Redesigned SIMD module and unit tests.

author Erik Lindahl <erik@kth.se>

Wed, 22 Jan 2014 17:30:10 +0000 (18:30 +0100)

committer Mark Abraham <mark.j.abraham@gmail.com>

Wed, 26 Feb 2014 10:52:57 +0000 (11:52 +0100)
author Erik Lindahl <erik@kth.se>
Wed, 22 Jan 2014 17:30:10 +0000 (18:30 +0100)
committer Mark Abraham <mark.j.abraham@gmail.com>
Wed, 26 Feb 2014 10:52:57 +0000 (11:52 +0100)
diff --git a/CMakeLists.txt b/CMakeLists.txt

index cce50946949e8b94e0dcc813cfa27afc6e401d6a..a450d1a7be9159948916514d8346333bee4fb2f1 100644 (file)
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -192,12 +192,14 @@ include(gmxManageGPU)
  # SIMD.
  include(gmxDetectTargetArchitecture)
  gmx_detect_target_architecture()
+
+if(GMX_CPU_ACCELERATION)
+    # Stay compatible with old Jenkins command line options for specific SIMD acceleration
+    set(GMX_SIMD "${GMX_CPU_ACCELERATION}" CACHE STRING "SIMD instruction set level and compiler optimization" FORCE)
+endif(GMX_CPU_ACCELERATION)
+
  include(gmxDetectSimd)
  gmx_detect_simd(GMX_SUGGESTED_SIMD)
-if("${GMX_SUGGESTED_SIMD}" STREQUAL "AVX2_256")
-    message(STATUS "Changing acceleration from AVX2 to AVX (until AVX2 patches commited).")
-    set(GMX_SUGGESTED_SIMD "AVX_256")
-endif()
  
  gmx_option_multichoice(
      GMX_SIMD
@@ -229,8 +231,18 @@ gmx_option_multichoice(
      None
      none gaussian mopac gamess orca)
  
-gmx_dependent_cache_variable(GMX_NBNXN_REF_KERNEL_TYPE "Reference kernel type (4xn or 2xnn)" STRING "4xn" "GMX_SIMD STREQUAL REFERENCE")
-gmx_dependent_cache_variable(GMX_NBNXN_REF_KERNEL_WIDTH "Reference kernel width" STRING "4" "GMX_SIMD STREQUAL REFERENCE")
+gmx_dependent_cache_variable(GMX_SIMD_REF_FLOAT_WIDTH  "Reference SIMD single precision width" STRING "4" "GMX_SIMD STREQUAL REFERENCE")
+gmx_dependent_cache_variable(GMX_SIMD_REF_DOUBLE_WIDTH "Reference SIMD double precision width" STRING "2" "GMX_SIMD STREQUAL REFERENCE")
+
+# This should be moved to a separate NBNXN cmake module when that code is cleaned up and modularized
+
+if("${GMX_SIMD}" STREQUAL "REFERENCE")
+    if(GMX_DOUBLE)
+        set(KERNEL_WIDTH ${GMX_SIMD_REF_DOUBLE_WIDTH})
+    else()
+        set(KERNEL_WIDTH ${GMX_SIMD_REF_FLOAT_WIDTH})
+    endif()
+endif()
  
  option(GMX_BROKEN_CALLOC "Work around broken calloc()" OFF)
  mark_as_advanced(GMX_BROKEN_CALLOC)
@@ -672,15 +684,6 @@ endif()
  if(HAVE_LIBM)
      list(APPEND GMX_EXTRA_LIBRARIES m)
  endif(HAVE_LIBM)
-if (${CMAKE_SYSTEM_NAME} MATCHES "BlueGene")
-    check_library_exists(mass_simd atan2f4 "" HAVE_MASS_SIMD)
-    if(HAVE_MASS_SIMD)
-        list(APPEND GMX_EXTRA_LIBRARIES mass_simd)
-    else()
-        message(FATAL_ERROR "Could not link to the SIMD version of the IBM MASS library. Please adjust your CMAKE_PREFIX_PATH to contain it")
-    endif()
-endif()
-
  
  option(GMX_NACL "Configure for Native Client builds" OFF)
  if (GMX_NACL)
diff --git a/cmake/gmxTestSimd.cmake b/cmake/gmxTestSimd.cmake

index 9d910073dc099e5f4757deb67c3e448e0b5c5f97..bdf4e708e7509e7412b26b12f768fb32a6a91aa6 100644 (file)
--- a/cmake/gmxTestSimd.cmake
+++ b/cmake/gmxTestSimd.cmake
@@ -80,8 +80,6 @@ elseif(${GMX_SIMD} STREQUAL "SSE2")
      endif()
  
      set(GMX_SIMD_X86_SSE2 1)
-    set(GMX_SIMD_X86_SSE2_OR_HIGHER 1)
-
      set(SIMD_STATUS_MESSAGE "Enabling SSE2 SIMD instructions")
  
  elseif(${GMX_SIMD} STREQUAL "SSE4.1")
@@ -108,8 +106,6 @@ elseif(${GMX_SIMD} STREQUAL "SSE4.1")
      endif()
  
      set(GMX_SIMD_X86_SSE4_1 1)
-    set(GMX_SIMD_X86_SSE4_1_OR_HIGHER 1)
-    set(GMX_SIMD_X86_SSE2_OR_HIGHER   1)
      set(SIMD_STATUS_MESSAGE "Enabling SSE4.1 SIMD instructions")
  
  elseif(${GMX_SIMD} STREQUAL "AVX_128_FMA")
@@ -203,10 +199,6 @@ int main(){__m128 x=_mm_set1_ps(0.5);x=_mm_frcz_ps(x);return 0;}"
      gmx_test_avx_gcc_maskload_bug(GMX_SIMD_X86_AVX_GCC_MASKLOAD_BUG "${SIMD_C_FLAGS}")
  
      set(GMX_SIMD_X86_AVX_128_FMA 1)
-    set(GMX_SIMD_X86_AVX_128_FMA_OR_HIGHER 1)
-    set(GMX_SIMD_X86_SSE4_1_OR_HIGHER      1)
-    set(GMX_SIMD_X86_SSE2_OR_HIGHER        1)
-
      set(SIMD_STATUS_MESSAGE "Enabling 128-bit AVX SIMD Gromacs SIMD (with fused-multiply add)")
  
  elseif(${GMX_SIMD} STREQUAL "AVX_256")
@@ -231,17 +223,10 @@ elseif(${GMX_SIMD} STREQUAL "AVX_256")
      gmx_test_avx_gcc_maskload_bug(GMX_SIMD_X86_AVX_GCC_MASKLOAD_BUG "${SIMD_C_FLAGS}")
  
      set(GMX_SIMD_X86_AVX_256 1)
-    set(GMX_SIMD_X86_AVX_256_OR_HIGHER  1)
-    set(GMX_SIMD_X86_SSE4_1_OR_HIGHER   1)
-    set(GMX_SIMD_X86_SSE2_OR_HIGHER     1)
-
      set(SIMD_STATUS_MESSAGE "Enabling 256-bit AVX SIMD instructions")
  
  elseif(${GMX_SIMD} STREQUAL "AVX2_256")
  
-    # Comment out this line for AVX2 development
-    message(FATAL_ERROR "AVX2_256 is disabled until the implementation has been commited.")
-
      gmx_use_clang_as_with_gnu_compilers_on_osx()
  
      gmx_find_cflag_for_source(CFLAGS_AVX2 "C compiler AVX2 flag"
@@ -262,11 +247,6 @@ elseif(${GMX_SIMD} STREQUAL "AVX2_256")
      # No need to test for Maskload bug - it was fixed before gcc added AVX2 support
  
      set(GMX_SIMD_X86_AVX2_256 1)
-    set(GMX_SIMD_X86_AVX2_256_OR_HIGHER 1)
-    set(GMX_SIMD_X86_AVX_256_OR_HIGHER  1)
-    set(GMX_SIMD_X86_SSE4_1_OR_HIGHER   1)
-    set(GMX_SIMD_X86_SSE2_OR_HIGHER     1)
-
      set(SIMD_STATUS_MESSAGE "Enabling 256-bit AVX2 SIMD instructions")
  
  elseif(${GMX_SIMD} STREQUAL "IBM_QPX")
@@ -290,22 +270,18 @@ elseif(${GMX_SIMD} STREQUAL "SPARC64_HPC_ACE")
  
  elseif(${GMX_SIMD} STREQUAL "REFERENCE")
  
-    add_definitions(-DGMX_SIMD_REFERENCE)
-    if(${GMX_NBNXN_REF_KERNEL_TYPE} STREQUAL "4xn")
-        if(${GMX_NBNXN_REF_KERNEL_WIDTH} STREQUAL "2" OR ${GMX_NBNXN_REF_KERNEL_WIDTH} STREQUAL "4" OR ${GMX_NBNXN_REF_KERNEL_WIDTH} STREQUAL "8")
-            add_definitions(-DGMX_NBNXN_SIMD_4XN -DGMX_SIMD_REF_WIDTH=${GMX_NBNXN_REF_KERNEL_WIDTH})
-        else()
-            message(FATAL_ERROR "Unsupported width for 4xn reference kernels")
-        endif()
-    elseif(${GMX_NBNXN_REF_KERNEL_TYPE} STREQUAL "2xnn")
-        if(${GMX_NBNXN_REF_KERNEL_WIDTH} STREQUAL "8" OR ${GMX_NBNXN_REF_KERNEL_WIDTH} STREQUAL "16")
-            add_definitions(-DGMX_NBNXN_SIMD_2XNN -DGMX_SIMD_REF_WIDTH=${GMX_NBNXN_REF_KERNEL_WIDTH})
-        else()
-            message(FATAL_ERROR "Unsupported width for 2xn reference kernels")
-        endif()
-    else()
-        message(FATAL_ERROR "Unsupported kernel type")
+    # NB: This file handles settings for the SIMD module, so in the interest 
+    # of proper modularization, please do NOT put any verlet kernel settings in this file.
+
+    if(GMX_SIMD_REF_FLOAT_WIDTH)
+        add_definitions(-DGMX_SIMD_REF_FLOAT_WIDTH=${GMX_SIMD_REF_FLOAT_WIDTH})
      endif()
+    if(GMX_SIMD_REF_DOUBLE_WIDTH)
+       add_definitions(-DGMX_SIMD_REF_DOUBLE_WIDTH=${GMX_SIMD_REF_DOUBLE_WIDTH})
+    endif()
+
+    set(GMX_SIMD_REFERENCE 1)
+    set(SIMD_STATUS_MESSAGE "Enabling reference (emulated) SIMD instructions.")
  
  else()
      gmx_invalid_option_value(GMX_SIMD)
diff --git a/doxygen/Doxyfile-common.cmakein b/doxygen/Doxyfile-common.cmakein

index 44a3a185dadd401dca769940cc50af4a45fa6487..e8c132e26fe1dc28f58772258f32cf0db2d69b69 100644 (file)
--- a/doxygen/Doxyfile-common.cmakein
+++ b/doxygen/Doxyfile-common.cmakein
@@ -26,10 +26,14 @@ MSCGEN_PATH            = @DOXYGEN_MSCGEN_PATH@
  
  ENABLED_SECTIONS       = @DOXYGEN_SECTIONS@
  
+MACRO_EXPANSION        = YES
+EXPAND_ONLY_PREDEF     = YES
  # Extract documentation also for code in headers within #ifdef __cplusplus
  PREDEFINED             = __cplusplus
  # This is for thread_mpi to #ifdef some code out that should not be documented.
  PREDEFINED            += DOXYGEN
+# This makes 'static gmx_inline' functions appear better in the documentation.
+PREDEFINED            += gmx_inline=inline
  # This is for parser.cpp to make it produce code that Doxygen understands
  # and that does not have unnecessary function declarations.
  PREDEFINED            += __STDC__ YYMALLOC=malloc YYFREE=free
diff --git a/doxygen/Doxyfile-lib.cmakein b/doxygen/Doxyfile-lib.cmakein

index 497e358c1065df41bc372c52c06555893eb03d09..f2e40f9b15f1bfd282d1d06a129b207692c841f1 100644 (file)
--- a/doxygen/Doxyfile-lib.cmakein
+++ b/doxygen/Doxyfile-lib.cmakein
@@ -2,6 +2,8 @@
  
  ENABLED_SECTIONS      += libapi
  INTERNAL_DOCS          = NO
+# This includes 'static inline' functions from headers in the documentation.
+EXTRACT_STATIC         = YES
  HIDE_UNDOC_CLASSES     = YES
  WARN_LOGFILE           = doxygen-lib.log
  HTML_OUTPUT            = html-lib
diff --git a/doxygen/Doxyfile-user.cmakein b/doxygen/Doxyfile-user.cmakein

index da37fa9c4e11d83f8180614d8af735e06e7c7d4b..dbdd555e058836d360b6ccb386200ee1ebb66752 100644 (file)
--- a/doxygen/Doxyfile-user.cmakein
+++ b/doxygen/Doxyfile-user.cmakein
@@ -7,6 +7,7 @@ EXCLUDE               += @CMAKE_SOURCE_DIR@/src/testutils
  EXCLUDE               += @CMAKE_SOURCE_DIR@/doxygen/doxygen.md
  EXCLUDE               += @CMAKE_SOURCE_DIR@/doxygen/unittesting.md
  EXCLUDE               += @CMAKE_SOURCE_DIR@/doxygen/wrapperbinary.md
+EXCLUDE               += @CMAKE_SOURCE_DIR@/doxygen/simd.md
  
  INTERNAL_DOCS          = NO
  HIDE_UNDOC_CLASSES     = YES
diff --git a/doxygen/directories.cpp b/doxygen/directories.cpp

index 738a835ceba11b61f29be521f397fa42d268eeae..eeab018d40194623a141a4af813e5f2d6079079d 100644 (file)
--- a/doxygen/directories.cpp
+++ b/doxygen/directories.cpp
@@ -87,6 +87,19 @@ Doxygen documentation file for directories in the source tree.
  \ingroup module_selection
   */
  
+/*! \libinternal
+\dir src/gromacs/simd
+\brief \ref module_simd
+ 
+\ingroup module_simd
+ */
+/*! \libinternal
+\dir src/gromacs/simd/tests
+\brief Unit tests for \ref module_simd
+ 
+\ingroup module_simd
+ */
+
  /*!
  \dir src/gromacs/trajectoryanalysis
  \brief \ref module_trajectoryanalysis
diff --git a/doxygen/mainpage.md b/doxygen/mainpage.md

index e395782212de2ea078f45660e51e6dd3e5941acf..217cf5f8c81c433a17376c3c21ff177b1f67c4dd 100644 (file)
--- a/doxygen/mainpage.md
+++ b/doxygen/mainpage.md
@@ -66,6 +66,9 @@ give an overview of some of the topics that are documented:
     Provides an overview of unit testing in \Gromacs.
   - \subpage page_wrapperbinary <br/>
     Provides an overview of how the `gmx` wrapper binary is implemented.
+ - \subpage page_simd <br/>
+   Documentation about the new SIMD module that makes it possible to write
+   highly accelerated CPU code that is still portable.
   - \subpage thread_mpi <br/>
     This code is used internally for threading support, and also provides a
     (partial) MPI implementation that allows compiling a "thread-MPI" version of
diff --git a/doxygen/simd.md b/doxygen/simd.md

new file mode 100644 (file)

index 0000000..71eb57f
--- /dev/null
+++ b/doxygen/simd.md
@@ -0,0 +1,502 @@
+Single-instruction Multiple-data (SIMD) coding {#page_simd}
+==============================================
+
+Coding with SIMD instructions
+=============================
+
+One important way for \Gromacs to achieve high performance is
+to use modern hardware capabilities where a single assembly
+instruction operates on multiple data units, essentially short
+fixed-length vectors (usually 2,4,8, or 16 elements). This provides
+a very efficient way for the CPU to increase floating-point
+performance, but it is much less versatile than general purpose
+registers. For this reason it is difficult for the compiler to
+generate efficient SIMD code, so the user has to organize the
+data in a way where it is possible to access as vectors, and
+these vectors often need to be aligned on cache boundaries.
+
+We have supported a number of different SIMD instruction sets in
+the group kernels for ages, and it is now also present in the
+verlet kernels and a few other places. However, with the increased
+usage and several architectures with different capabilities we now
+use a vendor-agnostic \Gromacs SIMD module, as documented in
+\ref module_simd.
+
+Design of the \Gromacs SIMD module
+==================================
+
+The macros in `src/gromacs/simd` are intended to be used for writing
+architecture-independent SIMD intrinsics code. Rather than making assumptions
+based on architecture, we have introduced a limited number of
+predefined preprocessor macros that describe the capabilities of the
+current implementation - these are the ones you need to check when
+writing SIMD code. As you will see, the functionality exposed by
+this module as typically a small subset of general SIMD implementations,
+and in particular we do not even try to expose advanced shuffling or
+permute operations, simply because we haven't been able to describe those
+in a generic way that can be implemented efficiently regardless of the
+hardware. However, the advantage of this approach is that it is straightforward
+to extend with support for new simd instruction sets in the future,
+and that will instantly speed up old code too.
+
+Unfortunately there is no standard for SIMD architectures. The available
+features vary a lot, but we still need to use quite a few of them to
+get the best performance possible. This means some features will only
+be available on certain platforms, and it is critical that we do NOT make
+to many assumptions about the storage formats, their size or SIMD width.
+Just to give a few examples:
+
+- On x86, double precision (64-bit) floating-point values always convert
+  to 32-bit integers, while many other platforms use 64-bit, and some cannot
+  use 32-bit integers at all. This means we cannot use a mask (boolean)
+  derived from integer operations to select double-precision floating-point
+  values, and it could get very complex for higher-level code if all these
+  decisions were exposed. Instead, we want to keep integers 32-bit since
+  all algorithms anyway need to work in single precision (w. 32-bit ints).
+- IBM QPX uses 4-wide SIMD both for single and double precision. Integer
+  support is highly limited, and the storage format means QPX does not
+  use x86-style all-ones masks (which have different widths in single/double)
+  but it uses the sign bit to denote the _false_ value. In particular, this
+  means we cannot use the bit contents for any fancy mask operations.
+- AVX1 only supports 4-wide 128-bit integer SIMD arithmetics, but the integer
+  _conversions_ can still be done 8-wide which corresponds to the single
+  precision floating-point width. Similarly, with AVX1 conversions between
+  double-precision and integers use the 32-bit 4-wide 128bit registers where
+  we can also do integer arithmetics. AVX2 adds proper arithmetics for
+  8-wide integers. We would severely limit performance if we had to say
+  that integer support was not present, so instead we stick to 32-bit ints
+  but limit the operations we expose (and do shuffling internally).
+- For SSE2 through SSE4.1, double precision is 2-wide, but when we convert
+  to integers they will be put in the first two elements of a 4-wide integer
+  type. This means we cannot assume that floating-point SIMD registers and
+  corresponding integer registers (after conversion) have the same width.
+- The 2-wide SIMD instructions on BlueGene/L and BlueGene/P cannot do any
+  floating-point logical operations (and/andnot/or/xor) whatsoever, which
+  can be a pain when implementing approximations for math functions.
+- Since boolean values can have different width for float/double and the
+  integers corresponding to float/double, we need to use separate boolean
+  types for all these values and convert between them if we e.g. want to use
+  result of an integer compare to select floating-point values.
+
+While this might sound complicated, it is actually far easier than writing
+separate SIMD code for 10 architectures in both single & double. The point
+is not that you need to remember the limitations above, but it is critical
+that you *never assume anything about the SIMD implementation*. We
+typically implement SIMD support for a new architecture in days with this
+new module, and the extensions required for verlet kernels
+are also very straightforward (group kernels can be more complex, but those
+are gradually on their way out). For the higher-level
+code, the only important thing is to never _assume_ anything about the SIMD
+architecture. Our general strategy in \Gromacs is to split the SIMD coding
+in three levels:
+
+<dl>
+<dt>Base level generic SIMD</dt>
+<dd>
+The base level SIMD module (which we get by including `gromacs/simd/simd.h`
+provides the API to define and manipulate SIMD datatypes. This will be enough
+for lots of cases, and it is a huge advantage that there is roughly
+parity between different architectures.
+</dd>
+<dt>Larger architecture-specific SIMD functions</dt>
+<dd>
+For some parts of the code this is not enough. In particular, both the
+group and Verlet kernels do insane amounts of floating-point operations,
+and since we spend 85-90% of the time in these kernels it is critical that
+we can optimize them as much as possible. Here, our strategy is first to
+define larger high-level functions that e.g. take a number of distances
+and loads the table interactions for this interaction. This way we can
+move this architecture-specific implementation to the SIMD module, and
+both achieve a reasonably clean kernel but still optimize a lot.
+</dd>
+<dt>Architecture-specific kernels (directories/files)</dt>
+<dd>
+When it is absolutely impossible to use a shared implementation we might
+have to code SIMD (just as GPU code). When this happens, we should create
+subdirectory or otherwise clearly names a file with a suffix for the
+SIMD architecture, to clarify to the user that the SIMD file has a
+direct non-SIMD correspondence. Since this code can be very hard to read,
+it is important to be explicit and use lots of comments - this is not the
+type of code where you should use smart optimization with hundreds of
+preprocessor directives. Keep it simple so other developers can help you
+support it. The question is not whether you can get a function 20% faster,
+but whether it justifies the added complexity of the code.
+</dd>
+</dl>
+
+File organization
+=================
+
+The SIMD module uses a couple of different files:
+
+<dl>
+<dt>`gromacs/simd/simd.h`</dt>
+<dd>
+This is the top-level wrapper that you should always include first.
+It will check the settings made at configuration time and include a
+suitable low-level implementation (that can be either single, double,
+or both). It also contains the routines for memory alignment, and
+based on the current Gromacs precision it will set aliases to 'real'
+SIMD datatypes (see further down) so the implementations do not have
+to care about Gromacs-specific details. However, note that you might
+not get all SIMD support you hoped for: If you compiled Gromacs in
+double precision but the hardware only supports single-precision SIMD
+there will not be any SIMD routines for default Gromacs 'real' precision.
+There are \#defines you can use to check this, as described further down.
+</dd>
+<dt>`gromacs/simd/impl_reference.h`</dt>
+<dd>
+This is an example of a low-level implementation. You should never, ever,
+work directly with these in higher-level code. The reference implementation
+contains the documentation for all SIMD wrappers, though.
+</dd>
+<dt>`gromacs/simd/simd_math.h`</dt>
+<dd>
+SIMD math functions. All functions in this file have to be designed
+so they work no matter whether the hardware supports integer SIMD, logical
+operations on integer or floating-point SIMD, or arithmetic operations
+on integers. However, a few routines check for defines and use faster
+algorithms if these features are present.
+</dd>
+<dt>`gromacs/simd/vector_operations.h`</dt>
+<dd>
+This file contains a few rvec-related SIMD functions, e.g. to
+calculate scalar products, norms, or cross products. They obviously
+cannot operate on scalar Gromacs rvec types, but use separate SIMD
+variables for X,Y, and Z vector components.
+</dd>
+</dl>
+
+
+SIMD datatypes
+==============
+
+The SIMD module handles the challenges mentioned in the introduction
+by introducing a number of datatypes;
+many of these might map to the same underlying SIMD types, but we need separate
+types because some architectures use different registers e.g. for boolean
+types.
+
+Floating-point data
+-------------------
+
+<dl>
+<dt>`#gmx_simd_real_t`</dt>
+<dd>
+This is the SIMD-version of \Gromacs' real type,
+which is set based on the CMake configuration and internally aliased
+to one of the next two types.
+Operations on these variables have the suffix `_r`, e.g. `gmx_simd_add_r()`.
+</dd>
+<dt>`#gmx_simd_float_t`</dt>
+<dd>
+This is always single-precision data, but it
+might not be supported on all architectures. Suffix `_f` is used for
+explicit single-precision routines, e.g. `gmx_simd_mul_f()`.
+</dd>
+<dt>`gmx_simd_double_t`</dt>
+<dd>
+This is always double precision when available,
+and in rare cases you might want to use a specific precision.
+Suffix `_d` is used for explicit double-precision routines,
+e.g. `gmx_simd_mul_d()`
+</dd>
+</dl>
+
+Integers corresponding to floating-point values
+-----------------------------------------------
+
+For these types, 'correspond' means that it is the integer type we
+get when we convert data e.g. from single (or double) precision
+floating-point SIMD variables. Those need to be different, since many
+common implementations only use half as many elements for double as
+for single SIMD variables, and then we only get half the number of
+integers too.
+
+<dl>
+<dt>`#gmx_simd_int32_t`</dt>
+<dd>
+This is used for integers when converting to/from Gromacs default "real" type.
+The corresponding routines have suffix `_i`, e.g. `gmx_simd_add_i()`.
+</dd>
+<dt>`gmx_simd_fint32_t`</dt>
+<dd>
+Integers obtained when converting from single precision, or intended to be
+converted to single precision floating-point. These are normal integers
+(not a special conversion type), but since some SIMD architectures such as
+SSE or AVX use different registers for integer SIMD variables having the
+same width as float and double, respectively, we need to separate these
+two types of integers. The actual operations you perform on the are normal
+ones such as addition or multiplication. The routines
+operating on these variables have suffix `_fi`, like `gmx_simd_add_fi()`.
+This will also be the widest integer data type if you want to do pure
+integer SIMD operations, but that will not be supported on all platforms.
+</dd>
+<dt>`gmx_simd_dint32_t`</dt>
+<dd>
+Integers used when converting to/from double. See the preceding item
+for a detailed explanation. On many architectures,
+including all x86 ones, this will be a narrower type than `gmx_simd_fint32_t`.
+The correspoding routines have suffix `_di`, like `gmx_simd_add_di()`.
+</dd>
+</dl>
+
+Note that all integer load/stores operations defined here load/store 32-bit
+integers, even when the internal register storage might be 64-bit, and we
+set the "width" of the SIMD implementation based on how many float/double/
+integers we load/store - even if the internal width could be larger.
+
+Boolean values
+--------------
+
+We need a separate boolean datatype for masks and comparison results, since
+we cannot assume they are identical either to integers, floats or double -
+some implementations use specific predicate registers for booleans.
+
+<dl>
+<dt>`#gmx_simd_bool_t`</dt>
+<dd>
+Results from boolean operations involving reals, and the booleans we use
+to select between real values. The corresponding routines have suffix `_b`,
+like `gmx_simd_or_b()`.
+</dd>
+<dt>`gmx_simd_fbool_t`</dt>
+<dd>
+Booleans specifically for single precision. Corresponding function suffix
+is `_fb`, like `gmx_simd_or_fb()`.
+</dd>
+<dt>`gmx_simd_dbool_t`</dt>
+<dd>
+Operations specifically on double. Operations have suffix `_db`: `gmx_simd_or_db()`
+</dd>
+<dt>`#gmx_simd_ibool_t`</dt>
+<dd>
+Boolean operations on integers corresponding to real (see floating-point
+descriptions above). Operations on these booleans use suffix `_ib`,
+like `gmx_simd_or_ib()`.
+</dd>
+<dt>`gmx_simd_fibool_t`</dt>
+<dd>
+Booleans for integers corresponding to float. Operation suffix is `_fib`,
+like `gmx_simd_or_fib()`.
+</dd>
+<dt>`gmx_simd_dibool_t`</dt>
+<dd>
+Booleans for integers corresponding to double. Operation suffix is `_dib`,
+like `gmx_simd_or_dib()`.
+</dd>
+</dl>
+
+The subset you should use in practice
+-------------------------------------
+
+If this seems daunting, in practice you should only need to use these types
+when you start coding:
+
+<dl>
+<dt>`#gmx_simd_real_t`</dt>
+<dd>
+Floating-point data.
+</dd>
+<dt>`#gmx_simd_bool_t`</dt>
+<dd>
+Booleans.
+</dd>
+<dt>`#gmx_simd_int32_t`</dt>
+<dd>
+Integer data. Might not be supported, so you must check
+the preprocessor macros described below.
+</dd>
+</dl>
+
+Operations on these types will be defined to either float/double (or corresponding integers) based on the current Gromacs precision, so the documentation is occasionally more detailed for the lower-level actual implementation functions.
+
+SIMD4 Macros
+------------
+
+The above should be sufficient for code that works with the full SIMD width.
+Unfortunately reality is not that simple. Some algorithms like lattice
+summation need quartets of elements, so even when the SIMD width is >4 we
+need width-4 SIMD if it is supported. These datatypes and operations use the
+prefix `gmx_simd4_`, and availability is indicated by `GMX_SIMD4_HAVE_FLOAT`
+and `GMX_SIMD4_HAVE_DOUBLE`. For now we only support a small subset of SIMD
+operations for SIMD4, but that is trivial to extend if we need to.
+
+Predefined SIMD preprocessor macros
+===================================
+
+Functionality-wise, we have a small set of core set of features that we
+require to be present on all platforms, while more avanced features can be
+used in the code when defines like e.g. `GMX_SIMD_HAVE_LOADU` are set.
+
+This is a summary of the currently available preprocessor defines that
+you should use to check for support when using the corresponding features.
+We first list the float/double/int defines set by the _implementation_; in
+most cases you do not want to check directly for float/double defines, but
+you should instead use the derived "real" defines set in this file - we list
+those at the end below.
+
+Preprocessor predefined macro defines set by the low-level implementation.
+These are only set if they work for all datatypes; `GMX_SIMD_HAVE_LOADU`
+thus means we can load both float, double, and integers from unaligned memory,
+and that the unaligned loads are available for SIMD4 too.
+
+<dl>
+<dt>`GMX_SIMD_HAVE_FLOAT`</dt>
+<dd>
+Single-precision instructions available.
+</dd>
+<dt>`GMX_SIMD_HAVE_DOUBLE `</dt>
+<dd>
+Double-precision instructions available.
+</dd>
+<dt>`GMX_SIMD_HAVE_HARDWARE`</dt>
+<dd>
+Set when we are NOT emulating SIMD.
+</dd>
+<dt>`GMX_SIMD_HAVE_LOADU`</dt>
+<dd>
+Load from unaligned memory available.
+</dd>
+<dt>`GMX_SIMD_HAVE_STOREU`</dt>
+<dd>
+Store to unaligned memory available.
+</dd>
+<dt>`GMX_SIMD_HAVE_LOGICAL`</dt>
+<dd>
+Support for and/andnot/or/xor on floating-point variables.
+</dd>
+<dt>`GMX_SIMD_HAVE_FMA`</dt>
+<dd>
+Floating-point fused multiply-add.
+Note: We provide emulated FMA instructions if you do not have FMA
+support, but in that case you might be able to code it more efficient w/o FMA.
+</dd>
+<dt>`GMX_SIMD_HAVE_FRACTION`</dt>
+<dd>
+Instruction to get decimal fraction. Same as FMA: This denotes
+hardware support, otherwise instruction will be emulated.
+</dd>
+<dt>`GMX_SIMD_HAVE_FINT32`</dt>
+<dd>
+Integer conversions to/from float available.
+</dd>
+<dt>`GMX_SIMD_HAVE_FINT32_EXTRACT`</dt>
+<dd>
+Support for extracting integer SIMD elements from `gmx_simd_fint32_t`.
+</dd>
+<dt>`GMX_SIMD_HAVE_FINT32_LOGICAL`</dt>
+<dd>
+Bitwise shifts on `gmx_simd_fint32_t`.
+</dd>
+<dt>`GMX_SIMD_HAVE_FINT32_ARITHMETICS`</dt>
+<dd>
+Arithmetic ops for `gmx_simd_fint32_t`.
+</dd>
+<dt>`GMX_SIMD_HAVE_DINT32`</dt>
+<dd>
+Integer conversions to/from double available.
+</dd>
+<dt>`GMX_SIMD_HAVE_DINT32_EXTRACT`</dt>
+<dd>
+Support for extracting integer SIMD elements from `gmx_simd_dint32_t`.
+</dd>
+<dt>`GMX_SIMD_HAVE_DINT32_LOGICAL`</dt>
+<dd>
+Bitwise shifts on `gmx_simd_dint32_t`.
+</dd>
+<dt>`GMX_SIMD_HAVE_DINT32_ARITHMETICS`</dt>
+<dd>
+Arithmetic ops for `gmx_simd_dint32_t`.
+</dd>
+</dl>
+
+There are also two macros specific to SIMD4: `GMX_SIMD4_HAVE_FLOAT` is set
+if we can use SIMD4 in single precision, and `GMX_SIMD4_HAVE_DOUBLE`
+similarly denotes support for a double-precision SIMD4 implementation. For
+generic properties (e.g. whether SIMD4 FMA is supported), you should check
+the normal SIMD macros above.
+
+Implementation properties
+-------------------------
+
+Higher-level code can use these macros to find information about the implementation,
+for instance what the SIMD width is:
+
+<dl>
+<dt>`GMX_SIMD_FLOAT_WIDTH`</dt>
+<dd>
+Number of elements in `gmx_simd_float_t`, and practical width of `gmx_simd_fint32_t`.
+</dd>
+<dt>`GMX_SIMD_DOUBLE_WIDTH`</dt>
+<dd>
+Number of elements in `gmx_simd_double_t`, and practical width of `gmx_simd_dint32_t`</dd>
+<dt>`GMX_SIMD_RSQRT_BITS`</dt>
+<dd>
+Accuracy (bits) of 1/sqrt(x) lookup step.
+</dd>
+<dt>`GMX_SIMD_RCP_BITS`</dt>
+<dd>
+Accuracy (bits) of 1/x lookup step.
+</dd>
+</dl>
+
+After including the low-level architecture-specific implementation, this
+header sets the following derived defines based on the current precision;
+these are the ones you should check for unless you absolutely want to dig
+deep into the explicit single/double precision implementations:
+
+<dl>
+<dt>`GMX_SIMD_HAVE_REAL`</dt>
+<dd>
+Set either to `GMX_SIMD_HAVE_FLOAT` or `GMX_SIMD_HAVE_DOUBLE`
+</dd>
+<dt>`GMX_SIMD4_HAVE_REAL`</dt>
+<dd>
+Set either to `GMX_SIMD4_HAVE_FLOAT` or `GMX_SIMD4_HAVE_DOUBLE`
+</dd>
+<dt>`GMX_SIMD_REAL_WIDTH`</dt>
+<dd>
+Set either to `GMX_SIMD_FLOAT_WIDTH` or `GMX_SIMD_DOUBLE_WIDTH`
+</dd>
+<dt>`GMX_SIMD_HAVE_INT32`</dt>
+<dd>
+Set either to `GMX_SIMD_HAVE_FINT32` or `GMX_SIMD_HAVE_DINT32`
+</dd>
+<dt>`GMX_SIMD_INT32_WIDTH`</dt>
+<dd>
+Set either to `GMX_SIMD_FINT32_WIDTH` or `GMX_SIMD_DINT32_WIDTH`
+</dd>
+<dt>`GMX_SIMD_HAVE_INT32_EXTRACT`</dt>
+<dd>
+Set either to `GMX_SIMD_HAVE_FINT32_EXTRACT` or `GMX_SIMD_HAVE_DINT32_EXTRACT`
+</dd>
+<dt>`GMX_SIMD_HAVE_INT32_LOGICAL`</dt>
+<dd>
+Set either to `GMX_SIMD_HAVE_FINT32_LOGICAL` or `GMX_SIMD_HAVE_DINT32_LOGICAL`
+</dd>
+<dt>`GMX_SIMD_HAVE_INT32_ARITHMETICS`</dt>
+<dd>
+Set either to `GMX_SIMD_HAVE_FINT32_ARITHMETICS` or `GMX_SIMD_HAVE_DINT32_ARITHMETICS`
+</dd>
+</dl>
+
+For convenience we also define `GMX_SIMD4_WIDTH` to 4. This will never vary,
+but using it helps you make it clear that a loop or array refers to the
+SIMD4 width rather than some other '4'.
+
+While all these defines are available to specify the features of the
+hardware, we would strongly recommend that you do NOT sprinkle your code
+with defines - if nothing else it will be a debug nightmare. Instead you can
+write a slower generic SIMD function that works everywhere, and then override
+this with faster architecture-specific versions for some implementations. The
+recommended way to do that is to add a define around the generic function
+that skips it if the name is already defined. The actual implementations in
+the lowest-level files are typically defined to an architecture-specific name
+(such as `gmx_simd_sincos_d_sse2`) so we can override it (e.g. in SSE4) by
+simply undefining and setting a new definition. Still, this is an
+implementation detail you won't have to worry about until you start writing
+support for a new SIMD architecture.
+
+
+
diff --git a/src/config.h.cmakein b/src/config.h.cmakein

index 1c09d5b41bb4c55e864c71817614a43e1e6da5fd..09f72342863faf709cc48ac90dfee88479f61645 100644 (file)
--- a/src/config.h.cmakein
+++ b/src/config.h.cmakein
@@ -109,18 +109,6 @@
  /* Target platform is BlueGene/Q */
  #cmakedefine GMX_TARGET_BGQ
  
-/* SSE2 instructions available */
-#cmakedefine GMX_SIMD_X86_SSE2_OR_HIGHER
-
-/* SSE4.1 instructions available */
-#cmakedefine GMX_SIMD_X86_SSE4_1_OR_HIGHER
-
-/* AVX 128-bit FMA instructions available (AMD side of the AVX world) */
-#cmakedefine GMX_SIMD_X86_AVX_128_FMA_OR_HIGHER
-
-/* AVX 256-bit instructions available (Intel side of the AVX world) */
-#cmakedefine GMX_SIMD_X86_AVX_256_OR_HIGHER
-
  /* GCC bug in AVX maskload/maskstore arguments - worked around internally */
  #cmakedefine GMX_SIMD_X86_AVX_GCC_MASKLOAD_BUG
  
@@ -136,12 +124,18 @@
  /* AVX 256-bit was selected as SIMD instructions */
  #cmakedefine GMX_SIMD_X86_AVX_256
  
+/* AVX2 256-bit SIMD instruction set level was selected */
+#cmakedefine GMX_SIMD_X86_AVX2_256
+
  /* IBM QPX was selected as SIMD instructions (e.g. BlueGene/Q) */
  #cmakedefine GMX_SIMD_IBM_QPX
  
  /* Fujitsu Sparc64 HPC-ACE SIMD acceleration */
  #cmakedefine GMX_SIMD_SPARC64_HPC_ACE
  
+/* Reference SIMD implementation for testing */
+#cmakedefine GMX_SIMD_REFERENCE
+
  /* String for SIMD instruction choice (for writing to log files and stdout) */
  #define GMX_SIMD_STRING "@GMX_SIMD@"
  
diff --git a/src/gromacs/CMakeLists.txt b/src/gromacs/CMakeLists.txt

index bad9df05712a98445b860f44b31e3ed73d4e0187..24ab5bb7610840e2b3257a6a91db37fd30701c58 100644 (file)
--- a/src/gromacs/CMakeLists.txt
+++ b/src/gromacs/CMakeLists.txt
@@ -61,6 +61,7 @@ add_subdirectory(fileio)
  add_subdirectory(swap)
  add_subdirectory(essentialdynamics)
  add_subdirectory(pulling)
+add_subdirectory(simd)
  if (NOT GMX_BUILD_MDRUN_ONLY)
      add_subdirectory(legacyheaders)
      add_subdirectory(gmxana)
diff --git a/src/gromacs/gmxlib/bondfree.c b/src/gromacs/gmxlib/bondfree.c

index 006221a92bf674d2538a032da08256ebed4715d0..6ae097c006b9aca64edf42c01149f75267d2923a 100644 (file)
--- a/src/gromacs/gmxlib/bondfree.c
+++ b/src/gromacs/gmxlib/bondfree.c
@@ -58,12 +58,9 @@
  #include "force.h"
  #include "nonbonded.h"
  
-/* Include the SIMD macro file and then check for support */
-#include "gromacs/simd/macros.h"
-#if defined GMX_HAVE_SIMD_MACROS && defined GMX_SIMD_HAVE_TRIGONOMETRIC
-#define SIMD_BONDEDS
+#include "gromacs/simd/simd.h"
+#include "gromacs/simd/simd_math.h"
  #include "gromacs/simd/vector_operations.h"
-#endif
  
  /* Find a better place for this? */
  const int cmap_coeff_matrix[] = {
@@ -116,7 +113,7 @@ static int pbc_rvec_sub(const t_pbc *pbc, const rvec xi, const rvec xj, rvec dx)
      }
  }
  
-#ifdef SIMD_BONDEDS
+#ifdef GMX_SIMD_HAVE_REAL
  
  /* SIMD PBC data structure, containing 1/boxdiag and the box vectors */
  typedef struct {
@@ -191,7 +188,7 @@ pbc_dx_simd(gmx_simd_real_t *dx, gmx_simd_real_t *dy, gmx_simd_real_t *dz,
      *dx = gmx_simd_fnmadd_r(sh, pbc->bxx, *dx);
  }
  
-#endif /* SIMD_BONDEDS */
+#endif /* GMX_SIMD_HAVE_REAL */
  
  /*
   * Morse potential bond by Frank Everdij
@@ -1037,7 +1034,7 @@ real angles(int nbonds,
      return vtot;
  }
  
-#ifdef SIMD_BONDEDS
+#ifdef GMX_SIMD_HAVE_REAL
  
  /* As angles, but using SIMD to calculate many dihedrals at once.
   * This routines does not calculate energies and shift forces.
@@ -1202,7 +1199,7 @@ angles_noener_simd(int nbonds,
      }
  }
  
-#endif /* SIMD_BONDEDS */
+#endif /* GMX_SIMD_HAVE_REAL */
  
  real linear_angles(int nbonds,
                     const t_iatom forceatoms[], const t_iparams forceparams[],
@@ -1502,7 +1499,7 @@ real dih_angle(const rvec xi, const rvec xj, const rvec xk, const rvec xl,
  }
  
  
-#ifdef SIMD_BONDEDS
+#ifdef GMX_SIMD_HAVE_REAL
  
  /* As dih_angle above, but calculates 4 dihedral angles at once using SIMD,
   * also calculates the pre-factor required for the dihedral force update.
@@ -1618,8 +1615,7 @@ dih_angle_simd(const rvec *x,
      *nrkj_n2_S = gmx_simd_mul_r(nrkj_S, gmx_simd_inv_r(iprn_S));
  
      /* Set sign of phi_S with the sign of ipr_S; phi_S is currently positive */
-    *phi_S     = gmx_cpsgn_nonneg_pr(ipr_S, *phi_S);
-
+    *phi_S     = gmx_simd_xor_sign_r(*phi_S, ipr_S);
      p_S        = gmx_simd_iprod_r(rijx_S, rijy_S, rijz_S,
                                    rkjx_S, rkjy_S, rkjz_S);
      p_S        = gmx_simd_mul_r(p_S, nrkj_2_S);
@@ -1632,7 +1628,7 @@ dih_angle_simd(const rvec *x,
      gmx_simd_store_r(q, q_S);
  }
  
-#endif /* SIMD_BONDEDS */
+#endif /* GMX_SIMD_HAVE_REAL */
  
  
  void do_dih_fup(int i, int j, int k, int l, real ddphi,
@@ -1967,7 +1963,7 @@ pdihs_noener(int nbonds,
  }
  
  
-#ifdef SIMD_BONDEDS
+#ifdef GMX_SIMD_HAVE_REAL
  
  /* As pdihs_noner above, but using SIMD to calculate many dihedrals at once */
  static void
@@ -2097,7 +2093,7 @@ pdihs_noener_simd(int nbonds,
      }
  }
  
-#endif /* SIMD_BONDEDS */
+#endif /* GMX_SIMD_HAVE_REAL */
  
  
  real idihs(int nbonds,
@@ -4162,7 +4158,7 @@ static real calc_one_bond(FILE *fplog, int thread,
                            pbc, g, lambda[efptFTYPE], &(dvdl[efptFTYPE]),
                            md, fcd, global_atom_index);
          }
-#ifdef SIMD_BONDEDS
+#ifdef GMX_SIMD_HAVE_REAL
          else if (ftype == F_ANGLES &&
                   !bCalcEnerVir && fr->efep == efepNO)
          {
@@ -4179,10 +4175,10 @@ static real calc_one_bond(FILE *fplog, int thread,
                   !bCalcEnerVir && fr->efep == efepNO)
          {
              /* No energies, shift forces, dvdl */
-#ifndef SIMD_BONDEDS
-            pdihs_noener
-#else
+#ifdef GMX_SIMD_HAVE_REAL
              pdihs_noener_simd
+#else
+            pdihs_noener
  #endif
                  (nbn, idef->il[ftype].iatoms+nb0,
                  idef->iparams,
diff --git a/src/gromacs/gmxlib/gmx_cpuid.c b/src/gromacs/gmxlib/gmx_cpuid.c

index ec1ce471d5820bf7e8689115166ce2be9158b826..03b83fc43229aabe759ec72d3f19110c781d46d6 100644 (file)
--- a/src/gromacs/gmxlib/gmx_cpuid.c
+++ b/src/gromacs/gmxlib/gmx_cpuid.c
@@ -138,6 +138,7 @@ gmx_cpuid_simd_string[GMX_CPUID_NSIMD] =
  {
      "CannotDetect",
      "None",
+    "Reference",
      "SSE2",
      "SSE4.1",
      "AVX_128_FMA",
@@ -222,11 +223,10 @@ gmx_cpuid_feature           (gmx_cpuid_t                cpuid,
  
  
  
-/* What type of SIMD was compiled in, if any?
- * This is set from Cmake. Note that the SSE2 and SSE4_1 macros are set for
- * AVX too, so it is important that they appear last in the list.
- */
-#ifdef GMX_SIMD_X86_AVX_256
+/* What type of SIMD was compiled in, if any? */
+#ifdef GMX_SIMD_X86_AVX2_256
+static const enum gmx_cpuid_simd compiled_simd = GMX_CPUID_SIMD_X86_AVX2_256;
+#elif defined GMX_SIMD_X86_AVX_256
  static const enum gmx_cpuid_simd compiled_simd = GMX_CPUID_SIMD_X86_AVX_256;
  #elif defined GMX_SIMD_X86_AVX_128_FMA
  static const enum gmx_cpuid_simd compiled_simd = GMX_CPUID_SIMD_X86_AVX_128_FMA;
@@ -238,6 +238,8 @@ static const enum gmx_cpuid_simd compiled_simd = GMX_CPUID_SIMD_X86_SSE2;
  static const enum gmx_cpuid_simd compiled_simd = GMX_CPUID_SIMD_SPARC64_HPC_ACE;
  #elif defined GMX_SIMD_IBM_QPX
  static const enum gmx_cpuid_simd compiled_simd = GMX_CPUID_SIMD_IBM_QPX;
+#elif defined GMX_SIMD_REFERENCE
+static const enum gmx_cpuid_simd compiled_simd = GMX_CPUID_SIMD_REFERENCE;
  #else
  static const enum gmx_cpuid_simd compiled_simd = GMX_CPUID_SIMD_NONE;
  #endif
diff --git a/src/gromacs/gmxlib/nonbonded/CMakeLists.txt b/src/gromacs/gmxlib/nonbonded/CMakeLists.txt

index 62939813affb5a39f9709ba276ecc21106ec656b..16943e9d93f92b86f9b564a46a25e8a1aaf1141d 100644 (file)
--- a/src/gromacs/gmxlib/nonbonded/CMakeLists.txt
+++ b/src/gromacs/gmxlib/nonbonded/CMakeLists.txt
@@ -47,7 +47,7 @@ if("${GMX_SIMD}" STREQUAL "AVX_128_FMA" AND NOT GMX_DOUBLE)
      file(GLOB NONBONDED_AVX_128_FMA_SINGLE_SOURCES nb_kernel_avx_128_fma_single/*.c)
  endif()
  
-if("${GMX_SIMD}" STREQUAL "AVX_256" AND NOT GMX_DOUBLE)
+if((("${GMX_SIMD}" STREQUAL "AVX_256") OR ("${GMX_SIMD}" STREQUAL "AVX2_256")) AND NOT GMX_DOUBLE)
      file(GLOB NONBONDED_AVX_256_SINGLE_SOURCES nb_kernel_avx_256_single/*.c)
  endif()
  
@@ -63,11 +63,11 @@ if("${GMX_SIMD}" STREQUAL "AVX_128_FMA" AND GMX_DOUBLE)
      file(GLOB NONBONDED_AVX_128_FMA_DOUBLE_SOURCES nb_kernel_avx_128_fma_double/*.c)
  endif()
  
-if("${GMX_SIMD}" STREQUAL "AVX_256" AND GMX_DOUBLE)
+if((("${GMX_SIMD}" STREQUAL "AVX_256") OR ("${GMX_SIMD}" STREQUAL "AVX2_256")) AND GMX_DOUBLE)
      file(GLOB NONBONDED_AVX_256_DOUBLE_SOURCES nb_kernel_avx_256_double/*.c)
  endif()
  
-if("${GMX_SIMD}" STREQUAL "Sparc64_HPC_ACE" AND GMX_DOUBLE)
+if("${GMX_SIMD}" STREQUAL "SPARC64_HPC_ACE" AND GMX_DOUBLE)
      file(GLOB NONBONDED_SPARC64_HPC_ACE_DOUBLE_SOURCES nb_kernel_sparc64_hpc_ace_double/*.c)
  endif()
  
diff --git a/src/gromacs/gmxlib/nonbonded/nb_kernel_avx_128_fma_double/kernelutil_x86_avx_128_fma_double.h b/src/gromacs/gmxlib/nonbonded/nb_kernel_avx_128_fma_double/kernelutil_x86_avx_128_fma_double.h

index c841834edd43e5c415c8af48f5cc50727bea0cdf..d6772ac8192682847ae77bbca4e94eec30341ab8 100644 (file)
--- a/src/gromacs/gmxlib/nonbonded/nb_kernel_avx_128_fma_double/kernelutil_x86_avx_128_fma_double.h
+++ b/src/gromacs/gmxlib/nonbonded/nb_kernel_avx_128_fma_double/kernelutil_x86_avx_128_fma_double.h
@@ -1,7 +1,7 @@
  /*
   * This file is part of the GROMACS molecular simulation package.
   *
- * Copyright (c) 2012,2013, by the GROMACS development team, led by
+ * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
   * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   * and including many others, as listed in the AUTHORS file in the
   * top-level source directory and at http://www.gromacs.org.
@@ -35,8 +35,22 @@
  #ifndef _kernelutil_x86_avx_128_fma_double_h_
  #define _kernelutil_x86_avx_128_fma_double_h_
  
-#include "gromacs/simd/general_x86_avx_128_fma.h"
+#include <math.h>
+#include <immintrin.h>
+#ifdef _MSC_VER
+#    include <intrin.h>
+#else
+#    include <x86intrin.h>
+#endif
  
+#define gmx_mm_castsi128_pd   _mm_castsi128_pd
+#define gmx_mm_extract_epi32  _mm_extract_epi32
+
+#define GMX_MM_TRANSPOSE2_PD(row0, row1) {           \
+        __m128d __gmx_t1 = row0;                         \
+        row0           = _mm_unpacklo_pd(row0, row1);     \
+        row1           = _mm_unpackhi_pd(__gmx_t1, row1); \
+}
  
  static int
  gmx_mm_any_lt(__m128d a, __m128d b)
diff --git a/src/gromacs/gmxlib/nonbonded/nb_kernel_avx_128_fma_single/kernelutil_x86_avx_128_fma_single.h b/src/gromacs/gmxlib/nonbonded/nb_kernel_avx_128_fma_single/kernelutil_x86_avx_128_fma_single.h

index 273bf1a0038c1be729ed75cd3af97ed3af3eb53e..b9ef14ef6a41c3d1489912ae6b640362b8bf5870 100644 (file)
--- a/src/gromacs/gmxlib/nonbonded/nb_kernel_avx_128_fma_single/kernelutil_x86_avx_128_fma_single.h
+++ b/src/gromacs/gmxlib/nonbonded/nb_kernel_avx_128_fma_single/kernelutil_x86_avx_128_fma_single.h
@@ -1,7 +1,7 @@
  /*
   * This file is part of the GROMACS molecular simulation package.
   *
- * Copyright (c) 2012,2013, by the GROMACS development team, led by
+ * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
   * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   * and including many others, as listed in the AUTHORS file in the
   * top-level source directory and at http://www.gromacs.org.
@@ -37,8 +37,28 @@
  
  
  #include <math.h>
+#include <immintrin.h>
+#ifdef _MSC_VER
+#    include <intrin.h>
+#else
+#    include <x86intrin.h>
+#endif
+
+#define gmx_mm_castsi128_ps   _mm_castsi128_ps
+#define gmx_mm_extract_epi32  _mm_extract_epi32
  
-#include "gromacs/simd/general_x86_avx_128_fma.h"
+/* Work around gcc bug with wrong type for mask formal parameter to maskload/maskstore */
+#ifdef GMX_SIMD_X86_AVX_GCC_MASKLOAD_BUG
+#    define gmx_mm_maskload_ps(mem, mask)       _mm_maskload_ps((mem), _mm_castsi128_ps(mask))
+#    define gmx_mm_maskstore_ps(mem, mask, x)    _mm_maskstore_ps((mem), _mm_castsi128_ps(mask), (x))
+#    define gmx_mm256_maskload_ps(mem, mask)    _mm256_maskload_ps((mem), _mm256_castsi256_ps(mask))
+#    define gmx_mm256_maskstore_ps(mem, mask, x) _mm256_maskstore_ps((mem), _mm256_castsi256_ps(mask), (x))
+#else
+#    define gmx_mm_maskload_ps(mem, mask)       _mm_maskload_ps((mem), (mask))
+#    define gmx_mm_maskstore_ps(mem, mask, x)    _mm_maskstore_ps((mem), (mask), (x))
+#    define gmx_mm256_maskload_ps(mem, mask)    _mm256_maskload_ps((mem), (mask))
+#    define gmx_mm256_maskstore_ps(mem, mask, x) _mm256_maskstore_ps((mem), (mask), (x))
+#endif
  
  /* Normal sum of four xmm registers */
  #define gmx_mm_sum4_ps(t0, t1, t2, t3)  _mm_add_ps(_mm_add_ps(t0, t1), _mm_add_ps(t2, t3))
diff --git a/src/gromacs/gmxlib/nonbonded/nb_kernel_avx_256_double/kernelutil_x86_avx_256_double.h b/src/gromacs/gmxlib/nonbonded/nb_kernel_avx_256_double/kernelutil_x86_avx_256_double.h

index 0f86d54a5a8cda1874fa60f6545554313e6f074f..9c9ef1e9084ba6e84194b164bad77902bb383bc7 100644 (file)
--- a/src/gromacs/gmxlib/nonbonded/nb_kernel_avx_256_double/kernelutil_x86_avx_256_double.h
+++ b/src/gromacs/gmxlib/nonbonded/nb_kernel_avx_256_double/kernelutil_x86_avx_256_double.h
@@ -1,7 +1,7 @@
  /*
   * This file is part of the GROMACS molecular simulation package.
   *
- * Copyright (c) 2012,2013, by the GROMACS development team, led by
+ * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
   * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   * and including many others, as listed in the AUTHORS file in the
   * top-level source directory and at http://www.gromacs.org.
@@ -35,9 +35,50 @@
  #ifndef _kernelutil_x86_avx_256_double_h_
  #define _kernelutil_x86_avx_256_double_h_
  
+#define gmx_mm_castsi128_ps(a) _mm_castsi128_ps(a)
+
+#define _GMX_MM_BLEND256D(b3, b2, b1, b0) (((b3) << 3) | ((b2) << 2) | ((b1) << 1) | ((b0)))
+#define _GMX_MM_PERMUTE(fp3, fp2, fp1, fp0) (((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | ((fp0)))
+#define _GMX_MM_PERMUTE128D(fp1, fp0)         (((fp1) << 1) | ((fp0)))
+#define _GMX_MM_PERMUTE256D(fp3, fp2, fp1, fp0) (((fp3) << 3) | ((fp2) << 2) | ((fp1) << 1) | ((fp0)))
+#define GMX_MM256_FULLTRANSPOSE4_PD(row0, row1, row2, row3) \
+    {                                                        \
+        __m256d _t0, _t1, _t2, _t3;                          \
+        _t0  = _mm256_unpacklo_pd((row0), (row1));           \
+        _t1  = _mm256_unpackhi_pd((row0), (row1));           \
+        _t2  = _mm256_unpacklo_pd((row2), (row3));           \
+        _t3  = _mm256_unpackhi_pd((row2), (row3));           \
+        row0 = _mm256_permute2f128_pd(_t0, _t2, 0x20);       \
+        row1 = _mm256_permute2f128_pd(_t1, _t3, 0x20);       \
+        row2 = _mm256_permute2f128_pd(_t0, _t2, 0x31);       \
+        row3 = _mm256_permute2f128_pd(_t1, _t3, 0x31);       \
+    }
+
+#define gmx_mm_extract_epi32(x, imm) _mm_extract_epi32((x), (imm))
  
-#include "gromacs/simd/general_x86_avx_256.h"
+static __m256d
+gmx_mm256_unpack128lo_pd(__m256d xmm1, __m256d xmm2)
+{
+    return _mm256_permute2f128_pd(xmm1, xmm2, 0x20);
+}
  
+static __m256d
+gmx_mm256_unpack128hi_pd(__m256d xmm1, __m256d xmm2)
+{
+    return _mm256_permute2f128_pd(xmm1, xmm2, 0x31);
+}
+
+static __m256d
+gmx_mm256_set_m128d(__m128d hi, __m128d lo)
+{
+    return _mm256_insertf128_pd(_mm256_castpd128_pd256(lo), hi, 0x1);
+}
+
+static gmx_inline __m256
+gmx_mm256_set_m128(__m128 hi, __m128 lo)
+{
+    return _mm256_insertf128_ps(_mm256_castps128_ps256(lo), hi, 0x1);
+}
  
  static int
  gmx_mm256_any_lt(__m256d a, __m256d b)
diff --git a/src/gromacs/gmxlib/nonbonded/nb_kernel_avx_256_single/kernelutil_x86_avx_256_single.h b/src/gromacs/gmxlib/nonbonded/nb_kernel_avx_256_single/kernelutil_x86_avx_256_single.h

index 59c9e053b7139c8d171d3466a0eee9bf2d776226..d4f041b119e5755a7ccf02075c82f7d892735d9b 100644 (file)
--- a/src/gromacs/gmxlib/nonbonded/nb_kernel_avx_256_single/kernelutil_x86_avx_256_single.h
+++ b/src/gromacs/gmxlib/nonbonded/nb_kernel_avx_256_single/kernelutil_x86_avx_256_single.h
@@ -1,7 +1,7 @@
  /*
   * This file is part of the GROMACS molecular simulation package.
   *
- * Copyright (c) 2012,2013, by the GROMACS development team, led by
+ * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
   * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   * and including many others, as listed in the AUTHORS file in the
   * top-level source directory and at http://www.gromacs.org.
@@ -35,7 +35,38 @@
  #ifndef _kernelutil_x86_avx_256_single_h_
  #define _kernelutil_x86_avx_256_single_h_
  
-#include "gromacs/simd/general_x86_avx_256.h"
+#define gmx_mm_castsi128_ps(a) _mm_castsi128_ps(a)
+
+static gmx_inline __m256
+gmx_mm256_unpack128lo_ps(__m256 xmm1, __m256 xmm2)
+{
+    return _mm256_permute2f128_ps(xmm1, xmm2, 0x20);
+}
+
+static gmx_inline __m256
+gmx_mm256_unpack128hi_ps(__m256 xmm1, __m256 xmm2)
+{
+    return _mm256_permute2f128_ps(xmm1, xmm2, 0x31);
+}
+
+static gmx_inline __m256
+gmx_mm256_set_m128(__m128 hi, __m128 lo)
+{
+    return _mm256_insertf128_ps(_mm256_castps128_ps256(lo), hi, 0x1);
+}
+
+/* Work around gcc bug with wrong type for mask formal parameter to maskload/maskstore */
+#ifdef GMX_SIMD_X86_AVX_GCC_MASKLOAD_BUG
+#    define gmx_mm_maskload_ps(mem, mask)       _mm_maskload_ps((mem), _mm_castsi128_ps(mask))
+#    define gmx_mm_maskstore_ps(mem, mask, x)    _mm_maskstore_ps((mem), _mm_castsi128_ps(mask), (x))
+#    define gmx_mm256_maskload_ps(mem, mask)    _mm256_maskload_ps((mem), _mm256_castsi256_ps(mask))
+#    define gmx_mm256_maskstore_ps(mem, mask, x) _mm256_maskstore_ps((mem), _mm256_castsi256_ps(mask), (x))
+#else
+#    define gmx_mm_maskload_ps(mem, mask)       _mm_maskload_ps((mem), (mask))
+#    define gmx_mm_maskstore_ps(mem, mask, x)    _mm_maskstore_ps((mem), (mask), (x))
+#    define gmx_mm256_maskload_ps(mem, mask)    _mm256_maskload_ps((mem), (mask))
+#    define gmx_mm256_maskstore_ps(mem, mask, x) _mm256_maskstore_ps((mem), (mask), (x))
+#endif
  
  /* Transpose lower/upper half of 256-bit registers separately */
  #define GMX_MM256_HALFTRANSPOSE4_PS(ymm0, ymm1, ymm2, ymm3) {            \
diff --git a/src/gromacs/gmxlib/nonbonded/nb_kernel_sse2_double/kernelutil_x86_sse2_double.h b/src/gromacs/gmxlib/nonbonded/nb_kernel_sse2_double/kernelutil_x86_sse2_double.h

index 05ed35d17cccae16725086aec5dfdb67a7397e74..d14ab9862b85e562a429f31e6ca666f000a69e66 100644 (file)
--- a/src/gromacs/gmxlib/nonbonded/nb_kernel_sse2_double/kernelutil_x86_sse2_double.h
+++ b/src/gromacs/gmxlib/nonbonded/nb_kernel_sse2_double/kernelutil_x86_sse2_double.h
@@ -1,7 +1,7 @@
  /*
   * This file is part of the GROMACS molecular simulation package.
   *
- * Copyright (c) 2012,2013, by the GROMACS development team, led by
+ * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
   * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   * and including many others, as listed in the AUTHORS file in the
   * top-level source directory and at http://www.gromacs.org.
@@ -37,13 +37,20 @@
  
  #include <math.h>
  
-#include "gromacs/simd/general_x86_sse2.h"
  
  #include <stdio.h>
  
  
  /* Normal sum of four ymm registers */
  #define gmx_mm_sum4_pd(t0, t1, t2, t3)  _mm_add_pd(_mm_add_pd(t0, t1), _mm_add_pd(t2, t3))
+#define gmx_mm_extract_epi32(x, imm) _mm_cvtsi128_si32(_mm_srli_si128((x), 4 * (imm)))
+#define gmx_mm_castsi128_pd(a) _mm_castsi128_pd(a)
+
+#define GMX_MM_TRANSPOSE2_PD(row0, row1) {           \
+        __m128d __gmx_t1 = row0;                         \
+        row0           = _mm_unpacklo_pd(row0, row1);     \
+        row1           = _mm_unpackhi_pd(__gmx_t1, row1); \
+}
  
  static int
  gmx_mm_any_lt(__m128d a, __m128d b)
diff --git a/src/gromacs/gmxlib/nonbonded/nb_kernel_sse2_single/kernelutil_x86_sse2_single.h b/src/gromacs/gmxlib/nonbonded/nb_kernel_sse2_single/kernelutil_x86_sse2_single.h

index 3d8b72380879f418c0bfab16372de4818d78b8f1..0820cac76e054aa25f58eb6447a8a3bf9e305e03 100644 (file)
--- a/src/gromacs/gmxlib/nonbonded/nb_kernel_sse2_single/kernelutil_x86_sse2_single.h
+++ b/src/gromacs/gmxlib/nonbonded/nb_kernel_sse2_single/kernelutil_x86_sse2_single.h
@@ -1,7 +1,7 @@
  /*
   * This file is part of the GROMACS molecular simulation package.
   *
- * Copyright (c) 2012,2013, by the GROMACS development team, led by
+ * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
   * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   * and including many others, as listed in the AUTHORS file in the
   * top-level source directory and at http://www.gromacs.org.
@@ -39,7 +39,9 @@
  
  #include <math.h>
  
-#include "gromacs/simd/general_x86_sse2.h"
+#define gmx_mm_castsi128_ps   _mm_castsi128_ps
+
+#define gmx_mm_extract_epi32(x, imm) _mm_cvtsi128_si32(_mm_srli_si128((x), 4 * (imm)))
  
  
  /* Normal sum of four xmm registers */
diff --git a/src/gromacs/gmxlib/nonbonded/nb_kernel_sse4_1_double/kernelutil_x86_sse4_1_double.h b/src/gromacs/gmxlib/nonbonded/nb_kernel_sse4_1_double/kernelutil_x86_sse4_1_double.h

index 282dfc0ea4ec5dda9b107b6b6a084ea3e9150d60..9f759f310dde39d01f934c538dfe57834a63caf1 100644 (file)
--- a/src/gromacs/gmxlib/nonbonded/nb_kernel_sse4_1_double/kernelutil_x86_sse4_1_double.h
+++ b/src/gromacs/gmxlib/nonbonded/nb_kernel_sse4_1_double/kernelutil_x86_sse4_1_double.h
@@ -1,7 +1,7 @@
  /*
   * This file is part of the GROMACS molecular simulation package.
   *
- * Copyright (c) 2012,2013, by the GROMACS development team, led by
+ * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
   * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   * and including many others, as listed in the AUTHORS file in the
   * top-level source directory and at http://www.gromacs.org.
@@ -37,10 +37,16 @@
  
  #include <math.h>
  
-#include "gromacs/simd/general_x86_sse4_1.h"
-
  #include <stdio.h>
  
+#define gmx_mm_extract_epi32(x, imm) _mm_extract_epi32((x), (imm))
+#define gmx_mm_castsi128_pd(a) _mm_castsi128_pd(a)
+
+#define GMX_MM_TRANSPOSE2_PD(row0, row1) {           \
+        __m128d __gmx_t1 = row0;                         \
+        row0           = _mm_unpacklo_pd(row0, row1);     \
+        row1           = _mm_unpackhi_pd(__gmx_t1, row1); \
+}
  
  /* Normal sum of four ymm registers */
  #define gmx_mm_sum4_pd(t0, t1, t2, t3)  _mm_add_pd(_mm_add_pd(t0, t1), _mm_add_pd(t2, t3))
diff --git a/src/gromacs/gmxlib/nonbonded/nb_kernel_sse4_1_single/kernelutil_x86_sse4_1_single.h b/src/gromacs/gmxlib/nonbonded/nb_kernel_sse4_1_single/kernelutil_x86_sse4_1_single.h

index 4aee8a27642564e2e263a66b8dd8c23da6325247..8d8c3e671972aa2243af3fd4ee63810930b48cf0 100644 (file)
--- a/src/gromacs/gmxlib/nonbonded/nb_kernel_sse4_1_single/kernelutil_x86_sse4_1_single.h
+++ b/src/gromacs/gmxlib/nonbonded/nb_kernel_sse4_1_single/kernelutil_x86_sse4_1_single.h
@@ -1,7 +1,7 @@
  /*
   * This file is part of the GROMACS molecular simulation package.
   *
- * Copyright (c) 2012,2013, by the GROMACS development team, led by
+ * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
   * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   * and including many others, as listed in the AUTHORS file in the
   * top-level source directory and at http://www.gromacs.org.
@@ -37,11 +37,12 @@
  
  #include <math.h>
  
-#include "gromacs/simd/general_x86_sse4_1.h"
-
  #undef gmx_restrict
  #define gmx_restrict
  
+#define gmx_mm_castsi128_ps   _mm_castsi128_ps
+#define gmx_mm_extract_epi32  _mm_extract_epi32
+
  /* Normal sum of four xmm registers */
  #define gmx_mm_sum4_ps(t0, t1, t2, t3)  _mm_add_ps(_mm_add_ps(t0, t1), _mm_add_ps(t2, t3))
  
diff --git a/src/gromacs/gmxlib/nonbonded/nonbonded.c b/src/gromacs/gmxlib/nonbonded/nonbonded.c

index 42209eef7d5fda07ee98f9be7e51539fc9adf527..e2448ade984af968d07627559a5c563b5cad5cff 100644 (file)
--- a/src/gromacs/gmxlib/nonbonded/nonbonded.c
+++ b/src/gromacs/gmxlib/nonbonded/nonbonded.c
@@ -68,7 +68,7 @@
  #include "nb_generic_cg.h"
  #include "nb_generic_adress.h"
  
-/* Different default (c) and accelerated interaction-specific kernels */
+/* Different default (c) and SIMD instructions interaction-specific kernels */
  #include "nb_kernel_c/nb_kernel_c.h"
  
  #if (defined GMX_SIMD_X86_SSE2) && !(defined GMX_DOUBLE)
@@ -80,7 +80,7 @@
  #if (defined GMX_SIMD_X86_AVX_128_FMA) && !(defined GMX_DOUBLE)
  #    include "nb_kernel_avx_128_fma_single/nb_kernel_avx_128_fma_single.h"
  #endif
-#if (defined GMX_SIMD_X86_AVX_256) && !(defined GMX_DOUBLE)
+#if (defined GMX_SIMD_X86_AVX_256_OR_HIGHER) && !(defined GMX_DOUBLE)
  #    include "nb_kernel_avx_256_single/nb_kernel_avx_256_single.h"
  #endif
  #if (defined GMX_SIMD_X86_SSE2 && defined GMX_DOUBLE)
@@ -92,7 +92,7 @@
  #if (defined GMX_SIMD_X86_AVX_128_FMA && defined GMX_DOUBLE)
  #    include "nb_kernel_avx_128_fma_double/nb_kernel_avx_128_fma_double.h"
  #endif
-#if (defined GMX_SIMD_X86_AVX_256 && defined GMX_DOUBLE)
+#if (defined GMX_SIMD_X86_AVX_256_OR_HIGHER && defined GMX_DOUBLE)
  #    include "nb_kernel_avx_256_double/nb_kernel_avx_256_double.h"
  #endif
  #if (defined GMX_SIMD_SPARC64_HPC_ACE && defined GMX_DOUBLE)
@@ -130,7 +130,7 @@ gmx_nonbonded_setup(t_forcerec *   fr,
  #if (defined GMX_SIMD_X86_AVX_128_FMA) && !(defined GMX_DOUBLE)
                  nb_kernel_list_add_kernels(kernellist_avx_128_fma_single, kernellist_avx_128_fma_single_size);
  #endif
-#if (defined GMX_SIMD_X86_AVX_256) && !(defined GMX_DOUBLE)
+#if (defined GMX_SIMD_X86_AVX_256_OR_HIGHER) && !(defined GMX_DOUBLE)
                  nb_kernel_list_add_kernels(kernellist_avx_256_single, kernellist_avx_256_single_size);
  #endif
                  /* Double precision */
@@ -143,7 +143,7 @@ gmx_nonbonded_setup(t_forcerec *   fr,
  #if (defined GMX_SIMD_X86_AVX_128_FMA && defined GMX_DOUBLE)
                  nb_kernel_list_add_kernels(kernellist_avx_128_fma_double, kernellist_avx_128_fma_double_size);
  #endif
-#if (defined GMX_SIMD_X86_AVX_256 && defined GMX_DOUBLE)
+#if (defined GMX_SIMD_X86_AVX_256_OR_HIGHER && defined GMX_DOUBLE)
                  nb_kernel_list_add_kernels(kernellist_avx_256_double, kernellist_avx_256_double_size);
  #endif
  #if (defined GMX_SIMD_SPARC64_HPC_ACE && defined GMX_DOUBLE)
@@ -181,7 +181,7 @@ gmx_nonbonded_set_kernel_pointers(FILE *log, t_nblist *nl)
      arch_and_padding[] =
      {
          /* Single precision */
-#if (defined GMX_SIMD_X86_AVX_256) && !(defined GMX_DOUBLE)
+#if (defined GMX_SIMD_X86_AVX_256_OR_HIGHER) && !(defined GMX_DOUBLE)
          { "avx_256_single", 8 },
  #endif
  #if (defined GMX_SIMD_X86_AVX_128_FMA) && !(defined GMX_DOUBLE)
@@ -194,7 +194,7 @@ gmx_nonbonded_set_kernel_pointers(FILE *log, t_nblist *nl)
          { "sse2_single", 4 },
  #endif
          /* Double precision */
-#if (defined GMX_SIMD_X86_AVX_256 && defined GMX_DOUBLE)
+#if (defined GMX_SIMD_X86_AVX_256_OR_HIGHER && defined GMX_DOUBLE)
          { "avx_256_double", 4 },
  #endif
  #if (defined GMX_SIMD_X86_AVX_128_FMA && defined GMX_DOUBLE)
diff --git a/src/gromacs/gmxpreprocess/calc_verletbuf.c b/src/gromacs/gmxpreprocess/calc_verletbuf.c

index 6cebe7772e323bd919509d533e4c89e546cd18f4..ea383bf633d4494baad8286428a5f69f71536f9f 100644 (file)
--- a/src/gromacs/gmxpreprocess/calc_verletbuf.c
+++ b/src/gromacs/gmxpreprocess/calc_verletbuf.c
@@ -57,7 +57,7 @@
  #ifdef GMX_NBNXN_HALF_WIDTH_SIMD
  #define GMX_USE_HALF_WIDTH_SIMD_HERE
  #endif
-#include "gromacs/simd/macros.h"
+#include "gromacs/simd/simd.h"
  #endif
  
  
diff --git a/src/gromacs/legacyheaders/gmx_cpuid.h b/src/gromacs/legacyheaders/gmx_cpuid.h

index a0e1e0a8bb5793de7d6226e9f88c47b5af49c46c..d595c51a373447b2c28d950b6e0824308dae16bc 100644 (file)
--- a/src/gromacs/legacyheaders/gmx_cpuid.h
+++ b/src/gromacs/legacyheaders/gmx_cpuid.h
@@ -124,6 +124,7 @@ enum gmx_cpuid_simd
  {
      GMX_CPUID_SIMD_CANNOTDETECT,    /* Should only be used if something fails */
      GMX_CPUID_SIMD_NONE,
+    GMX_CPUID_SIMD_REFERENCE,
      GMX_CPUID_SIMD_X86_SSE2,
      GMX_CPUID_SIMD_X86_SSE4_1,
      GMX_CPUID_SIMD_X86_AVX_128_FMA,
diff --git a/src/gromacs/legacyheaders/types/nb_verlet.h b/src/gromacs/legacyheaders/types/nb_verlet.h

index 8c6228e2768288509c973a4fcee54091ae5fdf1f..aaeeaa8b3616d2603c4e84dd64d90cdc96e5632a 100644 (file)
--- a/src/gromacs/legacyheaders/types/nb_verlet.h
+++ b/src/gromacs/legacyheaders/types/nb_verlet.h
@@ -43,34 +43,6 @@
  extern "C" {
  #endif
  
-#ifdef GMX_SIMD_REFERENCE
-#define GMX_NBNXN_SIMD
-#endif
-
-#if (defined GMX_SIMD_X86_SSE2_OR_HIGHER) || (defined GMX_SIMD_IBM_QPX)
-/* Use SIMD accelerated nbnxn search and kernels */
-#define GMX_NBNXN_SIMD
-
-/* Uncomment the next line to use, slower, 128-bit SIMD with AVX-256 */
-/* #define GMX_NBNXN_HALF_WIDTH_SIMD */
-
-/* The nbnxn SIMD 4xN and 2x(N+N) kernels can be added independently.
- * Currently the 2xNN SIMD kernels only make sense with:
- *  8-way SIMD: 4x4 setup, works with AVX-256 in single precision
- * 16-way SIMD: 4x8 setup, not used, but most of the kernel code is there
- */
-#define GMX_NBNXN_SIMD_4XN
-#if defined GMX_SIMD_X86_AVX_256_OR_HIGHER && !(defined GMX_DOUBLE || defined GMX_NBNXN_HALF_WIDTH_SIMD)
-#define GMX_NBNXN_SIMD_2XNN
-#endif
-
-#endif
-
-#ifdef __MIC__
-#define GMX_NBNXN_SIMD
-#define GMX_NBNXN_SIMD_2XNN
-#endif
-
  
  /*! Nonbonded NxN kernel types: plain C, CPU SIMD, GPU CUDA, GPU emulation */
  typedef enum
diff --git a/src/gromacs/mdlib/forcerec.c b/src/gromacs/mdlib/forcerec.c

index 64fef1f839d830bb614b3b15b17a32d03b9b68fb..e24126491b8cc0b5ac566049b155ead80ef40334 100644 (file)
--- a/src/gromacs/mdlib/forcerec.c
+++ b/src/gromacs/mdlib/forcerec.c
@@ -68,6 +68,7 @@
  #include "qmmm.h"
  #include "copyrite.h"
  #include "mtop_util.h"
+#include "nbnxn_simd.h"
  #include "nbnxn_search.h"
  #include "nbnxn_atomdata.h"
  #include "nbnxn_consts.h"
@@ -1568,21 +1569,41 @@ static void pick_nbnxn_kernel_cpu(const t_inputrec gmx_unused *ir,
          *kernel_type = nbnxnk4xN_SIMD_4xN;
  #endif
  #ifdef GMX_NBNXN_SIMD_2XNN
-        /* We expect the 2xNN kernels to be faster in most cases */
          *kernel_type = nbnxnk4xN_SIMD_2xNN;
  #endif
  
-#if defined GMX_NBNXN_SIMD_4XN && defined GMX_SIMD_X86_AVX_256_OR_HIGHER
-        if (EEL_RF(ir->coulombtype) || ir->coulombtype == eelCUT)
+#if defined GMX_NBNXN_SIMD_2XNN && defined GMX_NBNXN_SIMD_4XN
+        /* We need to choose if we want 2x(N+N) or 4xN kernels.
+         * Currently this is based on the SIMD acceleration choice,
+         * but it might be better to decide this at runtime based on CPU.
+         *
+         * 4xN calculates more (zero) interactions, but has less pair-search
+         * work and much better kernel instruction scheduling.
+         *
+         * Up till now we have only seen that on Intel Sandy/Ivy Bridge,
+         * which doesn't have FMA, both the analytical and tabulated Ewald
+         * kernels have similar pair rates for 4x8 and 2x(4+4), so we choose
+         * 2x(4+4) because it results in significantly fewer pairs.
+         * For RF, the raw pair rate of the 4x8 kernel is higher than 2x(4+4),
+         * 10% with HT, 50% without HT. As we currently don't detect the actual
+         * use of HT, use 4x8 to avoid a potential performance hit.
+         * On Intel Haswell 4x8 is always faster.
+         */
+        *kernel_type = nbnxnk4xN_SIMD_4xN;
+
+#ifndef GMX_SIMD_HAVE_FMA
+        if (EEL_PME(ir->coulombtype) || EEL_EWALD(ir->coulombtype) ||
+            EVDW_PME(ir->vdwtype))
          {
-            /* The raw pair rate of the 4x8 kernel is higher than 2x(4+4),
-             * 10% with HT, 50% without HT, but extra zeros interactions
-             * can compensate. As we currently don't detect the actual use
-             * of HT, switch to 4x8 to avoid a potential performance hit.
+            /* We have Ewald kernels without FMA (Intel Sandy/Ivy Bridge).
+             * There are enough instructions to make 2x(4+4) efficient.
               */
-            *kernel_type = nbnxnk4xN_SIMD_4xN;
+            *kernel_type = nbnxnk4xN_SIMD_2xNN;
          }
  #endif
+#endif  /* GMX_NBNXN_SIMD_2XNN && GMX_NBNXN_SIMD_4XN */
+
+
          if (getenv("GMX_NBNXN_SIMD_4XN") != NULL)
          {
  #ifdef GMX_NBNXN_SIMD_4XN
@@ -1601,11 +1622,16 @@ static void pick_nbnxn_kernel_cpu(const t_inputrec gmx_unused *ir,
          }
  
          /* Analytical Ewald exclusion correction is only an option in
-         * the SIMD kernel. On BlueGene/Q, this is faster regardless
-         * of precision. In single precision, this is faster on
-         * Bulldozer, and slightly faster on Sandy Bridge.
+         * the SIMD kernel.
+         * Since table lookup's don't parallelize with SIMD, analytical
+         * will probably always be faster for a SIMD width of 8 or more.
+         * With FMA analytical is sometimes faster for a width if 4 as well.
+         * On BlueGene/Q, this is faster regardless of precision.
+         * In single precision, this is faster on Bulldozer.
           */
-#if ((defined GMX_SIMD_X86_AVX_128_FMA_OR_HIGHER || defined GMX_SIMD_X86_AVX_256_OR_HIGHER || defined __MIC__) && !defined GMX_DOUBLE) || (defined GMX_SIMD_IBM_QPX)
+#if GMX_SIMD_REAL_WIDTH >= 8 || \
+        (GMX_SIMD_REAL_WIDTH >= 4 && defined GMX_SIMD_HAVE_FMA && !defined GMX_DOUBLE) || \
+        defined GMX_SIMD_IBM_QPX
          *ewald_excl = ewaldexclAnalytical;
  #endif
          if (getenv("GMX_NBNXN_EWALD_TABLE") != NULL)
@@ -1636,36 +1662,19 @@ const char *lookup_nbnxn_kernel_name(int kernel_type)
          case nbnxnk4xN_SIMD_4xN:
          case nbnxnk4xN_SIMD_2xNN:
  #ifdef GMX_NBNXN_SIMD
-#ifdef GMX_SIMD_X86_SSE2_OR_HIGHER
-            /* We have x86 SSE2 compatible SIMD */
-#ifdef GMX_SIMD_X86_AVX_128_FMA_OR_HIGHER
-            returnvalue = "AVX-128-FMA";
-#else
-#if defined GMX_SIMD_X86_AVX_256_OR_HIGHER || defined __AVX__
-            /* x86 SIMD intrinsics can be converted to SSE or AVX depending
-             * on compiler flags. As we use nearly identical intrinsics,
-             * compiling for AVX without an AVX macros effectively results
-             * in AVX kernels.
-             * For gcc we check for __AVX__
-             * At least a check for icc should be added (if there is a macro)
-             */
-#if defined GMX_SIMD_X86_AVX_256_OR_HIGHER && !defined GMX_NBNXN_HALF_WIDTH_SIMD
-            returnvalue = "AVX-256";
-#else
-            returnvalue = "AVX-128";
-#endif
-#else
-#ifdef GMX_SIMD_X86_SSE4_1_OR_HIGHER
-            returnvalue  = "SSE4.1";
+#if defined GMX_SIMD_X86_SSE2
+            returnvalue = "SSE2";
+#elif defined GMX_SIMD_X86_SSE4_1
+            returnvalue = "SSE4.1";
+#elif defined GMX_SIMD_X86_AVX_128_FMA
+            returnvalue = "AVX_128_FMA";
+#elif defined GMX_SIMD_X86_AVX_256
+            returnvalue = "AVX_256";
+#elif defined GMX_SIMD_X86_AVX2_256
+            returnvalue = "AVX2_256";
  #else
-            returnvalue  = "SSE2";
-#endif
-#endif
+            returnvalue = "SIMD";
  #endif
-#else   /* GMX_SIMD_X86_SSE2_OR_HIGHER */
-            /* not GMX_SIMD_X86_SSE2_OR_HIGHER, but other SIMD */
-            returnvalue  = "SIMD";
-#endif /* GMX_SIMD_X86_SSE2_OR_HIGHER */
  #else  /* GMX_NBNXN_SIMD */
              returnvalue = "not available";
  #endif /* GMX_NBNXN_SIMD */
diff --git a/src/gromacs/mdlib/nbnxn_internal.h b/src/gromacs/mdlib/nbnxn_internal.h

index e71921699faf38462fc0f05bd04981c19b8ee55e..8c366ee8cb65faf256490f1cc3437f41fca980f0 100644 (file)
--- a/src/gromacs/mdlib/nbnxn_internal.h
+++ b/src/gromacs/mdlib/nbnxn_internal.h
@@ -37,28 +37,16 @@
  #define _nbnxn_internal_h
  
  #include "typedefs.h"
+#include "nbnxn_simd.h"
  #include "domdec.h"
  #include "gromacs/timing/cyclecounter.h"
  
-#ifdef GMX_NBNXN_SIMD
-/* The include below sets the SIMD instruction type (precision+width)
- * for all nbnxn SIMD search and non-bonded kernel code.
- */
-#ifdef GMX_NBNXN_HALF_WIDTH_SIMD
-#define GMX_USE_HALF_WIDTH_SIMD_HERE
-#endif
-#include "gromacs/simd/macros.h"
-#endif
-
  
-/* Bounding box calculations are (currently) always in single precision.
+/* Bounding box calculations are (currently) always in single precision, so
+ * we only need to check for single precision support here.
   * This uses less (cache-)memory and SIMD is faster, at least on x86.
   */
-#define GMX_SIMD4_SINGLE
-/* Include the 4-wide SIMD macro file */
-#include "gromacs/simd/four_wide_macros.h"
-/* Check if we have 4-wide SIMD macro support */
-#ifdef GMX_HAVE_SIMD4_MACROS
+#ifdef GMX_SIMD4_HAVE_FLOAT
  #define NBNXN_SEARCH_BB_SIMD4
  #endif
  
diff --git a/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_simd_utils.h b/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_simd_utils.h

index bb84a499c48fcdb2364f634f31179029a3f35473..adb9613af39c751a70b7abe9a036794c22755495 100644 (file)
--- a/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_simd_utils.h
+++ b/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_simd_utils.h
@@ -67,7 +67,7 @@ prepare_table_load_buffer(const int gmx_unused *array)
  
  #else /* GMX_SIMD_REFERENCE */
  
-#if defined  GMX_SIMD_X86_SSE2_OR_HIGHER && !defined __MIC__
+#if defined  GMX_TARGET_X86 && !defined __MIC__
  /* Include x86 SSE2 compatible SIMD functions */
  
  /* Set the stride for the lookup of the two LJ parameters from their
@@ -80,46 +80,35 @@ static const int nbfp_stride = 4;
  #endif
  
  /* Align a stack-based thread-local working array. Table loads on
- * full-width AVX_256 use the array, but other implementations do
- * not. */
+ * 256-bit AVX use the array, but other implementations do not.
+ */
  static gmx_inline int *
-prepare_table_load_buffer(const int gmx_unused *array)
+prepare_table_load_buffer(int gmx_unused *array)
  {
-#if defined GMX_SIMD_X86_AVX_256_OR_HIGHER && !defined GMX_USE_HALF_WIDTH_SIMD_HERE
+#if GMX_SIMD_REAL_WIDTH >= 8 || (defined GMX_DOUBLE && GMX_SIMD_REAL_WIDTH >= 4)
      return gmx_simd_align_i(array);
  #else
      return NULL;
  #endif
  }
  
-#if defined GMX_SIMD_X86_AVX_256_OR_HIGHER && !defined GMX_USE_HALF_WIDTH_SIMD_HERE
-
-/* With full AVX-256 SIMD, half SIMD-width table loads are optimal */
-#if GMX_SIMD_REAL_WIDTH == 8
-#define TAB_FDV0
-#endif
  #ifdef GMX_DOUBLE
+#if GMX_SIMD_REAL_WIDTH == 2
+#include "nbnxn_kernel_simd_utils_x86_128d.h"
+#else
  #include "nbnxn_kernel_simd_utils_x86_256d.h"
-#else  /* GMX_DOUBLE */
-#include "nbnxn_kernel_simd_utils_x86_256s.h"
-#endif /* GMX_DOUBLE */
-
-#else  /* defined GMX_SIMD_X86_AVX_256_OR_HIGHER && !defined GMX_USE_HALF_WIDTH_SIMD_HERE */
-
-/* We use the FDV0 table layout when we can use aligned table loads */
-#if GMX_SIMD_REAL_WIDTH == 4
-#define TAB_FDV0
  #endif
-
-#ifdef GMX_DOUBLE
-#include "nbnxn_kernel_simd_utils_x86_128d.h"
-#else  /* GMX_DOUBLE */
+#else /* GMX_DOUBLE */
+/* In single precision aligned FDV0 table loads are optimal */
+#define TAB_FDV0
+#if GMX_SIMD_REAL_WIDTH == 4
  #include "nbnxn_kernel_simd_utils_x86_128s.h"
+#else
+#include "nbnxn_kernel_simd_utils_x86_256s.h"
+#endif
  #endif /* GMX_DOUBLE */
  
-#endif /* defined GMX_SIMD_X86_AVX_256_OR_HIGHER && !defined GMX_USE_HALF_WIDTH_SIMD_HERE */
-
-#else  /* GMX_SIMD_X86_SSE2_OR_HIGHER */
+#else  /* GMX_TARGET_X86 && !__MIC__ */
  
  #if GMX_SIMD_REAL_WIDTH > 4
  /* For width>4 we use unaligned loads. And thus we can use the minimal stride */
@@ -141,64 +130,22 @@ static const int nbfp_stride = GMX_SIMD_REAL_WIDTH;
  #include "nbnxn_kernel_simd_utils_x86_mic.h"
  #endif
  
-#endif /* GMX_SIMD_X86_SSE2_OR_HIGHER */
-#endif /* GMX_SIMD_REFERENCE */
-
-#if GMX_SIMD_REAL_WIDTH == 4
-#define gmx_mm_pr4    gmx_simd_real_t
-#define gmx_load_pr4  gmx_simd_load_r
-#define gmx_store_pr4 gmx_simd_store_r
-#define gmx_add_pr4   gmx_simd_add_r
-#endif
+#endif /* GMX_TARGET_X86 && !__MIC__ */
  
-#ifndef HAVE_GMX_SUM_SIMD /* should be defined for arch with hardware reduce */
-static gmx_inline real
-gmx_sum_simd2(gmx_simd_real_t x, real* b)
-{
-    gmx_simd_store_r(b, x);
-    return b[0]+b[1];
-}
-
-#if GMX_SIMD_REAL_WIDTH >= 4
-static gmx_inline real
-gmx_sum_simd4(gmx_mm_pr4 x, real* b)
-{
-    gmx_store_pr4(b, x);
-    return b[0]+b[1]+b[2]+b[3];
-}
-#endif
+#endif /* GMX_SIMD_REFERENCE */
  
-#if GMX_SIMD_REAL_WIDTH == 2
-static gmx_inline real gmx_sum_simd(gmx_simd_real_t x, real* b)
-{
-    gmx_simd_store_r(b, x);
-    return b[0]+b[1];
-}
-#elif GMX_SIMD_REAL_WIDTH == 4
-static gmx_inline real gmx_sum_simd(gmx_simd_real_t x, real* b)
-{
-    gmx_simd_store_r(b, x);
-    return b[0]+b[1]+b[2]+b[3];
-}
-#elif GMX_SIMD_REAL_WIDTH == 8
-static gmx_inline real gmx_sum_simd(gmx_simd_real_t x, real* b)
-{
-    gmx_simd_store_r(b, x);
-    return b[0]+b[1]+b[2]+b[3]+b[4]+b[5]+b[6]+b[7];
-}
-#elif GMX_SIMD_REAL_WIDTH == 16
-/* This is getting ridiculous, SIMD horizontal adds would help,
- * but this is not performance critical (only used to reduce energies)
+/* If the simd width is 4, but simd4 instructions are not defined,
+ * reuse the simd real type and the four instructions we need.
   */
-static gmx_inline real gmx_sum_simd(gmx_simd_real_t x, real* b)
-{
-    gmx_simd_store_r(b, x);
-    return b[0]+b[1]+b[2]+b[3]+b[4]+b[5]+b[6]+b[7]+b[8]+b[9]+b[10]+b[11]+b[12]+b[13]+b[14]+b[15];
-}
-#else
-#error "unsupported kernel configuration"
+#if GMX_SIMD_REAL_WIDTH == 4 && \
+    !((!defined GMX_DOUBLE && defined GMX_SIMD4_HAVE_FLOAT) || \
+    (defined GMX_DOUBLE && defined GMX_SIMD4_HAVE_DOUBLE))
+#define gmx_simd4_real_t    gmx_simd_real_t
+#define gmx_simd4_load_r    gmx_simd_load_r
+#define gmx_simd4_store_r   gmx_simd_store_r
+#define gmx_simd4_add_r     gmx_simd_add_r
+#define gmx_simd4_reduce_r  gmx_simd_reduce_r
  #endif
-#endif //HAVE_GMX_SUM_SIMD
  
  #ifdef UNROLLJ
  /* Add energy register to possibly multiple terms in the energy array */
diff --git a/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_simd_utils_ibm_qpx.h b/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_simd_utils_ibm_qpx.h

index fd857475b4291108a59fb7891fde890995bb3598..1071b7899cb41929dd18ed010d4d8464c6681088 100644 (file)
--- a/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_simd_utils_ibm_qpx.h
+++ b/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_simd_utils_ibm_qpx.h
@@ -38,9 +38,6 @@
  typedef gmx_simd_real_t gmx_exclfilter;
  static const int filter_stride = 1;
  
-/* The 4xn kernel operates on 4-wide i-force registers */
-typedef gmx_simd_real_t gmx_mm_pr4;
-
  /* This files contains all functions/macros for the SIMD kernels
   * which have explicit dependencies on the j-cluster size and/or SIMD-width.
   * The functionality which depends on the j-cluster size is:
@@ -113,7 +110,7 @@ gmx_shuffle_4_ps_fil2_to_1_ps(gmx_simd_real_t a, gmx_simd_real_t b,
  /* Align a stack-based thread-local working array. Table loads on QPX
   * use the array, but most other implementations do not. */
  static gmx_inline int *
-prepare_table_load_buffer(const int *array)
+prepare_table_load_buffer(int *array)
  {
      return gmx_simd_align_i(array);
  }
@@ -186,22 +183,6 @@ gmx_mm_transpose_sum4_pr(gmx_simd_real_t a, gmx_simd_real_t b,
      return gmx_simd_add_r(sum01, sim23);
  }
  
-#ifdef GMX_DOUBLE
-/* In double precision on x86 it can be faster to first calculate
- * single precision square roots for two double precision registers at
- * once and then use double precision Newton-Raphson iteration to
- * reach full double precision. For QPX, we just wrap the usual
- * reciprocal square roots.
- */
-static gmx_inline void
-gmx_mm_invsqrt2_pd(gmx_simd_real_t in0, gmx_simd_real_t in1,
-                   gmx_simd_real_t *out0, gmx_simd_real_t *out1)
-{
-    *out0 = gmx_simd_invsqrt_r(in0);
-    *out1 = gmx_simd_invsqrt_r(in1);
-}
-#endif
-
  static gmx_inline void
  load_lj_pair_params(const real *nbfp, const int *type, int aj,
                      gmx_simd_real_t *c6_S, gmx_simd_real_t *c12_S)
diff --git a/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_simd_utils_ref.h b/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_simd_utils_ref.h

index c7a6e9a6c0e9a8e93c5a99c8bb437bedd383e4c6..e7d38f2cfd5fb5c4c1e66487492614c75efd40b7 100644 (file)
--- a/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_simd_utils_ref.h
+++ b/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_simd_utils_ref.h
@@ -35,7 +35,10 @@
  #ifndef _nbnxn_kernel_simd_utils_ref_h_
  #define _nbnxn_kernel_simd_utils_ref_h_
  
-typedef gmx_simd_ref_epi32      gmx_simd_ref_exclfilter;
+#
+#include "gromacs/simd/simd_math.h"
+
+typedef gmx_simd_int32_t        gmx_simd_ref_exclfilter;
  typedef gmx_simd_ref_exclfilter gmx_exclfilter;
  static const int filter_stride = GMX_SIMD_INT32_WIDTH/GMX_SIMD_REAL_WIDTH;
  
@@ -55,13 +58,13 @@ static const int nbfp_stride = 4;
  /* float/double SIMD register type */
  typedef struct {
      real r[4];
-} gmx_mm_pr4;
+} gmx_simd4_real_t;
  
-static gmx_inline gmx_mm_pr4
-gmx_load_pr4(const real *r)
+static gmx_inline gmx_simd4_real_t
+gmx_simd4_load_r(const real *r)
  {
-    gmx_mm_pr4 a;
-    int        i;
+    gmx_simd4_real_t a;
+    int              i;
  
      for (i = 0; i < 4; i++)
      {
@@ -72,10 +75,10 @@ gmx_load_pr4(const real *r)
  }
  
  static gmx_inline void
-gmx_store_pr4(real *dest, gmx_mm_pr4 src)
+gmx_simd4_store_r(real *dest, gmx_simd4_real_t src)
  {
-    gmx_mm_pr4 a;
-    int        i;
+    gmx_simd4_real_t a;
+    int              i;
  
      for (i = 0; i < 4; i++)
      {
@@ -83,11 +86,11 @@ gmx_store_pr4(real *dest, gmx_mm_pr4 src)
      }
  }
  
-static gmx_inline gmx_mm_pr4
-gmx_add_pr4(gmx_mm_pr4 a, gmx_mm_pr4 b)
+static gmx_inline gmx_simd4_real_t
+gmx_simd4_add_r(gmx_simd4_real_t a, gmx_simd4_real_t b)
  {
-    gmx_mm_pr4 c;
-    int        i;
+    gmx_simd4_real_t c;
+    int              i;
  
      for (i = 0; i < 4; i++)
      {
@@ -96,6 +99,13 @@ gmx_add_pr4(gmx_mm_pr4 a, gmx_mm_pr4 b)
  
      return c;
  }
+
+static gmx_inline real
+gmx_simd4_reduce_r(gmx_simd4_real_t a)
+{
+    return a.r[0] + a.r[1] + a.r[2] + a.r[3];
+}
+
  #endif
  
  
@@ -137,7 +147,7 @@ gmx_set1_hpr(gmx_mm_hpr *a, real b)
  
  /* Load one real at b and one real at b+1 into halves of a, respectively */
  static gmx_inline void
-gmx_load1p1_pr(gmx_simd_ref_pr *a, const real *b)
+gmx_load1p1_pr(gmx_simd_real_t *a, const real *b)
  {
      int i;
  
@@ -150,7 +160,7 @@ gmx_load1p1_pr(gmx_simd_ref_pr *a, const real *b)
  
  /* Load reals at half-width aligned pointer b into two halves of a */
  static gmx_inline void
-gmx_loaddh_pr(gmx_simd_ref_pr *a, const real *b)
+gmx_loaddh_pr(gmx_simd_real_t *a, const real *b)
  {
      int i;
  
@@ -203,7 +213,7 @@ gmx_sub_hpr(gmx_mm_hpr a, gmx_mm_hpr b)
  
  /* Sum over 4 half SIMD registers */
  static gmx_inline gmx_mm_hpr
-gmx_sum4_hpr(gmx_simd_ref_pr a, gmx_simd_ref_pr b)
+gmx_sum4_hpr(gmx_simd_real_t a, gmx_simd_real_t b)
  {
      gmx_mm_hpr c;
      int        i;
@@ -222,11 +232,11 @@ gmx_sum4_hpr(gmx_simd_ref_pr a, gmx_simd_ref_pr b)
  
  #ifdef GMX_NBNXN_SIMD_2XNN
  /* Sum the elements of halfs of each input register and store sums in out */
-static gmx_inline gmx_mm_pr4
-gmx_mm_transpose_sum4h_pr(gmx_simd_ref_pr a, gmx_simd_ref_pr b)
+static gmx_inline gmx_simd4_real_t
+gmx_mm_transpose_sum4h_pr(gmx_simd_real_t a, gmx_simd_real_t b)
  {
-    gmx_mm_pr4 sum;
-    int        i;
+    gmx_simd4_real_t sum;
+    int              i;
  
      sum.r[0] = 0;
      sum.r[1] = 0;
@@ -246,7 +256,7 @@ gmx_mm_transpose_sum4h_pr(gmx_simd_ref_pr a, gmx_simd_ref_pr b)
  #endif
  
  static gmx_inline void
-gmx_pr_to_2hpr(gmx_simd_ref_pr a, gmx_mm_hpr *b, gmx_mm_hpr *c)
+gmx_pr_to_2hpr(gmx_simd_real_t a, gmx_mm_hpr *b, gmx_mm_hpr *c)
  {
      int i;
  
@@ -257,7 +267,7 @@ gmx_pr_to_2hpr(gmx_simd_ref_pr a, gmx_mm_hpr *b, gmx_mm_hpr *c)
      }
  }
  static gmx_inline void
-gmx_2hpr_to_pr(gmx_mm_hpr a, gmx_mm_hpr b, gmx_simd_ref_pr *c)
+gmx_2hpr_to_pr(gmx_mm_hpr a, gmx_mm_hpr b, gmx_simd_real_t *c)
  {
      int i;
  
@@ -273,16 +283,16 @@ gmx_2hpr_to_pr(gmx_mm_hpr a, gmx_mm_hpr b, gmx_simd_ref_pr *c)
  
  #ifndef TAB_FDV0
  static gmx_inline void
-load_table_f(const real *tab_coul_F, gmx_simd_ref_epi32 ti_S,
+load_table_f(const real *tab_coul_F, gmx_simd_int32_t ti_S,
               int gmx_unused *ti,
-             gmx_simd_ref_pr *ctab0_S, gmx_simd_ref_pr *ctab1_S)
+             gmx_simd_real_t *ctab0_S, gmx_simd_real_t *ctab1_S)
  {
      int i;
  
      for (i = 0; i < GMX_SIMD_REAL_WIDTH; i++)
      {
-        ctab0_S->r[i] = tab_coul_F[ti_S.r[i]];
-        ctab1_S->r[i] = tab_coul_F[ti_S.r[i]+1];
+        ctab0_S->r[i] = tab_coul_F[ti_S.i[i]];
+        ctab1_S->r[i] = tab_coul_F[ti_S.i[i]+1];
      }
  
      *ctab1_S  = gmx_simd_sub_r(*ctab1_S, *ctab0_S);
@@ -290,9 +300,9 @@ load_table_f(const real *tab_coul_F, gmx_simd_ref_epi32 ti_S,
  
  static gmx_inline void
  load_table_f_v(const real *tab_coul_F, const real *tab_coul_V,
-               gmx_simd_ref_epi32 ti_S, int *ti,
-               gmx_simd_ref_pr *ctab0_S, gmx_simd_ref_pr *ctab1_S,
-               gmx_simd_ref_pr *ctabv_S)
+               gmx_simd_int32_t ti_S, int *ti,
+               gmx_simd_real_t *ctab0_S, gmx_simd_real_t *ctab1_S,
+               gmx_simd_real_t *ctabv_S)
  {
      int i;
  
@@ -300,30 +310,30 @@ load_table_f_v(const real *tab_coul_F, const real *tab_coul_V,
  
      for (i = 0; i < GMX_SIMD_REAL_WIDTH; i++)
      {
-        ctabv_S->r[i] = tab_coul_V[ti_S.r[i]];
+        ctabv_S->r[i] = tab_coul_V[ti_S.i[i]];
      }
  }
  #endif
  
  #ifdef TAB_FDV0
  static gmx_inline void
-load_table_f(const real *tab_coul_FDV0, gmx_simd_ref_epi32 ti_S, int *ti,
-             gmx_simd_ref_pr *ctab0_S, gmx_simd_ref_pr *ctab1_S)
+load_table_f(const real *tab_coul_FDV0, gmx_simd_int32_t ti_S, int *ti,
+             gmx_simd_real_t *ctab0_S, gmx_simd_real_t *ctab1_S)
  {
      int i;
  
      for (i = 0; i < GMX_SIMD_REAL_WIDTH; i++)
      {
-        ctab0_S->r[i] = tab_coul_FDV0[ti_S.r[i]*4];
-        ctab1_S->r[i] = tab_coul_FDV0[ti_S.r[i]*4+1];
+        ctab0_S->r[i] = tab_coul_FDV0[ti_S.i[i]*4];
+        ctab1_S->r[i] = tab_coul_FDV0[ti_S.i[i]*4+1];
      }
  }
  
  static gmx_inline void
  load_table_f_v(const real *tab_coul_FDV0,
-               gmx_simd_ref_epi32 ti_S, int *ti,
-               gmx_simd_ref_pr *ctab0_S, gmx_simd_ref_pr *ctab1_S,
-               gmx_simd_ref_pr *ctabv_S)
+               gmx_simd_int32_t ti_S, int *ti,
+               gmx_simd_real_t *ctab0_S, gmx_simd_real_t *ctab1_S,
+               gmx_simd_real_t *ctabv_S)
  {
      int i;
  
@@ -331,7 +341,7 @@ load_table_f_v(const real *tab_coul_FDV0,
  
      for (i = 0; i < GMX_SIMD_REAL_WIDTH; i++)
      {
-        ctabv_S->r[i] = tab_coul_FDV0[ti_S.r[i]*4+2];
+        ctabv_S->r[i] = tab_coul_FDV0[ti_S.i[i]*4+2];
      }
  }
  #endif
@@ -340,10 +350,10 @@ load_table_f_v(const real *tab_coul_FDV0,
   * Note that 4/8-way SIMD requires gmx_mm_transpose_sum4_pr instead.
   */
  #if GMX_SIMD_REAL_WIDTH == 2
-static gmx_inline gmx_simd_ref_pr
-gmx_mm_transpose_sum2_pr(gmx_simd_ref_pr in0, gmx_simd_ref_pr in1)
+static gmx_inline gmx_simd_real_t
+gmx_mm_transpose_sum2_pr(gmx_simd_real_t in0, gmx_simd_real_t in1)
  {
-    gmx_simd_ref_pr sum;
+    gmx_simd_real_t sum;
  
      sum.r[0] = in0.r[0] + in0.r[1];
      sum.r[1] = in1.r[0] + in1.r[1];
@@ -354,19 +364,19 @@ gmx_mm_transpose_sum2_pr(gmx_simd_ref_pr in0, gmx_simd_ref_pr in1)
  
  #if GMX_SIMD_REAL_WIDTH >= 4
  #if GMX_SIMD_REAL_WIDTH == 4
-static gmx_inline gmx_simd_ref_pr
+static gmx_inline gmx_simd_real_t
  #else
-static gmx_inline gmx_mm_pr4
+static gmx_inline gmx_simd4_real_t
  #endif
-gmx_mm_transpose_sum4_pr(gmx_simd_ref_pr in0, gmx_simd_ref_pr in1,
-                         gmx_simd_ref_pr in2, gmx_simd_ref_pr in3)
+gmx_mm_transpose_sum4_pr(gmx_simd_real_t in0, gmx_simd_real_t in1,
+                         gmx_simd_real_t in2, gmx_simd_real_t in3)
  {
  #if GMX_SIMD_REAL_WIDTH == 4
-    gmx_simd_ref_pr sum;
+    gmx_simd_real_t  sum;
  #else
-    gmx_mm_pr4      sum;
+    gmx_simd4_real_t sum;
  #endif
-    int             i;
+    int              i;
  
      sum.r[0] = 0;
      sum.r[1] = 0;
@@ -392,8 +402,8 @@ gmx_mm_transpose_sum4_pr(gmx_simd_ref_pr in0, gmx_simd_ref_pr in1,
   * For this reference code we just use a plain-C sqrt.
   */
  static gmx_inline void
-gmx_mm_invsqrt2_pd(gmx_simd_ref_pr in0, gmx_simd_ref_pr in1,
-                   gmx_simd_ref_pr *out0, gmx_simd_ref_pr *out1)
+gmx_mm_invsqrt2_pd(gmx_simd_real_t in0, gmx_simd_real_t in1,
+                   gmx_simd_real_t *out0, gmx_simd_real_t *out1)
  {
      *out0 = gmx_simd_invsqrt_r(in0);
      *out1 = gmx_simd_invsqrt_r(in1);
@@ -402,7 +412,7 @@ gmx_mm_invsqrt2_pd(gmx_simd_ref_pr in0, gmx_simd_ref_pr in1,
  
  static gmx_inline void
  load_lj_pair_params(const real *nbfp, const int *type, int aj,
-                    gmx_simd_ref_pr *c6_S, gmx_simd_ref_pr *c12_S)
+                    gmx_simd_real_t *c6_S, gmx_simd_real_t *c12_S)
  {
      int i;
  
@@ -417,7 +427,7 @@ load_lj_pair_params(const real *nbfp, const int *type, int aj,
  static gmx_inline void
  load_lj_pair_params2(const real *nbfp0, const real *nbfp1,
                       const int *type, int aj,
-                     gmx_simd_ref_pr *c6_S, gmx_simd_ref_pr *c12_S)
+                     gmx_simd_real_t *c6_S, gmx_simd_real_t *c12_S)
  {
      int i;
  
@@ -445,9 +455,9 @@ gmx_simd_ref_load1_exclfilter(int src)
      gmx_simd_ref_exclfilter a;
      int                     i;
  
-    for (i = 0; i < GMX_SIMD_REF_WIDTH; i++)
+    for (i = 0; i < GMX_SIMD_REAL_WIDTH; i++)
      {
-        a.r[i] = src;
+        a.i[i] = src;
      }
  
      return a;
@@ -459,9 +469,9 @@ gmx_simd_ref_load_exclusion_filter(const int *src)
      gmx_simd_ref_exclfilter a;
      int                     i;
  
-    for (i = 0; i < GMX_SIMD_REF_WIDTH; i++)
+    for (i = 0; i < GMX_SIMD_REAL_WIDTH; i++)
      {
-        a.r[i] = src[i];
+        a.i[i] = src[i];
      }
  
      return a;
@@ -478,15 +488,15 @@ gmx_simd_ref_load_exclusion_filter(const int *src)
   * If the same bit is set in both input masks, return TRUE, else
   * FALSE. This function is only called with a single bit set in b.
   */
-static gmx_inline gmx_simd_ref_pb
+static gmx_inline gmx_simd_bool_t
  gmx_simd_ref_checkbitmask_pb(gmx_simd_ref_exclfilter a, gmx_simd_ref_exclfilter b)
  {
-    gmx_simd_ref_pb c;
+    gmx_simd_bool_t c;
      int             i;
  
-    for (i = 0; i < GMX_SIMD_REF_WIDTH; i++)
+    for (i = 0; i < GMX_SIMD_REAL_WIDTH; i++)
      {
-        c.r[i] = ((a.r[i] & b.r[i]) != 0);
+        c.b[i] = ((a.i[i] & b.i[i]) != 0);
      }
  
      return c;
diff --git a/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_simd_utils_x86_128d.h b/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_simd_utils_x86_128d.h

index 92be81d99f3b073c1ad201d2ab201e12ae047926..d7a3d037a2cc71e1d1a23a64a358e7c5f90a96e4 100644 (file)
--- a/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_simd_utils_x86_128d.h
+++ b/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_simd_utils_x86_128d.h
@@ -45,8 +45,13 @@
   *   energy group pair energy storage
   */
  
+#define gmx_mm_extract_epi32(x, imm) _mm_cvtsi128_si32(_mm_srli_si128((x), 4 * (imm)))
+
  typedef gmx_simd_int32_t gmx_exclfilter;
-static const int filter_stride = GMX_SIMD_INT32_WIDTH/GMX_SIMD_REAL_WIDTH;
+/* This is set to a constant for now, since the code does not adapt automatically just
+ * because we set the SIMD widths to other values.
+ */
+static const int filter_stride = 2;
  
  /* Transpose 2 double precision registers */
  static gmx_inline void
@@ -183,13 +188,14 @@ gmx_load1_exclfilter(int e)
  static gmx_inline gmx_exclfilter
  gmx_load_exclusion_filter(const unsigned *i)
  {
-    return _mm_load_si128((__m128i *) i);
+    /* For now this has to be an explicit-float load since we use stride==2 */
+    return gmx_simd_load_fi(i);
  }
  
  static gmx_inline gmx_simd_bool_t
  gmx_checkbitmask_pb(gmx_exclfilter m0, gmx_exclfilter m1)
  {
-    return gmx_mm_castsi128_pd(_mm_cmpeq_epi32(_mm_andnot_si128(m0, m1), _mm_setzero_si128()));
+    return _mm_castsi128_pd(_mm_cmpeq_epi32(_mm_andnot_si128(m0, m1), _mm_setzero_si128()));
  }
  
  #endif /* _nbnxn_kernel_simd_utils_x86_s128d_h_ */
diff --git a/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_simd_utils_x86_128s.h b/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_simd_utils_x86_128s.h

index 0571a6cd1f836ff9449f1cec5d2be0f21efb77a3..b1d08c1ae6bed003a70c89e9433e34eb617d29d1 100644 (file)
--- a/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_simd_utils_x86_128s.h
+++ b/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_simd_utils_x86_128s.h
@@ -125,13 +125,13 @@ load_table_f(const real *tab_coul_FDV0, gmx_simd_int32_t ti_S, int gmx_unused *t
      /* Table has 4 entries, left-shift index by 2 */
      ti_S = _mm_slli_epi32(ti_S, 2);
      /* Without SSE4.1 the extract macro needs an immediate: unroll */
-    idx[0]    = gmx_mm_extract_epi32(ti_S, 0);
+    idx[0]    = gmx_simd_extract_i(ti_S, 0);
      ctab_S[0] = _mm_load_ps(tab_coul_FDV0+idx[0]);
-    idx[1]    = gmx_mm_extract_epi32(ti_S, 1);
+    idx[1]    = gmx_simd_extract_i(ti_S, 1);
      ctab_S[1] = _mm_load_ps(tab_coul_FDV0+idx[1]);
-    idx[2]    = gmx_mm_extract_epi32(ti_S, 2);
+    idx[2]    = gmx_simd_extract_i(ti_S, 2);
      ctab_S[2] = _mm_load_ps(tab_coul_FDV0+idx[2]);
-    idx[3]    = gmx_mm_extract_epi32(ti_S, 3);
+    idx[3]    = gmx_simd_extract_i(ti_S, 3);
      ctab_S[3] = _mm_load_ps(tab_coul_FDV0+idx[3]);
  
      /* Shuffle the force table entries to a convenient order */
@@ -148,13 +148,13 @@ load_table_f_v(const real *tab_coul_FDV0, gmx_simd_int32_t ti_S, int gmx_unused
      /* Table has 4 entries, left-shift index by 2 */
      ti_S = _mm_slli_epi32(ti_S, 2);
      /* Without SSE4.1 the extract macro needs an immediate: unroll */
-    idx[0]    = gmx_mm_extract_epi32(ti_S, 0);
+    idx[0]    = gmx_simd_extract_i(ti_S, 0);
      ctab_S[0] = _mm_load_ps(tab_coul_FDV0+idx[0]);
-    idx[1]    = gmx_mm_extract_epi32(ti_S, 1);
+    idx[1]    = gmx_simd_extract_i(ti_S, 1);
      ctab_S[1] = _mm_load_ps(tab_coul_FDV0+idx[1]);
-    idx[2]    = gmx_mm_extract_epi32(ti_S, 2);
+    idx[2]    = gmx_simd_extract_i(ti_S, 2);
      ctab_S[2] = _mm_load_ps(tab_coul_FDV0+idx[2]);
-    idx[3]    = gmx_mm_extract_epi32(ti_S, 3);
+    idx[3]    = gmx_simd_extract_i(ti_S, 3);
      ctab_S[3] = _mm_load_ps(tab_coul_FDV0+idx[3]);
  
      /* Shuffle the force table entries to a convenient order */
@@ -172,13 +172,13 @@ gmx_load1_exclfilter(int e)
  static gmx_inline gmx_exclfilter
  gmx_load_exclusion_filter(const unsigned *i)
  {
-    return _mm_load_si128((__m128i *) i);
+    return gmx_simd_load_i(i);
  }
  
  static gmx_inline gmx_simd_bool_t
  gmx_checkbitmask_pb(gmx_exclfilter m0, gmx_exclfilter m1)
  {
-    return gmx_mm_castsi128_ps(_mm_cmpeq_epi32(_mm_andnot_si128(m0, m1), _mm_setzero_si128()));
+    return _mm_castsi128_ps(_mm_cmpeq_epi32(_mm_andnot_si128(m0, m1), _mm_setzero_si128()));
  }
  
  #endif /* _nbnxn_kernel_simd_utils_x86_s128s_h_ */
diff --git a/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_simd_utils_x86_256s.h b/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_simd_utils_x86_256s.h

index 2c6fac5ba95f867bd388350543a18a87a40e453a..9a9aba1f95d57b2626d8db63136ba3f4940af1a9 100644 (file)
--- a/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_simd_utils_x86_256s.h
+++ b/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_simd_utils_x86_256s.h
@@ -43,15 +43,6 @@
   *   energy group pair energy storage
   */
  
-typedef gmx_simd_real_t gmx_exclfilter;
-static const int filter_stride = 1;
-
-/* The 4xn kernel operates on 4-wide i-force registers */
-#define gmx_mm_pr4     __m128
-#define gmx_load_pr4   _mm_load_ps
-#define gmx_store_pr4  _mm_store_ps
-#define gmx_add_pr4    _mm_add_ps
-
  
  #ifdef GMX_NBNXN_SIMD_2XNN
  /* Half-width operations are required for the 2xnn kernels */
@@ -66,14 +57,28 @@ static const int filter_stride = 1;
  #define gmx_set1_hpr(a, b)   *(a) = _mm_set1_ps(b)
  /* Load one real at b and one real at b+1 into halves of a, respectively */
  #define gmx_load1p1_pr(a, b)  *(a) = _mm256_insertf128_ps(_mm256_castps128_ps256(_mm_load1_ps(b)), _mm_load1_ps(b+1), 0x1)
-/* Load reals at half-width aligned pointer b into two halves of a */
-#define gmx_loaddh_pr(a, b)   *(a) = gmx_mm256_load4_ps(b)
  /* To half-width SIMD register b into half width aligned memory a */
  #define gmx_store_hpr(a, b)          _mm_store_ps(a, b)
  #define gmx_add_hpr                  _mm_add_ps
  #define gmx_sub_hpr                  _mm_sub_ps
+
  /* Sum over 4 half SIMD registers */
-#define gmx_sum4_hpr                 gmx_mm256_sum4h_m128
+static __m128 gmx_sum4_hpr(__m256 x, __m256 y)
+{
+    __m256 sum;
+
+    sum = _mm256_add_ps(x, y);
+    return _mm_add_ps(_mm256_castps256_ps128(sum), _mm256_extractf128_ps(sum, 0x1));
+}
+
+/* Load reals at half-width aligned pointer b into two halves of a */
+static gmx_inline void
+gmx_loaddh_pr(gmx_simd_real_t *a, const real *b)
+{
+    __m128 tmp;
+    tmp = _mm_load_ps(b);
+    *a  = _mm256_insertf128_ps(_mm256_castps128_ps256(tmp), tmp, 0x1);
+}
  
  static gmx_inline void
  gmx_pr_to_2hpr(gmx_simd_real_t a, gmx_mm_hpr *b, gmx_mm_hpr *c)
@@ -267,6 +272,35 @@ load_table_f_v(const real *tab_coul_FDV0, gmx_simd_int32_t ti_S, int *ti,
      *ctabv_S = gmx_2_mm_to_m256(ctabvt_S[0], ctabvt_S[1]);
  }
  
+#ifdef GMX_SIMD_HAVE_FINT32_LOGICAL
+
+typedef gmx_simd_int32_t gmx_exclfilter;
+static const int filter_stride = GMX_SIMD_INT32_WIDTH/GMX_SIMD_REAL_WIDTH;
+
+static gmx_inline gmx_exclfilter
+gmx_load1_exclfilter(int e)
+{
+    return _mm256_set1_epi32(e);
+}
+
+static gmx_inline gmx_exclfilter
+gmx_load_exclusion_filter(const unsigned *i)
+{
+    return gmx_simd_load_i(i);
+}
+
+static gmx_inline gmx_simd_bool_t
+gmx_checkbitmask_pb(gmx_exclfilter m0, gmx_exclfilter m1)
+{
+    return _mm256_castsi256_ps(_mm256_cmpeq_epi32(_mm256_andnot_si256(m0, m1), _mm256_setzero_si256()));
+}
+
+#else /* GMX_SIMD_HAVE_FINT32_LOGICAL */
+
+/* No integer support, use a real to store the exclusion bits */
+typedef gmx_simd_real_t gmx_exclfilter;
+static const int filter_stride = 1;
+
  static gmx_inline gmx_exclfilter
  gmx_load1_exclfilter(int e)
  {
@@ -285,4 +319,6 @@ gmx_checkbitmask_pb(gmx_exclfilter m0, gmx_exclfilter m1)
      return _mm256_cmp_ps(_mm256_cvtepi32_ps(_mm256_castps_si256(_mm256_and_ps(m0, m1))), _mm256_setzero_ps(), 0x0c);
  }
  
+#endif /* GMX_SIMD_HAVE_FINT32_LOGICAL */
+
  #endif /* _nbnxn_kernel_simd_utils_x86_s256s_h_ */
diff --git a/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_simd_utils_x86_mic.h b/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_simd_utils_x86_mic.h

index 96ec6800c42ff9da11a7d4b2ebd9dbb8437f26d4..a7a11677f53c11c08c4de80d9385bc197e153efe 100644 (file)
--- a/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_simd_utils_x86_mic.h
+++ b/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_simd_utils_x86_mic.h
@@ -44,27 +44,6 @@ static const int filter_stride = GMX_SIMD_INT32_WIDTH/GMX_SIMD_REAL_WIDTH;
  #define mask_loh _mm512_int2mask(0x00FF) /* would be better a constant - but can't initialize with a function call. */
  #define mask_hih _mm512_int2mask(0xFF00)
  
-/* float/double SIMD register type */
-typedef __m512 gmx_mm_pr4;
-
-static gmx_inline gmx_mm_pr4
-gmx_load_pr4(const real *r)
-{
-    return _mm512_loadunpacklo_ps(_mm512_undefined_ps(), r);
-}
-
-static gmx_inline void
-gmx_store_pr4(real *dest, gmx_mm_pr4 src)
-{
-    _mm512_mask_packstorelo_ps(dest, _mm512_int2mask(0xF), src);
-}
-
-static gmx_inline gmx_mm_pr4
-gmx_add_pr4(gmx_mm_pr4 a, gmx_mm_pr4 b)
-{
-    return _mm512_add_ps(a, b);
-}
-
  /* Half-width SIMD real type */
  typedef __m512 gmx_mm_hpr; /* high half is ignored */
  
@@ -86,7 +65,7 @@ gmx_set1_hpr(gmx_mm_hpr *a, real b)
  
  /* Load one real at b and one real at b+1 into halves of a, respectively */
  static gmx_inline void
-gmx_load1p1_pr(gmx_mm_ps *a, const real *b)
+gmx_load1p1_pr(gmx_simd_float_t *a, const real *b)
  {
  
      *a = _mm512_mask_extload_ps(_mm512_extload_ps(b, _MM_UPCONV_PS_NONE, _MM_BROADCAST_1X16, _MM_HINT_NONE), mask_hih,
@@ -95,7 +74,7 @@ gmx_load1p1_pr(gmx_mm_ps *a, const real *b)
  
  /* Load reals at half-width aligned pointer b into two halves of a */
  static gmx_inline void
-gmx_loaddh_pr(gmx_mm_ps *a, const real *b)
+gmx_loaddh_pr(gmx_simd_float_t *a, const real *b)
  {
      *a = _mm512_permute4f128_ps(_mm512_loadunpacklo_ps(_mm512_undefined_ps(), b), PERM_LOW2HIGH);
  }
@@ -112,7 +91,7 @@ gmx_store_hpr(real *a, gmx_mm_hpr b)
  
  /* Sum over 4 half SIMD registers */
  static gmx_inline gmx_mm_hpr
-gmx_sum4_hpr(gmx_mm_ps a, gmx_mm_ps b)
+gmx_sum4_hpr(gmx_simd_float_t a, gmx_simd_float_t b)
  {
      a = _mm512_add_ps(a, b);
      b = _mm512_permute4f128_ps(a, PERM_HIGH2LOW);
@@ -120,8 +99,8 @@ gmx_sum4_hpr(gmx_mm_ps a, gmx_mm_ps b)
  }
  
  /* Sum the elements of halfs of each input register and store sums in out */
-static gmx_inline gmx_mm_pr4
-gmx_mm_transpose_sum4h_pr(gmx_mm_ps a, gmx_mm_ps b)
+static gmx_inline __m512
+gmx_mm_transpose_sum4h_pr(gmx_simd_float_t a, gmx_simd_float_t b)
  {
      return _mm512_setr4_ps(_mm512_mask_reduce_add_ps(mask_loh, a),
                             _mm512_mask_reduce_add_ps(mask_hih, a),
@@ -130,21 +109,21 @@ gmx_mm_transpose_sum4h_pr(gmx_mm_ps a, gmx_mm_ps b)
  }
  
  static gmx_inline void
-gmx_pr_to_2hpr(gmx_mm_ps a, gmx_mm_hpr *b, gmx_mm_hpr *c)
+gmx_pr_to_2hpr(gmx_simd_float_t a, gmx_mm_hpr *b, gmx_mm_hpr *c)
  {
      *b = a;
      *c = _mm512_permute4f128_ps(a, PERM_HIGH2LOW);
  }
  
  static gmx_inline void
-gmx_2hpr_to_pr(gmx_mm_hpr a, gmx_mm_hpr b, gmx_mm_ps *c)
+gmx_2hpr_to_pr(gmx_mm_hpr a, gmx_mm_hpr b, gmx_simd_float_t *c)
  {
      *c = _mm512_mask_permute4f128_ps(a, mask_hih, b, PERM_LOW2HIGH);
  }
  
  /* recombine the 2 high half into c */
  static gmx_inline void
-gmx_2hpr_high_to_pr(gmx_mm_hpr a, gmx_mm_hpr b, gmx_mm_ps *c)
+gmx_2hpr_high_to_pr(gmx_mm_hpr a, gmx_mm_hpr b, gmx_simd_float_t *c)
  {
      *c = _mm512_mask_permute4f128_ps(b, mask_loh, a, PERM_HIGH2LOW);
  }
@@ -177,7 +156,7 @@ prepare_table_load_buffer(const int *array)
   */
  static gmx_inline void
  load_table_f(const real *tab_coul_F, gmx_simd_int32_t ti_S, int *ti,
-             gmx_mm_ps *ctab0_S, gmx_mm_ps *ctab1_S)
+             gmx_simd_float_t *ctab0_S, gmx_simd_float_t *ctab1_S)
  {
      __m512i idx;
      __m512i ti1 = _mm512_add_epi32(ti_S, _mm512_set1_epi32(1)); /* incr by 1 for tab1 */
@@ -188,22 +167,23 @@ load_table_f(const real *tab_coul_F, gmx_simd_int32_t ti_S, int *ti,
  
      gmx_2hpr_to_pr(tmp1, tmp2, ctab0_S);
      gmx_2hpr_high_to_pr(tmp1, tmp2, ctab1_S);
+
      *ctab1_S  = gmx_simd_sub_r(*ctab1_S, *ctab0_S);
  }
  
  static gmx_inline void
  load_table_f_v(const real *tab_coul_F, const real *tab_coul_V,
                 gmx_simd_int32_t ti_S, int *ti,
-               gmx_mm_ps *ctab0_S, gmx_mm_ps *ctab1_S,
-               gmx_mm_ps *ctabv_S)
+               gmx_simd_float_t *ctab0_S, gmx_simd_float_t *ctab1_S,
+               gmx_simd_float_t *ctabv_S)
  {
      load_table_f(tab_coul_F, ti_S, ti, ctab0_S, ctab1_S);
      *ctabv_S = _mm512_i32gather_ps(ti_S, tab_coul_V, sizeof(float));
  }
  
-static gmx_inline gmx_mm_pr4
-gmx_mm_transpose_sum4_pr(gmx_mm_ps in0, gmx_mm_ps in1,
-                         gmx_mm_ps in2, gmx_mm_ps in3)
+static gmx_inline __m512
+gmx_mm_transpose_sum4_pr(gmx_simd_float_t in0, gmx_simd_float_t in1,
+                         gmx_simd_float_t in2, gmx_simd_float_t in3)
  {
      return _mm512_setr4_ps(_mm512_reduce_add_ps(in0),
                             _mm512_reduce_add_ps(in1),
@@ -214,7 +194,7 @@ gmx_mm_transpose_sum4_pr(gmx_mm_ps in0, gmx_mm_ps in1,
  static gmx_inline void
  load_lj_pair_params2(const real *nbfp0, const real *nbfp1,
                       const int *type, int aj,
-                     gmx_mm_ps *c6_S, gmx_mm_ps *c12_S)
+                     gmx_simd_float_t *c6_S, gmx_simd_float_t *c12_S)
  {
      __m512i idx0, idx1, idx;
  
@@ -233,18 +213,6 @@ load_lj_pair_params2(const real *nbfp0, const real *nbfp1,
      gmx_2hpr_high_to_pr(tmp1, tmp2, c12_S);
  }
  
-#define HAVE_GMX_SUM_SIMD
-static gmx_inline real
-gmx_sum_simd(gmx_simd_real_t x, real* b)
-{
-    return _mm512_reduce_add_ps(x);
-}
-static gmx_inline real
-gmx_sum_simd4(gmx_simd_real_t x, real* b)
-{
-    return _mm512_mask_reduce_add_ps(_mm512_int2mask(0xF), x);
-}
-
  /* Code for handling loading exclusions and converting them into
     interactions. */
  #define gmx_load1_exclfilter _mm512_set1_epi32
diff --git a/src/gromacs/mdlib/nbnxn_kernels/simd_2xnn/nbnxn_kernel_simd_2xnn.c b/src/gromacs/mdlib/nbnxn_kernels/simd_2xnn/nbnxn_kernel_simd_2xnn.c

index d8ec1d2edc4c3a863b8a9667ac4ea5c9cc8485eb..45f09bc4621d64d900ba6c9fc66effead4372743 100644 (file)
--- a/src/gromacs/mdlib/nbnxn_kernels/simd_2xnn/nbnxn_kernel_simd_2xnn.c
+++ b/src/gromacs/mdlib/nbnxn_kernels/simd_2xnn/nbnxn_kernel_simd_2xnn.c
@@ -43,11 +43,11 @@
  
  #include "typedefs.h"
  
+#include "gromacs/mdlib/nbnxn_simd.h"
+
  #ifdef GMX_NBNXN_SIMD_2XNN
  
  /* Include the full-width SIMD macros */
-
-#include "gromacs/simd/macros.h"
  #include "gromacs/simd/vector_operations.h"
  
  #if !(GMX_SIMD_REAL_WIDTH == 8 || GMX_SIMD_REAL_WIDTH == 16)
diff --git a/src/gromacs/mdlib/nbnxn_kernels/simd_2xnn/nbnxn_kernel_simd_2xnn.h b/src/gromacs/mdlib/nbnxn_kernels/simd_2xnn/nbnxn_kernel_simd_2xnn.h

index 58f1e768e01a4f9df9ac0412b1a7d6329016af3a..c7ec9bcbd7db63438412c1d7a5581ef39c34fe63 100644 (file)
--- a/src/gromacs/mdlib/nbnxn_kernels/simd_2xnn/nbnxn_kernel_simd_2xnn.h
+++ b/src/gromacs/mdlib/nbnxn_kernels/simd_2xnn/nbnxn_kernel_simd_2xnn.h
@@ -39,6 +39,8 @@
  
  #include "typedefs.h"
  
+#include "gromacs/mdlib/nbnxn_simd.h"
+
  #ifdef __cplusplus
  extern "C" {
  #endif
diff --git a/src/gromacs/mdlib/nbnxn_kernels/simd_2xnn/nbnxn_kernel_simd_2xnn_common.h b/src/gromacs/mdlib/nbnxn_kernels/simd_2xnn/nbnxn_kernel_simd_2xnn_common.h

index 588472bed6ce5928543647a06a299a23352bc431..01d8090765b2c4eb0d01772b05310f1b21c8b81f 100644 (file)
--- a/src/gromacs/mdlib/nbnxn_kernels/simd_2xnn/nbnxn_kernel_simd_2xnn_common.h
+++ b/src/gromacs/mdlib/nbnxn_kernels/simd_2xnn/nbnxn_kernel_simd_2xnn_common.h
@@ -32,8 +32,8 @@
   * To help us fund GROMACS development, we humbly ask that you cite
   * the research papers on the package. Check out http://www.gromacs.org.
   */
-#include "gromacs/simd/macros.h"
-#include "gromacs/simd/four_wide_macros.h"
+#include "gromacs/simd/simd.h"
+#include "gromacs/simd/simd_math.h"
  #include "gromacs/simd/vector_operations.h"
  #include "../../nbnxn_consts.h"
  #ifdef CALC_COUL_EWALD
diff --git a/src/gromacs/mdlib/nbnxn_kernels/simd_2xnn/nbnxn_kernel_simd_2xnn_inner.h b/src/gromacs/mdlib/nbnxn_kernels/simd_2xnn/nbnxn_kernel_simd_2xnn_inner.h

index 8c97de1c7ea26d2c2e98187ac098bb5ef5185ca4..5663260589ca2f8af2c509f58cbdc615503d8b1e 100644 (file)
--- a/src/gromacs/mdlib/nbnxn_kernels/simd_2xnn/nbnxn_kernel_simd_2xnn_inner.h
+++ b/src/gromacs/mdlib/nbnxn_kernels/simd_2xnn/nbnxn_kernel_simd_2xnn_inner.h
@@ -342,8 +342,8 @@
  
  #ifdef CHECK_EXCLS
      /* For excluded pairs add a small number to avoid r^-6 = NaN */
-    rsq_S0      = gmx_masknot_add_pr(interact_S0, rsq_S0, avoid_sing_S);
-    rsq_S2      = gmx_masknot_add_pr(interact_S2, rsq_S2, avoid_sing_S);
+    rsq_S0      = gmx_simd_add_r(rsq_S0, gmx_simd_blendnotzero_r(avoid_sing_S, interact_S0));
+    rsq_S2      = gmx_simd_add_r(rsq_S2, gmx_simd_blendnotzero_r(avoid_sing_S, interact_S2));
  #endif
  
      /* Calculate 1/r */
@@ -397,6 +397,11 @@
      rinv_S0     = gmx_simd_blendzero_r(rinv_S0, wco_S0);
      rinv_S2     = gmx_simd_blendzero_r(rinv_S2, wco_S2);
  #else
+    /* This needs to be modified: It makes assumptions about the internal storage
+     * of the SIMD representation, in particular that the blendv instruction always
+     * selects based on the sign bit. If the performance is really critical, it
+     * should be turned into a function that is platform-specific.
+     */
      /* We only need to mask for the cut-off: blendv is faster */
      rinv_S0     = gmx_simd_blendv_r(rinv_S0, zero_S, gmx_simd_sub_r(rc2_S, rsq_S0));
      rinv_S2     = gmx_simd_blendv_r(rinv_S2, zero_S, gmx_simd_sub_r(rc2_S, rsq_S2));
@@ -467,9 +472,9 @@
      /* Truncate scaled r to an int */
      ti_S0       = gmx_simd_cvtt_r2i(rs_S0);
      ti_S2       = gmx_simd_cvtt_r2i(rs_S2);
-#ifdef GMX_SIMD_HAVE_FLOOR
-    rf_S0       = gmx_simd_floor_r(rs_S0);
-    rf_S2       = gmx_simd_floor_r(rs_S2);
+#ifdef GMX_SIMD_HAVE_TRUNC
+    rf_S0       = gmx_simd_trunc_r(rs_S0);
+    rf_S2       = gmx_simd_trunc_r(rs_S2);
  #else
      rf_S0       = gmx_simd_cvt_i2r(ti_S0);
      rf_S2       = gmx_simd_cvt_i2r(ti_S2);
diff --git a/src/gromacs/mdlib/nbnxn_kernels/simd_2xnn/nbnxn_kernel_simd_2xnn_outer.h b/src/gromacs/mdlib/nbnxn_kernels/simd_2xnn/nbnxn_kernel_simd_2xnn_outer.h

index 38b0293872682a5b49ce30bc017b3eb3b37fad0c..590c09547df299ca20279ee02be9f82a5f4863a5 100644 (file)
--- a/src/gromacs/mdlib/nbnxn_kernels/simd_2xnn/nbnxn_kernel_simd_2xnn_outer.h
+++ b/src/gromacs/mdlib/nbnxn_kernels/simd_2xnn/nbnxn_kernel_simd_2xnn_outer.h
@@ -97,7 +97,7 @@
      gmx_simd_real_t  fix_S0, fiy_S0, fiz_S0;
      gmx_simd_real_t  fix_S2, fiy_S2, fiz_S2;
      /* We use an i-force SIMD register width of 4 */
-    /* The pr4 stuff is defined in nbnxn_kernel_simd_utils.h */
+    /* The simd4 stuff might be defined in nbnxn_kernel_simd_utils.h */
      gmx_simd4_real_t fix_S, fiy_S, fiz_S;
  
      gmx_simd_real_t  diagonal_jmi_S;
@@ -201,15 +201,6 @@
      gmx_simd_real_t  rcvdw2_S;
  #endif
  
-#ifdef CALC_ENERGIES
-    /* cppcheck-suppress unassignedVariable */
-    real       tmpsum_array[2*GMX_SIMD_REAL_WIDTH], *tmpsum;
-#endif
-#ifdef CALC_SHIFTFORCES
-    /* cppcheck-suppress unassignedVariable */
-    real       shf_array[2*GMX_SIMD_REAL_WIDTH], *shf;
-#endif
-
      int ninner;
  
  #ifdef COUNT_PAIRS
@@ -366,13 +357,6 @@
      shiftvec            = shift_vec[0];
      x                   = nbat->x;
  
-#ifdef CALC_ENERGIES
-    tmpsum   = gmx_simd_align_r(tmpsum_array);
-#endif
-#ifdef CALC_SHIFTFORCES
-    shf      = gmx_simd_align_r(shf_array);
-#endif
-
  #ifdef FIX_LJ_C
      pvdw_c6  = gmx_simd_align_r(pvdw_array);
      pvdw_c12 = pvdw_c6 + UNROLLI*UNROLLJ;
@@ -667,27 +651,26 @@
  
          /* Add accumulated i-forces to the force array */
          fix_S = gmx_mm_transpose_sum4h_pr(fix_S0, fix_S2);
-        gmx_simd4_store_r(f+scix, gmx_add_pr4(fix_S, gmx_simd4_load_r(f+scix)));
+        gmx_simd4_store_r(f+scix, gmx_simd4_add_r(fix_S, gmx_simd4_load_r(f+scix)));
  
          fiy_S = gmx_mm_transpose_sum4h_pr(fiy_S0, fiy_S2);
-        gmx_simd4_store_r(f+sciy, gmx_add_pr4(fiy_S, gmx_simd4_load_r(f+sciy)));
+        gmx_simd4_store_r(f+sciy, gmx_simd4_add_r(fiy_S, gmx_simd4_load_r(f+sciy)));
  
          fiz_S = gmx_mm_transpose_sum4h_pr(fiz_S0, fiz_S2);
-        gmx_simd4_store_r(f+sciz, gmx_add_pr4(fiz_S, gmx_simd4_load_r(f+sciz)));
+        gmx_simd4_store_r(f+sciz, gmx_simd4_add_r(fiz_S, gmx_simd4_load_r(f+sciz)));
  
  #ifdef CALC_SHIFTFORCES
-        fshift[ish3+0] += gmx_sum_simd4(fix_S, shf);
-        fshift[ish3+1] += gmx_sum_simd4(fiy_S, shf);
-        fshift[ish3+2] += gmx_sum_simd4(fiz_S, shf);
+        fshift[ish3+0] += gmx_simd4_reduce_r(fix_S);
+        fshift[ish3+1] += gmx_simd4_reduce_r(fiy_S);
+        fshift[ish3+2] += gmx_simd4_reduce_r(fiz_S);
  #endif
  
  #ifdef CALC_ENERGIES
          if (do_coul)
          {
-            *Vc += gmx_sum_simd(vctot_S, tmpsum);
+            *Vc += gmx_simd_reduce_r(vctot_S);
          }
-
-        *Vvdw += gmx_sum_simd(Vvdwtot_S, tmpsum);
+        *Vvdw += gmx_simd_reduce_r(Vvdwtot_S);
  #endif
  
          /* Outer loop uses 6 flops/iteration */
diff --git a/src/gromacs/mdlib/nbnxn_kernels/simd_4xn/nbnxn_kernel_simd_4xn.c b/src/gromacs/mdlib/nbnxn_kernels/simd_4xn/nbnxn_kernel_simd_4xn.c

index 06d93601fd6bb01f3f88d17a433332e2e08d7468..17608af2e7037a7fa46f647413d213f09036ada8 100644 (file)
--- a/src/gromacs/mdlib/nbnxn_kernels/simd_4xn/nbnxn_kernel_simd_4xn.c
+++ b/src/gromacs/mdlib/nbnxn_kernels/simd_4xn/nbnxn_kernel_simd_4xn.c
@@ -43,13 +43,10 @@
  
  #include "typedefs.h"
  
-#ifdef GMX_NBNXN_SIMD_4XN
+#include "gromacs/mdlib/nbnxn_simd.h"
  
-#ifdef GMX_NBNXN_HALF_WIDTH_SIMD
-#define GMX_USE_HALF_WIDTH_SIMD_HERE
-#endif
+#ifdef GMX_NBNXN_SIMD_4XN
  
-#include "gromacs/simd/macros.h"
  #include "gromacs/simd/vector_operations.h"
  #if !(GMX_SIMD_REAL_WIDTH == 2 || GMX_SIMD_REAL_WIDTH == 4 || GMX_SIMD_REAL_WIDTH == 8)
  #error "unsupported SIMD width"
diff --git a/src/gromacs/mdlib/nbnxn_kernels/simd_4xn/nbnxn_kernel_simd_4xn.h b/src/gromacs/mdlib/nbnxn_kernels/simd_4xn/nbnxn_kernel_simd_4xn.h

index 80859119efcc2181a674b340aa6108506200b041..e6e475765a84e69ec2279edf674ac31d59f64281 100644 (file)
--- a/src/gromacs/mdlib/nbnxn_kernels/simd_4xn/nbnxn_kernel_simd_4xn.h
+++ b/src/gromacs/mdlib/nbnxn_kernels/simd_4xn/nbnxn_kernel_simd_4xn.h
@@ -39,6 +39,8 @@
  
  #include "typedefs.h"
  
+#include "gromacs/mdlib/nbnxn_simd.h"
+
  #ifdef __cplusplus
  extern "C" {
  #endif
diff --git a/src/gromacs/mdlib/nbnxn_kernels/simd_4xn/nbnxn_kernel_simd_4xn_common.h b/src/gromacs/mdlib/nbnxn_kernels/simd_4xn/nbnxn_kernel_simd_4xn_common.h

index 93ca4855bc63b67004338792a311a872beac27d1..5d4222167558ee28a3b0771a0574429fb6851c4e 100644 (file)
--- a/src/gromacs/mdlib/nbnxn_kernels/simd_4xn/nbnxn_kernel_simd_4xn_common.h
+++ b/src/gromacs/mdlib/nbnxn_kernels/simd_4xn/nbnxn_kernel_simd_4xn_common.h
@@ -32,8 +32,8 @@
   * To help us fund GROMACS development, we humbly ask that you cite
   * the research papers on the package. Check out http://www.gromacs.org.
   */
-#include "gromacs/simd/macros.h"
-#include "gromacs/simd/four_wide_macros.h"
+#include "gromacs/simd/simd.h"
+#include "gromacs/simd/simd_math.h"
  #include "gromacs/simd/vector_operations.h"
  #include "../../nbnxn_consts.h"
  #ifdef CALC_COUL_EWALD
diff --git a/src/gromacs/mdlib/nbnxn_kernels/simd_4xn/nbnxn_kernel_simd_4xn_inner.h b/src/gromacs/mdlib/nbnxn_kernels/simd_4xn/nbnxn_kernel_simd_4xn_inner.h

index 758f8cd4f8d7ea5721a43bca75b37623d09d8934..4b7de1678fae416dfdf61dc275fa3f199342375f 100644 (file)
--- a/src/gromacs/mdlib/nbnxn_kernels/simd_4xn/nbnxn_kernel_simd_4xn_inner.h
+++ b/src/gromacs/mdlib/nbnxn_kernels/simd_4xn/nbnxn_kernel_simd_4xn_inner.h
@@ -420,10 +420,10 @@
  
  #ifdef CHECK_EXCLS
      /* For excluded pairs add a small number to avoid r^-6 = NaN */
-    rsq_S0      = gmx_masknot_add_pr(interact_S0, rsq_S0, avoid_sing_S);
-    rsq_S1      = gmx_masknot_add_pr(interact_S1, rsq_S1, avoid_sing_S);
-    rsq_S2      = gmx_masknot_add_pr(interact_S2, rsq_S2, avoid_sing_S);
-    rsq_S3      = gmx_masknot_add_pr(interact_S3, rsq_S3, avoid_sing_S);
+    rsq_S0      = gmx_simd_add_r(rsq_S0, gmx_simd_blendv_r(avoid_sing_S, gmx_simd_setzero_r(), interact_S0));
+    rsq_S1      = gmx_simd_add_r(rsq_S1, gmx_simd_blendv_r(avoid_sing_S, gmx_simd_setzero_r(), interact_S1));
+    rsq_S2      = gmx_simd_add_r(rsq_S2, gmx_simd_blendv_r(avoid_sing_S, gmx_simd_setzero_r(), interact_S2));
+    rsq_S3      = gmx_simd_add_r(rsq_S3, gmx_simd_blendv_r(avoid_sing_S, gmx_simd_setzero_r(), interact_S3));
  #endif
  
      /* Calculate 1/r */
@@ -433,8 +433,8 @@
      rinv_S2     = gmx_simd_invsqrt_r(rsq_S2);
      rinv_S3     = gmx_simd_invsqrt_r(rsq_S3);
  #else
-    gmx_mm_invsqrt2_pd(rsq_S0, rsq_S1, &rinv_S0, &rinv_S1);
-    gmx_mm_invsqrt2_pd(rsq_S2, rsq_S3, &rinv_S2, &rinv_S3);
+    gmx_simd_invsqrt_pair_r(rsq_S0, rsq_S1, &rinv_S0, &rinv_S1);
+    gmx_simd_invsqrt_pair_r(rsq_S2, rsq_S3, &rinv_S2, &rinv_S3);
  #endif
  
  #ifdef CALC_COULOMB
@@ -596,12 +596,12 @@
      ti_S1       = gmx_simd_cvtt_r2i(rs_S1);
      ti_S2       = gmx_simd_cvtt_r2i(rs_S2);
      ti_S3       = gmx_simd_cvtt_r2i(rs_S3);
-#ifdef GMX_SIMD_HAVE_FLOOR
-    /* SSE4.1 floor is faster than gmx_cvtepi32_ps int->float cast */
-    rf_S0       = gmx_simd_floor_r(rs_S0);
-    rf_S1       = gmx_simd_floor_r(rs_S1);
-    rf_S2       = gmx_simd_floor_r(rs_S2);
-    rf_S3       = gmx_simd_floor_r(rs_S3);
+#ifdef GMX_SIMD_HAVE_TRUNC
+    /* SSE4.1 trunc is faster than gmx_cvtepi32_ps int->float cast */
+    rf_S0       = gmx_simd_trunc_r(rs_S0);
+    rf_S1       = gmx_simd_trunc_r(rs_S1);
+    rf_S2       = gmx_simd_trunc_r(rs_S2);
+    rf_S3       = gmx_simd_trunc_r(rs_S3);
  #else
      rf_S0       = gmx_simd_cvt_i2r(ti_S0);
      rf_S1       = gmx_simd_cvt_i2r(ti_S1);
diff --git a/src/gromacs/mdlib/nbnxn_kernels/simd_4xn/nbnxn_kernel_simd_4xn_outer.h b/src/gromacs/mdlib/nbnxn_kernels/simd_4xn/nbnxn_kernel_simd_4xn_outer.h

index 7a636b5cd61f0ac98e64798445d0af316c8ce440..ffe53d5cd709cc26c50e69c3528a025feccae313 100644 (file)
--- a/src/gromacs/mdlib/nbnxn_kernels/simd_4xn/nbnxn_kernel_simd_4xn_outer.h
+++ b/src/gromacs/mdlib/nbnxn_kernels/simd_4xn/nbnxn_kernel_simd_4xn_outer.h
@@ -37,8 +37,8 @@
  {
      const nbnxn_ci_t   *nbln;
      const nbnxn_cj_t   *l_cj;
-    const int          *type;
-    const real         *q;
+    const int *         type;
+    const real *        q;
      const real         *shiftvec;
      const real         *x;
      const real         *nbfp0, *nbfp1, *nbfp2 = NULL, *nbfp3 = NULL;
@@ -109,9 +109,9 @@
  #ifdef CALC_COUL_TAB
      /* Coulomb table variables */
      gmx_simd_real_t   invtsp_S;
-    const real       *tab_coul_F;
+    const real *      tab_coul_F;
  #ifndef TAB_FDV0
-    const real       *tab_coul_V;
+    const real *      tab_coul_V;
  #endif
      /* Thread-local working buffers for force and potential lookups */
      int               ti0_array[2*GMX_SIMD_REAL_WIDTH], *ti0 = NULL;
@@ -192,15 +192,6 @@
      gmx_simd_real_t  rcvdw2_S;
  #endif
  
-#ifdef CALC_ENERGIES
-    /* cppcheck-suppress unassignedVariable */
-    real       tmpsum_array[GMX_SIMD_REAL_WIDTH*2], *tmpsum;
-#endif
-#ifdef CALC_SHIFTFORCES
-    /* cppcheck-suppress unassignedVariable */
-    real       shf_array[GMX_SIMD_REAL_WIDTH*2], *shf;
-#endif
-
      int ninner;
  
  #ifdef COUNT_PAIRS
@@ -375,13 +366,6 @@
      shiftvec            = shift_vec[0];
      x                   = nbat->x;
  
-#ifdef CALC_ENERGIES
-    tmpsum   = gmx_simd_align_r(tmpsum_array);
-#endif
-#ifdef CALC_SHIFTFORCES
-    shf      = gmx_simd_align_r(shf_array);
-#endif
-
  #ifdef FIX_LJ_C
      pvdw_c6  = gmx_simd_align_real(pvdw_array);
      pvdw_c12 = pvdw_c6 + UNROLLI*UNROLLJ;
@@ -701,9 +685,9 @@
          gmx_simd4_store_r(f+sciz, gmx_simd4_add_r(fiz_S, gmx_simd4_load_r(f+sciz)));
  
  #ifdef CALC_SHIFTFORCES
-        fshift[ish3+0] += gmx_sum_simd4(fix_S, shf);
-        fshift[ish3+1] += gmx_sum_simd4(fiy_S, shf);
-        fshift[ish3+2] += gmx_sum_simd4(fiz_S, shf);
+        fshift[ish3+0] += gmx_simd4_reduce_r(fix_S);
+        fshift[ish3+1] += gmx_simd4_reduce_r(fiy_S);
+        fshift[ish3+2] += gmx_simd4_reduce_r(fiz_S);
  #endif
  #else
          fix0_S = gmx_mm_transpose_sum2_pr(fix_S0, fix_S1);
@@ -722,19 +706,19 @@
          gmx_simd_store_r(f+sciz+2, gmx_simd_add_r(fiz2_S, gmx_simd_load_r(f+sciz+2)));
  
  #ifdef CALC_SHIFTFORCES
-        fshift[ish3+0] += gmx_sum_simd2(gmx_simd_add_r(fix0_S, fix2_S), shf);
-        fshift[ish3+1] += gmx_sum_simd2(gmx_simd_add_r(fiy0_S, fiy2_S), shf);
-        fshift[ish3+2] += gmx_sum_simd2(gmx_simd_add_r(fiz0_S, fiz2_S), shf);
+        fshift[ish3+0] += gmx_simd_reduce_r(gmx_simd_add_r(fix0_S, fix2_S));
+        fshift[ish3+1] += gmx_simd_reduce_r(gmx_simd_add_r(fiy0_S, fiy2_S));
+        fshift[ish3+2] += gmx_simd_reduce_r(gmx_simd_add_r(fiz0_S, fiz2_S));
  #endif
  #endif
  
  #ifdef CALC_ENERGIES
          if (do_coul)
          {
-            *Vc += gmx_sum_simd(vctot_S, tmpsum);
+            *Vc += gmx_simd_reduce_r(vctot_S);
          }
  
-        *Vvdw += gmx_sum_simd(Vvdwtot_S, tmpsum);
+        *Vvdw += gmx_simd_reduce_r(Vvdwtot_S);
  #endif
  
          /* Outer loop uses 6 flops/iteration */
diff --git a/src/gromacs/mdlib/nbnxn_search.c b/src/gromacs/mdlib/nbnxn_search.c

index 457db913939dfb617239c95e2f79861b8ad472ca..2f93f3036cfcb8b151579576f955cff11f58aed0 100644 (file)
--- a/src/gromacs/mdlib/nbnxn_search.c
+++ b/src/gromacs/mdlib/nbnxn_search.c
@@ -59,23 +59,23 @@
  #include "gromacs/fileio/gmxfio.h"
  
  #ifdef NBNXN_SEARCH_BB_SIMD4
-/* We use 4-wide SIMD for bounding box calculations */
+/* Always use 4-wide SIMD for bounding box calculations */
  
-#ifndef GMX_DOUBLE
+#    ifndef GMX_DOUBLE
  /* Single precision BBs + coordinates, we can also load coordinates with SIMD */
-#define NBNXN_SEARCH_SIMD4_FLOAT_X_BB
-#endif
+#        define NBNXN_SEARCH_SIMD4_FLOAT_X_BB
+#    endif
  
-#if defined NBNXN_SEARCH_SIMD4_FLOAT_X_BB && (GPU_NSUBCELL == 4 || GPU_NSUBCELL == 8)
+#    if defined NBNXN_SEARCH_SIMD4_FLOAT_X_BB && (GPU_NSUBCELL == 4 || GPU_NSUBCELL == 8)
  /* Store bounding boxes with x, y and z coordinates in packs of 4 */
-#define NBNXN_PBB_SIMD4
-#endif
+#        define NBNXN_PBB_SIMD4
+#    endif
  
  /* The packed bounding box coordinate stride is always set to 4.
   * With AVX we could use 8, but that turns out not to be faster.
   */
-#define STRIDE_PBB        4
-#define STRIDE_PBB_2LOG   2
+#    define STRIDE_PBB        4
+#    define STRIDE_PBB_2LOG   2
  
  #endif /* NBNXN_SEARCH_BB_SIMD4 */
  
@@ -129,7 +129,7 @@
  #define X_IND_CI_SIMD_2XNN(ci) X_IND_CI_J8(ci)
  #define X_IND_CJ_SIMD_2XNN(cj) X_IND_CJ_J8(cj)
  #else
-#error "unsupported GMX_NBNXN_SIMD_WIDTH"
+#error "unsupported GMX_SIMD_REAL_WIDTH"
  #endif
  #endif
  #endif
@@ -808,20 +808,20 @@ static void calc_bounding_box_x_x4_halves(int na, const real *x,
           * so we don't need to treat special cases in the rest of the code.
           */
  #ifdef NBNXN_SEARCH_BB_SIMD4
-        gmx_simd4_store_r(&bbj[1].lower[0], gmx_simd4_load_bb_pr(&bbj[0].lower[0]));
-        gmx_simd4_store_r(&bbj[1].upper[0], gmx_simd4_load_bb_pr(&bbj[0].upper[0]));
+        gmx_simd4_store_f(&bbj[1].lower[0], gmx_simd4_load_f(&bbj[0].lower[0]));
+        gmx_simd4_store_f(&bbj[1].upper[0], gmx_simd4_load_f(&bbj[0].upper[0]));
  #else
          bbj[1] = bbj[0];
  #endif
      }
  
  #ifdef NBNXN_SEARCH_BB_SIMD4
-    gmx_simd4_store_r(&bb->lower[0],
-                      gmx_simd4_min_r(gmx_simd4_load_bb_pr(&bbj[0].lower[0]),
-                                      gmx_simd4_load_bb_pr(&bbj[1].lower[0])));
-    gmx_simd4_store_r(&bb->upper[0],
-                      gmx_simd4_max_r(gmx_simd4_load_bb_pr(&bbj[0].upper[0]),
-                                      gmx_simd4_load_bb_pr(&bbj[1].upper[0])));
+    gmx_simd4_store_f(&bb->lower[0],
+                      gmx_simd4_min_f(gmx_simd4_load_f(&bbj[0].lower[0]),
+                                      gmx_simd4_load_f(&bbj[1].lower[0])));
+    gmx_simd4_store_f(&bb->upper[0],
+                      gmx_simd4_max_f(gmx_simd4_load_f(&bbj[0].upper[0]),
+                                      gmx_simd4_load_f(&bbj[1].upper[0])));
  #else
      {
          int i;
@@ -877,23 +877,23 @@ static void calc_bounding_box_xxxx(int na, int stride, const real *x, float *bb)
  /* Coordinate order xyz?, bb order xyz0 */
  static void calc_bounding_box_simd4(int na, const float *x, nbnxn_bb_t *bb)
  {
-    gmx_simd4_real_t bb_0_S, bb_1_S;
-    gmx_simd4_real_t x_S;
+    gmx_simd4_float_t bb_0_S, bb_1_S;
+    gmx_simd4_float_t x_S;
  
-    int              i;
+    int               i;
  
-    bb_0_S = gmx_simd4_load_bb_pr(x);
+    bb_0_S = gmx_simd4_load_f(x);
      bb_1_S = bb_0_S;
  
      for (i = 1; i < na; i++)
      {
-        x_S    = gmx_simd4_load_bb_pr(x+i*NNBSBB_C);
-        bb_0_S = gmx_simd4_min_r(bb_0_S, x_S);
-        bb_1_S = gmx_simd4_max_r(bb_1_S, x_S);
+        x_S    = gmx_simd4_load_f(x+i*NNBSBB_C);
+        bb_0_S = gmx_simd4_min_f(bb_0_S, x_S);
+        bb_1_S = gmx_simd4_max_f(bb_1_S, x_S);
      }
  
-    gmx_simd4_store_r(&bb->lower[0], bb_0_S);
-    gmx_simd4_store_r(&bb->upper[0], bb_1_S);
+    gmx_simd4_store_f(&bb->lower[0], bb_0_S);
+    gmx_simd4_store_f(&bb->upper[0], bb_1_S);
  }
  
  /* Coordinate order xyz?, bb order xxxxyyyyzzzz */
@@ -928,14 +928,14 @@ static void combine_bounding_box_pairs(nbnxn_grid_t *grid, const nbnxn_bb_t *bb)
          for (c2 = sc2; c2 < sc2+nc2; c2++)
          {
  #ifdef NBNXN_SEARCH_BB_SIMD4
-            gmx_simd4_real_t min_S, max_S;
-
-            min_S = gmx_simd4_min_r(gmx_simd4_load_bb_pr(&bb[c2*2+0].lower[0]),
-                                    gmx_simd4_load_bb_pr(&bb[c2*2+1].lower[0]));
-            max_S = gmx_simd4_max_r(gmx_simd4_load_bb_pr(&bb[c2*2+0].upper[0]),
-                                    gmx_simd4_load_bb_pr(&bb[c2*2+1].upper[0]));
-            gmx_simd4_store_r(&grid->bbj[c2].lower[0], min_S);
-            gmx_simd4_store_r(&grid->bbj[c2].upper[0], max_S);
+            gmx_simd4_float_t min_S, max_S;
+
+            min_S = gmx_simd4_min_f(gmx_simd4_load_f(&bb[c2*2+0].lower[0]),
+                                    gmx_simd4_load_f(&bb[c2*2+1].lower[0]));
+            max_S = gmx_simd4_max_f(gmx_simd4_load_f(&bb[c2*2+0].upper[0]),
+                                    gmx_simd4_load_f(&bb[c2*2+1].upper[0]));
+            gmx_simd4_store_f(&grid->bbj[c2].lower[0], min_S);
+            gmx_simd4_store_f(&grid->bbj[c2].upper[0], max_S);
  #else
              for (j = 0; j < NNBSBB_C; j++)
              {
@@ -2075,74 +2075,74 @@ static float subc_bb_dist2(int si, const nbnxn_bb_t *bb_i_ci,
  static float subc_bb_dist2_simd4(int si, const nbnxn_bb_t *bb_i_ci,
                                   int csj, const nbnxn_bb_t *bb_j_all)
  {
-    gmx_simd4_real_t bb_i_S0, bb_i_S1;
-    gmx_simd4_real_t bb_j_S0, bb_j_S1;
-    gmx_simd4_real_t dl_S;
-    gmx_simd4_real_t dh_S;
-    gmx_simd4_real_t dm_S;
-    gmx_simd4_real_t dm0_S;
+    gmx_simd4_float_t bb_i_S0, bb_i_S1;
+    gmx_simd4_float_t bb_j_S0, bb_j_S1;
+    gmx_simd4_float_t dl_S;
+    gmx_simd4_float_t dh_S;
+    gmx_simd4_float_t dm_S;
+    gmx_simd4_float_t dm0_S;
  
-    bb_i_S0 = gmx_simd4_load_bb_pr(&bb_i_ci[si].lower[0]);
-    bb_i_S1 = gmx_simd4_load_bb_pr(&bb_i_ci[si].upper[0]);
-    bb_j_S0 = gmx_simd4_load_bb_pr(&bb_j_all[csj].lower[0]);
-    bb_j_S1 = gmx_simd4_load_bb_pr(&bb_j_all[csj].upper[0]);
+    bb_i_S0 = gmx_simd4_load_f(&bb_i_ci[si].lower[0]);
+    bb_i_S1 = gmx_simd4_load_f(&bb_i_ci[si].upper[0]);
+    bb_j_S0 = gmx_simd4_load_f(&bb_j_all[csj].lower[0]);
+    bb_j_S1 = gmx_simd4_load_f(&bb_j_all[csj].upper[0]);
  
-    dl_S    = gmx_simd4_sub_r(bb_i_S0, bb_j_S1);
-    dh_S    = gmx_simd4_sub_r(bb_j_S0, bb_i_S1);
+    dl_S    = gmx_simd4_sub_f(bb_i_S0, bb_j_S1);
+    dh_S    = gmx_simd4_sub_f(bb_j_S0, bb_i_S1);
  
-    dm_S    = gmx_simd4_max_r(dl_S, dh_S);
-    dm0_S   = gmx_simd4_max_r(dm_S, gmx_simd4_setzero_r());
+    dm_S    = gmx_simd4_max_f(dl_S, dh_S);
+    dm0_S   = gmx_simd4_max_f(dm_S, gmx_simd4_setzero_f());
  
-    return gmx_simd4_dotproduct3_r(dm0_S, dm0_S);
+    return gmx_simd4_dotproduct3_f(dm0_S, dm0_S);
  }
  
  /* Calculate bb bounding distances of bb_i[si,...,si+3] and store them in d2 */
  #define SUBC_BB_DIST2_SIMD4_XXXX_INNER(si, bb_i, d2) \
      {                                                \
-        int              shi;                                  \
+        int               shi;                                  \
                                                   \
-        gmx_simd4_real_t dx_0, dy_0, dz_0;                       \
-        gmx_simd4_real_t dx_1, dy_1, dz_1;                       \
+        gmx_simd4_float_t dx_0, dy_0, dz_0;                    \
+        gmx_simd4_float_t dx_1, dy_1, dz_1;                    \
                                                   \
-        gmx_simd4_real_t mx, my, mz;                             \
-        gmx_simd4_real_t m0x, m0y, m0z;                          \
+        gmx_simd4_float_t mx, my, mz;                          \
+        gmx_simd4_float_t m0x, m0y, m0z;                       \
                                                   \
-        gmx_simd4_real_t d2x, d2y, d2z;                          \
-        gmx_simd4_real_t d2s, d2t;                              \
+        gmx_simd4_float_t d2x, d2y, d2z;                       \
+        gmx_simd4_float_t d2s, d2t;                            \
                                                   \
          shi = si*NNBSBB_D*DIM;                       \
                                                   \
-        xi_l = gmx_simd4_load_bb_pr(bb_i+shi+0*STRIDE_PBB);   \
-        yi_l = gmx_simd4_load_bb_pr(bb_i+shi+1*STRIDE_PBB);   \
-        zi_l = gmx_simd4_load_bb_pr(bb_i+shi+2*STRIDE_PBB);   \
-        xi_h = gmx_simd4_load_bb_pr(bb_i+shi+3*STRIDE_PBB);   \
-        yi_h = gmx_simd4_load_bb_pr(bb_i+shi+4*STRIDE_PBB);   \
-        zi_h = gmx_simd4_load_bb_pr(bb_i+shi+5*STRIDE_PBB);   \
+        xi_l = gmx_simd4_load_f(bb_i+shi+0*STRIDE_PBB);   \
+        yi_l = gmx_simd4_load_f(bb_i+shi+1*STRIDE_PBB);   \
+        zi_l = gmx_simd4_load_f(bb_i+shi+2*STRIDE_PBB);   \
+        xi_h = gmx_simd4_load_f(bb_i+shi+3*STRIDE_PBB);   \
+        yi_h = gmx_simd4_load_f(bb_i+shi+4*STRIDE_PBB);   \
+        zi_h = gmx_simd4_load_f(bb_i+shi+5*STRIDE_PBB);   \
                                                   \
-        dx_0 = gmx_simd4_sub_r(xi_l, xj_h);                \
-        dy_0 = gmx_simd4_sub_r(yi_l, yj_h);                \
-        dz_0 = gmx_simd4_sub_r(zi_l, zj_h);                \
+        dx_0 = gmx_simd4_sub_f(xi_l, xj_h);                 \
+        dy_0 = gmx_simd4_sub_f(yi_l, yj_h);                 \
+        dz_0 = gmx_simd4_sub_f(zi_l, zj_h);                 \
                                                   \
-        dx_1 = gmx_simd4_sub_r(xj_l, xi_h);                \
-        dy_1 = gmx_simd4_sub_r(yj_l, yi_h);                \
-        dz_1 = gmx_simd4_sub_r(zj_l, zi_h);                \
+        dx_1 = gmx_simd4_sub_f(xj_l, xi_h);                 \
+        dy_1 = gmx_simd4_sub_f(yj_l, yi_h);                 \
+        dz_1 = gmx_simd4_sub_f(zj_l, zi_h);                 \
                                                   \
-        mx   = gmx_simd4_max_r(dx_0, dx_1);                \
-        my   = gmx_simd4_max_r(dy_0, dy_1);                \
-        mz   = gmx_simd4_max_r(dz_0, dz_1);                \
+        mx   = gmx_simd4_max_f(dx_0, dx_1);                 \
+        my   = gmx_simd4_max_f(dy_0, dy_1);                 \
+        mz   = gmx_simd4_max_f(dz_0, dz_1);                 \
                                                   \
-        m0x  = gmx_simd4_max_r(mx, zero);                  \
-        m0y  = gmx_simd4_max_r(my, zero);                  \
-        m0z  = gmx_simd4_max_r(mz, zero);                  \
+        m0x  = gmx_simd4_max_f(mx, zero);                   \
+        m0y  = gmx_simd4_max_f(my, zero);                   \
+        m0z  = gmx_simd4_max_f(mz, zero);                   \
                                                   \
-        d2x  = gmx_simd4_mul_r(m0x, m0x);                  \
-        d2y  = gmx_simd4_mul_r(m0y, m0y);                  \
-        d2z  = gmx_simd4_mul_r(m0z, m0z);                  \
+        d2x  = gmx_simd4_mul_f(m0x, m0x);                   \
+        d2y  = gmx_simd4_mul_f(m0y, m0y);                   \
+        d2z  = gmx_simd4_mul_f(m0z, m0z);                   \
                                                   \
-        d2s  = gmx_simd4_add_r(d2x, d2y);                  \
-        d2t  = gmx_simd4_add_r(d2s, d2z);                  \
+        d2s  = gmx_simd4_add_f(d2x, d2y);                   \
+        d2t  = gmx_simd4_add_f(d2s, d2z);                   \
                                                   \
-        gmx_simd4_store_r(d2+si, d2t);                     \
+        gmx_simd4_store_f(d2+si, d2t);                      \
      }
  
  /* 4-wide SIMD code for nsi bb distances for bb format xxxxyyyyzzzz */
@@ -2150,21 +2150,21 @@ static void subc_bb_dist2_simd4_xxxx(const float *bb_j,
                                       int nsi, const float *bb_i,
                                       float *d2)
  {
-    gmx_simd4_real_t xj_l, yj_l, zj_l;
-    gmx_simd4_real_t xj_h, yj_h, zj_h;
-    gmx_simd4_real_t xi_l, yi_l, zi_l;
-    gmx_simd4_real_t xi_h, yi_h, zi_h;
+    gmx_simd4_float_t xj_l, yj_l, zj_l;
+    gmx_simd4_float_t xj_h, yj_h, zj_h;
+    gmx_simd4_float_t xi_l, yi_l, zi_l;
+    gmx_simd4_float_t xi_h, yi_h, zi_h;
  
-    gmx_simd4_real_t zero;
+    gmx_simd4_float_t zero;
  
-    zero = gmx_simd4_setzero_r();
+    zero = gmx_simd4_setzero_f();
  
-    xj_l = gmx_simd4_set1_r(bb_j[0*STRIDE_PBB]);
-    yj_l = gmx_simd4_set1_r(bb_j[1*STRIDE_PBB]);
-    zj_l = gmx_simd4_set1_r(bb_j[2*STRIDE_PBB]);
-    xj_h = gmx_simd4_set1_r(bb_j[3*STRIDE_PBB]);
-    yj_h = gmx_simd4_set1_r(bb_j[4*STRIDE_PBB]);
-    zj_h = gmx_simd4_set1_r(bb_j[5*STRIDE_PBB]);
+    xj_l = gmx_simd4_set1_f(bb_j[0*STRIDE_PBB]);
+    yj_l = gmx_simd4_set1_f(bb_j[1*STRIDE_PBB]);
+    zj_l = gmx_simd4_set1_f(bb_j[2*STRIDE_PBB]);
+    xj_h = gmx_simd4_set1_f(bb_j[3*STRIDE_PBB]);
+    yj_h = gmx_simd4_set1_f(bb_j[4*STRIDE_PBB]);
+    zj_h = gmx_simd4_set1_f(bb_j[5*STRIDE_PBB]);
  
      /* Here we "loop" over si (0,STRIDE_PBB) from 0 to nsi with step STRIDE_PBB.
       * But as we know the number of iterations is 1 or 2, we unroll manually.
@@ -2211,14 +2211,6 @@ static gmx_bool subc_in_range_x(int na_c,
  }
  
  #ifdef NBNXN_SEARCH_SIMD4_FLOAT_X_BB
-/* When we make seperate single/double precision SIMD vector operation
- * include files, this function should be moved there (also using FMA).
- */
-static inline gmx_simd4_real_t
-gmx_simd4_calc_rsq_r(gmx_simd4_real_t x, gmx_simd4_real_t y, gmx_simd4_real_t z)
-{
-    return gmx_simd4_add_r( gmx_simd4_add_r( gmx_simd4_mul_r(x, x), gmx_simd4_mul_r(y, y) ), gmx_simd4_mul_r(z, z) );
-}
  
  /* 4-wide SIMD function which determines if any atom pair between two cells,
   * both with 8 atoms, is within distance sqrt(rl2).
@@ -2240,12 +2232,12 @@ static gmx_bool subc_in_range_simd4(int na_c,
      rc2_S   = gmx_simd4_set1_r(rl2);
  
      dim_stride = NBNXN_GPU_CLUSTER_SIZE/STRIDE_PBB*DIM;
-    ix_S0      = gmx_simd4_load_bb_pr(x_i+(si*dim_stride+0)*STRIDE_PBB);
-    iy_S0      = gmx_simd4_load_bb_pr(x_i+(si*dim_stride+1)*STRIDE_PBB);
-    iz_S0      = gmx_simd4_load_bb_pr(x_i+(si*dim_stride+2)*STRIDE_PBB);
-    ix_S1      = gmx_simd4_load_bb_pr(x_i+(si*dim_stride+3)*STRIDE_PBB);
-    iy_S1      = gmx_simd4_load_bb_pr(x_i+(si*dim_stride+4)*STRIDE_PBB);
-    iz_S1      = gmx_simd4_load_bb_pr(x_i+(si*dim_stride+5)*STRIDE_PBB);
+    ix_S0      = gmx_simd4_load_r(x_i+(si*dim_stride+0)*STRIDE_PBB);
+    iy_S0      = gmx_simd4_load_r(x_i+(si*dim_stride+1)*STRIDE_PBB);
+    iz_S0      = gmx_simd4_load_r(x_i+(si*dim_stride+2)*STRIDE_PBB);
+    ix_S1      = gmx_simd4_load_r(x_i+(si*dim_stride+3)*STRIDE_PBB);
+    iy_S1      = gmx_simd4_load_r(x_i+(si*dim_stride+4)*STRIDE_PBB);
+    iz_S1      = gmx_simd4_load_r(x_i+(si*dim_stride+5)*STRIDE_PBB);
  
      /* We loop from the outer to the inner particles to maximize
       * the chance that we find a pair in range quickly and return.
diff --git a/src/gromacs/mdlib/nbnxn_simd.h b/src/gromacs/mdlib/nbnxn_simd.h

new file mode 100644 (file)

index 0000000..d3a04e0
--- /dev/null
+++ b/src/gromacs/mdlib/nbnxn_simd.h
@@ -0,0 +1,89 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2014, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+
+#ifndef _nbnxn_simd_h
+#define _nbnxn_simd_h
+
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include "typedefs.h"
+
+/* Include SIMD, below we select kernels based on the SIMD width */
+#include "gromacs/simd/simd.h"
+#include "gromacs/simd/simd_math.h"
+
+
+#ifdef GMX_SIMD_REFERENCE
+#define GMX_NBNXN_SIMD
+#endif
+
+/* As we modularize the verlet kernels, we should remove stuff like this
+ * that checks internal SIMD implementation details.
+ */
+#if (defined GMX_SIMD_X86_SSE2) || (defined GMX_SIMD_X86_SSE4_1) || \
+    (defined GMX_SIMD_X86_AVX_128_FMA) || (defined GMX_SIMD_X86_AVX_256) || \
+    (defined GMX_SIMD_X86_AVX2_256) || (defined GMX_SIMD_IBM_QPX)
+/* Use SIMD accelerated nbnxn search and kernels */
+#define GMX_NBNXN_SIMD
+#endif
+
+/* MIC for double is implemented in the SIMD module but so far missing in
+   mdlib/nbnxn_kernels/nbnxn_kernel_simd_utils_x86_mic.h */
+#if defined __MIC__ && !defined GMX_DOUBLE
+#define GMX_NBNXN_SIMD
+#endif
+
+#ifdef GMX_NBNXN_SIMD
+/* The nbnxn SIMD 4xN and 2x(N+N) kernels can be added independently.
+ * Currently the 2xNN SIMD kernels only make sense with:
+ *  8-way SIMD: 4x4 setup, works with AVX-256 in single precision
+ * 16-way SIMD: 4x8 setup, works with Intel MIC in single precision
+ */
+#if GMX_SIMD_REAL_WIDTH == 2 || GMX_SIMD_REAL_WIDTH == 4 || GMX_SIMD_REAL_WIDTH == 8
+#define GMX_NBNXN_SIMD_4XN
+#endif
+#if GMX_SIMD_REAL_WIDTH == 8 || GMX_SIMD_REAL_WIDTH == 16
+#define GMX_NBNXN_SIMD_2XNN
+#endif
+
+#if !(defined GMX_NBNXN_SIMD_4XN || defined GMX_NBNXN_SIMD_2XNN)
+#error "No SIMD kernel type defined"
+#endif
+
+#endif /* GMX_NBNXN_SIMD */
+
+#endif /* _nbnxn_simd_h */
diff --git a/src/gromacs/mdlib/pme.c b/src/gromacs/mdlib/pme.c

index c10e26d91ad6ae9d6093c6dd3a404b691c51cc11..0a9faf23a27ba32cb1c25eed856cca42508c2a8f 100644 (file)
--- a/src/gromacs/mdlib/pme.c
+++ b/src/gromacs/mdlib/pme.c
@@ -87,10 +87,11 @@
  #include "gromacs/utility/gmxomp.h"
  
  /* Include the SIMD macro file and then check for support */
-#include "gromacs/simd/macros.h"
-#if defined GMX_HAVE_SIMD_MACROS
+#include "gromacs/simd/simd.h"
+#include "gromacs/simd/simd_math.h"
+#ifdef GMX_SIMD_HAVE_REAL
  /* Turn on arbitrary width SIMD intrinsics for PME solve */
-#define PME_SIMD_SOLVE
+#    define PME_SIMD_SOLVE
  #endif
  
  #define PME_GRID_QA    0 /* Gridindex for A-state for Q */
@@ -108,21 +109,19 @@ const real lb_scale_factor[] = {
  /* Pascal triangle coefficients used in solve_pme_lj_yzx, only need to do 4 calculations due to symmetry */
  const real lb_scale_factor_symm[] = { 2.0/64, 12.0/64, 30.0/64, 20.0/64 };
  
-/* Include the 4-wide SIMD macro file */
-#include "gromacs/simd/four_wide_macros.h"
  /* Check if we have 4-wide SIMD macro support */
-#ifdef GMX_HAVE_SIMD4_MACROS
+#if (defined GMX_SIMD4_HAVE_REAL)
  /* Do PME spread and gather with 4-wide SIMD.
   * NOTE: SIMD is only used with PME order 4 and 5 (which are the most common).
   */
-#define PME_SIMD4_SPREAD_GATHER
+#    define PME_SIMD4_SPREAD_GATHER
  
-#ifdef GMX_SIMD4_HAVE_UNALIGNED
+#    if (defined GMX_SIMD_HAVE_LOADU) && (defined GMX_SIMD_HAVE_STOREU)
  /* With PME-order=4 on x86, unaligned load+store is slightly faster
   * than doubling all SIMD operations when using aligned load+store.
   */
-#define PME_SIMD4_UNALIGNED
-#endif
+#        define PME_SIMD4_UNALIGNED
+#    endif
  #endif
  
  #define DFT_TOL 1e-7
@@ -140,10 +139,10 @@ const real lb_scale_factor_symm[] = { 2.0/64, 12.0/64, 30.0/64, 20.0/64 };
  #endif
  
  #ifdef PME_SIMD4_SPREAD_GATHER
-#define SIMD4_ALIGNMENT  (GMX_SIMD4_WIDTH*sizeof(real))
+#    define SIMD4_ALIGNMENT  (GMX_SIMD4_WIDTH*sizeof(real))
  #else
  /* We can use any alignment, apart from 0, so we use 4 reals */
-#define SIMD4_ALIGNMENT  (4*sizeof(real))
+#    define SIMD4_ALIGNMENT  (4*sizeof(real))
  #endif
  
  /* GMX_CACHE_SEP should be a multiple of the SIMD and SIMD4 register size
@@ -1383,9 +1382,9 @@ static void spread_q_bsplines_thread(pmegrid_t                    *pmegrid,
      int            offx, offy, offz;
  
  #if defined PME_SIMD4_SPREAD_GATHER && !defined PME_SIMD4_UNALIGNED
-    real           thz_buffer[12], *thz_aligned;
+    real           thz_buffer[GMX_SIMD4_WIDTH*3], *thz_aligned;
  
-    thz_aligned = gmx_simd4_align_real(thz_buffer);
+    thz_aligned = gmx_simd4_align_r(thz_buffer);
  #endif
  
      pnx = pmegrid->s[XX];
@@ -1787,7 +1786,7 @@ static void free_work(pme_work_t *work)
  }
  
  
-#if defined PME_SIMD_SOLVE && defined GMX_SIMD_HAVE_EXP
+#if defined PME_SIMD_SOLVE
  /* Calculate exponentials through SIMD */
  inline static void calc_exponentials_q(int gmx_unused start, int end, real f, real *d_aligned, real *r_aligned, real *e_aligned)
  {
@@ -1832,7 +1831,7 @@ inline static void calc_exponentials_q(int start, int end, real f, real *d, real
  }
  #endif
  
-#if defined PME_SIMD_SOLVE && defined GMX_SIMD_HAVE_ERFC
+#if defined PME_SIMD_SOLVE
  /* Calculate exponentials through SIMD */
  inline static void calc_exponentials_lj(int gmx_unused start, int end, real *r_aligned, real *factor_aligned, real *d_aligned)
  {
@@ -2524,11 +2523,11 @@ static void gather_f_bsplines(gmx_pme_t pme, real *grid,
      pme_spline_work_t *work;
  
  #if defined PME_SIMD4_SPREAD_GATHER && !defined PME_SIMD4_UNALIGNED
-    real           thz_buffer[12],  *thz_aligned;
-    real           dthz_buffer[12], *dthz_aligned;
+    real           thz_buffer[GMX_SIMD4_WIDTH*3],  *thz_aligned;
+    real           dthz_buffer[GMX_SIMD4_WIDTH*3], *dthz_aligned;
  
-    thz_aligned  = gmx_simd4_align_real(thz_buffer);
-    dthz_aligned = gmx_simd4_align_real(dthz_buffer);
+    thz_aligned  = gmx_simd4_align_r(thz_buffer);
+    dthz_aligned = gmx_simd4_align_r(dthz_buffer);
  #endif
  
      work = pme->spline_work;
@@ -3276,14 +3275,14 @@ static pme_spline_work_t *make_pme_spline_work(int gmx_unused order)
      pme_spline_work_t *work;
  
  #ifdef PME_SIMD4_SPREAD_GATHER
-    real         tmp[12], *tmp_aligned;
+    real             tmp[GMX_SIMD4_WIDTH*3], *tmp_aligned;
      gmx_simd4_real_t zero_S;
      gmx_simd4_real_t real_mask_S0, real_mask_S1;
-    int          of, i;
+    int              of, i;
  
      snew_aligned(work, 1, SIMD4_ALIGNMENT);
  
-    tmp_aligned = gmx_simd4_align_real(tmp);
+    tmp_aligned = gmx_simd4_align_r(tmp);
  
      zero_S = gmx_simd4_setzero_r();
  
@@ -3291,14 +3290,14 @@ static pme_spline_work_t *make_pme_spline_work(int gmx_unused order)
       * as we only operate on order of the 8 grid entries that are
       * load into 2 SIMD registers.
       */
-    for (of = 0; of < 8-(order-1); of++)
+    for (of = 0; of < 2*GMX_SIMD4_WIDTH-(order-1); of++)
      {
-        for (i = 0; i < 8; i++)
+        for (i = 0; i < 2*GMX_SIMD4_WIDTH; i++)
          {
              tmp_aligned[i] = (i >= of && i < of+order ? -1.0 : 1.0);
          }
          real_mask_S0      = gmx_simd4_load_r(tmp_aligned);
-        real_mask_S1      = gmx_simd4_load_r(tmp_aligned+4);
+        real_mask_S1      = gmx_simd4_load_r(tmp_aligned+GMX_SIMD4_WIDTH);
          work->mask_S0[of] = gmx_simd4_cmplt_r(real_mask_S0, zero_S);
          work->mask_S1[of] = gmx_simd4_cmplt_r(real_mask_S1, zero_S);
      }
diff --git a/src/gromacs/mdlib/pme_simd4.h b/src/gromacs/mdlib/pme_simd4.h

index 4cd2213c5ded5b16b2522c6d5a991219389ce8dc..c49e54245a6e024c8b756287a4181cb1c7d5a847 100644 (file)
--- a/src/gromacs/mdlib/pme_simd4.h
+++ b/src/gromacs/mdlib/pme_simd4.h
@@ -178,7 +178,7 @@
      ty_S4 = gmx_simd4_set1_r(thy[4]);
  #endif
  
-#ifdef GMX_SIMD4_HAVE_UNALIGNED
+#ifdef PME_SIMD4_UNALIGNED
      tz_S0 = gmx_simd4_loadu_r(thz-offset);
      tz_S1 = gmx_simd4_loadu_r(thz-offset+4);
  #else
@@ -288,7 +288,7 @@
      fy_S = gmx_simd4_setzero_r();
      fz_S = gmx_simd4_setzero_r();
  
-#ifdef GMX_SIMD4_HAVE_UNALIGNED
+#ifdef PME_SIMD4_UNALIGNED
      tz_S0 = gmx_simd4_loadu_r(thz-offset);
      tz_S1 = gmx_simd4_loadu_r(thz-offset+4);
      dz_S0 = gmx_simd4_loadu_r(dthz-offset);
diff --git a/src/gromacs/mdlib/tpi.c b/src/gromacs/mdlib/tpi.c

index 3d6d5a08945416fd5df3234d44faf07dae21de13..3af90cbe10269b7030349b41bbe1d175c85d3e8d 100644 (file)
--- a/src/gromacs/mdlib/tpi.c
+++ b/src/gromacs/mdlib/tpi.c
@@ -78,11 +78,6 @@
  #include "gromacs/timing/wallcycle.h"
  #include "gromacs/timing/walltime_accounting.h"
  
-#ifdef GMX_SIMD_X86_SSE2_OR_HIGHER
-#include "gromacs/simd/general_x86_sse2.h"
-#endif
-
-
  static void global_max(t_commrec *cr, int *n)
  {
      int *sum, i;
@@ -466,9 +461,9 @@ double do_tpi(FILE *fplog, t_commrec *cr,
              gmx_fatal(FARGS, "Unknown integrator %s", ei_names[inputrec->eI]);
      }
  
-#ifdef GMX_SIMD_X86_SSE2_OR_HIGHER
-    /* Make sure we don't detect SSE overflow generated before this point */
-    gmx_mm_check_and_reset_overflow();
+#ifdef GMX_SIMD
+    /* Make sure we don't detect SIMD overflow generated before this point */
+    gmx_simd_check_and_reset_overflow();
  #endif
  
      while (bNotLastFrame)
diff --git a/src/gromacs/simd/CMakeLists.txt b/src/gromacs/simd/CMakeLists.txt

new file mode 100644 (file)

index 0000000..7c532ca
--- /dev/null
+++ b/src/gromacs/simd/CMakeLists.txt
@@ -0,0 +1,37 @@
+#
+# This file is part of the GROMACS molecular simulation package.
+#
+# Copyright (c) 2014, by the GROMACS development team, led by
+# Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+# and including many others, as listed in the AUTHORS file in the
+# top-level source directory and at http://www.gromacs.org.
+#
+# GROMACS is free software; you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public License
+# as published by the Free Software Foundation; either version 2.1
+# of the License, or (at your option) any later version.
+#
+# GROMACS is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public
+# License along with GROMACS; if not, see
+# http://www.gnu.org/licenses, or write to the Free Software Foundation,
+# Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+#
+# If you want to redistribute modifications to GROMACS, please
+# consider that scientific software is very special. Version
+# control is crucial - bugs must be traceable. We will be happy to
+# consider code for inclusion in the official distribution, but
+# derived work must not be called official GROMACS. Details are found
+# in the README & COPYING files - if they are missing, get the
+# official version at http://www.gromacs.org.
+#
+# To help us fund GROMACS development, we humbly ask that you cite
+# the research papers on the package. Check out http://www.gromacs.org.
+
+if (BUILD_TESTING)
+    add_subdirectory(tests)
+endif (BUILD_TESTING)
diff --git a/src/gromacs/simd/four_wide_macros.h b/src/gromacs/simd/four_wide_macros.h

deleted file mode 100644 (file)

index 8ed1d34..0000000
--- a/src/gromacs/simd/four_wide_macros.h
+++ /dev/null
@@ -1,469 +0,0 @@
-/*
- * This file is part of the GROMACS molecular simulation package.
- *
- * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
- * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
- * and including many others, as listed in the AUTHORS file in the
- * top-level source directory and at http://www.gromacs.org.
- *
- * GROMACS is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public License
- * as published by the Free Software Foundation; either version 2.1
- * of the License, or (at your option) any later version.
- *
- * GROMACS is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with GROMACS; if not, see
- * http://www.gnu.org/licenses, or write to the Free Software Foundation,
- * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
- *
- * If you want to redistribute modifications to GROMACS, please
- * consider that scientific software is very special. Version
- * control is crucial - bugs must be traceable. We will be happy to
- * consider code for inclusion in the official distribution, but
- * derived work must not be called official GROMACS. Details are found
- * in the README & COPYING files - if they are missing, get the
- * official version at http://www.gromacs.org.
- *
- * To help us fund GROMACS development, we humbly ask that you cite
- * the research papers on the package. Check out http://www.gromacs.org.
- */
-
-/* The macros in this file are intended to be used for writing
- * architecture-independent SIMD intrinsics code with a SIMD width of 4.
- * To support a new architecture, adding macros here should be all
- * that is needed.
- *
- * Note that this file is intended only for SIMD operations that require
- * a SIMD width of 4. In general gmx_simd_macros.h provides wider hardware
- * support, more functionality and higher performance, but the SIMD width is
- * not necessarily equal to 4.
- */
-
-#ifdef GMX_SIMD_FOUR_WIDE_MACROS_H
-#error "four_wide_macros.h included twice"
-#else
-#define GMX_SIMD_FOUR_WIDE_MACROS_H
-
-
-/* The SIMD width here is always 4, since that is the whole point */
-#define GMX_SIMD4_WIDTH  4
-
-
-#if defined GMX_SIMD4_SINGLE || defined GMX_SIMD4_DOUBLE
-/* Precision set before inclusion, honour that request */
-#else
-/* Match precision to the Gromacs real precision */
-#ifdef GMX_DOUBLE
-#define GMX_SIMD4_DOUBLE
-#else
-#define GMX_SIMD4_SINGLE
-#endif
-#endif
-
-#ifdef GMX_SIMD4_DOUBLE
-typedef double  gmx_simd4_real;
-#endif
-#ifdef GMX_SIMD4_SINGLE
-typedef float   gmx_simd4_real;
-#endif
-
-/* Uncomment the next line, without other SIMD active, for testing plain-C */
-/* #define GMX_SIMD4_REFERENCE */
-#ifdef GMX_SIMD4_REFERENCE
-/* Plain C SIMD reference implementation, also serves as documentation */
-#define GMX_HAVE_SIMD4_MACROS
-
-/* Include plain-C reference implementation, also serves as documentation */
-#include "four_wide_macros_ref.h"
-
-/* float/double SIMD register type */
-#define gmx_simd4_real_t  gmx_simd4_ref_pr
-
-/* boolean SIMD register type */
-#define gmx_simd4_bool_t  gmx_simd4_ref_pb
-
-#define gmx_simd4_load_r       gmx_simd4_ref_load_pr
-#define gmx_simd4_load_bb_pr    gmx_simd4_ref_load_pr
-#define gmx_simd4_set1_r       gmx_simd4_ref_set1_pr
-#define gmx_simd4_setzero_r    gmx_simd4_ref_setzero_pr
-#define gmx_simd4_store_r      gmx_simd4_ref_store_pr
-
-/* Unaligned load+store are not required,
- * but they can speed up the PME spread+gather operations.
- */
-#define GMX_SIMD4_HAVE_UNALIGNED
-#ifdef GMX_SIMD4_HAVE_UNALIGNED
-#define gmx_simd4_loadu_r      gmx_simd4_ref_load_pr
-#define gmx_simd4_storeu_r     gmx_simd4_ref_store_pr
-#endif
-
-#define gmx_simd4_add_r        gmx_simd4_ref_add_pr
-#define gmx_simd4_sub_r        gmx_simd4_ref_sub_pr
-#define gmx_simd4_mul_r        gmx_simd4_ref_mul_pr
-/* For the FMA macros below, aim for c=d in code, so FMA3 uses 1 instruction */
-#define gmx_simd4_fmadd_r       gmx_simd4_ref_madd_pr
-#define gmx_simd4_fnmadd_r      gmx_simd4_ref_nmsub_pr
-
-#define gmx_simd4_dotproduct3_r   gmx_simd4_ref_dotproduct3
-
-#define gmx_simd4_min_r        gmx_simd4_ref_min_pr
-#define gmx_simd4_max_r        gmx_simd4_ref_max_pr
-
-#define gmx_simd4_blendzero_r  gmx_simd4_ref_blendzero_pr
-
-/* Comparison */
-#define gmx_simd4_cmplt_r      gmx_simd4_ref_cmplt_pr
-
-/* Logical operations on SIMD booleans */
-#define gmx_simd4_and_b        gmx_simd4_ref_and_pb
-#define gmx_simd4_or_b         gmx_simd4_ref_or_pb
-
-/* Returns a single int (0/1) which tells if any of the 4 booleans is True */
-#define gmx_simd4_anytrue_b    gmx_simd4_ref_anytrue_pb
-
-#endif /* GMX_SIMD4_REFERENCE */
-
-
-/* The same SIMD macros can be translated to SIMD intrinsics (and compiled
- * to instructions for) different SIMD width and float precision.
- *
- * On x86: The gmx_simd4 prefix is replaced by _mm_ or _mm256_ (SSE or AVX).
- * The _pr suffix is replaced by _ps or _pd (for single or double precision).
- * Compiler settings will decide if 128-bit intrinsics will
- * be translated into SSE or AVX instructions.
- */
-
-
-#ifdef GMX_SIMD_X86_SSE2_OR_HIGHER
-/* This is for general x86 SIMD instruction sets that also support SSE2 */
-
-#ifdef GMX_SIMD4_SINGLE
-#define GMX_HAVE_SIMD4_MACROS
-#endif
-
-#ifdef GMX_SIMD4_DOUBLE
-/* Note that here we will use 256-bit SIMD with GMX_SIMD_X86_AVX_128_FMA_OR_HIGHER.
- * This is inconsistent naming wise, but should give the best performance.
- */
-#if defined GMX_SIMD_X86_AVX_128_FMA_OR_HIGHER || defined GMX_SIMD_X86_AVX_256_OR_HIGHER
-#define GMX_HAVE_SIMD4_MACROS
-#endif
-#endif
-
-#ifdef GMX_HAVE_SIMD4_MACROS
-
-#if defined GMX_SIMD_X86_AVX_128_FMA_OR_HIGHER || defined GMX_SIMD_X86_AVX_256_OR_HIGHER
-
-#include <immintrin.h>
-#ifdef HAVE_X86INTRIN_H
-#include <x86intrin.h> /* FMA */
-#endif
-#ifdef HAVE_INTRIN_H
-#include <intrin.h> /* FMA MSVC */
-#endif
-
-#else
-#ifdef GMX_SIMD_X86_SSE4_1_OR_HIGHER
-#include <smmintrin.h>
-#else
-/* We only have SSE2 */
-#include <emmintrin.h>
-#endif
-#endif
-
-#ifdef GMX_SIMD4_SINGLE
-
-#define gmx_simd4_real_t  __m128
-
-#define gmx_simd4_bool_t  __m128
-
-#define gmx_simd4_load_r       _mm_load_ps
-#define gmx_simd4_load_bb_pr    _mm_load_ps
-#define gmx_simd4_set1_r       _mm_set1_ps
-#define gmx_simd4_setzero_r    _mm_setzero_ps
-#define gmx_simd4_store_r      _mm_store_ps
-
-/* Some old AMD processors could have problems with unaligned loads+stores */
-#ifndef GMX_FAHCORE
-#define GMX_SIMD4_HAVE_UNALIGNED
-#endif
-#ifdef GMX_SIMD4_HAVE_UNALIGNED
-#define gmx_simd4_loadu_r      _mm_loadu_ps
-#define gmx_simd4_storeu_r     _mm_storeu_ps
-#endif
-
-#define gmx_simd4_add_r        _mm_add_ps
-#define gmx_simd4_sub_r        _mm_sub_ps
-#define gmx_simd4_mul_r        _mm_mul_ps
-
-#ifdef GMX_SIMD_X86_AVX_128_FMA_OR_HIGHER
-#define gmx_simd4_fmadd_r(a, b, c)   _mm_macc_ps(a, b, c)
-#define gmx_simd4_fnmadd_r(a, b, c)  _mm_nmacc_ps(a, b, c)
-#else
-#define gmx_simd4_fmadd_r(a, b, c)   _mm_add_ps(c, _mm_mul_ps(a, b))
-#define gmx_simd4_fnmadd_r(a, b, c)  _mm_sub_ps(c, _mm_mul_ps(a, b))
-#endif
-
-static inline float gmx_simd4_dotproduct3_r(__m128 a, __m128 b)
-#ifdef GMX_SIMD_X86_SSE4_1_OR_HIGHER
-{
-    float dp;
-
-    /* SSE4.1 dot product of components 0,1,2, stored in component 0 */
-    _mm_store_ss(&dp, _mm_dp_ps(a, b, 0x71));
-
-    return dp;
-}
-#else
-{
-    float        dp_array[7], *dp;
-
-    /* Generate an aligned pointer */
-    dp = (float *)(((size_t)(dp_array+3)) & (~((size_t)15)));
-
-    _mm_store_ps(dp, _mm_mul_ps(a, b));
-
-    return dp[0] + dp[1] + dp[2];
-}
-#endif
-
-#define gmx_simd4_min_r        _mm_min_ps
-#define gmx_simd4_max_r        _mm_max_ps
-
-#define gmx_simd4_blendzero_r  _mm_and_ps
-
-#define gmx_simd4_cmplt_r      _mm_cmplt_ps
-#define gmx_simd4_and_b        _mm_and_ps
-#define gmx_simd4_or_b         _mm_or_ps
-
-#define gmx_simd4_anytrue_b    _mm_movemask_ps
-
-#endif /* GMX_SIMD4_SINGLE */
-
-
-#ifdef GMX_SIMD4_DOUBLE
-
-#define gmx_simd4_real_t  __m256d
-
-#define gmx_simd4_bool_t  __m256d
-
-#define gmx_simd4_load_r       _mm256_load_pd
-#define gmx_simd4_load_bb_pr    _mm256_load_pd
-#define gmx_simd4_set1_r       _mm256_set1_pd
-#define gmx_simd4_setzero_r    _mm256_setzero_pd
-#define gmx_simd4_store_r      _mm256_store_pd
-
-#define GMX_SIMD4_HAVE_UNALIGNED
-#define gmx_simd4_loadu_r      _mm256_loadu_pd
-#define gmx_simd4_storeu_r     _mm256_storeu_pd
-
-#define gmx_simd4_add_r        _mm256_add_pd
-#define gmx_simd4_sub_r        _mm256_sub_pd
-#define gmx_simd4_mul_r        _mm256_mul_pd
-#ifdef GMX_SIMD_X86_AVX_128_FMA_OR_HIGHER
-#define gmx_simd4_fmadd_r(a, b, c)   _mm256_macc_pd(a, b, c)
-#define gmx_simd4_fnmadd_r(a, b, c)  _mm256_nmacc_pd(a, b, c)
-#else
-#define gmx_simd4_fmadd_r(a, b, c)   _mm256_add_pd(c, _mm256_mul_pd(a, b))
-#define gmx_simd4_fnmadd_r(a, b, c)  _mm256_sub_pd(c, _mm256_mul_pd(a, b))
-#endif
-#define gmx_simd4_min_r        _mm256_min_pd
-#define gmx_simd4_max_r        _mm256_max_pd
-
-#define gmx_simd4_blendzero_r  _mm256_and_pd
-
-/* Less-than (we use ordered, non-signaling, but that's not required) */
-#define gmx_simd4_cmplt_r(x, y) _mm256_cmp_pd(x, y, 0x11)
-#define gmx_simd4_and_b        _mm256_and_pd
-#define gmx_simd4_or_b         _mm256_or_pd
-
-#define gmx_simd4_anytrue_b    _mm256_movemask_pd
-
-#endif /* GMX_SIMD4_DOUBLE */
-
-
-#endif /* GMX_HAVE_SIMD4_MACROS */
-
-#endif /* GMX_SIMD_X86_SSE2_OR_HIGHER */
-
-#ifdef GMX_SIMD_IBM_QPX
-/* i.e. BlueGene/Q */
-
-/* This hack works on the compilers that can reach this code. A real
-   solution with broader scope will be proposed in master branch. */
-#define gmx_always_inline __attribute__((always_inline))
-
-#ifdef GMX_SIMD4_SINGLE
-#define GMX_HAVE_SIMD4_MACROS
-#endif
-
-typedef vector4double gmx_simd4_real_t;
-typedef vector4double gmx_simd4_bool_t;
-
-/* The declarations of vec_ld* use non-const pointers, and IBM
-   can't/won't fix this any time soon. So GROMACS has to cast away the
-   const-ness of its pointers before loads. Four-wide SIMD loads
-   sometimes occur from variables of type real, and sometimes from
-   variables of type float (even at double precison), so the correct
-   cast cannot be done easily. The correct cast is necessary because
-   the resulting type determines the alignment assumption of vec_ld*,
-   which is different for float and double. So the loads of
-   always-float variables have to be done with a function that does
-   the correct cast. Since functions cannot be overloaded by type in
-   C, they have to have different names. Thus we have
-   gmx_simd4_load_r and gmx_simd4_load_bb_pr.
- */
-
-static gmx_inline gmx_simd4_real_t gmx_always_inline gmx_simd4_load_r(const real *a)
-{
-#ifdef NDEBUG
-    return vec_ld(0, (real *) a);
-#else
-    return vec_lda(0, (real *) a);
-#endif
-}
-
-static gmx_inline gmx_simd4_real_t gmx_always_inline gmx_simd4_load_bb_pr(const float *a)
-{
-#ifdef NDEBUG
-    return vec_ld(0, (float *) a);
-#else
-    return vec_lda(0, (float *) a);
-#endif
-}
-
-static gmx_inline gmx_simd4_real_t gmx_always_inline gmx_simd4_set1_r(const real a)
-{
-    return vec_splats(a);
-}
-
-static gmx_inline gmx_simd4_real_t gmx_always_inline gmx_simd4_setzero_r()
-{
-    return vec_splats(0.0);
-}
-
-/* TODO this will not yet work, because the function might be passed a
-   pointer to a float when running in double precision.
- */
-static gmx_inline void gmx_always_inline gmx_simd4_store_r(real *a, gmx_simd4_real_t b)
-{
-#ifdef NDEBUG
-    vec_st(b, 0, a);
-#else
-    vec_sta(b, 0, a);
-#endif
-}
-
-static gmx_inline gmx_simd4_real_t gmx_always_inline gmx_simd4_add_r(gmx_simd4_real_t a, gmx_simd4_real_t b)
-{
-    return vec_add(a, b);
-}
-
-static gmx_inline gmx_simd4_real_t gmx_always_inline gmx_simd4_sub_r(gmx_simd4_real_t a, gmx_simd4_real_t b)
-{
-    return vec_sub(a, b);
-}
-
-static gmx_inline gmx_simd4_real_t gmx_always_inline gmx_simd4_mul_r(gmx_simd4_real_t a, gmx_simd4_real_t b)
-{
-    return vec_mul(a, b);
-}
-
-static gmx_inline gmx_simd4_real_t gmx_always_inline gmx_simd4_fmadd_r(gmx_simd4_real_t a, gmx_simd4_real_t b, gmx_simd4_real_t c)
-{
-    return vec_madd(a, b, c);
-}
-
-static gmx_inline gmx_simd4_real_t gmx_always_inline gmx_simd4_fnmadd_r(gmx_simd4_real_t a, gmx_simd4_real_t b, gmx_simd4_real_t c)
-{
-    return vec_nmsub(a, b, c);
-}
-
-static gmx_inline gmx_simd4_real_t gmx_always_inline gmx_simd4_min_r(gmx_simd4_real_t a, gmx_simd4_real_t b)
-{
-    /* Implemented the same way as max, but with the subtraction
-       operands swapped. */
-    return vec_sel(b, a, vec_sub(b, a));
-}
-
-static gmx_inline gmx_simd4_real_t gmx_always_inline gmx_simd4_max_r(gmx_simd4_real_t a, gmx_simd4_real_t b)
-{
-    return vec_sel(b, a, vec_sub(a, b));
-}
-
-static gmx_inline gmx_simd4_real_t gmx_always_inline gmx_simd4_blendzero_r(gmx_simd4_real_t a, gmx_simd4_real_t b)
-{
-    return vec_sel(gmx_simd_setzero_r(), a, b);
-}
-
-static gmx_inline gmx_simd4_bool_t gmx_always_inline gmx_simd4_cmplt_r(gmx_simd4_real_t a, gmx_simd4_real_t b)
-{
-    return vec_cmplt(a, b);
-}
-
-static gmx_inline gmx_simd4_bool_t gmx_always_inline gmx_simd4_and_b(gmx_simd4_bool_t a, gmx_simd4_bool_t b)
-{
-    return vec_and(a, b);
-}
-
-static gmx_inline gmx_simd4_bool_t gmx_always_inline gmx_simd4_or_b(gmx_simd4_bool_t a, gmx_simd4_bool_t b)
-{
-    return vec_or(a, b);
-}
-
-static gmx_inline float gmx_always_inline gmx_simd4_dotproduct3_r(gmx_simd4_real_t a, gmx_simd4_real_t b)
-{
-    /* The dot product is done solely on the QPX AXU (which is the
-       only available FPU). This is awkward, because pretty much no
-       "horizontal" SIMD-vector operations exist, unlike x86 where
-       SSE4.1 added various kinds of horizontal operations. So we have
-       to make do with shifting vector elements and operating on the
-       results. This makes for lots of data dependency, but the main
-       alternative of storing to memory and reloading is not going to
-       help, either. OpenMP over 2 or 4 hardware threads per core will
-       hide much of the latency from the data dependency. The
-       vec_extract() lets the compiler correctly use a floating-point
-       comparison on the zeroth vector element, which avoids needing
-       memory at all.
-     */
-
-    gmx_simd4_real_t dp_shifted_left_0 = vec_mul(a, b);
-    gmx_simd4_real_t dp_shifted_left_1 = vec_sldw(dp_shifted_left_0, dp_shifted_left_0, 1);
-    gmx_simd4_real_t dp_shifted_left_2 = vec_sldw(dp_shifted_left_0, dp_shifted_left_0, 2);
-    gmx_simd4_real_t dp                = vec_add(dp_shifted_left_2,
-                                                 vec_add(dp_shifted_left_0, dp_shifted_left_1));
-
-    /* See comment in nbnxn_make_pairlist_part() about how this should
-       be able to return a double on PowerPC. */
-    return (float) vec_extract(dp, 0);
-}
-
-static gmx_inline int gmx_always_inline gmx_simd4_anytrue_b(gmx_simd4_bool_t a)
-{
-    return gmx_simd_anytrue_b(a);
-}
-
-#undef gmx_always_inline
-
-#endif /* GMX_SIMD_IBM_QPX */
-
-#ifdef GMX_HAVE_SIMD4_MACROS
-/* Generic functions to extract a SIMD4 aligned pointer from a pointer x.
- * x should have at least GMX_SIMD4_WIDTH=4 elements extra compared
- * to how many you want to use, to avoid indexing outside the aligned region.
- */
-
-static gmx_inline gmx_simd4_real *
-gmx_simd4_align_real(const gmx_simd4_real *x)
-{
-    return (gmx_simd4_real *)(((size_t)((x)+GMX_SIMD4_WIDTH)) & (~((size_t)(GMX_SIMD4_WIDTH*sizeof(gmx_simd4_real)-1))));
-}
-#endif
-
-
-#endif
diff --git a/src/gromacs/simd/four_wide_macros_ref.h b/src/gromacs/simd/four_wide_macros_ref.h

deleted file mode 100644 (file)

index 8b47d64..0000000
--- a/src/gromacs/simd/four_wide_macros_ref.h
+++ /dev/null
@@ -1,306 +0,0 @@
-/*
- * This file is part of the GROMACS molecular simulation package.
- *
- * Copyright (c) 2013,2014, by the GROMACS development team, led by
- * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
- * and including many others, as listed in the AUTHORS file in the
- * top-level source directory and at http://www.gromacs.org.
- *
- * GROMACS is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public License
- * as published by the Free Software Foundation; either version 2.1
- * of the License, or (at your option) any later version.
- *
- * GROMACS is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with GROMACS; if not, see
- * http://www.gnu.org/licenses, or write to the Free Software Foundation,
- * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
- *
- * If you want to redistribute modifications to GROMACS, please
- * consider that scientific software is very special. Version
- * control is crucial - bugs must be traceable. We will be happy to
- * consider code for inclusion in the official distribution, but
- * derived work must not be called official GROMACS. Details are found
- * in the README & COPYING files - if they are missing, get the
- * official version at http://www.gromacs.org.
- *
- * To help us fund GROMACS development, we humbly ask that you cite
- * the research papers on the package. Check out http://www.gromacs.org.
- */
-
-#ifndef GMX_SIMD_FOUR_WIDE_MACROS_REF_H
-#define GMX_SIMD_FOUR_WIDE_MACROS_REF_H
-
-/* This file contains a reference plain-C implementation of 4-wide SIMD.
- * This code is only useful for testing and documentation.
- * Either float or double precision is supported through gmx_simd4_real,
- * which is set in gmx_simd4_macros.h
- */
-
-
-#include <math.h>
-
-/* float/double SIMD register type */
-typedef struct {
-    gmx_simd4_real r[GMX_SIMD4_WIDTH];
-} gmx_simd4_ref_pr;
-
-/* boolean SIMD register type */
-typedef struct {
-    char r[GMX_SIMD4_WIDTH];
-} gmx_simd4_ref_pb;
-
-
-/* Load GMX_SIMD4_WIDTH reals for memory starting at r */
-static gmx_inline gmx_simd4_ref_pr
-gmx_simd4_ref_load_pr(const gmx_simd4_real *r)
-{
-    gmx_simd4_ref_pr a;
-    int              i;
-
-    for (i = 0; i < GMX_SIMD4_WIDTH; i++)
-    {
-        a.r[i] = r[i];
-    }
-
-    return a;
-}
-
-/* Set all SIMD register elements to r */
-static gmx_inline gmx_simd4_ref_pr
-gmx_simd4_ref_set1_pr(gmx_simd4_real r)
-{
-    gmx_simd4_ref_pr a;
-    int              i;
-
-    for (i = 0; i < GMX_SIMD4_WIDTH; i++)
-    {
-        a.r[i] = r;
-    }
-
-    return a;
-}
-
-/* Set all SIMD register elements to 0 */
-static gmx_inline gmx_simd4_ref_pr
-gmx_simd4_ref_setzero_pr()
-{
-    gmx_simd4_ref_pr a;
-    int              i;
-
-    for (i = 0; i < GMX_SIMD4_WIDTH; i++)
-    {
-        a.r[i] = 0.0;
-    }
-
-    return a;
-}
-
-static gmx_inline void
-gmx_simd4_ref_store_pr(gmx_simd4_real *dest, gmx_simd4_ref_pr src)
-{
-    int i;
-
-    for (i = 0; i < GMX_SIMD4_WIDTH; i++)
-    {
-        dest[i] = src.r[i];
-    }
-}
-
-static gmx_inline gmx_simd4_ref_pr
-gmx_simd4_ref_add_pr(gmx_simd4_ref_pr a, gmx_simd4_ref_pr b)
-{
-    gmx_simd4_ref_pr c;
-    int              i;
-
-    for (i = 0; i < GMX_SIMD4_WIDTH; i++)
-    {
-        c.r[i] = a.r[i] + b.r[i];
-    }
-
-    return c;
-}
-
-static gmx_inline gmx_simd4_ref_pr
-gmx_simd4_ref_sub_pr(gmx_simd4_ref_pr a, gmx_simd4_ref_pr b)
-{
-    gmx_simd4_ref_pr c;
-    int              i;
-
-    for (i = 0; i < GMX_SIMD4_WIDTH; i++)
-    {
-        c.r[i] = a.r[i] - b.r[i];
-    }
-
-    return c;
-}
-
-static gmx_inline gmx_simd4_ref_pr
-gmx_simd4_ref_mul_pr(gmx_simd4_ref_pr a, gmx_simd4_ref_pr b)
-{
-    gmx_simd4_ref_pr c;
-    int              i;
-
-    for (i = 0; i < GMX_SIMD4_WIDTH; i++)
-    {
-        c.r[i] = a.r[i]*b.r[i];
-    }
-
-    return c;
-}
-
-static gmx_inline gmx_simd4_ref_pr
-gmx_simd4_ref_madd_pr(gmx_simd4_ref_pr a, gmx_simd4_ref_pr b, gmx_simd4_ref_pr c)
-{
-    gmx_simd4_ref_pr d;
-    int              i;
-
-    for (i = 0; i < GMX_SIMD4_WIDTH; i++)
-    {
-        d.r[i] = a.r[i]*b.r[i] + c.r[i];
-    }
-
-    return d;
-}
-
-static gmx_inline gmx_simd4_ref_pr
-gmx_simd4_ref_nmsub_pr(gmx_simd4_ref_pr a, gmx_simd4_ref_pr b, gmx_simd4_ref_pr c)
-{
-    gmx_simd4_ref_pr d;
-    int              i;
-
-    for (i = 0; i < GMX_SIMD4_WIDTH; i++)
-    {
-        d.r[i] = -a.r[i]*b.r[i] + c.r[i];
-    }
-
-    return d;
-}
-
-static gmx_inline gmx_simd4_real
-gmx_simd4_ref_dotproduct3(gmx_simd4_ref_pr a, gmx_simd4_ref_pr b)
-{
-    gmx_simd4_real dp;
-    int            i;
-
-    dp = 0.0;
-    for (i = 0; i < 3; i++)
-    {
-        dp += a.r[i]*b.r[i];
-    }
-
-    return dp;
-}
-
-static gmx_inline gmx_simd4_ref_pr
-gmx_simd4_ref_min_pr(gmx_simd4_ref_pr a, gmx_simd4_ref_pr b)
-{
-    gmx_simd4_ref_pr c;
-    int              i;
-
-    for (i = 0; i < GMX_SIMD4_WIDTH; i++)
-    {
-        c.r[i] = (a.r[i] <= b.r[i] ? a.r[i] : b.r[i]);
-    }
-
-    return c;
-}
-
-static gmx_inline gmx_simd4_ref_pr
-gmx_simd4_ref_max_pr(gmx_simd4_ref_pr a, gmx_simd4_ref_pr b)
-{
-    gmx_simd4_ref_pr c;
-    int              i;
-
-    for (i = 0; i < GMX_SIMD4_WIDTH; i++)
-    {
-        c.r[i] = (a.r[i] >= b.r[i] ? a.r[i] : b.r[i]);
-    }
-
-    return c;
-}
-
-static gmx_inline gmx_simd4_ref_pr
-gmx_simd4_ref_blendzero_pr(gmx_simd4_ref_pr a, gmx_simd4_ref_pb b)
-{
-    gmx_simd4_ref_pr c;
-    int              i;
-
-    for (i = 0; i < GMX_SIMD4_WIDTH; i++)
-    {
-        c.r[i] = (b.r[i] ? a.r[i] : 0.0);
-    }
-
-    return c;
-}
-
-/* Comparison */
-static gmx_inline gmx_simd4_ref_pb
-gmx_simd4_ref_cmplt_pr(gmx_simd4_ref_pr a, gmx_simd4_ref_pr b)
-{
-    gmx_simd4_ref_pb c;
-    int              i;
-
-    for (i = 0; i < GMX_SIMD4_WIDTH; i++)
-    {
-        c.r[i] = (a.r[i] < b.r[i]);
-    }
-
-    return c;
-}
-
-/* Logical AND on SIMD booleans */
-static gmx_inline gmx_simd4_ref_pb
-gmx_simd4_ref_and_pb(gmx_simd4_ref_pb a, gmx_simd4_ref_pb b)
-{
-    gmx_simd4_ref_pb c;
-    int              i;
-
-    for (i = 0; i < GMX_SIMD4_WIDTH; i++)
-    {
-        c.r[i] = (a.r[i] && b.r[i]);
-    }
-
-    return c;
-}
-
-/* Logical OR on SIMD booleans */
-static gmx_inline gmx_simd4_ref_pb
-gmx_simd4_ref_or_pb(gmx_simd4_ref_pb a, gmx_simd4_ref_pb b)
-{
-    gmx_simd4_ref_pb c;
-    int              i;
-
-    for (i = 0; i < GMX_SIMD4_WIDTH; i++)
-    {
-        c.r[i] = (a.r[i] || b.r[i]);
-    }
-
-    return c;
-}
-
-/* gmx_simd_anytrue_b(x) returns if any of the boolean is x is True */
-static gmx_inline int
-gmx_simd4_ref_anytrue_pb(gmx_simd4_ref_pb a)
-{
-    int anytrue;
-    int i;
-
-    anytrue = 0;
-    for (i = 0; i < GMX_SIMD4_WIDTH; i++)
-    {
-        if (a.r[i])
-        {
-            anytrue = 1;
-        }
-    }
-
-    return anytrue;
-}
-
-#endif
diff --git a/src/gromacs/simd/general_x86_avx_128_fma.h b/src/gromacs/simd/general_x86_avx_128_fma.h

deleted file mode 100644 (file)

index 19ec986..0000000
--- a/src/gromacs/simd/general_x86_avx_128_fma.h
+++ /dev/null
@@ -1,228 +0,0 @@
-/*
- * This file is part of the GROMACS molecular simulation package.
- *
- * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
- * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
- * and including many others, as listed in the AUTHORS file in the
- * top-level source directory and at http://www.gromacs.org.
- *
- * GROMACS is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public License
- * as published by the Free Software Foundation; either version 2.1
- * of the License, or (at your option) any later version.
- *
- * GROMACS is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with GROMACS; if not, see
- * http://www.gnu.org/licenses, or write to the Free Software Foundation,
- * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
- *
- * If you want to redistribute modifications to GROMACS, please
- * consider that scientific software is very special. Version
- * control is crucial - bugs must be traceable. We will be happy to
- * consider code for inclusion in the official distribution, but
- * derived work must not be called official GROMACS. Details are found
- * in the README & COPYING files - if they are missing, get the
- * official version at http://www.gromacs.org.
- *
- * To help us fund GROMACS development, we humbly ask that you cite
- * the research papers on the package. Check out http://www.gromacs.org.
- */
-#ifndef GMX_SIMD_GENERAL_AVX_128_FMA_H
-#define GMX_SIMD_GENERAL_AVX_128_FMA_H
-
-
-#include <immintrin.h>
-#ifdef HAVE_X86INTRIN_H
-#include <x86intrin.h> /* FMA */
-#endif
-#ifdef HAVE_INTRIN_H
-#include <intrin.h> /* FMA MSVC */
-#endif
-
-
-#include <stdio.h>
-
-#include "types/simple.h"
-
-
-#define gmx_mm_extract_epi32(x, imm) _mm_cvtsi128_si32(_mm_srli_si128((x), 4 * (imm)))
-
-#define _GMX_MM_BLEND(b3, b2, b1, b0) (((b3) << 3) | ((b2) << 2) | ((b1) << 1) | ((b0)))
-
-#define _GMX_MM_PERMUTE128D(fp1, fp0)         (((fp1) << 1) | ((fp0)))
-
-
-#define GMX_MM_TRANSPOSE2_PD(row0, row1) {           \
-        __m128d __gmx_t1 = row0;                         \
-        row0           = _mm_unpacklo_pd(row0, row1);     \
-        row1           = _mm_unpackhi_pd(__gmx_t1, row1); \
-}
-
-
-#if (defined (_MSC_VER) || defined(__INTEL_COMPILER))
-#  define gmx_mm_castsi128_ps(a) _mm_castsi128_ps(a)
-#  define gmx_mm_castps_si128(a) _mm_castps_si128(a)
-#  define gmx_mm_castps_ps128(a) (a)
-#  define gmx_mm_castsi128_pd(a) _mm_castsi128_pd(a)
-#  define gmx_mm_castpd_si128(a) _mm_castpd_si128(a)
-#elif defined(__GNUC__)
-#  define gmx_mm_castsi128_ps(a) ((__m128)(a))
-#  define gmx_mm_castps_si128(a) ((__m128i)(a))
-#  define gmx_mm_castps_ps128(a) ((__m128)(a))
-#  define gmx_mm_castsi128_pd(a) ((__m128d)(a))
-#  define gmx_mm_castpd_si128(a) ((__m128i)(a))
-#else
-static __m128  gmx_mm_castsi128_ps(__m128i a)
-{
-    return *(__m128 *) &a;
-}
-static __m128i gmx_mm_castps_si128(__m128 a)
-{
-    return *(__m128i *) &a;
-}
-static __m128  gmx_mm_castps_ps128(__m128 a)
-{
-    return *(__m128 *) &a;
-}
-static __m128d gmx_mm_castsi128_pd(__m128i a)
-{
-    return *(__m128d *) &a;
-}
-static __m128i gmx_mm_castpd_si128(__m128d a)
-{
-    return *(__m128i *) &a;
-}
-#endif
-
-#if GMX_EMULATE_AMD_FMA
-/* Wrapper routines so we can do test builds on non-FMA or non-AMD hardware */
-static __m128
-_mm_macc_ps(__m128 a, __m128 b, __m128 c)
-{
-    return _mm_add_ps(c, _mm_mul_ps(a, b));
-}
-
-static __m128
-_mm_nmacc_ps(__m128 a, __m128 b, __m128 c)
-{
-    return _mm_sub_ps(c, _mm_mul_ps(a, b));
-}
-
-static __m128
-_mm_msub_ps(__m128 a, __m128 b, __m128 c)
-{
-    return _mm_sub_ps(_mm_mul_ps(a, b), c);
-}
-
-static __m128d
-_mm_macc_pd(__m128d a, __m128d b, __m128d c)
-{
-    return _mm_add_pd(c, _mm_mul_pd(a, b));
-}
-
-static __m128d
-_mm_nmacc_pd(__m128d a, __m128d b, __m128d c)
-{
-    return _mm_sub_pd(c, _mm_mul_pd(a, b));
-}
-
-static __m128d
-_mm_msub_pd(__m128d a, __m128d b, __m128d c)
-{
-    return _mm_sub_pd(_mm_mul_pd(a, b), c);
-}
-#endif /* AMD FMA emulation support */
-
-static void
-gmx_simd_real_tintxmm_ps(const char *s, __m128 xmm)
-{
-    float f[4];
-
-    _mm_storeu_ps(f, xmm);
-    printf("%s: %15.10e %15.10e %15.10e %15.10e\n", s, f[0], f[1], f[2], f[3]);
-}
-
-
-static void
-gmx_simd_real_tintxmmsum_ps(const char *s, __m128 xmm)
-{
-    float f[4];
-
-    _mm_storeu_ps(f, xmm);
-    printf("%s (sum): %15.10g\n", s, f[0]+f[1]+f[2]+f[3]);
-}
-
-
-static void
-gmx_simd_real_tintxmm_pd(const char *s, __m128d xmm)
-{
-    double f[2];
-
-    _mm_storeu_pd(f, xmm);
-    printf("%s: %30.20e %30.20e\n", s, f[0], f[1]);
-}
-
-static void
-gmx_simd_real_tintxmmsum_pd(const char *s, __m128d xmm)
-{
-    double f[2];
-
-    _mm_storeu_pd(f, xmm);
-    printf("%s (sum): %15.10g\n", s, f[0]+f[1]);
-}
-
-
-static void
-gmx_simd_real_tintxmm_epi32(const char *s, __m128i xmmi)
-{
-    int i[4];
-
-    _mm_storeu_si128((__m128i *)i, xmmi);
-    printf("%10s: %2d %2d %2d %2d\n", s, i[0], i[1], i[2], i[3]);
-}
-
-
-
-static int gmx_mm_check_and_reset_overflow(void)
-{
-    int MXCSR;
-    int sse_overflow;
-
-    MXCSR = _mm_getcsr();
-    /* The overflow flag is bit 3 in the register */
-    if (MXCSR & 0x0008)
-    {
-        sse_overflow = 1;
-        /* Set the overflow flag to zero */
-        MXCSR = MXCSR & 0xFFF7;
-        _mm_setcsr(MXCSR);
-    }
-    else
-    {
-        sse_overflow = 0;
-    }
-
-    return sse_overflow;
-}
-
-/* Work around gcc bug with wrong type for mask formal parameter to maskload/maskstore */
-#ifdef GMX_SIMD_X86_AVX_GCC_MASKLOAD_BUG
-#    define gmx_mm_maskload_ps(mem, mask)       _mm_maskload_ps((mem), _mm_castsi128_ps(mask))
-#    define gmx_mm_maskstore_ps(mem, mask, x)    _mm_maskstore_ps((mem), _mm_castsi128_ps(mask), (x))
-#    define gmx_mm256_maskload_ps(mem, mask)    _mm256_maskload_ps((mem), _mm256_castsi256_ps(mask))
-#    define gmx_mm256_maskstore_ps(mem, mask, x) _mm256_maskstore_ps((mem), _mm256_castsi256_ps(mask), (x))
-#else
-#    define gmx_mm_maskload_ps(mem, mask)       _mm_maskload_ps((mem), (mask))
-#    define gmx_mm_maskstore_ps(mem, mask, x)    _mm_maskstore_ps((mem), (mask), (x))
-#    define gmx_mm256_maskload_ps(mem, mask)    _mm256_maskload_ps((mem), (mask))
-#    define gmx_mm256_maskstore_ps(mem, mask, x) _mm256_maskstore_ps((mem), (mask), (x))
-#endif
-
-
-
-#endif
diff --git a/src/gromacs/simd/general_x86_avx_256.h b/src/gromacs/simd/general_x86_avx_256.h

deleted file mode 100644 (file)

index b7b1c23..0000000
--- a/src/gromacs/simd/general_x86_avx_256.h
+++ /dev/null
@@ -1,303 +0,0 @@
-/*
- * This file is part of the GROMACS molecular simulation package.
- *
- * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
- * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
- * and including many others, as listed in the AUTHORS file in the
- * top-level source directory and at http://www.gromacs.org.
- *
- * GROMACS is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public License
- * as published by the Free Software Foundation; either version 2.1
- * of the License, or (at your option) any later version.
- *
- * GROMACS is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with GROMACS; if not, see
- * http://www.gnu.org/licenses, or write to the Free Software Foundation,
- * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
- *
- * If you want to redistribute modifications to GROMACS, please
- * consider that scientific software is very special. Version
- * control is crucial - bugs must be traceable. We will be happy to
- * consider code for inclusion in the official distribution, but
- * derived work must not be called official GROMACS. Details are found
- * in the README & COPYING files - if they are missing, get the
- * official version at http://www.gromacs.org.
- *
- * To help us fund GROMACS development, we humbly ask that you cite
- * the research papers on the package. Check out http://www.gromacs.org.
- */
-#ifndef GMX_SIMD_GENERAL_AVX_256_H
-#define GMX_SIMD_GENERAL_AVX_256_H
-
-
-#include <immintrin.h>
-#ifdef HAVE_X86INTRIN_H
-#include <x86intrin.h> /* FMA */
-#endif
-
-
-#include <stdio.h>
-
-#include "types/simple.h"
-
-
-#define gmx_mm_extract_epi32(x, imm) _mm_cvtsi128_si32(_mm_srli_si128((x), 4 * (imm)))
-
-#define _GMX_MM_BLEND256D(b3, b2, b1, b0) (((b3) << 3) | ((b2) << 2) | ((b1) << 1) | ((b0)))
-#define _GMX_MM_PERMUTE(fp3, fp2, fp1, fp0) (((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | ((fp0)))
-#define _GMX_MM_PERMUTE256D(fp3, fp2, fp1, fp0) (((fp3) << 3) | ((fp2) << 2) | ((fp1) << 1) | ((fp0)))
-#define _GMX_MM_PERMUTE128D(fp1, fp0)         (((fp1) << 1) | ((fp0)))
-
-
-#define GMX_MM_TRANSPOSE2_PD(row0, row1) {           \
-        __m128d __gmx_t1 = row0;                         \
-        row0           = _mm_unpacklo_pd(row0, row1);     \
-        row1           = _mm_unpackhi_pd(__gmx_t1, row1); \
-}
-
-#define GMX_MM256_FULLTRANSPOSE4_PD(row0, row1, row2, row3) \
-    {                                                        \
-        __m256d _t0, _t1, _t2, _t3;                          \
-        _t0  = _mm256_unpacklo_pd((row0), (row1));           \
-        _t1  = _mm256_unpackhi_pd((row0), (row1));           \
-        _t2  = _mm256_unpacklo_pd((row2), (row3));           \
-        _t3  = _mm256_unpackhi_pd((row2), (row3));           \
-        row0 = _mm256_permute2f128_pd(_t0, _t2, 0x20);       \
-        row1 = _mm256_permute2f128_pd(_t1, _t3, 0x20);       \
-        row2 = _mm256_permute2f128_pd(_t0, _t2, 0x31);       \
-        row3 = _mm256_permute2f128_pd(_t1, _t3, 0x31);       \
-    }
-
-#if (defined (_MSC_VER) || defined(__INTEL_COMPILER))
-#  define gmx_mm_castsi128_ps(a) _mm_castsi128_ps(a)
-#  define gmx_mm_castps_si128(a) _mm_castps_si128(a)
-#  define gmx_mm_castps_ps128(a) (a)
-#  define gmx_mm_castsi128_pd(a) _mm_castsi128_pd(a)
-#  define gmx_mm_castpd_si128(a) _mm_castpd_si128(a)
-#elif defined(__GNUC__)
-#  define gmx_mm_castsi128_ps(a) ((__m128)(a))
-#  define gmx_mm_castps_si128(a) ((__m128i)(a))
-#  define gmx_mm_castps_ps128(a) ((__m128)(a))
-#  define gmx_mm_castsi128_pd(a) ((__m128d)(a))
-#  define gmx_mm_castpd_si128(a) ((__m128i)(a))
-#else
-static __m128  gmx_mm_castsi128_ps(__m128i a)
-{
-    return *(__m128 *) &a;
-}
-static __m128i gmx_mm_castps_si128(__m128 a)
-{
-    return *(__m128i *) &a;
-}
-static __m128  gmx_mm_castps_ps128(__m128 a)
-{
-    return *(__m128 *) &a;
-}
-static __m128d gmx_mm_castsi128_pd(__m128i a)
-{
-    return *(__m128d *) &a;
-}
-static __m128i gmx_mm_castpd_si128(__m128d a)
-{
-    return *(__m128i *) &a;
-}
-#endif
-
-static gmx_inline __m256
-gmx_mm256_unpack128lo_ps(__m256 xmm1, __m256 xmm2)
-{
-    return _mm256_permute2f128_ps(xmm1, xmm2, 0x20);
-}
-
-static gmx_inline __m256
-gmx_mm256_unpack128hi_ps(__m256 xmm1, __m256 xmm2)
-{
-    return _mm256_permute2f128_ps(xmm1, xmm2, 0x31);
-}
-
-static gmx_inline __m256
-gmx_mm256_set_m128(__m128 hi, __m128 lo)
-{
-    return _mm256_insertf128_ps(_mm256_castps128_ps256(lo), hi, 0x1);
-}
-
-
-static gmx_inline __m256
-gmx_mm256_load4_ps(float const * p)
-{
-    __m128 a;
-
-    a = _mm_load_ps(p);
-    return _mm256_insertf128_ps(_mm256_castps128_ps256(a), a, 0x1);
-}
-
-
-static __m256d
-gmx_mm256_unpack128lo_pd(__m256d xmm1, __m256d xmm2)
-{
-    return _mm256_permute2f128_pd(xmm1, xmm2, 0x20);
-}
-
-static __m256d
-gmx_mm256_unpack128hi_pd(__m256d xmm1, __m256d xmm2)
-{
-    return _mm256_permute2f128_pd(xmm1, xmm2, 0x31);
-}
-
-static __m256d
-gmx_mm256_set_m128d(__m128d hi, __m128d lo)
-{
-    return _mm256_insertf128_pd(_mm256_castpd128_pd256(lo), hi, 0x1);
-}
-
-
-static __m128 gmx_mm256_sum4h_m128(__m256 x, __m256 y)
-{
-    __m256 sum;
-
-    sum = _mm256_add_ps(x, y);
-    return _mm_add_ps(_mm256_castps256_ps128(sum), _mm256_extractf128_ps(sum, 0x1));
-}
-
-
-static void
-gmx_simd_real_tintxmm_ps(const char *s, __m128 xmm)
-{
-    float f[4];
-
-    _mm_storeu_ps(f, xmm);
-    printf("%s: %15.10e %15.10e %15.10e %15.10e\n", s, f[0], f[1], f[2], f[3]);
-}
-
-
-static void
-gmx_simd_real_tintxmmsum_ps(const char *s, __m128 xmm)
-{
-    float f[4];
-
-    _mm_storeu_ps(f, xmm);
-    printf("%s (sum): %15.10g\n", s, f[0]+f[1]+f[2]+f[3]);
-}
-
-
-static void
-gmx_simd_real_tintxmm_pd(const char *s, __m128d xmm)
-{
-    double f[2];
-
-    _mm_storeu_pd(f, xmm);
-    printf("%s: %30.20e %30.20e\n", s, f[0], f[1]);
-}
-
-static void
-gmx_simd_real_tintxmmsum_pd(const char *s, __m128d xmm)
-{
-    double f[2];
-
-    _mm_storeu_pd(f, xmm);
-    printf("%s (sum): %15.10g\n", s, f[0]+f[1]);
-}
-
-
-static void
-gmx_simd_real_tintxmm_epi32(const char *s, __m128i xmmi)
-{
-    int i[4];
-
-    _mm_storeu_si128((__m128i *)i, xmmi);
-    printf("%10s: %2d %2d %2d %2d\n", s, i[0], i[1], i[2], i[3]);
-}
-
-static void
-gmx_mm256_printymm_ps(const char *s, __m256 ymm)
-{
-    float f[8];
-
-    _mm256_storeu_ps(f, ymm);
-    printf("%s: %12.7f %12.7f %12.7f %12.7f %12.7f %12.7f %12.7f %12.7f\n", s, f[0], f[1], f[2], f[3], f[4], f[5], f[6], f[7]);
-}
-
-static void
-gmx_mm256_printymmsum_ps(const char *s, __m256 ymm)
-{
-    float f[8];
-
-    _mm256_storeu_ps(f, ymm);
-    printf("%s (sum): %15.10g\n", s, f[0]+f[1]+f[2]+f[3]+f[4]+f[5]+f[6]+f[7]);
-}
-
-
-static void
-gmx_mm256_printymm_pd(const char *s, __m256d ymm)
-{
-    double f[4];
-
-    _mm256_storeu_pd(f, ymm);
-    printf("%s: %16.12f %16.12f %16.12f %16.12f\n", s, f[0], f[1], f[2], f[3]);
-}
-
-static void
-gmx_mm256_printymmsum_pd(const char *s, __m256d ymm)
-{
-    double f[4];
-
-    _mm256_storeu_pd(f, ymm);
-    printf("%s (sum): %15.10g\n", s, f[0]+f[1]+f[2]+f[3]);
-}
-
-
-
-static void
-gmx_mm256_printymm_epi32(const char *s, __m256i ymmi)
-{
-    int i[8];
-
-    _mm256_storeu_si256((__m256i *)i, ymmi);
-    printf("%10s: %2d %2d %2d %2d %2d %2d %2d %2d\n", s, i[0], i[1], i[2], i[3], i[4], i[5], i[6], i[7]);
-}
-
-
-
-static int gmx_mm_check_and_reset_overflow(void)
-{
-    int MXCSR;
-    int sse_overflow;
-
-    MXCSR = _mm_getcsr();
-    /* The overflow flag is bit 3 in the register */
-    if (MXCSR & 0x0008)
-    {
-        sse_overflow = 1;
-        /* Set the overflow flag to zero */
-        MXCSR = MXCSR & 0xFFF7;
-        _mm_setcsr(MXCSR);
-    }
-    else
-    {
-        sse_overflow = 0;
-    }
-
-    return sse_overflow;
-}
-
-/* Work around gcc bug with wrong type for mask formal parameter to maskload/maskstore */
-#ifdef GMX_SIMD_X86_AVX_GCC_MASKLOAD_BUG
-#    define gmx_mm_maskload_ps(mem, mask)       _mm_maskload_ps((mem), _mm_castsi128_ps(mask))
-#    define gmx_mm_maskstore_ps(mem, mask, x)    _mm_maskstore_ps((mem), _mm_castsi128_ps(mask), (x))
-#    define gmx_mm256_maskload_ps(mem, mask)    _mm256_maskload_ps((mem), _mm256_castsi256_ps(mask))
-#    define gmx_mm256_maskstore_ps(mem, mask, x) _mm256_maskstore_ps((mem), _mm256_castsi256_ps(mask), (x))
-#else
-#    define gmx_mm_maskload_ps(mem, mask)       _mm_maskload_ps((mem), (mask))
-#    define gmx_mm_maskstore_ps(mem, mask, x)    _mm_maskstore_ps((mem), (mask), (x))
-#    define gmx_mm256_maskload_ps(mem, mask)    _mm256_maskload_ps((mem), (mask))
-#    define gmx_mm256_maskstore_ps(mem, mask, x) _mm256_maskstore_ps((mem), (mask), (x))
-#endif
-
-
-#endif
diff --git a/src/gromacs/simd/general_x86_mic.h b/src/gromacs/simd/general_x86_mic.h

deleted file mode 100644 (file)

index 9f4c191..0000000
--- a/src/gromacs/simd/general_x86_mic.h
+++ /dev/null
@@ -1,161 +0,0 @@
-/*
- * This file is part of the GROMACS molecular simulation package.
- *
- * Copyright (c) 2013,2014, by the GROMACS development team, led by
- * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
- * and including many others, as listed in the AUTHORS file in the
- * top-level source directory and at http://www.gromacs.org.
- *
- * GROMACS is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public License
- * as published by the Free Software Foundation; either version 2.1
- * of the License, or (at your option) any later version.
- *
- * GROMACS is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with GROMACS; if not, see
- * http://www.gnu.org/licenses, or write to the Free Software Foundation,
- * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
- *
- * If you want to redistribute modifications to GROMACS, please
- * consider that scientific software is very special. Version
- * control is crucial - bugs must be traceable. We will be happy to
- * consider code for inclusion in the official distribution, but
- * derived work must not be called official GROMACS. Details are found
- * in the README & COPYING files - if they are missing, get the
- * official version at http://www.gromacs.org.
- *
- * To help us fund GROMACS development, we humbly ask that you cite
- * the research papers on the package. Check out http://www.gromacs.org.
- */
-
-#ifndef _general_x86_mic_h_
-#define _general_x86_mic_h_
-
-/* This file contains the SIMD implmenetation for Intel MIC
- */
-
-#include <math.h>
-#include <immintrin.h>
-
-#ifdef GMX_DOUBLE
-#error "Double precision isn't supported on Intel Phi yet"
-#endif
-
-typedef __m512 gmx_mm_ps;
-typedef __m512 gmx_simd_real_t;
-/* boolean SIMD register type */
-typedef __mmask16 gmx_simd_bool_t;
-typedef __m512i gmx_simd_int32_t;
-
-#define GMX_HAVE_SIMD_MACROS
-#define GMX_SIMD_REAL_WIDTH  16
-#define GMX_SIMD_INT32_WIDTH 16
-
-#define gmx_simd_load_r _mm512_load_ps
-
-/* Set all SIMD register elements to *r */
-static gmx_inline gmx_mm_ps
-gmx_simd_load1_r(const real *r)
-{
-    return _mm512_extload_ps(r, _MM_UPCONV_PS_NONE, _MM_BROADCAST_1X16, _MM_HINT_NONE);
-}
-
-#define gmx_simd_set1_r _mm512_set1_ps
-/* Set all SIMD register elements to 0 */
-#define gmx_simd_setzero_r _mm512_setzero_ps
-#define gmx_simd_store_r _mm512_store_ps
-
-#define gmx_simd_add_r _mm512_add_ps
-#define gmx_simd_sub_r _mm512_sub_ps
-#define gmx_simd_mul_r _mm512_mul_ps
-
-#define GMX_SIMD_HAVE_FMA
-#define gmx_simd_fmadd_r _mm512_fmadd_ps
-#define gmx_simd_fnmadd_r _mm512_fnmadd_ps
-
-#define gmx_simd_max_r _mm512_max_ps
-
-static gmx_inline gmx_mm_ps
-gmx_simd_blendzero_r(gmx_mm_ps a, gmx_simd_bool_t b)
-{
-    return _mm512_mask_mov_ps(_mm512_setzero_ps(), b, a);
-}
-
-#define gmx_simd_round_r _mm512_rint_ps
-
-#define GMX_SIMD_HAVE_FLOOR
-#define gmx_simd_floor_r _mm512_floor_ps
-
-/* Copy the sign of a to b, assumes b >= 0 for efficiency */
-static gmx_inline gmx_mm_ps
-gmx_cpsgn_nonneg_pr(gmx_mm_ps a, gmx_mm_ps b)
-{
-    __m512 zero = _mm512_setzero_ps();
-    __m512 neg1 = _mm512_set1_ps(-1);
-    /* TODO (only bond): Bitwise operations on floating points can be done after casting to int.
-       That allows us to do it the same way as AVX which might be faster. */
-    return _mm512_mask_mul_ps(b, _mm512_cmplt_ps_mask(a, zero), b, neg1);
-}
-
-/* Very specific operation required in the non-bonded kernels */
-static gmx_inline gmx_mm_ps
-gmx_masknot_add_pr(gmx_simd_bool_t a, gmx_mm_ps b, gmx_mm_ps c)
-{
-    return _mm512_mask_add_ps(b, _mm512_knot(a), b, c);
-}
-
-/* Comparison */
-#define gmx_simd_cmplt_r _mm512_cmplt_ps_mask
-
-/* Logical AND on SIMD booleans. */
-#define gmx_simd_and_b _mm512_kand
-
-/* Logical OR on SIMD booleans. */
-#define gmx_simd_or_b _mm512_kor
-
-/* Returns a single int (0/1) which tells if any of the booleans is True
-   It returns the full mask (not 1 for True). But given that any non-zero is True this is OK. */
-#define gmx_simd_anytrue_b _mm512_mask2int
-
-/* Conversions only used for PME table lookup */
-static gmx_inline gmx_simd_int32_t
-gmx_simd_cvtt_r2i(gmx_mm_ps a)
-{
-    return _mm512_cvtfxpnt_round_adjustps_epi32(a, _MM_ROUND_MODE_DOWN, _MM_EXPADJ_NONE);
-};
-
-/* These two function only need to be approximate, Newton-Raphson iteration
- * is used for full accuracy in gmx_simd_invsqrt_r and gmx_simd_inv_r.
- */
-#define gmx_simd_rsqrt_r _mm512_rsqrt23_ps
-#define gmx_simd_rcp_r _mm512_rcp23_ps
-
-#define GMX_SIMD_HAVE_EXP
-#define gmx_simd_exp_r _mm512_exp_ps
-
-#define GMX_SIMD_HAVE_ERFC
-#define gmx_simd_erfc_r _mm512_erfc_ps
-
-#define GMX_SIMD_HAVE_TRIGONOMETRIC
-#define gmx_simd_sqrt_r  _mm512_sqrt_ps
-
-static gmx_inline int
-gmx_simd_sincos_r(gmx_mm_ps a,
-                  gmx_mm_ps *s, gmx_mm_ps *c)
-{
-    /* TODO (only bond): optimize that both are calculated together.
-       Or (if if that isn't fast on MIC) don't call sincos if only one is needed. */
-    *s = _mm512_sin_ps(a);
-    *c = _mm512_cos_ps(a);
-    return 0;
-}
-
-#define gmx_simd_acos_r _mm512_acos_ps
-#define gmx_simd_atan2_r _mm512_atan2_ps
-
-#endif /* _general_x86_mic_h_ */
diff --git a/src/gromacs/simd/general_x86_sse2.h b/src/gromacs/simd/general_x86_sse2.h

deleted file mode 100644 (file)

index 8aa7085..0000000
--- a/src/gromacs/simd/general_x86_sse2.h
+++ /dev/null
@@ -1,169 +0,0 @@
-/*
- * This file is part of the GROMACS molecular simulation package.
- *
- * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
- * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
- * and including many others, as listed in the AUTHORS file in the
- * top-level source directory and at http://www.gromacs.org.
- *
- * GROMACS is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public License
- * as published by the Free Software Foundation; either version 2.1
- * of the License, or (at your option) any later version.
- *
- * GROMACS is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with GROMACS; if not, see
- * http://www.gnu.org/licenses, or write to the Free Software Foundation,
- * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
- *
- * If you want to redistribute modifications to GROMACS, please
- * consider that scientific software is very special. Version
- * control is crucial - bugs must be traceable. We will be happy to
- * consider code for inclusion in the official distribution, but
- * derived work must not be called official GROMACS. Details are found
- * in the README & COPYING files - if they are missing, get the
- * official version at http://www.gromacs.org.
- *
- * To help us fund GROMACS development, we humbly ask that you cite
- * the research papers on the package. Check out http://www.gromacs.org.
- */
-#ifndef GMX_SIMD_GENERAL_SSE2_H
-#define GMX_SIMD_GENERAL_SSE2_H
-
-#include <emmintrin.h>
-
-#include <stdio.h>
-
-#include "types/simple.h"
-
-
-
-/* Create some basic definitions that are not 100% SSE2 standard and thus not
- * available on all compilers. These should be fairly self-evident by comparing
- * with an arbitrary emmintrin.h.
- */
-
-#define gmx_mm_extract_epi32(x, imm) _mm_cvtsi128_si32(_mm_srli_si128((x), 4 * (imm)))
-
-#define GMX_MM_TRANSPOSE2_PD(row0, row1) {           \
-        __m128d __gmx_t1 = row0;                         \
-        row0           = _mm_unpacklo_pd(row0, row1);     \
-        row1           = _mm_unpackhi_pd(__gmx_t1, row1); \
-}
-
-
-#if (defined (_MSC_VER) || defined(__INTEL_COMPILER))
-#  define gmx_mm_castsi128_ps(a) _mm_castsi128_ps(a)
-#  define gmx_mm_castps_si128(a) _mm_castps_si128(a)
-#  define gmx_mm_castps_ps128(a) (a)
-#  define gmx_mm_castsi128_pd(a) _mm_castsi128_pd(a)
-#  define gmx_mm_castpd_si128(a) _mm_castpd_si128(a)
-#elif defined(__GNUC__)
-#  define gmx_mm_castsi128_ps(a) ((__m128)(a))
-#  define gmx_mm_castps_si128(a) ((__m128i)(a))
-#  define gmx_mm_castps_ps128(a) ((__m128)(a))
-#  define gmx_mm_castsi128_pd(a) ((__m128d)(a))
-#  define gmx_mm_castpd_si128(a) ((__m128i)(a))
-#else
-static __m128  gmx_mm_castsi128_ps(__m128i a)
-{
-    return *(__m128 *) &a;
-}
-static __m128i gmx_mm_castps_si128(__m128 a)
-{
-    return *(__m128i *) &a;
-}
-static __m128  gmx_mm_castps_ps128(__m128 a)
-{
-    return *(__m128 *) &a;
-}
-static __m128d gmx_mm_castsi128_pd(__m128i a)
-{
-    return *(__m128d *) &a;
-}
-static __m128i gmx_mm_castpd_si128(__m128d a)
-{
-    return *(__m128i *) &a;
-}
-#endif
-
-
-static void
-gmx_simd_real_tintxmm_ps(const char *s, __m128 xmm)
-{
-    float f[4];
-
-    _mm_storeu_ps(f, xmm);
-    printf("%s: %15.10e %15.10e %15.10e %15.10e\n", s, f[0], f[1], f[2], f[3]);
-}
-
-
-static void
-gmx_simd_real_tintxmmsum_ps(const char *s, __m128 xmm)
-{
-    float f[4];
-
-    _mm_storeu_ps(f, xmm);
-    printf("%s (sum): %15.10g\n", s, f[0]+f[1]+f[2]+f[3]);
-}
-
-
-static void
-gmx_simd_real_tintxmm_pd(const char *s, __m128d xmm)
-{
-    double f[2];
-
-    _mm_storeu_pd(f, xmm);
-    printf("%s: %30.20e %30.20e\n", s, f[0], f[1]);
-}
-
-static void
-gmx_simd_real_tintxmmsum_pd(const char *s, __m128d xmm)
-{
-    double f[2];
-
-    _mm_storeu_pd(f, xmm);
-    printf("%s (sum): %15.10g\n", s, f[0]+f[1]);
-}
-
-
-static void
-gmx_simd_real_tintxmm_epi32(const char *s, __m128i xmmi)
-{
-    int i[4];
-
-    _mm_storeu_si128((__m128i *)i, xmmi);
-    printf("%10s: %2d %2d %2d %2d\n", s, i[0], i[1], i[2], i[3]);
-}
-
-
-
-static int gmx_mm_check_and_reset_overflow(void)
-{
-    int MXCSR;
-    int sse_overflow;
-
-    MXCSR = _mm_getcsr();
-    /* The overflow flag is bit 3 in the register */
-    if (MXCSR & 0x0008)
-    {
-        sse_overflow = 1;
-        /* Set the overflow flag to zero */
-        MXCSR = MXCSR & 0xFFF7;
-        _mm_setcsr(MXCSR);
-    }
-    else
-    {
-        sse_overflow = 0;
-    }
-
-    return sse_overflow;
-}
-
-
-#endif
diff --git a/src/gromacs/simd/general_x86_sse4_1.h b/src/gromacs/simd/general_x86_sse4_1.h

deleted file mode 100644 (file)

index 43b83ef..0000000
--- a/src/gromacs/simd/general_x86_sse4_1.h
+++ /dev/null
@@ -1,169 +0,0 @@
-/*
- * This file is part of the GROMACS molecular simulation package.
- *
- * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
- * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
- * and including many others, as listed in the AUTHORS file in the
- * top-level source directory and at http://www.gromacs.org.
- *
- * GROMACS is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public License
- * as published by the Free Software Foundation; either version 2.1
- * of the License, or (at your option) any later version.
- *
- * GROMACS is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with GROMACS; if not, see
- * http://www.gnu.org/licenses, or write to the Free Software Foundation,
- * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
- *
- * If you want to redistribute modifications to GROMACS, please
- * consider that scientific software is very special. Version
- * control is crucial - bugs must be traceable. We will be happy to
- * consider code for inclusion in the official distribution, but
- * derived work must not be called official GROMACS. Details are found
- * in the README & COPYING files - if they are missing, get the
- * official version at http://www.gromacs.org.
- *
- * To help us fund GROMACS development, we humbly ask that you cite
- * the research papers on the package. Check out http://www.gromacs.org.
- */
-#ifndef GMX_SIMD_GENERAL_SSE4_1_H
-#define GMX_SIMD_GENERAL_SSE4_1_H
-
-#include <smmintrin.h>
-
-#include <stdio.h>
-
-#include "types/simple.h"
-
-/* Create some basic definitions that are not 100% SSE2 standard and thus not
- * available on all compilers. These should be fairly self-evident by comparing
- * with an arbitrary emmintrin.h.
- */
-
-
-#define gmx_mm_extract_epi32(x, imm) _mm_extract_epi32((x), (imm))
-
-#define GMX_MM_TRANSPOSE2_PD(row0, row1) {           \
-        __m128d __gmx_t1 = row0;                         \
-        row0           = _mm_unpacklo_pd(row0, row1);     \
-        row1           = _mm_unpackhi_pd(__gmx_t1, row1); \
-}
-
-#define _GMX_MM_BLEND(b3, b2, b1, b0) (((b3) << 3) | ((b2) << 2) | ((b1) << 1) | ((b0)))
-
-#if (defined (_MSC_VER) || defined(__INTEL_COMPILER))
-#  define gmx_mm_castsi128_ps(a) _mm_castsi128_ps(a)
-#  define gmx_mm_castps_si128(a) _mm_castps_si128(a)
-#  define gmx_mm_castps_ps128(a) (a)
-#  define gmx_mm_castsi128_pd(a) _mm_castsi128_pd(a)
-#  define gmx_mm_castpd_si128(a) _mm_castpd_si128(a)
-#elif defined(__GNUC__)
-#  define gmx_mm_castsi128_ps(a) ((__m128)(a))
-#  define gmx_mm_castps_si128(a) ((__m128i)(a))
-#  define gmx_mm_castps_ps128(a) ((__m128)(a))
-#  define gmx_mm_castsi128_pd(a) ((__m128d)(a))
-#  define gmx_mm_castpd_si128(a) ((__m128i)(a))
-#else
-static __m128  gmx_mm_castsi128_ps(__m128i a)
-{
-    return *(__m128 *) &a;
-}
-static __m128i gmx_mm_castps_si128(__m128 a)
-{
-    return *(__m128i *) &a;
-}
-static __m128  gmx_mm_castps_ps128(__m128 a)
-{
-    return *(__m128 *) &a;
-}
-static __m128d gmx_mm_castsi128_pd(__m128i a)
-{
-    return *(__m128d *) &a;
-}
-static __m128i gmx_mm_castpd_si128(__m128d a)
-{
-    return *(__m128i *) &a;
-}
-#endif
-
-
-static void
-gmx_simd_real_tintxmm_ps(const char *s, __m128 xmm)
-{
-    float f[4];
-
-    _mm_storeu_ps(f, xmm);
-    printf("%s: %15.10e %15.10e %15.10e %15.10e\n", s, f[0], f[1], f[2], f[3]);
-}
-
-
-static void
-gmx_simd_real_tintxmmsum_ps(const char *s, __m128 xmm)
-{
-    float f[4];
-
-    _mm_storeu_ps(f, xmm);
-    printf("%s (sum): %15.10g\n", s, f[0]+f[1]+f[2]+f[3]);
-}
-
-
-static void
-gmx_simd_real_tintxmm_pd(const char *s, __m128d xmm)
-{
-    double f[2];
-
-    _mm_storeu_pd(f, xmm);
-    printf("%s: %30.20e %30.20e\n", s, f[0], f[1]);
-}
-
-static void
-gmx_simd_real_tintxmmsum_pd(const char *s, __m128d xmm)
-{
-    double f[2];
-
-    _mm_storeu_pd(f, xmm);
-    printf("%s (sum): %15.10g\n", s, f[0]+f[1]);
-}
-
-
-static void
-gmx_simd_real_tintxmm_epi32(const char *s, __m128i xmmi)
-{
-    int i[4];
-
-    _mm_storeu_si128((__m128i *)i, xmmi);
-    printf("%10s: %2d %2d %2d %2d\n", s, i[0], i[1], i[2], i[3]);
-}
-
-
-
-static int gmx_mm_check_and_reset_overflow(void)
-{
-    int MXCSR;
-    int sse_overflow;
-
-    MXCSR = _mm_getcsr();
-    /* The overflow flag is bit 3 in the register */
-    if (MXCSR & 0x0008)
-    {
-        sse_overflow = 1;
-        /* Set the overflow flag to zero */
-        MXCSR = MXCSR & 0xFFF7;
-        _mm_setcsr(MXCSR);
-    }
-    else
-    {
-        sse_overflow = 0;
-    }
-
-    return sse_overflow;
-}
-
-
-#endif
diff --git a/src/gromacs/simd/impl_ibm_qpx/impl_ibm_qpx.h b/src/gromacs/simd/impl_ibm_qpx/impl_ibm_qpx.h

new file mode 100644 (file)

index 0000000..ec2e858
--- /dev/null
+++ b/src/gromacs/simd/impl_ibm_qpx/impl_ibm_qpx.h
@@ -0,0 +1,478 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2014, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+
+#ifndef GMX_SIMD_IMPLEMENTATION_IBM_QPX_H
+#define GMX_SIMD_IMPLEMENTATION_IBM_QPX_H
+
+#include <math.h>
+#ifdef __clang__
+#include <qpxmath.h>
+#endif
+
+/* IBM QPX SIMD instruction wrappers
+ *
+ * Please see documentation in gromacs/simd/simd.h for the available
+ * defines.
+ */
+/* Capability definitions for IBM QPX */
+#define GMX_SIMD_HAVE_FLOAT
+#define GMX_SIMD_HAVE_DOUBLE
+#define GMX_SIMD_HAVE_HARDWARE
+#undef  GMX_SIMD_HAVE_STOREU
+#undef  GMX_SIMD_HAVE_STOREU
+#undef  GMX_SIMD_HAVE_LOGICAL
+#define GMX_SIMD_HAVE_FMA
+#undef  GMX_SIMD_HAVE_FRACTION
+#define GMX_SIMD_HAVE_FINT32
+#undef  GMX_SIMD_HAVE_FINT32_EXTRACT
+#undef  GMX_SIMD_HAVE_FINT32_LOGICAL
+#undef  GMX_SIMD_HAVE_FINT32_ARITHMETICS
+#define GMX_SIMD_HAVE_DINT32
+#undef  GMX_SIMD_HAVE_DINT32_EXTRACT
+#undef  GMX_SIMD_HAVE_DINT32_LOGICAL
+#undef  GMX_SIMD_HAVE_DINT32_ARITHMETICS
+#define GMX_SIMD4_HAVE_FLOAT
+#define GMX_SIMD4_HAVE_DOUBLE
+
+/* Implementation details */
+#define GMX_SIMD_FLOAT_WIDTH         4
+#define GMX_SIMD_DOUBLE_WIDTH        4
+#define GMX_SIMD_FINT32_WIDTH        4
+#define GMX_SIMD_DINT32_WIDTH        4
+#define GMX_SIMD_RSQRT_BITS         14
+#define GMX_SIMD_RCP_BITS           14
+
+/****************************************************
+ *      SINGLE PRECISION SIMD IMPLEMENTATION        *
+ ****************************************************/
+#define gmx_simd_float_t          vector4double
+#ifdef NDEBUG
+#    define gmx_simd_load_f(m)    vec_ld(0, (float *)(m))
+#    define gmx_simd_store_f(m, a) vec_st(a, 0, (float *)(m))
+#else
+#    define gmx_simd_load_f(m)    vec_lda(0, (float *)(m))
+#    define gmx_simd_store_f(m, a) vec_sta(a, 0, (float *)(m))
+#endif
+#    define gmx_simd_load1_f(m)   vec_lds(0, (float *)(m))
+#define gmx_simd_set1_f(x)        vec_splats(x)
+/* No support for unaligned load/store */
+#define gmx_simd_setzero_f        gmx_simd_setzero_ibm_qpx
+#define gmx_simd_add_f(a, b)       vec_add(a, b)
+#define gmx_simd_sub_f(a, b)       vec_sub(a, b)
+#define gmx_simd_mul_f(a, b)       vec_mul(a, b)
+#define gmx_simd_fmadd_f(a, b, c)   vec_madd(a, b, c)
+#define gmx_simd_fmsub_f(a, b, c)   vec_msub(a, b, c)
+/* IBM uses an alternative FMA definition, so -a*b+c=-(a*b-c) is "nmsub" */
+#define gmx_simd_fnmadd_f(a, b, c)  vec_nmsub(a, b, c)
+/* IBM uses an alternative FMA definition, so -a*b-c=-(a*b+c) is "nmadd" */
+#define gmx_simd_fnmsub_f(a, b, c)  vec_nmadd(a, b, c)
+/* gmx_simd_and_f not supported - no bitwise logical ops */
+/* gmx_simd_andnot_f not supported - no bitwise logical ops */
+/* gmx_simd_or_f not supported - no bitwise logical ops */
+/* gmx_simd_xor_f not supported - no bitwise logical ops */
+#define gmx_simd_rsqrt_f(a)       vec_rsqrte(a)
+#define gmx_simd_rcp_f(a)         vec_re(a)
+#define gmx_simd_fabs_f(a)        vec_abs(a)
+#define gmx_simd_fneg_f           gmx_simd_fneg_ibm_qpx
+#define gmx_simd_max_f(a, b)       vec_sel(b, a, vec_sub(a, b))
+#define gmx_simd_min_f(a, b)       vec_sel(b, a, vec_sub(b, a))
+/* Note: It is critical to use vec_cfid(vec_ctid(a)) for the implementation
+ * of gmx_simd_round_f(), since vec_round() does not adhere to the FP control
+ * word rounding scheme. We rely on float-to-float and float-to-integer
+ * rounding being the same for half-way values in a few algorithms.
+ */
+#define gmx_simd_round_f(a)       vec_cfid(vec_ctid(a))
+#define gmx_simd_trunc_f(a)       vec_trunc(a)
+#define gmx_simd_fraction_f(x)    vec_sub(x, vec_trunc(x))
+#define gmx_simd_get_exponent_f(a) gmx_simd_get_exponent_ibm_qpx(a)
+#define gmx_simd_get_mantissa_f(a) gmx_simd_get_mantissa_ibm_qpx(a)
+#define gmx_simd_set_exponent_f(a) gmx_simd_set_exponent_ibm_qpx(a)
+/* integer datatype corresponding to float: gmx_simd_fint32_t */
+#define gmx_simd_fint32_t         vector4double
+#ifdef NDEBUG
+#    define gmx_simd_load_fi(m)   vec_ldia(0, (int *)(m))
+#else
+#    define gmx_simd_load_fi(m)   vec_ldiaa(0, (int *)(m))
+#endif
+#define gmx_simd_set1_fi(i)       gmx_simd_set1_int_ibm_qpx(i)
+#define gmx_simd_store_fi(m, x)    vec_st(x, 0, (int *)(m))
+#define gmx_simd_setzero_fi       gmx_simd_setzero_ibm_qpx
+#define gmx_simd_cvt_f2i(a)       vec_ctiw(a)
+#define gmx_simd_cvtt_f2i(a)      vec_ctiwz(a)
+#define gmx_simd_cvt_i2f(a)       vec_cfid(a)
+/* Integer simd extract not available */
+/* Integer logical ops on gmx_simd_fint32_t not supported */
+/* Integer arithmetic ops on gmx_simd_fint32_t not supported */
+/* Boolean & comparison operations on gmx_simd_float_t */
+#define gmx_simd_fbool_t          vector4double
+#define gmx_simd_cmpeq_f(a, b)     vec_cmpeq(a, b)
+#define gmx_simd_cmplt_f(a, b)     vec_cmplt((a), (b))
+#define gmx_simd_cmple_f(a, b)     gmx_simd_or_fb(vec_cmpeq(a, b), vec_cmplt(a, b))
+#define gmx_simd_and_fb(a, b)      vec_and(a, b)
+#define gmx_simd_or_fb(a, b)       vec_or(a, b)
+#define gmx_simd_anytrue_fb(a)    gmx_simd_anytrue_bool_ibm_qpx(a)
+#define gmx_simd_blendzero_f(a, sel) vec_sel(vec_splats(0.0), a, sel)
+#define gmx_simd_blendnotzero_f(a, sel) vec_sel(a, vec_splats(0.0), sel)
+#define gmx_simd_blendv_f(a, b, sel)  vec_sel(a, b, sel)
+#define gmx_simd_reduce_f(a)       gmx_simd_reduce_ibm_qpx(a)
+
+
+/* Boolean & comparison operations on gmx_simd_fint32_t not supported */
+/* Conversions between different booleans not supported */
+
+static __attribute__((always_inline)) vector4double
+gmx_simd_fneg_ibm_qpx(vector4double a)
+{
+    return vec_neg(a);
+}
+/****************************************************
+ *      DOUBLE PRECISION SIMD IMPLEMENTATION        *
+ ****************************************************/
+#define gmx_simd_double_t         vector4double
+#ifdef NDEBUG
+#    define gmx_simd_load_d(m)    vec_ld(0, (double *)(m))
+#    define gmx_simd_store_d(m, a) vec_st(a, 0, (double *)(m))
+#else
+#    define gmx_simd_load_d(m)    vec_lda(0, (double *)(m))
+#    define gmx_simd_store_d(m, a) vec_sta(a, 0, (double *)(m))
+#endif
+#    define gmx_simd_load1_d(m)   vec_lds(0, (double *)(m))
+#define gmx_simd_set1_d(x)        vec_splats(x)
+/* No support for unaligned load/store */
+#define gmx_simd_setzero_d        gmx_simd_setzero_ibm_qpx
+#define gmx_simd_add_d(a, b)       vec_add(a, b)
+#define gmx_simd_sub_d(a, b)       vec_sub(a, b)
+#define gmx_simd_mul_d(a, b)       vec_mul(a, b)
+#define gmx_simd_fmadd_d(a, b, c)   vec_madd(a, b, c)
+#define gmx_simd_fmsub_d(a, b, c)   vec_msub(a, b, c)
+/* IBM uses an alternative FMA definition, so -a*b+c=-(a*b-c) is "nmsub" */
+#define gmx_simd_fnmadd_d(a, b, c)  vec_nmsub(a, b, c)
+/* IBM uses an alternative FMA definition, so -a*b-c=-(a*b+c) is "nmadd" */
+#define gmx_simd_fnmsub_d(a, b, c)  vec_nmadd(a, b, c)
+/* gmx_simd_and_d not supported - no bitwise logical ops */
+/* gmx_simd_andnot_d not supported - no bitwise logical ops */
+/* gmx_simd_or_d not supported - no bitwise logical ops */
+/* gmx_simd_xor_d not supported - no bitwise logical ops */
+#define gmx_simd_rsqrt_d(a)       vec_rsqrte(a)
+#define gmx_simd_rcp_d(a)         vec_re(a)
+#define gmx_simd_fabs_d(a)        vec_abs(a)
+#define gmx_simd_fneg_d           gmx_simd_fneg_ibm_qpx
+#define gmx_simd_max_d(a, b)       vec_sel(b, a, vec_sub(a, b))
+#define gmx_simd_min_d(a, b)       vec_sel(b, a, vec_sub(b, a))
+/* Note: It is critical to use vec_cfid(vec_ctid(a)) for the implementation
+ * of gmx_simd_round_f(), since vec_round() does not adhere to the FP control
+ * word rounding scheme. We rely on float-to-float and float-to-integer
+ * rounding being the same for half-way values in a few algorithms.
+ */
+#define gmx_simd_round_d(a)       vec_cfid(vec_ctid(a))
+#define gmx_simd_trunc_d(a)       vec_trunc(a)
+#define gmx_simd_fraction_d(x)    vec_sub(x, vec_trunc(x))
+#define gmx_simd_get_exponent_d(a) gmx_simd_get_exponent_ibm_qpx(a)
+#define gmx_simd_get_mantissa_d(a) gmx_simd_get_mantissa_ibm_qpx(a)
+#define gmx_simd_set_exponent_d(a) gmx_simd_set_exponent_ibm_qpx(a)
+/* integer datatype corresponding to double: gmx_simd_dint32_t */
+#define gmx_simd_dint32_t         vector4double
+#ifdef NDEBUG
+#    define gmx_simd_load_di(m)   vec_ldia(0, (int *)(m))
+#else
+#    define gmx_simd_load_di(m)   vec_ldiaa(0, (int *)(m))
+#endif
+#define gmx_simd_set1_di(i)       gmx_simd_set1_int_ibm_qpx(i)
+#define gmx_simd_store_di(m, x)   vec_st(x, 0, (int *)(m))
+#define gmx_simd_setzero_di       gmx_simd_setzero_ibm_qpx
+#define gmx_simd_cvt_d2i(a)       vec_ctiw(a)
+#define gmx_simd_cvtt_d2i(a)      vec_ctiwz(a)
+#define gmx_simd_cvt_i2d(a)       vec_cfid(a)
+/* Integer simd extract not available */
+/* Integer logical ops on gmx_simd_dint32_t not supported */
+/* Integer arithmetic ops on gmx_simd_dint32_t not supported */
+/* Boolean & comparison operations on gmx_simd_double_t */
+#define gmx_simd_dbool_t          vector4double
+#define gmx_simd_cmpeq_d(a, b)     vec_cmpeq(a, b)
+#define gmx_simd_cmplt_d(a, b)     vec_cmplt((a), (b))
+#define gmx_simd_cmple_d(a, b)     gmx_simd_or_fb(vec_cmpeq(a, b), vec_cmplt(a, b))
+#define gmx_simd_and_db(a, b)      vec_and(a, b)
+#define gmx_simd_or_db(a, b)       vec_or(a, b)
+#define gmx_simd_anytrue_db(a)    gmx_simd_anytrue_bool_ibm_qpx(a)
+#define gmx_simd_blendzero_d(a, sel) vec_sel(vec_splats(0.0), a, sel)
+#define gmx_simd_blendnotzero_d(a, sel) vec_sel(a, vec_splats(0.0), sel)
+#define gmx_simd_blendv_d(a, b, sel)  vec_sel(a, b, sel)
+#define gmx_simd_reduce_d(a)      gmx_simd_reduce_ibm_qpx(a)
+
+/* Boolean & comparison operations on gmx_simd_dint32_t not supported */
+/* Conversions between different booleans not supported */
+
+
+/****************************************************
+ * IMPLEMENTATION HELPER FUNCTIONS                  *
+ ****************************************************/
+static __attribute__((always_inline)) vector4double
+gmx_simd_setzero_ibm_qpx(void)
+{
+    return vec_splats(0.0);
+}
+
+static __attribute__((always_inline)) vector4double
+gmx_simd_get_exponent_ibm_qpx(vector4double x)
+{
+    const gmx_int64_t    expmask   = 0x7ff0000000000000LL;
+    const gmx_int64_t    expbase   = 1023;
+    gmx_int64_t          idata[4] __attribute__((aligned(32)));
+
+    /* Store to memory */
+    vec_st(x, 0, idata);
+    /* Perform integer arithmetics in general registers. */
+    idata[0] = ((idata[0] & expmask) >> 52) - expbase;
+    idata[1] = ((idata[1] & expmask) >> 52) - expbase;
+    idata[2] = ((idata[2] & expmask) >> 52) - expbase;
+    idata[3] = ((idata[3] & expmask) >> 52) - expbase;
+    /* Reload from memory */
+    return vec_cfid(vec_ld(0, idata));
+}
+
+static __attribute__((always_inline)) vector4double
+gmx_simd_get_mantissa_ibm_qpx(vector4double x)
+{
+    const gmx_int64_t    exp_and_sign_mask = 0xfff0000000000000LL;
+    const gmx_int64_t    ione              = 0x3ff0000000000000LL;
+    gmx_int64_t          idata[4] __attribute__((aligned(32)));
+
+    /* Store to memory */
+    vec_st(x, 0, idata);
+    /* Perform integer arithmetics in general registers. */
+    idata[0] = (idata[0] & (~exp_and_sign_mask)) | ione;
+    idata[1] = (idata[1] & (~exp_and_sign_mask)) | ione;
+    idata[2] = (idata[2] & (~exp_and_sign_mask)) | ione;
+    idata[3] = (idata[3] & (~exp_and_sign_mask)) | ione;
+    /* Reload from memory */
+    return vec_ld(0, idata);
+}
+
+static __attribute__((always_inline)) vector4double
+gmx_simd_set_exponent_ibm_qpx(vector4double x)
+{
+    const gmx_int64_t    expbase = 1023;
+    gmx_int64_t          idata[4] __attribute__((aligned(32)));
+
+    /* Store to memory for shifts. It is REALLY critical that we use the same
+     * rounding mode as for gmx_simd_round_r() here. In particular, for QPX
+     * this means we implement gmx_simd_round_r(a) as vec_cfid(vec_ctid(a)),
+     * since vec_round() uses a different rounding scheme.
+     */
+    vec_st(vec_ctid(x), 0, idata);
+    /* Perform integer arithmetics in general registers. */
+    idata[0] = (idata[0] + expbase) << 52;
+    idata[1] = (idata[1] + expbase) << 52;
+    idata[2] = (idata[2] + expbase) << 52;
+    idata[3] = (idata[3] + expbase) << 52;
+    /* Reload from memory */
+    return vec_ld(0, idata);
+}
+
+static __attribute__((always_inline)) double
+gmx_simd_reduce_ibm_qpx(vector4double x)
+{
+    vector4double y = vec_sldw(x, x, 2);
+    vector4double z;
+
+    y = vec_add(y, x);
+    z = vec_sldw(y, y, 1);
+    y = vec_add(y, z);
+    return vec_extract(y, 0);
+}
+
+static __attribute__((always_inline)) vector4double
+gmx_simd_set1_int_ibm_qpx(int i)
+{
+    int idata[4] __attribute__((aligned(32)));
+
+    idata[0] = i;
+
+    /* Reload from memory */
+    return vec_splat(vec_ldia(0, idata), 0);
+}
+
+/* This works in both single and double */
+static __attribute__((always_inline)) int
+gmx_simd_anytrue_bool_ibm_qpx(vector4double a)
+{
+    vector4double b = vec_sldw(a, a, 2);
+
+    a = vec_or(a, b);
+    b = vec_sldw(a, a, 1);
+    a = vec_or(a, b);
+    return (vec_extract(a, 0) > 0);
+}
+
+/* QPX is already 4-wide both in single and double, so just reuse for SIMD4 */
+
+/* SINGLE */
+#define gmx_simd4_float_t                gmx_simd_float_t
+#define gmx_simd4_load_f                 gmx_simd_load_f
+#define gmx_simd4_load1_f                gmx_simd_load1_f
+#define gmx_simd4_set1_f                 gmx_simd_set1_f
+#define gmx_simd4_store_f                gmx_simd_store_f
+#define gmx_simd4_loadu_f                gmx_simd_loadu_f
+#define gmx_simd4_storeu_f               gmx_simd_storeu_f
+#define gmx_simd4_setzero_f              gmx_simd_setzero_f
+#define gmx_simd4_add_f                  gmx_simd_add_f
+#define gmx_simd4_sub_f                  gmx_simd_sub_f
+#define gmx_simd4_mul_f                  gmx_simd_mul_f
+#define gmx_simd4_fmadd_f                gmx_simd_fmadd_f
+#define gmx_simd4_fmsub_f                gmx_simd_fmsub_f
+#define gmx_simd4_fnmadd_f               gmx_simd_fnmadd_f
+#define gmx_simd4_fnmsub_f               gmx_simd_fnmsub_f
+#define gmx_simd4_and_f                  gmx_simd_and_f
+#define gmx_simd4_andnot_f               gmx_simd_andnot_f
+#define gmx_simd4_or_f                   gmx_simd_or_f
+#define gmx_simd4_xor_f                  gmx_simd_xor_f
+#define gmx_simd4_rsqrt_f                gmx_simd_rsqrt_f
+#define gmx_simd4_rcp_f                  gmx_simd_rcp_f
+#define gmx_simd4_fabs_f                 gmx_simd_fabs_f
+#define gmx_simd4_fneg_f                 gmx_simd_fneg_f
+#define gmx_simd4_max_f                  gmx_simd_max_f
+#define gmx_simd4_min_f                  gmx_simd_min_f
+#define gmx_simd4_round_f                gmx_simd_round_f
+#define gmx_simd4_trunc_f                gmx_simd_trunc_f
+#define gmx_simd4_fraction_f             gmx_simd_fraction_f
+#define gmx_simd4_get_exponent_f         gmx_simd_get_exponent_f
+#define gmx_simd4_get_mantissa_f         gmx_simd_get_mantissa_f
+#define gmx_simd4_set_exponent_f         gmx_simd_set_exponent_f
+#define gmx_simd4_dotproduct3_f          gmx_simd4_dotproduct3_f_ibm_qpx
+#define gmx_simd4_fint32_t               gmx_simd_fint32_t
+#define gmx_simd4_load_fi                gmx_simd_load_fi
+#define gmx_simd4_load1_fi               gmx_simd_load1_fi
+#define gmx_simd4_set1_fi                gmx_simd_set1_fi
+#define gmx_simd4_store_fi               gmx_simd_store_fi
+#define gmx_simd4_loadu_fi               gmx_simd_loadu_fi
+#define gmx_simd4_storeu_fi              gmx_simd_storeu_fi
+#define gmx_simd4_setzero_fi             gmx_simd_setzero_fi
+#define gmx_simd4_cvt_f2i                gmx_simd_cvt_f2i
+#define gmx_simd4_cvtt_f2i               gmx_simd_cvtt_f2i
+#define gmx_simd4_cvt_i2f                gmx_simd_cvt_i2f
+#define gmx_simd4_fbool_t                gmx_simd_fbool_t
+#define gmx_simd4_cmpeq_f                gmx_simd_cmpeq_f
+#define gmx_simd4_cmplt_f                gmx_simd_cmplt_f
+#define gmx_simd4_cmple_f                gmx_simd_cmple_f
+#define gmx_simd4_and_fb                 gmx_simd_and_fb
+#define gmx_simd4_or_fb                  gmx_simd_or_fb
+#define gmx_simd4_anytrue_fb             gmx_simd_anytrue_fb
+#define gmx_simd4_blendzero_f            gmx_simd_blendzero_f
+#define gmx_simd4_blendnotzero_f         gmx_simd_blendnotzero_f
+#define gmx_simd4_blendv_f               gmx_simd_blendv_f
+#define gmx_simd4_reduce_f               gmx_simd_reduce_f
+/* DOUBLE */
+#define gmx_simd4_double_t               gmx_simd_double_t
+#define gmx_simd4_load_d                 gmx_simd_load_d
+#define gmx_simd4_load1_d                gmx_simd_load1_d
+#define gmx_simd4_set1_d                 gmx_simd_set1_d
+#define gmx_simd4_store_d                gmx_simd_store_d
+#define gmx_simd4_loadu_d                gmx_simd_loadu_d
+#define gmx_simd4_storeu_d               gmx_simd_storeu_d
+#define gmx_simd4_setzero_d              gmx_simd_setzero_d
+#define gmx_simd4_add_d                  gmx_simd_add_d
+#define gmx_simd4_sub_d                  gmx_simd_sub_d
+#define gmx_simd4_mul_d                  gmx_simd_mul_d
+#define gmx_simd4_fmadd_d                gmx_simd_fmadd_d
+#define gmx_simd4_fmsub_d                gmx_simd_fmsub_d
+#define gmx_simd4_fnmadd_d               gmx_simd_fnmadd_d
+#define gmx_simd4_fnmsub_d               gmx_simd_fnmsub_d
+#define gmx_simd4_and_d                  gmx_simd_and_d
+#define gmx_simd4_andnot_d               gmx_simd_andnot_d
+#define gmx_simd4_or_d                   gmx_simd_or_d
+#define gmx_simd4_xor_d                  gmx_simd_xor_d
+#define gmx_simd4_rsqrt_d                gmx_simd_rsqrt_d
+#define gmx_simd4_rcp_d                  gmx_simd_rcp_d
+#define gmx_simd4_fabs_d                 gmx_simd_fabs_d
+#define gmx_simd4_fneg_d                 gmx_simd_fneg_d
+#define gmx_simd4_max_d                  gmx_simd_max_d
+#define gmx_simd4_min_d                  gmx_simd_min_d
+#define gmx_simd4_round_d                gmx_simd_round_d
+#define gmx_simd4_trunc_d                gmx_simd_trunc_d
+#define gmx_simd4_fraction_d             gmx_simd_fraction_d
+#define gmx_simd4_get_exponent_d         gmx_simd_get_exponent_d
+#define gmx_simd4_get_mantissa_d         gmx_simd_get_mantissa_d
+#define gmx_simd4_set_exponent_d         gmx_simd_set_exponent_d
+#define gmx_simd4_dotproduct3_d          gmx_simd4_dotproduct3_d_ibm_qpx
+#define gmx_simd4_dint32_t               gmx_simd_dint32_t
+#define gmx_simd4_load_di                gmx_simd_load_di
+#define gmx_simd4_load1_di               gmx_simd_load1_di
+#define gmx_simd4_set1_di                gmx_simd_set1_di
+#define gmx_simd4_store_di               gmx_simd_store_di
+#define gmx_simd4_loadu_di               gmx_simd_loadu_di
+#define gmx_simd4_storeu_di              gmx_simd_storeu_di
+#define gmx_simd4_setzero_di             gmx_simd_setzero_di
+#define gmx_simd4_cvt_d2i                gmx_simd_cvt_d2i
+#define gmx_simd4_cvtt_d2i               gmx_simd_cvtt_d2i
+#define gmx_simd4_cvt_i2f                gmx_simd_cvt_i2f
+#define gmx_simd4_dbool_t                gmx_simd_dbool_t
+#define gmx_simd4_cmpeq_d                gmx_simd_cmpeq_d
+#define gmx_simd4_cmplt_d                gmx_simd_cmplt_d
+#define gmx_simd4_cmple_d                gmx_simd_cmple_d
+#define gmx_simd4_and_db                 gmx_simd_and_db
+#define gmx_simd4_or_db                  gmx_simd_or_db
+#define gmx_simd4_anytrue_db             gmx_simd_anytrue_db
+#define gmx_simd4_blendzero_d            gmx_simd_blendzero_d
+#define gmx_simd4_blendnotzero_d         gmx_simd_blendnotzero_d
+#define gmx_simd4_blendv_d               gmx_simd_blendv_d
+#define gmx_simd4_reduce_d               gmx_simd_reduce_d
+
+static __attribute__((always_inline)) double
+gmx_simd4_dotproduct3_d_ibm_qpx(vector4double a, vector4double b)
+{
+    vector4double dp_sh0 = vec_mul(a, b);
+    vector4double dp_sh1 = vec_sldw(dp_sh0, dp_sh0, 1);
+    vector4double dp_sh2 = vec_sldw(dp_sh0, dp_sh0, 2);
+    vector4double dp     = vec_add(dp_sh2, vec_add(dp_sh0, dp_sh1));
+
+    return vec_extract(dp, 0);
+}
+
+static __attribute__((always_inline)) float
+gmx_simd4_dotproduct3_f_ibm_qpx(vector4double a, vector4double b)
+{
+    return (float)gmx_simd4_dotproduct3_d_ibm_qpx(a, b);
+}
+
+/* Function to check whether SIMD operations have resulted in overflow.
+ * For now, this is unfortunately a dummy for this architecture.
+ */
+static int
+gmx_simd_check_and_reset_overflow(void)
+{
+    return 0;
+}
+
+#endif /* GMX_SIMD_IMPLEMENTATION_IBM_QPX_H */
diff --git a/src/gromacs/simd/impl_intel_mic/impl_intel_mic.h b/src/gromacs/simd/impl_intel_mic/impl_intel_mic.h

new file mode 100644 (file)

index 0000000..399f20a
--- /dev/null
+++ b/src/gromacs/simd/impl_intel_mic/impl_intel_mic.h
@@ -0,0 +1,544 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2014, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+
+#ifndef GMX_SIMD_IMPL_INTEL_MIC_H
+#define GMX_SIMD_IMPL_INTEL_MIC_H
+
+#include <math.h>
+#include <immintrin.h>
+
+/* Intel Xeon Phi, or
+ * the-artist-formerly-known-as-Knight's-corner, or
+ * the-artist-formerly-formerly-known-as-MIC, or
+ * the artist formerly-formerly-formerly-known-as-Larrabee
+ * 512-bit SIMD instruction wrappers.
+ */
+
+/* Capability definitions for Xeon Phi SIMD */
+#define GMX_SIMD_HAVE_FLOAT
+#define GMX_SIMD_HAVE_DOUBLE
+#define GMX_SIMD_HAVE_SIMD_HARDWARE
+#define GMX_SIMD_HAVE_LOADU
+#define GMX_SIMD_HAVE_STOREU
+#define GMX_SIMD_HAVE_LOGICAL
+#define GMX_SIMD_HAVE_FMA
+#undef  GMX_SIMD_HAVE_FRACTION
+#define GMX_SIMD_HAVE_FINT32
+#define  GMX_SIMD_HAVE_FINT32_EXTRACT
+#define GMX_SIMD_HAVE_FINT32_LOGICAL
+#define GMX_SIMD_HAVE_FINT32_ARITHMETICS
+#define GMX_SIMD_HAVE_DINT32
+#define  GMX_SIMD_HAVE_DINT32_EXTRACT
+#define GMX_SIMD_HAVE_DINT32_LOGICAL
+#define GMX_SIMD_HAVE_DINT32_ARITHMETICS
+#define GMX_SIMD4_HAVE_FLOAT
+#define GMX_SIMD4_HAVE_DOUBLE
+
+/* Implementation details */
+#define GMX_SIMD_FLOAT_WIDTH        16
+#define GMX_SIMD_DOUBLE_WIDTH        8
+#define GMX_SIMD_FINT32_WIDTH       16
+#define GMX_SIMD_DINT32_WIDTH        8
+#define GMX_SIMD_RSQRT_BITS         23
+#define GMX_SIMD_RCP_BITS           23
+
+/****************************************************
+ *      SINGLE PRECISION SIMD IMPLEMENTATION        *
+ ****************************************************/
+#define gmx_simd_float_t           __m512
+#define gmx_simd_load_f            _mm512_load_ps
+#define gmx_simd_load1_f(m)        _mm512_extload_ps(m, _MM_UPCONV_PS_NONE, _MM_BROADCAST_1X16, _MM_HINT_NONE)
+#define gmx_simd_set1_f            _mm512_set1_ps
+#define gmx_simd_store_f           _mm512_store_ps
+#define gmx_simd_loadu_f           gmx_simd_loadu_f_mic
+#define gmx_simd_storeu_f          gmx_simd_storeu_f_mic
+#define gmx_simd_setzero_f         _mm512_setzero_ps
+#define gmx_simd_add_f             _mm512_add_ps
+#define gmx_simd_sub_f             _mm512_sub_ps
+#define gmx_simd_mul_f             _mm512_mul_ps
+#define gmx_simd_fmadd_f           _mm512_fmadd_ps
+#define gmx_simd_fmsub_f           _mm512_fmsub_ps
+#define gmx_simd_fnmadd_f          _mm512_fnmadd_ps
+#define gmx_simd_fnmsub_f          _mm512_fnmsub_ps
+#define gmx_simd_and_f(a, b)        _mm512_castsi512_ps(_mm512_and_epi32(_mm512_castps_si512(a), _mm512_castps_si512(b)))
+#define gmx_simd_andnot_f(a, b)     _mm512_castsi512_ps(_mm512_andnot_epi32(_mm512_castps_si512(a), _mm512_castps_si512(b)))
+#define gmx_simd_or_f(a, b)         _mm512_castsi512_ps(_mm512_or_epi32(_mm512_castps_si512(a), _mm512_castps_si512(b)))
+#define gmx_simd_xor_f(a, b)        _mm512_castsi512_ps(_mm512_xor_epi32(_mm512_castps_si512(a), _mm512_castps_si512(b)))
+#define gmx_simd_rsqrt_f           _mm512_rsqrt23_ps
+#define gmx_simd_rcp_f             _mm512_rcp23_ps
+#define gmx_simd_fabs_f(x)         gmx_simd_andnot_f(_mm512_set1_ps(-0.0), x)
+#define gmx_simd_fneg_f(x)         _mm512_addn_ps(x, _mm512_setzero_ps())
+#define gmx_simd_max_f             _mm512_gmax_ps
+#define gmx_simd_min_f             _mm512_gmin_ps
+#define gmx_simd_round_f(x)        _mm512_round_ps(x, _MM_FROUND_TO_NEAREST_INT, _MM_EXPADJ_NONE)
+#define gmx_simd_trunc_f(x)        _mm512_round_ps(x, _MM_FROUND_TO_ZERO, _MM_EXPADJ_NONE)
+#define gmx_simd_fraction_f(x)     _mm512_sub_ps(x, gmx_simd_trunc_f(x))
+#define gmx_simd_get_exponent_f(x) _mm512_getexp_ps(x)
+#define gmx_simd_get_mantissa_f(x) _mm512_getmant_ps(x, _MM_MANT_NORM_1_2, _MM_MANT_SIGN_zero)
+#define gmx_simd_set_exponent_f(x) gmx_simd_set_exponent_f_mic(x)
+/* integer datatype corresponding to float: gmx_simd_fint32_t */
+#define gmx_simd_fint32_t          __m512i
+#define gmx_simd_load_fi           _mm512_load_epi32
+#define gmx_simd_set1_fi           _mm512_set1_epi32
+#define gmx_simd_store_fi          _mm512_store_epi32
+#define gmx_simd_loadu_fi          gmx_simd_loadu_fi_mic
+#define gmx_simd_storeu_fi         gmx_simd_storeu_fi_mic
+#define gmx_simd_extract_fi        gmx_simd_extract_fi_mic
+#define gmx_simd_setzero_fi        _mm512_setzero_epi32
+#define gmx_simd_cvt_f2i(a)        _mm512_cvtfxpnt_round_adjustps_epi32(a, _MM_FROUND_TO_NEAREST_INT, _MM_EXPADJ_NONE)
+#define gmx_simd_cvtt_f2i(a)       _mm512_cvtfxpnt_round_adjustps_epi32(a, _MM_FROUND_TO_ZERO, _MM_EXPADJ_NONE)
+#define gmx_simd_cvt_i2f(a)        _mm512_cvtfxpnt_round_adjustepi32_ps(a, _MM_FROUND_TO_NEAREST_INT, _MM_EXPADJ_NONE)
+/* Integer logical ops on gmx_simd_fint32_t */
+#define gmx_simd_slli_fi           _mm512_slli_epi32
+#define gmx_simd_srli_fi           _mm512_srli_epi32
+#define gmx_simd_and_fi            _mm512_and_epi32
+#define gmx_simd_andnot_fi         _mm512_andnot_epi32
+#define gmx_simd_or_fi             _mm512_or_epi32
+#define gmx_simd_xor_fi            _mm512_xor_epi32
+/* Integer arithmetic ops on gmx_simd_fint32_t */
+#define gmx_simd_add_fi            _mm512_add_epi32
+#define gmx_simd_sub_fi            _mm512_sub_epi32
+#define gmx_simd_mul_fi            _mm512_mullo_epi32
+/* Boolean & comparison operations on gmx_simd_float_t */
+#define gmx_simd_fbool_t           __mmask16
+#define gmx_simd_cmpeq_f(a, b)     _mm512_cmp_ps_mask(a, b, _CMP_EQ_OQ)
+#define gmx_simd_cmplt_f(a, b)     _mm512_cmp_ps_mask(a, b, _CMP_LT_OS)
+#define gmx_simd_cmple_f(a, b)     _mm512_cmp_ps_mask(a, b, _CMP_LE_OS)
+#define gmx_simd_and_fb            _mm512_kand
+#define gmx_simd_andnot_fb(a, b)   _mm512_knot(_mm512_kor(a, b))
+#define gmx_simd_or_fb             _mm512_kor
+#define gmx_simd_anytrue_fb        _mm512_mask2int
+#define gmx_simd_blendzero_f(a, sel)    _mm512_mask_mov_ps(_mm512_setzero_ps(), sel, a)
+#define gmx_simd_blendnotzero_f(a, sel) _mm512_mask_mov_ps(_mm512_setzero_ps(), _mm512_knot(sel), a)
+#define gmx_simd_blendv_f(a, b, sel)    _mm512_mask_blend_ps(sel, a, b)
+#define gmx_simd_reduce_f(a)       _mm512_reduce_add_ps(a)
+/* Boolean & comparison operations on gmx_simd_fint32_t */
+#define gmx_simd_fibool_t          __mmask16
+#define gmx_simd_cmpeq_fi(a, b)    _mm512_cmp_epi32_mask(a, b, _MM_CMPINT_EQ)
+#define gmx_simd_cmplt_fi(a, b)    _mm512_cmp_epi32_mask(a, b, _MM_CMPINT_LT)
+#define gmx_simd_and_fib           _mm512_kand
+#define gmx_simd_or_fib            _mm512_kor
+#define gmx_simd_anytrue_fib       _mm512_mask2int
+#define gmx_simd_blendzero_fi(a, sel)    _mm512_mask_mov_epi32(_mm512_setzero_epi32(), sel, a)
+#define gmx_simd_blendnotzero_fi(a, sel) _mm512_mask_mov_epi32(_mm512_setzero_epi32(), _mm512_knot(sel), a)
+#define gmx_simd_blendv_fi(a, b, sel)    _mm512_mask_blend_epi32(sel, a, b)
+/* Conversions between different booleans */
+#define gmx_simd_cvt_fb2fib(x)     (x)
+#define gmx_simd_cvt_fib2fb(x)     (x)
+
+/* MIC provides full single precision of some neat functions: */
+/* 1/sqrt(x) and 1/x work fine in simd_math.h, and won't use extra iterations */
+
+#define gmx_simd_exp2_f            gmx_simd_exp2_f_mic
+#define gmx_simd_exp_f             gmx_simd_exp_f_mic
+#define gmx_simd_log_f             gmx_simd_log_f_mic
+
+/****************************************************
+ *      DOUBLE PRECISION SIMD IMPLEMENTATION        *
+ ****************************************************/
+#define gmx_simd_double_t          __m512d
+#define gmx_simd_load_d            _mm512_load_pd
+#define gmx_simd_load1_d(m)        _mm512_extload_pd(m, _MM_UPCONV_PD_NONE, _MM_BROADCAST_1X8, _MM_HINT_NONE)
+#define gmx_simd_set1_d            _mm512_set1_pd
+#define gmx_simd_store_d           _mm512_store_pd
+#define gmx_simd_loadu_d           gmx_simd_loadu_d_mic
+#define gmx_simd_storeu_d          gmx_simd_storeu_d_mic
+#define gmx_simd_setzero_d         _mm512_setzero_pd
+#define gmx_simd_add_d             _mm512_add_pd
+#define gmx_simd_sub_d             _mm512_sub_pd
+#define gmx_simd_mul_d             _mm512_mul_pd
+#define gmx_simd_fmadd_d           _mm512_fmadd_pd
+#define gmx_simd_fmsub_d           _mm512_fmsub_pd
+#define gmx_simd_fnmadd_d          _mm512_fnmadd_pd
+#define gmx_simd_fnmsub_d          _mm512_fnmsub_pd
+#define gmx_simd_and_d(a, b)       _mm512_castsi512_pd(_mm512_and_epi32(_mm512_castpd_si512(a), _mm512_castpd_si512(b)))
+#define gmx_simd_andnot_d(a, b)    _mm512_castsi512_pd(_mm512_andnot_epi32(_mm512_castpd_si512(a), _mm512_castpd_si512(b)))
+#define gmx_simd_or_d(a, b)        _mm512_castsi512_pd(_mm512_or_epi32(_mm512_castpd_si512(a), _mm512_castpd_si512(b)))
+#define gmx_simd_xor_d(a, b)       _mm512_castsi512_pd(_mm512_xor_epi32(_mm512_castpd_si512(a), _mm512_castpd_si512(b)))
+#define gmx_simd_rsqrt_d(x)        _mm512_cvtpslo_pd(_mm512_rsqrt23_ps(_mm512_cvtpd_pslo(x)))
+#define gmx_simd_rcp_d(x)          _mm512_cvtpslo_pd(_mm512_rcp23_ps(_mm512_cvtpd_pslo(x)))
+#define gmx_simd_fabs_d(x)         gmx_simd_andnot_d(_mm512_set1_pd(-0.0), x)
+#define gmx_simd_fneg_d(x)         _mm512_addn_pd(x, _mm512_setzero_pd())
+#define gmx_simd_max_d             _mm512_gmax_pd
+#define gmx_simd_min_d             _mm512_gmin_pd
+#define gmx_simd_round_d(a)        _mm512_roundfxpnt_adjust_pd(a, _MM_FROUND_TO_NEAREST_INT, _MM_EXPADJ_NONE)
+#define gmx_simd_trunc_d(a)        _mm512_roundfxpnt_adjust_pd(a, _MM_FROUND_TO_ZERO, _MM_EXPADJ_NONE)
+#define gmx_simd_fraction_d(x)     _mm512_sub_pd(x, gmx_simd_trunc_d(x))
+#define gmx_simd_get_exponent_d(x) _mm512_getexp_pd(x)
+#define gmx_simd_get_mantissa_d(x) _mm512_getmant_pd(x, _MM_MANT_NORM_1_2, _MM_MANT_SIGN_zero)
+#define gmx_simd_set_exponent_d(x) gmx_simd_set_exponent_d_mic(x)
+/* integer datatype corresponding to float: gmx_simd_fint32_t
+   Doesn't use mask other than where required. No side effect expected for operating on the (unused) upper 8.
+ */
+#define gmx_simd_dint32_t          __m512i
+#define gmx_simd_load_di(m)        _mm512_mask_loadunpacklo_epi32(_mm512_undefined_epi32(), mask_loh, m)
+#define gmx_simd_set1_di           _mm512_set1_epi32
+#define gmx_simd_store_di(m, a)    _mm512_mask_packstorelo_epi32(m, mask_loh, a)
+#define gmx_simd_loadu_di          gmx_simd_loadu_di_mic
+#define gmx_simd_storeu_di         gmx_simd_storeu_di_mic
+#define gmx_simd_extract_di        gmx_simd_extract_di_mic
+#define gmx_simd_setzero_di        _mm512_setzero_epi32
+#define gmx_simd_cvt_d2i(a)        _mm512_cvtfxpnt_roundpd_epi32lo(a, _MM_FROUND_TO_NEAREST_INT)
+#define gmx_simd_cvtt_d2i(a)       _mm512_cvtfxpnt_roundpd_epi32lo(a, _MM_FROUND_TO_ZERO)
+#define gmx_simd_cvt_i2d           _mm512_cvtepi32lo_pd
+/* Integer logical ops on gmx_simd_fint32_t */
+#define gmx_simd_slli_di           _mm512_slli_epi32
+#define gmx_simd_srli_di           _mm512_srli_epi32
+#define gmx_simd_and_di            _mm512_and_epi32
+#define gmx_simd_andnot_di         _mm512_andnot_epi32
+#define gmx_simd_or_di             _mm512_or_epi32
+#define gmx_simd_xor_di            _mm512_xor_epi32
+/* Integer arithmetic ops on gmx_simd_fint32_t */
+#define gmx_simd_add_di            _mm512_add_epi32
+#define gmx_simd_sub_di            _mm512_sub_epi32
+#define gmx_simd_mul_di            _mm512_mullo_epi32
+/* Boolean & comparison operations on gmx_simd_float_t */
+#define gmx_simd_dbool_t           __mmask8
+#define gmx_simd_cmpeq_d(a, b)     _mm512_cmp_pd_mask(a, b, _CMP_EQ_OQ)
+#define gmx_simd_cmplt_d(a, b)     _mm512_cmp_pd_mask(a, b, _CMP_LT_OS)
+#define gmx_simd_cmple_d(a, b)     _mm512_cmp_pd_mask(a, b, _CMP_LE_OS)
+#define gmx_simd_and_db            _mm512_kand
+#define gmx_simd_or_db             _mm512_kor
+#define gmx_simd_anytrue_db(x)     _mm512_mask2int(x)
+#define gmx_simd_blendzero_d(a, sel)    _mm512_mask_mov_pd(_mm512_setzero_pd(), sel, a)
+#define gmx_simd_blendnotzero_d(a, sel) _mm512_mask_mov_pd(_mm512_setzero_pd(), _mm512_knot(sel), a)
+#define gmx_simd_blendv_d(a, b, sel)    _mm512_mask_blend_pd(sel, a, b)
+#define gmx_simd_reduce_d(a)       _mm512_reduce_add_pd(a)
+/* Boolean & comparison operations on gmx_simd_fint32_t */
+#define gmx_simd_dibool_t          __mmask16
+#define gmx_simd_cmpeq_di(a, b)    _mm512_cmp_epi32_mask(a, b, _MM_CMPINT_EQ)
+#define gmx_simd_cmplt_di(a, b)    _mm512_cmp_epi32_mask(a, b, _MM_CMPINT_LT)
+#define gmx_simd_and_dib           _mm512_kand
+#define gmx_simd_or_dib            _mm512_kor
+#define gmx_simd_anytrue_dib(x)    (_mm512_mask2int(x)&0xFF)
+#define gmx_simd_blendzero_di(a, sel)    _mm512_mask_mov_epi32(_mm512_setzero_epi32(), sel, a)
+#define gmx_simd_blendnotzero_di(a, sel) _mm512_mask_mov_epi32(_mm512_setzero_epi32(), _mm512_knot(sel), a)
+#define gmx_simd_blendv_di(a, b, sel)    _mm512_mask_blend_epi32(sel, a, b)
+/* Conversions between booleans. Double & dint stuff is stored in low bits */
+#define gmx_simd_cvt_db2dib(x)     (x)
+#define gmx_simd_cvt_dib2db(x)     (x)
+
+/* Float/double conversion */
+#define gmx_simd_cvt_f2dd          gmx_simd_cvt_f2dd_mic
+#define gmx_simd_cvt_dd2f          gmx_simd_cvt_dd2f_mic
+
+/****************************************************
+ *      SINGLE PRECISION SIMD4 IMPLEMENTATION       *
+ ****************************************************/
+/* Load and store are guranteed to only access the 4 floats. All arithmetic operations
+   only operate on the 4 elements (to avoid floating excpetions). But other operations
+   are not gurateed to not modify the other 12 elements. E.g. setzero or blendzero
+   set the upper 12 to zero. */
+#define gmx_simd4_float_t           __m512
+#define gmx_simd4_mask              _mm512_int2mask(0xF)
+#define gmx_simd4_load_f(m)         _mm512_mask_loadunpacklo_ps(_mm512_undefined_ps(), gmx_simd4_mask, m)
+#define gmx_simd4_load1_f(m)        _mm512_mask_extload_ps(_mm512_undefined_ps(), gmx_simd4_mask, m, _MM_UPCONV_PS_NONE, _MM_BROADCAST_1X16, _MM_HINT_NONE)
+#define gmx_simd4_set1_f            _mm512_set1_ps
+#define gmx_simd4_store_f(m, a)     _mm512_mask_packstorelo_ps(m, gmx_simd4_mask, a)
+#define gmx_simd4_loadu_f           gmx_simd4_loadu_f_mic
+#define gmx_simd4_storeu_f          gmx_simd4_storeu_f_mic
+#define gmx_simd4_setzero_f         _mm512_setzero_ps
+#define gmx_simd4_add_f(a, b)       _mm512_mask_add_ps(_mm512_undefined_ps(), gmx_simd4_mask, a, b)
+#define gmx_simd4_sub_f(a, b)       _mm512_mask_sub_ps(_mm512_undefined_ps(), gmx_simd4_mask, a, b)
+#define gmx_simd4_mul_f(a, b)       _mm512_mask_mul_ps(_mm512_undefined_ps(), gmx_simd4_mask, a, b)
+#define gmx_simd4_fmadd_f(a, b, c)  _mm512_mask_fmadd_ps(a, gmx_simd4_mask, b, c)
+#define gmx_simd4_fmsub_f(a, b, c)  _mm512_mask_fmsub_ps(a, gmx_simd4_mask, b, c)
+#define gmx_simd4_fnmadd_f(a, b, c) _mm512_mask_fnmadd_ps(a, gmx_simd4_mask, b, c)
+#define gmx_simd4_fnmsub_f(a, b, c) _mm512_mask_fnmsub_ps(a, gmx_simd4_mask, b, c)
+#define gmx_simd4_and_f(a, b)       _mm512_castsi512_ps(_mm512_mask_and_epi32(_mm512_undefined_epi32(), gmx_simd4_mask, _mm512_castps_si512(a), _mm512_castps_si512(b)))
+#define gmx_simd4_andnot_f(a, b)    _mm512_castsi512_ps(_mm512_mask_andnot_epi32(_mm512_undefined_epi32(), gmx_simd4_mask, _mm512_castps_si512(a), _mm512_castps_si512(b)))
+#define gmx_simd4_or_f(a, b)        _mm512_castsi512_ps(_mm512_mask_or_epi32(_mm512_undefined_epi32(), gmx_simd4_mask, _mm512_castps_si512(a), _mm512_castps_si512(b)))
+#define gmx_simd4_xor_f(a, b)       _mm512_castsi512_ps(_mm512_mask_xor_epi32(_mm512_undefined_epi32(), gmx_simd4_mask, _mm512_castps_si512(a), _mm512_castps_si512(b)))
+#define gmx_simd4_rsqrt_f(a)        _mm512_mask_rsqrt23_ps(_mm512_undefined_ps(), gmx_simd4_mask, a)
+#define gmx_simd4_fabs_f(x)         gmx_simd4_andnot_f(_mm512_set1_ps(-0.0), x)
+#define gmx_simd4_fneg_f(x)         _mm512_mask_addn_ps(_mm512_undefined_ps(), gmx_simd4_mask, x, _mm512_setzero_ps())
+#define gmx_simd4_max_f(a, b)       _mm512_mask_gmax_ps(_mm512_undefined_ps(), gmx_simd4_mask, a, b)
+#define gmx_simd4_min_f(a, b)       _mm512_mask_gmin_ps(_mm512_undefined_ps(), gmx_simd4_mask, a, b)
+#define gmx_simd4_round_f(x)        _mm512_mask_round_ps(_mm512_undefined_ps(), gmx_simd4_mask, x, _MM_FROUND_TO_NEAREST_INT, _MM_EXPADJ_NONE)
+#define gmx_simd4_trunc_f(x)        _mm512_mask_round_ps(_mm512_undefined_ps(), gmx_simd4_mask, x, _MM_FROUND_TO_ZERO, _MM_EXPADJ_NONE)
+#define gmx_simd4_dotproduct3_f(a, b) _mm512_mask_reduce_add_ps(_mm512_int2mask(7), _mm512_mask_mul_ps(_mm512_undefined_ps(), _mm512_int2mask(7), a, b))
+#define gmx_simd4_fbool_t           __mmask16
+#define gmx_simd4_cmpeq_f(a, b)     _mm512_mask_cmp_ps_mask(gmx_simd4_mask, a, b, _CMP_EQ_OQ)
+#define gmx_simd4_cmplt_f(a, b)     _mm512_mask_cmp_ps_mask(gmx_simd4_mask, a, b, _CMP_LT_OS)
+#define gmx_simd4_cmple_f(a, b)     _mm512_mask_cmp_ps_mask(gmx_simd4_mask, a, b, _CMP_LE_OS)
+#define gmx_simd4_and_fb            _mm512_kand
+#define gmx_simd4_or_fb             _mm512_kor
+#define gmx_simd4_anytrue_fb(x)     (_mm512_mask2int(x)&0xF)
+#define gmx_simd4_blendzero_f(a, sel)    _mm512_mask_mov_ps(_mm512_setzero_ps(), sel, a)
+#define gmx_simd4_blendnotzero_f(a, sel) _mm512_mask_mov_ps(_mm512_setzero_ps(), _mm512_knot(sel), a)
+#define gmx_simd4_blendv_f(a, b, sel)    _mm512_mask_blend_ps(sel, a, b)
+#define gmx_simd4_reduce_f(x)       _mm512_mask_reduce_add_ps(_mm512_int2mask(0xF), x)
+
+/****************************************************
+ *      DOUBLE PRECISION SIMD4 IMPLEMENTATION       *
+ ****************************************************/
+#define gmx_simd4_double_t          __m512d
+#define gmx_simd4_mask              _mm512_int2mask(0xF)
+#define gmx_simd4_load_d(m)         _mm512_mask_loadunpacklo_pd(_mm512_undefined_pd(), gmx_simd4_mask, m)
+#define gmx_simd4_load1_d(m)        _mm512_mask_extload_pd(_mm512_undefined_pd(), gmx_simd4_mask, m, _MM_UPCONV_PD_NONE, _MM_BROADCAST_1X8, _MM_HINT_NONE)
+#define gmx_simd4_set1_d            _mm512_set1_pd
+#define gmx_simd4_store_d(m, a)     _mm512_mask_packstorelo_pd(m, gmx_simd4_mask, a)
+#define gmx_simd4_loadu_d           gmx_simd4_loadu_d_mic
+#define gmx_simd4_storeu_d          gmx_simd4_storeu_d_mic
+#define gmx_simd4_setzero_d         _mm512_setzero_pd
+#define gmx_simd4_add_d(a, b)       _mm512_mask_add_pd(_mm512_undefined_pd(), gmx_simd4_mask, a, b)
+#define gmx_simd4_sub_d(a, b)       _mm512_mask_sub_pd(_mm512_undefined_pd(), gmx_simd4_mask, a, b)
+#define gmx_simd4_mul_d(a, b)       _mm512_mask_mul_pd(_mm512_undefined_pd(), gmx_simd4_mask, a, b)
+#define gmx_simd4_fmadd_d(a, b, c)  _mm512_mask_fmadd_pd(a, gmx_simd4_mask, b, c)
+#define gmx_simd4_fmsub_d(a, b, c)  _mm512_mask_fmsub_pd(a, gmx_simd4_mask, b, c)
+#define gmx_simd4_fnmadd_d(a, b, c) _mm512_mask_fnmadd_pd(a, gmx_simd4_mask, b, c)
+#define gmx_simd4_fnmsub_d(a, b, c) _mm512_mask_fnmsub_pd(a, gmx_simd4_mask, b, c)
+#define gmx_simd4_and_d(a, b)       _mm512_castsi512_pd(_mm512_mask_and_epi32(_mm512_undefined_epi32(), mask_loh, _mm512_castpd_si512(a), _mm512_castpd_si512(b)))
+#define gmx_simd4_andnot_d(a, b)    _mm512_castsi512_pd(_mm512_mask_andnot_epi32(_mm512_undefined_epi32(), mask_loh, _mm512_castpd_si512(a), _mm512_castpd_si512(b)))
+#define gmx_simd4_or_d(a, b)        _mm512_castsi512_pd(_mm512_mask_or_epi32(_mm512_undefined_epi32(), mask_loh, _mm512_castpd_si512(a), _mm512_castpd_si512(b)))
+#define gmx_simd4_xor_d(a, b)       _mm512_castsi512_pd(_mm512_mask_xor_epi32(_mm512_undefined_epi32(), mask_loh, _mm512_castpd_si512(a), _mm512_castpd_si512(b)))
+#define gmx_simd4_rsqrt_d(a)        _mm512_mask_cvtpslo_pd(_mm512_undefined_pd(), gmx_simd4_mask, _mm512_mask_rsqrt23_ps(_mm512_undefined_ps(), gmx_simd4_mask, _mm512_mask_cvtpd_pslo(_mm512_undefined_ps(), gmx_simd4_mask, x)))
+#define gmx_simd4_fabs_d(x)         gmx_simd4_andnot_d(_mm512_set1_pd(-0.0), x)
+#define gmx_simd4_fneg_d(x)         _mm512_mask_addn_pd(_mm512_undefined_pd(), gmx_simd4_mask, x, _mm512_setzero_pd())
+#define gmx_simd4_max_d(a, b)       _mm512_mask_gmax_pd(_mm512_undefined_pd(), gmx_simd4_mask, a, b)
+#define gmx_simd4_min_d(a, b)       _mm512_mask_gmin_pd(_mm512_undefined_pd(), gmx_simd4_mask, a, b)
+#define gmx_simd4_round_d(a)        _mm512_mask_roundfxpnt_adjust_pd(_mm512_undefined_pd(), gmx_simd4_mask, a, _MM_FROUND_TO_NEAREST_INT, _MM_EXPADJ_NONE)
+#define gmx_simd4_trunc_d(a)        _mm512_mask_roundfxpnt_adjust_pd(_mm512_undefined_pd(), gmx_simd4_mask, a, _MM_FROUND_TO_ZERO, _MM_EXPADJ_NONE)
+#define gmx_simd4_dotproduct3_d(a, b) _mm512_mask_reduce_add_pd(_mm512_int2mask(7), _mm512_mask_mul_pd(_mm512_undefined_pd(), _mm512_int2mask(7), a, b))
+#define gmx_simd4_dbool_t           __mmask16
+#define gmx_simd4_cmpeq_d(a, b)     _mm512_mask_cmp_pd_mask(gmx_simd4_mask, a, b, _CMP_EQ_OQ)
+#define gmx_simd4_cmplt_d(a, b)     _mm512_mask_cmp_pd_mask(gmx_simd4_mask, a, b, _CMP_LT_OS)
+#define gmx_simd4_cmple_d(a, b)     _mm512_mask_cmp_pd_mask(gmx_simd4_mask, a, b, _CMP_LE_OS)
+#define gmx_simd4_and_db            _mm512_kand
+#define gmx_simd4_or_db             _mm512_kor
+#define gmx_simd4_anytrue_db(x)     (_mm512_mask2int(x)&0xF)
+#define gmx_simd4_blendzero_d(a, sel)    _mm512_mask_mov_pd(_mm512_setzero_pd(), sel, a)
+#define gmx_simd4_blendnotzero_d(a, sel) _mm512_mask_mov_pd(_mm512_setzero_pd(), _mm512_knot(sel), a)
+#define gmx_simd4_blendv_d(a, b, sel)    _mm512_mask_blend_pd(sel, a, b)
+#define gmx_simd4_reduce_d(x)       _mm512_mask_reduce_add_pd(_mm512_int2mask(0xF), x)
+
+#define PERM_LOW2HIGH _MM_PERM_BABA
+#define PERM_HIGH2LOW _MM_PERM_DCDC
+
+#define mask_loh _mm512_int2mask(0x00FF) /* would be better a constant - but can't initialize with a function call. */
+#define mask_hih _mm512_int2mask(0xFF00)
+
+/* load store float */
+static gmx_inline __m512
+gmx_simd_loadu_f_mic(const float * m)
+{
+    return _mm512_loadunpackhi_ps(_mm512_loadunpacklo_ps(_mm512_undefined_ps(), m), m+16);
+}
+
+static gmx_inline void
+gmx_simd_storeu_f_mic(float * m, __m512 s)
+{
+    _mm512_packstorelo_ps(m, s);
+    _mm512_packstorehi_ps(m+16, s);
+}
+
+/* load store fint32 */
+static gmx_inline __m512i
+gmx_simd_loadu_fi_mic(const gmx_int32_t * m)
+{
+    return _mm512_loadunpackhi_epi32(_mm512_loadunpacklo_epi32(_mm512_undefined_epi32(), m), m+16);
+}
+
+static gmx_inline void
+gmx_simd_storeu_fi_mic(gmx_int32_t * m, __m512i s)
+{
+    _mm512_packstorelo_epi32(m, s);
+    _mm512_packstorehi_epi32(m+16, s);
+}
+
+/* load store double */
+static gmx_inline __m512d
+gmx_simd_loadu_d_mic(const double * m)
+{
+    return _mm512_loadunpackhi_pd(_mm512_loadunpacklo_pd(_mm512_undefined_pd(), m), m+8);
+}
+
+static gmx_inline void
+gmx_simd_storeu_d_mic(double * m, __m512d s)
+{
+    _mm512_packstorelo_pd(m, s);
+    _mm512_packstorehi_pd(m+8, s);
+}
+
+/* load store dint32 */
+static gmx_inline __m512i
+gmx_simd_loadu_di_mic(const gmx_int32_t * m)
+{
+    return _mm512_mask_loadunpackhi_epi32(_mm512_mask_loadunpacklo_epi32(_mm512_undefined_epi32(), mask_loh, m), mask_loh, m+16);
+}
+
+static gmx_inline void
+gmx_simd_storeu_di_mic(gmx_int32_t * m, __m512i s)
+{
+    _mm512_mask_packstorelo_epi32(m, mask_loh, s);
+    _mm512_mask_packstorehi_epi32(m+16, mask_loh, s);
+}
+
+/* load store simd4 */
+static gmx_inline __m512
+gmx_simd4_loadu_f_mic(const float * m)
+{
+    return _mm512_mask_loadunpackhi_ps(_mm512_mask_loadunpacklo_ps(_mm512_undefined_ps(), gmx_simd4_mask, m), gmx_simd4_mask, m+16);
+}
+
+static gmx_inline void
+gmx_simd4_storeu_f_mic(float * m, __m512 s)
+{
+    _mm512_mask_packstorelo_ps(m, gmx_simd4_mask, s);
+    _mm512_mask_packstorehi_ps(m+16, gmx_simd4_mask, s);
+}
+
+static gmx_inline __m512d
+gmx_simd4_loadu_d_mic(const double * m)
+{
+    return _mm512_mask_loadunpackhi_pd(_mm512_mask_loadunpacklo_pd(_mm512_undefined_pd(), gmx_simd4_mask, m), gmx_simd4_mask, m+8);
+}
+
+static gmx_inline void
+gmx_simd4_storeu_d_mic(double * m, __m512d s)
+{
+    _mm512_mask_packstorelo_pd(m, gmx_simd4_mask, s);
+    _mm512_mask_packstorehi_pd(m+8, gmx_simd4_mask, s);
+}
+
+/* extract */
+static gmx_inline gmx_int32_t
+gmx_simd_extract_fi_mic(gmx_simd_fint32_t a, int index)
+{
+    int r;
+    _mm512_mask_packstorelo_epi32(&r, _mm512_mask2int(1<<index), a);
+    return r;
+}
+
+static gmx_inline gmx_int32_t
+gmx_simd_extract_di_mic(gmx_simd_dint32_t a, int index)
+{
+    int r;
+    _mm512_mask_packstorelo_epi32(&r, _mm512_mask2int(1<<index), a);
+    return r;
+}
+
+/* This is likely faster than the built in scale operation (lat 8, t-put 3)
+ * since we only work on the integer part and use shifts. TODO: check. given that scale also only does integer
+ */
+static gmx_inline __m512
+gmx_simd_set_exponent_f_mic(__m512 a)
+{
+    __m512i       iexp         = gmx_simd_cvt_f2i(a);
+
+    const __m512i expbias      = _mm512_set1_epi32(127);
+    iexp = _mm512_slli_epi32(_mm512_add_epi32(iexp, expbias), 23);
+    return _mm512_castsi512_ps(iexp);
+
+    /* scale alternative:
+       return _mm512_scale_ps(_mm512_set1_ps(1), iexp);
+     */
+}
+
+static gmx_inline __m512d
+gmx_simd_set_exponent_d_mic(__m512d a)
+{
+    const __m512i expbias      = _mm512_set1_epi32(1023);
+    __m512i       iexp         = _mm512_cvtfxpnt_roundpd_epi32lo(a, _MM_FROUND_TO_NEAREST_INT);
+    iexp = _mm512_permutevar_epi32(_mm512_set_epi32(7, 7, 6, 6, 5, 5, 4, 4, 3, 3, 2, 2, 1, 1, 0, 0), iexp);
+    iexp = _mm512_mask_slli_epi32(_mm512_setzero_epi32(), _mm512_int2mask(0xAAAA), _mm512_add_epi32(iexp, expbias), 20);
+    return _mm512_castsi512_pd(iexp);
+}
+
+static gmx_inline void
+gmx_simd_cvt_f2dd_mic(__m512 f, __m512d * d0, __m512d * d1)
+{
+    __m512i i1 = _mm512_permute4f128_epi32(_mm512_castps_si512(f), _MM_PERM_CDCD);
+
+    *d0 = _mm512_cvtpslo_pd(f);
+    *d1 = _mm512_cvtpslo_pd(_mm512_castsi512_ps(i1));
+}
+
+static gmx_inline __m512
+gmx_simd_cvt_dd2f_mic(__m512d d0, __m512d d1)
+{
+    __m512 f0 = _mm512_cvtpd_pslo(d0);
+    __m512 f1 = _mm512_cvtpd_pslo(d1);
+    return _mm512_mask_permute4f128_ps(f0, mask_hih, f1, PERM_LOW2HIGH);
+}
+
+static gmx_inline __m512
+gmx_simd_exp2_f_mic(__m512 x)
+{
+    return _mm512_exp223_ps(_mm512_cvtfxpnt_round_adjustps_epi32(x, _MM_ROUND_MODE_NEAREST, _MM_EXPADJ_24));
+}
+
+static gmx_inline __m512
+gmx_simd_exp_f_mic(__m512 x)
+{
+    /* only 59ulp accuracy so we need to do extra an iteration
+       Using: http://yacas.sourceforge.net/Algochapter5.html 5.4 Method 3 */
+    __m512    r = gmx_simd_exp2_f(_mm512_mul_ps(x, _mm512_set1_ps(1.44269504088896341)));
+    __mmask16 m = _mm512_cmpneq_ps_mask(r, _mm512_setzero_ps());
+    __m512    t = _mm512_mask_fnmadd_ps(_mm512_mask_log2ae23_ps(_mm512_undefined_ps(), m, r), m, _mm512_set1_ps(0.693147180559945286226764), x);
+    return _mm512_mask_fmadd_ps(r, m, t, r);
+}
+
+static gmx_inline __m512
+gmx_simd_log_f_mic(__m512 x)
+{
+    return _mm512_mul_ps(_mm512_set1_ps(0.693147180559945286226764), _mm512_log2ae23_ps(x));
+}
+
+/* Function to check whether SIMD operations have resulted in overflow */
+static int
+gmx_simd_check_and_reset_overflow(void)
+{
+    int                MXCSR;
+    int                sse_overflow;
+    /* The overflow flag is bit 3 in the register */
+    const unsigned int flag = 0x8;
+
+    MXCSR = _mm_getcsr();
+    if (MXCSR & flag)
+    {
+        sse_overflow = 1;
+        /* Set the overflow flag to zero */
+        MXCSR = MXCSR & ~flag;
+        _mm_setcsr(MXCSR);
+    }
+    else
+    {
+        sse_overflow = 0;
+    }
+    return sse_overflow;
+}
+
+#endif /* GMX_SIMD_IMPL_INTEL_MIC_H */
diff --git a/src/gromacs/simd/impl_reference/impl_reference.h b/src/gromacs/simd/impl_reference/impl_reference.h

new file mode 100644 (file)

index 0000000..f123c84
--- /dev/null
+++ b/src/gromacs/simd/impl_reference/impl_reference.h
@@ -0,0 +1,3711 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2014, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+
+#ifndef GMX_SIMD_IMPL_REFERENCE_H
+#define GMX_SIMD_IMPL_REFERENCE_H
+
+/*! \libinternal \file
+ *
+ * \brief Reference SIMD implementation, including SIMD documentation.
+ *
+ * \author Erik Lindahl <erik.lindahl@scilifelab.se>
+ *
+ * \ingroup module_simd
+ */
+
+
+#include <math.h>
+
+#include "gmx_fatal.h"
+
+/*! \cond libapi */
+/*! \addtogroup module_simd */
+/*! \{ */
+
+/*! \name SIMD implementation capability definitions
+ *  \{
+ */
+
+/*! \brief
+ * Defined when SIMD float support is present.
+ *
+ * You should only use this to specifically check for single precision SIMD,
+ * support, even when the rest of Gromacs uses double precision.
+ * \sa GMX_SIMD_HAVE_REAL, GMX_SIMD_HAVE_DOUBLE
+ */
+#define GMX_SIMD_HAVE_FLOAT
+
+/*! \brief Defined if SIMD double support is present. */
+#define GMX_SIMD_HAVE_DOUBLE
+
+/*! \brief Defined if SIMD is implemented with real hardware instructions. */
+#define GMX_SIMD_HAVE_HARDWARE /* For Doxygen */
+#undef  GMX_SIMD_HAVE_HARDWARE /* Reference implementation setting */
+
+/*! \brief Defined if the SIMD implementation supports unaligned loads. */
+#define GMX_SIMD_HAVE_LOADU
+
+/*! \brief Defined if the SIMD implementation supports unaligned stores. */
+#define GMX_SIMD_HAVE_STOREU
+
+/*! \brief Defined if SIMD implementation has logical operations on floating-point data. */
+#define GMX_SIMD_HAVE_LOGICAL
+
+/*! \brief Defined if SIMD fused multiply-add uses hardware instructions */
+#define GMX_SIMD_HAVE_FMA  /* For Doxygen */
+#undef  GMX_SIMD_HAVE_FMA  /* Reference implementation setting */
+
+/*! \brief Defined if the SIMD fraction has a direct hardware instruction. */
+#define GMX_SIMD_HAVE_FRACTION /* For Doxygen */
+#undef  GMX_SIMD_HAVE_FRACTION /* Reference implementation setting */
+
+/*! \brief Defined if the SIMD implementation has \ref gmx_simd_fint32_t. */
+#define GMX_SIMD_HAVE_FINT32
+
+/*! \brief Support for extracting integers from \ref gmx_simd_fint32_t. */
+#define GMX_SIMD_HAVE_FINT32_EXTRACT
+
+/*! \brief Defined if SIMD logical operations are supported for \ref gmx_simd_fint32_t */
+#define GMX_SIMD_HAVE_FINT32_LOGICAL
+
+/*! \brief Defined if SIMD arithmetic operations are supported for \ref gmx_simd_fint32_t */
+#define GMX_SIMD_HAVE_FINT32_ARITHMETICS
+
+/*! \brief Defined if the SIMD implementation has \ref gmx_simd_dint32_t.
+ *
+ * \note The Gromacs SIMD module works entirely with 32 bit integers, both
+ * in single and double precision, since some platforms do not support 64 bit
+ * SIMD integers at all. In particular, this means it is up to each
+ * implementation to get this working even if the architectures internal
+ * representation uses 64 bit integers when converting to/from double SIMD
+ * variables. For now we will try HARD to use conversions, packing or shuffling
+ * so the integer datatype has the same width as the floating-point type, i.e.
+ * if you use double precision SIMD with a width of 8, we want the integers
+ * we work with to also use a SIMD width of 8 to make it easy to load/store
+ * indices from arrays. This refers entirely to the function calls
+ * and how many integers we load/store in one call; the actual SIMD registers
+ * might be wider for integers internally (e.g. on x86 gmx_simd_dint32_t will
+ * only fill half the register), but this is none of the user's business.
+ * While this works for all current architectures, and we think it will work
+ * for future ones, we might have to alter this decision in the future. To
+ * avoid rewriting every single instance that refers to the SIMD width we still
+ * provide separate defines for the width of SIMD integer variables that you
+ * should use.
+ */
+#define GMX_SIMD_HAVE_DINT32
+
+/*! \brief Support for extracting integer from \ref gmx_simd_dint32_t */
+#define GMX_SIMD_HAVE_DINT32_EXTRACT
+
+/*! \brief Defined if logical operations are supported for \ref gmx_simd_dint32_t */
+#define GMX_SIMD_HAVE_DINT32_LOGICAL
+
+/*! \brief Defined if SIMD arithmetic operations are supported for \ref gmx_simd_dint32_t */
+#define GMX_SIMD_HAVE_DINT32_ARITHMETICS
+
+/*! \brief Defined if the implementation provides \ref gmx_simd4_float_t. */
+#define GMX_SIMD4_HAVE_FLOAT
+
+/*! \brief Defined if the implementation provides \ref gmx_simd4_double_t. */
+#define GMX_SIMD4_HAVE_DOUBLE
+
+#ifdef GMX_SIMD_REF_FLOAT_WIDTH
+#    define GMX_SIMD_FLOAT_WIDTH             GMX_SIMD_REF_FLOAT_WIDTH
+#else
+/*! \brief Width of the \ref gmx_simd_float_t datatype. */
+#    define GMX_SIMD_FLOAT_WIDTH             4
+#endif
+
+#ifdef GMX_SIMD_REF_DOUBLE_WIDTH
+#    define GMX_SIMD_DOUBLE_WIDTH            GMX_SIMD_REF_DOUBLE_WIDTH
+#else
+/*! \brief Width of the \ref gmx_simd_double_t datatype. */
+#    define GMX_SIMD_DOUBLE_WIDTH            4
+#endif
+
+/*! \brief Width of the \ref gmx_simd_fint32_t datatype. */
+#define GMX_SIMD_FINT32_WIDTH            GMX_SIMD_FLOAT_WIDTH
+
+/*! \brief Width of the \ref gmx_simd_dint32_t datatype. */
+#define GMX_SIMD_DINT32_WIDTH            GMX_SIMD_DOUBLE_WIDTH
+
+/*! \brief Accuracy of SIMD 1/sqrt(x) lookup. Used to determine number of iterations. */
+#define GMX_SIMD_RSQRT_BITS             23
+
+/*! \brief Accuracy of SIMD 1/x lookup. Used to determine number of iterations. */
+#define GMX_SIMD_RCP_BITS               23
+
+/*! \}
+ *
+ * \name SIMD implementation data types
+ * \{
+ */
+/*! \brief Float SIMD variable. Supported with GMX_SIMD_HAVE_FLOAT.
+ */
+typedef struct
+{
+    float r[GMX_SIMD_FLOAT_WIDTH]; /*!< Implementation dependent. Don't touch. */
+}
+gmx_simd_float_t;
+
+/*! \brief Floating-point SIMD variable type in double precision.
+ *
+ * Supported with GMX_SIMD_HAVE_DOUBLE.
+ */
+typedef struct
+{
+    double r[GMX_SIMD_DOUBLE_WIDTH]; /*!< Implementation dependent. Don't touch. */
+}
+gmx_simd_double_t;
+
+/*! \brief Integer SIMD variable type to use for conversions to/from float.
+ *
+ * This is also the widest integer SIMD type.
+ */
+typedef struct
+{
+    gmx_int32_t i[GMX_SIMD_FINT32_WIDTH]; /*!< Implementation dependent. Don't touch. */
+}
+gmx_simd_fint32_t;
+
+/*! \brief Integer SIMD variable type to use for conversions to/from double.
+ *
+ * Available with GMX_SIMD_HAVE_DINT32.
+ */
+typedef struct
+{
+    gmx_int32_t i[GMX_SIMD_DINT32_WIDTH]; /*!< Implementation dependent. Don't touch. */
+}
+gmx_simd_dint32_t;
+
+/*! \brief Boolean type for float SIMD data.
+ *
+ * You should likely use gmx_simd_bool_t
+ * (for gmx_simd_real_t) instead, unless you really know what you are doing.
+ */
+typedef struct
+{
+    gmx_int32_t b[GMX_SIMD_FLOAT_WIDTH]; /*!< Implementation dependent. Don't touch. */
+}
+gmx_simd_fbool_t;
+
+/*! \brief Boolean type for double precision SIMD data.
+ *
+ * Use the generic gmx_simd_bool_t
+ * (for gmx_simd_real_t) instead, unless you really know what you are doing.
+ */
+typedef struct
+{
+    gmx_int32_t b[GMX_SIMD_DOUBLE_WIDTH]; /*!< Implementation dependent. Don't touch. */
+}
+gmx_simd_dbool_t;
+
+/*! \brief Boolean type for integer datatypes corresponding to float SIMD. */
+typedef struct
+{
+    gmx_int32_t b[GMX_SIMD_FINT32_WIDTH]; /*!< Implementation dependent. Don't touch. */
+}
+gmx_simd_fibool_t;
+
+/*! \brief Boolean type for integer datatypes corresponding to double SIMD.
+ *
+ * You should likely use gmx_simd_ibool_t (for gmx_simd_int32_t) instead,
+ * unless you really know what you are doing.
+ */
+typedef struct
+{
+    gmx_int32_t b[GMX_SIMD_DINT32_WIDTH]; /*!< Implementation dependent. Don't touch. */
+}
+gmx_simd_dibool_t;
+
+/*! \}
+ *
+ * \name SIMD implementation load/store operations for single precision floating point
+ * \{
+ */
+
+/*! \brief Load \ref GMX_SIMD_FLOAT_WIDTH numbers from aligned memory.
+ *
+ * \param m Pointer to memory aligned to the SIMD width.
+ * \return SIMD variable with data loaded.
+ */
+static gmx_inline gmx_simd_float_t
+gmx_simd_load_f(const float *m)
+{
+    gmx_simd_float_t  a;
+    int               i;
+
+    for (i = 0; i < GMX_SIMD_FLOAT_WIDTH; i++)
+    {
+        a.r[i] = m[i];
+    }
+    return a;
+}
+
+/*! \brief Set all SIMD variable elements to float pointed to by m (unaligned).
+ *
+ * \param m Pointer to single value in memory.
+ * \return SIMD variable with all elements set to *m.
+ */
+static gmx_inline gmx_simd_float_t
+gmx_simd_load1_f(const float *m)
+{
+    gmx_simd_float_t  a;
+    int               i;
+    float             f = *m;
+
+    for (i = 0; i < GMX_SIMD_FLOAT_WIDTH; i++)
+    {
+        a.r[i] = f;
+    }
+    return a;
+}
+
+/*! \brief Set all SIMD float variable elements to the value r.
+ *
+ *  \param r floating-point constant
+ *  \return SIMD variable with all elements set to r.
+ */
+static gmx_inline gmx_simd_float_t
+gmx_simd_set1_f(float r)
+{
+    gmx_simd_float_t  a;
+    int               i;
+
+    for (i = 0; i < GMX_SIMD_FLOAT_WIDTH; i++)
+    {
+        a.r[i] = r;
+    }
+    return a;
+}
+
+/*! \brief Set all SIMD float variable elements to 0.0f.
+ *
+ *  \return The value 0.0 in all elements of a SIMD variable.
+ */
+static gmx_inline gmx_simd_float_t
+gmx_simd_setzero_f()
+{
+    gmx_simd_float_t  a;
+    int               i;
+
+    for (i = 0; i < GMX_SIMD_FLOAT_WIDTH; i++)
+    {
+        a.r[i] = 0.0;
+    }
+    return a;
+}
+
+/*! \brief Store the contents of the SIMD float variable pr to aligned memory m.
+ *
+ * \param[out] m Pointer to memory, aligned to SIMD width.
+ * \param a SIMD variable to store
+ */
+static gmx_inline void
+gmx_simd_store_f(float *m, gmx_simd_float_t a)
+{
+    int i;
+
+    for (i = 0; i < GMX_SIMD_FLOAT_WIDTH; i++)
+    {
+        m[i] = a.r[i];
+    }
+}
+
+/*! \brief Load SIMD float from unaligned memory.
+ *
+ * Available with \ref GMX_SIMD_HAVE_LOADU.
+ *
+ * \param m Pointer to memory, no alignment requirement.
+ * \return SIMD variable with data loaded.
+ */
+#define gmx_simd_loadu_f gmx_simd_load_f
+
+/*! \brief Store SIMD float to unaligned memory.
+ *
+ * Available with \ref GMX_SIMD_HAVE_STOREU.
+ *
+ * \param[out] m Pointer to memory, no alignment requirement.
+ * \param a SIMD variable to store.
+ */
+#define gmx_simd_storeu_f gmx_simd_store_f
+
+/*! \}
+ *
+ * \name SIMD implementation load/store operations for double precision floating point
+ * \{
+ */
+
+/*! \brief Load \ref GMX_SIMD_DOUBLE_WIDTH numbers from aligned memory.
+ *
+ * \copydetails gmx_simd_load_f
+ */
+static gmx_inline gmx_simd_double_t
+gmx_simd_load_d(const double *m)
+{
+    gmx_simd_double_t  a;
+    int                i;
+
+    for (i = 0; i < GMX_SIMD_DOUBLE_WIDTH; i++)
+    {
+        a.r[i] = m[i];
+    }
+    return a;
+}
+
+/*! \brief Set all SIMD variable elements to double pointed to by m (unaligned).
+ *
+ * \copydetails gmx_simd_load1_f
+ */
+static gmx_inline gmx_simd_double_t
+gmx_simd_load1_d(const double *m)
+{
+    gmx_simd_double_t  a;
+    int                i;
+    double             d = *m;
+
+    for (i = 0; i < GMX_SIMD_DOUBLE_WIDTH; i++)
+    {
+        a.r[i] = d;
+    }
+    return a;
+}
+
+/*! \brief Set all SIMD double variable elements to the value r.
+ *
+ * \copydetails gmx_simd_set1_f
+ */
+static gmx_inline gmx_simd_double_t
+gmx_simd_set1_d(double r)
+{
+    gmx_simd_double_t  a;
+    int                i;
+
+    for (i = 0; i < GMX_SIMD_DOUBLE_WIDTH; i++)
+    {
+        a.r[i] = r;
+    }
+    return a;
+}
+
+/*! \brief Set all SIMD double variable elements to 0.0.
+ *
+ * \copydetails gmx_simd_setzero_f
+ */
+static gmx_inline gmx_simd_double_t
+gmx_simd_setzero_d()
+{
+    gmx_simd_double_t  a;
+    int                i;
+
+    for (i = 0; i < GMX_SIMD_DOUBLE_WIDTH; i++)
+    {
+        a.r[i] = 0.0;
+    }
+    return a;
+}
+
+/*! \brief Store the contents of the SIMD double variable pr to aligned memory m.
+ *
+ * \copydetails gmx_simd_store_f
+ */
+static gmx_inline void
+gmx_simd_store_d(double *m, gmx_simd_double_t a)
+{
+    int i;
+
+    for (i = 0; i < GMX_SIMD_DOUBLE_WIDTH; i++)
+    {
+        m[i] = a.r[i];
+    }
+}
+
+/*! \brief Load SIMD double from unaligned memory.
+ *
+ * Available with \ref GMX_SIMD_HAVE_LOADU.
+ *
+ * \copydetails gmx_simd_loadu_f
+ */
+#define gmx_simd_loadu_d gmx_simd_load_d
+
+/*! \brief Store SIMD double to unaligned memory.
+ *
+ * Available with \ref GMX_SIMD_HAVE_STOREU.
+ *
+ * \copydetails gmx_simd_storeu_f
+ */
+#define gmx_simd_storeu_d gmx_simd_store_d
+
+/*! \}
+ *
+ * \name SIMD implementation load/store operations for integers (corresponding to float)
+ * \{
+ */
+
+/*! \brief Load aligned SIMD integer data, width corresponds to \ref gmx_simd_float_t.
+ *
+ * You should typically call the real-precision \ref gmx_simd_load_i.
+ *
+ * \param m Pointer to memory, aligned to integer SIMD width.
+ * \return SIMD integer variable.
+ */
+static gmx_inline gmx_simd_fint32_t
+gmx_simd_load_fi(const gmx_int32_t * m)
+{
+    gmx_simd_fint32_t  a;
+    int                i;
+    for (i = 0; i < GMX_SIMD_FINT32_WIDTH; i++)
+    {
+        a.i[i] = m[i];
+    }
+    return a;
+};
+
+/*! \brief Set SIMD from integer, width corresponds to \ref gmx_simd_float_t.
+ *
+ * You should typically call the real-precision \ref gmx_simd_set1_i.
+ *
+ *  \param b integer value to set variable to.
+ *  \return SIMD variable with all elements set to b.
+ */
+static gmx_inline gmx_simd_fint32_t
+gmx_simd_set1_fi(gmx_int32_t b)
+{
+    gmx_simd_fint32_t  a;
+    int                i;
+    for (i = 0; i < GMX_SIMD_FINT32_WIDTH; i++)
+    {
+        a.i[i] = b;
+    }
+    return a;
+}
+
+/*! \brief Set all SIMD variable elements to 0, width corresponds to \ref gmx_simd_float_t.
+ *
+ * You should typically call the real-precision \ref gmx_simd_setzero_i.
+ *
+ * \return SIMD integer variable with all bits set to zero.
+ */
+static gmx_inline gmx_simd_fint32_t
+gmx_simd_setzero_fi()
+{
+    gmx_simd_fint32_t  a;
+    int                i;
+
+    for (i = 0; i < GMX_SIMD_FINT32_WIDTH; i++)
+    {
+        a.i[i] = 0;
+    }
+    return a;
+}
+
+/*! \brief Store aligned SIMD integer data, width corresponds to \ref gmx_simd_float_t.
+ *
+ * You should typically call the real-precision \ref gmx_simd_store_i.
+ *
+ * \param m Memory aligned to integer SIMD width.
+ * \param a SIMD variable to store.
+ */
+static gmx_inline gmx_simd_fint32_t
+gmx_simd_store_fi(int * m, gmx_simd_fint32_t a)
+{
+    int                i;
+    for (i = 0; i < GMX_SIMD_FINT32_WIDTH; i++)
+    {
+        m[i] = a.i[i];
+    }
+    return a;
+};
+
+/*! \brief Load unaligned integer SIMD data, width corresponds to \ref gmx_simd_float_t.
+ *
+ * You should typically call the real-precision \ref gmx_simd_loadu_i.
+ *
+ * Supported with \ref GMX_SIMD_HAVE_LOADU.
+ *
+ * \param m Pointer to memory, no alignment requirements.
+ * \return SIMD integer variable.
+ */
+#define gmx_simd_loadu_fi  gmx_simd_load_fi
+
+/*! \brief Store unaligned SIMD integer data, width corresponds to \ref gmx_simd_float_t.
+ *
+ * You should typically call the real-precision \ref gmx_simd_storeu_i.
+ *
+ * Supported with \ref GMX_SIMD_HAVE_STOREU.
+ *
+ * \param m Memory pointer, no alignment requirements.
+ * \param a SIMD variable to store.
+ */
+#define gmx_simd_storeu_fi gmx_simd_store_fi
+
+/*! \brief Extract element with index i from \ref gmx_simd_fint32_t.
+ *
+ * You should typically call the real-precision \ref gmx_simd_extract_i.
+ *
+ * Available with \ref GMX_SIMD_HAVE_FINT32_EXTRACT.
+ *
+ * \param a SIMD variable
+ * \param index Position to extract integer from
+ * \return Single integer from position index in SIMD variable.
+ */
+static gmx_inline gmx_int32_t
+gmx_simd_extract_fi(gmx_simd_fint32_t a, int index)
+{
+    return a.i[index];
+}
+
+/*! \}
+ *
+ * \name SIMD implementation load/store operations for integers (corresponding to double)
+ * \{
+ */
+
+/*! \brief Load aligned SIMD integer data, width corresponds to \ref gmx_simd_double_t.
+ *
+ * \copydetails gmx_simd_load_fi
+ */
+static gmx_inline gmx_simd_dint32_t
+gmx_simd_load_di(const gmx_int32_t * m)
+{
+    gmx_simd_dint32_t  a;
+    int                i;
+    for (i = 0; i < GMX_SIMD_DINT32_WIDTH; i++)
+    {
+        a.i[i] = m[i];
+    }
+    return a;
+};
+
+/*! \brief Set SIMD from integer, width corresponds to \ref gmx_simd_double_t.
+ *
+ *  \copydetails gmx_simd_set1_fi
+ */
+static gmx_inline gmx_simd_dint32_t
+gmx_simd_set1_di(gmx_int32_t b)
+{
+    gmx_simd_dint32_t  a;
+    int                i;
+    for (i = 0; i < GMX_SIMD_DINT32_WIDTH; i++)
+    {
+        a.i[i] = b;
+    }
+    return a;
+}
+
+/*! \brief Set all SIMD variable elements to 0, width corresponds to \ref gmx_simd_double_t.
+ *
+ * \copydetails gmx_simd_setzero_fi
+ */
+static gmx_inline gmx_simd_dint32_t
+gmx_simd_setzero_di()
+{
+    gmx_simd_dint32_t  a;
+    int                i;
+
+    for (i = 0; i < GMX_SIMD_DINT32_WIDTH; i++)
+    {
+        a.i[i] = 0;
+    }
+    return a;
+}
+
+/*! \brief Store aligned SIMD integer data, width corresponds to \ref gmx_simd_double_t.
+ *
+ * \copydetails gmx_simd_store_fi
+ */
+static gmx_inline gmx_simd_dint32_t
+gmx_simd_store_di(gmx_int32_t * m, gmx_simd_dint32_t a)
+{
+    int                i;
+    for (i = 0; i < GMX_SIMD_DINT32_WIDTH; i++)
+    {
+        m[i] = a.i[i];
+    }
+    return a;
+};
+
+/*! \brief Load unaligned integer SIMD data, width corresponds to \ref gmx_simd_double_t.
+ *
+ * \copydetails gmx_simd_loadu_fi
+ */
+#define gmx_simd_loadu_di  gmx_simd_load_di
+
+/*! \brief Store unaligned SIMD integer data, width corresponds to \ref gmx_simd_double_t.
+ *
+ * \copydetails gmx_simd_storeu_fi
+ */
+#define gmx_simd_storeu_di gmx_simd_store_di
+
+/*! \brief Extract element with index i from \ref gmx_simd_dint32_t.
+ *
+ * \copydetails gmx_simd_extract_fi
+ */
+static gmx_inline gmx_int32_t
+gmx_simd_extract_di(gmx_simd_dint32_t a, int index)
+{
+    return a.i[index];
+}
+
+/*! \}
+ *
+ * \name SIMD implementation single precision floating-point bitwise logical operations
+ * \{
+ */
+
+/*! \brief Bitwise and for two SIMD float variables. Supported with \ref GMX_SIMD_HAVE_LOGICAL.
+ *
+ * You should typically call the real-precision \ref gmx_simd_and_r.
+ *
+ * \param a data1
+ * \param b data2
+ * \return data1 & data2
+ */
+static gmx_inline gmx_simd_float_t
+gmx_simd_and_f(gmx_simd_float_t a, gmx_simd_float_t b)
+{
+    gmx_simd_float_t  c;
+    int               i;
+#ifdef __cplusplus
+    gmx_int32_t       val1, val2, res;
+#else
+    union
+    {
+        float        r;
+        gmx_int32_t  i;
+    }
+    conv1, conv2;
+#endif
+
+    for (i = 0; i < GMX_SIMD_FLOAT_WIDTH; i++)
+    {
+#ifdef __cplusplus
+        val1   = reinterpret_cast<int &>(a.r[i]);
+        val2   = reinterpret_cast<int &>(b.r[i]);
+        res    = val1 & val2;
+        c.r[i] = reinterpret_cast<float &>(res);
+#else
+        conv1.r = a.r[i];
+        conv2.r = b.r[i];
+        conv1.i = conv1.i & conv2.i;
+        c.r[i]  = conv1.r;
+#endif
+    }
+    return c;
+}
+
+/*! \brief Bitwise andnot for SIMD float. c=(~a) & b. Supported with \ref GMX_SIMD_HAVE_LOGICAL.
+ *
+ * You should typically call the real-precision \ref gmx_simd_andnot_r.
+ *
+ * \param a data1
+ * \param b data2
+ * \return (~data1) & data2
+ */
+static gmx_inline gmx_simd_float_t
+gmx_simd_andnot_f(gmx_simd_float_t a, gmx_simd_float_t b)
+{
+    gmx_simd_float_t  c;
+    int               i;
+#ifdef __cplusplus
+    gmx_int32_t       val1, val2, res;
+#else
+    union
+    {
+        float        r;
+        gmx_int32_t  i;
+    }
+    conv1, conv2;
+#endif
+
+    for (i = 0; i < GMX_SIMD_FLOAT_WIDTH; i++)
+    {
+#ifdef __cplusplus
+        val1   = reinterpret_cast<int &>(a.r[i]);
+        val2   = reinterpret_cast<int &>(b.r[i]);
+        res    = (~val1) & val2;
+        c.r[i] = reinterpret_cast<float &>(res);
+#else
+        conv1.r = a.r[i];
+        conv2.r = b.r[i];
+        conv1.i = (~conv1.i) & conv2.i;
+        c.r[i]  = conv1.r;
+#endif
+    }
+    return c;
+}
+
+/*! \brief Bitwise or for SIMD float. Supported with \ref GMX_SIMD_HAVE_LOGICAL.
+ *
+ * You should typically call the real-precision \ref gmx_simd_or_r.
+ *
+ * \param a data1
+ * \param b data2
+ * \return data1 | data2
+ */
+static gmx_inline gmx_simd_float_t
+gmx_simd_or_f(gmx_simd_float_t a, gmx_simd_float_t b)
+{
+    gmx_simd_float_t  c;
+    int               i;
+#ifdef __cplusplus
+    gmx_int32_t       val1, val2, res;
+#else
+    union
+    {
+        float        r;
+        gmx_int32_t  i;
+    }
+    conv1, conv2;
+#endif
+
+    for (i = 0; i < GMX_SIMD_FLOAT_WIDTH; i++)
+    {
+#ifdef __cplusplus
+        val1   = reinterpret_cast<int &>(a.r[i]);
+        val2   = reinterpret_cast<int &>(b.r[i]);
+        res    = val1 | val2;
+        c.r[i] = reinterpret_cast<float &>(res);
+#else
+        conv1.r = a.r[i];
+        conv2.r = b.r[i];
+        conv1.i = conv1.i | conv2.i;
+        c.r[i]  = conv1.r;
+#endif
+    }
+    return c;
+}
+
+/*! \brief Bitwise xor for SIMD float. Supported with \ref GMX_SIMD_HAVE_LOGICAL.
+ *
+ * You should typically call the real-precision \ref gmx_simd_xor_r.
+ *
+ * \param a data1
+ * \param b data2
+ * \return data1 ^ data2
+ */
+static gmx_inline gmx_simd_float_t
+gmx_simd_xor_f(gmx_simd_float_t a, gmx_simd_float_t b)
+{
+    gmx_simd_float_t  c;
+    int               i;
+#ifdef __cplusplus
+    gmx_int32_t       val1, val2, res;
+#else
+    union
+    {
+        float        r;
+        gmx_int32_t  i;
+    }
+    conv1, conv2;
+#endif
+
+    for (i = 0; i < GMX_SIMD_FLOAT_WIDTH; i++)
+    {
+#ifdef __cplusplus
+        val1   = reinterpret_cast<int &>(a.r[i]);
+        val2   = reinterpret_cast<int &>(b.r[i]);
+        res    = val1 ^ val2;
+        c.r[i] = reinterpret_cast<float &>(res);
+#else
+        conv1.r = a.r[i];
+        conv2.r = b.r[i];
+        conv1.i = conv1.i ^ conv2.i;
+        c.r[i]  = conv1.r;
+#endif
+    }
+    return c;
+}
+
+/*! \}
+ *
+ * \name SIMD implementation single precision floating-point arithmetics
+ * \{
+ */
+/*! \brief Add two float SIMD variables.
+ *
+ * You should typically call the real-precision \ref gmx_simd_add_r.
+ *
+ * \param a term1
+ * \param b term2
+ * \return a+b
+ */
+static gmx_inline gmx_simd_float_t
+gmx_simd_add_f(gmx_simd_float_t a, gmx_simd_float_t b)
+{
+    gmx_simd_float_t  c;
+    int               i;
+
+    for (i = 0; i < GMX_SIMD_FLOAT_WIDTH; i++)
+    {
+        c.r[i] = a.r[i] + b.r[i];
+    }
+    return c;
+}
+
+/*! \brief Subtract two SIMD variables.
+ *
+ * You should typically call the real-precision \ref gmx_simd_sub_r.
+ *
+ * \param a term1
+ * \param b term2
+ * \return a-b
+ */
+static gmx_inline gmx_simd_float_t
+gmx_simd_sub_f(gmx_simd_float_t a, gmx_simd_float_t b)
+{
+    gmx_simd_float_t  c;
+    int               i;
+
+    for (i = 0; i < GMX_SIMD_FLOAT_WIDTH; i++)
+    {
+        c.r[i] = a.r[i] - b.r[i];
+    }
+    return c;
+}
+
+/*! \brief Multiply two SIMD variables.
+ *
+ * You should typically call the real-precision \ref gmx_simd_mul_r.
+ *
+ * \param a factor1
+ * \param b factor2
+ * \return a*b.
+ */
+static gmx_inline gmx_simd_float_t
+gmx_simd_mul_f(gmx_simd_float_t a, gmx_simd_float_t b)
+{
+    gmx_simd_float_t  c;
+    int               i;
+
+    for (i = 0; i < GMX_SIMD_FLOAT_WIDTH; i++)
+    {
+        c.r[i] = a.r[i]*b.r[i];
+    }
+    return c;
+}
+
+/*! \brief Fused-multiply-add. Result is a*b+c.
+ *
+ * You should typically call the real-precision \ref gmx_simd_fmadd_r.
+ *
+ *  If \ref GMX_SIMD_HAVE_FMA is defined this is a single hardware instruction.
+ *
+ * \param a value
+ * \param b value
+ * \param c value
+ * \return a*b+c
+ *
+ * For some implementations you save an instruction if you assign the result
+ * to c.
+ */
+#define gmx_simd_fmadd_f(a, b, c) gmx_simd_add_f(gmx_simd_mul_f(a, b), c)
+
+
+/*! \brief Fused-multiply-subtract. Result is a*b-c.
+ *
+ * You should typically call the real-precision \ref gmx_simd_fmsub_r.
+ *
+ *  If \ref GMX_SIMD_HAVE_FMA is defined this is a single hardware instruction.
+ *
+ * \param a value
+ * \param b value
+ * \param c value
+ * \return a*b-c
+ *
+ * For some implementations you save an instruction if you assign the result
+ * to c.
+ */
+#define gmx_simd_fmsub_f(a, b, c) gmx_simd_sub_f(gmx_simd_mul_f(a, b), c)
+
+
+/*! \brief Fused-negated-multiply-add. Result is -a*b+c.
+ *
+ * You should typically call the real-precision \ref gmx_simd_fnmadd_r.
+ *
+ *  If \ref GMX_SIMD_HAVE_FMA is defined this is a single hardware instruction.
+ *
+ * \param a value
+ * \param b value
+ * \param c value
+ * \return -a*b+c
+ *
+ * For some implementations you save an instruction if you assign the result
+ * to c.
+ */
+#define gmx_simd_fnmadd_f(a, b, c) gmx_simd_sub_f(c, gmx_simd_mul_f(a, b))
+
+
+/*! \brief Fused-negated-multiply-sub. Result is -a*b-c.
+ *
+ * You should typically call the real-precision \ref gmx_simd_fnmsub_r.
+ *
+ *  If \ref GMX_SIMD_HAVE_FMA is defined this is a single hardware instruction.
+ *
+ * \param a value
+ * \param b value
+ * \param c value
+ * \return -a*b-c
+ *
+ * For some implementations you save an instruction if you assign the result
+ * to c.
+ */
+#define gmx_simd_fnmsub_f(a, b, c) gmx_simd_sub_f(gmx_simd_setzero_f(), gmx_simd_fmadd_f(a, b, c))
+
+/*! \brief SIMD 1.0/sqrt(x) lookup.
+ *
+ * You should typically call the real-precision \ref gmx_simd_rsqrt_r.
+ *
+ * This is a low-level instruction that should only be called from routines
+ * implementing the inverse square root in simd_math.h.
+ *
+ * \param x Argument, x>0
+ * \return Approximation of 1/sqrt(x), accuracy is \ref GMX_SIMD_RSQRT_BITS.
+ */
+static gmx_inline gmx_simd_float_t
+gmx_simd_rsqrt_f(gmx_simd_float_t x)
+{
+    gmx_simd_float_t  b;
+    int               i;
+
+    for (i = 0; i < GMX_SIMD_FLOAT_WIDTH; i++)
+    {
+        b.r[i] = (x.r[i] > 0.0f) ? 1.0f/sqrtf(x.r[i]) : 0.0f;
+    }
+    return b;
+};
+
+/*! \brief SIMD 1.0/x lookup.
+ *
+ * You should typically call the real-precision \ref gmx_simd_rcp_r.
+ *
+ * This is a low-level instruction that should only be called from routines
+ * implementing the reciprocal in simd_math.h.
+ *
+ * \param x Argument, x!=0
+ * \return Approximation of 1/x, accuracy is \ref GMX_SIMD_RCP_BITS.
+ */
+static gmx_inline gmx_simd_float_t
+gmx_simd_rcp_f(gmx_simd_float_t x)
+{
+    gmx_simd_float_t  b;
+    int               i;
+
+    for (i = 0; i < GMX_SIMD_FLOAT_WIDTH; i++)
+    {
+        b.r[i] = (x.r[i] != 0.0f) ? 1.0f/x.r[i] : 0.0f;
+    }
+    return b;
+};
+
+/*! \brief SIMD Floating-point fabs().
+ *
+ * You should typically call the real-precision \ref gmx_simd_fabs_r.
+ *
+ * \param a any floating point values
+ * \return fabs(a) for each element.
+ */
+static gmx_inline gmx_simd_float_t
+gmx_simd_fabs_f(gmx_simd_float_t a)
+{
+    gmx_simd_float_t  c;
+    int               i;
+
+    for (i = 0; i < GMX_SIMD_FLOAT_WIDTH; i++)
+    {
+        c.r[i] = fabsf(a.r[i]);
+    }
+    return c;
+}
+
+/*! \brief SIMD floating-point negate.
+ *
+ * You should typically call the real-precision \ref gmx_simd_fneg_r.
+ *
+ * \param a Any floating-point value
+ * \return -a
+ */
+static gmx_inline gmx_simd_float_t
+gmx_simd_fneg_f(gmx_simd_float_t a)
+{
+    gmx_simd_float_t  c;
+    int               i;
+
+    for (i = 0; i < GMX_SIMD_FLOAT_WIDTH; i++)
+    {
+        c.r[i] = -a.r[i];
+    }
+    return c;
+}
+
+/*! \brief Set each SIMD element to the largest from two variables.
+ *
+ * You should typically call the real-precision \ref gmx_simd_max_r.
+ *
+ * \param a Any floating-point value
+ * \param b Any floating-point value
+ * \return max(a,b) for each element.
+ */
+static gmx_inline gmx_simd_float_t
+gmx_simd_max_f(gmx_simd_float_t a, gmx_simd_float_t b)
+{
+    gmx_simd_float_t  c;
+    int               i;
+
+    for (i = 0; i < GMX_SIMD_FLOAT_WIDTH; i++)
+    {
+        c.r[i] = (a.r[i] >= b.r[i] ? a.r[i] : b.r[i]);
+    }
+    return c;
+}
+
+/*! \brief Set each SIMD element to the smallest from two variables.
+ *
+ * You should typically call the real-precision \ref gmx_simd_min_r.
+ *
+ * \param a Any floating-point value
+ * \param b Any floating-point value
+ * \return min(a,b) for each element.
+ */
+static gmx_inline gmx_simd_float_t
+gmx_simd_min_f(gmx_simd_float_t a, gmx_simd_float_t b)
+{
+    gmx_simd_float_t  c;
+    int               i;
+
+    for (i = 0; i < GMX_SIMD_FLOAT_WIDTH; i++)
+    {
+        c.r[i] = (a.r[i] <= b.r[i] ? a.r[i] : b.r[i]);
+    }
+    return c;
+}
+
+/*! \brief Round to nearest integer value (in floating-point format).
+ *
+ * You should typically call the real-precision \ref gmx_simd_round_r.
+ *
+ * \param a Any floating-point value
+ * \return The nearest integer, represented in floating-point format.
+ *
+ * \note The reference implementation rounds exact half-way cases
+ * away from zero, whereas most SIMD intrinsics will round to nearest even.
+ * This could be fixed by using rint/rintf, but the bigger problem is that
+ * MSVC does not support full C99, and none of the round or rint
+ * functions are defined. It's much easier to approximately implement
+ * round() than rint(), so we do that and hope we never get bitten in
+ * testing. (Thanks, Microsoft.)
+ */
+static gmx_inline gmx_simd_float_t
+gmx_simd_round_f(gmx_simd_float_t a)
+{
+    gmx_simd_float_t  b;
+    int               i;
+
+    for (i = 0; i < GMX_SIMD_FLOAT_WIDTH; i++)
+    {
+#ifdef _MSC_VER
+        int temp = (a.r[i] >= 0.0f) ? (a.r[i] + 0.5f) : (a.r[i] - 0.5f);
+        b.r[i] = temp;
+#else
+        b.r[i] = roundf(a.r[i]);
+#endif
+    }
+    return b;
+}
+
+/*! \brief Truncate SIMD, i.e. round towards zero - common hardware instruction.
+ *
+ * You should typically call the real-precision \ref gmx_simd_trunc_r.
+ *
+ * \param a Any floating-point value
+ * \return Integer rounded towards zero, represented in floating-point format.
+ *
+ * \note This is truncation towards zero, not floor(). The reason for this
+ * is that truncation is virtually always present as a dedicated hardware
+ * instruction, but floor() frequently isn't.
+ */
+static gmx_inline gmx_simd_float_t
+gmx_simd_trunc_f(gmx_simd_float_t a)
+{
+    gmx_simd_float_t  b;
+    int               i;
+
+    for (i = 0; i < GMX_SIMD_FLOAT_WIDTH; i++)
+    {
+        b.r[i] = truncf(a.r[i]);
+    }
+    return b;
+}
+
+
+/*! \brief Fraction of the SIMD floating point number.
+ *
+ * You should typically call the real-precision \ref gmx_simd_fraction_r.
+ *
+ * \param a Any floating-point value
+ * \return a-trunc(r)
+ *
+ * To maximize compatibility, we use the same definition of fractions as used
+ * e.g. for the AMD64 hardware instructions. This relies on truncation towards
+ * zero for the integer part, and the remaining fraction can thus be either
+ * positive or negative. As an example, -1.42 would return the fraction -0.42.
+ *
+ * Hardware support with \ref GMX_SIMD_HAVE_FRACTION, otherwise emulated.
+ */
+static gmx_inline gmx_simd_float_t
+gmx_simd_fraction_f(gmx_simd_float_t a)
+{
+    return gmx_simd_sub_f(a, gmx_simd_trunc_f(a));
+}
+
+/*! \brief Extract (integer) exponent from single precision SIMD.
+ *
+ * You should typically call the real-precision \ref gmx_simd_get_exponent_r.
+ *
+ * \param a Any floating-point value
+ * \return Exponent value, represented in floating-point format.
+ *
+ * The IEEE754 exponent field is selected, the bias removed, and it is converted to
+ * a normal floating-point SIMD.
+ */
+static gmx_inline gmx_simd_float_t
+gmx_simd_get_exponent_f(gmx_simd_float_t a)
+{
+    /* Mask with ones for the exponent field of single precision fp */
+    const gmx_int32_t  expmask = 0x7f800000;
+    gmx_simd_float_t   b;
+    int                i;
+    union
+    {
+        float        f;
+        gmx_int32_t  i;
+    }
+    conv;
+
+    for (i = 0; i < GMX_SIMD_FLOAT_WIDTH; i++)
+    {
+        conv.f = a.r[i];
+        /* Keep exponent, shift 23 right (float mantissa), remove bias (127) */
+        b.r[i] = ((conv.i & expmask) >> 23) - 127;
+    }
+    return b;
+}
+
+/*! \brief Get SIMD mantissa.
+ *
+ * You should typically call the real-precision \ref gmx_simd_get_mantissa_r.
+ *
+ * \param a Any floating-point value
+ * \return Mantissa, represented in floating-point format.
+ *
+ * The mantissa field is selected, and a new neutral exponent created.
+ */
+static gmx_inline gmx_simd_float_t
+gmx_simd_get_mantissa_f(gmx_simd_float_t a)
+{
+    const gmx_int32_t  mantmask = 0x007fffff;
+    const gmx_int32_t  one      = 0x3f800000;
+    gmx_simd_float_t   b;
+    int                i;
+    union
+    {
+        float        f;
+        gmx_int32_t  i;
+    }
+    conv;
+
+    for (i = 0; i < GMX_SIMD_FLOAT_WIDTH; i++)
+    {
+        conv.f = a.r[i];
+        /* remove current exponent, add a biased exponent for 1.0 (i.e., 2^0=1) */
+        conv.i = (conv.i & (mantmask)) | one;
+        b.r[i] = conv.f;
+    }
+    return b;
+}
+
+/*! \brief Set (integer) exponent from single precision floating-point SIMD.
+ *
+ * You should typically call the real-precision \ref gmx_simd_set_exponent_r.
+ *
+ * \param a A floating point value that will not overflow as 2^a.
+ * \return 2^(round(a)).
+ *
+ * The input is \a rounded to the nearest integer, the exponent bias is added
+ * to this integer, and the bits are shifted to the IEEE754 exponent part of the number.
+ *
+ * \note The argument will be \a rounded to nearest integer since that is what
+ * we need for the exponential functions, and this integer x will be set as the
+ * exponent so the new floating-point number will be 2^x.
+ */
+static gmx_inline gmx_simd_float_t
+gmx_simd_set_exponent_f(gmx_simd_float_t a)
+{
+    gmx_simd_float_t   b;
+    gmx_int32_t        iexp;
+    int                i;
+    union
+    {
+        float        f;
+        gmx_int32_t  i;
+    }
+    conv;
+
+    for (i = 0; i < GMX_SIMD_FLOAT_WIDTH; i++)
+    {
+        /* Critical to use same algorithm as for gmx_simd_round_f() */
+#ifdef _MSC_VER
+        iexp = (a.r[i] >= 0.0f) ? (a.r[i] + 0.5f) : (a.r[i] - 0.5f);
+#else
+        iexp = roundf(a.r[i]);
+#endif
+        /* Add bias (127), and shift 23 bits left (mantissa size) */
+        conv.i = (iexp + 127) << 23;
+        b.r[i] = conv.f;
+    }
+    return b;
+}
+
+/*! \}
+ *
+ * \name SIMD implementation single precision floating-point comparisons, boolean, selection.
+ * \{
+ */
+/*! \brief SIMD a==b for single SIMD.
+ *
+ * You should typically call the real-precision \ref gmx_simd_cmpeq_r.
+ *
+ * \param a value1
+ * \param b value2
+ * \return Each element of the boolean will be set to true if a==b.
+ *
+ * Beware that exact floating-point comparisons are difficult.
+ */
+static gmx_inline gmx_simd_fbool_t
+gmx_simd_cmpeq_f(gmx_simd_float_t a, gmx_simd_float_t b)
+{
+    gmx_simd_fbool_t  c;
+    int               i;
+
+    for (i = 0; i < GMX_SIMD_FLOAT_WIDTH; i++)
+    {
+        c.b[i] = (a.r[i] == b.r[i]);
+    }
+    return c;
+}
+
+/*! \brief SIMD a<b for single SIMD.
+ *
+ * You should typically call the real-precision \ref gmx_simd_cmplt_r.
+ *
+ * \param a value1
+ * \param b value2
+ * \return Each element of the boolean will be set to true if a<b.
+ */
+static gmx_inline gmx_simd_fbool_t
+gmx_simd_cmplt_f(gmx_simd_float_t a, gmx_simd_float_t b)
+{
+    gmx_simd_fbool_t   c;
+    int                i;
+
+    for (i = 0; i < GMX_SIMD_FLOAT_WIDTH; i++)
+    {
+        c.b[i] = (a.r[i] < b.r[i]);
+    }
+    return c;
+}
+
+/*! \brief SIMD a<=b for single SIMD.
+ *
+ * You should typically call the real-precision \ref gmx_simd_cmple_r.
+ *
+ * \param a value1
+ * \param b value2
+ * \return Each element of the boolean will be set to true if a<=b.
+ */
+static gmx_inline gmx_simd_fbool_t
+gmx_simd_cmple_f(gmx_simd_float_t a, gmx_simd_float_t b)
+{
+    gmx_simd_fbool_t   c;
+    int                i;
+
+    for (i = 0; i < GMX_SIMD_FLOAT_WIDTH; i++)
+    {
+        c.b[i] = (a.r[i] <= b.r[i]);
+    }
+    return c;
+}
+
+/*! \brief Logical \a and on single precision SIMD booleans.
+ *
+ * You should typically call the real-precision \ref gmx_simd_and_r.
+ *
+ * \param a logical vars 1
+ * \param b logical vars 2
+ * \return For each element, the result boolean is true if a \& b are true.
+ *
+ * \note This is not necessarily a bitwise operation - the storage format
+ * of booleans is implementation-dependent.
+ *
+ * \sa gmx_simd_and_ib
+ */
+static gmx_inline gmx_simd_fbool_t
+gmx_simd_and_fb(gmx_simd_fbool_t a, gmx_simd_fbool_t b)
+{
+    gmx_simd_fbool_t  c;
+    int               i;
+
+    for (i = 0; i < GMX_SIMD_FLOAT_WIDTH; i++)
+    {
+        c.b[i] = (a.b[i] && b.b[i]);
+    }
+    return c;
+}
+
+/*! \brief Logical \a or on single precision SIMD booleans.
+ *
+ * You should typically call the real-precision \ref gmx_simd_or_r.
+ *
+ * \param a logical vars 1
+ * \param b logical vars 2
+ * \return For each element, the result boolean is true if a or b is true.
+ *
+ * Note that this is not necessarily a bitwise operation - the storage format
+ * of booleans is implementation-dependent.
+ *
+ * \sa gmx_simd_or_ib
+ */
+static gmx_inline gmx_simd_fbool_t
+gmx_simd_or_fb(gmx_simd_fbool_t a, gmx_simd_fbool_t b)
+{
+    gmx_simd_fbool_t  c;
+    int               i;
+
+    for (i = 0; i < GMX_SIMD_FLOAT_WIDTH; i++)
+    {
+        c.b[i] = (a.b[i] || b.b[i]);
+    }
+    return c;
+}
+
+/*! \brief Returns non-zero if any of the boolean in x is True, otherwise 0.
+ *
+ * You should typically call the real-precision \ref gmx_simd_anytrue_b.
+ *
+ * \param a Logical variable.
+ * \return non-zero if any element in a is true, otherwise 0.
+ *
+ * The actual return value for truth will depend on the architecture,
+ * so any non-zero value is considered truth.
+ */
+static gmx_inline int
+gmx_simd_anytrue_fb(gmx_simd_fbool_t a)
+{
+    int             anytrue;
+    int             i;
+
+    anytrue = 0;
+    for (i = 0; i < GMX_SIMD_FLOAT_WIDTH; i++)
+    {
+        anytrue = anytrue || a.b[i];
+    }
+    return anytrue;
+}
+
+/*! \brief Select from single precision SIMD variable where boolean is true.
+ *
+ * You should typically call the real-precision \ref gmx_simd_blendzero_r.
+ *
+ * \param a Floating-point variable to select from
+ * \param sel Boolean selector
+ * \return  For each element, a is selected for true, 0 for false.
+ */
+static gmx_inline gmx_simd_float_t
+gmx_simd_blendzero_f(gmx_simd_float_t a, gmx_simd_fbool_t sel)
+{
+    gmx_simd_float_t   c;
+    int                i;
+
+    for (i = 0; i < GMX_SIMD_FLOAT_WIDTH; i++)
+    {
+        c.r[i] = sel.b[i] ? a.r[i] : 0.0;
+    }
+    return c;
+}
+
+/*! \brief Select from single precision SIMD variable where boolean is false.
+ *
+ * You should typically call the real-precision \ref gmx_simd_blendnotzero_r.
+ *
+ * \param a Floating-point variable to select from
+ * \param sel Boolean selector
+ * \return  For each element, a is selected for false, 0 for true (sic).
+ */
+static gmx_inline gmx_simd_float_t
+gmx_simd_blendnotzero_f(gmx_simd_float_t a, gmx_simd_fbool_t sel)
+{
+    gmx_simd_float_t   c;
+    int                i;
+
+    for (i = 0; i < GMX_SIMD_FLOAT_WIDTH; i++)
+    {
+        c.r[i] = sel.b[i] ? 0.0 : a.r[i];
+    }
+    return c;
+}
+
+/*! \brief Vector-blend SIMD selection.
+ *
+ * You should typically call the real-precision \ref gmx_simd_blendv_r.
+ *
+ * \param a First source
+ * \param b Second source
+ * \param sel Boolean selector
+ * \return For each element, select b if sel is true, a otherwise.
+ */
+static gmx_inline gmx_simd_float_t
+gmx_simd_blendv_f(gmx_simd_float_t a, gmx_simd_float_t b, gmx_simd_fbool_t sel)
+{
+    gmx_simd_float_t  d;
+    int               i;
+
+    for (i = 0; i < GMX_SIMD_FLOAT_WIDTH; i++)
+    {
+        d.r[i] = sel.b[i] ? b.r[i] : a.r[i];
+    }
+    return d;
+}
+
+/*! \brief Return sum of all elements in SIMD float variable.
+ *
+ * You should typically call the real-precision \ref gmx_simd_reduce_r.
+ *
+ * \param a SIMD variable to reduce/sum.
+ * \return The sum of all elements in the argument variable.
+ *
+ */
+static gmx_inline float
+gmx_simd_reduce_f(gmx_simd_float_t a)
+{
+    float     sum = 0.0;
+    int       i;
+
+    for (i = 0; i < GMX_SIMD_FLOAT_WIDTH; i++)
+    {
+        sum += a.r[i];
+    }
+    return sum;
+}
+
+/*! \}
+ *
+ * \name SIMD implementation double precision floating-point bitwise logical operations
+ * \{
+ */
+/*! \brief Bitwise and for two SIMD double variables. Supported with \ref GMX_SIMD_HAVE_LOGICAL.
+ *
+ * \copydetails gmx_simd_and_f
+ */
+static gmx_inline gmx_simd_double_t
+gmx_simd_and_d(gmx_simd_double_t a, gmx_simd_double_t b)
+{
+    gmx_simd_double_t  c;
+    int                i;
+#ifdef __cplusplus
+    gmx_int64_t        val1, val2, res;
+#else
+    union
+    {
+        double       r;
+        gmx_int64_t  i;
+    }
+    conv1, conv2;
+#endif
+
+    for (i = 0; i < GMX_SIMD_DOUBLE_WIDTH; i++)
+    {
+#ifdef __cplusplus
+        val1   = reinterpret_cast<gmx_int64_t &>(a.r[i]);
+        val2   = reinterpret_cast<gmx_int64_t &>(b.r[i]);
+        res    = val1 & val2;
+        c.r[i] = reinterpret_cast<double &>(res);
+#else
+        conv1.r = a.r[i];
+        conv2.r = b.r[i];
+        conv1.i = conv1.i & conv2.i;
+        c.r[i]  = conv1.r;
+#endif
+    }
+    return c;
+}
+
+/*! \brief Bitwise andnot for SIMD double. c=(~a) & b. Supported with \ref GMX_SIMD_HAVE_LOGICAL.
+ *
+ * \copydetails gmx_simd_andnot_f
+ */
+static gmx_inline gmx_simd_double_t
+gmx_simd_andnot_d(gmx_simd_double_t a, gmx_simd_double_t b)
+{
+    gmx_simd_double_t  c;
+    int                i;
+#ifdef __cplusplus
+    gmx_int64_t        val1, val2, res;
+#else
+    union
+    {
+        double       r;
+        gmx_int64_t  i;
+    }
+    conv1, conv2;
+#endif
+
+    for (i = 0; i < GMX_SIMD_DOUBLE_WIDTH; i++)
+    {
+#ifdef __cplusplus
+        val1   = reinterpret_cast<gmx_int64_t &>(a.r[i]);
+        val2   = reinterpret_cast<gmx_int64_t &>(b.r[i]);
+        res    = (~val1) & val2;
+        c.r[i] = reinterpret_cast<double &>(res);
+#else
+        conv1.r = a.r[i];
+        conv2.r = b.r[i];
+        conv1.i = conv1.i & conv2.i;
+        c.r[i]  = conv1.r;
+#endif
+    }
+    return c;
+}
+
+/*! \brief Bitwise or for SIMD double. Supported with \ref GMX_SIMD_HAVE_LOGICAL.
+ *
+ * \copydetails gmx_simd_or_f
+ */
+static gmx_inline gmx_simd_double_t
+gmx_simd_or_d(gmx_simd_double_t a, gmx_simd_double_t b)
+{
+    gmx_simd_double_t  c;
+    int                i;
+#ifdef __cplusplus
+    gmx_int64_t        val1, val2, res;
+#else
+    union
+    {
+        double       r;
+        gmx_int64_t  i;
+    }
+    conv1, conv2;
+#endif
+
+    for (i = 0; i < GMX_SIMD_DOUBLE_WIDTH; i++)
+    {
+#ifdef __cplusplus
+        val1   = reinterpret_cast<gmx_int64_t &>(a.r[i]);
+        val2   = reinterpret_cast<gmx_int64_t &>(b.r[i]);
+        res    = val1 | val2;
+        c.r[i] = reinterpret_cast<double &>(res);
+#else
+        conv1.r = a.r[i];
+        conv2.r = b.r[i];
+        conv1.i = conv1.i & conv2.i;
+        c.r[i]  = conv1.r;
+#endif
+    }
+    return c;
+}
+
+/*! \brief Bitwise xor for SIMD double. Supported with \ref GMX_SIMD_HAVE_LOGICAL.
+ *
+ * \copydetails gmx_simd_xor_f
+ */
+static gmx_inline gmx_simd_double_t
+gmx_simd_xor_d(gmx_simd_double_t a, gmx_simd_double_t b)
+{
+    gmx_simd_double_t  c;
+    int                i;
+#ifdef __cplusplus
+    gmx_int64_t        val1, val2, res;
+#else
+    union
+    {
+        double       r;
+        gmx_int64_t  i;
+    }
+    conv1, conv2;
+#endif
+
+    for (i = 0; i < GMX_SIMD_DOUBLE_WIDTH; i++)
+    {
+#ifdef __cplusplus
+        val1   = reinterpret_cast<gmx_int64_t &>(a.r[i]);
+        val2   = reinterpret_cast<gmx_int64_t &>(b.r[i]);
+        res    = val1 ^ val2;
+        c.r[i] = reinterpret_cast<double &>(res);
+#else
+        conv1.r = a.r[i];
+        conv2.r = b.r[i];
+        conv1.i = conv1.i & conv2.i;
+        c.r[i]  = conv1.r;
+#endif
+    }
+    return c;
+}
+
+/*! \}
+ *
+ * \name SIMD implementation double precision floating-point arithmetics
+ * \{
+ */
+/*! \brief Add two double SIMD variables.
+ *
+ * \copydetails gmx_simd_add_f
+ */
+static gmx_inline gmx_simd_double_t
+gmx_simd_add_d(gmx_simd_double_t a, gmx_simd_double_t b)
+{
+    gmx_simd_double_t  c;
+    int                i;
+
+    for (i = 0; i < GMX_SIMD_DOUBLE_WIDTH; i++)
+    {
+        c.r[i] = a.r[i] + b.r[i];
+    }
+    return c;
+}
+
+/*! \brief Add two float SIMD variables.
+ *
+ * \copydetails gmx_simd_sub_f
+ */
+static gmx_inline gmx_simd_double_t
+gmx_simd_sub_d(gmx_simd_double_t a, gmx_simd_double_t b)
+{
+    gmx_simd_double_t  c;
+    int                i;
+
+    for (i = 0; i < GMX_SIMD_DOUBLE_WIDTH; i++)
+    {
+        c.r[i] = a.r[i] - b.r[i];
+    }
+    return c;
+}
+
+/*! \brief Multiply two SIMD variables.
+ *
+ * \copydetails gmx_simd_mul_f
+ */
+static gmx_inline gmx_simd_double_t
+gmx_simd_mul_d(gmx_simd_double_t a, gmx_simd_double_t b)
+{
+    gmx_simd_double_t  c;
+    int                i;
+
+    for (i = 0; i < GMX_SIMD_DOUBLE_WIDTH; i++)
+    {
+        c.r[i] = a.r[i]*b.r[i];
+    }
+    return c;
+}
+
+/*! \brief Fused-multiply-add. Result is a*b+c.
+ *
+ * \copydetails gmx_simd_fmadd_f
+ */
+#define gmx_simd_fmadd_d(a, b, c) gmx_simd_add_d(gmx_simd_mul_d(a, b), c)
+
+/*! \brief Fused-multiply-subtract. Result is a*b-c.
+ *
+ * \copydetails gmx_simd_fmsub_f
+ */
+#define gmx_simd_fmsub_d(a, b, c) gmx_simd_sub_d(gmx_simd_mul_d(a, b), c)
+
+/*! \brief Fused-negated-multiply-add. Result is -a*b+c.
+ *
+ * \copydetails gmx_simd_fnmadd_f
+ */
+#define gmx_simd_fnmadd_d(a, b, c) gmx_simd_sub_d(c, gmx_simd_mul_d(a, b))
+
+/*! \brief Fused-negated-multiply-add. Result is -a*b-c.
+ *
+ * \copydetails gmx_simd_fnmsub_f
+ */
+#define gmx_simd_fnmsub_d(a, b, c) gmx_simd_sub_d(gmx_simd_setzero_d(), gmx_simd_fmadd_d(a, b, c))
+
+/*! \brief SIMD 1.0/sqrt(x) lookup.
+ *
+ * \copydetails gmx_simd_rsqrt_f
+ */
+static gmx_inline gmx_simd_double_t
+gmx_simd_rsqrt_d(gmx_simd_double_t x)
+{
+    gmx_simd_double_t  b;
+    int                i;
+
+    for (i = 0; i < GMX_SIMD_DOUBLE_WIDTH; i++)
+    {
+        /* Sic - we only need single precision for the reference lookup, since
+         * we have defined GMX_SIMD_RSQRT_BITS to 23.
+         */
+        b.r[i] = (x.r[i] > 0.0) ? 1.0f/sqrtf(x.r[i]) : 0.0;
+    }
+    return b;
+};
+
+/*! \brief 1.0/x lookup.
+ *
+ * \copydetails gmx_simd_rcp_f
+ */
+static gmx_inline gmx_simd_double_t
+gmx_simd_rcp_d(gmx_simd_double_t x)
+{
+    gmx_simd_double_t  b;
+    int                i;
+
+    for (i = 0; i < GMX_SIMD_DOUBLE_WIDTH; i++)
+    {
+        /* Sic - we only need single precision for the reference lookup, since
+         * we have defined GMX_SIMD_RCP_BITS to 23.
+         */
+        b.r[i] = (x.r[i] != 0.0) ? 1.0f/x.r[i] : 0.0;
+    }
+    return b;
+};
+
+/*! \brief SIMD Floating-point fabs().
+ *
+ * \copydetails gmx_simd_fabs_f
+ */
+static gmx_inline gmx_simd_double_t
+gmx_simd_fabs_d(gmx_simd_double_t a)
+{
+    gmx_simd_double_t  c;
+    int                i;
+
+    for (i = 0; i < GMX_SIMD_DOUBLE_WIDTH; i++)
+    {
+        c.r[i] = fabs(a.r[i]);
+    }
+    return c;
+}
+
+/*! \brief SIMD floating-point negate.
+ *
+ * \copydetails gmx_simd_fneg_f
+ */
+static gmx_inline gmx_simd_double_t
+gmx_simd_fneg_d(gmx_simd_double_t a)
+{
+    gmx_simd_double_t  c;
+    int                i;
+
+    for (i = 0; i < GMX_SIMD_DOUBLE_WIDTH; i++)
+    {
+        c.r[i] = -a.r[i];
+    }
+    return c;
+}
+
+/*! \brief Set each SIMD element to the largest from two variables.
+ *
+ * \copydetails gmx_simd_max_f
+ */
+static gmx_inline gmx_simd_double_t
+gmx_simd_max_d(gmx_simd_double_t a, gmx_simd_double_t b)
+{
+    gmx_simd_double_t  c;
+    int                i;
+
+    for (i = 0; i < GMX_SIMD_DOUBLE_WIDTH; i++)
+    {
+        c.r[i] = (a.r[i] >= b.r[i] ? a.r[i] : b.r[i]);
+    }
+    return c;
+}
+
+/*! \brief Set each SIMD element to the smallest from two variables.
+ *
+ * \copydetails gmx_simd_min_f
+ */
+static gmx_inline gmx_simd_double_t
+gmx_simd_min_d(gmx_simd_double_t a, gmx_simd_double_t b)
+{
+    gmx_simd_double_t  c;
+    int                i;
+
+    for (i = 0; i < GMX_SIMD_DOUBLE_WIDTH; i++)
+    {
+        c.r[i] = (a.r[i] <= b.r[i] ? a.r[i] : b.r[i]);
+    }
+    return c;
+}
+
+/*! \brief Round to nearest integer value (in double floating-point format).
+ *
+ * \copydetails gmx_simd_round_f
+ */
+static gmx_inline gmx_simd_double_t
+gmx_simd_round_d(gmx_simd_double_t a)
+{
+    gmx_simd_double_t  b;
+    int                i;
+
+    for (i = 0; i < GMX_SIMD_DOUBLE_WIDTH; i++)
+    {
+#ifdef _MSC_VER
+        int temp = (a.r[i] >= 0.0) ? (a.r[i] + 0.5) : (a.r[i] - 0.5);
+        b.r[i] = temp;
+#else
+        b.r[i] = round(a.r[i]);
+#endif
+    }
+    return b;
+}
+
+/*! \brief Truncate SIMD, i.e. round towards zero - common hardware instruction.
+ *
+ * \copydetails gmx_simd_trunc_f
+ */
+static gmx_inline gmx_simd_double_t
+gmx_simd_trunc_d(gmx_simd_double_t a)
+{
+    gmx_simd_double_t  b;
+    int                i;
+
+    for (i = 0; i < GMX_SIMD_DOUBLE_WIDTH; i++)
+    {
+        b.r[i] = trunc(a.r[i]);
+    }
+    return b;
+}
+
+/*! \brief Fraction of the SIMD floating point number.
+ *
+ * \copydetails gmx_simd_fraction_f
+ */
+static gmx_inline gmx_simd_double_t
+gmx_simd_fraction_d(gmx_simd_double_t a)
+{
+    return gmx_simd_sub_d(a, gmx_simd_trunc_d(a));
+}
+
+
+/*! \brief Extract (integer) exponent from double precision SIMD.
+ *
+ * \copydetails gmx_simd_get_exponent_f
+ */
+static gmx_inline gmx_simd_double_t
+gmx_simd_get_exponent_d(gmx_simd_double_t a)
+{
+    /* Mask with ones for the exponent field of double precision fp */
+    const gmx_int64_t      expmask = 0x7ff0000000000000LL;
+    gmx_simd_double_t      b;
+    int                    i;
+    union
+    {
+        double             d;
+        gmx_int64_t        i;
+    }
+    conv;
+
+    for (i = 0; i < GMX_SIMD_DOUBLE_WIDTH; i++)
+    {
+        conv.d = a.r[i];
+        /* Zero everything but exponent field (remove sign),
+         * shift 23 bits right (mantissa size), and remove exponent bias (1023).
+         */
+        b.r[i] = ((conv.i & expmask) >> 52) - 1023;
+    }
+    return b;
+}
+
+/*! \brief Get SIMD doublemantissa.
+ *
+ * \copydetails gmx_simd_get_mantissa_f
+ */
+static gmx_inline gmx_simd_double_t
+gmx_simd_get_mantissa_d(gmx_simd_double_t a)
+{
+    const gmx_int64_t      mantmask = 0x000fffffffffffffLL;
+    const gmx_int64_t      one      = 0x3ff0000000000000LL;
+    gmx_simd_double_t      b;
+    int                    i;
+    union
+    {
+        double          d;
+        gmx_int64_t     i;
+    }
+    conv;
+
+    for (i = 0; i < GMX_SIMD_DOUBLE_WIDTH; i++)
+    {
+        conv.d = a.r[i];
+        conv.i = (conv.i & (mantmask)) | one;
+        b.r[i] = conv.d;
+    }
+    return b;
+}
+
+/*! \brief Set (integer) exponent from single precision floating-point SIMD.
+ *
+ * \copydetails gmx_simd_set_exponent_f
+ */
+static gmx_inline gmx_simd_double_t
+gmx_simd_set_exponent_d(gmx_simd_double_t a)
+{
+    gmx_simd_double_t      b;
+    int                    i;
+    gmx_int64_t            iexp;
+    union
+    {
+        double          d;
+        gmx_int64_t     i;
+    }
+    conv;
+
+    for (i = 0; i < GMX_SIMD_DOUBLE_WIDTH; i++)
+    {
+        /* Critical to use same algorithm as for gmx_simd_round_d() */
+#ifdef _MSC_VER
+        iexp = (a.r[i] >= 0.0) ? (a.r[i] + 0.5) : (a.r[i] - 0.5);
+#else
+        iexp = round(a.r[i]);
+#endif
+        /* Add bias (1023), and shift 52 bits left (mantissa size) */
+        conv.i = (iexp + 1023) << 52;
+        b.r[i] = conv.d;
+    }
+    return b;
+}
+
+/*! \}
+ *
+ * \name SIMD implementation double precision floating-point comparison, boolean, selection.
+ * \{
+ */
+/*! \brief SIMD a==b for double SIMD.
+ *
+ * \copydetails gmx_simd_cmpeq_f
+ */
+static gmx_inline gmx_simd_dbool_t
+gmx_simd_cmpeq_d(gmx_simd_double_t a, gmx_simd_double_t b)
+{
+    gmx_simd_dbool_t  c;
+    int               i;
+
+    for (i = 0; i < GMX_SIMD_DOUBLE_WIDTH; i++)
+    {
+        c.b[i] = (a.r[i] == b.r[i]);
+    }
+    return c;
+}
+
+/*! \brief SIMD a<b for double SIMD.
+ *
+ * \copydetails gmx_simd_cmplt_f
+ */
+static gmx_inline gmx_simd_dbool_t
+gmx_simd_cmplt_d(gmx_simd_double_t a, gmx_simd_double_t b)
+{
+    gmx_simd_dbool_t  c;
+    int               i;
+
+    for (i = 0; i < GMX_SIMD_DOUBLE_WIDTH; i++)
+    {
+        c.b[i] = (a.r[i] < b.r[i]);
+    }
+    return c;
+}
+
+/*! \brief SIMD a<=b for double SIMD.
+ *
+ * \copydetails gmx_simd_cmple_f
+ */
+static gmx_inline gmx_simd_dbool_t
+gmx_simd_cmple_d(gmx_simd_double_t a, gmx_simd_double_t b)
+{
+    gmx_simd_dbool_t  c;
+    int               i;
+
+    for (i = 0; i < GMX_SIMD_DOUBLE_WIDTH; i++)
+    {
+        c.b[i] = (a.r[i] <= b.r[i]);
+    }
+    return c;
+}
+
+
+/*! \brief Logical \a and on double precision SIMD booleans.
+ *
+ * \copydetails gmx_simd_and_fb
+ */
+static gmx_inline gmx_simd_dbool_t
+gmx_simd_and_db(gmx_simd_dbool_t a, gmx_simd_dbool_t b)
+{
+    gmx_simd_dbool_t  c;
+    int               i;
+
+    for (i = 0; i < GMX_SIMD_DOUBLE_WIDTH; i++)
+    {
+        c.b[i] = (a.b[i] && b.b[i]);
+    }
+    return c;
+}
+
+/*! \brief Logical \a or on double precision SIMD booleans.
+ *
+ * \copydetails gmx_simd_or_fb
+ */
+static gmx_inline gmx_simd_dbool_t
+gmx_simd_or_db(gmx_simd_dbool_t a, gmx_simd_dbool_t b)
+{
+    gmx_simd_dbool_t  c;
+    int               i;
+
+    for (i = 0; i < GMX_SIMD_DOUBLE_WIDTH; i++)
+    {
+        c.b[i] = (a.b[i] || b.b[i]);
+    }
+    return c;
+}
+
+
+/*! \brief Returns non-zero if any of the boolean in x is True, otherwise 0.
+ *
+ * \copydetails gmx_simd_anytrue_fb
+ */
+static gmx_inline int
+gmx_simd_anytrue_db(gmx_simd_dbool_t a)
+{
+    int         anytrue;
+    int         i;
+
+    anytrue = 0;
+    for (i = 0; i < GMX_SIMD_DOUBLE_WIDTH; i++)
+    {
+        anytrue = anytrue || a.b[i];
+    }
+    return anytrue;
+}
+
+
+/*! \brief Select from double SIMD variable where boolean is true.
+ *
+ * \copydetails gmx_simd_blendzero_f
+ */
+static gmx_inline gmx_simd_double_t
+gmx_simd_blendzero_d(gmx_simd_double_t a, gmx_simd_dbool_t sel)
+{
+    gmx_simd_double_t  c;
+    int                i;
+
+    for (i = 0; i < GMX_SIMD_DOUBLE_WIDTH; i++)
+    {
+        c.r[i] = sel.b[i] ? a.r[i] : 0.0;
+    }
+    return c;
+}
+
+/*! \brief Select from double SIMD variable where boolean is false.
+ *
+ * \copydetails gmx_simd_blendnotzero_f
+ */
+static gmx_inline gmx_simd_double_t
+gmx_simd_blendnotzero_d(gmx_simd_double_t a, gmx_simd_dbool_t sel)
+{
+    gmx_simd_double_t  c;
+    int                i;
+
+    for (i = 0; i < GMX_SIMD_DOUBLE_WIDTH; i++)
+    {
+        c.r[i] = sel.b[i] ? 0.0 : a.r[i];
+    }
+    return c;
+}
+
+/*! \brief Vector-blend double SIMD selection.
+ *
+ * \copydetails gmx_simd_blendv_f
+ */
+static gmx_inline gmx_simd_double_t
+gmx_simd_blendv_d(gmx_simd_double_t a, gmx_simd_double_t b, gmx_simd_dbool_t sel)
+{
+    gmx_simd_double_t  d;
+    int                i;
+
+    for (i = 0; i < GMX_SIMD_DOUBLE_WIDTH; i++)
+    {
+        d.r[i] = sel.b[i] ? b.r[i] : a.r[i];
+    }
+    return d;
+}
+
+/*! \brief Return sum of all elements in SIMD double variable.
+ *
+ * \copydetails gmx_simd_reduce_f
+ *
+ */
+static gmx_inline double
+gmx_simd_reduce_d(gmx_simd_double_t a)
+{
+    double    sum = 0.0;
+    int       i;
+
+    for (i = 0; i < GMX_SIMD_DOUBLE_WIDTH; i++)
+    {
+        sum += a.r[i];
+    }
+    return sum;
+}
+
+/*! \}
+ *
+ * \name SIMD implementation integer (corresponding to float) bitwise logical operations
+ * \{
+ */
+
+/*! \brief SIMD integer shift left logical, based on immediate value.
+ *
+ * You should typically call the real-precision \ref gmx_simd_slli_i.
+ *
+ *  Logical shift. Each element is shifted (independently) up to 32 positions
+ *  left, while zeros are shifted in from the right. Only available if
+ * \ref GMX_SIMD_HAVE_FINT32_LOGICAL (single) or \ref GMX_SIMD_HAVE_DINT32_LOGICAL
+ *  (double) is defined.
+ *
+ * \param a integer data to shift
+ * \param n number of positions to shift left. n<=32.
+ * \return shifted values
+ */
+static gmx_inline gmx_simd_fint32_t
+gmx_simd_slli_fi(gmx_simd_fint32_t a, int n)
+{
+    gmx_simd_fint32_t  c;
+    int                i;
+
+    for (i = 0; i < GMX_SIMD_FINT32_WIDTH; i++)
+    {
+        c.i[i] = a.i[i] << n;
+    }
+    return c;
+}
+
+/*! \brief SIMD integer shift right logical, based on immediate value.
+ *
+ * You should typically call the real-precision \ref gmx_simd_srli_i.
+ *
+ *  Logical shift. Each element is shifted (independently) up to 32 positions
+ *  right, while zeros are shifted in from the left. Only available if
+ * \ref GMX_SIMD_HAVE_FINT32_LOGICAL (single) or \ref GMX_SIMD_HAVE_DINT32_LOGICAL
+ *  (double) is defined.
+ *
+ * \param a integer data to shift
+ * \param n number of positions to shift right. n<=32.
+ * \return shifted values
+ */
+static gmx_inline gmx_simd_fint32_t
+gmx_simd_srli_fi(gmx_simd_fint32_t a, int n)
+{
+    gmx_simd_fint32_t  c;
+    int                i;
+
+    for (i = 0; i < GMX_SIMD_FINT32_WIDTH; i++)
+    {
+        c.i[i] = a.i[i] >> n;
+    }
+    return c;
+}
+
+/*! \brief Integer SIMD bitwise and.
+ *
+ * You should typically call the real-precision \ref gmx_simd_and_i.
+ *
+ * This routine is only available if \ref GMX_SIMD_HAVE_FINT32_LOGICAL (single)
+ *  or \ref GMX_SIMD_HAVE_DINT32_LOGICAL (double) is defined.
+ *
+ * \note You can \a not use this operation directly to select based on a boolean
+ * SIMD variable, since booleans are separate from integer SIMD. If that
+ * is what you need, have a look at \ref gmx_simd_blendzero_i instead.
+ *
+ * \param a first integer SIMD
+ * \param b second integer SIMD
+ * \return a \& b (bitwise and)
+ */
+static gmx_inline gmx_simd_fint32_t
+gmx_simd_and_fi(gmx_simd_fint32_t a, gmx_simd_fint32_t b)
+{
+    gmx_simd_fint32_t  c;
+    int                i;
+
+    for (i = 0; i < GMX_SIMD_FINT32_WIDTH; i++)
+    {
+        c.i[i] = a.i[i] & b.i[i];
+    }
+    return c;
+}
+
+/*! \brief Integer SIMD bitwise not-and.
+ *
+ * You should typically call the real-precision \ref gmx_simd_andnot_i.
+ *
+ * This routine is only available if \ref GMX_SIMD_HAVE_FINT32_LOGICAL (single)
+ *  or \ref GMX_SIMD_HAVE_DINT32_LOGICAL (double) is defined.
+ *
+ * Note that you can NOT use this operation directly to select based on a boolean
+ * SIMD variable, since booleans are separate from integer SIMD. If that
+ * is what you need, have a look at \ref gmx_simd_blendnotzero_i instead.
+ *
+ * \param a first integer SIMD
+ * \param b second integer SIMD
+ * \return (~a) \& b (bitwise andnot)
+ */
+static gmx_inline gmx_simd_fint32_t
+gmx_simd_andnot_fi(gmx_simd_fint32_t a, gmx_simd_fint32_t b)
+{
+    gmx_simd_fint32_t  c;
+    int                i;
+
+    for (i = 0; i < GMX_SIMD_FINT32_WIDTH; i++)
+    {
+        c.i[i] = (~a.i[i]) & b.i[i];
+    }
+    return c;
+}
+
+/*! \brief Integer SIMD bitwise or.
+ *
+ * You should typically call the real-precision \ref gmx_simd_or_i.
+ *
+ * This routine is only available if \ref GMX_SIMD_HAVE_FINT32_LOGICAL (single)
+ *  or \ref GMX_SIMD_HAVE_DINT32_LOGICAL (double) is defined.
+ *
+ * \param a first integer SIMD
+ * \param b second integer SIMD
+ * \return a \| b (bitwise or)
+ */
+static gmx_inline gmx_simd_fint32_t
+gmx_simd_or_fi(gmx_simd_fint32_t a, gmx_simd_fint32_t b)
+{
+    gmx_simd_fint32_t  c;
+    int                i;
+
+    for (i = 0; i < GMX_SIMD_FINT32_WIDTH; i++)
+    {
+        c.i[i] = a.i[i] | b.i[i];
+    }
+    return c;
+}
+
+/*! \brief Integer SIMD bitwise xor.
+ *
+ * You should typically call the real-precision \ref gmx_simd_xor_i.
+ *
+ * This routine is only available if \ref GMX_SIMD_HAVE_FINT32_LOGICAL (single)
+ *  or \ref GMX_SIMD_HAVE_DINT32_LOGICAL (double) is defined.
+ *
+ * \param a first integer SIMD
+ * \param b second integer SIMD
+ * \return a ^ b (bitwise xor)
+ */
+static gmx_inline gmx_simd_fint32_t
+gmx_simd_xor_fi(gmx_simd_fint32_t a, gmx_simd_fint32_t b)
+{
+    gmx_simd_fint32_t  c;
+    int                i;
+
+    for (i = 0; i < GMX_SIMD_FINT32_WIDTH; i++)
+    {
+        c.i[i] = a.i[i] ^ b.i[i];
+    }
+    return c;
+}
+
+/*! \}
+ *
+ * \name SIMD implementation integer (corresponding to float) arithmetics
+ * \{
+ */
+/*! \brief Add SIMD integers.
+ *
+ * You should typically call the real-precision \ref gmx_simd_xor_i.
+ *
+ * This routine is only available if \ref GMX_SIMD_HAVE_FINT32_ARITHMETICS (single)
+ *  or \ref GMX_SIMD_HAVE_DINT32_ARITHMETICS (double) is defined.
+ *
+ * \param a term1
+ * \param b term2
+ * \return a+b
+ */
+static gmx_inline gmx_simd_fint32_t
+gmx_simd_add_fi(gmx_simd_fint32_t a, gmx_simd_fint32_t b)
+{
+    gmx_simd_fint32_t  c;
+    int                i;
+
+    for (i = 0; i < GMX_SIMD_FINT32_WIDTH; i++)
+    {
+        c.i[i] = a.i[i] + b.i[i];
+    }
+    return c;
+}
+
+/*! \brief Subtract SIMD integers.
+ *
+ * You should typically call the real-precision \ref gmx_simd_xor_i.
+ *
+ * This routine is only available if \ref GMX_SIMD_HAVE_FINT32_ARITHMETICS (single)
+ *  or \ref GMX_SIMD_HAVE_DINT32_ARITHMETICS (double) is defined.
+ *
+ * \param a term1
+ * \param b term2
+ * \return a-b
+ */
+static gmx_inline gmx_simd_fint32_t
+gmx_simd_sub_fi(gmx_simd_fint32_t a, gmx_simd_fint32_t b)
+{
+    gmx_simd_fint32_t  c;
+    int                i;
+
+    for (i = 0; i < GMX_SIMD_FINT32_WIDTH; i++)
+    {
+        c.i[i] = a.i[i] - b.i[i];
+    }
+    return c;
+}
+
+/*! \brief Multiply SIMD integers.
+ *
+ * You should typically call the real-precision \ref gmx_simd_xor_i.
+ *
+ * This routine is only available if \ref GMX_SIMD_HAVE_FINT32_ARITHMETICS (single)
+ *  or \ref GMX_SIMD_HAVE_DINT32_ARITHMETICS (double) is defined.
+ *
+ * \param a factor1
+ * \param b factor2
+ * \return a*b.
+ *
+ * \note Only the low 32 bits are retained, so this can overflow.
+ */
+static gmx_inline gmx_simd_fint32_t
+gmx_simd_mul_fi(gmx_simd_fint32_t a, gmx_simd_fint32_t b)
+{
+    gmx_simd_fint32_t  c;
+    int                i;
+
+    for (i = 0; i < GMX_SIMD_FINT32_WIDTH; i++)
+    {
+        c.i[i] = a.i[i]*b.i[i];
+    }
+    return c;
+}
+
+/*! \}
+ *
+ * \name SIMD implementation integer (corresponding to float) comparisons, boolean, selection
+ * \{
+ */
+
+/*! \brief Equality comparison of two integers corresponding to float values.
+ *
+ * You should typically call the real-precision \ref gmx_simd_cmpeq_i.
+ *
+ * This routine is only available if \ref GMX_SIMD_HAVE_FINT32_ARITHMETICS (single)
+ *  or \ref GMX_SIMD_HAVE_DINT32_ARITHMETICS (double) is defined.
+ *
+ * \param a SIMD integer1
+ * \param b SIMD integer2
+ * \return SIMD integer boolean with true for elements where a==b
+ */
+static gmx_inline gmx_simd_fibool_t
+gmx_simd_cmpeq_fi(gmx_simd_fint32_t a, gmx_simd_fint32_t b)
+{
+    gmx_simd_fibool_t  c;
+    int                i;
+
+    for (i = 0; i < GMX_SIMD_FINT32_WIDTH; i++)
+    {
+        c.b[i] = (a.i[i] == b.i[i]);
+    }
+    return c;
+}
+
+/*! \brief Less-than comparison of two SIMD integers corresponding to float values.
+ *
+ * You should typically call the real-precision \ref gmx_simd_cmplt_i.
+ *
+ * This routine is only available if \ref GMX_SIMD_HAVE_FINT32_ARITHMETICS (single)
+ *  or \ref GMX_SIMD_HAVE_DINT32_ARITHMETICS (double) is defined.
+ *
+ * \param a SIMD integer1
+ * \param b SIMD integer2
+ * \return SIMD integer boolean with true for elements where a<b
+ */
+static gmx_inline gmx_simd_fibool_t
+gmx_simd_cmplt_fi(gmx_simd_fint32_t a, gmx_simd_fint32_t b)
+{
+    gmx_simd_fibool_t  c;
+    int                i;
+
+    for (i = 0; i < GMX_SIMD_FINT32_WIDTH; i++)
+    {
+        c.b[i] = (a.i[i] < b.i[i]);
+    }
+    return c;
+}
+
+/*! \brief Logical AND on gmx_simd_fibool_t.
+ *
+ * You should typically call the real-precision \ref gmx_simd_and_ib.
+ *
+ * This routine is only available if \ref GMX_SIMD_HAVE_FINT32_ARITHMETICS (single)
+ *  or \ref GMX_SIMD_HAVE_DINT32_ARITHMETICS (double) is defined.
+ *
+ * \param a SIMD boolean 1
+ * \param b SIMD boolean 2
+ * \return True for elements where both a and b are true.
+ */
+static gmx_inline gmx_simd_fibool_t
+gmx_simd_and_fib(gmx_simd_fibool_t a, gmx_simd_fibool_t b)
+{
+    gmx_simd_fibool_t c;
+    int               i;
+
+    for (i = 0; i < GMX_SIMD_FINT32_WIDTH; i++)
+    {
+        c.b[i] = (a.b[i] && b.b[i]);
+    }
+    return c;
+}
+
+/*! \brief Logical OR on gmx_simd_fibool_t.
+ *
+ * You should typically call the real-precision \ref gmx_simd_or_ib.
+ *
+ * This routine is only available if \ref GMX_SIMD_HAVE_FINT32_ARITHMETICS (single)
+ *  or \ref GMX_SIMD_HAVE_DINT32_ARITHMETICS (double) is defined.
+ *
+ * \param a SIMD boolean 1
+ * \param b SIMD boolean 2
+ * \return True for elements where both a and b are true.
+ */
+static gmx_inline gmx_simd_fibool_t
+gmx_simd_or_fib(gmx_simd_fibool_t a, gmx_simd_fibool_t b)
+{
+    gmx_simd_fibool_t  c;
+    int                i;
+
+    for (i = 0; i < GMX_SIMD_FINT32_WIDTH; i++)
+    {
+        c.b[i] = (a.b[i] || b.b[i]);
+    }
+    return c;
+}
+
+/*! \brief Returns non-zero if any of the boolean in x is True, otherwise 0.
+ *
+ * You should typically call the real-precision \ref gmx_simd_anytrue_ib.
+ *
+ * This routine is only available if \ref GMX_SIMD_HAVE_FINT32_ARITHMETICS (single)
+ *  or \ref GMX_SIMD_HAVE_DINT32_ARITHMETICS (double) is defined.
+ *
+ * The actual return value for "any true" will depend on the architecture.
+ * Any non-zero value should be considered truth.
+ *
+ * \param a SIMD boolean
+ * \return Nonzero integer if any of the elements in a is true, otherwise 0.
+ */
+static gmx_inline int
+gmx_simd_anytrue_fib(gmx_simd_fibool_t a)
+{
+    int             anytrue;
+    int             i;
+
+    anytrue = 0;
+    for (i = 0; i < GMX_SIMD_FINT32_WIDTH; i++)
+    {
+        anytrue = anytrue || a.b[i];
+    }
+    return anytrue;
+}
+
+/*! \brief Select from \ref gmx_simd_fint32_t variable where boolean is true.
+ *
+ * You should typically call the real-precision \ref gmx_simd_blendzero_i.
+ *
+ * This routine is only available if \ref GMX_SIMD_HAVE_FINT32_ARITHMETICS (single)
+ *  or \ref GMX_SIMD_HAVE_DINT32_ARITHMETICS (double) is defined.
+ *
+ * \param a SIMD integer to select from
+ * \param sel Boolean selector
+ * \return Elements from a where sel is true, 0 otherwise.
+ */
+static gmx_inline gmx_simd_fint32_t
+gmx_simd_blendzero_fi(gmx_simd_fint32_t a, gmx_simd_fibool_t sel)
+{
+    gmx_simd_fint32_t  c;
+    int                i;
+
+    for (i = 0; i < GMX_SIMD_FINT32_WIDTH; i++)
+    {
+        c.i[i] = sel.b[i] ? a.i[i] : 0.0;
+    }
+    return c;
+}
+
+/*! \brief Select from \ref gmx_simd_fint32_t variable where boolean is false.
+ *
+ * You should typically call the real-precision \ref gmx_simd_blendnotzero_i.
+ *
+ * This routine is only available if \ref GMX_SIMD_HAVE_FINT32_ARITHMETICS (single)
+ *  or \ref GMX_SIMD_HAVE_DINT32_ARITHMETICS (double) is defined.
+ *
+ * \param a SIMD integer to select from
+ * \param sel Boolean selector
+ * \return Elements from a where sel is false, 0 otherwise (sic).
+ */
+static gmx_inline gmx_simd_fint32_t
+gmx_simd_blendnotzero_fi(gmx_simd_fint32_t a, gmx_simd_fibool_t sel)
+{
+    gmx_simd_fint32_t  c;
+    int                i;
+
+    for (i = 0; i < GMX_SIMD_FINT32_WIDTH; i++)
+    {
+        c.i[i] = sel.b[i] ? 0.0 : a.i[i];
+    }
+    return c;
+}
+
+/*! \brief Vector-blend SIMD selection.
+ *
+ * You should typically call the real-precision \ref gmx_simd_blendv_i.
+ *
+ * This routine is only available if \ref GMX_SIMD_HAVE_FINT32_ARITHMETICS (single)
+ *  or \ref GMX_SIMD_HAVE_DINT32_ARITHMETICS (double) is defined.
+ *
+ * \param a First source
+ * \param b Second source
+ * \param sel Boolean selector
+ * \return For each element, select b if sel is true, a otherwise.
+ */
+static gmx_inline gmx_simd_fint32_t
+gmx_simd_blendv_fi(gmx_simd_fint32_t a, gmx_simd_fint32_t b, gmx_simd_fibool_t sel)
+{
+    gmx_simd_fint32_t d;
+    int               i;
+
+    for (i = 0; i < GMX_SIMD_FINT32_WIDTH; i++)
+    {
+        d.i[i] = sel.b[i] ? b.i[i] : a.i[i];
+    }
+    return d;
+}
+
+/*! \}
+ *
+ * \name SIMD implementation integer (corresponding to double) bitwise logical operations
+ * \{
+ */
+
+/*! \brief SIMD integer shift left, based on immediate value.
+ *
+ * \copydetails gmx_simd_slli_fi
+ */
+static gmx_inline gmx_simd_dint32_t
+gmx_simd_slli_di(gmx_simd_dint32_t a, int n)
+{
+    gmx_simd_dint32_t  c;
+    int                i;
+
+    for (i = 0; i < GMX_SIMD_DINT32_WIDTH; i++)
+    {
+        c.i[i] = a.i[i] << n;
+    }
+    return c;
+}
+
+/*! \brief SIMD integer shift right, based on immediate value.
+ *
+ * \copydetails gmx_simd_srli_fi
+ */
+static gmx_inline gmx_simd_dint32_t
+gmx_simd_srli_di(gmx_simd_dint32_t a, int n)
+{
+    gmx_simd_dint32_t  c;
+    int                i;
+
+    for (i = 0; i < GMX_SIMD_DINT32_WIDTH; i++)
+    {
+        c.i[i] = a.i[i] >> n;
+    }
+    return c;
+}
+
+/*! \brief Integer bitwise and for SIMD variables.
+ *
+ * \copydetails gmx_simd_and_fi
+ */
+static gmx_inline gmx_simd_dint32_t
+gmx_simd_and_di(gmx_simd_dint32_t a, gmx_simd_dint32_t b)
+{
+    gmx_simd_dint32_t  c;
+    int                i;
+
+    for (i = 0; i < GMX_SIMD_DINT32_WIDTH; i++)
+    {
+        c.i[i] = a.i[i] & b.i[i];
+    }
+    return c;
+}
+
+/*! \brief Integer bitwise not-and for SIMD variables.
+ *
+ * \copydetails gmx_simd_andnot_fi
+ */
+static gmx_inline gmx_simd_dint32_t
+gmx_simd_andnot_di(gmx_simd_dint32_t a, gmx_simd_dint32_t b)
+{
+    gmx_simd_dint32_t  c;
+    int                i;
+
+    for (i = 0; i < GMX_SIMD_DINT32_WIDTH; i++)
+    {
+        c.i[i] = (~a.i[i]) & b.i[i];
+    }
+    return c;
+}
+
+/*! \brief Integer bitwise or for SIMD variables.
+ *
+ * \copydetails gmx_simd_or_fi
+ */
+static gmx_inline gmx_simd_dint32_t
+gmx_simd_or_di(gmx_simd_dint32_t a, gmx_simd_dint32_t b)
+{
+    gmx_simd_dint32_t  c;
+    int                i;
+
+    for (i = 0; i < GMX_SIMD_DINT32_WIDTH; i++)
+    {
+        c.i[i] = a.i[i] | b.i[i];
+    }
+    return c;
+}
+
+/*! \brief Integer bitwise xor for SIMD variables.
+ *
+ * \copydetails gmx_simd_xor_fi
+ */
+static gmx_inline gmx_simd_dint32_t
+gmx_simd_xor_di(gmx_simd_dint32_t a, gmx_simd_dint32_t b)
+{
+    gmx_simd_dint32_t  c;
+    int                i;
+
+    for (i = 0; i < GMX_SIMD_DINT32_WIDTH; i++)
+    {
+        c.i[i] = a.i[i] ^ b.i[i];
+    }
+    return c;
+}
+
+/*! \}
+ *
+ * \name SIMD implementation integer (corresponding to double) arithmetics
+ * \{
+ */
+/*! \brief Add SIMD integers, corresponding to double precision.
+ *
+ * \copydetails gmx_simd_add_fi
+ */
+static gmx_inline gmx_simd_dint32_t
+gmx_simd_add_di(gmx_simd_dint32_t a, gmx_simd_dint32_t b)
+{
+    gmx_simd_dint32_t  c;
+    int                i;
+
+    for (i = 0; i < GMX_SIMD_DINT32_WIDTH; i++)
+    {
+        c.i[i] = a.i[i] + b.i[i];
+    }
+    return c;
+}
+
+/*! \brief Subtract SIMD integers, corresponding to double precision.
+ *
+ * \copydetails gmx_simd_sub_fi
+ */
+static gmx_inline gmx_simd_dint32_t
+gmx_simd_sub_di(gmx_simd_dint32_t a, gmx_simd_dint32_t b)
+{
+    gmx_simd_dint32_t  c;
+    int                i;
+
+    for (i = 0; i < GMX_SIMD_DINT32_WIDTH; i++)
+    {
+        c.i[i] = a.i[i] - b.i[i];
+    }
+    return c;
+}
+
+/*! \brief Multiply SIMD integers, corresponding to double precision.
+ *
+ * \copydetails gmx_simd_mul_fi
+ */
+static gmx_inline gmx_simd_dint32_t
+gmx_simd_mul_di(gmx_simd_dint32_t a, gmx_simd_dint32_t b)
+{
+    gmx_simd_dint32_t  c;
+    int                i;
+
+    for (i = 0; i < GMX_SIMD_DINT32_WIDTH; i++)
+    {
+        c.i[i] = a.i[i]*b.i[i];
+    }
+    return c;
+}
+
+/*! \}
+ *
+ * \name SIMD implementation integer (corresponding to double) comparisons, boolean selection
+ * \{
+ */
+
+/*! \brief Equality comparison of two ints corresponding to double SIMD data.
+ *
+ * \copydetails gmx_simd_cmpeq_fi
+ */
+static gmx_inline gmx_simd_dibool_t
+gmx_simd_cmpeq_di(gmx_simd_dint32_t a, gmx_simd_dint32_t b)
+{
+    gmx_simd_dibool_t  c;
+    int                i;
+
+    for (i = 0; i < GMX_SIMD_DINT32_WIDTH; i++)
+    {
+        c.b[i] = (a.i[i] == b.i[i]);
+    }
+    return c;
+}
+
+/*! \brief Less-than comparison of two ints corresponding to double SIMD data.
+ *
+ * \copydetails gmx_simd_cmplt_fi
+ */
+static gmx_inline gmx_simd_dibool_t
+gmx_simd_cmplt_di(gmx_simd_dint32_t a, gmx_simd_dint32_t b)
+{
+    gmx_simd_dibool_t  c;
+    int                i;
+
+    for (i = 0; i < GMX_SIMD_DINT32_WIDTH; i++)
+    {
+        c.b[i] = (a.i[i] < b.i[i]);
+    }
+    return c;
+}
+
+/*! \brief Logical AND on gmx_simd_dibool_t.
+ *
+ * \copydetails gmx_simd_and_fib
+ */
+static gmx_inline gmx_simd_dibool_t
+gmx_simd_and_dib(gmx_simd_dibool_t a, gmx_simd_dibool_t b)
+{
+    gmx_simd_dibool_t c;
+    int               i;
+
+    for (i = 0; i < GMX_SIMD_DINT32_WIDTH; i++)
+    {
+        c.b[i] = (a.b[i] && b.b[i]);
+    }
+    return c;
+}
+
+/*! \brief Logical OR on gmx_simd_dibool_t.
+ *
+ * \copydetails gmx_simd_or_fib
+ */
+static gmx_inline gmx_simd_dibool_t
+gmx_simd_or_dib(gmx_simd_dibool_t a, gmx_simd_dibool_t b)
+{
+    gmx_simd_dibool_t  c;
+    int                i;
+
+    for (i = 0; i < GMX_SIMD_DINT32_WIDTH; i++)
+    {
+        c.b[i] = (a.b[i] || b.b[i]);
+    }
+    return c;
+}
+
+/*! \brief Returns non-zero if any of the double-int SIMD booleans in x is True, otherwise 0.
+ *
+ * \copydetails gmx_simd_anytrue_fib
+ */
+static gmx_inline int
+gmx_simd_anytrue_dib(gmx_simd_dibool_t a)
+{
+    int             anytrue;
+    int             i;
+
+    anytrue = 0;
+    for (i = 0; i < GMX_SIMD_DINT32_WIDTH; i++)
+    {
+        anytrue = anytrue || a.b[i];
+    }
+    return anytrue;
+}
+
+/*! \brief Select from SIMD ints (corresponding to double) where boolean is true.
+ *
+ * \copydetails gmx_simd_blendzero_fi
+ */
+static gmx_inline gmx_simd_dint32_t
+gmx_simd_blendzero_di(gmx_simd_dint32_t a, gmx_simd_dibool_t sel)
+{
+    gmx_simd_dint32_t  c;
+    int                i;
+
+    for (i = 0; i < GMX_SIMD_DINT32_WIDTH; i++)
+    {
+        c.i[i] = sel.b[i] ? a.i[i] : 0.0;
+    }
+    return c;
+}
+
+/*! \brief Select from SIMD ints (corresponding to double) where boolean is false.
+ *
+ * \copydetails gmx_simd_blendnotzero_fi
+ */
+static gmx_inline gmx_simd_dint32_t
+gmx_simd_blendnotzero_di(gmx_simd_dint32_t a, gmx_simd_dibool_t sel)
+{
+    gmx_simd_dint32_t  c;
+    int                i;
+
+    for (i = 0; i < GMX_SIMD_DINT32_WIDTH; i++)
+    {
+        c.i[i] = sel.b[i] ? 0.0 : a.i[i];
+    }
+    return c;
+}
+
+/*! \brief Vector-blend SIMD selection for double-int SIMD.
+ *
+ * \copydetails gmx_simd_blendv_fi
+ */
+static gmx_inline gmx_simd_dint32_t
+gmx_simd_blendv_di(gmx_simd_dint32_t a, gmx_simd_dint32_t b, gmx_simd_dibool_t sel)
+{
+    gmx_simd_dint32_t  d;
+    int                i;
+
+    for (i = 0; i < GMX_SIMD_DINT32_WIDTH; i++)
+    {
+        d.i[i] = sel.b[i] ? b.i[i] : a.i[i];
+    }
+    return d;
+}
+
+/*! \}
+ *
+ * \name SIMD implementation conversion operations
+ * \{
+ */
+
+/*! \brief Round single precision floating point to integer.
+ *
+ * You should typically call the real-precision \ref gmx_simd_cvt_r2i.
+ *
+ * \param a SIMD floating-point
+ * \return SIMD integer, rounded to nearest integer.
+ */
+static gmx_inline gmx_simd_fint32_t
+gmx_simd_cvt_f2i(gmx_simd_float_t a)
+{
+    gmx_simd_fint32_t  b;
+    int                i;
+
+    for (i = 0; i < GMX_SIMD_FINT32_WIDTH; i++)
+    {
+#ifdef _MSC_VER
+        b.i[i] = (a.r[i] >= 0.0) ? (a.r[i] + 0.5) : (a.r[i] - 0.5);
+#else
+        b.i[i] = roundf(a.r[i]);
+#endif
+    }
+    return b;
+};
+
+/*! \brief Truncate single precision floating point to integer.
+ *
+ * You should typically call the real-precision \ref gmx_simd_cvtt_r2i.
+ *
+ * \param a SIMD floating-point
+ * \return SIMD integer, truncated towards zero.
+ */
+static gmx_inline gmx_simd_fint32_t
+gmx_simd_cvtt_f2i(gmx_simd_float_t a)
+{
+    gmx_simd_fint32_t  b;
+    int                i;
+
+    for (i = 0; i < GMX_SIMD_FINT32_WIDTH; i++)
+    {
+        b.i[i] = a.r[i];
+    }
+    return b;
+};
+
+/*! \brief Convert integer to single precision floating-point.
+ *
+ * You should typically call the real-precision \ref gmx_simd_cvt_i2r.
+ *
+ * \param a SIMD integer
+ * \return SIMD floating-pint
+ */
+static gmx_inline gmx_simd_float_t
+gmx_simd_cvt_i2f(gmx_simd_fint32_t a)
+{
+    gmx_simd_float_t   b;
+    int                i;
+
+    for (i = 0; i < GMX_SIMD_FINT32_WIDTH; i++)
+    {
+        b.r[i] = a.i[i];
+    }
+    return b;
+};
+
+/*! \brief Round double precision floating point to integer.
+ *
+ * \copydetails gmx_simd_cvt_f2i
+ */
+static gmx_inline gmx_simd_dint32_t
+gmx_simd_cvt_d2i(gmx_simd_double_t a)
+{
+    gmx_simd_dint32_t  b;
+    int                i;
+
+    for (i = 0; i < GMX_SIMD_DINT32_WIDTH; i++)
+    {
+#ifdef _MSC_VER
+        b.i[i] = (a.r[i] >= 0.0) ? (a.r[i] + 0.5) : (a.r[i] - 0.5);
+#else
+        b.i[i] = round(a.r[i]);
+#endif
+    }
+    return b;
+};
+
+/*! \brief Truncate double precision floating point to integer.
+ *
+ * \copydetails gmx_simd_cvtt_f2i
+ */
+static gmx_inline gmx_simd_dint32_t
+gmx_simd_cvtt_d2i(gmx_simd_double_t a)
+{
+    gmx_simd_dint32_t  b;
+    int                i;
+
+    for (i = 0; i < GMX_SIMD_DINT32_WIDTH; i++)
+    {
+        b.i[i] = a.r[i];
+    }
+    return b;
+};
+
+/*! \brief Convert integer to single precision floating-point.
+ *
+ * \copydetails gmx_simd_cvt_i2f
+ */
+static gmx_inline gmx_simd_double_t
+gmx_simd_cvt_i2d(gmx_simd_dint32_t a)
+{
+    gmx_simd_double_t  b;
+    int                i;
+
+    for (i = 0; i < GMX_SIMD_DINT32_WIDTH; i++)
+    {
+        b.r[i] = a.i[i];
+    }
+    return b;
+};
+
+/*! \brief Convert from float boolean to corresponding integer boolean.
+ *
+ * You should typically call the real-precision \ref gmx_simd_cvt_b2ib.
+ *
+ * \param a Boolean corresponding to SIMD floating-point
+ * \return Boolean that can be applied to SIMD integer operations.
+ */
+static gmx_inline gmx_simd_fibool_t
+gmx_simd_cvt_fb2fib(gmx_simd_fbool_t a)
+{
+    gmx_simd_fibool_t  b;
+    int                i;
+
+    /* Integer width >= float width */
+    for (i = 0; i < GMX_SIMD_FLOAT_WIDTH; i++)
+    {
+        b.b[i] = a.b[i];
+    }
+    return b;
+}
+
+/*! \brief Convert from integer boolean (corresponding to float) to float boolean.
+ *
+ * You should typically call the real-precision \ref gmx_simd_cvt_ib2b.
+ *
+ * \param a Boolean corresponding to SIMD integer
+ * \return Boolean that can be applied to SIMD floating-point.
+ */
+static gmx_inline gmx_simd_fbool_t
+gmx_simd_cvt_fib2fb(gmx_simd_fibool_t a)
+{
+    gmx_simd_fbool_t  b;
+    int               i;
+
+    /* Integer width >= float width */
+    for (i = 0; i < GMX_SIMD_FLOAT_WIDTH; i++)
+    {
+        b.b[i] = a.b[i];
+    }
+    return b;
+}
+
+/*! \brief Convert from double boolean to corresponding integer boolean.
+ *
+ * \copydetails gmx_simd_cvt_fb2fib
+ */
+static gmx_inline gmx_simd_dibool_t
+gmx_simd_cvt_db2dib(gmx_simd_dbool_t a)
+{
+    gmx_simd_dibool_t  b;
+    int                i;
+
+    /* Integer width >= double width */
+    for (i = 0; i < GMX_SIMD_DOUBLE_WIDTH; i++)
+    {
+        b.b[i] = a.b[i];
+    }
+    return b;
+}
+
+/*! \brief Convert from integer boolean (corresponding to double) to double boolean.
+ *
+ * \copydetails gmx_simd_cvt_fib2fb
+ */
+static gmx_inline gmx_simd_dbool_t
+gmx_simd_cvt_dib2db(gmx_simd_dibool_t a)
+{
+    gmx_simd_dbool_t  b;
+    int               i;
+
+    /* Integer width >= double width */
+    for (i = 0; i < GMX_SIMD_DOUBLE_WIDTH; i++)
+    {
+        b.b[i] = a.b[i];
+    }
+    return b;
+}
+
+/*! \brief Convert SIMD float to double.
+ *
+ * This version is available if \ref GMX_SIMD_FLOAT_WIDTH is identical to
+ * \ref GMX_SIMD_DOUBLE_WIDTH.
+ *
+ * Float/double conversions are complex since the SIMD width could either
+ * be different (e.g. on x86) or identical (e.g. IBM QPX). This means you will
+ * need to check for the width in the code, and have different code paths.
+ *
+ * \param f Single-precision SIMD variable
+ * \return Double-precision SIMD variable of the same width
+ */
+static gmx_inline gmx_simd_double_t
+gmx_simd_cvt_f2d(gmx_simd_float_t f)
+{
+    gmx_simd_double_t d;
+#if (GMX_SIMD_FLOAT_WIDTH == GMX_SIMD_DOUBLE_WIDTH)
+    int               i;
+    for (i = 0; i < GMX_SIMD_DOUBLE_WIDTH; i++)
+    {
+        d.r[i] = f.r[i];
+    }
+#else
+    gmx_fatal(FARGS, "gmx_simd_cvt_f2d() requires GMX_SIMD_FLOAT_WIDTH==GMX_SIMD_DOUBLE_WIDTH");
+    /* Avoid compiler warnings */
+    d.r[0] = f.r[0];
+#endif
+    return d;
+}
+
+/*! \brief Convert SIMD double to float.
+ *
+ * This version is available if \ref GMX_SIMD_FLOAT_WIDTH is identical to
+ * \ref GMX_SIMD_DOUBLE_WIDTH.
+ *
+ * Float/double conversions are complex since the SIMD width could either
+ * be different (e.g. on x86) or identical (e.g. IBM QPX). This means you will
+ * need to check for the width in the code, and have different code paths.
+ *
+ * \param d Double-precision SIMD variable
+ * \return Single-precision SIMD variable of the same width
+ */
+static gmx_inline gmx_simd_float_t
+gmx_simd_cvt_d2f(gmx_simd_double_t d)
+{
+    gmx_simd_float_t f;
+#if (GMX_SIMD_FLOAT_WIDTH == GMX_SIMD_DOUBLE_WIDTH)
+    int              i;
+    for (i = 0; i < GMX_SIMD_DOUBLE_WIDTH; i++)
+    {
+        f.r[i] = d.r[i];
+    }
+#else
+    gmx_fatal(FARGS, "gmx_simd_cvt_d2f() requires GMX_SIMD_FLOAT_WIDTH==GMX_SIMD_DOUBLE_WIDTH");
+    /* Avoid compiler warnings */
+    f.r[0] = d.r[0];
+#endif
+    return f;
+}
+
+/*! \brief Convert SIMD float to double.
+ *
+ * This version is available if \ref GMX_SIMD_FLOAT_WIDTH is twice as large
+ * as \ref GMX_SIMD_DOUBLE_WIDTH.
+ *
+ * Float/double conversions are complex since the SIMD width could either
+ * be different (e.g. on x86) or identical (e.g. IBM QPX). This means you will
+ * need to check for the width in the code, and have different code paths.
+ *
+ * \param f Single-precision SIMD variable
+ * \param[out] d0 Double-precision SIMD variable, first half of values from f.
+ * \param[out] d1 Double-precision SIMD variable, second half of values from f.
+ */
+static gmx_inline void
+gmx_simd_cvt_f2dd(gmx_simd_float_t f, gmx_simd_double_t *d0, gmx_simd_double_t *d1)
+{
+#if (GMX_SIMD_FLOAT_WIDTH == 2*GMX_SIMD_DOUBLE_WIDTH)
+    int i;
+    for (i = 0; i < GMX_SIMD_DOUBLE_WIDTH; i++)
+    {
+        d0->r[i] = f.r[i];
+        d1->r[i] = f.r[GMX_SIMD_DOUBLE_WIDTH+i];
+    }
+#else
+    gmx_fatal(FARGS, "gmx_simd_cvt_f2dd() requires GMX_SIMD_FLOAT_WIDTH==2*GMX_SIMD_DOUBLE_WIDTH");
+    /* Avoid compiler warnings about unused arguments */
+    d0->r[0] = f.r[0];
+    d1->r[0] = f.r[0];
+#endif
+}
+
+/*! \brief Convert SIMD double to float.
+ *
+ * This version is available if \ref GMX_SIMD_FLOAT_WIDTH is twice as large
+ * as \ref GMX_SIMD_DOUBLE_WIDTH.
+ *
+ * Float/double conversions are complex since the SIMD width could either
+ * be different (e.g. on x86) or identical (e.g. IBM QPX). This means you will
+ * need to check for the width in the code, and have different code paths.
+ *
+ * \param d0 Double-precision SIMD variable, first half of values to put in f.
+ * \param d1 Double-precision SIMD variable, second half of values to put in f.
+ * \return Single-precision SIMD variable with all values.
+ */
+static gmx_inline gmx_simd_float_t
+gmx_simd_cvt_dd2f(gmx_simd_double_t d0, gmx_simd_double_t d1)
+{
+    gmx_simd_float_t f;
+#if (GMX_SIMD_FLOAT_WIDTH == 2*GMX_SIMD_DOUBLE_WIDTH)
+    int              i;
+    for (i = 0; i < GMX_SIMD_DOUBLE_WIDTH; i++)
+    {
+        f.r[i]                       = d0.r[i];
+        f.r[GMX_SIMD_DOUBLE_WIDTH+i] = d1.r[i];
+    }
+#else
+    gmx_fatal(FARGS, "gmx_simd_cvt_dd2f() requires GMX_SIMD_FLOAT_WIDTH==2*GMX_SIMD_DOUBLE_WIDTH");
+    /* Avoid compiler warnings about unused arguments & uninitialized f */
+    f.r[0] = d0.r[0] + d1.r[0];
+#endif
+    return f;
+}
+
+/*! \} */
+
+/*! \name SIMD4. Constant width-4 SIMD types and instructions
+ * \{
+ */
+
+#if (GMX_SIMD_FLOAT_WIDTH == 4) || (defined DOXYGEN)
+
+
+/*! \brief SIMD4 float type. Available with \ref GMX_SIMD4_HAVE_FLOAT.
+ *
+ * Unless you specifically want a single-precision type you should check
+ * \ref gmx_simd4_real_t instead.
+ *
+ * While the SIMD4 datatype is identical to the normal SIMD type in the
+ * reference implementation, this will often not be the case for
+ * other architectures.
+ */
+#    define gmx_simd4_float_t    gmx_simd_float_t
+
+/*! \brief Load SIMD4 float from aligned memory.
+ *  \copydetails gmx_simd_load_f
+ */
+#    define gmx_simd4_load_f     gmx_simd_load_f
+
+/*! \brief Set all elements of SIMD4 float from single pointer.
+ *  \copydetails gmx_simd_load1_f
+ */
+#    define gmx_simd4_load1_f    gmx_simd_load1_f
+
+/*! \brief Set all SIMD4 float elements to the value r.
+ *  \copydetails gmx_simd_set1_f
+ */
+#    define gmx_simd4_set1_f     gmx_simd_set1_f
+
+/*! \brief Store the contents of SIMD4 float pr to aligned memory m.
+ *  \copydetails gmx_simd_store_f
+ */
+#    define gmx_simd4_store_f    gmx_simd_store_f
+
+/*! \brief Load SIMD4 float from unaligned memory.
+ * \copydetails gmx_simd_loadu_f
+ */
+#    define gmx_simd4_loadu_f    gmx_simd_loadu_f
+
+/*! \brief Store SIMD4 float to unaligned memory.
+ * \copydetails gmx_simd_storeu_f
+ */
+#    define gmx_simd4_storeu_f   gmx_simd_storeu_f
+
+/*! \brief Set all SIMD4 float elements to 0.
+ * \copydetails gmx_simd_setzero_f
+ */
+#    define gmx_simd4_setzero_f  gmx_simd_setzero_f
+
+/*! \brief Bitwise and for two SIMD4 float variables.
+ * \copydetails gmx_simd_and_f
+ */
+#    define gmx_simd4_and_f      gmx_simd_and_f
+
+/*! \brief Bitwise andnot for two SIMD4 float variables. c=(~a) & b.
+ * \copydetails gmx_simd_andnot_f
+ */
+#    define gmx_simd4_andnot_f   gmx_simd_andnot_f
+
+/*! \brief Bitwise or for two SIMD4 float variables.
+ * \copydetails gmx_simd_or_f
+ */
+#    define gmx_simd4_or_f       gmx_simd_or_f
+
+/*! \brief Bitwise xor for two SIMD4 float variables.
+ * \copydetails gmx_simd_xor_f
+ */
+#    define gmx_simd4_xor_f      gmx_simd_xor_f
+
+/*! \brief Add two SIMD4 float variables.
+ * \copydetails gmx_simd_add_f
+ */
+#    define gmx_simd4_add_f      gmx_simd_add_f
+
+/*! \brief Subtract two SIMD4 float variables.
+ * \copydetails gmx_simd_sub_f
+ */
+#    define gmx_simd4_sub_f      gmx_simd_sub_f
+
+/*! \brief Multiply two SIMD4 float variables.
+ * \copydetails gmx_simd_mul_f
+ */
+#    define gmx_simd4_mul_f      gmx_simd_mul_f
+
+/*! \brief Fused-multiply-add for SIMD4 float. Result is a*b+c.
+ * \copydetails gmx_simd_fmadd_f
+ */
+#    define gmx_simd4_fmadd_f    gmx_simd_fmadd_f
+
+/*! \brief Fused-multiply-subtract for SIMD4 float. Result is a*b-c.
+ * \copydetails gmx_simd_fmsub_f
+ */
+#    define gmx_simd4_fmsub_f    gmx_simd_fmsub_f
+
+/*! \brief Fused-negated-multiply-add for SIMD4 float. Result is -a*b+c.
+ * \copydetails gmx_simd_fnmadd_f
+ */
+#    define gmx_simd4_fnmadd_f   gmx_simd_fnmadd_f
+
+/*! \brief Fused-negated-multiply-add for SIMD4 float. Result is -a*b-c.
+ * \copydetails gmx_simd_fnmsub_f
+ */
+#    define gmx_simd4_fnmsub_f   gmx_simd_fnmsub_f
+
+/*! \brief Lookup of approximate 1/sqrt(x) for SIMD4 float.
+ * \copydetails gmx_simd_rsqrt_f
+ */
+#    define gmx_simd4_rsqrt_f    gmx_simd_rsqrt_f
+
+/*! \brief Floating-point absolute value for SIMD4 float.
+ * \copydetails gmx_simd_fabs_f
+ */
+#    define gmx_simd4_fabs_f     gmx_simd_fabs_f
+
+/*! \brief Floating-point negate for SIMD4 float.
+ * \copydetails gmx_simd_fneg_f
+ */
+#    define gmx_simd4_fneg_f     gmx_simd_fneg_f
+
+/*! \brief Set each SIMD4 float element to the largest from two variables.
+ * \copydetails gmx_simd_max_f
+ */
+#    define gmx_simd4_max_f      gmx_simd_max_f
+
+/*! \brief Set each SIMD4 float element to the smallest from two variables.
+ * \copydetails gmx_simd_min_f
+ */
+#    define gmx_simd4_min_f      gmx_simd_min_f
+
+/*! \brief Round to nearest integer value for SIMD4 float.
+ * \copydetails gmx_simd_round_f
+ */
+#    define gmx_simd4_round_f    gmx_simd_round_f
+
+/*! \brief Round to largest integral value for SIMD4 float.
+ * \copydetails gmx_simd_trunc_f
+ */
+#    define gmx_simd4_trunc_f    gmx_simd_trunc_f
+
+/*! \brief Return dot product of two single precision SIMD4 variables.
+ *
+ * The dot product is calculated between the first three elements in the two
+ * vectors, while the fourth is ignored. The result is returned as a scalar.
+ *
+ * \param a vector1
+ * \param b vector2
+ * \result a[0]*b[0]+a[1]*b[1]+a[2]*b[2], returned as scalar. Last element is ignored.
+ */
+static gmx_inline float
+gmx_simd4_dotproduct3_f(gmx_simd_float_t a, gmx_simd_float_t b)
+{
+    return a.r[0]*b.r[0]+a.r[1]*b.r[1]+a.r[2]*b.r[2];
+}
+
+/*! \brief SIMD4 variable type to use for logical comparisons on floats.
+ * \copydetails gmx_simd_fbool_t
+ */
+#    define gmx_simd4_fbool_t   gmx_simd_fbool_t
+
+/*! \brief Equality comparison of two single precision SIMD4.
+ * \copydetails gmx_simd_cmpeq_f
+ */
+#    define gmx_simd4_cmpeq_f   gmx_simd_cmpeq_f
+
+/*! \brief Less-than comparison of two single precision SIMD4.
+ * \copydetails gmx_simd_cmplt_f
+ */
+#    define gmx_simd4_cmplt_f   gmx_simd_cmplt_f
+
+/*! \brief Less-than comparison of two single precision SIMD4.
+ * \copydetails gmx_simd_cmple_f
+ */
+#    define gmx_simd4_cmple_f   gmx_simd_cmple_f
+
+/*! \brief Logical AND on float SIMD4 booleans.
+ * \copydetails gmx_simd_and_fb
+ */
+#    define gmx_simd4_and_fb gmx_simd_and_fb
+
+/*! \brief Logical OR on float SIMD4 booleans.
+ * \copydetails gmx_simd_or_fb
+ */
+#    define gmx_simd4_or_fb gmx_simd_or_fb
+
+/*! \brief Returns non-zero if any of the SIMD4 boolean in x is True.
+ * \copydetails gmx_simd_anytrue_fb
+ */
+#    define gmx_simd4_anytrue_fb gmx_simd_anytrue_fb
+
+/*! \brief Select from single precision SIMD4 variable where boolean is true.
+ * \copydetails gmx_simd_blendzero_f
+ */
+#    define gmx_simd4_blendzero_f gmx_simd_blendzero_f
+
+/*! \brief Select from single precision SIMD4 variable where boolean is false.
+ * \copydetails gmx_simd_blendnotzero_f
+ */
+#    define gmx_simd4_blendnotzero_f gmx_simd_blendnotzero_f
+
+/*! \brief Vector-blend instruction form SIMD4 float.
+ * \copydetails gmx_simd_blendv_f
+ */
+#    define gmx_simd4_blendv_f  gmx_simd_blendv_f
+
+/*! \brief Return sum of all elements in SIMD4 float.
+ * \copydetails gmx_simd_reduce_f
+ */
+#    define gmx_simd4_reduce_f  gmx_simd_reduce_f
+
+#else /* GMX_SIMD_FLOAT_WIDTH!=4 */
+#    undef GMX_SIMD4_HAVE_FLOAT
+#endif
+
+
+#if (GMX_SIMD_DOUBLE_WIDTH == 4) || (defined DOXYGEN)
+
+/*! \brief SIMD4 double type. Available with \ref GMX_SIMD4_HAVE_DOUBLE.
+ *
+ * Unless you specifically want a double-precision type you should check
+ * \ref gmx_simd4_real_t instead.
+ *
+ * While the SIMD4 datatype is identical to the normal SIMD type in the
+ * reference implementation, this will often not be the case for
+ * other architectures.
+ */
+#    define gmx_simd4_double_t   gmx_simd_double_t
+
+/*! \brief Double precision SIMD4 load aligned.
+ * \copydetails gmx_simd_load_d
+ */
+#    define gmx_simd4_load_d     gmx_simd_load_d
+
+/*! \brief Double precision SIMD4 load single value to all elements.
+ * \copydetails gmx_simd_load1_d
+ */
+#    define gmx_simd4_load1_d    gmx_simd_load1_d
+
+/*! \brief Double precision SIMD4 set all elements from value.
+ * \copydetails gmx_simd_set1_d
+ */
+#    define gmx_simd4_set1_d     gmx_simd_set1_d
+
+/*! \brief Double precision SIMD4 store to aligned memory.
+ * \copydetails gmx_simd_store_d
+ */
+#    define gmx_simd4_store_d   gmx_simd_store_d
+
+/*! \brief Load unaligned SIMD4 double.
+ * \copydetails gmx_simd_loadu_d
+ */
+#    define gmx_simd4_loadu_d   gmx_simd_loadu_d
+
+/*! \brief Store unaligned SIMD4 double.
+ * \copydetails gmx_simd_storeu_d
+ */
+#    define gmx_simd4_storeu_d  gmx_simd_storeu_d
+
+/*! \brief Set all elements in SIMD4 double to 0.0.
+ * \copydetails gmx_simd_setzero_d
+ */
+#    define gmx_simd4_setzero_d gmx_simd_setzero_d
+
+/*! \brief Bitwise and for two SIMD4 double variables.
+ * \copydetails gmx_simd_and_d
+ */
+#    define gmx_simd4_and_d     gmx_simd_and_d
+
+/*! \brief Bitwise andnot for SIMD4 double. c=(~a) & b.
+ * \copydetails gmx_simd_andnot_d
+ */
+#    define gmx_simd4_andnot_d  gmx_simd_andnot_d
+
+/*! \brief Bitwise or for SIMD4 double.
+ * \copydetails gmx_simd_or_d
+ */
+#    define gmx_simd4_or_d      gmx_simd_or_d
+
+/*! \brief Bitwise xor for SIMD4 double.
+ * \copydetails gmx_simd_xor_d
+ */
+#    define gmx_simd4_xor_d     gmx_simd_xor_d
+
+/*! \brief Add two SIMD4 double values.
+ * \copydetails gmx_simd_add_d
+ */
+#    define gmx_simd4_add_d     gmx_simd_add_d
+
+/*! \brief Subtract two SIMD4 double values.
+ * \copydetails gmx_simd_sub_d
+ */
+#    define gmx_simd4_sub_d     gmx_simd_sub_d
+
+/*! \brief Multiply two SIMD4 double values.
+ * \copydetails gmx_simd_mul_d
+ */
+#    define gmx_simd4_mul_d     gmx_simd_mul_d
+
+/*! \brief Fused-multiply-add for SIMD4 double. Result is a*b+c.
+ * \copydetails gmx_simd_fmadd_d
+ */
+#    define gmx_simd4_fmadd_d   gmx_simd_fmadd_d
+
+/*! \brief Fused-multiply-subtract for SIMD4 double. Result is a*b-c.
+ * \copydetails gmx_simd_fmsub_d
+ */
+#    define gmx_simd4_fmsub_d   gmx_simd_fmsub_d
+
+/*! \brief Fused-negated-multiply-add for SIMD4 double. Result is -a*b+c.
+ * \copydetails gmx_simd_fnmadd_d
+ */
+#    define gmx_simd4_fnmadd_d  gmx_simd_fnmadd_d
+
+/*! \brief Fused-negated-multiply-sub for SIMD4 double. Result is -a*b-c.
+ * \copydetails gmx_simd_fnmsub_d
+ */
+#    define gmx_simd4_fnmsub_d  gmx_simd_fnmsub_d
+
+/*! \brief SIMD4 double 1.0/sqrt(x) lookup.
+ * \copydetails gmx_simd_rsqrt_d
+ */
+#    define gmx_simd4_rsqrt_d   gmx_simd_rsqrt_d
+
+/*! \brief SIMD4 double Floating-point fabs().
+ * \copydetails gmx_simd_fabs_d
+ */
+#    define gmx_simd4_fabs_d    gmx_simd_fabs_d
+
+/*! \brief SIMD4 double floating-point negate.
+ * \copydetails gmx_simd_fneg_d
+ */
+#    define gmx_simd4_fneg_d    gmx_simd_fneg_d
+
+/*! \brief Set each SIMD4 element to the largest from two variables.
+ * \copydetails gmx_simd_max_d
+ */
+#    define gmx_simd4_max_d     gmx_simd_max_d
+
+/*! \brief Set each SIMD4 element to the smallest from two variables.
+ * \copydetails gmx_simd_min_d
+ */
+#    define gmx_simd4_min_d     gmx_simd_min_d
+
+/*!  \brief Round SIMD4 double to nearest integer value (in floating-point format).
+ * \copydetails gmx_simd_round_d
+ */
+#    define gmx_simd4_round_d   gmx_simd_round_d
+
+/*! \brief Truncate SIMD4 double, i.e. round towards zero.
+ * \copydetails gmx_simd_trunc_d
+ */
+#    define gmx_simd4_trunc_d   gmx_simd_trunc_d
+
+/*! \brief Return dot product of two double precision SIMD4 variables.
+ * \copydetails gmx_simd_setzero_f
+ */
+static gmx_inline double
+gmx_simd4_dotproduct3_d(gmx_simd_double_t a, gmx_simd_double_t b)
+{
+    return a.r[0]*b.r[0]+a.r[1]*b.r[1]+a.r[2]*b.r[2];
+}
+
+/*! \brief SIMD4 variable type to use for logical comparisons on doubles.
+ * \copydetails gmx_simd_dbool_t
+ */
+#    define gmx_simd4_dbool_t   gmx_simd_dbool_t
+
+/*! \brief Equality comparison of two double precision SIMD4 values.
+ * \copydetails gmx_simd_cmpeq_d
+ */
+#    define gmx_simd4_cmpeq_d   gmx_simd_cmpeq_d
+
+/*! \brief Less-than comparison of two double precision SIMD4 values.
+ * \copydetails gmx_simd_cmplt_d
+ */
+#    define gmx_simd4_cmplt_d   gmx_simd_cmplt_d
+
+/*! \brief Less-than comparison of two double precision SIMD4 values.
+ * \copydetails gmx_simd_cmple_d
+ */
+#    define gmx_simd4_cmple_d   gmx_simd_cmple_d
+
+/*! \brief Logical AND on double SIMD4 booleans.
+ * \copydetails gmx_simd_and_db
+ */
+#    define gmx_simd4_and_db gmx_simd_and_db
+
+/*! \brief Logical OR on double SIMD4 booleans.
+ * \copydetails gmx_simd_or_db
+ */
+#    define gmx_simd4_or_db gmx_simd_or_db
+
+/*! \brief Returns non-zero if any of the SIMD4 booleans in x is True.
+ * \copydetails gmx_simd_anytrue_db
+ */
+#    define gmx_simd4_anytrue_db gmx_simd_anytrue_db
+
+/*! \brief Select from double precision SIMD4 variable where boolean is true.
+ * \copydetails gmx_simd_blendzero_d
+ */
+#    define gmx_simd4_blendzero_d gmx_simd_blendzero_d
+
+/*! \brief Select from double precision SIMD4 variable where boolean is false.
+ * \copydetails gmx_simd_blendnotzero_d
+ */
+#    define gmx_simd4_blendnotzero_d gmx_simd_blendnotzero_d
+
+/*! \brief Vector-blend instruction for SIMD4 double.
+ * \copydetails gmx_simd_blendv_d
+ */
+#    define gmx_simd4_blendv_d  gmx_simd_blendv_d
+
+/*! \brief Return sum of all elements in SIMD4 double.
+ * \copydetails gmx_simd_reduce_d
+ */
+#    define gmx_simd4_reduce_d  gmx_simd_reduce_d
+
+#else /* GMX_SIMD4_DOUBLE_WIDTH!=4 */
+#    undef GMX_SIMD4_HAVE_DOUBLE
+#endif
+
+/*! \} */
+
+
+/*! \brief Return 1 if SIMD floating-point ops have overflowed, and reset check.
+
+ * This function to check whether SIMD operations have resulted in overflow,
+ * and returns 1 if it occured, 0 otherwise.
+ * For now, this is unfortunately a dummy for all architectures except x86.
+ */
+static int
+gmx_simd_check_and_reset_overflow(void)
+{
+    return 0;
+}
+
+/*! \} */
+/*! \endcond */
+
+#endif /* GMX_SIMD_IMPL_REFERENCE_H */
diff --git a/src/gromacs/simd/impl_x86_avx2_256/impl_x86_avx2_256.h b/src/gromacs/simd/impl_x86_avx2_256/impl_x86_avx2_256.h

new file mode 100644 (file)

index 0000000..1b995e0
--- /dev/null
+++ b/src/gromacs/simd/impl_x86_avx2_256/impl_x86_avx2_256.h
@@ -0,0 +1,205 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2014, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+
+#ifndef GMX_SIMD_IMPL_X86_AVX2_256_H
+#define GMX_SIMD_IMPL_X86_AVX2_256_H
+
+#include <math.h>
+#include <immintrin.h>
+
+/* x86 256-bit AVX2 SIMD instruction wrappers
+ *
+ * Please see documentation in gromacs/simd/simd.h for details
+ */
+
+/* Inherit parts of AVX2_256 from AVX_256 */
+#include "gromacs/simd/impl_x86_avx_256/impl_x86_avx_256.h"
+/* Increment over AVX_256 capabilities */
+#define GMX_SIMD_X86_AVX2_256_OR_HIGHER
+
+/* Override some capability definitions for things added in AVX2 */
+#define GMX_SIMD_HAVE_FMA
+#define GMX_SIMD_HAVE_FINT32_LOGICAL     /* AVX2 adds 256-bit integer shifts */
+#define GMX_SIMD_HAVE_FINT32_ARITHMETICS /* AVX2 adds 256-bit integer +,-,*  */
+
+/****************************************************
+ *      SINGLE PRECISION SIMD IMPLEMENTATION        *
+ ****************************************************/
+#undef  gmx_simd_fmadd_f
+#define gmx_simd_fmadd_f           _mm256_fmadd_ps
+#undef  gmx_simd_fmsub_f
+#define gmx_simd_fmsub_f           _mm256_fmsub_ps
+#undef  gmx_simd_fnmadd_f
+#define gmx_simd_fnmadd_f          _mm256_fnmadd_ps
+#undef  gmx_simd_fnmsub_f
+#define gmx_simd_fnmsub_f          _mm256_fnmsub_ps
+#undef  gmx_simd_get_exponent_f
+#define gmx_simd_get_exponent_f    gmx_simd_get_exponent_f_avx2_256
+#undef  gmx_simd_set_exponent_f
+#define gmx_simd_set_exponent_f    gmx_simd_set_exponent_f_avx2_256
+/* Previously undefined logical ops on gmx_simd_fint32_t */
+#define gmx_simd_slli_fi           _mm256_slli_epi32
+#define gmx_simd_srli_fi           _mm256_srli_epi32
+#define gmx_simd_and_fi            _mm256_and_si256
+#define gmx_simd_andnot_fi         _mm256_andnot_si256
+#define gmx_simd_or_fi             _mm256_or_si256
+#define gmx_simd_xor_fi            _mm256_xor_si256
+/* Previously undefined arithmetic ops on gmx_simd_fint32_t */
+#define gmx_simd_add_fi            _mm256_add_epi32
+#define gmx_simd_sub_fi            _mm256_sub_epi32
+#define gmx_simd_mul_fi            _mm256_mullo_epi32
+/* Previously undefined boolean ops on gmx_simd_fint32_t */
+#define gmx_simd_cmpeq_fi          _mm256_cmpeq_epi32
+#define gmx_simd_cmplt_fi(a, b)     _mm256_cmpgt_epi32(b, a)
+#define gmx_simd_and_fib           _mm256_and_si256
+#define gmx_simd_or_fib            _mm256_or_si256
+#define gmx_simd_anytrue_fib       _mm256_movemask_epi8
+#define gmx_simd_blendzero_fi      _mm256_and_si256
+#define gmx_simd_blendnotzero_fi(a, sel) _mm256_andnot_si256(sel, a)
+#define gmx_simd_blendv_fi         _mm256_blendv_epi8
+
+
+/****************************************************
+ *      DOUBLE PRECISION SIMD IMPLEMENTATION        *
+ ****************************************************/
+#undef  gmx_simd_fmadd_d
+#define gmx_simd_fmadd_d           _mm256_fmadd_pd
+#undef  gmx_simd_fmsub_d
+#define gmx_simd_fmsub_d           _mm256_fmsub_pd
+#undef  gmx_simd_fnmadd_d
+#define gmx_simd_fnmadd_d          _mm256_fnmadd_pd
+#undef  gmx_simd_fnmsub_d
+#define gmx_simd_fnmsub_d          _mm256_fnmsub_pd
+#undef  gmx_simd_get_exponent_d
+#define gmx_simd_get_exponent_d    gmx_simd_get_exponent_d_avx2_256
+#undef  gmx_simd_set_exponent_d
+#define gmx_simd_set_exponent_d    gmx_simd_set_exponent_d_avx2_256
+#undef  gmx_simd_cvt_db2dib
+#define gmx_simd_cvt_db2dib        gmx_simd_cvt_db2dib_avx2_256
+#undef  gmx_simd_cvt_dib2db
+#define gmx_simd_cvt_dib2db        gmx_simd_cvt_dib2db_avx2_256
+
+/****************************************************
+ *      SIMD4 SINGLE PRECISION IMPLEMENTATION       *
+ ****************************************************/
+#undef  gmx_simd4_fmadd_f
+#define gmx_simd4_fmadd_f          _mm_fmadd_ps
+#undef  gmx_simd4_fmsub_f
+#define gmx_simd4_fmsub_f          _mm_fmsub_ps
+#undef  gmx_simd4_fnmadd_f
+#define gmx_simd4_fnmadd_f         _mm_fnmadd_ps
+#undef  gmx_simd4_fnmsub_f
+#define gmx_simd4_fnmsub_f         _mm_fnmsub_ps
+
+/* No need to update SIMD4 double, since those instructions
+ * are aliased to the general SIMD double instructions above.
+ */
+
+/*********************************************************
+ * SIMD SINGLE PRECISION IMPLEMENTATION HELPER FUNCTIONS *
+ *********************************************************/
+static gmx_inline gmx_simd_float_t
+gmx_simd_get_exponent_f_avx2_256(gmx_simd_float_t x)
+{
+    const __m256  expmask      = _mm256_castsi256_ps(_mm256_set1_epi32(0x7F800000));
+    const __m256i expbias      = _mm256_set1_epi32(127);
+    __m256i       iexp;
+
+    iexp = _mm256_castps_si256(_mm256_and_ps(x, expmask));
+    iexp = _mm256_sub_epi32(_mm256_srli_epi32(iexp, 23), expbias);
+    return _mm256_cvtepi32_ps(iexp);
+}
+
+static gmx_inline gmx_simd_float_t
+gmx_simd_set_exponent_f_avx2_256(gmx_simd_float_t x)
+{
+    const __m256i  expbias      = _mm256_set1_epi32(127);
+    __m256i        iexp         = _mm256_cvtps_epi32(x);
+
+    iexp = _mm256_slli_epi32(_mm256_add_epi32(iexp, expbias), 23);
+    return _mm256_castsi256_ps(iexp);
+}
+
+/*********************************************************
+ * SIMD DOUBLE PRECISION IMPLEMENTATION HELPER FUNCTIONS *
+ *********************************************************/
+static gmx_inline gmx_simd_double_t
+gmx_simd_get_exponent_d_avx2_256(gmx_simd_double_t x)
+{
+    const __m256d  expmask      = _mm256_castsi256_pd(_mm256_set1_epi64x(0x7FF0000000000000LL));
+    const __m256i  expbias      = _mm256_set1_epi64x(1023LL);
+    __m256i        iexp;
+    __m128i        iexp128;
+
+    iexp = _mm256_castpd_si256(_mm256_and_pd(x, expmask));
+    iexp = _mm256_sub_epi64(_mm256_srli_epi64(iexp, 52), expbias);
+    iexp = _mm256_shuffle_epi32(iexp, _MM_SHUFFLE(3, 1, 2, 0));
+
+    iexp128 = _mm256_extractf128_si256(iexp, 1);
+    iexp128 = _mm_unpacklo_epi64(_mm256_castsi256_si128(iexp), iexp128);
+    return _mm256_cvtepi32_pd(iexp128);
+}
+
+static gmx_inline gmx_simd_double_t
+gmx_simd_set_exponent_d_avx2_256(gmx_simd_double_t x)
+{
+    const __m256i  expbias      = _mm256_set1_epi64x(1023LL);
+    __m256i        iexp         = _mm256_cvtepi32_epi64(_mm256_cvtpd_epi32(x));
+
+    iexp = _mm256_slli_epi64(_mm256_add_epi64(iexp, expbias), 52);
+    return _mm256_castsi256_pd(iexp);
+}
+
+static gmx_inline gmx_simd_dibool_t
+gmx_simd_cvt_db2dib_avx2_256(gmx_simd_dbool_t a)
+{
+    __m128i ia = _mm256_castsi256_si128(_mm256_castpd_si256(a));
+    __m128i ib = _mm256_extractf128_si256(_mm256_castpd_si256(a), 0x1);
+
+    ia = _mm_packs_epi32(ia, ib);
+
+    return ia;
+}
+
+static gmx_inline gmx_simd_dbool_t
+gmx_simd_cvt_dib2db_avx2_256(gmx_simd_dibool_t ia)
+{
+    __m128d lo = _mm_castsi128_pd(_mm_unpacklo_epi32(ia, ia));
+    __m128d hi = _mm_castsi128_pd(_mm_unpackhi_epi32(ia, ia));
+
+    return _mm256_insertf128_pd(_mm256_castpd128_pd256(lo), hi, 0x1);
+}
+
+#endif /* GMX_SIMD_IMPL_X86_AVX2_256_H */
diff --git a/src/gromacs/simd/impl_x86_avx_128_fma/impl_x86_avx_128_fma.h b/src/gromacs/simd/impl_x86_avx_128_fma/impl_x86_avx_128_fma.h

new file mode 100644 (file)

index 0000000..7d66d00
--- /dev/null
+++ b/src/gromacs/simd/impl_x86_avx_128_fma/impl_x86_avx_128_fma.h
@@ -0,0 +1,158 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2014, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+
+#ifndef GMX_SIMD_IMPL_X86_AVX_128_FMA_H
+#define GMX_SIMD_IMPL_X86_AVX_128_FMA_H
+
+#include <math.h>
+#include <immintrin.h>
+#include <x86intrin.h>
+
+/* x86 128-bit AVX with FMA SIMD instruction wrappers
+ *
+ * Please see documentation in gromacs/simd/simd.h for details
+ */
+
+/* Inherit parts of AVX_128_FMA from SSE4.1 */
+#include "gromacs/simd/impl_x86_sse4_1/impl_x86_sse4_1.h"
+/* Increment over SSE4.1 capabilities */
+#define GMX_SIMD_X86_AVX_128_FMA_OR_HIGHER
+
+/* Override some capability definitions for things added in AVX over SSE4.1 */
+#define GMX_SIMD_HAVE_FMA
+#define GMX_SIMD_HAVE_FRACTION
+#define GMX_SIMD4_HAVE_DOUBLE  /* We can use 256-bit operations for this */
+
+/* SINGLE */
+#undef  gmx_simd_fmadd_ps
+#define gmx_simd_fmadd_ps           _mm_macc_ps
+#undef  gmx_simd_fmsub_ps
+#define gmx_simd_fmsub_ps(a, b, c)    _mm_msub_ps
+#undef  gmx_simd_fnmadd_ps
+#define gmx_simd_fnmadd_ps(a, b, c)   _mm_nmacc_ps
+#undef  gmx_simd_fnmsub_ps
+#define gmx_simd_fnmsub_ps(a, b, c)   _mm_nmsub_ps
+#undef  gmx_simd_fraction_f
+#define gmx_simd_fraction_f         _mm_frcz_ps
+
+/* DOUBLE */
+#undef  gmx_simd_fmadd_pd
+#define gmx_simd_fmadd_pd            _mm_macc_pd
+#undef  gmx_simd_fmsub_pd
+#define gmx_simd_fmsub_pd(a, b, c)     _mm_msub_pd
+#undef  gmx_simd_fnmadd_pd
+#define gmx_simd_fnmadd_pd(a, b, c)    _mm_nmacc_pd
+#undef  gmx_simd_fnmsub_pd
+#define gmx_simd_fnmsub_pd(a, b, c)    _mm_nmsub_pd
+#undef  gmx_simd_fraction_d
+#define gmx_simd_fraction_d          _mm_frcz_pd
+
+/* Even if the _main_ SIMD implementation for this architecture file corresponds
+ * to 128-bit AVX (since it will be faster), the 256-bit operations will always
+ * be available in AVX, so we can use them for double precision SIMD4!
+ */
+/* SIMD4 Double precision floating point */
+#define gmx_simd4_double_t               __m256d
+#define gmx_simd4_load_d                 _mm256_load_pd
+#define gmx_simd4_load1_d                _mm256_broadcast_sd
+#define gmx_simd4_set1_d                 _mm256_set1_pd
+#define gmx_simd4_store_d                _mm256_store_pd
+#define gmx_simd4_loadu_d                _mm256_loadu_pd
+#define gmx_simd4_storeu_d               _mm256_storeu_pd
+#define gmx_simd4_setzero_d              _mm256_setzero_pd
+#define gmx_simd4_add_d                  _mm256_add_pd
+#define gmx_simd4_sub_d                  _mm256_sub_pd
+#define gmx_simd4_mul_d                  _mm256_mul_pd
+#define gmx_simd4_fmadd_d                _mm256_macc_pd
+#define gmx_simd4_fmsub_d                _mm256_msub_pd
+#define gmx_simd4_fnmadd_d               _mm256_nmacc_pd
+#define gmx_simd4_fnmsub_d               _mm256_nmsub_pd
+#define gmx_simd4_and_d                  _mm256_and_pd
+#define gmx_simd4_andnot_d               _mm256_andnot_pd
+#define gmx_simd4_or_d                   _mm256_or_pd
+#define gmx_simd4_xor_d                  _mm256_xor_pd
+#define gmx_simd4_rsqrt_d(x)             _mm256_cvtps_pd(_mm_rsqrt_ps(_mm256_cvtpd_ps(x)))
+#define gmx_simd4_fabs_d(x)              _mm256_andnot_pd(_mm256_set1_pd(-0.0), x)
+#define gmx_simd4_fneg_d(x)              _mm256_xor_pd(x, _mm256_set1_pd(-0.0))
+#define gmx_simd4_max_d                  _mm256_max_pd
+#define gmx_simd4_min_d                  _mm256_min_pd
+#define gmx_simd4_round_d(x)             _mm256_round_pd(x, _MM_FROUND_NINT)
+#define gmx_simd4_trunc_d(x)             _mm256_round_pd(x, _MM_FROUND_TRUNC)
+#define gmx_simd4_dotproduct3_d          gmx_simd4_dotproduct3_d_avx_128_fma
+/* SIMD4 booleans corresponding to double */
+#define gmx_simd4_dbool_t                __m256d
+#define gmx_simd4_cmpeq_d(a, b)           _mm256_cmp_pd(a, b, _CMP_EQ_OQ)
+#define gmx_simd4_cmplt_d(a, b)           _mm256_cmp_pd(a, b, _CMP_LT_OQ)
+#define gmx_simd4_cmple_d(a, b)           _mm256_cmp_pd(a, b, _CMP_LE_OQ)
+#define gmx_simd4_and_db                 _mm256_and_pd
+#define gmx_simd4_or_db                  _mm256_or_pd
+#define gmx_simd4_anytrue_db             _mm256_movemask_pd
+#define gmx_simd4_blendzero_d            _mm256_and_pd
+#define gmx_simd4_blendnotzero_d(a, sel)  _mm256_andnot_pd(sel, a)
+#define gmx_simd4_blendv_d               _mm256_blendv_pd
+#define gmx_simd4_reduce_d               gmx_simd4_reduce_d_avx_128_fma
+/* SIMD4 float/double conversion */
+#define gmx_simd4_cvt_f2d                _mm256_cvtps_pd
+#define gmx_simd4_cvt_d2f                _mm256_cvtpd_ps
+
+static gmx_inline double
+gmx_simd4_reduce_d_avx_128_fma(__m256d a)
+{
+    double  f;
+    __m128d a0, a1;
+    a  = _mm256_hadd_pd(a, a);
+    a0 = _mm256_castpd256_pd128(a);
+    a1 = _mm256_extractf128_pd(a, 0x1);
+    a0 = _mm_add_sd(a0, a1);
+    _mm_store_sd(&f, a0);
+    return f;
+}
+
+static gmx_inline double
+gmx_simd4_dotproduct3_d_avx_128_fma(__m256d a, __m256d b)
+{
+    double  d;
+    __m128d tmp1, tmp2;
+    a    = _mm256_mul_pd(a, b);
+    tmp1 = _mm256_castpd256_pd128(a);
+    tmp2 = _mm256_extractf128_pd(a, 0x1);
+
+    tmp1 = _mm_add_pd(tmp1, _mm_permute_pd(tmp1, _MM_SHUFFLE2(0, 1)));
+    tmp1 = _mm_add_pd(tmp1, tmp2);
+    _mm_store_sd(&d, tmp1);
+    return d;
+}
+
+#endif /* GMX_SIMD_IMPL_X86_AVX_128_FMA_H */
diff --git a/src/gromacs/simd/impl_x86_avx_256/impl_x86_avx_256.h b/src/gromacs/simd/impl_x86_avx_256/impl_x86_avx_256.h

new file mode 100644 (file)

index 0000000..1b08ec2
--- /dev/null
+++ b/src/gromacs/simd/impl_x86_avx_256/impl_x86_avx_256.h
@@ -0,0 +1,565 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2014, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+
+#ifndef GMX_SIMD_IMPL_X86_AVX_256_H
+#define GMX_SIMD_IMPL_X86_AVX_256_H
+
+#include <math.h>
+#include <immintrin.h>
+
+/* It is cleaner to start the AVX implementation from scratch rather than
+ * first inheriting from SSE4.1, which in turn inherits from SSE2. However,
+ * the capabilities still form a superset.
+ */
+#define GMX_SIMD_X86_SSE2_OR_HIGHER
+#define GMX_SIMD_X86_SSE4_1_OR_HIGHER
+#define GMX_SIMD_X86_AVX_256_OR_HIGHER
+
+
+/* x86 256-bit AVX SIMD instruction wrappers
+ *
+ * Please see documentation in gromacs/simd/simd.h for defines.
+ */
+
+/* Capability definitions for 256-bit AVX - no inheritance from SSE */
+#define GMX_SIMD_HAVE_FLOAT
+#define GMX_SIMD_HAVE_DOUBLE
+#define GMX_SIMD_HAVE_SIMD_HARDWARE
+#define GMX_SIMD_HAVE_LOADU
+#define GMX_SIMD_HAVE_STOREU
+#define GMX_SIMD_HAVE_LOGICAL
+#undef  GMX_SIMD_HAVE_FMA
+#undef  GMX_SIMD_HAVE_FRACTION
+#define GMX_SIMD_HAVE_FINT32
+#define GMX_SIMD_HAVE_FINT32_EXTRACT     /* Emulated */
+#undef  GMX_SIMD_HAVE_FINT32_LOGICAL     /* AVX1 cannot do 256-bit int shifts */
+#undef  GMX_SIMD_HAVE_FINT32_ARITHMETICS /* AVX1 cannot do 256-bit int +,-,*  */
+#define GMX_SIMD_HAVE_DINT32
+#define GMX_SIMD_HAVE_DINT32_EXTRACT     /* Native, dint uses 128-bit SIMD    */
+#define GMX_SIMD_HAVE_DINT32_LOGICAL
+#define GMX_SIMD_HAVE_DINT32_ARITHMETICS
+#define GMX_SIMD4_HAVE_FLOAT
+#define GMX_SIMD4_HAVE_DOUBLE
+
+/* Implementation details */
+#define GMX_SIMD_FLOAT_WIDTH         8
+#define GMX_SIMD_DOUBLE_WIDTH        4
+#define GMX_SIMD_FINT32_WIDTH        8
+#define GMX_SIMD_DINT32_WIDTH        4
+#define GMX_SIMD_RSQRT_BITS         11
+#define GMX_SIMD_RCP_BITS           11
+
+/****************************************************
+ *      SINGLE PRECISION SIMD IMPLEMENTATION        *
+ ****************************************************/
+#define gmx_simd_float_t           __m256
+#define gmx_simd_load_f            _mm256_load_ps
+#define gmx_simd_load1_f           _mm256_broadcast_ss
+#define gmx_simd_set1_f            _mm256_set1_ps
+#define gmx_simd_store_f           _mm256_store_ps
+#define gmx_simd_loadu_f           _mm256_loadu_ps
+#define gmx_simd_storeu_f          _mm256_storeu_ps
+#define gmx_simd_setzero_f         _mm256_setzero_ps
+#define gmx_simd_add_f             _mm256_add_ps
+#define gmx_simd_sub_f             _mm256_sub_ps
+#define gmx_simd_mul_f             _mm256_mul_ps
+#define gmx_simd_fmadd_f(a, b, c)    _mm256_add_ps(_mm256_mul_ps(a, b), c)
+#define gmx_simd_fmsub_f(a, b, c)    _mm256_sub_ps(_mm256_mul_ps(a, b), c)
+#define gmx_simd_fnmadd_f(a, b, c)   _mm256_sub_ps(c, _mm256_mul_ps(a, b))
+#define gmx_simd_fnmsub_f(a, b, c)   _mm256_sub_ps(_mm256_setzero_ps(), gmx_simd_fmadd_f(a, b, c))
+#define gmx_simd_and_f             _mm256_and_ps
+#define gmx_simd_andnot_f          _mm256_andnot_ps
+#define gmx_simd_or_f              _mm256_or_ps
+#define gmx_simd_xor_f             _mm256_xor_ps
+#define gmx_simd_rsqrt_f           _mm256_rsqrt_ps
+#define gmx_simd_rcp_f             _mm256_rcp_ps
+#define gmx_simd_fabs_f(x)         _mm256_andnot_ps(_mm256_set1_ps(-0.0), x)
+#define gmx_simd_fneg_f(x)         _mm256_xor_ps(x, _mm256_set1_ps(-0.0))
+#define gmx_simd_max_f             _mm256_max_ps
+#define gmx_simd_min_f             _mm256_min_ps
+#define gmx_simd_round_f(x)        _mm256_round_ps(x, _MM_FROUND_NINT)
+#define gmx_simd_trunc_f(x)        _mm256_round_ps(x, _MM_FROUND_TRUNC)
+#define gmx_simd_fraction_f(x)     _mm256_sub_ps(x, gmx_simd_trunc_f(x))
+#define gmx_simd_get_exponent_f    gmx_simd_get_exponent_f_avx_256
+#define gmx_simd_get_mantissa_f    gmx_simd_get_mantissa_f_avx_256
+#define gmx_simd_set_exponent_f    gmx_simd_set_exponent_f_avx_256
+/* integer datatype corresponding to float: gmx_simd_fint32_t */
+#define gmx_simd_fint32_t          __m256i
+#define gmx_simd_load_fi(m)        _mm256_castps_si256(_mm256_load_ps((const float *)m))
+#define gmx_simd_set1_fi           _mm256_set1_epi32
+#define gmx_simd_store_fi(m, x)     _mm256_store_ps((float *)m, _mm256_castsi256_ps(x))
+#define gmx_simd_loadu_fi(m)       _mm256_castps_si256(_mm256_loadu_ps((const float *)m))
+#define gmx_simd_storeu_fi(m, x)    _mm256_storeu_ps((float *)m, _mm256_castsi256_ps(x))
+#define gmx_simd_setzero_fi        _mm256_setzero_si256
+#define gmx_simd_cvt_f2i           _mm256_cvtps_epi32
+#define gmx_simd_cvtt_f2i          _mm256_cvttps_epi32
+#define gmx_simd_cvt_i2f           _mm256_cvtepi32_ps
+#define gmx_simd_extract_fi(x, i)   _mm_extract_epi32(_mm256_extractf128_si256(x, (i)>>2), (i)&0x3)
+/* Integer logical ops on gmx_simd_fint32_t */
+/* gmx_simd_add_fi not supported     */
+/* gmx_simd_sub_fi not supported     */
+/* gmx_simd_mul_fi not supported     */
+/* gmx_simd_slli_fi not supported    */
+/* gmx_simd_srli_fi not supported    */
+/* gmx_simd_and_fi not supported     */
+/* gmx_simd_andnot_fi not supported  */
+/* gmx_simd_or_fi not supported      */
+/* gmx_simd_xor_fi not supported     */
+/* Integer arithmetic ops on gmx_simd_fint32_t */
+/* gmx_simd_add_fi not supported     */
+/* gmx_simd_sub_fi not supported     */
+/* gmx_simd_mul_fi not supported     */
+/* Boolean & comparison operations on gmx_simd_float_t */
+#define gmx_simd_fbool_t           __m256
+#define gmx_simd_cmpeq_f(a, b)      _mm256_cmp_ps(a, b, _CMP_EQ_OQ)
+#define gmx_simd_cmplt_f(a, b)      _mm256_cmp_ps(a, b, _CMP_LT_OQ)
+#define gmx_simd_cmple_f(a, b)      _mm256_cmp_ps(a, b, _CMP_LE_OQ)
+#define gmx_simd_and_fb            _mm256_and_ps
+#define gmx_simd_or_fb             _mm256_or_ps
+#define gmx_simd_anytrue_fb        _mm256_movemask_ps
+#define gmx_simd_blendzero_f       _mm256_and_ps
+#define gmx_simd_blendnotzero_f(a, sel)  _mm256_andnot_ps(sel, a)
+#define gmx_simd_blendv_f          _mm256_blendv_ps
+#define gmx_simd_reduce_f          gmx_simd_reduce_f_avx_256
+/* Boolean & comparison operations on gmx_simd_fint32_t */
+#define gmx_simd_fibool_t          __m256i
+/* gmx_simd_cmpeq_fi not supported        */
+/* gmx_simd_cmplt_fi not supported        */
+/* gmx_simd_and_fib not supported         */
+/* gmx_simd_or_fib not supported          */
+/* gmx_simd_anytrue_fib not supported     */
+/* gmx_simd_blendzero_fi not supported    */
+/* gmx_simd_blendnotzero_fi not supported    */
+/* gmx_simd_blendv_fi not supported       */
+/* Conversions between different booleans */
+#define gmx_simd_cvt_fb2fib        _mm256_castps_si256
+#define gmx_simd_cvt_fib2fb        _mm256_castsi256_ps
+
+/****************************************************
+ *      DOUBLE PRECISION SIMD IMPLEMENTATION        *
+ ****************************************************/
+#define gmx_simd_double_t          __m256d
+#define gmx_simd_load_d            _mm256_load_pd
+#define gmx_simd_load1_d           _mm256_broadcast_sd
+#define gmx_simd_set1_d            _mm256_set1_pd
+#define gmx_simd_store_d           _mm256_store_pd
+#define gmx_simd_loadu_d           _mm256_loadu_pd
+#define gmx_simd_storeu_d          _mm256_storeu_pd
+#define gmx_simd_setzero_d         _mm256_setzero_pd
+#define gmx_simd_add_d             _mm256_add_pd
+#define gmx_simd_sub_d             _mm256_sub_pd
+#define gmx_simd_mul_d             _mm256_mul_pd
+#define gmx_simd_fmadd_d(a, b, c)    _mm256_add_pd(_mm256_mul_pd(a, b), c)
+#define gmx_simd_fmsub_d(a, b, c)    _mm256_sub_pd(_mm256_mul_pd(a, b), c)
+#define gmx_simd_fnmadd_d(a, b, c)   _mm256_sub_pd(c, _mm256_mul_pd(a, b))
+#define gmx_simd_fnmsub_d(a, b, c)   _mm256_sub_pd(_mm256_setzero_pd(), gmx_simd_fmadd_d(a, b, c))
+#define gmx_simd_and_d             _mm256_and_pd
+#define gmx_simd_andnot_d          _mm256_andnot_pd
+#define gmx_simd_or_d              _mm256_or_pd
+#define gmx_simd_xor_d             _mm256_xor_pd
+#define gmx_simd_rsqrt_d(x)        _mm256_cvtps_pd(_mm_rsqrt_ps(_mm256_cvtpd_ps(x)))
+#define gmx_simd_rcp_d(x)          _mm256_cvtps_pd(_mm_rcp_ps(_mm256_cvtpd_ps(x)))
+#define gmx_simd_fabs_d(x)         _mm256_andnot_pd(_mm256_set1_pd(-0.0), x)
+#define gmx_simd_fneg_d(x)         _mm256_xor_pd(x, _mm256_set1_pd(-0.0))
+#define gmx_simd_max_d             _mm256_max_pd
+#define gmx_simd_min_d             _mm256_min_pd
+#define gmx_simd_round_d(x)        _mm256_round_pd(x, _MM_FROUND_NINT)
+#define gmx_simd_trunc_d(x)        _mm256_round_pd(x, _MM_FROUND_TRUNC)
+#define gmx_simd_fraction_d(x)     _mm256_sub_pd(x, gmx_simd_trunc_d(x))
+#define gmx_simd_get_exponent_d    gmx_simd_get_exponent_d_avx_256
+#define gmx_simd_get_mantissa_d    gmx_simd_get_mantissa_d_avx_256
+#define gmx_simd_set_exponent_d    gmx_simd_set_exponent_d_avx_256
+/* integer datatype corresponding to double: gmx_simd_dint32_t */
+#define gmx_simd_dint32_t          __m128i
+#define gmx_simd_load_di(m)        _mm_load_si128((const __m128i *)m)
+#define gmx_simd_set1_di           _mm_set1_epi32
+#define gmx_simd_store_di(m, x)     _mm_store_si128((__m128i *)m, x)
+#define gmx_simd_loadu_di(m)       _mm_loadu_si128((const __m128i *)m)
+#define gmx_simd_storeu_di(m, x)    _mm_storeu_si128((__m128i *)m, x)
+#define gmx_simd_setzero_di        _mm_setzero_si128
+#define gmx_simd_cvt_d2i           _mm256_cvtpd_epi32
+#define gmx_simd_cvtt_d2i          _mm256_cvttpd_epi32
+#define gmx_simd_cvt_i2d           _mm256_cvtepi32_pd
+#define gmx_simd_extract_di        _mm_extract_epi32
+/* Integer logical ops on gmx_simd_dint32_t */
+#define gmx_simd_slli_di           _mm_slli_epi32
+#define gmx_simd_srli_di           _mm_srli_epi32
+#define gmx_simd_and_di            _mm_and_si128
+#define gmx_simd_andnot_di         _mm_andnot_si128
+#define gmx_simd_or_di             _mm_or_si128
+#define gmx_simd_xor_di            _mm_xor_si128
+/* Integer arithmetic ops on integer datatype corresponding to double */
+#define gmx_simd_add_di            _mm_add_epi32
+#define gmx_simd_sub_di            _mm_sub_epi32
+#define gmx_simd_mul_di            _mm_mullo_epi32
+/* Boolean & comparison operations on gmx_simd_double_t */
+#define gmx_simd_dbool_t           __m256d
+#define gmx_simd_cmpeq_d(a, b)      _mm256_cmp_pd(a, b, _CMP_EQ_OQ)
+#define gmx_simd_cmplt_d(a, b)      _mm256_cmp_pd(a, b, _CMP_LT_OQ)
+#define gmx_simd_cmple_d(a, b)      _mm256_cmp_pd(a, b, _CMP_LE_OQ)
+#define gmx_simd_and_db            _mm256_and_pd
+#define gmx_simd_or_db             _mm256_or_pd
+#define gmx_simd_anytrue_db        _mm256_movemask_pd
+#define gmx_simd_blendzero_d       _mm256_and_pd
+#define gmx_simd_blendnotzero_d(a, sel)  _mm256_andnot_pd(sel, a)
+#define gmx_simd_blendv_d          _mm256_blendv_pd
+#define gmx_simd_reduce_d          gmx_simd_reduce_d_avx_256
+/* Boolean & comparison operations on gmx_simd_dint32_t */
+#define gmx_simd_dibool_t          __m128i
+#define gmx_simd_cmpeq_di          _mm_cmpeq_epi32
+#define gmx_simd_cmplt_di          _mm_cmplt_epi32
+#define gmx_simd_and_dib           _mm_and_si128
+#define gmx_simd_or_dib            _mm_or_si128
+#define gmx_simd_anytrue_dib       _mm_movemask_epi8
+#define gmx_simd_blendzero_di      _mm_and_si128
+#define gmx_simd_blendnotzero_di(a, sel)  _mm_andnot_si128(sel, a)
+#define gmx_simd_blendv_di         _mm_blendv_epi8
+/* Conversions between different booleans */
+#define gmx_simd_cvt_db2dib        gmx_simd_cvt_db2dib_avx_256
+#define gmx_simd_cvt_dib2db        gmx_simd_cvt_dib2db_avx_256
+/* Float/double conversion */
+#define gmx_simd_cvt_f2dd          gmx_simd_cvt_f2dd_avx_256
+#define gmx_simd_cvt_dd2f          gmx_simd_cvt_dd2f_avx_256
+
+/****************************************************
+ *      SINGLE PRECISION SIMD4 IMPLEMENTATION       *
+ ****************************************************/
+#define gmx_simd4_float_t          __m128
+#define gmx_simd4_load_f           _mm_load_ps
+#define gmx_simd4_load1_f          _mm_broadcast_ss
+#define gmx_simd4_set1_f           _mm_set1_ps
+#define gmx_simd4_store_f          _mm_store_ps
+#define gmx_simd4_loadu_f          _mm_loadu_ps
+#define gmx_simd4_storeu_f         _mm_storeu_ps
+#define gmx_simd4_setzero_f        _mm_setzero_ps
+#define gmx_simd4_add_f            _mm_add_ps
+#define gmx_simd4_sub_f            _mm_sub_ps
+#define gmx_simd4_mul_f            _mm_mul_ps
+#define gmx_simd4_fmadd_f(a, b, c)   _mm_add_ps(_mm_mul_ps(a, b), c)
+#define gmx_simd4_fmsub_f(a, b, c)   _mm_sub_ps(_mm_mul_ps(a, b), c)
+#define gmx_simd4_fnmadd_f(a, b, c)  _mm_sub_ps(c, _mm_mul_ps(a, b))
+#define gmx_simd4_fnmsub_f(a, b, c)  _mm_sub_ps(_mm_setzero_ps(), gmx_simd4_fmadd_f(a, b, c))
+#define gmx_simd4_and_f            _mm_and_ps
+#define gmx_simd4_andnot_f         _mm_andnot_ps
+#define gmx_simd4_or_f             _mm_or_ps
+#define gmx_simd4_xor_f            _mm_xor_ps
+#define gmx_simd4_rsqrt_f          _mm_rsqrt_ps
+#define gmx_simd4_fabs_f(x)        _mm_andnot_ps(_mm_set1_ps(-0.0), x)
+#define gmx_simd4_fneg_f(x)        _mm_xor_ps(x, _mm_set1_ps(-0.0))
+#define gmx_simd4_max_f            _mm_max_ps
+#define gmx_simd4_min_f            _mm_min_ps
+#define gmx_simd4_round_f(x)       _mm_round_ps(x, _MM_FROUND_NINT)
+#define gmx_simd4_trunc_f(x)       _mm_round_ps(x, _MM_FROUND_TRUNC)
+#define gmx_simd4_dotproduct3_f    gmx_simd4_dotproduct3_f_avx_256
+#define gmx_simd4_fbool_t          __m128
+#define gmx_simd4_cmpeq_f          _mm_cmpeq_ps
+#define gmx_simd4_cmplt_f          _mm_cmplt_ps
+#define gmx_simd4_cmple_f          _mm_cmple_ps
+#define gmx_simd4_and_fb           _mm_and_ps
+#define gmx_simd4_or_fb            _mm_or_ps
+#define gmx_simd4_anytrue_fb       _mm_movemask_ps
+#define gmx_simd4_blendzero_f      _mm_and_ps
+#define gmx_simd4_blendnotzero_f(a, sel)  _mm_andnot_ps(sel, a)
+#define gmx_simd4_blendv_f         _mm_blendv_ps
+#define gmx_simd4_reduce_f         gmx_simd4_reduce_f_avx_256
+
+/****************************************************
+ *      DOUBLE PRECISION SIMD4 IMPLEMENTATION       *
+ ****************************************************/
+#define gmx_simd4_double_t          gmx_simd_double_t
+#define gmx_simd4_load_d            gmx_simd_load_d
+#define gmx_simd4_load1_d           gmx_simd_load1_d
+#define gmx_simd4_set1_d            gmx_simd_set1_d
+#define gmx_simd4_store_d           gmx_simd_store_d
+#define gmx_simd4_loadu_d           gmx_simd_loadu_d
+#define gmx_simd4_storeu_d          gmx_simd_storeu_d
+#define gmx_simd4_setzero_d         gmx_simd_setzero_d
+#define gmx_simd4_add_d             gmx_simd_add_d
+#define gmx_simd4_sub_d             gmx_simd_sub_d
+#define gmx_simd4_mul_d             gmx_simd_mul_d
+#define gmx_simd4_fmadd_d           gmx_simd_fmadd_d
+#define gmx_simd4_fmsub_d           gmx_simd_fmsub_d
+#define gmx_simd4_fnmadd_d          gmx_simd_fnmadd_d
+#define gmx_simd4_fnmsub_d          gmx_simd_fnmsub_d
+#define gmx_simd4_and_d             gmx_simd_and_d
+#define gmx_simd4_andnot_d          gmx_simd_andnot_d
+#define gmx_simd4_or_d              gmx_simd_or_d
+#define gmx_simd4_xor_d             gmx_simd_xor_d
+#define gmx_simd4_rsqrt_d           gmx_simd_rsqrt_d
+#define gmx_simd4_fabs_d            gmx_simd_fabs_d
+#define gmx_simd4_fneg_d            gmx_simd_fneg_d
+#define gmx_simd4_max_d             gmx_simd_max_d
+#define gmx_simd4_min_d             gmx_simd_min_d
+#define gmx_simd4_round_d           gmx_simd_round_d
+#define gmx_simd4_trunc_d           gmx_simd_trunc_d
+#define gmx_simd4_dotproduct3_d     gmx_simd4_dotproduct3_d_avx_256
+#define gmx_simd4_dbool_t           gmx_simd_dbool_t
+#define gmx_simd4_cmpeq_d           gmx_simd_cmpeq_d
+#define gmx_simd4_cmplt_d           gmx_simd_cmplt_d
+#define gmx_simd4_cmple_d           gmx_simd_cmple_d
+#define gmx_simd4_and_db            gmx_simd_and_db
+#define gmx_simd4_or_db             gmx_simd_or_db
+#define gmx_simd4_anytrue_db        gmx_simd_anytrue_db
+#define gmx_simd4_blendzero_d       gmx_simd_blendzero_d
+#define gmx_simd4_blendnotzero_d    gmx_simd_blendnotzero_d
+#define gmx_simd4_blendv_d          gmx_simd_blendv_d
+#define gmx_simd4_reduce_d          gmx_simd_reduce_d
+/* SIMD4 float/double conversion */
+#define gmx_simd4_cvt_f2d           _mm256_cvtps_pd
+#define gmx_simd4_cvt_d2f           _mm256_cvtpd_ps
+
+/*********************************************************
+ * SIMD SINGLE PRECISION IMPLEMENTATION HELPER FUNCTIONS *
+ *********************************************************/
+static gmx_inline __m256
+gmx_simd_get_exponent_f_avx_256(__m256 x)
+{
+    const __m256  expmask      = _mm256_castsi256_ps(_mm256_set1_epi32(0x7F800000));
+    const __m128i expbias      = _mm_set1_epi32(127);
+    __m256i       iexp256;
+    __m128i       iexp128a, iexp128b;
+
+    iexp256   = _mm256_castps_si256(_mm256_and_ps(x, expmask));
+    iexp128b  = _mm256_extractf128_si256(iexp256, 0x1);
+    iexp128a  = _mm256_castsi256_si128(iexp256);
+    iexp128a  = _mm_srli_epi32(iexp128a, 23);
+    iexp128b  = _mm_srli_epi32(iexp128b, 23);
+    iexp128a  = _mm_sub_epi32(iexp128a, expbias);
+    iexp128b  = _mm_sub_epi32(iexp128b, expbias);
+    iexp256   = _mm256_castsi128_si256(iexp128a);
+    iexp256   = _mm256_insertf128_si256(iexp256, iexp128b, 0x1);
+    return _mm256_cvtepi32_ps(iexp256);
+}
+
+static gmx_inline __m256
+gmx_simd_get_mantissa_f_avx_256(__m256 x)
+{
+    const __m256 mantmask   = _mm256_castsi256_ps(_mm256_set1_epi32(0x007FFFFF));
+    const __m256 one        = _mm256_set1_ps(1.0);
+
+    x = _mm256_and_ps(x, mantmask);
+    return _mm256_or_ps(x, one);
+}
+
+static gmx_inline __m256
+gmx_simd_set_exponent_f_avx_256(__m256 x)
+{
+    const __m128i expbias      = _mm_set1_epi32(127);
+    __m256i       iexp256;
+    __m128i       iexp128a, iexp128b;
+
+    iexp256   = _mm256_cvtps_epi32(x);
+    iexp128b  = _mm256_extractf128_si256(iexp256, 0x1);
+    iexp128a  = _mm256_castsi256_si128(iexp256);
+    iexp128a  = _mm_slli_epi32(_mm_add_epi32(iexp128a, expbias), 23);
+    iexp128b  = _mm_slli_epi32(_mm_add_epi32(iexp128b, expbias), 23);
+    iexp256   = _mm256_castsi128_si256(iexp128a);
+    iexp256   = _mm256_insertf128_si256(iexp256, iexp128b, 0x1);
+    return _mm256_castsi256_ps(iexp256);
+}
+
+static gmx_inline float
+gmx_simd_reduce_f_avx_256(__m256 a)
+{
+    float  f;
+
+    __m128 a0, a1;
+    a  = _mm256_hadd_ps(a, a);
+    a  = _mm256_hadd_ps(a, a);
+    a0 = _mm256_castps256_ps128(a);
+    a1 = _mm256_extractf128_ps(a, 0x1);
+    a0 = _mm_add_ss(a0, a1);
+    _mm_store_ss(&f, a0);
+    return f;
+}
+
+/*********************************************************
+ * SIMD DOUBLE PRECISION IMPLEMENTATION HELPER FUNCTIONS *
+ *********************************************************/
+static gmx_inline __m256d
+gmx_simd_get_exponent_d_avx_256(__m256d x)
+{
+    const __m256d expmask      = _mm256_castsi256_pd( _mm256_set1_epi64x(0x7FF0000000000000LL));
+    const __m128i expbias      = _mm_set1_epi32(1023);
+    __m256i       iexp256;
+    __m128i       iexp128a, iexp128b;
+
+    iexp256   = _mm256_castpd_si256(_mm256_and_pd(x, expmask));
+    iexp128b  = _mm256_extractf128_si256(iexp256, 0x1);
+    iexp128a  = _mm256_castsi256_si128(iexp256);
+    iexp128a  = _mm_srli_epi64(iexp128a, 52);
+    iexp128b  = _mm_srli_epi64(iexp128b, 52);
+    iexp128a  = _mm_shuffle_epi32(iexp128a, _MM_SHUFFLE(1, 1, 2, 0));
+    iexp128b  = _mm_shuffle_epi32(iexp128b, _MM_SHUFFLE(2, 0, 1, 1));
+    iexp128a  = _mm_or_si128(iexp128a, iexp128b);
+    iexp128a  = _mm_sub_epi32(iexp128a, expbias);
+    return _mm256_cvtepi32_pd(iexp128a);
+}
+
+static gmx_inline __m256d
+gmx_simd_get_mantissa_d_avx_256(__m256d x)
+{
+    const __m256d mantmask   = _mm256_castsi256_pd(_mm256_set1_epi64x(0x000FFFFFFFFFFFFFLL));
+    const __m256d one        = _mm256_set1_pd(1.0);
+
+    x = _mm256_and_pd(x, mantmask);
+    return _mm256_or_pd(x, one);
+}
+
+static gmx_inline __m256d
+gmx_simd_set_exponent_d_avx_256(__m256d x)
+{
+    const __m128i expbias      = _mm_set1_epi32(1023);
+    __m128i       iexp128a, iexp128b;
+
+    iexp128a = _mm256_cvtpd_epi32(x);
+    iexp128a = _mm_add_epi32(iexp128a, expbias);
+    iexp128b = _mm_shuffle_epi32(iexp128a, _MM_SHUFFLE(3, 3, 2, 2));
+    iexp128a = _mm_shuffle_epi32(iexp128a, _MM_SHUFFLE(1, 1, 0, 0));
+    iexp128b = _mm_slli_epi64(iexp128b, 52);
+    iexp128a = _mm_slli_epi64(iexp128a, 52);
+    return _mm256_castsi256_pd(_mm256_insertf128_si256(_mm256_castsi128_si256(iexp128a), iexp128b, 0x1));
+}
+
+static gmx_inline double
+gmx_simd_reduce_d_avx_256(__m256d a)
+{
+    double  f;
+    __m128d a0, a1;
+    a  = _mm256_hadd_pd(a, a);
+    a0 = _mm256_castpd256_pd128(a);
+    a1 = _mm256_extractf128_pd(a, 0x1);
+    a0 = _mm_add_sd(a0, a1);
+    _mm_store_sd(&f, a0);
+    return f;
+}
+
+static gmx_inline gmx_simd_dibool_t
+gmx_simd_cvt_db2dib_avx_256(gmx_simd_dbool_t a)
+{
+    __m128i a1 = _mm256_extractf128_si256(_mm256_castpd_si256(a), 0x1);
+    __m128i a0 = _mm256_castsi256_si128(_mm256_castpd_si256(a));
+    a0 = _mm_shuffle_epi32(a0, _MM_SHUFFLE(2, 0, 2, 0));
+    a1 = _mm_shuffle_epi32(a1, _MM_SHUFFLE(2, 0, 2, 0));
+    return _mm_blend_epi16(a0, a1, 0xF0);
+}
+
+static gmx_inline gmx_simd_dbool_t
+gmx_simd_cvt_dib2db_avx_256(gmx_simd_dibool_t a)
+{
+    __m128i a1 = _mm_shuffle_epi32(a, _MM_SHUFFLE(3, 3, 2, 2));
+    __m128i a0 = _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 1, 0, 0));
+    return _mm256_castsi256_pd(_mm256_insertf128_si256(_mm256_castsi128_si256(a0), a1, 0x1));
+}
+
+static gmx_inline void
+gmx_simd_cvt_f2dd_avx_256(__m256 f, __m256d *d0, __m256d *d1)
+{
+    *d0 = _mm256_cvtps_pd(_mm256_castps256_ps128(f));
+    *d1 = _mm256_cvtps_pd(_mm256_extractf128_ps(f, 0x1));
+}
+
+static gmx_inline __m256
+gmx_simd_cvt_dd2f_avx_256(__m256d d0, __m256d d1)
+{
+    __m128 f0 = _mm256_cvtpd_ps(d0);
+    __m128 f1 = _mm256_cvtpd_ps(d1);
+    return _mm256_insertf128_ps(_mm256_castps128_ps256(f0), f1, 0x1);
+}
+
+/* SIMD4 reduce helper */
+static gmx_inline float
+gmx_simd4_reduce_f_avx_256(__m128 a)
+{
+    float f;
+    a = _mm_hadd_ps(a, a);
+    a = _mm_hadd_ps(a, a);
+    _mm_store_ss(&f, a);
+    return f;
+}
+
+/* SIMD4 Dotproduct helper function */
+static gmx_inline float
+gmx_simd4_dotproduct3_f_avx_256(__m128 a, __m128 b)
+{
+    float  f;
+    __m128 c;
+    a = _mm_mul_ps(a, b);
+    c = _mm_add_ps(a, _mm_permute_ps(a, _MM_SHUFFLE(0, 3, 2, 1)));
+    c = _mm_add_ps(c, _mm_permute_ps(a, _MM_SHUFFLE(1, 0, 3, 2)));
+    _mm_store_ss(&f, c);
+    return f;
+}
+
+static gmx_inline double
+gmx_simd4_dotproduct3_d_avx_256(__m256d a, __m256d b)
+{
+    double  d;
+    __m128d tmp1, tmp2;
+    a    = _mm256_mul_pd(a, b);
+    tmp1 = _mm256_castpd256_pd128(a);
+    tmp2 = _mm256_extractf128_pd(a, 0x1);
+
+    tmp1 = _mm_add_pd(tmp1, _mm_permute_pd(tmp1, _MM_SHUFFLE2(0, 1)));
+    tmp1 = _mm_add_pd(tmp1, tmp2);
+    _mm_store_sd(&d, tmp1);
+    return d;
+}
+
+/* Function to check whether SIMD operations have resulted in overflow */
+static int
+gmx_simd_check_and_reset_overflow(void)
+{
+    int MXCSR;
+    int sse_overflow;
+
+    MXCSR = _mm_getcsr();
+    /* The overflow flag is bit 3 in the register */
+    if (MXCSR & 0x0008)
+    {
+        sse_overflow = 1;
+        /* Set the overflow flag to zero */
+        MXCSR = MXCSR & 0xFFF7;
+        _mm_setcsr(MXCSR);
+    }
+    else
+    {
+        sse_overflow = 0;
+    }
+    return sse_overflow;
+}
+
+
+#endif /* GMX_SIMD_IMPL_X86_AVX_256_H */
diff --git a/src/gromacs/simd/impl_x86_sse2/impl_x86_sse2.h b/src/gromacs/simd/impl_x86_sse2/impl_x86_sse2.h

new file mode 100644 (file)

index 0000000..5481de1
--- /dev/null
+++ b/src/gromacs/simd/impl_x86_sse2/impl_x86_sse2.h
@@ -0,0 +1,451 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2014, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+
+#ifndef GMX_SIMD_IMPL_X86_SSE2_H
+#define GMX_SIMD_IMPL_X86_SSE2_H
+
+#include <math.h>
+#include <emmintrin.h>
+
+/* Set capabilities that can be inherited */
+#define GMX_SIMD_X86_SSE2_OR_HIGHER
+
+/* x86 SSE2 SIMD instruction wrappers
+ *
+ * Please see documentation in gromacs/simd/simd.h for defines.
+ */
+
+/* Capability definitions for SSE2 */
+#define GMX_SIMD_HAVE_FLOAT
+#define GMX_SIMD_HAVE_DOUBLE
+#define GMX_SIMD_HAVE_HARDWARE
+#define GMX_SIMD_HAVE_LOADU
+#define GMX_SIMD_HAVE_STOREU
+#define GMX_SIMD_HAVE_LOGICAL
+#undef  GMX_SIMD_HAVE_FMA
+#undef  GMX_SIMD_HAVE_FRACTION
+#define GMX_SIMD_HAVE_FINT32
+#define GMX_SIMD_HAVE_FINT32_EXTRACT   /* No SSE2 instruction, but use shifts */
+#define GMX_SIMD_HAVE_FINT32_LOGICAL
+#define GMX_SIMD_HAVE_FINT32_ARITHMETICS
+#define GMX_SIMD_HAVE_DINT32
+#define GMX_SIMD_HAVE_DINT32_EXTRACT   /* No SSE2 instruction, but use shifts */
+#define GMX_SIMD_HAVE_DINT32_LOGICAL
+#define GMX_SIMD_HAVE_DINT32_ARITHMETICS
+#define GMX_SIMD4_HAVE_FLOAT
+#undef  GMX_SIMD4_HAVE_DOUBLE
+
+/* Implementation details */
+#define GMX_SIMD_FLOAT_WIDTH         4
+#define GMX_SIMD_DOUBLE_WIDTH        2
+#define GMX_SIMD_FINT32_WIDTH        4
+#define GMX_SIMD_DINT32_WIDTH        2
+#define GMX_SIMD_RSQRT_BITS         11
+#define GMX_SIMD_RCP_BITS           11
+
+/****************************************************
+ *      SINGLE PRECISION SIMD IMPLEMENTATION        *
+ ****************************************************/
+#define gmx_simd_float_t          __m128
+#define gmx_simd_load_f           _mm_load_ps
+#define gmx_simd_load1_f          _mm_load1_ps
+#define gmx_simd_set1_f           _mm_set1_ps
+#define gmx_simd_store_f          _mm_store_ps
+#define gmx_simd_loadu_f          _mm_loadu_ps
+#define gmx_simd_storeu_f         _mm_storeu_ps
+#define gmx_simd_setzero_f        _mm_setzero_ps
+#define gmx_simd_add_f            _mm_add_ps
+#define gmx_simd_sub_f            _mm_sub_ps
+#define gmx_simd_mul_f            _mm_mul_ps
+#define gmx_simd_fmadd_f(a, b, c)   _mm_add_ps(_mm_mul_ps(a, b), c)
+#define gmx_simd_fmsub_f(a, b, c)   _mm_sub_ps(_mm_mul_ps(a, b), c)
+#define gmx_simd_fnmadd_f(a, b, c)  _mm_sub_ps(c, _mm_mul_ps(a, b))
+#define gmx_simd_fnmsub_f(a, b, c)  _mm_sub_ps(_mm_setzero_ps(), gmx_simd_fmadd_f(a, b, c))
+#define gmx_simd_and_f            _mm_and_ps
+#define gmx_simd_andnot_f         _mm_andnot_ps
+#define gmx_simd_or_f             _mm_or_ps
+#define gmx_simd_xor_f            _mm_xor_ps
+#define gmx_simd_rsqrt_f          _mm_rsqrt_ps
+#define gmx_simd_rcp_f            _mm_rcp_ps
+#define gmx_simd_fabs_f(x)        _mm_andnot_ps(_mm_set1_ps(-0.0), x)
+#define gmx_simd_fneg_f(x)        _mm_xor_ps(x, _mm_set1_ps(-0.0))
+#define gmx_simd_max_f            _mm_max_ps
+#define gmx_simd_min_f            _mm_min_ps
+#define gmx_simd_round_f(x)       _mm_cvtepi32_ps(_mm_cvtps_epi32(x))
+#define gmx_simd_trunc_f(x)       _mm_cvtepi32_ps(_mm_cvttps_epi32(x))
+#define gmx_simd_fraction_f(x)    _mm_sub_ps(x, gmx_simd_trunc_f(x))
+#define gmx_simd_get_exponent_f   gmx_simd_get_exponent_f_sse2
+#define gmx_simd_get_mantissa_f   gmx_simd_get_mantissa_f_sse2
+#define gmx_simd_set_exponent_f   gmx_simd_set_exponent_f_sse2
+/* integer datatype corresponding to float: gmx_simd_fint32_t */
+#define gmx_simd_fint32_t         __m128i
+#define gmx_simd_load_fi(m)       _mm_load_si128((const __m128i *)m)
+#define gmx_simd_set1_fi          _mm_set1_epi32
+#define gmx_simd_store_fi(m, x)    _mm_store_si128((__m128i *)m, x)
+#define gmx_simd_loadu_fi(m)      _mm_loadu_si128((const __m128i *)m)
+#define gmx_simd_storeu_fi(m, x)   _mm_storeu_si128((__m128i *)m, x)
+#define gmx_simd_setzero_fi       _mm_setzero_si128
+#define gmx_simd_cvt_f2i          _mm_cvtps_epi32
+#define gmx_simd_cvtt_f2i         _mm_cvttps_epi32
+#define gmx_simd_cvt_i2f          _mm_cvtepi32_ps
+#define gmx_simd_extract_fi(x, i)  _mm_cvtsi128_si32(_mm_srli_si128((x), sizeof(int) * (i)))
+/* Integer logical ops on gmx_simd_fint32_t */
+#define gmx_simd_slli_fi          _mm_slli_epi32
+#define gmx_simd_srli_fi          _mm_srli_epi32
+#define gmx_simd_and_fi           _mm_and_si128
+#define gmx_simd_andnot_fi        _mm_andnot_si128
+#define gmx_simd_or_fi            _mm_or_si128
+#define gmx_simd_xor_fi           _mm_xor_si128
+/* Integer arithmetic ops on gmx_simd_fint32_t */
+#define gmx_simd_add_fi           _mm_add_epi32
+#define gmx_simd_sub_fi           _mm_sub_epi32
+#define gmx_simd_mul_fi           gmx_simd_mul_fi_sse2
+/* Boolean & comparison operations on gmx_simd_float_t */
+#define gmx_simd_fbool_t          __m128
+#define gmx_simd_cmpeq_f          _mm_cmpeq_ps
+#define gmx_simd_cmplt_f          _mm_cmplt_ps
+#define gmx_simd_cmple_f          _mm_cmple_ps
+#define gmx_simd_and_fb           _mm_and_ps
+#define gmx_simd_or_fb            _mm_or_ps
+#define gmx_simd_anytrue_fb       _mm_movemask_ps
+#define gmx_simd_blendzero_f      _mm_and_ps
+#define gmx_simd_blendnotzero_f(a, sel)   _mm_andnot_ps(sel, a)
+#define gmx_simd_blendv_f(a, b, s)  _mm_or_ps(_mm_andnot_ps(s, a), _mm_and_ps(s, b))
+#define gmx_simd_reduce_f(a)      gmx_simd_reduce_f_sse2(a)
+/* Boolean & comparison operations on gmx_simd_fint32_t */
+#define gmx_simd_fibool_t         __m128i
+#define gmx_simd_cmpeq_fi         _mm_cmpeq_epi32
+#define gmx_simd_cmplt_fi         _mm_cmplt_epi32
+#define gmx_simd_and_fib          _mm_and_si128
+#define gmx_simd_or_fib           _mm_or_si128
+#define gmx_simd_anytrue_fib      _mm_movemask_epi8
+#define gmx_simd_blendzero_fi     _mm_and_si128
+#define gmx_simd_blendnotzero_fi(a, sel) _mm_andnot_si128(sel, a)
+#define gmx_simd_blendv_fi(a, b, s) _mm_or_si128(_mm_andnot_si128(s, a), _mm_and_si128(s, b))
+/* Conversions between different booleans */
+#define gmx_simd_cvt_fb2fib       _mm_castps_si128
+#define gmx_simd_cvt_fib2fb       _mm_castsi128_ps
+
+/****************************************************
+ *      DOUBLE PRECISION SIMD IMPLEMENTATION        *
+ ****************************************************/
+#define gmx_simd_double_t          __m128d
+#define gmx_simd_load_d            _mm_load_pd
+#define gmx_simd_load1_d           _mm_load1_pd
+#define gmx_simd_set1_d            _mm_set1_pd
+#define gmx_simd_store_d           _mm_store_pd
+#define gmx_simd_loadu_d           _mm_loadu_pd
+#define gmx_simd_storeu_d          _mm_storeu_pd
+#define gmx_simd_setzero_d         _mm_setzero_pd
+#define gmx_simd_add_d             _mm_add_pd
+#define gmx_simd_sub_d             _mm_sub_pd
+#define gmx_simd_mul_d             _mm_mul_pd
+#define gmx_simd_fmadd_d(a, b, c)    _mm_add_pd(_mm_mul_pd(a, b), c)
+#define gmx_simd_fmsub_d(a, b, c)    _mm_sub_pd(_mm_mul_pd(a, b), c)
+#define gmx_simd_fnmadd_d(a, b, c)   _mm_sub_pd(c, _mm_mul_pd(a, b))
+#define gmx_simd_fnmsub_d(a, b, c)   _mm_sub_pd(_mm_setzero_pd(), gmx_simd_fmadd_d(a, b, c))
+#define gmx_simd_and_d             _mm_and_pd
+#define gmx_simd_andnot_d          _mm_andnot_pd
+#define gmx_simd_or_d              _mm_or_pd
+#define gmx_simd_xor_d             _mm_xor_pd
+#define gmx_simd_rsqrt_d(x)        _mm_cvtps_pd(_mm_rsqrt_ps(_mm_cvtpd_ps(x)))
+/* Don't use FMA for sqrt N-R iterations - this saves 1 instruction without FMA hardware */
+#define gmx_simd_rcp_d(x)          _mm_cvtps_pd(_mm_rcp_ps(_mm_cvtpd_ps(x)))
+#define gmx_simd_fabs_d(x)         _mm_andnot_pd(_mm_set1_pd(-0.0), x)
+#define gmx_simd_fneg_d(x)         _mm_xor_pd(x, _mm_set1_pd(-0.0))
+#define gmx_simd_max_d             _mm_max_pd
+#define gmx_simd_min_d             _mm_min_pd
+#define gmx_simd_round_d(x)        _mm_cvtepi32_pd(_mm_cvtpd_epi32(x))
+#define gmx_simd_trunc_d(x)        _mm_cvtepi32_pd(_mm_cvttpd_epi32(x))
+#define gmx_simd_fraction_d(x)     _mm_sub_pd(x, gmx_simd_trunc_d(x))
+#define gmx_simd_get_exponent_d    gmx_simd_get_exponent_d_sse2
+#define gmx_simd_get_mantissa_d    gmx_simd_get_mantissa_d_sse2
+#define gmx_simd_set_exponent_d    gmx_simd_set_exponent_d_sse2
+/* integer datatype corresponding to double: gmx_simd_dint32_t */
+#define gmx_simd_dint32_t          __m128i
+#define gmx_simd_load_di(m)        _mm_loadl_epi64((const __m128i *)m)
+#define gmx_simd_set1_di           _mm_set1_epi32
+#define gmx_simd_store_di(m, x)     _mm_storel_epi64((__m128i *)m, x)
+#define gmx_simd_loadu_di(m)       _mm_loadl_epi64((const __m128i *)m)
+#define gmx_simd_storeu_di(m, x)    _mm_storel_epi64((__m128i *)m, x)
+#define gmx_simd_setzero_di        _mm_setzero_si128
+#define gmx_simd_cvt_d2i           _mm_cvtpd_epi32
+#define gmx_simd_cvtt_d2i          _mm_cvttpd_epi32
+#define gmx_simd_cvt_i2d           _mm_cvtepi32_pd
+#define gmx_simd_extract_di(x, i)   _mm_cvtsi128_si32(_mm_srli_si128((x), sizeof(int) * (i)))
+/* Integer logical ops on gmx_simd_dint32_t */
+#define gmx_simd_slli_di           _mm_slli_epi32
+#define gmx_simd_srli_di           _mm_srli_epi32
+#define gmx_simd_and_di            _mm_and_si128
+#define gmx_simd_andnot_di         _mm_andnot_si128
+#define gmx_simd_or_di             _mm_or_si128
+#define gmx_simd_xor_di            _mm_xor_si128
+/* Integer arithmetic ops on integer datatype corresponding to double */
+#define gmx_simd_add_di            _mm_add_epi32
+#define gmx_simd_sub_di            _mm_sub_epi32
+#define gmx_simd_mul_di            gmx_simd_mul_di_sse2
+/* Boolean & comparison operations on gmx_simd_double_t */
+#define gmx_simd_dbool_t            __m128d
+#define gmx_simd_cmpeq_d            _mm_cmpeq_pd
+#define gmx_simd_cmplt_d            _mm_cmplt_pd
+#define gmx_simd_cmple_d            _mm_cmple_pd
+#define gmx_simd_and_db             _mm_and_pd
+#define gmx_simd_or_db              _mm_or_pd
+#define gmx_simd_anytrue_db         _mm_movemask_pd
+#define gmx_simd_blendzero_d        _mm_and_pd
+#define gmx_simd_blendnotzero_d(a, sel) _mm_andnot_pd(sel, a)
+#define gmx_simd_blendv_d(a, b, sel)  _mm_or_pd(_mm_andnot_pd(sel, a), _mm_and_pd(sel, b))
+#define gmx_simd_reduce_d(a)        gmx_simd_reduce_d_sse2(a)
+
+/* Boolean & comparison operations on gmx_simd_dint32_t */
+#define gmx_simd_dibool_t           __m128i
+#define gmx_simd_cmpeq_di           _mm_cmpeq_epi32
+#define gmx_simd_cmplt_di           _mm_cmplt_epi32
+#define gmx_simd_and_dib            _mm_and_si128
+#define gmx_simd_or_dib             _mm_or_si128
+#define gmx_simd_anytrue_dib(x)     _mm_movemask_epi8(_mm_shuffle_epi32(x, _MM_SHUFFLE(1, 0, 1, 0)))
+#define gmx_simd_blendzero_di       _mm_and_si128
+#define gmx_simd_blendnotzero_di(a, sel)  _mm_andnot_si128(sel, a)
+#define gmx_simd_blendv_di(a, b, sel) _mm_or_si128(_mm_andnot_si128(sel, a), _mm_and_si128(sel, b))
+#define gmx_simd_cvt_db2dib(x)      _mm_shuffle_epi32(_mm_castpd_si128(x), _MM_SHUFFLE(2, 0, 2, 0))
+#define gmx_simd_cvt_dib2db(x)      _mm_castsi128_pd(_mm_shuffle_epi32(x, _MM_SHUFFLE(1, 1, 0, 0)))
+/* Float/double conversion */
+#define gmx_simd_cvt_f2dd(f, d0, d1)  { *d0 = _mm_cvtps_pd(f); *d1 = _mm_cvtps_pd(_mm_movehl_ps(f, f)); }
+#define gmx_simd_cvt_dd2f(d0, d1)    _mm_movelh_ps(_mm_cvtpd_ps(d0), _mm_cvtpd_ps(d1))
+
+
+/****************************************************
+ * SINGLE PRECISION IMPLEMENTATION HELPER FUNCTIONS *
+ ****************************************************/
+static gmx_inline __m128
+gmx_simd_get_exponent_f_sse2(__m128 x)
+{
+    const __m128  expmask      = _mm_castsi128_ps(_mm_set1_epi32(0x7F800000));
+    const __m128i expbias      = _mm_set1_epi32(127);
+    __m128i       iexp;
+
+    iexp = _mm_castps_si128(_mm_and_ps(x, expmask));
+    iexp = _mm_sub_epi32(_mm_srli_epi32(iexp, 23), expbias);
+    return _mm_cvtepi32_ps(iexp);
+}
+
+static gmx_inline __m128
+gmx_simd_get_mantissa_f_sse2(__m128 x)
+{
+    const __m128 mantmask = _mm_castsi128_ps(_mm_set1_epi32(0x007FFFFF));
+    const __m128 one      = _mm_set1_ps(1.0f);
+
+    x = _mm_and_ps(x, mantmask);
+    return _mm_or_ps(x, one);
+}
+
+static gmx_inline __m128
+gmx_simd_set_exponent_f_sse2(__m128 x)
+{
+    const __m128i expbias      = _mm_set1_epi32(127);
+    __m128i       iexp         = _mm_cvtps_epi32(x);
+
+    iexp = _mm_slli_epi32(_mm_add_epi32(iexp, expbias), 23);
+    return _mm_castsi128_ps(iexp);
+}
+
+static gmx_inline __m128i
+gmx_simd_mul_fi_sse2(__m128i a, __m128i b)
+{
+    __m128i a1 = _mm_srli_si128(a, 4); /* - a[3] a[2] a[1] */
+    __m128i b1 = _mm_srli_si128(b, 4); /* - b[3] b[2] b[1] */
+    __m128i c  = _mm_mul_epu32(a, b);
+    __m128i c1 = _mm_mul_epu32(a1, b1);
+
+    c  = _mm_shuffle_epi32(c, _MM_SHUFFLE(3, 1, 2, 0));  /* - - a[2]*b[2] a[0]*b[0] */
+    c1 = _mm_shuffle_epi32(c1, _MM_SHUFFLE(3, 1, 2, 0)); /* - - a[3]*b[3] a[1]*b[1] */
+
+    return _mm_unpacklo_epi32(c, c1);
+}
+
+static gmx_inline float
+gmx_simd_reduce_f_sse2(__m128 a)
+{
+    __m128 b;
+    float  f;
+    b = _mm_add_ps(a, _mm_shuffle_ps(a, a, _MM_SHUFFLE(1, 0, 3, 2)));
+    b = _mm_add_ss(b, _mm_shuffle_ps(b, b, _MM_SHUFFLE(0, 3, 2, 1)));
+    _mm_store_ss(&f, b);
+    return f;
+}
+
+/****************************************************
+ * DOUBLE PRECISION IMPLEMENTATION HELPER FUNCTIONS *
+ ****************************************************/
+static gmx_inline __m128d
+gmx_simd_get_exponent_d_sse2(__m128d x)
+{
+    /* Don't use _mm_set1_epi64x() - on MSVC it is only supported for 64-bit builds */
+    const __m128d expmask      = _mm_castsi128_pd( _mm_set_epi32(0x7FF00000, 0x00000000, 0x7FF00000, 0x00000000) );
+    const __m128i expbias      = _mm_set1_epi32(1023);
+    __m128i       iexp;
+
+    iexp   = _mm_castpd_si128(_mm_and_pd(x, expmask));
+    iexp   = _mm_sub_epi32(_mm_srli_epi64(iexp, 52), expbias);
+    iexp   = _mm_shuffle_epi32(iexp, _MM_SHUFFLE(3, 1, 2, 0) );
+    return _mm_cvtepi32_pd(iexp);
+}
+
+static gmx_inline __m128d
+gmx_simd_get_mantissa_d_sse2(__m128d x)
+{
+    /* Don't use _mm_set1_epi64x() - on MSVC it is only supported for 64-bit builds */
+    const __m128d mantmask = _mm_castsi128_pd( _mm_set_epi32(0x000FFFFF, 0xFFFFFFFF, 0x000FFFFF, 0xFFFFFFFF) );
+    const __m128d one      = _mm_set1_pd(1.0);
+
+    x = _mm_and_pd(x, mantmask);
+    return _mm_or_pd(x, one);
+}
+
+static gmx_inline __m128d
+gmx_simd_set_exponent_d_sse2(__m128d x)
+{
+    const __m128i  expbias      = _mm_set1_epi32(1023);
+    __m128i        iexp         = _mm_cvtpd_epi32(x);
+
+    /* After conversion integers will be in slot 0,1. Move them to 0,2 so
+     * we can do a 64-bit shift and get them to the dp exponents. */
+    iexp = _mm_shuffle_epi32(iexp, _MM_SHUFFLE(3, 1, 2, 0));
+    iexp = _mm_slli_epi64(_mm_add_epi32(iexp, expbias), 52);
+    return _mm_castsi128_pd(iexp);
+}
+
+static gmx_inline __m128i
+gmx_simd_mul_di_sse2(__m128i a, __m128i b)
+{
+    __m128i c;
+
+    a = _mm_unpacklo_epi32(a, _mm_setzero_si128());       /* 0 a[1] 0 a[0] */
+    b = _mm_unpacklo_epi32(b, _mm_setzero_si128());       /* 0 b[1] 0 b[0] */
+
+    c  = _mm_mul_epu32(a, b);                             /* 0 a[1]*b[1] 0 a[0]*b[0] */
+    return _mm_shuffle_epi32(c, _MM_SHUFFLE(3, 1, 2, 0)); /* 0 0 a[1]*b[1] a[0]*b[0] */
+}
+
+static gmx_inline double
+gmx_simd_reduce_d_sse2(__m128d a)
+{
+    __m128d b;
+    double  f;
+
+    b = _mm_add_sd(a, _mm_shuffle_pd(a, a, _MM_SHUFFLE2(1, 1)));
+    _mm_store_sd(&f, b);
+    return f;
+}
+
+/* Function to check whether SIMD operations have resulted in overflow */
+static int
+gmx_simd_check_and_reset_overflow(void)
+{
+    int MXCSR;
+    int sse_overflow;
+
+    MXCSR = _mm_getcsr();
+    /* The overflow flag is bit 3 in the register */
+    if (MXCSR & 0x0008)
+    {
+        sse_overflow = 1;
+        /* Set the overflow flag to zero */
+        MXCSR = MXCSR & 0xFFF7;
+        _mm_setcsr(MXCSR);
+    }
+    else
+    {
+        sse_overflow = 0;
+    }
+    return sse_overflow;
+}
+
+/* SSE2 is already 4-wide in single, so we just reuse float datatype for SIMD4.
+ * SSE2 cannot do double-precision SIMD4.
+ */
+#define gmx_simd4_float_t                gmx_simd_float_t
+#define gmx_simd4_load_f                 gmx_simd_load_f
+#define gmx_simd4_load1_f                gmx_simd_load1_f
+#define gmx_simd4_set1_f                 gmx_simd_set1_f
+#define gmx_simd4_store_f                gmx_simd_store_f
+#define gmx_simd4_loadu_f                gmx_simd_loadu_f
+#define gmx_simd4_storeu_f               gmx_simd_storeu_f
+#define gmx_simd4_setzero_f              gmx_simd_setzero_f
+#define gmx_simd4_add_f                  gmx_simd_add_f
+#define gmx_simd4_sub_f                  gmx_simd_sub_f
+#define gmx_simd4_mul_f                  gmx_simd_mul_f
+#define gmx_simd4_fmadd_f                gmx_simd_fmadd_f
+#define gmx_simd4_fmsub_f                gmx_simd_fmsub_f
+#define gmx_simd4_fnmadd_f               gmx_simd_fnmadd_f
+#define gmx_simd4_fnmsub_f               gmx_simd_fnmsub_f
+#define gmx_simd4_and_f                  gmx_simd_and_f
+#define gmx_simd4_andnot_f               gmx_simd_andnot_f
+#define gmx_simd4_or_f                   gmx_simd_or_f
+#define gmx_simd4_xor_f                  gmx_simd_xor_f
+#define gmx_simd4_rsqrt_f                gmx_simd_rsqrt_f
+#define gmx_simd4_fabs_f                 gmx_simd_fabs_f
+#define gmx_simd4_fneg_f                 gmx_simd_fneg_f
+#define gmx_simd4_max_f                  gmx_simd_max_f
+#define gmx_simd4_min_f                  gmx_simd_min_f
+#define gmx_simd4_round_f                gmx_simd_round_f
+#define gmx_simd4_trunc_f                gmx_simd_trunc_f
+#define gmx_simd4_dotproduct3_f          gmx_simd4_dotproduct3_f_sse2
+#define gmx_simd4_fbool_t                gmx_simd_fbool_t
+#define gmx_simd4_cmpeq_f                gmx_simd_cmpeq_f
+#define gmx_simd4_cmplt_f                gmx_simd_cmplt_f
+#define gmx_simd4_cmple_f                gmx_simd_cmple_f
+#define gmx_simd4_and_fb                 gmx_simd_and_fb
+#define gmx_simd4_or_fb                  gmx_simd_or_fb
+#define gmx_simd4_anytrue_fb             gmx_simd_anytrue_fb
+#define gmx_simd4_blendzero_f            gmx_simd_blendzero_f
+#define gmx_simd4_blendnotzero_f         gmx_simd_blendnotzero_f
+#define gmx_simd4_blendv_f               gmx_simd_blendv_f
+#define gmx_simd4_reduce_f               gmx_simd_reduce_f
+
+/* SIMD4 Dotproduct helper function */
+static gmx_inline float
+gmx_simd4_dotproduct3_f_sse2(__m128 a, __m128 b)
+{
+    float  f;
+    __m128 c;
+    a = _mm_mul_ps(a, b);
+    c = _mm_add_ps(a, _mm_shuffle_ps(a, a, _MM_SHUFFLE(2, 1, 2, 1)));
+    c = _mm_add_ps(c, _mm_shuffle_ps(a, a, _MM_SHUFFLE(3, 2, 3, 2)));
+    _mm_store_ss(&f, c);
+    return f;
+}
+
+#endif /* GMX_SIMD_IMPL_X86_SSE2_H */
diff --git a/src/gromacs/simd/impl_x86_sse4_1/impl_x86_sse4_1.h b/src/gromacs/simd/impl_x86_sse4_1/impl_x86_sse4_1.h

new file mode 100644 (file)

index 0000000..ecb46b2
--- /dev/null
+++ b/src/gromacs/simd/impl_x86_sse4_1/impl_x86_sse4_1.h
@@ -0,0 +1,129 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2014, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+
+#ifndef GMX_SIMD_IMPL_X86_SSE4_1_H
+#define GMX_SIMD_IMPL_X86_SSE4_1_H
+
+#include <math.h>
+#include <smmintrin.h>
+
+
+/* x86 SSE4.1 SIMD instruction wrappers
+ *
+ * Please see documentation in gromacs/simd/simd.h for the available
+ * defines.
+ */
+
+/* Inherit most of SSE4.1 from SSE2 */
+#include "gromacs/simd/impl_x86_sse2/impl_x86_sse2.h"
+/* Increment over SSE2 capabilities */
+#define GMX_SIMD_X86_SSE4_1_OR_HIGHER
+
+
+/* Override capability definitions from SSE2 */
+#define  GMX_SIMD4_HAVE_FLOAT_DOTPRODUCT3
+
+/* Almost all SSE4.1 instructions already exist in SSE2, but a few of them
+ * can be implemented more efficiently in SSE4.1.
+ */
+#undef  gmx_simd_round_f
+#define gmx_simd_round_f(x)       _mm_round_ps(x, _MM_FROUND_NINT)
+#undef  gmx_simd_trunc_f
+#define gmx_simd_trunc_f(x)       _mm_round_ps(x, _MM_FROUND_TRUNC)
+#undef  gmx_simd_round_d
+#define gmx_simd_round_d(x)       _mm_round_pd(x, _MM_FROUND_NINT)
+#undef  gmx_simd_trunc_d
+#define gmx_simd_trunc_d(x)       _mm_round_pd(x, _MM_FROUND_TRUNC)
+
+#undef  gmx_simd_extract_fi
+#define gmx_simd_extract_fi       _mm_extract_epi32
+#undef  gmx_simd_mul_fi
+#define gmx_simd_mul_fi           _mm_mullo_epi32
+
+#undef  gmx_simd_extract_di
+#define gmx_simd_extract_di       _mm_extract_epi32
+#undef  gmx_simd_mul_di
+#define gmx_simd_mul_di           _mm_mullo_epi32
+
+#undef  gmx_simd_blendv_f
+#define gmx_simd_blendv_f         _mm_blendv_ps
+#undef  gmx_simd_blendv_d
+#define gmx_simd_blendv_d         _mm_blendv_pd
+
+#undef  gmx_simd_reduce_f
+#define gmx_simd_reduce_f(a)      gmx_simd_reduce_f_sse4_1(a)
+#undef  gmx_simd_reduce_d
+#define gmx_simd_reduce_d(a)      gmx_simd_reduce_d_sse4_1(a)
+
+#undef  gmx_simd_blendv_fi
+#define gmx_simd_blendv_fi        _mm_blendv_epi8
+#undef  gmx_simd_blendv_di
+#define gmx_simd_blendv_di        _mm_blendv_epi8
+
+#undef  gmx_simd4_dotproduct3_f
+#define gmx_simd4_dotproduct3_f   gmx_simd4_dotproduct3_f_sse4_1
+
+/* SIMD reduction function */
+static gmx_inline float
+gmx_simd_reduce_f_sse4_1(__m128 a)
+{
+    float  f;
+
+    a = _mm_hadd_ps(a, a);
+    a = _mm_hadd_ps(a, a);
+    _mm_store_ss(&f, a);
+    return f;
+}
+
+/* SIMD4 Dotproduct helper function */
+static gmx_inline float
+gmx_simd4_dotproduct3_f_sse4_1(__m128 a, __m128 b)
+{
+    float f;
+    _MM_EXTRACT_FLOAT(f, _mm_dp_ps(a, b, 0x71), 0);
+    return f;
+}
+
+static gmx_inline double
+gmx_simd_reduce_d_sse4_1(__m128d a)
+{
+    double  f;
+
+    a = _mm_hadd_pd(a, a);
+    _mm_store_sd(&f, a);
+    return f;
+}
+
+#endif /* GMX_SIMD_IMPL_X86_SSE4_1_H */
diff --git a/src/gromacs/simd/macros.h b/src/gromacs/simd/macros.h

deleted file mode 100644 (file)

index a24cd56..0000000
--- a/src/gromacs/simd/macros.h
+++ /dev/null
@@ -1,832 +0,0 @@
-/*
- * This file is part of the GROMACS molecular simulation package.
- *
- * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
- * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
- * and including many others, as listed in the AUTHORS file in the
- * top-level source directory and at http://www.gromacs.org.
- *
- * GROMACS is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public License
- * as published by the Free Software Foundation; either version 2.1
- * of the License, or (at your option) any later version.
- *
- * GROMACS is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with GROMACS; if not, see
- * http://www.gnu.org/licenses, or write to the Free Software Foundation,
- * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
- *
- * If you want to redistribute modifications to GROMACS, please
- * consider that scientific software is very special. Version
- * control is crucial - bugs must be traceable. We will be happy to
- * consider code for inclusion in the official distribution, but
- * derived work must not be called official GROMACS. Details are found
- * in the README & COPYING files - if they are missing, get the
- * official version at http://www.gromacs.org.
- *
- * To help us fund GROMACS development, we humbly ask that you cite
- * the research papers on the package. Check out http://www.gromacs.org.
- */
-
-/* The macros in this file are intended to be used for writing
- * architecture-independent SIMD intrinsics code.
- * To support a new architecture, adding macros here should be (nearly)
- * all that is needed.
- */
-
-#ifdef GMX_SIMD_MACROS_H
-#error "gromacs/simd/macros.h included twice"
-#else
-#define GMX_SIMD_MACROS_H
-
-/* NOTE: SSE2 acceleration does not include floor or blendv */
-
-#ifdef GMX_SIMD_REFERENCE
-/* Plain C SIMD reference implementation, also serves as documentation */
-#define GMX_HAVE_SIMD_MACROS
-
-/* Include plain-C reference implementation, also serves as documentation */
-#include "gromacs/simd/macros_ref.h"
-
-#define GMX_SIMD_REAL_WIDTH  GMX_SIMD_REF_WIDTH
-
-/* float/double SIMD register type */
-#define gmx_simd_real_t  gmx_simd_ref_pr
-
-/* boolean SIMD register type */
-#define gmx_simd_bool_t  gmx_simd_ref_pb
-
-/* integer SIMD register type, only for table indexing and exclusion masks */
-#define gmx_simd_int32_t  gmx_simd_ref_epi32
-#define GMX_SIMD_INT32_WIDTH  GMX_SIMD_REF_EPI32_WIDTH
-
-/* Load GMX_SIMD_REAL_WIDTH reals for memory starting at r */
-#define gmx_simd_load_r       gmx_simd_ref_load_pr
-/* Set all SIMD register elements to *r */
-#define gmx_simd_load1_r      gmx_simd_ref_load1_pr
-#define gmx_simd_set1_r       gmx_simd_ref_set1_pr
-#define gmx_simd_setzero_r    gmx_simd_ref_setzero_pr
-#define gmx_simd_store_r      gmx_simd_ref_store_pr
-
-#define gmx_simd_add_r        gmx_simd_ref_add_pr
-#define gmx_simd_sub_r        gmx_simd_ref_sub_pr
-#define gmx_simd_mul_r        gmx_simd_ref_mul_pr
-/* For the FMA macros below, aim for c=d in code, so FMA3 uses 1 instruction */
-#define gmx_simd_fmadd_r       gmx_simd_ref_madd_pr
-#define gmx_simd_fnmadd_r      gmx_simd_ref_nmsub_pr
-
-#define gmx_simd_max_r        gmx_simd_ref_max_pr
-#define gmx_simd_blendzero_r  gmx_simd_ref_blendzero_pr
-
-#define gmx_simd_round_r      gmx_simd_ref_round_pr
-
-/* Not required, only used to speed up the nbnxn tabulated PME kernels */
-#define GMX_SIMD_HAVE_FLOOR
-#ifdef GMX_SIMD_HAVE_FLOOR
-#define gmx_simd_floor_r      gmx_simd_ref_floor_pr
-#endif
-
-/* Not required, only used when blendv is faster than comparison */
-#define GMX_SIMD_HAVE_BLENDV
-#ifdef GMX_SIMD_HAVE_BLENDV
-#define gmx_simd_blendv_r     gmx_simd_ref_blendv_pr
-#endif
-
-/* Copy the sign of a to b, assumes b >= 0 for efficiency */
-#define gmx_cpsgn_nonneg_pr  gmx_simd_ref_cpsgn_nonneg_pr
-
-/* Very specific operation required in the non-bonded kernels */
-#define gmx_masknot_add_pr   gmx_simd_ref_masknot_add_pr
-
-/* Comparison */
-#define gmx_simd_cmplt_r      gmx_simd_ref_cmplt_pr
-
-/* Logical operations on SIMD booleans */
-#define gmx_simd_and_b        gmx_simd_ref_and_pb
-#define gmx_simd_or_b         gmx_simd_ref_or_pb
-
-/* Returns a single int (0/1) which tells if any of the 4 booleans is True */
-#define gmx_simd_anytrue_b    gmx_simd_ref_anytrue_pb
-
-/* Conversions only used for PME table lookup */
-#define gmx_simd_cvtt_r2i  gmx_simd_ref_cvttpr_epi32
-#define gmx_simd_cvt_i2r   gmx_simd_ref_cvtepi32_pr
-
-/* These two function only need to be approximate, Newton-Raphson iteration
- * is used for full accuracy in gmx_simd_invsqrt_r and gmx_simd_inv_r.
- */
-#define gmx_simd_rsqrt_r      gmx_simd_ref_rsqrt_pr
-#define gmx_simd_rcp_r        gmx_simd_ref_rcp_pr
-
-/* sqrt+inv+sin+cos+acos+atan2 are used for bonded potentials, exp for PME */
-#define GMX_SIMD_HAVE_EXP
-#ifdef GMX_SIMD_HAVE_EXP
-#define gmx_simd_exp_r        gmx_simd_ref_exp_pr
-#endif
-#define GMX_SIMD_HAVE_TRIGONOMETRIC
-#ifdef GMX_SIMD_HAVE_TRIGONOMETRIC
-#define gmx_simd_sqrt_r       gmx_simd_ref_sqrt_pr
-#define gmx_simd_sincos_r     gmx_simd_ref_sincos_pr
-#define gmx_simd_acos_r       gmx_simd_ref_acos_pr
-#define gmx_simd_atan2_r      gmx_simd_ref_atan2_pr
-#endif
-
-#endif /* GMX_SIMD_REFERENCE */
-
-
-/* The same SIMD macros can be translated to SIMD intrinsics (and compiled
- * to instructions for) different SIMD width and float precision.
- *
- * On x86: The gmx_ prefix is replaced by _mm_ or _mm256_ (SSE or AVX).
- * The _pr suffix is replaced by _ps or _pd (for single or double precision).
- * Compiler settings will decide if 128-bit intrinsics will
- * be translated into SSE or AVX instructions.
- */
-
-
-#ifdef GMX_USE_HALF_WIDTH_SIMD_HERE
-#if defined GMX_SIMD_X86_AVX_256_OR_HIGHER || defined __MIC__
-/* We have half SIMD width support, continue */
-#else
-#error "half SIMD width intrinsics are not supported"
-#endif
-#endif
-
-#if defined GMX_TARGET_X86 && !defined __MIC__
-
-#ifdef GMX_SIMD_X86_SSE2_OR_HIGHER
-/* This is for general x86 SIMD instruction sets that also support SSE2 */
-#define GMX_HAVE_SIMD_MACROS
-
-/* Include the highest supported x86 SIMD intrisics + math functions */
-#ifdef GMX_SIMD_X86_AVX_256_OR_HIGHER
-#include "general_x86_avx_256.h"
-#ifdef GMX_DOUBLE
-#include "math_x86_avx_256_double.h"
-#else  /* GMX_DOUBLE */
-#include "math_x86_avx_256_single.h"
-#endif /* GMX_DOUBLE */
-#else  /* GMX_SIMD_X86_AVX_256_OR_HIGHER */
-#ifdef GMX_SIMD_X86_AVX_128_FMA_OR_HIGHER
-#include "general_x86_avx_128_fma.h"
-#ifdef GMX_DOUBLE
-#include "math_x86_avx_128_fma_double.h"
-#else  /* GMX_DOUBLE */
-#include "math_x86_avx_128_fma_single.h"
-#endif /* GMX_DOUBLE */
-#else  /* GMX_SIMD_X86_AVX_128_FMA_OR_HIGHER */
-#ifdef GMX_SIMD_X86_SSE4_1
-#include "general_x86_sse4_1.h"
-#ifdef GMX_DOUBLE
-#include "math_x86_sse4_1_double.h"
-#else  /* GMX_DOUBLE */
-#include "math_x86_sse4_1_single.h"
-#endif /* GMX_DOUBLE */
-#else  /* GMX_SIMD_X86_SSE4_1_OR_HIGHER */
-#ifdef GMX_SIMD_X86_SSE2_OR_HIGHER
-#include "general_x86_sse2.h"
-#ifdef GMX_DOUBLE
-#include "math_x86_sse2_double.h"
-#else  /* GMX_DOUBLE */
-#include "math_x86_sse2_single.h"
-#endif /* GMX_DOUBLE */
-#else  /* GMX_SIMD_X86_SSE2_OR_HIGHER */
-#error No x86 acceleration defined
-#endif /* GMX_SIMD_X86_SSE2_OR_HIGHER */
-#endif /* GMX_SIMD_X86_SSE4_1_OR_HIGHER */
-#endif /* GMX_SIMD_X86_AVX_128_FMA_OR_HIGHER */
-#endif /* GMX_SIMD_X86_AVX_256_OR_HIGHER */
-
-/* exp and trigonometric functions are included above */
-#define GMX_SIMD_HAVE_EXP
-#define GMX_SIMD_HAVE_ERFC
-#define GMX_SIMD_HAVE_TRIGONOMETRIC
-
-#if !defined GMX_SIMD_X86_AVX_256_OR_HIGHER || defined GMX_USE_HALF_WIDTH_SIMD_HERE
-
-#ifndef GMX_DOUBLE
-
-#define GMX_SIMD_REAL_WIDTH  4
-
-#define gmx_simd_real_t  __m128
-
-#define gmx_simd_bool_t  __m128
-
-#define gmx_simd_int32_t  __m128i
-#define GMX_SIMD_INT32_WIDTH  4
-
-#define gmx_simd_load_r       _mm_load_ps
-#define gmx_simd_load1_r      _mm_load1_ps
-#define gmx_simd_set1_r       _mm_set1_ps
-#define gmx_simd_setzero_r    _mm_setzero_ps
-#define gmx_simd_store_r      _mm_store_ps
-
-#define gmx_simd_add_r        _mm_add_ps
-#define gmx_simd_sub_r        _mm_sub_ps
-#define gmx_simd_mul_r        _mm_mul_ps
-#ifdef GMX_SIMD_X86_AVX_128_FMA_OR_HIGHER
-#define GMX_SIMD_HAVE_FMA
-#define gmx_simd_fmadd_r(a, b, c)   _mm_macc_ps(a, b, c)
-#define gmx_simd_fnmadd_r(a, b, c)  _mm_nmacc_ps(a, b, c)
-#else
-#define gmx_simd_fmadd_r(a, b, c)   _mm_add_ps(c, _mm_mul_ps(a, b))
-#define gmx_simd_fnmadd_r(a, b, c)  _mm_sub_ps(c, _mm_mul_ps(a, b))
-#endif
-#define gmx_simd_max_r        _mm_max_ps
-#define gmx_simd_blendzero_r  _mm_and_ps
-
-#define gmx_simd_cmplt_r      _mm_cmplt_ps
-#define gmx_simd_and_b        _mm_and_ps
-#define gmx_simd_or_b         _mm_or_ps
-
-#ifdef GMX_SIMD_X86_SSE4_1_OR_HIGHER
-#define gmx_simd_round_r(x)   _mm_round_ps(x, 0x0)
-#define GMX_SIMD_HAVE_FLOOR
-#define gmx_simd_floor_r      _mm_floor_ps
-#else
-#define gmx_simd_round_r(x)   _mm_cvtepi32_ps(_mm_cvtps_epi32(x))
-#endif
-
-#ifdef GMX_SIMD_X86_SSE4_1_OR_HIGHER
-#define GMX_SIMD_HAVE_BLENDV
-#define gmx_simd_blendv_r     _mm_blendv_ps
-#endif
-
-static gmx_inline gmx_simd_real_t gmx_cpsgn_nonneg_pr(gmx_simd_real_t a, gmx_simd_real_t b)
-{
-    /* The value -0.0 has only the sign-bit set */
-    gmx_simd_real_t sign_mask = _mm_set1_ps(-0.0);
-    return _mm_or_ps(_mm_and_ps(a, sign_mask), b);
-};
-
-static gmx_inline gmx_simd_real_t gmx_masknot_add_pr(gmx_simd_bool_t a, gmx_simd_real_t b, gmx_simd_real_t c)
-{
-    return _mm_add_ps(b, _mm_andnot_ps(a, c));
-};
-
-#define gmx_simd_anytrue_b    _mm_movemask_ps
-
-#define gmx_simd_cvtt_r2i  _mm_cvttps_epi32
-#define gmx_simd_cvt_i2r   _mm_cvtepi32_ps
-
-#define gmx_simd_rsqrt_r      _mm_rsqrt_ps
-#define gmx_simd_rcp_r        _mm_rcp_ps
-
-#define gmx_simd_exp_r        gmx_mm_exp_ps
-#define gmx_simd_sqrt_r       gmx_mm_sqrt_ps
-#define gmx_simd_sincos_r     gmx_mm_sincos_ps
-#define gmx_simd_acos_r       gmx_mm_acos_ps
-#define gmx_simd_atan2_r      gmx_mm_atan2_ps
-#define gmx_simd_erfc_r       gmx_mm_erfc_ps
-
-#else /* ifndef GMX_DOUBLE */
-
-#define GMX_SIMD_REAL_WIDTH  2
-
-#define gmx_simd_real_t  __m128d
-
-#define gmx_simd_bool_t  __m128d
-
-#define gmx_simd_int32_t  __m128i
-#define GMX_SIMD_INT32_WIDTH  4
-
-#define gmx_simd_load_r       _mm_load_pd
-#define gmx_simd_load1_r      _mm_load1_pd
-#define gmx_simd_set1_r       _mm_set1_pd
-#define gmx_simd_setzero_r    _mm_setzero_pd
-#define gmx_simd_store_r      _mm_store_pd
-
-#define gmx_simd_add_r        _mm_add_pd
-#define gmx_simd_sub_r        _mm_sub_pd
-#define gmx_simd_mul_r        _mm_mul_pd
-#ifdef GMX_SIMD_X86_AVX_128_FMA_OR_HIGHER
-#define GMX_SIMD_HAVE_FMA
-#define gmx_simd_fmadd_r(a, b, c)   _mm_macc_pd(a, b, c)
-#define gmx_simd_fnmadd_r(a, b, c)  _mm_nmacc_pd(a, b, c)
-#else
-#define gmx_simd_fmadd_r(a, b, c)   _mm_add_pd(c, _mm_mul_pd(a, b))
-#define gmx_simd_fnmadd_r(a, b, c)  _mm_sub_pd(c, _mm_mul_pd(a, b))
-#endif
-#define gmx_simd_max_r        _mm_max_pd
-#define gmx_simd_blendzero_r  _mm_and_pd
-
-#ifdef GMX_SIMD_X86_SSE4_1_OR_HIGHER
-#define gmx_simd_round_r(x)   _mm_round_pd(x, 0x0)
-#define GMX_SIMD_HAVE_FLOOR
-#define gmx_simd_floor_r      _mm_floor_pd
-#else
-#define gmx_simd_round_r(x)   _mm_cvtepi32_pd(_mm_cvtpd_epi32(x))
-/* gmx_simd_floor_r is not used in code for pre-SSE4_1 hardware */
-#endif
-
-#ifdef GMX_SIMD_X86_SSE4_1_OR_HIGHER
-#define GMX_SIMD_HAVE_BLENDV
-#define gmx_simd_blendv_r     _mm_blendv_pd
-#endif
-
-static gmx_inline gmx_simd_real_t gmx_cpsgn_nonneg_pr(gmx_simd_real_t a, gmx_simd_real_t b)
-{
-    gmx_simd_real_t sign_mask = _mm_set1_pd(-0.0);
-    return _mm_or_pd(_mm_and_pd(a, sign_mask), b);
-};
-
-static gmx_inline gmx_simd_real_t gmx_masknot_add_pr(gmx_simd_bool_t a, gmx_simd_real_t b, gmx_simd_real_t c)
-{
-    return _mm_add_pd(b, _mm_andnot_pd(a, c));
-};
-
-#define gmx_simd_cmplt_r      _mm_cmplt_pd
-
-#define gmx_simd_and_b        _mm_and_pd
-#define gmx_simd_or_b         _mm_or_pd
-
-#define gmx_simd_anytrue_b    _mm_movemask_pd
-
-#define gmx_simd_cvtt_r2i  _mm_cvttpd_epi32
-#define gmx_simd_cvt_i2r   _mm_cvtepi32_pd
-
-#define gmx_simd_rsqrt_r(r)   _mm_cvtps_pd(_mm_rsqrt_ps(_mm_cvtpd_ps(r)))
-#define gmx_simd_rcp_r(r)     _mm_cvtps_pd(_mm_rcp_ps(_mm_cvtpd_ps(r)))
-
-#define gmx_simd_exp_r        gmx_mm_exp_pd
-#define gmx_simd_sqrt_r       gmx_mm_sqrt_pd
-#define gmx_simd_sincos_r     gmx_mm_sincos_pd
-#define gmx_simd_acos_r       gmx_mm_acos_pd
-#define gmx_simd_atan2_r      gmx_mm_atan2_pd
-#define gmx_simd_erfc_r       gmx_mm_erfc_pd
-
-#endif /* ifndef GMX_DOUBLE */
-
-#else
-/* We have GMX_SIMD_X86_AVX_256_OR_HIGHER and not GMX_USE_HALF_WIDTH_SIMD_HERE,
- * so we use 256-bit SIMD.
- */
-
-#ifndef GMX_DOUBLE
-
-#define GMX_SIMD_REAL_WIDTH  8
-
-#define gmx_simd_real_t  __m256
-
-#define gmx_simd_bool_t  __m256
-
-#define gmx_simd_int32_t  __m256i
-#define GMX_SIMD_INT32_WIDTH  8
-
-#define gmx_simd_load_r       _mm256_load_ps
-#define gmx_simd_load1_r(x)   _mm256_set1_ps((x)[0])
-#define gmx_simd_set1_r       _mm256_set1_ps
-#define gmx_simd_setzero_r    _mm256_setzero_ps
-#define gmx_simd_store_r      _mm256_store_ps
-
-#define gmx_simd_add_r        _mm256_add_ps
-#define gmx_simd_sub_r        _mm256_sub_ps
-#define gmx_simd_mul_r        _mm256_mul_ps
-#define gmx_simd_fmadd_r(a, b, c)   _mm256_add_ps(c, _mm256_mul_ps(a, b))
-#define gmx_simd_fnmadd_r(a, b, c)  _mm256_sub_ps(c, _mm256_mul_ps(a, b))
-#define gmx_simd_max_r        _mm256_max_ps
-#define gmx_simd_blendzero_r  _mm256_and_ps
-
-#define gmx_simd_round_r(x)   _mm256_round_ps(x, 0x0)
-#define GMX_SIMD_HAVE_FLOOR
-#define gmx_simd_floor_r      _mm256_floor_ps
-
-#define GMX_SIMD_HAVE_BLENDV
-#define gmx_simd_blendv_r     _mm256_blendv_ps
-
-static gmx_inline gmx_simd_real_t gmx_cpsgn_nonneg_pr(gmx_simd_real_t a, gmx_simd_real_t b)
-{
-    gmx_simd_real_t sign_mask = _mm256_set1_ps(-0.0);
-    return _mm256_or_ps(_mm256_and_ps(a, sign_mask), b);
-};
-
-static gmx_inline gmx_simd_real_t gmx_masknot_add_pr(gmx_simd_bool_t a, gmx_simd_real_t b, gmx_simd_real_t c)
-{
-    return _mm256_add_ps(b, _mm256_andnot_ps(a, c));
-};
-
-/* Less-than (we use ordered, non-signaling, but that's not required) */
-#define gmx_simd_cmplt_r(x, y) _mm256_cmp_ps(x, y, 0x11)
-#define gmx_simd_and_b        _mm256_and_ps
-#define gmx_simd_or_b         _mm256_or_ps
-
-#define gmx_simd_anytrue_b    _mm256_movemask_ps
-
-#define gmx_simd_cvtt_r2i  _mm256_cvttps_epi32
-
-#define gmx_simd_rsqrt_r      _mm256_rsqrt_ps
-#define gmx_simd_rcp_r        _mm256_rcp_ps
-
-#define gmx_simd_exp_r        gmx_mm256_exp_ps
-#define gmx_simd_sqrt_r       gmx_mm256_sqrt_ps
-#define gmx_simd_sincos_r     gmx_mm256_sincos_ps
-#define gmx_simd_acos_r       gmx_mm256_acos_ps
-#define gmx_simd_atan2_r      gmx_mm256_atan2_ps
-#define gmx_simd_erfc_r       gmx_mm256_erfc_ps
-
-#else /* ifndef GMX_DOUBLE */
-
-#define GMX_SIMD_REAL_WIDTH  4
-
-#define gmx_simd_real_t  __m256d
-
-#define gmx_simd_bool_t  __m256d
-
-/* We use 128-bit integer registers because of missing 256-bit operations */
-#define gmx_simd_int32_t  __m128i
-#define GMX_SIMD_INT32_WIDTH  4
-
-#define gmx_simd_load_r       _mm256_load_pd
-#define gmx_simd_load1_r(x)   _mm256_set1_pd((x)[0])
-#define gmx_simd_set1_r       _mm256_set1_pd
-#define gmx_simd_setzero_r    _mm256_setzero_pd
-#define gmx_simd_store_r      _mm256_store_pd
-
-#define gmx_simd_add_r        _mm256_add_pd
-#define gmx_simd_sub_r        _mm256_sub_pd
-#define gmx_simd_mul_r        _mm256_mul_pd
-#define gmx_simd_fmadd_r(a, b, c)   _mm256_add_pd(c, _mm256_mul_pd(a, b))
-#define gmx_simd_fnmadd_r(a, b, c)  _mm256_sub_pd(c, _mm256_mul_pd(a, b))
-#define gmx_simd_max_r        _mm256_max_pd
-#define gmx_simd_blendzero_r  _mm256_and_pd
-
-#define gmx_simd_round_r(x)   _mm256_round_pd(x, 0x0)
-#define GMX_SIMD_HAVE_FLOOR
-#define gmx_simd_floor_r      _mm256_floor_pd
-
-#define GMX_SIMD_HAVE_BLENDV
-#define gmx_simd_blendv_r     _mm256_blendv_pd
-
-static gmx_inline gmx_simd_real_t gmx_cpsgn_nonneg_pr(gmx_simd_real_t a, gmx_simd_real_t b)
-{
-    gmx_simd_real_t sign_mask = _mm256_set1_pd(-0.0);
-    return _mm256_or_pd(_mm256_and_pd(a, sign_mask), b);
-};
-
-static gmx_inline gmx_simd_real_t gmx_masknot_add_pr(gmx_simd_bool_t a, gmx_simd_real_t b, gmx_simd_real_t c)
-{
-    return _mm256_add_pd(b, _mm256_andnot_pd(a, c));
-};
-
-/* Less-than (we use ordered, non-signaling, but that's not required) */
-#define gmx_simd_cmplt_r(x, y) _mm256_cmp_pd(x, y, 0x11)
-
-#define gmx_simd_and_b        _mm256_and_pd
-#define gmx_simd_or_b         _mm256_or_pd
-
-#define gmx_simd_anytrue_b    _mm256_movemask_pd
-
-#define gmx_simd_cvtt_r2i  _mm256_cvttpd_epi32
-
-#define gmx_simd_rsqrt_r(r)   _mm256_cvtps_pd(_mm_rsqrt_ps(_mm256_cvtpd_ps(r)))
-#define gmx_simd_rcp_r(r)     _mm256_cvtps_pd(_mm_rcp_ps(_mm256_cvtpd_ps(r)))
-
-#define gmx_simd_exp_r        gmx_mm256_exp_pd
-#define gmx_simd_sqrt_r       gmx_mm256_sqrt_pd
-#define gmx_simd_sincos_r     gmx_mm256_sincos_pd
-#define gmx_simd_acos_r       gmx_mm256_acos_pd
-#define gmx_simd_atan2_r      gmx_mm256_atan2_pd
-#define gmx_simd_erfc_r       gmx_mm256_erfc_pd
-
-#endif /* ifndef GMX_DOUBLE */
-
-#endif /* 128- or 256-bit x86 SIMD */
-
-#endif /* GMX_SIMD_X86_SSE2_OR_HIGHER */
-
-#endif /* GMX_TARGET_X86 */
-
-#ifdef GMX_SIMD_IBM_QPX
-
-/* This hack works on the compilers that can reach this code. A real
-   solution with broader scope will be proposed in master branch. */
-#define gmx_always_inline __attribute__((always_inline))
-
-/* This is for the A2 core on BlueGene/Q that supports IBM's QPX
-   vector built-in functions */
-#include <mass_simd.h>
-#define GMX_HAVE_SIMD_MACROS
-#ifdef __clang__
-#include <qpxmath.h>
-#endif
-
-/* No need to version the code by the precision, because the QPX AXU
-   extends to and truncates from double precision for free. */
-
-#define GMX_SIMD_REAL_WIDTH  4
-typedef vector4double gmx_simd_real_t;
-typedef vector4double gmx_simd_bool_t;
-typedef vector4double gmx_simd_int32_t;
-#define GMX_SIMD_INT32_WIDTH  4
-
-static gmx_inline gmx_simd_real_t gmx_always_inline gmx_simd_load_r(const real *a)
-{
-#ifdef NDEBUG
-    return vec_ld(0, (real *) a);
-#else
-    return vec_lda(0, (real *) a);
-#endif
-}
-
-static gmx_inline gmx_simd_real_t gmx_always_inline gmx_simd_load1_r(const real *a)
-{
-    return vec_splats(*a);
-}
-
-static gmx_inline gmx_simd_real_t gmx_always_inline gmx_simd_set1_r(real a)
-{
-    return vec_splats(a);
-}
-
-static gmx_inline gmx_simd_real_t gmx_always_inline gmx_simd_setzero_r()
-{
-    return vec_splats(0.0);
-}
-
-static gmx_inline void gmx_always_inline gmx_simd_store_r(real *a, gmx_simd_real_t b)
-{
-#ifdef NDEBUG
-    vec_st(b, 0, a);
-#else
-    vec_sta(b, 0, a);
-#endif
-}
-
-static gmx_inline gmx_simd_real_t gmx_always_inline gmx_simd_add_r(gmx_simd_real_t a, gmx_simd_real_t b)
-{
-    return vec_add(a, b);
-}
-
-static gmx_inline gmx_simd_real_t gmx_always_inline gmx_simd_sub_r(gmx_simd_real_t a, gmx_simd_real_t b)
-{
-    return vec_sub(a, b);
-}
-
-static gmx_inline gmx_simd_real_t gmx_always_inline gmx_simd_mul_r(gmx_simd_real_t a, gmx_simd_real_t b)
-{
-    return vec_mul(a, b);
-}
-
-static gmx_inline gmx_simd_real_t gmx_always_inline gmx_simd_fmadd_r(gmx_simd_real_t a, gmx_simd_real_t b, gmx_simd_real_t c)
-{
-    return vec_madd(a, b, c);
-}
-
-static gmx_inline gmx_simd_real_t gmx_always_inline gmx_simd_fnmadd_r(gmx_simd_real_t a, gmx_simd_real_t b, gmx_simd_real_t c)
-{
-    return vec_nmsub(a, b, c);
-}
-
-static gmx_inline gmx_simd_real_t gmx_always_inline gmx_simd_max_r(gmx_simd_real_t a, gmx_simd_real_t b)
-{
-    return vec_sel(b, a, vec_sub(a, b));
-}
-
-static gmx_inline gmx_simd_real_t gmx_always_inline gmx_simd_blendzero_r(gmx_simd_real_t a, gmx_simd_real_t b)
-{
-    return vec_sel(gmx_simd_setzero_r(), a, b);
-}
-
-static gmx_inline gmx_simd_bool_t gmx_always_inline gmx_simd_cmplt_r(gmx_simd_real_t a, gmx_simd_real_t b)
-{
-    return vec_cmplt(a, b);
-}
-
-static gmx_inline gmx_simd_bool_t gmx_always_inline gmx_simd_and_b(gmx_simd_bool_t a, gmx_simd_bool_t b)
-{
-    return vec_and(a, b);
-}
-
-static gmx_inline gmx_simd_bool_t gmx_always_inline gmx_simd_or_b(gmx_simd_bool_t a, gmx_simd_bool_t b)
-{
-    return vec_or(a, b);
-}
-
-static gmx_inline gmx_simd_real_t gmx_always_inline gmx_simd_round_r(gmx_simd_real_t a)
-{
-    return vec_round(a);
-}
-
-#define GMX_SIMD_HAVE_FLOOR
-static gmx_inline gmx_simd_real_t gmx_always_inline gmx_simd_floor_r(gmx_simd_real_t a)
-{
-    return vec_floor(a);
-}
-
-#define GMX_SIMD_HAVE_BLENDV
-static gmx_inline gmx_simd_real_t gmx_always_inline gmx_simd_blendv_r(gmx_simd_real_t a, gmx_simd_real_t b, gmx_simd_real_t c)
-{
-    return vec_sel(b, a, gmx_simd_cmplt_r(gmx_simd_setzero_r(), c));
-}
-
-static gmx_inline gmx_simd_real_t gmx_always_inline gmx_cpsgn_nonneg_pr(gmx_simd_real_t a, gmx_simd_real_t b)
-{
-    return vec_cpsgn(a, b);
-};
-
-static gmx_inline gmx_simd_real_t gmx_always_inline gmx_masknot_add_pr(gmx_simd_bool_t a, gmx_simd_real_t b, gmx_simd_real_t c)
-{
-    return vec_add(b, vec_sel(c, gmx_simd_setzero_r(), a));
-};
-
-static gmx_inline gmx_bool gmx_always_inline
-GMX_SIMD_IS_TRUE(real x)
-{
-    return x >= 0.0;
-}
-
-static gmx_inline gmx_simd_int32_t gmx_always_inline gmx_simd_cvtt_r2i(gmx_simd_real_t a)
-{
-    return vec_ctiwuz(a);
-}
-/* Don't want this, we have floor */
-/* #define gmx_simd_cvt_i2r   vec_cvtepi32 */
-
-/* A2 core on BG/Q delivers relative error of 2^-14, whereas Power ISA
-   Architecture only promises 2^-8. So probably no need for
-   Newton-Raphson iterates at single or double. */
-static gmx_inline gmx_simd_real_t gmx_always_inline gmx_simd_rsqrt_r(gmx_simd_real_t a)
-{
-    return vec_rsqrte(a);
-}
-
-/* A2 core on BG/Q delivers relative error of 2^-14, whereas Power ISA
-   Architecture only promises 2^-5. So probably no need for
-   Newton-Raphson iterates at single or double. */
-static gmx_inline gmx_simd_real_t gmx_always_inline gmx_simd_rcp_r(gmx_simd_real_t a)
-{
-    return vec_re(a);
-}
-
-/* Note that here, and below, we use the built-in SLEEF port when
-   compiling on BlueGene/Q with clang */
-
-#define GMX_SIMD_HAVE_EXP
-static gmx_inline gmx_simd_real_t gmx_always_inline gmx_simd_exp_r(gmx_simd_real_t a)
-{
-#ifdef __clang__
-#ifndef GMX_DOUBLE
-    return xexpf(a);
-#else
-    return xexp(a);
-#endif
-#else
-#ifndef GMX_DOUBLE
-    return expf4(a);
-#else
-    return expd4(a);
-#endif
-#endif
-}
-
-static gmx_inline gmx_simd_real_t gmx_always_inline gmx_simd_sqrt_r(gmx_simd_real_t a)
-{
-#ifdef NDEBUG
-    return vec_swsqrt_nochk(a);
-#else
-    return vec_swsqrt(a);
-#endif
-}
-
-#define GMX_SIMD_HAVE_TRIGONOMETRIC
-static gmx_inline int gmx_always_inline gmx_simd_sincos_r(gmx_simd_real_t a, gmx_simd_real_t *b, gmx_simd_real_t *c)
-{
-#ifdef __clang__
-#ifndef GMX_DOUBLE
-    xsincosf(a, b, c);
-#else
-    xsincos(a, b, c);
-#endif
-#else
-#ifndef GMX_DOUBLE
-    sincosf4(a, b, c);
-#else
-    sincosd4(a, b, c);
-#endif
-#endif
-    return 1;
-}
-
-static gmx_inline gmx_simd_real_t gmx_always_inline gmx_simd_acos_r(gmx_simd_real_t a)
-{
-#ifdef __clang__
-#ifndef GMX_DOUBLE
-    return xacosf(a);
-#else
-    return xacos(a);
-#endif
-#else
-#ifndef GMX_DOUBLE
-    return acosf4(a);
-#else
-    return acosd4(a);
-#endif
-#endif
-}
-
-/* NB The order of parameters here is correct; the
-   documentation of atan2[df]4 in SIMD MASS is wrong. */
-static gmx_inline gmx_simd_real_t gmx_always_inline gmx_simd_atan2_r(gmx_simd_real_t a, gmx_simd_real_t b)
-{
-#ifdef __clang__
-#ifndef GMX_DOUBLE
-    return xatan2f(a, b);
-#else
-    return xatan2(a, b);
-#endif
-#else
-#ifndef GMX_DOUBLE
-    return atan2f4(a, b);
-#else
-    return atan2d4(a, b);
-#endif
-#endif
-}
-
-#define GMX_SIMD_HAVE_ERFC
-static gmx_inline gmx_simd_real_t gmx_always_inline gmx_simd_erfc_r(gmx_simd_real_t a)
-{
-    /* The BG/Q qpxmath.h vector math library intended for use with
-       bgclang does not have erfc, so we need to use a function from
-       mass_simd.h. If this changes, then the #include <mass_simd.h> can
-       become conditional. */
-#ifndef GMX_DOUBLE
-    return erfcf4(a);
-#else
-    return erfcd4(a);
-#endif
-}
-
-/* TODO: gmx_mm_erfc_p[sd] should be generalized using gmx_*_pr, so that it just works on BlueGene */
-
-static gmx_inline int gmx_always_inline
-gmx_simd_anytrue_b(gmx_simd_bool_t a)
-{
-    /* The "anytrue" is done solely on the QPX AXU (which is the only
-       available FPU). This is awkward, because pretty much no
-       "horizontal" SIMD-vector operations exist, unlike x86 where
-       SSE4.1 added various kinds of horizontal operations. So we have
-       to make do with shifting vector elements and operating on the
-       results. This makes for lots of data dependency, but the main
-       alternative of storing to memory and reloading is not going to
-       help, either. OpenMP over 2 or 4 hardware threads per core will
-       hide much of the latency from the data dependency. The
-       vec_extract() lets the compiler correctly use a floating-point
-       comparison on the zeroth vector element, which avoids needing
-       memory at all.
-     */
-    gmx_simd_bool_t vec_shifted_left_0 = a;
-    gmx_simd_bool_t vec_shifted_left_1 = vec_sldw(a, a, 1);
-    gmx_simd_bool_t vec_shifted_left_2 = vec_sldw(a, a, 2);
-    gmx_simd_bool_t vec_shifted_left_3 = vec_sldw(a, a, 3);
-
-    gmx_simd_bool_t vec_return = vec_or(vec_or(vec_shifted_left_2, vec_shifted_left_3),
-                                        vec_or(vec_shifted_left_0, vec_shifted_left_1));
-    return (0.0 < vec_extract(vec_return, 0));
-};
-
-#undef gmx_always_inline
-
-#endif /* GMX_SIMD_IBM_QPX */
-
-#ifdef __MIC__
-#include "general_x86_mic.h"
-#endif
-
-#ifdef GMX_HAVE_SIMD_MACROS
-/* Generic functions to extract a SIMD aligned pointer from a pointer x.
- * x should have at least GMX_SIMD_REAL_WIDTH elements extra compared
- * to how many you want to use, to avoid indexing outside the aligned region.
- */
-
-static gmx_inline real *
-gmx_simd_align_r(const real *x)
-{
-    return (real *)(((size_t)((x)+GMX_SIMD_REAL_WIDTH)) & (~((size_t)(GMX_SIMD_REAL_WIDTH*sizeof(real)-1))));
-}
-
-static gmx_inline int *
-gmx_simd_align_i(const int *x)
-{
-    return (int  *)(((size_t)((x)+GMX_SIMD_REAL_WIDTH)) & (~((size_t)(GMX_SIMD_REAL_WIDTH*sizeof(int )-1))));
-}
-
-
-/* Include the math functions which only need the above macros,
- * generally these are the ones that don't need masking operations.
- */
-#ifdef GMX_DOUBLE
-#include "math_double.h"
-#else
-#include "math_single.h"
-#endif
-
-
-#endif /* GMX_HAVE_SIMD_MACROS */
-
-#endif
diff --git a/src/gromacs/simd/macros_ref.h b/src/gromacs/simd/macros_ref.h

deleted file mode 100644 (file)

index 2f11e04..0000000
--- a/src/gromacs/simd/macros_ref.h
+++ /dev/null
@@ -1,526 +0,0 @@
-/*
- * This file is part of the GROMACS molecular simulation package.
- *
- * Copyright (c) 2013,2014, by the GROMACS development team, led by
- * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
- * and including many others, as listed in the AUTHORS file in the
- * top-level source directory and at http://www.gromacs.org.
- *
- * GROMACS is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public License
- * as published by the Free Software Foundation; either version 2.1
- * of the License, or (at your option) any later version.
- *
- * GROMACS is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with GROMACS; if not, see
- * http://www.gnu.org/licenses, or write to the Free Software Foundation,
- * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
- *
- * If you want to redistribute modifications to GROMACS, please
- * consider that scientific software is very special. Version
- * control is crucial - bugs must be traceable. We will be happy to
- * consider code for inclusion in the official distribution, but
- * derived work must not be called official GROMACS. Details are found
- * in the README & COPYING files - if they are missing, get the
- * official version at http://www.gromacs.org.
- *
- * To help us fund GROMACS development, we humbly ask that you cite
- * the research papers on the package. Check out http://www.gromacs.org.
- */
-
-#ifndef GMX_SIMD_MACROS_REF_H
-#define GMX_SIMD_MACROS_REF_H
-
-/* This file contains a reference plain-C implementation of arbitrary width.
- * This code is only useful for testing and documentation.
- * The SIMD width is set by defining GMX_SIMD_REF_WIDTH before including.
- */
-
-
-#ifndef GMX_SIMD_REF_WIDTH
-#error "GMX_SIMD_REF_WIDTH should be defined before including gromacs/simd/macros_ref.h"
-#endif
-
-#include <math.h>
-
-/* float/double SIMD register type */
-typedef struct {
-    real r[GMX_SIMD_REF_WIDTH];
-} gmx_simd_ref_pr;
-
-/* boolean SIMD register type */
-typedef struct {
-    char r[GMX_SIMD_REF_WIDTH];
-} gmx_simd_ref_pb;
-
-/* integer SIMD register type, only for table indexing and exclusion masks */
-typedef struct {
-    int r[GMX_SIMD_REF_WIDTH];
-} gmx_simd_ref_epi32;
-#define GMX_SIMD_REF_EPI32_WIDTH  GMX_SIMD_REF_WIDTH
-
-/* Load GMX_SIMD_REF_WIDTH reals for memory starting at r */
-static gmx_inline gmx_simd_ref_pr
-gmx_simd_ref_load_pr(const real *r)
-{
-    gmx_simd_ref_pr a;
-    int             i;
-
-    for (i = 0; i < GMX_SIMD_REF_WIDTH; i++)
-    {
-        a.r[i] = r[i];
-    }
-
-    return a;
-}
-
-/* Set all SIMD register elements to *r */
-static gmx_inline gmx_simd_ref_pr
-gmx_simd_ref_load1_pr(const real *r)
-{
-    gmx_simd_ref_pr a;
-    int             i;
-
-    for (i = 0; i < GMX_SIMD_REF_WIDTH; i++)
-    {
-        a.r[i] = *r;
-    }
-
-    return a;
-}
-
-/* Set all SIMD register elements to r */
-static gmx_inline gmx_simd_ref_pr
-gmx_simd_ref_set1_pr(real r)
-{
-    gmx_simd_ref_pr a;
-    int             i;
-
-    for (i = 0; i < GMX_SIMD_REF_WIDTH; i++)
-    {
-        a.r[i] = r;
-    }
-
-    return a;
-}
-
-/* Set all SIMD register elements to 0 */
-static gmx_inline gmx_simd_ref_pr
-gmx_simd_ref_setzero_pr()
-{
-    gmx_simd_ref_pr a;
-    int             i;
-
-    for (i = 0; i < GMX_SIMD_REF_WIDTH; i++)
-    {
-        a.r[i] = 0.0;
-    }
-
-    return a;
-}
-
-static gmx_inline void
-gmx_simd_ref_store_pr(real *dest, gmx_simd_ref_pr src)
-{
-    int i;
-
-    for (i = 0; i < GMX_SIMD_REF_WIDTH; i++)
-    {
-        dest[i] = src.r[i];
-    }
-}
-
-static gmx_inline gmx_simd_ref_pr
-gmx_simd_ref_add_pr(gmx_simd_ref_pr a, gmx_simd_ref_pr b)
-{
-    gmx_simd_ref_pr c;
-    int             i;
-
-    for (i = 0; i < GMX_SIMD_REF_WIDTH; i++)
-    {
-        c.r[i] = a.r[i] + b.r[i];
-    }
-
-    return c;
-}
-
-static gmx_inline gmx_simd_ref_pr
-gmx_simd_ref_sub_pr(gmx_simd_ref_pr a, gmx_simd_ref_pr b)
-{
-    gmx_simd_ref_pr c;
-    int             i;
-
-    for (i = 0; i < GMX_SIMD_REF_WIDTH; i++)
-    {
-        c.r[i] = a.r[i] - b.r[i];
-    }
-
-    return c;
-}
-
-static gmx_inline gmx_simd_ref_pr
-gmx_simd_ref_mul_pr(gmx_simd_ref_pr a, gmx_simd_ref_pr b)
-{
-    gmx_simd_ref_pr c;
-    int             i;
-
-    for (i = 0; i < GMX_SIMD_REF_WIDTH; i++)
-    {
-        c.r[i] = a.r[i]*b.r[i];
-    }
-
-    return c;
-}
-
-static gmx_inline gmx_simd_ref_pr
-gmx_simd_ref_madd_pr(gmx_simd_ref_pr a, gmx_simd_ref_pr b, gmx_simd_ref_pr c)
-{
-    gmx_simd_ref_pr d;
-    int             i;
-
-    for (i = 0; i < GMX_SIMD_REF_WIDTH; i++)
-    {
-        d.r[i] = a.r[i]*b.r[i] + c.r[i];
-    }
-
-    return d;
-}
-
-static gmx_inline gmx_simd_ref_pr
-gmx_simd_ref_nmsub_pr(gmx_simd_ref_pr a, gmx_simd_ref_pr b, gmx_simd_ref_pr c)
-{
-    gmx_simd_ref_pr d;
-    int             i;
-
-    for (i = 0; i < GMX_SIMD_REF_WIDTH; i++)
-    {
-        d.r[i] = -a.r[i]*b.r[i] + c.r[i];
-    }
-
-    return d;
-}
-
-static gmx_inline gmx_simd_ref_pr
-gmx_simd_ref_max_pr(gmx_simd_ref_pr a, gmx_simd_ref_pr b)
-{
-    gmx_simd_ref_pr c;
-    int             i;
-
-    for (i = 0; i < GMX_SIMD_REF_WIDTH; i++)
-    {
-        c.r[i] = (a.r[i] >= b.r[i] ? a.r[i] : b.r[i]);
-    }
-
-    return c;
-}
-
-static gmx_inline gmx_simd_ref_pr
-gmx_simd_ref_blendzero_pr(gmx_simd_ref_pr a, gmx_simd_ref_pb b)
-{
-    gmx_simd_ref_pr c;
-    int             i;
-
-    for (i = 0; i < GMX_SIMD_REF_WIDTH; i++)
-    {
-        c.r[i] = (b.r[i] ? a.r[i] : 0.0);
-    }
-
-    return c;
-}
-
-/* Note that this reference implementation rounds away from zero,
- * whereas most SIMD intrinsics will round to nearest even. Since this
- * function is only used for periodic image calculations, the rounding
- * of mantissas close to 0.5 is irrelevant, except in testing. This
- * could be fixed by using rint/rintf, but the bigger problem is that
- * MSVC does not support full C99, and none of the round or rint
- * functions are defined. It's much easier to approximately implement
- * round() than rint(), so we do that and hope we never get bitten in
- * testing. (Thanks, Microsoft.)
- */
-static gmx_inline gmx_simd_ref_pr
-gmx_simd_ref_round_pr(gmx_simd_ref_pr a)
-{
-    gmx_simd_ref_pr b;
-    int             i;
-
-    for (i = 0; i < GMX_SIMD_REF_WIDTH; i++)
-    {
-#ifdef _MSC_VER
-        int temp = (a.r[i] >= 0.)
-            ? (a.r[i] + 0.5)
-            : (a.r[i] - 0.5);
-        b.r[i] = (real) temp;
-#elif defined GMX_DOUBLE
-        b.r[i] = round(a.r[i]);
-#else
-        b.r[i] = roundf(a.r[i]);
-#endif
-    }
-
-    return b;
-}
-
-/* Not required, only used to speed up the nbnxn tabulated PME kernels */
-static gmx_inline gmx_simd_ref_pr
-gmx_simd_ref_floor_pr(gmx_simd_ref_pr a)
-{
-    gmx_simd_ref_pr b;
-    int             i;
-
-    for (i = 0; i < GMX_SIMD_REF_WIDTH; i++)
-    {
-#ifdef GMX_DOUBLE
-        b.r[i] = floor(a.r[i]);
-#else
-        b.r[i] = floorf(a.r[i]);
-#endif
-    }
-
-    return b;
-}
-
-/* Not required, only used when blendv is faster than comparison */
-static gmx_inline gmx_simd_ref_pr
-gmx_simd_ref_blendv_pr(gmx_simd_ref_pr a, gmx_simd_ref_pr b, gmx_simd_ref_pr c)
-{
-    gmx_simd_ref_pr d;
-    int             i;
-
-    for (i = 0; i < GMX_SIMD_REF_WIDTH; i++)
-    {
-        d.r[i] = (c.r[i] >= 0) ? a.r[i] : b.r[i];
-    }
-
-    return d;
-}
-
-/* Copy the sign of a to b, assumes b >= 0 for efficiency */
-static gmx_inline gmx_simd_ref_pr
-gmx_simd_ref_cpsgn_nonneg_pr(gmx_simd_ref_pr a, gmx_simd_ref_pr b)
-{
-    gmx_simd_ref_pr c;
-    int             i;
-
-    for (i = 0; i < GMX_SIMD_REF_WIDTH; i++)
-    {
-        c.r[i] = (a.r[i] >= 0) ? b.r[i] : -b.r[i];
-    }
-
-    return c;
-}
-
-/* Very specific operation required in the non-bonded kernels */
-static gmx_inline gmx_simd_ref_pr
-gmx_simd_ref_masknot_add_pr(gmx_simd_ref_pb a, gmx_simd_ref_pr b, gmx_simd_ref_pr c)
-{
-    gmx_simd_ref_pr d;
-    int             i;
-
-    for (i = 0; i < GMX_SIMD_REF_WIDTH; i++)
-    {
-        d.r[i] = a.r[i] ? b.r[i] : b.r[i] + c.r[i];
-    }
-
-    return d;
-}
-
-/* Comparison */
-static gmx_inline gmx_simd_ref_pb
-gmx_simd_ref_cmplt_pr(gmx_simd_ref_pr a, gmx_simd_ref_pr b)
-{
-    gmx_simd_ref_pb c;
-    int             i;
-
-    for (i = 0; i < GMX_SIMD_REF_WIDTH; i++)
-    {
-        c.r[i] = (a.r[i] < b.r[i]);
-    }
-
-    return c;
-}
-
-/* Logical AND on SIMD booleans. */
-static gmx_inline gmx_simd_ref_pb
-gmx_simd_ref_and_pb(gmx_simd_ref_pb a, gmx_simd_ref_pb b)
-{
-    gmx_simd_ref_pb c;
-    int             i;
-
-    for (i = 0; i < GMX_SIMD_REF_WIDTH; i++)
-    {
-        c.r[i] = (a.r[i] && b.r[i]);
-    }
-
-    return c;
-}
-
-/* Logical OR on SIMD booleans. */
-static gmx_inline gmx_simd_ref_pb
-gmx_simd_ref_or_pb(gmx_simd_ref_pb a, gmx_simd_ref_pb b)
-{
-    gmx_simd_ref_pb c;
-    int             i;
-
-    for (i = 0; i < GMX_SIMD_REF_WIDTH; i++)
-    {
-        c.r[i] = (a.r[i] || b.r[i]);
-    }
-
-    return c;
-}
-
-/* Returns a single int (0/1) which tells if any of the booleans is True */
-static gmx_inline int
-gmx_simd_ref_anytrue_pb(gmx_simd_ref_pb a)
-{
-    int anytrue;
-    int i;
-
-    anytrue = 0;
-    for (i = 0; i < GMX_SIMD_REF_WIDTH; i++)
-    {
-        if (a.r[i])
-        {
-            anytrue = 1;
-        }
-    }
-
-    return anytrue;
-}
-
-/* Conversions only used for PME table lookup */
-static gmx_inline gmx_simd_ref_epi32
-gmx_simd_ref_cvttpr_epi32(gmx_simd_ref_pr a)
-{
-    gmx_simd_ref_epi32 b;
-    int                i;
-
-    for (i = 0; i < GMX_SIMD_REF_WIDTH; i++)
-    {
-        b.r[i] = (int)a.r[i];
-    }
-
-    return b;
-};
-
-/* These two function only need to be approximate, Newton-Raphson iteration
- * is used for full accuracy in gmx_simd_invsqrt_r and gmx_simd_inv_r.
- */
-static gmx_inline gmx_simd_ref_pr
-gmx_simd_ref_rsqrt_pr(gmx_simd_ref_pr a)
-{
-    gmx_simd_ref_pr b;
-    int             i;
-
-    for (i = 0; i < GMX_SIMD_REF_WIDTH; i++)
-    {
-#ifdef GMX_DOUBLE
-        b.r[i] = 1.0/sqrt(a.r[i]);
-#else
-        b.r[i] = 1.0/sqrtf(a.r[i]);
-#endif
-    }
-
-    return b;
-};
-
-static gmx_inline gmx_simd_ref_pr
-gmx_simd_ref_rcp_pr(gmx_simd_ref_pr a)
-{
-    gmx_simd_ref_pr b;
-    int             i;
-
-    for (i = 0; i < GMX_SIMD_REF_WIDTH; i++)
-    {
-        b.r[i] = 1.0/a.r[i];
-    }
-
-    return b;
-};
-
-static gmx_inline gmx_simd_ref_pr
-gmx_simd_ref_exp_pr(gmx_simd_ref_pr a)
-{
-    gmx_simd_ref_pr b;
-    int             i;
-
-    for (i = 0; i < GMX_SIMD_REF_WIDTH; i++)
-    {
-#ifdef GMX_DOUBLE
-        b.r[i] = exp(a.r[i]);
-#else
-        b.r[i] = expf(a.r[i]);
-#endif
-    }
-
-    return b;
-};
-
-static gmx_inline gmx_simd_ref_pr
-gmx_simd_ref_sqrt_pr(gmx_simd_ref_pr a)
-{
-    gmx_simd_ref_pr b;
-    int             i;
-
-    for (i = 0; i < GMX_SIMD_REF_WIDTH; i++)
-    {
-#ifdef GMX_DOUBLE
-        b.r[i] = sqrt(a.r[i]);
-#else
-        b.r[i] = sqrtf(a.r[i]);
-#endif
-    }
-
-    return b;
-}
-
-static gmx_inline int
-gmx_simd_ref_sincos_pr(gmx_simd_ref_pr a,
-                       gmx_simd_ref_pr *s, gmx_simd_ref_pr *c)
-{
-    int i;
-
-    for (i = 0; i < GMX_SIMD_REF_WIDTH; i++)
-    {
-        s->r[i] = sin(a.r[i]);
-        c->r[i] = cos(a.r[i]);
-    }
-
-    return 0;
-}
-
-static gmx_inline gmx_simd_ref_pr
-gmx_simd_ref_acos_pr(gmx_simd_ref_pr a)
-{
-    gmx_simd_ref_pr b;
-    int             i;
-
-    for (i = 0; i < GMX_SIMD_REF_WIDTH; i++)
-    {
-        b.r[i] = acos(a.r[i]);
-    }
-
-    return b;
-}
-
-static gmx_inline gmx_simd_ref_pr
-gmx_simd_ref_atan2_pr(gmx_simd_ref_pr a, gmx_simd_ref_pr b)
-{
-    gmx_simd_ref_pr c;
-    int             i;
-
-    for (i = 0; i < GMX_SIMD_REF_WIDTH; i++)
-    {
-        c.r[i] = atan2(a.r[i], b.r[i]);
-    }
-
-    return c;
-}
-
-#endif
diff --git a/src/gromacs/simd/math_double.h b/src/gromacs/simd/math_double.h

deleted file mode 100644 (file)

index 8e7d733..0000000
--- a/src/gromacs/simd/math_double.h
+++ /dev/null
@@ -1,263 +0,0 @@
-/*
- * This file is part of the GROMACS molecular simulation package.
- *
- * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
- * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
- * and including many others, as listed in the AUTHORS file in the
- * top-level source directory and at http://www.gromacs.org.
- *
- * GROMACS is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public License
- * as published by the Free Software Foundation; either version 2.1
- * of the License, or (at your option) any later version.
- *
- * GROMACS is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with GROMACS; if not, see
- * http://www.gnu.org/licenses, or write to the Free Software Foundation,
- * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
- *
- * If you want to redistribute modifications to GROMACS, please
- * consider that scientific software is very special. Version
- * control is crucial - bugs must be traceable. We will be happy to
- * consider code for inclusion in the official distribution, but
- * derived work must not be called official GROMACS. Details are found
- * in the README & COPYING files - if they are missing, get the
- * official version at http://www.gromacs.org.
- *
- * To help us fund GROMACS development, we humbly ask that you cite
- * the research papers on the package. Check out http://www.gromacs.org.
- */
-#ifndef GMX_SIMD_MATH_DOUBLE_H_
-#define GMX_SIMD_MATH_DOUBLE_H_
-
-
-/* 1.0/sqrt(x) */
-static gmx_inline gmx_simd_real_t
-gmx_simd_invsqrt_r(gmx_simd_real_t x)
-{
-    const gmx_simd_real_t half  = gmx_simd_set1_r(0.5);
-    const gmx_simd_real_t three = gmx_simd_set1_r(3.0);
-
-    /* Lookup instruction only exists in single precision, convert back and forth... */
-    gmx_simd_real_t lu = gmx_simd_rsqrt_r(x);
-
-    lu = gmx_simd_mul_r(gmx_simd_mul_r(half, lu), gmx_simd_fnmadd_r(gmx_simd_mul_r(lu, lu), x, three));
-    return gmx_simd_mul_r(gmx_simd_mul_r(half, lu), gmx_simd_fnmadd_r(gmx_simd_mul_r(lu, lu), x, three));
-}
-
-
-/* 1.0/x */
-static gmx_inline gmx_simd_real_t
-gmx_simd_inv_r(gmx_simd_real_t x)
-{
-    const gmx_simd_real_t two  = gmx_simd_set1_r(2.0);
-
-    /* Lookup instruction only exists in single precision, convert back and forth... */
-    gmx_simd_real_t lu = gmx_simd_rcp_r(x);
-
-    /* Perform two N-R steps for double precision */
-    lu         = gmx_simd_mul_r(lu, gmx_simd_fnmadd_r(lu, x, two));
-    return gmx_simd_mul_r(lu, gmx_simd_fnmadd_r(lu, x, two));
-}
-
-
-/* Calculate the force correction due to PME analytically.
- *
- * This routine is meant to enable analytical evaluation of the
- * direct-space PME electrostatic force to avoid tables.
- *
- * The direct-space potential should be Erfc(beta*r)/r, but there
- * are some problems evaluating that:
- *
- * First, the error function is difficult (read: expensive) to
- * approxmiate accurately for intermediate to large arguments, and
- * this happens already in ranges of beta*r that occur in simulations.
- * Second, we now try to avoid calculating potentials in Gromacs but
- * use forces directly.
- *
- * We can simply things slight by noting that the PME part is really
- * a correction to the normal Coulomb force since Erfc(z)=1-Erf(z), i.e.
- *
- * V= 1/r - Erf(beta*r)/r
- *
- * The first term we already have from the inverse square root, so
- * that we can leave out of this routine.
- *
- * For pme tolerances of 1e-3 to 1e-8 and cutoffs of 0.5nm to 1.8nm,
- * the argument beta*r will be in the range 0.15 to ~4. Use your
- * favorite plotting program to realize how well-behaved Erf(z)/z is
- * in this range!
- *
- * We approximate f(z)=erf(z)/z with a rational minimax polynomial.
- * However, it turns out it is more efficient to approximate f(z)/z and
- * then only use even powers. This is another minor optimization, since
- * we actually WANT f(z)/z, because it is going to be multiplied by
- * the vector between the two atoms to get the vectorial force. The
- * fastest flops are the ones we can avoid calculating!
- *
- * So, here's how it should be used:
- *
- * 1. Calculate r^2.
- * 2. Multiply by beta^2, so you get z^2=beta^2*r^2.
- * 3. Evaluate this routine with z^2 as the argument.
- * 4. The return value is the expression:
- *
- *
- *       2*exp(-z^2)     erf(z)
- *       ------------ - --------
- *       sqrt(Pi)*z^2      z^3
- *
- * 5. Multiply the entire expression by beta^3. This will get you
- *
- *       beta^3*2*exp(-z^2)     beta^3*erf(z)
- *       ------------------  - ---------------
- *          sqrt(Pi)*z^2            z^3
- *
- *    or, switching back to r (z=r*beta):
- *
- *       2*beta*exp(-r^2*beta^2)   erf(r*beta)
- *       ----------------------- - -----------
- *            sqrt(Pi)*r^2            r^3
- *
- *
- *    With a bit of math exercise you should be able to confirm that
- *    this is exactly D[Erf[beta*r]/r,r] divided by r another time.
- *
- * 6. Add the result to 1/r^3, multiply by the product of the charges,
- *    and you have your force (divided by r). A final multiplication
- *    with the vector connecting the two particles and you have your
- *    vectorial force to add to the particles.
- *
- */
-static gmx_simd_real_t
-gmx_simd_pmecorrF_r(gmx_simd_real_t z2)
-{
-    const gmx_simd_real_t  FN10     = gmx_simd_set1_r(-8.0072854618360083154e-14);
-    const gmx_simd_real_t  FN9      = gmx_simd_set1_r(1.1859116242260148027e-11);
-    const gmx_simd_real_t  FN8      = gmx_simd_set1_r(-8.1490406329798423616e-10);
-    const gmx_simd_real_t  FN7      = gmx_simd_set1_r(3.4404793543907847655e-8);
-    const gmx_simd_real_t  FN6      = gmx_simd_set1_r(-9.9471420832602741006e-7);
-    const gmx_simd_real_t  FN5      = gmx_simd_set1_r(0.000020740315999115847456);
-    const gmx_simd_real_t  FN4      = gmx_simd_set1_r(-0.00031991745139313364005);
-    const gmx_simd_real_t  FN3      = gmx_simd_set1_r(0.0035074449373659008203);
-    const gmx_simd_real_t  FN2      = gmx_simd_set1_r(-0.031750380176100813405);
-    const gmx_simd_real_t  FN1      = gmx_simd_set1_r(0.13884101728898463426);
-    const gmx_simd_real_t  FN0      = gmx_simd_set1_r(-0.75225277815249618847);
-
-    const gmx_simd_real_t  FD5      = gmx_simd_set1_r(0.000016009278224355026701);
-    const gmx_simd_real_t  FD4      = gmx_simd_set1_r(0.00051055686934806966046);
-    const gmx_simd_real_t  FD3      = gmx_simd_set1_r(0.0081803507497974289008);
-    const gmx_simd_real_t  FD2      = gmx_simd_set1_r(0.077181146026670287235);
-    const gmx_simd_real_t  FD1      = gmx_simd_set1_r(0.41543303143712535988);
-    const gmx_simd_real_t  FD0      = gmx_simd_set1_r(1.0);
-
-    gmx_simd_real_t        z4;
-    gmx_simd_real_t        polyFN0, polyFN1, polyFD0, polyFD1;
-
-    z4             = gmx_simd_mul_r(z2, z2);
-
-    polyFD1        = gmx_simd_fmadd_r(FD5, z4, FD3);
-    polyFD1        = gmx_simd_fmadd_r(polyFD1, z4, FD1);
-    polyFD1        = gmx_simd_mul_r(polyFD1, z2);
-    polyFD0        = gmx_simd_fmadd_r(FD4, z4, FD2);
-    polyFD0        = gmx_simd_fmadd_r(polyFD0, z4, FD0);
-    polyFD0        = gmx_simd_add_r(polyFD0, polyFD1);
-
-    polyFD0        = gmx_simd_inv_r(polyFD0);
-
-    polyFN0        = gmx_simd_fmadd_r(FN10, z4, FN8);
-    polyFN0        = gmx_simd_fmadd_r(polyFN0, z4, FN6);
-    polyFN0        = gmx_simd_fmadd_r(polyFN0, z4, FN4);
-    polyFN0        = gmx_simd_fmadd_r(polyFN0, z4, FN2);
-    polyFN0        = gmx_simd_fmadd_r(polyFN0, z4, FN0);
-    polyFN1        = gmx_simd_fmadd_r(FN9, z4, FN7);
-    polyFN1        = gmx_simd_fmadd_r(polyFN1, z4, FN5);
-    polyFN1        = gmx_simd_fmadd_r(polyFN1, z4, FN3);
-    polyFN1        = gmx_simd_fmadd_r(polyFN1, z4, FN1);
-    polyFN0        = gmx_simd_fmadd_r(polyFN1, z2, polyFN0);
-
-    return gmx_simd_mul_r(polyFN0, polyFD0);
-}
-
-
-/* Calculate the potential correction due to PME analytically.
- *
- * This routine calculates Erf(z)/z, although you should provide z^2
- * as the input argument.
- *
- * Here's how it should be used:
- *
- * 1. Calculate r^2.
- * 2. Multiply by beta^2, so you get z^2=beta^2*r^2.
- * 3. Evaluate this routine with z^2 as the argument.
- * 4. The return value is the expression:
- *
- *
- *        erf(z)
- *       --------
- *          z
- *
- * 5. Multiply the entire expression by beta and switching back to r (z=r*beta):
- *
- *       erf(r*beta)
- *       -----------
- *           r
- *
- * 6. Subtract the result from 1/r, multiply by the product of the charges,
- *    and you have your potential.
- *
- */
-static gmx_simd_real_t
-gmx_simd_pmecorrV_r(gmx_simd_real_t z2)
-{
-    const gmx_simd_real_t  VN9      = gmx_simd_set1_r(-9.3723776169321855475e-13);
-    const gmx_simd_real_t  VN8      = gmx_simd_set1_r(1.2280156762674215741e-10);
-    const gmx_simd_real_t  VN7      = gmx_simd_set1_r(-7.3562157912251309487e-9);
-    const gmx_simd_real_t  VN6      = gmx_simd_set1_r(2.6215886208032517509e-7);
-    const gmx_simd_real_t  VN5      = gmx_simd_set1_r(-4.9532491651265819499e-6);
-    const gmx_simd_real_t  VN4      = gmx_simd_set1_r(0.00025907400778966060389);
-    const gmx_simd_real_t  VN3      = gmx_simd_set1_r(0.0010585044856156469792);
-    const gmx_simd_real_t  VN2      = gmx_simd_set1_r(0.045247661136833092885);
-    const gmx_simd_real_t  VN1      = gmx_simd_set1_r(0.11643931522926034421);
-    const gmx_simd_real_t  VN0      = gmx_simd_set1_r(1.1283791671726767970);
-
-    const gmx_simd_real_t  VD5      = gmx_simd_set1_r(0.000021784709867336150342);
-    const gmx_simd_real_t  VD4      = gmx_simd_set1_r(0.00064293662010911388448);
-    const gmx_simd_real_t  VD3      = gmx_simd_set1_r(0.0096311444822588683504);
-    const gmx_simd_real_t  VD2      = gmx_simd_set1_r(0.085608012351550627051);
-    const gmx_simd_real_t  VD1      = gmx_simd_set1_r(0.43652499166614811084);
-    const gmx_simd_real_t  VD0      = gmx_simd_set1_r(1.0);
-
-    gmx_simd_real_t        z4;
-    gmx_simd_real_t        polyVN0, polyVN1, polyVD0, polyVD1;
-
-    z4             = gmx_simd_mul_r(z2, z2);
-
-    polyVD1        = gmx_simd_fmadd_r(VD5, z4, VD3);
-    polyVD0        = gmx_simd_fmadd_r(VD4, z4, VD2);
-    polyVD1        = gmx_simd_fmadd_r(polyVD1, z4, VD1);
-    polyVD0        = gmx_simd_fmadd_r(polyVD0, z4, VD0);
-    polyVD0        = gmx_simd_fmadd_r(polyVD1, z2, polyVD0);
-
-    polyVD0        = gmx_simd_inv_r(polyVD0);
-
-    polyVN1        = gmx_simd_fmadd_r(VN9, z4, VN7);
-    polyVN0        = gmx_simd_fmadd_r(VN8, z4, VN6);
-    polyVN1        = gmx_simd_fmadd_r(polyVN1, z4, VN5);
-    polyVN0        = gmx_simd_fmadd_r(polyVN0, z4, VN4);
-    polyVN1        = gmx_simd_fmadd_r(polyVN1, z4, VN3);
-    polyVN0        = gmx_simd_fmadd_r(polyVN0, z4, VN2);
-    polyVN1        = gmx_simd_fmadd_r(polyVN1, z4, VN1);
-    polyVN0        = gmx_simd_fmadd_r(polyVN0, z4, VN0);
-    polyVN0        = gmx_simd_fmadd_r(polyVN1, z2, polyVN0);
-
-    return gmx_simd_mul_r(polyVN0, polyVD0);
-}
-
-
-#endif
diff --git a/src/gromacs/simd/math_single.h b/src/gromacs/simd/math_single.h

deleted file mode 100644 (file)

index c956b9a..0000000
--- a/src/gromacs/simd/math_single.h
+++ /dev/null
@@ -1,251 +0,0 @@
-/*
- * This file is part of the GROMACS molecular simulation package.
- *
- * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
- * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
- * and including many others, as listed in the AUTHORS file in the
- * top-level source directory and at http://www.gromacs.org.
- *
- * GROMACS is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public License
- * as published by the Free Software Foundation; either version 2.1
- * of the License, or (at your option) any later version.
- *
- * GROMACS is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with GROMACS; if not, see
- * http://www.gnu.org/licenses, or write to the Free Software Foundation,
- * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
- *
- * If you want to redistribute modifications to GROMACS, please
- * consider that scientific software is very special. Version
- * control is crucial - bugs must be traceable. We will be happy to
- * consider code for inclusion in the official distribution, but
- * derived work must not be called official GROMACS. Details are found
- * in the README & COPYING files - if they are missing, get the
- * official version at http://www.gromacs.org.
- *
- * To help us fund GROMACS development, we humbly ask that you cite
- * the research papers on the package. Check out http://www.gromacs.org.
- */
-#ifndef GMX_SIMD_MATH_SINGLE_H_
-#define GMX_SIMD_MATH_SINGLE_H_
-
-
-/* 1.0/sqrt(x) */
-static gmx_inline gmx_simd_real_t
-gmx_simd_invsqrt_r(gmx_simd_real_t x)
-{
-    /* This is one of the few cases where FMA adds a FLOP, but ends up with
-     * less instructions in total when FMA is available in hardware.
-     * Usually we would not optimize this far, but invsqrt is used often.
-     */
-#ifdef GMX_SIMD_HAVE_FMA
-    const gmx_simd_real_t half  = gmx_simd_set1_r(0.5);
-    const gmx_simd_real_t one   = gmx_simd_set1_r(1.0);
-
-    gmx_simd_real_t       lu = gmx_simd_rsqrt_r(x);
-
-    return gmx_simd_fmadd_r(gmx_simd_fnmadd_r(x, gmx_simd_mul_r(lu, lu), one), gmx_simd_mul_r(lu, half), lu);
-#else
-    const gmx_simd_real_t half  = gmx_simd_set1_r(0.5);
-    const gmx_simd_real_t three = gmx_simd_set1_r(3.0);
-
-    gmx_simd_real_t       lu = gmx_simd_rsqrt_r(x);
-
-    return gmx_simd_mul_r(half, gmx_simd_mul_r(gmx_simd_sub_r(three, gmx_simd_mul_r(gmx_simd_mul_r(lu, lu), x)), lu));
-#endif
-}
-
-
-/* 1.0/x */
-static gmx_inline gmx_simd_real_t
-gmx_simd_inv_r(gmx_simd_real_t x)
-{
-    const gmx_simd_real_t two = gmx_simd_set1_r(2.0);
-
-    gmx_simd_real_t       lu = gmx_simd_rcp_r(x);
-
-    return gmx_simd_mul_r(lu, gmx_simd_fnmadd_r(lu, x, two));
-}
-
-
-/* Calculate the force correction due to PME analytically.
- *
- * This routine is meant to enable analytical evaluation of the
- * direct-space PME electrostatic force to avoid tables.
- *
- * The direct-space potential should be Erfc(beta*r)/r, but there
- * are some problems evaluating that:
- *
- * First, the error function is difficult (read: expensive) to
- * approxmiate accurately for intermediate to large arguments, and
- * this happens already in ranges of beta*r that occur in simulations.
- * Second, we now try to avoid calculating potentials in Gromacs but
- * use forces directly.
- *
- * We can simply things slight by noting that the PME part is really
- * a correction to the normal Coulomb force since Erfc(z)=1-Erf(z), i.e.
- *
- * V= 1/r - Erf(beta*r)/r
- *
- * The first term we already have from the inverse square root, so
- * that we can leave out of this routine.
- *
- * For pme tolerances of 1e-3 to 1e-8 and cutoffs of 0.5nm to 1.8nm,
- * the argument beta*r will be in the range 0.15 to ~4. Use your
- * favorite plotting program to realize how well-behaved Erf(z)/z is
- * in this range!
- *
- * We approximate f(z)=erf(z)/z with a rational minimax polynomial.
- * However, it turns out it is more efficient to approximate f(z)/z and
- * then only use even powers. This is another minor optimization, since
- * we actually WANT f(z)/z, because it is going to be multiplied by
- * the vector between the two atoms to get the vectorial force. The
- * fastest flops are the ones we can avoid calculating!
- *
- * So, here's how it should be used:
- *
- * 1. Calculate r^2.
- * 2. Multiply by beta^2, so you get z^2=beta^2*r^2.
- * 3. Evaluate this routine with z^2 as the argument.
- * 4. The return value is the expression:
- *
- *
- *       2*exp(-z^2)     erf(z)
- *       ------------ - --------
- *       sqrt(Pi)*z^2      z^3
- *
- * 5. Multiply the entire expression by beta^3. This will get you
- *
- *       beta^3*2*exp(-z^2)     beta^3*erf(z)
- *       ------------------  - ---------------
- *          sqrt(Pi)*z^2            z^3
- *
- *    or, switching back to r (z=r*beta):
- *
- *       2*beta*exp(-r^2*beta^2)   erf(r*beta)
- *       ----------------------- - -----------
- *            sqrt(Pi)*r^2            r^3
- *
- *
- *    With a bit of math exercise you should be able to confirm that
- *    this is exactly D[Erf[beta*r]/r,r] divided by r another time.
- *
- * 6. Add the result to 1/r^3, multiply by the product of the charges,
- *    and you have your force (divided by r). A final multiplication
- *    with the vector connecting the two particles and you have your
- *    vectorial force to add to the particles.
- *
- */
-static gmx_simd_real_t
-gmx_simd_pmecorrF_r(gmx_simd_real_t z2)
-{
-    const gmx_simd_real_t  FN6      = gmx_simd_set1_r(-1.7357322914161492954e-8f);
-    const gmx_simd_real_t  FN5      = gmx_simd_set1_r(1.4703624142580877519e-6f);
-    const gmx_simd_real_t  FN4      = gmx_simd_set1_r(-0.000053401640219807709149f);
-    const gmx_simd_real_t  FN3      = gmx_simd_set1_r(0.0010054721316683106153f);
-    const gmx_simd_real_t  FN2      = gmx_simd_set1_r(-0.019278317264888380590f);
-    const gmx_simd_real_t  FN1      = gmx_simd_set1_r(0.069670166153766424023f);
-    const gmx_simd_real_t  FN0      = gmx_simd_set1_r(-0.75225204789749321333f);
-
-    const gmx_simd_real_t  FD4      = gmx_simd_set1_r(0.0011193462567257629232f);
-    const gmx_simd_real_t  FD3      = gmx_simd_set1_r(0.014866955030185295499f);
-    const gmx_simd_real_t  FD2      = gmx_simd_set1_r(0.11583842382862377919f);
-    const gmx_simd_real_t  FD1      = gmx_simd_set1_r(0.50736591960530292870f);
-    const gmx_simd_real_t  FD0      = gmx_simd_set1_r(1.0f);
-
-    gmx_simd_real_t        z4;
-    gmx_simd_real_t        polyFN0, polyFN1, polyFD0, polyFD1;
-
-    z4             = gmx_simd_mul_r(z2, z2);
-
-    polyFD0        = gmx_simd_fmadd_r(FD4, z4, FD2);
-    polyFD1        = gmx_simd_fmadd_r(FD3, z4, FD1);
-    polyFD0        = gmx_simd_fmadd_r(polyFD0, z4, FD0);
-    polyFD0        = gmx_simd_fmadd_r(polyFD1, z2, polyFD0);
-
-    polyFD0        = gmx_simd_inv_r(polyFD0);
-
-    polyFN0        = gmx_simd_fmadd_r(FN6, z4, FN4);
-    polyFN1        = gmx_simd_fmadd_r(FN5, z4, FN3);
-    polyFN0        = gmx_simd_fmadd_r(polyFN0, z4, FN2);
-    polyFN1        = gmx_simd_fmadd_r(polyFN1, z4, FN1);
-    polyFN0        = gmx_simd_fmadd_r(polyFN0, z4, FN0);
-    polyFN0        = gmx_simd_fmadd_r(polyFN1, z2, polyFN0);
-
-    return gmx_simd_mul_r(polyFN0, polyFD0);
-}
-
-
-/* Calculate the potential correction due to PME analytically.
- *
- * See gmx_simd_pmecorrF_r() for details about the approximation.
- *
- * This routine calculates Erf(z)/z, although you should provide z^2
- * as the input argument.
- *
- * Here's how it should be used:
- *
- * 1. Calculate r^2.
- * 2. Multiply by beta^2, so you get z^2=beta^2*r^2.
- * 3. Evaluate this routine with z^2 as the argument.
- * 4. The return value is the expression:
- *
- *
- *        erf(z)
- *       --------
- *          z
- *
- * 5. Multiply the entire expression by beta and switching back to r (z=r*beta):
- *
- *       erf(r*beta)
- *       -----------
- *           r
- *
- * 6. Add the result to 1/r, multiply by the product of the charges,
- *    and you have your potential.
- */
-static gmx_simd_real_t
-gmx_simd_pmecorrV_r(gmx_simd_real_t z2)
-{
-    const gmx_simd_real_t  VN6      = gmx_simd_set1_r(1.9296833005951166339e-8f);
-    const gmx_simd_real_t  VN5      = gmx_simd_set1_r(-1.4213390571557850962e-6f);
-    const gmx_simd_real_t  VN4      = gmx_simd_set1_r(0.000041603292906656984871f);
-    const gmx_simd_real_t  VN3      = gmx_simd_set1_r(-0.00013134036773265025626f);
-    const gmx_simd_real_t  VN2      = gmx_simd_set1_r(0.038657983986041781264f);
-    const gmx_simd_real_t  VN1      = gmx_simd_set1_r(0.11285044772717598220f);
-    const gmx_simd_real_t  VN0      = gmx_simd_set1_r(1.1283802385263030286f);
-
-    const gmx_simd_real_t  VD3      = gmx_simd_set1_r(0.0066752224023576045451f);
-    const gmx_simd_real_t  VD2      = gmx_simd_set1_r(0.078647795836373922256f);
-    const gmx_simd_real_t  VD1      = gmx_simd_set1_r(0.43336185284710920150f);
-    const gmx_simd_real_t  VD0      = gmx_simd_set1_r(1.0f);
-
-    gmx_simd_real_t        z4;
-    gmx_simd_real_t        polyVN0, polyVN1, polyVD0, polyVD1;
-
-    z4             = gmx_simd_mul_r(z2, z2);
-
-    polyVD1        = gmx_simd_fmadd_r(VD3, z4, VD1);
-    polyVD0        = gmx_simd_fmadd_r(VD2, z4, VD0);
-    polyVD0        = gmx_simd_fmadd_r(polyVD1, z2, polyVD0);
-
-    polyVD0        = gmx_simd_inv_r(polyVD0);
-
-    polyVN0        = gmx_simd_fmadd_r(VN6, z4, VN4);
-    polyVN1        = gmx_simd_fmadd_r(VN5, z4, VN3);
-    polyVN0        = gmx_simd_fmadd_r(polyVN0, z4, VN2);
-    polyVN1        = gmx_simd_fmadd_r(polyVN1, z4, VN1);
-    polyVN0        = gmx_simd_fmadd_r(polyVN0, z4, VN0);
-    polyVN0        = gmx_simd_fmadd_r(polyVN1, z2, polyVN0);
-
-    return gmx_simd_mul_r(polyVN0, polyVD0);
-}
-
-
-#endif
diff --git a/src/gromacs/simd/math_x86_avx_128_fma_double.h b/src/gromacs/simd/math_x86_avx_128_fma_double.h

index 778f3de0d694a682d1a49024bc4c60578ede7aba..fed6d546946bda6576d1216ff83f5a521c51627b 100644 (file)
--- a/src/gromacs/simd/math_x86_avx_128_fma_double.h
+++ b/src/gromacs/simd/math_x86_avx_128_fma_double.h
@@ -1,7 +1,7 @@
  /*
   * This file is part of the GROMACS molecular simulation package.
   *
- * Copyright (c) 2012,2013, by the GROMACS development team, led by
+ * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
   * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   * and including many others, as listed in the AUTHORS file in the
   * top-level source directory and at http://www.gromacs.org.
@@ -35,1316 +35,19 @@
  #ifndef GMX_SIMD_MATH_AVX_128_FMA_DOUBLE_H
  #define GMX_SIMD_MATH_AVX_128_FMA_DOUBLE_H
  
-#include <immintrin.h> /* AVX */
-#ifdef HAVE_X86INTRIN_H
-#include <x86intrin.h> /* FMA */
-#endif
-#ifdef HAVE_INTRIN_H
-#include <intrin.h> /* FMA MSVC */
-#endif
-
-#include <math.h>
-
-#include "general_x86_avx_128_fma.h"
-
-
-#ifndef M_PI
-#  define M_PI 3.14159265358979323846264338327950288
-#endif
-
-
-/************************
- *                      *
- * Simple math routines *
- *                      *
- ************************/
-
-/* 1.0/sqrt(x) */
-static gmx_inline __m128d
-gmx_mm_invsqrt_pd(__m128d x)
-{
-    const __m128d half  = _mm_set1_pd(0.5);
-    const __m128d three = _mm_set1_pd(3.0);
-
-    /* Lookup instruction only exists in single precision, convert back and forth... */
-    __m128d lu = _mm_cvtps_pd(_mm_rsqrt_ps( _mm_cvtpd_ps(x)));
-
-    lu = _mm_mul_pd(_mm_mul_pd(half, lu), _mm_nmacc_pd(_mm_mul_pd(lu, lu), x, three));
-    return _mm_mul_pd(_mm_mul_pd(half, lu), _mm_nmacc_pd(_mm_mul_pd(lu, lu), x, three));
-}
-
-/* 1.0/sqrt(x), done for a pair of arguments to improve throughput */
-static void
-gmx_mm_invsqrt_pair_pd(__m128d x1, __m128d x2, __m128d *invsqrt1, __m128d *invsqrt2)
-{
-    const __m128d half   = _mm_set1_pd(0.5);
-    const __m128d three  = _mm_set1_pd(3.0);
-    const __m128  halff  = _mm_set1_ps(0.5f);
-    const __m128  threef = _mm_set1_ps(3.0f);
-
-    __m128        xf, luf;
-    __m128d       lu1, lu2;
-
-    /* Do first N-R step in float for 2x throughput */
-    xf  = _mm_shuffle_ps(_mm_cvtpd_ps(x1), _mm_cvtpd_ps(x2), _MM_SHUFFLE(1, 0, 1, 0));
-    luf = _mm_rsqrt_ps(xf);
-
-    luf = _mm_mul_ps(_mm_mul_ps(halff, luf), _mm_nmacc_ps(_mm_mul_ps(luf, luf), xf, threef));
-
-
-    lu2 = _mm_cvtps_pd(_mm_shuffle_ps(luf, luf, _MM_SHUFFLE(3, 2, 3, 2)));
-    lu1 = _mm_cvtps_pd(luf);
-
-    *invsqrt1 = _mm_mul_pd(_mm_mul_pd(half, lu1), _mm_nmacc_pd(_mm_mul_pd(lu1, lu1), x1, three));
-    *invsqrt2 = _mm_mul_pd(_mm_mul_pd(half, lu2), _mm_nmacc_pd(_mm_mul_pd(lu2, lu2), x2, three));
-}
-
-/* sqrt(x) - Do NOT use this (but rather invsqrt) if you actually need 1.0/sqrt(x) */
-static gmx_inline __m128d
-gmx_mm_sqrt_pd(__m128d x)
-{
-    __m128d mask;
-    __m128d res;
-
-    mask = _mm_cmpeq_pd(x, _mm_setzero_pd());
-    res  = _mm_andnot_pd(mask, gmx_mm_invsqrt_pd(x));
-
-    res  = _mm_mul_pd(x, res);
-
-    return res;
-}
-
-/* 1.0/x */
-static gmx_inline __m128d
-gmx_mm_inv_pd(__m128d x)
-{
-    const __m128d two  = _mm_set1_pd(2.0);
-
-    /* Lookup instruction only exists in single precision, convert back and forth... */
-    __m128d lu = _mm_cvtps_pd(_mm_rcp_ps( _mm_cvtpd_ps(x)));
-
-    /* Perform two N-R steps for double precision */
-    lu         = _mm_mul_pd(lu, _mm_nmacc_pd(lu, x, two));
-    return _mm_mul_pd(lu, _mm_nmacc_pd(lu, x, two));
-}
-
-static gmx_inline __m128d
-gmx_mm_abs_pd(__m128d x)
-{
-    const __m128d signmask  = gmx_mm_castsi128_pd( _mm_set_epi32(0x7FFFFFFF, 0xFFFFFFFF, 0x7FFFFFFF, 0xFFFFFFFF) );
-
-    return _mm_and_pd(x, signmask);
-}
-
-
-/*
- * 2^x function.
- *
- * The 2^w term is calculated from a (6,0)-th order (no denominator) Minimax polynomia on the interval
- * [-0.5,0.5].
- *
- * The approximation on [-0.5,0.5] is a rational Padé approximation, 1+2*P(x^2)/(Q(x^2)-P(x^2)),
- * according to the same algorithm as used in the Cephes/netlib math routines.
- */
-static __m128d
-gmx_mm_exp2_pd(__m128d x)
-{
-    /* Lower bound: We do not allow numbers that would lead to an IEEE fp representation exponent smaller than -126. */
-    const __m128d arglimit = _mm_set1_pd(1022.0);
-    const __m128i expbase  = _mm_set1_epi32(1023);
-
-    const __m128d P2       = _mm_set1_pd(2.30933477057345225087e-2);
-    const __m128d P1       = _mm_set1_pd(2.02020656693165307700e1);
-    const __m128d P0       = _mm_set1_pd(1.51390680115615096133e3);
-    /* Q2 == 1.0 */
-    const __m128d Q1       = _mm_set1_pd(2.33184211722314911771e2);
-    const __m128d Q0       = _mm_set1_pd(4.36821166879210612817e3);
-    const __m128d one      = _mm_set1_pd(1.0);
-    const __m128d two      = _mm_set1_pd(2.0);
-
-    __m128d       valuemask;
-    __m128i       iexppart;
-    __m128d       fexppart;
-    __m128d       intpart;
-    __m128d       z, z2;
-    __m128d       PolyP, PolyQ;
-
-    iexppart  = _mm_cvtpd_epi32(x);
-    intpart   = _mm_round_pd(x, _MM_FROUND_TO_NEAREST_INT);
-
-    /* The two lowest elements of iexppart now contains 32-bit numbers with a correctly biased exponent.
-     * To be able to shift it into the exponent for a double precision number we first need to
-     * shuffle so that the lower half contains the first element, and the upper half the second.
-     * This should really be done as a zero-extension, but since the next instructions will shift
-     * the registers left by 52 bits it doesn't matter what we put there - it will be shifted out.
-     * (thus we just use element 2 from iexppart).
-     */
-    iexppart  = _mm_shuffle_epi32(iexppart, _MM_SHUFFLE(2, 1, 2, 0));
-
-    /* Do the shift operation on the 64-bit registers */
-    iexppart  = _mm_add_epi32(iexppart, expbase);
-    iexppart  = _mm_slli_epi64(iexppart, 52);
-
-    valuemask = _mm_cmpge_pd(arglimit, gmx_mm_abs_pd(x));
-    fexppart  = _mm_and_pd(valuemask, gmx_mm_castsi128_pd(iexppart));
-
-    z         = _mm_sub_pd(x, intpart);
-    z2        = _mm_mul_pd(z, z);
-
-    PolyP     = _mm_macc_pd(P2, z2, P1);
-    PolyQ     = _mm_add_pd(z2, Q1);
-    PolyP     = _mm_macc_pd(PolyP, z2, P0);
-    PolyQ     = _mm_macc_pd(PolyQ, z2, Q0);
-    PolyP     = _mm_mul_pd(PolyP, z);
-
-    z         = _mm_mul_pd(PolyP, gmx_mm_inv_pd(_mm_sub_pd(PolyQ, PolyP)));
-    z         = _mm_macc_pd(two, z, one);
-
-    z         = _mm_mul_pd(z, fexppart);
-
-    return z;
-}
-
-/* Exponential function. This could be calculated from 2^x as Exp(x)=2^(y), where y=log2(e)*x,
- * but there will then be a small rounding error since we lose some precision due to the
- * multiplication. This will then be magnified a lot by the exponential.
- *
- * Instead, we calculate the fractional part directly as a Padé approximation of
- * Exp(z) on [-0.5,0.5]. We use extended precision arithmetics to calculate the fraction
- * remaining after 2^y, which avoids the precision-loss.
- */
-static __m128d
-gmx_mm_exp_pd(__m128d exparg)
-{
-    const __m128d argscale = _mm_set1_pd(1.4426950408889634073599);
-    /* Lower bound: We do not allow numbers that would lead to an IEEE fp representation exponent smaller than -126. */
-    const __m128d arglimit = _mm_set1_pd(1022.0);
-    const __m128i expbase  = _mm_set1_epi32(1023);
-
-    const __m128d invargscale0  = _mm_set1_pd(6.93145751953125e-1);
-    const __m128d invargscale1  = _mm_set1_pd(1.42860682030941723212e-6);
-
-    const __m128d P2       = _mm_set1_pd(1.26177193074810590878e-4);
-    const __m128d P1       = _mm_set1_pd(3.02994407707441961300e-2);
-    /* P0 == 1.0 */
-    const __m128d Q3       = _mm_set1_pd(3.00198505138664455042E-6);
-    const __m128d Q2       = _mm_set1_pd(2.52448340349684104192E-3);
-    const __m128d Q1       = _mm_set1_pd(2.27265548208155028766E-1);
-    /* Q0 == 2.0 */
-    const __m128d one      = _mm_set1_pd(1.0);
-    const __m128d two      = _mm_set1_pd(2.0);
-
-    __m128d       valuemask;
-    __m128i       iexppart;
-    __m128d       fexppart;
-    __m128d       intpart;
-    __m128d       x, z, z2;
-    __m128d       PolyP, PolyQ;
-
-    x             = _mm_mul_pd(exparg, argscale);
-
-    iexppart  = _mm_cvtpd_epi32(x);
-    intpart   = _mm_round_pd(x, _MM_FROUND_TO_NEAREST_INT);
-
-    /* The two lowest elements of iexppart now contains 32-bit numbers with a correctly biased exponent.
-     * To be able to shift it into the exponent for a double precision number we first need to
-     * shuffle so that the lower half contains the first element, and the upper half the second.
-     * This should really be done as a zero-extension, but since the next instructions will shift
-     * the registers left by 52 bits it doesn't matter what we put there - it will be shifted out.
-     * (thus we just use element 2 from iexppart).
-     */
-    iexppart  = _mm_shuffle_epi32(iexppart, _MM_SHUFFLE(2, 1, 2, 0));
-
-    /* Do the shift operation on the 64-bit registers */
-    iexppart  = _mm_add_epi32(iexppart, expbase);
-    iexppart  = _mm_slli_epi64(iexppart, 52);
-
-    valuemask = _mm_cmpge_pd(arglimit, gmx_mm_abs_pd(x));
-    fexppart  = _mm_and_pd(valuemask, gmx_mm_castsi128_pd(iexppart));
-
-    z         = _mm_sub_pd(exparg, _mm_mul_pd(invargscale0, intpart));
-    z         = _mm_sub_pd(z, _mm_mul_pd(invargscale1, intpart));
-
-    z2        = _mm_mul_pd(z, z);
-
-    PolyQ     = _mm_macc_pd(Q3, z2, Q2);
-    PolyP     = _mm_macc_pd(P2, z2, P1);
-    PolyQ     = _mm_macc_pd(PolyQ, z2, Q1);
-
-    PolyP     = _mm_macc_pd(PolyP, z2, one);
-    PolyQ     = _mm_macc_pd(PolyQ, z2, two);
-
-    PolyP     = _mm_mul_pd(PolyP, z);
-
-    z         = _mm_mul_pd(PolyP, gmx_mm_inv_pd(_mm_sub_pd(PolyQ, PolyP)));
-    z         = _mm_macc_pd(two, z, one);
-
-    z         = _mm_mul_pd(z, fexppart);
-
-    return z;
-}
-
-
-
-static __m128d
-gmx_mm_log_pd(__m128d x)
-{
-    /* Same algorithm as cephes library */
-    const __m128d expmask    = gmx_mm_castsi128_pd( _mm_set_epi32(0x7FF00000, 0x00000000, 0x7FF00000, 0x00000000) );
-
-    const __m128i expbase_m1 = _mm_set1_epi32(1023-1); /* We want non-IEEE format */
-
-    const __m128d half       = _mm_set1_pd(0.5);
-    const __m128d one        = _mm_set1_pd(1.0);
-    const __m128d two        = _mm_set1_pd(2.0);
-    const __m128d invsq2     = _mm_set1_pd(1.0/sqrt(2.0));
-
-    const __m128d corr1      = _mm_set1_pd(-2.121944400546905827679e-4);
-    const __m128d corr2      = _mm_set1_pd(0.693359375);
-
-    const __m128d P5         = _mm_set1_pd(1.01875663804580931796e-4);
-    const __m128d P4         = _mm_set1_pd(4.97494994976747001425e-1);
-    const __m128d P3         = _mm_set1_pd(4.70579119878881725854e0);
-    const __m128d P2         = _mm_set1_pd(1.44989225341610930846e1);
-    const __m128d P1         = _mm_set1_pd(1.79368678507819816313e1);
-    const __m128d P0         = _mm_set1_pd(7.70838733755885391666e0);
-
-    const __m128d Q4         = _mm_set1_pd(1.12873587189167450590e1);
-    const __m128d Q3         = _mm_set1_pd(4.52279145837532221105e1);
-    const __m128d Q2         = _mm_set1_pd(8.29875266912776603211e1);
-    const __m128d Q1         = _mm_set1_pd(7.11544750618563894466e1);
-    const __m128d Q0         = _mm_set1_pd(2.31251620126765340583e1);
-
-    const __m128d R2         = _mm_set1_pd(-7.89580278884799154124e-1);
-    const __m128d R1         = _mm_set1_pd(1.63866645699558079767e1);
-    const __m128d R0         = _mm_set1_pd(-6.41409952958715622951e1);
-
-    const __m128d S2         = _mm_set1_pd(-3.56722798256324312549E1);
-    const __m128d S1         = _mm_set1_pd(3.12093766372244180303E2);
-    const __m128d S0         = _mm_set1_pd(-7.69691943550460008604E2);
-
-    __m128d       fexp;
-    __m128i       iexp;
-
-    __m128d       mask1, mask2;
-    __m128d       corr, t1, t2, q;
-    __m128d       zA, yA, xA, zB, yB, xB, z;
-    __m128d       polyR, polyS;
-    __m128d       polyP1, polyP2, polyQ1, polyQ2;
-
-    /* Separate x into exponent and mantissa, with a mantissa in the range [0.5..1[ (not IEEE754 standard!) */
-    fexp   = _mm_and_pd(x, expmask);
-    iexp   = gmx_mm_castpd_si128(fexp);
-    iexp   = _mm_srli_epi64(iexp, 52);
-    iexp   = _mm_sub_epi32(iexp, expbase_m1);
-    iexp   = _mm_shuffle_epi32(iexp, _MM_SHUFFLE(1, 1, 2, 0) );
-    fexp   = _mm_cvtepi32_pd(iexp);
-
-    x      = _mm_andnot_pd(expmask, x);
-    x      = _mm_or_pd(x, one);
-    x      = _mm_mul_pd(x, half);
-
-    mask1     = _mm_cmpgt_pd(gmx_mm_abs_pd(fexp), two);
-    mask2     = _mm_cmplt_pd(x, invsq2);
-
-    fexp   = _mm_sub_pd(fexp, _mm_and_pd(mask2, one));
-
-    /* If mask1 is set ('A') */
-    zA     = _mm_sub_pd(x, half);
-    t1     = _mm_blendv_pd( zA, x, mask2 );
-    zA     = _mm_sub_pd(t1, half);
-    t2     = _mm_blendv_pd( x, zA, mask2 );
-    yA     = _mm_mul_pd(half, _mm_add_pd(t2, one));
-
-    xA     = _mm_mul_pd(zA, gmx_mm_inv_pd(yA));
-    zA     = _mm_mul_pd(xA, xA);
-
-    /* EVALUATE POLY */
-    polyR  = _mm_macc_pd(R2, zA, R1);
-    polyR  = _mm_macc_pd(polyR, zA, R0);
-
-    polyS  = _mm_add_pd(zA, S2);
-    polyS  = _mm_macc_pd(polyS, zA, S1);
-    polyS  = _mm_macc_pd(polyS, zA, S0);
-
-    q      = _mm_mul_pd(polyR, gmx_mm_inv_pd(polyS));
-    zA     = _mm_mul_pd(_mm_mul_pd(xA, zA), q);
-
-    zA     = _mm_macc_pd(corr1, fexp, zA);
-    zA     = _mm_add_pd(zA, xA);
-    zA     = _mm_macc_pd(corr2, fexp, zA);
-
-    /* If mask1 is not set ('B') */
-    corr   = _mm_and_pd(mask2, x);
-    xB     = _mm_add_pd(x, corr);
-    xB     = _mm_sub_pd(xB, one);
-    zB     = _mm_mul_pd(xB, xB);
-
-    polyP1 = _mm_macc_pd(P5, zB, P3);
-    polyP2 = _mm_macc_pd(P4, zB, P2);
-    polyP1 = _mm_macc_pd(polyP1, zB, P1);
-    polyP2 = _mm_macc_pd(polyP2, zB, P0);
-    polyP1 = _mm_macc_pd(polyP1, xB, polyP2);
-
-    polyQ2 = _mm_macc_pd(Q4, zB, Q2);
-    polyQ1 = _mm_add_pd(zB, Q3);
-    polyQ1 = _mm_macc_pd(polyQ1, zB, Q1);
-    polyQ2 = _mm_macc_pd(polyQ2, zB, Q0);
-    polyQ1 = _mm_macc_pd(polyQ1, xB, polyQ2);
-
-    fexp   = _mm_and_pd(fexp, _mm_cmpneq_pd(fexp, _mm_setzero_pd()));
-
-    q      = _mm_mul_pd(polyP1, gmx_mm_inv_pd(polyQ1));
-    yB     = _mm_macc_pd(_mm_mul_pd(xB, zB), q, _mm_mul_pd(corr1, fexp));
-
-    yB     = _mm_nmacc_pd(half, zB, yB);
-    zB     = _mm_add_pd(xB, yB);
-    zB     = _mm_macc_pd(corr2, fexp, zB);
-
-    z      = _mm_blendv_pd( zB, zA, mask1 );
-
-    return z;
-}
-
-
-
-static __m128d
-gmx_mm_erf_pd(__m128d x)
-{
-    /* Coefficients for minimax approximation of erf(x)=x*(CAoffset + P(x^2)/Q(x^2)) in range [-0.75,0.75] */
-    const __m128d CAP4      = _mm_set1_pd(-0.431780540597889301512e-4);
-    const __m128d CAP3      = _mm_set1_pd(-0.00578562306260059236059);
-    const __m128d CAP2      = _mm_set1_pd(-0.028593586920219752446);
-    const __m128d CAP1      = _mm_set1_pd(-0.315924962948621698209);
-    const __m128d CAP0      = _mm_set1_pd(0.14952975608477029151);
-
-    const __m128d CAQ5      = _mm_set1_pd(-0.374089300177174709737e-5);
-    const __m128d CAQ4      = _mm_set1_pd(0.00015126584532155383535);
-    const __m128d CAQ3      = _mm_set1_pd(0.00536692680669480725423);
-    const __m128d CAQ2      = _mm_set1_pd(0.0668686825594046122636);
-    const __m128d CAQ1      = _mm_set1_pd(0.402604990869284362773);
-    /* CAQ0 == 1.0 */
-    const __m128d CAoffset  = _mm_set1_pd(0.9788494110107421875);
-
-    /* Coefficients for minimax approximation of erfc(x)=exp(-x^2)*x*(P(x-1)/Q(x-1)) in range [1.0,4.5] */
-    const __m128d CBP6      = _mm_set1_pd(2.49650423685462752497647637088e-10);
-    const __m128d CBP5      = _mm_set1_pd(0.00119770193298159629350136085658);
-    const __m128d CBP4      = _mm_set1_pd(0.0164944422378370965881008942733);
-    const __m128d CBP3      = _mm_set1_pd(0.0984581468691775932063932439252);
-    const __m128d CBP2      = _mm_set1_pd(0.317364595806937763843589437418);
-    const __m128d CBP1      = _mm_set1_pd(0.554167062641455850932670067075);
-    const __m128d CBP0      = _mm_set1_pd(0.427583576155807163756925301060);
-    const __m128d CBQ7      = _mm_set1_pd(0.00212288829699830145976198384930);
-    const __m128d CBQ6      = _mm_set1_pd(0.0334810979522685300554606393425);
-    const __m128d CBQ5      = _mm_set1_pd(0.2361713785181450957579508850717);
-    const __m128d CBQ4      = _mm_set1_pd(0.955364736493055670530981883072);
-    const __m128d CBQ3      = _mm_set1_pd(2.36815675631420037315349279199);
-    const __m128d CBQ2      = _mm_set1_pd(3.55261649184083035537184223542);
-    const __m128d CBQ1      = _mm_set1_pd(2.93501136050160872574376997993);
-    /* CBQ0 == 1.0 */
-
-    /* Coefficients for minimax approximation of erfc(x)=exp(-x^2)/x*(P(1/x)/Q(1/x)) in range [4.5,inf] */
-    const __m128d CCP6      = _mm_set1_pd(-2.8175401114513378771);
-    const __m128d CCP5      = _mm_set1_pd(-3.22729451764143718517);
-    const __m128d CCP4      = _mm_set1_pd(-2.5518551727311523996);
-    const __m128d CCP3      = _mm_set1_pd(-0.687717681153649930619);
-    const __m128d CCP2      = _mm_set1_pd(-0.212652252872804219852);
-    const __m128d CCP1      = _mm_set1_pd(0.0175389834052493308818);
-    const __m128d CCP0      = _mm_set1_pd(0.00628057170626964891937);
-
-    const __m128d CCQ6      = _mm_set1_pd(5.48409182238641741584);
-    const __m128d CCQ5      = _mm_set1_pd(13.5064170191802889145);
-    const __m128d CCQ4      = _mm_set1_pd(22.9367376522880577224);
-    const __m128d CCQ3      = _mm_set1_pd(15.930646027911794143);
-    const __m128d CCQ2      = _mm_set1_pd(11.0567237927800161565);
-    const __m128d CCQ1      = _mm_set1_pd(2.79257750980575282228);
-    /* CCQ0 == 1.0 */
-    const __m128d CCoffset  = _mm_set1_pd(0.5579090118408203125);
-
-    const __m128d one       = _mm_set1_pd(1.0);
-    const __m128d two       = _mm_set1_pd(2.0);
-
-    const __m128d signbit   = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000, 0x00000000, 0x80000000, 0x00000000) );
-
-    __m128d       xabs, x2, x4, t, t2, w, w2;
-    __m128d       PolyAP0, PolyAP1, PolyAQ0, PolyAQ1;
-    __m128d       PolyBP0, PolyBP1, PolyBQ0, PolyBQ1;
-    __m128d       PolyCP0, PolyCP1, PolyCQ0, PolyCQ1;
-    __m128d       res_erf, res_erfcB, res_erfcC, res_erfc, res;
-    __m128d       mask, expmx2;
-
-    /* Calculate erf() */
-    xabs     = gmx_mm_abs_pd(x);
-    x2       = _mm_mul_pd(x, x);
-    x4       = _mm_mul_pd(x2, x2);
-
-    PolyAP0  = _mm_macc_pd(CAP4, x4, CAP2);
-    PolyAP1  = _mm_macc_pd(CAP3, x4, CAP1);
-    PolyAP0  = _mm_macc_pd(PolyAP0, x4, CAP0);
-    PolyAP0  = _mm_macc_pd(PolyAP1, x2, PolyAP0);
-
-    PolyAQ1  = _mm_macc_pd(CAQ5, x4, CAQ3);
-    PolyAQ0  = _mm_macc_pd(CAQ4, x4, CAQ2);
-    PolyAQ1  = _mm_macc_pd(PolyAQ1, x4, CAQ1);
-    PolyAQ0  = _mm_macc_pd(PolyAQ0, x4, one);
-    PolyAQ0  = _mm_macc_pd(PolyAQ1, x2, PolyAQ0);
-
-    res_erf  = _mm_macc_pd(PolyAP0, gmx_mm_inv_pd(PolyAQ0), CAoffset);
-    res_erf  = _mm_mul_pd(x, res_erf);
-
-    /* Calculate erfc() in range [1,4.5] */
-    t       = _mm_sub_pd(xabs, one);
-    t2      = _mm_mul_pd(t, t);
-
-    PolyBP0  = _mm_macc_pd(CBP6, t2, CBP4);
-    PolyBP1  = _mm_macc_pd(CBP5, t2, CBP3);
-    PolyBP0  = _mm_macc_pd(PolyBP0, t2, CBP2);
-    PolyBP1  = _mm_macc_pd(PolyBP1, t2, CBP1);
-    PolyBP0  = _mm_macc_pd(PolyBP0, t2, CBP0);
-    PolyBP0  = _mm_macc_pd(PolyBP1, t, PolyBP0);
-
-    PolyBQ1 = _mm_macc_pd(CBQ7, t2, CBQ5);
-    PolyBQ0 = _mm_macc_pd(CBQ6, t2, CBQ4);
-    PolyBQ1 = _mm_macc_pd(PolyBQ1, t2, CBQ3);
-    PolyBQ0 = _mm_macc_pd(PolyBQ0, t2, CBQ2);
-    PolyBQ1 = _mm_macc_pd(PolyBQ1, t2, CBQ1);
-    PolyBQ0 = _mm_macc_pd(PolyBQ0, t2, one);
-    PolyBQ0 = _mm_macc_pd(PolyBQ1, t, PolyBQ0);
-
-    res_erfcB = _mm_mul_pd(PolyBP0, gmx_mm_inv_pd(PolyBQ0));
-
-    res_erfcB = _mm_mul_pd(res_erfcB, xabs);
-
-    /* Calculate erfc() in range [4.5,inf] */
-    w       = gmx_mm_inv_pd(xabs);
-    w2      = _mm_mul_pd(w, w);
-
-    PolyCP0  = _mm_macc_pd(CCP6, w2, CCP4);
-    PolyCP1  = _mm_macc_pd(CCP5, w2, CCP3);
-    PolyCP0  = _mm_macc_pd(PolyCP0, w2, CCP2);
-    PolyCP1  = _mm_macc_pd(PolyCP1, w2, CCP1);
-    PolyCP0  = _mm_macc_pd(PolyCP0, w2, CCP0);
-    PolyCP0  = _mm_macc_pd(PolyCP1, w, PolyCP0);
-
-    PolyCQ0  = _mm_macc_pd(CCQ6, w2, CCQ4);
-    PolyCQ1  = _mm_macc_pd(CCQ5, w2, CCQ3);
-    PolyCQ0  = _mm_macc_pd(PolyCQ0, w2, CCQ2);
-    PolyCQ1  = _mm_macc_pd(PolyCQ1, w2, CCQ1);
-    PolyCQ0  = _mm_macc_pd(PolyCQ0, w2, one);
-    PolyCQ0  = _mm_macc_pd(PolyCQ1, w, PolyCQ0);
-
-    expmx2   = gmx_mm_exp_pd( _mm_or_pd(signbit, x2) );
-
-    res_erfcC = _mm_macc_pd(PolyCP0, gmx_mm_inv_pd(PolyCQ0), CCoffset);
-    res_erfcC = _mm_mul_pd(res_erfcC, w);
-
-    mask     = _mm_cmpgt_pd(xabs, _mm_set1_pd(4.5));
-    res_erfc = _mm_blendv_pd(res_erfcB, res_erfcC, mask);
-
-    res_erfc = _mm_mul_pd(res_erfc, expmx2);
-
-    /* erfc(x<0) = 2-erfc(|x|) */
-    mask     = _mm_cmplt_pd(x, _mm_setzero_pd());
-    res_erfc = _mm_blendv_pd(res_erfc, _mm_sub_pd(two, res_erfc), mask);
-
-    /* Select erf() or erfc() */
-    mask = _mm_cmplt_pd(xabs, one);
-    res  = _mm_blendv_pd(_mm_sub_pd(one, res_erfc), res_erf, mask);
-
-    return res;
-}
-
-
-static __m128d
-gmx_mm_erfc_pd(__m128d x)
-{
-    /* Coefficients for minimax approximation of erf(x)=x*(CAoffset + P(x^2)/Q(x^2)) in range [-0.75,0.75] */
-    const __m128d CAP4      = _mm_set1_pd(-0.431780540597889301512e-4);
-    const __m128d CAP3      = _mm_set1_pd(-0.00578562306260059236059);
-    const __m128d CAP2      = _mm_set1_pd(-0.028593586920219752446);
-    const __m128d CAP1      = _mm_set1_pd(-0.315924962948621698209);
-    const __m128d CAP0      = _mm_set1_pd(0.14952975608477029151);
-
-    const __m128d CAQ5      = _mm_set1_pd(-0.374089300177174709737e-5);
-    const __m128d CAQ4      = _mm_set1_pd(0.00015126584532155383535);
-    const __m128d CAQ3      = _mm_set1_pd(0.00536692680669480725423);
-    const __m128d CAQ2      = _mm_set1_pd(0.0668686825594046122636);
-    const __m128d CAQ1      = _mm_set1_pd(0.402604990869284362773);
-    /* CAQ0 == 1.0 */
-    const __m128d CAoffset  = _mm_set1_pd(0.9788494110107421875);
-
-    /* Coefficients for minimax approximation of erfc(x)=exp(-x^2)*x*(P(x-1)/Q(x-1)) in range [1.0,4.5] */
-    const __m128d CBP6      = _mm_set1_pd(2.49650423685462752497647637088e-10);
-    const __m128d CBP5      = _mm_set1_pd(0.00119770193298159629350136085658);
-    const __m128d CBP4      = _mm_set1_pd(0.0164944422378370965881008942733);
-    const __m128d CBP3      = _mm_set1_pd(0.0984581468691775932063932439252);
-    const __m128d CBP2      = _mm_set1_pd(0.317364595806937763843589437418);
-    const __m128d CBP1      = _mm_set1_pd(0.554167062641455850932670067075);
-    const __m128d CBP0      = _mm_set1_pd(0.427583576155807163756925301060);
-    const __m128d CBQ7      = _mm_set1_pd(0.00212288829699830145976198384930);
-    const __m128d CBQ6      = _mm_set1_pd(0.0334810979522685300554606393425);
-    const __m128d CBQ5      = _mm_set1_pd(0.2361713785181450957579508850717);
-    const __m128d CBQ4      = _mm_set1_pd(0.955364736493055670530981883072);
-    const __m128d CBQ3      = _mm_set1_pd(2.36815675631420037315349279199);
-    const __m128d CBQ2      = _mm_set1_pd(3.55261649184083035537184223542);
-    const __m128d CBQ1      = _mm_set1_pd(2.93501136050160872574376997993);
-    /* CBQ0 == 1.0 */
-
-    /* Coefficients for minimax approximation of erfc(x)=exp(-x^2)/x*(P(1/x)/Q(1/x)) in range [4.5,inf] */
-    const __m128d CCP6      = _mm_set1_pd(-2.8175401114513378771);
-    const __m128d CCP5      = _mm_set1_pd(-3.22729451764143718517);
-    const __m128d CCP4      = _mm_set1_pd(-2.5518551727311523996);
-    const __m128d CCP3      = _mm_set1_pd(-0.687717681153649930619);
-    const __m128d CCP2      = _mm_set1_pd(-0.212652252872804219852);
-    const __m128d CCP1      = _mm_set1_pd(0.0175389834052493308818);
-    const __m128d CCP0      = _mm_set1_pd(0.00628057170626964891937);
-
-    const __m128d CCQ6      = _mm_set1_pd(5.48409182238641741584);
-    const __m128d CCQ5      = _mm_set1_pd(13.5064170191802889145);
-    const __m128d CCQ4      = _mm_set1_pd(22.9367376522880577224);
-    const __m128d CCQ3      = _mm_set1_pd(15.930646027911794143);
-    const __m128d CCQ2      = _mm_set1_pd(11.0567237927800161565);
-    const __m128d CCQ1      = _mm_set1_pd(2.79257750980575282228);
-    /* CCQ0 == 1.0 */
-    const __m128d CCoffset  = _mm_set1_pd(0.5579090118408203125);
-
-    const __m128d one       = _mm_set1_pd(1.0);
-    const __m128d two       = _mm_set1_pd(2.0);
-
-    const __m128d signbit   = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000, 0x00000000, 0x80000000, 0x00000000) );
-
-    __m128d       xabs, x2, x4, t, t2, w, w2;
-    __m128d       PolyAP0, PolyAP1, PolyAQ0, PolyAQ1;
-    __m128d       PolyBP0, PolyBP1, PolyBQ0, PolyBQ1;
-    __m128d       PolyCP0, PolyCP1, PolyCQ0, PolyCQ1;
-    __m128d       res_erf, res_erfcB, res_erfcC, res_erfc, res;
-    __m128d       mask, expmx2;
-
-    /* Calculate erf() */
-    xabs     = gmx_mm_abs_pd(x);
-    x2       = _mm_mul_pd(x, x);
-    x4       = _mm_mul_pd(x2, x2);
-
-    PolyAP0  = _mm_macc_pd(CAP4, x4, CAP2);
-    PolyAP1  = _mm_macc_pd(CAP3, x4, CAP1);
-    PolyAP0  = _mm_macc_pd(PolyAP0, x4, CAP0);
-    PolyAP0  = _mm_macc_pd(PolyAP1, x2, PolyAP0);
-
-    PolyAQ1  = _mm_macc_pd(CAQ5, x4, CAQ3);
-    PolyAQ0  = _mm_macc_pd(CAQ4, x4, CAQ2);
-    PolyAQ1  = _mm_macc_pd(PolyAQ1, x4, CAQ1);
-    PolyAQ0  = _mm_macc_pd(PolyAQ0, x4, one);
-    PolyAQ0  = _mm_macc_pd(PolyAQ1, x2, PolyAQ0);
-
-    res_erf  = _mm_macc_pd(PolyAP0, gmx_mm_inv_pd(PolyAQ0), CAoffset);
-    res_erf  = _mm_mul_pd(x, res_erf);
-
-    /* Calculate erfc() in range [1,4.5] */
-    t       = _mm_sub_pd(xabs, one);
-    t2      = _mm_mul_pd(t, t);
-
-    PolyBP0  = _mm_macc_pd(CBP6, t2, CBP4);
-    PolyBP1  = _mm_macc_pd(CBP5, t2, CBP3);
-    PolyBP0  = _mm_macc_pd(PolyBP0, t2, CBP2);
-    PolyBP1  = _mm_macc_pd(PolyBP1, t2, CBP1);
-    PolyBP0  = _mm_macc_pd(PolyBP0, t2, CBP0);
-    PolyBP0  = _mm_macc_pd(PolyBP1, t, PolyBP0);
-
-    PolyBQ1 = _mm_macc_pd(CBQ7, t2, CBQ5);
-    PolyBQ0 = _mm_macc_pd(CBQ6, t2, CBQ4);
-    PolyBQ1 = _mm_macc_pd(PolyBQ1, t2, CBQ3);
-    PolyBQ0 = _mm_macc_pd(PolyBQ0, t2, CBQ2);
-    PolyBQ1 = _mm_macc_pd(PolyBQ1, t2, CBQ1);
-    PolyBQ0 = _mm_macc_pd(PolyBQ0, t2, one);
-    PolyBQ0 = _mm_macc_pd(PolyBQ1, t, PolyBQ0);
-
-    res_erfcB = _mm_mul_pd(PolyBP0, gmx_mm_inv_pd(PolyBQ0));
-
-    res_erfcB = _mm_mul_pd(res_erfcB, xabs);
-
-    /* Calculate erfc() in range [4.5,inf] */
-    w       = gmx_mm_inv_pd(xabs);
-    w2      = _mm_mul_pd(w, w);
-
-    PolyCP0  = _mm_macc_pd(CCP6, w2, CCP4);
-    PolyCP1  = _mm_macc_pd(CCP5, w2, CCP3);
-    PolyCP0  = _mm_macc_pd(PolyCP0, w2, CCP2);
-    PolyCP1  = _mm_macc_pd(PolyCP1, w2, CCP1);
-    PolyCP0  = _mm_macc_pd(PolyCP0, w2, CCP0);
-    PolyCP0  = _mm_macc_pd(PolyCP1, w, PolyCP0);
-
-    PolyCQ0  = _mm_macc_pd(CCQ6, w2, CCQ4);
-    PolyCQ1  = _mm_macc_pd(CCQ5, w2, CCQ3);
-    PolyCQ0  = _mm_macc_pd(PolyCQ0, w2, CCQ2);
-    PolyCQ1  = _mm_macc_pd(PolyCQ1, w2, CCQ1);
-    PolyCQ0  = _mm_macc_pd(PolyCQ0, w2, one);
-    PolyCQ0  = _mm_macc_pd(PolyCQ1, w, PolyCQ0);
-
-    expmx2   = gmx_mm_exp_pd( _mm_or_pd(signbit, x2) );
-
-    res_erfcC = _mm_macc_pd(PolyCP0, gmx_mm_inv_pd(PolyCQ0), CCoffset);
-    res_erfcC = _mm_mul_pd(res_erfcC, w);
-
-    mask     = _mm_cmpgt_pd(xabs, _mm_set1_pd(4.5));
-    res_erfc = _mm_blendv_pd(res_erfcB, res_erfcC, mask);
-
-    res_erfc = _mm_mul_pd(res_erfc, expmx2);
-
-    /* erfc(x<0) = 2-erfc(|x|) */
-    mask     = _mm_cmplt_pd(x, _mm_setzero_pd());
-    res_erfc = _mm_blendv_pd(res_erfc, _mm_sub_pd(two, res_erfc), mask);
+#include "simd_math.h"
  
-    /* Select erf() or erfc() */
-    mask = _mm_cmplt_pd(xabs, one);
-    res  = _mm_blendv_pd(res_erfc, _mm_sub_pd(one, res_erf), mask);
-
-    return res;
-}
-
-
-
-/* Calculate the force correction due to PME analytically.
- *
- * This routine is meant to enable analytical evaluation of the
- * direct-space PME electrostatic force to avoid tables.
- *
- * The direct-space potential should be Erfc(beta*r)/r, but there
- * are some problems evaluating that:
- *
- * First, the error function is difficult (read: expensive) to
- * approxmiate accurately for intermediate to large arguments, and
- * this happens already in ranges of beta*r that occur in simulations.
- * Second, we now try to avoid calculating potentials in Gromacs but
- * use forces directly.
- *
- * We can simply things slight by noting that the PME part is really
- * a correction to the normal Coulomb force since Erfc(z)=1-Erf(z), i.e.
- *
- * V= 1/r - Erf(beta*r)/r
- *
- * The first term we already have from the inverse square root, so
- * that we can leave out of this routine.
- *
- * For pme tolerances of 1e-3 to 1e-8 and cutoffs of 0.5nm to 1.8nm,
- * the argument beta*r will be in the range 0.15 to ~4. Use your
- * favorite plotting program to realize how well-behaved Erf(z)/z is
- * in this range!
- *
- * We approximate f(z)=erf(z)/z with a rational minimax polynomial.
- * However, it turns out it is more efficient to approximate f(z)/z and
- * then only use even powers. This is another minor optimization, since
- * we actually WANT f(z)/z, because it is going to be multiplied by
- * the vector between the two atoms to get the vectorial force. The
- * fastest flops are the ones we can avoid calculating!
- *
- * So, here's how it should be used:
- *
- * 1. Calculate r^2.
- * 2. Multiply by beta^2, so you get z^2=beta^2*r^2.
- * 3. Evaluate this routine with z^2 as the argument.
- * 4. The return value is the expression:
- *
- *
- *       2*exp(-z^2)     erf(z)
- *       ------------ - --------
- *       sqrt(Pi)*z^2      z^3
- *
- * 5. Multiply the entire expression by beta^3. This will get you
- *
- *       beta^3*2*exp(-z^2)     beta^3*erf(z)
- *       ------------------  - ---------------
- *          sqrt(Pi)*z^2            z^3
- *
- *    or, switching back to r (z=r*beta):
- *
- *       2*beta*exp(-r^2*beta^2)   erf(r*beta)
- *       ----------------------- - -----------
- *            sqrt(Pi)*r^2            r^3
- *
- *
- *    With a bit of math exercise you should be able to confirm that
- *    this is exactly D[Erf[beta*r]/r,r] divided by r another time.
- *
- * 6. Add the result to 1/r^3, multiply by the product of the charges,
- *    and you have your force (divided by r). A final multiplication
- *    with the vector connecting the two particles and you have your
- *    vectorial force to add to the particles.
- *
+/* Temporary:
+ * Alias some old SSE definitions to new SIMD definitions so we don't need
+ * to modify _all_ group kernels - they will anyway be replaced with a new
+ * generic SIMD version soon.
   */
-static __m128d
-gmx_mm_pmecorrF_pd(__m128d z2)
-{
-    const __m128d  FN10     = _mm_set1_pd(-8.0072854618360083154e-14);
-    const __m128d  FN9      = _mm_set1_pd(1.1859116242260148027e-11);
-    const __m128d  FN8      = _mm_set1_pd(-8.1490406329798423616e-10);
-    const __m128d  FN7      = _mm_set1_pd(3.4404793543907847655e-8);
-    const __m128d  FN6      = _mm_set1_pd(-9.9471420832602741006e-7);
-    const __m128d  FN5      = _mm_set1_pd(0.000020740315999115847456);
-    const __m128d  FN4      = _mm_set1_pd(-0.00031991745139313364005);
-    const __m128d  FN3      = _mm_set1_pd(0.0035074449373659008203);
-    const __m128d  FN2      = _mm_set1_pd(-0.031750380176100813405);
-    const __m128d  FN1      = _mm_set1_pd(0.13884101728898463426);
-    const __m128d  FN0      = _mm_set1_pd(-0.75225277815249618847);
-
-    const __m128d  FD5      = _mm_set1_pd(0.000016009278224355026701);
-    const __m128d  FD4      = _mm_set1_pd(0.00051055686934806966046);
-    const __m128d  FD3      = _mm_set1_pd(0.0081803507497974289008);
-    const __m128d  FD2      = _mm_set1_pd(0.077181146026670287235);
-    const __m128d  FD1      = _mm_set1_pd(0.41543303143712535988);
-    const __m128d  FD0      = _mm_set1_pd(1.0);
-
-    __m128d        z4;
-    __m128d        polyFN0, polyFN1, polyFD0, polyFD1;
-
-    z4             = _mm_mul_pd(z2, z2);
-
-    polyFD1        = _mm_macc_pd(FD5, z4, FD3);
-    polyFD1        = _mm_macc_pd(polyFD1, z4, FD1);
-    polyFD1        = _mm_mul_pd(polyFD1, z2);
-    polyFD0        = _mm_macc_pd(FD4, z4, FD2);
-    polyFD0        = _mm_macc_pd(polyFD0, z4, FD0);
-    polyFD0        = _mm_add_pd(polyFD0, polyFD1);
-
-    polyFD0        = gmx_mm_inv_pd(polyFD0);
-
-    polyFN0        = _mm_macc_pd(FN10, z4, FN8);
-    polyFN0        = _mm_macc_pd(polyFN0, z4, FN6);
-    polyFN0        = _mm_macc_pd(polyFN0, z4, FN4);
-    polyFN0        = _mm_macc_pd(polyFN0, z4, FN2);
-    polyFN0        = _mm_macc_pd(polyFN0, z4, FN0);
-    polyFN1        = _mm_macc_pd(FN9, z4, FN7);
-    polyFN1        = _mm_macc_pd(polyFN1, z4, FN5);
-    polyFN1        = _mm_macc_pd(polyFN1, z4, FN3);
-    polyFN1        = _mm_macc_pd(polyFN1, z4, FN1);
-    polyFN0        = _mm_macc_pd(polyFN1, z2, polyFN0);
-
-    return _mm_mul_pd(polyFN0, polyFD0);
-}
-
-
-/* Calculate the potential correction due to PME analytically.
- *
- * This routine calculates Erf(z)/z, although you should provide z^2
- * as the input argument.
- *
- * Here's how it should be used:
- *
- * 1. Calculate r^2.
- * 2. Multiply by beta^2, so you get z^2=beta^2*r^2.
- * 3. Evaluate this routine with z^2 as the argument.
- * 4. The return value is the expression:
- *
- *
- *        erf(z)
- *       --------
- *          z
- *
- * 5. Multiply the entire expression by beta and switching back to r (z=r*beta):
- *
- *       erf(r*beta)
- *       -----------
- *           r
- *
- * 6. Subtract the result from 1/r, multiply by the product of the charges,
- *    and you have your potential.
- *
- */
-static __m128d
-gmx_mm_pmecorrV_pd(__m128d z2)
-{
-    const __m128d  VN9      = _mm_set1_pd(-9.3723776169321855475e-13);
-    const __m128d  VN8      = _mm_set1_pd(1.2280156762674215741e-10);
-    const __m128d  VN7      = _mm_set1_pd(-7.3562157912251309487e-9);
-    const __m128d  VN6      = _mm_set1_pd(2.6215886208032517509e-7);
-    const __m128d  VN5      = _mm_set1_pd(-4.9532491651265819499e-6);
-    const __m128d  VN4      = _mm_set1_pd(0.00025907400778966060389);
-    const __m128d  VN3      = _mm_set1_pd(0.0010585044856156469792);
-    const __m128d  VN2      = _mm_set1_pd(0.045247661136833092885);
-    const __m128d  VN1      = _mm_set1_pd(0.11643931522926034421);
-    const __m128d  VN0      = _mm_set1_pd(1.1283791671726767970);
-
-    const __m128d  VD5      = _mm_set1_pd(0.000021784709867336150342);
-    const __m128d  VD4      = _mm_set1_pd(0.00064293662010911388448);
-    const __m128d  VD3      = _mm_set1_pd(0.0096311444822588683504);
-    const __m128d  VD2      = _mm_set1_pd(0.085608012351550627051);
-    const __m128d  VD1      = _mm_set1_pd(0.43652499166614811084);
-    const __m128d  VD0      = _mm_set1_pd(1.0);
-
-    __m128d        z4;
-    __m128d        polyVN0, polyVN1, polyVD0, polyVD1;
-
-    z4             = _mm_mul_pd(z2, z2);
-
-    polyVD1        = _mm_macc_pd(VD5, z4, VD3);
-    polyVD0        = _mm_macc_pd(VD4, z4, VD2);
-    polyVD1        = _mm_macc_pd(polyVD1, z4, VD1);
-    polyVD0        = _mm_macc_pd(polyVD0, z4, VD0);
-    polyVD0        = _mm_macc_pd(polyVD1, z2, polyVD0);
-
-    polyVD0        = gmx_mm_inv_pd(polyVD0);
-
-    polyVN1        = _mm_macc_pd(VN9, z4, VN7);
-    polyVN0        = _mm_macc_pd(VN8, z4, VN6);
-    polyVN1        = _mm_macc_pd(polyVN1, z4, VN5);
-    polyVN0        = _mm_macc_pd(polyVN0, z4, VN4);
-    polyVN1        = _mm_macc_pd(polyVN1, z4, VN3);
-    polyVN0        = _mm_macc_pd(polyVN0, z4, VN2);
-    polyVN1        = _mm_macc_pd(polyVN1, z4, VN1);
-    polyVN0        = _mm_macc_pd(polyVN0, z4, VN0);
-    polyVN0        = _mm_macc_pd(polyVN1, z2, polyVN0);
-
-    return _mm_mul_pd(polyVN0, polyVD0);
-}
-
-
-static int
-gmx_mm_sincos_pd(__m128d  x,
-                 __m128d *sinval,
-                 __m128d *cosval)
-{
-#ifdef _MSC_VER
-    __declspec(align(16))
-    const double sintable[34] =
-    {
-        1.00000000000000000e+00, 0.00000000000000000e+00,
-        9.95184726672196929e-01, 9.80171403295606036e-02,
-        9.80785280403230431e-01, 1.95090322016128248e-01,
-        9.56940335732208824e-01, 2.90284677254462331e-01,
-        9.23879532511286738e-01, 3.82683432365089782e-01,
-        8.81921264348355050e-01, 4.71396736825997642e-01,
-        8.31469612302545236e-01, 5.55570233019602178e-01,
-        7.73010453362736993e-01, 6.34393284163645488e-01,
-        7.07106781186547573e-01, 7.07106781186547462e-01,
-        6.34393284163645599e-01, 7.73010453362736882e-01,
-        5.55570233019602289e-01, 8.31469612302545125e-01,
-        4.71396736825997809e-01, 8.81921264348354939e-01,
-        3.82683432365089837e-01, 9.23879532511286738e-01,
-        2.90284677254462276e-01, 9.56940335732208935e-01,
-        1.95090322016128304e-01, 9.80785280403230431e-01,
-        9.80171403295607702e-02, 9.95184726672196818e-01,
-        0.0, 1.00000000000000000e+00
-    };
-#else
-    const __m128d sintable[17] =
-    {
-        _mm_set_pd( 0.0, 1.0 ),
-        _mm_set_pd( sin(  1.0 * (M_PI/2.0) / 16.0), cos(  1.0 * (M_PI/2.0) / 16.0) ),
-        _mm_set_pd( sin(  2.0 * (M_PI/2.0) / 16.0), cos(  2.0 * (M_PI/2.0) / 16.0) ),
-        _mm_set_pd( sin(  3.0 * (M_PI/2.0) / 16.0), cos(  3.0 * (M_PI/2.0) / 16.0) ),
-        _mm_set_pd( sin(  4.0 * (M_PI/2.0) / 16.0), cos(  4.0 * (M_PI/2.0) / 16.0) ),
-        _mm_set_pd( sin(  5.0 * (M_PI/2.0) / 16.0), cos(  5.0 * (M_PI/2.0) / 16.0) ),
-        _mm_set_pd( sin(  6.0 * (M_PI/2.0) / 16.0), cos(  6.0 * (M_PI/2.0) / 16.0) ),
-        _mm_set_pd( sin(  7.0 * (M_PI/2.0) / 16.0), cos(  7.0 * (M_PI/2.0) / 16.0) ),
-        _mm_set_pd( sin(  8.0 * (M_PI/2.0) / 16.0), cos(  8.0 * (M_PI/2.0) / 16.0) ),
-        _mm_set_pd( sin(  9.0 * (M_PI/2.0) / 16.0), cos(  9.0 * (M_PI/2.0) / 16.0) ),
-        _mm_set_pd( sin( 10.0 * (M_PI/2.0) / 16.0), cos( 10.0 * (M_PI/2.0) / 16.0) ),
-        _mm_set_pd( sin( 11.0 * (M_PI/2.0) / 16.0), cos( 11.0 * (M_PI/2.0) / 16.0) ),
-        _mm_set_pd( sin( 12.0 * (M_PI/2.0) / 16.0), cos( 12.0 * (M_PI/2.0) / 16.0) ),
-        _mm_set_pd( sin( 13.0 * (M_PI/2.0) / 16.0), cos( 13.0 * (M_PI/2.0) / 16.0) ),
-        _mm_set_pd( sin( 14.0 * (M_PI/2.0) / 16.0), cos( 14.0 * (M_PI/2.0) / 16.0) ),
-        _mm_set_pd( sin( 15.0 * (M_PI/2.0) / 16.0), cos( 15.0 * (M_PI/2.0) / 16.0) ),
-        _mm_set_pd(  1.0, 0.0 )
-    };
-#endif
-
-    const __m128d signmask       = gmx_mm_castsi128_pd( _mm_set_epi32(0x7FFFFFFF, 0xFFFFFFFF, 0x7FFFFFFF, 0xFFFFFFFF) );
-    const __m128i signbit_epi32  = _mm_set1_epi32(0x80000000);
-
-    const __m128d tabscale      = _mm_set1_pd(32.0/M_PI);
-    const __m128d invtabscale0  = _mm_set1_pd(9.81747508049011230469e-02);
-    const __m128d invtabscale1  = _mm_set1_pd(1.96197799156550576057e-08);
-    const __m128i ione          = _mm_set1_epi32(1);
-    const __m128i i32           = _mm_set1_epi32(32);
-    const __m128i i16           = _mm_set1_epi32(16);
-    const __m128i tabmask       = _mm_set1_epi32(0x3F);
-    const __m128d sinP7         = _mm_set1_pd(-1.0/5040.0);
-    const __m128d sinP5         = _mm_set1_pd(1.0/120.0);
-    const __m128d sinP3         = _mm_set1_pd(-1.0/6.0);
-    const __m128d sinP1         = _mm_set1_pd(1.0);
-
-    const __m128d cosP6         = _mm_set1_pd(-1.0/720.0);
-    const __m128d cosP4         = _mm_set1_pd(1.0/24.0);
-    const __m128d cosP2         = _mm_set1_pd(-1.0/2.0);
-    const __m128d cosP0         = _mm_set1_pd(1.0);
-
-    __m128d       scalex;
-    __m128i       tabidx, corridx;
-    __m128d       xabs, z, z2, polySin, polyCos;
-    __m128d       xpoint;
-    __m128d       ypoint0, ypoint1;
-
-    __m128d       sinpoint, cospoint;
-    __m128d       xsign, ssign, csign;
-    __m128i       imask, sswapsign, cswapsign;
-    __m128d       minusone;
-
-    xsign    = _mm_andnot_pd(signmask, x);
-    xabs     = _mm_and_pd(x, signmask);
-
-    scalex   = _mm_mul_pd(tabscale, xabs);
-    tabidx   = _mm_cvtpd_epi32(scalex);
-
-    xpoint   = _mm_round_pd(scalex, _MM_FROUND_TO_NEAREST_INT);
-
-    /* Extended precision arithmetics */
-    z        = _mm_nmacc_pd(invtabscale0, xpoint, xabs);
-    z        = _mm_nmacc_pd(invtabscale1, xpoint, z);
-
-    /* Range reduction to 0..2*Pi */
-    tabidx   = _mm_and_si128(tabidx, tabmask);
-
-    /* tabidx is now in range [0,..,64] */
-    imask     = _mm_cmpgt_epi32(tabidx, i32);
-    sswapsign = imask;
-    cswapsign = imask;
-    corridx   = _mm_and_si128(imask, i32);
-    tabidx    = _mm_sub_epi32(tabidx, corridx);
-
-    /* tabidx is now in range [0..32] */
-    imask     = _mm_cmpgt_epi32(tabidx, i16);
-    cswapsign = _mm_xor_si128(cswapsign, imask);
-    corridx   = _mm_sub_epi32(i32, tabidx);
-    tabidx    = _mm_blendv_epi8(tabidx, corridx, imask);
-    /* tabidx is now in range [0..16] */
-    ssign     = _mm_cvtepi32_pd( _mm_or_si128( sswapsign, ione ) );
-    csign     = _mm_cvtepi32_pd( _mm_or_si128( cswapsign, ione ) );
-
-#ifdef _MSC_VER
-    ypoint0  = _mm_load_pd(sintable + 2*_mm_extract_epi32(tabidx, 0));
-    ypoint1  = _mm_load_pd(sintable + 2*_mm_extract_epi32(tabidx, 1));
-#else
-    ypoint0  = sintable[_mm_extract_epi32(tabidx, 0)];
-    ypoint1  = sintable[_mm_extract_epi32(tabidx, 1)];
-#endif
-    sinpoint = _mm_unpackhi_pd(ypoint0, ypoint1);
-    cospoint = _mm_unpacklo_pd(ypoint0, ypoint1);
-
-    sinpoint = _mm_mul_pd(sinpoint, ssign);
-    cospoint = _mm_mul_pd(cospoint, csign);
-
-    z2       = _mm_mul_pd(z, z);
-
-    polySin  = _mm_macc_pd(sinP7, z2, sinP5);
-    polySin  = _mm_macc_pd(polySin, z2, sinP3);
-    polySin  = _mm_macc_pd(polySin, z2, sinP1);
-    polySin  = _mm_mul_pd(polySin, z);
-
-    polyCos  = _mm_macc_pd(cosP6, z2, cosP4);
-    polyCos  = _mm_macc_pd(polyCos, z2, cosP2);
-    polyCos  = _mm_macc_pd(polyCos, z2, cosP0);
-
-    *sinval  = _mm_xor_pd(_mm_add_pd( _mm_mul_pd(sinpoint, polyCos), _mm_mul_pd(cospoint, polySin) ), xsign);
-    *cosval  = _mm_sub_pd( _mm_mul_pd(cospoint, polyCos), _mm_mul_pd(sinpoint, polySin) );
-
-    return 0;
-}
-
-/*
- * IMPORTANT: Do NOT call both sin & cos if you need both results, since each of them
- * will then call the sincos() routine and waste a factor 2 in performance!
- */
-static __m128d
-gmx_mm_sin_pd(__m128d x)
-{
-    __m128d s, c;
-    gmx_mm_sincos_pd(x, &s, &c);
-    return s;
-}
-
-/*
- * IMPORTANT: Do NOT call both sin & cos if you need both results, since each of them
- * will then call the sincos() routine and waste a factor 2 in performance!
- */
-static __m128d
-gmx_mm_cos_pd(__m128d x)
-{
-    __m128d s, c;
-    gmx_mm_sincos_pd(x, &s, &c);
-    return c;
-}
-
-
-
-static __m128d
-gmx_mm_tan_pd(__m128d x)
-{
-    __m128d sinval, cosval;
-    __m128d tanval;
-
-    gmx_mm_sincos_pd(x, &sinval, &cosval);
-
-    tanval = _mm_mul_pd(sinval, gmx_mm_inv_pd(cosval));
-
-    return tanval;
-}
-
-
-
-static __m128d
-gmx_mm_asin_pd(__m128d x)
-{
-    /* Same algorithm as cephes library */
-    const __m128d signmask  = gmx_mm_castsi128_pd( _mm_set_epi32(0x7FFFFFFF, 0xFFFFFFFF, 0x7FFFFFFF, 0xFFFFFFFF) );
-    const __m128d limit1    = _mm_set1_pd(0.625);
-    const __m128d limit2    = _mm_set1_pd(1e-8);
-    const __m128d one       = _mm_set1_pd(1.0);
-    const __m128d halfpi    = _mm_set1_pd(M_PI/2.0);
-    const __m128d quarterpi = _mm_set1_pd(M_PI/4.0);
-    const __m128d morebits  = _mm_set1_pd(6.123233995736765886130e-17);
-
-    const __m128d P5        = _mm_set1_pd(4.253011369004428248960e-3);
-    const __m128d P4        = _mm_set1_pd(-6.019598008014123785661e-1);
-    const __m128d P3        = _mm_set1_pd(5.444622390564711410273e0);
-    const __m128d P2        = _mm_set1_pd(-1.626247967210700244449e1);
-    const __m128d P1        = _mm_set1_pd(1.956261983317594739197e1);
-    const __m128d P0        = _mm_set1_pd(-8.198089802484824371615e0);
-
-    const __m128d Q4        = _mm_set1_pd(-1.474091372988853791896e1);
-    const __m128d Q3        = _mm_set1_pd(7.049610280856842141659e1);
-    const __m128d Q2        = _mm_set1_pd(-1.471791292232726029859e2);
-    const __m128d Q1        = _mm_set1_pd(1.395105614657485689735e2);
-    const __m128d Q0        = _mm_set1_pd(-4.918853881490881290097e1);
-
-    const __m128d R4        = _mm_set1_pd(2.967721961301243206100e-3);
-    const __m128d R3        = _mm_set1_pd(-5.634242780008963776856e-1);
-    const __m128d R2        = _mm_set1_pd(6.968710824104713396794e0);
-    const __m128d R1        = _mm_set1_pd(-2.556901049652824852289e1);
-    const __m128d R0        = _mm_set1_pd(2.853665548261061424989e1);
-
-    const __m128d S3        = _mm_set1_pd(-2.194779531642920639778e1);
-    const __m128d S2        = _mm_set1_pd(1.470656354026814941758e2);
-    const __m128d S1        = _mm_set1_pd(-3.838770957603691357202e2);
-    const __m128d S0        = _mm_set1_pd(3.424398657913078477438e2);
-
-    __m128d       sign;
-    __m128d       mask;
-    __m128d       xabs;
-    __m128d       zz, ww, z, q, w, y, zz2, ww2;
-    __m128d       PA, PB;
-    __m128d       QA, QB;
-    __m128d       RA, RB;
-    __m128d       SA, SB;
-    __m128d       nom, denom;
-
-    sign  = _mm_andnot_pd(signmask, x);
-    xabs  = _mm_and_pd(x, signmask);
-
-    mask  = _mm_cmpgt_pd(xabs, limit1);
-
-    zz    = _mm_sub_pd(one, xabs);
-    ww    = _mm_mul_pd(xabs, xabs);
-    zz2   = _mm_mul_pd(zz, zz);
-    ww2   = _mm_mul_pd(ww, ww);
-
-    /* R */
-    RA    = _mm_macc_pd(R4, zz2, R2);
-    RB    = _mm_macc_pd(R3, zz2, R1);
-    RA    = _mm_macc_pd(RA, zz2, R0);
-    RA    = _mm_macc_pd(RB, zz, RA);
-
-    /* S, SA = zz2 */
-    SB    = _mm_macc_pd(S3, zz2, S1);
-    SA    = _mm_add_pd(zz2, S2);
-    SA    = _mm_macc_pd(SA, zz2, S0);
-    SA    = _mm_macc_pd(SB, zz, SA);
-
-    /* P */
-    PA    = _mm_macc_pd(P5, ww2, P3);
-    PB    = _mm_macc_pd(P4, ww2, P2);
-    PA    = _mm_macc_pd(PA, ww2, P1);
-    PB    = _mm_macc_pd(PB, ww2, P0);
-    PA    = _mm_macc_pd(PA, ww, PB);
-
-    /* Q, QA = ww2 */
-    QB    = _mm_macc_pd(Q4, ww2, Q2);
-    QA    = _mm_add_pd(ww2, Q3);
-    QA    = _mm_macc_pd(QA, ww2, Q1);
-    QB    = _mm_macc_pd(QB, ww2, Q0);
-    QA    = _mm_macc_pd(QA, ww, QB);
-
-    RA    = _mm_mul_pd(RA, zz);
-    PA    = _mm_mul_pd(PA, ww);
-
-    nom   = _mm_blendv_pd( PA, RA, mask );
-    denom = _mm_blendv_pd( QA, SA, mask );
-
-    q     = _mm_mul_pd( nom, gmx_mm_inv_pd(denom) );
-
-    zz    = _mm_add_pd(zz, zz);
-    zz    = gmx_mm_sqrt_pd(zz);
-    z     = _mm_sub_pd(quarterpi, zz);
-    zz    = _mm_mul_pd(zz, q);
-    zz    = _mm_sub_pd(zz, morebits);
-    z     = _mm_sub_pd(z, zz);
-    z     = _mm_add_pd(z, quarterpi);
-
-    w     = _mm_macc_pd(xabs, q, xabs);
-
-    z     = _mm_blendv_pd( w, z, mask );
-
-    mask  = _mm_cmpgt_pd(xabs, limit2);
-    z     = _mm_blendv_pd( xabs, z, mask );
-
-    z = _mm_xor_pd(z, sign);
-
-    return z;
-}
-
-
-static __m128d
-gmx_mm_acos_pd(__m128d x)
-{
-    const __m128d signmask   = gmx_mm_castsi128_pd( _mm_set_epi32(0x7FFFFFFF, 0xFFFFFFFF, 0x7FFFFFFF, 0xFFFFFFFF) );
-    const __m128d one        = _mm_set1_pd(1.0);
-    const __m128d half       = _mm_set1_pd(0.5);
-    const __m128d pi         = _mm_set1_pd(M_PI);
-    const __m128d quarterpi0 = _mm_set1_pd(7.85398163397448309616e-1);
-    const __m128d quarterpi1 = _mm_set1_pd(6.123233995736765886130e-17);
-
-
-    __m128d mask1;
-
-    __m128d z, z1, z2;
-
-    mask1 = _mm_cmpgt_pd(x, half);
-    z1    = _mm_mul_pd(half, _mm_sub_pd(one, x));
-    z1    = gmx_mm_sqrt_pd(z1);
-    z     = _mm_blendv_pd( x, z1, mask1 );
-
-    z     = gmx_mm_asin_pd(z);
-
-    z1    = _mm_add_pd(z, z);
-
-    z2    = _mm_sub_pd(quarterpi0, z);
-    z2    = _mm_add_pd(z2, quarterpi1);
-    z2    = _mm_add_pd(z2, quarterpi0);
-
-    z     = _mm_blendv_pd(z2, z1, mask1);
-
-    return z;
-}
-
-static __m128d
-gmx_mm_atan_pd(__m128d x)
-{
-    /* Same algorithm as cephes library */
-    const __m128d signmask  = gmx_mm_castsi128_pd( _mm_set_epi32(0x7FFFFFFF, 0xFFFFFFFF, 0x7FFFFFFF, 0xFFFFFFFF) );
-    const __m128d limit1    = _mm_set1_pd(0.66);
-    const __m128d limit2    = _mm_set1_pd(2.41421356237309504880);
-    const __m128d quarterpi = _mm_set1_pd(M_PI/4.0);
-    const __m128d halfpi    = _mm_set1_pd(M_PI/2.0);
-    const __m128d mone      = _mm_set1_pd(-1.0);
-    const __m128d morebits1 = _mm_set1_pd(0.5*6.123233995736765886130E-17);
-    const __m128d morebits2 = _mm_set1_pd(6.123233995736765886130E-17);
-
-    const __m128d P4        = _mm_set1_pd(-8.750608600031904122785E-1);
-    const __m128d P3        = _mm_set1_pd(-1.615753718733365076637E1);
-    const __m128d P2        = _mm_set1_pd(-7.500855792314704667340E1);
-    const __m128d P1        = _mm_set1_pd(-1.228866684490136173410E2);
-    const __m128d P0        = _mm_set1_pd(-6.485021904942025371773E1);
-
-    const __m128d Q4        = _mm_set1_pd(2.485846490142306297962E1);
-    const __m128d Q3        = _mm_set1_pd(1.650270098316988542046E2);
-    const __m128d Q2        = _mm_set1_pd(4.328810604912902668951E2);
-    const __m128d Q1        = _mm_set1_pd(4.853903996359136964868E2);
-    const __m128d Q0        = _mm_set1_pd(1.945506571482613964425E2);
-
-    __m128d       sign;
-    __m128d       mask1, mask2;
-    __m128d       y, t1, t2;
-    __m128d       z, z2;
-    __m128d       P_A, P_B, Q_A, Q_B;
-
-    sign   = _mm_andnot_pd(signmask, x);
-    x      = _mm_and_pd(x, signmask);
-
-    mask1  = _mm_cmpgt_pd(x, limit1);
-    mask2  = _mm_cmpgt_pd(x, limit2);
-
-    t1     = _mm_mul_pd(_mm_add_pd(x, mone), gmx_mm_inv_pd(_mm_sub_pd(x, mone)));
-    t2     = _mm_mul_pd(mone, gmx_mm_inv_pd(x));
-
-    y      = _mm_and_pd(mask1, quarterpi);
-    y      = _mm_or_pd( _mm_and_pd(mask2, halfpi), _mm_andnot_pd(mask2, y) );
-
-    x      = _mm_or_pd( _mm_and_pd(mask1, t1), _mm_andnot_pd(mask1, x) );
-    x      = _mm_or_pd( _mm_and_pd(mask2, t2), _mm_andnot_pd(mask2, x) );
-
-    z      = _mm_mul_pd(x, x);
-    z2     = _mm_mul_pd(z, z);
-
-    P_A    = _mm_macc_pd(P4, z2, P2);
-    P_B    = _mm_macc_pd(P3, z2, P1);
-    P_A    = _mm_macc_pd(P_A, z2, P0);
-    P_A    = _mm_macc_pd(P_B, z, P_A);
-
-    /* Q_A = z2 */
-    Q_B    = _mm_macc_pd(Q4, z2, Q2);
-    Q_A    = _mm_add_pd(z2, Q3);
-    Q_A    = _mm_macc_pd(Q_A, z2, Q1);
-    Q_B    = _mm_macc_pd(Q_B, z2, Q0);
-    Q_A    = _mm_macc_pd(Q_A, z, Q_B);
-
-    z      = _mm_mul_pd(z, P_A);
-    z      = _mm_mul_pd(z, gmx_mm_inv_pd(Q_A));
-    z      = _mm_macc_pd(z, x, x);
-
-    t1     = _mm_and_pd(mask1, morebits1);
-    t1     = _mm_or_pd( _mm_and_pd(mask2, morebits2), _mm_andnot_pd(mask2, t1) );
-
-    z      = _mm_add_pd(z, t1);
-    y      = _mm_add_pd(y, z);
-
-    y      = _mm_xor_pd(y, sign);
-
-    return y;
-}
-
-
-static __m128d
-gmx_mm_atan2_pd(__m128d y, __m128d x)
-{
-    const __m128d pi          = _mm_set1_pd(M_PI);
-    const __m128d minuspi     = _mm_set1_pd(-M_PI);
-    const __m128d halfpi      = _mm_set1_pd(M_PI/2.0);
-    const __m128d minushalfpi = _mm_set1_pd(-M_PI/2.0);
-
-    __m128d       z, z1, z3, z4;
-    __m128d       w;
-    __m128d       maskx_lt, maskx_eq;
-    __m128d       masky_lt, masky_eq;
-    __m128d       mask1, mask2, mask3, mask4, maskall;
-
-    maskx_lt  = _mm_cmplt_pd(x, _mm_setzero_pd());
-    masky_lt  = _mm_cmplt_pd(y, _mm_setzero_pd());
-    maskx_eq  = _mm_cmpeq_pd(x, _mm_setzero_pd());
-    masky_eq  = _mm_cmpeq_pd(y, _mm_setzero_pd());
-
-    z         = _mm_mul_pd(y, gmx_mm_inv_pd(x));
-    z         = gmx_mm_atan_pd(z);
-
-    mask1     = _mm_and_pd(maskx_eq, masky_lt);
-    mask2     = _mm_andnot_pd(maskx_lt, masky_eq);
-    mask3     = _mm_andnot_pd( _mm_or_pd(masky_lt, masky_eq), maskx_eq);
-    mask4     = _mm_and_pd(masky_eq, maskx_lt);
-
-    maskall   = _mm_or_pd( _mm_or_pd(mask1, mask2), _mm_or_pd(mask3, mask4) );
-
-    z         = _mm_andnot_pd(maskall, z);
-    z1        = _mm_and_pd(mask1, minushalfpi);
-    z3        = _mm_and_pd(mask3, halfpi);
-    z4        = _mm_and_pd(mask4, pi);
-
-    z         = _mm_or_pd( _mm_or_pd(z, z1), _mm_or_pd(z3, z4) );
-
-    w         = _mm_blendv_pd(pi, minuspi, masky_lt);
-    w         = _mm_and_pd(w, maskx_lt);
-
-    w         = _mm_andnot_pd(maskall, w);
-
-    z         = _mm_add_pd(z, w);
  
-    return z;
-}
+#define gmx_mm_invsqrt_pd   gmx_simd_invsqrt_d
+#define gmx_mm_inv_pd       gmx_simd_inv_d
+#define gmx_mm_log_pd       gmx_simd_log_d
+#define gmx_mm_pmecorrF_pd  gmx_simd_pmecorrF_d
+#define gmx_mm_pmecorrV_pd  gmx_simd_pmecorrV_d
+#define gmx_mm_sincos_pd    gmx_simd_sincos_d
  
  #endif
diff --git a/src/gromacs/simd/math_x86_avx_128_fma_single.h b/src/gromacs/simd/math_x86_avx_128_fma_single.h

index b8454278cc673b0b0322c5e456f7aa365a88bd0b..fc7113dd3a426f645a9f18890b2cb6d254ad10fc 100644 (file)
--- a/src/gromacs/simd/math_x86_avx_128_fma_single.h
+++ b/src/gromacs/simd/math_x86_avx_128_fma_single.h
@@ -1,7 +1,7 @@
  /*
   * This file is part of the GROMACS molecular simulation package.
   *
- * Copyright (c) 2012,2013, by the GROMACS development team, led by
+ * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
   * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   * and including many others, as listed in the AUTHORS file in the
   * top-level source directory and at http://www.gromacs.org.
@@ -35,1048 +35,19 @@
  #ifndef GMX_SIMD_MATH_AVX_128_FMA_SINGLE_H
  #define GMX_SIMD_MATH_AVX_128_FMA_SINGLE_H
  
-#include <immintrin.h> /* AVX */
-#ifdef HAVE_X86INTRIN_H
-#include <x86intrin.h> /* FMA */
-#endif
-#ifdef HAVE_INTRIN_H
-#include <intrin.h> /* FMA MSVC */
-#endif
-
-#include <math.h>
-
-#include "general_x86_avx_128_fma.h"
-
-
-#ifndef M_PI
-#  define M_PI 3.14159265358979323846264338327950288
-#endif
-
-
-
-
-/************************
- *                      *
- * Simple math routines *
- *                      *
- ************************/
-
-/* 1.0/sqrt(x) */
-static gmx_inline __m128
-gmx_mm_invsqrt_ps(__m128 x)
-{
-    const __m128 half  = _mm_set1_ps(0.5);
-    const __m128 one   = _mm_set1_ps(1.0);
-
-    __m128       lu = _mm_rsqrt_ps(x);
-
-    return _mm_macc_ps(_mm_nmacc_ps(x, _mm_mul_ps(lu, lu), one), _mm_mul_ps(lu, half), lu);
-}
-
-/* sqrt(x) - Do NOT use this (but rather invsqrt) if you actually need 1.0/sqrt(x) */
-static gmx_inline __m128
-gmx_mm_sqrt_ps(__m128 x)
-{
-    __m128 mask;
-    __m128 res;
-
-    mask = _mm_cmp_ps(x, _mm_setzero_ps(), _CMP_EQ_OQ);
-    res  = _mm_andnot_ps(mask, gmx_mm_invsqrt_ps(x));
-
-    res  = _mm_mul_ps(x, res);
-
-    return res;
-}
-
-/* 1.0/x */
-static gmx_inline __m128
-gmx_mm_inv_ps(__m128 x)
-{
-    const __m128 two = _mm_set1_ps(2.0);
-
-    __m128       lu = _mm_rcp_ps(x);
-
-    return _mm_mul_ps(lu, _mm_nmacc_ps(lu, x, two));
-}
-
-static gmx_inline __m128
-gmx_mm_abs_ps(__m128 x)
-{
-    const __m128 signmask  = gmx_mm_castsi128_ps( _mm_set1_epi32(0x7FFFFFFF) );
-
-    return _mm_and_ps(x, signmask);
-}
-
-static __m128
-gmx_mm_log_ps(__m128 x)
-{
-    /* Same algorithm as cephes library */
-    const __m128  expmask    = gmx_mm_castsi128_ps( _mm_set_epi32(0x7F800000, 0x7F800000, 0x7F800000, 0x7F800000) );
-    const __m128i expbase_m1 = _mm_set1_epi32(127-1); /* We want non-IEEE format */
-    const __m128  half       = _mm_set1_ps(0.5f);
-    const __m128  one        = _mm_set1_ps(1.0f);
-    const __m128  invsq2     = _mm_set1_ps(1.0f/sqrt(2.0f));
-    const __m128  corr1      = _mm_set1_ps(-2.12194440e-4f);
-    const __m128  corr2      = _mm_set1_ps(0.693359375f);
-
-    const __m128  CA_1        = _mm_set1_ps(0.070376836292f);
-    const __m128  CB_0        = _mm_set1_ps(1.6714950086782716f);
-    const __m128  CB_1        = _mm_set1_ps(-2.452088066061482f);
-    const __m128  CC_0        = _mm_set1_ps(1.5220770854701728f);
-    const __m128  CC_1        = _mm_set1_ps(-1.3422238433233642f);
-    const __m128  CD_0        = _mm_set1_ps(1.386218787509749f);
-    const __m128  CD_1        = _mm_set1_ps(0.35075468953796346f);
-    const __m128  CE_0        = _mm_set1_ps(1.3429983063133937f);
-    const __m128  CE_1        = _mm_set1_ps(1.807420826584643f);
-
-    __m128        fexp, fexp1;
-    __m128i       iexp;
-    __m128        mask;
-    __m128        x1, x2;
-    __m128        y;
-    __m128        pA, pB, pC, pD, pE, tB, tC, tD, tE;
-
-    /* Separate x into exponent and mantissa, with a mantissa in the range [0.5..1[ (not IEEE754 standard!) */
-    fexp  = _mm_and_ps(x, expmask);
-    iexp  = gmx_mm_castps_si128(fexp);
-    iexp  = _mm_srli_epi32(iexp, 23);
-    iexp  = _mm_sub_epi32(iexp, expbase_m1);
-
-    x     = _mm_andnot_ps(expmask, x);
-    x     = _mm_or_ps(x, one);
-    x     = _mm_mul_ps(x, half);
-
-    mask  = _mm_cmp_ps(x, invsq2, _CMP_LT_OQ);
-
-    x     = _mm_add_ps(x, _mm_and_ps(mask, x));
-    x     = _mm_sub_ps(x, one);
-    iexp  = _mm_add_epi32(iexp, gmx_mm_castps_si128(mask)); /* 0xFFFFFFFF = -1 as int */
-
-    x2    = _mm_mul_ps(x, x);
-
-    pA    = _mm_mul_ps(CA_1, x);
-
-    pB    = _mm_add_ps(x, CB_1);
-    pC    = _mm_add_ps(x, CC_1);
-    pD    = _mm_add_ps(x, CD_1);
-    pE    = _mm_add_ps(x, CE_1);
-
-    pB    = _mm_macc_ps(x, pB, CB_0);
-    pC    = _mm_macc_ps(x, pC, CC_0);
-    pD    = _mm_macc_ps(x, pD, CD_0);
-    pE    = _mm_macc_ps(x, pE, CE_0);
-
-    pA    = _mm_mul_ps(pA, pB);
-    pC    = _mm_mul_ps(pC, pD);
-    pE    = _mm_mul_ps(pE, x2);
-    pA    = _mm_mul_ps(pA, pC);
-    y     = _mm_mul_ps(pA, pE);
-
-    fexp  = _mm_cvtepi32_ps(iexp);
-    y     = _mm_macc_ps(fexp, corr1, y);
-    y     = _mm_nmacc_ps(half, x2, y);
-
-    x2    = _mm_add_ps(x, y);
-    x2    = _mm_macc_ps(fexp, corr2, x2);
-
-    return x2;
-}
-
-
-/*
- * 2^x function.
- *
- * The 2^w term is calculated from a (6,0)-th order (no denominator) Minimax polynomia on the interval
- * [-0.5,0.5]. The coefficiencts of this was derived in Mathematica using the command:
- *
- * MiniMaxApproximation[(2^x), {x, {-0.5, 0.5}, 6, 0}, WorkingPrecision -> 15]
- *
- * The largest-magnitude exponent we can represent in IEEE single-precision binary format
- * is 2^-126 for small numbers and 2^127 for large ones. To avoid wrap-around problems, we set the
- * result to zero if the argument falls outside this range. For small numbers this is just fine, but
- * for large numbers you could be fancy and return the smallest/largest IEEE single-precision
- * number instead. That would take a few extra cycles and not really help, since something is
- * wrong if you are using single precision to work with numbers that cannot really be represented
- * in single precision.
- *
- * The accuracy is at least 23 bits.
- */
-static __m128
-gmx_mm_exp2_ps(__m128 x)
-{
-    /* Lower bound: We do not allow numbers that would lead to an IEEE fp representation exponent smaller than -126. */
-    const __m128  arglimit = _mm_set1_ps(126.0f);
-
-    const __m128i expbase  = _mm_set1_epi32(127);
-    const __m128  CA6      = _mm_set1_ps(1.535336188319500E-004);
-    const __m128  CA5      = _mm_set1_ps(1.339887440266574E-003);
-    const __m128  CA4      = _mm_set1_ps(9.618437357674640E-003);
-    const __m128  CA3      = _mm_set1_ps(5.550332471162809E-002);
-    const __m128  CA2      = _mm_set1_ps(2.402264791363012E-001);
-    const __m128  CA1      = _mm_set1_ps(6.931472028550421E-001);
-    const __m128  CA0      = _mm_set1_ps(1.0f);
-
-    __m128        valuemask;
-    __m128i       iexppart;
-    __m128        fexppart;
-    __m128        intpart;
-    __m128        x2;
-    __m128        p0, p1;
-
-    iexppart  = _mm_cvtps_epi32(x);
-    intpart   = _mm_round_ps(x, _MM_FROUND_TO_NEAREST_INT);
-    iexppart  = _mm_slli_epi32(_mm_add_epi32(iexppart, expbase), 23);
-    valuemask = _mm_cmp_ps(arglimit, gmx_mm_abs_ps(x), _CMP_GE_OQ);
-    fexppart  = _mm_and_ps(valuemask, gmx_mm_castsi128_ps(iexppart));
-
-    x         = _mm_sub_ps(x, intpart);
-    x2        = _mm_mul_ps(x, x);
-
-    p0        = _mm_macc_ps(CA6, x2, CA4);
-    p1        = _mm_macc_ps(CA5, x2, CA3);
-    p0        = _mm_macc_ps(p0, x2, CA2);
-    p1        = _mm_macc_ps(p1, x2, CA1);
-    p0        = _mm_macc_ps(p0, x2, CA0);
-    p0        = _mm_macc_ps(p1, x, p0);
-    x         = _mm_mul_ps(p0, fexppart);
-
-    return x;
-}
-
-
-/* Exponential function. This could be calculated from 2^x as Exp(x)=2^(y), where y=log2(e)*x,
- * but there will then be a small rounding error since we lose some precision due to the
- * multiplication. This will then be magnified a lot by the exponential.
- *
- * Instead, we calculate the fractional part directly as a minimax approximation of
- * Exp(z) on [-0.5,0.5]. We use extended precision arithmetics to calculate the fraction
- * remaining after 2^y, which avoids the precision-loss.
- * The final result is correct to within 1 LSB over the entire argument range.
- */
-static __m128
-gmx_mm_exp_ps(__m128 x)
-{
-    const __m128  argscale      = _mm_set1_ps(1.44269504088896341f);
-    /* Lower bound: Disallow numbers that would lead to an IEEE fp exponent reaching +-127. */
-    const __m128  arglimit      = _mm_set1_ps(126.0f);
-    const __m128i expbase       = _mm_set1_epi32(127);
-
-    const __m128  invargscale0  = _mm_set1_ps(0.693359375f);
-    const __m128  invargscale1  = _mm_set1_ps(-2.12194440e-4f);
-
-    const __m128  CC5           = _mm_set1_ps(1.9875691500e-4f);
-    const __m128  CC4           = _mm_set1_ps(1.3981999507e-3f);
-    const __m128  CC3           = _mm_set1_ps(8.3334519073e-3f);
-    const __m128  CC2           = _mm_set1_ps(4.1665795894e-2f);
-    const __m128  CC1           = _mm_set1_ps(1.6666665459e-1f);
-    const __m128  CC0           = _mm_set1_ps(5.0000001201e-1f);
-    const __m128  one           = _mm_set1_ps(1.0f);
-
-    __m128        y, x2;
-    __m128        p0, p1;
-    __m128        valuemask;
-    __m128i       iexppart;
-    __m128        fexppart;
-    __m128        intpart;
-
-    y = _mm_mul_ps(x, argscale);
-
-    iexppart  = _mm_cvtps_epi32(y);
-    intpart   = _mm_round_ps(y, _MM_FROUND_TO_NEAREST_INT);
-
-    iexppart  = _mm_slli_epi32(_mm_add_epi32(iexppart, expbase), 23);
-    valuemask = _mm_cmp_ps(arglimit, gmx_mm_abs_ps(y), _CMP_GE_OQ);
-    fexppart  = _mm_and_ps(valuemask, gmx_mm_castsi128_ps(iexppart));
-
-    /* Extended precision arithmetics */
-    x         = _mm_nmacc_ps(invargscale0, intpart, x);
-    x         = _mm_nmacc_ps(invargscale1, intpart, x);
-
-    x2        = _mm_mul_ps(x, x);
-
-    p1        = _mm_macc_ps(CC5, x2, CC3);
-    p0        = _mm_macc_ps(CC4, x2, CC2);
-    p1        = _mm_macc_ps(p1, x2, CC1);
-    p0        = _mm_macc_ps(p0, x2, CC0);
-    p0        = _mm_macc_ps(p1, x, p0);
-    p0        = _mm_macc_ps(p0, x2, one);
-
-    x         = _mm_add_ps(x, p0);
-
-    x         = _mm_mul_ps(x, fexppart);
-
-    return x;
-}
-
-/* FULL precision. Only errors in LSB */
-static __m128
-gmx_mm_erf_ps(__m128 x)
-{
-    /* Coefficients for minimax approximation of erf(x)=x*P(x^2) in range [-1,1] */
-    const __m128  CA6      = _mm_set1_ps(7.853861353153693e-5f);
-    const __m128  CA5      = _mm_set1_ps(-8.010193625184903e-4f);
-    const __m128  CA4      = _mm_set1_ps(5.188327685732524e-3f);
-    const __m128  CA3      = _mm_set1_ps(-2.685381193529856e-2f);
-    const __m128  CA2      = _mm_set1_ps(1.128358514861418e-1f);
-    const __m128  CA1      = _mm_set1_ps(-3.761262582423300e-1f);
-    const __m128  CA0      = _mm_set1_ps(1.128379165726710f);
-    /* Coefficients for minimax approximation of erfc(x)=Exp(-x^2)*P((1/(x-1))^2) in range [0.67,2] */
-    const __m128  CB9      = _mm_set1_ps(-0.0018629930017603923f);
-    const __m128  CB8      = _mm_set1_ps(0.003909821287598495f);
-    const __m128  CB7      = _mm_set1_ps(-0.0052094582210355615f);
-    const __m128  CB6      = _mm_set1_ps(0.005685614362160572f);
-    const __m128  CB5      = _mm_set1_ps(-0.0025367682853477272f);
-    const __m128  CB4      = _mm_set1_ps(-0.010199799682318782f);
-    const __m128  CB3      = _mm_set1_ps(0.04369575504816542f);
-    const __m128  CB2      = _mm_set1_ps(-0.11884063474674492f);
-    const __m128  CB1      = _mm_set1_ps(0.2732120154030589f);
-    const __m128  CB0      = _mm_set1_ps(0.42758357702025784f);
-    /* Coefficients for minimax approximation of erfc(x)=Exp(-x^2)*(1/x)*P((1/x)^2) in range [2,9.19] */
-    const __m128  CC10     = _mm_set1_ps(-0.0445555913112064f);
-    const __m128  CC9      = _mm_set1_ps(0.21376355144663348f);
-    const __m128  CC8      = _mm_set1_ps(-0.3473187200259257f);
-    const __m128  CC7      = _mm_set1_ps(0.016690861551248114f);
-    const __m128  CC6      = _mm_set1_ps(0.7560973182491192f);
-    const __m128  CC5      = _mm_set1_ps(-1.2137903600145787f);
-    const __m128  CC4      = _mm_set1_ps(0.8411872321232948f);
-    const __m128  CC3      = _mm_set1_ps(-0.08670413896296343f);
-    const __m128  CC2      = _mm_set1_ps(-0.27124782687240334f);
-    const __m128  CC1      = _mm_set1_ps(-0.0007502488047806069f);
-    const __m128  CC0      = _mm_set1_ps(0.5642114853803148f);
-
-    /* Coefficients for expansion of exp(x) in [0,0.1] */
-    /* CD0 and CD1 are both 1.0, so no need to declare them separately */
-    const __m128  CD2      = _mm_set1_ps(0.5000066608081202f);
-    const __m128  CD3      = _mm_set1_ps(0.1664795422874624f);
-    const __m128  CD4      = _mm_set1_ps(0.04379839977652482f);
-
-    const __m128  sieve    = gmx_mm_castsi128_ps( _mm_set1_epi32(0xfffff000) );
-    const __m128  signbit  = gmx_mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
-    const __m128  one      = _mm_set1_ps(1.0f);
-    const __m128  two      = _mm_set1_ps(2.0f);
-
-    __m128        x2, x4, y;
-    __m128        z, q, t, t2, w, w2;
-    __m128        pA0, pA1, pB0, pB1, pC0, pC1;
-    __m128        expmx2, corr;
-    __m128        res_erf, res_erfc, res;
-    __m128        mask;
-
-    /* Calculate erf() */
-    x2     = _mm_mul_ps(x, x);
-    x4     = _mm_mul_ps(x2, x2);
-
-    pA0  = _mm_macc_ps(CA6, x4, CA4);
-    pA1  = _mm_macc_ps(CA5, x4, CA3);
-    pA0  = _mm_macc_ps(pA0, x4, CA2);
-    pA1  = _mm_macc_ps(pA1, x4, CA1);
-    pA0  = _mm_mul_ps(pA0, x4);
-    pA0  = _mm_macc_ps(pA1, x2, pA0);
-    /* Constant term must come last for precision reasons */
-    pA0  = _mm_add_ps(pA0, CA0);
-
-    res_erf = _mm_mul_ps(x, pA0);
-
-    /* Calculate erfc */
-
-    y       = gmx_mm_abs_ps(x);
-    t       = gmx_mm_inv_ps(y);
-    w       = _mm_sub_ps(t, one);
-    t2      = _mm_mul_ps(t, t);
-    w2      = _mm_mul_ps(w, w);
-    /*
-     * We cannot simply calculate exp(-x2) directly in single precision, since
-     * that will lose a couple of bits of precision due to the multiplication.
-     * Instead, we introduce x=z+w, where the last 12 bits of precision are in w.
-     * Then we get exp(-x2) = exp(-z2)*exp((z-x)*(z+x)).
-     *
-     * The only drawback with this is that it requires TWO separate exponential
-     * evaluations, which would be horrible performance-wise. However, the argument
-     * for the second exp() call is always small, so there we simply use a
-     * low-order minimax expansion on [0,0.1].
-     */
-
-    z       = _mm_and_ps(y, sieve);
-    q       = _mm_mul_ps( _mm_sub_ps(z, y), _mm_add_ps(z, y) );
-
-    corr    = _mm_macc_ps(CD4, q, CD3);
-    corr    = _mm_macc_ps(corr, q, CD2);
-    corr    = _mm_macc_ps(corr, q, one);
-    corr    = _mm_macc_ps(corr, q, one);
-
-    expmx2  = gmx_mm_exp_ps( _mm_or_ps( signbit, _mm_mul_ps(z, z) ) );
-    expmx2  = _mm_mul_ps(expmx2, corr);
-
-    pB1  = _mm_macc_ps(CB9, w2, CB7);
-    pB0  = _mm_macc_ps(CB8, w2, CB6);
-    pB1  = _mm_macc_ps(pB1, w2, CB5);
-    pB0  = _mm_macc_ps(pB0, w2, CB4);
-    pB1  = _mm_macc_ps(pB1, w2, CB3);
-    pB0  = _mm_macc_ps(pB0, w2, CB2);
-    pB1  = _mm_macc_ps(pB1, w2, CB1);
-    pB0  = _mm_macc_ps(pB0, w2, CB0);
-    pB0  = _mm_macc_ps(pB1, w, pB0);
-
-    pC0  = _mm_macc_ps(CC10, t2, CC8);
-    pC1  = _mm_macc_ps(CC9, t2, CC7);
-    pC0  = _mm_macc_ps(pC0, t2, CC6);
-    pC1  = _mm_macc_ps(pC1, t2, CC5);
-    pC0  = _mm_macc_ps(pC0, t2, CC4);
-    pC1  = _mm_macc_ps(pC1, t2, CC3);
-    pC0  = _mm_macc_ps(pC0, t2, CC2);
-    pC1  = _mm_macc_ps(pC1, t2, CC1);
-
-    pC0  = _mm_macc_ps(pC0, t2, CC0);
-    pC0  = _mm_macc_ps(pC1, t, pC0);
-    pC0  = _mm_mul_ps(pC0, t);
-
-    /* SELECT pB0 or pC0 for erfc() */
-    mask     = _mm_cmp_ps(two, y, _CMP_LT_OQ);
-    res_erfc = _mm_blendv_ps(pB0, pC0, mask);
-    res_erfc = _mm_mul_ps(res_erfc, expmx2);
-
-    /* erfc(x<0) = 2-erfc(|x|) */
-    mask     = _mm_cmp_ps(x, _mm_setzero_ps(), _CMP_LT_OQ);
-    res_erfc = _mm_blendv_ps(res_erfc, _mm_sub_ps(two, res_erfc), mask);
-
-    /* Select erf() or erfc() */
-    mask = _mm_cmp_ps(y, _mm_set1_ps(0.75f), _CMP_LT_OQ);
-    res  = _mm_blendv_ps(_mm_sub_ps(one, res_erfc), res_erf, mask);
-
-    return res;
-}
-
-
-/* FULL precision. Only errors in LSB */
-static __m128
-gmx_mm_erfc_ps(__m128 x)
-{
-    /* Coefficients for minimax approximation of erf(x)=x*P(x^2) in range [-1,1] */
-    const __m128  CA6      = _mm_set1_ps(7.853861353153693e-5f);
-    const __m128  CA5      = _mm_set1_ps(-8.010193625184903e-4f);
-    const __m128  CA4      = _mm_set1_ps(5.188327685732524e-3f);
-    const __m128  CA3      = _mm_set1_ps(-2.685381193529856e-2f);
-    const __m128  CA2      = _mm_set1_ps(1.128358514861418e-1f);
-    const __m128  CA1      = _mm_set1_ps(-3.761262582423300e-1f);
-    const __m128  CA0      = _mm_set1_ps(1.128379165726710f);
-    /* Coefficients for minimax approximation of erfc(x)=Exp(-x^2)*P((1/(x-1))^2) in range [0.67,2] */
-    const __m128  CB9      = _mm_set1_ps(-0.0018629930017603923f);
-    const __m128  CB8      = _mm_set1_ps(0.003909821287598495f);
-    const __m128  CB7      = _mm_set1_ps(-0.0052094582210355615f);
-    const __m128  CB6      = _mm_set1_ps(0.005685614362160572f);
-    const __m128  CB5      = _mm_set1_ps(-0.0025367682853477272f);
-    const __m128  CB4      = _mm_set1_ps(-0.010199799682318782f);
-    const __m128  CB3      = _mm_set1_ps(0.04369575504816542f);
-    const __m128  CB2      = _mm_set1_ps(-0.11884063474674492f);
-    const __m128  CB1      = _mm_set1_ps(0.2732120154030589f);
-    const __m128  CB0      = _mm_set1_ps(0.42758357702025784f);
-    /* Coefficients for minimax approximation of erfc(x)=Exp(-x^2)*(1/x)*P((1/x)^2) in range [2,9.19] */
-    const __m128  CC10     = _mm_set1_ps(-0.0445555913112064f);
-    const __m128  CC9      = _mm_set1_ps(0.21376355144663348f);
-    const __m128  CC8      = _mm_set1_ps(-0.3473187200259257f);
-    const __m128  CC7      = _mm_set1_ps(0.016690861551248114f);
-    const __m128  CC6      = _mm_set1_ps(0.7560973182491192f);
-    const __m128  CC5      = _mm_set1_ps(-1.2137903600145787f);
-    const __m128  CC4      = _mm_set1_ps(0.8411872321232948f);
-    const __m128  CC3      = _mm_set1_ps(-0.08670413896296343f);
-    const __m128  CC2      = _mm_set1_ps(-0.27124782687240334f);
-    const __m128  CC1      = _mm_set1_ps(-0.0007502488047806069f);
-    const __m128  CC0      = _mm_set1_ps(0.5642114853803148f);
-
-    /* Coefficients for expansion of exp(x) in [0,0.1] */
-    /* CD0 and CD1 are both 1.0, so no need to declare them separately */
-    const __m128  CD2      = _mm_set1_ps(0.5000066608081202f);
-    const __m128  CD3      = _mm_set1_ps(0.1664795422874624f);
-    const __m128  CD4      = _mm_set1_ps(0.04379839977652482f);
-
-    const __m128  sieve    = gmx_mm_castsi128_ps( _mm_set1_epi32(0xfffff000) );
-    const __m128  signbit  = gmx_mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
-    const __m128  one      = _mm_set1_ps(1.0f);
-    const __m128  two      = _mm_set1_ps(2.0f);
-
-    __m128        x2, x4, y;
-    __m128        z, q, t, t2, w, w2;
-    __m128        pA0, pA1, pB0, pB1, pC0, pC1;
-    __m128        expmx2, corr;
-    __m128        res_erf, res_erfc, res;
-    __m128        mask;
-
-    /* Calculate erf() */
-    x2     = _mm_mul_ps(x, x);
-    x4     = _mm_mul_ps(x2, x2);
-
-    pA0  = _mm_macc_ps(CA6, x4, CA4);
-    pA1  = _mm_macc_ps(CA5, x4, CA3);
-    pA0  = _mm_macc_ps(pA0, x4, CA2);
-    pA1  = _mm_macc_ps(pA1, x4, CA1);
-    pA1  = _mm_mul_ps(pA1, x2);
-    pA0  = _mm_macc_ps(pA0, x4, pA1);
-    /* Constant term must come last for precision reasons */
-    pA0  = _mm_add_ps(pA0, CA0);
-
-    res_erf = _mm_mul_ps(x, pA0);
-
-    /* Calculate erfc */
-    y       = gmx_mm_abs_ps(x);
-    t       = gmx_mm_inv_ps(y);
-    w       = _mm_sub_ps(t, one);
-    t2      = _mm_mul_ps(t, t);
-    w2      = _mm_mul_ps(w, w);
-    /*
-     * We cannot simply calculate exp(-x2) directly in single precision, since
-     * that will lose a couple of bits of precision due to the multiplication.
-     * Instead, we introduce x=z+w, where the last 12 bits of precision are in w.
-     * Then we get exp(-x2) = exp(-z2)*exp((z-x)*(z+x)).
-     *
-     * The only drawback with this is that it requires TWO separate exponential
-     * evaluations, which would be horrible performance-wise. However, the argument
-     * for the second exp() call is always small, so there we simply use a
-     * low-order minimax expansion on [0,0.1].
-     */
-
-    z       = _mm_and_ps(y, sieve);
-    q       = _mm_mul_ps( _mm_sub_ps(z, y), _mm_add_ps(z, y) );
-
-    corr    = _mm_macc_ps(CD4, q, CD3);
-    corr    = _mm_macc_ps(corr, q, CD2);
-    corr    = _mm_macc_ps(corr, q, one);
-    corr    = _mm_macc_ps(corr, q, one);
-
-    expmx2  = gmx_mm_exp_ps( _mm_or_ps( signbit, _mm_mul_ps(z, z) ) );
-    expmx2  = _mm_mul_ps(expmx2, corr);
-
-    pB1  = _mm_macc_ps(CB9, w2, CB7);
-    pB0  = _mm_macc_ps(CB8, w2, CB6);
-    pB1  = _mm_macc_ps(pB1, w2, CB5);
-    pB0  = _mm_macc_ps(pB0, w2, CB4);
-    pB1  = _mm_macc_ps(pB1, w2, CB3);
-    pB0  = _mm_macc_ps(pB0, w2, CB2);
-    pB1  = _mm_macc_ps(pB1, w2, CB1);
-    pB0  = _mm_macc_ps(pB0, w2, CB0);
-    pB0  = _mm_macc_ps(pB1, w, pB0);
-
-    pC0  = _mm_macc_ps(CC10, t2, CC8);
-    pC1  = _mm_macc_ps(CC9, t2, CC7);
-    pC0  = _mm_macc_ps(pC0, t2, CC6);
-    pC1  = _mm_macc_ps(pC1, t2, CC5);
-    pC0  = _mm_macc_ps(pC0, t2, CC4);
-    pC1  = _mm_macc_ps(pC1, t2, CC3);
-    pC0  = _mm_macc_ps(pC0, t2, CC2);
-    pC1  = _mm_macc_ps(pC1, t2, CC1);
-
-    pC0  = _mm_macc_ps(pC0, t2, CC0);
-    pC0  = _mm_macc_ps(pC1, t, pC0);
-    pC0  = _mm_mul_ps(pC0, t);
-
-    /* SELECT pB0 or pC0 for erfc() */
-    mask     = _mm_cmp_ps(two, y, _CMP_LT_OQ);
-    res_erfc = _mm_blendv_ps(pB0, pC0, mask);
-    res_erfc = _mm_mul_ps(res_erfc, expmx2);
-
-    /* erfc(x<0) = 2-erfc(|x|) */
-    mask     = _mm_cmp_ps(x, _mm_setzero_ps(), _CMP_LT_OQ);
-    res_erfc = _mm_blendv_ps(res_erfc, _mm_sub_ps(two, res_erfc), mask);
-
-    /* Select erf() or erfc() */
-    mask = _mm_cmp_ps(y, _mm_set1_ps(0.75f), _CMP_LT_OQ);
-    res  = _mm_blendv_ps(res_erfc, _mm_sub_ps(one, res_erf), mask);
-
-    return res;
-}
-
-
-/* Calculate the force correction due to PME analytically.
- *
- * This routine is meant to enable analytical evaluation of the
- * direct-space PME electrostatic force to avoid tables.
- *
- * The direct-space potential should be Erfc(beta*r)/r, but there
- * are some problems evaluating that:
- *
- * First, the error function is difficult (read: expensive) to
- * approxmiate accurately for intermediate to large arguments, and
- * this happens already in ranges of beta*r that occur in simulations.
- * Second, we now try to avoid calculating potentials in Gromacs but
- * use forces directly.
- *
- * We can simply things slight by noting that the PME part is really
- * a correction to the normal Coulomb force since Erfc(z)=1-Erf(z), i.e.
- *
- * V= 1/r - Erf(beta*r)/r
- *
- * The first term we already have from the inverse square root, so
- * that we can leave out of this routine.
- *
- * For pme tolerances of 1e-3 to 1e-8 and cutoffs of 0.5nm to 1.8nm,
- * the argument beta*r will be in the range 0.15 to ~4. Use your
- * favorite plotting program to realize how well-behaved Erf(z)/z is
- * in this range!
- *
- * We approximate f(z)=erf(z)/z with a rational minimax polynomial.
- * However, it turns out it is more efficient to approximate f(z)/z and
- * then only use even powers. This is another minor optimization, since
- * we actually WANT f(z)/z, because it is going to be multiplied by
- * the vector between the two atoms to get the vectorial force. The
- * fastest flops are the ones we can avoid calculating!
- *
- * So, here's how it should be used:
- *
- * 1. Calculate r^2.
- * 2. Multiply by beta^2, so you get z^2=beta^2*r^2.
- * 3. Evaluate this routine with z^2 as the argument.
- * 4. The return value is the expression:
- *
- *
- *       2*exp(-z^2)     erf(z)
- *       ------------ - --------
- *       sqrt(Pi)*z^2      z^3
- *
- * 5. Multiply the entire expression by beta^3. This will get you
- *
- *       beta^3*2*exp(-z^2)     beta^3*erf(z)
- *       ------------------  - ---------------
- *          sqrt(Pi)*z^2            z^3
- *
- *    or, switching back to r (z=r*beta):
- *
- *       2*beta*exp(-r^2*beta^2)   erf(r*beta)
- *       ----------------------- - -----------
- *            sqrt(Pi)*r^2            r^3
- *
- *
- *    With a bit of math exercise you should be able to confirm that
- *    this is exactly D[Erf[beta*r]/r,r] divided by r another time.
- *
- * 6. Add the result to 1/r^3, multiply by the product of the charges,
- *    and you have your force (divided by r). A final multiplication
- *    with the vector connecting the two particles and you have your
- *    vectorial force to add to the particles.
- *
- */
-static __m128
-gmx_mm_pmecorrF_ps(__m128 z2)
-{
-    const __m128  FN6      = _mm_set1_ps(-1.7357322914161492954e-8f);
-    const __m128  FN5      = _mm_set1_ps(1.4703624142580877519e-6f);
-    const __m128  FN4      = _mm_set1_ps(-0.000053401640219807709149f);
-    const __m128  FN3      = _mm_set1_ps(0.0010054721316683106153f);
-    const __m128  FN2      = _mm_set1_ps(-0.019278317264888380590f);
-    const __m128  FN1      = _mm_set1_ps(0.069670166153766424023f);
-    const __m128  FN0      = _mm_set1_ps(-0.75225204789749321333f);
-
-    const __m128  FD4      = _mm_set1_ps(0.0011193462567257629232f);
-    const __m128  FD3      = _mm_set1_ps(0.014866955030185295499f);
-    const __m128  FD2      = _mm_set1_ps(0.11583842382862377919f);
-    const __m128  FD1      = _mm_set1_ps(0.50736591960530292870f);
-    const __m128  FD0      = _mm_set1_ps(1.0f);
-
-    __m128        z4;
-    __m128        polyFN0, polyFN1, polyFD0, polyFD1;
-
-    z4             = _mm_mul_ps(z2, z2);
-
-    polyFD0        = _mm_macc_ps(FD4, z4, FD2);
-    polyFD1        = _mm_macc_ps(FD3, z4, FD1);
-    polyFD0        = _mm_macc_ps(polyFD0, z4, FD0);
-    polyFD0        = _mm_macc_ps(polyFD1, z2, polyFD0);
-
-    polyFD0        = gmx_mm_inv_ps(polyFD0);
-
-    polyFN0        = _mm_macc_ps(FN6, z4, FN4);
-    polyFN1        = _mm_macc_ps(FN5, z4, FN3);
-    polyFN0        = _mm_macc_ps(polyFN0, z4, FN2);
-    polyFN1        = _mm_macc_ps(polyFN1, z4, FN1);
-    polyFN0        = _mm_macc_ps(polyFN0, z4, FN0);
-    polyFN0        = _mm_macc_ps(polyFN1, z2, polyFN0);
-
-    return _mm_mul_ps(polyFN0, polyFD0);
-}
-
-
-
-
-/* Calculate the potential correction due to PME analytically.
- *
- * See gmx_mm256_pmecorrF_ps() for details about the approximation.
- *
- * This routine calculates Erf(z)/z, although you should provide z^2
- * as the input argument.
- *
- * Here's how it should be used:
- *
- * 1. Calculate r^2.
- * 2. Multiply by beta^2, so you get z^2=beta^2*r^2.
- * 3. Evaluate this routine with z^2 as the argument.
- * 4. The return value is the expression:
- *
- *
- *        erf(z)
- *       --------
- *          z
- *
- * 5. Multiply the entire expression by beta and switching back to r (z=r*beta):
- *
- *       erf(r*beta)
- *       -----------
- *           r
- *
- * 6. Add the result to 1/r, multiply by the product of the charges,
- *    and you have your potential.
- */
-static __m128
-gmx_mm_pmecorrV_ps(__m128 z2)
-{
-    const __m128  VN6      = _mm_set1_ps(1.9296833005951166339e-8f);
-    const __m128  VN5      = _mm_set1_ps(-1.4213390571557850962e-6f);
-    const __m128  VN4      = _mm_set1_ps(0.000041603292906656984871f);
-    const __m128  VN3      = _mm_set1_ps(-0.00013134036773265025626f);
-    const __m128  VN2      = _mm_set1_ps(0.038657983986041781264f);
-    const __m128  VN1      = _mm_set1_ps(0.11285044772717598220f);
-    const __m128  VN0      = _mm_set1_ps(1.1283802385263030286f);
-
-    const __m128  VD3      = _mm_set1_ps(0.0066752224023576045451f);
-    const __m128  VD2      = _mm_set1_ps(0.078647795836373922256f);
-    const __m128  VD1      = _mm_set1_ps(0.43336185284710920150f);
-    const __m128  VD0      = _mm_set1_ps(1.0f);
-
-    __m128        z4;
-    __m128        polyVN0, polyVN1, polyVD0, polyVD1;
+#include "simd_math.h"
  
-    z4             = _mm_mul_ps(z2, z2);
-
-    polyVD1        = _mm_macc_ps(VD3, z4, VD1);
-    polyVD0        = _mm_macc_ps(VD2, z4, VD0);
-    polyVD0        = _mm_macc_ps(polyVD1, z2, polyVD0);
-
-    polyVD0        = gmx_mm_inv_ps(polyVD0);
-
-    polyVN0        = _mm_macc_ps(VN6, z4, VN4);
-    polyVN1        = _mm_macc_ps(VN5, z4, VN3);
-    polyVN0        = _mm_macc_ps(polyVN0, z4, VN2);
-    polyVN1        = _mm_macc_ps(polyVN1, z4, VN1);
-    polyVN0        = _mm_macc_ps(polyVN0, z4, VN0);
-    polyVN0        = _mm_macc_ps(polyVN1, z2, polyVN0);
-
-    return _mm_mul_ps(polyVN0, polyVD0);
-}
-
-
-
-static int
-gmx_mm_sincos_ps(__m128  x,
-                 __m128 *sinval,
-                 __m128 *cosval)
-{
-    const __m128  two_over_pi = _mm_set1_ps(2.0/M_PI);
-    const __m128  half        = _mm_set1_ps(0.5);
-    const __m128  one         = _mm_set1_ps(1.0);
-
-    const __m128i izero      = _mm_set1_epi32(0);
-    const __m128i ione       = _mm_set1_epi32(1);
-    const __m128i itwo       = _mm_set1_epi32(2);
-    const __m128i ithree     = _mm_set1_epi32(3);
-    const __m128  signbit    = gmx_mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
-
-    const __m128  CA1         = _mm_set1_ps(1.5703125f);
-    const __m128  CA2         = _mm_set1_ps(4.837512969970703125e-4f);
-    const __m128  CA3         = _mm_set1_ps(7.54978995489188216e-8f);
-
-    const __m128  CC0         = _mm_set1_ps(-0.0013602249f);
-    const __m128  CC1         = _mm_set1_ps(0.0416566950f);
-    const __m128  CC2         = _mm_set1_ps(-0.4999990225f);
-    const __m128  CS0         = _mm_set1_ps(-0.0001950727f);
-    const __m128  CS1         = _mm_set1_ps(0.0083320758f);
-    const __m128  CS2         = _mm_set1_ps(-0.1666665247f);
-
-    __m128        y, y2;
-    __m128        z;
-    __m128i       iz;
-    __m128i       offset_sin, offset_cos;
-    __m128        tmp1, tmp2;
-    __m128        mask_sin, mask_cos;
-    __m128        tmp_sin, tmp_cos;
-
-    y          = _mm_mul_ps(x, two_over_pi);
-    y          = _mm_add_ps(y, _mm_or_ps(_mm_and_ps(y, signbit), half));
-
-    iz         = _mm_cvttps_epi32(y);
-    z          = _mm_round_ps(y, _MM_FROUND_TO_ZERO);
-
-    offset_sin = _mm_and_si128(iz, ithree);
-    offset_cos = _mm_add_epi32(iz, ione);
-
-    /* Extended precision arithmethic to achieve full precision */
-    y               = _mm_nmacc_ps(z, CA1, x);
-    y               = _mm_nmacc_ps(z, CA2, y);
-    y               = _mm_nmacc_ps(z, CA3, y);
-
-    y2              = _mm_mul_ps(y, y);
-
-    tmp1            = _mm_macc_ps(CC0, y2, CC1);
-    tmp2            = _mm_macc_ps(CS0, y2, CS1);
-    tmp1            = _mm_macc_ps(tmp1, y2, CC2);
-    tmp2            = _mm_macc_ps(tmp2, y2, CS2);
-
-    tmp1            = _mm_macc_ps(tmp1, y2, one);
-
-    tmp2            = _mm_macc_ps(tmp2, _mm_mul_ps(y, y2), y);
-
-    mask_sin        = gmx_mm_castsi128_ps(_mm_cmpeq_epi32( _mm_and_si128(offset_sin, ione), izero));
-    mask_cos        = gmx_mm_castsi128_ps(_mm_cmpeq_epi32( _mm_and_si128(offset_cos, ione), izero));
-
-    tmp_sin         = _mm_blendv_ps(tmp1, tmp2, mask_sin);
-    tmp_cos         = _mm_blendv_ps(tmp1, tmp2, mask_cos);
-
-    mask_sin        = gmx_mm_castsi128_ps(_mm_cmpeq_epi32( _mm_and_si128(offset_sin, itwo), izero));
-    mask_cos        = gmx_mm_castsi128_ps(_mm_cmpeq_epi32( _mm_and_si128(offset_cos, itwo), izero));
-
-    tmp1            = _mm_xor_ps(signbit, tmp_sin);
-    tmp2            = _mm_xor_ps(signbit, tmp_cos);
-
-    *sinval         = _mm_blendv_ps(tmp1, tmp_sin, mask_sin);
-    *cosval         = _mm_blendv_ps(tmp2, tmp_cos, mask_cos);
-
-    return 0;
-}
-
-/*
- * IMPORTANT: Do NOT call both sin & cos if you need both results, since each of them
- * will then call the sincos() routine and waste a factor 2 in performance!
- */
-static __m128
-gmx_mm_sin_ps(__m128 x)
-{
-    __m128 s, c;
-    gmx_mm_sincos_ps(x, &s, &c);
-    return s;
-}
-
-/*
- * IMPORTANT: Do NOT call both sin & cos if you need both results, since each of them
- * will then call the sincos() routine and waste a factor 2 in performance!
+/* Temporary:
+ * Alias some old SSE definitions to new SIMD definitions so we don't need
+ * to modify _all_ group kernels - they will anyway be replaced with a new
+ * generic SIMD version soon.
   */
-static __m128
-gmx_mm_cos_ps(__m128 x)
-{
-    __m128 s, c;
-    gmx_mm_sincos_ps(x, &s, &c);
-    return c;
-}
-
-
-static __m128
-gmx_mm_tan_ps(__m128 x)
-{
-    __m128 sinval, cosval;
-    __m128 tanval;
-
-    gmx_mm_sincos_ps(x, &sinval, &cosval);
-
-    tanval = _mm_mul_ps(sinval, gmx_mm_inv_ps(cosval));
-
-    return tanval;
-}
-
-
-static __m128
-gmx_mm_asin_ps(__m128 x)
-{
-    /* Same algorithm as cephes library */
-    const __m128 signmask  = gmx_mm_castsi128_ps( _mm_set1_epi32(0x7FFFFFFF) );
-    const __m128 limitlow  = _mm_set1_ps(1e-4f);
-    const __m128 half      = _mm_set1_ps(0.5f);
-    const __m128 one       = _mm_set1_ps(1.0f);
-    const __m128 halfpi    = _mm_set1_ps(M_PI/2.0f);
-
-    const __m128 CC5        = _mm_set1_ps(4.2163199048E-2f);
-    const __m128 CC4        = _mm_set1_ps(2.4181311049E-2f);
-    const __m128 CC3        = _mm_set1_ps(4.5470025998E-2f);
-    const __m128 CC2        = _mm_set1_ps(7.4953002686E-2f);
-    const __m128 CC1        = _mm_set1_ps(1.6666752422E-1f);
-
-    __m128       sign;
-    __m128       mask;
-    __m128       xabs;
-    __m128       z, z1, z2, q, q1, q2;
-    __m128       pA, pB;
-
-    sign  = _mm_andnot_ps(signmask, x);
-    xabs  = _mm_and_ps(x, signmask);
-
-    mask  = _mm_cmp_ps(xabs, half, _CMP_GT_OQ);
-
-    z1    = _mm_mul_ps(half, _mm_sub_ps(one, xabs));
-    q1    = _mm_mul_ps(z1, gmx_mm_invsqrt_ps(z1));
-    q1    = _mm_andnot_ps(_mm_cmp_ps(xabs, one, _CMP_EQ_OQ), q1);
-
-    q2    = xabs;
-    z2    = _mm_mul_ps(q2, q2);
-
-    z     = _mm_or_ps( _mm_and_ps(mask, z1), _mm_andnot_ps(mask, z2) );
-    q     = _mm_or_ps( _mm_and_ps(mask, q1), _mm_andnot_ps(mask, q2) );
-
-    z2    = _mm_mul_ps(z, z);
-
-    pA    = _mm_macc_ps(CC5, z2, CC3);
-    pB    = _mm_macc_ps(CC4, z2, CC2);
-
-    pA    = _mm_macc_ps(pA, z2, CC1);
-    pA    = _mm_mul_ps(pA, z);
-
-    z     = _mm_macc_ps(pB, z2, pA);
-
-    z     = _mm_macc_ps(z, q, q);
-
-    q2    = _mm_sub_ps(halfpi, z);
-    q2    = _mm_sub_ps(q2, z);
-
-    z     = _mm_or_ps( _mm_and_ps(mask, q2), _mm_andnot_ps(mask, z) );
-
-    mask  = _mm_cmp_ps(xabs, limitlow, _CMP_GT_OQ);
-    z     = _mm_or_ps( _mm_and_ps(mask, z), _mm_andnot_ps(mask, xabs) );
-
-    z = _mm_xor_ps(z, sign);
-
-    return z;
-}
-
-
-static __m128
-gmx_mm_acos_ps(__m128 x)
-{
-    const __m128 signmask  = gmx_mm_castsi128_ps( _mm_set1_epi32(0x7FFFFFFF) );
-    const __m128 one_ps    = _mm_set1_ps(1.0f);
-    const __m128 half_ps   = _mm_set1_ps(0.5f);
-    const __m128 pi_ps     = _mm_set1_ps(M_PI);
-    const __m128 halfpi_ps = _mm_set1_ps(M_PI/2.0f);
-
-    __m128       mask1;
-    __m128       mask2;
-    __m128       xabs;
-    __m128       z, z1, z2, z3;
-
-    xabs  = _mm_and_ps(x, signmask);
-    mask1 = _mm_cmp_ps(xabs, half_ps, _CMP_GT_OQ);
-    mask2 = _mm_cmp_ps(x, _mm_setzero_ps(), _CMP_GT_OQ);
-
-    z     = _mm_mul_ps(half_ps, _mm_sub_ps(one_ps, xabs));
-    z     = _mm_mul_ps(z, gmx_mm_invsqrt_ps(z));
-    z     = _mm_andnot_ps(_mm_cmp_ps(xabs, one_ps, _CMP_EQ_OQ), z);
-
-    z     = _mm_blendv_ps(x, z, mask1);
-    z     = gmx_mm_asin_ps(z);
-
-    z2    = _mm_add_ps(z, z);
-    z1    = _mm_sub_ps(pi_ps, z2);
-    z3    = _mm_sub_ps(halfpi_ps, z);
-
-    z     = _mm_blendv_ps(z1, z2, mask2);
-    z     = _mm_blendv_ps(z3, z, mask1);
-
-    return z;
-}
-
-
-static __m128
-gmx_mm_atan_ps(__m128 x)
-{
-    /* Same algorithm as cephes library */
-    const __m128 signmask  = gmx_mm_castsi128_ps( _mm_set1_epi32(0x7FFFFFFF) );
-    const __m128 limit1    = _mm_set1_ps(0.414213562373095f);
-    const __m128 limit2    = _mm_set1_ps(2.414213562373095f);
-    const __m128 quarterpi = _mm_set1_ps(0.785398163397448f);
-    const __m128 halfpi    = _mm_set1_ps(1.570796326794896f);
-    const __m128 mone      = _mm_set1_ps(-1.0f);
-    const __m128 CC3       = _mm_set1_ps(-3.33329491539E-1f);
-    const __m128 CC5       = _mm_set1_ps(1.99777106478E-1f);
-    const __m128 CC7       = _mm_set1_ps(-1.38776856032E-1);
-    const __m128 CC9       = _mm_set1_ps(8.05374449538e-2f);
-
-    __m128       sign;
-    __m128       mask1, mask2;
-    __m128       y, z1, z2;
-    __m128       x2, x4;
-    __m128       sum1, sum2;
-
-    sign  = _mm_andnot_ps(signmask, x);
-    x     = _mm_and_ps(x, signmask);
-
-    mask1 = _mm_cmp_ps(x, limit1, _CMP_GT_OQ);
-    mask2 = _mm_cmp_ps(x, limit2, _CMP_GT_OQ);
-
-    z1    = _mm_mul_ps(_mm_add_ps(x, mone), gmx_mm_inv_ps(_mm_sub_ps(x, mone)));
-    z2    = _mm_mul_ps(mone, gmx_mm_inv_ps(x));
-
-    y     = _mm_and_ps(mask1, quarterpi);
-    y     = _mm_blendv_ps(y, halfpi, mask2);
-
-    x     = _mm_blendv_ps(x, z1, mask1);
-    x     = _mm_blendv_ps(x, z2, mask2);
-
-    x2    = _mm_mul_ps(x, x);
-    x4    = _mm_mul_ps(x2, x2);
-
-    sum1  = _mm_macc_ps(CC9, x4, CC5);
-    sum2  = _mm_macc_ps(CC7, x4, CC3);
-    sum1  = _mm_mul_ps(sum1, x4);
-    sum1  = _mm_macc_ps(sum2, x2, sum1);
-
-    sum1  = _mm_sub_ps(sum1, mone);
-    y     = _mm_macc_ps(sum1, x, y);
-
-    y     = _mm_xor_ps(y, sign);
-
-    return y;
-}
-
-
-static __m128
-gmx_mm_atan2_ps(__m128 y, __m128 x)
-{
-    const __m128 pi          = _mm_set1_ps(M_PI);
-    const __m128 minuspi     = _mm_set1_ps(-M_PI);
-    const __m128 halfpi      = _mm_set1_ps(M_PI/2.0);
-    const __m128 minushalfpi = _mm_set1_ps(-M_PI/2.0);
-
-    __m128       z, z1, z3, z4;
-    __m128       w;
-    __m128       maskx_lt, maskx_eq;
-    __m128       masky_lt, masky_eq;
-    __m128       mask1, mask2, mask3, mask4, maskall;
-
-    maskx_lt  = _mm_cmp_ps(x, _mm_setzero_ps(), _CMP_LT_OQ);
-    masky_lt  = _mm_cmp_ps(y, _mm_setzero_ps(), _CMP_LT_OQ);
-    maskx_eq  = _mm_cmp_ps(x, _mm_setzero_ps(), _CMP_EQ_OQ);
-    masky_eq  = _mm_cmp_ps(y, _mm_setzero_ps(), _CMP_EQ_OQ);
-
-    z         = _mm_mul_ps(y, gmx_mm_inv_ps(x));
-    z         = gmx_mm_atan_ps(z);
-
-    mask1     = _mm_and_ps(maskx_eq, masky_lt);
-    mask2     = _mm_andnot_ps(maskx_lt, masky_eq);
-    mask3     = _mm_andnot_ps( _mm_or_ps(masky_lt, masky_eq), maskx_eq);
-    mask4     = _mm_and_ps(masky_eq, maskx_lt);
-
-    maskall   = _mm_or_ps( _mm_or_ps(mask1, mask2), _mm_or_ps(mask3, mask4) );
-
-    z         = _mm_andnot_ps(maskall, z);
-    z1        = _mm_and_ps(mask1, minushalfpi);
-    z3        = _mm_and_ps(mask3, halfpi);
-    z4        = _mm_and_ps(mask4, pi);
-
-    z         = _mm_or_ps( _mm_or_ps(z, z1), _mm_or_ps(z3, z4) );
-
-    mask1     = _mm_andnot_ps(masky_lt, maskx_lt);
-    mask2     = _mm_and_ps(maskx_lt, masky_lt);
-
-    w         = _mm_or_ps( _mm_and_ps(mask1, pi), _mm_and_ps(mask2, minuspi) );
-    w         = _mm_andnot_ps(maskall, w);
-
-    z         = _mm_add_ps(z, w);
-
-    return z;
-}
-
  
+#define gmx_mm_invsqrt_ps   gmx_simd_invsqrt_f
+#define gmx_mm_inv_ps       gmx_simd_inv_f
+#define gmx_mm_log_ps       gmx_simd_log_f
+#define gmx_mm_pmecorrF_ps  gmx_simd_pmecorrF_f
+#define gmx_mm_pmecorrV_ps  gmx_simd_pmecorrV_f
+#define gmx_mm_sincos_ps    gmx_simd_sincos_f
  
  #endif
diff --git a/src/gromacs/simd/math_x86_avx_256_double.h b/src/gromacs/simd/math_x86_avx_256_double.h

index 2828505c6fb089f3786b7ad1fff42c6f43f70f04..8eb7941765156707b4f48af3517ed8254609c8ba 100644 (file)
--- a/src/gromacs/simd/math_x86_avx_256_double.h
+++ b/src/gromacs/simd/math_x86_avx_256_double.h
@@ -1,7 +1,7 @@
  /*
   * This file is part of the GROMACS molecular simulation package.
   *
- * Copyright (c) 2012,2013, by the GROMACS development team, led by
+ * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
   * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   * and including many others, as listed in the AUTHORS file in the
   * top-level source directory and at http://www.gromacs.org.
@@ -35,2802 +35,19 @@
  #ifndef GMX_SIMD_MATH_AVX_256_DOUBLE_H
  #define GMX_SIMD_MATH_AVX_256_DOUBLE_H
  
-#include <math.h>
+#include "simd_math.h"
  
-#include "general_x86_avx_256.h"
-
-#ifndef M_PI
-#  define M_PI 3.14159265358979323846264338327950288
-#endif
-
-
-/************************
- *                      *
- * Simple math routines *
- *                      *
- ************************/
-
-/* 1.0/sqrt(x), 256 bit wide */
-static gmx_inline __m256d
-gmx_mm256_invsqrt_pd(__m256d x)
-{
-    const __m256d half  = _mm256_set1_pd(0.5);
-    const __m256d three = _mm256_set1_pd(3.0);
-
-    /* Lookup instruction only exists in single precision, convert back and forth... */
-    __m256d lu = _mm256_cvtps_pd(_mm_rsqrt_ps( _mm256_cvtpd_ps(x)));
-
-    lu = _mm256_mul_pd(half, _mm256_mul_pd(_mm256_sub_pd(three, _mm256_mul_pd(_mm256_mul_pd(lu, lu), x)), lu));
-    return _mm256_mul_pd(half, _mm256_mul_pd(_mm256_sub_pd(three, _mm256_mul_pd(_mm256_mul_pd(lu, lu), x)), lu));
-}
-
-/* 1.0/sqrt(x), done for a pair of arguments to improve throughput */
-static void
-gmx_mm256_invsqrt_pair_pd(__m256d x1, __m256d x2, __m256d *invsqrt1, __m256d *invsqrt2)
-{
-    const __m256d half   = _mm256_set1_pd(0.5);
-    const __m256d three  = _mm256_set1_pd(3.0);
-    const __m256  halff  = _mm256_set1_ps(0.5f);
-    const __m256  threef = _mm256_set1_ps(3.0f);
-
-    __m256        xf, luf;
-    __m256d       lu1, lu2;
-
-    /* Do first N-R step in float for 2x throughput */
-    xf  = _mm256_insertf128_ps(_mm256_castps128_ps256(_mm256_cvtpd_ps(x1)), _mm256_cvtpd_ps(x2), 0x1);
-    luf = _mm256_rsqrt_ps(xf);
-
-    luf = _mm256_mul_ps(halff, _mm256_mul_ps(_mm256_sub_ps(threef, _mm256_mul_ps(_mm256_mul_ps(luf, luf), xf)), luf));
-
-    lu2 = _mm256_cvtps_pd(_mm256_extractf128_ps(luf, 0x1));
-    lu1 = _mm256_cvtps_pd(_mm256_castps256_ps128(luf));
-
-    *invsqrt1 = _mm256_mul_pd(half, _mm256_mul_pd(_mm256_sub_pd(three, _mm256_mul_pd(_mm256_mul_pd(lu1, lu1), x1)), lu1));
-    *invsqrt2 = _mm256_mul_pd(half, _mm256_mul_pd(_mm256_sub_pd(three, _mm256_mul_pd(_mm256_mul_pd(lu2, lu2), x2)), lu2));
-}
-
-/* 1.0/sqrt(x), 128 bit wide */
-static gmx_inline __m128d
-gmx_mm_invsqrt_pd(__m128d x)
-{
-    const __m128d half  = _mm_set1_pd(0.5);
-    const __m128d three = _mm_set1_pd(3.0);
-
-    /* Lookup instruction only exists in single precision, convert back and forth... */
-    __m128d lu = _mm_cvtps_pd(_mm_rsqrt_ps( _mm_cvtpd_ps(x)));
-
-    lu = _mm_mul_pd(half, _mm_mul_pd(_mm_sub_pd(three, _mm_mul_pd(_mm_mul_pd(lu, lu), x)), lu));
-    return _mm_mul_pd(half, _mm_mul_pd(_mm_sub_pd(three, _mm_mul_pd(_mm_mul_pd(lu, lu), x)), lu));
-}
-
-/* 1.0/sqrt(x), done for two pairs to improve throughput */
-static void
-gmx_mm_invsqrt_pair_pd(__m128d x1, __m128d x2, __m128d *invsqrt1, __m128d *invsqrt2)
-{
-    const __m128d half   = _mm_set1_pd(0.5);
-    const __m128d three  = _mm_set1_pd(3.0);
-    const __m128  halff  = _mm_set1_ps(0.5f);
-    const __m128  threef = _mm_set1_ps(3.0f);
-
-    __m128        xf, luf;
-    __m128d       lu1, lu2;
-
-    /* Do first N-R step in float for 2x throughput */
-    xf  = _mm_shuffle_ps(_mm_cvtpd_ps(x1), _mm_cvtpd_ps(x2), _MM_SHUFFLE(1, 0, 1, 0));
-    luf = _mm_rsqrt_ps(xf);
-    luf = _mm_mul_ps(halff, _mm_mul_ps(_mm_sub_ps(threef, _mm_mul_ps(_mm_mul_ps(luf, luf), xf)), luf));
-
-    lu2 = _mm_cvtps_pd(_mm_shuffle_ps(luf, luf, _MM_SHUFFLE(3, 2, 3, 2)));
-    lu1 = _mm_cvtps_pd(luf);
-
-    *invsqrt1 = _mm_mul_pd(half, _mm_mul_pd(_mm_sub_pd(three, _mm_mul_pd(_mm_mul_pd(lu1, lu1), x1)), lu1));
-    *invsqrt2 = _mm_mul_pd(half, _mm_mul_pd(_mm_sub_pd(three, _mm_mul_pd(_mm_mul_pd(lu2, lu2), x2)), lu2));
-}
-
-/* sqrt(x) (256 bit)- Do NOT use this (but rather invsqrt) if you actually need 1.0/sqrt(x) */
-static gmx_inline __m256d
-gmx_mm256_sqrt_pd(__m256d x)
-{
-    __m256d mask;
-    __m256d res;
-
-    mask = _mm256_cmp_pd(x, _mm256_setzero_pd(), _CMP_EQ_OQ);
-    res  = _mm256_andnot_pd(mask, gmx_mm256_invsqrt_pd(x));
-
-    res  = _mm256_mul_pd(x, res);
-
-    return res;
-}
-
-/* sqrt(x) (128 bit) - Do NOT use this (but rather invsqrt) if you actually need 1.0/sqrt(x) */
-static gmx_inline __m128d
-gmx_mm_sqrt_pd(__m128d x)
-{
-    __m128d mask;
-    __m128d res;
-
-    mask = _mm_cmpeq_pd(x, _mm_setzero_pd());
-    res  = _mm_andnot_pd(mask, gmx_mm_invsqrt_pd(x));
-
-    res  = _mm_mul_pd(x, res);
-
-    return res;
-}
-
-
-/* 1.0/x, 256 bit wide */
-static gmx_inline __m256d
-gmx_mm256_inv_pd(__m256d x)
-{
-    const __m256d two  = _mm256_set1_pd(2.0);
-
-    /* Lookup instruction only exists in single precision, convert back and forth... */
-    __m256d lu = _mm256_cvtps_pd(_mm_rcp_ps( _mm256_cvtpd_ps(x)));
-
-    /* Perform two N-R steps for double precision */
-    lu         = _mm256_mul_pd(lu, _mm256_sub_pd(two, _mm256_mul_pd(x, lu)));
-    return _mm256_mul_pd(lu, _mm256_sub_pd(two, _mm256_mul_pd(x, lu)));
-}
-
-/* 1.0/x, 128 bit */
-static gmx_inline __m128d
-gmx_mm_inv_pd(__m128d x)
-{
-    const __m128d two  = _mm_set1_pd(2.0);
-
-    /* Lookup instruction only exists in single precision, convert back and forth... */
-    __m128d lu = _mm_cvtps_pd(_mm_rcp_ps( _mm_cvtpd_ps(x)));
-
-    /* Perform two N-R steps for double precision */
-    lu         = _mm_mul_pd(lu, _mm_sub_pd(two, _mm_mul_pd(x, lu)));
-    return _mm_mul_pd(lu, _mm_sub_pd(two, _mm_mul_pd(x, lu)));
-}
-
-
-static gmx_inline __m256d
-gmx_mm256_abs_pd(__m256d x)
-{
-    const __m256d signmask  = _mm256_castsi256_pd( _mm256_set_epi32(0x7FFFFFFF, 0xFFFFFFFF, 0x7FFFFFFF, 0xFFFFFFFF,
-                                                                    0x7FFFFFFF, 0xFFFFFFFF, 0x7FFFFFFF, 0xFFFFFFFF) );
-
-    return _mm256_and_pd(x, signmask);
-}
-
-static gmx_inline __m128d
-gmx_mm_abs_pd(__m128d x)
-{
-    const __m128d signmask  = gmx_mm_castsi128_pd( _mm_set_epi32(0x7FFFFFFF, 0xFFFFFFFF, 0x7FFFFFFF, 0xFFFFFFFF) );
-
-    return _mm_and_pd(x, signmask);
-}
-
-
-/*
- * 2^x function, 256 bit
- *
- * The 2^w term is calculated from a (6,0)-th order (no denominator) Minimax polynomia on the interval
- * [-0.5,0.5].
- *
- * The approximation on [-0.5,0.5] is a rational Padé approximation, 1+2*P(x^2)/(Q(x^2)-P(x^2)),
- * according to the same algorithm as used in the Cephes/netlib math routines.
- */
-static __m256d
-gmx_mm256_exp2_pd(__m256d x)
-{
-    /* Lower bound: We do not allow numbers that would lead to an IEEE fp representation exponent smaller than -126. */
-    const __m256d arglimit = _mm256_set1_pd(1022.0);
-    const __m128i expbase  = _mm_set1_epi32(1023);
-
-    const __m256d P2       = _mm256_set1_pd(2.30933477057345225087e-2);
-    const __m256d P1       = _mm256_set1_pd(2.02020656693165307700e1);
-    const __m256d P0       = _mm256_set1_pd(1.51390680115615096133e3);
-    /* Q2 == 1.0 */
-    const __m256d Q1       = _mm256_set1_pd(2.33184211722314911771e2);
-    const __m256d Q0       = _mm256_set1_pd(4.36821166879210612817e3);
-    const __m256d one      = _mm256_set1_pd(1.0);
-    const __m256d two      = _mm256_set1_pd(2.0);
-
-    __m256d       valuemask;
-    __m256i       iexppart;
-    __m128i       iexppart128a, iexppart128b;
-    __m256d       fexppart;
-    __m256d       intpart;
-    __m256d       z, z2;
-    __m256d       PolyP, PolyQ;
-
-    iexppart128a  = _mm256_cvtpd_epi32(x);
-    intpart       = _mm256_round_pd(x, _MM_FROUND_TO_NEAREST_INT);
-
-    /* Add exponent bias */
-    iexppart128a   = _mm_add_epi32(iexppart128a, expbase);
-
-    /* We now want to shift the exponent 52 positions left, but to achieve this we need
-     * to separate the 128-bit register data into two registers (4x64-bit > 128bit)
-     * shift them, and then merge into a single __m256d.
-     * Elements 0/1 should end up in iexppart128a, and 2/3 in iexppart128b.
-     * It doesnt matter what we put in the 2nd/4th position, since that data will be
-     * shifted out and replaced with zeros.
-     */
-    iexppart128b   = _mm_shuffle_epi32(iexppart128a, _MM_SHUFFLE(3, 3, 2, 2));
-    iexppart128a   = _mm_shuffle_epi32(iexppart128a, _MM_SHUFFLE(1, 1, 0, 0));
-
-    iexppart128b   = _mm_slli_epi64(iexppart128b, 52);
-    iexppart128a   = _mm_slli_epi64(iexppart128a, 52);
-
-    iexppart  = _mm256_castsi128_si256(iexppart128a);
-    iexppart  = _mm256_insertf128_si256(iexppart, iexppart128b, 0x1);
-
-    valuemask = _mm256_cmp_pd(arglimit, gmx_mm256_abs_pd(x), _CMP_GE_OQ);
-    fexppart  = _mm256_and_pd(valuemask, _mm256_castsi256_pd(iexppart));
-
-    z         = _mm256_sub_pd(x, intpart);
-
-    z2        = _mm256_mul_pd(z, z);
-
-    PolyP     = _mm256_mul_pd(P2, z2);
-    PolyP     = _mm256_add_pd(PolyP, P1);
-    PolyQ     = _mm256_add_pd(z2, Q1);
-    PolyP     = _mm256_mul_pd(PolyP, z2);
-    PolyQ     = _mm256_mul_pd(PolyQ, z2);
-    PolyP     = _mm256_add_pd(PolyP, P0);
-    PolyQ     = _mm256_add_pd(PolyQ, Q0);
-    PolyP     = _mm256_mul_pd(PolyP, z);
-
-    z         = _mm256_mul_pd(PolyP, gmx_mm256_inv_pd(_mm256_sub_pd(PolyQ, PolyP)));
-    z         = _mm256_add_pd(one, _mm256_mul_pd(two, z));
-
-    z         = _mm256_mul_pd(z, fexppart);
-
-    return z;
-}
-
-/* 2^x, 128 bit */
-static __m128d
-gmx_mm_exp2_pd(__m128d x)
-{
-    /* Lower bound: We do not allow numbers that would lead to an IEEE fp representation exponent smaller than -126. */
-    const __m128d arglimit = _mm_set1_pd(1022.0);
-    const __m128i expbase  = _mm_set1_epi32(1023);
-
-    const __m128d P2       = _mm_set1_pd(2.30933477057345225087e-2);
-    const __m128d P1       = _mm_set1_pd(2.02020656693165307700e1);
-    const __m128d P0       = _mm_set1_pd(1.51390680115615096133e3);
-    /* Q2 == 1.0 */
-    const __m128d Q1       = _mm_set1_pd(2.33184211722314911771e2);
-    const __m128d Q0       = _mm_set1_pd(4.36821166879210612817e3);
-    const __m128d one      = _mm_set1_pd(1.0);
-    const __m128d two      = _mm_set1_pd(2.0);
-
-    __m128d       valuemask;
-    __m128i       iexppart;
-    __m128d       fexppart;
-    __m128d       intpart;
-    __m128d       z, z2;
-    __m128d       PolyP, PolyQ;
-
-    iexppart  = _mm_cvtpd_epi32(x);
-    intpart   = _mm_round_pd(x, _MM_FROUND_TO_NEAREST_INT);
-
-    /* The two lowest elements of iexppart now contains 32-bit numbers with a correctly biased exponent.
-     * To be able to shift it into the exponent for a double precision number we first need to
-     * shuffle so that the lower half contains the first element, and the upper half the second.
-     * This should really be done as a zero-extension, but since the next instructions will shift
-     * the registers left by 52 bits it doesn't matter what we put there - it will be shifted out.
-     * (thus we just use element 2 from iexppart).
-     */
-    iexppart  = _mm_shuffle_epi32(iexppart, _MM_SHUFFLE(2, 1, 2, 0));
-
-    /* Do the shift operation on the 64-bit registers */
-    iexppart  = _mm_add_epi32(iexppart, expbase);
-    iexppart  = _mm_slli_epi64(iexppart, 52);
-
-    valuemask = _mm_cmpge_pd(arglimit, gmx_mm_abs_pd(x));
-    fexppart  = _mm_and_pd(valuemask, gmx_mm_castsi128_pd(iexppart));
-
-    z         = _mm_sub_pd(x, intpart);
-    z2        = _mm_mul_pd(z, z);
-
-    PolyP     = _mm_mul_pd(P2, z2);
-    PolyP     = _mm_add_pd(PolyP, P1);
-    PolyQ     = _mm_add_pd(z2, Q1);
-    PolyP     = _mm_mul_pd(PolyP, z2);
-    PolyQ     = _mm_mul_pd(PolyQ, z2);
-    PolyP     = _mm_add_pd(PolyP, P0);
-    PolyQ     = _mm_add_pd(PolyQ, Q0);
-    PolyP     = _mm_mul_pd(PolyP, z);
-
-    z         = _mm_mul_pd(PolyP, gmx_mm_inv_pd(_mm_sub_pd(PolyQ, PolyP)));
-    z         = _mm_add_pd(one, _mm_mul_pd(two, z));
-
-    z         = _mm_mul_pd(z, fexppart);
-
-    return z;
-}
-
-
-/* Exponential function, 256 bit. This could be calculated from 2^x as Exp(x)=2^(y),
- * where y=log2(e)*x, but there will then be a small rounding error since we lose
- * some precision due to the multiplication. This will then be magnified a lot by
- * the exponential.
- *
- * Instead, we calculate the fractional part directly as a Padé approximation of
- * Exp(z) on [-0.5,0.5]. We use extended precision arithmetics to calculate the fraction
- * remaining after 2^y, which avoids the precision-loss.
- */
-static __m256d
-gmx_mm256_exp_pd(__m256d exparg)
-{
-    const __m256d argscale = _mm256_set1_pd(1.4426950408889634073599);
-    /* Lower bound: We do not allow numbers that would lead to an IEEE fp representation exponent smaller than -126. */
-    const __m256d arglimit = _mm256_set1_pd(1022.0);
-    const __m128i expbase  = _mm_set1_epi32(1023);
-
-    const __m256d invargscale0  = _mm256_set1_pd(6.93145751953125e-1);
-    const __m256d invargscale1  = _mm256_set1_pd(1.42860682030941723212e-6);
-
-    const __m256d P2       = _mm256_set1_pd(1.26177193074810590878e-4);
-    const __m256d P1       = _mm256_set1_pd(3.02994407707441961300e-2);
-    /* P0 == 1.0 */
-    const __m256d Q3       = _mm256_set1_pd(3.00198505138664455042E-6);
-    const __m256d Q2       = _mm256_set1_pd(2.52448340349684104192E-3);
-    const __m256d Q1       = _mm256_set1_pd(2.27265548208155028766E-1);
-    /* Q0 == 2.0 */
-    const __m256d one      = _mm256_set1_pd(1.0);
-    const __m256d two      = _mm256_set1_pd(2.0);
-
-    __m256d       valuemask;
-    __m256i       iexppart;
-    __m128i       iexppart128a, iexppart128b;
-    __m256d       fexppart;
-    __m256d       intpart;
-    __m256d       x, z, z2;
-    __m256d       PolyP, PolyQ;
-
-    x             = _mm256_mul_pd(exparg, argscale);
-
-    iexppart128a  = _mm256_cvtpd_epi32(x);
-    intpart       = _mm256_round_pd(x, _MM_FROUND_TO_NEAREST_INT);
-
-    /* Add exponent bias */
-    iexppart128a   = _mm_add_epi32(iexppart128a, expbase);
-
-    /* We now want to shift the exponent 52 positions left, but to achieve this we need
-     * to separate the 128-bit register data into two registers (4x64-bit > 128bit)
-     * shift them, and then merge into a single __m256d.
-     * Elements 0/1 should end up in iexppart128a, and 2/3 in iexppart128b.
-     * It doesnt matter what we put in the 2nd/4th position, since that data will be
-     * shifted out and replaced with zeros.
-     */
-    iexppart128b   = _mm_shuffle_epi32(iexppart128a, _MM_SHUFFLE(3, 3, 2, 2));
-    iexppart128a   = _mm_shuffle_epi32(iexppart128a, _MM_SHUFFLE(1, 1, 0, 0));
-
-    iexppart128b   = _mm_slli_epi64(iexppart128b, 52);
-    iexppart128a   = _mm_slli_epi64(iexppart128a, 52);
-
-    iexppart  = _mm256_castsi128_si256(iexppart128a);
-    iexppart  = _mm256_insertf128_si256(iexppart, iexppart128b, 0x1);
-
-    valuemask = _mm256_cmp_pd(arglimit, gmx_mm256_abs_pd(x), _CMP_GE_OQ);
-    fexppart  = _mm256_and_pd(valuemask, _mm256_castsi256_pd(iexppart));
-
-    z         = _mm256_sub_pd(exparg, _mm256_mul_pd(invargscale0, intpart));
-    z         = _mm256_sub_pd(z, _mm256_mul_pd(invargscale1, intpart));
-
-    z2        = _mm256_mul_pd(z, z);
-
-    PolyQ     = _mm256_mul_pd(Q3, z2);
-    PolyQ     = _mm256_add_pd(PolyQ, Q2);
-    PolyP     = _mm256_mul_pd(P2, z2);
-    PolyQ     = _mm256_mul_pd(PolyQ, z2);
-    PolyP     = _mm256_add_pd(PolyP, P1);
-    PolyQ     = _mm256_add_pd(PolyQ, Q1);
-    PolyP     = _mm256_mul_pd(PolyP, z2);
-    PolyQ     = _mm256_mul_pd(PolyQ, z2);
-    PolyP     = _mm256_add_pd(PolyP, one);
-    PolyQ     = _mm256_add_pd(PolyQ, two);
-
-    PolyP     = _mm256_mul_pd(PolyP, z);
-
-    z         = _mm256_mul_pd(PolyP, gmx_mm256_inv_pd(_mm256_sub_pd(PolyQ, PolyP)));
-    z         = _mm256_add_pd(one, _mm256_mul_pd(two, z));
-
-    z         = _mm256_mul_pd(z, fexppart);
-
-    return z;
-}
-
-/* exp(), 128 bit */
-static __m128d
-gmx_mm_exp_pd(__m128d exparg)
-{
-    const __m128d argscale = _mm_set1_pd(1.4426950408889634073599);
-    /* Lower bound: We do not allow numbers that would lead to an IEEE fp representation exponent smaller than -126. */
-    const __m128d arglimit = _mm_set1_pd(1022.0);
-    const __m128i expbase  = _mm_set1_epi32(1023);
-
-    const __m128d invargscale0  = _mm_set1_pd(6.93145751953125e-1);
-    const __m128d invargscale1  = _mm_set1_pd(1.42860682030941723212e-6);
-
-    const __m128d P2       = _mm_set1_pd(1.26177193074810590878e-4);
-    const __m128d P1       = _mm_set1_pd(3.02994407707441961300e-2);
-    /* P0 == 1.0 */
-    const __m128d Q3       = _mm_set1_pd(3.00198505138664455042E-6);
-    const __m128d Q2       = _mm_set1_pd(2.52448340349684104192E-3);
-    const __m128d Q1       = _mm_set1_pd(2.27265548208155028766E-1);
-    /* Q0 == 2.0 */
-    const __m128d one      = _mm_set1_pd(1.0);
-    const __m128d two      = _mm_set1_pd(2.0);
-
-    __m128d       valuemask;
-    __m128i       iexppart;
-    __m128d       fexppart;
-    __m128d       intpart;
-    __m128d       x, z, z2;
-    __m128d       PolyP, PolyQ;
-
-    x             = _mm_mul_pd(exparg, argscale);
-
-    iexppart  = _mm_cvtpd_epi32(x);
-    intpart   = _mm_round_pd(x, _MM_FROUND_TO_NEAREST_INT);
-
-    /* The two lowest elements of iexppart now contains 32-bit numbers with a correctly biased exponent.
-     * To be able to shift it into the exponent for a double precision number we first need to
-     * shuffle so that the lower half contains the first element, and the upper half the second.
-     * This should really be done as a zero-extension, but since the next instructions will shift
-     * the registers left by 52 bits it doesn't matter what we put there - it will be shifted out.
-     * (thus we just use element 2 from iexppart).
-     */
-    iexppart  = _mm_shuffle_epi32(iexppart, _MM_SHUFFLE(2, 1, 2, 0));
-
-    /* Do the shift operation on the 64-bit registers */
-    iexppart  = _mm_add_epi32(iexppart, expbase);
-    iexppart  = _mm_slli_epi64(iexppart, 52);
-
-    valuemask = _mm_cmpge_pd(arglimit, gmx_mm_abs_pd(x));
-    fexppart  = _mm_and_pd(valuemask, gmx_mm_castsi128_pd(iexppart));
-
-    z         = _mm_sub_pd(exparg, _mm_mul_pd(invargscale0, intpart));
-    z         = _mm_sub_pd(z, _mm_mul_pd(invargscale1, intpart));
-
-    z2        = _mm_mul_pd(z, z);
-
-    PolyQ     = _mm_mul_pd(Q3, z2);
-    PolyQ     = _mm_add_pd(PolyQ, Q2);
-    PolyP     = _mm_mul_pd(P2, z2);
-    PolyQ     = _mm_mul_pd(PolyQ, z2);
-    PolyP     = _mm_add_pd(PolyP, P1);
-    PolyQ     = _mm_add_pd(PolyQ, Q1);
-    PolyP     = _mm_mul_pd(PolyP, z2);
-    PolyQ     = _mm_mul_pd(PolyQ, z2);
-    PolyP     = _mm_add_pd(PolyP, one);
-    PolyQ     = _mm_add_pd(PolyQ, two);
-
-    PolyP     = _mm_mul_pd(PolyP, z);
-
-    z         = _mm_mul_pd(PolyP, gmx_mm_inv_pd(_mm_sub_pd(PolyQ, PolyP)));
-    z         = _mm_add_pd(one, _mm_mul_pd(two, z));
-
-    z         = _mm_mul_pd(z, fexppart);
-
-    return z;
-}
-
-
-static __m256d
-gmx_mm256_log_pd(__m256d x)
-{
-    /* Same algorithm as cephes library */
-    const __m256d expmask    = _mm256_castsi256_pd( _mm256_set_epi32(0x7FF00000, 0x00000000, 0x7FF00000, 0x00000000,
-                                                                     0x7FF00000, 0x00000000, 0x7FF00000, 0x00000000) );
-
-    const __m128i expbase_m1 = _mm_set1_epi32(1023-1); /* We want non-IEEE format */
-
-    const __m256d half       = _mm256_set1_pd(0.5);
-    const __m256d one        = _mm256_set1_pd(1.0);
-    const __m256d two        = _mm256_set1_pd(2.0);
-    const __m256d invsq2     = _mm256_set1_pd(1.0/sqrt(2.0));
-
-    const __m256d corr1      = _mm256_set1_pd(-2.121944400546905827679e-4);
-    const __m256d corr2      = _mm256_set1_pd(0.693359375);
-
-    const __m256d P5         = _mm256_set1_pd(1.01875663804580931796e-4);
-    const __m256d P4         = _mm256_set1_pd(4.97494994976747001425e-1);
-    const __m256d P3         = _mm256_set1_pd(4.70579119878881725854e0);
-    const __m256d P2         = _mm256_set1_pd(1.44989225341610930846e1);
-    const __m256d P1         = _mm256_set1_pd(1.79368678507819816313e1);
-    const __m256d P0         = _mm256_set1_pd(7.70838733755885391666e0);
-
-    const __m256d Q4         = _mm256_set1_pd(1.12873587189167450590e1);
-    const __m256d Q3         = _mm256_set1_pd(4.52279145837532221105e1);
-    const __m256d Q2         = _mm256_set1_pd(8.29875266912776603211e1);
-    const __m256d Q1         = _mm256_set1_pd(7.11544750618563894466e1);
-    const __m256d Q0         = _mm256_set1_pd(2.31251620126765340583e1);
-
-    const __m256d R2         = _mm256_set1_pd(-7.89580278884799154124e-1);
-    const __m256d R1         = _mm256_set1_pd(1.63866645699558079767e1);
-    const __m256d R0         = _mm256_set1_pd(-6.41409952958715622951e1);
-
-    const __m256d S2         = _mm256_set1_pd(-3.56722798256324312549E1);
-    const __m256d S1         = _mm256_set1_pd(3.12093766372244180303E2);
-    const __m256d S0         = _mm256_set1_pd(-7.69691943550460008604E2);
-
-    __m256d       fexp;
-    __m256i       iexp;
-    __m128i       iexp128a, iexp128b;
-
-    __m256d       mask1, mask2;
-    __m256d       corr, t1, t2, q;
-    __m256d       zA, yA, xA, zB, yB, xB, z;
-    __m256d       polyR, polyS;
-    __m256d       polyP1, polyP2, polyQ1, polyQ2;
-
-    /* Separate x into exponent and mantissa, with a mantissa in the range [0.5..1[ (not IEEE754 standard!) */
-    fexp     = _mm256_and_pd(x, expmask);
-
-    iexp     = _mm256_castpd_si256(fexp);
-    iexp128b = _mm256_extractf128_si256(iexp, 0x1);
-    iexp128a = _mm256_castsi256_si128(iexp);
-
-    iexp128a  = _mm_srli_epi64(iexp128a, 52);
-    iexp128b  = _mm_srli_epi64(iexp128b, 52);
-    /* Merge into a single register */
-    iexp128a  = _mm_shuffle_epi32(iexp128a, _MM_SHUFFLE(1, 1, 2, 0));
-    iexp128b  = _mm_shuffle_epi32(iexp128b, _MM_SHUFFLE(2, 0, 1, 1));
-    iexp128a  = _mm_or_si128(iexp128a, iexp128b);
-    iexp128a  = _mm_sub_epi32(iexp128a, expbase_m1);
-
-    fexp      = _mm256_cvtepi32_pd(iexp128a);
-
-    x         = _mm256_andnot_pd(expmask, x); /* Get mantissa */
-    x         = _mm256_or_pd(x, one);
-    x         = _mm256_mul_pd(x, half);
-
-    mask1     = _mm256_cmp_pd(gmx_mm256_abs_pd(fexp), two, _CMP_GT_OQ);
-    mask2     = _mm256_cmp_pd(x, invsq2, _CMP_LT_OQ);
-
-    fexp      = _mm256_sub_pd(fexp, _mm256_and_pd(mask2, one));
-
-    /* If mask1 is set ('A') */
-    zA     = _mm256_sub_pd(x, half);
-    t1     = _mm256_blendv_pd( zA, x, mask2 );
-    zA     = _mm256_sub_pd(t1, half);
-    t2     = _mm256_blendv_pd( x, zA, mask2 );
-    yA     = _mm256_mul_pd(half, _mm256_add_pd(t2, one));
-
-    xA     = _mm256_mul_pd(zA, gmx_mm256_inv_pd(yA));
-    zA     = _mm256_mul_pd(xA, xA);
-
-    /* EVALUATE POLY */
-    polyR  = _mm256_mul_pd(R2, zA);
-    polyR  = _mm256_add_pd(polyR, R1);
-    polyR  = _mm256_mul_pd(polyR, zA);
-    polyR  = _mm256_add_pd(polyR, R0);
-
-    polyS  = _mm256_add_pd(zA, S2);
-    polyS  = _mm256_mul_pd(polyS, zA);
-    polyS  = _mm256_add_pd(polyS, S1);
-    polyS  = _mm256_mul_pd(polyS, zA);
-    polyS  = _mm256_add_pd(polyS, S0);
-
-    q      = _mm256_mul_pd(polyR, gmx_mm256_inv_pd(polyS));
-    zA     = _mm256_mul_pd(_mm256_mul_pd(xA, zA), q);
-
-    zA     = _mm256_add_pd(zA, _mm256_mul_pd(corr1, fexp));
-    zA     = _mm256_add_pd(zA, xA);
-    zA     = _mm256_add_pd(zA, _mm256_mul_pd(corr2, fexp));
-
-    /* If mask1 is not set ('B') */
-    corr   = _mm256_and_pd(mask2, x);
-    xB     = _mm256_add_pd(x, corr);
-    xB     = _mm256_sub_pd(xB, one);
-    zB     = _mm256_mul_pd(xB, xB);
-
-    polyP1 = _mm256_mul_pd(P5, zB);
-    polyP2 = _mm256_mul_pd(P4, zB);
-    polyP1 = _mm256_add_pd(polyP1, P3);
-    polyP2 = _mm256_add_pd(polyP2, P2);
-    polyP1 = _mm256_mul_pd(polyP1, zB);
-    polyP2 = _mm256_mul_pd(polyP2, zB);
-    polyP1 = _mm256_add_pd(polyP1, P1);
-    polyP2 = _mm256_add_pd(polyP2, P0);
-    polyP1 = _mm256_mul_pd(polyP1, xB);
-    polyP1 = _mm256_add_pd(polyP1, polyP2);
-
-    polyQ2 = _mm256_mul_pd(Q4, zB);
-    polyQ1 = _mm256_add_pd(zB, Q3);
-    polyQ2 = _mm256_add_pd(polyQ2, Q2);
-    polyQ1 = _mm256_mul_pd(polyQ1, zB);
-    polyQ2 = _mm256_mul_pd(polyQ2, zB);
-    polyQ1 = _mm256_add_pd(polyQ1, Q1);
-    polyQ2 = _mm256_add_pd(polyQ2, Q0);
-    polyQ1 = _mm256_mul_pd(polyQ1, xB);
-    polyQ1 = _mm256_add_pd(polyQ1, polyQ2);
-
-    fexp   = _mm256_and_pd(fexp, _mm256_cmp_pd(fexp, _mm256_setzero_pd(), _CMP_NEQ_OQ));
-
-    q      = _mm256_mul_pd(polyP1, gmx_mm256_inv_pd(polyQ1));
-    yB     = _mm256_mul_pd(_mm256_mul_pd(xB, zB), q);
-
-    yB     = _mm256_add_pd(yB, _mm256_mul_pd(corr1, fexp));
-    yB     = _mm256_sub_pd(yB, _mm256_mul_pd(half, zB));
-    zB     = _mm256_add_pd(xB, yB);
-    zB     = _mm256_add_pd(zB, _mm256_mul_pd(corr2, fexp));
-
-    z      = _mm256_blendv_pd( zB, zA, mask1 );
-
-    return z;
-}
-
-static __m128d
-gmx_mm_log_pd(__m128d x)
-{
-    /* Same algorithm as cephes library */
-    const __m128d expmask    = gmx_mm_castsi128_pd( _mm_set_epi32(0x7FF00000, 0x00000000, 0x7FF00000, 0x00000000) );
-
-    const __m128i expbase_m1 = _mm_set1_epi32(1023-1); /* We want non-IEEE format */
-
-    const __m128d half       = _mm_set1_pd(0.5);
-    const __m128d one        = _mm_set1_pd(1.0);
-    const __m128d two        = _mm_set1_pd(2.0);
-    const __m128d invsq2     = _mm_set1_pd(1.0/sqrt(2.0));
-
-    const __m128d corr1      = _mm_set1_pd(-2.121944400546905827679e-4);
-    const __m128d corr2      = _mm_set1_pd(0.693359375);
-
-    const __m128d P5         = _mm_set1_pd(1.01875663804580931796e-4);
-    const __m128d P4         = _mm_set1_pd(4.97494994976747001425e-1);
-    const __m128d P3         = _mm_set1_pd(4.70579119878881725854e0);
-    const __m128d P2         = _mm_set1_pd(1.44989225341610930846e1);
-    const __m128d P1         = _mm_set1_pd(1.79368678507819816313e1);
-    const __m128d P0         = _mm_set1_pd(7.70838733755885391666e0);
-
-    const __m128d Q4         = _mm_set1_pd(1.12873587189167450590e1);
-    const __m128d Q3         = _mm_set1_pd(4.52279145837532221105e1);
-    const __m128d Q2         = _mm_set1_pd(8.29875266912776603211e1);
-    const __m128d Q1         = _mm_set1_pd(7.11544750618563894466e1);
-    const __m128d Q0         = _mm_set1_pd(2.31251620126765340583e1);
-
-    const __m128d R2         = _mm_set1_pd(-7.89580278884799154124e-1);
-    const __m128d R1         = _mm_set1_pd(1.63866645699558079767e1);
-    const __m128d R0         = _mm_set1_pd(-6.41409952958715622951e1);
-
-    const __m128d S2         = _mm_set1_pd(-3.56722798256324312549E1);
-    const __m128d S1         = _mm_set1_pd(3.12093766372244180303E2);
-    const __m128d S0         = _mm_set1_pd(-7.69691943550460008604E2);
-
-    __m128d       fexp;
-    __m128i       iexp;
-
-    __m128d       mask1, mask2;
-    __m128d       corr, t1, t2, q;
-    __m128d       zA, yA, xA, zB, yB, xB, z;
-    __m128d       polyR, polyS;
-    __m128d       polyP1, polyP2, polyQ1, polyQ2;
-
-    /* Separate x into exponent and mantissa, with a mantissa in the range [0.5..1[ (not IEEE754 standard!) */
-    fexp   = _mm_and_pd(x, expmask);
-    iexp   = gmx_mm_castpd_si128(fexp);
-    iexp   = _mm_srli_epi64(iexp, 52);
-    iexp   = _mm_sub_epi32(iexp, expbase_m1);
-    iexp   = _mm_shuffle_epi32(iexp, _MM_SHUFFLE(1, 1, 2, 0) );
-    fexp   = _mm_cvtepi32_pd(iexp);
-
-    x      = _mm_andnot_pd(expmask, x);
-    x      = _mm_or_pd(x, one);
-    x      = _mm_mul_pd(x, half);
-
-    mask1     = _mm_cmpgt_pd(gmx_mm_abs_pd(fexp), two);
-    mask2     = _mm_cmplt_pd(x, invsq2);
-
-    fexp   = _mm_sub_pd(fexp, _mm_and_pd(mask2, one));
-
-    /* If mask1 is set ('A') */
-    zA     = _mm_sub_pd(x, half);
-    t1     = _mm_blendv_pd( zA, x, mask2 );
-    zA     = _mm_sub_pd(t1, half);
-    t2     = _mm_blendv_pd( x, zA, mask2 );
-    yA     = _mm_mul_pd(half, _mm_add_pd(t2, one));
-
-    xA     = _mm_mul_pd(zA, gmx_mm_inv_pd(yA));
-    zA     = _mm_mul_pd(xA, xA);
-
-    /* EVALUATE POLY */
-    polyR  = _mm_mul_pd(R2, zA);
-    polyR  = _mm_add_pd(polyR, R1);
-    polyR  = _mm_mul_pd(polyR, zA);
-    polyR  = _mm_add_pd(polyR, R0);
-
-    polyS  = _mm_add_pd(zA, S2);
-    polyS  = _mm_mul_pd(polyS, zA);
-    polyS  = _mm_add_pd(polyS, S1);
-    polyS  = _mm_mul_pd(polyS, zA);
-    polyS  = _mm_add_pd(polyS, S0);
-
-    q      = _mm_mul_pd(polyR, gmx_mm_inv_pd(polyS));
-    zA     = _mm_mul_pd(_mm_mul_pd(xA, zA), q);
-
-    zA     = _mm_add_pd(zA, _mm_mul_pd(corr1, fexp));
-    zA     = _mm_add_pd(zA, xA);
-    zA     = _mm_add_pd(zA, _mm_mul_pd(corr2, fexp));
-
-    /* If mask1 is not set ('B') */
-    corr   = _mm_and_pd(mask2, x);
-    xB     = _mm_add_pd(x, corr);
-    xB     = _mm_sub_pd(xB, one);
-    zB     = _mm_mul_pd(xB, xB);
-
-    polyP1 = _mm_mul_pd(P5, zB);
-    polyP2 = _mm_mul_pd(P4, zB);
-    polyP1 = _mm_add_pd(polyP1, P3);
-    polyP2 = _mm_add_pd(polyP2, P2);
-    polyP1 = _mm_mul_pd(polyP1, zB);
-    polyP2 = _mm_mul_pd(polyP2, zB);
-    polyP1 = _mm_add_pd(polyP1, P1);
-    polyP2 = _mm_add_pd(polyP2, P0);
-    polyP1 = _mm_mul_pd(polyP1, xB);
-    polyP1 = _mm_add_pd(polyP1, polyP2);
-
-    polyQ2 = _mm_mul_pd(Q4, zB);
-    polyQ1 = _mm_add_pd(zB, Q3);
-    polyQ2 = _mm_add_pd(polyQ2, Q2);
-    polyQ1 = _mm_mul_pd(polyQ1, zB);
-    polyQ2 = _mm_mul_pd(polyQ2, zB);
-    polyQ1 = _mm_add_pd(polyQ1, Q1);
-    polyQ2 = _mm_add_pd(polyQ2, Q0);
-    polyQ1 = _mm_mul_pd(polyQ1, xB);
-    polyQ1 = _mm_add_pd(polyQ1, polyQ2);
-
-    fexp   = _mm_and_pd(fexp, _mm_cmpneq_pd(fexp, _mm_setzero_pd()));
-
-    q      = _mm_mul_pd(polyP1, gmx_mm_inv_pd(polyQ1));
-    yB     = _mm_mul_pd(_mm_mul_pd(xB, zB), q);
-
-    yB     = _mm_add_pd(yB, _mm_mul_pd(corr1, fexp));
-    yB     = _mm_sub_pd(yB, _mm_mul_pd(half, zB));
-    zB     = _mm_add_pd(xB, yB);
-    zB     = _mm_add_pd(zB, _mm_mul_pd(corr2, fexp));
-
-    z      = _mm_blendv_pd( zB, zA, mask1 );
-
-    return z;
-}
-
-
-static __m256d
-gmx_mm256_erf_pd(__m256d x)
-{
-    /* Coefficients for minimax approximation of erf(x)=x*(CAoffset + P(x^2)/Q(x^2)) in range [-0.75,0.75] */
-    const __m256d CAP4      = _mm256_set1_pd(-0.431780540597889301512e-4);
-    const __m256d CAP3      = _mm256_set1_pd(-0.00578562306260059236059);
-    const __m256d CAP2      = _mm256_set1_pd(-0.028593586920219752446);
-    const __m256d CAP1      = _mm256_set1_pd(-0.315924962948621698209);
-    const __m256d CAP0      = _mm256_set1_pd(0.14952975608477029151);
-
-    const __m256d CAQ5      = _mm256_set1_pd(-0.374089300177174709737e-5);
-    const __m256d CAQ4      = _mm256_set1_pd(0.00015126584532155383535);
-    const __m256d CAQ3      = _mm256_set1_pd(0.00536692680669480725423);
-    const __m256d CAQ2      = _mm256_set1_pd(0.0668686825594046122636);
-    const __m256d CAQ1      = _mm256_set1_pd(0.402604990869284362773);
-    /* CAQ0 == 1.0 */
-    const __m256d CAoffset  = _mm256_set1_pd(0.9788494110107421875);
-
-    /* Coefficients for minimax approximation of erfc(x)=exp(-x^2)*x*(P(x-1)/Q(x-1)) in range [1.0,4.5] */
-    const __m256d CBP6      = _mm256_set1_pd(2.49650423685462752497647637088e-10);
-    const __m256d CBP5      = _mm256_set1_pd(0.00119770193298159629350136085658);
-    const __m256d CBP4      = _mm256_set1_pd(0.0164944422378370965881008942733);
-    const __m256d CBP3      = _mm256_set1_pd(0.0984581468691775932063932439252);
-    const __m256d CBP2      = _mm256_set1_pd(0.317364595806937763843589437418);
-    const __m256d CBP1      = _mm256_set1_pd(0.554167062641455850932670067075);
-    const __m256d CBP0      = _mm256_set1_pd(0.427583576155807163756925301060);
-    const __m256d CBQ7      = _mm256_set1_pd(0.00212288829699830145976198384930);
-    const __m256d CBQ6      = _mm256_set1_pd(0.0334810979522685300554606393425);
-    const __m256d CBQ5      = _mm256_set1_pd(0.2361713785181450957579508850717);
-    const __m256d CBQ4      = _mm256_set1_pd(0.955364736493055670530981883072);
-    const __m256d CBQ3      = _mm256_set1_pd(2.36815675631420037315349279199);
-    const __m256d CBQ2      = _mm256_set1_pd(3.55261649184083035537184223542);
-    const __m256d CBQ1      = _mm256_set1_pd(2.93501136050160872574376997993);
-    /* CBQ0 == 1.0 */
-
-    /* Coefficients for minimax approximation of erfc(x)=exp(-x^2)/x*(P(1/x)/Q(1/x)) in range [4.5,inf] */
-    const __m256d CCP6      = _mm256_set1_pd(-2.8175401114513378771);
-    const __m256d CCP5      = _mm256_set1_pd(-3.22729451764143718517);
-    const __m256d CCP4      = _mm256_set1_pd(-2.5518551727311523996);
-    const __m256d CCP3      = _mm256_set1_pd(-0.687717681153649930619);
-    const __m256d CCP2      = _mm256_set1_pd(-0.212652252872804219852);
-    const __m256d CCP1      = _mm256_set1_pd(0.0175389834052493308818);
-    const __m256d CCP0      = _mm256_set1_pd(0.00628057170626964891937);
-
-    const __m256d CCQ6      = _mm256_set1_pd(5.48409182238641741584);
-    const __m256d CCQ5      = _mm256_set1_pd(13.5064170191802889145);
-    const __m256d CCQ4      = _mm256_set1_pd(22.9367376522880577224);
-    const __m256d CCQ3      = _mm256_set1_pd(15.930646027911794143);
-    const __m256d CCQ2      = _mm256_set1_pd(11.0567237927800161565);
-    const __m256d CCQ1      = _mm256_set1_pd(2.79257750980575282228);
-    /* CCQ0 == 1.0 */
-    const __m256d CCoffset  = _mm256_set1_pd(0.5579090118408203125);
-
-    const __m256d one       = _mm256_set1_pd(1.0);
-    const __m256d two       = _mm256_set1_pd(2.0);
-
-    const __m256d signbit   = _mm256_castsi256_pd( _mm256_set_epi32(0x80000000, 0x00000000, 0x80000000, 0x00000000,
-                                                                    0x80000000, 0x00000000, 0x80000000, 0x00000000) );
-
-    __m256d xabs, x2, x4, t, t2, w, w2;
-    __m256d PolyAP0, PolyAP1, PolyAQ0, PolyAQ1;
-    __m256d PolyBP0, PolyBP1, PolyBQ0, PolyBQ1;
-    __m256d PolyCP0, PolyCP1, PolyCQ0, PolyCQ1;
-    __m256d res_erf, res_erfcB, res_erfcC, res_erfc, res;
-    __m256d mask, expmx2;
-
-    /* Calculate erf() */
-    xabs     = gmx_mm256_abs_pd(x);
-    x2       = _mm256_mul_pd(x, x);
-    x4       = _mm256_mul_pd(x2, x2);
-
-    PolyAP0  = _mm256_mul_pd(CAP4, x4);
-    PolyAP1  = _mm256_mul_pd(CAP3, x4);
-    PolyAP0  = _mm256_add_pd(PolyAP0, CAP2);
-    PolyAP1  = _mm256_add_pd(PolyAP1, CAP1);
-    PolyAP0  = _mm256_mul_pd(PolyAP0, x4);
-    PolyAP1  = _mm256_mul_pd(PolyAP1, x2);
-    PolyAP0  = _mm256_add_pd(PolyAP0, CAP0);
-    PolyAP0  = _mm256_add_pd(PolyAP0, PolyAP1);
-
-    PolyAQ1  = _mm256_mul_pd(CAQ5, x4);
-    PolyAQ0  = _mm256_mul_pd(CAQ4, x4);
-    PolyAQ1  = _mm256_add_pd(PolyAQ1, CAQ3);
-    PolyAQ0  = _mm256_add_pd(PolyAQ0, CAQ2);
-    PolyAQ1  = _mm256_mul_pd(PolyAQ1, x4);
-    PolyAQ0  = _mm256_mul_pd(PolyAQ0, x4);
-    PolyAQ1  = _mm256_add_pd(PolyAQ1, CAQ1);
-    PolyAQ0  = _mm256_add_pd(PolyAQ0, one);
-    PolyAQ1  = _mm256_mul_pd(PolyAQ1, x2);
-    PolyAQ0  = _mm256_add_pd(PolyAQ0, PolyAQ1);
-
-    res_erf  = _mm256_mul_pd(PolyAP0, gmx_mm256_inv_pd(PolyAQ0));
-    res_erf  = _mm256_add_pd(CAoffset, res_erf);
-    res_erf  = _mm256_mul_pd(x, res_erf);
-
-    /* Calculate erfc() in range [1,4.5] */
-    t       = _mm256_sub_pd(xabs, one);
-    t2      = _mm256_mul_pd(t, t);
-
-    PolyBP0  = _mm256_mul_pd(CBP6, t2);
-    PolyBP1  = _mm256_mul_pd(CBP5, t2);
-    PolyBP0  = _mm256_add_pd(PolyBP0, CBP4);
-    PolyBP1  = _mm256_add_pd(PolyBP1, CBP3);
-    PolyBP0  = _mm256_mul_pd(PolyBP0, t2);
-    PolyBP1  = _mm256_mul_pd(PolyBP1, t2);
-    PolyBP0  = _mm256_add_pd(PolyBP0, CBP2);
-    PolyBP1  = _mm256_add_pd(PolyBP1, CBP1);
-    PolyBP0  = _mm256_mul_pd(PolyBP0, t2);
-    PolyBP1  = _mm256_mul_pd(PolyBP1, t);
-    PolyBP0  = _mm256_add_pd(PolyBP0, CBP0);
-    PolyBP0  = _mm256_add_pd(PolyBP0, PolyBP1);
-
-    PolyBQ1 = _mm256_mul_pd(CBQ7, t2);
-    PolyBQ0 = _mm256_mul_pd(CBQ6, t2);
-    PolyBQ1 = _mm256_add_pd(PolyBQ1, CBQ5);
-    PolyBQ0 = _mm256_add_pd(PolyBQ0, CBQ4);
-    PolyBQ1 = _mm256_mul_pd(PolyBQ1, t2);
-    PolyBQ0 = _mm256_mul_pd(PolyBQ0, t2);
-    PolyBQ1 = _mm256_add_pd(PolyBQ1, CBQ3);
-    PolyBQ0 = _mm256_add_pd(PolyBQ0, CBQ2);
-    PolyBQ1 = _mm256_mul_pd(PolyBQ1, t2);
-    PolyBQ0 = _mm256_mul_pd(PolyBQ0, t2);
-    PolyBQ1 = _mm256_add_pd(PolyBQ1, CBQ1);
-    PolyBQ0 = _mm256_add_pd(PolyBQ0, one);
-    PolyBQ1 = _mm256_mul_pd(PolyBQ1, t);
-    PolyBQ0 = _mm256_add_pd(PolyBQ0, PolyBQ1);
-
-    res_erfcB = _mm256_mul_pd(PolyBP0, gmx_mm256_inv_pd(PolyBQ0));
-
-    res_erfcB = _mm256_mul_pd(res_erfcB, xabs);
-
-    /* Calculate erfc() in range [4.5,inf] */
-    w       = gmx_mm256_inv_pd(xabs);
-    w2      = _mm256_mul_pd(w, w);
-
-    PolyCP0  = _mm256_mul_pd(CCP6, w2);
-    PolyCP1  = _mm256_mul_pd(CCP5, w2);
-    PolyCP0  = _mm256_add_pd(PolyCP0, CCP4);
-    PolyCP1  = _mm256_add_pd(PolyCP1, CCP3);
-    PolyCP0  = _mm256_mul_pd(PolyCP0, w2);
-    PolyCP1  = _mm256_mul_pd(PolyCP1, w2);
-    PolyCP0  = _mm256_add_pd(PolyCP0, CCP2);
-    PolyCP1  = _mm256_add_pd(PolyCP1, CCP1);
-    PolyCP0  = _mm256_mul_pd(PolyCP0, w2);
-    PolyCP1  = _mm256_mul_pd(PolyCP1, w);
-    PolyCP0  = _mm256_add_pd(PolyCP0, CCP0);
-    PolyCP0  = _mm256_add_pd(PolyCP0, PolyCP1);
-
-    PolyCQ0  = _mm256_mul_pd(CCQ6, w2);
-    PolyCQ1  = _mm256_mul_pd(CCQ5, w2);
-    PolyCQ0  = _mm256_add_pd(PolyCQ0, CCQ4);
-    PolyCQ1  = _mm256_add_pd(PolyCQ1, CCQ3);
-    PolyCQ0  = _mm256_mul_pd(PolyCQ0, w2);
-    PolyCQ1  = _mm256_mul_pd(PolyCQ1, w2);
-    PolyCQ0  = _mm256_add_pd(PolyCQ0, CCQ2);
-    PolyCQ1  = _mm256_add_pd(PolyCQ1, CCQ1);
-    PolyCQ0  = _mm256_mul_pd(PolyCQ0, w2);
-    PolyCQ1  = _mm256_mul_pd(PolyCQ1, w);
-    PolyCQ0  = _mm256_add_pd(PolyCQ0, one);
-    PolyCQ0  = _mm256_add_pd(PolyCQ0, PolyCQ1);
-
-    expmx2   = gmx_mm256_exp_pd( _mm256_or_pd(signbit, x2) );
-
-    res_erfcC = _mm256_mul_pd(PolyCP0, gmx_mm256_inv_pd(PolyCQ0));
-    res_erfcC = _mm256_add_pd(res_erfcC, CCoffset);
-    res_erfcC = _mm256_mul_pd(res_erfcC, w);
-
-    mask     = _mm256_cmp_pd(xabs, _mm256_set1_pd(4.5), _CMP_GT_OQ);
-    res_erfc = _mm256_blendv_pd(res_erfcB, res_erfcC, mask);
-
-    res_erfc = _mm256_mul_pd(res_erfc, expmx2);
-
-    /* erfc(x<0) = 2-erfc(|x|) */
-    mask     = _mm256_cmp_pd(x, _mm256_setzero_pd(), _CMP_LT_OQ);
-    res_erfc = _mm256_blendv_pd(res_erfc, _mm256_sub_pd(two, res_erfc), mask);
-
-    /* Select erf() or erfc() */
-    mask = _mm256_cmp_pd(xabs, one, _CMP_LT_OQ);
-    res  = _mm256_blendv_pd(_mm256_sub_pd(one, res_erfc), res_erf, mask);
-
-    return res;
-}
-
-static __m128d
-gmx_mm_erf_pd(__m128d x)
-{
-    /* Coefficients for minimax approximation of erf(x)=x*(CAoffset + P(x^2)/Q(x^2)) in range [-0.75,0.75] */
-    const __m128d CAP4      = _mm_set1_pd(-0.431780540597889301512e-4);
-    const __m128d CAP3      = _mm_set1_pd(-0.00578562306260059236059);
-    const __m128d CAP2      = _mm_set1_pd(-0.028593586920219752446);
-    const __m128d CAP1      = _mm_set1_pd(-0.315924962948621698209);
-    const __m128d CAP0      = _mm_set1_pd(0.14952975608477029151);
-
-    const __m128d CAQ5      = _mm_set1_pd(-0.374089300177174709737e-5);
-    const __m128d CAQ4      = _mm_set1_pd(0.00015126584532155383535);
-    const __m128d CAQ3      = _mm_set1_pd(0.00536692680669480725423);
-    const __m128d CAQ2      = _mm_set1_pd(0.0668686825594046122636);
-    const __m128d CAQ1      = _mm_set1_pd(0.402604990869284362773);
-    /* CAQ0 == 1.0 */
-    const __m128d CAoffset  = _mm_set1_pd(0.9788494110107421875);
-
-    /* Coefficients for minimax approximation of erfc(x)=exp(-x^2)*x*(P(x-1)/Q(x-1)) in range [1.0,4.5] */
-    const __m128d CBP6      = _mm_set1_pd(2.49650423685462752497647637088e-10);
-    const __m128d CBP5      = _mm_set1_pd(0.00119770193298159629350136085658);
-    const __m128d CBP4      = _mm_set1_pd(0.0164944422378370965881008942733);
-    const __m128d CBP3      = _mm_set1_pd(0.0984581468691775932063932439252);
-    const __m128d CBP2      = _mm_set1_pd(0.317364595806937763843589437418);
-    const __m128d CBP1      = _mm_set1_pd(0.554167062641455850932670067075);
-    const __m128d CBP0      = _mm_set1_pd(0.427583576155807163756925301060);
-    const __m128d CBQ7      = _mm_set1_pd(0.00212288829699830145976198384930);
-    const __m128d CBQ6      = _mm_set1_pd(0.0334810979522685300554606393425);
-    const __m128d CBQ5      = _mm_set1_pd(0.2361713785181450957579508850717);
-    const __m128d CBQ4      = _mm_set1_pd(0.955364736493055670530981883072);
-    const __m128d CBQ3      = _mm_set1_pd(2.36815675631420037315349279199);
-    const __m128d CBQ2      = _mm_set1_pd(3.55261649184083035537184223542);
-    const __m128d CBQ1      = _mm_set1_pd(2.93501136050160872574376997993);
-    /* CBQ0 == 1.0 */
-
-    /* Coefficients for minimax approximation of erfc(x)=exp(-x^2)/x*(P(1/x)/Q(1/x)) in range [4.5,inf] */
-    const __m128d CCP6      = _mm_set1_pd(-2.8175401114513378771);
-    const __m128d CCP5      = _mm_set1_pd(-3.22729451764143718517);
-    const __m128d CCP4      = _mm_set1_pd(-2.5518551727311523996);
-    const __m128d CCP3      = _mm_set1_pd(-0.687717681153649930619);
-    const __m128d CCP2      = _mm_set1_pd(-0.212652252872804219852);
-    const __m128d CCP1      = _mm_set1_pd(0.0175389834052493308818);
-    const __m128d CCP0      = _mm_set1_pd(0.00628057170626964891937);
-
-    const __m128d CCQ6      = _mm_set1_pd(5.48409182238641741584);
-    const __m128d CCQ5      = _mm_set1_pd(13.5064170191802889145);
-    const __m128d CCQ4      = _mm_set1_pd(22.9367376522880577224);
-    const __m128d CCQ3      = _mm_set1_pd(15.930646027911794143);
-    const __m128d CCQ2      = _mm_set1_pd(11.0567237927800161565);
-    const __m128d CCQ1      = _mm_set1_pd(2.79257750980575282228);
-    /* CCQ0 == 1.0 */
-    const __m128d CCoffset  = _mm_set1_pd(0.5579090118408203125);
-
-    const __m128d one       = _mm_set1_pd(1.0);
-    const __m128d two       = _mm_set1_pd(2.0);
-
-    const __m128d signbit   = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000, 0x00000000, 0x80000000, 0x00000000) );
-
-    __m128d       xabs, x2, x4, t, t2, w, w2;
-    __m128d       PolyAP0, PolyAP1, PolyAQ0, PolyAQ1;
-    __m128d       PolyBP0, PolyBP1, PolyBQ0, PolyBQ1;
-    __m128d       PolyCP0, PolyCP1, PolyCQ0, PolyCQ1;
-    __m128d       res_erf, res_erfcB, res_erfcC, res_erfc, res;
-    __m128d       mask, expmx2;
-
-    /* Calculate erf() */
-    xabs     = gmx_mm_abs_pd(x);
-    x2       = _mm_mul_pd(x, x);
-    x4       = _mm_mul_pd(x2, x2);
-
-    PolyAP0  = _mm_mul_pd(CAP4, x4);
-    PolyAP1  = _mm_mul_pd(CAP3, x4);
-    PolyAP0  = _mm_add_pd(PolyAP0, CAP2);
-    PolyAP1  = _mm_add_pd(PolyAP1, CAP1);
-    PolyAP0  = _mm_mul_pd(PolyAP0, x4);
-    PolyAP1  = _mm_mul_pd(PolyAP1, x2);
-    PolyAP0  = _mm_add_pd(PolyAP0, CAP0);
-    PolyAP0  = _mm_add_pd(PolyAP0, PolyAP1);
-
-    PolyAQ1  = _mm_mul_pd(CAQ5, x4);
-    PolyAQ0  = _mm_mul_pd(CAQ4, x4);
-    PolyAQ1  = _mm_add_pd(PolyAQ1, CAQ3);
-    PolyAQ0  = _mm_add_pd(PolyAQ0, CAQ2);
-    PolyAQ1  = _mm_mul_pd(PolyAQ1, x4);
-    PolyAQ0  = _mm_mul_pd(PolyAQ0, x4);
-    PolyAQ1  = _mm_add_pd(PolyAQ1, CAQ1);
-    PolyAQ0  = _mm_add_pd(PolyAQ0, one);
-    PolyAQ1  = _mm_mul_pd(PolyAQ1, x2);
-    PolyAQ0  = _mm_add_pd(PolyAQ0, PolyAQ1);
-
-    res_erf  = _mm_mul_pd(PolyAP0, gmx_mm_inv_pd(PolyAQ0));
-    res_erf  = _mm_add_pd(CAoffset, res_erf);
-    res_erf  = _mm_mul_pd(x, res_erf);
-
-    /* Calculate erfc() in range [1,4.5] */
-    t       = _mm_sub_pd(xabs, one);
-    t2      = _mm_mul_pd(t, t);
-
-    PolyBP0  = _mm_mul_pd(CBP6, t2);
-    PolyBP1  = _mm_mul_pd(CBP5, t2);
-    PolyBP0  = _mm_add_pd(PolyBP0, CBP4);
-    PolyBP1  = _mm_add_pd(PolyBP1, CBP3);
-    PolyBP0  = _mm_mul_pd(PolyBP0, t2);
-    PolyBP1  = _mm_mul_pd(PolyBP1, t2);
-    PolyBP0  = _mm_add_pd(PolyBP0, CBP2);
-    PolyBP1  = _mm_add_pd(PolyBP1, CBP1);
-    PolyBP0  = _mm_mul_pd(PolyBP0, t2);
-    PolyBP1  = _mm_mul_pd(PolyBP1, t);
-    PolyBP0  = _mm_add_pd(PolyBP0, CBP0);
-    PolyBP0  = _mm_add_pd(PolyBP0, PolyBP1);
-
-    PolyBQ1 = _mm_mul_pd(CBQ7, t2);
-    PolyBQ0 = _mm_mul_pd(CBQ6, t2);
-    PolyBQ1 = _mm_add_pd(PolyBQ1, CBQ5);
-    PolyBQ0 = _mm_add_pd(PolyBQ0, CBQ4);
-    PolyBQ1 = _mm_mul_pd(PolyBQ1, t2);
-    PolyBQ0 = _mm_mul_pd(PolyBQ0, t2);
-    PolyBQ1 = _mm_add_pd(PolyBQ1, CBQ3);
-    PolyBQ0 = _mm_add_pd(PolyBQ0, CBQ2);
-    PolyBQ1 = _mm_mul_pd(PolyBQ1, t2);
-    PolyBQ0 = _mm_mul_pd(PolyBQ0, t2);
-    PolyBQ1 = _mm_add_pd(PolyBQ1, CBQ1);
-    PolyBQ0 = _mm_add_pd(PolyBQ0, one);
-    PolyBQ1 = _mm_mul_pd(PolyBQ1, t);
-    PolyBQ0 = _mm_add_pd(PolyBQ0, PolyBQ1);
-
-    res_erfcB = _mm_mul_pd(PolyBP0, gmx_mm_inv_pd(PolyBQ0));
-
-    res_erfcB = _mm_mul_pd(res_erfcB, xabs);
-
-    /* Calculate erfc() in range [4.5,inf] */
-    w       = gmx_mm_inv_pd(xabs);
-    w2      = _mm_mul_pd(w, w);
-
-    PolyCP0  = _mm_mul_pd(CCP6, w2);
-    PolyCP1  = _mm_mul_pd(CCP5, w2);
-    PolyCP0  = _mm_add_pd(PolyCP0, CCP4);
-    PolyCP1  = _mm_add_pd(PolyCP1, CCP3);
-    PolyCP0  = _mm_mul_pd(PolyCP0, w2);
-    PolyCP1  = _mm_mul_pd(PolyCP1, w2);
-    PolyCP0  = _mm_add_pd(PolyCP0, CCP2);
-    PolyCP1  = _mm_add_pd(PolyCP1, CCP1);
-    PolyCP0  = _mm_mul_pd(PolyCP0, w2);
-    PolyCP1  = _mm_mul_pd(PolyCP1, w);
-    PolyCP0  = _mm_add_pd(PolyCP0, CCP0);
-    PolyCP0  = _mm_add_pd(PolyCP0, PolyCP1);
-
-    PolyCQ0  = _mm_mul_pd(CCQ6, w2);
-    PolyCQ1  = _mm_mul_pd(CCQ5, w2);
-    PolyCQ0  = _mm_add_pd(PolyCQ0, CCQ4);
-    PolyCQ1  = _mm_add_pd(PolyCQ1, CCQ3);
-    PolyCQ0  = _mm_mul_pd(PolyCQ0, w2);
-    PolyCQ1  = _mm_mul_pd(PolyCQ1, w2);
-    PolyCQ0  = _mm_add_pd(PolyCQ0, CCQ2);
-    PolyCQ1  = _mm_add_pd(PolyCQ1, CCQ1);
-    PolyCQ0  = _mm_mul_pd(PolyCQ0, w2);
-    PolyCQ1  = _mm_mul_pd(PolyCQ1, w);
-    PolyCQ0  = _mm_add_pd(PolyCQ0, one);
-    PolyCQ0  = _mm_add_pd(PolyCQ0, PolyCQ1);
-
-    expmx2   = gmx_mm_exp_pd( _mm_or_pd(signbit, x2) );
-
-    res_erfcC = _mm_mul_pd(PolyCP0, gmx_mm_inv_pd(PolyCQ0));
-    res_erfcC = _mm_add_pd(res_erfcC, CCoffset);
-    res_erfcC = _mm_mul_pd(res_erfcC, w);
-
-    mask     = _mm_cmpgt_pd(xabs, _mm_set1_pd(4.5));
-    res_erfc = _mm_blendv_pd(res_erfcB, res_erfcC, mask);
-
-    res_erfc = _mm_mul_pd(res_erfc, expmx2);
-
-    /* erfc(x<0) = 2-erfc(|x|) */
-    mask     = _mm_cmplt_pd(x, _mm_setzero_pd());
-    res_erfc = _mm_blendv_pd(res_erfc, _mm_sub_pd(two, res_erfc), mask);
-
-    /* Select erf() or erfc() */
-    mask = _mm_cmplt_pd(xabs, one);
-    res  = _mm_blendv_pd(_mm_sub_pd(one, res_erfc), res_erf, mask);
-
-    return res;
-}
-
-
-static __m256d
-gmx_mm256_erfc_pd(__m256d x)
-{
-    /* Coefficients for minimax approximation of erf(x)=x*(CAoffset + P(x^2)/Q(x^2)) in range [-0.75,0.75] */
-    const __m256d CAP4      = _mm256_set1_pd(-0.431780540597889301512e-4);
-    const __m256d CAP3      = _mm256_set1_pd(-0.00578562306260059236059);
-    const __m256d CAP2      = _mm256_set1_pd(-0.028593586920219752446);
-    const __m256d CAP1      = _mm256_set1_pd(-0.315924962948621698209);
-    const __m256d CAP0      = _mm256_set1_pd(0.14952975608477029151);
-
-    const __m256d CAQ5      = _mm256_set1_pd(-0.374089300177174709737e-5);
-    const __m256d CAQ4      = _mm256_set1_pd(0.00015126584532155383535);
-    const __m256d CAQ3      = _mm256_set1_pd(0.00536692680669480725423);
-    const __m256d CAQ2      = _mm256_set1_pd(0.0668686825594046122636);
-    const __m256d CAQ1      = _mm256_set1_pd(0.402604990869284362773);
-    /* CAQ0 == 1.0 */
-    const __m256d CAoffset  = _mm256_set1_pd(0.9788494110107421875);
-
-    /* Coefficients for minimax approximation of erfc(x)=exp(-x^2)*x*(P(x-1)/Q(x-1)) in range [1.0,4.5] */
-    const __m256d CBP6      = _mm256_set1_pd(2.49650423685462752497647637088e-10);
-    const __m256d CBP5      = _mm256_set1_pd(0.00119770193298159629350136085658);
-    const __m256d CBP4      = _mm256_set1_pd(0.0164944422378370965881008942733);
-    const __m256d CBP3      = _mm256_set1_pd(0.0984581468691775932063932439252);
-    const __m256d CBP2      = _mm256_set1_pd(0.317364595806937763843589437418);
-    const __m256d CBP1      = _mm256_set1_pd(0.554167062641455850932670067075);
-    const __m256d CBP0      = _mm256_set1_pd(0.427583576155807163756925301060);
-    const __m256d CBQ7      = _mm256_set1_pd(0.00212288829699830145976198384930);
-    const __m256d CBQ6      = _mm256_set1_pd(0.0334810979522685300554606393425);
-    const __m256d CBQ5      = _mm256_set1_pd(0.2361713785181450957579508850717);
-    const __m256d CBQ4      = _mm256_set1_pd(0.955364736493055670530981883072);
-    const __m256d CBQ3      = _mm256_set1_pd(2.36815675631420037315349279199);
-    const __m256d CBQ2      = _mm256_set1_pd(3.55261649184083035537184223542);
-    const __m256d CBQ1      = _mm256_set1_pd(2.93501136050160872574376997993);
-    /* CBQ0 == 1.0 */
-
-    /* Coefficients for minimax approximation of erfc(x)=exp(-x^2)/x*(P(1/x)/Q(1/x)) in range [4.5,inf] */
-    const __m256d CCP6      = _mm256_set1_pd(-2.8175401114513378771);
-    const __m256d CCP5      = _mm256_set1_pd(-3.22729451764143718517);
-    const __m256d CCP4      = _mm256_set1_pd(-2.5518551727311523996);
-    const __m256d CCP3      = _mm256_set1_pd(-0.687717681153649930619);
-    const __m256d CCP2      = _mm256_set1_pd(-0.212652252872804219852);
-    const __m256d CCP1      = _mm256_set1_pd(0.0175389834052493308818);
-    const __m256d CCP0      = _mm256_set1_pd(0.00628057170626964891937);
-
-    const __m256d CCQ6      = _mm256_set1_pd(5.48409182238641741584);
-    const __m256d CCQ5      = _mm256_set1_pd(13.5064170191802889145);
-    const __m256d CCQ4      = _mm256_set1_pd(22.9367376522880577224);
-    const __m256d CCQ3      = _mm256_set1_pd(15.930646027911794143);
-    const __m256d CCQ2      = _mm256_set1_pd(11.0567237927800161565);
-    const __m256d CCQ1      = _mm256_set1_pd(2.79257750980575282228);
-    /* CCQ0 == 1.0 */
-    const __m256d CCoffset  = _mm256_set1_pd(0.5579090118408203125);
-
-    const __m256d one       = _mm256_set1_pd(1.0);
-    const __m256d two       = _mm256_set1_pd(2.0);
-
-    const __m256d signbit   = _mm256_castsi256_pd( _mm256_set_epi32(0x80000000, 0x00000000, 0x80000000, 0x00000000,
-                                                                    0x80000000, 0x00000000, 0x80000000, 0x00000000) );
-
-    __m256d xabs, x2, x4, t, t2, w, w2;
-    __m256d PolyAP0, PolyAP1, PolyAQ0, PolyAQ1;
-    __m256d PolyBP0, PolyBP1, PolyBQ0, PolyBQ1;
-    __m256d PolyCP0, PolyCP1, PolyCQ0, PolyCQ1;
-    __m256d res_erf, res_erfcB, res_erfcC, res_erfc, res;
-    __m256d mask, expmx2;
-
-    /* Calculate erf() */
-    xabs     = gmx_mm256_abs_pd(x);
-    x2       = _mm256_mul_pd(x, x);
-    x4       = _mm256_mul_pd(x2, x2);
-
-    PolyAP0  = _mm256_mul_pd(CAP4, x4);
-    PolyAP1  = _mm256_mul_pd(CAP3, x4);
-    PolyAP0  = _mm256_add_pd(PolyAP0, CAP2);
-    PolyAP1  = _mm256_add_pd(PolyAP1, CAP1);
-    PolyAP0  = _mm256_mul_pd(PolyAP0, x4);
-    PolyAP1  = _mm256_mul_pd(PolyAP1, x2);
-    PolyAP0  = _mm256_add_pd(PolyAP0, CAP0);
-    PolyAP0  = _mm256_add_pd(PolyAP0, PolyAP1);
-
-    PolyAQ1  = _mm256_mul_pd(CAQ5, x4);
-    PolyAQ0  = _mm256_mul_pd(CAQ4, x4);
-    PolyAQ1  = _mm256_add_pd(PolyAQ1, CAQ3);
-    PolyAQ0  = _mm256_add_pd(PolyAQ0, CAQ2);
-    PolyAQ1  = _mm256_mul_pd(PolyAQ1, x4);
-    PolyAQ0  = _mm256_mul_pd(PolyAQ0, x4);
-    PolyAQ1  = _mm256_add_pd(PolyAQ1, CAQ1);
-    PolyAQ0  = _mm256_add_pd(PolyAQ0, one);
-    PolyAQ1  = _mm256_mul_pd(PolyAQ1, x2);
-    PolyAQ0  = _mm256_add_pd(PolyAQ0, PolyAQ1);
-
-    res_erf  = _mm256_mul_pd(PolyAP0, gmx_mm256_inv_pd(PolyAQ0));
-    res_erf  = _mm256_add_pd(CAoffset, res_erf);
-    res_erf  = _mm256_mul_pd(x, res_erf);
-
-    /* Calculate erfc() in range [1,4.5] */
-    t       = _mm256_sub_pd(xabs, one);
-    t2      = _mm256_mul_pd(t, t);
-
-    PolyBP0  = _mm256_mul_pd(CBP6, t2);
-    PolyBP1  = _mm256_mul_pd(CBP5, t2);
-    PolyBP0  = _mm256_add_pd(PolyBP0, CBP4);
-    PolyBP1  = _mm256_add_pd(PolyBP1, CBP3);
-    PolyBP0  = _mm256_mul_pd(PolyBP0, t2);
-    PolyBP1  = _mm256_mul_pd(PolyBP1, t2);
-    PolyBP0  = _mm256_add_pd(PolyBP0, CBP2);
-    PolyBP1  = _mm256_add_pd(PolyBP1, CBP1);
-    PolyBP0  = _mm256_mul_pd(PolyBP0, t2);
-    PolyBP1  = _mm256_mul_pd(PolyBP1, t);
-    PolyBP0  = _mm256_add_pd(PolyBP0, CBP0);
-    PolyBP0  = _mm256_add_pd(PolyBP0, PolyBP1);
-
-    PolyBQ1 = _mm256_mul_pd(CBQ7, t2);
-    PolyBQ0 = _mm256_mul_pd(CBQ6, t2);
-    PolyBQ1 = _mm256_add_pd(PolyBQ1, CBQ5);
-    PolyBQ0 = _mm256_add_pd(PolyBQ0, CBQ4);
-    PolyBQ1 = _mm256_mul_pd(PolyBQ1, t2);
-    PolyBQ0 = _mm256_mul_pd(PolyBQ0, t2);
-    PolyBQ1 = _mm256_add_pd(PolyBQ1, CBQ3);
-    PolyBQ0 = _mm256_add_pd(PolyBQ0, CBQ2);
-    PolyBQ1 = _mm256_mul_pd(PolyBQ1, t2);
-    PolyBQ0 = _mm256_mul_pd(PolyBQ0, t2);
-    PolyBQ1 = _mm256_add_pd(PolyBQ1, CBQ1);
-    PolyBQ0 = _mm256_add_pd(PolyBQ0, one);
-    PolyBQ1 = _mm256_mul_pd(PolyBQ1, t);
-    PolyBQ0 = _mm256_add_pd(PolyBQ0, PolyBQ1);
-
-    res_erfcB = _mm256_mul_pd(PolyBP0, gmx_mm256_inv_pd(PolyBQ0));
-
-    res_erfcB = _mm256_mul_pd(res_erfcB, xabs);
-
-    /* Calculate erfc() in range [4.5,inf] */
-    w       = gmx_mm256_inv_pd(xabs);
-    w2      = _mm256_mul_pd(w, w);
-
-    PolyCP0  = _mm256_mul_pd(CCP6, w2);
-    PolyCP1  = _mm256_mul_pd(CCP5, w2);
-    PolyCP0  = _mm256_add_pd(PolyCP0, CCP4);
-    PolyCP1  = _mm256_add_pd(PolyCP1, CCP3);
-    PolyCP0  = _mm256_mul_pd(PolyCP0, w2);
-    PolyCP1  = _mm256_mul_pd(PolyCP1, w2);
-    PolyCP0  = _mm256_add_pd(PolyCP0, CCP2);
-    PolyCP1  = _mm256_add_pd(PolyCP1, CCP1);
-    PolyCP0  = _mm256_mul_pd(PolyCP0, w2);
-    PolyCP1  = _mm256_mul_pd(PolyCP1, w);
-    PolyCP0  = _mm256_add_pd(PolyCP0, CCP0);
-    PolyCP0  = _mm256_add_pd(PolyCP0, PolyCP1);
-
-    PolyCQ0  = _mm256_mul_pd(CCQ6, w2);
-    PolyCQ1  = _mm256_mul_pd(CCQ5, w2);
-    PolyCQ0  = _mm256_add_pd(PolyCQ0, CCQ4);
-    PolyCQ1  = _mm256_add_pd(PolyCQ1, CCQ3);
-    PolyCQ0  = _mm256_mul_pd(PolyCQ0, w2);
-    PolyCQ1  = _mm256_mul_pd(PolyCQ1, w2);
-    PolyCQ0  = _mm256_add_pd(PolyCQ0, CCQ2);
-    PolyCQ1  = _mm256_add_pd(PolyCQ1, CCQ1);
-    PolyCQ0  = _mm256_mul_pd(PolyCQ0, w2);
-    PolyCQ1  = _mm256_mul_pd(PolyCQ1, w);
-    PolyCQ0  = _mm256_add_pd(PolyCQ0, one);
-    PolyCQ0  = _mm256_add_pd(PolyCQ0, PolyCQ1);
-
-    expmx2   = gmx_mm256_exp_pd( _mm256_or_pd(signbit, x2) );
-
-    res_erfcC = _mm256_mul_pd(PolyCP0, gmx_mm256_inv_pd(PolyCQ0));
-    res_erfcC = _mm256_add_pd(res_erfcC, CCoffset);
-    res_erfcC = _mm256_mul_pd(res_erfcC, w);
-
-    mask     = _mm256_cmp_pd(xabs, _mm256_set1_pd(4.5), _CMP_GT_OQ);
-    res_erfc = _mm256_blendv_pd(res_erfcB, res_erfcC, mask);
-
-    res_erfc = _mm256_mul_pd(res_erfc, expmx2);
-
-    /* erfc(x<0) = 2-erfc(|x|) */
-    mask     = _mm256_cmp_pd(x, _mm256_setzero_pd(), _CMP_LT_OQ);
-    res_erfc = _mm256_blendv_pd(res_erfc, _mm256_sub_pd(two, res_erfc), mask);
-
-    /* Select erf() or erfc() */
-    mask = _mm256_cmp_pd(xabs, one, _CMP_LT_OQ);
-    res  = _mm256_blendv_pd(res_erfc, _mm256_sub_pd(one, res_erf), mask);
-
-    return res;
-}
-
-
-static __m128d
-gmx_mm_erfc_pd(__m128d x)
-{
-    /* Coefficients for minimax approximation of erf(x)=x*(CAoffset + P(x^2)/Q(x^2)) in range [-0.75,0.75] */
-    const __m128d CAP4      = _mm_set1_pd(-0.431780540597889301512e-4);
-    const __m128d CAP3      = _mm_set1_pd(-0.00578562306260059236059);
-    const __m128d CAP2      = _mm_set1_pd(-0.028593586920219752446);
-    const __m128d CAP1      = _mm_set1_pd(-0.315924962948621698209);
-    const __m128d CAP0      = _mm_set1_pd(0.14952975608477029151);
-
-    const __m128d CAQ5      = _mm_set1_pd(-0.374089300177174709737e-5);
-    const __m128d CAQ4      = _mm_set1_pd(0.00015126584532155383535);
-    const __m128d CAQ3      = _mm_set1_pd(0.00536692680669480725423);
-    const __m128d CAQ2      = _mm_set1_pd(0.0668686825594046122636);
-    const __m128d CAQ1      = _mm_set1_pd(0.402604990869284362773);
-    /* CAQ0 == 1.0 */
-    const __m128d CAoffset  = _mm_set1_pd(0.9788494110107421875);
-
-    /* Coefficients for minimax approximation of erfc(x)=exp(-x^2)*x*(P(x-1)/Q(x-1)) in range [1.0,4.5] */
-    const __m128d CBP6      = _mm_set1_pd(2.49650423685462752497647637088e-10);
-    const __m128d CBP5      = _mm_set1_pd(0.00119770193298159629350136085658);
-    const __m128d CBP4      = _mm_set1_pd(0.0164944422378370965881008942733);
-    const __m128d CBP3      = _mm_set1_pd(0.0984581468691775932063932439252);
-    const __m128d CBP2      = _mm_set1_pd(0.317364595806937763843589437418);
-    const __m128d CBP1      = _mm_set1_pd(0.554167062641455850932670067075);
-    const __m128d CBP0      = _mm_set1_pd(0.427583576155807163756925301060);
-    const __m128d CBQ7      = _mm_set1_pd(0.00212288829699830145976198384930);
-    const __m128d CBQ6      = _mm_set1_pd(0.0334810979522685300554606393425);
-    const __m128d CBQ5      = _mm_set1_pd(0.2361713785181450957579508850717);
-    const __m128d CBQ4      = _mm_set1_pd(0.955364736493055670530981883072);
-    const __m128d CBQ3      = _mm_set1_pd(2.36815675631420037315349279199);
-    const __m128d CBQ2      = _mm_set1_pd(3.55261649184083035537184223542);
-    const __m128d CBQ1      = _mm_set1_pd(2.93501136050160872574376997993);
-    /* CBQ0 == 1.0 */
-
-    /* Coefficients for minimax approximation of erfc(x)=exp(-x^2)/x*(P(1/x)/Q(1/x)) in range [4.5,inf] */
-    const __m128d CCP6      = _mm_set1_pd(-2.8175401114513378771);
-    const __m128d CCP5      = _mm_set1_pd(-3.22729451764143718517);
-    const __m128d CCP4      = _mm_set1_pd(-2.5518551727311523996);
-    const __m128d CCP3      = _mm_set1_pd(-0.687717681153649930619);
-    const __m128d CCP2      = _mm_set1_pd(-0.212652252872804219852);
-    const __m128d CCP1      = _mm_set1_pd(0.0175389834052493308818);
-    const __m128d CCP0      = _mm_set1_pd(0.00628057170626964891937);
-
-    const __m128d CCQ6      = _mm_set1_pd(5.48409182238641741584);
-    const __m128d CCQ5      = _mm_set1_pd(13.5064170191802889145);
-    const __m128d CCQ4      = _mm_set1_pd(22.9367376522880577224);
-    const __m128d CCQ3      = _mm_set1_pd(15.930646027911794143);
-    const __m128d CCQ2      = _mm_set1_pd(11.0567237927800161565);
-    const __m128d CCQ1      = _mm_set1_pd(2.79257750980575282228);
-    /* CCQ0 == 1.0 */
-    const __m128d CCoffset  = _mm_set1_pd(0.5579090118408203125);
-
-    const __m128d one       = _mm_set1_pd(1.0);
-    const __m128d two       = _mm_set1_pd(2.0);
-
-    const __m128d signbit   = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000, 0x00000000, 0x80000000, 0x00000000) );
-
-    __m128d       xabs, x2, x4, t, t2, w, w2;
-    __m128d       PolyAP0, PolyAP1, PolyAQ0, PolyAQ1;
-    __m128d       PolyBP0, PolyBP1, PolyBQ0, PolyBQ1;
-    __m128d       PolyCP0, PolyCP1, PolyCQ0, PolyCQ1;
-    __m128d       res_erf, res_erfcB, res_erfcC, res_erfc, res;
-    __m128d       mask, expmx2;
-
-    /* Calculate erf() */
-    xabs     = gmx_mm_abs_pd(x);
-    x2       = _mm_mul_pd(x, x);
-    x4       = _mm_mul_pd(x2, x2);
-
-    PolyAP0  = _mm_mul_pd(CAP4, x4);
-    PolyAP1  = _mm_mul_pd(CAP3, x4);
-    PolyAP0  = _mm_add_pd(PolyAP0, CAP2);
-    PolyAP1  = _mm_add_pd(PolyAP1, CAP1);
-    PolyAP0  = _mm_mul_pd(PolyAP0, x4);
-    PolyAP1  = _mm_mul_pd(PolyAP1, x2);
-    PolyAP0  = _mm_add_pd(PolyAP0, CAP0);
-    PolyAP0  = _mm_add_pd(PolyAP0, PolyAP1);
-
-    PolyAQ1  = _mm_mul_pd(CAQ5, x4);
-    PolyAQ0  = _mm_mul_pd(CAQ4, x4);
-    PolyAQ1  = _mm_add_pd(PolyAQ1, CAQ3);
-    PolyAQ0  = _mm_add_pd(PolyAQ0, CAQ2);
-    PolyAQ1  = _mm_mul_pd(PolyAQ1, x4);
-    PolyAQ0  = _mm_mul_pd(PolyAQ0, x4);
-    PolyAQ1  = _mm_add_pd(PolyAQ1, CAQ1);
-    PolyAQ0  = _mm_add_pd(PolyAQ0, one);
-    PolyAQ1  = _mm_mul_pd(PolyAQ1, x2);
-    PolyAQ0  = _mm_add_pd(PolyAQ0, PolyAQ1);
-
-    res_erf  = _mm_mul_pd(PolyAP0, gmx_mm_inv_pd(PolyAQ0));
-    res_erf  = _mm_add_pd(CAoffset, res_erf);
-    res_erf  = _mm_mul_pd(x, res_erf);
-
-    /* Calculate erfc() in range [1,4.5] */
-    t       = _mm_sub_pd(xabs, one);
-    t2      = _mm_mul_pd(t, t);
-
-    PolyBP0  = _mm_mul_pd(CBP6, t2);
-    PolyBP1  = _mm_mul_pd(CBP5, t2);
-    PolyBP0  = _mm_add_pd(PolyBP0, CBP4);
-    PolyBP1  = _mm_add_pd(PolyBP1, CBP3);
-    PolyBP0  = _mm_mul_pd(PolyBP0, t2);
-    PolyBP1  = _mm_mul_pd(PolyBP1, t2);
-    PolyBP0  = _mm_add_pd(PolyBP0, CBP2);
-    PolyBP1  = _mm_add_pd(PolyBP1, CBP1);
-    PolyBP0  = _mm_mul_pd(PolyBP0, t2);
-    PolyBP1  = _mm_mul_pd(PolyBP1, t);
-    PolyBP0  = _mm_add_pd(PolyBP0, CBP0);
-    PolyBP0  = _mm_add_pd(PolyBP0, PolyBP1);
-
-    PolyBQ1 = _mm_mul_pd(CBQ7, t2);
-    PolyBQ0 = _mm_mul_pd(CBQ6, t2);
-    PolyBQ1 = _mm_add_pd(PolyBQ1, CBQ5);
-    PolyBQ0 = _mm_add_pd(PolyBQ0, CBQ4);
-    PolyBQ1 = _mm_mul_pd(PolyBQ1, t2);
-    PolyBQ0 = _mm_mul_pd(PolyBQ0, t2);
-    PolyBQ1 = _mm_add_pd(PolyBQ1, CBQ3);
-    PolyBQ0 = _mm_add_pd(PolyBQ0, CBQ2);
-    PolyBQ1 = _mm_mul_pd(PolyBQ1, t2);
-    PolyBQ0 = _mm_mul_pd(PolyBQ0, t2);
-    PolyBQ1 = _mm_add_pd(PolyBQ1, CBQ1);
-    PolyBQ0 = _mm_add_pd(PolyBQ0, one);
-    PolyBQ1 = _mm_mul_pd(PolyBQ1, t);
-    PolyBQ0 = _mm_add_pd(PolyBQ0, PolyBQ1);
-
-    res_erfcB = _mm_mul_pd(PolyBP0, gmx_mm_inv_pd(PolyBQ0));
-
-    res_erfcB = _mm_mul_pd(res_erfcB, xabs);
-
-    /* Calculate erfc() in range [4.5,inf] */
-    w       = gmx_mm_inv_pd(xabs);
-    w2      = _mm_mul_pd(w, w);
-
-    PolyCP0  = _mm_mul_pd(CCP6, w2);
-    PolyCP1  = _mm_mul_pd(CCP5, w2);
-    PolyCP0  = _mm_add_pd(PolyCP0, CCP4);
-    PolyCP1  = _mm_add_pd(PolyCP1, CCP3);
-    PolyCP0  = _mm_mul_pd(PolyCP0, w2);
-    PolyCP1  = _mm_mul_pd(PolyCP1, w2);
-    PolyCP0  = _mm_add_pd(PolyCP0, CCP2);
-    PolyCP1  = _mm_add_pd(PolyCP1, CCP1);
-    PolyCP0  = _mm_mul_pd(PolyCP0, w2);
-    PolyCP1  = _mm_mul_pd(PolyCP1, w);
-    PolyCP0  = _mm_add_pd(PolyCP0, CCP0);
-    PolyCP0  = _mm_add_pd(PolyCP0, PolyCP1);
-
-    PolyCQ0  = _mm_mul_pd(CCQ6, w2);
-    PolyCQ1  = _mm_mul_pd(CCQ5, w2);
-    PolyCQ0  = _mm_add_pd(PolyCQ0, CCQ4);
-    PolyCQ1  = _mm_add_pd(PolyCQ1, CCQ3);
-    PolyCQ0  = _mm_mul_pd(PolyCQ0, w2);
-    PolyCQ1  = _mm_mul_pd(PolyCQ1, w2);
-    PolyCQ0  = _mm_add_pd(PolyCQ0, CCQ2);
-    PolyCQ1  = _mm_add_pd(PolyCQ1, CCQ1);
-    PolyCQ0  = _mm_mul_pd(PolyCQ0, w2);
-    PolyCQ1  = _mm_mul_pd(PolyCQ1, w);
-    PolyCQ0  = _mm_add_pd(PolyCQ0, one);
-    PolyCQ0  = _mm_add_pd(PolyCQ0, PolyCQ1);
-
-    expmx2   = gmx_mm_exp_pd( _mm_or_pd(signbit, x2) );
-
-    res_erfcC = _mm_mul_pd(PolyCP0, gmx_mm_inv_pd(PolyCQ0));
-    res_erfcC = _mm_add_pd(res_erfcC, CCoffset);
-    res_erfcC = _mm_mul_pd(res_erfcC, w);
-
-    mask     = _mm_cmpgt_pd(xabs, _mm_set1_pd(4.5));
-    res_erfc = _mm_blendv_pd(res_erfcB, res_erfcC, mask);
-
-    res_erfc = _mm_mul_pd(res_erfc, expmx2);
-
-    /* erfc(x<0) = 2-erfc(|x|) */
-    mask     = _mm_cmplt_pd(x, _mm_setzero_pd());
-    res_erfc = _mm_blendv_pd(res_erfc, _mm_sub_pd(two, res_erfc), mask);
-
-    /* Select erf() or erfc() */
-    mask = _mm_cmplt_pd(xabs, one);
-    res  = _mm_blendv_pd(res_erfc, _mm_sub_pd(one, res_erf), mask);
-
-    return res;
-}
-
-
-/* Calculate the force correction due to PME analytically.
- *
- * This routine is meant to enable analytical evaluation of the
- * direct-space PME electrostatic force to avoid tables.
- *
- * The direct-space potential should be Erfc(beta*r)/r, but there
- * are some problems evaluating that:
- *
- * First, the error function is difficult (read: expensive) to
- * approxmiate accurately for intermediate to large arguments, and
- * this happens already in ranges of beta*r that occur in simulations.
- * Second, we now try to avoid calculating potentials in Gromacs but
- * use forces directly.
- *
- * We can simply things slight by noting that the PME part is really
- * a correction to the normal Coulomb force since Erfc(z)=1-Erf(z), i.e.
- *
- * V= 1/r - Erf(beta*r)/r
- *
- * The first term we already have from the inverse square root, so
- * that we can leave out of this routine.
- *
- * For pme tolerances of 1e-3 to 1e-8 and cutoffs of 0.5nm to 1.8nm,
- * the argument beta*r will be in the range 0.15 to ~4. Use your
- * favorite plotting program to realize how well-behaved Erf(z)/z is
- * in this range!
- *
- * We approximate f(z)=erf(z)/z with a rational minimax polynomial.
- * However, it turns out it is more efficient to approximate f(z)/z and
- * then only use even powers. This is another minor optimization, since
- * we actually WANT f(z)/z, because it is going to be multiplied by
- * the vector between the two atoms to get the vectorial force. The
- * fastest flops are the ones we can avoid calculating!
- *
- * So, here's how it should be used:
- *
- * 1. Calculate r^2.
- * 2. Multiply by beta^2, so you get z^2=beta^2*r^2.
- * 3. Evaluate this routine with z^2 as the argument.
- * 4. The return value is the expression:
- *
- *
- *       2*exp(-z^2)     erf(z)
- *       ------------ - --------
- *       sqrt(Pi)*z^2      z^3
- *
- * 5. Multiply the entire expression by beta^3. This will get you
- *
- *       beta^3*2*exp(-z^2)     beta^3*erf(z)
- *       ------------------  - ---------------
- *          sqrt(Pi)*z^2            z^3
- *
- *    or, switching back to r (z=r*beta):
- *
- *       2*beta*exp(-r^2*beta^2)   erf(r*beta)
- *       ----------------------- - -----------
- *            sqrt(Pi)*r^2            r^3
- *
- *
- *    With a bit of math exercise you should be able to confirm that
- *    this is exactly D[Erf[beta*r]/r,r] divided by r another time.
- *
- * 6. Add the result to 1/r^3, multiply by the product of the charges,
- *    and you have your force (divided by r). A final multiplication
- *    with the vector connecting the two particles and you have your
- *    vectorial force to add to the particles.
- *
- */
-static __m256d
-gmx_mm256_pmecorrF_pd(__m256d z2)
-{
-    const __m256d  FN10     = _mm256_set1_pd(-8.0072854618360083154e-14);
-    const __m256d  FN9      = _mm256_set1_pd(1.1859116242260148027e-11);
-    const __m256d  FN8      = _mm256_set1_pd(-8.1490406329798423616e-10);
-    const __m256d  FN7      = _mm256_set1_pd(3.4404793543907847655e-8);
-    const __m256d  FN6      = _mm256_set1_pd(-9.9471420832602741006e-7);
-    const __m256d  FN5      = _mm256_set1_pd(0.000020740315999115847456);
-    const __m256d  FN4      = _mm256_set1_pd(-0.00031991745139313364005);
-    const __m256d  FN3      = _mm256_set1_pd(0.0035074449373659008203);
-    const __m256d  FN2      = _mm256_set1_pd(-0.031750380176100813405);
-    const __m256d  FN1      = _mm256_set1_pd(0.13884101728898463426);
-    const __m256d  FN0      = _mm256_set1_pd(-0.75225277815249618847);
-
-    const __m256d  FD5      = _mm256_set1_pd(0.000016009278224355026701);
-    const __m256d  FD4      = _mm256_set1_pd(0.00051055686934806966046);
-    const __m256d  FD3      = _mm256_set1_pd(0.0081803507497974289008);
-    const __m256d  FD2      = _mm256_set1_pd(0.077181146026670287235);
-    const __m256d  FD1      = _mm256_set1_pd(0.41543303143712535988);
-    const __m256d  FD0      = _mm256_set1_pd(1.0);
-
-    __m256d        z4;
-    __m256d        polyFN0, polyFN1, polyFD0, polyFD1;
-
-    z4             = _mm256_mul_pd(z2, z2);
-
-    polyFD1        = _mm256_mul_pd(FD5, z4);
-    polyFD0        = _mm256_mul_pd(FD4, z4);
-    polyFD1        = _mm256_add_pd(polyFD1, FD3);
-    polyFD0        = _mm256_add_pd(polyFD0, FD2);
-    polyFD1        = _mm256_mul_pd(polyFD1, z4);
-    polyFD0        = _mm256_mul_pd(polyFD0, z4);
-    polyFD1        = _mm256_add_pd(polyFD1, FD1);
-    polyFD0        = _mm256_add_pd(polyFD0, FD0);
-    polyFD1        = _mm256_mul_pd(polyFD1, z2);
-    polyFD0        = _mm256_add_pd(polyFD0, polyFD1);
-
-    polyFD0        = gmx_mm256_inv_pd(polyFD0);
-
-    polyFN0        = _mm256_mul_pd(FN10, z4);
-    polyFN1        = _mm256_mul_pd(FN9, z4);
-    polyFN0        = _mm256_add_pd(polyFN0, FN8);
-    polyFN1        = _mm256_add_pd(polyFN1, FN7);
-    polyFN0        = _mm256_mul_pd(polyFN0, z4);
-    polyFN1        = _mm256_mul_pd(polyFN1, z4);
-    polyFN0        = _mm256_add_pd(polyFN0, FN6);
-    polyFN1        = _mm256_add_pd(polyFN1, FN5);
-    polyFN0        = _mm256_mul_pd(polyFN0, z4);
-    polyFN1        = _mm256_mul_pd(polyFN1, z4);
-    polyFN0        = _mm256_add_pd(polyFN0, FN4);
-    polyFN1        = _mm256_add_pd(polyFN1, FN3);
-    polyFN0        = _mm256_mul_pd(polyFN0, z4);
-    polyFN1        = _mm256_mul_pd(polyFN1, z4);
-    polyFN0        = _mm256_add_pd(polyFN0, FN2);
-    polyFN1        = _mm256_add_pd(polyFN1, FN1);
-    polyFN0        = _mm256_mul_pd(polyFN0, z4);
-    polyFN1        = _mm256_mul_pd(polyFN1, z2);
-    polyFN0        = _mm256_add_pd(polyFN0, FN0);
-    polyFN0        = _mm256_add_pd(polyFN0, polyFN1);
-
-    return _mm256_mul_pd(polyFN0, polyFD0);
-}
-
-
-static __m128d
-gmx_mm_pmecorrF_pd(__m128d z2)
-{
-    const __m128d  FN10     = _mm_set1_pd(-8.0072854618360083154e-14);
-    const __m128d  FN9      = _mm_set1_pd(1.1859116242260148027e-11);
-    const __m128d  FN8      = _mm_set1_pd(-8.1490406329798423616e-10);
-    const __m128d  FN7      = _mm_set1_pd(3.4404793543907847655e-8);
-    const __m128d  FN6      = _mm_set1_pd(-9.9471420832602741006e-7);
-    const __m128d  FN5      = _mm_set1_pd(0.000020740315999115847456);
-    const __m128d  FN4      = _mm_set1_pd(-0.00031991745139313364005);
-    const __m128d  FN3      = _mm_set1_pd(0.0035074449373659008203);
-    const __m128d  FN2      = _mm_set1_pd(-0.031750380176100813405);
-    const __m128d  FN1      = _mm_set1_pd(0.13884101728898463426);
-    const __m128d  FN0      = _mm_set1_pd(-0.75225277815249618847);
-
-    const __m128d  FD5      = _mm_set1_pd(0.000016009278224355026701);
-    const __m128d  FD4      = _mm_set1_pd(0.00051055686934806966046);
-    const __m128d  FD3      = _mm_set1_pd(0.0081803507497974289008);
-    const __m128d  FD2      = _mm_set1_pd(0.077181146026670287235);
-    const __m128d  FD1      = _mm_set1_pd(0.41543303143712535988);
-    const __m128d  FD0      = _mm_set1_pd(1.0);
-
-    __m128d        z4;
-    __m128d        polyFN0, polyFN1, polyFD0, polyFD1;
-
-    z4             = _mm_mul_pd(z2, z2);
-
-    polyFD1        = _mm_mul_pd(FD5, z4);
-    polyFD0        = _mm_mul_pd(FD4, z4);
-    polyFD1        = _mm_add_pd(polyFD1, FD3);
-    polyFD0        = _mm_add_pd(polyFD0, FD2);
-    polyFD1        = _mm_mul_pd(polyFD1, z4);
-    polyFD0        = _mm_mul_pd(polyFD0, z4);
-    polyFD1        = _mm_add_pd(polyFD1, FD1);
-    polyFD0        = _mm_add_pd(polyFD0, FD0);
-    polyFD1        = _mm_mul_pd(polyFD1, z2);
-    polyFD0        = _mm_add_pd(polyFD0, polyFD1);
-
-    polyFD0        = gmx_mm_inv_pd(polyFD0);
-
-    polyFN0        = _mm_mul_pd(FN10, z4);
-    polyFN1        = _mm_mul_pd(FN9, z4);
-    polyFN0        = _mm_add_pd(polyFN0, FN8);
-    polyFN1        = _mm_add_pd(polyFN1, FN7);
-    polyFN0        = _mm_mul_pd(polyFN0, z4);
-    polyFN1        = _mm_mul_pd(polyFN1, z4);
-    polyFN0        = _mm_add_pd(polyFN0, FN6);
-    polyFN1        = _mm_add_pd(polyFN1, FN5);
-    polyFN0        = _mm_mul_pd(polyFN0, z4);
-    polyFN1        = _mm_mul_pd(polyFN1, z4);
-    polyFN0        = _mm_add_pd(polyFN0, FN4);
-    polyFN1        = _mm_add_pd(polyFN1, FN3);
-    polyFN0        = _mm_mul_pd(polyFN0, z4);
-    polyFN1        = _mm_mul_pd(polyFN1, z4);
-    polyFN0        = _mm_add_pd(polyFN0, FN2);
-    polyFN1        = _mm_add_pd(polyFN1, FN1);
-    polyFN0        = _mm_mul_pd(polyFN0, z4);
-    polyFN1        = _mm_mul_pd(polyFN1, z2);
-    polyFN0        = _mm_add_pd(polyFN0, FN0);
-    polyFN0        = _mm_add_pd(polyFN0, polyFN1);
-
-    return _mm_mul_pd(polyFN0, polyFD0);
-}
-
-
-
-
-/* Calculate the potential correction due to PME analytically.
- *
- * See gmx_mm256_pmecorrF_ps() for details about the approximation.
- *
- * This routine calculates Erf(z)/z, although you should provide z^2
- * as the input argument.
- *
- * Here's how it should be used:
- *
- * 1. Calculate r^2.
- * 2. Multiply by beta^2, so you get z^2=beta^2*r^2.
- * 3. Evaluate this routine with z^2 as the argument.
- * 4. The return value is the expression:
- *
- *
- *        erf(z)
- *       --------
- *          z
- *
- * 5. Multiply the entire expression by beta and switching back to r (z=r*beta):
- *
- *       erf(r*beta)
- *       -----------
- *           r
- *
- * 6. Subtract the result from 1/r, multiply by the product of the charges,
- *    and you have your potential.
- *
- */
-static __m256d
-gmx_mm256_pmecorrV_pd(__m256d z2)
-{
-    const __m256d  VN9      = _mm256_set1_pd(-9.3723776169321855475e-13);
-    const __m256d  VN8      = _mm256_set1_pd(1.2280156762674215741e-10);
-    const __m256d  VN7      = _mm256_set1_pd(-7.3562157912251309487e-9);
-    const __m256d  VN6      = _mm256_set1_pd(2.6215886208032517509e-7);
-    const __m256d  VN5      = _mm256_set1_pd(-4.9532491651265819499e-6);
-    const __m256d  VN4      = _mm256_set1_pd(0.00025907400778966060389);
-    const __m256d  VN3      = _mm256_set1_pd(0.0010585044856156469792);
-    const __m256d  VN2      = _mm256_set1_pd(0.045247661136833092885);
-    const __m256d  VN1      = _mm256_set1_pd(0.11643931522926034421);
-    const __m256d  VN0      = _mm256_set1_pd(1.1283791671726767970);
-
-    const __m256d  VD5      = _mm256_set1_pd(0.000021784709867336150342);
-    const __m256d  VD4      = _mm256_set1_pd(0.00064293662010911388448);
-    const __m256d  VD3      = _mm256_set1_pd(0.0096311444822588683504);
-    const __m256d  VD2      = _mm256_set1_pd(0.085608012351550627051);
-    const __m256d  VD1      = _mm256_set1_pd(0.43652499166614811084);
-    const __m256d  VD0      = _mm256_set1_pd(1.0);
-
-    __m256d        z4;
-    __m256d        polyVN0, polyVN1, polyVD0, polyVD1;
-
-    z4             = _mm256_mul_pd(z2, z2);
-
-    polyVD1        = _mm256_mul_pd(VD5, z4);
-    polyVD0        = _mm256_mul_pd(VD4, z4);
-    polyVD1        = _mm256_add_pd(polyVD1, VD3);
-    polyVD0        = _mm256_add_pd(polyVD0, VD2);
-    polyVD1        = _mm256_mul_pd(polyVD1, z4);
-    polyVD0        = _mm256_mul_pd(polyVD0, z4);
-    polyVD1        = _mm256_add_pd(polyVD1, VD1);
-    polyVD0        = _mm256_add_pd(polyVD0, VD0);
-    polyVD1        = _mm256_mul_pd(polyVD1, z2);
-    polyVD0        = _mm256_add_pd(polyVD0, polyVD1);
-
-    polyVD0        = gmx_mm256_inv_pd(polyVD0);
-
-    polyVN1        = _mm256_mul_pd(VN9, z4);
-    polyVN0        = _mm256_mul_pd(VN8, z4);
-    polyVN1        = _mm256_add_pd(polyVN1, VN7);
-    polyVN0        = _mm256_add_pd(polyVN0, VN6);
-    polyVN1        = _mm256_mul_pd(polyVN1, z4);
-    polyVN0        = _mm256_mul_pd(polyVN0, z4);
-    polyVN1        = _mm256_add_pd(polyVN1, VN5);
-    polyVN0        = _mm256_add_pd(polyVN0, VN4);
-    polyVN1        = _mm256_mul_pd(polyVN1, z4);
-    polyVN0        = _mm256_mul_pd(polyVN0, z4);
-    polyVN1        = _mm256_add_pd(polyVN1, VN3);
-    polyVN0        = _mm256_add_pd(polyVN0, VN2);
-    polyVN1        = _mm256_mul_pd(polyVN1, z4);
-    polyVN0        = _mm256_mul_pd(polyVN0, z4);
-    polyVN1        = _mm256_add_pd(polyVN1, VN1);
-    polyVN0        = _mm256_add_pd(polyVN0, VN0);
-    polyVN1        = _mm256_mul_pd(polyVN1, z2);
-    polyVN0        = _mm256_add_pd(polyVN0, polyVN1);
-
-    return _mm256_mul_pd(polyVN0, polyVD0);
-}
-
-
-static __m128d
-gmx_mm_pmecorrV_pd(__m128d z2)
-{
-    const __m128d  VN9      = _mm_set1_pd(-9.3723776169321855475e-13);
-    const __m128d  VN8      = _mm_set1_pd(1.2280156762674215741e-10);
-    const __m128d  VN7      = _mm_set1_pd(-7.3562157912251309487e-9);
-    const __m128d  VN6      = _mm_set1_pd(2.6215886208032517509e-7);
-    const __m128d  VN5      = _mm_set1_pd(-4.9532491651265819499e-6);
-    const __m128d  VN4      = _mm_set1_pd(0.00025907400778966060389);
-    const __m128d  VN3      = _mm_set1_pd(0.0010585044856156469792);
-    const __m128d  VN2      = _mm_set1_pd(0.045247661136833092885);
-    const __m128d  VN1      = _mm_set1_pd(0.11643931522926034421);
-    const __m128d  VN0      = _mm_set1_pd(1.1283791671726767970);
-
-    const __m128d  VD5      = _mm_set1_pd(0.000021784709867336150342);
-    const __m128d  VD4      = _mm_set1_pd(0.00064293662010911388448);
-    const __m128d  VD3      = _mm_set1_pd(0.0096311444822588683504);
-    const __m128d  VD2      = _mm_set1_pd(0.085608012351550627051);
-    const __m128d  VD1      = _mm_set1_pd(0.43652499166614811084);
-    const __m128d  VD0      = _mm_set1_pd(1.0);
-
-    __m128d        z4;
-    __m128d        polyVN0, polyVN1, polyVD0, polyVD1;
-
-    z4             = _mm_mul_pd(z2, z2);
-
-    polyVD1        = _mm_mul_pd(VD5, z4);
-    polyVD0        = _mm_mul_pd(VD4, z4);
-    polyVD1        = _mm_add_pd(polyVD1, VD3);
-    polyVD0        = _mm_add_pd(polyVD0, VD2);
-    polyVD1        = _mm_mul_pd(polyVD1, z4);
-    polyVD0        = _mm_mul_pd(polyVD0, z4);
-    polyVD1        = _mm_add_pd(polyVD1, VD1);
-    polyVD0        = _mm_add_pd(polyVD0, VD0);
-    polyVD1        = _mm_mul_pd(polyVD1, z2);
-    polyVD0        = _mm_add_pd(polyVD0, polyVD1);
-
-    polyVD0        = gmx_mm_inv_pd(polyVD0);
-
-    polyVN1        = _mm_mul_pd(VN9, z4);
-    polyVN0        = _mm_mul_pd(VN8, z4);
-    polyVN1        = _mm_add_pd(polyVN1, VN7);
-    polyVN0        = _mm_add_pd(polyVN0, VN6);
-    polyVN1        = _mm_mul_pd(polyVN1, z4);
-    polyVN0        = _mm_mul_pd(polyVN0, z4);
-    polyVN1        = _mm_add_pd(polyVN1, VN5);
-    polyVN0        = _mm_add_pd(polyVN0, VN4);
-    polyVN1        = _mm_mul_pd(polyVN1, z4);
-    polyVN0        = _mm_mul_pd(polyVN0, z4);
-    polyVN1        = _mm_add_pd(polyVN1, VN3);
-    polyVN0        = _mm_add_pd(polyVN0, VN2);
-    polyVN1        = _mm_mul_pd(polyVN1, z4);
-    polyVN0        = _mm_mul_pd(polyVN0, z4);
-    polyVN1        = _mm_add_pd(polyVN1, VN1);
-    polyVN0        = _mm_add_pd(polyVN0, VN0);
-    polyVN1        = _mm_mul_pd(polyVN1, z2);
-    polyVN0        = _mm_add_pd(polyVN0, polyVN1);
-
-    return _mm_mul_pd(polyVN0, polyVD0);
-}
-
-
-
-static int
-gmx_mm256_sincos_pd(__m256d  x,
-                    __m256d *sinval,
-                    __m256d *cosval)
-{
-#ifdef _MSC_VER
-    __declspec(align(16))
-    const double sintable[34] =
-    {
-        1.00000000000000000e+00, 0.00000000000000000e+00,
-        9.95184726672196929e-01, 9.80171403295606036e-02,
-        9.80785280403230431e-01, 1.95090322016128248e-01,
-        9.56940335732208824e-01, 2.90284677254462331e-01,
-        9.23879532511286738e-01, 3.82683432365089782e-01,
-        8.81921264348355050e-01, 4.71396736825997642e-01,
-        8.31469612302545236e-01, 5.55570233019602178e-01,
-        7.73010453362736993e-01, 6.34393284163645488e-01,
-        7.07106781186547573e-01, 7.07106781186547462e-01,
-        6.34393284163645599e-01, 7.73010453362736882e-01,
-        5.55570233019602289e-01, 8.31469612302545125e-01,
-        4.71396736825997809e-01, 8.81921264348354939e-01,
-        3.82683432365089837e-01, 9.23879532511286738e-01,
-        2.90284677254462276e-01, 9.56940335732208935e-01,
-        1.95090322016128304e-01, 9.80785280403230431e-01,
-        9.80171403295607702e-02, 9.95184726672196818e-01,
-        0.0, 1.00000000000000000e+00
-    };
-#else
-    const __m128d sintable[17] =
-    {
-        _mm_set_pd( 0.0, 1.0 ),
-        _mm_set_pd( sin(  1.0 * (M_PI/2.0) / 16.0), cos(  1.0 * (M_PI/2.0) / 16.0) ),
-        _mm_set_pd( sin(  2.0 * (M_PI/2.0) / 16.0), cos(  2.0 * (M_PI/2.0) / 16.0) ),
-        _mm_set_pd( sin(  3.0 * (M_PI/2.0) / 16.0), cos(  3.0 * (M_PI/2.0) / 16.0) ),
-        _mm_set_pd( sin(  4.0 * (M_PI/2.0) / 16.0), cos(  4.0 * (M_PI/2.0) / 16.0) ),
-        _mm_set_pd( sin(  5.0 * (M_PI/2.0) / 16.0), cos(  5.0 * (M_PI/2.0) / 16.0) ),
-        _mm_set_pd( sin(  6.0 * (M_PI/2.0) / 16.0), cos(  6.0 * (M_PI/2.0) / 16.0) ),
-        _mm_set_pd( sin(  7.0 * (M_PI/2.0) / 16.0), cos(  7.0 * (M_PI/2.0) / 16.0) ),
-        _mm_set_pd( sin(  8.0 * (M_PI/2.0) / 16.0), cos(  8.0 * (M_PI/2.0) / 16.0) ),
-        _mm_set_pd( sin(  9.0 * (M_PI/2.0) / 16.0), cos(  9.0 * (M_PI/2.0) / 16.0) ),
-        _mm_set_pd( sin( 10.0 * (M_PI/2.0) / 16.0), cos( 10.0 * (M_PI/2.0) / 16.0) ),
-        _mm_set_pd( sin( 11.0 * (M_PI/2.0) / 16.0), cos( 11.0 * (M_PI/2.0) / 16.0) ),
-        _mm_set_pd( sin( 12.0 * (M_PI/2.0) / 16.0), cos( 12.0 * (M_PI/2.0) / 16.0) ),
-        _mm_set_pd( sin( 13.0 * (M_PI/2.0) / 16.0), cos( 13.0 * (M_PI/2.0) / 16.0) ),
-        _mm_set_pd( sin( 14.0 * (M_PI/2.0) / 16.0), cos( 14.0 * (M_PI/2.0) / 16.0) ),
-        _mm_set_pd( sin( 15.0 * (M_PI/2.0) / 16.0), cos( 15.0 * (M_PI/2.0) / 16.0) ),
-        _mm_set_pd(  1.0, 0.0 )
-    };
-#endif
-
-    const __m256d signmask    = _mm256_castsi256_pd( _mm256_set_epi32(0x7FFFFFFF, 0xFFFFFFFF, 0x7FFFFFFF, 0xFFFFFFFF,
-                                                                      0x7FFFFFFF, 0xFFFFFFFF, 0x7FFFFFFF, 0xFFFFFFFF) );
-
-    const __m256d tabscale      = _mm256_set1_pd(32.0/M_PI);
-    const __m256d invtabscale0  = _mm256_set1_pd(9.81747508049011230469e-02);
-    const __m256d invtabscale1  = _mm256_set1_pd(1.96197799156550576057e-08);
-    const __m128i ione          = _mm_set1_epi32(1);
-    const __m128i i32           = _mm_set1_epi32(32);
-    const __m128i i16           = _mm_set1_epi32(16);
-    const __m128i tabmask       = _mm_set1_epi32(0x3F);
-    const __m256d sinP7         = _mm256_set1_pd(-1.0/5040.0);
-    const __m256d sinP5         = _mm256_set1_pd(1.0/120.0);
-    const __m256d sinP3         = _mm256_set1_pd(-1.0/6.0);
-    const __m256d sinP1         = _mm256_set1_pd(1.0);
-
-    const __m256d cosP6         = _mm256_set1_pd(-1.0/720.0);
-    const __m256d cosP4         = _mm256_set1_pd(1.0/24.0);
-    const __m256d cosP2         = _mm256_set1_pd(-1.0/2.0);
-    const __m256d cosP0         = _mm256_set1_pd(1.0);
-
-    __m256d       scalex;
-    __m128i       tabidx, corridx;
-    __m256d       xabs, z, z2, polySin, polyCos;
-    __m256d       xpoint;
-    __m256d       t1, t2;
-
-    __m256d       sinpoint, cospoint;
-    __m256d       xsign, ssign, csign;
-    __m128i       imask, sswapsign, cswapsign;
-
-    xsign    = _mm256_andnot_pd(signmask, x);
-    xabs     = _mm256_and_pd(x, signmask);
-
-    scalex   = _mm256_mul_pd(tabscale, xabs);
-    tabidx   = _mm256_cvtpd_epi32(scalex);
-
-    xpoint   = _mm256_round_pd(scalex, _MM_FROUND_TO_NEAREST_INT);
-
-    /* Extended precision arithmetics */
-    z        = _mm256_sub_pd(xabs, _mm256_mul_pd(invtabscale0, xpoint));
-    z        = _mm256_sub_pd(z, _mm256_mul_pd(invtabscale1, xpoint));
-
-    /* Range reduction to 0..2*Pi */
-    tabidx   = _mm_and_si128(tabidx, tabmask);
-
-    /* tabidx is now in range [0,..,64] */
-    imask     = _mm_cmpgt_epi32(tabidx, i32);
-    sswapsign = imask;
-    cswapsign = imask;
-    corridx   = _mm_and_si128(imask, i32);
-    tabidx    = _mm_sub_epi32(tabidx, corridx);
-
-    /* tabidx is now in range [0..32] */
-    imask     = _mm_cmpgt_epi32(tabidx, i16);
-    cswapsign = _mm_xor_si128(cswapsign, imask);
-    corridx   = _mm_sub_epi32(i32, tabidx);
-    tabidx    = _mm_blendv_epi8(tabidx, corridx, imask);
-    /* tabidx is now in range [0..16] */
-    ssign     = _mm256_cvtepi32_pd( _mm_or_si128( sswapsign, ione ) );
-    csign     = _mm256_cvtepi32_pd( _mm_or_si128( cswapsign, ione ) );
-
-    /* First lookup into table */
-#ifdef _MSC_VER
-    t1       = _mm256_insertf128_pd(_mm256_castpd128_pd256(_mm_load_pd(sintable + 2*_mm_extract_epi32(tabidx, 0))),
-                                    _mm_load_pd(sintable + 2*_mm_extract_epi32(tabidx, 2)), 0x1);
-    t2       = _mm256_insertf128_pd(_mm256_castpd128_pd256(_mm_load_pd(sintable + 2*_mm_extract_epi32(tabidx, 1))),
-                                    _mm_load_pd(sintable + 2*_mm_extract_epi32(tabidx, 3)), 0x1);
-#else
-    t1       = _mm256_insertf128_pd(_mm256_castpd128_pd256(sintable[_mm_extract_epi32(tabidx, 0)]),
-                                    sintable[_mm_extract_epi32(tabidx, 2)], 0x1);
-    t2       = _mm256_insertf128_pd(_mm256_castpd128_pd256(sintable[_mm_extract_epi32(tabidx, 1)]),
-                                    sintable[_mm_extract_epi32(tabidx, 3)], 0x1);
-#endif
-
-    sinpoint  = _mm256_unpackhi_pd(t1, t2);
-    cospoint  = _mm256_unpacklo_pd(t1, t2);
-
-    sinpoint = _mm256_mul_pd(sinpoint, ssign);
-    cospoint = _mm256_mul_pd(cospoint, csign);
-
-    z2       = _mm256_mul_pd(z, z);
-
-    polySin  = _mm256_mul_pd(sinP7, z2);
-    polySin  = _mm256_add_pd(polySin, sinP5);
-    polySin  = _mm256_mul_pd(polySin, z2);
-    polySin  = _mm256_add_pd(polySin, sinP3);
-    polySin  = _mm256_mul_pd(polySin, z2);
-    polySin  = _mm256_add_pd(polySin, sinP1);
-    polySin  = _mm256_mul_pd(polySin, z);
-
-    polyCos  = _mm256_mul_pd(cosP6, z2);
-    polyCos  = _mm256_add_pd(polyCos, cosP4);
-    polyCos  = _mm256_mul_pd(polyCos, z2);
-    polyCos  = _mm256_add_pd(polyCos, cosP2);
-    polyCos  = _mm256_mul_pd(polyCos, z2);
-    polyCos  = _mm256_add_pd(polyCos, cosP0);
-
-    *sinval  = _mm256_xor_pd(_mm256_add_pd( _mm256_mul_pd(sinpoint, polyCos), _mm256_mul_pd(cospoint, polySin) ), xsign);
-    *cosval  = _mm256_sub_pd( _mm256_mul_pd(cospoint, polyCos), _mm256_mul_pd(sinpoint, polySin) );
-
-    return 0;
-}
-
-static int
-gmx_mm_sincos_pd(__m128d  x,
-                 __m128d *sinval,
-                 __m128d *cosval)
-{
-#ifdef _MSC_VER
-    __declspec(align(16))
-    const double sintable[34] =
-    {
-        1.00000000000000000e+00, 0.00000000000000000e+00,
-        9.95184726672196929e-01, 9.80171403295606036e-02,
-        9.80785280403230431e-01, 1.95090322016128248e-01,
-        9.56940335732208824e-01, 2.90284677254462331e-01,
-        9.23879532511286738e-01, 3.82683432365089782e-01,
-        8.81921264348355050e-01, 4.71396736825997642e-01,
-        8.31469612302545236e-01, 5.55570233019602178e-01,
-        7.73010453362736993e-01, 6.34393284163645488e-01,
-        7.07106781186547573e-01, 7.07106781186547462e-01,
-        6.34393284163645599e-01, 7.73010453362736882e-01,
-        5.55570233019602289e-01, 8.31469612302545125e-01,
-        4.71396736825997809e-01, 8.81921264348354939e-01,
-        3.82683432365089837e-01, 9.23879532511286738e-01,
-        2.90284677254462276e-01, 9.56940335732208935e-01,
-        1.95090322016128304e-01, 9.80785280403230431e-01,
-        9.80171403295607702e-02, 9.95184726672196818e-01,
-        0.0, 1.00000000000000000e+00
-    };
-#else
-    const __m128d sintable[17] =
-    {
-        _mm_set_pd( 0.0, 1.0 ),
-        _mm_set_pd( sin(  1.0 * (M_PI/2.0) / 16.0), cos(  1.0 * (M_PI/2.0) / 16.0) ),
-        _mm_set_pd( sin(  2.0 * (M_PI/2.0) / 16.0), cos(  2.0 * (M_PI/2.0) / 16.0) ),
-        _mm_set_pd( sin(  3.0 * (M_PI/2.0) / 16.0), cos(  3.0 * (M_PI/2.0) / 16.0) ),
-        _mm_set_pd( sin(  4.0 * (M_PI/2.0) / 16.0), cos(  4.0 * (M_PI/2.0) / 16.0) ),
-        _mm_set_pd( sin(  5.0 * (M_PI/2.0) / 16.0), cos(  5.0 * (M_PI/2.0) / 16.0) ),
-        _mm_set_pd( sin(  6.0 * (M_PI/2.0) / 16.0), cos(  6.0 * (M_PI/2.0) / 16.0) ),
-        _mm_set_pd( sin(  7.0 * (M_PI/2.0) / 16.0), cos(  7.0 * (M_PI/2.0) / 16.0) ),
-        _mm_set_pd( sin(  8.0 * (M_PI/2.0) / 16.0), cos(  8.0 * (M_PI/2.0) / 16.0) ),
-        _mm_set_pd( sin(  9.0 * (M_PI/2.0) / 16.0), cos(  9.0 * (M_PI/2.0) / 16.0) ),
-        _mm_set_pd( sin( 10.0 * (M_PI/2.0) / 16.0), cos( 10.0 * (M_PI/2.0) / 16.0) ),
-        _mm_set_pd( sin( 11.0 * (M_PI/2.0) / 16.0), cos( 11.0 * (M_PI/2.0) / 16.0) ),
-        _mm_set_pd( sin( 12.0 * (M_PI/2.0) / 16.0), cos( 12.0 * (M_PI/2.0) / 16.0) ),
-        _mm_set_pd( sin( 13.0 * (M_PI/2.0) / 16.0), cos( 13.0 * (M_PI/2.0) / 16.0) ),
-        _mm_set_pd( sin( 14.0 * (M_PI/2.0) / 16.0), cos( 14.0 * (M_PI/2.0) / 16.0) ),
-        _mm_set_pd( sin( 15.0 * (M_PI/2.0) / 16.0), cos( 15.0 * (M_PI/2.0) / 16.0) ),
-        _mm_set_pd(  1.0, 0.0 )
-    };
-#endif
-
-    const __m128d signmask       = gmx_mm_castsi128_pd( _mm_set_epi32(0x7FFFFFFF, 0xFFFFFFFF, 0x7FFFFFFF, 0xFFFFFFFF) );
-
-    const __m128d tabscale      = _mm_set1_pd(32.0/M_PI);
-    const __m128d invtabscale0  = _mm_set1_pd(9.81747508049011230469e-02);
-    const __m128d invtabscale1  = _mm_set1_pd(1.96197799156550576057e-08);
-    const __m128i ione          = _mm_set1_epi32(1);
-    const __m128i i32           = _mm_set1_epi32(32);
-    const __m128i i16           = _mm_set1_epi32(16);
-    const __m128i tabmask       = _mm_set1_epi32(0x3F);
-    const __m128d sinP7         = _mm_set1_pd(-1.0/5040.0);
-    const __m128d sinP5         = _mm_set1_pd(1.0/120.0);
-    const __m128d sinP3         = _mm_set1_pd(-1.0/6.0);
-    const __m128d sinP1         = _mm_set1_pd(1.0);
-
-    const __m128d cosP6         = _mm_set1_pd(-1.0/720.0);
-    const __m128d cosP4         = _mm_set1_pd(1.0/24.0);
-    const __m128d cosP2         = _mm_set1_pd(-1.0/2.0);
-    const __m128d cosP0         = _mm_set1_pd(1.0);
-
-    __m128d       scalex;
-    __m128i       tabidx, corridx;
-    __m128d       xabs, z, z2, polySin, polyCos;
-    __m128d       xpoint;
-    __m128d       ypoint0, ypoint1;
-
-    __m128d       sinpoint, cospoint;
-    __m128d       xsign, ssign, csign;
-    __m128i       imask, sswapsign, cswapsign;
-
-    xsign    = _mm_andnot_pd(signmask, x);
-    xabs     = _mm_and_pd(x, signmask);
-
-    scalex   = _mm_mul_pd(tabscale, xabs);
-    tabidx   = _mm_cvtpd_epi32(scalex);
-
-    xpoint   = _mm_round_pd(scalex, _MM_FROUND_TO_NEAREST_INT);
-
-    /* Extended precision arithmetics */
-    z        = _mm_sub_pd(xabs, _mm_mul_pd(invtabscale0, xpoint));
-    z        = _mm_sub_pd(z, _mm_mul_pd(invtabscale1, xpoint));
-
-    /* Range reduction to 0..2*Pi */
-    tabidx   = _mm_and_si128(tabidx, tabmask);
-
-    /* tabidx is now in range [0,..,64] */
-    imask     = _mm_cmpgt_epi32(tabidx, i32);
-    sswapsign = imask;
-    cswapsign = imask;
-    corridx   = _mm_and_si128(imask, i32);
-    tabidx    = _mm_sub_epi32(tabidx, corridx);
-
-    /* tabidx is now in range [0..32] */
-    imask     = _mm_cmpgt_epi32(tabidx, i16);
-    cswapsign = _mm_xor_si128(cswapsign, imask);
-    corridx   = _mm_sub_epi32(i32, tabidx);
-    tabidx    = _mm_blendv_epi8(tabidx, corridx, imask);
-    /* tabidx is now in range [0..16] */
-    ssign     = _mm_cvtepi32_pd( _mm_or_si128( sswapsign, ione ) );
-    csign     = _mm_cvtepi32_pd( _mm_or_si128( cswapsign, ione ) );
-
-#ifdef _MSC_VER
-    ypoint0  = _mm_load_pd(sintable + 2*_mm_extract_epi32(tabidx, 0));
-    ypoint1  = _mm_load_pd(sintable + 2*_mm_extract_epi32(tabidx, 1));
-#else
-    ypoint0  = sintable[_mm_extract_epi32(tabidx, 0)];
-    ypoint1  = sintable[_mm_extract_epi32(tabidx, 1)];
-#endif
-    sinpoint = _mm_unpackhi_pd(ypoint0, ypoint1);
-    cospoint = _mm_unpacklo_pd(ypoint0, ypoint1);
-
-    sinpoint = _mm_mul_pd(sinpoint, ssign);
-    cospoint = _mm_mul_pd(cospoint, csign);
-
-    z2       = _mm_mul_pd(z, z);
-
-    polySin  = _mm_mul_pd(sinP7, z2);
-    polySin  = _mm_add_pd(polySin, sinP5);
-    polySin  = _mm_mul_pd(polySin, z2);
-    polySin  = _mm_add_pd(polySin, sinP3);
-    polySin  = _mm_mul_pd(polySin, z2);
-    polySin  = _mm_add_pd(polySin, sinP1);
-    polySin  = _mm_mul_pd(polySin, z);
-
-    polyCos  = _mm_mul_pd(cosP6, z2);
-    polyCos  = _mm_add_pd(polyCos, cosP4);
-    polyCos  = _mm_mul_pd(polyCos, z2);
-    polyCos  = _mm_add_pd(polyCos, cosP2);
-    polyCos  = _mm_mul_pd(polyCos, z2);
-    polyCos  = _mm_add_pd(polyCos, cosP0);
-
-    *sinval  = _mm_xor_pd(_mm_add_pd( _mm_mul_pd(sinpoint, polyCos), _mm_mul_pd(cospoint, polySin) ), xsign);
-    *cosval  = _mm_sub_pd( _mm_mul_pd(cospoint, polyCos), _mm_mul_pd(sinpoint, polySin) );
-
-    return 0;
-}
-
-
-/*
- * IMPORTANT: Do NOT call both sin & cos if you need both results, since each of them
- * will then call the sincos() routine and waste a factor 2 in performance!
- */
-static __m256d
-gmx_mm256_sin_pd(__m256d x)
-{
-    __m256d s, c;
-    gmx_mm256_sincos_pd(x, &s, &c);
-    return s;
-}
-
-static __m128d
-gmx_mm_sin_pd(__m128d x)
-{
-    __m128d s, c;
-    gmx_mm_sincos_pd(x, &s, &c);
-    return s;
-}
-
-/*
- * IMPORTANT: Do NOT call both sin & cos if you need both results, since each of them
- * will then call the sincos() routine and waste a factor 2 in performance!
+/* Temporary:
+ * Alias some old SSE definitions to new SIMD definitions so we don't need
+ * to modify _all_ group kernels - they will anyway be replaced with a new
+ * generic SIMD version soon.
   */
-static __m256d
-gmx_mm256_cos_pd(__m256d x)
-{
-    __m256d s, c;
-    gmx_mm256_sincos_pd(x, &s, &c);
-    return c;
-}
-
-static __m128d
-gmx_mm_cos_pd(__m128d x)
-{
-    __m128d s, c;
-    gmx_mm_sincos_pd(x, &s, &c);
-    return c;
-}
-
-
-static __m256d
-gmx_mm256_tan_pd(__m256d x)
-{
-    __m256d sinval, cosval;
-    __m256d tanval;
-
-    gmx_mm256_sincos_pd(x, &sinval, &cosval);
-
-    tanval = _mm256_mul_pd(sinval, gmx_mm256_inv_pd(cosval));
-
-    return tanval;
-}
-
-static __m128d
-gmx_mm_tan_pd(__m128d x)
-{
-    __m128d sinval, cosval;
-    __m128d tanval;
-
-    gmx_mm_sincos_pd(x, &sinval, &cosval);
-
-    tanval = _mm_mul_pd(sinval, gmx_mm_inv_pd(cosval));
-
-    return tanval;
-}
-
-
-static __m256d
-gmx_mm256_asin_pd(__m256d x)
-{
-    /* Same algorithm as cephes library */
-    const __m256d signmask  = _mm256_castsi256_pd( _mm256_set_epi32(0x7FFFFFFF, 0xFFFFFFFF, 0x7FFFFFFF, 0xFFFFFFFF,
-                                                                    0x7FFFFFFF, 0xFFFFFFFF, 0x7FFFFFFF, 0xFFFFFFFF) );
-    const __m256d limit1    = _mm256_set1_pd(0.625);
-    const __m256d limit2    = _mm256_set1_pd(1e-8);
-    const __m256d one       = _mm256_set1_pd(1.0);
-    const __m256d quarterpi = _mm256_set1_pd(M_PI/4.0);
-    const __m256d morebits  = _mm256_set1_pd(6.123233995736765886130e-17);
-
-    const __m256d P5        = _mm256_set1_pd(4.253011369004428248960e-3);
-    const __m256d P4        = _mm256_set1_pd(-6.019598008014123785661e-1);
-    const __m256d P3        = _mm256_set1_pd(5.444622390564711410273e0);
-    const __m256d P2        = _mm256_set1_pd(-1.626247967210700244449e1);
-    const __m256d P1        = _mm256_set1_pd(1.956261983317594739197e1);
-    const __m256d P0        = _mm256_set1_pd(-8.198089802484824371615e0);
-
-    const __m256d Q4        = _mm256_set1_pd(-1.474091372988853791896e1);
-    const __m256d Q3        = _mm256_set1_pd(7.049610280856842141659e1);
-    const __m256d Q2        = _mm256_set1_pd(-1.471791292232726029859e2);
-    const __m256d Q1        = _mm256_set1_pd(1.395105614657485689735e2);
-    const __m256d Q0        = _mm256_set1_pd(-4.918853881490881290097e1);
-
-    const __m256d R4        = _mm256_set1_pd(2.967721961301243206100e-3);
-    const __m256d R3        = _mm256_set1_pd(-5.634242780008963776856e-1);
-    const __m256d R2        = _mm256_set1_pd(6.968710824104713396794e0);
-    const __m256d R1        = _mm256_set1_pd(-2.556901049652824852289e1);
-    const __m256d R0        = _mm256_set1_pd(2.853665548261061424989e1);
-
-    const __m256d S3        = _mm256_set1_pd(-2.194779531642920639778e1);
-    const __m256d S2        = _mm256_set1_pd(1.470656354026814941758e2);
-    const __m256d S1        = _mm256_set1_pd(-3.838770957603691357202e2);
-    const __m256d S0        = _mm256_set1_pd(3.424398657913078477438e2);
-
-    __m256d       sign;
-    __m256d       mask;
-    __m256d       xabs;
-    __m256d       zz, ww, z, q, w, zz2, ww2;
-    __m256d       PA, PB;
-    __m256d       QA, QB;
-    __m256d       RA, RB;
-    __m256d       SA, SB;
-    __m256d       nom, denom;
-
-    sign  = _mm256_andnot_pd(signmask, x);
-    xabs  = _mm256_and_pd(x, signmask);
-
-    mask  = _mm256_cmp_pd(xabs, limit1, _CMP_GT_OQ);
-
-    zz    = _mm256_sub_pd(one, xabs);
-    ww    = _mm256_mul_pd(xabs, xabs);
-    zz2   = _mm256_mul_pd(zz, zz);
-    ww2   = _mm256_mul_pd(ww, ww);
-
-    /* R */
-    RA    = _mm256_mul_pd(R4, zz2);
-    RB    = _mm256_mul_pd(R3, zz2);
-    RA    = _mm256_add_pd(RA, R2);
-    RB    = _mm256_add_pd(RB, R1);
-    RA    = _mm256_mul_pd(RA, zz2);
-    RB    = _mm256_mul_pd(RB, zz);
-    RA    = _mm256_add_pd(RA, R0);
-    RA    = _mm256_add_pd(RA, RB);
-
-    /* S, SA = zz2 */
-    SB    = _mm256_mul_pd(S3, zz2);
-    SA    = _mm256_add_pd(zz2, S2);
-    SB    = _mm256_add_pd(SB, S1);
-    SA    = _mm256_mul_pd(SA, zz2);
-    SB    = _mm256_mul_pd(SB, zz);
-    SA    = _mm256_add_pd(SA, S0);
-    SA    = _mm256_add_pd(SA, SB);
-
-    /* P */
-    PA    = _mm256_mul_pd(P5, ww2);
-    PB    = _mm256_mul_pd(P4, ww2);
-    PA    = _mm256_add_pd(PA, P3);
-    PB    = _mm256_add_pd(PB, P2);
-    PA    = _mm256_mul_pd(PA, ww2);
-    PB    = _mm256_mul_pd(PB, ww2);
-    PA    = _mm256_add_pd(PA, P1);
-    PB    = _mm256_add_pd(PB, P0);
-    PA    = _mm256_mul_pd(PA, ww);
-    PA    = _mm256_add_pd(PA, PB);
-
-    /* Q, QA = ww2 */
-    QB    = _mm256_mul_pd(Q4, ww2);
-    QA    = _mm256_add_pd(ww2, Q3);
-    QB    = _mm256_add_pd(QB, Q2);
-    QA    = _mm256_mul_pd(QA, ww2);
-    QB    = _mm256_mul_pd(QB, ww2);
-    QA    = _mm256_add_pd(QA, Q1);
-    QB    = _mm256_add_pd(QB, Q0);
-    QA    = _mm256_mul_pd(QA, ww);
-    QA    = _mm256_add_pd(QA, QB);
-
-    RA    = _mm256_mul_pd(RA, zz);
-    PA    = _mm256_mul_pd(PA, ww);
-
-    nom   = _mm256_blendv_pd( PA, RA, mask );
-    denom = _mm256_blendv_pd( QA, SA, mask );
-
-    q     = _mm256_mul_pd( nom, gmx_mm256_inv_pd(denom) );
-
-    zz    = _mm256_add_pd(zz, zz);
-    zz    = gmx_mm256_sqrt_pd(zz);
-    z     = _mm256_sub_pd(quarterpi, zz);
-    zz    = _mm256_mul_pd(zz, q);
-    zz    = _mm256_sub_pd(zz, morebits);
-    z     = _mm256_sub_pd(z, zz);
-    z     = _mm256_add_pd(z, quarterpi);
-
-    w     = _mm256_mul_pd(xabs, q);
-    w     = _mm256_add_pd(w, xabs);
-
-    z     = _mm256_blendv_pd( w, z, mask );
-
-    mask  = _mm256_cmp_pd(xabs, limit2, _CMP_GT_OQ);
-    z     = _mm256_blendv_pd( xabs, z, mask );
-
-    z = _mm256_xor_pd(z, sign);
-
-    return z;
-}
-
-static __m128d
-gmx_mm_asin_pd(__m128d x)
-{
-    /* Same algorithm as cephes library */
-    const __m128d signmask  = gmx_mm_castsi128_pd( _mm_set_epi32(0x7FFFFFFF, 0xFFFFFFFF, 0x7FFFFFFF, 0xFFFFFFFF) );
-    const __m128d limit1    = _mm_set1_pd(0.625);
-    const __m128d limit2    = _mm_set1_pd(1e-8);
-    const __m128d one       = _mm_set1_pd(1.0);
-    const __m128d quarterpi = _mm_set1_pd(M_PI/4.0);
-    const __m128d morebits  = _mm_set1_pd(6.123233995736765886130e-17);
-
-    const __m128d P5        = _mm_set1_pd(4.253011369004428248960e-3);
-    const __m128d P4        = _mm_set1_pd(-6.019598008014123785661e-1);
-    const __m128d P3        = _mm_set1_pd(5.444622390564711410273e0);
-    const __m128d P2        = _mm_set1_pd(-1.626247967210700244449e1);
-    const __m128d P1        = _mm_set1_pd(1.956261983317594739197e1);
-    const __m128d P0        = _mm_set1_pd(-8.198089802484824371615e0);
-
-    const __m128d Q4        = _mm_set1_pd(-1.474091372988853791896e1);
-    const __m128d Q3        = _mm_set1_pd(7.049610280856842141659e1);
-    const __m128d Q2        = _mm_set1_pd(-1.471791292232726029859e2);
-    const __m128d Q1        = _mm_set1_pd(1.395105614657485689735e2);
-    const __m128d Q0        = _mm_set1_pd(-4.918853881490881290097e1);
-
-    const __m128d R4        = _mm_set1_pd(2.967721961301243206100e-3);
-    const __m128d R3        = _mm_set1_pd(-5.634242780008963776856e-1);
-    const __m128d R2        = _mm_set1_pd(6.968710824104713396794e0);
-    const __m128d R1        = _mm_set1_pd(-2.556901049652824852289e1);
-    const __m128d R0        = _mm_set1_pd(2.853665548261061424989e1);
-
-    const __m128d S3        = _mm_set1_pd(-2.194779531642920639778e1);
-    const __m128d S2        = _mm_set1_pd(1.470656354026814941758e2);
-    const __m128d S1        = _mm_set1_pd(-3.838770957603691357202e2);
-    const __m128d S0        = _mm_set1_pd(3.424398657913078477438e2);
-
-    __m128d       sign;
-    __m128d       mask;
-    __m128d       xabs;
-    __m128d       zz, ww, z, q, w, zz2, ww2;
-    __m128d       PA, PB;
-    __m128d       QA, QB;
-    __m128d       RA, RB;
-    __m128d       SA, SB;
-    __m128d       nom, denom;
-
-    sign  = _mm_andnot_pd(signmask, x);
-    xabs  = _mm_and_pd(x, signmask);
-
-    mask  = _mm_cmpgt_pd(xabs, limit1);
-
-    zz    = _mm_sub_pd(one, xabs);
-    ww    = _mm_mul_pd(xabs, xabs);
-    zz2   = _mm_mul_pd(zz, zz);
-    ww2   = _mm_mul_pd(ww, ww);
-
-    /* R */
-    RA    = _mm_mul_pd(R4, zz2);
-    RB    = _mm_mul_pd(R3, zz2);
-    RA    = _mm_add_pd(RA, R2);
-    RB    = _mm_add_pd(RB, R1);
-    RA    = _mm_mul_pd(RA, zz2);
-    RB    = _mm_mul_pd(RB, zz);
-    RA    = _mm_add_pd(RA, R0);
-    RA    = _mm_add_pd(RA, RB);
-
-    /* S, SA = zz2 */
-    SB    = _mm_mul_pd(S3, zz2);
-    SA    = _mm_add_pd(zz2, S2);
-    SB    = _mm_add_pd(SB, S1);
-    SA    = _mm_mul_pd(SA, zz2);
-    SB    = _mm_mul_pd(SB, zz);
-    SA    = _mm_add_pd(SA, S0);
-    SA    = _mm_add_pd(SA, SB);
-
-    /* P */
-    PA    = _mm_mul_pd(P5, ww2);
-    PB    = _mm_mul_pd(P4, ww2);
-    PA    = _mm_add_pd(PA, P3);
-    PB    = _mm_add_pd(PB, P2);
-    PA    = _mm_mul_pd(PA, ww2);
-    PB    = _mm_mul_pd(PB, ww2);
-    PA    = _mm_add_pd(PA, P1);
-    PB    = _mm_add_pd(PB, P0);
-    PA    = _mm_mul_pd(PA, ww);
-    PA    = _mm_add_pd(PA, PB);
-
-    /* Q, QA = ww2 */
-    QB    = _mm_mul_pd(Q4, ww2);
-    QA    = _mm_add_pd(ww2, Q3);
-    QB    = _mm_add_pd(QB, Q2);
-    QA    = _mm_mul_pd(QA, ww2);
-    QB    = _mm_mul_pd(QB, ww2);
-    QA    = _mm_add_pd(QA, Q1);
-    QB    = _mm_add_pd(QB, Q0);
-    QA    = _mm_mul_pd(QA, ww);
-    QA    = _mm_add_pd(QA, QB);
-
-    RA    = _mm_mul_pd(RA, zz);
-    PA    = _mm_mul_pd(PA, ww);
-
-    nom   = _mm_blendv_pd( PA, RA, mask );
-    denom = _mm_blendv_pd( QA, SA, mask );
-
-    q     = _mm_mul_pd( nom, gmx_mm_inv_pd(denom) );
-
-    zz    = _mm_add_pd(zz, zz);
-    zz    = gmx_mm_sqrt_pd(zz);
-    z     = _mm_sub_pd(quarterpi, zz);
-    zz    = _mm_mul_pd(zz, q);
-    zz    = _mm_sub_pd(zz, morebits);
-    z     = _mm_sub_pd(z, zz);
-    z     = _mm_add_pd(z, quarterpi);
-
-    w     = _mm_mul_pd(xabs, q);
-    w     = _mm_add_pd(w, xabs);
-
-    z     = _mm_blendv_pd( w, z, mask );
-
-    mask  = _mm_cmpgt_pd(xabs, limit2);
-    z     = _mm_blendv_pd( xabs, z, mask );
-
-    z = _mm_xor_pd(z, sign);
-
-    return z;
-}
-
-
-static __m256d
-gmx_mm256_acos_pd(__m256d x)
-{
-    const __m256d one        = _mm256_set1_pd(1.0);
-    const __m256d half       = _mm256_set1_pd(0.5);
-    const __m256d quarterpi0 = _mm256_set1_pd(7.85398163397448309616e-1);
-    const __m256d quarterpi1 = _mm256_set1_pd(6.123233995736765886130e-17);
-
-
-    __m256d mask1;
-
-    __m256d z, z1, z2;
-
-    mask1 = _mm256_cmp_pd(x, half, _CMP_GT_OQ);
-    z1    = _mm256_mul_pd(half, _mm256_sub_pd(one, x));
-    z1    = gmx_mm256_sqrt_pd(z1);
-    z     = _mm256_blendv_pd( x, z1, mask1 );
-
-    z     = gmx_mm256_asin_pd(z);
-
-    z1    = _mm256_add_pd(z, z);
-
-    z2    = _mm256_sub_pd(quarterpi0, z);
-    z2    = _mm256_add_pd(z2, quarterpi1);
-    z2    = _mm256_add_pd(z2, quarterpi0);
-
-    z     = _mm256_blendv_pd(z2, z1, mask1);
-
-    return z;
-}
-
-static __m128d
-gmx_mm_acos_pd(__m128d x)
-{
-    const __m128d one        = _mm_set1_pd(1.0);
-    const __m128d half       = _mm_set1_pd(0.5);
-    const __m128d quarterpi0 = _mm_set1_pd(7.85398163397448309616e-1);
-    const __m128d quarterpi1 = _mm_set1_pd(6.123233995736765886130e-17);
-
-
-    __m128d mask1;
-
-    __m128d z, z1, z2;
-
-    mask1 = _mm_cmpgt_pd(x, half);
-    z1    = _mm_mul_pd(half, _mm_sub_pd(one, x));
-    z1    = gmx_mm_sqrt_pd(z1);
-    z     = _mm_blendv_pd( x, z1, mask1 );
-
-    z     = gmx_mm_asin_pd(z);
-
-    z1    = _mm_add_pd(z, z);
-
-    z2    = _mm_sub_pd(quarterpi0, z);
-    z2    = _mm_add_pd(z2, quarterpi1);
-    z2    = _mm_add_pd(z2, quarterpi0);
-
-    z     = _mm_blendv_pd(z2, z1, mask1);
-
-    return z;
-}
-
-
-static __m256d
-gmx_mm256_atan_pd(__m256d x)
-{
-    /* Same algorithm as cephes library */
-    const __m256d signmask  = _mm256_castsi256_pd( _mm256_set_epi32(0x7FFFFFFF, 0xFFFFFFFF, 0x7FFFFFFF, 0xFFFFFFFF,
-                                                                    0x7FFFFFFF, 0xFFFFFFFF, 0x7FFFFFFF, 0xFFFFFFFF) );
-    const __m256d limit1    = _mm256_set1_pd(0.66);
-    const __m256d limit2    = _mm256_set1_pd(2.41421356237309504880);
-    const __m256d quarterpi = _mm256_set1_pd(M_PI/4.0);
-    const __m256d halfpi    = _mm256_set1_pd(M_PI/2.0);
-    const __m256d mone      = _mm256_set1_pd(-1.0);
-    const __m256d morebits1 = _mm256_set1_pd(0.5*6.123233995736765886130E-17);
-    const __m256d morebits2 = _mm256_set1_pd(6.123233995736765886130E-17);
-
-    const __m256d P4        = _mm256_set1_pd(-8.750608600031904122785E-1);
-    const __m256d P3        = _mm256_set1_pd(-1.615753718733365076637E1);
-    const __m256d P2        = _mm256_set1_pd(-7.500855792314704667340E1);
-    const __m256d P1        = _mm256_set1_pd(-1.228866684490136173410E2);
-    const __m256d P0        = _mm256_set1_pd(-6.485021904942025371773E1);
-
-    const __m256d Q4        = _mm256_set1_pd(2.485846490142306297962E1);
-    const __m256d Q3        = _mm256_set1_pd(1.650270098316988542046E2);
-    const __m256d Q2        = _mm256_set1_pd(4.328810604912902668951E2);
-    const __m256d Q1        = _mm256_set1_pd(4.853903996359136964868E2);
-    const __m256d Q0        = _mm256_set1_pd(1.945506571482613964425E2);
-
-    __m256d       sign;
-    __m256d       mask1, mask2;
-    __m256d       y, t1, t2;
-    __m256d       z, z2;
-    __m256d       P_A, P_B, Q_A, Q_B;
-
-    sign   = _mm256_andnot_pd(signmask, x);
-    x      = _mm256_and_pd(x, signmask);
-
-    mask1  = _mm256_cmp_pd(x, limit1, _CMP_GT_OQ);
-    mask2  = _mm256_cmp_pd(x, limit2, _CMP_GT_OQ);
-
-    t1     = _mm256_mul_pd(_mm256_add_pd(x, mone), gmx_mm256_inv_pd(_mm256_sub_pd(x, mone)));
-    t2     = _mm256_mul_pd(mone, gmx_mm256_inv_pd(x));
-
-    y      = _mm256_and_pd(mask1, quarterpi);
-    y      = _mm256_or_pd( _mm256_and_pd(mask2, halfpi), _mm256_andnot_pd(mask2, y) );
-
-    x      = _mm256_or_pd( _mm256_and_pd(mask1, t1), _mm256_andnot_pd(mask1, x) );
-    x      = _mm256_or_pd( _mm256_and_pd(mask2, t2), _mm256_andnot_pd(mask2, x) );
-
-    z      = _mm256_mul_pd(x, x);
-    z2     = _mm256_mul_pd(z, z);
-
-    P_A    = _mm256_mul_pd(P4, z2);
-    P_B    = _mm256_mul_pd(P3, z2);
-    P_A    = _mm256_add_pd(P_A, P2);
-    P_B    = _mm256_add_pd(P_B, P1);
-    P_A    = _mm256_mul_pd(P_A, z2);
-    P_B    = _mm256_mul_pd(P_B, z);
-    P_A    = _mm256_add_pd(P_A, P0);
-    P_A    = _mm256_add_pd(P_A, P_B);
-
-    /* Q_A = z2 */
-    Q_B    = _mm256_mul_pd(Q4, z2);
-    Q_A    = _mm256_add_pd(z2, Q3);
-    Q_B    = _mm256_add_pd(Q_B, Q2);
-    Q_A    = _mm256_mul_pd(Q_A, z2);
-    Q_B    = _mm256_mul_pd(Q_B, z2);
-    Q_A    = _mm256_add_pd(Q_A, Q1);
-    Q_B    = _mm256_add_pd(Q_B, Q0);
-    Q_A    = _mm256_mul_pd(Q_A, z);
-    Q_A    = _mm256_add_pd(Q_A, Q_B);
-
-    z      = _mm256_mul_pd(z, P_A);
-    z      = _mm256_mul_pd(z, gmx_mm256_inv_pd(Q_A));
-    z      = _mm256_mul_pd(z, x);
-    z      = _mm256_add_pd(z, x);
-
-    t1     = _mm256_and_pd(mask1, morebits1);
-    t1     = _mm256_or_pd( _mm256_and_pd(mask2, morebits2), _mm256_andnot_pd(mask2, t1) );
-
-    z      = _mm256_add_pd(z, t1);
-    y      = _mm256_add_pd(y, z);
-
-    y      = _mm256_xor_pd(y, sign);
-
-    return y;
-}
-
-static __m128d
-gmx_mm_atan_pd(__m128d x)
-{
-    /* Same algorithm as cephes library */
-    const __m128d signmask  = gmx_mm_castsi128_pd( _mm_set_epi32(0x7FFFFFFF, 0xFFFFFFFF, 0x7FFFFFFF, 0xFFFFFFFF) );
-    const __m128d limit1    = _mm_set1_pd(0.66);
-    const __m128d limit2    = _mm_set1_pd(2.41421356237309504880);
-    const __m128d quarterpi = _mm_set1_pd(M_PI/4.0);
-    const __m128d halfpi    = _mm_set1_pd(M_PI/2.0);
-    const __m128d mone      = _mm_set1_pd(-1.0);
-    const __m128d morebits1 = _mm_set1_pd(0.5*6.123233995736765886130E-17);
-    const __m128d morebits2 = _mm_set1_pd(6.123233995736765886130E-17);
-
-    const __m128d P4        = _mm_set1_pd(-8.750608600031904122785E-1);
-    const __m128d P3        = _mm_set1_pd(-1.615753718733365076637E1);
-    const __m128d P2        = _mm_set1_pd(-7.500855792314704667340E1);
-    const __m128d P1        = _mm_set1_pd(-1.228866684490136173410E2);
-    const __m128d P0        = _mm_set1_pd(-6.485021904942025371773E1);
-
-    const __m128d Q4        = _mm_set1_pd(2.485846490142306297962E1);
-    const __m128d Q3        = _mm_set1_pd(1.650270098316988542046E2);
-    const __m128d Q2        = _mm_set1_pd(4.328810604912902668951E2);
-    const __m128d Q1        = _mm_set1_pd(4.853903996359136964868E2);
-    const __m128d Q0        = _mm_set1_pd(1.945506571482613964425E2);
-
-    __m128d       sign;
-    __m128d       mask1, mask2;
-    __m128d       y, t1, t2;
-    __m128d       z, z2;
-    __m128d       P_A, P_B, Q_A, Q_B;
-
-    sign   = _mm_andnot_pd(signmask, x);
-    x      = _mm_and_pd(x, signmask);
-
-    mask1  = _mm_cmpgt_pd(x, limit1);
-    mask2  = _mm_cmpgt_pd(x, limit2);
-
-    t1     = _mm_mul_pd(_mm_add_pd(x, mone), gmx_mm_inv_pd(_mm_sub_pd(x, mone)));
-    t2     = _mm_mul_pd(mone, gmx_mm_inv_pd(x));
-
-    y      = _mm_and_pd(mask1, quarterpi);
-    y      = _mm_or_pd( _mm_and_pd(mask2, halfpi), _mm_andnot_pd(mask2, y) );
-
-    x      = _mm_or_pd( _mm_and_pd(mask1, t1), _mm_andnot_pd(mask1, x) );
-    x      = _mm_or_pd( _mm_and_pd(mask2, t2), _mm_andnot_pd(mask2, x) );
-
-    z      = _mm_mul_pd(x, x);
-    z2     = _mm_mul_pd(z, z);
-
-    P_A    = _mm_mul_pd(P4, z2);
-    P_B    = _mm_mul_pd(P3, z2);
-    P_A    = _mm_add_pd(P_A, P2);
-    P_B    = _mm_add_pd(P_B, P1);
-    P_A    = _mm_mul_pd(P_A, z2);
-    P_B    = _mm_mul_pd(P_B, z);
-    P_A    = _mm_add_pd(P_A, P0);
-    P_A    = _mm_add_pd(P_A, P_B);
-
-    /* Q_A = z2 */
-    Q_B    = _mm_mul_pd(Q4, z2);
-    Q_A    = _mm_add_pd(z2, Q3);
-    Q_B    = _mm_add_pd(Q_B, Q2);
-    Q_A    = _mm_mul_pd(Q_A, z2);
-    Q_B    = _mm_mul_pd(Q_B, z2);
-    Q_A    = _mm_add_pd(Q_A, Q1);
-    Q_B    = _mm_add_pd(Q_B, Q0);
-    Q_A    = _mm_mul_pd(Q_A, z);
-    Q_A    = _mm_add_pd(Q_A, Q_B);
-
-    z      = _mm_mul_pd(z, P_A);
-    z      = _mm_mul_pd(z, gmx_mm_inv_pd(Q_A));
-    z      = _mm_mul_pd(z, x);
-    z      = _mm_add_pd(z, x);
-
-    t1     = _mm_and_pd(mask1, morebits1);
-    t1     = _mm_or_pd( _mm_and_pd(mask2, morebits2), _mm_andnot_pd(mask2, t1) );
-
-    z      = _mm_add_pd(z, t1);
-    y      = _mm_add_pd(y, z);
-
-    y      = _mm_xor_pd(y, sign);
-
-    return y;
-}
-
-
-
-static __m256d
-gmx_mm256_atan2_pd(__m256d y, __m256d x)
-{
-    const __m256d pi          = _mm256_set1_pd(M_PI);
-    const __m256d minuspi     = _mm256_set1_pd(-M_PI);
-    const __m256d halfpi      = _mm256_set1_pd(M_PI/2.0);
-    const __m256d minushalfpi = _mm256_set1_pd(-M_PI/2.0);
-
-    __m256d       z, z1, z3, z4;
-    __m256d       w;
-    __m256d       maskx_lt, maskx_eq;
-    __m256d       masky_lt, masky_eq;
-    __m256d       mask1, mask2, mask3, mask4, maskall;
-
-    maskx_lt  = _mm256_cmp_pd(x, _mm256_setzero_pd(), _CMP_LT_OQ);
-    masky_lt  = _mm256_cmp_pd(y, _mm256_setzero_pd(), _CMP_LT_OQ);
-    maskx_eq  = _mm256_cmp_pd(x, _mm256_setzero_pd(), _CMP_EQ_OQ);
-    masky_eq  = _mm256_cmp_pd(y, _mm256_setzero_pd(), _CMP_EQ_OQ);
-
-    z         = _mm256_mul_pd(y, gmx_mm256_inv_pd(x));
-    z         = gmx_mm256_atan_pd(z);
-
-    mask1     = _mm256_and_pd(maskx_eq, masky_lt);
-    mask2     = _mm256_andnot_pd(maskx_lt, masky_eq);
-    mask3     = _mm256_andnot_pd( _mm256_or_pd(masky_lt, masky_eq), maskx_eq);
-    mask4     = _mm256_and_pd(masky_eq, maskx_lt);
-
-    maskall   = _mm256_or_pd( _mm256_or_pd(mask1, mask2), _mm256_or_pd(mask3, mask4) );
-
-    z         = _mm256_andnot_pd(maskall, z);
-    z1        = _mm256_and_pd(mask1, minushalfpi);
-    z3        = _mm256_and_pd(mask3, halfpi);
-    z4        = _mm256_and_pd(mask4, pi);
-
-    z         = _mm256_or_pd( _mm256_or_pd(z, z1), _mm256_or_pd(z3, z4) );
-
-    w         = _mm256_blendv_pd(pi, minuspi, masky_lt);
-    w         = _mm256_and_pd(w, maskx_lt);
-
-    w         = _mm256_andnot_pd(maskall, w);
-
-    z         = _mm256_add_pd(z, w);
-
-    return z;
-}
-
-static __m128d
-gmx_mm_atan2_pd(__m128d y, __m128d x)
-{
-    const __m128d pi          = _mm_set1_pd(M_PI);
-    const __m128d minuspi     = _mm_set1_pd(-M_PI);
-    const __m128d halfpi      = _mm_set1_pd(M_PI/2.0);
-    const __m128d minushalfpi = _mm_set1_pd(-M_PI/2.0);
-
-    __m128d       z, z1, z3, z4;
-    __m128d       w;
-    __m128d       maskx_lt, maskx_eq;
-    __m128d       masky_lt, masky_eq;
-    __m128d       mask1, mask2, mask3, mask4, maskall;
-
-    maskx_lt  = _mm_cmplt_pd(x, _mm_setzero_pd());
-    masky_lt  = _mm_cmplt_pd(y, _mm_setzero_pd());
-    maskx_eq  = _mm_cmpeq_pd(x, _mm_setzero_pd());
-    masky_eq  = _mm_cmpeq_pd(y, _mm_setzero_pd());
-
-    z         = _mm_mul_pd(y, gmx_mm_inv_pd(x));
-    z         = gmx_mm_atan_pd(z);
-
-    mask1     = _mm_and_pd(maskx_eq, masky_lt);
-    mask2     = _mm_andnot_pd(maskx_lt, masky_eq);
-    mask3     = _mm_andnot_pd( _mm_or_pd(masky_lt, masky_eq), maskx_eq);
-    mask4     = _mm_and_pd(masky_eq, maskx_lt);
-
-    maskall   = _mm_or_pd( _mm_or_pd(mask1, mask2), _mm_or_pd(mask3, mask4) );
-
-    z         = _mm_andnot_pd(maskall, z);
-    z1        = _mm_and_pd(mask1, minushalfpi);
-    z3        = _mm_and_pd(mask3, halfpi);
-    z4        = _mm_and_pd(mask4, pi);
-
-    z         = _mm_or_pd( _mm_or_pd(z, z1), _mm_or_pd(z3, z4) );
-
-    w         = _mm_blendv_pd(pi, minuspi, masky_lt);
-    w         = _mm_and_pd(w, maskx_lt);
-
-    w         = _mm_andnot_pd(maskall, w);
-
-    z         = _mm_add_pd(z, w);
  
-    return z;
-}
+#define gmx_mm256_invsqrt_pd   gmx_simd_invsqrt_d
+#define gmx_mm256_inv_pd       gmx_simd_inv_d
+#define gmx_mm256_log_pd       gmx_simd_log_d
+#define gmx_mm256_pmecorrF_pd  gmx_simd_pmecorrF_d
+#define gmx_mm256_pmecorrV_pd  gmx_simd_pmecorrV_d
+#define gmx_mm256_sincos_pd    gmx_simd_sincos_d
  
  #endif
diff --git a/src/gromacs/simd/math_x86_avx_256_single.h b/src/gromacs/simd/math_x86_avx_256_single.h

index 0411130bbdb71e568d39a4a5ed14948435765b4d..b23712a2595d9eff9a19c4e6da4a851ced18b939 100644 (file)
--- a/src/gromacs/simd/math_x86_avx_256_single.h
+++ b/src/gromacs/simd/math_x86_avx_256_single.h
@@ -1,7 +1,7 @@
  /*
   * This file is part of the GROMACS molecular simulation package.
   *
- * Copyright (c) 2012,2013, by the GROMACS development team, led by
+ * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
   * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   * and including many others, as listed in the AUTHORS file in the
   * top-level source directory and at http://www.gromacs.org.
@@ -35,2189 +35,19 @@
  #ifndef GMX_SIMD_MATH_AVX_256_SINGLE_H
  #define GMX_SIMD_MATH_AVX_256_SINGLE_H
  
-#include <math.h>
+#include "simd_math.h"
  
-#include "general_x86_avx_256.h"
-
-#ifndef M_PI
-#  define M_PI 3.14159265358979323846264338327950288
-#endif
-
-
-
-/************************
- *                      *
- * Simple math routines *
- *                      *
- ************************/
-
-/* 1.0/sqrt(x), 256-bit wide version */
-static gmx_inline __m256
-gmx_mm256_invsqrt_ps(__m256 x)
-{
-    const __m256 half  = _mm256_set1_ps(0.5f);
-    const __m256 three = _mm256_set1_ps(3.0f);
-
-    __m256       lu = _mm256_rsqrt_ps(x);
-
-    return _mm256_mul_ps(half, _mm256_mul_ps(_mm256_sub_ps(three, _mm256_mul_ps(_mm256_mul_ps(lu, lu), x)), lu));
-}
-
-/* 1.0/sqrt(x), 128-bit wide version */
-static gmx_inline __m128
-gmx_mm_invsqrt_ps(__m128 x)
-{
-    const __m128 half  = _mm_set_ps(0.5, 0.5, 0.5, 0.5);
-    const __m128 three = _mm_set_ps(3.0, 3.0, 3.0, 3.0);
-
-    __m128       lu = _mm_rsqrt_ps(x);
-
-    return _mm_mul_ps(half, _mm_mul_ps(_mm_sub_ps(three, _mm_mul_ps(_mm_mul_ps(lu, lu), x)), lu));
-}
-
-
-/* sqrt(x) (256 bit) - Do NOT use this (but rather invsqrt) if you actually need 1.0/sqrt(x) */
-static gmx_inline __m256
-gmx_mm256_sqrt_ps(__m256 x)
-{
-    __m256 mask;
-    __m256 res;
-
-    mask = _mm256_cmp_ps(x, _mm256_setzero_ps(), _CMP_EQ_OQ);
-    res  = _mm256_andnot_ps(mask, gmx_mm256_invsqrt_ps(x));
-
-    res  = _mm256_mul_ps(x, res);
-
-    return res;
-}
-
-/* sqrt(x) (128 bit) - Do NOT use this (but rather invsqrt) if you actually need 1.0/sqrt(x) */
-static gmx_inline __m128
-gmx_mm_sqrt_ps(__m128 x)
-{
-    __m128 mask;
-    __m128 res;
-
-    mask = _mm_cmpeq_ps(x, _mm_setzero_ps());
-    res  = _mm_andnot_ps(mask, gmx_mm_invsqrt_ps(x));
-
-    res  = _mm_mul_ps(x, res);
-
-    return res;
-}
-
-
-/* 1.0/x, 256-bit wide */
-static gmx_inline __m256
-gmx_mm256_inv_ps(__m256 x)
-{
-    const __m256 two = _mm256_set1_ps(2.0f);
-
-    __m256       lu = _mm256_rcp_ps(x);
-
-    return _mm256_mul_ps(lu, _mm256_sub_ps(two, _mm256_mul_ps(lu, x)));
-}
-
-/* 1.0/x, 128-bit wide */
-static gmx_inline __m128
-gmx_mm_inv_ps(__m128 x)
-{
-    const __m128 two = _mm_set_ps(2.0f, 2.0f, 2.0f, 2.0f);
-
-    __m128       lu = _mm_rcp_ps(x);
-
-    return _mm_mul_ps(lu, _mm_sub_ps(two, _mm_mul_ps(lu, x)));
-}
-
-
-static gmx_inline __m256
-gmx_mm256_abs_ps(__m256 x)
-{
-    const __m256 signmask  = _mm256_castsi256_ps( _mm256_set1_epi32(0x7FFFFFFF) );
-
-    return _mm256_and_ps(x, signmask);
-}
-
-static gmx_inline __m128
-gmx_mm_abs_ps(__m128 x)
-{
-    const __m128 signmask  = gmx_mm_castsi128_ps( _mm_set1_epi32(0x7FFFFFFF) );
-
-    return _mm_and_ps(x, signmask);
-}
-
-
-static __m256
-gmx_mm256_log_ps(__m256 x)
-{
-    const __m256  expmask    = _mm256_castsi256_ps( _mm256_set1_epi32(0x7F800000) );
-    const __m128i expbase_m1 = _mm_set1_epi32(127-1); /* We want non-IEEE format */
-    const __m256  half       = _mm256_set1_ps(0.5f);
-    const __m256  one        = _mm256_set1_ps(1.0f);
-    const __m256  invsq2     = _mm256_set1_ps(1.0f/sqrt(2.0f));
-    const __m256  corr1      = _mm256_set1_ps(-2.12194440e-4f);
-    const __m256  corr2      = _mm256_set1_ps(0.693359375f);
-
-    const __m256  CA_1        = _mm256_set1_ps(0.070376836292f);
-    const __m256  CB_0        = _mm256_set1_ps(1.6714950086782716f);
-    const __m256  CB_1        = _mm256_set1_ps(-2.452088066061482f);
-    const __m256  CC_0        = _mm256_set1_ps(1.5220770854701728f);
-    const __m256  CC_1        = _mm256_set1_ps(-1.3422238433233642f);
-    const __m256  CD_0        = _mm256_set1_ps(1.386218787509749f);
-    const __m256  CD_1        = _mm256_set1_ps(0.35075468953796346f);
-    const __m256  CE_0        = _mm256_set1_ps(1.3429983063133937f);
-    const __m256  CE_1        = _mm256_set1_ps(1.807420826584643f);
-
-    __m256        fexp;
-    __m256i       iexp;
-    __m128i       iexp128a, iexp128b;
-    __m256        mask;
-    __m256i       imask;
-    __m128i       imask128a, imask128b;
-    __m256        x2;
-    __m256        y;
-    __m256        pA, pB, pC, pD, pE, tB, tC, tD, tE;
-
-    /* Separate x into exponent and mantissa, with a mantissa in the range [0.5..1[ (not IEEE754 standard!) */
-    fexp  = _mm256_and_ps(x, expmask);
-    iexp  = _mm256_castps_si256(fexp);
-
-    iexp128b = _mm256_extractf128_si256(iexp, 0x1);
-    iexp128a = _mm256_castsi256_si128(iexp);
-
-    iexp128a  = _mm_srli_epi32(iexp128a, 23);
-    iexp128b  = _mm_srli_epi32(iexp128b, 23);
-    iexp128a  = _mm_sub_epi32(iexp128a, expbase_m1);
-    iexp128b  = _mm_sub_epi32(iexp128b, expbase_m1);
-
-    x     = _mm256_andnot_ps(expmask, x);
-    x     = _mm256_or_ps(x, one);
-    x     = _mm256_mul_ps(x, half);
-
-    mask  = _mm256_cmp_ps(x, invsq2, _CMP_LT_OQ);
-
-    x     = _mm256_add_ps(x, _mm256_and_ps(mask, x));
-    x     = _mm256_sub_ps(x, one);
-
-    imask = _mm256_castps_si256(mask);
-
-    imask128b = _mm256_extractf128_si256(imask, 0x1);
-    imask128a = _mm256_castsi256_si128(imask);
-
-    iexp128a  = _mm_add_epi32(iexp128a, imask128a);
-    iexp128b  = _mm_add_epi32(iexp128b, imask128b);
-
-    iexp  = _mm256_castsi128_si256(iexp128a);
-    iexp  = _mm256_insertf128_si256(iexp, iexp128b, 0x1);
-
-    x2    = _mm256_mul_ps(x, x);
-
-    pA    = _mm256_mul_ps(CA_1, x);
-    pB    = _mm256_mul_ps(CB_1, x);
-    pC    = _mm256_mul_ps(CC_1, x);
-    pD    = _mm256_mul_ps(CD_1, x);
-    pE    = _mm256_mul_ps(CE_1, x);
-    tB    = _mm256_add_ps(CB_0, x2);
-    tC    = _mm256_add_ps(CC_0, x2);
-    tD    = _mm256_add_ps(CD_0, x2);
-    tE    = _mm256_add_ps(CE_0, x2);
-    pB    = _mm256_add_ps(pB, tB);
-    pC    = _mm256_add_ps(pC, tC);
-    pD    = _mm256_add_ps(pD, tD);
-    pE    = _mm256_add_ps(pE, tE);
-
-    pA    = _mm256_mul_ps(pA, pB);
-    pC    = _mm256_mul_ps(pC, pD);
-    pE    = _mm256_mul_ps(pE, x2);
-    pA    = _mm256_mul_ps(pA, pC);
-    y     = _mm256_mul_ps(pA, pE);
-
-    fexp  = _mm256_cvtepi32_ps(iexp);
-    y     = _mm256_add_ps(y, _mm256_mul_ps(fexp, corr1));
-
-    y     = _mm256_sub_ps(y, _mm256_mul_ps(half, x2));
-    x2    = _mm256_add_ps(x, y);
-
-    x2    = _mm256_add_ps(x2, _mm256_mul_ps(fexp, corr2));
-
-    return x2;
-}
-
-
-static __m128
-gmx_mm_log_ps(__m128 x)
-{
-    /* Same algorithm as cephes library */
-    const __m128  expmask    = gmx_mm_castsi128_ps( _mm_set_epi32(0x7F800000, 0x7F800000, 0x7F800000, 0x7F800000) );
-    const __m128i expbase_m1 = _mm_set1_epi32(127-1); /* We want non-IEEE format */
-    const __m128  half       = _mm_set1_ps(0.5f);
-    const __m128  one        = _mm_set1_ps(1.0f);
-    const __m128  invsq2     = _mm_set1_ps(1.0f/sqrt(2.0f));
-    const __m128  corr1      = _mm_set1_ps(-2.12194440e-4f);
-    const __m128  corr2      = _mm_set1_ps(0.693359375f);
-
-    const __m128  CA_1        = _mm_set1_ps(0.070376836292f);
-    const __m128  CB_0        = _mm_set1_ps(1.6714950086782716f);
-    const __m128  CB_1        = _mm_set1_ps(-2.452088066061482f);
-    const __m128  CC_0        = _mm_set1_ps(1.5220770854701728f);
-    const __m128  CC_1        = _mm_set1_ps(-1.3422238433233642f);
-    const __m128  CD_0        = _mm_set1_ps(1.386218787509749f);
-    const __m128  CD_1        = _mm_set1_ps(0.35075468953796346f);
-    const __m128  CE_0        = _mm_set1_ps(1.3429983063133937f);
-    const __m128  CE_1        = _mm_set1_ps(1.807420826584643f);
-
-    __m128        fexp;
-    __m128i       iexp;
-    __m128        mask;
-    __m128        x2;
-    __m128        y;
-    __m128        pA, pB, pC, pD, pE, tB, tC, tD, tE;
-
-    /* Separate x into exponent and mantissa, with a mantissa in the range [0.5..1[ (not IEEE754 standard!) */
-    fexp  = _mm_and_ps(x, expmask);
-    iexp  = gmx_mm_castps_si128(fexp);
-    iexp  = _mm_srli_epi32(iexp, 23);
-    iexp  = _mm_sub_epi32(iexp, expbase_m1);
-
-    x     = _mm_andnot_ps(expmask, x);
-    x     = _mm_or_ps(x, one);
-    x     = _mm_mul_ps(x, half);
-
-    mask  = _mm_cmplt_ps(x, invsq2);
-
-    x     = _mm_add_ps(x, _mm_and_ps(mask, x));
-    x     = _mm_sub_ps(x, one);
-    iexp  = _mm_add_epi32(iexp, gmx_mm_castps_si128(mask)); /* 0xFFFFFFFF = -1 as int */
-
-    x2    = _mm_mul_ps(x, x);
-
-    pA    = _mm_mul_ps(CA_1, x);
-    pB    = _mm_mul_ps(CB_1, x);
-    pC    = _mm_mul_ps(CC_1, x);
-    pD    = _mm_mul_ps(CD_1, x);
-    pE    = _mm_mul_ps(CE_1, x);
-    tB    = _mm_add_ps(CB_0, x2);
-    tC    = _mm_add_ps(CC_0, x2);
-    tD    = _mm_add_ps(CD_0, x2);
-    tE    = _mm_add_ps(CE_0, x2);
-    pB    = _mm_add_ps(pB, tB);
-    pC    = _mm_add_ps(pC, tC);
-    pD    = _mm_add_ps(pD, tD);
-    pE    = _mm_add_ps(pE, tE);
-
-    pA    = _mm_mul_ps(pA, pB);
-    pC    = _mm_mul_ps(pC, pD);
-    pE    = _mm_mul_ps(pE, x2);
-    pA    = _mm_mul_ps(pA, pC);
-    y     = _mm_mul_ps(pA, pE);
-
-    fexp  = _mm_cvtepi32_ps(iexp);
-    y     = _mm_add_ps(y, _mm_mul_ps(fexp, corr1));
-
-    y     = _mm_sub_ps(y, _mm_mul_ps(half, x2));
-    x2    = _mm_add_ps(x, y);
-
-    x2    = _mm_add_ps(x2, _mm_mul_ps(fexp, corr2));
-
-    return x2;
-}
-
-
-/*
- * 2^x function, 256-bit wide
- *
- * The 2^w term is calculated from a (6,0)-th order (no denominator) Minimax polynomia on the interval
- * [-0.5,0.5]. The coefficiencts of this was derived in Mathematica using the command:
- *
- * MiniMaxApproximation[(2^x), {x, {-0.5, 0.5}, 6, 0}, WorkingPrecision -> 15]
- *
- * The largest-magnitude exponent we can represent in IEEE single-precision binary format
- * is 2^-126 for small numbers and 2^127 for large ones. To avoid wrap-around problems, we set the
- * result to zero if the argument falls outside this range. For small numbers this is just fine, but
- * for large numbers you could be fancy and return the smallest/largest IEEE single-precision
- * number instead. That would take a few extra cycles and not really help, since something is
- * wrong if you are using single precision to work with numbers that cannot really be represented
- * in single precision.
- *
- * The accuracy is at least 23 bits.
+/* Temporary:
+ * Alias some old SSE definitions to new SIMD definitions so we don't need
+ * to modify _all_ group kernels - they will anyway be replaced with a new
+ * generic SIMD version soon.
   */
-static __m256
-gmx_mm256_exp2_ps(__m256 x)
-{
-    /* Lower bound: Disallow numbers that would lead to an IEEE fp exponent reaching +-127. */
-    const __m256  arglimit = _mm256_set1_ps(126.0f);
-
-    const __m128i expbase  = _mm_set1_epi32(127);
-    const __m256  CC6      = _mm256_set1_ps(1.535336188319500E-004);
-    const __m256  CC5      = _mm256_set1_ps(1.339887440266574E-003);
-    const __m256  CC4      = _mm256_set1_ps(9.618437357674640E-003);
-    const __m256  CC3      = _mm256_set1_ps(5.550332471162809E-002);
-    const __m256  CC2      = _mm256_set1_ps(2.402264791363012E-001);
-    const __m256  CC1      = _mm256_set1_ps(6.931472028550421E-001);
-    const __m256  CC0      = _mm256_set1_ps(1.0f);
-
-    __m256        p0, p1;
-    __m256        valuemask;
-    __m256i       iexppart;
-    __m128i       iexppart128a, iexppart128b;
-    __m256        fexppart;
-    __m256        intpart;
-    __m256        x2;
-
-
-    iexppart  = _mm256_cvtps_epi32(x);
-    intpart   = _mm256_round_ps(x, _MM_FROUND_TO_NEAREST_INT);
-
-    iexppart128b = _mm256_extractf128_si256(iexppart, 0x1);
-    iexppart128a = _mm256_castsi256_si128(iexppart);
-
-    iexppart128a = _mm_slli_epi32(_mm_add_epi32(iexppart128a, expbase), 23);
-    iexppart128b = _mm_slli_epi32(_mm_add_epi32(iexppart128b, expbase), 23);
-
-    iexppart  = _mm256_castsi128_si256(iexppart128a);
-    iexppart  = _mm256_insertf128_si256(iexppart, iexppart128b, 0x1);
-    valuemask = _mm256_cmp_ps(arglimit, gmx_mm256_abs_ps(x), _CMP_GE_OQ);
-    fexppart  = _mm256_and_ps(valuemask, _mm256_castsi256_ps(iexppart));
-
-    x         = _mm256_sub_ps(x, intpart);
-    x2        = _mm256_mul_ps(x, x);
-
-    p0        = _mm256_mul_ps(CC6, x2);
-    p1        = _mm256_mul_ps(CC5, x2);
-    p0        = _mm256_add_ps(p0, CC4);
-    p1        = _mm256_add_ps(p1, CC3);
-    p0        = _mm256_mul_ps(p0, x2);
-    p1        = _mm256_mul_ps(p1, x2);
-    p0        = _mm256_add_ps(p0, CC2);
-    p1        = _mm256_add_ps(p1, CC1);
-    p0        = _mm256_mul_ps(p0, x2);
-    p1        = _mm256_mul_ps(p1, x);
-    p0        = _mm256_add_ps(p0, CC0);
-    p0        = _mm256_add_ps(p0, p1);
-    x         = _mm256_mul_ps(p0, fexppart);
-
-    return x;
-}
-
-
-/* 2^x, 128 bit wide */
-static __m128
-gmx_mm_exp2_ps(__m128 x)
-{
-    /* Lower bound: We do not allow numbers that would lead to an IEEE fp representation exponent smaller than -126. */
-    const __m128  arglimit = _mm_set1_ps(126.0f);
-
-    const __m128i expbase  = _mm_set1_epi32(127);
-    const __m128  CA6      = _mm_set1_ps(1.535336188319500E-004);
-    const __m128  CA5      = _mm_set1_ps(1.339887440266574E-003);
-    const __m128  CA4      = _mm_set1_ps(9.618437357674640E-003);
-    const __m128  CA3      = _mm_set1_ps(5.550332471162809E-002);
-    const __m128  CA2      = _mm_set1_ps(2.402264791363012E-001);
-    const __m128  CA1      = _mm_set1_ps(6.931472028550421E-001);
-    const __m128  CA0      = _mm_set1_ps(1.0f);
-
-    __m128        valuemask;
-    __m128i       iexppart;
-    __m128        fexppart;
-    __m128        intpart;
-    __m128        x2;
-    __m128        p0, p1;
-
-    iexppart  = _mm_cvtps_epi32(x);
-    intpart   = _mm_round_ps(x, _MM_FROUND_TO_NEAREST_INT);
-    iexppart  = _mm_slli_epi32(_mm_add_epi32(iexppart, expbase), 23);
-    valuemask = _mm_cmpge_ps(arglimit, gmx_mm_abs_ps(x));
-    fexppart  = _mm_and_ps(valuemask, gmx_mm_castsi128_ps(iexppart));
-
-    x         = _mm_sub_ps(x, intpart);
-    x2        = _mm_mul_ps(x, x);
-
-    p0        = _mm_mul_ps(CA6, x2);
-    p1        = _mm_mul_ps(CA5, x2);
-    p0        = _mm_add_ps(p0, CA4);
-    p1        = _mm_add_ps(p1, CA3);
-    p0        = _mm_mul_ps(p0, x2);
-    p1        = _mm_mul_ps(p1, x2);
-    p0        = _mm_add_ps(p0, CA2);
-    p1        = _mm_add_ps(p1, CA1);
-    p0        = _mm_mul_ps(p0, x2);
-    p1        = _mm_mul_ps(p1, x);
-    p0        = _mm_add_ps(p0, CA0);
-    p0        = _mm_add_ps(p0, p1);
-    x         = _mm_mul_ps(p0, fexppart);
-
-    return x;
-}
-
-
-/* Exponential function, 256 bit wide. This could be calculated from 2^x as Exp(x)=2^(y),
- * where y=log2(e)*x, but there will then be a small rounding error since we lose some
- * precision due to the multiplication. This will then be magnified a lot by the exponential.
- *
- * Instead, we calculate the fractional part directly as a minimax approximation of
- * Exp(z) on [-0.5,0.5]. We use extended precision arithmetics to calculate the fraction
- * remaining after 2^y, which avoids the precision-loss.
- * The final result is correct to within 1 LSB over the entire argument range.
- */
-static __m256
-gmx_mm256_exp_ps(__m256 exparg)
-{
-    const __m256  argscale      = _mm256_set1_ps(1.44269504088896341f);
-    /* Lower bound: Disallow numbers that would lead to an IEEE fp exponent reaching +-127. */
-    const __m256  arglimit      = _mm256_set1_ps(126.0f);
-    const __m128i expbase       = _mm_set1_epi32(127);
-
-    const __m256  invargscale0  = _mm256_set1_ps(0.693359375f);
-    const __m256  invargscale1  = _mm256_set1_ps(-2.12194440e-4f);
-
-    const __m256  CE5           = _mm256_set1_ps(1.9875691500e-4f);
-    const __m256  CE4           = _mm256_set1_ps(1.3981999507e-3f);
-    const __m256  CE3           = _mm256_set1_ps(8.3334519073e-3f);
-    const __m256  CE2           = _mm256_set1_ps(4.1665795894e-2f);
-    const __m256  CE1           = _mm256_set1_ps(1.6666665459e-1f);
-    const __m256  CE0           = _mm256_set1_ps(5.0000001201e-1f);
-    const __m256  one           = _mm256_set1_ps(1.0f);
-
-    __m256        exparg2, exp2arg;
-    __m256        pE0, pE1;
-    __m256        valuemask;
-    __m256i       iexppart;
-    __m128i       iexppart128a, iexppart128b;
-    __m256        fexppart;
-    __m256        intpart;
-
-    exp2arg = _mm256_mul_ps(exparg, argscale);
-
-    iexppart  = _mm256_cvtps_epi32(exp2arg);
-    intpart   = _mm256_round_ps(exp2arg, _MM_FROUND_TO_NEAREST_INT);
-
-    iexppart128b = _mm256_extractf128_si256(iexppart, 0x1);
-    iexppart128a = _mm256_castsi256_si128(iexppart);
-
-    iexppart128a = _mm_slli_epi32(_mm_add_epi32(iexppart128a, expbase), 23);
-    iexppart128b = _mm_slli_epi32(_mm_add_epi32(iexppart128b, expbase), 23);
-
-    iexppart  = _mm256_castsi128_si256(iexppart128a);
-    iexppart  = _mm256_insertf128_si256(iexppart, iexppart128b, 0x1);
-    valuemask = _mm256_cmp_ps(arglimit, gmx_mm256_abs_ps(exp2arg), _CMP_GE_OQ);
-    fexppart  = _mm256_and_ps(valuemask, _mm256_castsi256_ps(iexppart));
-
-    /* Extended precision arithmetics */
-    exparg    = _mm256_sub_ps(exparg, _mm256_mul_ps(invargscale0, intpart));
-    exparg    = _mm256_sub_ps(exparg, _mm256_mul_ps(invargscale1, intpart));
-
-    exparg2   = _mm256_mul_ps(exparg, exparg);
-
-    pE1       = _mm256_mul_ps(CE5, exparg2);
-    pE0       = _mm256_mul_ps(CE4, exparg2);
-    pE1       = _mm256_add_ps(pE1, CE3);
-    pE0       = _mm256_add_ps(pE0, CE2);
-    pE1       = _mm256_mul_ps(pE1, exparg2);
-    pE0       = _mm256_mul_ps(pE0, exparg2);
-    pE1       = _mm256_add_ps(pE1, CE1);
-    pE0       = _mm256_add_ps(pE0, CE0);
-    pE1       = _mm256_mul_ps(pE1, exparg);
-    pE0       = _mm256_add_ps(pE0, pE1);
-    pE0       = _mm256_mul_ps(pE0, exparg2);
-    exparg    = _mm256_add_ps(exparg, one);
-    exparg    = _mm256_add_ps(exparg, pE0);
-
-    exparg    = _mm256_mul_ps(exparg, fexppart);
-
-    return exparg;
-}
-
-
-/* exp(), 128 bit wide. */
-static __m128
-gmx_mm_exp_ps(__m128 x)
-{
-    const __m128  argscale      = _mm_set1_ps(1.44269504088896341f);
-    /* Lower bound: Disallow numbers that would lead to an IEEE fp exponent reaching +-127. */
-    const __m128  arglimit      = _mm_set1_ps(126.0f);
-    const __m128i expbase       = _mm_set1_epi32(127);
-
-    const __m128  invargscale0  = _mm_set1_ps(0.693359375f);
-    const __m128  invargscale1  = _mm_set1_ps(-2.12194440e-4f);
-
-    const __m128  CC5           = _mm_set1_ps(1.9875691500e-4f);
-    const __m128  CC4           = _mm_set1_ps(1.3981999507e-3f);
-    const __m128  CC3           = _mm_set1_ps(8.3334519073e-3f);
-    const __m128  CC2           = _mm_set1_ps(4.1665795894e-2f);
-    const __m128  CC1           = _mm_set1_ps(1.6666665459e-1f);
-    const __m128  CC0           = _mm_set1_ps(5.0000001201e-1f);
-    const __m128  one           = _mm_set1_ps(1.0f);
-
-    __m128        y, x2;
-    __m128        p0, p1;
-    __m128        valuemask;
-    __m128i       iexppart;
-    __m128        fexppart;
-    __m128        intpart;
-
-    y = _mm_mul_ps(x, argscale);
-
-    iexppart  = _mm_cvtps_epi32(y);
-    intpart   = _mm_round_ps(y, _MM_FROUND_TO_NEAREST_INT);
-
-    iexppart  = _mm_slli_epi32(_mm_add_epi32(iexppart, expbase), 23);
-    valuemask = _mm_cmpge_ps(arglimit, gmx_mm_abs_ps(y));
-    fexppart  = _mm_and_ps(valuemask, gmx_mm_castsi128_ps(iexppart));
-
-    /* Extended precision arithmetics */
-    x         = _mm_sub_ps(x, _mm_mul_ps(invargscale0, intpart));
-    x         = _mm_sub_ps(x, _mm_mul_ps(invargscale1, intpart));
-
-    x2        = _mm_mul_ps(x, x);
-
-    p1        = _mm_mul_ps(CC5, x2);
-    p0        = _mm_mul_ps(CC4, x2);
-    p1        = _mm_add_ps(p1, CC3);
-    p0        = _mm_add_ps(p0, CC2);
-    p1        = _mm_mul_ps(p1, x2);
-    p0        = _mm_mul_ps(p0, x2);
-    p1        = _mm_add_ps(p1, CC1);
-    p0        = _mm_add_ps(p0, CC0);
-    p1        = _mm_mul_ps(p1, x);
-    p0        = _mm_add_ps(p0, p1);
-    p0        = _mm_mul_ps(p0, x2);
-    x         = _mm_add_ps(x, one);
-    x         = _mm_add_ps(x, p0);
-
-    x         = _mm_mul_ps(x, fexppart);
-
-    return x;
-}
-
-
-
-/* FULL precision erf(), 256-bit wide. Only errors in LSB */
-static __m256
-gmx_mm256_erf_ps(__m256 x)
-{
-    /* Coefficients for minimax approximation of erf(x)=x*P(x^2) in range [-1,1] */
-    const __m256  CA6      = _mm256_set1_ps(7.853861353153693e-5f);
-    const __m256  CA5      = _mm256_set1_ps(-8.010193625184903e-4f);
-    const __m256  CA4      = _mm256_set1_ps(5.188327685732524e-3f);
-    const __m256  CA3      = _mm256_set1_ps(-2.685381193529856e-2f);
-    const __m256  CA2      = _mm256_set1_ps(1.128358514861418e-1f);
-    const __m256  CA1      = _mm256_set1_ps(-3.761262582423300e-1f);
-    const __m256  CA0      = _mm256_set1_ps(1.128379165726710f);
-    /* Coefficients for minimax approximation of erfc(x)=Exp(-x^2)*P((1/(x-1))^2) in range [0.67,2] */
-    const __m256  CB9      = _mm256_set1_ps(-0.0018629930017603923f);
-    const __m256  CB8      = _mm256_set1_ps(0.003909821287598495f);
-    const __m256  CB7      = _mm256_set1_ps(-0.0052094582210355615f);
-    const __m256  CB6      = _mm256_set1_ps(0.005685614362160572f);
-    const __m256  CB5      = _mm256_set1_ps(-0.0025367682853477272f);
-    const __m256  CB4      = _mm256_set1_ps(-0.010199799682318782f);
-    const __m256  CB3      = _mm256_set1_ps(0.04369575504816542f);
-    const __m256  CB2      = _mm256_set1_ps(-0.11884063474674492f);
-    const __m256  CB1      = _mm256_set1_ps(0.2732120154030589f);
-    const __m256  CB0      = _mm256_set1_ps(0.42758357702025784f);
-    /* Coefficients for minimax approximation of erfc(x)=Exp(-x^2)*(1/x)*P((1/x)^2) in range [2,9.19] */
-    const __m256  CC10     = _mm256_set1_ps(-0.0445555913112064f);
-    const __m256  CC9      = _mm256_set1_ps(0.21376355144663348f);
-    const __m256  CC8      = _mm256_set1_ps(-0.3473187200259257f);
-    const __m256  CC7      = _mm256_set1_ps(0.016690861551248114f);
-    const __m256  CC6      = _mm256_set1_ps(0.7560973182491192f);
-    const __m256  CC5      = _mm256_set1_ps(-1.2137903600145787f);
-    const __m256  CC4      = _mm256_set1_ps(0.8411872321232948f);
-    const __m256  CC3      = _mm256_set1_ps(-0.08670413896296343f);
-    const __m256  CC2      = _mm256_set1_ps(-0.27124782687240334f);
-    const __m256  CC1      = _mm256_set1_ps(-0.0007502488047806069f);
-    const __m256  CC0      = _mm256_set1_ps(0.5642114853803148f);
-
-    /* Coefficients for expansion of exp(x) in [0,0.1] */
-    /* CD0 and CD1 are both 1.0, so no need to declare them separately */
-    const __m256  CD2      = _mm256_set1_ps(0.5000066608081202f);
-    const __m256  CD3      = _mm256_set1_ps(0.1664795422874624f);
-    const __m256  CD4      = _mm256_set1_ps(0.04379839977652482f);
-
-    const __m256  sieve    = _mm256_castsi256_ps( _mm256_set1_epi32(0xfffff000) );
-    const __m256  signbit  = _mm256_castsi256_ps( _mm256_set1_epi32(0x80000000) );
-    const __m256  one      = _mm256_set1_ps(1.0f);
-    const __m256  two      = _mm256_set1_ps(2.0f);
-
-    __m256        x2, x4, y;
-    __m256        z, q, t, t2, w, w2;
-    __m256        pA0, pA1, pB0, pB1, pC0, pC1;
-    __m256        expmx2, corr;
-    __m256        res_erf, res_erfc, res;
-    __m256        mask;
-
-    /* Calculate erf() */
-    x2     = _mm256_mul_ps(x, x);
-    x4     = _mm256_mul_ps(x2, x2);
-
-    pA0  = _mm256_mul_ps(CA6, x4);
-    pA1  = _mm256_mul_ps(CA5, x4);
-    pA0  = _mm256_add_ps(pA0, CA4);
-    pA1  = _mm256_add_ps(pA1, CA3);
-    pA0  = _mm256_mul_ps(pA0, x4);
-    pA1  = _mm256_mul_ps(pA1, x4);
-    pA0  = _mm256_add_ps(pA0, CA2);
-    pA1  = _mm256_add_ps(pA1, CA1);
-    pA0  = _mm256_mul_ps(pA0, x4);
-    pA1  = _mm256_mul_ps(pA1, x2);
-    pA0  = _mm256_add_ps(pA0, pA1);
-    pA0  = _mm256_add_ps(pA0, CA0);
-
-    res_erf = _mm256_mul_ps(x, pA0);
-
-    /* Calculate erfc */
-
-    y       = gmx_mm256_abs_ps(x);
-    t       = gmx_mm256_inv_ps(y);
-    w       = _mm256_sub_ps(t, one);
-    t2      = _mm256_mul_ps(t, t);
-    w2      = _mm256_mul_ps(w, w);
-    /*
-     * We cannot simply calculate exp(-x2) directly in single precision, since
-     * that will lose a couple of bits of precision due to the multiplication.
-     * Instead, we introduce x=z+w, where the last 12 bits of precision are in w.
-     * Then we get exp(-x2) = exp(-z2)*exp((z-x)*(z+x)).
-     *
-     * The only drawback with this is that it requires TWO separate exponential
-     * evaluations, which would be horrible performance-wise. However, the argument
-     * for the second exp() call is always small, so there we simply use a
-     * low-order minimax expansion on [0,0.1].
-     */
-
-    z       = _mm256_and_ps(y, sieve);
-    q       = _mm256_mul_ps( _mm256_sub_ps(z, y), _mm256_add_ps(z, y) );
-
-    corr    = _mm256_mul_ps(CD4, q);
-    corr    = _mm256_add_ps(corr, CD3);
-    corr    = _mm256_mul_ps(corr, q);
-    corr    = _mm256_add_ps(corr, CD2);
-    corr    = _mm256_mul_ps(corr, q);
-    corr    = _mm256_add_ps(corr, one);
-    corr    = _mm256_mul_ps(corr, q);
-    corr    = _mm256_add_ps(corr, one);
-
-    expmx2  = gmx_mm256_exp_ps( _mm256_or_ps( signbit, _mm256_mul_ps(z, z) ) );
-    expmx2  = _mm256_mul_ps(expmx2, corr);
-
-    pB1  = _mm256_mul_ps(CB9, w2);
-    pB0  = _mm256_mul_ps(CB8, w2);
-    pB1  = _mm256_add_ps(pB1, CB7);
-    pB0  = _mm256_add_ps(pB0, CB6);
-    pB1  = _mm256_mul_ps(pB1, w2);
-    pB0  = _mm256_mul_ps(pB0, w2);
-    pB1  = _mm256_add_ps(pB1, CB5);
-    pB0  = _mm256_add_ps(pB0, CB4);
-    pB1  = _mm256_mul_ps(pB1, w2);
-    pB0  = _mm256_mul_ps(pB0, w2);
-    pB1  = _mm256_add_ps(pB1, CB3);
-    pB0  = _mm256_add_ps(pB0, CB2);
-    pB1  = _mm256_mul_ps(pB1, w2);
-    pB0  = _mm256_mul_ps(pB0, w2);
-    pB1  = _mm256_add_ps(pB1, CB1);
-    pB1  = _mm256_mul_ps(pB1, w);
-    pB0  = _mm256_add_ps(pB0, pB1);
-    pB0  = _mm256_add_ps(pB0, CB0);
-
-    pC0  = _mm256_mul_ps(CC10, t2);
-    pC1  = _mm256_mul_ps(CC9, t2);
-    pC0  = _mm256_add_ps(pC0, CC8);
-    pC1  = _mm256_add_ps(pC1, CC7);
-    pC0  = _mm256_mul_ps(pC0, t2);
-    pC1  = _mm256_mul_ps(pC1, t2);
-    pC0  = _mm256_add_ps(pC0, CC6);
-    pC1  = _mm256_add_ps(pC1, CC5);
-    pC0  = _mm256_mul_ps(pC0, t2);
-    pC1  = _mm256_mul_ps(pC1, t2);
-    pC0  = _mm256_add_ps(pC0, CC4);
-    pC1  = _mm256_add_ps(pC1, CC3);
-    pC0  = _mm256_mul_ps(pC0, t2);
-    pC1  = _mm256_mul_ps(pC1, t2);
-    pC0  = _mm256_add_ps(pC0, CC2);
-    pC1  = _mm256_add_ps(pC1, CC1);
-    pC0  = _mm256_mul_ps(pC0, t2);
-    pC1  = _mm256_mul_ps(pC1, t);
-    pC0  = _mm256_add_ps(pC0, pC1);
-    pC0  = _mm256_add_ps(pC0, CC0);
-    pC0  = _mm256_mul_ps(pC0, t);
-
-    /* SELECT pB0 or pC0 for erfc() */
-    mask     = _mm256_cmp_ps(two, y, _CMP_LT_OQ);
-    res_erfc = _mm256_blendv_ps(pB0, pC0, mask);
-    res_erfc = _mm256_mul_ps(res_erfc, expmx2);
-
-    /* erfc(x<0) = 2-erfc(|x|) */
-    mask     = _mm256_cmp_ps(x, _mm256_setzero_ps(), _CMP_LT_OQ);
-    res_erfc = _mm256_blendv_ps(res_erfc, _mm256_sub_ps(two, res_erfc), mask);
-
-    /* Select erf() or erfc() */
-    mask = _mm256_cmp_ps(y, _mm256_set1_ps(0.75f), _CMP_LT_OQ);
-    res  = _mm256_blendv_ps(_mm256_sub_ps(one, res_erfc), res_erf, mask);
-
-    return res;
-}
-
-
-/* erf(), 128 bit wide */
-static __m128
-gmx_mm_erf_ps(__m128 x)
-{
-    /* Coefficients for minimax approximation of erf(x)=x*P(x^2) in range [-1,1] */
-    const __m128  CA6      = _mm_set1_ps(7.853861353153693e-5f);
-    const __m128  CA5      = _mm_set1_ps(-8.010193625184903e-4f);
-    const __m128  CA4      = _mm_set1_ps(5.188327685732524e-3f);
-    const __m128  CA3      = _mm_set1_ps(-2.685381193529856e-2f);
-    const __m128  CA2      = _mm_set1_ps(1.128358514861418e-1f);
-    const __m128  CA1      = _mm_set1_ps(-3.761262582423300e-1f);
-    const __m128  CA0      = _mm_set1_ps(1.128379165726710f);
-    /* Coefficients for minimax approximation of erfc(x)=Exp(-x^2)*P((1/(x-1))^2) in range [0.67,2] */
-    const __m128  CB9      = _mm_set1_ps(-0.0018629930017603923f);
-    const __m128  CB8      = _mm_set1_ps(0.003909821287598495f);
-    const __m128  CB7      = _mm_set1_ps(-0.0052094582210355615f);
-    const __m128  CB6      = _mm_set1_ps(0.005685614362160572f);
-    const __m128  CB5      = _mm_set1_ps(-0.0025367682853477272f);
-    const __m128  CB4      = _mm_set1_ps(-0.010199799682318782f);
-    const __m128  CB3      = _mm_set1_ps(0.04369575504816542f);
-    const __m128  CB2      = _mm_set1_ps(-0.11884063474674492f);
-    const __m128  CB1      = _mm_set1_ps(0.2732120154030589f);
-    const __m128  CB0      = _mm_set1_ps(0.42758357702025784f);
-    /* Coefficients for minimax approximation of erfc(x)=Exp(-x^2)*(1/x)*P((1/x)^2) in range [2,9.19] */
-    const __m128  CC10     = _mm_set1_ps(-0.0445555913112064f);
-    const __m128  CC9      = _mm_set1_ps(0.21376355144663348f);
-    const __m128  CC8      = _mm_set1_ps(-0.3473187200259257f);
-    const __m128  CC7      = _mm_set1_ps(0.016690861551248114f);
-    const __m128  CC6      = _mm_set1_ps(0.7560973182491192f);
-    const __m128  CC5      = _mm_set1_ps(-1.2137903600145787f);
-    const __m128  CC4      = _mm_set1_ps(0.8411872321232948f);
-    const __m128  CC3      = _mm_set1_ps(-0.08670413896296343f);
-    const __m128  CC2      = _mm_set1_ps(-0.27124782687240334f);
-    const __m128  CC1      = _mm_set1_ps(-0.0007502488047806069f);
-    const __m128  CC0      = _mm_set1_ps(0.5642114853803148f);
-
-    /* Coefficients for expansion of exp(x) in [0,0.1] */
-    /* CD0 and CD1 are both 1.0, so no need to declare them separately */
-    const __m128  CD2      = _mm_set1_ps(0.5000066608081202f);
-    const __m128  CD3      = _mm_set1_ps(0.1664795422874624f);
-    const __m128  CD4      = _mm_set1_ps(0.04379839977652482f);
-
-    const __m128  sieve    = gmx_mm_castsi128_ps( _mm_set1_epi32(0xfffff000) );
-    const __m128  signbit  = gmx_mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
-    const __m128  one      = _mm_set1_ps(1.0f);
-    const __m128  two      = _mm_set1_ps(2.0f);
-
-    __m128        x2, x4, y;
-    __m128        z, q, t, t2, w, w2;
-    __m128        pA0, pA1, pB0, pB1, pC0, pC1;
-    __m128        expmx2, corr;
-    __m128        res_erf, res_erfc, res;
-    __m128        mask;
-
-    /* Calculate erf() */
-    x2     = _mm_mul_ps(x, x);
-    x4     = _mm_mul_ps(x2, x2);
-
-    pA0  = _mm_mul_ps(CA6, x4);
-    pA1  = _mm_mul_ps(CA5, x4);
-    pA0  = _mm_add_ps(pA0, CA4);
-    pA1  = _mm_add_ps(pA1, CA3);
-    pA0  = _mm_mul_ps(pA0, x4);
-    pA1  = _mm_mul_ps(pA1, x4);
-    pA0  = _mm_add_ps(pA0, CA2);
-    pA1  = _mm_add_ps(pA1, CA1);
-    pA0  = _mm_mul_ps(pA0, x4);
-    pA1  = _mm_mul_ps(pA1, x2);
-    pA0  = _mm_add_ps(pA0, pA1);
-    pA0  = _mm_add_ps(pA0, CA0);
-
-    res_erf = _mm_mul_ps(x, pA0);
-
-    /* Calculate erfc */
-
-    y       = gmx_mm_abs_ps(x);
-    t       = gmx_mm_inv_ps(y);
-    w       = _mm_sub_ps(t, one);
-    t2      = _mm_mul_ps(t, t);
-    w2      = _mm_mul_ps(w, w);
-    /*
-     * We cannot simply calculate exp(-x2) directly in single precision, since
-     * that will lose a couple of bits of precision due to the multiplication.
-     * Instead, we introduce x=z+w, where the last 12 bits of precision are in w.
-     * Then we get exp(-x2) = exp(-z2)*exp((z-x)*(z+x)).
-     *
-     * The only drawback with this is that it requires TWO separate exponential
-     * evaluations, which would be horrible performance-wise. However, the argument
-     * for the second exp() call is always small, so there we simply use a
-     * low-order minimax expansion on [0,0.1].
-     */
-
-    z       = _mm_and_ps(y, sieve);
-    q       = _mm_mul_ps( _mm_sub_ps(z, y), _mm_add_ps(z, y) );
-
-    corr    = _mm_mul_ps(CD4, q);
-    corr    = _mm_add_ps(corr, CD3);
-    corr    = _mm_mul_ps(corr, q);
-    corr    = _mm_add_ps(corr, CD2);
-    corr    = _mm_mul_ps(corr, q);
-    corr    = _mm_add_ps(corr, one);
-    corr    = _mm_mul_ps(corr, q);
-    corr    = _mm_add_ps(corr, one);
-
-    expmx2  = gmx_mm_exp_ps( _mm_or_ps( signbit, _mm_mul_ps(z, z) ) );
-    expmx2  = _mm_mul_ps(expmx2, corr);
-
-    pB1  = _mm_mul_ps(CB9, w2);
-    pB0  = _mm_mul_ps(CB8, w2);
-    pB1  = _mm_add_ps(pB1, CB7);
-    pB0  = _mm_add_ps(pB0, CB6);
-    pB1  = _mm_mul_ps(pB1, w2);
-    pB0  = _mm_mul_ps(pB0, w2);
-    pB1  = _mm_add_ps(pB1, CB5);
-    pB0  = _mm_add_ps(pB0, CB4);
-    pB1  = _mm_mul_ps(pB1, w2);
-    pB0  = _mm_mul_ps(pB0, w2);
-    pB1  = _mm_add_ps(pB1, CB3);
-    pB0  = _mm_add_ps(pB0, CB2);
-    pB1  = _mm_mul_ps(pB1, w2);
-    pB0  = _mm_mul_ps(pB0, w2);
-    pB1  = _mm_add_ps(pB1, CB1);
-    pB1  = _mm_mul_ps(pB1, w);
-    pB0  = _mm_add_ps(pB0, pB1);
-    pB0  = _mm_add_ps(pB0, CB0);
-
-    pC0  = _mm_mul_ps(CC10, t2);
-    pC1  = _mm_mul_ps(CC9, t2);
-    pC0  = _mm_add_ps(pC0, CC8);
-    pC1  = _mm_add_ps(pC1, CC7);
-    pC0  = _mm_mul_ps(pC0, t2);
-    pC1  = _mm_mul_ps(pC1, t2);
-    pC0  = _mm_add_ps(pC0, CC6);
-    pC1  = _mm_add_ps(pC1, CC5);
-    pC0  = _mm_mul_ps(pC0, t2);
-    pC1  = _mm_mul_ps(pC1, t2);
-    pC0  = _mm_add_ps(pC0, CC4);
-    pC1  = _mm_add_ps(pC1, CC3);
-    pC0  = _mm_mul_ps(pC0, t2);
-    pC1  = _mm_mul_ps(pC1, t2);
-    pC0  = _mm_add_ps(pC0, CC2);
-    pC1  = _mm_add_ps(pC1, CC1);
-    pC0  = _mm_mul_ps(pC0, t2);
-    pC1  = _mm_mul_ps(pC1, t);
-    pC0  = _mm_add_ps(pC0, pC1);
-    pC0  = _mm_add_ps(pC0, CC0);
-    pC0  = _mm_mul_ps(pC0, t);
-
-    /* SELECT pB0 or pC0 for erfc() */
-    mask     = _mm_cmplt_ps(two, y);
-    res_erfc = _mm_blendv_ps(pB0, pC0, mask);
-    res_erfc = _mm_mul_ps(res_erfc, expmx2);
-
-    /* erfc(x<0) = 2-erfc(|x|) */
-    mask     = _mm_cmplt_ps(x, _mm_setzero_ps());
-    res_erfc = _mm_blendv_ps(res_erfc, _mm_sub_ps(two, res_erfc), mask);
-
-    /* Select erf() or erfc() */
-    mask = _mm_cmplt_ps(y, _mm_set1_ps(0.75f));
-    res  = _mm_blendv_ps(_mm_sub_ps(one, res_erfc), res_erf, mask);
-
-    return res;
-}
-
-
-
-
-/* FULL precision erfc(), 256 bit wide. Only errors in LSB */
-static __m256
-gmx_mm256_erfc_ps(__m256 x)
-{
-    /* Coefficients for minimax approximation of erf(x)=x*P(x^2) in range [-1,1] */
-    const __m256  CA6      = _mm256_set1_ps(7.853861353153693e-5f);
-    const __m256  CA5      = _mm256_set1_ps(-8.010193625184903e-4f);
-    const __m256  CA4      = _mm256_set1_ps(5.188327685732524e-3f);
-    const __m256  CA3      = _mm256_set1_ps(-2.685381193529856e-2f);
-    const __m256  CA2      = _mm256_set1_ps(1.128358514861418e-1f);
-    const __m256  CA1      = _mm256_set1_ps(-3.761262582423300e-1f);
-    const __m256  CA0      = _mm256_set1_ps(1.128379165726710f);
-    /* Coefficients for minimax approximation of erfc(x)=Exp(-x^2)*P((1/(x-1))^2) in range [0.67,2] */
-    const __m256  CB9      = _mm256_set1_ps(-0.0018629930017603923f);
-    const __m256  CB8      = _mm256_set1_ps(0.003909821287598495f);
-    const __m256  CB7      = _mm256_set1_ps(-0.0052094582210355615f);
-    const __m256  CB6      = _mm256_set1_ps(0.005685614362160572f);
-    const __m256  CB5      = _mm256_set1_ps(-0.0025367682853477272f);
-    const __m256  CB4      = _mm256_set1_ps(-0.010199799682318782f);
-    const __m256  CB3      = _mm256_set1_ps(0.04369575504816542f);
-    const __m256  CB2      = _mm256_set1_ps(-0.11884063474674492f);
-    const __m256  CB1      = _mm256_set1_ps(0.2732120154030589f);
-    const __m256  CB0      = _mm256_set1_ps(0.42758357702025784f);
-    /* Coefficients for minimax approximation of erfc(x)=Exp(-x^2)*(1/x)*P((1/x)^2) in range [2,9.19] */
-    const __m256  CC10     = _mm256_set1_ps(-0.0445555913112064f);
-    const __m256  CC9      = _mm256_set1_ps(0.21376355144663348f);
-    const __m256  CC8      = _mm256_set1_ps(-0.3473187200259257f);
-    const __m256  CC7      = _mm256_set1_ps(0.016690861551248114f);
-    const __m256  CC6      = _mm256_set1_ps(0.7560973182491192f);
-    const __m256  CC5      = _mm256_set1_ps(-1.2137903600145787f);
-    const __m256  CC4      = _mm256_set1_ps(0.8411872321232948f);
-    const __m256  CC3      = _mm256_set1_ps(-0.08670413896296343f);
-    const __m256  CC2      = _mm256_set1_ps(-0.27124782687240334f);
-    const __m256  CC1      = _mm256_set1_ps(-0.0007502488047806069f);
-    const __m256  CC0      = _mm256_set1_ps(0.5642114853803148f);
-
-    /* Coefficients for expansion of exp(x) in [0,0.1] */
-    /* CD0 and CD1 are both 1.0, so no need to declare them separately */
-    const __m256  CD2      = _mm256_set1_ps(0.5000066608081202f);
-    const __m256  CD3      = _mm256_set1_ps(0.1664795422874624f);
-    const __m256  CD4      = _mm256_set1_ps(0.04379839977652482f);
-
-    const __m256  sieve    = _mm256_castsi256_ps( _mm256_set1_epi32(0xfffff000) );
-    const __m256  signbit  = _mm256_castsi256_ps( _mm256_set1_epi32(0x80000000) );
-    const __m256  one      = _mm256_set1_ps(1.0f);
-    const __m256  two      = _mm256_set1_ps(2.0f);
-
-    __m256        x2, x4, y;
-    __m256        z, q, t, t2, w, w2;
-    __m256        pA0, pA1, pB0, pB1, pC0, pC1;
-    __m256        expmx2, corr;
-    __m256        res_erf, res_erfc, res;
-    __m256        mask;
-
-    /* Calculate erf() */
-    x2     = _mm256_mul_ps(x, x);
-    x4     = _mm256_mul_ps(x2, x2);
-
-    pA0  = _mm256_mul_ps(CA6, x4);
-    pA1  = _mm256_mul_ps(CA5, x4);
-    pA0  = _mm256_add_ps(pA0, CA4);
-    pA1  = _mm256_add_ps(pA1, CA3);
-    pA0  = _mm256_mul_ps(pA0, x4);
-    pA1  = _mm256_mul_ps(pA1, x4);
-    pA0  = _mm256_add_ps(pA0, CA2);
-    pA1  = _mm256_add_ps(pA1, CA1);
-    pA0  = _mm256_mul_ps(pA0, x4);
-    pA1  = _mm256_mul_ps(pA1, x2);
-    pA0  = _mm256_add_ps(pA0, pA1);
-    pA0  = _mm256_add_ps(pA0, CA0);
-
-    res_erf = _mm256_mul_ps(x, pA0);
-
-    /* Calculate erfc */
-    y       = gmx_mm256_abs_ps(x);
-    t       = gmx_mm256_inv_ps(y);
-    w       = _mm256_sub_ps(t, one);
-    t2      = _mm256_mul_ps(t, t);
-    w2      = _mm256_mul_ps(w, w);
-    /*
-     * We cannot simply calculate exp(-x2) directly in single precision, since
-     * that will lose a couple of bits of precision due to the multiplication.
-     * Instead, we introduce x=z+w, where the last 12 bits of precision are in w.
-     * Then we get exp(-x2) = exp(-z2)*exp((z-x)*(z+x)).
-     *
-     * The only drawback with this is that it requires TWO separate exponential
-     * evaluations, which would be horrible performance-wise. However, the argument
-     * for the second exp() call is always small, so there we simply use a
-     * low-order minimax expansion on [0,0.1].
-     */
-
-    z       = _mm256_and_ps(y, sieve);
-    q       = _mm256_mul_ps( _mm256_sub_ps(z, y), _mm256_add_ps(z, y) );
-
-    corr    = _mm256_mul_ps(CD4, q);
-    corr    = _mm256_add_ps(corr, CD3);
-    corr    = _mm256_mul_ps(corr, q);
-    corr    = _mm256_add_ps(corr, CD2);
-    corr    = _mm256_mul_ps(corr, q);
-    corr    = _mm256_add_ps(corr, one);
-    corr    = _mm256_mul_ps(corr, q);
-    corr    = _mm256_add_ps(corr, one);
-
-    expmx2  = gmx_mm256_exp_ps( _mm256_or_ps( signbit, _mm256_mul_ps(z, z) ) );
-    expmx2  = _mm256_mul_ps(expmx2, corr);
-
-    pB1  = _mm256_mul_ps(CB9, w2);
-    pB0  = _mm256_mul_ps(CB8, w2);
-    pB1  = _mm256_add_ps(pB1, CB7);
-    pB0  = _mm256_add_ps(pB0, CB6);
-    pB1  = _mm256_mul_ps(pB1, w2);
-    pB0  = _mm256_mul_ps(pB0, w2);
-    pB1  = _mm256_add_ps(pB1, CB5);
-    pB0  = _mm256_add_ps(pB0, CB4);
-    pB1  = _mm256_mul_ps(pB1, w2);
-    pB0  = _mm256_mul_ps(pB0, w2);
-    pB1  = _mm256_add_ps(pB1, CB3);
-    pB0  = _mm256_add_ps(pB0, CB2);
-    pB1  = _mm256_mul_ps(pB1, w2);
-    pB0  = _mm256_mul_ps(pB0, w2);
-    pB1  = _mm256_add_ps(pB1, CB1);
-    pB1  = _mm256_mul_ps(pB1, w);
-    pB0  = _mm256_add_ps(pB0, pB1);
-    pB0  = _mm256_add_ps(pB0, CB0);
-
-    pC0  = _mm256_mul_ps(CC10, t2);
-    pC1  = _mm256_mul_ps(CC9, t2);
-    pC0  = _mm256_add_ps(pC0, CC8);
-    pC1  = _mm256_add_ps(pC1, CC7);
-    pC0  = _mm256_mul_ps(pC0, t2);
-    pC1  = _mm256_mul_ps(pC1, t2);
-    pC0  = _mm256_add_ps(pC0, CC6);
-    pC1  = _mm256_add_ps(pC1, CC5);
-    pC0  = _mm256_mul_ps(pC0, t2);
-    pC1  = _mm256_mul_ps(pC1, t2);
-    pC0  = _mm256_add_ps(pC0, CC4);
-    pC1  = _mm256_add_ps(pC1, CC3);
-    pC0  = _mm256_mul_ps(pC0, t2);
-    pC1  = _mm256_mul_ps(pC1, t2);
-    pC0  = _mm256_add_ps(pC0, CC2);
-    pC1  = _mm256_add_ps(pC1, CC1);
-    pC0  = _mm256_mul_ps(pC0, t2);
-    pC1  = _mm256_mul_ps(pC1, t);
-    pC0  = _mm256_add_ps(pC0, pC1);
-    pC0  = _mm256_add_ps(pC0, CC0);
-    pC0  = _mm256_mul_ps(pC0, t);
-
-    /* SELECT pB0 or pC0 for erfc() */
-    mask     = _mm256_cmp_ps(two, y, _CMP_LT_OQ);
-    res_erfc = _mm256_blendv_ps(pB0, pC0, mask);
-    res_erfc = _mm256_mul_ps(res_erfc, expmx2);
-
-    /* erfc(x<0) = 2-erfc(|x|) */
-    mask     = _mm256_cmp_ps(x, _mm256_setzero_ps(), _CMP_LT_OQ);
-    res_erfc = _mm256_blendv_ps(res_erfc, _mm256_sub_ps(two, res_erfc), mask);
-
-    /* Select erf() or erfc() */
-    mask = _mm256_cmp_ps(y, _mm256_set1_ps(0.75f), _CMP_LT_OQ);
-    res  = _mm256_blendv_ps(res_erfc, _mm256_sub_ps(one, res_erf), mask);
-
-    return res;
-}
-
-
-/* erfc(), 128 bit wide */
-static __m128
-gmx_mm_erfc_ps(__m128 x)
-{
-    /* Coefficients for minimax approximation of erf(x)=x*P(x^2) in range [-1,1] */
-    const __m128  CA6      = _mm_set1_ps(7.853861353153693e-5f);
-    const __m128  CA5      = _mm_set1_ps(-8.010193625184903e-4f);
-    const __m128  CA4      = _mm_set1_ps(5.188327685732524e-3f);
-    const __m128  CA3      = _mm_set1_ps(-2.685381193529856e-2f);
-    const __m128  CA2      = _mm_set1_ps(1.128358514861418e-1f);
-    const __m128  CA1      = _mm_set1_ps(-3.761262582423300e-1f);
-    const __m128  CA0      = _mm_set1_ps(1.128379165726710f);
-    /* Coefficients for minimax approximation of erfc(x)=Exp(-x^2)*P((1/(x-1))^2) in range [0.67,2] */
-    const __m128  CB9      = _mm_set1_ps(-0.0018629930017603923f);
-    const __m128  CB8      = _mm_set1_ps(0.003909821287598495f);
-    const __m128  CB7      = _mm_set1_ps(-0.0052094582210355615f);
-    const __m128  CB6      = _mm_set1_ps(0.005685614362160572f);
-    const __m128  CB5      = _mm_set1_ps(-0.0025367682853477272f);
-    const __m128  CB4      = _mm_set1_ps(-0.010199799682318782f);
-    const __m128  CB3      = _mm_set1_ps(0.04369575504816542f);
-    const __m128  CB2      = _mm_set1_ps(-0.11884063474674492f);
-    const __m128  CB1      = _mm_set1_ps(0.2732120154030589f);
-    const __m128  CB0      = _mm_set1_ps(0.42758357702025784f);
-    /* Coefficients for minimax approximation of erfc(x)=Exp(-x^2)*(1/x)*P((1/x)^2) in range [2,9.19] */
-    const __m128  CC10     = _mm_set1_ps(-0.0445555913112064f);
-    const __m128  CC9      = _mm_set1_ps(0.21376355144663348f);
-    const __m128  CC8      = _mm_set1_ps(-0.3473187200259257f);
-    const __m128  CC7      = _mm_set1_ps(0.016690861551248114f);
-    const __m128  CC6      = _mm_set1_ps(0.7560973182491192f);
-    const __m128  CC5      = _mm_set1_ps(-1.2137903600145787f);
-    const __m128  CC4      = _mm_set1_ps(0.8411872321232948f);
-    const __m128  CC3      = _mm_set1_ps(-0.08670413896296343f);
-    const __m128  CC2      = _mm_set1_ps(-0.27124782687240334f);
-    const __m128  CC1      = _mm_set1_ps(-0.0007502488047806069f);
-    const __m128  CC0      = _mm_set1_ps(0.5642114853803148f);
-
-    /* Coefficients for expansion of exp(x) in [0,0.1] */
-    /* CD0 and CD1 are both 1.0, so no need to declare them separately */
-    const __m128  CD2      = _mm_set1_ps(0.5000066608081202f);
-    const __m128  CD3      = _mm_set1_ps(0.1664795422874624f);
-    const __m128  CD4      = _mm_set1_ps(0.04379839977652482f);
-
-    const __m128  sieve    = gmx_mm_castsi128_ps( _mm_set1_epi32(0xfffff000) );
-    const __m128  signbit  = gmx_mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
-    const __m128  one      = _mm_set1_ps(1.0f);
-    const __m128  two      = _mm_set1_ps(2.0f);
-
-    __m128        x2, x4, y;
-    __m128        z, q, t, t2, w, w2;
-    __m128        pA0, pA1, pB0, pB1, pC0, pC1;
-    __m128        expmx2, corr;
-    __m128        res_erf, res_erfc, res;
-    __m128        mask;
-
-    /* Calculate erf() */
-    x2     = _mm_mul_ps(x, x);
-    x4     = _mm_mul_ps(x2, x2);
-
-    pA0  = _mm_mul_ps(CA6, x4);
-    pA1  = _mm_mul_ps(CA5, x4);
-    pA0  = _mm_add_ps(pA0, CA4);
-    pA1  = _mm_add_ps(pA1, CA3);
-    pA0  = _mm_mul_ps(pA0, x4);
-    pA1  = _mm_mul_ps(pA1, x4);
-    pA0  = _mm_add_ps(pA0, CA2);
-    pA1  = _mm_add_ps(pA1, CA1);
-    pA0  = _mm_mul_ps(pA0, x4);
-    pA1  = _mm_mul_ps(pA1, x2);
-    pA0  = _mm_add_ps(pA0, pA1);
-    pA0  = _mm_add_ps(pA0, CA0);
-
-    res_erf = _mm_mul_ps(x, pA0);
-
-    /* Calculate erfc */
-    y       = gmx_mm_abs_ps(x);
-    t       = gmx_mm_inv_ps(y);
-    w       = _mm_sub_ps(t, one);
-    t2      = _mm_mul_ps(t, t);
-    w2      = _mm_mul_ps(w, w);
-    /*
-     * We cannot simply calculate exp(-x2) directly in single precision, since
-     * that will lose a couple of bits of precision due to the multiplication.
-     * Instead, we introduce x=z+w, where the last 12 bits of precision are in w.
-     * Then we get exp(-x2) = exp(-z2)*exp((z-x)*(z+x)).
-     *
-     * The only drawback with this is that it requires TWO separate exponential
-     * evaluations, which would be horrible performance-wise. However, the argument
-     * for the second exp() call is always small, so there we simply use a
-     * low-order minimax expansion on [0,0.1].
-     */
-
-    z       = _mm_and_ps(y, sieve);
-    q       = _mm_mul_ps( _mm_sub_ps(z, y), _mm_add_ps(z, y) );
-
-    corr    = _mm_mul_ps(CD4, q);
-    corr    = _mm_add_ps(corr, CD3);
-    corr    = _mm_mul_ps(corr, q);
-    corr    = _mm_add_ps(corr, CD2);
-    corr    = _mm_mul_ps(corr, q);
-    corr    = _mm_add_ps(corr, one);
-    corr    = _mm_mul_ps(corr, q);
-    corr    = _mm_add_ps(corr, one);
-
-    expmx2  = gmx_mm_exp_ps( _mm_or_ps( signbit, _mm_mul_ps(z, z) ) );
-    expmx2  = _mm_mul_ps(expmx2, corr);
-
-    pB1  = _mm_mul_ps(CB9, w2);
-    pB0  = _mm_mul_ps(CB8, w2);
-    pB1  = _mm_add_ps(pB1, CB7);
-    pB0  = _mm_add_ps(pB0, CB6);
-    pB1  = _mm_mul_ps(pB1, w2);
-    pB0  = _mm_mul_ps(pB0, w2);
-    pB1  = _mm_add_ps(pB1, CB5);
-    pB0  = _mm_add_ps(pB0, CB4);
-    pB1  = _mm_mul_ps(pB1, w2);
-    pB0  = _mm_mul_ps(pB0, w2);
-    pB1  = _mm_add_ps(pB1, CB3);
-    pB0  = _mm_add_ps(pB0, CB2);
-    pB1  = _mm_mul_ps(pB1, w2);
-    pB0  = _mm_mul_ps(pB0, w2);
-    pB1  = _mm_add_ps(pB1, CB1);
-    pB1  = _mm_mul_ps(pB1, w);
-    pB0  = _mm_add_ps(pB0, pB1);
-    pB0  = _mm_add_ps(pB0, CB0);
-
-    pC0  = _mm_mul_ps(CC10, t2);
-    pC1  = _mm_mul_ps(CC9, t2);
-    pC0  = _mm_add_ps(pC0, CC8);
-    pC1  = _mm_add_ps(pC1, CC7);
-    pC0  = _mm_mul_ps(pC0, t2);
-    pC1  = _mm_mul_ps(pC1, t2);
-    pC0  = _mm_add_ps(pC0, CC6);
-    pC1  = _mm_add_ps(pC1, CC5);
-    pC0  = _mm_mul_ps(pC0, t2);
-    pC1  = _mm_mul_ps(pC1, t2);
-    pC0  = _mm_add_ps(pC0, CC4);
-    pC1  = _mm_add_ps(pC1, CC3);
-    pC0  = _mm_mul_ps(pC0, t2);
-    pC1  = _mm_mul_ps(pC1, t2);
-    pC0  = _mm_add_ps(pC0, CC2);
-    pC1  = _mm_add_ps(pC1, CC1);
-    pC0  = _mm_mul_ps(pC0, t2);
-    pC1  = _mm_mul_ps(pC1, t);
-    pC0  = _mm_add_ps(pC0, pC1);
-    pC0  = _mm_add_ps(pC0, CC0);
-    pC0  = _mm_mul_ps(pC0, t);
-
-    /* SELECT pB0 or pC0 for erfc() */
-    mask     = _mm_cmplt_ps(two, y);
-    res_erfc = _mm_blendv_ps(pB0, pC0, mask);
-    res_erfc = _mm_mul_ps(res_erfc, expmx2);
-
-    /* erfc(x<0) = 2-erfc(|x|) */
-    mask     = _mm_cmplt_ps(x, _mm_setzero_ps());
-    res_erfc = _mm_blendv_ps(res_erfc, _mm_sub_ps(two, res_erfc), mask);
-
-    /* Select erf() or erfc() */
-    mask = _mm_cmplt_ps(y, _mm_set1_ps(0.75f));
-    res  = _mm_blendv_ps(res_erfc, _mm_sub_ps(one, res_erf), mask);
-
-    return res;
-}
-
-
-
-/* Calculate the force correction due to PME analytically.
- *
- * This routine is meant to enable analytical evaluation of the
- * direct-space PME electrostatic force to avoid tables.
- *
- * The direct-space potential should be Erfc(beta*r)/r, but there
- * are some problems evaluating that:
- *
- * First, the error function is difficult (read: expensive) to
- * approxmiate accurately for intermediate to large arguments, and
- * this happens already in ranges of beta*r that occur in simulations.
- * Second, we now try to avoid calculating potentials in Gromacs but
- * use forces directly.
- *
- * We can simply things slight by noting that the PME part is really
- * a correction to the normal Coulomb force since Erfc(z)=1-Erf(z), i.e.
- *
- * V= 1/r - Erf(beta*r)/r
- *
- * The first term we already have from the inverse square root, so
- * that we can leave out of this routine.
- *
- * For pme tolerances of 1e-3 to 1e-8 and cutoffs of 0.5nm to 1.8nm,
- * the argument beta*r will be in the range 0.15 to ~4. Use your
- * favorite plotting program to realize how well-behaved Erf(z)/z is
- * in this range!
- *
- * We approximate f(z)=erf(z)/z with a rational minimax polynomial.
- * However, it turns out it is more efficient to approximate f(z)/z and
- * then only use even powers. This is another minor optimization, since
- * we actually WANT f(z)/z, because it is going to be multiplied by
- * the vector between the two atoms to get the vectorial force. The
- * fastest flops are the ones we can avoid calculating!
- *
- * So, here's how it should be used:
- *
- * 1. Calculate r^2.
- * 2. Multiply by beta^2, so you get z^2=beta^2*r^2.
- * 3. Evaluate this routine with z^2 as the argument.
- * 4. The return value is the expression:
- *
- *
- *       2*exp(-z^2)     erf(z)
- *       ------------ - --------
- *       sqrt(Pi)*z^2      z^3
- *
- * 5. Multiply the entire expression by beta^3. This will get you
- *
- *       beta^3*2*exp(-z^2)     beta^3*erf(z)
- *       ------------------  - ---------------
- *          sqrt(Pi)*z^2            z^3
- *
- *    or, switching back to r (z=r*beta):
- *
- *       2*beta*exp(-r^2*beta^2)   erf(r*beta)
- *       ----------------------- - -----------
- *            sqrt(Pi)*r^2            r^3
- *
- *
- *    With a bit of math exercise you should be able to confirm that
- *    this is exactly D[Erf[beta*r]/r,r] divided by r another time.
- *
- * 6. Add the result to 1/r^3, multiply by the product of the charges,
- *    and you have your force (divided by r). A final multiplication
- *    with the vector connecting the two particles and you have your
- *    vectorial force to add to the particles.
- *
- */
-static __m256
-gmx_mm256_pmecorrF_ps(__m256 z2)
-{
-    const __m256  FN6      = _mm256_set1_ps(-1.7357322914161492954e-8f);
-    const __m256  FN5      = _mm256_set1_ps(1.4703624142580877519e-6f);
-    const __m256  FN4      = _mm256_set1_ps(-0.000053401640219807709149f);
-    const __m256  FN3      = _mm256_set1_ps(0.0010054721316683106153f);
-    const __m256  FN2      = _mm256_set1_ps(-0.019278317264888380590f);
-    const __m256  FN1      = _mm256_set1_ps(0.069670166153766424023f);
-    const __m256  FN0      = _mm256_set1_ps(-0.75225204789749321333f);
-
-    const __m256  FD4      = _mm256_set1_ps(0.0011193462567257629232f);
-    const __m256  FD3      = _mm256_set1_ps(0.014866955030185295499f);
-    const __m256  FD2      = _mm256_set1_ps(0.11583842382862377919f);
-    const __m256  FD1      = _mm256_set1_ps(0.50736591960530292870f);
-    const __m256  FD0      = _mm256_set1_ps(1.0f);
-
-    __m256        z4;
-    __m256        polyFN0, polyFN1, polyFD0, polyFD1;
-
-    z4             = _mm256_mul_ps(z2, z2);
-
-    polyFD0        = _mm256_mul_ps(FD4, z4);
-    polyFD1        = _mm256_mul_ps(FD3, z4);
-    polyFD0        = _mm256_add_ps(polyFD0, FD2);
-    polyFD1        = _mm256_add_ps(polyFD1, FD1);
-    polyFD0        = _mm256_mul_ps(polyFD0, z4);
-    polyFD1        = _mm256_mul_ps(polyFD1, z2);
-    polyFD0        = _mm256_add_ps(polyFD0, FD0);
-    polyFD0        = _mm256_add_ps(polyFD0, polyFD1);
-
-    polyFD0        = gmx_mm256_inv_ps(polyFD0);
-
-    polyFN0        = _mm256_mul_ps(FN6, z4);
-    polyFN1        = _mm256_mul_ps(FN5, z4);
-    polyFN0        = _mm256_add_ps(polyFN0, FN4);
-    polyFN1        = _mm256_add_ps(polyFN1, FN3);
-    polyFN0        = _mm256_mul_ps(polyFN0, z4);
-    polyFN1        = _mm256_mul_ps(polyFN1, z4);
-    polyFN0        = _mm256_add_ps(polyFN0, FN2);
-    polyFN1        = _mm256_add_ps(polyFN1, FN1);
-    polyFN0        = _mm256_mul_ps(polyFN0, z4);
-    polyFN1        = _mm256_mul_ps(polyFN1, z2);
-    polyFN0        = _mm256_add_ps(polyFN0, FN0);
-    polyFN0        = _mm256_add_ps(polyFN0, polyFN1);
-
-    return _mm256_mul_ps(polyFN0, polyFD0);
-}
-
-
-static __m128
-gmx_mm_pmecorrF_ps(__m128 z2)
-{
-    const __m128  FN6      = _mm_set1_ps(-1.7357322914161492954e-8f);
-    const __m128  FN5      = _mm_set1_ps(1.4703624142580877519e-6f);
-    const __m128  FN4      = _mm_set1_ps(-0.000053401640219807709149f);
-    const __m128  FN3      = _mm_set1_ps(0.0010054721316683106153f);
-    const __m128  FN2      = _mm_set1_ps(-0.019278317264888380590f);
-    const __m128  FN1      = _mm_set1_ps(0.069670166153766424023f);
-    const __m128  FN0      = _mm_set1_ps(-0.75225204789749321333f);
-
-    const __m128  FD4      = _mm_set1_ps(0.0011193462567257629232f);
-    const __m128  FD3      = _mm_set1_ps(0.014866955030185295499f);
-    const __m128  FD2      = _mm_set1_ps(0.11583842382862377919f);
-    const __m128  FD1      = _mm_set1_ps(0.50736591960530292870f);
-    const __m128  FD0      = _mm_set1_ps(1.0f);
-
-    __m128        z4;
-    __m128        polyFN0, polyFN1, polyFD0, polyFD1;
-
-    z4             = _mm_mul_ps(z2, z2);
-
-    polyFD0        = _mm_mul_ps(FD4, z4);
-    polyFD1        = _mm_mul_ps(FD3, z4);
-    polyFD0        = _mm_add_ps(polyFD0, FD2);
-    polyFD1        = _mm_add_ps(polyFD1, FD1);
-    polyFD0        = _mm_mul_ps(polyFD0, z4);
-    polyFD1        = _mm_mul_ps(polyFD1, z2);
-    polyFD0        = _mm_add_ps(polyFD0, FD0);
-    polyFD0        = _mm_add_ps(polyFD0, polyFD1);
-
-    polyFD0        = gmx_mm_inv_ps(polyFD0);
-
-    polyFN0        = _mm_mul_ps(FN6, z4);
-    polyFN1        = _mm_mul_ps(FN5, z4);
-    polyFN0        = _mm_add_ps(polyFN0, FN4);
-    polyFN1        = _mm_add_ps(polyFN1, FN3);
-    polyFN0        = _mm_mul_ps(polyFN0, z4);
-    polyFN1        = _mm_mul_ps(polyFN1, z4);
-    polyFN0        = _mm_add_ps(polyFN0, FN2);
-    polyFN1        = _mm_add_ps(polyFN1, FN1);
-    polyFN0        = _mm_mul_ps(polyFN0, z4);
-    polyFN1        = _mm_mul_ps(polyFN1, z2);
-    polyFN0        = _mm_add_ps(polyFN0, FN0);
-    polyFN0        = _mm_add_ps(polyFN0, polyFN1);
-
-    return _mm_mul_ps(polyFN0, polyFD0);
-}
-
-
-
-/* Calculate the potential correction due to PME analytically.
- *
- * See gmx_mm256_pmecorrF_ps() for details about the approximation.
- *
- * This routine calculates Erf(z)/z, although you should provide z^2
- * as the input argument.
- *
- * Here's how it should be used:
- *
- * 1. Calculate r^2.
- * 2. Multiply by beta^2, so you get z^2=beta^2*r^2.
- * 3. Evaluate this routine with z^2 as the argument.
- * 4. The return value is the expression:
- *
- *
- *        erf(z)
- *       --------
- *          z
- *
- * 5. Multiply the entire expression by beta and switching back to r (z=r*beta):
- *
- *       erf(r*beta)
- *       -----------
- *           r
- *
- * 6. Subtract the result from 1/r, multiply by the product of the charges,
- *    and you have your potential.
- */
-static __m256
-gmx_mm256_pmecorrV_ps(__m256 z2)
-{
-    const __m256  VN6      = _mm256_set1_ps(1.9296833005951166339e-8f);
-    const __m256  VN5      = _mm256_set1_ps(-1.4213390571557850962e-6f);
-    const __m256  VN4      = _mm256_set1_ps(0.000041603292906656984871f);
-    const __m256  VN3      = _mm256_set1_ps(-0.00013134036773265025626f);
-    const __m256  VN2      = _mm256_set1_ps(0.038657983986041781264f);
-    const __m256  VN1      = _mm256_set1_ps(0.11285044772717598220f);
-    const __m256  VN0      = _mm256_set1_ps(1.1283802385263030286f);
-
-    const __m256  VD3      = _mm256_set1_ps(0.0066752224023576045451f);
-    const __m256  VD2      = _mm256_set1_ps(0.078647795836373922256f);
-    const __m256  VD1      = _mm256_set1_ps(0.43336185284710920150f);
-    const __m256  VD0      = _mm256_set1_ps(1.0f);
-
-    __m256        z4;
-    __m256        polyVN0, polyVN1, polyVD0, polyVD1;
-
-    z4             = _mm256_mul_ps(z2, z2);
-
-    polyVD1        = _mm256_mul_ps(VD3, z4);
-    polyVD0        = _mm256_mul_ps(VD2, z4);
-    polyVD1        = _mm256_add_ps(polyVD1, VD1);
-    polyVD0        = _mm256_add_ps(polyVD0, VD0);
-    polyVD1        = _mm256_mul_ps(polyVD1, z2);
-    polyVD0        = _mm256_add_ps(polyVD0, polyVD1);
-
-    polyVD0        = gmx_mm256_inv_ps(polyVD0);
-
-    polyVN0        = _mm256_mul_ps(VN6, z4);
-    polyVN1        = _mm256_mul_ps(VN5, z4);
-    polyVN0        = _mm256_add_ps(polyVN0, VN4);
-    polyVN1        = _mm256_add_ps(polyVN1, VN3);
-    polyVN0        = _mm256_mul_ps(polyVN0, z4);
-    polyVN1        = _mm256_mul_ps(polyVN1, z4);
-    polyVN0        = _mm256_add_ps(polyVN0, VN2);
-    polyVN1        = _mm256_add_ps(polyVN1, VN1);
-    polyVN0        = _mm256_mul_ps(polyVN0, z4);
-    polyVN1        = _mm256_mul_ps(polyVN1, z2);
-    polyVN0        = _mm256_add_ps(polyVN0, VN0);
-    polyVN0        = _mm256_add_ps(polyVN0, polyVN1);
-
-    return _mm256_mul_ps(polyVN0, polyVD0);
-}
-
-
-static __m128
-gmx_mm_pmecorrV_ps(__m128 z2)
-{
-    const __m128  VN6      = _mm_set1_ps(1.9296833005951166339e-8f);
-    const __m128  VN5      = _mm_set1_ps(-1.4213390571557850962e-6f);
-    const __m128  VN4      = _mm_set1_ps(0.000041603292906656984871f);
-    const __m128  VN3      = _mm_set1_ps(-0.00013134036773265025626f);
-    const __m128  VN2      = _mm_set1_ps(0.038657983986041781264f);
-    const __m128  VN1      = _mm_set1_ps(0.11285044772717598220f);
-    const __m128  VN0      = _mm_set1_ps(1.1283802385263030286f);
-
-    const __m128  VD3      = _mm_set1_ps(0.0066752224023576045451f);
-    const __m128  VD2      = _mm_set1_ps(0.078647795836373922256f);
-    const __m128  VD1      = _mm_set1_ps(0.43336185284710920150f);
-    const __m128  VD0      = _mm_set1_ps(1.0f);
-
-    __m128        z4;
-    __m128        polyVN0, polyVN1, polyVD0, polyVD1;
-
-    z4             = _mm_mul_ps(z2, z2);
-
-    polyVD1        = _mm_mul_ps(VD3, z4);
-    polyVD0        = _mm_mul_ps(VD2, z4);
-    polyVD1        = _mm_add_ps(polyVD1, VD1);
-    polyVD0        = _mm_add_ps(polyVD0, VD0);
-    polyVD1        = _mm_mul_ps(polyVD1, z2);
-    polyVD0        = _mm_add_ps(polyVD0, polyVD1);
-
-    polyVD0        = gmx_mm_inv_ps(polyVD0);
-
-    polyVN0        = _mm_mul_ps(VN6, z4);
-    polyVN1        = _mm_mul_ps(VN5, z4);
-    polyVN0        = _mm_add_ps(polyVN0, VN4);
-    polyVN1        = _mm_add_ps(polyVN1, VN3);
-    polyVN0        = _mm_mul_ps(polyVN0, z4);
-    polyVN1        = _mm_mul_ps(polyVN1, z4);
-    polyVN0        = _mm_add_ps(polyVN0, VN2);
-    polyVN1        = _mm_add_ps(polyVN1, VN1);
-    polyVN0        = _mm_mul_ps(polyVN0, z4);
-    polyVN1        = _mm_mul_ps(polyVN1, z2);
-    polyVN0        = _mm_add_ps(polyVN0, VN0);
-    polyVN0        = _mm_add_ps(polyVN0, polyVN1);
-
-    return _mm_mul_ps(polyVN0, polyVD0);
-}
-
-
-static int
-gmx_mm256_sincos_ps(__m256  x,
-                    __m256 *sinval,
-                    __m256 *cosval)
-{
-    const __m256  two_over_pi = _mm256_set1_ps(2.0f/(float)M_PI);
-    const __m256  half        = _mm256_set1_ps(0.5f);
-    const __m256  one         = _mm256_set1_ps(1.0f);
-    const __m256  zero        = _mm256_setzero_ps();
-
-    const __m128i ione       = _mm_set1_epi32(1);
-
-    const __m256  mask_one    = _mm256_castsi256_ps(_mm256_set1_epi32(1));
-    const __m256  mask_two    = _mm256_castsi256_ps(_mm256_set1_epi32(2));
-    const __m256  mask_three  = _mm256_castsi256_ps(_mm256_set1_epi32(3));
-
-    const __m256  CA1         = _mm256_set1_ps(1.5703125f);
-    const __m256  CA2         = _mm256_set1_ps(4.837512969970703125e-4f);
-    const __m256  CA3         = _mm256_set1_ps(7.54978995489188216e-8f);
-
-    const __m256  CC0         = _mm256_set1_ps(-0.0013602249f);
-    const __m256  CC1         = _mm256_set1_ps(0.0416566950f);
-    const __m256  CC2         = _mm256_set1_ps(-0.4999990225f);
-    const __m256  CS0         = _mm256_set1_ps(-0.0001950727f);
-    const __m256  CS1         = _mm256_set1_ps(0.0083320758f);
-    const __m256  CS2         = _mm256_set1_ps(-0.1666665247f);
-
-    const __m256  signbit    = _mm256_castsi256_ps( _mm256_set1_epi32(0x80000000) );
-
-    __m256        y, y2;
-    __m256        z;
-    __m256i       iz;
-    __m128i       iz_high, iz_low;
-    __m256        offset_sin, offset_cos;
-    __m256        mask_sin, mask_cos;
-    __m256        tmp1, tmp2;
-    __m256        tmp_sin, tmp_cos;
-
-    y               = _mm256_mul_ps(x, two_over_pi);
-    y               = _mm256_add_ps(y, _mm256_or_ps(_mm256_and_ps(y, signbit), half));
-
-    iz              = _mm256_cvttps_epi32(y);
-    z               = _mm256_round_ps(y, _MM_FROUND_TO_ZERO);
-
-    offset_sin      = _mm256_and_ps(_mm256_castsi256_ps(iz), mask_three);
-
-    iz_high         = _mm256_extractf128_si256(iz, 0x1);
-    iz_low          = _mm256_castsi256_si128(iz);
-    iz_low          = _mm_add_epi32(iz_low, ione);
-    iz_high         = _mm_add_epi32(iz_high, ione);
-    iz              = _mm256_castsi128_si256(iz_low);
-    iz              = _mm256_insertf128_si256(iz, iz_high, 0x1);
-    offset_cos      = _mm256_castsi256_ps(iz);
-
-    /* Extended precision arithmethic to achieve full precision */
-    y               = _mm256_mul_ps(z, CA1);
-    tmp1            = _mm256_mul_ps(z, CA2);
-    tmp2            = _mm256_mul_ps(z, CA3);
-    y               = _mm256_sub_ps(x, y);
-    y               = _mm256_sub_ps(y, tmp1);
-    y               = _mm256_sub_ps(y, tmp2);
-
-    y2              = _mm256_mul_ps(y, y);
-
-    tmp1            = _mm256_mul_ps(CC0, y2);
-    tmp1            = _mm256_add_ps(tmp1, CC1);
-    tmp2            = _mm256_mul_ps(CS0, y2);
-    tmp2            = _mm256_add_ps(tmp2, CS1);
-    tmp1            = _mm256_mul_ps(tmp1, y2);
-    tmp1            = _mm256_add_ps(tmp1, CC2);
-    tmp2            = _mm256_mul_ps(tmp2, y2);
-    tmp2            = _mm256_add_ps(tmp2, CS2);
-
-    tmp1            = _mm256_mul_ps(tmp1, y2);
-    tmp1            = _mm256_add_ps(tmp1, one);
-
-    tmp2            = _mm256_mul_ps(tmp2, _mm256_mul_ps(y, y2));
-    tmp2            = _mm256_add_ps(tmp2, y);
-
-#ifdef __INTEL_COMPILER
-    /* Intel Compiler version 12.1.3 20120130 is buggy if optimization is enabled unless we cast explicitly! */
-    mask_sin        = _mm256_cmp_ps(_mm256_cvtepi32_ps(_mm256_castps_si256(_mm256_and_ps(offset_sin, mask_one))), zero, _CMP_EQ_OQ);
-    mask_cos        = _mm256_cmp_ps(_mm256_cvtepi32_ps(_mm256_castps_si256(_mm256_and_ps(offset_cos, mask_one))), zero, _CMP_EQ_OQ);
-#else
-    mask_sin        = _mm256_cmp_ps( _mm256_and_ps(offset_sin, mask_one), zero, _CMP_EQ_OQ);
-    mask_cos        = _mm256_cmp_ps( _mm256_and_ps(offset_cos, mask_one), zero, _CMP_EQ_OQ);
-#endif
-    tmp_sin         = _mm256_blendv_ps(tmp1, tmp2, mask_sin);
-    tmp_cos         = _mm256_blendv_ps(tmp1, tmp2, mask_cos);
-
-    tmp1            = _mm256_xor_ps(signbit, tmp_sin);
-    tmp2            = _mm256_xor_ps(signbit, tmp_cos);
-
-#ifdef __INTEL_COMPILER
-    /* Intel Compiler version 12.1.3 20120130 is buggy if optimization is enabled unless we cast explicitly! */
-    mask_sin        = _mm256_cmp_ps(_mm256_cvtepi32_ps(_mm256_castps_si256(_mm256_and_ps(offset_sin, mask_two))), zero, _CMP_EQ_OQ);
-    mask_cos        = _mm256_cmp_ps(_mm256_cvtepi32_ps(_mm256_castps_si256(_mm256_and_ps(offset_cos, mask_two))), zero, _CMP_EQ_OQ);
-#else
-    mask_sin        = _mm256_cmp_ps( _mm256_and_ps(offset_sin, mask_two), zero, _CMP_EQ_OQ);
-    mask_cos        = _mm256_cmp_ps( _mm256_and_ps(offset_cos, mask_two), zero, _CMP_EQ_OQ);
-
-#endif
-    *sinval         = _mm256_blendv_ps(tmp1, tmp_sin, mask_sin);
-    *cosval         = _mm256_blendv_ps(tmp2, tmp_cos, mask_cos);
-
-    return 0;
-}
-
-static int
-gmx_mm_sincos_ps(__m128  x,
-                 __m128 *sinval,
-                 __m128 *cosval)
-{
-    const __m128  two_over_pi = _mm_set1_ps(2.0/M_PI);
-    const __m128  half        = _mm_set1_ps(0.5);
-    const __m128  one         = _mm_set1_ps(1.0);
-
-    const __m128i izero      = _mm_set1_epi32(0);
-    const __m128i ione       = _mm_set1_epi32(1);
-    const __m128i itwo       = _mm_set1_epi32(2);
-    const __m128i ithree     = _mm_set1_epi32(3);
-    const __m128  signbit    = gmx_mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
-
-    const __m128  CA1         = _mm_set1_ps(1.5703125f);
-    const __m128  CA2         = _mm_set1_ps(4.837512969970703125e-4f);
-    const __m128  CA3         = _mm_set1_ps(7.54978995489188216e-8f);
-
-    const __m128  CC0         = _mm_set1_ps(-0.0013602249f);
-    const __m128  CC1         = _mm_set1_ps(0.0416566950f);
-    const __m128  CC2         = _mm_set1_ps(-0.4999990225f);
-    const __m128  CS0         = _mm_set1_ps(-0.0001950727f);
-    const __m128  CS1         = _mm_set1_ps(0.0083320758f);
-    const __m128  CS2         = _mm_set1_ps(-0.1666665247f);
-
-    __m128        y, y2;
-    __m128        z;
-    __m128i       iz;
-    __m128i       offset_sin, offset_cos;
-    __m128        tmp1, tmp2;
-    __m128        mask_sin, mask_cos;
-    __m128        tmp_sin, tmp_cos;
-
-    y          = _mm_mul_ps(x, two_over_pi);
-    y          = _mm_add_ps(y, _mm_or_ps(_mm_and_ps(y, signbit), half));
-
-    iz         = _mm_cvttps_epi32(y);
-    z          = _mm_round_ps(y, _MM_FROUND_TO_ZERO);
-
-    offset_sin = _mm_and_si128(iz, ithree);
-    offset_cos = _mm_add_epi32(iz, ione);
-
-    /* Extended precision arithmethic to achieve full precision */
-    y               = _mm_mul_ps(z, CA1);
-    tmp1            = _mm_mul_ps(z, CA2);
-    tmp2            = _mm_mul_ps(z, CA3);
-    y               = _mm_sub_ps(x, y);
-    y               = _mm_sub_ps(y, tmp1);
-    y               = _mm_sub_ps(y, tmp2);
-
-    y2              = _mm_mul_ps(y, y);
-
-    tmp1            = _mm_mul_ps(CC0, y2);
-    tmp1            = _mm_add_ps(tmp1, CC1);
-    tmp2            = _mm_mul_ps(CS0, y2);
-    tmp2            = _mm_add_ps(tmp2, CS1);
-    tmp1            = _mm_mul_ps(tmp1, y2);
-    tmp1            = _mm_add_ps(tmp1, CC2);
-    tmp2            = _mm_mul_ps(tmp2, y2);
-    tmp2            = _mm_add_ps(tmp2, CS2);
-
-    tmp1            = _mm_mul_ps(tmp1, y2);
-    tmp1            = _mm_add_ps(tmp1, one);
-
-    tmp2            = _mm_mul_ps(tmp2, _mm_mul_ps(y, y2));
-    tmp2            = _mm_add_ps(tmp2, y);
-
-    mask_sin        = gmx_mm_castsi128_ps(_mm_cmpeq_epi32( _mm_and_si128(offset_sin, ione), izero));
-    mask_cos        = gmx_mm_castsi128_ps(_mm_cmpeq_epi32( _mm_and_si128(offset_cos, ione), izero));
-
-    tmp_sin         = _mm_blendv_ps(tmp1, tmp2, mask_sin);
-    tmp_cos         = _mm_blendv_ps(tmp1, tmp2, mask_cos);
-
-    mask_sin        = gmx_mm_castsi128_ps(_mm_cmpeq_epi32( _mm_and_si128(offset_sin, itwo), izero));
-    mask_cos        = gmx_mm_castsi128_ps(_mm_cmpeq_epi32( _mm_and_si128(offset_cos, itwo), izero));
-
-    tmp1            = _mm_xor_ps(signbit, tmp_sin);
-    tmp2            = _mm_xor_ps(signbit, tmp_cos);
-
-    *sinval         = _mm_blendv_ps(tmp1, tmp_sin, mask_sin);
-    *cosval         = _mm_blendv_ps(tmp2, tmp_cos, mask_cos);
-
-    return 0;
-}
-
-
-
-
-/*
- * IMPORTANT: Do NOT call both sin & cos if you need both results, since each of them
- * will then call the sincos() routine and waste a factor 2 in performance!
- */
-static __m256
-gmx_mm256_sin_ps(__m256 x)
-{
-    __m256 s, c;
-    gmx_mm256_sincos_ps(x, &s, &c);
-    return s;
-}
-
-static __m128
-gmx_mm_sin_ps(__m128 x)
-{
-    __m128 s, c;
-    gmx_mm_sincos_ps(x, &s, &c);
-    return s;
-}
-
-
-/*
- * IMPORTANT: Do NOT call both sin & cos if you need both results, since each of them
- * will then call the sincos() routine and waste a factor 2 in performance!
- */
-static __m256
-gmx_mm256_cos_ps(__m256 x)
-{
-    __m256 s, c;
-    gmx_mm256_sincos_ps(x, &s, &c);
-    return c;
-}
-
-static __m128
-gmx_mm_cos_ps(__m128 x)
-{
-    __m128 s, c;
-    gmx_mm_sincos_ps(x, &s, &c);
-    return c;
-}
-
-
-static __m256
-gmx_mm256_tan_ps(__m256 x)
-{
-    __m256 sinval, cosval;
-    __m256 tanval;
-
-    gmx_mm256_sincos_ps(x, &sinval, &cosval);
-
-    tanval = _mm256_mul_ps(sinval, gmx_mm256_inv_ps(cosval));
-
-    return tanval;
-}
-
-static __m128
-gmx_mm_tan_ps(__m128 x)
-{
-    __m128 sinval, cosval;
-    __m128 tanval;
-
-    gmx_mm_sincos_ps(x, &sinval, &cosval);
-
-    tanval = _mm_mul_ps(sinval, gmx_mm_inv_ps(cosval));
-
-    return tanval;
-}
-
-
-static __m256
-gmx_mm256_asin_ps(__m256 x)
-{
-    const __m256 signmask  = _mm256_castsi256_ps( _mm256_set1_epi32(0x7FFFFFFF) );
-    const __m256 limitlow  = _mm256_set1_ps(1e-4f);
-    const __m256 half      = _mm256_set1_ps(0.5f);
-    const __m256 one       = _mm256_set1_ps(1.0f);
-    const __m256 halfpi    = _mm256_set1_ps((float)M_PI/2.0f);
-
-    const __m256 CC5        = _mm256_set1_ps(4.2163199048E-2f);
-    const __m256 CC4        = _mm256_set1_ps(2.4181311049E-2f);
-    const __m256 CC3        = _mm256_set1_ps(4.5470025998E-2f);
-    const __m256 CC2        = _mm256_set1_ps(7.4953002686E-2f);
-    const __m256 CC1        = _mm256_set1_ps(1.6666752422E-1f);
-
-    __m256       sign;
-    __m256       mask;
-    __m256       xabs;
-    __m256       z, z1, z2, q, q1, q2;
-    __m256       pA, pB;
-
-    sign  = _mm256_andnot_ps(signmask, x);
-    xabs  = _mm256_and_ps(x, signmask);
-
-    mask  = _mm256_cmp_ps(xabs, half, _CMP_GT_OQ);
-
-    z1    = _mm256_mul_ps(half, _mm256_sub_ps(one, xabs));
-    q1    = _mm256_mul_ps(z1, gmx_mm256_invsqrt_ps(z1));
-    q1    = _mm256_andnot_ps(_mm256_cmp_ps(xabs, one, _CMP_EQ_OQ), q1);
-
-    q2    = xabs;
-    z2    = _mm256_mul_ps(q2, q2);
-
-    z     = _mm256_blendv_ps(z2, z1, mask);
-    q     = _mm256_blendv_ps(q2, q1, mask);
-
-    z2    = _mm256_mul_ps(z, z);
-
-    pA    = _mm256_mul_ps(CC5, z2);
-    pB    = _mm256_mul_ps(CC4, z2);
-
-    pA    = _mm256_add_ps(pA, CC3);
-    pB    = _mm256_add_ps(pB, CC2);
-
-    pA    = _mm256_mul_ps(pA, z2);
-    pB    = _mm256_mul_ps(pB, z2);
-
-    pA    = _mm256_add_ps(pA, CC1);
-    pA    = _mm256_mul_ps(pA, z);
-
-    z     = _mm256_add_ps(pA, pB);
-    z     = _mm256_mul_ps(z, q);
-    z     = _mm256_add_ps(z, q);
-
-    q2    = _mm256_sub_ps(halfpi, z);
-    q2    = _mm256_sub_ps(q2, z);
-
-    z     = _mm256_blendv_ps(z, q2, mask);
-
-    mask  = _mm256_cmp_ps(xabs, limitlow, _CMP_GT_OQ);
-    z     = _mm256_blendv_ps(xabs, z, mask);
-
-    z     = _mm256_xor_ps(z, sign);
-
-    return z;
-}
-
-static __m128
-gmx_mm_asin_ps(__m128 x)
-{
-    /* Same algorithm as cephes library */
-    const __m128 signmask  = gmx_mm_castsi128_ps( _mm_set1_epi32(0x7FFFFFFF) );
-    const __m128 limitlow  = _mm_set1_ps(1e-4f);
-    const __m128 half      = _mm_set1_ps(0.5f);
-    const __m128 one       = _mm_set1_ps(1.0f);
-    const __m128 halfpi    = _mm_set1_ps(M_PI/2.0f);
-
-    const __m128 CC5        = _mm_set1_ps(4.2163199048E-2f);
-    const __m128 CC4        = _mm_set1_ps(2.4181311049E-2f);
-    const __m128 CC3        = _mm_set1_ps(4.5470025998E-2f);
-    const __m128 CC2        = _mm_set1_ps(7.4953002686E-2f);
-    const __m128 CC1        = _mm_set1_ps(1.6666752422E-1f);
-
-    __m128       sign;
-    __m128       mask;
-    __m128       xabs;
-    __m128       z, z1, z2, q, q1, q2;
-    __m128       pA, pB;
-
-    sign  = _mm_andnot_ps(signmask, x);
-    xabs  = _mm_and_ps(x, signmask);
-
-    mask  = _mm_cmpgt_ps(xabs, half);
-
-    z1    = _mm_mul_ps(half, _mm_sub_ps(one, xabs));
-    q1    = _mm_mul_ps(z1, gmx_mm_invsqrt_ps(z1));
-    q1    = _mm_andnot_ps(_mm_cmpeq_ps(xabs, one), q1);
-
-    q2    = xabs;
-    z2    = _mm_mul_ps(q2, q2);
-
-    z     = _mm_or_ps( _mm_and_ps(mask, z1), _mm_andnot_ps(mask, z2) );
-    q     = _mm_or_ps( _mm_and_ps(mask, q1), _mm_andnot_ps(mask, q2) );
-
-    z2    = _mm_mul_ps(z, z);
-
-    pA    = _mm_mul_ps(CC5, z2);
-    pB    = _mm_mul_ps(CC4, z2);
-
-    pA    = _mm_add_ps(pA, CC3);
-    pB    = _mm_add_ps(pB, CC2);
-
-    pA    = _mm_mul_ps(pA, z2);
-    pB    = _mm_mul_ps(pB, z2);
-
-    pA    = _mm_add_ps(pA, CC1);
-    pA    = _mm_mul_ps(pA, z);
-
-    z     = _mm_add_ps(pA, pB);
-    z     = _mm_mul_ps(z, q);
-    z     = _mm_add_ps(z, q);
-
-    q2    = _mm_sub_ps(halfpi, z);
-    q2    = _mm_sub_ps(q2, z);
-
-    z     = _mm_or_ps( _mm_and_ps(mask, q2), _mm_andnot_ps(mask, z) );
-
-    mask  = _mm_cmpgt_ps(xabs, limitlow);
-    z     = _mm_or_ps( _mm_and_ps(mask, z), _mm_andnot_ps(mask, xabs) );
-
-    z = _mm_xor_ps(z, sign);
-
-    return z;
-}
-
-
-static __m256
-gmx_mm256_acos_ps(__m256 x)
-{
-    const __m256 signmask  = _mm256_castsi256_ps( _mm256_set1_epi32(0x7FFFFFFF) );
-    const __m256 one_ps    = _mm256_set1_ps(1.0f);
-    const __m256 half_ps   = _mm256_set1_ps(0.5f);
-    const __m256 pi_ps     = _mm256_set1_ps((float)M_PI);
-    const __m256 halfpi_ps = _mm256_set1_ps((float)M_PI/2.0f);
-
-    __m256       mask1;
-    __m256       mask2;
-    __m256       xabs;
-    __m256       z, z1, z2, z3;
-
-    xabs  = _mm256_and_ps(x, signmask);
-    mask1 = _mm256_cmp_ps(xabs, half_ps, _CMP_GT_OQ);
-    mask2 = _mm256_cmp_ps(x, _mm256_setzero_ps(), _CMP_GT_OQ);
-
-    z     = _mm256_mul_ps(half_ps, _mm256_sub_ps(one_ps, xabs));
-    z     = _mm256_mul_ps(z, gmx_mm256_invsqrt_ps(z));
-    z     = _mm256_andnot_ps(_mm256_cmp_ps(xabs, one_ps, _CMP_EQ_OQ), z);
-
-    z     = _mm256_blendv_ps(x, z, mask1);
-    z     = gmx_mm256_asin_ps(z);
-
-    z2    = _mm256_add_ps(z, z);
-    z1    = _mm256_sub_ps(pi_ps, z2);
-    z3    = _mm256_sub_ps(halfpi_ps, z);
-
-    z     = _mm256_blendv_ps(z1, z2, mask2);
-    z     = _mm256_blendv_ps(z3, z, mask1);
-
-    return z;
-}
-
-static __m128
-gmx_mm_acos_ps(__m128 x)
-{
-    const __m128 signmask  = gmx_mm_castsi128_ps( _mm_set1_epi32(0x7FFFFFFF) );
-    const __m128 one_ps    = _mm_set1_ps(1.0f);
-    const __m128 half_ps   = _mm_set1_ps(0.5f);
-    const __m128 pi_ps     = _mm_set1_ps(M_PI);
-    const __m128 halfpi_ps = _mm_set1_ps(M_PI/2.0f);
-
-    __m128       mask1;
-    __m128       mask2;
-    __m128       xabs;
-    __m128       z, z1, z2, z3;
-
-    xabs  = _mm_and_ps(x, signmask);
-    mask1 = _mm_cmpgt_ps(xabs, half_ps);
-    mask2 = _mm_cmpgt_ps(x, _mm_setzero_ps());
-
-    z     = _mm_mul_ps(half_ps, _mm_sub_ps(one_ps, xabs));
-    z     = _mm_mul_ps(z, gmx_mm_invsqrt_ps(z));
-    z     = _mm_andnot_ps(_mm_cmpeq_ps(xabs, one_ps), z);
-
-    z     = _mm_blendv_ps(x, z, mask1);
-    z     = gmx_mm_asin_ps(z);
-
-    z2    = _mm_add_ps(z, z);
-    z1    = _mm_sub_ps(pi_ps, z2);
-    z3    = _mm_sub_ps(halfpi_ps, z);
-
-    z     = _mm_blendv_ps(z1, z2, mask2);
-    z     = _mm_blendv_ps(z3, z, mask1);
-
-    return z;
-}
-
-
-static __m256
-gmx_mm256_atan_ps(__m256 x)
-{
-    const __m256 signmask  = _mm256_castsi256_ps( _mm256_set1_epi32(0x7FFFFFFF) );
-    const __m256 limit1    = _mm256_set1_ps(0.414213562373095f);
-    const __m256 limit2    = _mm256_set1_ps(2.414213562373095f);
-    const __m256 quarterpi = _mm256_set1_ps(0.785398163397448f);
-    const __m256 halfpi    = _mm256_set1_ps(1.570796326794896f);
-    const __m256 mone      = _mm256_set1_ps(-1.0f);
-    const __m256 CC3       = _mm256_set1_ps(-3.33329491539E-1f);
-    const __m256 CC5       = _mm256_set1_ps(1.99777106478E-1f);
-    const __m256 CC7       = _mm256_set1_ps(-1.38776856032E-1);
-    const __m256 CC9       = _mm256_set1_ps(8.05374449538e-2f);
-
-    __m256       sign;
-    __m256       mask1, mask2;
-    __m256       y, z1, z2;
-    __m256       x2, x4;
-    __m256       sum1, sum2;
-
-    sign  = _mm256_andnot_ps(signmask, x);
-    x     = _mm256_and_ps(x, signmask);
-
-    mask1 = _mm256_cmp_ps(x, limit1, _CMP_GT_OQ);
-    mask2 = _mm256_cmp_ps(x, limit2, _CMP_GT_OQ);
-
-    z1    = _mm256_mul_ps(_mm256_add_ps(x, mone), gmx_mm256_inv_ps(_mm256_sub_ps(x, mone)));
-    z2    = _mm256_mul_ps(mone, gmx_mm256_inv_ps(x));
-
-    y     = _mm256_and_ps(mask1, quarterpi);
-    y     = _mm256_blendv_ps(y, halfpi, mask2);
-
-    x     = _mm256_blendv_ps(x, z1, mask1);
-    x     = _mm256_blendv_ps(x, z2, mask2);
-
-    x2    = _mm256_mul_ps(x, x);
-    x4    = _mm256_mul_ps(x2, x2);
-
-    sum1  = _mm256_mul_ps(CC9, x4);
-    sum2  = _mm256_mul_ps(CC7, x4);
-    sum1  = _mm256_add_ps(sum1, CC5);
-    sum2  = _mm256_add_ps(sum2, CC3);
-    sum1  = _mm256_mul_ps(sum1, x4);
-    sum2  = _mm256_mul_ps(sum2, x2);
-
-    sum1  = _mm256_add_ps(sum1, sum2);
-    sum1  = _mm256_sub_ps(sum1, mone);
-    sum1  = _mm256_mul_ps(sum1, x);
-    y     = _mm256_add_ps(y, sum1);
-
-    y     = _mm256_xor_ps(y, sign);
-
-    return y;
-}
-
-static __m128
-gmx_mm_atan_ps(__m128 x)
-{
-    /* Same algorithm as cephes library */
-    const __m128 signmask  = gmx_mm_castsi128_ps( _mm_set1_epi32(0x7FFFFFFF) );
-    const __m128 limit1    = _mm_set1_ps(0.414213562373095f);
-    const __m128 limit2    = _mm_set1_ps(2.414213562373095f);
-    const __m128 quarterpi = _mm_set1_ps(0.785398163397448f);
-    const __m128 halfpi    = _mm_set1_ps(1.570796326794896f);
-    const __m128 mone      = _mm_set1_ps(-1.0f);
-    const __m128 CC3       = _mm_set1_ps(-3.33329491539E-1f);
-    const __m128 CC5       = _mm_set1_ps(1.99777106478E-1f);
-    const __m128 CC7       = _mm_set1_ps(-1.38776856032E-1);
-    const __m128 CC9       = _mm_set1_ps(8.05374449538e-2f);
-
-    __m128       sign;
-    __m128       mask1, mask2;
-    __m128       y, z1, z2;
-    __m128       x2, x4;
-    __m128       sum1, sum2;
-
-    sign  = _mm_andnot_ps(signmask, x);
-    x     = _mm_and_ps(x, signmask);
-
-    mask1 = _mm_cmpgt_ps(x, limit1);
-    mask2 = _mm_cmpgt_ps(x, limit2);
-
-    z1    = _mm_mul_ps(_mm_add_ps(x, mone), gmx_mm_inv_ps(_mm_sub_ps(x, mone)));
-    z2    = _mm_mul_ps(mone, gmx_mm_inv_ps(x));
-
-    y     = _mm_and_ps(mask1, quarterpi);
-    y     = _mm_blendv_ps(y, halfpi, mask2);
-
-    x     = _mm_blendv_ps(x, z1, mask1);
-    x     = _mm_blendv_ps(x, z2, mask2);
-
-    x2    = _mm_mul_ps(x, x);
-    x4    = _mm_mul_ps(x2, x2);
-
-    sum1  = _mm_mul_ps(CC9, x4);
-    sum2  = _mm_mul_ps(CC7, x4);
-    sum1  = _mm_add_ps(sum1, CC5);
-    sum2  = _mm_add_ps(sum2, CC3);
-    sum1  = _mm_mul_ps(sum1, x4);
-    sum2  = _mm_mul_ps(sum2, x2);
-
-    sum1  = _mm_add_ps(sum1, sum2);
-    sum1  = _mm_sub_ps(sum1, mone);
-    sum1  = _mm_mul_ps(sum1, x);
-    y     = _mm_add_ps(y, sum1);
-
-    y     = _mm_xor_ps(y, sign);
-
-    return y;
-}
-
-
-static __m256
-gmx_mm256_atan2_ps(__m256 y, __m256 x)
-{
-    const __m256 pi          = _mm256_set1_ps( (float) M_PI);
-    const __m256 minuspi     = _mm256_set1_ps( (float) -M_PI);
-    const __m256 halfpi      = _mm256_set1_ps( (float) M_PI/2.0f);
-    const __m256 minushalfpi = _mm256_set1_ps( (float) -M_PI/2.0f);
-
-    __m256       z, z1, z3, z4;
-    __m256       w;
-    __m256       maskx_lt, maskx_eq;
-    __m256       masky_lt, masky_eq;
-    __m256       mask1, mask2, mask3, mask4, maskall;
-
-    maskx_lt  = _mm256_cmp_ps(x, _mm256_setzero_ps(), _CMP_LT_OQ);
-    masky_lt  = _mm256_cmp_ps(y, _mm256_setzero_ps(), _CMP_LT_OQ);
-    maskx_eq  = _mm256_cmp_ps(x, _mm256_setzero_ps(), _CMP_EQ_OQ);
-    masky_eq  = _mm256_cmp_ps(y, _mm256_setzero_ps(), _CMP_EQ_OQ);
-
-    z         = _mm256_mul_ps(y, gmx_mm256_inv_ps(x));
-    z         = gmx_mm256_atan_ps(z);
-
-    mask1     = _mm256_and_ps(maskx_eq, masky_lt);
-    mask2     = _mm256_andnot_ps(maskx_lt, masky_eq);
-    mask3     = _mm256_andnot_ps( _mm256_or_ps(masky_lt, masky_eq), maskx_eq);
-    mask4     = _mm256_and_ps(maskx_lt, masky_eq);
-    maskall   = _mm256_or_ps( _mm256_or_ps(mask1, mask2), _mm256_or_ps(mask3, mask4) );
-
-    z         = _mm256_andnot_ps(maskall, z);
-    z1        = _mm256_and_ps(mask1, minushalfpi);
-    z3        = _mm256_and_ps(mask3, halfpi);
-    z4        = _mm256_and_ps(mask4, pi);
-
-    z         = _mm256_or_ps( _mm256_or_ps(z, z1), _mm256_or_ps(z3, z4) );
-
-    w         = _mm256_blendv_ps(pi, minuspi, masky_lt);
-    w         = _mm256_and_ps(w, maskx_lt);
-
-    w         = _mm256_andnot_ps(maskall, w);
-
-    z         = _mm256_add_ps(z, w);
-
-    return z;
-}
-
-static __m128
-gmx_mm_atan2_ps(__m128 y, __m128 x)
-{
-    const __m128 pi          = _mm_set1_ps(M_PI);
-    const __m128 minuspi     = _mm_set1_ps(-M_PI);
-    const __m128 halfpi      = _mm_set1_ps(M_PI/2.0);
-    const __m128 minushalfpi = _mm_set1_ps(-M_PI/2.0);
-
-    __m128       z, z1, z3, z4;
-    __m128       w;
-    __m128       maskx_lt, maskx_eq;
-    __m128       masky_lt, masky_eq;
-    __m128       mask1, mask2, mask3, mask4, maskall;
-
-    maskx_lt  = _mm_cmplt_ps(x, _mm_setzero_ps());
-    masky_lt  = _mm_cmplt_ps(y, _mm_setzero_ps());
-    maskx_eq  = _mm_cmpeq_ps(x, _mm_setzero_ps());
-    masky_eq  = _mm_cmpeq_ps(y, _mm_setzero_ps());
-
-    z         = _mm_mul_ps(y, gmx_mm_inv_ps(x));
-    z         = gmx_mm_atan_ps(z);
-
-    mask1     = _mm_and_ps(maskx_eq, masky_lt);
-    mask2     = _mm_andnot_ps(maskx_lt, masky_eq);
-    mask3     = _mm_andnot_ps( _mm_or_ps(masky_lt, masky_eq), maskx_eq);
-    mask4     = _mm_and_ps(masky_eq, maskx_lt);
-
-    maskall   = _mm_or_ps( _mm_or_ps(mask1, mask2), _mm_or_ps(mask3, mask4) );
-
-    z         = _mm_andnot_ps(maskall, z);
-    z1        = _mm_and_ps(mask1, minushalfpi);
-    z3        = _mm_and_ps(mask3, halfpi);
-    z4        = _mm_and_ps(mask4, pi);
-
-    z         = _mm_or_ps( _mm_or_ps(z, z1), _mm_or_ps(z3, z4) );
-
-    mask1     = _mm_andnot_ps(masky_lt, maskx_lt);
-    mask2     = _mm_and_ps(maskx_lt, masky_lt);
-
-    w         = _mm_or_ps( _mm_and_ps(mask1, pi), _mm_and_ps(mask2, minuspi) );
-    w         = _mm_andnot_ps(maskall, w);
-
-    z         = _mm_add_ps(z, w);
  
-    return z;
-}
+#define gmx_mm256_invsqrt_ps   gmx_simd_invsqrt_f
+#define gmx_mm256_inv_ps       gmx_simd_inv_f
+#define gmx_mm256_log_ps       gmx_simd_log_f
+#define gmx_mm256_pmecorrF_ps  gmx_simd_pmecorrF_f
+#define gmx_mm256_pmecorrV_ps  gmx_simd_pmecorrV_f
+#define gmx_mm256_sincos_ps    gmx_simd_sincos_f
  
  #endif
diff --git a/src/gromacs/simd/math_x86_sse2_double.h b/src/gromacs/simd/math_x86_sse2_double.h

index 3f7a64e7fe87e85fd6b066d6af612fb62f6ba56b..9332dc3d3666a752f3b9a7363082b3e4c77c4532 100644 (file)
--- a/src/gromacs/simd/math_x86_sse2_double.h
+++ b/src/gromacs/simd/math_x86_sse2_double.h
@@ -1,7 +1,7 @@
  /*
   * This file is part of the GROMACS molecular simulation package.
   *
- * Copyright (c) 2012,2013, by the GROMACS development team, led by
+ * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
   * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   * and including many others, as listed in the AUTHORS file in the
   * top-level source directory and at http://www.gromacs.org.
@@ -35,1462 +35,19 @@
  #ifndef GMX_SIMD_MATH_SSE2_DOUBLE_H
  #define GMX_SIMD_MATH_SSE2_DOUBLE_H
  
+#include "simd_math.h"
  
-#include <stdio.h>
-#include <math.h>
-
-#include "general_x86_sse2.h"
-
-
-#ifndef M_PI
-#  define M_PI 3.14159265358979323846264338327950288
-#endif
-
-
-
-/************************
- *                      *
- * Simple math routines *
- *                      *
- ************************/
-
-/* 1.0/sqrt(x) */
-static gmx_inline __m128d
-gmx_mm_invsqrt_pd(__m128d x)
-{
-    const __m128d half  = _mm_set1_pd(0.5);
-    const __m128d three = _mm_set1_pd(3.0);
-
-    /* Lookup instruction only exists in single precision, convert back and forth... */
-    __m128d lu = _mm_cvtps_pd(_mm_rsqrt_ps( _mm_cvtpd_ps(x)));
-
-    lu = _mm_mul_pd(half, _mm_mul_pd(_mm_sub_pd(three, _mm_mul_pd(_mm_mul_pd(lu, lu), x)), lu));
-    return _mm_mul_pd(half, _mm_mul_pd(_mm_sub_pd(three, _mm_mul_pd(_mm_mul_pd(lu, lu), x)), lu));
-}
-
-/* 1.0/sqrt(x), done for a pair of arguments to improve throughput */
-static void
-gmx_mm_invsqrt_pair_pd(__m128d x1, __m128d x2, __m128d *invsqrt1, __m128d *invsqrt2)
-{
-    const __m128d half   = _mm_set1_pd(0.5);
-    const __m128d three  = _mm_set1_pd(3.0);
-    const __m128  halff  = _mm_set1_ps(0.5f);
-    const __m128  threef = _mm_set1_ps(3.0f);
-
-    __m128        xf, luf;
-    __m128d       lu1, lu2;
-
-    /* Do first N-R step in float for 2x throughput */
-    xf  = _mm_shuffle_ps(_mm_cvtpd_ps(x1), _mm_cvtpd_ps(x2), _MM_SHUFFLE(1, 0, 1, 0));
-    luf = _mm_rsqrt_ps(xf);
-    luf = _mm_mul_ps(halff, _mm_mul_ps(_mm_sub_ps(threef, _mm_mul_ps(_mm_mul_ps(luf, luf), xf)), luf));
-
-    lu2 = _mm_cvtps_pd(_mm_shuffle_ps(luf, luf, _MM_SHUFFLE(3, 2, 3, 2)));
-    lu1 = _mm_cvtps_pd(luf);
-
-    *invsqrt1 = _mm_mul_pd(half, _mm_mul_pd(_mm_sub_pd(three, _mm_mul_pd(_mm_mul_pd(lu1, lu1), x1)), lu1));
-    *invsqrt2 = _mm_mul_pd(half, _mm_mul_pd(_mm_sub_pd(three, _mm_mul_pd(_mm_mul_pd(lu2, lu2), x2)), lu2));
-}
-
-
-/* sqrt(x) - Do NOT use this (but rather invsqrt) if you actually need 1.0/sqrt(x) */
-static gmx_inline __m128d
-gmx_mm_sqrt_pd(__m128d x)
-{
-    __m128d mask;
-    __m128d res;
-
-    mask = _mm_cmpeq_pd(x, _mm_setzero_pd());
-    res  = _mm_andnot_pd(mask, gmx_mm_invsqrt_pd(x));
-
-    res  = _mm_mul_pd(x, res);
-
-    return res;
-}
-
-/* 1.0/x */
-static gmx_inline __m128d
-gmx_mm_inv_pd(__m128d x)
-{
-    const __m128d two  = _mm_set1_pd(2.0);
-
-    /* Lookup instruction only exists in single precision, convert back and forth... */
-    __m128d lu = _mm_cvtps_pd(_mm_rcp_ps( _mm_cvtpd_ps(x)));
-
-    /* Perform two N-R steps for double precision */
-    lu         = _mm_mul_pd(lu, _mm_sub_pd(two, _mm_mul_pd(x, lu)));
-    return _mm_mul_pd(lu, _mm_sub_pd(two, _mm_mul_pd(x, lu)));
-}
-
-static gmx_inline __m128d
-gmx_mm_abs_pd(__m128d x)
-{
-    const __m128d signmask  = gmx_mm_castsi128_pd( _mm_set_epi32(0x7FFFFFFF, 0xFFFFFFFF, 0x7FFFFFFF, 0xFFFFFFFF) );
-
-    return _mm_and_pd(x, signmask);
-}
-
-
-/*
- * 2^x function.
- *
- * The 2^w term is calculated from a (6,0)-th order (no denominator) Minimax polynomia on the interval
- * [-0.5,0.5].
- *
- * The approximation on [-0.5,0.5] is a rational Padé approximation, 1+2*P(x^2)/(Q(x^2)-P(x^2)),
- * according to the same algorithm as used in the Cephes/netlib math routines.
- */
-static __m128d
-gmx_mm_exp2_pd(__m128d x)
-{
-    /* Lower bound: We do not allow numbers that would lead to an IEEE fp representation exponent smaller than -126. */
-    const __m128d arglimit = _mm_set1_pd(1022.0);
-    const __m128i expbase  = _mm_set1_epi32(1023);
-
-    const __m128d P2       = _mm_set1_pd(2.30933477057345225087e-2);
-    const __m128d P1       = _mm_set1_pd(2.02020656693165307700e1);
-    const __m128d P0       = _mm_set1_pd(1.51390680115615096133e3);
-    /* Q2 == 1.0 */
-    const __m128d Q1       = _mm_set1_pd(2.33184211722314911771e2);
-    const __m128d Q0       = _mm_set1_pd(4.36821166879210612817e3);
-    const __m128d one      = _mm_set1_pd(1.0);
-    const __m128d two      = _mm_set1_pd(2.0);
-
-    __m128d       valuemask;
-    __m128i       iexppart;
-    __m128d       fexppart;
-    __m128d       intpart;
-    __m128d       z, z2;
-    __m128d       PolyP, PolyQ;
-
-    iexppart  = _mm_cvtpd_epi32(x);
-    intpart   = _mm_cvtepi32_pd(iexppart);
-
-    /* The two lowest elements of iexppart now contains 32-bit numbers with a correctly biased exponent.
-     * To be able to shift it into the exponent for a double precision number we first need to
-     * shuffle so that the lower half contains the first element, and the upper half the second.
-     * This should really be done as a zero-extension, but since the next instructions will shift
-     * the registers left by 52 bits it doesn't matter what we put there - it will be shifted out.
-     * (thus we just use element 2 from iexppart).
-     */
-    iexppart  = _mm_shuffle_epi32(iexppart, _MM_SHUFFLE(2, 1, 2, 0));
-
-    /* Do the shift operation on the 64-bit registers */
-    iexppart  = _mm_add_epi32(iexppart, expbase);
-    iexppart  = _mm_slli_epi64(iexppart, 52);
-
-    valuemask = _mm_cmpge_pd(arglimit, gmx_mm_abs_pd(x));
-    fexppart  = _mm_and_pd(valuemask, gmx_mm_castsi128_pd(iexppart));
-
-    z         = _mm_sub_pd(x, intpart);
-    z2        = _mm_mul_pd(z, z);
-
-    PolyP     = _mm_mul_pd(P2, z2);
-    PolyP     = _mm_add_pd(PolyP, P1);
-    PolyQ     = _mm_add_pd(z2, Q1);
-    PolyP     = _mm_mul_pd(PolyP, z2);
-    PolyQ     = _mm_mul_pd(PolyQ, z2);
-    PolyP     = _mm_add_pd(PolyP, P0);
-    PolyQ     = _mm_add_pd(PolyQ, Q0);
-    PolyP     = _mm_mul_pd(PolyP, z);
-
-    z         = _mm_mul_pd(PolyP, gmx_mm_inv_pd(_mm_sub_pd(PolyQ, PolyP)));
-    z         = _mm_add_pd(one, _mm_mul_pd(two, z));
-
-    z         = _mm_mul_pd(z, fexppart);
-
-    return z;
-}
-
-/* Exponential function. This could be calculated from 2^x as Exp(x)=2^(y), where y=log2(e)*x,
- * but there will then be a small rounding error since we lose some precision due to the
- * multiplication. This will then be magnified a lot by the exponential.
- *
- * Instead, we calculate the fractional part directly as a Padé approximation of
- * Exp(z) on [-0.5,0.5]. We use extended precision arithmetics to calculate the fraction
- * remaining after 2^y, which avoids the precision-loss.
- */
-static __m128d
-gmx_mm_exp_pd(__m128d exparg)
-{
-    const __m128d argscale = _mm_set1_pd(1.4426950408889634073599);
-    /* Lower bound: We do not allow numbers that would lead to an IEEE fp representation exponent smaller than -126. */
-    const __m128d arglimit = _mm_set1_pd(1022.0);
-    const __m128i expbase  = _mm_set1_epi32(1023);
-
-    const __m128d invargscale0  = _mm_set1_pd(6.93145751953125e-1);
-    const __m128d invargscale1  = _mm_set1_pd(1.42860682030941723212e-6);
-
-    const __m128d P2       = _mm_set1_pd(1.26177193074810590878e-4);
-    const __m128d P1       = _mm_set1_pd(3.02994407707441961300e-2);
-    /* P0 == 1.0 */
-    const __m128d Q3       = _mm_set1_pd(3.00198505138664455042E-6);
-    const __m128d Q2       = _mm_set1_pd(2.52448340349684104192E-3);
-    const __m128d Q1       = _mm_set1_pd(2.27265548208155028766E-1);
-    /* Q0 == 2.0 */
-    const __m128d one      = _mm_set1_pd(1.0);
-    const __m128d two      = _mm_set1_pd(2.0);
-
-    __m128d       valuemask;
-    __m128i       iexppart;
-    __m128d       fexppart;
-    __m128d       intpart;
-    __m128d       x, z, z2;
-    __m128d       PolyP, PolyQ;
-
-    x             = _mm_mul_pd(exparg, argscale);
-
-    iexppart  = _mm_cvtpd_epi32(x);
-    intpart   = _mm_cvtepi32_pd(iexppart);
-
-    /* The two lowest elements of iexppart now contains 32-bit numbers with a correctly biased exponent.
-     * To be able to shift it into the exponent for a double precision number we first need to
-     * shuffle so that the lower half contains the first element, and the upper half the second.
-     * This should really be done as a zero-extension, but since the next instructions will shift
-     * the registers left by 52 bits it doesn't matter what we put there - it will be shifted out.
-     * (thus we just use element 2 from iexppart).
-     */
-    iexppart  = _mm_shuffle_epi32(iexppart, _MM_SHUFFLE(2, 1, 2, 0));
-
-    /* Do the shift operation on the 64-bit registers */
-    iexppart  = _mm_add_epi32(iexppart, expbase);
-    iexppart  = _mm_slli_epi64(iexppart, 52);
-
-    valuemask = _mm_cmpge_pd(arglimit, gmx_mm_abs_pd(x));
-    fexppart  = _mm_and_pd(valuemask, gmx_mm_castsi128_pd(iexppart));
-
-    z         = _mm_sub_pd(exparg, _mm_mul_pd(invargscale0, intpart));
-    z         = _mm_sub_pd(z, _mm_mul_pd(invargscale1, intpart));
-
-    z2        = _mm_mul_pd(z, z);
-
-    PolyQ     = _mm_mul_pd(Q3, z2);
-    PolyQ     = _mm_add_pd(PolyQ, Q2);
-    PolyP     = _mm_mul_pd(P2, z2);
-    PolyQ     = _mm_mul_pd(PolyQ, z2);
-    PolyP     = _mm_add_pd(PolyP, P1);
-    PolyQ     = _mm_add_pd(PolyQ, Q1);
-    PolyP     = _mm_mul_pd(PolyP, z2);
-    PolyQ     = _mm_mul_pd(PolyQ, z2);
-    PolyP     = _mm_add_pd(PolyP, one);
-    PolyQ     = _mm_add_pd(PolyQ, two);
-
-    PolyP     = _mm_mul_pd(PolyP, z);
-
-    z         = _mm_mul_pd(PolyP, gmx_mm_inv_pd(_mm_sub_pd(PolyQ, PolyP)));
-    z         = _mm_add_pd(one, _mm_mul_pd(two, z));
-
-    z         = _mm_mul_pd(z, fexppart);
-
-    return z;
-}
-
-
-
-static __m128d
-gmx_mm_log_pd(__m128d x)
-{
-    /* Same algorithm as cephes library */
-    const __m128d expmask    = gmx_mm_castsi128_pd( _mm_set_epi32(0x7FF00000, 0x00000000, 0x7FF00000, 0x00000000) );
-
-    const __m128i expbase_m1 = _mm_set1_epi32(1023-1); /* We want non-IEEE format */
-
-    const __m128d half       = _mm_set1_pd(0.5);
-    const __m128d one        = _mm_set1_pd(1.0);
-    const __m128d two        = _mm_set1_pd(2.0);
-    const __m128d invsq2     = _mm_set1_pd(1.0/sqrt(2.0));
-
-    const __m128d corr1      = _mm_set1_pd(-2.121944400546905827679e-4);
-    const __m128d corr2      = _mm_set1_pd(0.693359375);
-
-    const __m128d P5         = _mm_set1_pd(1.01875663804580931796e-4);
-    const __m128d P4         = _mm_set1_pd(4.97494994976747001425e-1);
-    const __m128d P3         = _mm_set1_pd(4.70579119878881725854e0);
-    const __m128d P2         = _mm_set1_pd(1.44989225341610930846e1);
-    const __m128d P1         = _mm_set1_pd(1.79368678507819816313e1);
-    const __m128d P0         = _mm_set1_pd(7.70838733755885391666e0);
-
-    const __m128d Q4         = _mm_set1_pd(1.12873587189167450590e1);
-    const __m128d Q3         = _mm_set1_pd(4.52279145837532221105e1);
-    const __m128d Q2         = _mm_set1_pd(8.29875266912776603211e1);
-    const __m128d Q1         = _mm_set1_pd(7.11544750618563894466e1);
-    const __m128d Q0         = _mm_set1_pd(2.31251620126765340583e1);
-
-    const __m128d R2         = _mm_set1_pd(-7.89580278884799154124e-1);
-    const __m128d R1         = _mm_set1_pd(1.63866645699558079767e1);
-    const __m128d R0         = _mm_set1_pd(-6.41409952958715622951e1);
-
-    const __m128d S2         = _mm_set1_pd(-3.56722798256324312549E1);
-    const __m128d S1         = _mm_set1_pd(3.12093766372244180303E2);
-    const __m128d S0         = _mm_set1_pd(-7.69691943550460008604E2);
-
-    __m128d       fexp;
-    __m128i       iexp;
-
-    __m128d       mask1, mask2;
-    __m128d       corr, t1, t2, q;
-    __m128d       zA, yA, xA, zB, yB, xB, z;
-    __m128d       polyR, polyS;
-    __m128d       polyP1, polyP2, polyQ1, polyQ2;
-
-    /* Separate x into exponent and mantissa, with a mantissa in the range [0.5..1[ (not IEEE754 standard!) */
-    fexp   = _mm_and_pd(x, expmask);
-    iexp   = gmx_mm_castpd_si128(fexp);
-    iexp   = _mm_srli_epi64(iexp, 52);
-    iexp   = _mm_sub_epi32(iexp, expbase_m1);
-    iexp   = _mm_shuffle_epi32(iexp, _MM_SHUFFLE(1, 1, 2, 0) );
-    fexp   = _mm_cvtepi32_pd(iexp);
-
-    x      = _mm_andnot_pd(expmask, x);
-    x      = _mm_or_pd(x, one);
-    x      = _mm_mul_pd(x, half);
-
-    mask1     = _mm_cmpgt_pd(gmx_mm_abs_pd(fexp), two);
-    mask2     = _mm_cmplt_pd(x, invsq2);
-
-    fexp   = _mm_sub_pd(fexp, _mm_and_pd(mask2, one));
-
-    /* If mask1 is set ('A') */
-    zA     = _mm_sub_pd(x, half);
-    t1     = _mm_or_pd( _mm_andnot_pd(mask2, zA), _mm_and_pd(mask2, x) );
-    zA     = _mm_sub_pd(t1, half);
-    t2     = _mm_or_pd( _mm_andnot_pd(mask2, x), _mm_and_pd(mask2, zA) );
-    yA     = _mm_mul_pd(half, _mm_add_pd(t2, one));
-
-    xA     = _mm_mul_pd(zA, gmx_mm_inv_pd(yA));
-    zA     = _mm_mul_pd(xA, xA);
-
-    /* EVALUATE POLY */
-    polyR  = _mm_mul_pd(R2, zA);
-    polyR  = _mm_add_pd(polyR, R1);
-    polyR  = _mm_mul_pd(polyR, zA);
-    polyR  = _mm_add_pd(polyR, R0);
-
-    polyS  = _mm_add_pd(zA, S2);
-    polyS  = _mm_mul_pd(polyS, zA);
-    polyS  = _mm_add_pd(polyS, S1);
-    polyS  = _mm_mul_pd(polyS, zA);
-    polyS  = _mm_add_pd(polyS, S0);
-
-    q      = _mm_mul_pd(polyR, gmx_mm_inv_pd(polyS));
-    zA     = _mm_mul_pd(_mm_mul_pd(xA, zA), q);
-
-    zA     = _mm_add_pd(zA, _mm_mul_pd(corr1, fexp));
-    zA     = _mm_add_pd(zA, xA);
-    zA     = _mm_add_pd(zA, _mm_mul_pd(corr2, fexp));
-
-    /* If mask1 is not set ('B') */
-    corr   = _mm_and_pd(mask2, x);
-    xB     = _mm_add_pd(x, corr);
-    xB     = _mm_sub_pd(xB, one);
-    zB     = _mm_mul_pd(xB, xB);
-
-    polyP1 = _mm_mul_pd(P5, zB);
-    polyP2 = _mm_mul_pd(P4, zB);
-    polyP1 = _mm_add_pd(polyP1, P3);
-    polyP2 = _mm_add_pd(polyP2, P2);
-    polyP1 = _mm_mul_pd(polyP1, zB);
-    polyP2 = _mm_mul_pd(polyP2, zB);
-    polyP1 = _mm_add_pd(polyP1, P1);
-    polyP2 = _mm_add_pd(polyP2, P0);
-    polyP1 = _mm_mul_pd(polyP1, xB);
-    polyP1 = _mm_add_pd(polyP1, polyP2);
-
-    polyQ2 = _mm_mul_pd(Q4, zB);
-    polyQ1 = _mm_add_pd(zB, Q3);
-    polyQ2 = _mm_add_pd(polyQ2, Q2);
-    polyQ1 = _mm_mul_pd(polyQ1, zB);
-    polyQ2 = _mm_mul_pd(polyQ2, zB);
-    polyQ1 = _mm_add_pd(polyQ1, Q1);
-    polyQ2 = _mm_add_pd(polyQ2, Q0);
-    polyQ1 = _mm_mul_pd(polyQ1, xB);
-    polyQ1 = _mm_add_pd(polyQ1, polyQ2);
-
-    fexp   = _mm_and_pd(fexp, _mm_cmpneq_pd(fexp, _mm_setzero_pd()));
-
-    q      = _mm_mul_pd(polyP1, gmx_mm_inv_pd(polyQ1));
-    yB     = _mm_mul_pd(_mm_mul_pd(xB, zB), q);
-
-    yB     = _mm_add_pd(yB, _mm_mul_pd(corr1, fexp));
-    yB     = _mm_sub_pd(yB, _mm_mul_pd(half, zB));
-    zB     = _mm_add_pd(xB, yB);
-    zB     = _mm_add_pd(zB, _mm_mul_pd(corr2, fexp));
-
-    z      = _mm_or_pd( _mm_andnot_pd(mask1, zB), _mm_and_pd(mask1, zA) );
-
-    return z;
-}
-
-
-
-static __m128d
-gmx_mm_erf_pd(__m128d x)
-{
-    /* Coefficients for minimax approximation of erf(x)=x*(CAoffset + P(x^2)/Q(x^2)) in range [-0.75,0.75] */
-    const __m128d CAP4      = _mm_set1_pd(-0.431780540597889301512e-4);
-    const __m128d CAP3      = _mm_set1_pd(-0.00578562306260059236059);
-    const __m128d CAP2      = _mm_set1_pd(-0.028593586920219752446);
-    const __m128d CAP1      = _mm_set1_pd(-0.315924962948621698209);
-    const __m128d CAP0      = _mm_set1_pd(0.14952975608477029151);
-
-    const __m128d CAQ5      = _mm_set1_pd(-0.374089300177174709737e-5);
-    const __m128d CAQ4      = _mm_set1_pd(0.00015126584532155383535);
-    const __m128d CAQ3      = _mm_set1_pd(0.00536692680669480725423);
-    const __m128d CAQ2      = _mm_set1_pd(0.0668686825594046122636);
-    const __m128d CAQ1      = _mm_set1_pd(0.402604990869284362773);
-    /* CAQ0 == 1.0 */
-    const __m128d CAoffset  = _mm_set1_pd(0.9788494110107421875);
-
-    /* Coefficients for minimax approximation of erfc(x)=exp(-x^2)*x*(P(x-1)/Q(x-1)) in range [1.0,4.5] */
-    const __m128d CBP6      = _mm_set1_pd(2.49650423685462752497647637088e-10);
-    const __m128d CBP5      = _mm_set1_pd(0.00119770193298159629350136085658);
-    const __m128d CBP4      = _mm_set1_pd(0.0164944422378370965881008942733);
-    const __m128d CBP3      = _mm_set1_pd(0.0984581468691775932063932439252);
-    const __m128d CBP2      = _mm_set1_pd(0.317364595806937763843589437418);
-    const __m128d CBP1      = _mm_set1_pd(0.554167062641455850932670067075);
-    const __m128d CBP0      = _mm_set1_pd(0.427583576155807163756925301060);
-    const __m128d CBQ7      = _mm_set1_pd(0.00212288829699830145976198384930);
-    const __m128d CBQ6      = _mm_set1_pd(0.0334810979522685300554606393425);
-    const __m128d CBQ5      = _mm_set1_pd(0.2361713785181450957579508850717);
-    const __m128d CBQ4      = _mm_set1_pd(0.955364736493055670530981883072);
-    const __m128d CBQ3      = _mm_set1_pd(2.36815675631420037315349279199);
-    const __m128d CBQ2      = _mm_set1_pd(3.55261649184083035537184223542);
-    const __m128d CBQ1      = _mm_set1_pd(2.93501136050160872574376997993);
-    /* CBQ0 == 1.0 */
-
-    /* Coefficients for minimax approximation of erfc(x)=exp(-x^2)/x*(P(1/x)/Q(1/x)) in range [4.5,inf] */
-    const __m128d CCP6      = _mm_set1_pd(-2.8175401114513378771);
-    const __m128d CCP5      = _mm_set1_pd(-3.22729451764143718517);
-    const __m128d CCP4      = _mm_set1_pd(-2.5518551727311523996);
-    const __m128d CCP3      = _mm_set1_pd(-0.687717681153649930619);
-    const __m128d CCP2      = _mm_set1_pd(-0.212652252872804219852);
-    const __m128d CCP1      = _mm_set1_pd(0.0175389834052493308818);
-    const __m128d CCP0      = _mm_set1_pd(0.00628057170626964891937);
-
-    const __m128d CCQ6      = _mm_set1_pd(5.48409182238641741584);
-    const __m128d CCQ5      = _mm_set1_pd(13.5064170191802889145);
-    const __m128d CCQ4      = _mm_set1_pd(22.9367376522880577224);
-    const __m128d CCQ3      = _mm_set1_pd(15.930646027911794143);
-    const __m128d CCQ2      = _mm_set1_pd(11.0567237927800161565);
-    const __m128d CCQ1      = _mm_set1_pd(2.79257750980575282228);
-    /* CCQ0 == 1.0 */
-    const __m128d CCoffset  = _mm_set1_pd(0.5579090118408203125);
-
-    const __m128d one       = _mm_set1_pd(1.0);
-    const __m128d two       = _mm_set1_pd(2.0);
-
-    const __m128d signbit   = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000, 0x00000000, 0x80000000, 0x00000000) );
-
-    __m128d       xabs, x2, x4, t, t2, w, w2;
-    __m128d       PolyAP0, PolyAP1, PolyAQ0, PolyAQ1;
-    __m128d       PolyBP0, PolyBP1, PolyBQ0, PolyBQ1;
-    __m128d       PolyCP0, PolyCP1, PolyCQ0, PolyCQ1;
-    __m128d       res_erf, res_erfcB, res_erfcC, res_erfc, res;
-    __m128d       mask, expmx2;
-
-    /* Calculate erf() */
-    xabs     = gmx_mm_abs_pd(x);
-    x2       = _mm_mul_pd(x, x);
-    x4       = _mm_mul_pd(x2, x2);
-
-    PolyAP0  = _mm_mul_pd(CAP4, x4);
-    PolyAP1  = _mm_mul_pd(CAP3, x4);
-    PolyAP0  = _mm_add_pd(PolyAP0, CAP2);
-    PolyAP1  = _mm_add_pd(PolyAP1, CAP1);
-    PolyAP0  = _mm_mul_pd(PolyAP0, x4);
-    PolyAP1  = _mm_mul_pd(PolyAP1, x2);
-    PolyAP0  = _mm_add_pd(PolyAP0, CAP0);
-    PolyAP0  = _mm_add_pd(PolyAP0, PolyAP1);
-
-    PolyAQ1  = _mm_mul_pd(CAQ5, x4);
-    PolyAQ0  = _mm_mul_pd(CAQ4, x4);
-    PolyAQ1  = _mm_add_pd(PolyAQ1, CAQ3);
-    PolyAQ0  = _mm_add_pd(PolyAQ0, CAQ2);
-    PolyAQ1  = _mm_mul_pd(PolyAQ1, x4);
-    PolyAQ0  = _mm_mul_pd(PolyAQ0, x4);
-    PolyAQ1  = _mm_add_pd(PolyAQ1, CAQ1);
-    PolyAQ0  = _mm_add_pd(PolyAQ0, one);
-    PolyAQ1  = _mm_mul_pd(PolyAQ1, x2);
-    PolyAQ0  = _mm_add_pd(PolyAQ0, PolyAQ1);
-
-    res_erf  = _mm_mul_pd(PolyAP0, gmx_mm_inv_pd(PolyAQ0));
-    res_erf  = _mm_add_pd(CAoffset, res_erf);
-    res_erf  = _mm_mul_pd(x, res_erf);
-
-    /* Calculate erfc() in range [1,4.5] */
-    t       = _mm_sub_pd(xabs, one);
-    t2      = _mm_mul_pd(t, t);
-
-    PolyBP0  = _mm_mul_pd(CBP6, t2);
-    PolyBP1  = _mm_mul_pd(CBP5, t2);
-    PolyBP0  = _mm_add_pd(PolyBP0, CBP4);
-    PolyBP1  = _mm_add_pd(PolyBP1, CBP3);
-    PolyBP0  = _mm_mul_pd(PolyBP0, t2);
-    PolyBP1  = _mm_mul_pd(PolyBP1, t2);
-    PolyBP0  = _mm_add_pd(PolyBP0, CBP2);
-    PolyBP1  = _mm_add_pd(PolyBP1, CBP1);
-    PolyBP0  = _mm_mul_pd(PolyBP0, t2);
-    PolyBP1  = _mm_mul_pd(PolyBP1, t);
-    PolyBP0  = _mm_add_pd(PolyBP0, CBP0);
-    PolyBP0  = _mm_add_pd(PolyBP0, PolyBP1);
-
-    PolyBQ1 = _mm_mul_pd(CBQ7, t2);
-    PolyBQ0 = _mm_mul_pd(CBQ6, t2);
-    PolyBQ1 = _mm_add_pd(PolyBQ1, CBQ5);
-    PolyBQ0 = _mm_add_pd(PolyBQ0, CBQ4);
-    PolyBQ1 = _mm_mul_pd(PolyBQ1, t2);
-    PolyBQ0 = _mm_mul_pd(PolyBQ0, t2);
-    PolyBQ1 = _mm_add_pd(PolyBQ1, CBQ3);
-    PolyBQ0 = _mm_add_pd(PolyBQ0, CBQ2);
-    PolyBQ1 = _mm_mul_pd(PolyBQ1, t2);
-    PolyBQ0 = _mm_mul_pd(PolyBQ0, t2);
-    PolyBQ1 = _mm_add_pd(PolyBQ1, CBQ1);
-    PolyBQ0 = _mm_add_pd(PolyBQ0, one);
-    PolyBQ1 = _mm_mul_pd(PolyBQ1, t);
-    PolyBQ0 = _mm_add_pd(PolyBQ0, PolyBQ1);
-
-    res_erfcB = _mm_mul_pd(PolyBP0, gmx_mm_inv_pd(PolyBQ0));
-
-    res_erfcB = _mm_mul_pd(res_erfcB, xabs);
-
-    /* Calculate erfc() in range [4.5,inf] */
-    w       = gmx_mm_inv_pd(xabs);
-    w2      = _mm_mul_pd(w, w);
-
-    PolyCP0  = _mm_mul_pd(CCP6, w2);
-    PolyCP1  = _mm_mul_pd(CCP5, w2);
-    PolyCP0  = _mm_add_pd(PolyCP0, CCP4);
-    PolyCP1  = _mm_add_pd(PolyCP1, CCP3);
-    PolyCP0  = _mm_mul_pd(PolyCP0, w2);
-    PolyCP1  = _mm_mul_pd(PolyCP1, w2);
-    PolyCP0  = _mm_add_pd(PolyCP0, CCP2);
-    PolyCP1  = _mm_add_pd(PolyCP1, CCP1);
-    PolyCP0  = _mm_mul_pd(PolyCP0, w2);
-    PolyCP1  = _mm_mul_pd(PolyCP1, w);
-    PolyCP0  = _mm_add_pd(PolyCP0, CCP0);
-    PolyCP0  = _mm_add_pd(PolyCP0, PolyCP1);
-
-    PolyCQ0  = _mm_mul_pd(CCQ6, w2);
-    PolyCQ1  = _mm_mul_pd(CCQ5, w2);
-    PolyCQ0  = _mm_add_pd(PolyCQ0, CCQ4);
-    PolyCQ1  = _mm_add_pd(PolyCQ1, CCQ3);
-    PolyCQ0  = _mm_mul_pd(PolyCQ0, w2);
-    PolyCQ1  = _mm_mul_pd(PolyCQ1, w2);
-    PolyCQ0  = _mm_add_pd(PolyCQ0, CCQ2);
-    PolyCQ1  = _mm_add_pd(PolyCQ1, CCQ1);
-    PolyCQ0  = _mm_mul_pd(PolyCQ0, w2);
-    PolyCQ1  = _mm_mul_pd(PolyCQ1, w);
-    PolyCQ0  = _mm_add_pd(PolyCQ0, one);
-    PolyCQ0  = _mm_add_pd(PolyCQ0, PolyCQ1);
-
-    expmx2   = gmx_mm_exp_pd( _mm_or_pd(signbit, x2) );
-
-    res_erfcC = _mm_mul_pd(PolyCP0, gmx_mm_inv_pd(PolyCQ0));
-    res_erfcC = _mm_add_pd(res_erfcC, CCoffset);
-    res_erfcC = _mm_mul_pd(res_erfcC, w);
-
-    mask     = _mm_cmpgt_pd(xabs, _mm_set1_pd(4.5));
-    res_erfc = _mm_or_pd(_mm_andnot_pd(mask, res_erfcB), _mm_and_pd(mask, res_erfcC));
-
-    res_erfc = _mm_mul_pd(res_erfc, expmx2);
-
-    /* erfc(x<0) = 2-erfc(|x|) */
-    mask     = _mm_cmplt_pd(x, _mm_setzero_pd());
-    res_erfc = _mm_or_pd(_mm_andnot_pd(mask, res_erfc), _mm_and_pd(mask, _mm_sub_pd(two, res_erfc)));
-
-    /* Select erf() or erfc() */
-    mask = _mm_cmplt_pd(xabs, one);
-    res  = _mm_or_pd(_mm_andnot_pd(mask, _mm_sub_pd(one, res_erfc)), _mm_and_pd(mask, res_erf));
-
-    return res;
-}
-
-
-static __m128d
-gmx_mm_erfc_pd(__m128d x)
-{
-    /* Coefficients for minimax approximation of erf(x)=x*(CAoffset + P(x^2)/Q(x^2)) in range [-0.75,0.75] */
-    const __m128d CAP4      = _mm_set1_pd(-0.431780540597889301512e-4);
-    const __m128d CAP3      = _mm_set1_pd(-0.00578562306260059236059);
-    const __m128d CAP2      = _mm_set1_pd(-0.028593586920219752446);
-    const __m128d CAP1      = _mm_set1_pd(-0.315924962948621698209);
-    const __m128d CAP0      = _mm_set1_pd(0.14952975608477029151);
-
-    const __m128d CAQ5      = _mm_set1_pd(-0.374089300177174709737e-5);
-    const __m128d CAQ4      = _mm_set1_pd(0.00015126584532155383535);
-    const __m128d CAQ3      = _mm_set1_pd(0.00536692680669480725423);
-    const __m128d CAQ2      = _mm_set1_pd(0.0668686825594046122636);
-    const __m128d CAQ1      = _mm_set1_pd(0.402604990869284362773);
-    /* CAQ0 == 1.0 */
-    const __m128d CAoffset  = _mm_set1_pd(0.9788494110107421875);
-
-    /* Coefficients for minimax approximation of erfc(x)=exp(-x^2)*x*(P(x-1)/Q(x-1)) in range [1.0,4.5] */
-    const __m128d CBP6      = _mm_set1_pd(2.49650423685462752497647637088e-10);
-    const __m128d CBP5      = _mm_set1_pd(0.00119770193298159629350136085658);
-    const __m128d CBP4      = _mm_set1_pd(0.0164944422378370965881008942733);
-    const __m128d CBP3      = _mm_set1_pd(0.0984581468691775932063932439252);
-    const __m128d CBP2      = _mm_set1_pd(0.317364595806937763843589437418);
-    const __m128d CBP1      = _mm_set1_pd(0.554167062641455850932670067075);
-    const __m128d CBP0      = _mm_set1_pd(0.427583576155807163756925301060);
-    const __m128d CBQ7      = _mm_set1_pd(0.00212288829699830145976198384930);
-    const __m128d CBQ6      = _mm_set1_pd(0.0334810979522685300554606393425);
-    const __m128d CBQ5      = _mm_set1_pd(0.2361713785181450957579508850717);
-    const __m128d CBQ4      = _mm_set1_pd(0.955364736493055670530981883072);
-    const __m128d CBQ3      = _mm_set1_pd(2.36815675631420037315349279199);
-    const __m128d CBQ2      = _mm_set1_pd(3.55261649184083035537184223542);
-    const __m128d CBQ1      = _mm_set1_pd(2.93501136050160872574376997993);
-    /* CBQ0 == 1.0 */
-
-    /* Coefficients for minimax approximation of erfc(x)=exp(-x^2)/x*(P(1/x)/Q(1/x)) in range [4.5,inf] */
-    const __m128d CCP6      = _mm_set1_pd(-2.8175401114513378771);
-    const __m128d CCP5      = _mm_set1_pd(-3.22729451764143718517);
-    const __m128d CCP4      = _mm_set1_pd(-2.5518551727311523996);
-    const __m128d CCP3      = _mm_set1_pd(-0.687717681153649930619);
-    const __m128d CCP2      = _mm_set1_pd(-0.212652252872804219852);
-    const __m128d CCP1      = _mm_set1_pd(0.0175389834052493308818);
-    const __m128d CCP0      = _mm_set1_pd(0.00628057170626964891937);
-
-    const __m128d CCQ6      = _mm_set1_pd(5.48409182238641741584);
-    const __m128d CCQ5      = _mm_set1_pd(13.5064170191802889145);
-    const __m128d CCQ4      = _mm_set1_pd(22.9367376522880577224);
-    const __m128d CCQ3      = _mm_set1_pd(15.930646027911794143);
-    const __m128d CCQ2      = _mm_set1_pd(11.0567237927800161565);
-    const __m128d CCQ1      = _mm_set1_pd(2.79257750980575282228);
-    /* CCQ0 == 1.0 */
-    const __m128d CCoffset  = _mm_set1_pd(0.5579090118408203125);
-
-    const __m128d one       = _mm_set1_pd(1.0);
-    const __m128d two       = _mm_set1_pd(2.0);
-
-    const __m128d signbit   = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000, 0x00000000, 0x80000000, 0x00000000) );
-
-    __m128d       xabs, x2, x4, t, t2, w, w2;
-    __m128d       PolyAP0, PolyAP1, PolyAQ0, PolyAQ1;
-    __m128d       PolyBP0, PolyBP1, PolyBQ0, PolyBQ1;
-    __m128d       PolyCP0, PolyCP1, PolyCQ0, PolyCQ1;
-    __m128d       res_erf, res_erfcB, res_erfcC, res_erfc, res;
-    __m128d       mask, expmx2;
-
-    /* Calculate erf() */
-    xabs     = gmx_mm_abs_pd(x);
-    x2       = _mm_mul_pd(x, x);
-    x4       = _mm_mul_pd(x2, x2);
-
-    PolyAP0  = _mm_mul_pd(CAP4, x4);
-    PolyAP1  = _mm_mul_pd(CAP3, x4);
-    PolyAP0  = _mm_add_pd(PolyAP0, CAP2);
-    PolyAP1  = _mm_add_pd(PolyAP1, CAP1);
-    PolyAP0  = _mm_mul_pd(PolyAP0, x4);
-    PolyAP1  = _mm_mul_pd(PolyAP1, x2);
-    PolyAP0  = _mm_add_pd(PolyAP0, CAP0);
-    PolyAP0  = _mm_add_pd(PolyAP0, PolyAP1);
-
-    PolyAQ1  = _mm_mul_pd(CAQ5, x4);
-    PolyAQ0  = _mm_mul_pd(CAQ4, x4);
-    PolyAQ1  = _mm_add_pd(PolyAQ1, CAQ3);
-    PolyAQ0  = _mm_add_pd(PolyAQ0, CAQ2);
-    PolyAQ1  = _mm_mul_pd(PolyAQ1, x4);
-    PolyAQ0  = _mm_mul_pd(PolyAQ0, x4);
-    PolyAQ1  = _mm_add_pd(PolyAQ1, CAQ1);
-    PolyAQ0  = _mm_add_pd(PolyAQ0, one);
-    PolyAQ1  = _mm_mul_pd(PolyAQ1, x2);
-    PolyAQ0  = _mm_add_pd(PolyAQ0, PolyAQ1);
-
-    res_erf  = _mm_mul_pd(PolyAP0, gmx_mm_inv_pd(PolyAQ0));
-    res_erf  = _mm_add_pd(CAoffset, res_erf);
-    res_erf  = _mm_mul_pd(x, res_erf);
-
-    /* Calculate erfc() in range [1,4.5] */
-    t       = _mm_sub_pd(xabs, one);
-    t2      = _mm_mul_pd(t, t);
-
-    PolyBP0  = _mm_mul_pd(CBP6, t2);
-    PolyBP1  = _mm_mul_pd(CBP5, t2);
-    PolyBP0  = _mm_add_pd(PolyBP0, CBP4);
-    PolyBP1  = _mm_add_pd(PolyBP1, CBP3);
-    PolyBP0  = _mm_mul_pd(PolyBP0, t2);
-    PolyBP1  = _mm_mul_pd(PolyBP1, t2);
-    PolyBP0  = _mm_add_pd(PolyBP0, CBP2);
-    PolyBP1  = _mm_add_pd(PolyBP1, CBP1);
-    PolyBP0  = _mm_mul_pd(PolyBP0, t2);
-    PolyBP1  = _mm_mul_pd(PolyBP1, t);
-    PolyBP0  = _mm_add_pd(PolyBP0, CBP0);
-    PolyBP0  = _mm_add_pd(PolyBP0, PolyBP1);
-
-    PolyBQ1 = _mm_mul_pd(CBQ7, t2);
-    PolyBQ0 = _mm_mul_pd(CBQ6, t2);
-    PolyBQ1 = _mm_add_pd(PolyBQ1, CBQ5);
-    PolyBQ0 = _mm_add_pd(PolyBQ0, CBQ4);
-    PolyBQ1 = _mm_mul_pd(PolyBQ1, t2);
-    PolyBQ0 = _mm_mul_pd(PolyBQ0, t2);
-    PolyBQ1 = _mm_add_pd(PolyBQ1, CBQ3);
-    PolyBQ0 = _mm_add_pd(PolyBQ0, CBQ2);
-    PolyBQ1 = _mm_mul_pd(PolyBQ1, t2);
-    PolyBQ0 = _mm_mul_pd(PolyBQ0, t2);
-    PolyBQ1 = _mm_add_pd(PolyBQ1, CBQ1);
-    PolyBQ0 = _mm_add_pd(PolyBQ0, one);
-    PolyBQ1 = _mm_mul_pd(PolyBQ1, t);
-    PolyBQ0 = _mm_add_pd(PolyBQ0, PolyBQ1);
-
-    res_erfcB = _mm_mul_pd(PolyBP0, gmx_mm_inv_pd(PolyBQ0));
-
-    res_erfcB = _mm_mul_pd(res_erfcB, xabs);
-
-    /* Calculate erfc() in range [4.5,inf] */
-    w       = gmx_mm_inv_pd(xabs);
-    w2      = _mm_mul_pd(w, w);
-
-    PolyCP0  = _mm_mul_pd(CCP6, w2);
-    PolyCP1  = _mm_mul_pd(CCP5, w2);
-    PolyCP0  = _mm_add_pd(PolyCP0, CCP4);
-    PolyCP1  = _mm_add_pd(PolyCP1, CCP3);
-    PolyCP0  = _mm_mul_pd(PolyCP0, w2);
-    PolyCP1  = _mm_mul_pd(PolyCP1, w2);
-    PolyCP0  = _mm_add_pd(PolyCP0, CCP2);
-    PolyCP1  = _mm_add_pd(PolyCP1, CCP1);
-    PolyCP0  = _mm_mul_pd(PolyCP0, w2);
-    PolyCP1  = _mm_mul_pd(PolyCP1, w);
-    PolyCP0  = _mm_add_pd(PolyCP0, CCP0);
-    PolyCP0  = _mm_add_pd(PolyCP0, PolyCP1);
-
-    PolyCQ0  = _mm_mul_pd(CCQ6, w2);
-    PolyCQ1  = _mm_mul_pd(CCQ5, w2);
-    PolyCQ0  = _mm_add_pd(PolyCQ0, CCQ4);
-    PolyCQ1  = _mm_add_pd(PolyCQ1, CCQ3);
-    PolyCQ0  = _mm_mul_pd(PolyCQ0, w2);
-    PolyCQ1  = _mm_mul_pd(PolyCQ1, w2);
-    PolyCQ0  = _mm_add_pd(PolyCQ0, CCQ2);
-    PolyCQ1  = _mm_add_pd(PolyCQ1, CCQ1);
-    PolyCQ0  = _mm_mul_pd(PolyCQ0, w2);
-    PolyCQ1  = _mm_mul_pd(PolyCQ1, w);
-    PolyCQ0  = _mm_add_pd(PolyCQ0, one);
-    PolyCQ0  = _mm_add_pd(PolyCQ0, PolyCQ1);
-
-    expmx2   = gmx_mm_exp_pd( _mm_or_pd(signbit, x2) );
-
-    res_erfcC = _mm_mul_pd(PolyCP0, gmx_mm_inv_pd(PolyCQ0));
-    res_erfcC = _mm_add_pd(res_erfcC, CCoffset);
-    res_erfcC = _mm_mul_pd(res_erfcC, w);
-
-    mask     = _mm_cmpgt_pd(xabs, _mm_set1_pd(4.5));
-    res_erfc = _mm_or_pd(_mm_andnot_pd(mask, res_erfcB), _mm_and_pd(mask, res_erfcC));
-
-    res_erfc = _mm_mul_pd(res_erfc, expmx2);
-
-    /* erfc(x<0) = 2-erfc(|x|) */
-    mask     = _mm_cmplt_pd(x, _mm_setzero_pd());
-    res_erfc = _mm_or_pd(_mm_andnot_pd(mask, res_erfc), _mm_and_pd(mask, _mm_sub_pd(two, res_erfc)));
-
-    /* Select erf() or erfc() */
-    mask = _mm_cmplt_pd(xabs, one);
-    res  = _mm_or_pd(_mm_andnot_pd(mask, res_erfc), _mm_and_pd(mask, _mm_sub_pd(one, res_erf)));
-
-    return res;
-}
-
-
-/* Calculate the force correction due to PME analytically.
- *
- * This routine is meant to enable analytical evaluation of the
- * direct-space PME electrostatic force to avoid tables.
- *
- * The direct-space potential should be Erfc(beta*r)/r, but there
- * are some problems evaluating that:
- *
- * First, the error function is difficult (read: expensive) to
- * approxmiate accurately for intermediate to large arguments, and
- * this happens already in ranges of beta*r that occur in simulations.
- * Second, we now try to avoid calculating potentials in Gromacs but
- * use forces directly.
- *
- * We can simply things slight by noting that the PME part is really
- * a correction to the normal Coulomb force since Erfc(z)=1-Erf(z), i.e.
- *
- * V= 1/r - Erf(beta*r)/r
- *
- * The first term we already have from the inverse square root, so
- * that we can leave out of this routine.
- *
- * For pme tolerances of 1e-3 to 1e-8 and cutoffs of 0.5nm to 1.8nm,
- * the argument beta*r will be in the range 0.15 to ~4. Use your
- * favorite plotting program to realize how well-behaved Erf(z)/z is
- * in this range!
- *
- * We approximate f(z)=erf(z)/z with a rational minimax polynomial.
- * However, it turns out it is more efficient to approximate f(z)/z and
- * then only use even powers. This is another minor optimization, since
- * we actually WANT f(z)/z, because it is going to be multiplied by
- * the vector between the two atoms to get the vectorial force. The
- * fastest flops are the ones we can avoid calculating!
- *
- * So, here's how it should be used:
- *
- * 1. Calculate r^2.
- * 2. Multiply by beta^2, so you get z^2=beta^2*r^2.
- * 3. Evaluate this routine with z^2 as the argument.
- * 4. The return value is the expression:
- *
- *
- *       2*exp(-z^2)     erf(z)
- *       ------------ - --------
- *       sqrt(Pi)*z^2      z^3
- *
- * 5. Multiply the entire expression by beta^3. This will get you
- *
- *       beta^3*2*exp(-z^2)     beta^3*erf(z)
- *       ------------------  - ---------------
- *          sqrt(Pi)*z^2            z^3
- *
- *    or, switching back to r (z=r*beta):
- *
- *       2*beta*exp(-r^2*beta^2)   erf(r*beta)
- *       ----------------------- - -----------
- *            sqrt(Pi)*r^2            r^3
- *
- *
- *    With a bit of math exercise you should be able to confirm that
- *    this is exactly D[Erf[beta*r]/r,r] divided by r another time.
- *
- * 6. Add the result to 1/r^3, multiply by the product of the charges,
- *    and you have your force (divided by r). A final multiplication
- *    with the vector connecting the two particles and you have your
- *    vectorial force to add to the particles.
- *
+/* Temporary:
+ * Alias some old SSE definitions to new SIMD definitions so we don't need
+ * to modify _all_ group kernels - they will anyway be replaced with a new
+ * generic SIMD version soon.
   */
-static __m128d
-gmx_mm_pmecorrF_pd(__m128d z2)
-{
-    const __m128d  FN10     = _mm_set1_pd(-8.0072854618360083154e-14);
-    const __m128d  FN9      = _mm_set1_pd(1.1859116242260148027e-11);
-    const __m128d  FN8      = _mm_set1_pd(-8.1490406329798423616e-10);
-    const __m128d  FN7      = _mm_set1_pd(3.4404793543907847655e-8);
-    const __m128d  FN6      = _mm_set1_pd(-9.9471420832602741006e-7);
-    const __m128d  FN5      = _mm_set1_pd(0.000020740315999115847456);
-    const __m128d  FN4      = _mm_set1_pd(-0.00031991745139313364005);
-    const __m128d  FN3      = _mm_set1_pd(0.0035074449373659008203);
-    const __m128d  FN2      = _mm_set1_pd(-0.031750380176100813405);
-    const __m128d  FN1      = _mm_set1_pd(0.13884101728898463426);
-    const __m128d  FN0      = _mm_set1_pd(-0.75225277815249618847);
-
-    const __m128d  FD5      = _mm_set1_pd(0.000016009278224355026701);
-    const __m128d  FD4      = _mm_set1_pd(0.00051055686934806966046);
-    const __m128d  FD3      = _mm_set1_pd(0.0081803507497974289008);
-    const __m128d  FD2      = _mm_set1_pd(0.077181146026670287235);
-    const __m128d  FD1      = _mm_set1_pd(0.41543303143712535988);
-    const __m128d  FD0      = _mm_set1_pd(1.0);
-
-    __m128d        z4;
-    __m128d        polyFN0, polyFN1, polyFD0, polyFD1;
-
-    z4             = _mm_mul_pd(z2, z2);
-
-    polyFD1        = _mm_mul_pd(FD5, z4);
-    polyFD0        = _mm_mul_pd(FD4, z4);
-    polyFD1        = _mm_add_pd(polyFD1, FD3);
-    polyFD0        = _mm_add_pd(polyFD0, FD2);
-    polyFD1        = _mm_mul_pd(polyFD1, z4);
-    polyFD0        = _mm_mul_pd(polyFD0, z4);
-    polyFD1        = _mm_add_pd(polyFD1, FD1);
-    polyFD0        = _mm_add_pd(polyFD0, FD0);
-    polyFD1        = _mm_mul_pd(polyFD1, z2);
-    polyFD0        = _mm_add_pd(polyFD0, polyFD1);
-
-    polyFD0        = gmx_mm_inv_pd(polyFD0);
-
-    polyFN0        = _mm_mul_pd(FN10, z4);
-    polyFN1        = _mm_mul_pd(FN9, z4);
-    polyFN0        = _mm_add_pd(polyFN0, FN8);
-    polyFN1        = _mm_add_pd(polyFN1, FN7);
-    polyFN0        = _mm_mul_pd(polyFN0, z4);
-    polyFN1        = _mm_mul_pd(polyFN1, z4);
-    polyFN0        = _mm_add_pd(polyFN0, FN6);
-    polyFN1        = _mm_add_pd(polyFN1, FN5);
-    polyFN0        = _mm_mul_pd(polyFN0, z4);
-    polyFN1        = _mm_mul_pd(polyFN1, z4);
-    polyFN0        = _mm_add_pd(polyFN0, FN4);
-    polyFN1        = _mm_add_pd(polyFN1, FN3);
-    polyFN0        = _mm_mul_pd(polyFN0, z4);
-    polyFN1        = _mm_mul_pd(polyFN1, z4);
-    polyFN0        = _mm_add_pd(polyFN0, FN2);
-    polyFN1        = _mm_add_pd(polyFN1, FN1);
-    polyFN0        = _mm_mul_pd(polyFN0, z4);
-    polyFN1        = _mm_mul_pd(polyFN1, z2);
-    polyFN0        = _mm_add_pd(polyFN0, FN0);
-    polyFN0        = _mm_add_pd(polyFN0, polyFN1);
-
-    return _mm_mul_pd(polyFN0, polyFD0);
-}
-
-
-
-
-/* Calculate the potential correction due to PME analytically.
- *
- * See gmx_mm256_pmecorrF_ps() for details about the approximation.
- *
- * This routine calculates Erf(z)/z, although you should provide z^2
- * as the input argument.
- *
- * Here's how it should be used:
- *
- * 1. Calculate r^2.
- * 2. Multiply by beta^2, so you get z^2=beta^2*r^2.
- * 3. Evaluate this routine with z^2 as the argument.
- * 4. The return value is the expression:
- *
- *
- *        erf(z)
- *       --------
- *          z
- *
- * 5. Multiply the entire expression by beta and switching back to r (z=r*beta):
- *
- *       erf(r*beta)
- *       -----------
- *           r
- *
- * 6. Subtract the result from 1/r, multiply by the product of the charges,
- *    and you have your potential.
- *
- */
-static __m128d
-gmx_mm_pmecorrV_pd(__m128d z2)
-{
-    const __m128d  VN9      = _mm_set1_pd(-9.3723776169321855475e-13);
-    const __m128d  VN8      = _mm_set1_pd(1.2280156762674215741e-10);
-    const __m128d  VN7      = _mm_set1_pd(-7.3562157912251309487e-9);
-    const __m128d  VN6      = _mm_set1_pd(2.6215886208032517509e-7);
-    const __m128d  VN5      = _mm_set1_pd(-4.9532491651265819499e-6);
-    const __m128d  VN4      = _mm_set1_pd(0.00025907400778966060389);
-    const __m128d  VN3      = _mm_set1_pd(0.0010585044856156469792);
-    const __m128d  VN2      = _mm_set1_pd(0.045247661136833092885);
-    const __m128d  VN1      = _mm_set1_pd(0.11643931522926034421);
-    const __m128d  VN0      = _mm_set1_pd(1.1283791671726767970);
-
-    const __m128d  VD5      = _mm_set1_pd(0.000021784709867336150342);
-    const __m128d  VD4      = _mm_set1_pd(0.00064293662010911388448);
-    const __m128d  VD3      = _mm_set1_pd(0.0096311444822588683504);
-    const __m128d  VD2      = _mm_set1_pd(0.085608012351550627051);
-    const __m128d  VD1      = _mm_set1_pd(0.43652499166614811084);
-    const __m128d  VD0      = _mm_set1_pd(1.0);
-
-    __m128d        z4;
-    __m128d        polyVN0, polyVN1, polyVD0, polyVD1;
-
-    z4             = _mm_mul_pd(z2, z2);
-
-    polyVD1        = _mm_mul_pd(VD5, z4);
-    polyVD0        = _mm_mul_pd(VD4, z4);
-    polyVD1        = _mm_add_pd(polyVD1, VD3);
-    polyVD0        = _mm_add_pd(polyVD0, VD2);
-    polyVD1        = _mm_mul_pd(polyVD1, z4);
-    polyVD0        = _mm_mul_pd(polyVD0, z4);
-    polyVD1        = _mm_add_pd(polyVD1, VD1);
-    polyVD0        = _mm_add_pd(polyVD0, VD0);
-    polyVD1        = _mm_mul_pd(polyVD1, z2);
-    polyVD0        = _mm_add_pd(polyVD0, polyVD1);
-
-    polyVD0        = gmx_mm_inv_pd(polyVD0);
-
-    polyVN1        = _mm_mul_pd(VN9, z4);
-    polyVN0        = _mm_mul_pd(VN8, z4);
-    polyVN1        = _mm_add_pd(polyVN1, VN7);
-    polyVN0        = _mm_add_pd(polyVN0, VN6);
-    polyVN1        = _mm_mul_pd(polyVN1, z4);
-    polyVN0        = _mm_mul_pd(polyVN0, z4);
-    polyVN1        = _mm_add_pd(polyVN1, VN5);
-    polyVN0        = _mm_add_pd(polyVN0, VN4);
-    polyVN1        = _mm_mul_pd(polyVN1, z4);
-    polyVN0        = _mm_mul_pd(polyVN0, z4);
-    polyVN1        = _mm_add_pd(polyVN1, VN3);
-    polyVN0        = _mm_add_pd(polyVN0, VN2);
-    polyVN1        = _mm_mul_pd(polyVN1, z4);
-    polyVN0        = _mm_mul_pd(polyVN0, z4);
-    polyVN1        = _mm_add_pd(polyVN1, VN1);
-    polyVN0        = _mm_add_pd(polyVN0, VN0);
-    polyVN1        = _mm_mul_pd(polyVN1, z2);
-    polyVN0        = _mm_add_pd(polyVN0, polyVN1);
-
-    return _mm_mul_pd(polyVN0, polyVD0);
-}
-
-
-
-static int
-gmx_mm_sincos_pd(__m128d  x,
-                 __m128d *sinval,
-                 __m128d *cosval)
-{
-#ifdef _MSC_VER
-    __declspec(align(16))
-    const double sintable[34] =
-    {
-        1.00000000000000000e+00, 0.00000000000000000e+00,
-        9.95184726672196929e-01, 9.80171403295606036e-02,
-        9.80785280403230431e-01, 1.95090322016128248e-01,
-        9.56940335732208824e-01, 2.90284677254462331e-01,
-        9.23879532511286738e-01, 3.82683432365089782e-01,
-        8.81921264348355050e-01, 4.71396736825997642e-01,
-        8.31469612302545236e-01, 5.55570233019602178e-01,
-        7.73010453362736993e-01, 6.34393284163645488e-01,
-        7.07106781186547573e-01, 7.07106781186547462e-01,
-        6.34393284163645599e-01, 7.73010453362736882e-01,
-        5.55570233019602289e-01, 8.31469612302545125e-01,
-        4.71396736825997809e-01, 8.81921264348354939e-01,
-        3.82683432365089837e-01, 9.23879532511286738e-01,
-        2.90284677254462276e-01, 9.56940335732208935e-01,
-        1.95090322016128304e-01, 9.80785280403230431e-01,
-        9.80171403295607702e-02, 9.95184726672196818e-01,
-        0.0, 1.00000000000000000e+00
-    };
-#else
-    const __m128d sintable[17] =
-    {
-        _mm_set_pd( 0.0, 1.0 ),
-        _mm_set_pd( sin(  1.0 * (M_PI/2.0) / 16.0), cos(  1.0 * (M_PI/2.0) / 16.0) ),
-        _mm_set_pd( sin(  2.0 * (M_PI/2.0) / 16.0), cos(  2.0 * (M_PI/2.0) / 16.0) ),
-        _mm_set_pd( sin(  3.0 * (M_PI/2.0) / 16.0), cos(  3.0 * (M_PI/2.0) / 16.0) ),
-        _mm_set_pd( sin(  4.0 * (M_PI/2.0) / 16.0), cos(  4.0 * (M_PI/2.0) / 16.0) ),
-        _mm_set_pd( sin(  5.0 * (M_PI/2.0) / 16.0), cos(  5.0 * (M_PI/2.0) / 16.0) ),
-        _mm_set_pd( sin(  6.0 * (M_PI/2.0) / 16.0), cos(  6.0 * (M_PI/2.0) / 16.0) ),
-        _mm_set_pd( sin(  7.0 * (M_PI/2.0) / 16.0), cos(  7.0 * (M_PI/2.0) / 16.0) ),
-        _mm_set_pd( sin(  8.0 * (M_PI/2.0) / 16.0), cos(  8.0 * (M_PI/2.0) / 16.0) ),
-        _mm_set_pd( sin(  9.0 * (M_PI/2.0) / 16.0), cos(  9.0 * (M_PI/2.0) / 16.0) ),
-        _mm_set_pd( sin( 10.0 * (M_PI/2.0) / 16.0), cos( 10.0 * (M_PI/2.0) / 16.0) ),
-        _mm_set_pd( sin( 11.0 * (M_PI/2.0) / 16.0), cos( 11.0 * (M_PI/2.0) / 16.0) ),
-        _mm_set_pd( sin( 12.0 * (M_PI/2.0) / 16.0), cos( 12.0 * (M_PI/2.0) / 16.0) ),
-        _mm_set_pd( sin( 13.0 * (M_PI/2.0) / 16.0), cos( 13.0 * (M_PI/2.0) / 16.0) ),
-        _mm_set_pd( sin( 14.0 * (M_PI/2.0) / 16.0), cos( 14.0 * (M_PI/2.0) / 16.0) ),
-        _mm_set_pd( sin( 15.0 * (M_PI/2.0) / 16.0), cos( 15.0 * (M_PI/2.0) / 16.0) ),
-        _mm_set_pd(  1.0, 0.0 )
-    };
-#endif
-
-    const __m128d signmask       = gmx_mm_castsi128_pd( _mm_set_epi32(0x7FFFFFFF, 0xFFFFFFFF, 0x7FFFFFFF, 0xFFFFFFFF) );
-
-    const __m128d tabscale      = _mm_set1_pd(32.0/M_PI);
-    const __m128d invtabscale0  = _mm_set1_pd(9.81747508049011230469e-02);
-    const __m128d invtabscale1  = _mm_set1_pd(1.96197799156550576057e-08);
-    const __m128i ione          = _mm_set1_epi32(1);
-    const __m128i i32           = _mm_set1_epi32(32);
-    const __m128i i16           = _mm_set1_epi32(16);
-    const __m128i tabmask       = _mm_set1_epi32(0x3F);
-    const __m128d sinP7         = _mm_set1_pd(-1.0/5040.0);
-    const __m128d sinP5         = _mm_set1_pd(1.0/120.0);
-    const __m128d sinP3         = _mm_set1_pd(-1.0/6.0);
-    const __m128d sinP1         = _mm_set1_pd(1.0);
-
-    const __m128d cosP6         = _mm_set1_pd(-1.0/720.0);
-    const __m128d cosP4         = _mm_set1_pd(1.0/24.0);
-    const __m128d cosP2         = _mm_set1_pd(-1.0/2.0);
-    const __m128d cosP0         = _mm_set1_pd(1.0);
-
-    __m128d       scalex;
-    __m128i       tabidx, corridx;
-    __m128d       xabs, z, z2, polySin, polyCos;
-    __m128d       xpoint;
-    __m128d       ypoint0, ypoint1;
-
-    __m128d       sinpoint, cospoint;
-    __m128d       xsign, ssign, csign;
-    __m128i       imask, sswapsign, cswapsign;
-
-    xsign    = _mm_andnot_pd(signmask, x);
-    xabs     = _mm_and_pd(x, signmask);
-
-    scalex   = _mm_mul_pd(tabscale, xabs);
-    tabidx   = _mm_cvtpd_epi32(scalex);
-
-    xpoint   = _mm_cvtepi32_pd(tabidx);
-
-    /* Extended precision arithmetics */
-    z        = _mm_sub_pd(xabs, _mm_mul_pd(invtabscale0, xpoint));
-    z        = _mm_sub_pd(z, _mm_mul_pd(invtabscale1, xpoint));
-
-    /* Range reduction to 0..2*Pi */
-    tabidx   = _mm_and_si128(tabidx, tabmask);
-
-    /* tabidx is now in range [0,..,64] */
-    imask     = _mm_cmpgt_epi32(tabidx, i32);
-    sswapsign = imask;
-    cswapsign = imask;
-    corridx   = _mm_and_si128(imask, i32);
-    tabidx    = _mm_sub_epi32(tabidx, corridx);
-
-    /* tabidx is now in range [0..32] */
-    imask     = _mm_cmpgt_epi32(tabidx, i16);
-    cswapsign = _mm_xor_si128(cswapsign, imask);
-    corridx   = _mm_sub_epi32(i32, tabidx);
-    tabidx    = _mm_or_si128( _mm_and_si128(imask, corridx), _mm_andnot_si128(imask, tabidx) );
-    /* tabidx is now in range [0..16] */
-    ssign     = _mm_cvtepi32_pd( _mm_or_si128( sswapsign, ione ) );
-    csign     = _mm_cvtepi32_pd( _mm_or_si128( cswapsign, ione ) );
-
-#ifdef _MSC_VER
-    ypoint0  = _mm_load_pd(sintable + 2*gmx_mm_extract_epi32(tabidx, 0));
-    ypoint1  = _mm_load_pd(sintable + 2*gmx_mm_extract_epi32(tabidx, 1));
-#else
-    ypoint0  = sintable[gmx_mm_extract_epi32(tabidx, 0)];
-    ypoint1  = sintable[gmx_mm_extract_epi32(tabidx, 1)];
-#endif
-    sinpoint = _mm_unpackhi_pd(ypoint0, ypoint1);
-    cospoint = _mm_unpacklo_pd(ypoint0, ypoint1);
-
-    sinpoint = _mm_mul_pd(sinpoint, ssign);
-    cospoint = _mm_mul_pd(cospoint, csign);
-
-    z2       = _mm_mul_pd(z, z);
-
-    polySin  = _mm_mul_pd(sinP7, z2);
-    polySin  = _mm_add_pd(polySin, sinP5);
-    polySin  = _mm_mul_pd(polySin, z2);
-    polySin  = _mm_add_pd(polySin, sinP3);
-    polySin  = _mm_mul_pd(polySin, z2);
-    polySin  = _mm_add_pd(polySin, sinP1);
-    polySin  = _mm_mul_pd(polySin, z);
-
-    polyCos  = _mm_mul_pd(cosP6, z2);
-    polyCos  = _mm_add_pd(polyCos, cosP4);
-    polyCos  = _mm_mul_pd(polyCos, z2);
-    polyCos  = _mm_add_pd(polyCos, cosP2);
-    polyCos  = _mm_mul_pd(polyCos, z2);
-    polyCos  = _mm_add_pd(polyCos, cosP0);
-
-    *sinval  = _mm_xor_pd(_mm_add_pd( _mm_mul_pd(sinpoint, polyCos), _mm_mul_pd(cospoint, polySin) ), xsign);
-    *cosval  = _mm_sub_pd( _mm_mul_pd(cospoint, polyCos), _mm_mul_pd(sinpoint, polySin) );
-
-    return 0;
-}
-
-/*
- * IMPORTANT: Do NOT call both sin & cos if you need both results, since each of them
- * will then call the sincos() routine and waste a factor 2 in performance!
- */
-static __m128d
-gmx_mm_sin_pd(__m128d x)
-{
-    __m128d s, c;
-    gmx_mm_sincos_pd(x, &s, &c);
-    return s;
-}
-
-/*
- * IMPORTANT: Do NOT call both sin & cos if you need both results, since each of them
- * will then call the sincos() routine and waste a factor 2 in performance!
- */
-static __m128d
-gmx_mm_cos_pd(__m128d x)
-{
-    __m128d s, c;
-    gmx_mm_sincos_pd(x, &s, &c);
-    return c;
-}
-
-
-
-static __m128d
-gmx_mm_tan_pd(__m128d x)
-{
-    __m128d sinval, cosval;
-    __m128d tanval;
-
-    gmx_mm_sincos_pd(x, &sinval, &cosval);
-
-    tanval = _mm_mul_pd(sinval, gmx_mm_inv_pd(cosval));
-
-    return tanval;
-}
-
-
-
-static __m128d
-gmx_mm_asin_pd(__m128d x)
-{
-    /* Same algorithm as cephes library */
-    const __m128d signmask  = gmx_mm_castsi128_pd( _mm_set_epi32(0x7FFFFFFF, 0xFFFFFFFF, 0x7FFFFFFF, 0xFFFFFFFF) );
-    const __m128d limit1    = _mm_set1_pd(0.625);
-    const __m128d limit2    = _mm_set1_pd(1e-8);
-    const __m128d one       = _mm_set1_pd(1.0);
-    const __m128d quarterpi = _mm_set1_pd(M_PI/4.0);
-    const __m128d morebits  = _mm_set1_pd(6.123233995736765886130e-17);
-
-    const __m128d P5        = _mm_set1_pd(4.253011369004428248960e-3);
-    const __m128d P4        = _mm_set1_pd(-6.019598008014123785661e-1);
-    const __m128d P3        = _mm_set1_pd(5.444622390564711410273e0);
-    const __m128d P2        = _mm_set1_pd(-1.626247967210700244449e1);
-    const __m128d P1        = _mm_set1_pd(1.956261983317594739197e1);
-    const __m128d P0        = _mm_set1_pd(-8.198089802484824371615e0);
-
-    const __m128d Q4        = _mm_set1_pd(-1.474091372988853791896e1);
-    const __m128d Q3        = _mm_set1_pd(7.049610280856842141659e1);
-    const __m128d Q2        = _mm_set1_pd(-1.471791292232726029859e2);
-    const __m128d Q1        = _mm_set1_pd(1.395105614657485689735e2);
-    const __m128d Q0        = _mm_set1_pd(-4.918853881490881290097e1);
-
-    const __m128d R4        = _mm_set1_pd(2.967721961301243206100e-3);
-    const __m128d R3        = _mm_set1_pd(-5.634242780008963776856e-1);
-    const __m128d R2        = _mm_set1_pd(6.968710824104713396794e0);
-    const __m128d R1        = _mm_set1_pd(-2.556901049652824852289e1);
-    const __m128d R0        = _mm_set1_pd(2.853665548261061424989e1);
-
-    const __m128d S3        = _mm_set1_pd(-2.194779531642920639778e1);
-    const __m128d S2        = _mm_set1_pd(1.470656354026814941758e2);
-    const __m128d S1        = _mm_set1_pd(-3.838770957603691357202e2);
-    const __m128d S0        = _mm_set1_pd(3.424398657913078477438e2);
-
-    __m128d       sign;
-    __m128d       mask;
-    __m128d       xabs;
-    __m128d       zz, ww, z, q, w, zz2, ww2;
-    __m128d       PA, PB;
-    __m128d       QA, QB;
-    __m128d       RA, RB;
-    __m128d       SA, SB;
-    __m128d       nom, denom;
-
-    sign  = _mm_andnot_pd(signmask, x);
-    xabs  = _mm_and_pd(x, signmask);
-
-    mask  = _mm_cmpgt_pd(xabs, limit1);
-
-    zz    = _mm_sub_pd(one, xabs);
-    ww    = _mm_mul_pd(xabs, xabs);
-    zz2   = _mm_mul_pd(zz, zz);
-    ww2   = _mm_mul_pd(ww, ww);
-
-    /* R */
-    RA    = _mm_mul_pd(R4, zz2);
-    RB    = _mm_mul_pd(R3, zz2);
-    RA    = _mm_add_pd(RA, R2);
-    RB    = _mm_add_pd(RB, R1);
-    RA    = _mm_mul_pd(RA, zz2);
-    RB    = _mm_mul_pd(RB, zz);
-    RA    = _mm_add_pd(RA, R0);
-    RA    = _mm_add_pd(RA, RB);
-
-    /* S, SA = zz2 */
-    SB    = _mm_mul_pd(S3, zz2);
-    SA    = _mm_add_pd(zz2, S2);
-    SB    = _mm_add_pd(SB, S1);
-    SA    = _mm_mul_pd(SA, zz2);
-    SB    = _mm_mul_pd(SB, zz);
-    SA    = _mm_add_pd(SA, S0);
-    SA    = _mm_add_pd(SA, SB);
-
-    /* P */
-    PA    = _mm_mul_pd(P5, ww2);
-    PB    = _mm_mul_pd(P4, ww2);
-    PA    = _mm_add_pd(PA, P3);
-    PB    = _mm_add_pd(PB, P2);
-    PA    = _mm_mul_pd(PA, ww2);
-    PB    = _mm_mul_pd(PB, ww2);
-    PA    = _mm_add_pd(PA, P1);
-    PB    = _mm_add_pd(PB, P0);
-    PA    = _mm_mul_pd(PA, ww);
-    PA    = _mm_add_pd(PA, PB);
-
-    /* Q, QA = ww2 */
-    QB    = _mm_mul_pd(Q4, ww2);
-    QA    = _mm_add_pd(ww2, Q3);
-    QB    = _mm_add_pd(QB, Q2);
-    QA    = _mm_mul_pd(QA, ww2);
-    QB    = _mm_mul_pd(QB, ww2);
-    QA    = _mm_add_pd(QA, Q1);
-    QB    = _mm_add_pd(QB, Q0);
-    QA    = _mm_mul_pd(QA, ww);
-    QA    = _mm_add_pd(QA, QB);
-
-    RA    = _mm_mul_pd(RA, zz);
-    PA    = _mm_mul_pd(PA, ww);
-
-    nom   = _mm_or_pd( _mm_andnot_pd(mask, PA), _mm_and_pd(mask, RA) );
-    denom = _mm_or_pd( _mm_andnot_pd(mask, QA), _mm_and_pd(mask, SA) );
-
-    q     = _mm_mul_pd( nom, gmx_mm_inv_pd(denom) );
-
-    zz    = _mm_add_pd(zz, zz);
-    zz    = gmx_mm_sqrt_pd(zz);
-    z     = _mm_sub_pd(quarterpi, zz);
-    zz    = _mm_mul_pd(zz, q);
-    zz    = _mm_sub_pd(zz, morebits);
-    z     = _mm_sub_pd(z, zz);
-    z     = _mm_add_pd(z, quarterpi);
-
-    w     = _mm_mul_pd(xabs, q);
-    w     = _mm_add_pd(w, xabs);
-
-    z     = _mm_or_pd( _mm_andnot_pd(mask, w), _mm_and_pd(mask, z) );
-
-    mask  = _mm_cmpgt_pd(xabs, limit2);
-    z     = _mm_or_pd( _mm_andnot_pd(mask, xabs), _mm_and_pd(mask, z) );
-
-    z = _mm_xor_pd(z, sign);
-
-    return z;
-}
-
-
-static __m128d
-gmx_mm_acos_pd(__m128d x)
-{
-    const __m128d one        = _mm_set1_pd(1.0);
-    const __m128d half       = _mm_set1_pd(0.5);
-    const __m128d quarterpi0 = _mm_set1_pd(7.85398163397448309616e-1);
-    const __m128d quarterpi1 = _mm_set1_pd(6.123233995736765886130e-17);
-
-
-    __m128d mask1;
-
-    __m128d z, z1, z2;
-
-    mask1 = _mm_cmpgt_pd(x, half);
-    z1    = _mm_mul_pd(half, _mm_sub_pd(one, x));
-    z1    = gmx_mm_sqrt_pd(z1);
-    z     = _mm_or_pd( _mm_andnot_pd(mask1, x), _mm_and_pd(mask1, z1) );
-
-    z     = gmx_mm_asin_pd(z);
-
-    z1    = _mm_add_pd(z, z);
-
-    z2    = _mm_sub_pd(quarterpi0, z);
-    z2    = _mm_add_pd(z2, quarterpi1);
-    z2    = _mm_add_pd(z2, quarterpi0);
-
-    z     = _mm_or_pd(_mm_andnot_pd(mask1, z2), _mm_and_pd(mask1, z1));
-
-    return z;
-}
-
-static __m128d
-gmx_mm_atan_pd(__m128d x)
-{
-    /* Same algorithm as cephes library */
-    const __m128d signmask  = gmx_mm_castsi128_pd( _mm_set_epi32(0x7FFFFFFF, 0xFFFFFFFF, 0x7FFFFFFF, 0xFFFFFFFF) );
-    const __m128d limit1    = _mm_set1_pd(0.66);
-    const __m128d limit2    = _mm_set1_pd(2.41421356237309504880);
-    const __m128d quarterpi = _mm_set1_pd(M_PI/4.0);
-    const __m128d halfpi    = _mm_set1_pd(M_PI/2.0);
-    const __m128d mone      = _mm_set1_pd(-1.0);
-    const __m128d morebits1 = _mm_set1_pd(0.5*6.123233995736765886130E-17);
-    const __m128d morebits2 = _mm_set1_pd(6.123233995736765886130E-17);
-
-    const __m128d P4        = _mm_set1_pd(-8.750608600031904122785E-1);
-    const __m128d P3        = _mm_set1_pd(-1.615753718733365076637E1);
-    const __m128d P2        = _mm_set1_pd(-7.500855792314704667340E1);
-    const __m128d P1        = _mm_set1_pd(-1.228866684490136173410E2);
-    const __m128d P0        = _mm_set1_pd(-6.485021904942025371773E1);
-
-    const __m128d Q4        = _mm_set1_pd(2.485846490142306297962E1);
-    const __m128d Q3        = _mm_set1_pd(1.650270098316988542046E2);
-    const __m128d Q2        = _mm_set1_pd(4.328810604912902668951E2);
-    const __m128d Q1        = _mm_set1_pd(4.853903996359136964868E2);
-    const __m128d Q0        = _mm_set1_pd(1.945506571482613964425E2);
-
-    __m128d       sign;
-    __m128d       mask1, mask2;
-    __m128d       y, t1, t2;
-    __m128d       z, z2;
-    __m128d       P_A, P_B, Q_A, Q_B;
-
-    sign   = _mm_andnot_pd(signmask, x);
-    x      = _mm_and_pd(x, signmask);
-
-    mask1  = _mm_cmpgt_pd(x, limit1);
-    mask2  = _mm_cmpgt_pd(x, limit2);
-
-    t1     = _mm_mul_pd(_mm_add_pd(x, mone), gmx_mm_inv_pd(_mm_sub_pd(x, mone)));
-    t2     = _mm_mul_pd(mone, gmx_mm_inv_pd(x));
-
-    y      = _mm_and_pd(mask1, quarterpi);
-    y      = _mm_or_pd( _mm_and_pd(mask2, halfpi), _mm_andnot_pd(mask2, y) );
-
-    x      = _mm_or_pd( _mm_and_pd(mask1, t1), _mm_andnot_pd(mask1, x) );
-    x      = _mm_or_pd( _mm_and_pd(mask2, t2), _mm_andnot_pd(mask2, x) );
-
-    z      = _mm_mul_pd(x, x);
-    z2     = _mm_mul_pd(z, z);
-
-    P_A    = _mm_mul_pd(P4, z2);
-    P_B    = _mm_mul_pd(P3, z2);
-    P_A    = _mm_add_pd(P_A, P2);
-    P_B    = _mm_add_pd(P_B, P1);
-    P_A    = _mm_mul_pd(P_A, z2);
-    P_B    = _mm_mul_pd(P_B, z);
-    P_A    = _mm_add_pd(P_A, P0);
-    P_A    = _mm_add_pd(P_A, P_B);
-
-    /* Q_A = z2 */
-    Q_B    = _mm_mul_pd(Q4, z2);
-    Q_A    = _mm_add_pd(z2, Q3);
-    Q_B    = _mm_add_pd(Q_B, Q2);
-    Q_A    = _mm_mul_pd(Q_A, z2);
-    Q_B    = _mm_mul_pd(Q_B, z2);
-    Q_A    = _mm_add_pd(Q_A, Q1);
-    Q_B    = _mm_add_pd(Q_B, Q0);
-    Q_A    = _mm_mul_pd(Q_A, z);
-    Q_A    = _mm_add_pd(Q_A, Q_B);
-
-    z      = _mm_mul_pd(z, P_A);
-    z      = _mm_mul_pd(z, gmx_mm_inv_pd(Q_A));
-    z      = _mm_mul_pd(z, x);
-    z      = _mm_add_pd(z, x);
-
-    t1     = _mm_and_pd(mask1, morebits1);
-    t1     = _mm_or_pd( _mm_and_pd(mask2, morebits2), _mm_andnot_pd(mask2, t1) );
-
-    z      = _mm_add_pd(z, t1);
-    y      = _mm_add_pd(y, z);
-
-    y      = _mm_xor_pd(y, sign);
-
-    return y;
-}
-
-
-static __m128d
-gmx_mm_atan2_pd(__m128d y, __m128d x)
-{
-    const __m128d pi          = _mm_set1_pd(M_PI);
-    const __m128d minuspi     = _mm_set1_pd(-M_PI);
-    const __m128d halfpi      = _mm_set1_pd(M_PI/2.0);
-    const __m128d minushalfpi = _mm_set1_pd(-M_PI/2.0);
-
-    __m128d       z, z1, z3, z4;
-    __m128d       w;
-    __m128d       maskx_lt, maskx_eq;
-    __m128d       masky_lt, masky_eq;
-    __m128d       mask1, mask2, mask3, mask4, maskall;
-
-    maskx_lt  = _mm_cmplt_pd(x, _mm_setzero_pd());
-    masky_lt  = _mm_cmplt_pd(y, _mm_setzero_pd());
-    maskx_eq  = _mm_cmpeq_pd(x, _mm_setzero_pd());
-    masky_eq  = _mm_cmpeq_pd(y, _mm_setzero_pd());
-
-    z         = _mm_mul_pd(y, gmx_mm_inv_pd(x));
-    z         = gmx_mm_atan_pd(z);
-
-    mask1     = _mm_and_pd(maskx_eq, masky_lt);
-    mask2     = _mm_andnot_pd(maskx_lt, masky_eq);
-    mask3     = _mm_andnot_pd( _mm_or_pd(masky_lt, masky_eq), maskx_eq);
-    mask4     = _mm_and_pd(masky_eq, maskx_lt);
-
-    maskall   = _mm_or_pd( _mm_or_pd(mask1, mask2), _mm_or_pd(mask3, mask4) );
-
-    z         = _mm_andnot_pd(maskall, z);
-    z1        = _mm_and_pd(mask1, minushalfpi);
-    z3        = _mm_and_pd(mask3, halfpi);
-    z4        = _mm_and_pd(mask4, pi);
-
-    z         = _mm_or_pd( _mm_or_pd(z, z1), _mm_or_pd(z3, z4) );
-
-    w         = _mm_or_pd(_mm_andnot_pd(masky_lt, pi), _mm_and_pd(masky_lt, minuspi));
-    w         = _mm_and_pd(w, maskx_lt);
-
-    w         = _mm_andnot_pd(maskall, w);
-
-    z         = _mm_add_pd(z, w);
  
-    return z;
-}
+#define gmx_mm_invsqrt_pd   gmx_simd_invsqrt_d
+#define gmx_mm_inv_pd       gmx_simd_inv_d
+#define gmx_mm_log_pd       gmx_simd_log_d
+#define gmx_mm_pmecorrF_pd  gmx_simd_pmecorrF_d
+#define gmx_mm_pmecorrV_pd  gmx_simd_pmecorrV_d
+#define gmx_mm_sincos_pd    gmx_simd_sincos_d
  
  #endif
diff --git a/src/gromacs/simd/math_x86_sse2_single.h b/src/gromacs/simd/math_x86_sse2_single.h

index db373c8412930722dfc4cb3518e1948b1a7d746f..054fd2c1bc8261c1ba27258171635259183706a1 100644 (file)
--- a/src/gromacs/simd/math_x86_sse2_single.h
+++ b/src/gromacs/simd/math_x86_sse2_single.h
@@ -1,7 +1,7 @@
  /*
   * This file is part of the GROMACS molecular simulation package.
   *
- * Copyright (c) 2012,2013, by the GROMACS development team, led by
+ * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
   * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   * and including many others, as listed in the AUTHORS file in the
   * top-level source directory and at http://www.gromacs.org.
@@ -35,1147 +35,19 @@
  #ifndef GMX_SIMD_MATH_SSE2_SINGLE_H
  #define GMX_SIMD_MATH_SSE2_SINGLE_H
  
+#include "simd_math.h"
  
-#include <stdio.h>
-#include <math.h>
-
-#include "general_x86_sse2.h"
-
-
-#ifndef M_PI
-#  define M_PI 3.14159265358979323846264338327950288
-#endif
-
-
-
-/************************
- *                      *
- * Simple math routines *
- *                      *
- ************************/
-
-/* 1.0/sqrt(x) */
-static gmx_inline __m128
-gmx_mm_invsqrt_ps(__m128 x)
-{
-    const __m128 half  = _mm_set_ps(0.5, 0.5, 0.5, 0.5);
-    const __m128 three = _mm_set_ps(3.0, 3.0, 3.0, 3.0);
-
-    __m128       lu = _mm_rsqrt_ps(x);
-
-    return _mm_mul_ps(half, _mm_mul_ps(_mm_sub_ps(three, _mm_mul_ps(_mm_mul_ps(lu, lu), x)), lu));
-}
-
-/* sqrt(x) - Do NOT use this (but rather invsqrt) if you actually need 1.0/sqrt(x) */
-static gmx_inline __m128
-gmx_mm_sqrt_ps(__m128 x)
-{
-    __m128 mask;
-    __m128 res;
-
-    mask = _mm_cmpeq_ps(x, _mm_setzero_ps());
-    res  = _mm_andnot_ps(mask, gmx_mm_invsqrt_ps(x));
-
-    res  = _mm_mul_ps(x, res);
-
-    return res;
-}
-
-/* 1.0/x */
-static gmx_inline __m128
-gmx_mm_inv_ps(__m128 x)
-{
-    const __m128 two = _mm_set_ps(2.0f, 2.0f, 2.0f, 2.0f);
-
-    __m128       lu = _mm_rcp_ps(x);
-
-    return _mm_mul_ps(lu, _mm_sub_ps(two, _mm_mul_ps(lu, x)));
-}
-
-static gmx_inline __m128
-gmx_mm_abs_ps(__m128 x)
-{
-    const __m128 signmask  = gmx_mm_castsi128_ps( _mm_set1_epi32(0x7FFFFFFF) );
-
-    return _mm_and_ps(x, signmask);
-}
-
-
-static __m128
-gmx_mm_log_ps(__m128 x)
-{
-    /* Same algorithm as cephes library */
-    const __m128  expmask    = gmx_mm_castsi128_ps( _mm_set_epi32(0x7F800000, 0x7F800000, 0x7F800000, 0x7F800000) );
-    const __m128i expbase_m1 = _mm_set1_epi32(127-1); /* We want non-IEEE format */
-    const __m128  half       = _mm_set1_ps(0.5f);
-    const __m128  one        = _mm_set1_ps(1.0f);
-    const __m128  invsq2     = _mm_set1_ps(1.0f/sqrt(2.0f));
-    const __m128  corr1      = _mm_set1_ps(-2.12194440e-4f);
-    const __m128  corr2      = _mm_set1_ps(0.693359375f);
-
-    const __m128  CA_1        = _mm_set1_ps(0.070376836292f);
-    const __m128  CB_0        = _mm_set1_ps(1.6714950086782716f);
-    const __m128  CB_1        = _mm_set1_ps(-2.452088066061482f);
-    const __m128  CC_0        = _mm_set1_ps(1.5220770854701728f);
-    const __m128  CC_1        = _mm_set1_ps(-1.3422238433233642f);
-    const __m128  CD_0        = _mm_set1_ps(1.386218787509749f);
-    const __m128  CD_1        = _mm_set1_ps(0.35075468953796346f);
-    const __m128  CE_0        = _mm_set1_ps(1.3429983063133937f);
-    const __m128  CE_1        = _mm_set1_ps(1.807420826584643f);
-
-    __m128        fexp;
-    __m128i       iexp;
-    __m128        mask;
-    __m128        x2;
-    __m128        y;
-    __m128        pA, pB, pC, pD, pE, tB, tC, tD, tE;
-
-    /* Separate x into exponent and mantissa, with a mantissa in the range [0.5..1[ (not IEEE754 standard!) */
-    fexp  = _mm_and_ps(x, expmask);
-    iexp  = gmx_mm_castps_si128(fexp);
-    iexp  = _mm_srli_epi32(iexp, 23);
-    iexp  = _mm_sub_epi32(iexp, expbase_m1);
-
-    x     = _mm_andnot_ps(expmask, x);
-    x     = _mm_or_ps(x, one);
-    x     = _mm_mul_ps(x, half);
-
-    mask  = _mm_cmplt_ps(x, invsq2);
-
-    x     = _mm_add_ps(x, _mm_and_ps(mask, x));
-    x     = _mm_sub_ps(x, one);
-    iexp  = _mm_add_epi32(iexp, gmx_mm_castps_si128(mask)); /* 0xFFFFFFFF = -1 as int */
-
-    x2    = _mm_mul_ps(x, x);
-
-    pA    = _mm_mul_ps(CA_1, x);
-    pB    = _mm_mul_ps(CB_1, x);
-    pC    = _mm_mul_ps(CC_1, x);
-    pD    = _mm_mul_ps(CD_1, x);
-    pE    = _mm_mul_ps(CE_1, x);
-    tB    = _mm_add_ps(CB_0, x2);
-    tC    = _mm_add_ps(CC_0, x2);
-    tD    = _mm_add_ps(CD_0, x2);
-    tE    = _mm_add_ps(CE_0, x2);
-    pB    = _mm_add_ps(pB, tB);
-    pC    = _mm_add_ps(pC, tC);
-    pD    = _mm_add_ps(pD, tD);
-    pE    = _mm_add_ps(pE, tE);
-
-    pA    = _mm_mul_ps(pA, pB);
-    pC    = _mm_mul_ps(pC, pD);
-    pE    = _mm_mul_ps(pE, x2);
-    pA    = _mm_mul_ps(pA, pC);
-    y     = _mm_mul_ps(pA, pE);
-
-    fexp  = _mm_cvtepi32_ps(iexp);
-    y     = _mm_add_ps(y, _mm_mul_ps(fexp, corr1));
-
-    y     = _mm_sub_ps(y, _mm_mul_ps(half, x2));
-    x2    = _mm_add_ps(x, y);
-
-    x2    = _mm_add_ps(x2, _mm_mul_ps(fexp, corr2));
-
-    return x2;
-}
-
-
-/*
- * 2^x function.
- *
- * The 2^w term is calculated from a (6,0)-th order (no denominator) Minimax polynomia on the interval
- * [-0.5,0.5]. The coefficiencts of this was derived in Mathematica using the command:
- *
- * MiniMaxApproximation[(2^x), {x, {-0.5, 0.5}, 6, 0}, WorkingPrecision -> 15]
- *
- * The largest-magnitude exponent we can represent in IEEE single-precision binary format
- * is 2^-126 for small numbers and 2^127 for large ones. To avoid wrap-around problems, we set the
- * result to zero if the argument falls outside this range. For small numbers this is just fine, but
- * for large numbers you could be fancy and return the smallest/largest IEEE single-precision
- * number instead. That would take a few extra cycles and not really help, since something is
- * wrong if you are using single precision to work with numbers that cannot really be represented
- * in single precision.
- *
- * The accuracy is at least 23 bits.
- */
-static __m128
-gmx_mm_exp2_ps(__m128 x)
-{
-    /* Lower bound: We do not allow numbers that would lead to an IEEE fp representation exponent smaller than -126. */
-    const __m128  arglimit = _mm_set1_ps(126.0f);
-
-    const __m128i expbase  = _mm_set1_epi32(127);
-    const __m128  CA6      = _mm_set1_ps(1.535336188319500E-004);
-    const __m128  CA5      = _mm_set1_ps(1.339887440266574E-003);
-    const __m128  CA4      = _mm_set1_ps(9.618437357674640E-003);
-    const __m128  CA3      = _mm_set1_ps(5.550332471162809E-002);
-    const __m128  CA2      = _mm_set1_ps(2.402264791363012E-001);
-    const __m128  CA1      = _mm_set1_ps(6.931472028550421E-001);
-    const __m128  CA0      = _mm_set1_ps(1.0f);
-
-
-    __m128  valuemask;
-    __m128i iexppart;
-    __m128  fexppart;
-    __m128  intpart;
-    __m128  x2;
-    __m128  p0, p1;
-
-    iexppart  = _mm_cvtps_epi32(x);
-    intpart   = _mm_cvtepi32_ps(iexppart);
-    iexppart  = _mm_slli_epi32(_mm_add_epi32(iexppart, expbase), 23);
-    valuemask = _mm_cmpge_ps(arglimit, gmx_mm_abs_ps(x));
-    fexppart  = _mm_and_ps(valuemask, gmx_mm_castsi128_ps(iexppart));
-
-    x         = _mm_sub_ps(x, intpart);
-    x2        = _mm_mul_ps(x, x);
-
-    p0        = _mm_mul_ps(CA6, x2);
-    p1        = _mm_mul_ps(CA5, x2);
-    p0        = _mm_add_ps(p0, CA4);
-    p1        = _mm_add_ps(p1, CA3);
-    p0        = _mm_mul_ps(p0, x2);
-    p1        = _mm_mul_ps(p1, x2);
-    p0        = _mm_add_ps(p0, CA2);
-    p1        = _mm_add_ps(p1, CA1);
-    p0        = _mm_mul_ps(p0, x2);
-    p1        = _mm_mul_ps(p1, x);
-    p0        = _mm_add_ps(p0, CA0);
-    p0        = _mm_add_ps(p0, p1);
-    x         = _mm_mul_ps(p0, fexppart);
-
-    return x;
-}
-
-
-/* Exponential function. This could be calculated from 2^x as Exp(x)=2^(y), where y=log2(e)*x,
- * but there will then be a small rounding error since we lose some precision due to the
- * multiplication. This will then be magnified a lot by the exponential.
- *
- * Instead, we calculate the fractional part directly as a minimax approximation of
- * Exp(z) on [-0.5,0.5]. We use extended precision arithmetics to calculate the fraction
- * remaining after 2^y, which avoids the precision-loss.
- * The final result is correct to within 1 LSB over the entire argument range.
+/* Temporary:
+ * Alias some old SSE definitions to new SIMD definitions so we don't need
+ * to modify _all_ group kernels - they will anyway be replaced with a new
+ * generic SIMD version soon.
   */
-static __m128
-gmx_mm_exp_ps(__m128 x)
-{
-    const __m128  argscale      = _mm_set1_ps(1.44269504088896341f);
-    /* Lower bound: Disallow numbers that would lead to an IEEE fp exponent reaching +-127. */
-    const __m128  arglimit      = _mm_set1_ps(126.0f);
-    const __m128i expbase       = _mm_set1_epi32(127);
-
-    const __m128  invargscale0  = _mm_set1_ps(0.693359375f);
-    const __m128  invargscale1  = _mm_set1_ps(-2.12194440e-4f);
-
-    const __m128  CC5           = _mm_set1_ps(1.9875691500e-4f);
-    const __m128  CC4           = _mm_set1_ps(1.3981999507e-3f);
-    const __m128  CC3           = _mm_set1_ps(8.3334519073e-3f);
-    const __m128  CC2           = _mm_set1_ps(4.1665795894e-2f);
-    const __m128  CC1           = _mm_set1_ps(1.6666665459e-1f);
-    const __m128  CC0           = _mm_set1_ps(5.0000001201e-1f);
-    const __m128  one           = _mm_set1_ps(1.0f);
-
-    __m128        y, x2;
-    __m128        p0, p1;
-    __m128        valuemask;
-    __m128i       iexppart;
-    __m128        fexppart;
-    __m128        intpart;
-
-    y = _mm_mul_ps(x, argscale);
-
-    iexppart  = _mm_cvtps_epi32(y);
-    intpart   = _mm_cvtepi32_ps(iexppart);
-
-    iexppart  = _mm_slli_epi32(_mm_add_epi32(iexppart, expbase), 23);
-    valuemask = _mm_cmpge_ps(arglimit, gmx_mm_abs_ps(y));
-    fexppart  = _mm_and_ps(valuemask, gmx_mm_castsi128_ps(iexppart));
-
-    /* Extended precision arithmetics */
-    x         = _mm_sub_ps(x, _mm_mul_ps(invargscale0, intpart));
-    x         = _mm_sub_ps(x, _mm_mul_ps(invargscale1, intpart));
-
-    x2        = _mm_mul_ps(x, x);
-
-    p1        = _mm_mul_ps(CC5, x2);
-    p0        = _mm_mul_ps(CC4, x2);
-    p1        = _mm_add_ps(p1, CC3);
-    p0        = _mm_add_ps(p0, CC2);
-    p1        = _mm_mul_ps(p1, x2);
-    p0        = _mm_mul_ps(p0, x2);
-    p1        = _mm_add_ps(p1, CC1);
-    p0        = _mm_add_ps(p0, CC0);
-    p1        = _mm_mul_ps(p1, x);
-    p0        = _mm_add_ps(p0, p1);
-    p0        = _mm_mul_ps(p0, x2);
-    x         = _mm_add_ps(x, one);
-    x         = _mm_add_ps(x, p0);
-
-    x         = _mm_mul_ps(x, fexppart);
-
-    return x;
-}
-
-/* FULL precision. Only errors in LSB */
-static __m128
-gmx_mm_erf_ps(__m128 x)
-{
-    /* Coefficients for minimax approximation of erf(x)=x*P(x^2) in range [-1,1] */
-    const __m128  CA6      = _mm_set1_ps(7.853861353153693e-5f);
-    const __m128  CA5      = _mm_set1_ps(-8.010193625184903e-4f);
-    const __m128  CA4      = _mm_set1_ps(5.188327685732524e-3f);
-    const __m128  CA3      = _mm_set1_ps(-2.685381193529856e-2f);
-    const __m128  CA2      = _mm_set1_ps(1.128358514861418e-1f);
-    const __m128  CA1      = _mm_set1_ps(-3.761262582423300e-1f);
-    const __m128  CA0      = _mm_set1_ps(1.128379165726710f);
-    /* Coefficients for minimax approximation of erfc(x)=Exp(-x^2)*P((1/(x-1))^2) in range [0.67,2] */
-    const __m128  CB9      = _mm_set1_ps(-0.0018629930017603923f);
-    const __m128  CB8      = _mm_set1_ps(0.003909821287598495f);
-    const __m128  CB7      = _mm_set1_ps(-0.0052094582210355615f);
-    const __m128  CB6      = _mm_set1_ps(0.005685614362160572f);
-    const __m128  CB5      = _mm_set1_ps(-0.0025367682853477272f);
-    const __m128  CB4      = _mm_set1_ps(-0.010199799682318782f);
-    const __m128  CB3      = _mm_set1_ps(0.04369575504816542f);
-    const __m128  CB2      = _mm_set1_ps(-0.11884063474674492f);
-    const __m128  CB1      = _mm_set1_ps(0.2732120154030589f);
-    const __m128  CB0      = _mm_set1_ps(0.42758357702025784f);
-    /* Coefficients for minimax approximation of erfc(x)=Exp(-x^2)*(1/x)*P((1/x)^2) in range [2,9.19] */
-    const __m128  CC10     = _mm_set1_ps(-0.0445555913112064f);
-    const __m128  CC9      = _mm_set1_ps(0.21376355144663348f);
-    const __m128  CC8      = _mm_set1_ps(-0.3473187200259257f);
-    const __m128  CC7      = _mm_set1_ps(0.016690861551248114f);
-    const __m128  CC6      = _mm_set1_ps(0.7560973182491192f);
-    const __m128  CC5      = _mm_set1_ps(-1.2137903600145787f);
-    const __m128  CC4      = _mm_set1_ps(0.8411872321232948f);
-    const __m128  CC3      = _mm_set1_ps(-0.08670413896296343f);
-    const __m128  CC2      = _mm_set1_ps(-0.27124782687240334f);
-    const __m128  CC1      = _mm_set1_ps(-0.0007502488047806069f);
-    const __m128  CC0      = _mm_set1_ps(0.5642114853803148f);
-
-    /* Coefficients for expansion of exp(x) in [0,0.1] */
-    /* CD0 and CD1 are both 1.0, so no need to declare them separately */
-    const __m128  CD2      = _mm_set1_ps(0.5000066608081202f);
-    const __m128  CD3      = _mm_set1_ps(0.1664795422874624f);
-    const __m128  CD4      = _mm_set1_ps(0.04379839977652482f);
-
-    const __m128  sieve    = gmx_mm_castsi128_ps( _mm_set1_epi32(0xfffff000) );
-    const __m128  signbit  = gmx_mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
-    const __m128  one      = _mm_set1_ps(1.0f);
-    const __m128  two      = _mm_set1_ps(2.0f);
-
-    __m128        x2, x4, y;
-    __m128        z, q, t, t2, w, w2;
-    __m128        pA0, pA1, pB0, pB1, pC0, pC1;
-    __m128        expmx2, corr;
-    __m128        res_erf, res_erfc, res;
-    __m128        mask;
-
-    /* Calculate erf() */
-    x2     = _mm_mul_ps(x, x);
-    x4     = _mm_mul_ps(x2, x2);
-
-    pA0  = _mm_mul_ps(CA6, x4);
-    pA1  = _mm_mul_ps(CA5, x4);
-    pA0  = _mm_add_ps(pA0, CA4);
-    pA1  = _mm_add_ps(pA1, CA3);
-    pA0  = _mm_mul_ps(pA0, x4);
-    pA1  = _mm_mul_ps(pA1, x4);
-    pA0  = _mm_add_ps(pA0, CA2);
-    pA1  = _mm_add_ps(pA1, CA1);
-    pA0  = _mm_mul_ps(pA0, x4);
-    pA1  = _mm_mul_ps(pA1, x2);
-    pA0  = _mm_add_ps(pA0, pA1);
-    pA0  = _mm_add_ps(pA0, CA0);
-
-    res_erf = _mm_mul_ps(x, pA0);
-
-    /* Calculate erfc */
-
-    y       = gmx_mm_abs_ps(x);
-    t       = gmx_mm_inv_ps(y);
-    w       = _mm_sub_ps(t, one);
-    t2      = _mm_mul_ps(t, t);
-    w2      = _mm_mul_ps(w, w);
-    /*
-     * We cannot simply calculate exp(-x2) directly in single precision, since
-     * that will lose a couple of bits of precision due to the multiplication.
-     * Instead, we introduce x=z+w, where the last 12 bits of precision are in w.
-     * Then we get exp(-x2) = exp(-z2)*exp((z-x)*(z+x)).
-     *
-     * The only drawback with this is that it requires TWO separate exponential
-     * evaluations, which would be horrible performance-wise. However, the argument
-     * for the second exp() call is always small, so there we simply use a
-     * low-order minimax expansion on [0,0.1].
-     */
-
-    z       = _mm_and_ps(y, sieve);
-    q       = _mm_mul_ps( _mm_sub_ps(z, y), _mm_add_ps(z, y) );
-
-    corr    = _mm_mul_ps(CD4, q);
-    corr    = _mm_add_ps(corr, CD3);
-    corr    = _mm_mul_ps(corr, q);
-    corr    = _mm_add_ps(corr, CD2);
-    corr    = _mm_mul_ps(corr, q);
-    corr    = _mm_add_ps(corr, one);
-    corr    = _mm_mul_ps(corr, q);
-    corr    = _mm_add_ps(corr, one);
-
-    expmx2  = gmx_mm_exp_ps( _mm_or_ps( signbit, _mm_mul_ps(z, z) ) );
-    expmx2  = _mm_mul_ps(expmx2, corr);
-
-    pB1  = _mm_mul_ps(CB9, w2);
-    pB0  = _mm_mul_ps(CB8, w2);
-    pB1  = _mm_add_ps(pB1, CB7);
-    pB0  = _mm_add_ps(pB0, CB6);
-    pB1  = _mm_mul_ps(pB1, w2);
-    pB0  = _mm_mul_ps(pB0, w2);
-    pB1  = _mm_add_ps(pB1, CB5);
-    pB0  = _mm_add_ps(pB0, CB4);
-    pB1  = _mm_mul_ps(pB1, w2);
-    pB0  = _mm_mul_ps(pB0, w2);
-    pB1  = _mm_add_ps(pB1, CB3);
-    pB0  = _mm_add_ps(pB0, CB2);
-    pB1  = _mm_mul_ps(pB1, w2);
-    pB0  = _mm_mul_ps(pB0, w2);
-    pB1  = _mm_add_ps(pB1, CB1);
-    pB1  = _mm_mul_ps(pB1, w);
-    pB0  = _mm_add_ps(pB0, pB1);
-    pB0  = _mm_add_ps(pB0, CB0);
-
-    pC0  = _mm_mul_ps(CC10, t2);
-    pC1  = _mm_mul_ps(CC9, t2);
-    pC0  = _mm_add_ps(pC0, CC8);
-    pC1  = _mm_add_ps(pC1, CC7);
-    pC0  = _mm_mul_ps(pC0, t2);
-    pC1  = _mm_mul_ps(pC1, t2);
-    pC0  = _mm_add_ps(pC0, CC6);
-    pC1  = _mm_add_ps(pC1, CC5);
-    pC0  = _mm_mul_ps(pC0, t2);
-    pC1  = _mm_mul_ps(pC1, t2);
-    pC0  = _mm_add_ps(pC0, CC4);
-    pC1  = _mm_add_ps(pC1, CC3);
-    pC0  = _mm_mul_ps(pC0, t2);
-    pC1  = _mm_mul_ps(pC1, t2);
-    pC0  = _mm_add_ps(pC0, CC2);
-    pC1  = _mm_add_ps(pC1, CC1);
-    pC0  = _mm_mul_ps(pC0, t2);
-    pC1  = _mm_mul_ps(pC1, t);
-    pC0  = _mm_add_ps(pC0, pC1);
-    pC0  = _mm_add_ps(pC0, CC0);
-    pC0  = _mm_mul_ps(pC0, t);
-
-    /* SELECT pB0 or pC0 for erfc() */
-    mask     = _mm_cmplt_ps(two, y);
-    res_erfc = _mm_or_ps(_mm_andnot_ps(mask, pB0), _mm_and_ps(mask, pC0));
-    res_erfc = _mm_mul_ps(res_erfc, expmx2);
-
-    /* erfc(x<0) = 2-erfc(|x|) */
-    mask     = _mm_cmplt_ps(x, _mm_setzero_ps());
-    res_erfc = _mm_or_ps(_mm_andnot_ps(mask, res_erfc),
-                         _mm_and_ps(mask, _mm_sub_ps(two, res_erfc)));
-
-    /* Select erf() or erfc() */
-    mask = _mm_cmplt_ps(y, _mm_set1_ps(0.75f));
-    res  = _mm_or_ps(_mm_andnot_ps(mask, _mm_sub_ps(one, res_erfc)), _mm_and_ps(mask, res_erf));
-
-    return res;
-}
-
-
-
-
-
-/* FULL precision. Only errors in LSB */
-static __m128
-gmx_mm_erfc_ps(__m128 x)
-{
-    /* Coefficients for minimax approximation of erf(x)=x*P(x^2) in range [-1,1] */
-    const __m128  CA6      = _mm_set1_ps(7.853861353153693e-5f);
-    const __m128  CA5      = _mm_set1_ps(-8.010193625184903e-4f);
-    const __m128  CA4      = _mm_set1_ps(5.188327685732524e-3f);
-    const __m128  CA3      = _mm_set1_ps(-2.685381193529856e-2f);
-    const __m128  CA2      = _mm_set1_ps(1.128358514861418e-1f);
-    const __m128  CA1      = _mm_set1_ps(-3.761262582423300e-1f);
-    const __m128  CA0      = _mm_set1_ps(1.128379165726710f);
-    /* Coefficients for minimax approximation of erfc(x)=Exp(-x^2)*P((1/(x-1))^2) in range [0.67,2] */
-    const __m128  CB9      = _mm_set1_ps(-0.0018629930017603923f);
-    const __m128  CB8      = _mm_set1_ps(0.003909821287598495f);
-    const __m128  CB7      = _mm_set1_ps(-0.0052094582210355615f);
-    const __m128  CB6      = _mm_set1_ps(0.005685614362160572f);
-    const __m128  CB5      = _mm_set1_ps(-0.0025367682853477272f);
-    const __m128  CB4      = _mm_set1_ps(-0.010199799682318782f);
-    const __m128  CB3      = _mm_set1_ps(0.04369575504816542f);
-    const __m128  CB2      = _mm_set1_ps(-0.11884063474674492f);
-    const __m128  CB1      = _mm_set1_ps(0.2732120154030589f);
-    const __m128  CB0      = _mm_set1_ps(0.42758357702025784f);
-    /* Coefficients for minimax approximation of erfc(x)=Exp(-x^2)*(1/x)*P((1/x)^2) in range [2,9.19] */
-    const __m128  CC10     = _mm_set1_ps(-0.0445555913112064f);
-    const __m128  CC9      = _mm_set1_ps(0.21376355144663348f);
-    const __m128  CC8      = _mm_set1_ps(-0.3473187200259257f);
-    const __m128  CC7      = _mm_set1_ps(0.016690861551248114f);
-    const __m128  CC6      = _mm_set1_ps(0.7560973182491192f);
-    const __m128  CC5      = _mm_set1_ps(-1.2137903600145787f);
-    const __m128  CC4      = _mm_set1_ps(0.8411872321232948f);
-    const __m128  CC3      = _mm_set1_ps(-0.08670413896296343f);
-    const __m128  CC2      = _mm_set1_ps(-0.27124782687240334f);
-    const __m128  CC1      = _mm_set1_ps(-0.0007502488047806069f);
-    const __m128  CC0      = _mm_set1_ps(0.5642114853803148f);
-
-    /* Coefficients for expansion of exp(x) in [0,0.1] */
-    /* CD0 and CD1 are both 1.0, so no need to declare them separately */
-    const __m128  CD2      = _mm_set1_ps(0.5000066608081202f);
-    const __m128  CD3      = _mm_set1_ps(0.1664795422874624f);
-    const __m128  CD4      = _mm_set1_ps(0.04379839977652482f);
-
-    const __m128  sieve    = gmx_mm_castsi128_ps( _mm_set1_epi32(0xfffff000) );
-    const __m128  signbit  = gmx_mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
-    const __m128  one      = _mm_set1_ps(1.0f);
-    const __m128  two      = _mm_set1_ps(2.0f);
-
-    __m128        x2, x4, y;
-    __m128        z, q, t, t2, w, w2;
-    __m128        pA0, pA1, pB0, pB1, pC0, pC1;
-    __m128        expmx2, corr;
-    __m128        res_erf, res_erfc, res;
-    __m128        mask;
-
-    /* Calculate erf() */
-    x2     = _mm_mul_ps(x, x);
-    x4     = _mm_mul_ps(x2, x2);
-
-    pA0  = _mm_mul_ps(CA6, x4);
-    pA1  = _mm_mul_ps(CA5, x4);
-    pA0  = _mm_add_ps(pA0, CA4);
-    pA1  = _mm_add_ps(pA1, CA3);
-    pA0  = _mm_mul_ps(pA0, x4);
-    pA1  = _mm_mul_ps(pA1, x4);
-    pA0  = _mm_add_ps(pA0, CA2);
-    pA1  = _mm_add_ps(pA1, CA1);
-    pA0  = _mm_mul_ps(pA0, x4);
-    pA1  = _mm_mul_ps(pA1, x2);
-    pA0  = _mm_add_ps(pA0, pA1);
-    pA0  = _mm_add_ps(pA0, CA0);
-
-    res_erf = _mm_mul_ps(x, pA0);
-
-    /* Calculate erfc */
-    y       = gmx_mm_abs_ps(x);
-    t       = gmx_mm_inv_ps(y);
-    w       = _mm_sub_ps(t, one);
-    t2      = _mm_mul_ps(t, t);
-    w2      = _mm_mul_ps(w, w);
-    /*
-     * We cannot simply calculate exp(-x2) directly in single precision, since
-     * that will lose a couple of bits of precision due to the multiplication.
-     * Instead, we introduce x=z+w, where the last 12 bits of precision are in w.
-     * Then we get exp(-x2) = exp(-z2)*exp((z-x)*(z+x)).
-     *
-     * The only drawback with this is that it requires TWO separate exponential
-     * evaluations, which would be horrible performance-wise. However, the argument
-     * for the second exp() call is always small, so there we simply use a
-     * low-order minimax expansion on [0,0.1].
-     */
-
-    z       = _mm_and_ps(y, sieve);
-    q       = _mm_mul_ps( _mm_sub_ps(z, y), _mm_add_ps(z, y) );
-
-    corr    = _mm_mul_ps(CD4, q);
-    corr    = _mm_add_ps(corr, CD3);
-    corr    = _mm_mul_ps(corr, q);
-    corr    = _mm_add_ps(corr, CD2);
-    corr    = _mm_mul_ps(corr, q);
-    corr    = _mm_add_ps(corr, one);
-    corr    = _mm_mul_ps(corr, q);
-    corr    = _mm_add_ps(corr, one);
-
-    expmx2  = gmx_mm_exp_ps( _mm_or_ps( signbit, _mm_mul_ps(z, z) ) );
-    expmx2  = _mm_mul_ps(expmx2, corr);
-
-    pB1  = _mm_mul_ps(CB9, w2);
-    pB0  = _mm_mul_ps(CB8, w2);
-    pB1  = _mm_add_ps(pB1, CB7);
-    pB0  = _mm_add_ps(pB0, CB6);
-    pB1  = _mm_mul_ps(pB1, w2);
-    pB0  = _mm_mul_ps(pB0, w2);
-    pB1  = _mm_add_ps(pB1, CB5);
-    pB0  = _mm_add_ps(pB0, CB4);
-    pB1  = _mm_mul_ps(pB1, w2);
-    pB0  = _mm_mul_ps(pB0, w2);
-    pB1  = _mm_add_ps(pB1, CB3);
-    pB0  = _mm_add_ps(pB0, CB2);
-    pB1  = _mm_mul_ps(pB1, w2);
-    pB0  = _mm_mul_ps(pB0, w2);
-    pB1  = _mm_add_ps(pB1, CB1);
-    pB1  = _mm_mul_ps(pB1, w);
-    pB0  = _mm_add_ps(pB0, pB1);
-    pB0  = _mm_add_ps(pB0, CB0);
-
-    pC0  = _mm_mul_ps(CC10, t2);
-    pC1  = _mm_mul_ps(CC9, t2);
-    pC0  = _mm_add_ps(pC0, CC8);
-    pC1  = _mm_add_ps(pC1, CC7);
-    pC0  = _mm_mul_ps(pC0, t2);
-    pC1  = _mm_mul_ps(pC1, t2);
-    pC0  = _mm_add_ps(pC0, CC6);
-    pC1  = _mm_add_ps(pC1, CC5);
-    pC0  = _mm_mul_ps(pC0, t2);
-    pC1  = _mm_mul_ps(pC1, t2);
-    pC0  = _mm_add_ps(pC0, CC4);
-    pC1  = _mm_add_ps(pC1, CC3);
-    pC0  = _mm_mul_ps(pC0, t2);
-    pC1  = _mm_mul_ps(pC1, t2);
-    pC0  = _mm_add_ps(pC0, CC2);
-    pC1  = _mm_add_ps(pC1, CC1);
-    pC0  = _mm_mul_ps(pC0, t2);
-    pC1  = _mm_mul_ps(pC1, t);
-    pC0  = _mm_add_ps(pC0, pC1);
-    pC0  = _mm_add_ps(pC0, CC0);
-    pC0  = _mm_mul_ps(pC0, t);
-
-    /* SELECT pB0 or pC0 for erfc() */
-    mask     = _mm_cmplt_ps(two, y);
-    res_erfc = _mm_or_ps(_mm_andnot_ps(mask, pB0), _mm_and_ps(mask, pC0));
-    res_erfc = _mm_mul_ps(res_erfc, expmx2);
-
-    /* erfc(x<0) = 2-erfc(|x|) */
-    mask     = _mm_cmplt_ps(x, _mm_setzero_ps());
-    res_erfc = _mm_or_ps(_mm_andnot_ps(mask, res_erfc), _mm_and_ps(mask, _mm_sub_ps(two, res_erfc)));
-
-    /* Select erf() or erfc() */
-    mask = _mm_cmplt_ps(y, _mm_set1_ps(0.75f));
-    res  = _mm_or_ps(_mm_andnot_ps(mask, res_erfc), _mm_and_ps(mask, _mm_sub_ps(one, res_erf)));
-
-    return res;
-}
-
-
-/* Calculate the force correction due to PME analytically.
- *
- * This routine is meant to enable analytical evaluation of the
- * direct-space PME electrostatic force to avoid tables.
- *
- * The direct-space potential should be Erfc(beta*r)/r, but there
- * are some problems evaluating that:
- *
- * First, the error function is difficult (read: expensive) to
- * approxmiate accurately for intermediate to large arguments, and
- * this happens already in ranges of beta*r that occur in simulations.
- * Second, we now try to avoid calculating potentials in Gromacs but
- * use forces directly.
- *
- * We can simply things slight by noting that the PME part is really
- * a correction to the normal Coulomb force since Erfc(z)=1-Erf(z), i.e.
- *
- * V= 1/r - Erf(beta*r)/r
- *
- * The first term we already have from the inverse square root, so
- * that we can leave out of this routine.
- *
- * For pme tolerances of 1e-3 to 1e-8 and cutoffs of 0.5nm to 1.8nm,
- * the argument beta*r will be in the range 0.15 to ~4. Use your
- * favorite plotting program to realize how well-behaved Erf(z)/z is
- * in this range!
- *
- * We approximate f(z)=erf(z)/z with a rational minimax polynomial.
- * However, it turns out it is more efficient to approximate f(z)/z and
- * then only use even powers. This is another minor optimization, since
- * we actually WANT f(z)/z, because it is going to be multiplied by
- * the vector between the two atoms to get the vectorial force. The
- * fastest flops are the ones we can avoid calculating!
- *
- * So, here's how it should be used:
- *
- * 1. Calculate r^2.
- * 2. Multiply by beta^2, so you get z^2=beta^2*r^2.
- * 3. Evaluate this routine with z^2 as the argument.
- * 4. The return value is the expression:
- *
- *
- *       2*exp(-z^2)     erf(z)
- *       ------------ - --------
- *       sqrt(Pi)*z^2      z^3
- *
- * 5. Multiply the entire expression by beta^3. This will get you
- *
- *       beta^3*2*exp(-z^2)     beta^3*erf(z)
- *       ------------------  - ---------------
- *          sqrt(Pi)*z^2            z^3
- *
- *    or, switching back to r (z=r*beta):
- *
- *       2*beta*exp(-r^2*beta^2)   erf(r*beta)
- *       ----------------------- - -----------
- *            sqrt(Pi)*r^2            r^3
- *
- *
- *    With a bit of math exercise you should be able to confirm that
- *    this is exactly D[Erf[beta*r]/r,r] divided by r another time.
- *
- * 6. Add the result to 1/r^3, multiply by the product of the charges,
- *    and you have your force (divided by r). A final multiplication
- *    with the vector connecting the two particles and you have your
- *    vectorial force to add to the particles.
- *
- */
-static gmx_inline __m128
-gmx_mm_pmecorrF_ps(__m128 z2)
-{
-    const __m128  FN6      = _mm_set1_ps(-1.7357322914161492954e-8f);
-    const __m128  FN5      = _mm_set1_ps(1.4703624142580877519e-6f);
-    const __m128  FN4      = _mm_set1_ps(-0.000053401640219807709149f);
-    const __m128  FN3      = _mm_set1_ps(0.0010054721316683106153f);
-    const __m128  FN2      = _mm_set1_ps(-0.019278317264888380590f);
-    const __m128  FN1      = _mm_set1_ps(0.069670166153766424023f);
-    const __m128  FN0      = _mm_set1_ps(-0.75225204789749321333f);
-
-    const __m128  FD4      = _mm_set1_ps(0.0011193462567257629232f);
-    const __m128  FD3      = _mm_set1_ps(0.014866955030185295499f);
-    const __m128  FD2      = _mm_set1_ps(0.11583842382862377919f);
-    const __m128  FD1      = _mm_set1_ps(0.50736591960530292870f);
-    const __m128  FD0      = _mm_set1_ps(1.0f);
-
-    __m128        z4;
-    __m128        polyFN0, polyFN1, polyFD0, polyFD1;
-
-    z4             = _mm_mul_ps(z2, z2);
-
-    polyFD0        = _mm_mul_ps(FD4, z4);
-    polyFD1        = _mm_mul_ps(FD3, z4);
-    polyFD0        = _mm_add_ps(polyFD0, FD2);
-    polyFD1        = _mm_add_ps(polyFD1, FD1);
-    polyFD0        = _mm_mul_ps(polyFD0, z4);
-    polyFD1        = _mm_mul_ps(polyFD1, z2);
-    polyFD0        = _mm_add_ps(polyFD0, FD0);
-    polyFD0        = _mm_add_ps(polyFD0, polyFD1);
-
-    polyFD0        = gmx_mm_inv_ps(polyFD0);
-
-    polyFN0        = _mm_mul_ps(FN6, z4);
-    polyFN1        = _mm_mul_ps(FN5, z4);
-    polyFN0        = _mm_add_ps(polyFN0, FN4);
-    polyFN1        = _mm_add_ps(polyFN1, FN3);
-    polyFN0        = _mm_mul_ps(polyFN0, z4);
-    polyFN1        = _mm_mul_ps(polyFN1, z4);
-    polyFN0        = _mm_add_ps(polyFN0, FN2);
-    polyFN1        = _mm_add_ps(polyFN1, FN1);
-    polyFN0        = _mm_mul_ps(polyFN0, z4);
-    polyFN1        = _mm_mul_ps(polyFN1, z2);
-    polyFN0        = _mm_add_ps(polyFN0, FN0);
-    polyFN0        = _mm_add_ps(polyFN0, polyFN1);
-
-    return _mm_mul_ps(polyFN0, polyFD0);
-}
-
-
-/* Calculate the potential correction due to PME analytically.
- *
- * See gmx_mm256_pmecorrF_ps() for details about the approximation.
- *
- * This routine calculates Erf(z)/z, although you should provide z^2
- * as the input argument.
- *
- * Here's how it should be used:
- *
- * 1. Calculate r^2.
- * 2. Multiply by beta^2, so you get z^2=beta^2*r^2.
- * 3. Evaluate this routine with z^2 as the argument.
- * 4. The return value is the expression:
- *
- *
- *        erf(z)
- *       --------
- *          z
- *
- * 5. Multiply the entire expression by beta and switching back to r (z=r*beta):
- *
- *       erf(r*beta)
- *       -----------
- *           r
- *
- * 6. Subtract the result from 1/r, multiply by the product of the charges,
- *    and you have your potential.
- */
-static gmx_inline __m128
-gmx_mm_pmecorrV_ps(__m128 z2)
-{
-    const __m128  VN6      = _mm_set1_ps(1.9296833005951166339e-8f);
-    const __m128  VN5      = _mm_set1_ps(-1.4213390571557850962e-6f);
-    const __m128  VN4      = _mm_set1_ps(0.000041603292906656984871f);
-    const __m128  VN3      = _mm_set1_ps(-0.00013134036773265025626f);
-    const __m128  VN2      = _mm_set1_ps(0.038657983986041781264f);
-    const __m128  VN1      = _mm_set1_ps(0.11285044772717598220f);
-    const __m128  VN0      = _mm_set1_ps(1.1283802385263030286f);
-
-    const __m128  VD3      = _mm_set1_ps(0.0066752224023576045451f);
-    const __m128  VD2      = _mm_set1_ps(0.078647795836373922256f);
-    const __m128  VD1      = _mm_set1_ps(0.43336185284710920150f);
-    const __m128  VD0      = _mm_set1_ps(1.0f);
-
-    __m128        z4;
-    __m128        polyVN0, polyVN1, polyVD0, polyVD1;
-
-    z4             = _mm_mul_ps(z2, z2);
-
-    polyVD1        = _mm_mul_ps(VD3, z4);
-    polyVD0        = _mm_mul_ps(VD2, z4);
-    polyVD1        = _mm_add_ps(polyVD1, VD1);
-    polyVD0        = _mm_add_ps(polyVD0, VD0);
-    polyVD1        = _mm_mul_ps(polyVD1, z2);
-    polyVD0        = _mm_add_ps(polyVD0, polyVD1);
-
-    polyVD0        = gmx_mm_inv_ps(polyVD0);
-
-    polyVN0        = _mm_mul_ps(VN6, z4);
-    polyVN1        = _mm_mul_ps(VN5, z4);
-    polyVN0        = _mm_add_ps(polyVN0, VN4);
-    polyVN1        = _mm_add_ps(polyVN1, VN3);
-    polyVN0        = _mm_mul_ps(polyVN0, z4);
-    polyVN1        = _mm_mul_ps(polyVN1, z4);
-    polyVN0        = _mm_add_ps(polyVN0, VN2);
-    polyVN1        = _mm_add_ps(polyVN1, VN1);
-    polyVN0        = _mm_mul_ps(polyVN0, z4);
-    polyVN1        = _mm_mul_ps(polyVN1, z2);
-    polyVN0        = _mm_add_ps(polyVN0, VN0);
-    polyVN0        = _mm_add_ps(polyVN0, polyVN1);
-
-    return _mm_mul_ps(polyVN0, polyVD0);
-}
-
-
-static int
-gmx_mm_sincos_ps(__m128  x,
-                 __m128 *sinval,
-                 __m128 *cosval)
-{
-    const __m128  two_over_pi = _mm_set1_ps(2.0/M_PI);
-    const __m128  half        = _mm_set1_ps(0.5);
-    const __m128  one         = _mm_set1_ps(1.0);
-
-    const __m128i izero      = _mm_set1_epi32(0);
-    const __m128i ione       = _mm_set1_epi32(1);
-    const __m128i itwo       = _mm_set1_epi32(2);
-    const __m128i ithree     = _mm_set1_epi32(3);
-    const __m128  signbit    = gmx_mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
-
-    const __m128  CA1         = _mm_set1_ps(1.5703125f);
-    const __m128  CA2         = _mm_set1_ps(4.837512969970703125e-4f);
-    const __m128  CA3         = _mm_set1_ps(7.54978995489188216e-8f);
-
-    const __m128  CC0         = _mm_set1_ps(-0.0013602249f);
-    const __m128  CC1         = _mm_set1_ps(0.0416566950f);
-    const __m128  CC2         = _mm_set1_ps(-0.4999990225f);
-    const __m128  CS0         = _mm_set1_ps(-0.0001950727f);
-    const __m128  CS1         = _mm_set1_ps(0.0083320758f);
-    const __m128  CS2         = _mm_set1_ps(-0.1666665247f);
-
-    __m128        y, y2;
-    __m128        z;
-    __m128i       iz;
-    __m128i       offset_sin, offset_cos;
-    __m128        tmp1, tmp2;
-    __m128        mask_sin, mask_cos;
-    __m128        tmp_sin, tmp_cos;
-
-    y          = _mm_mul_ps(x, two_over_pi);
-    y          = _mm_add_ps(y, _mm_or_ps(_mm_and_ps(y, signbit), half));
-
-    iz         = _mm_cvttps_epi32(y);
-    z          = _mm_cvtepi32_ps(iz);
-
-    offset_sin = _mm_and_si128(iz, ithree);
-    offset_cos = _mm_add_epi32(iz, ione);
-
-    /* Extended precision arithmethic to achieve full precision */
-    y               = _mm_mul_ps(z, CA1);
-    tmp1            = _mm_mul_ps(z, CA2);
-    tmp2            = _mm_mul_ps(z, CA3);
-    y               = _mm_sub_ps(x, y);
-    y               = _mm_sub_ps(y, tmp1);
-    y               = _mm_sub_ps(y, tmp2);
-
-    y2              = _mm_mul_ps(y, y);
-
-    tmp1            = _mm_mul_ps(CC0, y2);
-    tmp1            = _mm_add_ps(tmp1, CC1);
-    tmp2            = _mm_mul_ps(CS0, y2);
-    tmp2            = _mm_add_ps(tmp2, CS1);
-    tmp1            = _mm_mul_ps(tmp1, y2);
-    tmp1            = _mm_add_ps(tmp1, CC2);
-    tmp2            = _mm_mul_ps(tmp2, y2);
-    tmp2            = _mm_add_ps(tmp2, CS2);
-
-    tmp1            = _mm_mul_ps(tmp1, y2);
-    tmp1            = _mm_add_ps(tmp1, one);
-
-    tmp2            = _mm_mul_ps(tmp2, _mm_mul_ps(y, y2));
-    tmp2            = _mm_add_ps(tmp2, y);
-
-    mask_sin        = gmx_mm_castsi128_ps(_mm_cmpeq_epi32( _mm_and_si128(offset_sin, ione), izero));
-    mask_cos        = gmx_mm_castsi128_ps(_mm_cmpeq_epi32( _mm_and_si128(offset_cos, ione), izero));
-
-    tmp_sin         = _mm_or_ps( _mm_andnot_ps(mask_sin, tmp1), _mm_and_ps(mask_sin, tmp2) );
-    tmp_cos         = _mm_or_ps( _mm_andnot_ps(mask_cos, tmp1), _mm_and_ps(mask_cos, tmp2) );
-
-    mask_sin        = gmx_mm_castsi128_ps(_mm_cmpeq_epi32( _mm_and_si128(offset_sin, itwo), izero));
-    mask_cos        = gmx_mm_castsi128_ps(_mm_cmpeq_epi32( _mm_and_si128(offset_cos, itwo), izero));
-
-    tmp1            = _mm_xor_ps(signbit, tmp_sin);
-    tmp2            = _mm_xor_ps(signbit, tmp_cos);
-
-    *sinval         = _mm_or_ps( _mm_andnot_ps(mask_sin, tmp1), _mm_and_ps(mask_sin, tmp_sin) );
-    *cosval         = _mm_or_ps( _mm_andnot_ps(mask_cos, tmp2), _mm_and_ps(mask_cos, tmp_cos) );
-
-    return 0;
-}
-
-/*
- * IMPORTANT: Do NOT call both sin & cos if you need both results, since each of them
- * will then call the sincos() routine and waste a factor 2 in performance!
- */
-static __m128
-gmx_mm_sin_ps(__m128 x)
-{
-    __m128 s, c;
-    gmx_mm_sincos_ps(x, &s, &c);
-    return s;
-}
-
-/*
- * IMPORTANT: Do NOT call both sin & cos if you need both results, since each of them
- * will then call the sincos() routine and waste a factor 2 in performance!
- */
-static __m128
-gmx_mm_cos_ps(__m128 x)
-{
-    __m128 s, c;
-    gmx_mm_sincos_ps(x, &s, &c);
-    return c;
-}
-
-
-static __m128
-gmx_mm_tan_ps(__m128 x)
-{
-    __m128 sinval, cosval;
-    __m128 tanval;
-
-    gmx_mm_sincos_ps(x, &sinval, &cosval);
-
-    tanval = _mm_mul_ps(sinval, gmx_mm_inv_ps(cosval));
-
-    return tanval;
-}
-
-
-static __m128
-gmx_mm_asin_ps(__m128 x)
-{
-    /* Same algorithm as cephes library */
-    const __m128 signmask  = gmx_mm_castsi128_ps( _mm_set1_epi32(0x7FFFFFFF) );
-    const __m128 limitlow  = _mm_set1_ps(1e-4f);
-    const __m128 half      = _mm_set1_ps(0.5f);
-    const __m128 one       = _mm_set1_ps(1.0f);
-    const __m128 halfpi    = _mm_set1_ps(M_PI/2.0f);
-
-    const __m128 CC5        = _mm_set1_ps(4.2163199048E-2f);
-    const __m128 CC4        = _mm_set1_ps(2.4181311049E-2f);
-    const __m128 CC3        = _mm_set1_ps(4.5470025998E-2f);
-    const __m128 CC2        = _mm_set1_ps(7.4953002686E-2f);
-    const __m128 CC1        = _mm_set1_ps(1.6666752422E-1f);
-
-    __m128       sign;
-    __m128       mask;
-    __m128       xabs;
-    __m128       z, z1, z2, q, q1, q2;
-    __m128       pA, pB;
-
-    sign  = _mm_andnot_ps(signmask, x);
-    xabs  = _mm_and_ps(x, signmask);
-
-    mask  = _mm_cmpgt_ps(xabs, half);
-
-    z1    = _mm_mul_ps(half, _mm_sub_ps(one, xabs));
-    q1    = _mm_mul_ps(z1, gmx_mm_invsqrt_ps(z1));
-    q1    = _mm_andnot_ps(_mm_cmpeq_ps(xabs, one), q1);
-
-    q2    = xabs;
-    z2    = _mm_mul_ps(q2, q2);
-
-    z     = _mm_or_ps( _mm_and_ps(mask, z1), _mm_andnot_ps(mask, z2) );
-    q     = _mm_or_ps( _mm_and_ps(mask, q1), _mm_andnot_ps(mask, q2) );
-
-    z2    = _mm_mul_ps(z, z);
-
-    pA    = _mm_mul_ps(CC5, z2);
-    pB    = _mm_mul_ps(CC4, z2);
-
-    pA    = _mm_add_ps(pA, CC3);
-    pB    = _mm_add_ps(pB, CC2);
-
-    pA    = _mm_mul_ps(pA, z2);
-    pB    = _mm_mul_ps(pB, z2);
-
-    pA    = _mm_add_ps(pA, CC1);
-    pA    = _mm_mul_ps(pA, z);
-
-    z     = _mm_add_ps(pA, pB);
-    z     = _mm_mul_ps(z, q);
-    z     = _mm_add_ps(z, q);
-
-    q2    = _mm_sub_ps(halfpi, z);
-    q2    = _mm_sub_ps(q2, z);
-
-    z     = _mm_or_ps( _mm_and_ps(mask, q2), _mm_andnot_ps(mask, z) );
-
-    mask  = _mm_cmpgt_ps(xabs, limitlow);
-    z     = _mm_or_ps( _mm_and_ps(mask, z), _mm_andnot_ps(mask, xabs) );
-
-    z = _mm_xor_ps(z, sign);
-
-    return z;
-}
-
-
-static __m128
-gmx_mm_acos_ps(__m128 x)
-{
-    const __m128 signmask  = gmx_mm_castsi128_ps( _mm_set1_epi32(0x7FFFFFFF) );
-    const __m128 one_ps    = _mm_set1_ps(1.0f);
-    const __m128 half_ps   = _mm_set1_ps(0.5f);
-    const __m128 pi_ps     = _mm_set1_ps(M_PI);
-    const __m128 halfpi_ps = _mm_set1_ps(M_PI/2.0f);
-
-    __m128       mask1;
-    __m128       mask2;
-    __m128       xabs;
-    __m128       z, z1, z2, z3;
-
-    xabs  = _mm_and_ps(x, signmask);
-    mask1 = _mm_cmpgt_ps(xabs, half_ps);
-    mask2 = _mm_cmpgt_ps(x, _mm_setzero_ps());
-
-    z     = _mm_mul_ps(half_ps, _mm_sub_ps(one_ps, xabs));
-    z     = _mm_mul_ps(z, gmx_mm_invsqrt_ps(z));
-    z     = _mm_andnot_ps(_mm_cmpeq_ps(xabs, one_ps), z);
-
-    z     = _mm_or_ps( _mm_and_ps(mask1, z), _mm_andnot_ps(mask1, x) );
-    z     = gmx_mm_asin_ps(z);
-
-    z2    = _mm_add_ps(z, z);
-    z1    = _mm_sub_ps(pi_ps, z2);
-    z3    = _mm_sub_ps(halfpi_ps, z);
-
-    z     = _mm_or_ps( _mm_and_ps(mask2, z2), _mm_andnot_ps(mask2, z1) );
-    z     = _mm_or_ps( _mm_and_ps(mask1, z), _mm_andnot_ps(mask1, z3) );
-
-    return z;
-}
-
-
-static __m128
-gmx_mm_atan_ps(__m128 x)
-{
-    /* Same algorithm as cephes library */
-    const __m128 signmask  = gmx_mm_castsi128_ps( _mm_set1_epi32(0x7FFFFFFF) );
-    const __m128 limit1    = _mm_set1_ps(0.414213562373095f);
-    const __m128 limit2    = _mm_set1_ps(2.414213562373095f);
-    const __m128 quarterpi = _mm_set1_ps(0.785398163397448f);
-    const __m128 halfpi    = _mm_set1_ps(1.570796326794896f);
-    const __m128 mone      = _mm_set1_ps(-1.0f);
-    const __m128 CC3       = _mm_set1_ps(-3.33329491539E-1f);
-    const __m128 CC5       = _mm_set1_ps(1.99777106478E-1f);
-    const __m128 CC7       = _mm_set1_ps(-1.38776856032E-1);
-    const __m128 CC9       = _mm_set1_ps(8.05374449538e-2f);
-
-    __m128       sign;
-    __m128       mask1, mask2;
-    __m128       y, z1, z2;
-    __m128       x2, x4;
-    __m128       sum1, sum2;
-
-    sign  = _mm_andnot_ps(signmask, x);
-    x     = _mm_and_ps(x, signmask);
-
-    mask1 = _mm_cmpgt_ps(x, limit1);
-    mask2 = _mm_cmpgt_ps(x, limit2);
-
-    z1    = _mm_mul_ps(_mm_add_ps(x, mone), gmx_mm_inv_ps(_mm_sub_ps(x, mone)));
-    z2    = _mm_mul_ps(mone, gmx_mm_inv_ps(x));
-
-    y     = _mm_and_ps(mask1, quarterpi);
-    y     = _mm_or_ps( _mm_and_ps(mask2, halfpi), _mm_andnot_ps(mask2, y) );
-
-    x     = _mm_or_ps( _mm_and_ps(mask1, z1), _mm_andnot_ps(mask1, x) );
-    x     = _mm_or_ps( _mm_and_ps(mask2, z2), _mm_andnot_ps(mask2, x) );
-
-    x2    = _mm_mul_ps(x, x);
-    x4    = _mm_mul_ps(x2, x2);
-
-    sum1  = _mm_mul_ps(CC9, x4);
-    sum2  = _mm_mul_ps(CC7, x4);
-    sum1  = _mm_add_ps(sum1, CC5);
-    sum2  = _mm_add_ps(sum2, CC3);
-    sum1  = _mm_mul_ps(sum1, x4);
-    sum2  = _mm_mul_ps(sum2, x2);
-
-    sum1  = _mm_add_ps(sum1, sum2);
-    sum1  = _mm_sub_ps(sum1, mone);
-    sum1  = _mm_mul_ps(sum1, x);
-    y     = _mm_add_ps(y, sum1);
-
-    y     = _mm_xor_ps(y, sign);
-
-    return y;
-}
-
-
-static __m128
-gmx_mm_atan2_ps(__m128 y, __m128 x)
-{
-    const __m128 pi          = _mm_set1_ps(M_PI);
-    const __m128 minuspi     = _mm_set1_ps(-M_PI);
-    const __m128 halfpi      = _mm_set1_ps(M_PI/2.0);
-    const __m128 minushalfpi = _mm_set1_ps(-M_PI/2.0);
-
-    __m128       z, z1, z3, z4;
-    __m128       w;
-    __m128       maskx_lt, maskx_eq;
-    __m128       masky_lt, masky_eq;
-    __m128       mask1, mask2, mask3, mask4, maskall;
-
-    maskx_lt  = _mm_cmplt_ps(x, _mm_setzero_ps());
-    masky_lt  = _mm_cmplt_ps(y, _mm_setzero_ps());
-    maskx_eq  = _mm_cmpeq_ps(x, _mm_setzero_ps());
-    masky_eq  = _mm_cmpeq_ps(y, _mm_setzero_ps());
-
-    z         = _mm_mul_ps(y, gmx_mm_inv_ps(x));
-    z         = gmx_mm_atan_ps(z);
-
-    mask1     = _mm_and_ps(maskx_eq, masky_lt);
-    mask2     = _mm_andnot_ps(maskx_lt, masky_eq);
-    mask3     = _mm_andnot_ps( _mm_or_ps(masky_lt, masky_eq), maskx_eq);
-    mask4     = _mm_and_ps(masky_eq, maskx_lt);
-
-    maskall   = _mm_or_ps( _mm_or_ps(mask1, mask2), _mm_or_ps(mask3, mask4) );
-
-    z         = _mm_andnot_ps(maskall, z);
-    z1        = _mm_and_ps(mask1, minushalfpi);
-    z3        = _mm_and_ps(mask3, halfpi);
-    z4        = _mm_and_ps(mask4, pi);
-
-    z         = _mm_or_ps( _mm_or_ps(z, z1), _mm_or_ps(z3, z4) );
-
-    mask1     = _mm_andnot_ps(masky_lt, maskx_lt);
-    mask2     = _mm_and_ps(maskx_lt, masky_lt);
-
-    w         = _mm_or_ps( _mm_and_ps(mask1, pi), _mm_and_ps(mask2, minuspi) );
-    w         = _mm_andnot_ps(maskall, w);
-
-    z         = _mm_add_ps(z, w);
-
-    return z;
-}
  
+#define gmx_mm_invsqrt_ps   gmx_simd_invsqrt_f
+#define gmx_mm_inv_ps       gmx_simd_inv_f
+#define gmx_mm_log_ps       gmx_simd_log_f
+#define gmx_mm_pmecorrF_ps  gmx_simd_pmecorrF_f
+#define gmx_mm_pmecorrV_ps  gmx_simd_pmecorrV_f
+#define gmx_mm_sincos_ps    gmx_simd_sincos_f
  
  #endif
diff --git a/src/gromacs/simd/math_x86_sse4_1_double.h b/src/gromacs/simd/math_x86_sse4_1_double.h

index 5f87caa88b792af53a98a312f5e2e18a79d95276..4d8955a51c1ca87d1603e8b84a7110eed5eedddb 100644 (file)
--- a/src/gromacs/simd/math_x86_sse4_1_double.h
+++ b/src/gromacs/simd/math_x86_sse4_1_double.h
@@ -1,7 +1,7 @@
  /*
   * This file is part of the GROMACS molecular simulation package.
   *
- * Copyright (c) 2012,2013, by the GROMACS development team, led by
+ * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
   * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   * and including many others, as listed in the AUTHORS file in the
   * top-level source directory and at http://www.gromacs.org.
@@ -35,1457 +35,19 @@
  #ifndef GMX_SIMD_MATH_SSE4_1_DOUBLE_H
  #define GMX_SIMD_MATH_SSE4_1_DOUBLE_H
  
-#include <stdio.h>
-#include <math.h>
+#include "simd_math.h"
  
-#include "general_x86_sse4_1.h"
-
-
-
-#ifndef M_PI
-#  define M_PI 3.14159265358979323846264338327950288
-#endif
-
-/************************
- *                      *
- * Simple math routines *
- *                      *
- ************************/
-
-/* 1.0/sqrt(x) */
-static gmx_inline __m128d
-gmx_mm_invsqrt_pd(__m128d x)
-{
-    const __m128d half  = _mm_set1_pd(0.5);
-    const __m128d three = _mm_set1_pd(3.0);
-
-    /* Lookup instruction only exists in single precision, convert back and forth... */
-    __m128d lu = _mm_cvtps_pd(_mm_rsqrt_ps( _mm_cvtpd_ps(x)));
-
-    lu = _mm_mul_pd(half, _mm_mul_pd(_mm_sub_pd(three, _mm_mul_pd(_mm_mul_pd(lu, lu), x)), lu));
-    return _mm_mul_pd(half, _mm_mul_pd(_mm_sub_pd(three, _mm_mul_pd(_mm_mul_pd(lu, lu), x)), lu));
-}
-
-/* 1.0/sqrt(x), done for a pair of arguments to improve throughput */
-static void
-gmx_mm_invsqrt_pair_pd(__m128d x1, __m128d x2, __m128d *invsqrt1, __m128d *invsqrt2)
-{
-    const __m128d half   = _mm_set1_pd(0.5);
-    const __m128d three  = _mm_set1_pd(3.0);
-    const __m128  halff  = _mm_set1_ps(0.5f);
-    const __m128  threef = _mm_set1_ps(3.0f);
-
-    __m128        xf, luf;
-    __m128d       lu1, lu2;
-
-    /* Do first N-R step in float for 2x throughput */
-    xf  = _mm_shuffle_ps(_mm_cvtpd_ps(x1), _mm_cvtpd_ps(x2), _MM_SHUFFLE(1, 0, 1, 0));
-    luf = _mm_rsqrt_ps(xf);
-    luf = _mm_mul_ps(halff, _mm_mul_ps(_mm_sub_ps(threef, _mm_mul_ps(_mm_mul_ps(luf, luf), xf)), luf));
-
-    lu2 = _mm_cvtps_pd(_mm_shuffle_ps(luf, luf, _MM_SHUFFLE(3, 2, 3, 2)));
-    lu1 = _mm_cvtps_pd(luf);
-
-    *invsqrt1 = _mm_mul_pd(half, _mm_mul_pd(_mm_sub_pd(three, _mm_mul_pd(_mm_mul_pd(lu1, lu1), x1)), lu1));
-    *invsqrt2 = _mm_mul_pd(half, _mm_mul_pd(_mm_sub_pd(three, _mm_mul_pd(_mm_mul_pd(lu2, lu2), x2)), lu2));
-}
-
-/* sqrt(x) - Do NOT use this (but rather invsqrt) if you actually need 1.0/sqrt(x) */
-static gmx_inline __m128d
-gmx_mm_sqrt_pd(__m128d x)
-{
-    __m128d mask;
-    __m128d res;
-
-    mask = _mm_cmpeq_pd(x, _mm_setzero_pd());
-    res  = _mm_andnot_pd(mask, gmx_mm_invsqrt_pd(x));
-
-    res  = _mm_mul_pd(x, res);
-
-    return res;
-}
-
-/* 1.0/x */
-static gmx_inline __m128d
-gmx_mm_inv_pd(__m128d x)
-{
-    const __m128d two  = _mm_set1_pd(2.0);
-
-    /* Lookup instruction only exists in single precision, convert back and forth... */
-    __m128d lu = _mm_cvtps_pd(_mm_rcp_ps( _mm_cvtpd_ps(x)));
-
-    /* Perform two N-R steps for double precision */
-    lu         = _mm_mul_pd(lu, _mm_sub_pd(two, _mm_mul_pd(x, lu)));
-    return _mm_mul_pd(lu, _mm_sub_pd(two, _mm_mul_pd(x, lu)));
-}
-
-static gmx_inline __m128d
-gmx_mm_abs_pd(__m128d x)
-{
-    const __m128d signmask  = gmx_mm_castsi128_pd( _mm_set_epi32(0x7FFFFFFF, 0xFFFFFFFF, 0x7FFFFFFF, 0xFFFFFFFF) );
-
-    return _mm_and_pd(x, signmask);
-}
-
-
-/*
- * 2^x function.
- *
- * The 2^w term is calculated from a (6,0)-th order (no denominator) Minimax polynomia on the interval
- * [-0.5,0.5].
- *
- * The approximation on [-0.5,0.5] is a rational Padé approximation, 1+2*P(x^2)/(Q(x^2)-P(x^2)),
- * according to the same algorithm as used in the Cephes/netlib math routines.
- */
-static __m128d
-gmx_mm_exp2_pd(__m128d x)
-{
-    /* Lower bound: We do not allow numbers that would lead to an IEEE fp representation exponent smaller than -126. */
-    const __m128d arglimit = _mm_set1_pd(1022.0);
-    const __m128i expbase  = _mm_set1_epi32(1023);
-
-    const __m128d P2       = _mm_set1_pd(2.30933477057345225087e-2);
-    const __m128d P1       = _mm_set1_pd(2.02020656693165307700e1);
-    const __m128d P0       = _mm_set1_pd(1.51390680115615096133e3);
-    /* Q2 == 1.0 */
-    const __m128d Q1       = _mm_set1_pd(2.33184211722314911771e2);
-    const __m128d Q0       = _mm_set1_pd(4.36821166879210612817e3);
-    const __m128d one      = _mm_set1_pd(1.0);
-    const __m128d two      = _mm_set1_pd(2.0);
-
-    __m128d       valuemask;
-    __m128i       iexppart;
-    __m128d       fexppart;
-    __m128d       intpart;
-    __m128d       z, z2;
-    __m128d       PolyP, PolyQ;
-
-    iexppart  = _mm_cvtpd_epi32(x);
-    intpart   = _mm_round_pd(x, _MM_FROUND_TO_NEAREST_INT);
-
-    /* The two lowest elements of iexppart now contains 32-bit numbers with a correctly biased exponent.
-     * To be able to shift it into the exponent for a double precision number we first need to
-     * shuffle so that the lower half contains the first element, and the upper half the second.
-     * This should really be done as a zero-extension, but since the next instructions will shift
-     * the registers left by 52 bits it doesn't matter what we put there - it will be shifted out.
-     * (thus we just use element 2 from iexppart).
-     */
-    iexppart  = _mm_shuffle_epi32(iexppart, _MM_SHUFFLE(2, 1, 2, 0));
-
-    /* Do the shift operation on the 64-bit registers */
-    iexppart  = _mm_add_epi32(iexppart, expbase);
-    iexppart  = _mm_slli_epi64(iexppart, 52);
-
-    valuemask = _mm_cmpge_pd(arglimit, gmx_mm_abs_pd(x));
-    fexppart  = _mm_and_pd(valuemask, gmx_mm_castsi128_pd(iexppart));
-
-    z         = _mm_sub_pd(x, intpart);
-    z2        = _mm_mul_pd(z, z);
-
-    PolyP     = _mm_mul_pd(P2, z2);
-    PolyP     = _mm_add_pd(PolyP, P1);
-    PolyQ     = _mm_add_pd(z2, Q1);
-    PolyP     = _mm_mul_pd(PolyP, z2);
-    PolyQ     = _mm_mul_pd(PolyQ, z2);
-    PolyP     = _mm_add_pd(PolyP, P0);
-    PolyQ     = _mm_add_pd(PolyQ, Q0);
-    PolyP     = _mm_mul_pd(PolyP, z);
-
-    z         = _mm_mul_pd(PolyP, gmx_mm_inv_pd(_mm_sub_pd(PolyQ, PolyP)));
-    z         = _mm_add_pd(one, _mm_mul_pd(two, z));
-
-    z         = _mm_mul_pd(z, fexppart);
-
-    return z;
-}
-
-/* Exponential function. This could be calculated from 2^x as Exp(x)=2^(y), where y=log2(e)*x,
- * but there will then be a small rounding error since we lose some precision due to the
- * multiplication. This will then be magnified a lot by the exponential.
- *
- * Instead, we calculate the fractional part directly as a Padé approximation of
- * Exp(z) on [-0.5,0.5]. We use extended precision arithmetics to calculate the fraction
- * remaining after 2^y, which avoids the precision-loss.
+/* Temporary:
+ * Alias some old SSE definitions to new SIMD definitions so we don't need
+ * to modify _all_ group kernels - they will anyway be replaced with a new
+ * generic SIMD version soon.
   */
-static __m128d
-gmx_mm_exp_pd(__m128d exparg)
-{
-    const __m128d argscale = _mm_set1_pd(1.4426950408889634073599);
-    /* Lower bound: We do not allow numbers that would lead to an IEEE fp representation exponent smaller than -126. */
-    const __m128d arglimit = _mm_set1_pd(1022.0);
-    const __m128i expbase  = _mm_set1_epi32(1023);
-
-    const __m128d invargscale0  = _mm_set1_pd(6.93145751953125e-1);
-    const __m128d invargscale1  = _mm_set1_pd(1.42860682030941723212e-6);
-
-    const __m128d P2       = _mm_set1_pd(1.26177193074810590878e-4);
-    const __m128d P1       = _mm_set1_pd(3.02994407707441961300e-2);
-    /* P0 == 1.0 */
-    const __m128d Q3       = _mm_set1_pd(3.00198505138664455042E-6);
-    const __m128d Q2       = _mm_set1_pd(2.52448340349684104192E-3);
-    const __m128d Q1       = _mm_set1_pd(2.27265548208155028766E-1);
-    /* Q0 == 2.0 */
-    const __m128d one      = _mm_set1_pd(1.0);
-    const __m128d two      = _mm_set1_pd(2.0);
-
-    __m128d       valuemask;
-    __m128i       iexppart;
-    __m128d       fexppart;
-    __m128d       intpart;
-    __m128d       x, z, z2;
-    __m128d       PolyP, PolyQ;
-
-    x             = _mm_mul_pd(exparg, argscale);
-
-    iexppart  = _mm_cvtpd_epi32(x);
-    intpart   = _mm_round_pd(x, _MM_FROUND_TO_NEAREST_INT);
-
-    /* The two lowest elements of iexppart now contains 32-bit numbers with a correctly biased exponent.
-     * To be able to shift it into the exponent for a double precision number we first need to
-     * shuffle so that the lower half contains the first element, and the upper half the second.
-     * This should really be done as a zero-extension, but since the next instructions will shift
-     * the registers left by 52 bits it doesn't matter what we put there - it will be shifted out.
-     * (thus we just use element 2 from iexppart).
-     */
-    iexppart  = _mm_shuffle_epi32(iexppart, _MM_SHUFFLE(2, 1, 2, 0));
-
-    /* Do the shift operation on the 64-bit registers */
-    iexppart  = _mm_add_epi32(iexppart, expbase);
-    iexppart  = _mm_slli_epi64(iexppart, 52);
-
-    valuemask = _mm_cmpge_pd(arglimit, gmx_mm_abs_pd(x));
-    fexppart  = _mm_and_pd(valuemask, gmx_mm_castsi128_pd(iexppart));
-
-    z         = _mm_sub_pd(exparg, _mm_mul_pd(invargscale0, intpart));
-    z         = _mm_sub_pd(z, _mm_mul_pd(invargscale1, intpart));
-
-    z2        = _mm_mul_pd(z, z);
-
-    PolyQ     = _mm_mul_pd(Q3, z2);
-    PolyQ     = _mm_add_pd(PolyQ, Q2);
-    PolyP     = _mm_mul_pd(P2, z2);
-    PolyQ     = _mm_mul_pd(PolyQ, z2);
-    PolyP     = _mm_add_pd(PolyP, P1);
-    PolyQ     = _mm_add_pd(PolyQ, Q1);
-    PolyP     = _mm_mul_pd(PolyP, z2);
-    PolyQ     = _mm_mul_pd(PolyQ, z2);
-    PolyP     = _mm_add_pd(PolyP, one);
-    PolyQ     = _mm_add_pd(PolyQ, two);
-
-    PolyP     = _mm_mul_pd(PolyP, z);
-
-    z         = _mm_mul_pd(PolyP, gmx_mm_inv_pd(_mm_sub_pd(PolyQ, PolyP)));
-    z         = _mm_add_pd(one, _mm_mul_pd(two, z));
-
-    z         = _mm_mul_pd(z, fexppart);
-
-    return z;
-}
-
-
-
-static __m128d
-gmx_mm_log_pd(__m128d x)
-{
-    /* Same algorithm as cephes library */
-    const __m128d expmask    = gmx_mm_castsi128_pd( _mm_set_epi32(0x7FF00000, 0x00000000, 0x7FF00000, 0x00000000) );
-
-    const __m128i expbase_m1 = _mm_set1_epi32(1023-1); /* We want non-IEEE format */
-
-    const __m128d half       = _mm_set1_pd(0.5);
-    const __m128d one        = _mm_set1_pd(1.0);
-    const __m128d two        = _mm_set1_pd(2.0);
-    const __m128d invsq2     = _mm_set1_pd(1.0/sqrt(2.0));
-
-    const __m128d corr1      = _mm_set1_pd(-2.121944400546905827679e-4);
-    const __m128d corr2      = _mm_set1_pd(0.693359375);
-
-    const __m128d P5         = _mm_set1_pd(1.01875663804580931796e-4);
-    const __m128d P4         = _mm_set1_pd(4.97494994976747001425e-1);
-    const __m128d P3         = _mm_set1_pd(4.70579119878881725854e0);
-    const __m128d P2         = _mm_set1_pd(1.44989225341610930846e1);
-    const __m128d P1         = _mm_set1_pd(1.79368678507819816313e1);
-    const __m128d P0         = _mm_set1_pd(7.70838733755885391666e0);
-
-    const __m128d Q4         = _mm_set1_pd(1.12873587189167450590e1);
-    const __m128d Q3         = _mm_set1_pd(4.52279145837532221105e1);
-    const __m128d Q2         = _mm_set1_pd(8.29875266912776603211e1);
-    const __m128d Q1         = _mm_set1_pd(7.11544750618563894466e1);
-    const __m128d Q0         = _mm_set1_pd(2.31251620126765340583e1);
-
-    const __m128d R2         = _mm_set1_pd(-7.89580278884799154124e-1);
-    const __m128d R1         = _mm_set1_pd(1.63866645699558079767e1);
-    const __m128d R0         = _mm_set1_pd(-6.41409952958715622951e1);
-
-    const __m128d S2         = _mm_set1_pd(-3.56722798256324312549E1);
-    const __m128d S1         = _mm_set1_pd(3.12093766372244180303E2);
-    const __m128d S0         = _mm_set1_pd(-7.69691943550460008604E2);
-
-    __m128d       fexp;
-    __m128i       iexp;
-
-    __m128d       mask1, mask2;
-    __m128d       corr, t1, t2, q;
-    __m128d       zA, yA, xA, zB, yB, xB, z;
-    __m128d       polyR, polyS;
-    __m128d       polyP1, polyP2, polyQ1, polyQ2;
-
-    /* Separate x into exponent and mantissa, with a mantissa in the range [0.5..1[ (not IEEE754 standard!) */
-    fexp   = _mm_and_pd(x, expmask);
-    iexp   = gmx_mm_castpd_si128(fexp);
-    iexp   = _mm_srli_epi64(iexp, 52);
-    iexp   = _mm_sub_epi32(iexp, expbase_m1);
-    iexp   = _mm_shuffle_epi32(iexp, _MM_SHUFFLE(1, 1, 2, 0) );
-    fexp   = _mm_cvtepi32_pd(iexp);
-
-    x      = _mm_andnot_pd(expmask, x);
-    x      = _mm_or_pd(x, one);
-    x      = _mm_mul_pd(x, half);
-
-    mask1     = _mm_cmpgt_pd(gmx_mm_abs_pd(fexp), two);
-    mask2     = _mm_cmplt_pd(x, invsq2);
-
-    fexp   = _mm_sub_pd(fexp, _mm_and_pd(mask2, one));
-
-    /* If mask1 is set ('A') */
-    zA     = _mm_sub_pd(x, half);
-    t1     = _mm_blendv_pd( zA, x, mask2 );
-    zA     = _mm_sub_pd(t1, half);
-    t2     = _mm_blendv_pd( x, zA, mask2 );
-    yA     = _mm_mul_pd(half, _mm_add_pd(t2, one));
-
-    xA     = _mm_mul_pd(zA, gmx_mm_inv_pd(yA));
-    zA     = _mm_mul_pd(xA, xA);
-
-    /* EVALUATE POLY */
-    polyR  = _mm_mul_pd(R2, zA);
-    polyR  = _mm_add_pd(polyR, R1);
-    polyR  = _mm_mul_pd(polyR, zA);
-    polyR  = _mm_add_pd(polyR, R0);
-
-    polyS  = _mm_add_pd(zA, S2);
-    polyS  = _mm_mul_pd(polyS, zA);
-    polyS  = _mm_add_pd(polyS, S1);
-    polyS  = _mm_mul_pd(polyS, zA);
-    polyS  = _mm_add_pd(polyS, S0);
-
-    q      = _mm_mul_pd(polyR, gmx_mm_inv_pd(polyS));
-    zA     = _mm_mul_pd(_mm_mul_pd(xA, zA), q);
-
-    zA     = _mm_add_pd(zA, _mm_mul_pd(corr1, fexp));
-    zA     = _mm_add_pd(zA, xA);
-    zA     = _mm_add_pd(zA, _mm_mul_pd(corr2, fexp));
-
-    /* If mask1 is not set ('B') */
-    corr   = _mm_and_pd(mask2, x);
-    xB     = _mm_add_pd(x, corr);
-    xB     = _mm_sub_pd(xB, one);
-    zB     = _mm_mul_pd(xB, xB);
-
-    polyP1 = _mm_mul_pd(P5, zB);
-    polyP2 = _mm_mul_pd(P4, zB);
-    polyP1 = _mm_add_pd(polyP1, P3);
-    polyP2 = _mm_add_pd(polyP2, P2);
-    polyP1 = _mm_mul_pd(polyP1, zB);
-    polyP2 = _mm_mul_pd(polyP2, zB);
-    polyP1 = _mm_add_pd(polyP1, P1);
-    polyP2 = _mm_add_pd(polyP2, P0);
-    polyP1 = _mm_mul_pd(polyP1, xB);
-    polyP1 = _mm_add_pd(polyP1, polyP2);
-
-    polyQ2 = _mm_mul_pd(Q4, zB);
-    polyQ1 = _mm_add_pd(zB, Q3);
-    polyQ2 = _mm_add_pd(polyQ2, Q2);
-    polyQ1 = _mm_mul_pd(polyQ1, zB);
-    polyQ2 = _mm_mul_pd(polyQ2, zB);
-    polyQ1 = _mm_add_pd(polyQ1, Q1);
-    polyQ2 = _mm_add_pd(polyQ2, Q0);
-    polyQ1 = _mm_mul_pd(polyQ1, xB);
-    polyQ1 = _mm_add_pd(polyQ1, polyQ2);
-
-    fexp   = _mm_and_pd(fexp, _mm_cmpneq_pd(fexp, _mm_setzero_pd()));
-
-    q      = _mm_mul_pd(polyP1, gmx_mm_inv_pd(polyQ1));
-    yB     = _mm_mul_pd(_mm_mul_pd(xB, zB), q);
-
-    yB     = _mm_add_pd(yB, _mm_mul_pd(corr1, fexp));
-    yB     = _mm_sub_pd(yB, _mm_mul_pd(half, zB));
-    zB     = _mm_add_pd(xB, yB);
-    zB     = _mm_add_pd(zB, _mm_mul_pd(corr2, fexp));
-
-    z      = _mm_blendv_pd( zB, zA, mask1 );
-
-    return z;
-}
-
-
-static __m128d
-gmx_mm_erf_pd(__m128d x)
-{
-    /* Coefficients for minimax approximation of erf(x)=x*(CAoffset + P(x^2)/Q(x^2)) in range [-0.75,0.75] */
-    const __m128d CAP4      = _mm_set1_pd(-0.431780540597889301512e-4);
-    const __m128d CAP3      = _mm_set1_pd(-0.00578562306260059236059);
-    const __m128d CAP2      = _mm_set1_pd(-0.028593586920219752446);
-    const __m128d CAP1      = _mm_set1_pd(-0.315924962948621698209);
-    const __m128d CAP0      = _mm_set1_pd(0.14952975608477029151);
-
-    const __m128d CAQ5      = _mm_set1_pd(-0.374089300177174709737e-5);
-    const __m128d CAQ4      = _mm_set1_pd(0.00015126584532155383535);
-    const __m128d CAQ3      = _mm_set1_pd(0.00536692680669480725423);
-    const __m128d CAQ2      = _mm_set1_pd(0.0668686825594046122636);
-    const __m128d CAQ1      = _mm_set1_pd(0.402604990869284362773);
-    /* CAQ0 == 1.0 */
-    const __m128d CAoffset  = _mm_set1_pd(0.9788494110107421875);
-
-    /* Coefficients for minimax approximation of erfc(x)=exp(-x^2)*x*(P(x-1)/Q(x-1)) in range [1.0,4.5] */
-    const __m128d CBP6      = _mm_set1_pd(2.49650423685462752497647637088e-10);
-    const __m128d CBP5      = _mm_set1_pd(0.00119770193298159629350136085658);
-    const __m128d CBP4      = _mm_set1_pd(0.0164944422378370965881008942733);
-    const __m128d CBP3      = _mm_set1_pd(0.0984581468691775932063932439252);
-    const __m128d CBP2      = _mm_set1_pd(0.317364595806937763843589437418);
-    const __m128d CBP1      = _mm_set1_pd(0.554167062641455850932670067075);
-    const __m128d CBP0      = _mm_set1_pd(0.427583576155807163756925301060);
-    const __m128d CBQ7      = _mm_set1_pd(0.00212288829699830145976198384930);
-    const __m128d CBQ6      = _mm_set1_pd(0.0334810979522685300554606393425);
-    const __m128d CBQ5      = _mm_set1_pd(0.2361713785181450957579508850717);
-    const __m128d CBQ4      = _mm_set1_pd(0.955364736493055670530981883072);
-    const __m128d CBQ3      = _mm_set1_pd(2.36815675631420037315349279199);
-    const __m128d CBQ2      = _mm_set1_pd(3.55261649184083035537184223542);
-    const __m128d CBQ1      = _mm_set1_pd(2.93501136050160872574376997993);
-    /* CBQ0 == 1.0 */
-
-    /* Coefficients for minimax approximation of erfc(x)=exp(-x^2)/x*(P(1/x)/Q(1/x)) in range [4.5,inf] */
-    const __m128d CCP6      = _mm_set1_pd(-2.8175401114513378771);
-    const __m128d CCP5      = _mm_set1_pd(-3.22729451764143718517);
-    const __m128d CCP4      = _mm_set1_pd(-2.5518551727311523996);
-    const __m128d CCP3      = _mm_set1_pd(-0.687717681153649930619);
-    const __m128d CCP2      = _mm_set1_pd(-0.212652252872804219852);
-    const __m128d CCP1      = _mm_set1_pd(0.0175389834052493308818);
-    const __m128d CCP0      = _mm_set1_pd(0.00628057170626964891937);
-
-    const __m128d CCQ6      = _mm_set1_pd(5.48409182238641741584);
-    const __m128d CCQ5      = _mm_set1_pd(13.5064170191802889145);
-    const __m128d CCQ4      = _mm_set1_pd(22.9367376522880577224);
-    const __m128d CCQ3      = _mm_set1_pd(15.930646027911794143);
-    const __m128d CCQ2      = _mm_set1_pd(11.0567237927800161565);
-    const __m128d CCQ1      = _mm_set1_pd(2.79257750980575282228);
-    /* CCQ0 == 1.0 */
-    const __m128d CCoffset  = _mm_set1_pd(0.5579090118408203125);
-
-    const __m128d one       = _mm_set1_pd(1.0);
-    const __m128d two       = _mm_set1_pd(2.0);
-
-    const __m128d signbit   = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000, 0x00000000, 0x80000000, 0x00000000) );
-
-    __m128d       xabs, x2, x4, t, t2, w, w2;
-    __m128d       PolyAP0, PolyAP1, PolyAQ0, PolyAQ1;
-    __m128d       PolyBP0, PolyBP1, PolyBQ0, PolyBQ1;
-    __m128d       PolyCP0, PolyCP1, PolyCQ0, PolyCQ1;
-    __m128d       res_erf, res_erfcB, res_erfcC, res_erfc, res;
-    __m128d       mask, expmx2;
-
-    /* Calculate erf() */
-    xabs     = gmx_mm_abs_pd(x);
-    x2       = _mm_mul_pd(x, x);
-    x4       = _mm_mul_pd(x2, x2);
-
-    PolyAP0  = _mm_mul_pd(CAP4, x4);
-    PolyAP1  = _mm_mul_pd(CAP3, x4);
-    PolyAP0  = _mm_add_pd(PolyAP0, CAP2);
-    PolyAP1  = _mm_add_pd(PolyAP1, CAP1);
-    PolyAP0  = _mm_mul_pd(PolyAP0, x4);
-    PolyAP1  = _mm_mul_pd(PolyAP1, x2);
-    PolyAP0  = _mm_add_pd(PolyAP0, CAP0);
-    PolyAP0  = _mm_add_pd(PolyAP0, PolyAP1);
-
-    PolyAQ1  = _mm_mul_pd(CAQ5, x4);
-    PolyAQ0  = _mm_mul_pd(CAQ4, x4);
-    PolyAQ1  = _mm_add_pd(PolyAQ1, CAQ3);
-    PolyAQ0  = _mm_add_pd(PolyAQ0, CAQ2);
-    PolyAQ1  = _mm_mul_pd(PolyAQ1, x4);
-    PolyAQ0  = _mm_mul_pd(PolyAQ0, x4);
-    PolyAQ1  = _mm_add_pd(PolyAQ1, CAQ1);
-    PolyAQ0  = _mm_add_pd(PolyAQ0, one);
-    PolyAQ1  = _mm_mul_pd(PolyAQ1, x2);
-    PolyAQ0  = _mm_add_pd(PolyAQ0, PolyAQ1);
-
-    res_erf  = _mm_mul_pd(PolyAP0, gmx_mm_inv_pd(PolyAQ0));
-    res_erf  = _mm_add_pd(CAoffset, res_erf);
-    res_erf  = _mm_mul_pd(x, res_erf);
-
-    /* Calculate erfc() in range [1,4.5] */
-    t       = _mm_sub_pd(xabs, one);
-    t2      = _mm_mul_pd(t, t);
-
-    PolyBP0  = _mm_mul_pd(CBP6, t2);
-    PolyBP1  = _mm_mul_pd(CBP5, t2);
-    PolyBP0  = _mm_add_pd(PolyBP0, CBP4);
-    PolyBP1  = _mm_add_pd(PolyBP1, CBP3);
-    PolyBP0  = _mm_mul_pd(PolyBP0, t2);
-    PolyBP1  = _mm_mul_pd(PolyBP1, t2);
-    PolyBP0  = _mm_add_pd(PolyBP0, CBP2);
-    PolyBP1  = _mm_add_pd(PolyBP1, CBP1);
-    PolyBP0  = _mm_mul_pd(PolyBP0, t2);
-    PolyBP1  = _mm_mul_pd(PolyBP1, t);
-    PolyBP0  = _mm_add_pd(PolyBP0, CBP0);
-    PolyBP0  = _mm_add_pd(PolyBP0, PolyBP1);
-
-    PolyBQ1 = _mm_mul_pd(CBQ7, t2);
-    PolyBQ0 = _mm_mul_pd(CBQ6, t2);
-    PolyBQ1 = _mm_add_pd(PolyBQ1, CBQ5);
-    PolyBQ0 = _mm_add_pd(PolyBQ0, CBQ4);
-    PolyBQ1 = _mm_mul_pd(PolyBQ1, t2);
-    PolyBQ0 = _mm_mul_pd(PolyBQ0, t2);
-    PolyBQ1 = _mm_add_pd(PolyBQ1, CBQ3);
-    PolyBQ0 = _mm_add_pd(PolyBQ0, CBQ2);
-    PolyBQ1 = _mm_mul_pd(PolyBQ1, t2);
-    PolyBQ0 = _mm_mul_pd(PolyBQ0, t2);
-    PolyBQ1 = _mm_add_pd(PolyBQ1, CBQ1);
-    PolyBQ0 = _mm_add_pd(PolyBQ0, one);
-    PolyBQ1 = _mm_mul_pd(PolyBQ1, t);
-    PolyBQ0 = _mm_add_pd(PolyBQ0, PolyBQ1);
-
-    res_erfcB = _mm_mul_pd(PolyBP0, gmx_mm_inv_pd(PolyBQ0));
-
-    res_erfcB = _mm_mul_pd(res_erfcB, xabs);
-
-    /* Calculate erfc() in range [4.5,inf] */
-    w       = gmx_mm_inv_pd(xabs);
-    w2      = _mm_mul_pd(w, w);
-
-    PolyCP0  = _mm_mul_pd(CCP6, w2);
-    PolyCP1  = _mm_mul_pd(CCP5, w2);
-    PolyCP0  = _mm_add_pd(PolyCP0, CCP4);
-    PolyCP1  = _mm_add_pd(PolyCP1, CCP3);
-    PolyCP0  = _mm_mul_pd(PolyCP0, w2);
-    PolyCP1  = _mm_mul_pd(PolyCP1, w2);
-    PolyCP0  = _mm_add_pd(PolyCP0, CCP2);
-    PolyCP1  = _mm_add_pd(PolyCP1, CCP1);
-    PolyCP0  = _mm_mul_pd(PolyCP0, w2);
-    PolyCP1  = _mm_mul_pd(PolyCP1, w);
-    PolyCP0  = _mm_add_pd(PolyCP0, CCP0);
-    PolyCP0  = _mm_add_pd(PolyCP0, PolyCP1);
-
-    PolyCQ0  = _mm_mul_pd(CCQ6, w2);
-    PolyCQ1  = _mm_mul_pd(CCQ5, w2);
-    PolyCQ0  = _mm_add_pd(PolyCQ0, CCQ4);
-    PolyCQ1  = _mm_add_pd(PolyCQ1, CCQ3);
-    PolyCQ0  = _mm_mul_pd(PolyCQ0, w2);
-    PolyCQ1  = _mm_mul_pd(PolyCQ1, w2);
-    PolyCQ0  = _mm_add_pd(PolyCQ0, CCQ2);
-    PolyCQ1  = _mm_add_pd(PolyCQ1, CCQ1);
-    PolyCQ0  = _mm_mul_pd(PolyCQ0, w2);
-    PolyCQ1  = _mm_mul_pd(PolyCQ1, w);
-    PolyCQ0  = _mm_add_pd(PolyCQ0, one);
-    PolyCQ0  = _mm_add_pd(PolyCQ0, PolyCQ1);
-
-    expmx2   = gmx_mm_exp_pd( _mm_or_pd(signbit, x2) );
-
-    res_erfcC = _mm_mul_pd(PolyCP0, gmx_mm_inv_pd(PolyCQ0));
-    res_erfcC = _mm_add_pd(res_erfcC, CCoffset);
-    res_erfcC = _mm_mul_pd(res_erfcC, w);
-
-    mask     = _mm_cmpgt_pd(xabs, _mm_set1_pd(4.5));
-    res_erfc = _mm_blendv_pd(res_erfcB, res_erfcC, mask);
-
-    res_erfc = _mm_mul_pd(res_erfc, expmx2);
-
-    /* erfc(x<0) = 2-erfc(|x|) */
-    mask     = _mm_cmplt_pd(x, _mm_setzero_pd());
-    res_erfc = _mm_blendv_pd(res_erfc, _mm_sub_pd(two, res_erfc), mask);
-
-    /* Select erf() or erfc() */
-    mask = _mm_cmplt_pd(xabs, one);
-    res  = _mm_blendv_pd(_mm_sub_pd(one, res_erfc), res_erf, mask);
-
-    return res;
-}
-
-
-static __m128d
-gmx_mm_erfc_pd(__m128d x)
-{
-    /* Coefficients for minimax approximation of erf(x)=x*(CAoffset + P(x^2)/Q(x^2)) in range [-0.75,0.75] */
-    const __m128d CAP4      = _mm_set1_pd(-0.431780540597889301512e-4);
-    const __m128d CAP3      = _mm_set1_pd(-0.00578562306260059236059);
-    const __m128d CAP2      = _mm_set1_pd(-0.028593586920219752446);
-    const __m128d CAP1      = _mm_set1_pd(-0.315924962948621698209);
-    const __m128d CAP0      = _mm_set1_pd(0.14952975608477029151);
-
-    const __m128d CAQ5      = _mm_set1_pd(-0.374089300177174709737e-5);
-    const __m128d CAQ4      = _mm_set1_pd(0.00015126584532155383535);
-    const __m128d CAQ3      = _mm_set1_pd(0.00536692680669480725423);
-    const __m128d CAQ2      = _mm_set1_pd(0.0668686825594046122636);
-    const __m128d CAQ1      = _mm_set1_pd(0.402604990869284362773);
-    /* CAQ0 == 1.0 */
-    const __m128d CAoffset  = _mm_set1_pd(0.9788494110107421875);
-
-    /* Coefficients for minimax approximation of erfc(x)=exp(-x^2)*x*(P(x-1)/Q(x-1)) in range [1.0,4.5] */
-    const __m128d CBP6      = _mm_set1_pd(2.49650423685462752497647637088e-10);
-    const __m128d CBP5      = _mm_set1_pd(0.00119770193298159629350136085658);
-    const __m128d CBP4      = _mm_set1_pd(0.0164944422378370965881008942733);
-    const __m128d CBP3      = _mm_set1_pd(0.0984581468691775932063932439252);
-    const __m128d CBP2      = _mm_set1_pd(0.317364595806937763843589437418);
-    const __m128d CBP1      = _mm_set1_pd(0.554167062641455850932670067075);
-    const __m128d CBP0      = _mm_set1_pd(0.427583576155807163756925301060);
-    const __m128d CBQ7      = _mm_set1_pd(0.00212288829699830145976198384930);
-    const __m128d CBQ6      = _mm_set1_pd(0.0334810979522685300554606393425);
-    const __m128d CBQ5      = _mm_set1_pd(0.2361713785181450957579508850717);
-    const __m128d CBQ4      = _mm_set1_pd(0.955364736493055670530981883072);
-    const __m128d CBQ3      = _mm_set1_pd(2.36815675631420037315349279199);
-    const __m128d CBQ2      = _mm_set1_pd(3.55261649184083035537184223542);
-    const __m128d CBQ1      = _mm_set1_pd(2.93501136050160872574376997993);
-    /* CBQ0 == 1.0 */
-
-    /* Coefficients for minimax approximation of erfc(x)=exp(-x^2)/x*(P(1/x)/Q(1/x)) in range [4.5,inf] */
-    const __m128d CCP6      = _mm_set1_pd(-2.8175401114513378771);
-    const __m128d CCP5      = _mm_set1_pd(-3.22729451764143718517);
-    const __m128d CCP4      = _mm_set1_pd(-2.5518551727311523996);
-    const __m128d CCP3      = _mm_set1_pd(-0.687717681153649930619);
-    const __m128d CCP2      = _mm_set1_pd(-0.212652252872804219852);
-    const __m128d CCP1      = _mm_set1_pd(0.0175389834052493308818);
-    const __m128d CCP0      = _mm_set1_pd(0.00628057170626964891937);
-
-    const __m128d CCQ6      = _mm_set1_pd(5.48409182238641741584);
-    const __m128d CCQ5      = _mm_set1_pd(13.5064170191802889145);
-    const __m128d CCQ4      = _mm_set1_pd(22.9367376522880577224);
-    const __m128d CCQ3      = _mm_set1_pd(15.930646027911794143);
-    const __m128d CCQ2      = _mm_set1_pd(11.0567237927800161565);
-    const __m128d CCQ1      = _mm_set1_pd(2.79257750980575282228);
-    /* CCQ0 == 1.0 */
-    const __m128d CCoffset  = _mm_set1_pd(0.5579090118408203125);
-
-    const __m128d one       = _mm_set1_pd(1.0);
-    const __m128d two       = _mm_set1_pd(2.0);
-
-    const __m128d signbit   = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000, 0x00000000, 0x80000000, 0x00000000) );
-
-    __m128d       xabs, x2, x4, t, t2, w, w2;
-    __m128d       PolyAP0, PolyAP1, PolyAQ0, PolyAQ1;
-    __m128d       PolyBP0, PolyBP1, PolyBQ0, PolyBQ1;
-    __m128d       PolyCP0, PolyCP1, PolyCQ0, PolyCQ1;
-    __m128d       res_erf, res_erfcB, res_erfcC, res_erfc, res;
-    __m128d       mask, expmx2;
-
-    /* Calculate erf() */
-    xabs     = gmx_mm_abs_pd(x);
-    x2       = _mm_mul_pd(x, x);
-    x4       = _mm_mul_pd(x2, x2);
-
-    PolyAP0  = _mm_mul_pd(CAP4, x4);
-    PolyAP1  = _mm_mul_pd(CAP3, x4);
-    PolyAP0  = _mm_add_pd(PolyAP0, CAP2);
-    PolyAP1  = _mm_add_pd(PolyAP1, CAP1);
-    PolyAP0  = _mm_mul_pd(PolyAP0, x4);
-    PolyAP1  = _mm_mul_pd(PolyAP1, x2);
-    PolyAP0  = _mm_add_pd(PolyAP0, CAP0);
-    PolyAP0  = _mm_add_pd(PolyAP0, PolyAP1);
-
-    PolyAQ1  = _mm_mul_pd(CAQ5, x4);
-    PolyAQ0  = _mm_mul_pd(CAQ4, x4);
-    PolyAQ1  = _mm_add_pd(PolyAQ1, CAQ3);
-    PolyAQ0  = _mm_add_pd(PolyAQ0, CAQ2);
-    PolyAQ1  = _mm_mul_pd(PolyAQ1, x4);
-    PolyAQ0  = _mm_mul_pd(PolyAQ0, x4);
-    PolyAQ1  = _mm_add_pd(PolyAQ1, CAQ1);
-    PolyAQ0  = _mm_add_pd(PolyAQ0, one);
-    PolyAQ1  = _mm_mul_pd(PolyAQ1, x2);
-    PolyAQ0  = _mm_add_pd(PolyAQ0, PolyAQ1);
-
-    res_erf  = _mm_mul_pd(PolyAP0, gmx_mm_inv_pd(PolyAQ0));
-    res_erf  = _mm_add_pd(CAoffset, res_erf);
-    res_erf  = _mm_mul_pd(x, res_erf);
-
-    /* Calculate erfc() in range [1,4.5] */
-    t       = _mm_sub_pd(xabs, one);
-    t2      = _mm_mul_pd(t, t);
-
-    PolyBP0  = _mm_mul_pd(CBP6, t2);
-    PolyBP1  = _mm_mul_pd(CBP5, t2);
-    PolyBP0  = _mm_add_pd(PolyBP0, CBP4);
-    PolyBP1  = _mm_add_pd(PolyBP1, CBP3);
-    PolyBP0  = _mm_mul_pd(PolyBP0, t2);
-    PolyBP1  = _mm_mul_pd(PolyBP1, t2);
-    PolyBP0  = _mm_add_pd(PolyBP0, CBP2);
-    PolyBP1  = _mm_add_pd(PolyBP1, CBP1);
-    PolyBP0  = _mm_mul_pd(PolyBP0, t2);
-    PolyBP1  = _mm_mul_pd(PolyBP1, t);
-    PolyBP0  = _mm_add_pd(PolyBP0, CBP0);
-    PolyBP0  = _mm_add_pd(PolyBP0, PolyBP1);
-
-    PolyBQ1 = _mm_mul_pd(CBQ7, t2);
-    PolyBQ0 = _mm_mul_pd(CBQ6, t2);
-    PolyBQ1 = _mm_add_pd(PolyBQ1, CBQ5);
-    PolyBQ0 = _mm_add_pd(PolyBQ0, CBQ4);
-    PolyBQ1 = _mm_mul_pd(PolyBQ1, t2);
-    PolyBQ0 = _mm_mul_pd(PolyBQ0, t2);
-    PolyBQ1 = _mm_add_pd(PolyBQ1, CBQ3);
-    PolyBQ0 = _mm_add_pd(PolyBQ0, CBQ2);
-    PolyBQ1 = _mm_mul_pd(PolyBQ1, t2);
-    PolyBQ0 = _mm_mul_pd(PolyBQ0, t2);
-    PolyBQ1 = _mm_add_pd(PolyBQ1, CBQ1);
-    PolyBQ0 = _mm_add_pd(PolyBQ0, one);
-    PolyBQ1 = _mm_mul_pd(PolyBQ1, t);
-    PolyBQ0 = _mm_add_pd(PolyBQ0, PolyBQ1);
-
-    res_erfcB = _mm_mul_pd(PolyBP0, gmx_mm_inv_pd(PolyBQ0));
-
-    res_erfcB = _mm_mul_pd(res_erfcB, xabs);
-
-    /* Calculate erfc() in range [4.5,inf] */
-    w       = gmx_mm_inv_pd(xabs);
-    w2      = _mm_mul_pd(w, w);
-
-    PolyCP0  = _mm_mul_pd(CCP6, w2);
-    PolyCP1  = _mm_mul_pd(CCP5, w2);
-    PolyCP0  = _mm_add_pd(PolyCP0, CCP4);
-    PolyCP1  = _mm_add_pd(PolyCP1, CCP3);
-    PolyCP0  = _mm_mul_pd(PolyCP0, w2);
-    PolyCP1  = _mm_mul_pd(PolyCP1, w2);
-    PolyCP0  = _mm_add_pd(PolyCP0, CCP2);
-    PolyCP1  = _mm_add_pd(PolyCP1, CCP1);
-    PolyCP0  = _mm_mul_pd(PolyCP0, w2);
-    PolyCP1  = _mm_mul_pd(PolyCP1, w);
-    PolyCP0  = _mm_add_pd(PolyCP0, CCP0);
-    PolyCP0  = _mm_add_pd(PolyCP0, PolyCP1);
-
-    PolyCQ0  = _mm_mul_pd(CCQ6, w2);
-    PolyCQ1  = _mm_mul_pd(CCQ5, w2);
-    PolyCQ0  = _mm_add_pd(PolyCQ0, CCQ4);
-    PolyCQ1  = _mm_add_pd(PolyCQ1, CCQ3);
-    PolyCQ0  = _mm_mul_pd(PolyCQ0, w2);
-    PolyCQ1  = _mm_mul_pd(PolyCQ1, w2);
-    PolyCQ0  = _mm_add_pd(PolyCQ0, CCQ2);
-    PolyCQ1  = _mm_add_pd(PolyCQ1, CCQ1);
-    PolyCQ0  = _mm_mul_pd(PolyCQ0, w2);
-    PolyCQ1  = _mm_mul_pd(PolyCQ1, w);
-    PolyCQ0  = _mm_add_pd(PolyCQ0, one);
-    PolyCQ0  = _mm_add_pd(PolyCQ0, PolyCQ1);
-
-    expmx2   = gmx_mm_exp_pd( _mm_or_pd(signbit, x2) );
-
-    res_erfcC = _mm_mul_pd(PolyCP0, gmx_mm_inv_pd(PolyCQ0));
-    res_erfcC = _mm_add_pd(res_erfcC, CCoffset);
-    res_erfcC = _mm_mul_pd(res_erfcC, w);
-
-    mask     = _mm_cmpgt_pd(xabs, _mm_set1_pd(4.5));
-    res_erfc = _mm_blendv_pd(res_erfcB, res_erfcC, mask);
-
-    res_erfc = _mm_mul_pd(res_erfc, expmx2);
-
-    /* erfc(x<0) = 2-erfc(|x|) */
-    mask     = _mm_cmplt_pd(x, _mm_setzero_pd());
-    res_erfc = _mm_blendv_pd(res_erfc, _mm_sub_pd(two, res_erfc), mask);
-
-    /* Select erf() or erfc() */
-    mask = _mm_cmplt_pd(xabs, one);
-    res  = _mm_blendv_pd(res_erfc, _mm_sub_pd(one, res_erf), mask);
-
-    return res;
-}
-
-
-/* Calculate the force correction due to PME analytically.
- *
- * This routine is meant to enable analytical evaluation of the
- * direct-space PME electrostatic force to avoid tables.
- *
- * The direct-space potential should be Erfc(beta*r)/r, but there
- * are some problems evaluating that:
- *
- * First, the error function is difficult (read: expensive) to
- * approxmiate accurately for intermediate to large arguments, and
- * this happens already in ranges of beta*r that occur in simulations.
- * Second, we now try to avoid calculating potentials in Gromacs but
- * use forces directly.
- *
- * We can simply things slight by noting that the PME part is really
- * a correction to the normal Coulomb force since Erfc(z)=1-Erf(z), i.e.
- *
- * V= 1/r - Erf(beta*r)/r
- *
- * The first term we already have from the inverse square root, so
- * that we can leave out of this routine.
- *
- * For pme tolerances of 1e-3 to 1e-8 and cutoffs of 0.5nm to 1.8nm,
- * the argument beta*r will be in the range 0.15 to ~4. Use your
- * favorite plotting program to realize how well-behaved Erf(z)/z is
- * in this range!
- *
- * We approximate f(z)=erf(z)/z with a rational minimax polynomial.
- * However, it turns out it is more efficient to approximate f(z)/z and
- * then only use even powers. This is another minor optimization, since
- * we actually WANT f(z)/z, because it is going to be multiplied by
- * the vector between the two atoms to get the vectorial force. The
- * fastest flops are the ones we can avoid calculating!
- *
- * So, here's how it should be used:
- *
- * 1. Calculate r^2.
- * 2. Multiply by beta^2, so you get z^2=beta^2*r^2.
- * 3. Evaluate this routine with z^2 as the argument.
- * 4. The return value is the expression:
- *
- *
- *       2*exp(-z^2)     erf(z)
- *       ------------ - --------
- *       sqrt(Pi)*z^2      z^3
- *
- * 5. Multiply the entire expression by beta^3. This will get you
- *
- *       beta^3*2*exp(-z^2)     beta^3*erf(z)
- *       ------------------  - ---------------
- *          sqrt(Pi)*z^2            z^3
- *
- *    or, switching back to r (z=r*beta):
- *
- *       2*beta*exp(-r^2*beta^2)   erf(r*beta)
- *       ----------------------- - -----------
- *            sqrt(Pi)*r^2            r^3
- *
- *
- *    With a bit of math exercise you should be able to confirm that
- *    this is exactly D[Erf[beta*r]/r,r] divided by r another time.
- *
- * 6. Add the result to 1/r^3, multiply by the product of the charges,
- *    and you have your force (divided by r). A final multiplication
- *    with the vector connecting the two particles and you have your
- *    vectorial force to add to the particles.
- *
- */
-static __m128d
-gmx_mm_pmecorrF_pd(__m128d z2)
-{
-    const __m128d  FN10     = _mm_set1_pd(-8.0072854618360083154e-14);
-    const __m128d  FN9      = _mm_set1_pd(1.1859116242260148027e-11);
-    const __m128d  FN8      = _mm_set1_pd(-8.1490406329798423616e-10);
-    const __m128d  FN7      = _mm_set1_pd(3.4404793543907847655e-8);
-    const __m128d  FN6      = _mm_set1_pd(-9.9471420832602741006e-7);
-    const __m128d  FN5      = _mm_set1_pd(0.000020740315999115847456);
-    const __m128d  FN4      = _mm_set1_pd(-0.00031991745139313364005);
-    const __m128d  FN3      = _mm_set1_pd(0.0035074449373659008203);
-    const __m128d  FN2      = _mm_set1_pd(-0.031750380176100813405);
-    const __m128d  FN1      = _mm_set1_pd(0.13884101728898463426);
-    const __m128d  FN0      = _mm_set1_pd(-0.75225277815249618847);
-
-    const __m128d  FD5      = _mm_set1_pd(0.000016009278224355026701);
-    const __m128d  FD4      = _mm_set1_pd(0.00051055686934806966046);
-    const __m128d  FD3      = _mm_set1_pd(0.0081803507497974289008);
-    const __m128d  FD2      = _mm_set1_pd(0.077181146026670287235);
-    const __m128d  FD1      = _mm_set1_pd(0.41543303143712535988);
-    const __m128d  FD0      = _mm_set1_pd(1.0);
-
-    __m128d        z4;
-    __m128d        polyFN0, polyFN1, polyFD0, polyFD1;
-
-    z4             = _mm_mul_pd(z2, z2);
-
-    polyFD1        = _mm_mul_pd(FD5, z4);
-    polyFD0        = _mm_mul_pd(FD4, z4);
-    polyFD1        = _mm_add_pd(polyFD1, FD3);
-    polyFD0        = _mm_add_pd(polyFD0, FD2);
-    polyFD1        = _mm_mul_pd(polyFD1, z4);
-    polyFD0        = _mm_mul_pd(polyFD0, z4);
-    polyFD1        = _mm_add_pd(polyFD1, FD1);
-    polyFD0        = _mm_add_pd(polyFD0, FD0);
-    polyFD1        = _mm_mul_pd(polyFD1, z2);
-    polyFD0        = _mm_add_pd(polyFD0, polyFD1);
-
-    polyFD0        = gmx_mm_inv_pd(polyFD0);
-
-    polyFN0        = _mm_mul_pd(FN10, z4);
-    polyFN1        = _mm_mul_pd(FN9, z4);
-    polyFN0        = _mm_add_pd(polyFN0, FN8);
-    polyFN1        = _mm_add_pd(polyFN1, FN7);
-    polyFN0        = _mm_mul_pd(polyFN0, z4);
-    polyFN1        = _mm_mul_pd(polyFN1, z4);
-    polyFN0        = _mm_add_pd(polyFN0, FN6);
-    polyFN1        = _mm_add_pd(polyFN1, FN5);
-    polyFN0        = _mm_mul_pd(polyFN0, z4);
-    polyFN1        = _mm_mul_pd(polyFN1, z4);
-    polyFN0        = _mm_add_pd(polyFN0, FN4);
-    polyFN1        = _mm_add_pd(polyFN1, FN3);
-    polyFN0        = _mm_mul_pd(polyFN0, z4);
-    polyFN1        = _mm_mul_pd(polyFN1, z4);
-    polyFN0        = _mm_add_pd(polyFN0, FN2);
-    polyFN1        = _mm_add_pd(polyFN1, FN1);
-    polyFN0        = _mm_mul_pd(polyFN0, z4);
-    polyFN1        = _mm_mul_pd(polyFN1, z2);
-    polyFN0        = _mm_add_pd(polyFN0, FN0);
-    polyFN0        = _mm_add_pd(polyFN0, polyFN1);
-
-    return _mm_mul_pd(polyFN0, polyFD0);
-}
-
-
-
-
-/* Calculate the potential correction due to PME analytically.
- *
- * See gmx_mm256_pmecorrF_ps() for details about the approximation.
- *
- * This routine calculates Erf(z)/z, although you should provide z^2
- * as the input argument.
- *
- * Here's how it should be used:
- *
- * 1. Calculate r^2.
- * 2. Multiply by beta^2, so you get z^2=beta^2*r^2.
- * 3. Evaluate this routine with z^2 as the argument.
- * 4. The return value is the expression:
- *
- *
- *        erf(z)
- *       --------
- *          z
- *
- * 5. Multiply the entire expression by beta and switching back to r (z=r*beta):
- *
- *       erf(r*beta)
- *       -----------
- *           r
- *
- * 6. Subtract the result from 1/r, multiply by the product of the charges,
- *    and you have your potential.
- *
- */
-static __m128d
-gmx_mm_pmecorrV_pd(__m128d z2)
-{
-    const __m128d  VN9      = _mm_set1_pd(-9.3723776169321855475e-13);
-    const __m128d  VN8      = _mm_set1_pd(1.2280156762674215741e-10);
-    const __m128d  VN7      = _mm_set1_pd(-7.3562157912251309487e-9);
-    const __m128d  VN6      = _mm_set1_pd(2.6215886208032517509e-7);
-    const __m128d  VN5      = _mm_set1_pd(-4.9532491651265819499e-6);
-    const __m128d  VN4      = _mm_set1_pd(0.00025907400778966060389);
-    const __m128d  VN3      = _mm_set1_pd(0.0010585044856156469792);
-    const __m128d  VN2      = _mm_set1_pd(0.045247661136833092885);
-    const __m128d  VN1      = _mm_set1_pd(0.11643931522926034421);
-    const __m128d  VN0      = _mm_set1_pd(1.1283791671726767970);
-
-    const __m128d  VD5      = _mm_set1_pd(0.000021784709867336150342);
-    const __m128d  VD4      = _mm_set1_pd(0.00064293662010911388448);
-    const __m128d  VD3      = _mm_set1_pd(0.0096311444822588683504);
-    const __m128d  VD2      = _mm_set1_pd(0.085608012351550627051);
-    const __m128d  VD1      = _mm_set1_pd(0.43652499166614811084);
-    const __m128d  VD0      = _mm_set1_pd(1.0);
-
-    __m128d        z4;
-    __m128d        polyVN0, polyVN1, polyVD0, polyVD1;
-
-    z4             = _mm_mul_pd(z2, z2);
-
-    polyVD1        = _mm_mul_pd(VD5, z4);
-    polyVD0        = _mm_mul_pd(VD4, z4);
-    polyVD1        = _mm_add_pd(polyVD1, VD3);
-    polyVD0        = _mm_add_pd(polyVD0, VD2);
-    polyVD1        = _mm_mul_pd(polyVD1, z4);
-    polyVD0        = _mm_mul_pd(polyVD0, z4);
-    polyVD1        = _mm_add_pd(polyVD1, VD1);
-    polyVD0        = _mm_add_pd(polyVD0, VD0);
-    polyVD1        = _mm_mul_pd(polyVD1, z2);
-    polyVD0        = _mm_add_pd(polyVD0, polyVD1);
-
-    polyVD0        = gmx_mm_inv_pd(polyVD0);
-
-    polyVN1        = _mm_mul_pd(VN9, z4);
-    polyVN0        = _mm_mul_pd(VN8, z4);
-    polyVN1        = _mm_add_pd(polyVN1, VN7);
-    polyVN0        = _mm_add_pd(polyVN0, VN6);
-    polyVN1        = _mm_mul_pd(polyVN1, z4);
-    polyVN0        = _mm_mul_pd(polyVN0, z4);
-    polyVN1        = _mm_add_pd(polyVN1, VN5);
-    polyVN0        = _mm_add_pd(polyVN0, VN4);
-    polyVN1        = _mm_mul_pd(polyVN1, z4);
-    polyVN0        = _mm_mul_pd(polyVN0, z4);
-    polyVN1        = _mm_add_pd(polyVN1, VN3);
-    polyVN0        = _mm_add_pd(polyVN0, VN2);
-    polyVN1        = _mm_mul_pd(polyVN1, z4);
-    polyVN0        = _mm_mul_pd(polyVN0, z4);
-    polyVN1        = _mm_add_pd(polyVN1, VN1);
-    polyVN0        = _mm_add_pd(polyVN0, VN0);
-    polyVN1        = _mm_mul_pd(polyVN1, z2);
-    polyVN0        = _mm_add_pd(polyVN0, polyVN1);
-
-    return _mm_mul_pd(polyVN0, polyVD0);
-}
-
-
-static int
-gmx_mm_sincos_pd(__m128d  x,
-                 __m128d *sinval,
-                 __m128d *cosval)
-{
-#ifdef _MSC_VER
-    __declspec(align(16))
-    const double sintable[34] =
-    {
-        1.00000000000000000e+00, 0.00000000000000000e+00,
-        9.95184726672196929e-01, 9.80171403295606036e-02,
-        9.80785280403230431e-01, 1.95090322016128248e-01,
-        9.56940335732208824e-01, 2.90284677254462331e-01,
-        9.23879532511286738e-01, 3.82683432365089782e-01,
-        8.81921264348355050e-01, 4.71396736825997642e-01,
-        8.31469612302545236e-01, 5.55570233019602178e-01,
-        7.73010453362736993e-01, 6.34393284163645488e-01,
-        7.07106781186547573e-01, 7.07106781186547462e-01,
-        6.34393284163645599e-01, 7.73010453362736882e-01,
-        5.55570233019602289e-01, 8.31469612302545125e-01,
-        4.71396736825997809e-01, 8.81921264348354939e-01,
-        3.82683432365089837e-01, 9.23879532511286738e-01,
-        2.90284677254462276e-01, 9.56940335732208935e-01,
-        1.95090322016128304e-01, 9.80785280403230431e-01,
-        9.80171403295607702e-02, 9.95184726672196818e-01,
-        0.0, 1.00000000000000000e+00
-    };
-#else
-    const __m128d sintable[17] =
-    {
-        _mm_set_pd( 0.0, 1.0 ),
-        _mm_set_pd( sin(  1.0 * (M_PI/2.0) / 16.0), cos(  1.0 * (M_PI/2.0) / 16.0) ),
-        _mm_set_pd( sin(  2.0 * (M_PI/2.0) / 16.0), cos(  2.0 * (M_PI/2.0) / 16.0) ),
-        _mm_set_pd( sin(  3.0 * (M_PI/2.0) / 16.0), cos(  3.0 * (M_PI/2.0) / 16.0) ),
-        _mm_set_pd( sin(  4.0 * (M_PI/2.0) / 16.0), cos(  4.0 * (M_PI/2.0) / 16.0) ),
-        _mm_set_pd( sin(  5.0 * (M_PI/2.0) / 16.0), cos(  5.0 * (M_PI/2.0) / 16.0) ),
-        _mm_set_pd( sin(  6.0 * (M_PI/2.0) / 16.0), cos(  6.0 * (M_PI/2.0) / 16.0) ),
-        _mm_set_pd( sin(  7.0 * (M_PI/2.0) / 16.0), cos(  7.0 * (M_PI/2.0) / 16.0) ),
-        _mm_set_pd( sin(  8.0 * (M_PI/2.0) / 16.0), cos(  8.0 * (M_PI/2.0) / 16.0) ),
-        _mm_set_pd( sin(  9.0 * (M_PI/2.0) / 16.0), cos(  9.0 * (M_PI/2.0) / 16.0) ),
-        _mm_set_pd( sin( 10.0 * (M_PI/2.0) / 16.0), cos( 10.0 * (M_PI/2.0) / 16.0) ),
-        _mm_set_pd( sin( 11.0 * (M_PI/2.0) / 16.0), cos( 11.0 * (M_PI/2.0) / 16.0) ),
-        _mm_set_pd( sin( 12.0 * (M_PI/2.0) / 16.0), cos( 12.0 * (M_PI/2.0) / 16.0) ),
-        _mm_set_pd( sin( 13.0 * (M_PI/2.0) / 16.0), cos( 13.0 * (M_PI/2.0) / 16.0) ),
-        _mm_set_pd( sin( 14.0 * (M_PI/2.0) / 16.0), cos( 14.0 * (M_PI/2.0) / 16.0) ),
-        _mm_set_pd( sin( 15.0 * (M_PI/2.0) / 16.0), cos( 15.0 * (M_PI/2.0) / 16.0) ),
-        _mm_set_pd(  1.0, 0.0 )
-    };
-#endif
-
-    const __m128d signmask       = gmx_mm_castsi128_pd( _mm_set_epi32(0x7FFFFFFF, 0xFFFFFFFF, 0x7FFFFFFF, 0xFFFFFFFF) );
-
-    const __m128d tabscale      = _mm_set1_pd(32.0/M_PI);
-    const __m128d invtabscale0  = _mm_set1_pd(9.81747508049011230469e-02);
-    const __m128d invtabscale1  = _mm_set1_pd(1.96197799156550576057e-08);
-    const __m128i ione          = _mm_set1_epi32(1);
-    const __m128i i32           = _mm_set1_epi32(32);
-    const __m128i i16           = _mm_set1_epi32(16);
-    const __m128i tabmask       = _mm_set1_epi32(0x3F);
-    const __m128d sinP7         = _mm_set1_pd(-1.0/5040.0);
-    const __m128d sinP5         = _mm_set1_pd(1.0/120.0);
-    const __m128d sinP3         = _mm_set1_pd(-1.0/6.0);
-    const __m128d sinP1         = _mm_set1_pd(1.0);
-
-    const __m128d cosP6         = _mm_set1_pd(-1.0/720.0);
-    const __m128d cosP4         = _mm_set1_pd(1.0/24.0);
-    const __m128d cosP2         = _mm_set1_pd(-1.0/2.0);
-    const __m128d cosP0         = _mm_set1_pd(1.0);
-
-    __m128d       scalex;
-    __m128i       tabidx, corridx;
-    __m128d       xabs, z, z2, polySin, polyCos;
-    __m128d       xpoint;
-    __m128d       ypoint0, ypoint1;
-
-    __m128d       sinpoint, cospoint;
-    __m128d       xsign, ssign, csign;
-    __m128i       imask, sswapsign, cswapsign;
-
-    xsign    = _mm_andnot_pd(signmask, x);
-    xabs     = _mm_and_pd(x, signmask);
-
-    scalex   = _mm_mul_pd(tabscale, xabs);
-    tabidx   = _mm_cvtpd_epi32(scalex);
-
-    xpoint   = _mm_round_pd(scalex, _MM_FROUND_TO_NEAREST_INT);
-
-    /* Extended precision arithmetics */
-    z        = _mm_sub_pd(xabs, _mm_mul_pd(invtabscale0, xpoint));
-    z        = _mm_sub_pd(z, _mm_mul_pd(invtabscale1, xpoint));
-
-    /* Range reduction to 0..2*Pi */
-    tabidx   = _mm_and_si128(tabidx, tabmask);
-
-    /* tabidx is now in range [0,..,64] */
-    imask     = _mm_cmpgt_epi32(tabidx, i32);
-    sswapsign = imask;
-    cswapsign = imask;
-    corridx   = _mm_and_si128(imask, i32);
-    tabidx    = _mm_sub_epi32(tabidx, corridx);
-
-    /* tabidx is now in range [0..32] */
-    imask     = _mm_cmpgt_epi32(tabidx, i16);
-    cswapsign = _mm_xor_si128(cswapsign, imask);
-    corridx   = _mm_sub_epi32(i32, tabidx);
-    tabidx    = _mm_blendv_epi8(tabidx, corridx, imask);
-    /* tabidx is now in range [0..16] */
-    ssign     = _mm_cvtepi32_pd( _mm_or_si128( sswapsign, ione ) );
-    csign     = _mm_cvtepi32_pd( _mm_or_si128( cswapsign, ione ) );
-
-#ifdef _MSC_VER
-    ypoint0  = _mm_load_pd(sintable + 2*_mm_extract_epi32(tabidx, 0));
-    ypoint1  = _mm_load_pd(sintable + 2*_mm_extract_epi32(tabidx, 1));
-#else
-    ypoint0  = sintable[_mm_extract_epi32(tabidx, 0)];
-    ypoint1  = sintable[_mm_extract_epi32(tabidx, 1)];
-#endif
-    sinpoint = _mm_unpackhi_pd(ypoint0, ypoint1);
-    cospoint = _mm_unpacklo_pd(ypoint0, ypoint1);
-
-    sinpoint = _mm_mul_pd(sinpoint, ssign);
-    cospoint = _mm_mul_pd(cospoint, csign);
-
-    z2       = _mm_mul_pd(z, z);
-
-    polySin  = _mm_mul_pd(sinP7, z2);
-    polySin  = _mm_add_pd(polySin, sinP5);
-    polySin  = _mm_mul_pd(polySin, z2);
-    polySin  = _mm_add_pd(polySin, sinP3);
-    polySin  = _mm_mul_pd(polySin, z2);
-    polySin  = _mm_add_pd(polySin, sinP1);
-    polySin  = _mm_mul_pd(polySin, z);
-
-    polyCos  = _mm_mul_pd(cosP6, z2);
-    polyCos  = _mm_add_pd(polyCos, cosP4);
-    polyCos  = _mm_mul_pd(polyCos, z2);
-    polyCos  = _mm_add_pd(polyCos, cosP2);
-    polyCos  = _mm_mul_pd(polyCos, z2);
-    polyCos  = _mm_add_pd(polyCos, cosP0);
-
-    *sinval  = _mm_xor_pd(_mm_add_pd( _mm_mul_pd(sinpoint, polyCos), _mm_mul_pd(cospoint, polySin) ), xsign);
-    *cosval  = _mm_sub_pd( _mm_mul_pd(cospoint, polyCos), _mm_mul_pd(sinpoint, polySin) );
-
-    return 0;
-}
-
-/*
- * IMPORTANT: Do NOT call both sin & cos if you need both results, since each of them
- * will then call the sincos() routine and waste a factor 2 in performance!
- */
-static __m128d
-gmx_mm_sin_pd(__m128d x)
-{
-    __m128d s, c;
-    gmx_mm_sincos_pd(x, &s, &c);
-    return s;
-}
-
-/*
- * IMPORTANT: Do NOT call both sin & cos if you need both results, since each of them
- * will then call the sincos() routine and waste a factor 2 in performance!
- */
-static __m128d
-gmx_mm_cos_pd(__m128d x)
-{
-    __m128d s, c;
-    gmx_mm_sincos_pd(x, &s, &c);
-    return c;
-}
-
-
-
-static __m128d
-gmx_mm_tan_pd(__m128d x)
-{
-    __m128d sinval, cosval;
-    __m128d tanval;
-
-    gmx_mm_sincos_pd(x, &sinval, &cosval);
-
-    tanval = _mm_mul_pd(sinval, gmx_mm_inv_pd(cosval));
-
-    return tanval;
-}
-
-
-
-static __m128d
-gmx_mm_asin_pd(__m128d x)
-{
-    /* Same algorithm as cephes library */
-    const __m128d signmask  = gmx_mm_castsi128_pd( _mm_set_epi32(0x7FFFFFFF, 0xFFFFFFFF, 0x7FFFFFFF, 0xFFFFFFFF) );
-    const __m128d limit1    = _mm_set1_pd(0.625);
-    const __m128d limit2    = _mm_set1_pd(1e-8);
-    const __m128d one       = _mm_set1_pd(1.0);
-    const __m128d quarterpi = _mm_set1_pd(M_PI/4.0);
-    const __m128d morebits  = _mm_set1_pd(6.123233995736765886130e-17);
-
-    const __m128d P5        = _mm_set1_pd(4.253011369004428248960e-3);
-    const __m128d P4        = _mm_set1_pd(-6.019598008014123785661e-1);
-    const __m128d P3        = _mm_set1_pd(5.444622390564711410273e0);
-    const __m128d P2        = _mm_set1_pd(-1.626247967210700244449e1);
-    const __m128d P1        = _mm_set1_pd(1.956261983317594739197e1);
-    const __m128d P0        = _mm_set1_pd(-8.198089802484824371615e0);
-
-    const __m128d Q4        = _mm_set1_pd(-1.474091372988853791896e1);
-    const __m128d Q3        = _mm_set1_pd(7.049610280856842141659e1);
-    const __m128d Q2        = _mm_set1_pd(-1.471791292232726029859e2);
-    const __m128d Q1        = _mm_set1_pd(1.395105614657485689735e2);
-    const __m128d Q0        = _mm_set1_pd(-4.918853881490881290097e1);
-
-    const __m128d R4        = _mm_set1_pd(2.967721961301243206100e-3);
-    const __m128d R3        = _mm_set1_pd(-5.634242780008963776856e-1);
-    const __m128d R2        = _mm_set1_pd(6.968710824104713396794e0);
-    const __m128d R1        = _mm_set1_pd(-2.556901049652824852289e1);
-    const __m128d R0        = _mm_set1_pd(2.853665548261061424989e1);
-
-    const __m128d S3        = _mm_set1_pd(-2.194779531642920639778e1);
-    const __m128d S2        = _mm_set1_pd(1.470656354026814941758e2);
-    const __m128d S1        = _mm_set1_pd(-3.838770957603691357202e2);
-    const __m128d S0        = _mm_set1_pd(3.424398657913078477438e2);
-
-    __m128d       sign;
-    __m128d       mask;
-    __m128d       xabs;
-    __m128d       zz, ww, z, q, w, zz2, ww2;
-    __m128d       PA, PB;
-    __m128d       QA, QB;
-    __m128d       RA, RB;
-    __m128d       SA, SB;
-    __m128d       nom, denom;
-
-    sign  = _mm_andnot_pd(signmask, x);
-    xabs  = _mm_and_pd(x, signmask);
-
-    mask  = _mm_cmpgt_pd(xabs, limit1);
-
-    zz    = _mm_sub_pd(one, xabs);
-    ww    = _mm_mul_pd(xabs, xabs);
-    zz2   = _mm_mul_pd(zz, zz);
-    ww2   = _mm_mul_pd(ww, ww);
-
-    /* R */
-    RA    = _mm_mul_pd(R4, zz2);
-    RB    = _mm_mul_pd(R3, zz2);
-    RA    = _mm_add_pd(RA, R2);
-    RB    = _mm_add_pd(RB, R1);
-    RA    = _mm_mul_pd(RA, zz2);
-    RB    = _mm_mul_pd(RB, zz);
-    RA    = _mm_add_pd(RA, R0);
-    RA    = _mm_add_pd(RA, RB);
-
-    /* S, SA = zz2 */
-    SB    = _mm_mul_pd(S3, zz2);
-    SA    = _mm_add_pd(zz2, S2);
-    SB    = _mm_add_pd(SB, S1);
-    SA    = _mm_mul_pd(SA, zz2);
-    SB    = _mm_mul_pd(SB, zz);
-    SA    = _mm_add_pd(SA, S0);
-    SA    = _mm_add_pd(SA, SB);
-
-    /* P */
-    PA    = _mm_mul_pd(P5, ww2);
-    PB    = _mm_mul_pd(P4, ww2);
-    PA    = _mm_add_pd(PA, P3);
-    PB    = _mm_add_pd(PB, P2);
-    PA    = _mm_mul_pd(PA, ww2);
-    PB    = _mm_mul_pd(PB, ww2);
-    PA    = _mm_add_pd(PA, P1);
-    PB    = _mm_add_pd(PB, P0);
-    PA    = _mm_mul_pd(PA, ww);
-    PA    = _mm_add_pd(PA, PB);
-
-    /* Q, QA = ww2 */
-    QB    = _mm_mul_pd(Q4, ww2);
-    QA    = _mm_add_pd(ww2, Q3);
-    QB    = _mm_add_pd(QB, Q2);
-    QA    = _mm_mul_pd(QA, ww2);
-    QB    = _mm_mul_pd(QB, ww2);
-    QA    = _mm_add_pd(QA, Q1);
-    QB    = _mm_add_pd(QB, Q0);
-    QA    = _mm_mul_pd(QA, ww);
-    QA    = _mm_add_pd(QA, QB);
-
-    RA    = _mm_mul_pd(RA, zz);
-    PA    = _mm_mul_pd(PA, ww);
-
-    nom   = _mm_blendv_pd( PA, RA, mask );
-    denom = _mm_blendv_pd( QA, SA, mask );
-
-    q     = _mm_mul_pd( nom, gmx_mm_inv_pd(denom) );
-
-    zz    = _mm_add_pd(zz, zz);
-    zz    = gmx_mm_sqrt_pd(zz);
-    z     = _mm_sub_pd(quarterpi, zz);
-    zz    = _mm_mul_pd(zz, q);
-    zz    = _mm_sub_pd(zz, morebits);
-    z     = _mm_sub_pd(z, zz);
-    z     = _mm_add_pd(z, quarterpi);
-
-    w     = _mm_mul_pd(xabs, q);
-    w     = _mm_add_pd(w, xabs);
-
-    z     = _mm_blendv_pd( w, z, mask );
-
-    mask  = _mm_cmpgt_pd(xabs, limit2);
-    z     = _mm_blendv_pd( xabs, z, mask );
-
-    z = _mm_xor_pd(z, sign);
-
-    return z;
-}
-
-
-static __m128d
-gmx_mm_acos_pd(__m128d x)
-{
-    const __m128d one        = _mm_set1_pd(1.0);
-    const __m128d half       = _mm_set1_pd(0.5);
-    const __m128d quarterpi0 = _mm_set1_pd(7.85398163397448309616e-1);
-    const __m128d quarterpi1 = _mm_set1_pd(6.123233995736765886130e-17);
-
-
-    __m128d mask1;
-
-    __m128d z, z1, z2;
-
-    mask1 = _mm_cmpgt_pd(x, half);
-    z1    = _mm_mul_pd(half, _mm_sub_pd(one, x));
-    z1    = gmx_mm_sqrt_pd(z1);
-    z     = _mm_blendv_pd( x, z1, mask1 );
-
-    z     = gmx_mm_asin_pd(z);
-
-    z1    = _mm_add_pd(z, z);
-
-    z2    = _mm_sub_pd(quarterpi0, z);
-    z2    = _mm_add_pd(z2, quarterpi1);
-    z2    = _mm_add_pd(z2, quarterpi0);
-
-    z     = _mm_blendv_pd(z2, z1, mask1);
-
-    return z;
-}
-
-static __m128d
-gmx_mm_atan_pd(__m128d x)
-{
-    /* Same algorithm as cephes library */
-    const __m128d signmask  = gmx_mm_castsi128_pd( _mm_set_epi32(0x7FFFFFFF, 0xFFFFFFFF, 0x7FFFFFFF, 0xFFFFFFFF) );
-    const __m128d limit1    = _mm_set1_pd(0.66);
-    const __m128d limit2    = _mm_set1_pd(2.41421356237309504880);
-    const __m128d quarterpi = _mm_set1_pd(M_PI/4.0);
-    const __m128d halfpi    = _mm_set1_pd(M_PI/2.0);
-    const __m128d mone      = _mm_set1_pd(-1.0);
-    const __m128d morebits1 = _mm_set1_pd(0.5*6.123233995736765886130E-17);
-    const __m128d morebits2 = _mm_set1_pd(6.123233995736765886130E-17);
-
-    const __m128d P4        = _mm_set1_pd(-8.750608600031904122785E-1);
-    const __m128d P3        = _mm_set1_pd(-1.615753718733365076637E1);
-    const __m128d P2        = _mm_set1_pd(-7.500855792314704667340E1);
-    const __m128d P1        = _mm_set1_pd(-1.228866684490136173410E2);
-    const __m128d P0        = _mm_set1_pd(-6.485021904942025371773E1);
-
-    const __m128d Q4        = _mm_set1_pd(2.485846490142306297962E1);
-    const __m128d Q3        = _mm_set1_pd(1.650270098316988542046E2);
-    const __m128d Q2        = _mm_set1_pd(4.328810604912902668951E2);
-    const __m128d Q1        = _mm_set1_pd(4.853903996359136964868E2);
-    const __m128d Q0        = _mm_set1_pd(1.945506571482613964425E2);
-
-    __m128d       sign;
-    __m128d       mask1, mask2;
-    __m128d       y, t1, t2;
-    __m128d       z, z2;
-    __m128d       P_A, P_B, Q_A, Q_B;
-
-    sign   = _mm_andnot_pd(signmask, x);
-    x      = _mm_and_pd(x, signmask);
-
-    mask1  = _mm_cmpgt_pd(x, limit1);
-    mask2  = _mm_cmpgt_pd(x, limit2);
-
-    t1     = _mm_mul_pd(_mm_add_pd(x, mone), gmx_mm_inv_pd(_mm_sub_pd(x, mone)));
-    t2     = _mm_mul_pd(mone, gmx_mm_inv_pd(x));
-
-    y      = _mm_and_pd(mask1, quarterpi);
-    y      = _mm_or_pd( _mm_and_pd(mask2, halfpi), _mm_andnot_pd(mask2, y) );
-
-    x      = _mm_or_pd( _mm_and_pd(mask1, t1), _mm_andnot_pd(mask1, x) );
-    x      = _mm_or_pd( _mm_and_pd(mask2, t2), _mm_andnot_pd(mask2, x) );
-
-    z      = _mm_mul_pd(x, x);
-    z2     = _mm_mul_pd(z, z);
-
-    P_A    = _mm_mul_pd(P4, z2);
-    P_B    = _mm_mul_pd(P3, z2);
-    P_A    = _mm_add_pd(P_A, P2);
-    P_B    = _mm_add_pd(P_B, P1);
-    P_A    = _mm_mul_pd(P_A, z2);
-    P_B    = _mm_mul_pd(P_B, z);
-    P_A    = _mm_add_pd(P_A, P0);
-    P_A    = _mm_add_pd(P_A, P_B);
-
-    /* Q_A = z2 */
-    Q_B    = _mm_mul_pd(Q4, z2);
-    Q_A    = _mm_add_pd(z2, Q3);
-    Q_B    = _mm_add_pd(Q_B, Q2);
-    Q_A    = _mm_mul_pd(Q_A, z2);
-    Q_B    = _mm_mul_pd(Q_B, z2);
-    Q_A    = _mm_add_pd(Q_A, Q1);
-    Q_B    = _mm_add_pd(Q_B, Q0);
-    Q_A    = _mm_mul_pd(Q_A, z);
-    Q_A    = _mm_add_pd(Q_A, Q_B);
-
-    z      = _mm_mul_pd(z, P_A);
-    z      = _mm_mul_pd(z, gmx_mm_inv_pd(Q_A));
-    z      = _mm_mul_pd(z, x);
-    z      = _mm_add_pd(z, x);
-
-    t1     = _mm_and_pd(mask1, morebits1);
-    t1     = _mm_or_pd( _mm_and_pd(mask2, morebits2), _mm_andnot_pd(mask2, t1) );
-
-    z      = _mm_add_pd(z, t1);
-    y      = _mm_add_pd(y, z);
-
-    y      = _mm_xor_pd(y, sign);
-
-    return y;
-}
-
-
-static __m128d
-gmx_mm_atan2_pd(__m128d y, __m128d x)
-{
-    const __m128d pi          = _mm_set1_pd(M_PI);
-    const __m128d minuspi     = _mm_set1_pd(-M_PI);
-    const __m128d halfpi      = _mm_set1_pd(M_PI/2.0);
-    const __m128d minushalfpi = _mm_set1_pd(-M_PI/2.0);
-
-    __m128d       z, z1, z3, z4;
-    __m128d       w;
-    __m128d       maskx_lt, maskx_eq;
-    __m128d       masky_lt, masky_eq;
-    __m128d       mask1, mask2, mask3, mask4, maskall;
-
-    maskx_lt  = _mm_cmplt_pd(x, _mm_setzero_pd());
-    masky_lt  = _mm_cmplt_pd(y, _mm_setzero_pd());
-    maskx_eq  = _mm_cmpeq_pd(x, _mm_setzero_pd());
-    masky_eq  = _mm_cmpeq_pd(y, _mm_setzero_pd());
-
-    z         = _mm_mul_pd(y, gmx_mm_inv_pd(x));
-    z         = gmx_mm_atan_pd(z);
-
-    mask1     = _mm_and_pd(maskx_eq, masky_lt);
-    mask2     = _mm_andnot_pd(maskx_lt, masky_eq);
-    mask3     = _mm_andnot_pd( _mm_or_pd(masky_lt, masky_eq), maskx_eq);
-    mask4     = _mm_and_pd(masky_eq, maskx_lt);
-
-    maskall   = _mm_or_pd( _mm_or_pd(mask1, mask2), _mm_or_pd(mask3, mask4) );
-
-    z         = _mm_andnot_pd(maskall, z);
-    z1        = _mm_and_pd(mask1, minushalfpi);
-    z3        = _mm_and_pd(mask3, halfpi);
-    z4        = _mm_and_pd(mask4, pi);
-
-    z         = _mm_or_pd( _mm_or_pd(z, z1), _mm_or_pd(z3, z4) );
-
-    w         = _mm_blendv_pd(pi, minuspi, masky_lt);
-    w         = _mm_and_pd(w, maskx_lt);
-
-    w         = _mm_andnot_pd(maskall, w);
-
-    z         = _mm_add_pd(z, w);
  
-    return z;
-}
+#define gmx_mm_invsqrt_pd   gmx_simd_invsqrt_d
+#define gmx_mm_inv_pd       gmx_simd_inv_d
+#define gmx_mm_log_pd       gmx_simd_log_d
+#define gmx_mm_pmecorrF_pd  gmx_simd_pmecorrF_d
+#define gmx_mm_pmecorrV_pd  gmx_simd_pmecorrV_d
+#define gmx_mm_sincos_pd    gmx_simd_sincos_d
  
  #endif
diff --git a/src/gromacs/simd/math_x86_sse4_1_single.h b/src/gromacs/simd/math_x86_sse4_1_single.h

index 24fbc21aab28ad394b131b3146f890ee6203bba9..2fbdf23b3aa76f1f4b274c2e92d80cec3abee377 100644 (file)
--- a/src/gromacs/simd/math_x86_sse4_1_single.h
+++ b/src/gromacs/simd/math_x86_sse4_1_single.h
@@ -1,7 +1,7 @@
  /*
   * This file is part of the GROMACS molecular simulation package.
   *
- * Copyright (c) 2012,2013, by the GROMACS development team, led by
+ * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
   * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   * and including many others, as listed in the AUTHORS file in the
   * top-level source directory and at http://www.gromacs.org.
@@ -35,1145 +35,19 @@
  #ifndef GMX_SIMD_MATH_SSE4_1_SINGLE_H
  #define GMX_SIMD_MATH_SSE4_1_SINGLE_H
  
-#include <stdio.h>
-#include <math.h>
+#include "simd_math.h"
  
-#include "general_x86_sse4_1.h"
-
-
-
-#ifndef M_PI
-#  define M_PI 3.14159265358979323846264338327950288
-#endif
-
-
-
-
-/************************
- *                      *
- * Simple math routines *
- *                      *
- ************************/
-
-/* 1.0/sqrt(x) */
-static gmx_inline __m128
-gmx_mm_invsqrt_ps(__m128 x)
-{
-    const __m128 half  = _mm_set_ps(0.5, 0.5, 0.5, 0.5);
-    const __m128 three = _mm_set_ps(3.0, 3.0, 3.0, 3.0);
-
-    __m128       lu = _mm_rsqrt_ps(x);
-
-    return _mm_mul_ps(half, _mm_mul_ps(_mm_sub_ps(three, _mm_mul_ps(_mm_mul_ps(lu, lu), x)), lu));
-}
-
-/* sqrt(x) - Do NOT use this (but rather invsqrt) if you actually need 1.0/sqrt(x) */
-static gmx_inline __m128
-gmx_mm_sqrt_ps(__m128 x)
-{
-    __m128 mask;
-    __m128 res;
-
-    mask = _mm_cmpeq_ps(x, _mm_setzero_ps());
-    res  = _mm_andnot_ps(mask, gmx_mm_invsqrt_ps(x));
-
-    res  = _mm_mul_ps(x, res);
-
-    return res;
-}
-
-/* 1.0/x */
-static gmx_inline __m128
-gmx_mm_inv_ps(__m128 x)
-{
-    const __m128 two = _mm_set_ps(2.0f, 2.0f, 2.0f, 2.0f);
-
-    __m128       lu = _mm_rcp_ps(x);
-
-    return _mm_mul_ps(lu, _mm_sub_ps(two, _mm_mul_ps(lu, x)));
-}
-
-static gmx_inline __m128
-gmx_mm_abs_ps(__m128 x)
-{
-    const __m128 signmask  = gmx_mm_castsi128_ps( _mm_set1_epi32(0x7FFFFFFF) );
-
-    return _mm_and_ps(x, signmask);
-}
-
-
-
-static __m128
-gmx_mm_log_ps(__m128 x)
-{
-    /* Same algorithm as cephes library */
-    const __m128  expmask    = gmx_mm_castsi128_ps( _mm_set_epi32(0x7F800000, 0x7F800000, 0x7F800000, 0x7F800000) );
-    const __m128i expbase_m1 = _mm_set1_epi32(127-1); /* We want non-IEEE format */
-    const __m128  half       = _mm_set1_ps(0.5f);
-    const __m128  one        = _mm_set1_ps(1.0f);
-    const __m128  invsq2     = _mm_set1_ps(1.0f/sqrt(2.0f));
-    const __m128  corr1      = _mm_set1_ps(-2.12194440e-4f);
-    const __m128  corr2      = _mm_set1_ps(0.693359375f);
-
-    const __m128  CA_1        = _mm_set1_ps(0.070376836292f);
-    const __m128  CB_0        = _mm_set1_ps(1.6714950086782716f);
-    const __m128  CB_1        = _mm_set1_ps(-2.452088066061482f);
-    const __m128  CC_0        = _mm_set1_ps(1.5220770854701728f);
-    const __m128  CC_1        = _mm_set1_ps(-1.3422238433233642f);
-    const __m128  CD_0        = _mm_set1_ps(1.386218787509749f);
-    const __m128  CD_1        = _mm_set1_ps(0.35075468953796346f);
-    const __m128  CE_0        = _mm_set1_ps(1.3429983063133937f);
-    const __m128  CE_1        = _mm_set1_ps(1.807420826584643f);
-
-    __m128        fexp;
-    __m128i       iexp;
-    __m128        mask;
-    __m128        x2;
-    __m128        y;
-    __m128        pA, pB, pC, pD, pE, tB, tC, tD, tE;
-
-    /* Separate x into exponent and mantissa, with a mantissa in the range [0.5..1[ (not IEEE754 standard!) */
-    fexp  = _mm_and_ps(x, expmask);
-    iexp  = gmx_mm_castps_si128(fexp);
-    iexp  = _mm_srli_epi32(iexp, 23);
-    iexp  = _mm_sub_epi32(iexp, expbase_m1);
-
-    x     = _mm_andnot_ps(expmask, x);
-    x     = _mm_or_ps(x, one);
-    x     = _mm_mul_ps(x, half);
-
-    mask  = _mm_cmplt_ps(x, invsq2);
-
-    x     = _mm_add_ps(x, _mm_and_ps(mask, x));
-    x     = _mm_sub_ps(x, one);
-    iexp  = _mm_add_epi32(iexp, gmx_mm_castps_si128(mask)); /* 0xFFFFFFFF = -1 as int */
-
-    x2    = _mm_mul_ps(x, x);
-
-    pA    = _mm_mul_ps(CA_1, x);
-    pB    = _mm_mul_ps(CB_1, x);
-    pC    = _mm_mul_ps(CC_1, x);
-    pD    = _mm_mul_ps(CD_1, x);
-    pE    = _mm_mul_ps(CE_1, x);
-    tB    = _mm_add_ps(CB_0, x2);
-    tC    = _mm_add_ps(CC_0, x2);
-    tD    = _mm_add_ps(CD_0, x2);
-    tE    = _mm_add_ps(CE_0, x2);
-    pB    = _mm_add_ps(pB, tB);
-    pC    = _mm_add_ps(pC, tC);
-    pD    = _mm_add_ps(pD, tD);
-    pE    = _mm_add_ps(pE, tE);
-
-    pA    = _mm_mul_ps(pA, pB);
-    pC    = _mm_mul_ps(pC, pD);
-    pE    = _mm_mul_ps(pE, x2);
-    pA    = _mm_mul_ps(pA, pC);
-    y     = _mm_mul_ps(pA, pE);
-
-    fexp  = _mm_cvtepi32_ps(iexp);
-    y     = _mm_add_ps(y, _mm_mul_ps(fexp, corr1));
-
-    y     = _mm_sub_ps(y, _mm_mul_ps(half, x2));
-    x2    = _mm_add_ps(x, y);
-
-    x2    = _mm_add_ps(x2, _mm_mul_ps(fexp, corr2));
-
-    return x2;
-}
-
-
-/*
- * 2^x function.
- *
- * The 2^w term is calculated from a (6,0)-th order (no denominator) Minimax polynomia on the interval
- * [-0.5,0.5]. The coefficiencts of this was derived in Mathematica using the command:
- *
- * MiniMaxApproximation[(2^x), {x, {-0.5, 0.5}, 6, 0}, WorkingPrecision -> 15]
- *
- * The largest-magnitude exponent we can represent in IEEE single-precision binary format
- * is 2^-126 for small numbers and 2^127 for large ones. To avoid wrap-around problems, we set the
- * result to zero if the argument falls outside this range. For small numbers this is just fine, but
- * for large numbers you could be fancy and return the smallest/largest IEEE single-precision
- * number instead. That would take a few extra cycles and not really help, since something is
- * wrong if you are using single precision to work with numbers that cannot really be represented
- * in single precision.
- *
- * The accuracy is at least 23 bits.
- */
-static __m128
-gmx_mm_exp2_ps(__m128 x)
-{
-    /* Lower bound: We do not allow numbers that would lead to an IEEE fp representation exponent smaller than -126. */
-    const __m128  arglimit = _mm_set1_ps(126.0f);
-
-    const __m128i expbase  = _mm_set1_epi32(127);
-    const __m128  CA6      = _mm_set1_ps(1.535336188319500E-004);
-    const __m128  CA5      = _mm_set1_ps(1.339887440266574E-003);
-    const __m128  CA4      = _mm_set1_ps(9.618437357674640E-003);
-    const __m128  CA3      = _mm_set1_ps(5.550332471162809E-002);
-    const __m128  CA2      = _mm_set1_ps(2.402264791363012E-001);
-    const __m128  CA1      = _mm_set1_ps(6.931472028550421E-001);
-    const __m128  CA0      = _mm_set1_ps(1.0f);
-
-    __m128        valuemask;
-    __m128i       iexppart;
-    __m128        fexppart;
-    __m128        intpart;
-    __m128        x2;
-    __m128        p0, p1;
-
-    iexppart  = _mm_cvtps_epi32(x);
-    intpart   = _mm_round_ps(x, _MM_FROUND_TO_NEAREST_INT);
-    iexppart  = _mm_slli_epi32(_mm_add_epi32(iexppart, expbase), 23);
-    valuemask = _mm_cmpge_ps(arglimit, gmx_mm_abs_ps(x));
-    fexppart  = _mm_and_ps(valuemask, gmx_mm_castsi128_ps(iexppart));
-
-    x         = _mm_sub_ps(x, intpart);
-    x2        = _mm_mul_ps(x, x);
-
-    p0        = _mm_mul_ps(CA6, x2);
-    p1        = _mm_mul_ps(CA5, x2);
-    p0        = _mm_add_ps(p0, CA4);
-    p1        = _mm_add_ps(p1, CA3);
-    p0        = _mm_mul_ps(p0, x2);
-    p1        = _mm_mul_ps(p1, x2);
-    p0        = _mm_add_ps(p0, CA2);
-    p1        = _mm_add_ps(p1, CA1);
-    p0        = _mm_mul_ps(p0, x2);
-    p1        = _mm_mul_ps(p1, x);
-    p0        = _mm_add_ps(p0, CA0);
-    p0        = _mm_add_ps(p0, p1);
-    x         = _mm_mul_ps(p0, fexppart);
-
-    return x;
-}
-
-
-/* Exponential function. This could be calculated from 2^x as Exp(x)=2^(y), where y=log2(e)*x,
- * but there will then be a small rounding error since we lose some precision due to the
- * multiplication. This will then be magnified a lot by the exponential.
- *
- * Instead, we calculate the fractional part directly as a minimax approximation of
- * Exp(z) on [-0.5,0.5]. We use extended precision arithmetics to calculate the fraction
- * remaining after 2^y, which avoids the precision-loss.
- * The final result is correct to within 1 LSB over the entire argument range.
- */
-static __m128
-gmx_mm_exp_ps(__m128 x)
-{
-    const __m128  argscale      = _mm_set1_ps(1.44269504088896341f);
-    /* Lower bound: Disallow numbers that would lead to an IEEE fp exponent reaching +-127. */
-    const __m128  arglimit      = _mm_set1_ps(126.0f);
-    const __m128i expbase       = _mm_set1_epi32(127);
-
-    const __m128  invargscale0  = _mm_set1_ps(0.693359375f);
-    const __m128  invargscale1  = _mm_set1_ps(-2.12194440e-4f);
-
-    const __m128  CC5           = _mm_set1_ps(1.9875691500e-4f);
-    const __m128  CC4           = _mm_set1_ps(1.3981999507e-3f);
-    const __m128  CC3           = _mm_set1_ps(8.3334519073e-3f);
-    const __m128  CC2           = _mm_set1_ps(4.1665795894e-2f);
-    const __m128  CC1           = _mm_set1_ps(1.6666665459e-1f);
-    const __m128  CC0           = _mm_set1_ps(5.0000001201e-1f);
-    const __m128  one           = _mm_set1_ps(1.0f);
-
-    __m128        y, x2;
-    __m128        p0, p1;
-    __m128        valuemask;
-    __m128i       iexppart;
-    __m128        fexppart;
-    __m128        intpart;
-
-    y = _mm_mul_ps(x, argscale);
-
-    iexppart  = _mm_cvtps_epi32(y);
-    intpart   = _mm_round_ps(y, _MM_FROUND_TO_NEAREST_INT);
-
-    iexppart  = _mm_slli_epi32(_mm_add_epi32(iexppart, expbase), 23);
-    valuemask = _mm_cmpge_ps(arglimit, gmx_mm_abs_ps(y));
-    fexppart  = _mm_and_ps(valuemask, gmx_mm_castsi128_ps(iexppart));
-
-    /* Extended precision arithmetics */
-    x         = _mm_sub_ps(x, _mm_mul_ps(invargscale0, intpart));
-    x         = _mm_sub_ps(x, _mm_mul_ps(invargscale1, intpart));
-
-    x2        = _mm_mul_ps(x, x);
-
-    p1        = _mm_mul_ps(CC5, x2);
-    p0        = _mm_mul_ps(CC4, x2);
-    p1        = _mm_add_ps(p1, CC3);
-    p0        = _mm_add_ps(p0, CC2);
-    p1        = _mm_mul_ps(p1, x2);
-    p0        = _mm_mul_ps(p0, x2);
-    p1        = _mm_add_ps(p1, CC1);
-    p0        = _mm_add_ps(p0, CC0);
-    p1        = _mm_mul_ps(p1, x);
-    p0        = _mm_add_ps(p0, p1);
-    p0        = _mm_mul_ps(p0, x2);
-    x         = _mm_add_ps(x, one);
-    x         = _mm_add_ps(x, p0);
-
-    x         = _mm_mul_ps(x, fexppart);
-
-    return x;
-}
-
-/* FULL precision. Only errors in LSB */
-static __m128
-gmx_mm_erf_ps(__m128 x)
-{
-    /* Coefficients for minimax approximation of erf(x)=x*P(x^2) in range [-1,1] */
-    const __m128  CA6      = _mm_set1_ps(7.853861353153693e-5f);
-    const __m128  CA5      = _mm_set1_ps(-8.010193625184903e-4f);
-    const __m128  CA4      = _mm_set1_ps(5.188327685732524e-3f);
-    const __m128  CA3      = _mm_set1_ps(-2.685381193529856e-2f);
-    const __m128  CA2      = _mm_set1_ps(1.128358514861418e-1f);
-    const __m128  CA1      = _mm_set1_ps(-3.761262582423300e-1f);
-    const __m128  CA0      = _mm_set1_ps(1.128379165726710f);
-    /* Coefficients for minimax approximation of erfc(x)=Exp(-x^2)*P((1/(x-1))^2) in range [0.67,2] */
-    const __m128  CB9      = _mm_set1_ps(-0.0018629930017603923f);
-    const __m128  CB8      = _mm_set1_ps(0.003909821287598495f);
-    const __m128  CB7      = _mm_set1_ps(-0.0052094582210355615f);
-    const __m128  CB6      = _mm_set1_ps(0.005685614362160572f);
-    const __m128  CB5      = _mm_set1_ps(-0.0025367682853477272f);
-    const __m128  CB4      = _mm_set1_ps(-0.010199799682318782f);
-    const __m128  CB3      = _mm_set1_ps(0.04369575504816542f);
-    const __m128  CB2      = _mm_set1_ps(-0.11884063474674492f);
-    const __m128  CB1      = _mm_set1_ps(0.2732120154030589f);
-    const __m128  CB0      = _mm_set1_ps(0.42758357702025784f);
-    /* Coefficients for minimax approximation of erfc(x)=Exp(-x^2)*(1/x)*P((1/x)^2) in range [2,9.19] */
-    const __m128  CC10     = _mm_set1_ps(-0.0445555913112064f);
-    const __m128  CC9      = _mm_set1_ps(0.21376355144663348f);
-    const __m128  CC8      = _mm_set1_ps(-0.3473187200259257f);
-    const __m128  CC7      = _mm_set1_ps(0.016690861551248114f);
-    const __m128  CC6      = _mm_set1_ps(0.7560973182491192f);
-    const __m128  CC5      = _mm_set1_ps(-1.2137903600145787f);
-    const __m128  CC4      = _mm_set1_ps(0.8411872321232948f);
-    const __m128  CC3      = _mm_set1_ps(-0.08670413896296343f);
-    const __m128  CC2      = _mm_set1_ps(-0.27124782687240334f);
-    const __m128  CC1      = _mm_set1_ps(-0.0007502488047806069f);
-    const __m128  CC0      = _mm_set1_ps(0.5642114853803148f);
-
-    /* Coefficients for expansion of exp(x) in [0,0.1] */
-    /* CD0 and CD1 are both 1.0, so no need to declare them separately */
-    const __m128  CD2      = _mm_set1_ps(0.5000066608081202f);
-    const __m128  CD3      = _mm_set1_ps(0.1664795422874624f);
-    const __m128  CD4      = _mm_set1_ps(0.04379839977652482f);
-
-    const __m128  sieve    = gmx_mm_castsi128_ps( _mm_set1_epi32(0xfffff000) );
-    const __m128  signbit  = gmx_mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
-    const __m128  one      = _mm_set1_ps(1.0f);
-    const __m128  two      = _mm_set1_ps(2.0f);
-
-    __m128        x2, x4, y;
-    __m128        z, q, t, t2, w, w2;
-    __m128        pA0, pA1, pB0, pB1, pC0, pC1;
-    __m128        expmx2, corr;
-    __m128        res_erf, res_erfc, res;
-    __m128        mask;
-
-    /* Calculate erf() */
-    x2     = _mm_mul_ps(x, x);
-    x4     = _mm_mul_ps(x2, x2);
-
-    pA0  = _mm_mul_ps(CA6, x4);
-    pA1  = _mm_mul_ps(CA5, x4);
-    pA0  = _mm_add_ps(pA0, CA4);
-    pA1  = _mm_add_ps(pA1, CA3);
-    pA0  = _mm_mul_ps(pA0, x4);
-    pA1  = _mm_mul_ps(pA1, x4);
-    pA0  = _mm_add_ps(pA0, CA2);
-    pA1  = _mm_add_ps(pA1, CA1);
-    pA0  = _mm_mul_ps(pA0, x4);
-    pA1  = _mm_mul_ps(pA1, x2);
-    pA0  = _mm_add_ps(pA0, pA1);
-    pA0  = _mm_add_ps(pA0, CA0);
-
-    res_erf = _mm_mul_ps(x, pA0);
-
-    /* Calculate erfc */
-
-    y       = gmx_mm_abs_ps(x);
-    t       = gmx_mm_inv_ps(y);
-    w       = _mm_sub_ps(t, one);
-    t2      = _mm_mul_ps(t, t);
-    w2      = _mm_mul_ps(w, w);
-    /*
-     * We cannot simply calculate exp(-x2) directly in single precision, since
-     * that will lose a couple of bits of precision due to the multiplication.
-     * Instead, we introduce x=z+w, where the last 12 bits of precision are in w.
-     * Then we get exp(-x2) = exp(-z2)*exp((z-x)*(z+x)).
-     *
-     * The only drawback with this is that it requires TWO separate exponential
-     * evaluations, which would be horrible performance-wise. However, the argument
-     * for the second exp() call is always small, so there we simply use a
-     * low-order minimax expansion on [0,0.1].
-     */
-
-    z       = _mm_and_ps(y, sieve);
-    q       = _mm_mul_ps( _mm_sub_ps(z, y), _mm_add_ps(z, y) );
-
-    corr    = _mm_mul_ps(CD4, q);
-    corr    = _mm_add_ps(corr, CD3);
-    corr    = _mm_mul_ps(corr, q);
-    corr    = _mm_add_ps(corr, CD2);
-    corr    = _mm_mul_ps(corr, q);
-    corr    = _mm_add_ps(corr, one);
-    corr    = _mm_mul_ps(corr, q);
-    corr    = _mm_add_ps(corr, one);
-
-    expmx2  = gmx_mm_exp_ps( _mm_or_ps( signbit, _mm_mul_ps(z, z) ) );
-    expmx2  = _mm_mul_ps(expmx2, corr);
-
-    pB1  = _mm_mul_ps(CB9, w2);
-    pB0  = _mm_mul_ps(CB8, w2);
-    pB1  = _mm_add_ps(pB1, CB7);
-    pB0  = _mm_add_ps(pB0, CB6);
-    pB1  = _mm_mul_ps(pB1, w2);
-    pB0  = _mm_mul_ps(pB0, w2);
-    pB1  = _mm_add_ps(pB1, CB5);
-    pB0  = _mm_add_ps(pB0, CB4);
-    pB1  = _mm_mul_ps(pB1, w2);
-    pB0  = _mm_mul_ps(pB0, w2);
-    pB1  = _mm_add_ps(pB1, CB3);
-    pB0  = _mm_add_ps(pB0, CB2);
-    pB1  = _mm_mul_ps(pB1, w2);
-    pB0  = _mm_mul_ps(pB0, w2);
-    pB1  = _mm_add_ps(pB1, CB1);
-    pB1  = _mm_mul_ps(pB1, w);
-    pB0  = _mm_add_ps(pB0, pB1);
-    pB0  = _mm_add_ps(pB0, CB0);
-
-    pC0  = _mm_mul_ps(CC10, t2);
-    pC1  = _mm_mul_ps(CC9, t2);
-    pC0  = _mm_add_ps(pC0, CC8);
-    pC1  = _mm_add_ps(pC1, CC7);
-    pC0  = _mm_mul_ps(pC0, t2);
-    pC1  = _mm_mul_ps(pC1, t2);
-    pC0  = _mm_add_ps(pC0, CC6);
-    pC1  = _mm_add_ps(pC1, CC5);
-    pC0  = _mm_mul_ps(pC0, t2);
-    pC1  = _mm_mul_ps(pC1, t2);
-    pC0  = _mm_add_ps(pC0, CC4);
-    pC1  = _mm_add_ps(pC1, CC3);
-    pC0  = _mm_mul_ps(pC0, t2);
-    pC1  = _mm_mul_ps(pC1, t2);
-    pC0  = _mm_add_ps(pC0, CC2);
-    pC1  = _mm_add_ps(pC1, CC1);
-    pC0  = _mm_mul_ps(pC0, t2);
-    pC1  = _mm_mul_ps(pC1, t);
-    pC0  = _mm_add_ps(pC0, pC1);
-    pC0  = _mm_add_ps(pC0, CC0);
-    pC0  = _mm_mul_ps(pC0, t);
-
-    /* SELECT pB0 or pC0 for erfc() */
-    mask     = _mm_cmplt_ps(two, y);
-    res_erfc = _mm_blendv_ps(pB0, pC0, mask);
-    res_erfc = _mm_mul_ps(res_erfc, expmx2);
-
-    /* erfc(x<0) = 2-erfc(|x|) */
-    mask     = _mm_cmplt_ps(x, _mm_setzero_ps());
-    res_erfc = _mm_blendv_ps(res_erfc, _mm_sub_ps(two, res_erfc), mask);
-
-    /* Select erf() or erfc() */
-    mask = _mm_cmplt_ps(y, _mm_set1_ps(0.75f));
-    res  = _mm_blendv_ps(_mm_sub_ps(one, res_erfc), res_erf, mask);
-
-    return res;
-}
-
-
-/* FULL precision. Only errors in LSB */
-static __m128
-gmx_mm_erfc_ps(__m128 x)
-{
-    /* Coefficients for minimax approximation of erf(x)=x*P(x^2) in range [-1,1] */
-    const __m128  CA6      = _mm_set1_ps(7.853861353153693e-5f);
-    const __m128  CA5      = _mm_set1_ps(-8.010193625184903e-4f);
-    const __m128  CA4      = _mm_set1_ps(5.188327685732524e-3f);
-    const __m128  CA3      = _mm_set1_ps(-2.685381193529856e-2f);
-    const __m128  CA2      = _mm_set1_ps(1.128358514861418e-1f);
-    const __m128  CA1      = _mm_set1_ps(-3.761262582423300e-1f);
-    const __m128  CA0      = _mm_set1_ps(1.128379165726710f);
-    /* Coefficients for minimax approximation of erfc(x)=Exp(-x^2)*P((1/(x-1))^2) in range [0.67,2] */
-    const __m128  CB9      = _mm_set1_ps(-0.0018629930017603923f);
-    const __m128  CB8      = _mm_set1_ps(0.003909821287598495f);
-    const __m128  CB7      = _mm_set1_ps(-0.0052094582210355615f);
-    const __m128  CB6      = _mm_set1_ps(0.005685614362160572f);
-    const __m128  CB5      = _mm_set1_ps(-0.0025367682853477272f);
-    const __m128  CB4      = _mm_set1_ps(-0.010199799682318782f);
-    const __m128  CB3      = _mm_set1_ps(0.04369575504816542f);
-    const __m128  CB2      = _mm_set1_ps(-0.11884063474674492f);
-    const __m128  CB1      = _mm_set1_ps(0.2732120154030589f);
-    const __m128  CB0      = _mm_set1_ps(0.42758357702025784f);
-    /* Coefficients for minimax approximation of erfc(x)=Exp(-x^2)*(1/x)*P((1/x)^2) in range [2,9.19] */
-    const __m128  CC10     = _mm_set1_ps(-0.0445555913112064f);
-    const __m128  CC9      = _mm_set1_ps(0.21376355144663348f);
-    const __m128  CC8      = _mm_set1_ps(-0.3473187200259257f);
-    const __m128  CC7      = _mm_set1_ps(0.016690861551248114f);
-    const __m128  CC6      = _mm_set1_ps(0.7560973182491192f);
-    const __m128  CC5      = _mm_set1_ps(-1.2137903600145787f);
-    const __m128  CC4      = _mm_set1_ps(0.8411872321232948f);
-    const __m128  CC3      = _mm_set1_ps(-0.08670413896296343f);
-    const __m128  CC2      = _mm_set1_ps(-0.27124782687240334f);
-    const __m128  CC1      = _mm_set1_ps(-0.0007502488047806069f);
-    const __m128  CC0      = _mm_set1_ps(0.5642114853803148f);
-
-    /* Coefficients for expansion of exp(x) in [0,0.1] */
-    /* CD0 and CD1 are both 1.0, so no need to declare them separately */
-    const __m128  CD2      = _mm_set1_ps(0.5000066608081202f);
-    const __m128  CD3      = _mm_set1_ps(0.1664795422874624f);
-    const __m128  CD4      = _mm_set1_ps(0.04379839977652482f);
-
-    const __m128  sieve    = gmx_mm_castsi128_ps( _mm_set1_epi32(0xfffff000) );
-    const __m128  signbit  = gmx_mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
-    const __m128  one      = _mm_set1_ps(1.0f);
-    const __m128  two      = _mm_set1_ps(2.0f);
-
-    __m128        x2, x4, y;
-    __m128        z, q, t, t2, w, w2;
-    __m128        pA0, pA1, pB0, pB1, pC0, pC1;
-    __m128        expmx2, corr;
-    __m128        res_erf, res_erfc, res;
-    __m128        mask;
-
-    /* Calculate erf() */
-    x2     = _mm_mul_ps(x, x);
-    x4     = _mm_mul_ps(x2, x2);
-
-    pA0  = _mm_mul_ps(CA6, x4);
-    pA1  = _mm_mul_ps(CA5, x4);
-    pA0  = _mm_add_ps(pA0, CA4);
-    pA1  = _mm_add_ps(pA1, CA3);
-    pA0  = _mm_mul_ps(pA0, x4);
-    pA1  = _mm_mul_ps(pA1, x4);
-    pA0  = _mm_add_ps(pA0, CA2);
-    pA1  = _mm_add_ps(pA1, CA1);
-    pA0  = _mm_mul_ps(pA0, x4);
-    pA1  = _mm_mul_ps(pA1, x2);
-    pA0  = _mm_add_ps(pA0, pA1);
-    pA0  = _mm_add_ps(pA0, CA0);
-
-    res_erf = _mm_mul_ps(x, pA0);
-
-    /* Calculate erfc */
-    y       = gmx_mm_abs_ps(x);
-    t       = gmx_mm_inv_ps(y);
-    w       = _mm_sub_ps(t, one);
-    t2      = _mm_mul_ps(t, t);
-    w2      = _mm_mul_ps(w, w);
-    /*
-     * We cannot simply calculate exp(-x2) directly in single precision, since
-     * that will lose a couple of bits of precision due to the multiplication.
-     * Instead, we introduce x=z+w, where the last 12 bits of precision are in w.
-     * Then we get exp(-x2) = exp(-z2)*exp((z-x)*(z+x)).
-     *
-     * The only drawback with this is that it requires TWO separate exponential
-     * evaluations, which would be horrible performance-wise. However, the argument
-     * for the second exp() call is always small, so there we simply use a
-     * low-order minimax expansion on [0,0.1].
-     */
-
-    z       = _mm_and_ps(y, sieve);
-    q       = _mm_mul_ps( _mm_sub_ps(z, y), _mm_add_ps(z, y) );
-
-    corr    = _mm_mul_ps(CD4, q);
-    corr    = _mm_add_ps(corr, CD3);
-    corr    = _mm_mul_ps(corr, q);
-    corr    = _mm_add_ps(corr, CD2);
-    corr    = _mm_mul_ps(corr, q);
-    corr    = _mm_add_ps(corr, one);
-    corr    = _mm_mul_ps(corr, q);
-    corr    = _mm_add_ps(corr, one);
-
-    expmx2  = gmx_mm_exp_ps( _mm_or_ps( signbit, _mm_mul_ps(z, z) ) );
-    expmx2  = _mm_mul_ps(expmx2, corr);
-
-    pB1  = _mm_mul_ps(CB9, w2);
-    pB0  = _mm_mul_ps(CB8, w2);
-    pB1  = _mm_add_ps(pB1, CB7);
-    pB0  = _mm_add_ps(pB0, CB6);
-    pB1  = _mm_mul_ps(pB1, w2);
-    pB0  = _mm_mul_ps(pB0, w2);
-    pB1  = _mm_add_ps(pB1, CB5);
-    pB0  = _mm_add_ps(pB0, CB4);
-    pB1  = _mm_mul_ps(pB1, w2);
-    pB0  = _mm_mul_ps(pB0, w2);
-    pB1  = _mm_add_ps(pB1, CB3);
-    pB0  = _mm_add_ps(pB0, CB2);
-    pB1  = _mm_mul_ps(pB1, w2);
-    pB0  = _mm_mul_ps(pB0, w2);
-    pB1  = _mm_add_ps(pB1, CB1);
-    pB1  = _mm_mul_ps(pB1, w);
-    pB0  = _mm_add_ps(pB0, pB1);
-    pB0  = _mm_add_ps(pB0, CB0);
-
-    pC0  = _mm_mul_ps(CC10, t2);
-    pC1  = _mm_mul_ps(CC9, t2);
-    pC0  = _mm_add_ps(pC0, CC8);
-    pC1  = _mm_add_ps(pC1, CC7);
-    pC0  = _mm_mul_ps(pC0, t2);
-    pC1  = _mm_mul_ps(pC1, t2);
-    pC0  = _mm_add_ps(pC0, CC6);
-    pC1  = _mm_add_ps(pC1, CC5);
-    pC0  = _mm_mul_ps(pC0, t2);
-    pC1  = _mm_mul_ps(pC1, t2);
-    pC0  = _mm_add_ps(pC0, CC4);
-    pC1  = _mm_add_ps(pC1, CC3);
-    pC0  = _mm_mul_ps(pC0, t2);
-    pC1  = _mm_mul_ps(pC1, t2);
-    pC0  = _mm_add_ps(pC0, CC2);
-    pC1  = _mm_add_ps(pC1, CC1);
-    pC0  = _mm_mul_ps(pC0, t2);
-    pC1  = _mm_mul_ps(pC1, t);
-    pC0  = _mm_add_ps(pC0, pC1);
-    pC0  = _mm_add_ps(pC0, CC0);
-    pC0  = _mm_mul_ps(pC0, t);
-
-    /* SELECT pB0 or pC0 for erfc() */
-    mask     = _mm_cmplt_ps(two, y);
-    res_erfc = _mm_blendv_ps(pB0, pC0, mask);
-    res_erfc = _mm_mul_ps(res_erfc, expmx2);
-
-    /* erfc(x<0) = 2-erfc(|x|) */
-    mask     = _mm_cmplt_ps(x, _mm_setzero_ps());
-    res_erfc = _mm_blendv_ps(res_erfc, _mm_sub_ps(two, res_erfc), mask);
-
-    /* Select erf() or erfc() */
-    mask = _mm_cmplt_ps(y, _mm_set1_ps(0.75f));
-    res  = _mm_blendv_ps(res_erfc, _mm_sub_ps(one, res_erf), mask);
-
-    return res;
-}
-
-
-/* Calculate the force correction due to PME analytically.
- *
- * This routine is meant to enable analytical evaluation of the
- * direct-space PME electrostatic force to avoid tables.
- *
- * The direct-space potential should be Erfc(beta*r)/r, but there
- * are some problems evaluating that:
- *
- * First, the error function is difficult (read: expensive) to
- * approxmiate accurately for intermediate to large arguments, and
- * this happens already in ranges of beta*r that occur in simulations.
- * Second, we now try to avoid calculating potentials in Gromacs but
- * use forces directly.
- *
- * We can simply things slight by noting that the PME part is really
- * a correction to the normal Coulomb force since Erfc(z)=1-Erf(z), i.e.
- *
- * V= 1/r - Erf(beta*r)/r
- *
- * The first term we already have from the inverse square root, so
- * that we can leave out of this routine.
- *
- * For pme tolerances of 1e-3 to 1e-8 and cutoffs of 0.5nm to 1.8nm,
- * the argument beta*r will be in the range 0.15 to ~4. Use your
- * favorite plotting program to realize how well-behaved Erf(z)/z is
- * in this range!
- *
- * We approximate f(z)=erf(z)/z with a rational minimax polynomial.
- * However, it turns out it is more efficient to approximate f(z)/z and
- * then only use even powers. This is another minor optimization, since
- * we actually WANT f(z)/z, because it is going to be multiplied by
- * the vector between the two atoms to get the vectorial force. The
- * fastest flops are the ones we can avoid calculating!
- *
- * So, here's how it should be used:
- *
- * 1. Calculate r^2.
- * 2. Multiply by beta^2, so you get z^2=beta^2*r^2.
- * 3. Evaluate this routine with z^2 as the argument.
- * 4. The return value is the expression:
- *
- *
- *       2*exp(-z^2)     erf(z)
- *       ------------ - --------
- *       sqrt(Pi)*z^2      z^3
- *
- * 5. Multiply the entire expression by beta^3. This will get you
- *
- *       beta^3*2*exp(-z^2)     beta^3*erf(z)
- *       ------------------  - ---------------
- *          sqrt(Pi)*z^2            z^3
- *
- *    or, switching back to r (z=r*beta):
- *
- *       2*beta*exp(-r^2*beta^2)   erf(r*beta)
- *       ----------------------- - -----------
- *            sqrt(Pi)*r^2            r^3
- *
- *
- *    With a bit of math exercise you should be able to confirm that
- *    this is exactly D[Erf[beta*r]/r,r] divided by r another time.
- *
- * 6. Add the result to 1/r^3, multiply by the product of the charges,
- *    and you have your force (divided by r). A final multiplication
- *    with the vector connecting the two particles and you have your
- *    vectorial force to add to the particles.
- *
- */
-static gmx_inline __m128
-gmx_mm_pmecorrF_ps(__m128 z2)
-{
-    const __m128  FN6      = _mm_set1_ps(-1.7357322914161492954e-8f);
-    const __m128  FN5      = _mm_set1_ps(1.4703624142580877519e-6f);
-    const __m128  FN4      = _mm_set1_ps(-0.000053401640219807709149f);
-    const __m128  FN3      = _mm_set1_ps(0.0010054721316683106153f);
-    const __m128  FN2      = _mm_set1_ps(-0.019278317264888380590f);
-    const __m128  FN1      = _mm_set1_ps(0.069670166153766424023f);
-    const __m128  FN0      = _mm_set1_ps(-0.75225204789749321333f);
-
-    const __m128  FD4      = _mm_set1_ps(0.0011193462567257629232f);
-    const __m128  FD3      = _mm_set1_ps(0.014866955030185295499f);
-    const __m128  FD2      = _mm_set1_ps(0.11583842382862377919f);
-    const __m128  FD1      = _mm_set1_ps(0.50736591960530292870f);
-    const __m128  FD0      = _mm_set1_ps(1.0f);
-
-    __m128        z4;
-    __m128        polyFN0, polyFN1, polyFD0, polyFD1;
-
-    z4             = _mm_mul_ps(z2, z2);
-
-    polyFD0        = _mm_mul_ps(FD4, z4);
-    polyFD1        = _mm_mul_ps(FD3, z4);
-    polyFD0        = _mm_add_ps(polyFD0, FD2);
-    polyFD1        = _mm_add_ps(polyFD1, FD1);
-    polyFD0        = _mm_mul_ps(polyFD0, z4);
-    polyFD1        = _mm_mul_ps(polyFD1, z2);
-    polyFD0        = _mm_add_ps(polyFD0, FD0);
-    polyFD0        = _mm_add_ps(polyFD0, polyFD1);
-
-    polyFD0        = gmx_mm_inv_ps(polyFD0);
-
-    polyFN0        = _mm_mul_ps(FN6, z4);
-    polyFN1        = _mm_mul_ps(FN5, z4);
-    polyFN0        = _mm_add_ps(polyFN0, FN4);
-    polyFN1        = _mm_add_ps(polyFN1, FN3);
-    polyFN0        = _mm_mul_ps(polyFN0, z4);
-    polyFN1        = _mm_mul_ps(polyFN1, z4);
-    polyFN0        = _mm_add_ps(polyFN0, FN2);
-    polyFN1        = _mm_add_ps(polyFN1, FN1);
-    polyFN0        = _mm_mul_ps(polyFN0, z4);
-    polyFN1        = _mm_mul_ps(polyFN1, z2);
-    polyFN0        = _mm_add_ps(polyFN0, FN0);
-    polyFN0        = _mm_add_ps(polyFN0, polyFN1);
-
-    return _mm_mul_ps(polyFN0, polyFD0);
-}
-
-
-/* Calculate the potential correction due to PME analytically.
- *
- * See gmx_mm256_pmecorrF_ps() for details about the approximation.
- *
- * This routine calculates Erf(z)/z, although you should provide z^2
- * as the input argument.
- *
- * Here's how it should be used:
- *
- * 1. Calculate r^2.
- * 2. Multiply by beta^2, so you get z^2=beta^2*r^2.
- * 3. Evaluate this routine with z^2 as the argument.
- * 4. The return value is the expression:
- *
- *
- *        erf(z)
- *       --------
- *          z
- *
- * 5. Multiply the entire expression by beta and switching back to r (z=r*beta):
- *
- *       erf(r*beta)
- *       -----------
- *           r
- *
- * 6. Subtract the result from 1/r, multiply by the product of the charges,
- *    and you have your potential.
+/* Temporary:
+ * Alias some old SSE definitions to new SIMD definitions so we don't need
+ * to modify _all_ group kernels - they will anyway be replaced with a new
+ * generic SIMD version soon.
   */
-static gmx_inline __m128
-gmx_mm_pmecorrV_ps(__m128 z2)
-{
-    const __m128  VN6      = _mm_set1_ps(1.9296833005951166339e-8f);
-    const __m128  VN5      = _mm_set1_ps(-1.4213390571557850962e-6f);
-    const __m128  VN4      = _mm_set1_ps(0.000041603292906656984871f);
-    const __m128  VN3      = _mm_set1_ps(-0.00013134036773265025626f);
-    const __m128  VN2      = _mm_set1_ps(0.038657983986041781264f);
-    const __m128  VN1      = _mm_set1_ps(0.11285044772717598220f);
-    const __m128  VN0      = _mm_set1_ps(1.1283802385263030286f);
-
-    const __m128  VD3      = _mm_set1_ps(0.0066752224023576045451f);
-    const __m128  VD2      = _mm_set1_ps(0.078647795836373922256f);
-    const __m128  VD1      = _mm_set1_ps(0.43336185284710920150f);
-    const __m128  VD0      = _mm_set1_ps(1.0f);
-
-    __m128        z4;
-    __m128        polyVN0, polyVN1, polyVD0, polyVD1;
-
-    z4             = _mm_mul_ps(z2, z2);
-
-    polyVD1        = _mm_mul_ps(VD3, z4);
-    polyVD0        = _mm_mul_ps(VD2, z4);
-    polyVD1        = _mm_add_ps(polyVD1, VD1);
-    polyVD0        = _mm_add_ps(polyVD0, VD0);
-    polyVD1        = _mm_mul_ps(polyVD1, z2);
-    polyVD0        = _mm_add_ps(polyVD0, polyVD1);
-
-    polyVD0        = gmx_mm_inv_ps(polyVD0);
-
-    polyVN0        = _mm_mul_ps(VN6, z4);
-    polyVN1        = _mm_mul_ps(VN5, z4);
-    polyVN0        = _mm_add_ps(polyVN0, VN4);
-    polyVN1        = _mm_add_ps(polyVN1, VN3);
-    polyVN0        = _mm_mul_ps(polyVN0, z4);
-    polyVN1        = _mm_mul_ps(polyVN1, z4);
-    polyVN0        = _mm_add_ps(polyVN0, VN2);
-    polyVN1        = _mm_add_ps(polyVN1, VN1);
-    polyVN0        = _mm_mul_ps(polyVN0, z4);
-    polyVN1        = _mm_mul_ps(polyVN1, z2);
-    polyVN0        = _mm_add_ps(polyVN0, VN0);
-    polyVN0        = _mm_add_ps(polyVN0, polyVN1);
-
-    return _mm_mul_ps(polyVN0, polyVD0);
-}
-
-
-static int
-gmx_mm_sincos_ps(__m128  x,
-                 __m128 *sinval,
-                 __m128 *cosval)
-{
-    const __m128  two_over_pi = _mm_set1_ps(2.0/M_PI);
-    const __m128  half        = _mm_set1_ps(0.5);
-    const __m128  one         = _mm_set1_ps(1.0);
-
-    const __m128i izero      = _mm_set1_epi32(0);
-    const __m128i ione       = _mm_set1_epi32(1);
-    const __m128i itwo       = _mm_set1_epi32(2);
-    const __m128i ithree     = _mm_set1_epi32(3);
-    const __m128  signbit    = gmx_mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
-
-    const __m128  CA1         = _mm_set1_ps(1.5703125f);
-    const __m128  CA2         = _mm_set1_ps(4.837512969970703125e-4f);
-    const __m128  CA3         = _mm_set1_ps(7.54978995489188216e-8f);
-
-    const __m128  CC0         = _mm_set1_ps(-0.0013602249f);
-    const __m128  CC1         = _mm_set1_ps(0.0416566950f);
-    const __m128  CC2         = _mm_set1_ps(-0.4999990225f);
-    const __m128  CS0         = _mm_set1_ps(-0.0001950727f);
-    const __m128  CS1         = _mm_set1_ps(0.0083320758f);
-    const __m128  CS2         = _mm_set1_ps(-0.1666665247f);
-
-    __m128        y, y2;
-    __m128        z;
-    __m128i       iz;
-    __m128i       offset_sin, offset_cos;
-    __m128        tmp1, tmp2;
-    __m128        mask_sin, mask_cos;
-    __m128        tmp_sin, tmp_cos;
-
-    y          = _mm_mul_ps(x, two_over_pi);
-    y          = _mm_add_ps(y, _mm_or_ps(_mm_and_ps(y, signbit), half));
-
-    iz         = _mm_cvttps_epi32(y);
-    z          = _mm_round_ps(y, _MM_FROUND_TO_ZERO);
-
-    offset_sin = _mm_and_si128(iz, ithree);
-    offset_cos = _mm_add_epi32(iz, ione);
-
-    /* Extended precision arithmethic to achieve full precision */
-    y               = _mm_mul_ps(z, CA1);
-    tmp1            = _mm_mul_ps(z, CA2);
-    tmp2            = _mm_mul_ps(z, CA3);
-    y               = _mm_sub_ps(x, y);
-    y               = _mm_sub_ps(y, tmp1);
-    y               = _mm_sub_ps(y, tmp2);
-
-    y2              = _mm_mul_ps(y, y);
-
-    tmp1            = _mm_mul_ps(CC0, y2);
-    tmp1            = _mm_add_ps(tmp1, CC1);
-    tmp2            = _mm_mul_ps(CS0, y2);
-    tmp2            = _mm_add_ps(tmp2, CS1);
-    tmp1            = _mm_mul_ps(tmp1, y2);
-    tmp1            = _mm_add_ps(tmp1, CC2);
-    tmp2            = _mm_mul_ps(tmp2, y2);
-    tmp2            = _mm_add_ps(tmp2, CS2);
-
-    tmp1            = _mm_mul_ps(tmp1, y2);
-    tmp1            = _mm_add_ps(tmp1, one);
-
-    tmp2            = _mm_mul_ps(tmp2, _mm_mul_ps(y, y2));
-    tmp2            = _mm_add_ps(tmp2, y);
-
-    mask_sin        = gmx_mm_castsi128_ps(_mm_cmpeq_epi32( _mm_and_si128(offset_sin, ione), izero));
-    mask_cos        = gmx_mm_castsi128_ps(_mm_cmpeq_epi32( _mm_and_si128(offset_cos, ione), izero));
-
-    tmp_sin         = _mm_blendv_ps(tmp1, tmp2, mask_sin);
-    tmp_cos         = _mm_blendv_ps(tmp1, tmp2, mask_cos);
-
-    mask_sin        = gmx_mm_castsi128_ps(_mm_cmpeq_epi32( _mm_and_si128(offset_sin, itwo), izero));
-    mask_cos        = gmx_mm_castsi128_ps(_mm_cmpeq_epi32( _mm_and_si128(offset_cos, itwo), izero));
-
-    tmp1            = _mm_xor_ps(signbit, tmp_sin);
-    tmp2            = _mm_xor_ps(signbit, tmp_cos);
-
-    *sinval         = _mm_blendv_ps(tmp1, tmp_sin, mask_sin);
-    *cosval         = _mm_blendv_ps(tmp2, tmp_cos, mask_cos);
-
-    return 0;
-}
-
-/*
- * IMPORTANT: Do NOT call both sin & cos if you need both results, since each of them
- * will then call the sincos() routine and waste a factor 2 in performance!
- */
-static __m128
-gmx_mm_sin_ps(__m128 x)
-{
-    __m128 s, c;
-    gmx_mm_sincos_ps(x, &s, &c);
-    return s;
-}
-
-/*
- * IMPORTANT: Do NOT call both sin & cos if you need both results, since each of them
- * will then call the sincos() routine and waste a factor 2 in performance!
- */
-static __m128
-gmx_mm_cos_ps(__m128 x)
-{
-    __m128 s, c;
-    gmx_mm_sincos_ps(x, &s, &c);
-    return c;
-}
-
-
-static __m128
-gmx_mm_tan_ps(__m128 x)
-{
-    __m128 sinval, cosval;
-    __m128 tanval;
-
-    gmx_mm_sincos_ps(x, &sinval, &cosval);
-
-    tanval = _mm_mul_ps(sinval, gmx_mm_inv_ps(cosval));
-
-    return tanval;
-}
-
-
-static __m128
-gmx_mm_asin_ps(__m128 x)
-{
-    /* Same algorithm as cephes library */
-    const __m128 signmask  = gmx_mm_castsi128_ps( _mm_set1_epi32(0x7FFFFFFF) );
-    const __m128 limitlow  = _mm_set1_ps(1e-4f);
-    const __m128 half      = _mm_set1_ps(0.5f);
-    const __m128 one       = _mm_set1_ps(1.0f);
-    const __m128 halfpi    = _mm_set1_ps(M_PI/2.0f);
-
-    const __m128 CC5        = _mm_set1_ps(4.2163199048E-2f);
-    const __m128 CC4        = _mm_set1_ps(2.4181311049E-2f);
-    const __m128 CC3        = _mm_set1_ps(4.5470025998E-2f);
-    const __m128 CC2        = _mm_set1_ps(7.4953002686E-2f);
-    const __m128 CC1        = _mm_set1_ps(1.6666752422E-1f);
-
-    __m128       sign;
-    __m128       mask;
-    __m128       xabs;
-    __m128       z, z1, z2, q, q1, q2;
-    __m128       pA, pB;
-
-    sign  = _mm_andnot_ps(signmask, x);
-    xabs  = _mm_and_ps(x, signmask);
-
-    mask  = _mm_cmpgt_ps(xabs, half);
-
-    z1    = _mm_mul_ps(half, _mm_sub_ps(one, xabs));
-    q1    = _mm_mul_ps(z1, gmx_mm_invsqrt_ps(z1));
-    q1    = _mm_andnot_ps(_mm_cmpeq_ps(xabs, one), q1);
-
-    q2    = xabs;
-    z2    = _mm_mul_ps(q2, q2);
-
-    z     = _mm_or_ps( _mm_and_ps(mask, z1), _mm_andnot_ps(mask, z2) );
-    q     = _mm_or_ps( _mm_and_ps(mask, q1), _mm_andnot_ps(mask, q2) );
-
-    z2    = _mm_mul_ps(z, z);
-
-    pA    = _mm_mul_ps(CC5, z2);
-    pB    = _mm_mul_ps(CC4, z2);
-
-    pA    = _mm_add_ps(pA, CC3);
-    pB    = _mm_add_ps(pB, CC2);
-
-    pA    = _mm_mul_ps(pA, z2);
-    pB    = _mm_mul_ps(pB, z2);
-
-    pA    = _mm_add_ps(pA, CC1);
-    pA    = _mm_mul_ps(pA, z);
-
-    z     = _mm_add_ps(pA, pB);
-    z     = _mm_mul_ps(z, q);
-    z     = _mm_add_ps(z, q);
-
-    q2    = _mm_sub_ps(halfpi, z);
-    q2    = _mm_sub_ps(q2, z);
-
-    z     = _mm_or_ps( _mm_and_ps(mask, q2), _mm_andnot_ps(mask, z) );
-
-    mask  = _mm_cmpgt_ps(xabs, limitlow);
-    z     = _mm_or_ps( _mm_and_ps(mask, z), _mm_andnot_ps(mask, xabs) );
-
-    z = _mm_xor_ps(z, sign);
-
-    return z;
-}
-
-
-static __m128
-gmx_mm_acos_ps(__m128 x)
-{
-    const __m128 signmask  = gmx_mm_castsi128_ps( _mm_set1_epi32(0x7FFFFFFF) );
-    const __m128 one_ps    = _mm_set1_ps(1.0f);
-    const __m128 half_ps   = _mm_set1_ps(0.5f);
-    const __m128 pi_ps     = _mm_set1_ps(M_PI);
-    const __m128 halfpi_ps = _mm_set1_ps(M_PI/2.0f);
-
-    __m128       mask1;
-    __m128       mask2;
-    __m128       xabs;
-    __m128       z, z1, z2, z3;
-
-    xabs  = _mm_and_ps(x, signmask);
-    mask1 = _mm_cmpgt_ps(xabs, half_ps);
-    mask2 = _mm_cmpgt_ps(x, _mm_setzero_ps());
-
-    z     = _mm_mul_ps(half_ps, _mm_sub_ps(one_ps, xabs));
-    z     = _mm_mul_ps(z, gmx_mm_invsqrt_ps(z));
-    z     = _mm_andnot_ps(_mm_cmpeq_ps(xabs, one_ps), z);
-
-    z     = _mm_blendv_ps(x, z, mask1);
-    z     = gmx_mm_asin_ps(z);
-
-    z2    = _mm_add_ps(z, z);
-    z1    = _mm_sub_ps(pi_ps, z2);
-    z3    = _mm_sub_ps(halfpi_ps, z);
-
-    z     = _mm_blendv_ps(z1, z2, mask2);
-    z     = _mm_blendv_ps(z3, z, mask1);
-
-    return z;
-}
-
-
-static __m128
-gmx_mm_atan_ps(__m128 x)
-{
-    /* Same algorithm as cephes library */
-    const __m128 signmask  = gmx_mm_castsi128_ps( _mm_set1_epi32(0x7FFFFFFF) );
-    const __m128 limit1    = _mm_set1_ps(0.414213562373095f);
-    const __m128 limit2    = _mm_set1_ps(2.414213562373095f);
-    const __m128 quarterpi = _mm_set1_ps(0.785398163397448f);
-    const __m128 halfpi    = _mm_set1_ps(1.570796326794896f);
-    const __m128 mone      = _mm_set1_ps(-1.0f);
-    const __m128 CC3       = _mm_set1_ps(-3.33329491539E-1f);
-    const __m128 CC5       = _mm_set1_ps(1.99777106478E-1f);
-    const __m128 CC7       = _mm_set1_ps(-1.38776856032E-1);
-    const __m128 CC9       = _mm_set1_ps(8.05374449538e-2f);
-
-    __m128       sign;
-    __m128       mask1, mask2;
-    __m128       y, z1, z2;
-    __m128       x2, x4;
-    __m128       sum1, sum2;
-
-    sign  = _mm_andnot_ps(signmask, x);
-    x     = _mm_and_ps(x, signmask);
-
-    mask1 = _mm_cmpgt_ps(x, limit1);
-    mask2 = _mm_cmpgt_ps(x, limit2);
-
-    z1    = _mm_mul_ps(_mm_add_ps(x, mone), gmx_mm_inv_ps(_mm_sub_ps(x, mone)));
-    z2    = _mm_mul_ps(mone, gmx_mm_inv_ps(x));
-
-    y     = _mm_and_ps(mask1, quarterpi);
-    y     = _mm_blendv_ps(y, halfpi, mask2);
-
-    x     = _mm_blendv_ps(x, z1, mask1);
-    x     = _mm_blendv_ps(x, z2, mask2);
-
-    x2    = _mm_mul_ps(x, x);
-    x4    = _mm_mul_ps(x2, x2);
-
-    sum1  = _mm_mul_ps(CC9, x4);
-    sum2  = _mm_mul_ps(CC7, x4);
-    sum1  = _mm_add_ps(sum1, CC5);
-    sum2  = _mm_add_ps(sum2, CC3);
-    sum1  = _mm_mul_ps(sum1, x4);
-    sum2  = _mm_mul_ps(sum2, x2);
-
-    sum1  = _mm_add_ps(sum1, sum2);
-    sum1  = _mm_sub_ps(sum1, mone);
-    sum1  = _mm_mul_ps(sum1, x);
-    y     = _mm_add_ps(y, sum1);
-
-    y     = _mm_xor_ps(y, sign);
-
-    return y;
-}
-
-
-static __m128
-gmx_mm_atan2_ps(__m128 y, __m128 x)
-{
-    const __m128 pi          = _mm_set1_ps(M_PI);
-    const __m128 minuspi     = _mm_set1_ps(-M_PI);
-    const __m128 halfpi      = _mm_set1_ps(M_PI/2.0);
-    const __m128 minushalfpi = _mm_set1_ps(-M_PI/2.0);
-
-    __m128       z, z1, z3, z4;
-    __m128       w;
-    __m128       maskx_lt, maskx_eq;
-    __m128       masky_lt, masky_eq;
-    __m128       mask1, mask2, mask3, mask4, maskall;
-
-    maskx_lt  = _mm_cmplt_ps(x, _mm_setzero_ps());
-    masky_lt  = _mm_cmplt_ps(y, _mm_setzero_ps());
-    maskx_eq  = _mm_cmpeq_ps(x, _mm_setzero_ps());
-    masky_eq  = _mm_cmpeq_ps(y, _mm_setzero_ps());
-
-    z         = _mm_mul_ps(y, gmx_mm_inv_ps(x));
-    z         = gmx_mm_atan_ps(z);
-
-    mask1     = _mm_and_ps(maskx_eq, masky_lt);
-    mask2     = _mm_andnot_ps(maskx_lt, masky_eq);
-    mask3     = _mm_andnot_ps( _mm_or_ps(masky_lt, masky_eq), maskx_eq);
-    mask4     = _mm_and_ps(masky_eq, maskx_lt);
-
-    maskall   = _mm_or_ps( _mm_or_ps(mask1, mask2), _mm_or_ps(mask3, mask4) );
-
-    z         = _mm_andnot_ps(maskall, z);
-    z1        = _mm_and_ps(mask1, minushalfpi);
-    z3        = _mm_and_ps(mask3, halfpi);
-    z4        = _mm_and_ps(mask4, pi);
-
-    z         = _mm_or_ps( _mm_or_ps(z, z1), _mm_or_ps(z3, z4) );
-
-    mask1     = _mm_andnot_ps(masky_lt, maskx_lt);
-    mask2     = _mm_and_ps(maskx_lt, masky_lt);
-
-    w         = _mm_or_ps( _mm_and_ps(mask1, pi), _mm_and_ps(mask2, minuspi) );
-    w         = _mm_andnot_ps(maskall, w);
-
-    z         = _mm_add_ps(z, w);
-
-    return z;
-}
-
  
+#define gmx_mm_invsqrt_ps   gmx_simd_invsqrt_f
+#define gmx_mm_inv_ps       gmx_simd_inv_f
+#define gmx_mm_log_ps       gmx_simd_log_f
+#define gmx_mm_pmecorrF_ps  gmx_simd_pmecorrF_f
+#define gmx_mm_pmecorrV_ps  gmx_simd_pmecorrV_f
+#define gmx_mm_sincos_ps    gmx_simd_sincos_f
  
  #endif
diff --git a/src/gromacs/simd/simd.h b/src/gromacs/simd/simd.h

new file mode 100644 (file)

index 0000000..0e8716a
--- /dev/null
+++ b/src/gromacs/simd/simd.h
@@ -0,0 +1,1585 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2013,2014, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+
+/*! \libinternal
+ *  \defgroup module_simd SIMD intrinsics interface (simd)
+ *  \ingroup group_utilitymodules
+ *
+ *  \brief Provides an architecture-independent way of doing SIMD coding.
+ *
+ *  Start by consulting the overview Doxygen SIMD module documentation which is
+ *  available in the internal library documentation (but not the public API),
+ *  and then the details are documented in simd.h and the reference
+ *  implementation impl_reference.h.
+ *
+ *  \author Erik Lindahl <erik.lindahl@scilifelab.se>
+ */
+
+#ifndef GMX_SIMD_SIMD_H
+#define GMX_SIMD_SIMD_H
+
+/*! \libinternal \file
+ *
+ * \brief Definitions, capabilities, and wrappers for SIMD module.
+ *
+ * The macros in this file are intended to be used for writing
+ * architecture-independent SIMD intrinsics code.
+ * To support a new architecture, adding a new sub-include with macros here
+ * should be (nearly) all that is needed.
+ *
+ * The defines in this top-level file will set default Gromacs real precision
+ * operations to either single or double precision based on whether
+ * GMX_DOUBLE is defined. The actual implementation - including e.g.
+ * conversion operations specifically between single and double - is documented
+ * in impl_reference.h.
+ *
+ * \author Erik Lindahl <erik.lindahl@scilifelab.se>
+ *
+ * \inlibraryapi
+ * \ingroup module_simd
+ */
+
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <stddef.h>
+#include "gromacs/legacyheaders/types/simple.h"
+
+/* Forward declarations so memory allocation can be used in implementations */
+static gmx_inline float *  gmx_simd_align_f(float *p);
+static gmx_inline double * gmx_simd_align_d(double *p);
+static gmx_inline int *    gmx_simd_align_fi(int *p);
+static gmx_inline int *    gmx_simd_align_di(int *p);
+static gmx_inline float *  gmx_simd4_align_f(float *p);
+static gmx_inline double * gmx_simd4_align_d(double *p);
+
+/*! \cond libapi */
+/*! \addtogroup module_simd */
+/*! \{ */
+
+/*! \name SIMD predefined macros to describe high-level capabilities
+ *
+ *  These macros are used to describe the features available in default
+ *  Gromacs real precision. They are set from the lower-level implementation
+ *  files that have macros describing single and double precision individually,
+ *  as well as the implementation details.
+ *  \{
+ */
+
+/*! \brief
+ *  GMX_SIMD indicates that some sort of SIMD support is present in software.
+ *
+ * It is disabled if no architecture, neither reference SIMD, has been selected.
+ */
+#define GMX_SIMD
+
+
+/* Intel MIC is a bit special since it is a co-processor. This means the rest
+ * of GROMACS (which runs on the CPU) should use a default SIMD set like AVX,
+ * while the part running on the coprocessor defines __MIC__. All functions in
+ * this SIMD module are static, so it will work perfectly fine to include this
+ * file with different SIMD definitions for different files.
+ */
+#if defined __MIC__
+#    include "gromacs/simd/impl_intel_mic/impl_intel_mic.h"
+#elif defined GMX_SIMD_X86_AVX2_256
+#    include "gromacs/simd/impl_x86_avx2_256/impl_x86_avx2_256.h"
+#elif defined GMX_SIMD_X86_AVX_256
+#    include "gromacs/simd/impl_x86_avx_256/impl_x86_avx_256.h"
+#elif defined GMX_SIMD_X86_AVX_128_FMA
+#    include "gromacs/simd/impl_x86_avx_128_fma/impl_x86_avx_128_fma.h"
+#elif defined GMX_SIMD_X86_SSE4_1
+#    include "gromacs/simd/impl_x86_sse4_1/impl_x86_sse4_1.h"
+#elif defined GMX_SIMD_X86_SSE2
+#    include "gromacs/simd/impl_x86_sse2/impl_x86_sse2.h"
+#elif defined GMX_SIMD_IBM_QPX
+#    include "gromacs/simd/impl_ibm_qpx/impl_ibm_qpx.h"
+#elif (defined GMX_SIMD_REFERENCE) || (defined DOXYGEN)
+/* Plain C SIMD reference implementation, also serves as documentation.
+ * For now this code path will also be taken for Sparc64_HPC_ACE since we have
+ * not yet added the verlet kernel extensions there. The group kernels do not
+ * depend on this file, so they will still be accelerated with SIMD.
+ */
+#    include "gromacs/simd/impl_reference/impl_reference.h"
+#else
+/* Turn off the GMX_SIMD flag if we do not even have reference support */
+#    undef GMX_SIMD
+#endif
+
+/*! \brief
+ * SIMD4 width is always 4, but use this for clarity in definitions.
+ *
+ * It improves code readability to allocate e.g. 2*GMX_SIMD4_WIDTH instead of 8.
+ */
+#define GMX_SIMD4_WIDTH    4
+
+/*! \} */
+
+/*! \name SIMD memory alignment operations
+ *  \{
+ */
+
+/*! \brief
+ * Align a float pointer for usage with SIMD instructions.
+ *
+ * You should typically \a not call this function directly (unless you explicitly
+ * want single precision even when GMX_DOUBLE is set), but use the
+ * \ref gmx_simd_align_r macro to align memory in default Gromacs real precision.
+ *
+ * \param  p Pointer to memory, allocate at least \ref GMX_SIMD_FLOAT_WIDTH extra elements.
+ *
+ * \return Aligned pointer (>=p) suitable for loading/storing float fp SIMD.
+ *         If \ref GMX_SIMD_HAVE_FLOAT is not set, p will be returned unchanged.
+ *
+ * Start by allocating an extra \ref GMX_SIMD_FLOAT_WIDTH float elements of memory,
+ * and then call this function. The returned pointer will be greater or equal
+ * to the one you provided, and point to an address inside your provided memory
+ * that is aligned to the SIMD width.
+ */
+static gmx_inline float *
+gmx_simd_align_f(float *p)
+{
+#    ifdef GMX_SIMD_HAVE_FLOAT
+    return (float *)(((size_t)((p)+GMX_SIMD_FLOAT_WIDTH-1)) & (~((size_t)(GMX_SIMD_FLOAT_WIDTH*sizeof(float)-1))));
+#    else
+    return p;
+#    endif
+}
+
+/*!  \brief
+ * Align a double pointer for usage with SIMD instructions.
+ *
+ * You should typically \a not call this function directly (unless you explicitly
+ * want double precision even when GMX_DOUBLE is not set), but use the
+ * \ref gmx_simd_align_r macro to align memory in default Gromacs real precision.
+ *
+ * \param  p Pointer to memory, allocate at least \ref GMX_SIMD_DOUBLE_WIDTH extra elements.
+ *
+ * \return Aligned pointer (>=p) suitable for loading/storing double fp SIMD.
+ *         If \ref GMX_SIMD_HAVE_DOUBLE is not set, p will be returned unchanged.
+ *
+ * Start by allocating an extra \ref GMX_SIMD_DOUBLE_WIDTH double elements of memory,
+ * and then call this function. The returned pointer will be greater or equal
+ * to the one you provided, and point to an address inside your provided memory
+ * that is aligned to the SIMD width.
+ */
+static gmx_inline double *
+gmx_simd_align_d(double *p)
+{
+#    ifdef GMX_SIMD_HAVE_DOUBLE
+    return (double *)(((size_t)((p)+GMX_SIMD_DOUBLE_WIDTH-1)) & (~((size_t)(GMX_SIMD_DOUBLE_WIDTH*sizeof(double)-1))));
+#    else
+    return p;
+#    endif
+}
+
+/*! \brief
+ * Align a (float) integer pointer for usage with SIMD instructions.
+ *
+ * You should typically \a not call this function directly (unless you explicitly
+ * want integers corresponding to single precision even when GMX_DOUBLE is
+ * set), but use the \ref gmx_simd_align_i macro to align integer memory
+ * corresponding to Gromacs default floating-point precision.
+ *
+ * \param  p Pointer to memory, allocate at least \ref GMX_SIMD_FINT32_WIDTH extra elements.
+ *
+ * \return Aligned pointer (>=p) suitable for loading/storing float-integer SIMD.
+ *         If \ref GMX_SIMD_HAVE_FINT32 is not set, p will be returned unchanged.
+ *
+ * This routine provides aligned memory for usage with \ref gmx_simd_fint32_t. You
+ * should have allocated an extra \ref GMX_SIMD_FINT32_WIDTH * sizeof(int) bytes. The
+ * reason why we need to separate float-integer vs. double-integer is that the
+ * width of registers after conversions from the floating-point types might not
+ * be identical, or even supported, in both cases.
+ */
+static gmx_inline int *
+gmx_simd_align_fi(int *p)
+{
+#    ifdef GMX_SIMD_HAVE_FINT32
+    return (int *)(((size_t)((p)+GMX_SIMD_FINT32_WIDTH-1)) & (~((size_t)(GMX_SIMD_FINT32_WIDTH*sizeof(int)-1))));
+#    else
+    return p;
+#    endif
+}
+
+/*! \brief
+ * Align a (double) integer pointer for usage with SIMD instructions.
+ *
+ * You should typically \a not call this function directly (unless you explicitly
+ * want integers corresponding to doublele precision even when GMX_DOUBLE is
+ * not set), but use the \ref gmx_simd_align_i macro to align integer memory
+ * corresponding to Gromacs default floating-point precision.
+ *
+ * \param  p Pointer to memory, allocate at least \ref GMX_SIMD_DINT32_WIDTH extra elements.
+ *
+ * \return Aligned pointer (>=p) suitable for loading/storing double-integer SIMD.
+ *         If \ref GMX_SIMD_HAVE_DINT32 is not set, p will be returned unchanged.
+ *
+ * This routine provides aligned memory for usage with \ref gmx_simd_dint32_t. You
+ * should have allocated an extra \ref GMX_SIMD_DINT32_WIDTH*sizeof(int) bytes. The
+ * reason why we need to separate float-integer vs. double-integer is that the
+ * width of registers after conversions from the floating-point types might not
+ * be identical, or even supported, in both cases.
+ */
+static gmx_inline int *
+gmx_simd_align_di(int *p)
+{
+#    ifdef GMX_SIMD_HAVE_DINT32
+    return (int *)(((size_t)((p)+GMX_SIMD_DINT32_WIDTH-1)) & (~((size_t)(GMX_SIMD_DINT32_WIDTH*sizeof(int)-1))));
+#    else
+    return p;
+#    endif
+}
+
+/*! \brief
+ * Align a float pointer for usage with SIMD4 instructions.
+ *
+ * You should typically \a not call this function directly (unless you explicitly
+ * want single precision even when GMX_DOUBLE is set), but use the
+ * \ref gmx_simd4_align_r macro to align memory in default Gromacs real precision.
+ *
+ * \param  p Pointer to memory, allocate at least \ref GMX_SIMD4_WIDTH extra elements.
+ *
+ * \return Aligned pointer (>=p) suitable for loading/storing float SIMD.
+ *         If \ref GMX_SIMD4_HAVE_FLOAT is not set, p will be returned unchanged.
+ *
+ * This routine provides aligned memory for usage with \ref gmx_simd4_float_t.
+ * should have allocated an extra \ref GMX_SIMD4_WIDTH * sizeof(float) bytes.
+ */
+static gmx_inline float *
+gmx_simd4_align_f(float *p)
+{
+#    ifdef GMX_SIMD4_HAVE_FLOAT
+    return (float *)(((size_t)((p)+GMX_SIMD4_WIDTH-1)) & (~((size_t)(GMX_SIMD4_WIDTH*sizeof(float)-1))));
+#    else
+    return p;
+#    endif
+}
+
+/*! \brief
+ * Align a double pointer for usage with SIMD4 instructions.
+ *
+ * You should typically \a not call this function directly (unless you explicitly
+ * want double precision even when GMX_DOUBLE is not set), but use the
+ * \ref gmx_simd4_align_r macro to align memory in default Gromacs real precision.
+ *
+ * \param  p Pointer to memory, allocate at least \ref GMX_SIMD4_WIDTH extra elements.
+ *
+ * \return Aligned pointer (>=p) suitable for loading/storing float SIMD.
+ *         If \ref GMX_SIMD4_HAVE_DOUBLE is not set, p will be returned unchanged.
+ *
+ * This routine provides aligned memory for usage with \ref gmx_simd4_double_t.
+ * should have allocated an extra \ref GMX_SIMD4_WIDTH * sizeof(double) bytes.
+ */
+static gmx_inline double *
+gmx_simd4_align_d(double *p)
+{
+#    ifdef GMX_SIMD4_HAVE_DOUBLE
+    return (double *)(((size_t)((p)+GMX_SIMD4_WIDTH-1)) & (~((size_t)(GMX_SIMD4_WIDTH*sizeof(double)-1))));
+#    else
+    return p;
+#    endif
+}
+
+/*! \} */
+
+
+/* Define Gromacs "real" precision macros depending on Gromacs config. Note
+ * that conversions float-to-double and v.v. are not included here since they
+ * are not precision-dependent - find them in the implementation files.
+ */
+#ifdef GMX_DOUBLE
+/* Double floating-point. The documentation is in the float part below */
+#    define gmx_simd_real_t                  gmx_simd_double_t
+#    define gmx_simd_load_r                  gmx_simd_load_d
+#    define gmx_simd_load1_r                 gmx_simd_load1_d
+#    define gmx_simd_set1_r                  gmx_simd_set1_d
+#    define gmx_simd_store_r                 gmx_simd_store_d
+#    define gmx_simd_loadu_r                 gmx_simd_loadu_d
+#    define gmx_simd_storeu_r                gmx_simd_storeu_d
+#    define gmx_simd_setzero_r               gmx_simd_setzero_d
+#    define gmx_simd_add_r                   gmx_simd_add_d
+#    define gmx_simd_sub_r                   gmx_simd_sub_d
+#    define gmx_simd_mul_r                   gmx_simd_mul_d
+#    define gmx_simd_fmadd_r                 gmx_simd_fmadd_d
+#    define gmx_simd_fmsub_r                 gmx_simd_fmsub_d
+#    define gmx_simd_fnmadd_r                gmx_simd_fnmadd_d
+#    define gmx_simd_fnmsub_r                gmx_simd_fnmsub_d
+#    define gmx_simd_and_r                   gmx_simd_and_d
+#    define gmx_simd_andnot_r                gmx_simd_andnot_d
+#    define gmx_simd_or_r                    gmx_simd_or_d
+#    define gmx_simd_xor_r                   gmx_simd_xor_d
+#    define gmx_simd_rsqrt_r                 gmx_simd_rsqrt_d
+#    define gmx_simd_rcp_r                   gmx_simd_rcp_d
+#    define gmx_simd_fabs_r                  gmx_simd_fabs_d
+#    define gmx_simd_fneg_r                  gmx_simd_fneg_d
+#    define gmx_simd_max_r                   gmx_simd_max_d
+#    define gmx_simd_min_r                   gmx_simd_min_d
+#    define gmx_simd_round_r                 gmx_simd_round_d
+#    define gmx_simd_trunc_r                 gmx_simd_trunc_d
+#    define gmx_simd_fraction_r              gmx_simd_fraction_d
+#    define gmx_simd_get_exponent_r          gmx_simd_get_exponent_d
+#    define gmx_simd_get_mantissa_r          gmx_simd_get_mantissa_d
+#    define gmx_simd_set_exponent_r          gmx_simd_set_exponent_d
+/* Double integer and conversions */
+#    define gmx_simd_int32_t                 gmx_simd_dint32_t
+#    define gmx_simd_load_i                  gmx_simd_load_di
+#    define gmx_simd_set1_i                  gmx_simd_set1_di
+#    define gmx_simd_store_i                 gmx_simd_store_di
+#    define gmx_simd_loadu_i                 gmx_simd_loadu_di
+#    define gmx_simd_storeu_i                gmx_simd_storeu_di
+#    define gmx_simd_setzero_i               gmx_simd_setzero_di
+#    define gmx_simd_cvt_r2i                 gmx_simd_cvt_d2i
+#    define gmx_simd_cvtt_r2i                gmx_simd_cvtt_d2i
+#    define gmx_simd_cvt_i2r                 gmx_simd_cvt_i2d
+#    define gmx_simd_extract_i               gmx_simd_extract_di
+#    define gmx_simd_slli_i                  gmx_simd_slli_di
+#    define gmx_simd_srli_i                  gmx_simd_srli_di
+#    define gmx_simd_and_i                   gmx_simd_and_di
+#    define gmx_simd_andnot_i                gmx_simd_andnot_di
+#    define gmx_simd_or_i                    gmx_simd_or_di
+#    define gmx_simd_xor_i                   gmx_simd_xor_di
+#    define gmx_simd_add_i                   gmx_simd_add_di
+#    define gmx_simd_sub_i                   gmx_simd_sub_di
+#    define gmx_simd_mul_i                   gmx_simd_mul_di
+/* Double booleans and selection */
+#    define gmx_simd_bool_t                  gmx_simd_dbool_t
+#    define gmx_simd_cmpeq_r                 gmx_simd_cmpeq_d
+#    define gmx_simd_cmplt_r                 gmx_simd_cmplt_d
+#    define gmx_simd_cmple_r                 gmx_simd_cmple_d
+#    define gmx_simd_and_b                   gmx_simd_and_db
+#    define gmx_simd_or_b                    gmx_simd_or_db
+#    define gmx_simd_anytrue_b               gmx_simd_anytrue_db
+#    define gmx_simd_blendzero_r             gmx_simd_blendzero_d
+#    define gmx_simd_blendnotzero_r          gmx_simd_blendnotzero_d
+#    define gmx_simd_blendv_r                gmx_simd_blendv_d
+#    define gmx_simd_reduce_r                gmx_simd_reduce_d
+#    define gmx_simd_ibool_t                 gmx_simd_dibool_t
+#    define gmx_simd_cmpeq_i                 gmx_simd_cmpeq_di
+#    define gmx_simd_cmplt_i                 gmx_simd_cmplt_di
+#    define gmx_simd_and_ib                  gmx_simd_and_dib
+#    define gmx_simd_or_ib                   gmx_simd_or_dib
+#    define gmx_simd_anytrue_ib              gmx_simd_anytrue_dib
+#    define gmx_simd_blendzero_i             gmx_simd_blendzero_di
+#    define gmx_simd_blendnotzero_i          gmx_simd_blendnotzero_di
+#    define gmx_simd_blendv_i                gmx_simd_blendv_di
+/* Conversions between integer and double floating-point booleans */
+#    define gmx_simd_cvt_b2ib                gmx_simd_cvt_db2dib
+#    define gmx_simd_cvt_ib2b                gmx_simd_cvt_dib2db
+
+/* SIMD4 double fp - we only support a subset of SIMD instructions for SIMD4 */
+#    define gmx_simd4_real_t                 gmx_simd4_double_t
+#    define gmx_simd4_load_r                 gmx_simd4_load_d
+#    define gmx_simd4_load1_r                gmx_simd4_load1_d
+#    define gmx_simd4_set1_r                 gmx_simd4_set1_d
+#    define gmx_simd4_store_r                gmx_simd4_store_d
+#    define gmx_simd4_loadu_r                gmx_simd4_loadu_d
+#    define gmx_simd4_storeu_r               gmx_simd4_storeu_d
+#    define gmx_simd4_setzero_r              gmx_simd4_setzero_d
+#    define gmx_simd4_add_r                  gmx_simd4_add_d
+#    define gmx_simd4_sub_r                  gmx_simd4_sub_d
+#    define gmx_simd4_mul_r                  gmx_simd4_mul_d
+#    define gmx_simd4_fmadd_r                gmx_simd4_fmadd_d
+#    define gmx_simd4_fmsub_r                gmx_simd4_fmsub_d
+#    define gmx_simd4_fnmadd_r               gmx_simd4_fnmadd_d
+#    define gmx_simd4_fnmsub_r               gmx_simd4_fnmsub_d
+#    define gmx_simd4_and_r                  gmx_simd4_and_d
+#    define gmx_simd4_andnot_r               gmx_simd4_andnot_d
+#    define gmx_simd4_or_r                   gmx_simd4_or_d
+#    define gmx_simd4_xor_r                  gmx_simd4_xor_d
+#    define gmx_simd4_rsqrt_r                gmx_simd4_rsqrt_d
+#    define gmx_simd4_fabs_r                 gmx_simd4_fabs_d
+#    define gmx_simd4_fneg_r                 gmx_simd4_fneg_d
+#    define gmx_simd4_max_r                  gmx_simd4_max_d
+#    define gmx_simd4_min_r                  gmx_simd4_min_d
+#    define gmx_simd4_round_r                gmx_simd4_round_d
+#    define gmx_simd4_trunc_r                gmx_simd4_trunc_d
+#    define gmx_simd4_dotproduct3_r          gmx_simd4_dotproduct3_d
+#    define gmx_simd4_bool_t                 gmx_simd4_dbool_t
+#    define gmx_simd4_cmpeq_r                gmx_simd4_cmpeq_d
+#    define gmx_simd4_cmplt_r                gmx_simd4_cmplt_d
+#    define gmx_simd4_cmple_r                gmx_simd4_cmple_d
+#    define gmx_simd4_and_b                  gmx_simd4_and_db
+#    define gmx_simd4_or_b                   gmx_simd4_or_db
+#    define gmx_simd4_anytrue_b              gmx_simd4_anytrue_db
+#    define gmx_simd4_blendzero_r            gmx_simd4_blendzero_d
+#    define gmx_simd4_blendnotzero_r         gmx_simd4_blendnotzero_d
+#    define gmx_simd4_blendv_r               gmx_simd4_blendv_d
+#    define gmx_simd4_reduce_r               gmx_simd4_reduce_d
+
+/* Memory allocation */
+#    define gmx_simd_align_r                 gmx_simd_align_d
+#    define gmx_simd_align_i                 gmx_simd_align_di
+#    define gmx_simd4_align_r                gmx_simd4_align_d
+
+#    ifdef GMX_SIMD_HAVE_DOUBLE
+#        define GMX_SIMD_HAVE_REAL
+#        define GMX_SIMD_REAL_WIDTH          GMX_SIMD_DOUBLE_WIDTH
+#    endif
+#    ifdef GMX_SIMD_HAVE_DINT32
+#        define GMX_SIMD_HAVE_INT32
+#        define GMX_SIMD_INT32_WIDTH         GMX_SIMD_DINT32_WIDTH
+#    endif
+#    ifdef GMX_SIMD_HAVE_DINT32_EXTRACT
+#        define GMX_SIMD_HAVE_INT32_EXTRACT
+#    endif
+#    ifdef GMX_SIMD_HAVE_DINT32_LOGICAL
+#        define GMX_SIMD_HAVE_INT32_LOGICAL
+#    endif
+#    ifdef GMX_SIMD_HAVE_DINT32_ARITHMETICS
+#        define GMX_SIMD_HAVE_INT32_ARITHMETICS
+#    endif
+#    ifdef GMX_SIMD4_HAVE_DOUBLE
+#        define GMX_SIMD4_HAVE_REAL
+#    endif
+
+#else /* GMX_DOUBLE */
+
+/*! \name SIMD data types
+ *
+ *  The actual storage of these types is implementation dependent. The
+ *  documentation is generated from the reference implementation, but for
+ *  normal usage this will likely not be what you are using.
+ * \{
+ */
+/*! \brief Real precision floating-point SIMD datatype.
+ *
+ * This type is only available if \ref GMX_SIMD_HAVE_REAL is defined.
+ *
+ * If GMX_DOUBLE is defined, this will be set to \ref gmx_simd_double_t
+ * internally, otherwise \ref gmx_simd_float_t.
+ */
+#    define gmx_simd_real_t                  gmx_simd_float_t
+
+/*! \brief 32-bit integer SIMD type.
+ *
+ * This type is only available if \ref GMX_SIMD_HAVE_INT32 is defined.
+ *
+ * If GMX_DOUBLE is defined, this will be set to \ref gmx_simd_dint32_t
+ * internally, otherwise \ref gmx_simd_fint32_t. This might seem a strange
+ * implementation detail, but it is because some SIMD implementations use
+ * different types/widths of integers registers when converting from
+ * double vs. single precision floating point. As long as you just use
+ * this type you will not have to worry about precision.
+ */
+#    define gmx_simd_int32_t                 gmx_simd_fint32_t
+
+/*! \brief Boolean SIMD type for usage with \ref gmx_simd_real_t.
+ *
+ * This type is only available if \ref GMX_SIMD_HAVE_REAL is defined.
+ *
+ * If GMX_DOUBLE is defined, this will be set to \ref gmx_simd_dbool_t
+ * internally, otherwise \ref gmx_simd_fbool_t. This is necessary since some
+ * SIMD implementations use bitpatterns for marking truth, so single-
+ * vs. double precision booleans are not necessarily exchangable.
+ * As long as you just use this type you will not have to worry about precision.
+ *
+ * See \ref gmx_simd_ibool_t for an explanation of real vs. integer booleans.
+ */
+#    define gmx_simd_bool_t                  gmx_simd_fbool_t
+
+/*! \brief Boolean SIMD type for usage with \ref gmx_simd_int32_t.
+ *
+ * This type is only available if \ref GMX_SIMD_HAVE_INT32 is defined.
+ *
+ * If GMX_DOUBLE is defined, this will be set to \ref gmx_simd_dibool_t
+ * internally, otherwise \ref gmx_simd_fibool_t. This is necessary since some
+ * SIMD implementations use bitpatterns for marking truth, so single-
+ * vs. double precision booleans are not necessarily exchangable, and while
+ * a double-precision boolean might be represented with a 64-bit mask, the
+ * corresponding integer might only use a 32-bit mask.
+ *
+ * We provide conversion routines for these cases, so the only thing you need to
+ * keep in mind is to use \ref gmx_simd_bool_t when working with
+ * \ref gmx_simd_real_t while you pick \ref gmx_simd_ibool_t when working with
+ * \ref gmx_simd_int32_t.
+ *
+ * To convert between them, use \ref gmx_simd_cvt_b2ib and \ref gmx_simd_cvt_ib2b.
+ */
+#    define gmx_simd_ibool_t                 gmx_simd_fibool_t
+
+
+/*! \}
+ *  \name SIMD load/store operations on gmx_simd_real_t
+ *
+ *  \note Unaligned load/stores are only available when
+ *  \ref GMX_SIMD_HAVE_LOADU and \ref GMX_SIMD_HAVE_STOREU are set, respectively.
+ *  \{
+ */
+
+/*! \brief Load \ref GMX_SIMD_REAL_WIDTH values from aligned memory to \ref gmx_simd_real_t
+ *
+ * If GMX_DOUBLE is defined, this will be aliased to \ref gmx_simd_load_d,
+ * otherwise \ref gmx_simd_load_f.
+ *
+ * \copydetails gmx_simd_load_f
+ */
+#    define gmx_simd_load_r                  gmx_simd_load_f
+
+/*! \brief Set all elements in \ref gmx_simd_real_t from single value in memory.
+ *
+ * If GMX_DOUBLE is defined, this will be aliased to \ref gmx_simd_load1_d,
+ * otherwise \ref gmx_simd_load1_f.
+ *
+ * \copydetails gmx_simd_load1_f
+ */
+#    define gmx_simd_load1_r                 gmx_simd_load1_f
+
+/*! \brief Set all elements in \ref gmx_simd_real_t from a scalar.
+ *
+ * If GMX_DOUBLE is defined, this will be aliased to \ref gmx_simd_set1_d,
+ * otherwise \ref gmx_simd_set1_f.
+ *
+ * \copydetails gmx_simd_set1_f
+ */
+#    define gmx_simd_set1_r                  gmx_simd_set1_f
+
+/*! \brief Store \ref GMX_SIMD_REAL_WIDTH values from \ref gmx_simd_real_t to aligned memory.
+ *
+ * If GMX_DOUBLE is defined, this will be aliased to \ref gmx_simd_store_d,
+ * otherwise \ref gmx_simd_store_f.
+ *
+ * \copydetails gmx_simd_store_f
+ */
+#    define gmx_simd_store_r                 gmx_simd_store_f
+
+/*! \brief Load \ref GMX_SIMD_REAL_WIDTH values from unaligned memory to \ref gmx_simd_real_t.
+ *
+ * If GMX_DOUBLE is defined, this will be aliased to \ref gmx_simd_loadu_d,
+ * otherwise \ref gmx_simd_loadu_f.
+ *
+ * \copydetails gmx_simd_loadu_f
+ */
+#    define gmx_simd_loadu_r                 gmx_simd_loadu_f
+
+/*! \brief Store \ref GMX_SIMD_REAL_WIDTH values from \ref gmx_simd_real_t to unaligned memory.
+ *
+ * If GMX_DOUBLE is defined, this will be aliased to \ref gmx_simd_storeu_d,
+ * otherwise \ref gmx_simd_storeu_f.
+ *
+ * \copydetails gmx_simd_storeu_f
+ */
+#    define gmx_simd_storeu_r                gmx_simd_storeu_f
+
+/*! \brief Set all elements in \ref gmx_simd_real_t to 0.0.
+ *
+ * If GMX_DOUBLE is defined, this will be aliased to \ref gmx_simd_setzero_d,
+ * otherwise \ref gmx_simd_setzero_f.
+ *
+ * \copydetails gmx_simd_setzero_f
+ */
+#    define gmx_simd_setzero_r               gmx_simd_setzero_f
+
+/*! \}
+ *  \name SIMD load/store operations on gmx_simd_int32_t
+ *
+ *  \note Unaligned load/stores are only available when
+ *  \ref GMX_SIMD_HAVE_LOADU and \ref GMX_SIMD_HAVE_STOREU are set, respectively.
+ *  \{
+ */
+
+/*! \brief Load \ref GMX_SIMD_INT32_WIDTH values from aligned memory to \ref gmx_simd_int32_t .
+ *
+ * If GMX_DOUBLE is defined, this will be aliased to \ref gmx_simd_load_di ,
+ * otherwise \ref gmx_simd_load_fi .
+ *
+ * \copydetails gmx_simd_load_fi
+ */
+#    define gmx_simd_load_i                  gmx_simd_load_fi
+
+/*! \brief Set all elements in \ref gmx_simd_int32_t from a single integer.
+ *
+ * If GMX_DOUBLE is defined, this will be aliased to \ref gmx_simd_set1_di ,
+ * otherwise \ref gmx_simd_set1_fi .
+ *
+ * \copydetails gmx_simd_set1_fi
+ */
+#    define gmx_simd_set1_i                  gmx_simd_set1_fi
+
+/*! \brief Store \ref GMX_SIMD_REAL_WIDTH values from \ref gmx_simd_int32_t to aligned memory.
+ *
+ * If GMX_DOUBLE is defined, this will be aliased to \ref gmx_simd_store_di ,
+ * otherwise \ref gmx_simd_store_fi .
+ *
+ * \copydetails gmx_simd_store_fi
+ */
+#    define gmx_simd_store_i                 gmx_simd_store_fi
+
+/*! \brief Load \ref GMX_SIMD_REAL_WIDTH values from unaligned memory to \ref gmx_simd_int32_t.
+ *
+ * If GMX_DOUBLE is defined, this will be aliased to \ref gmx_simd_loadu_di ,
+ * otherwise \ref gmx_simd_loadu_fi .
+ *
+ * \copydetails gmx_simd_loadu_fi
+ */
+#    define gmx_simd_loadu_i                 gmx_simd_loadu_fi
+
+/*! \brief Store \ref GMX_SIMD_REAL_WIDTH values from \ref gmx_simd_int32_t to unaligned memory.
+ *
+ * If GMX_DOUBLE is defined, this will be aliased to \ref gmx_simd_storeu_di ,
+ * otherwise \ref gmx_simd_storeu_fi .
+ *
+ * \copydetails gmx_simd_storeu_fi
+ */
+#    define gmx_simd_storeu_i                gmx_simd_storeu_fi
+
+/*! \brief Extract single integer from \ref gmx_simd_int32_t element.
+ *
+ * If GMX_DOUBLE is defined, this will be aliased to \ref gmx_simd_extract_di ,
+ * otherwise \ref gmx_simd_extract_fi .
+ *
+ * \copydetails gmx_simd_extract_fi
+ */
+#    define gmx_simd_extract_i               gmx_simd_extract_fi
+
+/*! \brief Set all elements in \ref gmx_simd_int32_t to 0.
+ *
+ * If GMX_DOUBLE is defined, it will be aliased to \ref gmx_simd_setzero_di ,
+ * otherwise \ref gmx_simd_setzero_fi .
+ *
+ * \copydetails gmx_simd_setzero_fi
+ */
+#    define gmx_simd_setzero_i               gmx_simd_setzero_fi
+
+
+/*! \}
+ *  \name SIMD floating-point logical operations on gmx_simd_real_t
+ *
+ *  These instructions are available if \ref GMX_SIMD_HAVE_LOGICAL is defined.
+ *  \{
+ */
+
+/*! \brief Bitwise \a and on two \ref gmx_simd_real_t.
+ *
+ * If GMX_DOUBLE is defined, this will be aliased to \ref gmx_simd_and_d,
+ * otherwise \ref gmx_simd_and_f.
+ *
+ * \copydetails gmx_simd_and_f
+ */
+#    define gmx_simd_and_r                   gmx_simd_and_f
+
+/*! \brief Bitwise \a and-not on two \ref gmx_simd_real_t; 1st arg is complemented.
+ *
+ * If GMX_DOUBLE is defined, this will be aliased to \ref gmx_simd_andnot_d,
+ * otherwise \ref gmx_simd_andnot_f.
+ *
+ * \copydetails gmx_simd_andnot_f
+ */
+#    define gmx_simd_andnot_r                gmx_simd_andnot_f
+
+/*! \brief Bitwise \a or on two \ref gmx_simd_real_t.
+ *
+ * If GMX_DOUBLE is defined, this will be aliased to \ref gmx_simd_or_d,
+ * otherwise \ref gmx_simd_or_f.
+ *
+ * \copydetails gmx_simd_or_f
+ */
+#    define gmx_simd_or_r                    gmx_simd_or_f
+
+/*! \brief Bitwise \a exclusive-or on two \ref gmx_simd_real_t.
+ *
+ * If GMX_DOUBLE is defined, this will be aliased to \ref gmx_simd_xor_d,
+ * otherwise \ref gmx_simd_xor_f.
+ *
+ * \copydetails gmx_simd_xor_f
+ */
+#    define gmx_simd_xor_r                   gmx_simd_xor_f
+
+/*! \}
+ *  \name SIMD floating-point arithmetic operations on gmx_simd_real_t
+ *  \{
+ */
+
+/*! \brief SIMD a+b for two \ref gmx_simd_real_t.
+ *
+ * If GMX_DOUBLE is defined, this will be aliased to \ref gmx_simd_add_d,
+ * otherwise \ref gmx_simd_add_f.
+ *
+ * \copydetails gmx_simd_add_f
+ */
+#    define gmx_simd_add_r                   gmx_simd_add_f
+
+/*! \brief SIMD a-b for two \ref gmx_simd_real_t.
+ *
+ * If GMX_DOUBLE is defined, this will be aliased to \ref gmx_simd_sub_d,
+ * otherwise \ref gmx_simd_sub_f.
+ *
+ * \copydetails gmx_simd_sub_f
+ */
+#    define gmx_simd_sub_r                   gmx_simd_sub_f
+
+/*! \brief SIMD a*b for two \ref gmx_simd_real_t.
+ *
+ * If GMX_DOUBLE is defined, this will be aliased to \ref gmx_simd_mul_d,
+ * otherwise \ref gmx_simd_mul_f.
+ *
+ * \copydetails gmx_simd_mul_f
+ */
+#    define gmx_simd_mul_r                   gmx_simd_mul_f
+
+/*! \brief SIMD a*b+c for three \ref gmx_simd_real_t.
+ *
+ * If GMX_DOUBLE is defined, this will be aliased to \ref gmx_simd_fmadd_d,
+ * otherwise \ref gmx_simd_fmadd_f.
+ *
+ * \copydetails gmx_simd_fmadd_f
+ */
+#    define gmx_simd_fmadd_r                 gmx_simd_fmadd_f
+
+/*! \brief SIMD a*b-c for three \ref gmx_simd_real_t.
+ *
+ * If GMX_DOUBLE is defined, this will be aliased to \ref gmx_simd_fmsub_d,
+ * otherwise \ref gmx_simd_fmsub_f.
+ *
+ * \copydetails gmx_simd_fmsub_f
+ */
+#    define gmx_simd_fmsub_r                 gmx_simd_fmsub_f
+
+/*! \brief SIMD -a*b+c for three \ref gmx_simd_real_t.
+ *
+ * If GMX_DOUBLE is defined, this will be aliased to \ref gmx_simd_fnmadd_d,
+ * otherwise \ref gmx_simd_fnmadd_f.
+ *
+ * \copydetails gmx_simd_fnmadd_f
+ */
+#    define gmx_simd_fnmadd_r                gmx_simd_fnmadd_f
+
+/*! \brief SIMD -a*b-c for three \ref gmx_simd_real_t.
+ *
+ * If GMX_DOUBLE is defined, this will be aliased to \ref gmx_simd_fnmsub_d,
+ * otherwise \ref gmx_simd_fnmsub_f.
+ *
+ * \copydetails gmx_simd_fnmsub_f
+ */
+#    define gmx_simd_fnmsub_r                gmx_simd_fnmsub_f
+
+/*! \brief SIMD table lookup for 1/sqrt(x) approximation.
+ *
+ * If GMX_DOUBLE is defined, this will be aliased to \ref gmx_simd_rsqrt_d,
+ * otherwise \ref gmx_simd_rsqrt_f.
+ *
+ * \copydetails gmx_simd_rsqrt_f
+ */
+#    define gmx_simd_rsqrt_r                 gmx_simd_rsqrt_f
+
+/*! \brief SIMD table lookup for 1/x approximation.
+ *
+ * If GMX_DOUBLE is defined, this will be aliased to \ref gmx_simd_rcp_d,
+ * otherwise \ref gmx_simd_rcp_f.
+ *
+ * \copydetails gmx_simd_rcp_f
+ */
+#    define gmx_simd_rcp_r                   gmx_simd_rcp_f
+
+/*! \brief SIMD fabs(x) for \ref gmx_simd_real_t.
+ *
+ * If GMX_DOUBLE is defined, this will be aliased to \ref gmx_simd_fabs_d,
+ * otherwise \ref gmx_simd_fabs_f.
+ *
+ * \copydetails gmx_simd_fabs_f
+ */
+#    define gmx_simd_fabs_r                  gmx_simd_fabs_f
+
+/*! \brief SIMD -x for \ref gmx_simd_real_t.
+ *
+ * If GMX_DOUBLE is defined, this will be aliased to \ref gmx_simd_fneg_d,
+ * otherwise \ref gmx_simd_fneg_f.
+ *
+ * \copydetails gmx_simd_fneg_f
+ */
+#    define gmx_simd_fneg_r                  gmx_simd_fneg_f
+
+/*! \brief SIMD max(a,b) for each element in \ref gmx_simd_real_t.
+ *
+ * If GMX_DOUBLE is defined, this will be aliased to \ref gmx_simd_max_d,
+ * otherwise \ref gmx_simd_max_f.
+ *
+ * \copydetails gmx_simd_max_f
+ */
+#    define gmx_simd_max_r                   gmx_simd_max_f
+
+/*! \brief SIMD min(a,b) for each element in \ref gmx_simd_real_t.
+ *
+ * If GMX_DOUBLE is defined, this will be aliased to \ref gmx_simd_min_d,
+ * otherwise \ref gmx_simd_min_f.
+ *
+ * \copydetails gmx_simd_min_f
+ */
+#    define gmx_simd_min_r                   gmx_simd_min_f
+
+/*! \brief Round \ref gmx_simd_real_t to nearest int, return \ref gmx_simd_real_t.
+ *
+ * If GMX_DOUBLE is defined, this will be aliased to \ref gmx_simd_round_d,
+ * otherwise \ref gmx_simd_round_f.
+ *
+ * \copydetails gmx_simd_round_f
+ */
+#    define gmx_simd_round_r                 gmx_simd_round_f
+
+/*! \brief Truncate \ref gmx_simd_real_t towards 0, return \ref gmx_simd_real_t.
+ *
+ * If GMX_DOUBLE is defined, this will be aliased to \ref gmx_simd_trunc_d,
+ * otherwise \ref gmx_simd_trunc_f.
+ *
+ * \copydetails gmx_simd_trunc_f
+ */
+#    define gmx_simd_trunc_r                 gmx_simd_trunc_f
+
+/*! \brief SIMD Fraction, i.e. x-trunc(x) for \ref gmx_simd_real_t.
+ *
+ * If GMX_DOUBLE is defined, this will be aliased to \ref gmx_simd_fraction_d,
+ * otherwise \ref gmx_simd_fraction_f.
+ *
+ * \copydetails gmx_simd_fraction_f
+ */
+#    define gmx_simd_fraction_r              gmx_simd_fraction_f
+
+/*! \brief Return the FP exponent of a SIMD \ref gmx_simd_real_t as a \ref gmx_simd_real_t.
+ *
+ * If GMX_DOUBLE is defined, this will be aliased to \ref gmx_simd_get_exponent_d,
+ * otherwise \ref gmx_simd_get_exponent_f.
+ *
+ * \copydetails gmx_simd_exponent_f
+ */
+#    define gmx_simd_get_exponent_r          gmx_simd_get_exponent_f
+
+/*! \brief Return the FP mantissa of a SIMD \ref gmx_simd_real_t as a \ref gmx_simd_real_t.
+ *
+ * If GMX_DOUBLE is defined, this will be aliased to \ref gmx_simd_get_mantissa_d,
+ * otherwise \ref gmx_simd_get_mantissa_f.
+ *
+ * \copydetails gmx_simd_mantissa_f
+ */
+#    define gmx_simd_get_mantissa_r          gmx_simd_get_mantissa_f
+
+/*! \brief Set the exponent of a SIMD \ref gmx_simd_real_t from a \ref gmx_simd_real_t.
+ *
+ * If GMX_DOUBLE is defined, this will be aliased to \ref gmx_simd_set_exponent_d,
+ * otherwise \ref gmx_simd_set_exponent_f.
+ *
+ * \copydetails gmx_simd_set_exponent_f
+ */
+#    define gmx_simd_set_exponent_r          gmx_simd_set_exponent_f
+
+/*! \}
+ *  \name SIMD comparison, boolean, and select operations for gmx_simd_real_t
+ *  \{
+ */
+
+/*! \brief SIMD a==b for \ref gmx_simd_real_t. Returns a \ref gmx_simd_bool_t.
+ *
+ * If GMX_DOUBLE is defined, this will be aliased to \ref gmx_simd_cmpeq_d,
+ * otherwise \ref gmx_simd_cmpeq_f.
+ *
+ * \copydetails gmx_simd_cmpeq_f
+ */
+#    define gmx_simd_cmpeq_r                 gmx_simd_cmpeq_f
+
+/*! \brief SIMD a<b for \ref gmx_simd_real_t. Returns a \ref gmx_simd_bool_t.
+ *
+ * If GMX_DOUBLE is defined, this will be aliased to \ref gmx_simd_cmplt_d,
+ * otherwise \ref gmx_simd_cmplt_f.
+ *
+ * \copydetails gmx_simd_cmplt_f
+ */
+#    define gmx_simd_cmplt_r                 gmx_simd_cmplt_f
+
+/*! \brief SIMD a<=b for \ref gmx_simd_real_t. Returns a \ref gmx_simd_bool_t.
+ *
+ * If GMX_DOUBLE is defined, this will be aliased to \ref gmx_simd_cmple_d,
+ * otherwise \ref gmx_simd_cmple_f.
+ *
+ * \copydetails gmx_simd_cmple_f
+ */
+#    define gmx_simd_cmple_r                 gmx_simd_cmple_f
+
+/*! \brief For each element, the result boolean is true if both arguments are true
+ *
+ * If GMX_DOUBLE is defined, this will be aliased to \ref gmx_simd_and_db,
+ * otherwise \ref gmx_simd_and_fb.
+ *
+ * \copydetails gmx_simd_and_fb
+ */
+#    define gmx_simd_and_b                   gmx_simd_and_fb
+
+/*! \brief For each element, the result boolean is true if either argument is true
+ *
+ * If GMX_DOUBLE is defined, this will be aliased to \ref gmx_simd_or_db,
+ * otherwise \ref gmx_simd_or_fb.
+ *
+ * \copydetails gmx_simd_or_fn
+ */
+#    define gmx_simd_or_b                    gmx_simd_or_fb
+
+/*! \brief Return nonzero if any element in gmx_simd_bool_t is true, otherwise 0.
+ *
+ * If GMX_DOUBLE is defined, this will be aliased to \ref gmx_simd_anytrue_db,
+ * otherwise \ref gmx_simd_anytrue_fb.
+ *
+ * \copydetails gmx_simd_anytrue_fb
+ */
+#    define gmx_simd_anytrue_b               gmx_simd_anytrue_fb
+
+/*! \brief Selects elements from \ref gmx_simd_real_t where boolean is true, otherwise 0.
+ *
+ * If GMX_DOUBLE is defined, this will be aliased to \ref gmx_simd_blendzero_d,
+ * otherwise \ref gmx_simd_blendzero_f.
+ *
+ * \copydetails gmx_simd_blendzero_f
+ *
+ * \sa gmx_simd_blendzero_i
+ */
+#    define gmx_simd_blendzero_r             gmx_simd_blendzero_f
+
+/*! \brief Selects elements from \ref gmx_simd_real_t where boolean is false, otherwise 0.
+ *
+ * If GMX_DOUBLE is defined, this will be aliased to \ref gmx_simd_blendnotzero_d,
+ * otherwise \ref gmx_simd_blendnotzero_f.
+ *
+ * \copydetails gmx_simd_blendnotzero_f
+ */
+#    define gmx_simd_blendnotzero_r          gmx_simd_blendnotzero_f
+
+/*! \brief Selects from 2nd real SIMD arg where boolean is true, otherwise 1st arg.
+ *
+ * If GMX_DOUBLE is defined, this will be aliased to \ref gmx_simd_blendv_d,
+ * otherwise \ref gmx_simd_blendv_f.
+ *
+ * \copydetails gmx_simd_blendv_f
+ */
+#    define gmx_simd_blendv_r                gmx_simd_blendv_f
+
+/*! \brief Return sum of all elements in SIMD floating-point variable.
+ *
+ * If GMX_DOUBLE is defined, this will be aliased to \ref gmx_simd_reduce_d,
+ * otherwise \ref gmx_simd_reduce_f.
+ *
+ * \copydetails gmx_simd_reduce_f
+ */
+#    define gmx_simd_reduce_r                gmx_simd_reduce_f
+
+/*! \}
+ *  \name SIMD integer logical operations on gmx_simd_int32_t
+ *
+ *  These instructions are available if \ref GMX_SIMD_HAVE_INT32_LOGICAL is defined.
+ *  \{
+ */
+
+/*! \brief Shift each element in \ref gmx_simd_int32_t left by immediate
+ *
+ * If GMX_DOUBLE is defined, this will be aliased to \ref gmx_simd_slli_di,
+ * otherwise \ref gmx_simd_slli_fi.
+ *
+ * \copydetails gmx_simd_slli_fi
+ */
+#    define gmx_simd_slli_i                  gmx_simd_slli_fi
+
+/*! \brief Shift each element in \ref gmx_simd_int32_t right by immediate
+ *
+ * If GMX_DOUBLE is defined, this will be aliased to \ref gmx_simd_srli_di,
+ * otherwise \ref gmx_simd_srli_fi.
+ *
+ * \copydetails gmx_simd_srli_fi
+ */
+#    define gmx_simd_srli_i                  gmx_simd_srli_fi
+
+/*! \brief Bitwise \a and on two \ref gmx_simd_int32_t.
+ *
+ * If GMX_DOUBLE is defined, this will be aliased to \ref gmx_simd_and_di,
+ * otherwise \ref gmx_simd_and_fi.
+ *
+ * \copydetails gmx_simd_and_fi
+ */
+#    define gmx_simd_and_i                   gmx_simd_and_fi
+
+/*! \brief Bitwise \a and-not on two \ref gmx_simd_int32_t; 1st arg is complemented.
+ *
+ * If GMX_DOUBLE is defined, this will be aliased to \ref gmx_simd_andnot_di,
+ * otherwise \ref gmx_simd_andnot_fi.
+ *
+ * \copydetails gmx_simd_andnot_fi
+ */
+#    define gmx_simd_andnot_i                gmx_simd_andnot_fi
+
+/*! \brief Bitwise \a or on two \ref gmx_simd_int32_t.
+ *
+ * If GMX_DOUBLE is defined, this will be aliased to \ref gmx_simd_or_di,
+ * otherwise \ref gmx_simd_or_fi.
+ *
+ * \copydetails gmx_simd_or_fi
+ */
+#    define gmx_simd_or_i                    gmx_simd_or_fi
+
+/*! \brief Bitwise \a xor on two \ref gmx_simd_int32_t.
+ *
+ * If GMX_DOUBLE is defined, this will be aliased to \ref gmx_simd_xor_di,
+ * otherwise \ref gmx_simd_xor_fi.
+ *
+ * \copydetails gmx_simd_xor_fi
+ */
+#    define gmx_simd_xor_i                   gmx_simd_xor_fi
+
+/*! \}
+ *  \name SIMD integer arithmetic operations on gmx_simd_int32_t
+ *
+ *  These instructions are available if \ref GMX_SIMD_HAVE_INT32_ARITHMETICS is defined.
+ *  \{
+ */
+
+/*! \brief SIMD a+b for two \ref gmx_simd_int32_t.
+ *
+ * If GMX_DOUBLE is defined, this will be aliased to \ref gmx_simd_add_di,
+ * otherwise \ref gmx_simd_add_fi.
+ *
+ * \copydetails gmx_simd_add_fi
+ */
+#    define gmx_simd_add_i                   gmx_simd_add_fi
+
+/*! \brief SIMD a-b for two \ref gmx_simd_int32_t.
+ *
+ * If GMX_DOUBLE is defined, this will be aliased to \ref gmx_simd_sub_di,
+ * otherwise \ref gmx_simd_sub_fi.
+ *
+ * \copydetails gmx_simd_sub_fi
+ */
+#    define gmx_simd_sub_i                   gmx_simd_sub_fi
+
+/*! \brief SIMD a*b for two \ref gmx_simd_int32_t.
+ *
+ * If GMX_DOUBLE is defined, this will be aliased to \ref gmx_simd_mul_di,
+ * otherwise \ref gmx_simd_mul_fi.
+ *
+ * \copydetails gmx_simd_mul_fi
+ */
+#    define gmx_simd_mul_i                   gmx_simd_mul_fi
+
+/*! \}
+ *  \name SIMD integer comparison, booleans, and selection on gmx_simd_int32_t
+ *
+ *  These instructions are available if \ref GMX_SIMD_HAVE_INT32_ARITHMETICS is defined.
+ *  \{
+ */
+
+/*! \brief Returns boolean describing whether a==b, for \ref gmx_simd_int32_t
+ *
+ * If GMX_DOUBLE is defined, this will be aliased to \ref gmx_simd_cmpeq_di,
+ * otherwise \ref gmx_simd_cmpeq_fi.
+ *
+ * \copydetails gmx_simd_cmpeq_fi
+ */
+#    define gmx_simd_cmpeq_i                 gmx_simd_cmpeq_fi
+
+/*! \brief Returns boolean describing whether a<b, for \ref gmx_simd_int32_t
+ *
+ * If GMX_DOUBLE is defined, this will be aliased to \ref gmx_simd_cmplt_di,
+ * otherwise \ref gmx_simd_cmplt_fi.
+ *
+ * \copydetails gmx_simd_cmplt_fi
+ */
+#    define gmx_simd_cmplt_i                 gmx_simd_cmplt_fi
+
+/*! \brief For each element, the result boolean is true if both arguments are true
+ *
+ * If GMX_DOUBLE is defined, this will be aliased to \ref gmx_simd_and_dib,
+ * otherwise \ref gmx_simd_and_fib.
+ *
+ * \copydetails gmx_simd_and_fib
+ */
+#    define gmx_simd_and_ib                  gmx_simd_and_fib
+
+/*! \brief For each element, the result boolean is true if either argument is true.
+ *
+ * If GMX_DOUBLE is defined, this will be aliased to \ref gmx_simd_or_dib,
+ * otherwise \ref gmx_simd_or_fib.
+ *
+ * \copydetails gmx_simd_or_fib
+ */
+#    define gmx_simd_or_ib                   gmx_simd_or_fib
+
+/*! \brief Return nonzero if any element in gmx_simd_ibool_t is true, otherwise 0.
+ *
+ * If GMX_DOUBLE is defined, this will be aliased to \ref gmx_simd_anytrue_dib,
+ * otherwise \ref gmx_simd_anytrue_fib.
+ *
+ * \copydetails gmx_simd_anytrue_fib
+ */
+#    define gmx_simd_anytrue_ib              gmx_simd_anytrue_fib
+
+/*! \brief Selects elements from \ref gmx_simd_int32_t where boolean is true, otherwise 0.
+ *
+ * If GMX_DOUBLE is defined, this will be aliased to \ref gmx_simd_blendzero_di,
+ * otherwise \ref gmx_simd_blendzero_fi.
+ *
+ * \copydetails gmx_simd_blendzero_fi
+ */
+#    define gmx_simd_blendzero_i             gmx_simd_blendzero_fi
+
+/*! \brief Selects elements from \ref gmx_simd_int32_t where boolean is false, otherwise 0.
+ *
+ * If GMX_DOUBLE is defined, this will be aliased to \ref gmx_simd_blendnotzero_di,
+ * otherwise \ref gmx_simd_blendnotzero_fi.
+ *
+ * \copydetails gmx_simd_blendnotzero_fi
+ */
+#    define gmx_simd_blendnotzero_i          gmx_simd_blendnotzero_fi
+
+/*! \brief Selects from 2nd int SIMD arg where boolean is true, otherwise 1st arg.
+ *
+ * If GMX_DOUBLE is defined, this will be aliased to \ref gmx_simd_blendv_di,
+ * otherwise \ref gmx_simd_blendv_fi.
+ *
+ * \copydetails gmx_simd_blendv_fi
+ */
+#    define gmx_simd_blendv_i                gmx_simd_blendv_fi
+
+/*! \}
+ *  \name SIMD conversion operations
+ *
+ *  These instructions are available when both types involved in the conversion
+ *  are defined, e.g. \ref GMX_SIMD_HAVE_REAL and \ref GMX_SIMD_HAVE_INT32
+ *  for real-to-integer conversion.
+ *  \{
+ */
+
+/*! \brief Convert gmx_simd_real_t to gmx_simd_int32_t, round to nearest integer.
+ *
+ * If GMX_DOUBLE is defined, this will be aliased to \ref gmx_simd_cvt_d2i,
+ * otherwise \ref gmx_simd_cvt_f2i.
+ *
+ * \copydetails gmx_simd_cvt_f2i
+ */
+#    define gmx_simd_cvt_r2i                 gmx_simd_cvt_f2i
+
+/*! \brief Convert gmx_simd_real_t to gmx_simd_int32_t, truncate towards zero
+ *
+ * If GMX_DOUBLE is defined, this will be aliased to \ref gmx_simd_cvtt_d2i,
+ * otherwise \ref gmx_simd_cvtt_f2i.
+ *
+ * \copydetails gmx_simd_cvtt_f2i
+ */
+#    define gmx_simd_cvtt_r2i                gmx_simd_cvtt_f2i
+
+/*! \brief Convert gmx_simd_int32_t to gmx_simd_real_t
+ *
+ * If GMX_DOUBLE is defined, this will be aliased to \ref gmx_simd_cvt_i2d,
+ * otherwise \ref gmx_simd_cvt_i2f.
+ *
+ * \copydetails gmx_simd_cvt_i2f
+ */
+#    define gmx_simd_cvt_i2r                 gmx_simd_cvt_i2f
+
+/*! \brief Convert from gmx_simd_bool_t to gmx_simd_ibool_t
+ *
+ * If GMX_DOUBLE is defined, this will be aliased to \ref gmx_simd_cvt_db2dib,
+ * otherwise \ref gmx_simd_cvt_fb2fib.
+ *
+ * \copydetails gmx_simd_cvt_fb2fib
+ */
+#    define gmx_simd_cvt_b2ib                gmx_simd_cvt_fb2fib
+
+/*! \brief Convert from gmx_simd_ibool_t to gmx_simd_bool_t
+ *
+ * If GMX_DOUBLE is defined, this will be aliased to \ref gmx_simd_cvt_dib2db,
+ * otherwise \ref gmx_simd_cvt_fib2fb.
+ *
+ * \copydetails gmx_simd_cvt_fib2fb
+ */
+#    define gmx_simd_cvt_ib2b                gmx_simd_cvt_fib2fb
+
+
+/*! \}
+ *  \name SIMD memory alignment operations
+ *  \{
+ */
+
+/*! \brief Align real memory for SIMD usage.
+ *
+ * This routine will only align memory if \ref GMX_SIMD_HAVE_REAL is defined.
+ * Otherwise the original pointer will be returned.
+ *
+ * Start by allocating an extra \ref GMX_SIMD_REAL_WIDTH float elements of memory,
+ * and then call this function. The returned pointer will be greater or equal
+ * to the one you provided, and point to an address inside your provided memory
+ * that is aligned to the SIMD width.
+ *
+ * If GMX_DOUBLE is defined, this will be aliased to \ref gmx_simd_align_d,
+ * otherwise \ref gmx_simd_align_f. For detailed documentation, see the
+ * precision-specific implementation routines.
+ */
+#    define gmx_simd_align_r                 gmx_simd_align_f
+
+/*! \brief Align integer memory for SIMD usage.
+ *
+ * This routine will only align memory if \ref GMX_SIMD_HAVE_INT32 is defined.
+ * Otherwise the original pointer will be returned.
+ *
+ * Start by allocating an extra \ref GMX_SIMD_INT32_WIDTH elements of memory,
+ * and then call this function. The returned pointer will be greater or equal
+ * to the one you provided, and point to an address inside your provided memory
+ * that is aligned to the SIMD width.
+ *
+ * If GMX_DOUBLE is defined, this will be aliased to \ref gmx_simd_align_di,
+ * otherwise \ref gmx_simd_align_fi. For detailed documentation, see the
+ * precision-specific implementation routines.
+ */
+#    define gmx_simd_align_i                 gmx_simd_align_fi
+
+/*! \} */
+
+/*! \name SIMD4 - constant width-four SIMD datatypes
+ *
+ * These operations are only meant to be used for a few coordinate
+ * manipulation and grid interpolation routines, so we only support a subset
+ * of operations for SIMD4. To avoid repeating all the documentation from
+ * the generic width SIMD routines, we only provide brief documentation for
+ * these operations. Follow the link to the implementation documentation or the
+ * reference to the corresponding generic SIMD routine. The format will be
+ * exactly the same, but they have SIMD replaced with SIMD4.
+ *  \{
+ */
+
+/*! \brief SIMD real datatype guaranteed to be 4 elements wide, if available.
+ *
+ * All the SIMD4 datatypes and operations behave like their counterparts for
+ * the generic SIMD implementation, but they might be implemented with different
+ * registers, or not supported at all. It is important that you check the
+ * define \ref GMX_SIMD4_HAVE_REAL before using it.
+ *
+ * Just as the normal SIMD operations, all SIMD4 types and routines will
+ * be aliased to either single or double precision ones based on whether
+ * GMX_DOUBLE is defined.
+ *
+ * \note There is no support for integer or math operations in SIMD4.
+ */
+#    define gmx_simd4_real_t                 gmx_simd4_float_t
+
+/*! \brief Boolean for \ref gmx_simd4_real_t comparision/selection */
+#    define gmx_simd4_bool_t                 gmx_simd4_fbool_t
+
+/*! \brief Load aligned data to gmx_simd4_real_t.
+ *
+ * \copydetails gmx_simd4_load_f
+ */
+#    define gmx_simd4_load_r                 gmx_simd4_load_f
+
+/*! \brief Load single element to gmx_simd4_real_t
+ *
+ * \copydetails gmx_simd4_load1_f
+ */
+#    define gmx_simd4_load1_r                gmx_simd4_load1_f
+
+/*! \brief Set gmx_simd4_real_t from scalar value
+ *
+ * \copydetails gmx_simd4_set1_f
+ */
+#    define gmx_simd4_set1_r                 gmx_simd4_set1_f
+
+/*! \brief store aligned data from gmx_simd4_real_t
+ *
+ * \copydetails gmx_simd4_store_f
+ */
+#    define gmx_simd4_store_r                gmx_simd4_store_f
+
+/*! \brief Load unaligned data to gmx_simd4_real_t
+ *
+ * \copydetails gmx_simd4_loadu_f
+ */
+#    define gmx_simd4_loadu_r                gmx_simd4_loadu_f
+
+/*! \brief Store unaligned data from gmx_simd4_real_t
+ *
+ * \copydetails gmx_simd4_storeu_f
+ */
+#    define gmx_simd4_storeu_r               gmx_simd4_storeu_f
+
+/*! \brief Set all elements in gmx_simd4_real_t to 0.0
+ *
+ * \copydetails gmx_simd4_setzero_f
+ */
+#    define gmx_simd4_setzero_r              gmx_simd4_setzero_f
+
+/*! \brief Bitwise and for two gmx_simd4_real_t
+ *
+ * \copydetails gmx_simd4_and_f
+ */
+#    define gmx_simd4_and_r                  gmx_simd4_and_f
+
+/*! \brief Bitwise and-not for two gmx_simd4_real_t. 1st arg is complemented.
+ *
+ * \copydetails gmx_simd4_andnot_f
+ */
+#    define gmx_simd4_andnot_r               gmx_simd4_andnot_f
+
+/*! \brief Bitwise or for two gmx_simd4_real_t
+ *
+ * \copydetails gmx_simd4_or_f
+ */
+#    define gmx_simd4_or_r                   gmx_simd4_or_f
+
+/*! \brief Bitwise xor for two gmx_simd4_real_t
+ *
+ * \copydetails gmx_simd4_xor_f
+ */
+#    define gmx_simd4_xor_r                  gmx_simd4_xor_f
+
+/*! \brief a+b for \ref gmx_simd4_real_t
+ *
+ * \copydetails gmx_simd4_add_f
+ */
+#    define gmx_simd4_add_r                  gmx_simd4_add_f
+
+/*! \brief a-b for \ref gmx_simd4_real_t
+ *
+ * \copydetails gmx_simd4_sub_f
+ */
+#    define gmx_simd4_sub_r                  gmx_simd4_sub_f
+
+/*! \brief a*b for \ref gmx_simd4_real_t
+ *
+ * \copydetails gmx_simd4_mul_f
+ */
+#    define gmx_simd4_mul_r                  gmx_simd4_mul_f
+
+/*! \brief a*b+c for \ref gmx_simd4_real_t
+ *
+ * \copydetails gmx_simd4_fmadd_f
+ */
+#    define gmx_simd4_fmadd_r                gmx_simd4_fmadd_f
+
+/*! \brief a*b-c for \ref gmx_simd4_real_t
+ *
+ * \copydetails gmx_simd4_fmsub_f
+ */
+#    define gmx_simd4_fmsub_r                gmx_simd4_fmsub_f
+
+/*! \brief -a*b+c for \ref gmx_simd4_real_t
+ *
+ * \copydetails gmx_simd4_fnmadd_f
+ */
+#    define gmx_simd4_fnmadd_r               gmx_simd4_fnmadd_f
+
+/*! \brief -a*b-c for \ref gmx_simd4_real_t
+ *
+ * \copydetails gmx_simd4_fnmsub_f
+ */
+#    define gmx_simd4_fnmsub_r               gmx_simd4_fnmsub_f
+
+/*! \brief 1/sqrt(x) approximate lookup for \ref gmx_simd4_real_t
+ *
+ * \copydetails gmx_simd4_rsqrt_f
+ */
+#    define gmx_simd4_rsqrt_r                gmx_simd4_rsqrt_f
+
+/*! \brief fabs(x) for \ref gmx_simd4_real_t
+ *
+ * \copydetails gmx_simd4_fabs_f
+ */
+#    define gmx_simd4_fabs_r                 gmx_simd4_fabs_f
+
+/*! \brief Change sign (-x) for \ref gmx_simd4_real_t
+ *
+ * \copydetails gmx_simd4_fneg_f
+ */
+#    define gmx_simd4_fneg_r                 gmx_simd4_fneg_f
+
+/*! \brief Select maximum of each pair of elements from args for \ref gmx_simd4_real_t
+ *
+ * \copydetails gmx_simd4_max_f
+ */
+#    define gmx_simd4_max_r                  gmx_simd4_max_f
+
+/*! \brief Select minimum of each pair of elements from args for \ref gmx_simd4_real_t
+ *
+ * \copydetails gmx_simd4_min_f
+ */
+#    define gmx_simd4_min_r                  gmx_simd4_min_f
+
+/*! \brief Round \ref gmx_simd4_real_t to nearest integer, return \ref gmx_simd4_real_t
+ *
+ * \copydetails gmx_simd4_round_f
+ */
+#    define gmx_simd4_round_r                gmx_simd4_round_f
+
+/*! \brief Truncate \ref gmx_simd4_real_t towards zero, return \ref gmx_simd4_real_t
+ *
+ * \copydetails gmx_simd4_trunc_f
+ */
+#    define gmx_simd4_trunc_r                gmx_simd4_trunc_f
+
+/*! \brief Scalar product of first three elements of two \ref gmx_simd4_real_t *
+ *
+ * \copydetails gmx_simd4_dotproduct3_f
+ */
+#    define gmx_simd4_dotproduct3_r          gmx_simd4_dotproduct3_f
+
+/*! \brief Return booleans whether a==b for each element two \ref gmx_simd4_real_t
+ *
+ * \copydetails gmx_simd4_cmpeq_f
+ */
+#    define gmx_simd4_cmpeq_r                gmx_simd4_cmpeq_f
+/*! \brief Return booleans whether a<b for each element two \ref gmx_simd4_real_t
+ *
+ * \copydetails gmx_simd4_cmplt_f
+ */
+#    define gmx_simd4_cmplt_r                gmx_simd4_cmplt_f
+/*! \brief Return booleans whether a<=b for each element two \ref gmx_simd4_real_t
+ *
+ * \copydetails gmx_simd4_cmple_f
+ */
+#    define gmx_simd4_cmple_r                gmx_simd4_cmple_f
+
+/*! \brief Logical and for two \ref gmx_simd4_bool_t
+ *
+ * \copydetails gmx_simd4_and_fb
+ */
+#    define gmx_simd4_and_b                  gmx_simd4_and_fb
+/*! \brief Logical or for two \ref gmx_simd4_bool_t
+ *
+ * \copydetails gmx_simd4_or_fb
+ */
+#    define gmx_simd4_or_b                   gmx_simd4_or_fb
+
+/*! \brief Return nonzero if any element in \ref gmx_simd4_bool_t is true, otherwise 0
+ *
+ * \copydetails gmx_simd4_anytrue_fb
+ */
+#    define gmx_simd4_anytrue_b              gmx_simd4_anytrue_fb
+
+/*! \brief Selects from 2nd real SIMD4 arg where boolean is true, otherwise 1st arg
+ *
+ * \copydetails gmx_simd4_blendzero_f
+ */
+#    define gmx_simd4_blendzero_r            gmx_simd4_blendzero_f
+
+/*! \brief Selects from 2nd real SIMD4 arg where boolean is false, otherwise 1st arg
+ *
+ * \copydetails gmx_simd4_blendnotzero_f
+ */
+#    define gmx_simd4_blendnotzero_r            gmx_simd4_blendnotzero_f
+
+/*! \brief Selects from 2nd real SIMD4 arg where boolean is true, otherwise 1st arg
+ *
+ * \copydetails gmx_simd4_blendv_f
+ */
+#    define gmx_simd4_blendv_r               gmx_simd4_blendv_f
+
+/*! \brief Return sum of all elements in SIMD4 floating-point variable.
+ *
+ * \copydetails gmx_simd4_reduce_f
+ */
+#    define gmx_simd4_reduce_r               gmx_simd4_reduce_f
+
+/*! Align real memory for SIMD4 usage.
+ *
+ * \copydetails gmx_simd4_align_f
+ */
+#    define gmx_simd4_align_r                gmx_simd4_align_f
+
+/*! \} */
+
+/*! \name SIMD predefined macros to describe high-level capabilities
+ *  \{
+ */
+
+#    if (defined GMX_SIMD_HAVE_FLOAT) || (defined DOXYGEN)
+/*! \brief Defined if gmx_simd_real_t is available.
+ *
+ *  if GMX_DOUBLE is defined, this will be aliased to
+ *  \ref GMX_SIMD_HAVE_DOUBLE, otherwise GMX_SIMD_HAVE_FLOAT.
+ */
+#        define GMX_SIMD_HAVE_REAL
+/*! \brief Width of gmx_simd_real_t.
+ *
+ *  if GMX_DOUBLE is defined, this will be aliased to
+ *  \ref GMX_SIMD_DOUBLE_WIDTH, otherwise GMX_SIMD_FLOAT_WIDTH.
+ */
+#        define GMX_SIMD_REAL_WIDTH          GMX_SIMD_FLOAT_WIDTH
+#    endif
+#    if (defined GMX_SIMD_HAVE_FINT32) || (defined DOXYGEN)
+/*! \brief Defined if gmx_simd_int32_t is available.
+ *
+ *  if GMX_DOUBLE is defined, this will be aliased to
+ *  \ref GMX_SIMD_HAVE_DINT32, otherwise GMX_SIMD_HAVE_FINT32.
+ */
+#        define GMX_SIMD_HAVE_INT32
+/*! \brief Width of gmx_simd_int32_t.
+ *
+ *  if GMX_DOUBLE is defined, this will be aliased to
+ *  \ref GMX_SIMD_DINT32_WIDTH, otherwise GMX_SIMD_FINT32_WIDTH.
+ */
+#        define GMX_SIMD_INT32_WIDTH         GMX_SIMD_FINT32_WIDTH
+#    endif
+#    if (defined GMX_SIMD_HAVE_FINT32_EXTRACT) || (defined DOXYGEN)
+/*! \brief Defined if gmx_simd_extract_i() is available.
+ *
+ *  if GMX_DOUBLE is defined, this will be aliased to
+ *  \ref GMX_SIMD_HAVE_DINT32_EXTRACT, otherwise GMX_SIMD_HAVE_FINT32_EXTRACT.
+ */
+#        define GMX_SIMD_HAVE_INT32_EXTRACT
+#    endif
+#    if (defined GMX_SIMD_HAVE_FINT32_LOGICAL) || (defined DOXYGEN)
+/*! \brief Defined if logical ops are supported on gmx_simd_int32_t.
+ *
+ *  if GMX_DOUBLE is defined, this will be aliased to
+ *  \ref GMX_SIMD_HAVE_DINT32_LOGICAL, otherwise GMX_SIMD_HAVE_FINT32_LOGICAL.
+ */
+#        define GMX_SIMD_HAVE_INT32_LOGICAL
+#    endif
+#    if (defined GMX_SIMD_HAVE_FINT32_ARITHMETICS) || (defined DOXYGEN)
+/*! \brief Defined if arithmetic ops are supported on gmx_simd_int32_t.
+ *
+ *  if GMX_DOUBLE is defined, this will be aliased to
+ *  \ref GMX_SIMD_HAVE_DINT32_ARITHMETICS, otherwise GMX_SIMD_HAVE_FINT32_ARITHMETICS.
+ */
+#        define GMX_SIMD_HAVE_INT32_ARITHMETICS
+#    endif
+#    if (defined GMX_SIMD4_HAVE_FLOAT) || (defined DOXYGEN)
+/*! \brief Defined if gmx_simd4_real_t is available.
+ *
+ *  if GMX_DOUBLE is defined, this will be aliased to
+ *  \ref GMX_SIMD4_HAVE_DOUBLE, otherwise GMX_SIMD4_HAVE_FLOAT.
+ */
+#        define GMX_SIMD4_HAVE_REAL
+#    endif
+
+/*! \} */
+
+#endif /* GMX_DOUBLE */
+
+/*! \} */
+/*! \endcond */
+
+#endif /* GMX_SIMD_SIMD_H */
diff --git a/src/gromacs/simd/simd_math.h b/src/gromacs/simd/simd_math.h

new file mode 100644 (file)

index 0000000..5013ab5
--- /dev/null
+++ b/src/gromacs/simd/simd_math.h
@@ -0,0 +1,2916 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+#ifndef GMX_SIMD_SIMD_MATH_H_
+#define GMX_SIMD_SIMD_MATH_H_
+
+/*! \libinternal \file
+ *
+ * \brief Math functions for SIMD datatypes.
+ *
+ * \attention This file is generic for all SIMD architectures, so you cannot
+ * assume that any of the optional SIMD features (as defined in simd.h) are
+ * present. In particular, this means you cannot assume support for integers,
+ * logical operations (neither on floating-point nor integer values), shifts,
+ * and the architecture might only have SIMD for either float or double.
+ * Second, to keep this file clean and general, any additions to this file
+ * must work for all possible SIMD architectures in both single and double
+ * precision (if they support it), and you cannot make any assumptions about
+ * SIMD width.
+ *
+ * \author Erik Lindahl <erik.lindahl@scilifelab.se>
+ *
+ * \inlibraryapi
+ * \ingroup module_simd
+ */
+
+#include <math.h>
+
+#include "gromacs/math/utilities.h"
+#include "gromacs/simd/simd.h"
+
+/*! \cond libapi */
+/*! \addtogroup module_simd */
+/*! \{ */
+
+/*! \name Implementation accuracy settings
+ *  \{
+ */
+
+/*! \brief We accept lsb errors for 1/sqrt(x) and 1/x, so float target is 22 bits */
+#define GMX_SIMD_MATH_TARGET_SINGLE_BITS 22
+
+/*! \brief We accept "double" that has 2x single precision - 44 bits.
+ *
+ * This way two Newton-Raphson iterations will suffice in double precision.
+ */
+#define GMX_SIMD_MATH_TARGET_DOUBLE_BITS 44
+
+/*! \} */
+
+#ifdef GMX_SIMD_HAVE_FLOAT
+
+/*! \name Single precision SIMD math functions
+ *
+ *  \note In most cases you should use the real-precision functions instead.
+ *  \{
+ */
+
+/****************************************
+ * SINGLE PRECISION SIMD MATH FUNCTIONS *
+ ****************************************/
+
+/*! \brief SIMD float utility to sum a+b+c+d.
+ *
+ * You should normally call the real-precision routine \ref gmx_simd_sum4_r.
+ *
+ * \param a term 1 (multiple values)
+ * \param b term 2 (multiple values)
+ * \param c term 3 (multiple values)
+ * \param d term 4 (multiple values)
+ * \return sum of terms 1-4 (multiple values)
+ */
+static gmx_inline gmx_simd_float_t
+gmx_simd_sum4_f(gmx_simd_float_t a, gmx_simd_float_t b,
+                gmx_simd_float_t c, gmx_simd_float_t d)
+{
+    return gmx_simd_add_f(gmx_simd_add_f(a, b), gmx_simd_add_f(c, d));
+}
+
+/*! \brief Return -a if b is negative, SIMD float.
+ *
+ * You should normally call the real-precision routine \ref gmx_simd_xor_sign_r.
+ *
+ * \param a Values to set sign for
+ * \param b Values used to set sign
+ * \return if b is negative, the sign of a will be changed.
+ *
+ * This is equivalent to doing an xor operation on a with the sign bit of b,
+ * with the exception that negative zero is not considered to be negative
+ * on architectures where \ref GMX_SIMD_HAVE_LOGICAL is not set.
+ */
+static gmx_inline gmx_simd_float_t
+gmx_simd_xor_sign_f(gmx_simd_float_t a, gmx_simd_float_t b)
+{
+#ifdef GMX_SIMD_HAVE_LOGICAL
+    return gmx_simd_xor_f(a, gmx_simd_and_f(gmx_simd_set1_f(-0.0), b));
+#else
+    return gmx_simd_blendv_f(a, gmx_simd_fneg_f(a), gmx_simd_cmplt_f(b, gmx_simd_setzero_f()));
+#endif
+}
+
+/*! \brief Perform one Newton-Raphson iteration to improve 1/sqrt(x) for SIMD float.
+ *
+ * This is a low-level routine that should only be used by SIMD math routine
+ * that evaluates the inverse square root.
+ *
+ *  \param lu Approximation of 1/sqrt(x), typically obtained from lookup.
+ *  \param x  The reference (starting) value x for which we want 1/sqrt(x).
+ *  \return   An improved approximation with roughly twice as many bits of accuracy.
+ */
+static gmx_inline gmx_simd_float_t
+gmx_simd_rsqrt_iter_f(gmx_simd_float_t lu, gmx_simd_float_t x)
+{
+#    ifdef GMX_SIMD_HAVE_FMA
+    return gmx_simd_fmadd_f(gmx_simd_fnmadd_f(x, gmx_simd_mul_f(lu, lu), gmx_simd_set1_f(1.0f)), gmx_simd_mul_f(lu, gmx_simd_set1_f(0.5f)), lu);
+#    else
+    return gmx_simd_mul_f(gmx_simd_set1_f(0.5f), gmx_simd_mul_f(gmx_simd_sub_f(gmx_simd_set1_f(3.0f), gmx_simd_mul_f(gmx_simd_mul_f(lu, lu), x)), lu));
+#    endif
+}
+
+/*! \brief Calculate 1/sqrt(x) for SIMD float.
+ *
+ * You should normally call the real-precision routine \ref gmx_simd_invsqrt_r.
+ *
+ *  \param x Argument that must be >0. This routine does not check arguments.
+ *  \return 1/sqrt(x). Result is undefined if your argument was invalid.
+ */
+static gmx_inline gmx_simd_float_t
+gmx_simd_invsqrt_f(gmx_simd_float_t x)
+{
+    gmx_simd_float_t lu = gmx_simd_rsqrt_f(x);
+#if (GMX_SIMD_RSQRT_BITS < GMX_SIMD_MATH_TARGET_SINGLE_BITS)
+    lu = gmx_simd_rsqrt_iter_f(lu, x);
+#endif
+#if (GMX_SIMD_RSQRT_BITS*2 < GMX_SIMD_MATH_TARGET_SINGLE_BITS)
+    lu = gmx_simd_rsqrt_iter_f(lu, x);
+#endif
+#if (GMX_SIMD_RSQRT_BITS*4 < GMX_SIMD_MATH_TARGET_SINGLE_BITS)
+    lu = gmx_simd_rsqrt_iter_f(lu, x);
+#endif
+    return lu;
+}
+
+/*! \brief Calculate 1/sqrt(x) for two SIMD floats.
+ *
+ * You should normally call the real-precision routine \ref gmx_simd_invsqrt_pair_r.
+ *
+ * \param x0  First set of arguments, x0 must be positive - no argument checking.
+ * \param x1  Second set of arguments, x1 must be positive - no argument checking.
+ * \param[out] out0  Result 1/sqrt(x0)
+ * \param[out] out1  Result 1/sqrt(x1)
+ *
+ *  In particular for double precision we can sometimes calculate square root
+ *  pairs slightly faster by using single precision until the very last step.
+ */
+static gmx_inline void
+gmx_simd_invsqrt_pair_f(gmx_simd_float_t x0,    gmx_simd_float_t x1,
+                        gmx_simd_float_t *out0, gmx_simd_float_t *out1)
+{
+    *out0 = gmx_simd_invsqrt_f(x0);
+    *out1 = gmx_simd_invsqrt_f(x1);
+}
+
+/*! \brief Perform one Newton-Raphson iteration to improve 1/x for SIMD float.
+ *
+ * This is a low-level routine that should only be used by SIMD math routine
+ * that evaluates the reciprocal.
+ *
+ *  \param lu Approximation of 1/x, typically obtained from lookup.
+ *  \param x  The reference (starting) value x for which we want 1/x.
+ *  \return   An improved approximation with roughly twice as many bits of accuracy.
+ */
+static gmx_inline gmx_simd_float_t
+gmx_simd_rcp_iter_f(gmx_simd_float_t lu, gmx_simd_float_t x)
+{
+    return gmx_simd_mul_f(lu, gmx_simd_fnmadd_f(lu, x, gmx_simd_set1_f(2.0f)));
+}
+
+/*! \brief Calculate 1/x for SIMD float.
+ *
+ * You should normally call the real-precision routine \ref gmx_simd_inv_r.
+ *
+ *  \param x Argument that must be nonzero. This routine does not check arguments.
+ *  \return 1/x. Result is undefined if your argument was invalid.
+ */
+static gmx_inline gmx_simd_float_t
+gmx_simd_inv_f(gmx_simd_float_t x)
+{
+    gmx_simd_float_t lu = gmx_simd_rcp_f(x);
+#if (GMX_SIMD_RCP_BITS < GMX_SIMD_MATH_TARGET_SINGLE_BITS)
+    lu = gmx_simd_rcp_iter_f(lu, x);
+#endif
+#if (GMX_SIMD_RCP_BITS*2 < GMX_SIMD_MATH_TARGET_SINGLE_BITS)
+    lu = gmx_simd_rcp_iter_f(lu, x);
+#endif
+#if (GMX_SIMD_RCP_BITS*4 < GMX_SIMD_MATH_TARGET_SINGLE_BITS)
+    lu = gmx_simd_rcp_iter_f(lu, x);
+#endif
+    return lu;
+}
+
+/*! \brief Calculate sqrt(x) correctly for SIMD floats, including argument 0.0.
+ *
+ * You should normally call the real-precision routine \ref gmx_simd_sqrt_r.
+ *
+ *  \param x Argument that must be >=0.
+ *  \return sqrt(x). If x=0, the result will correctly be set to 0.
+ *          The result is undefined if the input value is negative.
+ */
+static gmx_inline gmx_simd_float_t
+gmx_simd_sqrt_f(gmx_simd_float_t x)
+{
+    gmx_simd_fbool_t  mask;
+    gmx_simd_float_t  res;
+
+    mask = gmx_simd_cmpeq_f(x, gmx_simd_setzero_f());
+    res  = gmx_simd_blendnotzero_f(gmx_simd_invsqrt_f(x), mask);
+    return gmx_simd_mul_f(res, x);
+}
+
+/*! \brief SIMD float log(x). This is the natural logarithm.
+ *
+ * You should normally call the real-precision routine \ref gmx_simd_log_r.
+ *
+ * \param x Argument, should be >0.
+ * \result The natural logarithm of x. Undefined if argument is invalid.
+ */
+#ifndef gmx_simd_log_f
+static gmx_inline gmx_simd_float_t
+gmx_simd_log_f(gmx_simd_float_t x)
+{
+    const gmx_simd_float_t  half       = gmx_simd_set1_f(0.5f);
+    const gmx_simd_float_t  one        = gmx_simd_set1_f(1.0f);
+    const gmx_simd_float_t  sqrt2      = gmx_simd_set1_f(sqrt(2.0f));
+    const gmx_simd_float_t  corr       = gmx_simd_set1_f(0.693147180559945286226764f);
+    const gmx_simd_float_t  CL9        = gmx_simd_set1_f(0.2371599674224853515625f);
+    const gmx_simd_float_t  CL7        = gmx_simd_set1_f(0.285279005765914916992188f);
+    const gmx_simd_float_t  CL5        = gmx_simd_set1_f(0.400005519390106201171875f);
+    const gmx_simd_float_t  CL3        = gmx_simd_set1_f(0.666666567325592041015625f);
+    const gmx_simd_float_t  CL1        = gmx_simd_set1_f(2.0f);
+    gmx_simd_float_t        fexp, x2, p;
+    gmx_simd_fbool_t        mask;
+
+    fexp  = gmx_simd_get_exponent_f(x);
+    x     = gmx_simd_get_mantissa_f(x);
+
+    mask  = gmx_simd_cmplt_f(sqrt2, x);
+    /* Adjust to non-IEEE format for x>sqrt(2): exponent += 1, mantissa *= 0.5 */
+    fexp  = gmx_simd_add_f(fexp, gmx_simd_blendzero_f(one, mask));
+    x     = gmx_simd_mul_f(x, gmx_simd_blendv_f(one, half, mask));
+
+    x     = gmx_simd_mul_f( gmx_simd_sub_f(x, one), gmx_simd_inv_f( gmx_simd_add_f(x, one) ) );
+    x2    = gmx_simd_mul_f(x, x);
+
+    p     = gmx_simd_fmadd_f(CL9, x2, CL7);
+    p     = gmx_simd_fmadd_f(p, x2, CL5);
+    p     = gmx_simd_fmadd_f(p, x2, CL3);
+    p     = gmx_simd_fmadd_f(p, x2, CL1);
+    p     = gmx_simd_fmadd_f(p, x, gmx_simd_mul_f(corr, fexp));
+
+    return p;
+}
+#endif
+
+#ifndef gmx_simd_exp2_f
+/*! \brief SIMD float 2^x.
+ *
+ * You should normally call the real-precision routine \ref gmx_simd_exp2_r.
+ *
+ * \param x Argument.
+ * \result 2^x. Undefined if input argument caused overflow.
+ */
+static gmx_inline gmx_simd_float_t
+gmx_simd_exp2_f(gmx_simd_float_t x)
+{
+    /* Lower bound: Disallow numbers that would lead to an IEEE fp exponent reaching +-127. */
+    const gmx_simd_float_t  arglimit = gmx_simd_set1_f(126.0f);
+    const gmx_simd_float_t  CC6      = gmx_simd_set1_f(0.0001534581200287996416911311);
+    const gmx_simd_float_t  CC5      = gmx_simd_set1_f(0.001339993121934088894618990);
+    const gmx_simd_float_t  CC4      = gmx_simd_set1_f(0.009618488957115180159497841);
+    const gmx_simd_float_t  CC3      = gmx_simd_set1_f(0.05550328776964726865751735);
+    const gmx_simd_float_t  CC2      = gmx_simd_set1_f(0.2402264689063408646490722);
+    const gmx_simd_float_t  CC1      = gmx_simd_set1_f(0.6931472057372680777553816);
+    const gmx_simd_float_t  one      = gmx_simd_set1_f(1.0f);
+
+    gmx_simd_float_t        fexppart;
+    gmx_simd_float_t        intpart;
+    gmx_simd_float_t        p;
+    gmx_simd_fbool_t        valuemask;
+
+    fexppart  = gmx_simd_set_exponent_f(x);
+    intpart   = gmx_simd_round_f(x);
+    valuemask = gmx_simd_cmple_f(gmx_simd_fabs_f(x), arglimit);
+    fexppart  = gmx_simd_blendzero_f(fexppart, valuemask);
+    x         = gmx_simd_sub_f(x, intpart);
+
+    p         = gmx_simd_fmadd_f(CC6, x, CC5);
+    p         = gmx_simd_fmadd_f(p, x, CC4);
+    p         = gmx_simd_fmadd_f(p, x, CC3);
+    p         = gmx_simd_fmadd_f(p, x, CC2);
+    p         = gmx_simd_fmadd_f(p, x, CC1);
+    p         = gmx_simd_fmadd_f(p, x, one);
+    x         = gmx_simd_mul_f(p, fexppart);
+    return x;
+}
+#endif
+
+#ifndef gmx_simd_exp_f
+/*! \brief SIMD float exp(x).
+ *
+ * You should normally call the real-precision routine \ref gmx_simd_exp_r.
+ *
+ * In addition to scaling the argument for 2^x this routine correctly does
+ * extended precision arithmetics to improve accuracy.
+ *
+ * \param x Argument.
+ * \result exp(x). Undefined if input argument caused overflow.
+ */
+static gmx_inline gmx_simd_float_t
+gmx_simd_exp_f(gmx_simd_float_t x)
+{
+    const gmx_simd_float_t  argscale     = gmx_simd_set1_f(1.44269504088896341f);
+    /* Lower bound: Disallow numbers that would lead to an IEEE fp exponent reaching +-127. */
+    const gmx_simd_float_t  arglimit     = gmx_simd_set1_f(126.0f);
+    const gmx_simd_float_t  invargscale0 = gmx_simd_set1_f(0.693145751953125f);
+    const gmx_simd_float_t  invargscale1 = gmx_simd_set1_f(1.428606765330187045e-06f);
+    const gmx_simd_float_t  CC4          = gmx_simd_set1_f(0.00136324646882712841033936f);
+    const gmx_simd_float_t  CC3          = gmx_simd_set1_f(0.00836596917361021041870117f);
+    const gmx_simd_float_t  CC2          = gmx_simd_set1_f(0.0416710823774337768554688f);
+    const gmx_simd_float_t  CC1          = gmx_simd_set1_f(0.166665524244308471679688f);
+    const gmx_simd_float_t  CC0          = gmx_simd_set1_f(0.499999850988388061523438f);
+    const gmx_simd_float_t  one          = gmx_simd_set1_f(1.0f);
+    gmx_simd_float_t        fexppart;
+    gmx_simd_float_t        intpart;
+    gmx_simd_float_t        y, p;
+    gmx_simd_fbool_t        valuemask;
+
+    y         = gmx_simd_mul_f(x, argscale);
+    fexppart  = gmx_simd_set_exponent_f(y);  /* rounds to nearest int internally */
+    intpart   = gmx_simd_round_f(y);         /* use same rounding algorithm here */
+    valuemask = gmx_simd_cmple_f(gmx_simd_fabs_f(y), arglimit);
+    fexppart  = gmx_simd_blendzero_f(fexppart, valuemask);
+
+    /* Extended precision arithmetics */
+    x         = gmx_simd_fnmadd_f(invargscale0, intpart, x);
+    x         = gmx_simd_fnmadd_f(invargscale1, intpart, x);
+
+    p         = gmx_simd_fmadd_f(CC4, x, CC3);
+    p         = gmx_simd_fmadd_f(p, x, CC2);
+    p         = gmx_simd_fmadd_f(p, x, CC1);
+    p         = gmx_simd_fmadd_f(p, x, CC0);
+    p         = gmx_simd_fmadd_f(gmx_simd_mul_f(x, x), p, x);
+    p         = gmx_simd_add_f(p, one);
+    x         = gmx_simd_mul_f(p, fexppart);
+    return x;
+}
+#endif
+
+/*! \brief SIMD float erf(x).
+ *
+ * You should normally call the real-precision routine \ref gmx_simd_erf_r.
+ *
+ * \param x The value to calculate erf(x) for.
+ * \result erf(x)
+ *
+ * This routine achieves very close to full precision, but we do not care about
+ * the last bit or the subnormal result range.
+ */
+static gmx_inline gmx_simd_float_t
+gmx_simd_erf_f(gmx_simd_float_t x)
+{
+    /* Coefficients for minimax approximation of erf(x)=x*P(x^2) in range [-1,1] */
+    const gmx_simd_float_t  CA6      = gmx_simd_set1_f(7.853861353153693e-5f);
+    const gmx_simd_float_t  CA5      = gmx_simd_set1_f(-8.010193625184903e-4f);
+    const gmx_simd_float_t  CA4      = gmx_simd_set1_f(5.188327685732524e-3f);
+    const gmx_simd_float_t  CA3      = gmx_simd_set1_f(-2.685381193529856e-2f);
+    const gmx_simd_float_t  CA2      = gmx_simd_set1_f(1.128358514861418e-1f);
+    const gmx_simd_float_t  CA1      = gmx_simd_set1_f(-3.761262582423300e-1f);
+    const gmx_simd_float_t  CA0      = gmx_simd_set1_f(1.128379165726710f);
+    /* Coefficients for minimax approximation of erfc(x)=Exp(-x^2)*P((1/(x-1))^2) in range [0.67,2] */
+    const gmx_simd_float_t  CB9      = gmx_simd_set1_f(-0.0018629930017603923f);
+    const gmx_simd_float_t  CB8      = gmx_simd_set1_f(0.003909821287598495f);
+    const gmx_simd_float_t  CB7      = gmx_simd_set1_f(-0.0052094582210355615f);
+    const gmx_simd_float_t  CB6      = gmx_simd_set1_f(0.005685614362160572f);
+    const gmx_simd_float_t  CB5      = gmx_simd_set1_f(-0.0025367682853477272f);
+    const gmx_simd_float_t  CB4      = gmx_simd_set1_f(-0.010199799682318782f);
+    const gmx_simd_float_t  CB3      = gmx_simd_set1_f(0.04369575504816542f);
+    const gmx_simd_float_t  CB2      = gmx_simd_set1_f(-0.11884063474674492f);
+    const gmx_simd_float_t  CB1      = gmx_simd_set1_f(0.2732120154030589f);
+    const gmx_simd_float_t  CB0      = gmx_simd_set1_f(0.42758357702025784f);
+    /* Coefficients for minimax approximation of erfc(x)=Exp(-x^2)*(1/x)*P((1/x)^2) in range [2,9.19] */
+    const gmx_simd_float_t  CC10     = gmx_simd_set1_f(-0.0445555913112064f);
+    const gmx_simd_float_t  CC9      = gmx_simd_set1_f(0.21376355144663348f);
+    const gmx_simd_float_t  CC8      = gmx_simd_set1_f(-0.3473187200259257f);
+    const gmx_simd_float_t  CC7      = gmx_simd_set1_f(0.016690861551248114f);
+    const gmx_simd_float_t  CC6      = gmx_simd_set1_f(0.7560973182491192f);
+    const gmx_simd_float_t  CC5      = gmx_simd_set1_f(-1.2137903600145787f);
+    const gmx_simd_float_t  CC4      = gmx_simd_set1_f(0.8411872321232948f);
+    const gmx_simd_float_t  CC3      = gmx_simd_set1_f(-0.08670413896296343f);
+    const gmx_simd_float_t  CC2      = gmx_simd_set1_f(-0.27124782687240334f);
+    const gmx_simd_float_t  CC1      = gmx_simd_set1_f(-0.0007502488047806069f);
+    const gmx_simd_float_t  CC0      = gmx_simd_set1_f(0.5642114853803148f);
+    const gmx_simd_float_t  one      = gmx_simd_set1_f(1.0f);
+    const gmx_simd_float_t  two      = gmx_simd_set1_f(2.0f);
+
+    gmx_simd_float_t        x2, x4, y;
+    gmx_simd_float_t        t, t2, w, w2;
+    gmx_simd_float_t        pA0, pA1, pB0, pB1, pC0, pC1;
+    gmx_simd_float_t        expmx2;
+    gmx_simd_float_t        res_erf, res_erfc, res;
+    gmx_simd_fbool_t        mask;
+
+    /* Calculate erf() */
+    x2   = gmx_simd_mul_f(x, x);
+    x4   = gmx_simd_mul_f(x2, x2);
+
+    pA0  = gmx_simd_fmadd_f(CA6, x4, CA4);
+    pA1  = gmx_simd_fmadd_f(CA5, x4, CA3);
+    pA0  = gmx_simd_fmadd_f(pA0, x4, CA2);
+    pA1  = gmx_simd_fmadd_f(pA1, x4, CA1);
+    pA0  = gmx_simd_mul_f(pA0, x4);
+    pA0  = gmx_simd_fmadd_f(pA1, x2, pA0);
+    /* Constant term must come last for precision reasons */
+    pA0  = gmx_simd_add_f(pA0, CA0);
+
+    res_erf = gmx_simd_mul_f(x, pA0);
+
+    /* Calculate erfc */
+    y       = gmx_simd_fabs_f(x);
+    t       = gmx_simd_inv_f(y);
+    w       = gmx_simd_sub_f(t, one);
+    t2      = gmx_simd_mul_f(t, t);
+    w2      = gmx_simd_mul_f(w, w);
+
+    /* No need for a floating-point sieve here (as in erfc), since erf()
+     * will never return values that are extremely small for large args.
+     */
+    expmx2  = gmx_simd_exp_f( gmx_simd_fneg_f( gmx_simd_mul_f(y, y)));
+
+    pB1  = gmx_simd_fmadd_f(CB9, w2, CB7);
+    pB0  = gmx_simd_fmadd_f(CB8, w2, CB6);
+    pB1  = gmx_simd_fmadd_f(pB1, w2, CB5);
+    pB0  = gmx_simd_fmadd_f(pB0, w2, CB4);
+    pB1  = gmx_simd_fmadd_f(pB1, w2, CB3);
+    pB0  = gmx_simd_fmadd_f(pB0, w2, CB2);
+    pB1  = gmx_simd_fmadd_f(pB1, w2, CB1);
+    pB0  = gmx_simd_fmadd_f(pB0, w2, CB0);
+    pB0  = gmx_simd_fmadd_f(pB1, w, pB0);
+
+    pC0  = gmx_simd_fmadd_f(CC10, t2, CC8);
+    pC1  = gmx_simd_fmadd_f(CC9, t2, CC7);
+    pC0  = gmx_simd_fmadd_f(pC0, t2, CC6);
+    pC1  = gmx_simd_fmadd_f(pC1, t2, CC5);
+    pC0  = gmx_simd_fmadd_f(pC0, t2, CC4);
+    pC1  = gmx_simd_fmadd_f(pC1, t2, CC3);
+    pC0  = gmx_simd_fmadd_f(pC0, t2, CC2);
+    pC1  = gmx_simd_fmadd_f(pC1, t2, CC1);
+
+    pC0  = gmx_simd_fmadd_f(pC0, t2, CC0);
+    pC0  = gmx_simd_fmadd_f(pC1, t, pC0);
+    pC0  = gmx_simd_mul_f(pC0, t);
+
+    /* SELECT pB0 or pC0 for erfc() */
+    mask     = gmx_simd_cmplt_f(two, y);
+    res_erfc = gmx_simd_blendv_f(pB0, pC0, mask);
+    res_erfc = gmx_simd_mul_f(res_erfc, expmx2);
+
+    /* erfc(x<0) = 2-erfc(|x|) */
+    mask     = gmx_simd_cmplt_f(x, gmx_simd_setzero_f());
+    res_erfc = gmx_simd_blendv_f(res_erfc, gmx_simd_sub_f(two, res_erfc), mask);
+
+    /* Select erf() or erfc() */
+    mask = gmx_simd_cmplt_f(y, gmx_simd_set1_f(0.75f));
+    res  = gmx_simd_blendv_f(gmx_simd_sub_f(one, res_erfc), res_erf, mask);
+
+    return res;
+}
+
+/*! \brief SIMD float erfc(x).
+ *
+ * You should normally call the real-precision routine \ref gmx_simd_erfc_r.
+ *
+ * \param x The value to calculate erfc(x) for.
+ * \result erfc(x)
+ *
+ * This routine achieves full precision (bar the last bit) over most of the
+ * input range, but for large arguments where the result is getting close
+ * to the minimum representable numbers we accept slightly larger errors
+ * (think results that are in the ballpark of 10^-30 for single precision,
+ * or 10^-200 for double) since that is not relevant for MD.
+ */
+static gmx_inline gmx_simd_float_t
+gmx_simd_erfc_f(gmx_simd_float_t x)
+{
+    /* Coefficients for minimax approximation of erf(x)=x*P(x^2) in range [-1,1] */
+    const gmx_simd_float_t  CA6      = gmx_simd_set1_f(7.853861353153693e-5f);
+    const gmx_simd_float_t  CA5      = gmx_simd_set1_f(-8.010193625184903e-4f);
+    const gmx_simd_float_t  CA4      = gmx_simd_set1_f(5.188327685732524e-3f);
+    const gmx_simd_float_t  CA3      = gmx_simd_set1_f(-2.685381193529856e-2f);
+    const gmx_simd_float_t  CA2      = gmx_simd_set1_f(1.128358514861418e-1f);
+    const gmx_simd_float_t  CA1      = gmx_simd_set1_f(-3.761262582423300e-1f);
+    const gmx_simd_float_t  CA0      = gmx_simd_set1_f(1.128379165726710f);
+    /* Coefficients for minimax approximation of erfc(x)=Exp(-x^2)*P((1/(x-1))^2) in range [0.67,2] */
+    const gmx_simd_float_t  CB9      = gmx_simd_set1_f(-0.0018629930017603923f);
+    const gmx_simd_float_t  CB8      = gmx_simd_set1_f(0.003909821287598495f);
+    const gmx_simd_float_t  CB7      = gmx_simd_set1_f(-0.0052094582210355615f);
+    const gmx_simd_float_t  CB6      = gmx_simd_set1_f(0.005685614362160572f);
+    const gmx_simd_float_t  CB5      = gmx_simd_set1_f(-0.0025367682853477272f);
+    const gmx_simd_float_t  CB4      = gmx_simd_set1_f(-0.010199799682318782f);
+    const gmx_simd_float_t  CB3      = gmx_simd_set1_f(0.04369575504816542f);
+    const gmx_simd_float_t  CB2      = gmx_simd_set1_f(-0.11884063474674492f);
+    const gmx_simd_float_t  CB1      = gmx_simd_set1_f(0.2732120154030589f);
+    const gmx_simd_float_t  CB0      = gmx_simd_set1_f(0.42758357702025784f);
+    /* Coefficients for minimax approximation of erfc(x)=Exp(-x^2)*(1/x)*P((1/x)^2) in range [2,9.19] */
+    const gmx_simd_float_t  CC10     = gmx_simd_set1_f(-0.0445555913112064f);
+    const gmx_simd_float_t  CC9      = gmx_simd_set1_f(0.21376355144663348f);
+    const gmx_simd_float_t  CC8      = gmx_simd_set1_f(-0.3473187200259257f);
+    const gmx_simd_float_t  CC7      = gmx_simd_set1_f(0.016690861551248114f);
+    const gmx_simd_float_t  CC6      = gmx_simd_set1_f(0.7560973182491192f);
+    const gmx_simd_float_t  CC5      = gmx_simd_set1_f(-1.2137903600145787f);
+    const gmx_simd_float_t  CC4      = gmx_simd_set1_f(0.8411872321232948f);
+    const gmx_simd_float_t  CC3      = gmx_simd_set1_f(-0.08670413896296343f);
+    const gmx_simd_float_t  CC2      = gmx_simd_set1_f(-0.27124782687240334f);
+    const gmx_simd_float_t  CC1      = gmx_simd_set1_f(-0.0007502488047806069f);
+    const gmx_simd_float_t  CC0      = gmx_simd_set1_f(0.5642114853803148f);
+    /* Coefficients for expansion of exp(x) in [0,0.1] */
+    /* CD0 and CD1 are both 1.0, so no need to declare them separately */
+    const gmx_simd_float_t  CD2      = gmx_simd_set1_f(0.5000066608081202f);
+    const gmx_simd_float_t  CD3      = gmx_simd_set1_f(0.1664795422874624f);
+    const gmx_simd_float_t  CD4      = gmx_simd_set1_f(0.04379839977652482f);
+    const gmx_simd_float_t  one      = gmx_simd_set1_f(1.0f);
+    const gmx_simd_float_t  two      = gmx_simd_set1_f(2.0f);
+
+    /* We need to use a small trick here, since we cannot assume all SIMD
+     * architectures support integers, and the flag we want (0xfffff000) would
+     * evaluate to NaN (i.e., it cannot be expressed as a floating-point num).
+     * Instead, we represent the flags 0xf0f0f000 and 0x0f0f0000 as valid
+     * fp numbers, and perform a logical or. Since the expression is constant,
+     * we can at least hope it is evaluated at compile-time.
+     */
+#ifdef GMX_SIMD_HAVE_LOGICAL
+    const gmx_simd_float_t  sieve    = gmx_simd_or_f(gmx_simd_set1_f(-5.965323564e+29f), gmx_simd_set1_f(7.05044434e-30f));
+#else
+    const int               isieve   = 0xFFFFF000;
+    float                   mem[GMX_SIMD_REAL_WIDTH*2];
+    float *                 pmem = gmx_simd_align_f(mem);
+    union {
+        float f; int i;
+    } conv;
+    int                     i;
+#endif
+
+    gmx_simd_float_t        x2, x4, y;
+    gmx_simd_float_t        q, z, t, t2, w, w2;
+    gmx_simd_float_t        pA0, pA1, pB0, pB1, pC0, pC1;
+    gmx_simd_float_t        expmx2, corr;
+    gmx_simd_float_t        res_erf, res_erfc, res;
+    gmx_simd_fbool_t        mask;
+
+    /* Calculate erf() */
+    x2     = gmx_simd_mul_f(x, x);
+    x4     = gmx_simd_mul_f(x2, x2);
+
+    pA0  = gmx_simd_fmadd_f(CA6, x4, CA4);
+    pA1  = gmx_simd_fmadd_f(CA5, x4, CA3);
+    pA0  = gmx_simd_fmadd_f(pA0, x4, CA2);
+    pA1  = gmx_simd_fmadd_f(pA1, x4, CA1);
+    pA1  = gmx_simd_mul_f(pA1, x2);
+    pA0  = gmx_simd_fmadd_f(pA0, x4, pA1);
+    /* Constant term must come last for precision reasons */
+    pA0  = gmx_simd_add_f(pA0, CA0);
+
+    res_erf = gmx_simd_mul_f(x, pA0);
+
+    /* Calculate erfc */
+    y       = gmx_simd_fabs_f(x);
+    t       = gmx_simd_inv_f(y);
+    w       = gmx_simd_sub_f(t, one);
+    t2      = gmx_simd_mul_f(t, t);
+    w2      = gmx_simd_mul_f(w, w);
+    /*
+     * We cannot simply calculate exp(-y2) directly in single precision, since
+     * that will lose a couple of bits of precision due to the multiplication.
+     * Instead, we introduce y=z+w, where the last 12 bits of precision are in w.
+     * Then we get exp(-y2) = exp(-z2)*exp((z-y)*(z+y)).
+     *
+     * The only drawback with this is that it requires TWO separate exponential
+     * evaluations, which would be horrible performance-wise. However, the argument
+     * for the second exp() call is always small, so there we simply use a
+     * low-order minimax expansion on [0,0.1].
+     *
+     * However, this neat idea requires support for logical ops (and) on
+     * FP numbers, which some vendors decided isn't necessary in their SIMD
+     * instruction sets (Hi, IBM VSX!). In principle we could use some tricks
+     * in double, but we still need memory as a backup when that is not available,
+     * and this case is rare enough that we go directly there...
+     */
+#ifdef GMX_SIMD_HAVE_LOGICAL
+    z       = gmx_simd_and_f(y, sieve);
+#else
+    gmx_simd_store_f(pmem, y);
+    for (i = 0; i < GMX_SIMD_FLOAT_WIDTH; i++)
+    {
+        conv.f  = pmem[i];
+        conv.i  = conv.i & isieve;
+        pmem[i] = conv.f;
+    }
+    z = gmx_simd_load_f(pmem);
+#endif
+    q       = gmx_simd_mul_f( gmx_simd_sub_f(z, y), gmx_simd_add_f(z, y) );
+    corr    = gmx_simd_fmadd_f(CD4, q, CD3);
+    corr    = gmx_simd_fmadd_f(corr, q, CD2);
+    corr    = gmx_simd_fmadd_f(corr, q, one);
+    corr    = gmx_simd_fmadd_f(corr, q, one);
+
+    expmx2  = gmx_simd_exp_f( gmx_simd_fneg_f( gmx_simd_mul_f(z, z) ) );
+    expmx2  = gmx_simd_mul_f(expmx2, corr);
+
+    pB1  = gmx_simd_fmadd_f(CB9, w2, CB7);
+    pB0  = gmx_simd_fmadd_f(CB8, w2, CB6);
+    pB1  = gmx_simd_fmadd_f(pB1, w2, CB5);
+    pB0  = gmx_simd_fmadd_f(pB0, w2, CB4);
+    pB1  = gmx_simd_fmadd_f(pB1, w2, CB3);
+    pB0  = gmx_simd_fmadd_f(pB0, w2, CB2);
+    pB1  = gmx_simd_fmadd_f(pB1, w2, CB1);
+    pB0  = gmx_simd_fmadd_f(pB0, w2, CB0);
+    pB0  = gmx_simd_fmadd_f(pB1, w, pB0);
+
+    pC0  = gmx_simd_fmadd_f(CC10, t2, CC8);
+    pC1  = gmx_simd_fmadd_f(CC9, t2, CC7);
+    pC0  = gmx_simd_fmadd_f(pC0, t2, CC6);
+    pC1  = gmx_simd_fmadd_f(pC1, t2, CC5);
+    pC0  = gmx_simd_fmadd_f(pC0, t2, CC4);
+    pC1  = gmx_simd_fmadd_f(pC1, t2, CC3);
+    pC0  = gmx_simd_fmadd_f(pC0, t2, CC2);
+    pC1  = gmx_simd_fmadd_f(pC1, t2, CC1);
+
+    pC0  = gmx_simd_fmadd_f(pC0, t2, CC0);
+    pC0  = gmx_simd_fmadd_f(pC1, t, pC0);
+    pC0  = gmx_simd_mul_f(pC0, t);
+
+    /* SELECT pB0 or pC0 for erfc() */
+    mask     = gmx_simd_cmplt_f(two, y);
+    res_erfc = gmx_simd_blendv_f(pB0, pC0, mask);
+    res_erfc = gmx_simd_mul_f(res_erfc, expmx2);
+
+    /* erfc(x<0) = 2-erfc(|x|) */
+    mask     = gmx_simd_cmplt_f(x, gmx_simd_setzero_f());
+    res_erfc = gmx_simd_blendv_f(res_erfc, gmx_simd_sub_f(two, res_erfc), mask);
+
+    /* Select erf() or erfc() */
+    mask = gmx_simd_cmplt_f(y, gmx_simd_set1_f(0.75f));
+    res  = gmx_simd_blendv_f(res_erfc, gmx_simd_sub_f(one, res_erf), mask);
+
+    return res;
+}
+
+/*! \brief SIMD float sin \& cos.
+ *
+ * You should normally call the real-precision routine \ref gmx_simd_sincos_r.
+ *
+ * \param x The argument to evaluate sin/cos for
+ * \param[out] sinval Sin(x)
+ * \param[out] cosval Cos(x)
+ *
+ * This version achieves close to machine precision, but for very large
+ * magnitudes of the argument we inherently begin to lose accuracy due to the
+ * argument reduction, despite using extended precision arithmetics internally.
+ */
+static gmx_inline void
+gmx_simd_sincos_f(gmx_simd_float_t x, gmx_simd_float_t *sinval, gmx_simd_float_t *cosval)
+{
+    /* Constants to subtract Pi/4*x from y while minimizing precision loss */
+    const gmx_simd_float_t  argred0         = gmx_simd_set1_f(1.5703125);
+    const gmx_simd_float_t  argred1         = gmx_simd_set1_f(4.83751296997070312500e-04f);
+    const gmx_simd_float_t  argred2         = gmx_simd_set1_f(7.54953362047672271729e-08f);
+    const gmx_simd_float_t  argred3         = gmx_simd_set1_f(2.56334406825708960298e-12f);
+    const gmx_simd_float_t  two_over_pi     = gmx_simd_set1_f(2.0f/M_PI);
+    const gmx_simd_float_t  const_sin2      = gmx_simd_set1_f(-1.9515295891e-4f);
+    const gmx_simd_float_t  const_sin1      = gmx_simd_set1_f( 8.3321608736e-3f);
+    const gmx_simd_float_t  const_sin0      = gmx_simd_set1_f(-1.6666654611e-1f);
+    const gmx_simd_float_t  const_cos2      = gmx_simd_set1_f( 2.443315711809948e-5f);
+    const gmx_simd_float_t  const_cos1      = gmx_simd_set1_f(-1.388731625493765e-3f);
+    const gmx_simd_float_t  const_cos0      = gmx_simd_set1_f( 4.166664568298827e-2f);
+    const gmx_simd_float_t  half            = gmx_simd_set1_f(0.5f);
+    const gmx_simd_float_t  one             = gmx_simd_set1_f(1.0f);
+    gmx_simd_float_t        ssign, csign;
+    gmx_simd_float_t        x2, y, z, psin, pcos, sss, ccc;
+    gmx_simd_fbool_t        mask;
+#if (defined GMX_SIMD_HAVE_FINT32) && (defined GMX_SIMD_HAVE_FINT32_ARITHMETICS) && (defined GMX_SIMD_HAVE_LOGICAL)
+    const gmx_simd_fint32_t ione            = gmx_simd_set1_fi(1);
+    const gmx_simd_fint32_t itwo            = gmx_simd_set1_fi(2);
+    gmx_simd_fint32_t       iy;
+
+    z       = gmx_simd_mul_f(x, two_over_pi);
+    iy      = gmx_simd_cvt_f2i(z);
+    y       = gmx_simd_round_f(z);
+
+    mask    = gmx_simd_cvt_fib2fb(gmx_simd_cmpeq_fi(gmx_simd_and_fi(iy, ione), gmx_simd_setzero_fi()));
+    ssign   = gmx_simd_blendzero_f(gmx_simd_set1_f(-0.0f), gmx_simd_cvt_fib2fb(gmx_simd_cmpeq_fi(gmx_simd_and_fi(iy, itwo), itwo)));
+    csign   = gmx_simd_blendzero_f(gmx_simd_set1_f(-0.0f), gmx_simd_cvt_fib2fb(gmx_simd_cmpeq_fi(gmx_simd_and_fi(gmx_simd_add_fi(iy, ione), itwo), itwo)));
+#else
+    const gmx_simd_float_t  quarter         = gmx_simd_set1_f(0.25f);
+    const gmx_simd_float_t  minusquarter    = gmx_simd_set1_f(-0.25f);
+    gmx_simd_float_t        q;
+    gmx_simd_fbool_t        m1, m2, m3;
+
+    /* The most obvious way to find the arguments quadrant in the unit circle
+     * to calculate the sign is to use integer arithmetic, but that is not
+     * present in all SIMD implementations. As an alternative, we have devised a
+     * pure floating-point algorithm that uses truncation for argument reduction
+     * so that we get a new value 0<=q<1 over the unit circle, and then
+     * do floating-point comparisons with fractions. This is likely to be
+     * slightly slower (~10%) due to the longer latencies of floating-point, so
+     * we only use it when integer SIMD arithmetic is not present.
+     */
+    ssign   = x;
+    x       = gmx_simd_fabs_f(x);
+    /* It is critical that half-way cases are rounded down */
+    z       = gmx_simd_fmadd_f(x, two_over_pi, half);
+    y       = gmx_simd_trunc_f(z);
+    q       = gmx_simd_mul_f(z, quarter);
+    q       = gmx_simd_sub_f(q, gmx_simd_trunc_f(q));
+    /* z now starts at 0.0 for x=-pi/4 (although neg. values cannot occur), and
+     * then increased by 1.0 as x increases by 2*Pi, when it resets to 0.0.
+     * This removes the 2*Pi periodicity without using any integer arithmetic.
+     * First check if y had the value 2 or 3, set csign if true.
+     */
+    q       = gmx_simd_sub_f(q, half);
+    /* If we have logical operations we can work directly on the signbit, which
+     * saves instructions. Otherwise we need to represent signs as +1.0/-1.0.
+     * Thus, if you are altering defines to debug alternative code paths, the
+     * two GMX_SIMD_HAVE_LOGICAL sections in this routine must either both be
+     * active or inactive - you will get errors if only one is used.
+     */
+#    ifdef GMX_SIMD_HAVE_LOGICAL
+    ssign   = gmx_simd_and_f(ssign, gmx_simd_set1_f(-0.0f));
+    csign   = gmx_simd_andnot_f(q, gmx_simd_set1_f(-0.0f));
+    ssign   = gmx_simd_xor_f(ssign, csign);
+#    else
+    csign   = gmx_simd_xor_sign_f(gmx_simd_set1_f(-1.0f), q);
+    // ALT: csign = gmx_simd_fneg_f(gmx_simd_copysign(gmx_simd_set1_f(1.0),q));
+
+    ssign   = gmx_simd_xor_sign_f(ssign, csign);    /* swap ssign if csign was set. */
+#    endif
+    /* Check if y had value 1 or 3 (remember we subtracted 0.5 from q) */
+    m1      = gmx_simd_cmplt_f(q, minusquarter);
+    m2      = gmx_simd_cmple_f(gmx_simd_setzero_f(), q);
+    m3      = gmx_simd_cmplt_f(q, quarter);
+    m2      = gmx_simd_and_fb(m2, m3);
+    mask    = gmx_simd_or_fb(m1, m2);
+    /* where mask is FALSE, set sign. */
+    csign   = gmx_simd_xor_sign_f(csign, gmx_simd_blendv_f(gmx_simd_set1_f(-1.0f), one, mask));
+#endif
+    x       = gmx_simd_fnmadd_f(y, argred0, x);
+    x       = gmx_simd_fnmadd_f(y, argred1, x);
+    x       = gmx_simd_fnmadd_f(y, argred2, x);
+    x       = gmx_simd_fnmadd_f(y, argred3, x);
+    x2      = gmx_simd_mul_f(x, x);
+
+    psin    = gmx_simd_fmadd_f(const_sin2, x2, const_sin1);
+    psin    = gmx_simd_fmadd_f(psin, x2, const_sin0);
+    psin    = gmx_simd_fmadd_f(psin, gmx_simd_mul_f(x, x2), x);
+    pcos    = gmx_simd_fmadd_f(const_cos2, x2, const_cos1);
+    pcos    = gmx_simd_fmadd_f(pcos, x2, const_cos0);
+    pcos    = gmx_simd_fmsub_f(pcos, x2, half);
+    pcos    = gmx_simd_fmadd_f(pcos, x2, one);
+
+    sss     = gmx_simd_blendv_f(pcos, psin, mask);
+    ccc     = gmx_simd_blendv_f(psin, pcos, mask);
+    /* See comment for GMX_SIMD_HAVE_LOGICAL section above. */
+#ifdef GMX_SIMD_HAVE_LOGICAL
+    *sinval = gmx_simd_xor_f(sss, ssign);
+    *cosval = gmx_simd_xor_f(ccc, csign);
+#else
+    *sinval = gmx_simd_xor_sign_f(sss, ssign);
+    *cosval = gmx_simd_xor_sign_f(ccc, csign);
+#endif
+}
+
+/*! \brief SIMD float sin(x).
+ *
+ * You should normally call the real-precision routine \ref gmx_simd_sin_r.
+ *
+ * \param x The argument to evaluate sin for
+ * \result Sin(x)
+ *
+ * \attention Do NOT call both sin & cos if you need both results, since each of them
+ * will then call \ref gmx_simd_sincos_r and waste a factor 2 in performance.
+ */
+static gmx_inline gmx_simd_float_t
+gmx_simd_sin_f(gmx_simd_float_t x)
+{
+    gmx_simd_float_t s, c;
+    gmx_simd_sincos_f(x, &s, &c);
+    return s;
+}
+
+/*! \brief SIMD float cos(x).
+ *
+ * You should normally call the real-precision routine \ref gmx_simd_cos_r.
+ *
+ * \param x The argument to evaluate cos for
+ * \result Cos(x)
+ *
+ * \attention Do NOT call both sin & cos if you need both results, since each of them
+ * will then call \ref gmx_simd_sincos_r and waste a factor 2 in performance.
+ */
+static gmx_inline gmx_simd_float_t
+gmx_simd_cos_f(gmx_simd_float_t x)
+{
+    gmx_simd_float_t s, c;
+    gmx_simd_sincos_f(x, &s, &c);
+    return c;
+}
+
+/*! \brief SIMD float tan(x).
+ *
+ * You should normally call the real-precision routine \ref gmx_simd_tan_r.
+ *
+ * \param x The argument to evaluate tan for
+ * \result Tan(x)
+ */
+static gmx_inline gmx_simd_float_t
+gmx_simd_tan_f(gmx_simd_float_t x)
+{
+    const gmx_simd_float_t  argred0         = gmx_simd_set1_f(1.5703125);
+    const gmx_simd_float_t  argred1         = gmx_simd_set1_f(4.83751296997070312500e-04f);
+    const gmx_simd_float_t  argred2         = gmx_simd_set1_f(7.54953362047672271729e-08f);
+    const gmx_simd_float_t  argred3         = gmx_simd_set1_f(2.56334406825708960298e-12f);
+    const gmx_simd_float_t  two_over_pi     = gmx_simd_set1_f(2.0f/M_PI);
+    const gmx_simd_float_t  CT6             = gmx_simd_set1_f(0.009498288995810566122993911);
+    const gmx_simd_float_t  CT5             = gmx_simd_set1_f(0.002895755790837379295226923);
+    const gmx_simd_float_t  CT4             = gmx_simd_set1_f(0.02460087336161924491836265);
+    const gmx_simd_float_t  CT3             = gmx_simd_set1_f(0.05334912882656359828045988);
+    const gmx_simd_float_t  CT2             = gmx_simd_set1_f(0.1333989091464957704418495);
+    const gmx_simd_float_t  CT1             = gmx_simd_set1_f(0.3333307599244198227797507);
+
+    gmx_simd_float_t        x2, p, y, z;
+    gmx_simd_fbool_t        mask;
+
+#if (defined GMX_SIMD_HAVE_FINT32) && (defined GMX_SIMD_HAVE_FINT32_ARITHMETICS) && (defined GMX_SIMD_HAVE_LOGICAL)
+    gmx_simd_fint32_t  iy;
+    gmx_simd_fint32_t  ione = gmx_simd_set1_fi(1);
+
+    z       = gmx_simd_mul_f(x, two_over_pi);
+    iy      = gmx_simd_cvt_f2i(z);
+    y       = gmx_simd_round_f(z);
+    mask    = gmx_simd_cvt_fib2fb(gmx_simd_cmpeq_fi(gmx_simd_and_fi(iy, ione), ione));
+
+    x       = gmx_simd_fnmadd_f(y, argred0, x);
+    x       = gmx_simd_fnmadd_f(y, argred1, x);
+    x       = gmx_simd_fnmadd_f(y, argred2, x);
+    x       = gmx_simd_fnmadd_f(y, argred3, x);
+    x       = gmx_simd_xor_f(gmx_simd_blendzero_f(gmx_simd_set1_f(-0.0f), mask), x);
+#else
+    const gmx_simd_float_t  quarter         = gmx_simd_set1_f(0.25f);
+    const gmx_simd_float_t  half            = gmx_simd_set1_f(0.5f);
+    const gmx_simd_float_t  threequarter    = gmx_simd_set1_f(0.75f);
+    gmx_simd_float_t        w, q;
+    gmx_simd_fbool_t        m1, m2, m3;
+
+    w       = gmx_simd_fabs_f(x);
+    z       = gmx_simd_fmadd_f(w, two_over_pi, half);
+    y       = gmx_simd_trunc_f(z);
+    q       = gmx_simd_mul_f(z, quarter);
+    q       = gmx_simd_sub_f(q, gmx_simd_trunc_f(q));
+    m1      = gmx_simd_cmple_f(quarter, q);
+    m2      = gmx_simd_cmplt_f(q, half);
+    m3      = gmx_simd_cmple_f(threequarter, q);
+    m1      = gmx_simd_and_fb(m1, m2);
+    mask    = gmx_simd_or_fb(m1, m3);
+    w       = gmx_simd_fnmadd_f(y, argred0, w);
+    w       = gmx_simd_fnmadd_f(y, argred1, w);
+    w       = gmx_simd_fnmadd_f(y, argred2, w);
+    w       = gmx_simd_fnmadd_f(y, argred3, w);
+
+    w       = gmx_simd_blendv_f(w, gmx_simd_fneg_f(w), mask);
+    x       = gmx_simd_xor_sign_f(w, x);
+#endif
+    x2      = gmx_simd_mul_f(x, x);
+    p       = gmx_simd_fmadd_f(CT6, x2, CT5);
+    p       = gmx_simd_fmadd_f(p, x2, CT4);
+    p       = gmx_simd_fmadd_f(p, x2, CT3);
+    p       = gmx_simd_fmadd_f(p, x2, CT2);
+    p       = gmx_simd_fmadd_f(p, x2, CT1);
+    p       = gmx_simd_fmadd_f(x2, gmx_simd_mul_f(p, x), x);
+
+    p       = gmx_simd_blendv_f( p, gmx_simd_inv_f(p), mask);
+    return p;
+}
+
+/*! \brief SIMD float asin(x).
+ *
+ * You should normally call the real-precision routine \ref gmx_simd_asin_r.
+ *
+ * \param x The argument to evaluate asin for
+ * \result Asin(x)
+ */
+static gmx_inline gmx_simd_float_t
+gmx_simd_asin_f(gmx_simd_float_t x)
+{
+    const gmx_simd_float_t limitlow   = gmx_simd_set1_f(1e-4f);
+    const gmx_simd_float_t half       = gmx_simd_set1_f(0.5f);
+    const gmx_simd_float_t one        = gmx_simd_set1_f(1.0f);
+    const gmx_simd_float_t halfpi     = gmx_simd_set1_f((float)M_PI/2.0f);
+    const gmx_simd_float_t CC5        = gmx_simd_set1_f(4.2163199048E-2f);
+    const gmx_simd_float_t CC4        = gmx_simd_set1_f(2.4181311049E-2f);
+    const gmx_simd_float_t CC3        = gmx_simd_set1_f(4.5470025998E-2f);
+    const gmx_simd_float_t CC2        = gmx_simd_set1_f(7.4953002686E-2f);
+    const gmx_simd_float_t CC1        = gmx_simd_set1_f(1.6666752422E-1f);
+    gmx_simd_float_t       xabs;
+    gmx_simd_float_t       z, z1, z2, q, q1, q2;
+    gmx_simd_float_t       pA, pB;
+    gmx_simd_fbool_t       mask;
+
+    xabs  = gmx_simd_fabs_f(x);
+    mask  = gmx_simd_cmplt_f(half, xabs);
+    z1    = gmx_simd_mul_f(half, gmx_simd_sub_f(one, xabs));
+    q1    = gmx_simd_mul_f(z1, gmx_simd_invsqrt_f(z1));
+    q1    = gmx_simd_blendnotzero_f(q1, gmx_simd_cmpeq_f(xabs, one));
+    q2    = xabs;
+    z2    = gmx_simd_mul_f(q2, q2);
+    z     = gmx_simd_blendv_f(z2, z1, mask);
+    q     = gmx_simd_blendv_f(q2, q1, mask);
+
+    z2    = gmx_simd_mul_f(z, z);
+    pA    = gmx_simd_fmadd_f(CC5, z2, CC3);
+    pB    = gmx_simd_fmadd_f(CC4, z2, CC2);
+    pA    = gmx_simd_fmadd_f(pA, z2, CC1);
+    pA    = gmx_simd_mul_f(pA, z);
+    z     = gmx_simd_fmadd_f(pB, z2, pA);
+    z     = gmx_simd_fmadd_f(z, q, q);
+    q2    = gmx_simd_sub_f(halfpi, z);
+    q2    = gmx_simd_sub_f(q2, z);
+    z     = gmx_simd_blendv_f(z, q2, mask);
+
+    mask  = gmx_simd_cmplt_f(limitlow, xabs);
+    z     = gmx_simd_blendv_f( xabs, z, mask );
+    z     = gmx_simd_xor_sign_f(z, x);
+
+    return z;
+}
+
+/*! \brief SIMD float acos(x).
+ *
+ * You should normally call the real-precision routine \ref gmx_simd_acos_r.
+ *
+ * \param x The argument to evaluate acos for
+ * \result Acos(x)
+ */
+static gmx_inline gmx_simd_float_t
+gmx_simd_acos_f(gmx_simd_float_t x)
+{
+    const gmx_simd_float_t one       = gmx_simd_set1_f(1.0f);
+    const gmx_simd_float_t half      = gmx_simd_set1_f(0.5f);
+    const gmx_simd_float_t pi        = gmx_simd_set1_f((float)M_PI);
+    const gmx_simd_float_t halfpi    = gmx_simd_set1_f((float)M_PI/2.0f);
+    gmx_simd_float_t       xabs;
+    gmx_simd_float_t       z, z1, z2, z3;
+    gmx_simd_fbool_t       mask1, mask2;
+
+    xabs  = gmx_simd_fabs_f(x);
+    mask1 = gmx_simd_cmplt_f(half, xabs);
+    mask2 = gmx_simd_cmplt_f(gmx_simd_setzero_f(), x);
+
+    z     = gmx_simd_mul_f(half, gmx_simd_sub_f(one, xabs));
+    z     = gmx_simd_mul_f(z, gmx_simd_invsqrt_f(z));
+    z     = gmx_simd_blendnotzero_f(z, gmx_simd_cmpeq_f(xabs, one));
+    z     = gmx_simd_blendv_f(x, z, mask1);
+    z     = gmx_simd_asin_f(z);
+
+    z2    = gmx_simd_add_f(z, z);
+    z1    = gmx_simd_sub_f(pi, z2);
+    z3    = gmx_simd_sub_f(halfpi, z);
+    z     = gmx_simd_blendv_f(z1, z2, mask2);
+    z     = gmx_simd_blendv_f(z3, z, mask1);
+
+    return z;
+}
+
+/*! \brief SIMD float asin(x).
+ *
+ * You should normally call the real-precision routine \ref gmx_simd_atan_r.
+ *
+ * \param x The argument to evaluate atan for
+ * \result Atan(x), same argument/value range as standard math library.
+ */
+static gmx_inline gmx_simd_float_t
+gmx_simd_atan_f(gmx_simd_float_t x)
+{
+    const gmx_simd_float_t halfpi    = gmx_simd_set1_f(M_PI/2);
+    const gmx_simd_float_t CA17      = gmx_simd_set1_f(0.002823638962581753730774f);
+    const gmx_simd_float_t CA15      = gmx_simd_set1_f(-0.01595690287649631500244f);
+    const gmx_simd_float_t CA13      = gmx_simd_set1_f(0.04250498861074447631836f);
+    const gmx_simd_float_t CA11      = gmx_simd_set1_f(-0.07489009201526641845703f);
+    const gmx_simd_float_t CA9       = gmx_simd_set1_f(0.1063479334115982055664f);
+    const gmx_simd_float_t CA7       = gmx_simd_set1_f(-0.1420273631811141967773f);
+    const gmx_simd_float_t CA5       = gmx_simd_set1_f(0.1999269574880599975585f);
+    const gmx_simd_float_t CA3       = gmx_simd_set1_f(-0.3333310186862945556640f);
+    gmx_simd_float_t       x2, x3, x4, pA, pB;
+    gmx_simd_fbool_t       mask, mask2;
+
+    mask  = gmx_simd_cmplt_f(x, gmx_simd_setzero_f());
+    x     = gmx_simd_fabs_f(x);
+    mask2 = gmx_simd_cmplt_f(gmx_simd_set1_f(1.0f), x);
+    x     = gmx_simd_blendv_f(x, gmx_simd_inv_f(x), mask2);
+
+    x2    = gmx_simd_mul_f(x, x);
+    x3    = gmx_simd_mul_f(x2, x);
+    x4    = gmx_simd_mul_f(x2, x2);
+    pA    = gmx_simd_fmadd_f(CA17, x4, CA13);
+    pB    = gmx_simd_fmadd_f(CA15, x4, CA11);
+    pA    = gmx_simd_fmadd_f(pA, x4, CA9);
+    pB    = gmx_simd_fmadd_f(pB, x4, CA7);
+    pA    = gmx_simd_fmadd_f(pA, x4, CA5);
+    pB    = gmx_simd_fmadd_f(pB, x4, CA3);
+    pA    = gmx_simd_fmadd_f(pA, x2, pB);
+    pA    = gmx_simd_fmadd_f(pA, x3, x);
+
+    pA    = gmx_simd_blendv_f(pA, gmx_simd_sub_f(halfpi, pA), mask2);
+    pA    = gmx_simd_blendv_f(pA, gmx_simd_fneg_f(pA), mask);
+
+    return pA;
+}
+
+/*! \brief SIMD float atan2(y,x).
+ *
+ * You should normally call the real-precision routine \ref gmx_simd_atan2_r.
+ *
+ * \param y Y component of vector, any quartile
+ * \param x X component of vector, any quartile
+ * \result Atan(y,x), same argument/value range as standard math library.
+ *
+ * \note This routine should provide correct results for all finite
+ * non-zero or positive-zero arguments. However, negative zero arguments will
+ * be treated as positive zero, which means the return value will deviate from
+ * the standard math library atan2(y,x) for those cases. That should not be
+ * of any concern in Gromacs, and in particular it will not affect calculations
+ * of angles from vectors.
+ */
+static gmx_inline gmx_simd_float_t
+gmx_simd_atan2_f(gmx_simd_float_t y, gmx_simd_float_t x)
+{
+    const gmx_simd_float_t pi          = gmx_simd_set1_f(M_PI);
+    const gmx_simd_float_t halfpi      = gmx_simd_set1_f(M_PI/2.0);
+    gmx_simd_float_t       xinv, p, aoffset;
+    gmx_simd_fbool_t       mask_x0, mask_y0, mask_xlt0, mask_ylt0;
+
+    mask_x0   = gmx_simd_cmpeq_f(x, gmx_simd_setzero_f());
+    mask_y0   = gmx_simd_cmpeq_f(y, gmx_simd_setzero_f());
+    mask_xlt0 = gmx_simd_cmplt_f(x, gmx_simd_setzero_f());
+    mask_ylt0 = gmx_simd_cmplt_f(y, gmx_simd_setzero_f());
+
+    aoffset   = gmx_simd_blendzero_f(halfpi, mask_x0);
+    aoffset   = gmx_simd_blendnotzero_f(aoffset, mask_y0);
+
+    aoffset   = gmx_simd_blendv_f(aoffset, pi, mask_xlt0);
+    aoffset   = gmx_simd_blendv_f(aoffset, gmx_simd_fneg_f(aoffset), mask_ylt0);
+
+    xinv      = gmx_simd_blendnotzero_f(gmx_simd_inv_f(x), mask_x0);
+    p         = gmx_simd_mul_f(y, xinv);
+    p         = gmx_simd_atan_f(p);
+    p         = gmx_simd_add_f(p, aoffset);
+
+    return p;
+}
+
+/*! \brief Calculate the force correction due to PME analytically in SIMD float.
+ *
+ * You should normally call the real-precision routine \ref gmx_simd_pmecorrF_r.
+ *
+ * \param z2 \f$(r \beta)^2\f$ - see below for details.
+ * \result Correction factor to coulomb force - see below for details.
+ *
+ * This routine is meant to enable analytical evaluation of the
+ * direct-space PME electrostatic force to avoid tables.
+ *
+ * The direct-space potential should be \f$ \mbox{erfc}(\beta r)/r\f$, but there
+ * are some problems evaluating that:
+ *
+ * First, the error function is difficult (read: expensive) to
+ * approxmiate accurately for intermediate to large arguments, and
+ * this happens already in ranges of \f$(\beta r)\f$ that occur in simulations.
+ * Second, we now try to avoid calculating potentials in Gromacs but
+ * use forces directly.
+ *
+ * We can simply things slight by noting that the PME part is really
+ * a correction to the normal Coulomb force since \f$\mbox{erfc}(z)=1-\mbox{erf}(z)\f$, i.e.
+ * \f[
+ * V = \frac{1}{r} - \frac{\mbox{erf}(\beta r)}{r}
+ * \f]
+ * The first term we already have from the inverse square root, so
+ * that we can leave out of this routine.
+ *
+ * For pme tolerances of 1e-3 to 1e-8 and cutoffs of 0.5nm to 1.8nm,
+ * the argument \f$beta r\f$ will be in the range 0.15 to ~4. Use your
+ * favorite plotting program to realize how well-behaved \f$\frac{\mbox{erf}(z)}{z}\f$ is
+ * in this range!
+ *
+ * We approximate \f$f(z)=\mbox{erf}(z)/z\f$ with a rational minimax polynomial.
+ * However, it turns out it is more efficient to approximate \f$f(z)/z\f$ and
+ * then only use even powers. This is another minor optimization, since
+ * we actually \a want \f$f(z)/z\f$, because it is going to be multiplied by
+ * the vector between the two atoms to get the vectorial force. The
+ * fastest flops are the ones we can avoid calculating!
+ *
+ * So, here's how it should be used:
+ *
+ * 1. Calculate \f$r^2\f$.
+ * 2. Multiply by \f$\beta^2\f$, so you get \f$z^2=(\beta r)^2\f$.
+ * 3. Evaluate this routine with \f$z^2\f$ as the argument.
+ * 4. The return value is the expression:
+ *
+ * \f[
+ *    \frac{2 \exp{-z^2}}{\sqrt{\pi} z^2}-\frac{\mbox{erf}(z)}{z^3}
+ * \f]
+ *
+ * 5. Multiply the entire expression by \f$\beta^3\f$. This will get you
+ *
+ *  \f[
+ *    \frac{2 \beta^3 \exp(-z^2)}{\sqrt{\pi} z^2} - \frac{\beta^3 \mbox{erf}(z)}{z^3}
+ *  \f]
+ *
+ *    or, switching back to \f$r\f$ (since \f$z=r \beta\f$):
+ *
+ *  \f[
+ *    \frac{2 \beta \exp(-r^2 \beta^2)}{\sqrt{\pi} r^2} - \frac{\mbox{erf}(r \beta)}{r^3}
+ *  \f]
+ *
+ *    With a bit of math exercise you should be able to confirm that
+ *    this is exactly
+ *
+ *  \f[
+ *   \frac{\frac{d}{dr}\left( \frac{\mbox{erf}(\beta r)}{r} \right)}{r}
+ *  \f]
+ *
+ * 6. Add the result to \f$r^{-3}\f$, multiply by the product of the charges,
+ *    and you have your force (divided by \f$r\f$). A final multiplication
+ *    with the vector connecting the two particles and you have your
+ *    vectorial force to add to the particles.
+ *
+ * This approximation achieves an accuracy slightly lower than 1e-6; when
+ * added to \f$1/r\f$ the error will be insignificant.
+ *
+ */
+static gmx_simd_float_t
+gmx_simd_pmecorrF_f(gmx_simd_float_t z2)
+{
+    const gmx_simd_float_t  FN6      = gmx_simd_set1_f(-1.7357322914161492954e-8f);
+    const gmx_simd_float_t  FN5      = gmx_simd_set1_f(1.4703624142580877519e-6f);
+    const gmx_simd_float_t  FN4      = gmx_simd_set1_f(-0.000053401640219807709149f);
+    const gmx_simd_float_t  FN3      = gmx_simd_set1_f(0.0010054721316683106153f);
+    const gmx_simd_float_t  FN2      = gmx_simd_set1_f(-0.019278317264888380590f);
+    const gmx_simd_float_t  FN1      = gmx_simd_set1_f(0.069670166153766424023f);
+    const gmx_simd_float_t  FN0      = gmx_simd_set1_f(-0.75225204789749321333f);
+
+    const gmx_simd_float_t  FD4      = gmx_simd_set1_f(0.0011193462567257629232f);
+    const gmx_simd_float_t  FD3      = gmx_simd_set1_f(0.014866955030185295499f);
+    const gmx_simd_float_t  FD2      = gmx_simd_set1_f(0.11583842382862377919f);
+    const gmx_simd_float_t  FD1      = gmx_simd_set1_f(0.50736591960530292870f);
+    const gmx_simd_float_t  FD0      = gmx_simd_set1_f(1.0f);
+
+    gmx_simd_float_t        z4;
+    gmx_simd_float_t        polyFN0, polyFN1, polyFD0, polyFD1;
+
+    z4             = gmx_simd_mul_f(z2, z2);
+
+    polyFD0        = gmx_simd_fmadd_f(FD4, z4, FD2);
+    polyFD1        = gmx_simd_fmadd_f(FD3, z4, FD1);
+    polyFD0        = gmx_simd_fmadd_f(polyFD0, z4, FD0);
+    polyFD0        = gmx_simd_fmadd_f(polyFD1, z2, polyFD0);
+
+    polyFD0        = gmx_simd_inv_f(polyFD0);
+
+    polyFN0        = gmx_simd_fmadd_f(FN6, z4, FN4);
+    polyFN1        = gmx_simd_fmadd_f(FN5, z4, FN3);
+    polyFN0        = gmx_simd_fmadd_f(polyFN0, z4, FN2);
+    polyFN1        = gmx_simd_fmadd_f(polyFN1, z4, FN1);
+    polyFN0        = gmx_simd_fmadd_f(polyFN0, z4, FN0);
+    polyFN0        = gmx_simd_fmadd_f(polyFN1, z2, polyFN0);
+
+    return gmx_simd_mul_f(polyFN0, polyFD0);
+}
+
+
+
+/*! \brief Calculate the potential correction due to PME analytically in SIMD float.
+ *
+ * You should normally call the real-precision routine \ref gmx_simd_pmecorrV_r.
+ *
+ * \param z2 \f$(r \beta)^2\f$ - see below for details.
+ * \result Correction factor to coulomb potential - see below for details.
+ *
+ * See \ref gmx_simd_pmecorrF_f for details about the approximation.
+ *
+ * This routine calculates \f$\mbox{erf}(z)/z\f$, although you should provide \f$z^2\f$
+ * as the input argument.
+ *
+ * Here's how it should be used:
+ *
+ * 1. Calculate \f$r^2\f$.
+ * 2. Multiply by \f$\beta^2\f$, so you get \f$z^2=\beta^2*r^2\f$.
+ * 3. Evaluate this routine with z^2 as the argument.
+ * 4. The return value is the expression:
+ *
+ *  \f[
+ *   \frac{\mbox{erf}(z)}{z}
+ *  \f]
+ *
+ * 5. Multiply the entire expression by beta and switching back to \f$r\f$ (since \f$z=r \beta\f$):
+ *
+ *  \f[
+ *    \frac{\mbox{erf}(r \beta)}{r}
+ *  \f]
+ *
+ * 6. Subtract the result from \f$1/r\f$, multiply by the product of the charges,
+ *    and you have your potential.
+ *
+ * This approximation achieves an accuracy slightly lower than 1e-6; when
+ * added to \f$1/r\f$ the error will be insignificant.
+ */
+static gmx_simd_float_t
+gmx_simd_pmecorrV_f(gmx_simd_float_t z2)
+{
+    const gmx_simd_float_t  VN6      = gmx_simd_set1_f(1.9296833005951166339e-8f);
+    const gmx_simd_float_t  VN5      = gmx_simd_set1_f(-1.4213390571557850962e-6f);
+    const gmx_simd_float_t  VN4      = gmx_simd_set1_f(0.000041603292906656984871f);
+    const gmx_simd_float_t  VN3      = gmx_simd_set1_f(-0.00013134036773265025626f);
+    const gmx_simd_float_t  VN2      = gmx_simd_set1_f(0.038657983986041781264f);
+    const gmx_simd_float_t  VN1      = gmx_simd_set1_f(0.11285044772717598220f);
+    const gmx_simd_float_t  VN0      = gmx_simd_set1_f(1.1283802385263030286f);
+
+    const gmx_simd_float_t  VD3      = gmx_simd_set1_f(0.0066752224023576045451f);
+    const gmx_simd_float_t  VD2      = gmx_simd_set1_f(0.078647795836373922256f);
+    const gmx_simd_float_t  VD1      = gmx_simd_set1_f(0.43336185284710920150f);
+    const gmx_simd_float_t  VD0      = gmx_simd_set1_f(1.0f);
+
+    gmx_simd_float_t        z4;
+    gmx_simd_float_t        polyVN0, polyVN1, polyVD0, polyVD1;
+
+    z4             = gmx_simd_mul_f(z2, z2);
+
+    polyVD1        = gmx_simd_fmadd_f(VD3, z4, VD1);
+    polyVD0        = gmx_simd_fmadd_f(VD2, z4, VD0);
+    polyVD0        = gmx_simd_fmadd_f(polyVD1, z2, polyVD0);
+
+    polyVD0        = gmx_simd_inv_f(polyVD0);
+
+    polyVN0        = gmx_simd_fmadd_f(VN6, z4, VN4);
+    polyVN1        = gmx_simd_fmadd_f(VN5, z4, VN3);
+    polyVN0        = gmx_simd_fmadd_f(polyVN0, z4, VN2);
+    polyVN1        = gmx_simd_fmadd_f(polyVN1, z4, VN1);
+    polyVN0        = gmx_simd_fmadd_f(polyVN0, z4, VN0);
+    polyVN0        = gmx_simd_fmadd_f(polyVN1, z2, polyVN0);
+
+    return gmx_simd_mul_f(polyVN0, polyVD0);
+}
+#endif
+
+/*! \} */
+
+#ifdef GMX_SIMD_HAVE_DOUBLE
+
+/*! \name Double precision SIMD math functions
+ *
+ *  \note In most cases you should use the real-precision functions instead.
+ *  \{
+ */
+
+/****************************************
+ * DOUBLE PRECISION SIMD MATH FUNCTIONS *
+ ****************************************/
+
+/*! \brief SIMD utility function to sum a+b+c+d for SIMD doubles.
+ *
+ * \copydetails gmx_simd_sum4_f
+ */
+static gmx_inline gmx_simd_double_t
+gmx_simd_sum4_d(gmx_simd_double_t a, gmx_simd_double_t b,
+                gmx_simd_double_t c, gmx_simd_double_t d)
+{
+    return gmx_simd_add_d(gmx_simd_add_d(a, b), gmx_simd_add_d(c, d));
+}
+
+/*! \brief Return -a if b is negative, SIMD double.
+ *
+ * You should normally call the real-precision routine \ref gmx_simd_xor_sign_r.
+ *
+ * \param a Values to set sign for
+ * \param b Values used to set sign
+ * \return if b is negative, the sign of a will be changed.
+ *
+ * This is equivalent to doing an xor operation on a with the sign bit of b,
+ * with the exception that negative zero is not considered to be negative
+ * on architectures where \ref GMX_SIMD_HAVE_LOGICAL is not set.
+ */
+static gmx_inline gmx_simd_double_t
+gmx_simd_xor_sign_d(gmx_simd_double_t a, gmx_simd_double_t b)
+{
+#ifdef GMX_SIMD_HAVE_LOGICAL
+    return gmx_simd_xor_d(a, gmx_simd_and_d(gmx_simd_set1_d(-0.0), b));
+#else
+    return gmx_simd_blendv_d(a, gmx_simd_fneg_d(a), gmx_simd_cmplt_d(b, gmx_simd_setzero_d()));
+#endif
+}
+
+/*! \brief Perform one Newton-Raphson iteration to improve 1/sqrt(x) for SIMD double.
+ *
+ * \copydetails gmx_simd_rsqrt_iter_f
+ */
+static gmx_inline gmx_simd_double_t
+gmx_simd_rsqrt_iter_d(gmx_simd_double_t lu, gmx_simd_double_t x)
+{
+#ifdef GMX_SIMD_HAVE_FMA
+    return gmx_simd_fmadd_d(gmx_simd_fnmadd_d(x, gmx_simd_mul_d(lu, lu), gmx_simd_set1_d(1.0)), gmx_simd_mul_d(lu, gmx_simd_set1_d(0.5)), lu);
+#else
+    return gmx_simd_mul_d(gmx_simd_set1_d(0.5), gmx_simd_mul_d(gmx_simd_sub_d(gmx_simd_set1_d(3.0), gmx_simd_mul_d(gmx_simd_mul_d(lu, lu), x)), lu));
+#endif
+}
+
+
+/*! \brief Calculate 1/sqrt(x) for SIMD double
+ *
+ * \copydetails gmx_simd_invsqrt_f
+ */
+static gmx_inline gmx_simd_double_t
+gmx_simd_invsqrt_d(gmx_simd_double_t x)
+{
+    gmx_simd_double_t lu = gmx_simd_rsqrt_d(x);
+#if (GMX_SIMD_RSQRT_BITS < GMX_SIMD_MATH_TARGET_DOUBLE_BITS)
+    lu = gmx_simd_rsqrt_iter_d(lu, x);
+#endif
+#if (GMX_SIMD_RSQRT_BITS*2 < GMX_SIMD_MATH_TARGET_DOUBLE_BITS)
+    lu = gmx_simd_rsqrt_iter_d(lu, x);
+#endif
+#if (GMX_SIMD_RSQRT_BITS*4 < GMX_SIMD_MATH_TARGET_DOUBLE_BITS)
+    lu = gmx_simd_rsqrt_iter_d(lu, x);
+#endif
+#if (GMX_SIMD_RSQRT_BITS*8 < GMX_SIMD_MATH_TARGET_DOUBLE_BITS)
+    lu = gmx_simd_rsqrt_iter_d(lu, x);
+#endif
+    return lu;
+}
+
+/*! \brief Calculate 1/sqrt(x) for two SIMD doubles.
+ *
+ * \copydetails gmx_simd_invsqrt_pair_f
+ */
+static gmx_inline void
+gmx_simd_invsqrt_pair_d(gmx_simd_double_t x0,    gmx_simd_double_t x1,
+                        gmx_simd_double_t *out0, gmx_simd_double_t *out1)
+{
+#if (defined GMX_SIMD_HAVE_FLOAT) && (GMX_SIMD_FLOAT_WIDTH == 2*GMX_SIMD_DOUBLE_WIDTH) && (GMX_SIMD_RSQRT_BITS < 22)
+    gmx_simd_float_t  xf  = gmx_simd_cvt_dd2f(x0, x1);
+    gmx_simd_float_t  luf = gmx_simd_rsqrt_f(xf);
+    gmx_simd_double_t lu0, lu1;
+    /* Intermediate target is single - mantissa+1 bits */
+#if (GMX_SIMD_RSQRT_BITS < GMX_SIMD_MATH_TARGET_SINGLE_BITS)
+    luf = gmx_simd_rsqrt_iter_f(luf, xf);
+#endif
+#if (GMX_SIMD_RSQRT_BITS*2 < GMX_SIMD_MATH_TARGET_SINGLE_BITS)
+    luf = gmx_simd_rsqrt_iter_f(luf, xf);
+#endif
+#if (GMX_SIMD_RSQRT_BITS*4 < GMX_SIMD_MATH_TARGET_SINGLE_BITS)
+    luf = gmx_simd_rsqrt_iter_f(luf, xf);
+#endif
+    gmx_simd_cvt_f2dd(luf, &lu0, &lu1);
+    /* Last iteration(s) performed in double - if we had 22 bits, this gets us to 44 (~1e-15) */
+#if (GMX_SIMD_MATH_TARGET_SINGLE_BITS < GMX_SIMD_MATH_TARGET_DOUBLE_BITS)
+    lu0 = gmx_simd_rsqrt_iter_d(lu0, x0);
+    lu1 = gmx_simd_rsqrt_iter_d(lu1, x1);
+#endif
+#if (GMX_SIMD_MATH_TARGET_SINGLE_BITS*2 < GMX_SIMD_MATH_TARGET_DOUBLE_BITS)
+    lu0 = gmx_simd_rsqrt_iter_d(lu0, x0);
+    lu1 = gmx_simd_rsqrt_iter_d(lu1, x1);
+#endif
+    *out0 = lu0;
+    *out1 = lu1;
+#else
+    *out0 = gmx_simd_invsqrt_d(x0);
+    *out1 = gmx_simd_invsqrt_d(x1);
+#endif
+}
+
+/*! \brief Perform one Newton-Raphson iteration to improve 1/x for SIMD double.
+ *
+ * \copydetails gmx_simd_rcp_iter_f
+ */
+static gmx_inline gmx_simd_double_t
+gmx_simd_rcp_iter_d(gmx_simd_double_t lu, gmx_simd_double_t x)
+{
+    return gmx_simd_mul_d(lu, gmx_simd_fnmadd_d(lu, x, gmx_simd_set1_d(2.0)));
+}
+
+/*! \brief Calculate 1/x for SIMD double.
+ *
+ * \copydetails gmx_simd_inv_f
+ */
+static gmx_inline gmx_simd_double_t
+gmx_simd_inv_d(gmx_simd_double_t x)
+{
+    gmx_simd_double_t lu = gmx_simd_rcp_d(x);
+#if (GMX_SIMD_RCP_BITS < GMX_SIMD_MATH_TARGET_DOUBLE_BITS)
+    lu = gmx_simd_rcp_iter_d(lu, x);
+#endif
+#if (GMX_SIMD_RCP_BITS*2 < GMX_SIMD_MATH_TARGET_DOUBLE_BITS)
+    lu = gmx_simd_rcp_iter_d(lu, x);
+#endif
+#if (GMX_SIMD_RCP_BITS*4 < GMX_SIMD_MATH_TARGET_DOUBLE_BITS)
+    lu = gmx_simd_rcp_iter_d(lu, x);
+#endif
+#if (GMX_SIMD_RCP_BITS*8 < GMX_SIMD_MATH_TARGET_DOUBLE_BITS)
+    lu = gmx_simd_rcp_iter_d(lu, x);
+#endif
+    return lu;
+}
+
+/*! \brief Calculate sqrt(x) correctly for SIMD doubles, including argument 0.0.
+ *
+ * \copydetails gmx_simd_sqrt_f
+ */
+static gmx_inline gmx_simd_double_t
+gmx_simd_sqrt_d(gmx_simd_double_t x)
+{
+    gmx_simd_dbool_t   mask;
+    gmx_simd_double_t  res;
+
+    mask = gmx_simd_cmpeq_d(x, gmx_simd_setzero_d());
+    res  = gmx_simd_blendnotzero_d(gmx_simd_invsqrt_d(x), mask);
+    return gmx_simd_mul_d(res, x);
+}
+
+/*! \brief SIMD double log(x). This is the natural logarithm.
+ *
+ * \copydetails gmx_simd_log_f
+ */
+static gmx_inline gmx_simd_double_t
+gmx_simd_log_d(gmx_simd_double_t x)
+{
+    const gmx_simd_double_t  half       = gmx_simd_set1_d(0.5);
+    const gmx_simd_double_t  one        = gmx_simd_set1_d(1.0);
+    const gmx_simd_double_t  sqrt2      = gmx_simd_set1_d(sqrt(2.0));
+    const gmx_simd_double_t  corr       = gmx_simd_set1_d(0.693147180559945286226764);
+    const gmx_simd_double_t  CL15       = gmx_simd_set1_d(0.148197055177935105296783);
+    const gmx_simd_double_t  CL13       = gmx_simd_set1_d(0.153108178020442575739679);
+    const gmx_simd_double_t  CL11       = gmx_simd_set1_d(0.181837339521549679055568);
+    const gmx_simd_double_t  CL9        = gmx_simd_set1_d(0.22222194152736701733275);
+    const gmx_simd_double_t  CL7        = gmx_simd_set1_d(0.285714288030134544449368);
+    const gmx_simd_double_t  CL5        = gmx_simd_set1_d(0.399999999989941956712869);
+    const gmx_simd_double_t  CL3        = gmx_simd_set1_d(0.666666666666685503450651);
+    const gmx_simd_double_t  CL1        = gmx_simd_set1_d(2.0);
+    gmx_simd_double_t        fexp, x2, p;
+    gmx_simd_dbool_t         mask;
+
+    fexp  = gmx_simd_get_exponent_d(x);
+    x     = gmx_simd_get_mantissa_d(x);
+
+    mask  = gmx_simd_cmplt_d(sqrt2, x);
+    /* Adjust to non-IEEE format for x>sqrt(2): exponent += 1, mantissa *= 0.5 */
+    fexp  = gmx_simd_add_d(fexp, gmx_simd_blendzero_d(one, mask));
+    x     = gmx_simd_mul_d(x, gmx_simd_blendv_d(one, half, mask));
+
+    x     = gmx_simd_mul_d( gmx_simd_sub_d(x, one), gmx_simd_inv_d( gmx_simd_add_d(x, one) ) );
+    x2    = gmx_simd_mul_d(x, x);
+
+    p     = gmx_simd_fmadd_d(CL15, x2, CL13);
+    p     = gmx_simd_fmadd_d(p, x2, CL11);
+    p     = gmx_simd_fmadd_d(p, x2, CL9);
+    p     = gmx_simd_fmadd_d(p, x2, CL7);
+    p     = gmx_simd_fmadd_d(p, x2, CL5);
+    p     = gmx_simd_fmadd_d(p, x2, CL3);
+    p     = gmx_simd_fmadd_d(p, x2, CL1);
+    p     = gmx_simd_fmadd_d(p, x, gmx_simd_mul_d(corr, fexp));
+
+    return p;
+}
+
+/*! \brief SIMD double 2^x.
+ *
+ * \copydetails gmx_simd_exp2_f
+ */
+static gmx_inline gmx_simd_double_t
+gmx_simd_exp2_d(gmx_simd_double_t x)
+{
+    const gmx_simd_double_t  arglimit      = gmx_simd_set1_d(1022.0);
+    const gmx_simd_double_t  CE11          = gmx_simd_set1_d(4.435280790452730022081181e-10);
+    const gmx_simd_double_t  CE10          = gmx_simd_set1_d(7.074105630863314448024247e-09);
+    const gmx_simd_double_t  CE9           = gmx_simd_set1_d(1.017819803432096698472621e-07);
+    const gmx_simd_double_t  CE8           = gmx_simd_set1_d(1.321543308956718799557863e-06);
+    const gmx_simd_double_t  CE7           = gmx_simd_set1_d(0.00001525273348995851746990884);
+    const gmx_simd_double_t  CE6           = gmx_simd_set1_d(0.0001540353046251466849082632);
+    const gmx_simd_double_t  CE5           = gmx_simd_set1_d(0.001333355814678995257307880);
+    const gmx_simd_double_t  CE4           = gmx_simd_set1_d(0.009618129107588335039176502);
+    const gmx_simd_double_t  CE3           = gmx_simd_set1_d(0.05550410866481992147457793);
+    const gmx_simd_double_t  CE2           = gmx_simd_set1_d(0.2402265069591015620470894);
+    const gmx_simd_double_t  CE1           = gmx_simd_set1_d(0.6931471805599453304615075);
+    const gmx_simd_double_t  one           = gmx_simd_set1_d(1.0);
+    gmx_simd_double_t        fexppart;
+    gmx_simd_double_t        intpart;
+    gmx_simd_double_t        p;
+    gmx_simd_dbool_t         valuemask;
+
+    fexppart  = gmx_simd_set_exponent_d(x);  /* rounds to nearest int internally */
+    intpart   = gmx_simd_round_d(x);         /* use same rounding mode here */
+    valuemask = gmx_simd_cmple_d(gmx_simd_fabs_d(x), arglimit);
+    fexppart  = gmx_simd_blendzero_d(fexppart, valuemask);
+    x         = gmx_simd_sub_d(x, intpart);
+
+    p         = gmx_simd_fmadd_d(CE11, x, CE10);
+    p         = gmx_simd_fmadd_d(p, x, CE9);
+    p         = gmx_simd_fmadd_d(p, x, CE8);
+    p         = gmx_simd_fmadd_d(p, x, CE7);
+    p         = gmx_simd_fmadd_d(p, x, CE6);
+    p         = gmx_simd_fmadd_d(p, x, CE5);
+    p         = gmx_simd_fmadd_d(p, x, CE4);
+    p         = gmx_simd_fmadd_d(p, x, CE3);
+    p         = gmx_simd_fmadd_d(p, x, CE2);
+    p         = gmx_simd_fmadd_d(p, x, CE1);
+    p         = gmx_simd_fmadd_d(p, x, one);
+    x         = gmx_simd_mul_d(p, fexppart);
+    return x;
+}
+
+/*! \brief SIMD double exp(x).
+ *
+ * \copydetails gmx_simd_exp_f
+ */
+static gmx_inline gmx_simd_double_t
+gmx_simd_exp_d(gmx_simd_double_t x)
+{
+    const gmx_simd_double_t  argscale      = gmx_simd_set1_d(1.44269504088896340735992468100);
+    const gmx_simd_double_t  arglimit      = gmx_simd_set1_d(1022.0);
+    const gmx_simd_double_t  invargscale0  = gmx_simd_set1_d(0.69314718055966295651160180568695068359375);
+    const gmx_simd_double_t  invargscale1  = gmx_simd_set1_d(2.8235290563031577122588448175013436025525412068e-13);
+    const gmx_simd_double_t  CE12          = gmx_simd_set1_d(2.078375306791423699350304e-09);
+    const gmx_simd_double_t  CE11          = gmx_simd_set1_d(2.518173854179933105218635e-08);
+    const gmx_simd_double_t  CE10          = gmx_simd_set1_d(2.755842049600488770111608e-07);
+    const gmx_simd_double_t  CE9           = gmx_simd_set1_d(2.755691815216689746619849e-06);
+    const gmx_simd_double_t  CE8           = gmx_simd_set1_d(2.480158383706245033920920e-05);
+    const gmx_simd_double_t  CE7           = gmx_simd_set1_d(0.0001984127043518048611841321);
+    const gmx_simd_double_t  CE6           = gmx_simd_set1_d(0.001388888889360258341755930);
+    const gmx_simd_double_t  CE5           = gmx_simd_set1_d(0.008333333332907368102819109);
+    const gmx_simd_double_t  CE4           = gmx_simd_set1_d(0.04166666666663836745814631);
+    const gmx_simd_double_t  CE3           = gmx_simd_set1_d(0.1666666666666796929434570);
+    const gmx_simd_double_t  CE2           = gmx_simd_set1_d(0.5);
+    const gmx_simd_double_t  one           = gmx_simd_set1_d(1.0);
+    gmx_simd_double_t        fexppart;
+    gmx_simd_double_t        intpart;
+    gmx_simd_double_t        y, p;
+    gmx_simd_dbool_t         valuemask;
+
+    y         = gmx_simd_mul_d(x, argscale);
+    fexppart  = gmx_simd_set_exponent_d(y);  /* rounds to nearest int internally */
+    intpart   = gmx_simd_round_d(y);         /* use same rounding mode here */
+    valuemask = gmx_simd_cmple_d(gmx_simd_fabs_d(y), arglimit);
+    fexppart  = gmx_simd_blendzero_d(fexppart, valuemask);
+
+    /* Extended precision arithmetics */
+    x         = gmx_simd_fnmadd_d(invargscale0, intpart, x);
+    x         = gmx_simd_fnmadd_d(invargscale1, intpart, x);
+
+    p         = gmx_simd_fmadd_d(CE12, x, CE11);
+    p         = gmx_simd_fmadd_d(p, x, CE10);
+    p         = gmx_simd_fmadd_d(p, x, CE9);
+    p         = gmx_simd_fmadd_d(p, x, CE8);
+    p         = gmx_simd_fmadd_d(p, x, CE7);
+    p         = gmx_simd_fmadd_d(p, x, CE6);
+    p         = gmx_simd_fmadd_d(p, x, CE5);
+    p         = gmx_simd_fmadd_d(p, x, CE4);
+    p         = gmx_simd_fmadd_d(p, x, CE3);
+    p         = gmx_simd_fmadd_d(p, x, CE2);
+    p         = gmx_simd_fmadd_d(p, gmx_simd_mul_d(x, x), gmx_simd_add_d(x, one));
+    x         = gmx_simd_mul_d(p, fexppart);
+    return x;
+}
+
+/*! \brief SIMD double erf(x).
+ *
+ * \copydetails gmx_simd_erf_f
+ */
+static gmx_inline gmx_simd_double_t
+gmx_simd_erf_d(gmx_simd_double_t x)
+{
+    /* Coefficients for minimax approximation of erf(x)=x*(CAoffset + P(x^2)/Q(x^2)) in range [-0.75,0.75] */
+    const gmx_simd_double_t CAP4      = gmx_simd_set1_d(-0.431780540597889301512e-4);
+    const gmx_simd_double_t CAP3      = gmx_simd_set1_d(-0.00578562306260059236059);
+    const gmx_simd_double_t CAP2      = gmx_simd_set1_d(-0.028593586920219752446);
+    const gmx_simd_double_t CAP1      = gmx_simd_set1_d(-0.315924962948621698209);
+    const gmx_simd_double_t CAP0      = gmx_simd_set1_d(0.14952975608477029151);
+
+    const gmx_simd_double_t CAQ5      = gmx_simd_set1_d(-0.374089300177174709737e-5);
+    const gmx_simd_double_t CAQ4      = gmx_simd_set1_d(0.00015126584532155383535);
+    const gmx_simd_double_t CAQ3      = gmx_simd_set1_d(0.00536692680669480725423);
+    const gmx_simd_double_t CAQ2      = gmx_simd_set1_d(0.0668686825594046122636);
+    const gmx_simd_double_t CAQ1      = gmx_simd_set1_d(0.402604990869284362773);
+    /* CAQ0 == 1.0 */
+    const gmx_simd_double_t CAoffset  = gmx_simd_set1_d(0.9788494110107421875);
+
+    /* Coefficients for minimax approximation of erfc(x)=exp(-x^2)*x*(P(x-1)/Q(x-1)) in range [1.0,4.5] */
+    const gmx_simd_double_t CBP6      = gmx_simd_set1_d(2.49650423685462752497647637088e-10);
+    const gmx_simd_double_t CBP5      = gmx_simd_set1_d(0.00119770193298159629350136085658);
+    const gmx_simd_double_t CBP4      = gmx_simd_set1_d(0.0164944422378370965881008942733);
+    const gmx_simd_double_t CBP3      = gmx_simd_set1_d(0.0984581468691775932063932439252);
+    const gmx_simd_double_t CBP2      = gmx_simd_set1_d(0.317364595806937763843589437418);
+    const gmx_simd_double_t CBP1      = gmx_simd_set1_d(0.554167062641455850932670067075);
+    const gmx_simd_double_t CBP0      = gmx_simd_set1_d(0.427583576155807163756925301060);
+    const gmx_simd_double_t CBQ7      = gmx_simd_set1_d(0.00212288829699830145976198384930);
+    const gmx_simd_double_t CBQ6      = gmx_simd_set1_d(0.0334810979522685300554606393425);
+    const gmx_simd_double_t CBQ5      = gmx_simd_set1_d(0.2361713785181450957579508850717);
+    const gmx_simd_double_t CBQ4      = gmx_simd_set1_d(0.955364736493055670530981883072);
+    const gmx_simd_double_t CBQ3      = gmx_simd_set1_d(2.36815675631420037315349279199);
+    const gmx_simd_double_t CBQ2      = gmx_simd_set1_d(3.55261649184083035537184223542);
+    const gmx_simd_double_t CBQ1      = gmx_simd_set1_d(2.93501136050160872574376997993);
+    /* CBQ0 == 1.0 */
+
+    /* Coefficients for minimax approximation of erfc(x)=exp(-x^2)/x*(P(1/x)/Q(1/x)) in range [4.5,inf] */
+    const gmx_simd_double_t CCP6      = gmx_simd_set1_d(-2.8175401114513378771);
+    const gmx_simd_double_t CCP5      = gmx_simd_set1_d(-3.22729451764143718517);
+    const gmx_simd_double_t CCP4      = gmx_simd_set1_d(-2.5518551727311523996);
+    const gmx_simd_double_t CCP3      = gmx_simd_set1_d(-0.687717681153649930619);
+    const gmx_simd_double_t CCP2      = gmx_simd_set1_d(-0.212652252872804219852);
+    const gmx_simd_double_t CCP1      = gmx_simd_set1_d(0.0175389834052493308818);
+    const gmx_simd_double_t CCP0      = gmx_simd_set1_d(0.00628057170626964891937);
+
+    const gmx_simd_double_t CCQ6      = gmx_simd_set1_d(5.48409182238641741584);
+    const gmx_simd_double_t CCQ5      = gmx_simd_set1_d(13.5064170191802889145);
+    const gmx_simd_double_t CCQ4      = gmx_simd_set1_d(22.9367376522880577224);
+    const gmx_simd_double_t CCQ3      = gmx_simd_set1_d(15.930646027911794143);
+    const gmx_simd_double_t CCQ2      = gmx_simd_set1_d(11.0567237927800161565);
+    const gmx_simd_double_t CCQ1      = gmx_simd_set1_d(2.79257750980575282228);
+    /* CCQ0 == 1.0 */
+    const gmx_simd_double_t CCoffset  = gmx_simd_set1_d(0.5579090118408203125);
+
+    const gmx_simd_double_t one       = gmx_simd_set1_d(1.0);
+    const gmx_simd_double_t two       = gmx_simd_set1_d(2.0);
+
+    gmx_simd_double_t       xabs, x2, x4, t, t2, w, w2;
+    gmx_simd_double_t       PolyAP0, PolyAP1, PolyAQ0, PolyAQ1;
+    gmx_simd_double_t       PolyBP0, PolyBP1, PolyBQ0, PolyBQ1;
+    gmx_simd_double_t       PolyCP0, PolyCP1, PolyCQ0, PolyCQ1;
+    gmx_simd_double_t       res_erf, res_erfcB, res_erfcC, res_erfc, res;
+    gmx_simd_double_t       expmx2;
+    gmx_simd_dbool_t        mask;
+
+    /* Calculate erf() */
+    xabs     = gmx_simd_fabs_d(x);
+    x2       = gmx_simd_mul_d(x, x);
+    x4       = gmx_simd_mul_d(x2, x2);
+
+    PolyAP0  = gmx_simd_mul_d(CAP4, x4);
+    PolyAP1  = gmx_simd_mul_d(CAP3, x4);
+    PolyAP0  = gmx_simd_add_d(PolyAP0, CAP2);
+    PolyAP1  = gmx_simd_add_d(PolyAP1, CAP1);
+    PolyAP0  = gmx_simd_mul_d(PolyAP0, x4);
+    PolyAP1  = gmx_simd_mul_d(PolyAP1, x2);
+    PolyAP0  = gmx_simd_add_d(PolyAP0, CAP0);
+    PolyAP0  = gmx_simd_add_d(PolyAP0, PolyAP1);
+
+    PolyAQ1  = gmx_simd_mul_d(CAQ5, x4);
+    PolyAQ0  = gmx_simd_mul_d(CAQ4, x4);
+    PolyAQ1  = gmx_simd_add_d(PolyAQ1, CAQ3);
+    PolyAQ0  = gmx_simd_add_d(PolyAQ0, CAQ2);
+    PolyAQ1  = gmx_simd_mul_d(PolyAQ1, x4);
+    PolyAQ0  = gmx_simd_mul_d(PolyAQ0, x4);
+    PolyAQ1  = gmx_simd_add_d(PolyAQ1, CAQ1);
+    PolyAQ0  = gmx_simd_add_d(PolyAQ0, one);
+    PolyAQ1  = gmx_simd_mul_d(PolyAQ1, x2);
+    PolyAQ0  = gmx_simd_add_d(PolyAQ0, PolyAQ1);
+
+    res_erf  = gmx_simd_mul_d(PolyAP0, gmx_simd_inv_d(PolyAQ0));
+    res_erf  = gmx_simd_add_d(CAoffset, res_erf);
+    res_erf  = gmx_simd_mul_d(x, res_erf);
+
+    /* Calculate erfc() in range [1,4.5] */
+    t       = gmx_simd_sub_d(xabs, one);
+    t2      = gmx_simd_mul_d(t, t);
+
+    PolyBP0  = gmx_simd_mul_d(CBP6, t2);
+    PolyBP1  = gmx_simd_mul_d(CBP5, t2);
+    PolyBP0  = gmx_simd_add_d(PolyBP0, CBP4);
+    PolyBP1  = gmx_simd_add_d(PolyBP1, CBP3);
+    PolyBP0  = gmx_simd_mul_d(PolyBP0, t2);
+    PolyBP1  = gmx_simd_mul_d(PolyBP1, t2);
+    PolyBP0  = gmx_simd_add_d(PolyBP0, CBP2);
+    PolyBP1  = gmx_simd_add_d(PolyBP1, CBP1);
+    PolyBP0  = gmx_simd_mul_d(PolyBP0, t2);
+    PolyBP1  = gmx_simd_mul_d(PolyBP1, t);
+    PolyBP0  = gmx_simd_add_d(PolyBP0, CBP0);
+    PolyBP0  = gmx_simd_add_d(PolyBP0, PolyBP1);
+
+    PolyBQ1 = gmx_simd_mul_d(CBQ7, t2);
+    PolyBQ0 = gmx_simd_mul_d(CBQ6, t2);
+    PolyBQ1 = gmx_simd_add_d(PolyBQ1, CBQ5);
+    PolyBQ0 = gmx_simd_add_d(PolyBQ0, CBQ4);
+    PolyBQ1 = gmx_simd_mul_d(PolyBQ1, t2);
+    PolyBQ0 = gmx_simd_mul_d(PolyBQ0, t2);
+    PolyBQ1 = gmx_simd_add_d(PolyBQ1, CBQ3);
+    PolyBQ0 = gmx_simd_add_d(PolyBQ0, CBQ2);
+    PolyBQ1 = gmx_simd_mul_d(PolyBQ1, t2);
+    PolyBQ0 = gmx_simd_mul_d(PolyBQ0, t2);
+    PolyBQ1 = gmx_simd_add_d(PolyBQ1, CBQ1);
+    PolyBQ0 = gmx_simd_add_d(PolyBQ0, one);
+    PolyBQ1 = gmx_simd_mul_d(PolyBQ1, t);
+    PolyBQ0 = gmx_simd_add_d(PolyBQ0, PolyBQ1);
+
+    res_erfcB = gmx_simd_mul_d(PolyBP0, gmx_simd_inv_d(PolyBQ0));
+
+    res_erfcB = gmx_simd_mul_d(res_erfcB, xabs);
+
+    /* Calculate erfc() in range [4.5,inf] */
+    w       = gmx_simd_inv_d(xabs);
+    w2      = gmx_simd_mul_d(w, w);
+
+    PolyCP0  = gmx_simd_mul_d(CCP6, w2);
+    PolyCP1  = gmx_simd_mul_d(CCP5, w2);
+    PolyCP0  = gmx_simd_add_d(PolyCP0, CCP4);
+    PolyCP1  = gmx_simd_add_d(PolyCP1, CCP3);
+    PolyCP0  = gmx_simd_mul_d(PolyCP0, w2);
+    PolyCP1  = gmx_simd_mul_d(PolyCP1, w2);
+    PolyCP0  = gmx_simd_add_d(PolyCP0, CCP2);
+    PolyCP1  = gmx_simd_add_d(PolyCP1, CCP1);
+    PolyCP0  = gmx_simd_mul_d(PolyCP0, w2);
+    PolyCP1  = gmx_simd_mul_d(PolyCP1, w);
+    PolyCP0  = gmx_simd_add_d(PolyCP0, CCP0);
+    PolyCP0  = gmx_simd_add_d(PolyCP0, PolyCP1);
+
+    PolyCQ0  = gmx_simd_mul_d(CCQ6, w2);
+    PolyCQ1  = gmx_simd_mul_d(CCQ5, w2);
+    PolyCQ0  = gmx_simd_add_d(PolyCQ0, CCQ4);
+    PolyCQ1  = gmx_simd_add_d(PolyCQ1, CCQ3);
+    PolyCQ0  = gmx_simd_mul_d(PolyCQ0, w2);
+    PolyCQ1  = gmx_simd_mul_d(PolyCQ1, w2);
+    PolyCQ0  = gmx_simd_add_d(PolyCQ0, CCQ2);
+    PolyCQ1  = gmx_simd_add_d(PolyCQ1, CCQ1);
+    PolyCQ0  = gmx_simd_mul_d(PolyCQ0, w2);
+    PolyCQ1  = gmx_simd_mul_d(PolyCQ1, w);
+    PolyCQ0  = gmx_simd_add_d(PolyCQ0, one);
+    PolyCQ0  = gmx_simd_add_d(PolyCQ0, PolyCQ1);
+
+    expmx2   = gmx_simd_exp_d( gmx_simd_fneg_d(x2) );
+
+    res_erfcC = gmx_simd_mul_d(PolyCP0, gmx_simd_inv_d(PolyCQ0));
+    res_erfcC = gmx_simd_add_d(res_erfcC, CCoffset);
+    res_erfcC = gmx_simd_mul_d(res_erfcC, w);
+
+    mask     = gmx_simd_cmplt_d(gmx_simd_set1_d(4.5), xabs);
+    res_erfc = gmx_simd_blendv_d(res_erfcB, res_erfcC, mask);
+
+    res_erfc = gmx_simd_mul_d(res_erfc, expmx2);
+
+    /* erfc(x<0) = 2-erfc(|x|) */
+    mask     = gmx_simd_cmplt_d(x, gmx_simd_setzero_d());
+    res_erfc = gmx_simd_blendv_d(res_erfc, gmx_simd_sub_d(two, res_erfc), mask);
+
+    /* Select erf() or erfc() */
+    mask = gmx_simd_cmplt_d(xabs, one);
+    res  = gmx_simd_blendv_d(gmx_simd_sub_d(one, res_erfc), res_erf, mask);
+
+    return res;
+}
+
+/*! \brief SIMD double erfc(x).
+ *
+ * \copydetails gmx_simd_erfc_f
+ */
+static gmx_inline gmx_simd_double_t
+gmx_simd_erfc_d(gmx_simd_double_t x)
+{
+    /* Coefficients for minimax approximation of erf(x)=x*(CAoffset + P(x^2)/Q(x^2)) in range [-0.75,0.75] */
+    const gmx_simd_double_t CAP4      = gmx_simd_set1_d(-0.431780540597889301512e-4);
+    const gmx_simd_double_t CAP3      = gmx_simd_set1_d(-0.00578562306260059236059);
+    const gmx_simd_double_t CAP2      = gmx_simd_set1_d(-0.028593586920219752446);
+    const gmx_simd_double_t CAP1      = gmx_simd_set1_d(-0.315924962948621698209);
+    const gmx_simd_double_t CAP0      = gmx_simd_set1_d(0.14952975608477029151);
+
+    const gmx_simd_double_t CAQ5      = gmx_simd_set1_d(-0.374089300177174709737e-5);
+    const gmx_simd_double_t CAQ4      = gmx_simd_set1_d(0.00015126584532155383535);
+    const gmx_simd_double_t CAQ3      = gmx_simd_set1_d(0.00536692680669480725423);
+    const gmx_simd_double_t CAQ2      = gmx_simd_set1_d(0.0668686825594046122636);
+    const gmx_simd_double_t CAQ1      = gmx_simd_set1_d(0.402604990869284362773);
+    /* CAQ0 == 1.0 */
+    const gmx_simd_double_t CAoffset  = gmx_simd_set1_d(0.9788494110107421875);
+
+    /* Coefficients for minimax approximation of erfc(x)=exp(-x^2)*x*(P(x-1)/Q(x-1)) in range [1.0,4.5] */
+    const gmx_simd_double_t CBP6      = gmx_simd_set1_d(2.49650423685462752497647637088e-10);
+    const gmx_simd_double_t CBP5      = gmx_simd_set1_d(0.00119770193298159629350136085658);
+    const gmx_simd_double_t CBP4      = gmx_simd_set1_d(0.0164944422378370965881008942733);
+    const gmx_simd_double_t CBP3      = gmx_simd_set1_d(0.0984581468691775932063932439252);
+    const gmx_simd_double_t CBP2      = gmx_simd_set1_d(0.317364595806937763843589437418);
+    const gmx_simd_double_t CBP1      = gmx_simd_set1_d(0.554167062641455850932670067075);
+    const gmx_simd_double_t CBP0      = gmx_simd_set1_d(0.427583576155807163756925301060);
+    const gmx_simd_double_t CBQ7      = gmx_simd_set1_d(0.00212288829699830145976198384930);
+    const gmx_simd_double_t CBQ6      = gmx_simd_set1_d(0.0334810979522685300554606393425);
+    const gmx_simd_double_t CBQ5      = gmx_simd_set1_d(0.2361713785181450957579508850717);
+    const gmx_simd_double_t CBQ4      = gmx_simd_set1_d(0.955364736493055670530981883072);
+    const gmx_simd_double_t CBQ3      = gmx_simd_set1_d(2.36815675631420037315349279199);
+    const gmx_simd_double_t CBQ2      = gmx_simd_set1_d(3.55261649184083035537184223542);
+    const gmx_simd_double_t CBQ1      = gmx_simd_set1_d(2.93501136050160872574376997993);
+    /* CBQ0 == 1.0 */
+
+    /* Coefficients for minimax approximation of erfc(x)=exp(-x^2)/x*(P(1/x)/Q(1/x)) in range [4.5,inf] */
+    const gmx_simd_double_t CCP6      = gmx_simd_set1_d(-2.8175401114513378771);
+    const gmx_simd_double_t CCP5      = gmx_simd_set1_d(-3.22729451764143718517);
+    const gmx_simd_double_t CCP4      = gmx_simd_set1_d(-2.5518551727311523996);
+    const gmx_simd_double_t CCP3      = gmx_simd_set1_d(-0.687717681153649930619);
+    const gmx_simd_double_t CCP2      = gmx_simd_set1_d(-0.212652252872804219852);
+    const gmx_simd_double_t CCP1      = gmx_simd_set1_d(0.0175389834052493308818);
+    const gmx_simd_double_t CCP0      = gmx_simd_set1_d(0.00628057170626964891937);
+
+    const gmx_simd_double_t CCQ6      = gmx_simd_set1_d(5.48409182238641741584);
+    const gmx_simd_double_t CCQ5      = gmx_simd_set1_d(13.5064170191802889145);
+    const gmx_simd_double_t CCQ4      = gmx_simd_set1_d(22.9367376522880577224);
+    const gmx_simd_double_t CCQ3      = gmx_simd_set1_d(15.930646027911794143);
+    const gmx_simd_double_t CCQ2      = gmx_simd_set1_d(11.0567237927800161565);
+    const gmx_simd_double_t CCQ1      = gmx_simd_set1_d(2.79257750980575282228);
+    /* CCQ0 == 1.0 */
+    const gmx_simd_double_t CCoffset  = gmx_simd_set1_d(0.5579090118408203125);
+
+    const gmx_simd_double_t one       = gmx_simd_set1_d(1.0);
+    const gmx_simd_double_t two       = gmx_simd_set1_d(2.0);
+
+    gmx_simd_double_t       xabs, x2, x4, t, t2, w, w2;
+    gmx_simd_double_t       PolyAP0, PolyAP1, PolyAQ0, PolyAQ1;
+    gmx_simd_double_t       PolyBP0, PolyBP1, PolyBQ0, PolyBQ1;
+    gmx_simd_double_t       PolyCP0, PolyCP1, PolyCQ0, PolyCQ1;
+    gmx_simd_double_t       res_erf, res_erfcB, res_erfcC, res_erfc, res;
+    gmx_simd_double_t       expmx2;
+    gmx_simd_dbool_t        mask;
+
+    /* Calculate erf() */
+    xabs     = gmx_simd_fabs_d(x);
+    x2       = gmx_simd_mul_d(x, x);
+    x4       = gmx_simd_mul_d(x2, x2);
+
+    PolyAP0  = gmx_simd_mul_d(CAP4, x4);
+    PolyAP1  = gmx_simd_mul_d(CAP3, x4);
+    PolyAP0  = gmx_simd_add_d(PolyAP0, CAP2);
+    PolyAP1  = gmx_simd_add_d(PolyAP1, CAP1);
+    PolyAP0  = gmx_simd_mul_d(PolyAP0, x4);
+    PolyAP1  = gmx_simd_mul_d(PolyAP1, x2);
+    PolyAP0  = gmx_simd_add_d(PolyAP0, CAP0);
+    PolyAP0  = gmx_simd_add_d(PolyAP0, PolyAP1);
+
+    PolyAQ1  = gmx_simd_mul_d(CAQ5, x4);
+    PolyAQ0  = gmx_simd_mul_d(CAQ4, x4);
+    PolyAQ1  = gmx_simd_add_d(PolyAQ1, CAQ3);
+    PolyAQ0  = gmx_simd_add_d(PolyAQ0, CAQ2);
+    PolyAQ1  = gmx_simd_mul_d(PolyAQ1, x4);
+    PolyAQ0  = gmx_simd_mul_d(PolyAQ0, x4);
+    PolyAQ1  = gmx_simd_add_d(PolyAQ1, CAQ1);
+    PolyAQ0  = gmx_simd_add_d(PolyAQ0, one);
+    PolyAQ1  = gmx_simd_mul_d(PolyAQ1, x2);
+    PolyAQ0  = gmx_simd_add_d(PolyAQ0, PolyAQ1);
+
+    res_erf  = gmx_simd_mul_d(PolyAP0, gmx_simd_inv_d(PolyAQ0));
+    res_erf  = gmx_simd_add_d(CAoffset, res_erf);
+    res_erf  = gmx_simd_mul_d(x, res_erf);
+
+    /* Calculate erfc() in range [1,4.5] */
+    t       = gmx_simd_sub_d(xabs, one);
+    t2      = gmx_simd_mul_d(t, t);
+
+    PolyBP0  = gmx_simd_mul_d(CBP6, t2);
+    PolyBP1  = gmx_simd_mul_d(CBP5, t2);
+    PolyBP0  = gmx_simd_add_d(PolyBP0, CBP4);
+    PolyBP1  = gmx_simd_add_d(PolyBP1, CBP3);
+    PolyBP0  = gmx_simd_mul_d(PolyBP0, t2);
+    PolyBP1  = gmx_simd_mul_d(PolyBP1, t2);
+    PolyBP0  = gmx_simd_add_d(PolyBP0, CBP2);
+    PolyBP1  = gmx_simd_add_d(PolyBP1, CBP1);
+    PolyBP0  = gmx_simd_mul_d(PolyBP0, t2);
+    PolyBP1  = gmx_simd_mul_d(PolyBP1, t);
+    PolyBP0  = gmx_simd_add_d(PolyBP0, CBP0);
+    PolyBP0  = gmx_simd_add_d(PolyBP0, PolyBP1);
+
+    PolyBQ1 = gmx_simd_mul_d(CBQ7, t2);
+    PolyBQ0 = gmx_simd_mul_d(CBQ6, t2);
+    PolyBQ1 = gmx_simd_add_d(PolyBQ1, CBQ5);
+    PolyBQ0 = gmx_simd_add_d(PolyBQ0, CBQ4);
+    PolyBQ1 = gmx_simd_mul_d(PolyBQ1, t2);
+    PolyBQ0 = gmx_simd_mul_d(PolyBQ0, t2);
+    PolyBQ1 = gmx_simd_add_d(PolyBQ1, CBQ3);
+    PolyBQ0 = gmx_simd_add_d(PolyBQ0, CBQ2);
+    PolyBQ1 = gmx_simd_mul_d(PolyBQ1, t2);
+    PolyBQ0 = gmx_simd_mul_d(PolyBQ0, t2);
+    PolyBQ1 = gmx_simd_add_d(PolyBQ1, CBQ1);
+    PolyBQ0 = gmx_simd_add_d(PolyBQ0, one);
+    PolyBQ1 = gmx_simd_mul_d(PolyBQ1, t);
+    PolyBQ0 = gmx_simd_add_d(PolyBQ0, PolyBQ1);
+
+    res_erfcB = gmx_simd_mul_d(PolyBP0, gmx_simd_inv_d(PolyBQ0));
+
+    res_erfcB = gmx_simd_mul_d(res_erfcB, xabs);
+
+    /* Calculate erfc() in range [4.5,inf] */
+    w       = gmx_simd_inv_d(xabs);
+    w2      = gmx_simd_mul_d(w, w);
+
+    PolyCP0  = gmx_simd_mul_d(CCP6, w2);
+    PolyCP1  = gmx_simd_mul_d(CCP5, w2);
+    PolyCP0  = gmx_simd_add_d(PolyCP0, CCP4);
+    PolyCP1  = gmx_simd_add_d(PolyCP1, CCP3);
+    PolyCP0  = gmx_simd_mul_d(PolyCP0, w2);
+    PolyCP1  = gmx_simd_mul_d(PolyCP1, w2);
+    PolyCP0  = gmx_simd_add_d(PolyCP0, CCP2);
+    PolyCP1  = gmx_simd_add_d(PolyCP1, CCP1);
+    PolyCP0  = gmx_simd_mul_d(PolyCP0, w2);
+    PolyCP1  = gmx_simd_mul_d(PolyCP1, w);
+    PolyCP0  = gmx_simd_add_d(PolyCP0, CCP0);
+    PolyCP0  = gmx_simd_add_d(PolyCP0, PolyCP1);
+
+    PolyCQ0  = gmx_simd_mul_d(CCQ6, w2);
+    PolyCQ1  = gmx_simd_mul_d(CCQ5, w2);
+    PolyCQ0  = gmx_simd_add_d(PolyCQ0, CCQ4);
+    PolyCQ1  = gmx_simd_add_d(PolyCQ1, CCQ3);
+    PolyCQ0  = gmx_simd_mul_d(PolyCQ0, w2);
+    PolyCQ1  = gmx_simd_mul_d(PolyCQ1, w2);
+    PolyCQ0  = gmx_simd_add_d(PolyCQ0, CCQ2);
+    PolyCQ1  = gmx_simd_add_d(PolyCQ1, CCQ1);
+    PolyCQ0  = gmx_simd_mul_d(PolyCQ0, w2);
+    PolyCQ1  = gmx_simd_mul_d(PolyCQ1, w);
+    PolyCQ0  = gmx_simd_add_d(PolyCQ0, one);
+    PolyCQ0  = gmx_simd_add_d(PolyCQ0, PolyCQ1);
+
+    expmx2   = gmx_simd_exp_d( gmx_simd_fneg_d(x2) );
+
+    res_erfcC = gmx_simd_mul_d(PolyCP0, gmx_simd_inv_d(PolyCQ0));
+    res_erfcC = gmx_simd_add_d(res_erfcC, CCoffset);
+    res_erfcC = gmx_simd_mul_d(res_erfcC, w);
+
+    mask     = gmx_simd_cmplt_d(gmx_simd_set1_d(4.5), xabs);
+    res_erfc = gmx_simd_blendv_d(res_erfcB, res_erfcC, mask);
+
+    res_erfc = gmx_simd_mul_d(res_erfc, expmx2);
+
+    /* erfc(x<0) = 2-erfc(|x|) */
+    mask     = gmx_simd_cmplt_d(x, gmx_simd_setzero_d());
+    res_erfc = gmx_simd_blendv_d(res_erfc, gmx_simd_sub_d(two, res_erfc), mask);
+
+    /* Select erf() or erfc() */
+    mask = gmx_simd_cmplt_d(xabs, one);
+    res  = gmx_simd_blendv_d(res_erfc, gmx_simd_sub_d(one, res_erf), mask);
+
+    return res;
+}
+
+/*! \brief SIMD double sin \& cos.
+ *
+ * \copydetails gmx_simd_sincos_f
+ */
+static gmx_inline void
+gmx_simd_sincos_d(gmx_simd_double_t x, gmx_simd_double_t *sinval, gmx_simd_double_t *cosval)
+{
+    /* Constants to subtract Pi/4*x from y while minimizing precision loss */
+    const gmx_simd_double_t  argred0         = gmx_simd_set1_d(2*0.78539816290140151978);
+    const gmx_simd_double_t  argred1         = gmx_simd_set1_d(2*4.9604678871439933374e-10);
+    const gmx_simd_double_t  argred2         = gmx_simd_set1_d(2*1.1258708853173288931e-18);
+    const gmx_simd_double_t  argred3         = gmx_simd_set1_d(2*1.7607799325916000908e-27);
+    const gmx_simd_double_t  two_over_pi     = gmx_simd_set1_d(2.0/M_PI);
+    const gmx_simd_double_t  const_sin5      = gmx_simd_set1_d( 1.58938307283228937328511e-10);
+    const gmx_simd_double_t  const_sin4      = gmx_simd_set1_d(-2.50506943502539773349318e-08);
+    const gmx_simd_double_t  const_sin3      = gmx_simd_set1_d( 2.75573131776846360512547e-06);
+    const gmx_simd_double_t  const_sin2      = gmx_simd_set1_d(-0.000198412698278911770864914);
+    const gmx_simd_double_t  const_sin1      = gmx_simd_set1_d( 0.0083333333333191845961746);
+    const gmx_simd_double_t  const_sin0      = gmx_simd_set1_d(-0.166666666666666130709393);
+
+    const gmx_simd_double_t  const_cos7      = gmx_simd_set1_d(-1.13615350239097429531523e-11);
+    const gmx_simd_double_t  const_cos6      = gmx_simd_set1_d( 2.08757471207040055479366e-09);
+    const gmx_simd_double_t  const_cos5      = gmx_simd_set1_d(-2.75573144028847567498567e-07);
+    const gmx_simd_double_t  const_cos4      = gmx_simd_set1_d( 2.48015872890001867311915e-05);
+    const gmx_simd_double_t  const_cos3      = gmx_simd_set1_d(-0.00138888888888714019282329);
+    const gmx_simd_double_t  const_cos2      = gmx_simd_set1_d( 0.0416666666666665519592062);
+    const gmx_simd_double_t  half            = gmx_simd_set1_d(0.5);
+    const gmx_simd_double_t  one             = gmx_simd_set1_d(1.0);
+    gmx_simd_double_t        ssign, csign;
+    gmx_simd_double_t        x2, y, z, psin, pcos, sss, ccc;
+    gmx_simd_dbool_t         mask;
+#if (defined GMX_SIMD_HAVE_DINT32) && (defined GMX_SIMD_HAVE_DINT32_ARITHMETICS) && (defined GMX_SIMD_HAVE_LOGICAL)
+    const gmx_simd_dint32_t  ione            = gmx_simd_set1_di(1);
+    const gmx_simd_dint32_t  itwo            = gmx_simd_set1_di(2);
+    gmx_simd_dint32_t        iy;
+
+    z       = gmx_simd_mul_d(x, two_over_pi);
+    iy      = gmx_simd_cvt_d2i(z);
+    y       = gmx_simd_round_d(z);
+
+    mask    = gmx_simd_cvt_dib2db(gmx_simd_cmpeq_di(gmx_simd_and_di(iy, ione), gmx_simd_setzero_di()));
+    ssign   = gmx_simd_blendzero_d(gmx_simd_set1_d(-0.0), gmx_simd_cvt_dib2db(gmx_simd_cmpeq_di(gmx_simd_and_di(iy, itwo), itwo)));
+    csign   = gmx_simd_blendzero_d(gmx_simd_set1_d(-0.0), gmx_simd_cvt_dib2db(gmx_simd_cmpeq_di(gmx_simd_and_di(gmx_simd_add_di(iy, ione), itwo), itwo)));
+#else
+    const gmx_simd_double_t  quarter         = gmx_simd_set1_d(0.25);
+    const gmx_simd_double_t  minusquarter    = gmx_simd_set1_d(-0.25);
+    gmx_simd_double_t        q;
+    gmx_simd_dbool_t         m1, m2, m3;
+
+    /* The most obvious way to find the arguments quadrant in the unit circle
+     * to calculate the sign is to use integer arithmetic, but that is not
+     * present in all SIMD implementations. As an alternative, we have devised a
+     * pure floating-point algorithm that uses truncation for argument reduction
+     * so that we get a new value 0<=q<1 over the unit circle, and then
+     * do floating-point comparisons with fractions. This is likely to be
+     * slightly slower (~10%) due to the longer latencies of floating-point, so
+     * we only use it when integer SIMD arithmetic is not present.
+     */
+    ssign   = x;
+    x       = gmx_simd_fabs_d(x);
+    /* It is critical that half-way cases are rounded down */
+    z       = gmx_simd_fmadd_d(x, two_over_pi, half);
+    y       = gmx_simd_trunc_d(z);
+    q       = gmx_simd_mul_d(z, quarter);
+    q       = gmx_simd_sub_d(q, gmx_simd_trunc_d(q));
+    /* z now starts at 0.0 for x=-pi/4 (although neg. values cannot occur), and
+     * then increased by 1.0 as x increases by 2*Pi, when it resets to 0.0.
+     * This removes the 2*Pi periodicity without using any integer arithmetic.
+     * First check if y had the value 2 or 3, set csign if true.
+     */
+    q       = gmx_simd_sub_d(q, half);
+    /* If we have logical operations we can work directly on the signbit, which
+     * saves instructions. Otherwise we need to represent signs as +1.0/-1.0.
+     * Thus, if you are altering defines to debug alternative code paths, the
+     * two GMX_SIMD_HAVE_LOGICAL sections in this routine must either both be
+     * active or inactive - you will get errors if only one is used.
+     */
+#    ifdef GMX_SIMD_HAVE_LOGICAL
+    ssign   = gmx_simd_and_d(ssign, gmx_simd_set1_d(-0.0));
+    csign   = gmx_simd_andnot_d(q, gmx_simd_set1_d(-0.0));
+    ssign   = gmx_simd_xor_d(ssign, csign);
+#    else
+    csign   = gmx_simd_xor_sign_d(gmx_simd_set1_d(-1.0), q);
+    ssign   = gmx_simd_xor_sign_d(ssign, csign);    /* swap ssign if csign was set. */
+#    endif
+    /* Check if y had value 1 or 3 (remember we subtracted 0.5 from q) */
+    m1      = gmx_simd_cmplt_d(q, minusquarter);
+    m2      = gmx_simd_cmple_d(gmx_simd_setzero_d(), q);
+    m3      = gmx_simd_cmplt_d(q, quarter);
+    m2      = gmx_simd_and_db(m2, m3);
+    mask    = gmx_simd_or_db(m1, m2);
+    /* where mask is FALSE, set sign. */
+    csign   = gmx_simd_xor_sign_d(csign, gmx_simd_blendv_d(gmx_simd_set1_d(-1.0), one, mask));
+#endif
+    x       = gmx_simd_fnmadd_d(y, argred0, x);
+    x       = gmx_simd_fnmadd_d(y, argred1, x);
+    x       = gmx_simd_fnmadd_d(y, argred2, x);
+    x       = gmx_simd_fnmadd_d(y, argred3, x);
+    x2      = gmx_simd_mul_d(x, x);
+
+    psin    = gmx_simd_fmadd_d(const_sin5, x2, const_sin4);
+    psin    = gmx_simd_fmadd_d(psin, x2, const_sin3);
+    psin    = gmx_simd_fmadd_d(psin, x2, const_sin2);
+    psin    = gmx_simd_fmadd_d(psin, x2, const_sin1);
+    psin    = gmx_simd_fmadd_d(psin, x2, const_sin0);
+    psin    = gmx_simd_fmadd_d(psin, gmx_simd_mul_d(x2, x), x);
+
+    pcos    = gmx_simd_fmadd_d(const_cos7, x2, const_cos6);
+    pcos    = gmx_simd_fmadd_d(pcos, x2, const_cos5);
+    pcos    = gmx_simd_fmadd_d(pcos, x2, const_cos4);
+    pcos    = gmx_simd_fmadd_d(pcos, x2, const_cos3);
+    pcos    = gmx_simd_fmadd_d(pcos, x2, const_cos2);
+    pcos    = gmx_simd_fmsub_d(pcos, x2, half);
+    pcos    = gmx_simd_fmadd_d(pcos, x2, one);
+
+    sss     = gmx_simd_blendv_d(pcos, psin, mask);
+    ccc     = gmx_simd_blendv_d(psin, pcos, mask);
+    /* See comment for GMX_SIMD_HAVE_LOGICAL section above. */
+#ifdef GMX_SIMD_HAVE_LOGICAL
+    *sinval = gmx_simd_xor_d(sss, ssign);
+    *cosval = gmx_simd_xor_d(ccc, csign);
+#else
+    *sinval = gmx_simd_xor_sign_d(sss, ssign);
+    *cosval = gmx_simd_xor_sign_d(ccc, csign);
+#endif
+}
+
+/*! \brief SIMD double sin(x).
+ *
+ * \copydetails gmx_simd_sin_f
+ */
+static gmx_inline gmx_simd_double_t
+gmx_simd_sin_d(gmx_simd_double_t x)
+{
+    gmx_simd_double_t s, c;
+    gmx_simd_sincos_d(x, &s, &c);
+    return s;
+}
+
+/*! \brief SIMD double cos(x).
+ *
+ * \copydetails gmx_simd_cos_f
+ */
+static gmx_inline gmx_simd_double_t
+gmx_simd_cos_d(gmx_simd_double_t x)
+{
+    gmx_simd_double_t s, c;
+    gmx_simd_sincos_d(x, &s, &c);
+    return c;
+}
+
+/*! \brief SIMD double tan(x).
+ *
+ * \copydetails gmx_simd_tan_f
+ */
+static gmx_inline gmx_simd_double_t
+gmx_simd_tan_d(gmx_simd_double_t x)
+{
+    const gmx_simd_double_t  argred0         = gmx_simd_set1_d(2*0.78539816290140151978);
+    const gmx_simd_double_t  argred1         = gmx_simd_set1_d(2*4.9604678871439933374e-10);
+    const gmx_simd_double_t  argred2         = gmx_simd_set1_d(2*1.1258708853173288931e-18);
+    const gmx_simd_double_t  argred3         = gmx_simd_set1_d(2*1.7607799325916000908e-27);
+    const gmx_simd_double_t  two_over_pi     = gmx_simd_set1_d(2.0/M_PI);
+    const gmx_simd_double_t  CT15            = gmx_simd_set1_d(1.01419718511083373224408e-05);
+    const gmx_simd_double_t  CT14            = gmx_simd_set1_d(-2.59519791585924697698614e-05);
+    const gmx_simd_double_t  CT13            = gmx_simd_set1_d(5.23388081915899855325186e-05);
+    const gmx_simd_double_t  CT12            = gmx_simd_set1_d(-3.05033014433946488225616e-05);
+    const gmx_simd_double_t  CT11            = gmx_simd_set1_d(7.14707504084242744267497e-05);
+    const gmx_simd_double_t  CT10            = gmx_simd_set1_d(8.09674518280159187045078e-05);
+    const gmx_simd_double_t  CT9             = gmx_simd_set1_d(0.000244884931879331847054404);
+    const gmx_simd_double_t  CT8             = gmx_simd_set1_d(0.000588505168743587154904506);
+    const gmx_simd_double_t  CT7             = gmx_simd_set1_d(0.00145612788922812427978848);
+    const gmx_simd_double_t  CT6             = gmx_simd_set1_d(0.00359208743836906619142924);
+    const gmx_simd_double_t  CT5             = gmx_simd_set1_d(0.00886323944362401618113356);
+    const gmx_simd_double_t  CT4             = gmx_simd_set1_d(0.0218694882853846389592078);
+    const gmx_simd_double_t  CT3             = gmx_simd_set1_d(0.0539682539781298417636002);
+    const gmx_simd_double_t  CT2             = gmx_simd_set1_d(0.133333333333125941821962);
+    const gmx_simd_double_t  CT1             = gmx_simd_set1_d(0.333333333333334980164153);
+
+    gmx_simd_double_t        x2, p, y, z;
+    gmx_simd_dbool_t         mask;
+
+#if (defined GMX_SIMD_HAVE_DINT32) && (defined GMX_SIMD_HAVE_DINT32_ARITHMETICS) && (defined GMX_SIMD_HAVE_LOGICAL)
+    gmx_simd_dint32_t  iy;
+    gmx_simd_dint32_t  ione = gmx_simd_set1_di(1);
+
+    z       = gmx_simd_mul_d(x, two_over_pi);
+    iy      = gmx_simd_cvt_d2i(z);
+    y       = gmx_simd_round_d(z);
+    mask    = gmx_simd_cvt_dib2db(gmx_simd_cmpeq_di(gmx_simd_and_di(iy, ione), ione));
+
+    x       = gmx_simd_fnmadd_d(y, argred0, x);
+    x       = gmx_simd_fnmadd_d(y, argred1, x);
+    x       = gmx_simd_fnmadd_d(y, argred2, x);
+    x       = gmx_simd_fnmadd_d(y, argred3, x);
+    x       = gmx_simd_xor_d(gmx_simd_blendzero_d(gmx_simd_set1_d(-0.0), mask), x);
+#else
+    const gmx_simd_double_t  quarter         = gmx_simd_set1_d(0.25);
+    const gmx_simd_double_t  half            = gmx_simd_set1_d(0.5);
+    const gmx_simd_double_t  threequarter    = gmx_simd_set1_d(0.75);
+    gmx_simd_double_t        w, q;
+    gmx_simd_dbool_t         m1, m2, m3;
+
+    w       = gmx_simd_fabs_d(x);
+    z       = gmx_simd_fmadd_d(w, two_over_pi, half);
+    y       = gmx_simd_trunc_d(z);
+    q       = gmx_simd_mul_d(z, quarter);
+    q       = gmx_simd_sub_d(q, gmx_simd_trunc_d(q));
+    m1      = gmx_simd_cmple_d(quarter, q);
+    m2      = gmx_simd_cmplt_d(q, half);
+    m3      = gmx_simd_cmple_d(threequarter, q);
+    m1      = gmx_simd_and_db(m1, m2);
+    mask    = gmx_simd_or_db(m1, m3);
+    w       = gmx_simd_fnmadd_d(y, argred0, w);
+    w       = gmx_simd_fnmadd_d(y, argred1, w);
+    w       = gmx_simd_fnmadd_d(y, argred2, w);
+    w       = gmx_simd_fnmadd_d(y, argred3, w);
+
+    w       = gmx_simd_blendv_d(w, gmx_simd_fneg_d(w), mask);
+    x       = gmx_simd_xor_sign_d(w, x);
+#endif
+    x2      = gmx_simd_mul_d(x, x);
+    p       = gmx_simd_fmadd_d(CT15, x2, CT14);
+    p       = gmx_simd_fmadd_d(p, x2, CT13);
+    p       = gmx_simd_fmadd_d(p, x2, CT12);
+    p       = gmx_simd_fmadd_d(p, x2, CT11);
+    p       = gmx_simd_fmadd_d(p, x2, CT10);
+    p       = gmx_simd_fmadd_d(p, x2, CT9);
+    p       = gmx_simd_fmadd_d(p, x2, CT8);
+    p       = gmx_simd_fmadd_d(p, x2, CT7);
+    p       = gmx_simd_fmadd_d(p, x2, CT6);
+    p       = gmx_simd_fmadd_d(p, x2, CT5);
+    p       = gmx_simd_fmadd_d(p, x2, CT4);
+    p       = gmx_simd_fmadd_d(p, x2, CT3);
+    p       = gmx_simd_fmadd_d(p, x2, CT2);
+    p       = gmx_simd_fmadd_d(p, x2, CT1);
+    p       = gmx_simd_fmadd_d(x2, gmx_simd_mul_d(p, x), x);
+
+    p       = gmx_simd_blendv_d( p, gmx_simd_inv_d(p), mask);
+    return p;
+}
+
+/*! \brief SIMD double asin(x).
+ *
+ * \copydetails gmx_simd_asin_f
+ */
+static gmx_inline gmx_simd_double_t
+gmx_simd_asin_d(gmx_simd_double_t x)
+{
+    /* Same algorithm as cephes library */
+    const gmx_simd_double_t limit1    = gmx_simd_set1_d(0.625);
+    const gmx_simd_double_t limit2    = gmx_simd_set1_d(1e-8);
+    const gmx_simd_double_t one       = gmx_simd_set1_d(1.0);
+    const gmx_simd_double_t quarterpi = gmx_simd_set1_d(M_PI/4.0);
+    const gmx_simd_double_t morebits  = gmx_simd_set1_d(6.123233995736765886130e-17);
+
+    const gmx_simd_double_t P5        = gmx_simd_set1_d(4.253011369004428248960e-3);
+    const gmx_simd_double_t P4        = gmx_simd_set1_d(-6.019598008014123785661e-1);
+    const gmx_simd_double_t P3        = gmx_simd_set1_d(5.444622390564711410273e0);
+    const gmx_simd_double_t P2        = gmx_simd_set1_d(-1.626247967210700244449e1);
+    const gmx_simd_double_t P1        = gmx_simd_set1_d(1.956261983317594739197e1);
+    const gmx_simd_double_t P0        = gmx_simd_set1_d(-8.198089802484824371615e0);
+
+    const gmx_simd_double_t Q4        = gmx_simd_set1_d(-1.474091372988853791896e1);
+    const gmx_simd_double_t Q3        = gmx_simd_set1_d(7.049610280856842141659e1);
+    const gmx_simd_double_t Q2        = gmx_simd_set1_d(-1.471791292232726029859e2);
+    const gmx_simd_double_t Q1        = gmx_simd_set1_d(1.395105614657485689735e2);
+    const gmx_simd_double_t Q0        = gmx_simd_set1_d(-4.918853881490881290097e1);
+
+    const gmx_simd_double_t R4        = gmx_simd_set1_d(2.967721961301243206100e-3);
+    const gmx_simd_double_t R3        = gmx_simd_set1_d(-5.634242780008963776856e-1);
+    const gmx_simd_double_t R2        = gmx_simd_set1_d(6.968710824104713396794e0);
+    const gmx_simd_double_t R1        = gmx_simd_set1_d(-2.556901049652824852289e1);
+    const gmx_simd_double_t R0        = gmx_simd_set1_d(2.853665548261061424989e1);
+
+    const gmx_simd_double_t S3        = gmx_simd_set1_d(-2.194779531642920639778e1);
+    const gmx_simd_double_t S2        = gmx_simd_set1_d(1.470656354026814941758e2);
+    const gmx_simd_double_t S1        = gmx_simd_set1_d(-3.838770957603691357202e2);
+    const gmx_simd_double_t S0        = gmx_simd_set1_d(3.424398657913078477438e2);
+
+    gmx_simd_double_t       xabs;
+    gmx_simd_double_t       zz, ww, z, q, w, zz2, ww2;
+    gmx_simd_double_t       PA, PB;
+    gmx_simd_double_t       QA, QB;
+    gmx_simd_double_t       RA, RB;
+    gmx_simd_double_t       SA, SB;
+    gmx_simd_double_t       nom, denom;
+    gmx_simd_dbool_t        mask;
+
+    xabs  = gmx_simd_fabs_d(x);
+
+    mask  = gmx_simd_cmplt_d(limit1, xabs);
+
+    zz    = gmx_simd_sub_d(one, xabs);
+    ww    = gmx_simd_mul_d(xabs, xabs);
+    zz2   = gmx_simd_mul_d(zz, zz);
+    ww2   = gmx_simd_mul_d(ww, ww);
+
+    /* R */
+    RA    = gmx_simd_mul_d(R4, zz2);
+    RB    = gmx_simd_mul_d(R3, zz2);
+    RA    = gmx_simd_add_d(RA, R2);
+    RB    = gmx_simd_add_d(RB, R1);
+    RA    = gmx_simd_mul_d(RA, zz2);
+    RB    = gmx_simd_mul_d(RB, zz);
+    RA    = gmx_simd_add_d(RA, R0);
+    RA    = gmx_simd_add_d(RA, RB);
+
+    /* S, SA = zz2 */
+    SB    = gmx_simd_mul_d(S3, zz2);
+    SA    = gmx_simd_add_d(zz2, S2);
+    SB    = gmx_simd_add_d(SB, S1);
+    SA    = gmx_simd_mul_d(SA, zz2);
+    SB    = gmx_simd_mul_d(SB, zz);
+    SA    = gmx_simd_add_d(SA, S0);
+    SA    = gmx_simd_add_d(SA, SB);
+
+    /* P */
+    PA    = gmx_simd_mul_d(P5, ww2);
+    PB    = gmx_simd_mul_d(P4, ww2);
+    PA    = gmx_simd_add_d(PA, P3);
+    PB    = gmx_simd_add_d(PB, P2);
+    PA    = gmx_simd_mul_d(PA, ww2);
+    PB    = gmx_simd_mul_d(PB, ww2);
+    PA    = gmx_simd_add_d(PA, P1);
+    PB    = gmx_simd_add_d(PB, P0);
+    PA    = gmx_simd_mul_d(PA, ww);
+    PA    = gmx_simd_add_d(PA, PB);
+
+    /* Q, QA = ww2 */
+    QB    = gmx_simd_mul_d(Q4, ww2);
+    QA    = gmx_simd_add_d(ww2, Q3);
+    QB    = gmx_simd_add_d(QB, Q2);
+    QA    = gmx_simd_mul_d(QA, ww2);
+    QB    = gmx_simd_mul_d(QB, ww2);
+    QA    = gmx_simd_add_d(QA, Q1);
+    QB    = gmx_simd_add_d(QB, Q0);
+    QA    = gmx_simd_mul_d(QA, ww);
+    QA    = gmx_simd_add_d(QA, QB);
+
+    RA    = gmx_simd_mul_d(RA, zz);
+    PA    = gmx_simd_mul_d(PA, ww);
+
+    nom   = gmx_simd_blendv_d( PA, RA, mask );
+    denom = gmx_simd_blendv_d( QA, SA, mask );
+
+    q     = gmx_simd_mul_d( nom, gmx_simd_inv_d(denom) );
+
+    zz    = gmx_simd_add_d(zz, zz);
+    zz    = gmx_simd_sqrt_d(zz);
+    z     = gmx_simd_sub_d(quarterpi, zz);
+    zz    = gmx_simd_mul_d(zz, q);
+    zz    = gmx_simd_sub_d(zz, morebits);
+    z     = gmx_simd_sub_d(z, zz);
+    z     = gmx_simd_add_d(z, quarterpi);
+
+    w     = gmx_simd_mul_d(xabs, q);
+    w     = gmx_simd_add_d(w, xabs);
+
+    z     = gmx_simd_blendv_d( w, z, mask );
+
+    mask  = gmx_simd_cmplt_d(limit2, xabs);
+    z     = gmx_simd_blendv_d( xabs, z, mask );
+
+    z = gmx_simd_xor_sign_d(z, x);
+
+    return z;
+}
+
+/*! \brief SIMD double acos(x).
+ *
+ * \copydetails gmx_simd_acos_f
+ */
+static gmx_inline gmx_simd_double_t
+gmx_simd_acos_d(gmx_simd_double_t x)
+{
+    const gmx_simd_double_t one        = gmx_simd_set1_d(1.0);
+    const gmx_simd_double_t half       = gmx_simd_set1_d(0.5);
+    const gmx_simd_double_t quarterpi0 = gmx_simd_set1_d(7.85398163397448309616e-1);
+    const gmx_simd_double_t quarterpi1 = gmx_simd_set1_d(6.123233995736765886130e-17);
+
+    gmx_simd_dbool_t        mask1;
+    gmx_simd_double_t       z, z1, z2;
+
+    mask1 = gmx_simd_cmplt_d(half, x);
+    z1    = gmx_simd_mul_d(half, gmx_simd_sub_d(one, x));
+    z1    = gmx_simd_sqrt_d(z1);
+    z     = gmx_simd_blendv_d( x, z1, mask1 );
+
+    z     = gmx_simd_asin_d(z);
+
+    z1    = gmx_simd_add_d(z, z);
+
+    z2    = gmx_simd_sub_d(quarterpi0, z);
+    z2    = gmx_simd_add_d(z2, quarterpi1);
+    z2    = gmx_simd_add_d(z2, quarterpi0);
+
+    z     = gmx_simd_blendv_d(z2, z1, mask1);
+
+    return z;
+}
+
+/*! \brief SIMD double atan(x).
+ *
+ * \copydetails gmx_simd_atan_f
+ */
+static gmx_inline gmx_simd_double_t
+gmx_simd_atan_d(gmx_simd_double_t x)
+{
+    /* Same algorithm as cephes library */
+    const gmx_simd_double_t limit1    = gmx_simd_set1_d(0.66);
+    const gmx_simd_double_t limit2    = gmx_simd_set1_d(2.41421356237309504880);
+    const gmx_simd_double_t quarterpi = gmx_simd_set1_d(M_PI/4.0);
+    const gmx_simd_double_t halfpi    = gmx_simd_set1_d(M_PI/2.0);
+    const gmx_simd_double_t mone      = gmx_simd_set1_d(-1.0);
+    const gmx_simd_double_t morebits1 = gmx_simd_set1_d(0.5*6.123233995736765886130E-17);
+    const gmx_simd_double_t morebits2 = gmx_simd_set1_d(6.123233995736765886130E-17);
+
+    const gmx_simd_double_t P4        = gmx_simd_set1_d(-8.750608600031904122785E-1);
+    const gmx_simd_double_t P3        = gmx_simd_set1_d(-1.615753718733365076637E1);
+    const gmx_simd_double_t P2        = gmx_simd_set1_d(-7.500855792314704667340E1);
+    const gmx_simd_double_t P1        = gmx_simd_set1_d(-1.228866684490136173410E2);
+    const gmx_simd_double_t P0        = gmx_simd_set1_d(-6.485021904942025371773E1);
+
+    const gmx_simd_double_t Q4        = gmx_simd_set1_d(2.485846490142306297962E1);
+    const gmx_simd_double_t Q3        = gmx_simd_set1_d(1.650270098316988542046E2);
+    const gmx_simd_double_t Q2        = gmx_simd_set1_d(4.328810604912902668951E2);
+    const gmx_simd_double_t Q1        = gmx_simd_set1_d(4.853903996359136964868E2);
+    const gmx_simd_double_t Q0        = gmx_simd_set1_d(1.945506571482613964425E2);
+
+    gmx_simd_double_t       y, xabs, t1, t2;
+    gmx_simd_double_t       z, z2;
+    gmx_simd_double_t       P_A, P_B, Q_A, Q_B;
+    gmx_simd_dbool_t        mask1, mask2;
+
+    xabs   = gmx_simd_fabs_d(x);
+
+    mask1  = gmx_simd_cmplt_d(limit1, xabs);
+    mask2  = gmx_simd_cmplt_d(limit2, xabs);
+
+    t1     = gmx_simd_mul_d(gmx_simd_add_d(xabs, mone), gmx_simd_inv_d(gmx_simd_sub_d(xabs, mone)));
+    t2     = gmx_simd_mul_d(mone, gmx_simd_inv_d(xabs));
+
+    y      = gmx_simd_blendzero_d(quarterpi, mask1);
+    y      = gmx_simd_blendv_d(y, halfpi, mask2);
+    xabs   = gmx_simd_blendv_d(xabs, t1, mask1);
+    xabs   = gmx_simd_blendv_d(xabs, t2, mask2);
+
+    z      = gmx_simd_mul_d(xabs, xabs);
+    z2     = gmx_simd_mul_d(z, z);
+
+    P_A    = gmx_simd_mul_d(P4, z2);
+    P_B    = gmx_simd_mul_d(P3, z2);
+    P_A    = gmx_simd_add_d(P_A, P2);
+    P_B    = gmx_simd_add_d(P_B, P1);
+    P_A    = gmx_simd_mul_d(P_A, z2);
+    P_B    = gmx_simd_mul_d(P_B, z);
+    P_A    = gmx_simd_add_d(P_A, P0);
+    P_A    = gmx_simd_add_d(P_A, P_B);
+
+    /* Q_A = z2 */
+    Q_B    = gmx_simd_mul_d(Q4, z2);
+    Q_A    = gmx_simd_add_d(z2, Q3);
+    Q_B    = gmx_simd_add_d(Q_B, Q2);
+    Q_A    = gmx_simd_mul_d(Q_A, z2);
+    Q_B    = gmx_simd_mul_d(Q_B, z2);
+    Q_A    = gmx_simd_add_d(Q_A, Q1);
+    Q_B    = gmx_simd_add_d(Q_B, Q0);
+    Q_A    = gmx_simd_mul_d(Q_A, z);
+    Q_A    = gmx_simd_add_d(Q_A, Q_B);
+
+    z      = gmx_simd_mul_d(z, P_A);
+    z      = gmx_simd_mul_d(z, gmx_simd_inv_d(Q_A));
+    z      = gmx_simd_mul_d(z, xabs);
+    z      = gmx_simd_add_d(z, xabs);
+
+    t1     = gmx_simd_blendzero_d(morebits1, mask1);
+    t1     = gmx_simd_blendv_d(t1, morebits2, mask2);
+
+    z      = gmx_simd_add_d(z, t1);
+    y      = gmx_simd_add_d(y, z);
+
+    y      = gmx_simd_xor_sign_d(y, x);
+
+    return y;
+}
+
+/*! \brief SIMD double atan2(y,x).
+ *
+ * \copydetails gmx_simd_atan2_f
+ */
+static gmx_inline gmx_simd_double_t
+gmx_simd_atan2_d(gmx_simd_double_t y, gmx_simd_double_t x)
+{
+    const gmx_simd_double_t pi          = gmx_simd_set1_d(M_PI);
+    const gmx_simd_double_t halfpi      = gmx_simd_set1_d(M_PI/2.0);
+    gmx_simd_double_t       xinv, p, aoffset;
+    gmx_simd_dbool_t        mask_x0, mask_y0, mask_xlt0, mask_ylt0;
+
+    mask_x0   = gmx_simd_cmpeq_d(x, gmx_simd_setzero_d());
+    mask_y0   = gmx_simd_cmpeq_d(y, gmx_simd_setzero_d());
+    mask_xlt0 = gmx_simd_cmplt_d(x, gmx_simd_setzero_d());
+    mask_ylt0 = gmx_simd_cmplt_d(y, gmx_simd_setzero_d());
+
+    aoffset   = gmx_simd_blendzero_d(halfpi, mask_x0);
+    aoffset   = gmx_simd_blendnotzero_d(aoffset, mask_y0);
+
+    aoffset   = gmx_simd_blendv_d(aoffset, pi, mask_xlt0);
+    aoffset   = gmx_simd_blendv_d(aoffset, gmx_simd_fneg_d(aoffset), mask_ylt0);
+
+    xinv      = gmx_simd_blendnotzero_d(gmx_simd_inv_d(x), mask_x0);
+    p         = gmx_simd_mul_d(y, xinv);
+    p         = gmx_simd_atan_d(p);
+    p         = gmx_simd_add_d(p, aoffset);
+
+    return p;
+}
+
+
+/*! \brief Calculate the force correction due to PME analytically for SIMD double.
+ *
+ * \copydetails gmx_simd_pmecorrF_f
+ */
+static gmx_simd_double_t
+gmx_simd_pmecorrF_d(gmx_simd_double_t z2)
+{
+    const gmx_simd_double_t  FN10     = gmx_simd_set1_d(-8.0072854618360083154e-14);
+    const gmx_simd_double_t  FN9      = gmx_simd_set1_d(1.1859116242260148027e-11);
+    const gmx_simd_double_t  FN8      = gmx_simd_set1_d(-8.1490406329798423616e-10);
+    const gmx_simd_double_t  FN7      = gmx_simd_set1_d(3.4404793543907847655e-8);
+    const gmx_simd_double_t  FN6      = gmx_simd_set1_d(-9.9471420832602741006e-7);
+    const gmx_simd_double_t  FN5      = gmx_simd_set1_d(0.000020740315999115847456);
+    const gmx_simd_double_t  FN4      = gmx_simd_set1_d(-0.00031991745139313364005);
+    const gmx_simd_double_t  FN3      = gmx_simd_set1_d(0.0035074449373659008203);
+    const gmx_simd_double_t  FN2      = gmx_simd_set1_d(-0.031750380176100813405);
+    const gmx_simd_double_t  FN1      = gmx_simd_set1_d(0.13884101728898463426);
+    const gmx_simd_double_t  FN0      = gmx_simd_set1_d(-0.75225277815249618847);
+
+    const gmx_simd_double_t  FD5      = gmx_simd_set1_d(0.000016009278224355026701);
+    const gmx_simd_double_t  FD4      = gmx_simd_set1_d(0.00051055686934806966046);
+    const gmx_simd_double_t  FD3      = gmx_simd_set1_d(0.0081803507497974289008);
+    const gmx_simd_double_t  FD2      = gmx_simd_set1_d(0.077181146026670287235);
+    const gmx_simd_double_t  FD1      = gmx_simd_set1_d(0.41543303143712535988);
+    const gmx_simd_double_t  FD0      = gmx_simd_set1_d(1.0);
+
+    gmx_simd_double_t        z4;
+    gmx_simd_double_t        polyFN0, polyFN1, polyFD0, polyFD1;
+
+    z4             = gmx_simd_mul_d(z2, z2);
+
+    polyFD1        = gmx_simd_fmadd_d(FD5, z4, FD3);
+    polyFD1        = gmx_simd_fmadd_d(polyFD1, z4, FD1);
+    polyFD1        = gmx_simd_mul_d(polyFD1, z2);
+    polyFD0        = gmx_simd_fmadd_d(FD4, z4, FD2);
+    polyFD0        = gmx_simd_fmadd_d(polyFD0, z4, FD0);
+    polyFD0        = gmx_simd_add_d(polyFD0, polyFD1);
+
+    polyFD0        = gmx_simd_inv_d(polyFD0);
+
+    polyFN0        = gmx_simd_fmadd_d(FN10, z4, FN8);
+    polyFN0        = gmx_simd_fmadd_d(polyFN0, z4, FN6);
+    polyFN0        = gmx_simd_fmadd_d(polyFN0, z4, FN4);
+    polyFN0        = gmx_simd_fmadd_d(polyFN0, z4, FN2);
+    polyFN0        = gmx_simd_fmadd_d(polyFN0, z4, FN0);
+    polyFN1        = gmx_simd_fmadd_d(FN9, z4, FN7);
+    polyFN1        = gmx_simd_fmadd_d(polyFN1, z4, FN5);
+    polyFN1        = gmx_simd_fmadd_d(polyFN1, z4, FN3);
+    polyFN1        = gmx_simd_fmadd_d(polyFN1, z4, FN1);
+    polyFN0        = gmx_simd_fmadd_d(polyFN1, z2, polyFN0);
+
+
+    return gmx_simd_mul_d(polyFN0, polyFD0);
+}
+
+
+
+/*! \brief Calculate the potential correction due to PME analytically for SIMD double.
+ *
+ * \copydetails gmx_simd_pmecorrV_f
+ */
+static gmx_simd_double_t
+gmx_simd_pmecorrV_d(gmx_simd_double_t z2)
+{
+    const gmx_simd_double_t  VN9      = gmx_simd_set1_d(-9.3723776169321855475e-13);
+    const gmx_simd_double_t  VN8      = gmx_simd_set1_d(1.2280156762674215741e-10);
+    const gmx_simd_double_t  VN7      = gmx_simd_set1_d(-7.3562157912251309487e-9);
+    const gmx_simd_double_t  VN6      = gmx_simd_set1_d(2.6215886208032517509e-7);
+    const gmx_simd_double_t  VN5      = gmx_simd_set1_d(-4.9532491651265819499e-6);
+    const gmx_simd_double_t  VN4      = gmx_simd_set1_d(0.00025907400778966060389);
+    const gmx_simd_double_t  VN3      = gmx_simd_set1_d(0.0010585044856156469792);
+    const gmx_simd_double_t  VN2      = gmx_simd_set1_d(0.045247661136833092885);
+    const gmx_simd_double_t  VN1      = gmx_simd_set1_d(0.11643931522926034421);
+    const gmx_simd_double_t  VN0      = gmx_simd_set1_d(1.1283791671726767970);
+
+    const gmx_simd_double_t  VD5      = gmx_simd_set1_d(0.000021784709867336150342);
+    const gmx_simd_double_t  VD4      = gmx_simd_set1_d(0.00064293662010911388448);
+    const gmx_simd_double_t  VD3      = gmx_simd_set1_d(0.0096311444822588683504);
+    const gmx_simd_double_t  VD2      = gmx_simd_set1_d(0.085608012351550627051);
+    const gmx_simd_double_t  VD1      = gmx_simd_set1_d(0.43652499166614811084);
+    const gmx_simd_double_t  VD0      = gmx_simd_set1_d(1.0);
+
+    gmx_simd_double_t        z4;
+    gmx_simd_double_t        polyVN0, polyVN1, polyVD0, polyVD1;
+
+    z4             = gmx_simd_mul_d(z2, z2);
+
+    polyVD1        = gmx_simd_fmadd_d(VD5, z4, VD3);
+    polyVD0        = gmx_simd_fmadd_d(VD4, z4, VD2);
+    polyVD1        = gmx_simd_fmadd_d(polyVD1, z4, VD1);
+    polyVD0        = gmx_simd_fmadd_d(polyVD0, z4, VD0);
+    polyVD0        = gmx_simd_fmadd_d(polyVD1, z2, polyVD0);
+
+    polyVD0        = gmx_simd_inv_d(polyVD0);
+
+    polyVN1        = gmx_simd_fmadd_d(VN9, z4, VN7);
+    polyVN0        = gmx_simd_fmadd_d(VN8, z4, VN6);
+    polyVN1        = gmx_simd_fmadd_d(polyVN1, z4, VN5);
+    polyVN0        = gmx_simd_fmadd_d(polyVN0, z4, VN4);
+    polyVN1        = gmx_simd_fmadd_d(polyVN1, z4, VN3);
+    polyVN0        = gmx_simd_fmadd_d(polyVN0, z4, VN2);
+    polyVN1        = gmx_simd_fmadd_d(polyVN1, z4, VN1);
+    polyVN0        = gmx_simd_fmadd_d(polyVN0, z4, VN0);
+    polyVN0        = gmx_simd_fmadd_d(polyVN1, z2, polyVN0);
+
+    return gmx_simd_mul_d(polyVN0, polyVD0);
+}
+
+/*! \} */
+
+#endif
+
+
+/*! \name SIMD4 math functions
+ *
+ * \note Only a subset of the math functions are implemented for SIMD4.
+ *  \{
+ */
+
+
+#ifdef GMX_SIMD4_HAVE_FLOAT
+
+/*************************************************************************
+ * SINGLE PRECISION SIMD4 MATH FUNCTIONS - JUST A SMALL SUBSET SUPPORTED *
+ *************************************************************************/
+
+/*! \brief SIMD4 utility function to sum a+b+c+d for SIMD4 floats.
+ *
+ * \copydetails gmx_simd_sum4_f
+ */
+static gmx_inline gmx_simd4_float_t
+gmx_simd4_sum4_f(gmx_simd4_float_t a, gmx_simd4_float_t b,
+                 gmx_simd4_float_t c, gmx_simd4_float_t d)
+{
+    return gmx_simd4_add_f(gmx_simd4_add_f(a, b), gmx_simd4_add_f(c, d));
+}
+
+/*! \brief Perform one Newton-Raphson iteration to improve 1/sqrt(x) for SIMD4 float.
+ *
+ * \copydetails gmx_simd_rsqrt_iter_f
+ */
+static gmx_inline gmx_simd4_float_t
+gmx_simd4_rsqrt_iter_f(gmx_simd4_float_t lu, gmx_simd4_float_t x)
+{
+#    ifdef GMX_SIMD_HAVE_FMA
+    return gmx_simd4_fmadd_f(gmx_simd4_fnmadd_f(x, gmx_simd4_mul_f(lu, lu), gmx_simd4_set1_f(1.0f)), gmx_simd4_mul_f(lu, gmx_simd4_set1_f(0.5f)), lu);
+#    else
+    return gmx_simd4_mul_f(gmx_simd4_set1_f(0.5f), gmx_simd4_mul_f(gmx_simd4_sub_f(gmx_simd4_set1_f(3.0f), gmx_simd4_mul_f(gmx_simd4_mul_f(lu, lu), x)), lu));
+#    endif
+}
+
+/*! \brief Calculate 1/sqrt(x) for SIMD4 float.
+ *
+ * \copydetails gmx_simd_invsqrt_f
+ */
+static gmx_inline gmx_simd4_float_t
+gmx_simd4_invsqrt_f(gmx_simd4_float_t x)
+{
+    gmx_simd4_float_t lu = gmx_simd4_rsqrt_f(x);
+#if (GMX_SIMD_RSQRT_BITS < GMX_SIMD_MATH_TARGET_SINGLE_BITS)
+    lu = gmx_simd4_rsqrt_iter_f(lu, x);
+#endif
+#if (GMX_SIMD_RSQRT_BITS*2 < GMX_SIMD_MATH_TARGET_SINGLE_BITS)
+    lu = gmx_simd4_rsqrt_iter_f(lu, x);
+#endif
+#if (GMX_SIMD_RSQRT_BITS*4 < GMX_SIMD_MATH_TARGET_SINGLE_BITS)
+    lu = gmx_simd4_rsqrt_iter_f(lu, x);
+#endif
+    return lu;
+}
+
+#endif /* GMX_SIMD4_HAVE_FLOAT */
+
+
+
+#ifdef GMX_SIMD4_HAVE_DOUBLE
+/*************************************************************************
+ * DOUBLE PRECISION SIMD4 MATH FUNCTIONS - JUST A SMALL SUBSET SUPPORTED *
+ *************************************************************************/
+
+
+/*! \brief SIMD4 utility function to sum a+b+c+d for SIMD4 doubles.
+ *
+ * \copydetails gmx_simd_sum4_f
+ */
+static gmx_inline gmx_simd4_double_t
+gmx_simd4_sum4_d(gmx_simd4_double_t a, gmx_simd4_double_t b,
+                 gmx_simd4_double_t c, gmx_simd4_double_t d)
+{
+    return gmx_simd4_add_d(gmx_simd4_add_d(a, b), gmx_simd4_add_d(c, d));
+}
+
+/*! \brief Perform one Newton-Raphson iteration to improve 1/sqrt(x) for SIMD4 double.
+ *
+ * \copydetails gmx_simd_rsqrt_iter_f
+ */
+static gmx_inline gmx_simd4_double_t
+gmx_simd4_rsqrt_iter_d(gmx_simd4_double_t lu, gmx_simd4_double_t x)
+{
+#ifdef GMX_SIMD_HAVE_FMA
+    return gmx_simd4_fmadd_d(gmx_simd4_fnmadd_d(x, gmx_simd4_mul_d(lu, lu), gmx_simd4_set1_d(1.0)), gmx_simd4_mul_d(lu, gmx_simd4_set1_d(0.5)), lu);
+#else
+    return gmx_simd4_mul_d(gmx_simd4_set1_d(0.5), gmx_simd4_mul_d(gmx_simd4_sub_d(gmx_simd4_set1_d(3.0), gmx_simd4_mul_d(gmx_simd4_mul_d(lu, lu), x)), lu));
+#endif
+}
+
+/*! \brief Calculate 1/sqrt(x) for SIMD4 double.
+ *
+ * \copydetails gmx_simd_invsqrt_f
+ */
+static gmx_inline gmx_simd4_double_t
+gmx_simd4_invsqrt_d(gmx_simd4_double_t x)
+{
+    gmx_simd4_double_t lu = gmx_simd4_rsqrt_d(x);
+#if (GMX_SIMD_RSQRT_BITS < GMX_SIMD_MATH_TARGET_DOUBLE_BITS)
+    lu = gmx_simd4_rsqrt_iter_d(lu, x);
+#endif
+#if (GMX_SIMD_RSQRT_BITS*2 < GMX_SIMD_MATH_TARGET_DOUBLE_BITS)
+    lu = gmx_simd4_rsqrt_iter_d(lu, x);
+#endif
+#if (GMX_SIMD_RSQRT_BITS*4 < GMX_SIMD_MATH_TARGET_DOUBLE_BITS)
+    lu = gmx_simd4_rsqrt_iter_d(lu, x);
+#endif
+#if (GMX_SIMD_RSQRT_BITS*8 < GMX_SIMD_MATH_TARGET_DOUBLE_BITS)
+    lu = gmx_simd4_rsqrt_iter_d(lu, x);
+#endif
+    return lu;
+}
+#endif /* GMX_SIMD4_HAVE_DOUBLE */
+
+/*! \} */
+
+
+/* Set defines based on default Gromacs precision */
+#ifdef GMX_DOUBLE
+/* Documentation in single branch below */
+#    define gmx_simd_sum4_r           gmx_simd_sum4_d
+#    define gmx_simd_xor_sign_r       gmx_simd_xor_sign_d
+#    define gmx_simd_invsqrt_r        gmx_simd_invsqrt_d
+#    define gmx_simd_invsqrt_pair_r   gmx_simd_invsqrt_pair_d
+#    define gmx_simd_sqrt_r           gmx_simd_sqrt_d
+#    define gmx_simd_inv_r            gmx_simd_inv_d
+#    define gmx_simd_log_r            gmx_simd_log_d
+#    define gmx_simd_exp2_r           gmx_simd_exp2_d
+#    define gmx_simd_exp_r            gmx_simd_exp_d
+#    define gmx_simd_erf_r            gmx_simd_erf_d
+#    define gmx_simd_erfc_r           gmx_simd_erfc_d
+#    define gmx_simd_sincos_r         gmx_simd_sincos_d
+#    define gmx_simd_sin_r            gmx_simd_sin_d
+#    define gmx_simd_cos_r            gmx_simd_cos_d
+#    define gmx_simd_tan_r            gmx_simd_tan_d
+#    define gmx_simd_asin_r           gmx_simd_asin_d
+#    define gmx_simd_acos_r           gmx_simd_acos_d
+#    define gmx_simd_atan_r           gmx_simd_atan_d
+#    define gmx_simd_atan2_r          gmx_simd_atan2_d
+#    define gmx_simd_pmecorrF_r       gmx_simd_pmecorrF_d
+#    define gmx_simd_pmecorrV_r       gmx_simd_pmecorrV_d
+#    define gmx_simd4_sum4_r          gmx_simd4_sum4_d
+#    define gmx_simd4_invsqrt_r       gmx_simd4_invsqrt_d
+
+#else /* GMX_DOUBLE */
+
+/*! \name Real-precision SIMD math functions
+ *
+ *  These are the ones you should typically call in Gromacs.
+ * \{
+ */
+
+/*! \brief SIMD utility function to sum a+b+c+d for SIMD reals.
+ *
+ * \copydetails gmx_simd_sum4_f
+ */
+#    define gmx_simd_sum4_r           gmx_simd_sum4_f
+
+/*! \brief Return -a if b is negative, SIMD real.
+ *
+ * \copydetails gmx_simd_xor_sign_f
+ */
+#    define gmx_simd_xor_sign_r       gmx_simd_xor_sign_f
+
+/*! \brief Calculate 1/sqrt(x) for SIMD real.
+ *
+ * \copydetails gmx_simd_invsqrt_f
+ */
+#    define gmx_simd_invsqrt_r        gmx_simd_invsqrt_f
+
+/*! \brief Calculate 1/sqrt(x) for two SIMD reals.
+ *
+ * \copydetails gmx_simd_invsqrt_pair_f
+ */
+#    define gmx_simd_invsqrt_pair_r   gmx_simd_invsqrt_pair_f
+
+/*! \brief Calculate sqrt(x) correctly for SIMD real, including argument 0.0.
+ *
+ * \copydetails gmx_simd_sqrt_f
+ */
+#    define gmx_simd_sqrt_r           gmx_simd_sqrt_f
+
+/*! \brief Calculate 1/x for SIMD real.
+ *
+ * \copydetails gmx_simd_inv_f
+ */
+#    define gmx_simd_inv_r            gmx_simd_inv_f
+
+/*! \brief SIMD real log(x). This is the natural logarithm.
+ *
+ * \copydetails gmx_simd_log_f
+ */
+#    define gmx_simd_log_r            gmx_simd_log_f
+
+/*! \brief SIMD real 2^x.
+ *
+ * \copydetails gmx_simd_exp2_f
+ */
+#    define gmx_simd_exp2_r           gmx_simd_exp2_f
+
+/*! \brief SIMD real e^x.
+ *
+ * \copydetails gmx_simd_exp_f
+ */
+#    define gmx_simd_exp_r            gmx_simd_exp_f
+
+/*! \brief SIMD real erf(x).
+ *
+ * \copydetails gmx_simd_erf_f
+ */
+#    define gmx_simd_erf_r            gmx_simd_erf_f
+
+/*! \brief SIMD real erfc(x).
+ *
+ * \copydetails gmx_simd_erfc_f
+ */
+#    define gmx_simd_erfc_r           gmx_simd_erfc_f
+
+/*! \brief SIMD real sin \& cos.
+ *
+ * \copydetails gmx_simd_sincos_f
+ */
+#    define gmx_simd_sincos_r         gmx_simd_sincos_f
+
+/*! \brief SIMD real sin(x).
+ *
+ * \copydetails gmx_simd_sin_f
+ */
+#    define gmx_simd_sin_r            gmx_simd_sin_f
+
+/*! \brief SIMD real cos(x).
+ *
+ * \copydetails gmx_simd_cos_f
+ */
+#    define gmx_simd_cos_r            gmx_simd_cos_f
+
+/*! \brief SIMD real tan(x).
+ *
+ * \copydetails gmx_simd_tan_f
+ */
+#    define gmx_simd_tan_r            gmx_simd_tan_f
+
+/*! \brief SIMD real asin(x).
+ *
+ * \copydetails gmx_simd_asin_f
+ */
+#    define gmx_simd_asin_r           gmx_simd_asin_f
+
+/*! \brief SIMD real acos(x).
+ *
+ * \copydetails gmx_simd_acos_f
+ */
+#    define gmx_simd_acos_r           gmx_simd_acos_f
+
+/*! \brief SIMD real atan(x).
+ *
+ * \copydetails gmx_simd_atan_f
+ */
+#    define gmx_simd_atan_r           gmx_simd_atan_f
+
+/*! \brief SIMD real atan2(y,x).
+ *
+ * \copydetails gmx_simd_atan2_f
+ */
+#    define gmx_simd_atan2_r          gmx_simd_atan2_f
+
+/*! \brief SIMD Analytic PME force correction.
+ *
+ * \copydetails gmx_simd_pmecorrF_f
+ */
+#    define gmx_simd_pmecorrF_r       gmx_simd_pmecorrF_f
+
+/*! \brief SIMD Analytic PME potential correction.
+ *
+ * \copydetails gmx_simd_pmecorrV_f
+ */
+#    define gmx_simd_pmecorrV_r       gmx_simd_pmecorrV_f
+
+/*! \}
+ * \name SIMD4 math functions
+ * \{
+ */
+
+/*! \brief SIMD4 utility function to sum a+b+c+d for SIMD4 reals.
+ *
+ * \copydetails gmx_simd_sum4_f
+ */
+#    define gmx_simd4_sum4_r          gmx_simd4_sum4_f
+
+/*! \brief Calculate 1/sqrt(x) for SIMD4 real.
+ *
+ * \copydetails gmx_simd_invsqrt_f
+ */
+#    define gmx_simd4_invsqrt_r       gmx_simd4_invsqrt_f
+
+/*! \} */
+
+#endif /* GMX_DOUBLE */
+
+/*! \} */
+/*! \endcond */
+
+#endif /* GMX_SIMD_SIMD_MATH_H_ */
diff --git a/src/gromacs/simd/tests/CMakeLists.txt b/src/gromacs/simd/tests/CMakeLists.txt

new file mode 100644 (file)

index 0000000..2d6f7d0
--- /dev/null
+++ b/src/gromacs/simd/tests/CMakeLists.txt
@@ -0,0 +1,49 @@
+#
+# This file is part of the GROMACS molecular simulation package.
+#
+# Copyright (c) 2014, by the GROMACS development team, led by
+# Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+# and including many others, as listed in the AUTHORS file in the
+# top-level source directory and at http://www.gromacs.org.
+#
+# GROMACS is free software; you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public License
+# as published by the Free Software Foundation; either version 2.1
+# of the License, or (at your option) any later version.
+#
+# GROMACS is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public
+# License along with GROMACS; if not, see
+# http://www.gnu.org/licenses, or write to the Free Software Foundation,
+# Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+#
+# If you want to redistribute modifications to GROMACS, please
+# consider that scientific software is very special. Version
+# control is crucial - bugs must be traceable. We will be happy to
+# consider code for inclusion in the official distribution, but
+# derived work must not be called official GROMACS. Details are found
+# in the README & COPYING files - if they are missing, get the
+# official version at http://www.gromacs.org.
+#
+# To help us fund GROMACS development, we humbly ask that you cite
+# the research papers on the package. Check out http://www.gromacs.org.
+
+gmx_add_unit_test(SimdUnitTests simd-test
+                  bootstrap_loadstore.cpp
+                 base.cpp
+                  simd.cpp
+                 simd_floatingpoint.cpp
+                  simd_vector_operations.cpp
+                  simd_math.cpp
+                 simd_integer.cpp
+                  simd4.cpp
+                  simd4_floatingpoint.cpp
+                  simd4_vector_operations.cpp
+                  simd4_math.cpp)
+
+
+
diff --git a/src/gromacs/simd/tests/base.cpp b/src/gromacs/simd/tests/base.cpp

new file mode 100644 (file)

index 0000000..c461b28
--- /dev/null
+++ b/src/gromacs/simd/tests/base.cpp
@@ -0,0 +1,130 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2014, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "testutils/testoptions.h"
+#include "gromacs/options/options.h"
+#include "gromacs/options/basicoptions.h"
+
+#include "base.h"
+
+namespace gmx
+{
+namespace test
+{
+
+namespace
+{
+
+/*! \cond */
+/*! \brief Command-line option to adjust the number of points used to test SIMD math functions. */
+GMX_TEST_OPTIONS(SimdBaseTestOptions, options)
+{
+    options->addOption(::gmx::IntegerOption("npoints")
+                           .store(&SimdBaseTest::s_nPoints)
+                           .description("Number of points to test for SIMD math functions"));
+}
+/*! \endcond */
+
+}       // namespace
+
+/*! \cond internal */
+/*! \addtogroup module_simd */
+/*! \{ */
+
+int  SimdBaseTest::s_nPoints    = 10000;
+
+::testing::AssertionResult
+SimdBaseTest::compareVectorRealUlp(const char * refExpr,   const char * tstExpr,
+                                   const std::vector<real> &ref, const std::vector<real> &tst)
+{
+    std::vector<real>             absDiff(tst.size());
+    std::vector<gmx_int64_t>      ulpDiff(tst.size());
+    bool                          allOk;
+    size_t                        i;
+
+    union {
+#ifdef GMX_DOUBLE
+        double r; gmx_int64_t i;
+#else
+        float  r; gmx_int32_t i;
+#endif
+    } conv0, conv1;
+
+    // Internal test of the test - make sure reference and test have the same length.
+    if (ref.size() != tst.size())
+    {
+        return ::testing::AssertionFailure()
+               << "Internal test error - unequal size vectors in compareVectorRealUlp" << std::endl;
+    }
+
+    for (i = 0, allOk = true; i < tst.size(); i++)
+    {
+        absDiff[i]  = fabs(ref[i]-tst[i]);
+        conv0.r     = ref[i];
+        conv1.r     = tst[i];
+        ulpDiff[i]  = llabs(conv0.i-conv1.i);
+
+        /* Use strict smaller-than for absolute tolerance check, so we disable it with absTol_=0 */
+        allOk       = allOk && ( ( absDiff[i] < absTol_ ) || ( ( ref[i]*tst[i] >= 0 ) && (ulpDiff[i] <= ulpTol_) ) );
+    }
+
+    if (allOk == true)
+    {
+        return ::testing::AssertionSuccess();
+    }
+    else
+    {
+        return ::testing::AssertionFailure()
+               << "Failing comparison between " << refExpr << " and " << tstExpr << std::endl
+               << "Requested abs tolerance: " << absTol_ << std::endl
+               << "Requested ulp tolerance: " << ulpTol_ << std::endl
+               << "(And values should not differ in sign unless within abs tolerance.)" << std::endl
+               << "Reference values: " << ::testing::PrintToString(ref) << std::endl
+               << "SIMD values:      " << ::testing::PrintToString(tst) << std::endl
+               << "Abs. difference:  " << ::testing::PrintToString(absDiff) << std::endl
+               << "Ulp difference:   " << ::testing::PrintToString(ulpDiff) << std::endl;
+    }
+}
+
+/*! \} */
+/*! \endcond */
+
+}      // namespace
+}      // namespace
diff --git a/src/gromacs/simd/tests/base.h b/src/gromacs/simd/tests/base.h

new file mode 100644 (file)

index 0000000..9f9f8f5
--- /dev/null
+++ b/src/gromacs/simd/tests/base.h
@@ -0,0 +1,184 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2014, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+
+#ifndef GMX_SIMD_TESTS_BASE_H
+#define GMX_SIMD_TESTS_BASE_H
+
+/*! \internal \file
+ * \brief
+ * Declares common base class for testing SIMD and SIMD4.
+ *
+ * The base class contains the settings for absolute and ulp tolerances,
+ * as well as testing ranges used for both SIMD and SIMD4 tests, mainly
+ * to keep everything symmetric and clean. The class also defines a couple
+ * of generic tests that compare vectors of elements with arbitrary length for
+ * either exact or approximate matching (in terms of ulp). These are used in
+ * derived classes that convert either SIMD or SIMD4 values to
+ * std::vector<real> and then performs the comparison.
+ *
+ * \author Erik Lindahl <erik.lindahl@scilifelab.se>
+ * \ingroup module_simd
+ */
+#include <vector>
+#include <gtest/gtest.h>
+#include "gromacs/simd/simd.h"
+
+
+namespace gmx
+{
+namespace test
+{
+
+/*! \cond internal */
+/*! \addtogroup module_simd */
+/*! \{ */
+
+/*! \brief Base class for SIMD test fixtures.
+ *
+ * This class contains settings that are common for SIMD and SIMD4 tests,
+ * and it is thus not used directly for any tests, but derived separately
+ * in simd.h and simd4.h.
+ */
+class SimdBaseTest : public ::testing::Test
+{
+    public:
+        /*! \brief Initialize new SIMD test fixture with default tolerances.
+         *
+         * The default absolute tolerance is set to 0, which means the we always
+         * check the ulp tolerance by default (passing the absolute tolerance
+         * test would otherwise mean we approve the test instantly).
+         * The default ulp tolerance is set to 10 units in single, and 255 units
+         * in double precision.
+         * Most SIMD math functions achieve 2-3 ulp accuracy in single, but by
+         * being a bit liberal we avoid tests failing on aggressive compilers.
+         *
+         * For double precision we only aim to achieve twice the accuracy of
+         * single. This way we can make do with a single extra iteration
+         * in some algorithms, in particular 1/sqrt(x).
+         *
+         * The range is used by derived classes to test math functions. The
+         * default test range will be [1,10], which is intentionally
+         * conservative so it works with (inverse) square root, division,
+         * exponentials, logarithms, and error functions.
+         */
+        SimdBaseTest()
+        {
+#ifdef GMX_DOUBLE
+            ulpTol_       = 255LL; // Aim for roughly twice the precision we have in single.
+#else
+            ulpTol_       = 10LL;  // Be a bit liberal so compiler optimization doesn't bite us.
+#endif
+            absTol_       = 0;
+            range_        = std::pair<real, real>(1, 10);
+        }
+
+        /*! \brief Adjust ulp tolerance from the default 10 (float) or 255 (double). */
+        void setUlpTol(gmx_int64_t newTol)   { ulpTol_ = newTol; }
+
+        /*! \brief Adjust the absolute tolerance from the default 0.
+         *
+         * If values are closer than the absolute tolerance, the test will pass
+         * no matter what their ulp difference is.
+         */
+        void setAbsTol(real newTol)          { absTol_ = newTol; }
+
+        /*! \brief Change math function testing range from the default [1,10]. */
+        void setRange(real low, real high) { range_.first = low; range_.second = high; }
+
+        static int  s_nPoints;    //!< Number of test points to use, settable on command line.
+
+        /*! \brief Compare two std::vector<real> for approximate equality.
+         *
+         * This is an internal implementation routine that will be used by
+         * routines in derived child classes that first convert SIMD or SIMD4
+         * variables to std::vector<real>. Do not call it directly.
+         *
+         * This routine is designed according to the Google test specs, so the char
+         * strings will describe the arguments to the macro.
+         *
+         * The comparison is applied to each element, and it returns true if each element
+         * in the vector test variable is within the class tolerances of the corresponding
+         * reference elements.
+         */
+        ::testing::AssertionResult
+        compareVectorRealUlp(const char * refExpr,  const char * tstExpr,
+                             const std::vector<real> &ref, const std::vector<real> &tst);
+
+        /*! \brief Compare std::vectors for exact equality.
+         *
+         * The template in this class makes it usable for testing both
+         * SIMD floating-point and integers variables, after conversion to
+         * vectors.
+         * This is an internal implementation routine that will be used by
+         * routines in derived child classes that first convert SIMD or SIMD4
+         * variables to std::vector<real>. Do not call it directly.
+         *
+         * This routine is designed according to the Google test specs, so the char
+         * strings will describe the arguments to the macro.
+         *
+         * The comparison is applied to each element, and it returns true if each element
+         * in the vector test variable is within the class tolerances of the corresponding
+         * reference elements.
+         */
+        template <typename T> ::testing::AssertionResult
+        compareVectorEq(const char * refExpr,  const char * tstExpr,
+                        const std::vector<T> &ref, const std::vector<T> &tst)
+        {
+            if (ref == tst)
+            {
+                return ::testing::AssertionSuccess();
+            }
+            else
+            {
+                return ::testing::AssertionFailure()
+                       << "Failing SIMD comparison between " << refExpr << " and " << tstExpr << std::endl
+                       << "Ref. values: " << ::testing::PrintToString(ref) << std::endl
+                       << "Test values: " << ::testing::PrintToString(tst) << std::endl;
+            }
+        }
+
+    protected:
+        gmx_int64_t            ulpTol_;       //!< Current tolerance in units-in-last-position.
+        real                   absTol_;       //!< Current absolute tolerance.
+        std::pair<real, real>  range_;        //!< Range for math function tests.
+};
+
+/*! \} */
+/*! \endcond */
+
+}      // namespace
+}      // namespace
+
+#endif // GMX_SIMD_TESTS_BASE_H
diff --git a/src/gromacs/simd/tests/bootstrap_loadstore.cpp b/src/gromacs/simd/tests/bootstrap_loadstore.cpp

new file mode 100644 (file)

index 0000000..d7a38e9
--- /dev/null
+++ b/src/gromacs/simd/tests/bootstrap_loadstore.cpp
@@ -0,0 +1,342 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2014, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+/*! \internal \file
+ * \brief
+ * Separate test of SIMD load/store, before we use them in the SIMD test classes.
+ *
+ * Simple tests without using any classes/utilities, so we can use load/store
+ * functions inside our test utilities after this has passed.
+ *
+ * This file tests:
+ *
+ * - gmx_simd_align_r(),gmx_simd_align_i(),gmx_simd4_align_r(),
+ * - gmx_simd_load_r(),gmx_simd_store_r(),gmx_simd_loadu_r(),gmx_simd_storeu_r()
+ * - gmx_simd_load_i(),gmx_simd_store_i(), gmx_simd_loadu_i(),gmx_simd_storeu_i()
+ * - gmx_simd4_load_r(),gmx_simd4_store_r(), gmx_simd4_loadu_r(),gmx_simd4_storeu_r()
+ *
+ * \author Erik Lindahl <erik.lindahl@scilifelab.se>
+ * \ingroup module_simd
+ */
+
+#include <gtest/gtest.h>
+#include "gromacs/simd/simd.h"
+
+namespace
+{
+
+/*! \cond internal */
+/*! \addtogroup module_simd */
+/*! \{ */
+
+TEST(SimdBootstrapTest, gmxSimdAlign)
+{
+#ifdef GMX_SIMD_HAVE_REAL
+    real rdata[GMX_SIMD_REAL_WIDTH*2];
+    for (int i = 0; i < GMX_SIMD_REAL_WIDTH; i++)
+    {
+        EXPECT_EQ(((size_t)gmx_simd_align_r(&rdata[i]) & (GMX_SIMD_REAL_WIDTH*sizeof(real)-1)), (size_t)0);
+    }
+#endif
+#ifdef GMX_SIMD_HAVE_INT32
+    int idata[GMX_SIMD_INT32_WIDTH*2];
+    for (int i = 0; i < GMX_SIMD_INT32_WIDTH; i++)
+    {
+        EXPECT_EQ(((size_t)gmx_simd_align_i(&idata[i]) & (GMX_SIMD_INT32_WIDTH*sizeof(int)-1)), (size_t)0);
+    }
+#endif
+}
+
+/*! \brief Generic routine to test load & store of SIMD, and check for side effects.
+ *
+ * The tests for load, store, unaligned load and unaligned store both for
+ * real and int are pretty much similar, so we use a template function with
+ * additional function pointers for the actual load/store calls. This would
+ * be more hacking to turn into a class, since the SIMD functionality uses
+ * macros rather than functions that can be overloaded.
+ */
+template <typename T, typename TSimd> void
+simdLoadStoreTester(TSimd simdLoadFn(T* mem), void simdStoreFn(T* mem, TSimd),
+                    T * simdAlignFn(T *mem),
+                    const int loadOffset, const int storeOffset, const int simdWidth)
+{
+    /* We want simdWidth elements before the data to check we are not polluting
+     * memory. Then we need 2*simdWidth storage to be able to extract an aligned
+     * pointer, another simdWidth elements so we can create (deliberately)
+     * offset un-aligned pointers, and finally simdWidth elements at the end
+     * to test we are not polluting memory there either. Sum=5*simdWidth!
+     */
+    std::vector<T>   src(simdWidth*5);
+    std::vector<T>   dst(simdWidth*5);
+    // Make sure we have memory to check both before and after the test pointers
+    T *              pCopySrc = simdAlignFn(&src[0]) + simdWidth + loadOffset;
+    T *              pCopyDst = simdAlignFn(&dst[0]) + simdWidth + storeOffset;
+    int              i;
+
+    for (i = 0; i < simdWidth*5; i++)
+    {
+        src[i] =  1+i;
+        dst[i] = -1-i;
+    }
+
+    simdStoreFn(pCopyDst, simdLoadFn(pCopySrc));
+
+    for (i = 0; i < simdWidth; i++)
+    {
+        EXPECT_EQ(pCopySrc[i], pCopyDst[i]) << "SIMD load or store not moving data correctly for element " << i;
+    }
+
+    for (i = 0; i < simdWidth*5; i++)
+    {
+        EXPECT_EQ(src[i], (T)(1+i)) << "Side effect on source memory, i = " << i;
+        if (&dst[0]+i < pCopyDst || &dst[0]+i >= pCopyDst+simdWidth)
+        {
+            EXPECT_EQ(dst[i], (T)(-1-i)) << "Side effect on destination memory, i = " << i;
+        }
+    }
+}
+
+#ifdef GMX_SIMD_HAVE_REAL
+//! Wrapper for SIMD macro to load aligned floating-point data.
+gmx_simd_real_t wrapperSimdLoadR(real *m)
+{
+    return gmx_simd_load_r(m);
+}
+//! Wrapper for SIMD macro to store to aligned floating-point data.
+void            wrapperSimdStoreR(real *m, gmx_simd_real_t s)
+{
+    gmx_simd_store_r(m, s);
+}
+
+TEST(SimdBootstrapTest, gmxSimdLoadStoreR)
+{
+    simdLoadStoreTester(wrapperSimdLoadR, wrapperSimdStoreR, gmx_simd_align_r, 0, 0, GMX_SIMD_REAL_WIDTH);
+}
+
+#    ifdef GMX_SIMD_HAVE_LOADU
+//! Wrapper for SIMD macro to load unaligned floating-point data.
+gmx_simd_real_t WrapperSimdLoadUR(real *m)
+{
+    return gmx_simd_loadu_r(m);
+}
+
+TEST(SimdBootstrapTest, gmxSimdLoadUR)
+{
+    for (int i = 0; i < GMX_SIMD_REAL_WIDTH; i++)
+    {
+        simdLoadStoreTester(WrapperSimdLoadUR, wrapperSimdStoreR, gmx_simd_align_r, i, 0, GMX_SIMD_REAL_WIDTH);
+    }
+}
+#    endif
+
+#    ifdef GMX_SIMD_HAVE_STOREU
+//! Wrapper for SIMD macro to store to unaligned floating-point data.
+void WrapperSimdStoreUR(real *m, gmx_simd_real_t s)
+{
+    gmx_simd_storeu_r(m, s);
+}
+
+TEST(SimdBootstrapTest, gmxSimdStoreUR)
+{
+    for (int i = 0; i < GMX_SIMD_REAL_WIDTH; i++)
+    {
+        simdLoadStoreTester(wrapperSimdLoadR, WrapperSimdStoreUR, gmx_simd_align_r, 0, i, GMX_SIMD_REAL_WIDTH);
+    }
+}
+#    endif
+#endif
+
+#ifdef GMX_SIMD_HAVE_INT32
+// Tests for gmx_simd_int32_t load & store operations
+
+//! Wrapper for SIMD macro to load aligned integer data.
+gmx_simd_int32_t wrapperSimdLoadI(int *m)
+{
+    return gmx_simd_load_i(m);
+}
+//! Wrapper for SIMD macro to store to aligned integer data.
+void             wrapperSimdStoreI(int *m, gmx_simd_int32_t s)
+{
+    gmx_simd_store_i(m, s);
+}
+
+TEST(SimdBootstrapTest, gmxSimdLoadStoreI)
+{
+    simdLoadStoreTester(wrapperSimdLoadI, wrapperSimdStoreI, gmx_simd_align_i, 0, 0, GMX_SIMD_INT32_WIDTH);
+}
+
+#    ifdef GMX_SIMD_HAVE_LOADU
+//! Wrapper for SIMD macro to load unaligned integer data.
+gmx_simd_int32_t wrapperSimdLoadUI(int *m)
+{
+    return gmx_simd_loadu_i(m);
+}
+
+TEST(SimdBootstrapTest, gmxSimdLoadUI)
+{
+    for (int i = 0; i < GMX_SIMD_INT32_WIDTH; i++)
+    {
+        simdLoadStoreTester(wrapperSimdLoadUI, wrapperSimdStoreI, gmx_simd_align_i, i, 0, GMX_SIMD_INT32_WIDTH);
+    }
+}
+#    endif
+
+#    ifdef GMX_SIMD_HAVE_STOREU
+//! Wrapper for SIMD macro to store to unaligned integer data.
+void wrapperSimdStoreUI(int *m, gmx_simd_int32_t s)
+{
+    gmx_simd_storeu_i(m, s);
+}
+
+TEST(SimdBootstrapTest, gmxSimdStoreUI)
+{
+    for (int i = 0; i < GMX_SIMD_INT32_WIDTH; i++)
+    {
+        simdLoadStoreTester(wrapperSimdLoadI, wrapperSimdStoreUI, gmx_simd_align_i, 0, i, GMX_SIMD_INT32_WIDTH);
+    }
+}
+#    endif
+#endif
+
+#ifdef GMX_SIMD4_HAVE_REAL
+/* Tests for gmx_simd4_real_t load & store operations. Define wrapper functions
+ * for the SIMD instructions that are typically implemented as macros.
+ */
+
+/*! \brief Separate load/store tester function for SIMD4.
+ *
+ * Due to the way SIMD variables
+ * are implemented as deep internal data, some compilers treat them as
+ * float/double with special prefixes. Unfortunately, this means that some C++
+ * compilers think an 8-wide normal real SIMD and a 4-wide SIMD4 real type
+ * cannot be overloaded (e.g. with gcc using 256-bit AVX single precision).
+ */
+template <typename T, typename TSimd> void
+simd4LoadStoreTester(TSimd simd4LoadFn(T* mem), void simd4StoreFn(T* mem, TSimd),
+                     T * simd4AlignFn(T *mem),
+                     const int loadOffset, const int storeOffset)
+{
+    /* We want simdWidth elements before the data to check we are not polluting
+     * memory. Then we need 2*simdWidth storage to be able to extract an aligned
+     * pointer, another simdWidth elements so we can create (deliberately)
+     * offset un-aligned pointers, and finally simdWidth elements at the end
+     * to test we are not polluting memory there either. Sum=5*simdWidth!
+     */
+    T         src[GMX_SIMD4_WIDTH*5];
+    T         dst[GMX_SIMD4_WIDTH*5];
+    // Make sure we have memory to check both before and after the test pointers
+    T *       pCopySrc = simd4AlignFn(src) + GMX_SIMD4_WIDTH + loadOffset;
+    T *       pCopyDst = simd4AlignFn(dst) + GMX_SIMD4_WIDTH + storeOffset;
+    int       i;
+
+    for (i = 0; i < GMX_SIMD4_WIDTH*5; i++)
+    {
+        src[i] =  1+i;
+        dst[i] = -1-i;
+    }
+
+    simd4StoreFn(pCopyDst, simd4LoadFn(pCopySrc));
+
+    for (i = 0; i < GMX_SIMD4_WIDTH; i++)
+    {
+        EXPECT_EQ(pCopySrc[i], pCopyDst[i]) << "SIMD4 load or store not moving data correctly for element " << i;
+    }
+
+    for (i = 0; i < GMX_SIMD4_WIDTH*5; i++)
+    {
+        EXPECT_EQ(src[i], (T)(1+i)) << "Side effect on source memory, i = " << i;
+        if (dst+i < pCopyDst || dst+i >= pCopyDst+GMX_SIMD4_WIDTH)
+        {
+            EXPECT_EQ(dst[i], (T)(-1-i)) << "Side effect on destination memory, i = " << i;
+        }
+    }
+}
+
+//! Wrapper for SIMD4 macro to load aligned floating-point data.
+gmx_simd4_real_t wrapperSimd4LoadR(real *m)
+{
+    return gmx_simd4_load_r(m);
+}
+//! Wrapper for SIMD4 macro to store to aligned floating-point data.
+void             wrapperSimd4StoreR(real *m, gmx_simd4_real_t s)
+{
+    gmx_simd4_store_r(m, s);
+}
+
+TEST(SimdBootstrapTest, gmxSimd4LoadStoreR)
+{
+    simd4LoadStoreTester(wrapperSimd4LoadR, wrapperSimd4StoreR, gmx_simd4_align_r, 0, 0);
+}
+
+#    ifdef GMX_SIMD_HAVE_LOADU
+//! Wrapper for SIMD4 macro to load unaligned floating-point data.
+gmx_simd4_real_t WrapperSimd4LoadUR(real *m)
+{
+    return gmx_simd4_loadu_r(m);
+}
+
+TEST(SimdBootstrapTest, gmxSimd4LoadUR)
+{
+    for (int i = 0; i < GMX_SIMD4_WIDTH; i++)
+    {
+        simd4LoadStoreTester(WrapperSimd4LoadUR, wrapperSimd4StoreR, gmx_simd4_align_r, i, 0);
+    }
+}
+#    endif
+
+#    ifdef GMX_SIMD_HAVE_STOREU
+//! Wrapper for SIMD4 macro to store to unaligned floating-point data.
+void WrapperSimd4StoreUR(real *m, gmx_simd4_real_t s)
+{
+    gmx_simd4_storeu_r(m, s);
+}
+
+TEST(SimdBootstrapTest, gmxSimd4StoreUR)
+{
+    for (int i = 0; i < GMX_SIMD4_WIDTH; i++)
+    {
+        simd4LoadStoreTester(wrapperSimd4LoadR, WrapperSimd4StoreUR, gmx_simd4_align_r, 0, i);
+    }
+}
+#    endif
+#endif
+
+/*! \} */
+/*! \endcond */
+
+} // namespace
diff --git a/src/gromacs/simd/tests/simd.cpp b/src/gromacs/simd/tests/simd.cpp

new file mode 100644 (file)

index 0000000..74c5dff
--- /dev/null
+++ b/src/gromacs/simd/tests/simd.cpp
@@ -0,0 +1,235 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2014, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include "simd.h"
+
+namespace gmx
+{
+namespace test
+{
+
+/*! \cond internal */
+/*! \addtogroup module_simd */
+/*! \{ */
+
+/* Unfortunately we cannot keep static SIMD constants in the test fixture class.
+ * The problem is that SIMD memory need to be aligned, and in particular
+ * this applies to automatic storage of variables in classes. For SSE registers
+ * this means 16-byte alignment (which seems to work), but AVX requires 32-bit
+ * alignment. At least both gcc-4.7.3 and Apple clang-5.0 (OS X 10.9) fail to
+ * align these variables when they are stored as data in a class.
+ *
+ * In theory we could set some of these on-the-fly e.g. with setSimdRealFrom3R()
+ * instead (although that would mean repeating code between tests), but many of
+ * the constants depend on the current precision not to mention they
+ * occasionally have many digits that need to be exactly right, and keeping
+ * them in a single place makes sure they are consistent.
+ */
+#ifdef GMX_SIMD_HAVE_REAL
+const gmx_simd_real_t rSimd_1_2_3    = setSimdRealFrom3R(1, 2, 3);
+const gmx_simd_real_t rSimd_4_5_6    = setSimdRealFrom3R(4, 5, 6);
+const gmx_simd_real_t rSimd_7_8_9    = setSimdRealFrom3R(7, 8, 9);
+const gmx_simd_real_t rSimd_5_7_9    = setSimdRealFrom3R(5, 7, 9);
+const gmx_simd_real_t rSimd_m1_m2_m3 = setSimdRealFrom3R(-1, -2, -3);
+const gmx_simd_real_t rSimd_3_1_4    = setSimdRealFrom3R(3, 1, 4);
+const gmx_simd_real_t rSimd_m3_m1_m4 = setSimdRealFrom3R(-3, -1, -4);
+const gmx_simd_real_t rSimd_2p25     = setSimdRealFrom1R(2.25);
+const gmx_simd_real_t rSimd_3p75     = setSimdRealFrom1R(3.75);
+const gmx_simd_real_t rSimd_m2p25    = setSimdRealFrom1R(-2.25);
+const gmx_simd_real_t rSimd_m3p75    = setSimdRealFrom1R(-3.75);
+const gmx_simd_real_t rSimd_Exp      = setSimdRealFrom3R( 1.4055235171027452623914516e+18,
+                                                          5.3057102734253445623914516e-13,
+                                                          -2.1057102745623934534514516e+16);
+#    if (defined GMX_SIMD_HAVE_DOUBLE) && (defined GMX_DOUBLE)
+// Make sure we also test exponents outside single precision when we use double
+const gmx_simd_real_t rSimd_ExpDouble = setSimdRealFrom3R( 6.287393598732017379054414e+176,
+                                                           8.794495252903116023030553e-140,
+                                                           -3.637060701570496477655022e+202);
+// Magic FP numbers corresponding to specific bit patterns
+const gmx_simd_real_t rSimd_Bits1    = setSimdRealFrom1R(-1.07730874267432137e+236);
+const gmx_simd_real_t rSimd_Bits2    = setSimdRealFrom1R(-9.25596313493178307e+061);
+const gmx_simd_real_t rSimd_Bits3    = setSimdRealFrom1R(-8.57750588235293981e+003);
+const gmx_simd_real_t rSimd_Bits4    = setSimdRealFrom1R( 1.22416778341839096e-250);
+const gmx_simd_real_t rSimd_Bits5    = setSimdRealFrom1R(-1.15711777004554095e+294);
+const gmx_simd_real_t rSimd_Bits6    = setSimdRealFrom1R( 1.53063836115600621e-018);
+#    else
+// Magic FP numbers corresponding to specific bit patterns
+const gmx_simd_real_t rSimd_Bits1    = setSimdRealFrom1R(-5.9654142337e+29);
+const gmx_simd_real_t rSimd_Bits2    = setSimdRealFrom1R(-1.0737417600e+08);
+const gmx_simd_real_t rSimd_Bits3    = setSimdRealFrom1R(-6.0235290527e+00);
+const gmx_simd_real_t rSimd_Bits4    = setSimdRealFrom1R( 1.0788832913e-31);
+const gmx_simd_real_t rSimd_Bits5    = setSimdRealFrom1R(-1.0508719529e+37);
+const gmx_simd_real_t rSimd_Bits6    = setSimdRealFrom1R( 1.1488970369e-02);
+#    endif
+#endif  // GMX_SIMD_HAVE_REAL
+#ifdef GMX_SIMD_HAVE_INT32
+const gmx_simd_int32_t iSimd_1_2_3      = setSimdIntFrom3I(1, 2, 3);
+const gmx_simd_int32_t iSimd_4_5_6      = setSimdIntFrom3I(4, 5, 6);
+const gmx_simd_int32_t iSimd_7_8_9      = setSimdIntFrom3I(7, 8, 9);
+const gmx_simd_int32_t iSimd_5_7_9      = setSimdIntFrom3I(5, 7, 9);
+const gmx_simd_int32_t iSimd_1M_2M_3M   = setSimdIntFrom3I(1000000, 2000000, 3000000);
+const gmx_simd_int32_t iSimd_4M_5M_6M   = setSimdIntFrom3I(4000000, 5000000, 6000000);
+const gmx_simd_int32_t iSimd_5M_7M_9M   = setSimdIntFrom3I(5000000, 7000000, 9000000);
+const gmx_simd_int32_t iSimd_0xF0F0F0F0 = setSimdIntFrom1I(0xF0F0F0F0);
+const gmx_simd_int32_t iSimd_0xCCCCCCCC = setSimdIntFrom1I(0xCCCCCCCC);
+#endif  // GMX_SIMD_HAVE_INT32
+
+#ifdef GMX_SIMD_HAVE_REAL
+::std::vector<real>
+simdReal2Vector(const gmx_simd_real_t simd)
+{
+    real                mem[GMX_SIMD_REAL_WIDTH*2];
+    real *              p = gmx_simd_align_r(mem);
+
+    gmx_simd_store_r(p, simd);
+    std::vector<real>   v(p, p+GMX_SIMD_REAL_WIDTH);
+
+    return v;
+}
+
+gmx_simd_real_t
+vector2SimdReal(const std::vector<real> &v)
+{
+    real                mem[GMX_SIMD_REAL_WIDTH*2];
+    real *              p = gmx_simd_align_r(mem);
+
+    for (int i = 0; i < GMX_SIMD_REAL_WIDTH; i++)
+    {
+        p[i] = v[i % v.size()];  // repeat vector contents to fill simd width
+    }
+    return gmx_simd_load_r(p);
+}
+
+gmx_simd_real_t
+setSimdRealFrom3R(real r0, real r1, real r2)
+{
+    std::vector<real> v(3);
+    v[0] = r0;
+    v[1] = r1;
+    v[2] = r2;
+    return vector2SimdReal(v);
+}
+
+gmx_simd_real_t
+setSimdRealFrom1R(real value)
+{
+    std::vector<real> v(GMX_SIMD_REAL_WIDTH);
+    for (int i = 0; i < GMX_SIMD_REAL_WIDTH; i++)
+    {
+        v[i] = value;
+    }
+    return vector2SimdReal(v);
+}
+
+testing::AssertionResult
+SimdTest::compareSimdRealUlp(const char *  refExpr,     const char *  tstExpr,
+                             const gmx_simd_real_t ref, const gmx_simd_real_t tst)
+{
+    return compareVectorRealUlp(refExpr, tstExpr, simdReal2Vector(ref), simdReal2Vector(tst));
+}
+
+testing::AssertionResult
+SimdTest::compareSimdRealEq(const char * refExpr, const char * tstExpr,
+                            const gmx_simd_real_t ref, const gmx_simd_real_t tst)
+{
+    return compareVectorEq(refExpr, tstExpr, simdReal2Vector(ref), simdReal2Vector(tst));
+}
+
+#endif  // GMX_SIMD_HAVE_REAL
+
+#ifdef GMX_SIMD_HAVE_INT32
+std::vector<int>
+simdInt2Vector(const gmx_simd_int32_t simd)
+{
+    int                 mem[GMX_SIMD_INT32_WIDTH*2];
+    int *               p = gmx_simd_align_i(mem);
+
+    gmx_simd_store_i(p, simd);
+    std::vector<int>    v(p, p+GMX_SIMD_INT32_WIDTH);
+
+    return v;
+}
+
+gmx_simd_int32_t
+vector2SimdInt(const std::vector<int> &v)
+{
+    int                 mem[GMX_SIMD_INT32_WIDTH*2];
+    int *               p = gmx_simd_align_i(mem);
+
+    for (int i = 0; i < GMX_SIMD_INT32_WIDTH; i++)
+    {
+        p[i] = v[i % v.size()];  // repeat vector contents to fill simd width
+    }
+    return gmx_simd_load_i(p);
+}
+
+gmx_simd_int32_t
+setSimdIntFrom3I(int i0, int i1, int i2)
+{
+    std::vector<int> v(3);
+    v[0] = i0;
+    v[1] = i1;
+    v[2] = i2;
+    return vector2SimdInt(v);
+}
+
+gmx_simd_int32_t
+setSimdIntFrom1I(int value)
+{
+    std::vector<int> v(GMX_SIMD_INT32_WIDTH);
+    for (int i = 0; i < GMX_SIMD_INT32_WIDTH; i++)
+    {
+        v[i] = value;
+    }
+    return vector2SimdInt(v);
+}
+
+::testing::AssertionResult
+SimdTest::compareSimdInt32(const char *  refExpr,      const char *  tstExpr,
+                           const gmx_simd_int32_t ref, const gmx_simd_int32_t tst)
+{
+    return compareVectorEq(refExpr, tstExpr, simdInt2Vector(ref), simdInt2Vector(tst));
+}
+
+#endif  // GMX_SIMD_HAVE_INT32
+
+/*! \} */
+/*! \endcond */
+
+}      // namespace
+}      // namespace
diff --git a/src/gromacs/simd/tests/simd.h b/src/gromacs/simd/tests/simd.h

new file mode 100644 (file)

index 0000000..9904390
--- /dev/null
+++ b/src/gromacs/simd/tests/simd.h
@@ -0,0 +1,290 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2014, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+#ifndef GMX_SIMD_TESTS_SIMD_H
+#define GMX_SIMD_TESTS_SIMD_H
+
+/*! \internal \file
+ * \brief
+ * Declares fixture for testing of normal SIMD (not SIMD4) functionality.
+ *
+ * The SIMD tests are both simple and complicated. The actual testing logic
+ * is \a very straightforward since we just need to test single values against
+ * the math library, and for some math functions we need to do it in a loop.
+ * This could have been achieved in minutes with the default Google Test tools,
+ * if it wasn't for the problem that we cannot access or compare SIMD contents
+ * directly without using lots of other SIMD functionality. For this reason
+ * we have separate the basic testing of load/store operations into a separate
+ * bootstrapping test. Once this works, we use a set of utility routines to
+ * convert SIMD contents to/from std:vector<> and perform the rest of the tests,
+ * which then can farmed out to the base class SimdBaseTest that is common
+ * to SIMD and SIMD4.
+ *
+ * Another complication is that the width of the SIMD implementation will
+ * depend on the hardware and precision. For some simple operations it is
+ * sufficient to set all SIMD elements to the same value, and check that the
+ * result is present in all elements. However, for a few more complex
+ * instructions that might rely on shuffling under-the-hood it is important
+ * that we can test operations with different elements. We achieve this by
+ * having test code that can initialize a SIMD variable from an std::vector
+ * of arbitrary length; the vector is simply repeated to fill all elements in
+ * the SIMD variable. We also have similar routines to compare a SIMD result
+ * with values in a vector, which returns true iff all elements match.
+ *
+ * This way we can write simple tests that use different values for all SIMD
+ * elements. Personally I like using vectors of length 3, since this means
+ * there are no simple repeated patterns in low/high halves of SIMD variables
+ * that are 2,4,8,or 16 elements wide, and we still don't have to care about
+ * the exact SIMD width of the underlying implementation.
+ *
+ * Note that this utility uses a few SIMD load/store instructions internally -
+ * those have been tested separately in the bootstrap_loadstore.cpp file.
+ *
+ * \author Erik Lindahl <erik.lindahl@scilifelab.se>
+ * \ingroup module_simd
+ */
+#include <vector>
+#include <gtest/gtest.h>
+#include "gromacs/simd/simd.h"
+
+#include "base.h"
+
+namespace gmx
+{
+namespace test
+{
+
+/*! \cond internal */
+/*! \addtogroup module_simd */
+/*! \{ */
+
+/* Unfortunately we cannot keep static SIMD constants in the test fixture class.
+ * The problem is that SIMD memory need to be aligned, and in particular
+ * this applies to automatic storage of variables in classes. For SSE registers
+ * this means 16-byte alignment (which seems to work), but AVX requires 32-bit
+ * alignment. At least both gcc-4.7.3 and Apple clang-5.0 (OS X 10.9) fail to
+ * align these variables when they are stored as data in a class.
+ *
+ * In theory we could set some of these on-the-fly e.g. with setSimdFrom3R()
+ * instead (although that would mean repeating code between tests), but many of
+ * the constants depend on the current precision not to mention they
+ * occasionally have many digits that need to be exactly right, and keeping
+ * them in a single place makes sure they are consistent.
+ */
+#ifdef GMX_SIMD_HAVE_REAL
+extern const gmx_simd_real_t rSimd_1_2_3;     //!< Generic (different) fp values.
+extern const gmx_simd_real_t rSimd_4_5_6;     //!< Generic (different) fp values.
+extern const gmx_simd_real_t rSimd_7_8_9;     //!< Generic (different) fp values.
+extern const gmx_simd_real_t rSimd_5_7_9;     //!< rSimd_1_2_3 + rSimd_4_5_6.
+extern const gmx_simd_real_t rSimd_m1_m2_m3;  //!< Generic negative floating-point values.
+extern const gmx_simd_real_t rSimd_3_1_4;     //!< Used to test min/max.
+extern const gmx_simd_real_t rSimd_m3_m1_m4;  //!< negative rSimd_3_1_4.
+extern const gmx_simd_real_t rSimd_2p25;      //!< Value that rounds down.
+extern const gmx_simd_real_t rSimd_3p75;      //!< Value that rounds up.
+extern const gmx_simd_real_t rSimd_m2p25;     //!< Negative value that rounds up.
+extern const gmx_simd_real_t rSimd_m3p75;     //!< Negative value that rounds down.
+//! Three large floating-point values whose exponents are >32.
+extern const gmx_simd_real_t rSimd_Exp;
+#    if (defined GMX_SIMD_HAVE_DOUBLE) && (defined GMX_DOUBLE)
+// Make sure we also test exponents outside single precision when we use double
+extern const gmx_simd_real_t rSimd_ExpDouble;
+#    endif
+// Magic FP numbers corresponding to specific bit patterns
+extern const gmx_simd_real_t rSimd_Bits1;       //!< Pattern F0 repeated to fill single/double.
+extern const gmx_simd_real_t rSimd_Bits2;       //!< Pattern CC repeated to fill single/double.
+extern const gmx_simd_real_t rSimd_Bits3;       //!< Pattern C0 repeated to fill single/double.
+extern const gmx_simd_real_t rSimd_Bits4;       //!< Pattern 0C repeated to fill single/double.
+extern const gmx_simd_real_t rSimd_Bits5;       //!< Pattern FC repeated to fill single/double.
+extern const gmx_simd_real_t rSimd_Bits6;       //!< Pattern 3C repeated to fill single/double.
+#endif                                          // GMX_SIMD_HAVE_REAL
+#ifdef GMX_SIMD_HAVE_INT32
+extern const gmx_simd_int32_t iSimd_1_2_3;      //!< Three generic ints.
+extern const gmx_simd_int32_t iSimd_4_5_6;      //!< Three generic ints.
+extern const gmx_simd_int32_t iSimd_7_8_9;      //!< Three generic ints.
+extern const gmx_simd_int32_t iSimd_5_7_9;      //!< iSimd_1_2_3 + iSimd_4_5_6.
+extern const gmx_simd_int32_t iSimd_1M_2M_3M;   //!< Term1 for 32bit add/sub.
+extern const gmx_simd_int32_t iSimd_4M_5M_6M;   //!< Term2 for 32bit add/sub.
+extern const gmx_simd_int32_t iSimd_5M_7M_9M;   //!< iSimd_1M_2M_3M + iSimd_4M_5M_6M.
+extern const gmx_simd_int32_t iSimd_0xF0F0F0F0; //!< Bitpattern to test integer logical operations.
+extern const gmx_simd_int32_t iSimd_0xCCCCCCCC; //!< Bitpattern to test integer logical operations.
+#endif                                          // GMX_SIMD_HAVE_INT32
+
+
+/*! \brief Test fixture for SIMD tests.
+ *
+ * This is a very simple test fixture that basically just takes the common
+ * SIMD/SIMD4 functionality from SimdBaseTest and creates wrapper routines
+ * specific for normal SIMD functionality.
+ */
+class SimdTest : public SimdBaseTest
+{
+    public:
+#ifdef GMX_SIMD_HAVE_REAL
+        /*! \brief Compare two real SIMD variables for approximate equality.
+         *
+         * This is an internal implementation routine. YOu should always use
+         * GMX_EXPECT_SIMD_REAL_NEAR() instead.
+         *
+         * This routine is designed according to the Google test specs, so the char
+         * strings will describe the arguments to the macro.
+         *
+         * The comparison is applied to each element, and it returns true if each element
+         * in the SIMD test variable is within the class tolerances of the corresponding
+         * reference element.
+         */
+            ::testing::AssertionResult
+        compareSimdRealUlp(const char * refExpr, const char * tstExpr,
+                           const gmx_simd_real_t ref, const gmx_simd_real_t tst);
+
+        /*! \brief Compare two real SIMD variables for exact equality.
+         *
+         * This is an internal implementation routine. YOu should always use
+         * GMX_EXPECT_SIMD_REAL_NEAR() instead.
+         *
+         * This routine is designed according to the Google test specs, so the char
+         * strings will describe the arguments to the macro.
+         *
+         * The comparison is applied to each element, and it returns true if each element
+         * in the SIMD test variable is within the class tolerances of the corresponding
+         * reference element.
+         */
+        ::testing::AssertionResult
+        compareSimdRealEq(const char * refExpr, const char * tstExpr,
+                          const gmx_simd_real_t ref, const gmx_simd_real_t tst);
+
+#endif
+
+#ifdef GMX_SIMD_HAVE_INT32
+        /*! \brief Compare two 32-bit integer SIMD variables.
+         *
+         * This is an internal implementation routine. YOu should always use
+         * GMX_EXPECT_SIMD_INT_EQ() instead.
+         *
+         * This routine is designed according to the Google test specs, so the char
+         * strings will describe the arguments to the macro, while the SIMD and
+         * tolerance arguments are used to decide if the values are approximately equal.
+         *
+         * The comparison is applied to each element, and it returns true if each element
+         * in the SIMD variable tst is identical to the corresponding reference element.
+         */
+            ::testing::AssertionResult
+        compareSimdInt32(const char * refExpr, const char *  tstExpr,
+                         const gmx_simd_int32_t ref, const gmx_simd_int32_t tst);
+#endif
+};
+
+#ifdef GMX_SIMD_HAVE_REAL
+/*! \brief Convert SIMD real to std::vector<real>.
+ *
+ * The returned vector will have the same length as the SIMD width.
+ */
+std::vector<real> simdReal2Vector(const gmx_simd_real_t simd);
+
+/*! \brief Return floating-point SIMD value from std::vector<real>.
+ *
+ * If the vector is longer than SIMD width, only the first elements will be used.
+ * If it is shorter, the contents will be repeated to fill the SIMD register.
+ */
+gmx_simd_real_t   vector2SimdReal(const std::vector<real> &v);
+
+/*! \brief Set SIMD register contents from three real values.
+ *
+ * Our reason for using three values is that 3 is not a factor in any known
+ * SIMD width, so this way there will not be any simple repeated patterns e.g.
+ * between the low/high 64/128/256 bits in the SIMD register, which could hide bugs.
+ */
+gmx_simd_real_t   setSimdRealFrom3R(real r0, real r1, real r2);
+
+/*! \brief Set SIMD register contents from single real value.
+ *
+ * All elements is set from the given value. This is effectively the same
+ * operation as gmx_simd_set1_r(), but is implemented using only load/store
+ * operations that have been tested separately in the bootstrapping tests.
+ */
+gmx_simd_real_t   setSimdRealFrom1R(real value);
+
+/*! \brief Test if a SIMD real is bitwise identical to reference SIMD value. */
+#define GMX_EXPECT_SIMD_REAL_EQ(ref, tst)   EXPECT_PRED_FORMAT2(compareSimdRealEq, ref, tst)
+
+/*! \brief Test if a SIMD real is within tolerance of reference SIMD value. */
+#define GMX_EXPECT_SIMD_REAL_NEAR(ref, tst) EXPECT_PRED_FORMAT2(compareSimdRealUlp, ref, tst)
+
+#endif  // GMX_SIMD_HAVE_REAL
+
+#ifdef GMX_SIMD_HAVE_INT32
+/*! \brief Convert SIMD integer to std::vector<int>.
+ *
+ * The returned vector will have the same length as the SIMD width.
+ */
+std::vector<int>   simdInt2Vector(const gmx_simd_int32_t simd);
+
+/*! \brief Return 32-bit integer SIMD value from std::vector<int>.
+ *
+ * If the vector is longer than SIMD width, only the first elements will be used.
+ * If it is shorter, the contents will be repeated to fill the SIMD register.
+ */
+gmx_simd_int32_t   vector2SimdInt(const std::vector<int> &v);
+
+/*! \brief Set SIMD register contents from three int values.
+ *
+ * Our reason for using three values is that 3 is not a factor in any known
+ * SIMD width, so this way there will not be any simple repeated patterns e.g.
+ * between the low/high 64/128/256 bits in the SIMD register, which could hide bugs.
+ */
+gmx_simd_int32_t   setSimdIntFrom3I(int i0, int i1, int i2);
+
+/*! \brief Set SIMD register contents from single integer value.
+ *
+ * All elements is set from the given value. This is effectively the same
+ * operation as gmx_simd_set1_i(), but is implemented using only load/store
+ * operations that have been tested separately in the bootstrapping tests.
+ */
+gmx_simd_int32_t   setSimdIntFrom1I(int value);
+
+/*! \brief Macro that checks SIMD integer expression against SIMD or reference int.
+ *
+ * If the reference argument is a scalar integer it will be expanded into
+ * the width of the SIMD register and tested against all elements.
+ */
+#define GMX_EXPECT_SIMD_INT_EQ(ref, tst)    EXPECT_PRED_FORMAT2(compareSimdInt32, ref, tst)
+
+#endif  // GMX_SIMD_HAVE_INT32
+
+/*! \} */
+/*! \endcond */
+
+}      // namespace
+}      // namespace
+
+#endif // GMX_SIMD_TESTS_SIMD_H
diff --git a/src/gromacs/simd/tests/simd4.cpp b/src/gromacs/simd/tests/simd4.cpp

new file mode 100644 (file)

index 0000000..d6dd7ac
--- /dev/null
+++ b/src/gromacs/simd/tests/simd4.cpp
@@ -0,0 +1,153 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2014, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include "simd4.h"
+
+namespace gmx
+{
+namespace test
+{
+
+/*! \cond internal */
+/*! \addtogroup module_simd */
+/*! \{ */
+
+#ifdef GMX_SIMD4_HAVE_REAL
+
+const gmx_simd4_real_t rSimd4_1_2_3    = setSimd4RealFrom3R(1, 2, 3);
+const gmx_simd4_real_t rSimd4_4_5_6    = setSimd4RealFrom3R(4, 5, 6);
+const gmx_simd4_real_t rSimd4_7_8_9    = setSimd4RealFrom3R(7, 8, 9);
+const gmx_simd4_real_t rSimd4_5_7_9    = setSimd4RealFrom3R(5, 7, 9);
+const gmx_simd4_real_t rSimd4_m1_m2_m3 = setSimd4RealFrom3R(-1, -2, -3);
+const gmx_simd4_real_t rSimd4_3_1_4    = setSimd4RealFrom3R(3, 1, 4);
+const gmx_simd4_real_t rSimd4_m3_m1_m4 = setSimd4RealFrom3R(-3, -1, -4);
+const gmx_simd4_real_t rSimd4_2p25     = setSimd4RealFrom1R(2.25);
+const gmx_simd4_real_t rSimd4_3p75     = setSimd4RealFrom1R(3.75);
+const gmx_simd4_real_t rSimd4_m2p25    = setSimd4RealFrom1R(-2.25);
+const gmx_simd4_real_t rSimd4_m3p75    = setSimd4RealFrom1R(-3.75);
+const gmx_simd4_real_t rSimd4_Exp      = setSimd4RealFrom3R( 1.4055235171027452623914516e+18,
+                                                             5.3057102734253445623914516e-13,
+                                                             -2.1057102745623934534514516e+16);
+#    if (defined GMX_SIMD_HAVE_DOUBLE) && (defined GMX_DOUBLE)
+// Make sure we also test exponents outside single precision when we use double
+const gmx_simd4_real_t  rSimd_ExpDouble = setSimd4RealFrom3R( 6.287393598732017379054414e+176,
+                                                              8.794495252903116023030553e-140,
+                                                              -3.637060701570496477655022e+202);
+// Magic FP numbers corresponding to specific bit patterns
+const gmx_simd4_real_t rSimd4_Bits1    = setSimd4RealFrom1R(-1.07730874267432137e+236);
+const gmx_simd4_real_t rSimd4_Bits2    = setSimd4RealFrom1R(-9.25596313493178307e+061);
+const gmx_simd4_real_t rSimd4_Bits3    = setSimd4RealFrom1R(-8.57750588235293981e+003);
+const gmx_simd4_real_t rSimd4_Bits4    = setSimd4RealFrom1R( 1.22416778341839096e-250);
+const gmx_simd4_real_t rSimd4_Bits5    = setSimd4RealFrom1R(-1.15711777004554095e+294);
+const gmx_simd4_real_t rSimd4_Bits6    = setSimd4RealFrom1R( 1.53063836115600621e-018);
+#    else
+const gmx_simd4_real_t rSimd4_Bits1    = setSimd4RealFrom1R(-5.9654142337e+29);
+const gmx_simd4_real_t rSimd4_Bits2    = setSimd4RealFrom1R(-1.0737417600e+08);
+const gmx_simd4_real_t rSimd4_Bits3    = setSimd4RealFrom1R(-6.0235290527e+00);
+const gmx_simd4_real_t rSimd4_Bits4    = setSimd4RealFrom1R( 1.0788832913e-31);
+const gmx_simd4_real_t rSimd4_Bits5    = setSimd4RealFrom1R(-1.0508719529e+37);
+const gmx_simd4_real_t rSimd4_Bits6    = setSimd4RealFrom1R( 1.1488970369e-02);
+#    endif
+
+::std::vector<real>
+simd4Real2Vector(const gmx_simd4_real_t simd4)
+{
+    real                mem[GMX_SIMD4_WIDTH*2];
+    real *              p = gmx_simd4_align_r(mem);
+
+    gmx_simd4_store_r(p, simd4);
+    std::vector<real>   v(p, p+GMX_SIMD4_WIDTH);
+
+    return v;
+}
+
+gmx_simd4_real_t
+vector2Simd4Real(const std::vector<real> &v)
+{
+    real                mem[GMX_SIMD4_WIDTH*2];
+    real *              p = gmx_simd4_align_r(mem);
+
+    for (int i = 0; i < GMX_SIMD4_WIDTH; i++)
+    {
+        p[i] = v[i % v.size()];  // repeat vector contents to fill simd width
+    }
+    return gmx_simd4_load_r(p);
+}
+
+gmx_simd4_real_t
+setSimd4RealFrom3R(real r0, real r1, real r2)
+{
+    std::vector<real> v(3);
+    v[0] = r0;
+    v[1] = r1;
+    v[2] = r2;
+    return vector2Simd4Real(v);
+}
+
+gmx_simd4_real_t
+setSimd4RealFrom1R(real value)
+{
+    std::vector<real> v(GMX_SIMD4_WIDTH);
+    for (int i = 0; i < GMX_SIMD4_WIDTH; i++)
+    {
+        v[i] = value;
+    }
+    return vector2Simd4Real(v);
+}
+
+testing::AssertionResult
+Simd4Test::compareSimd4RealUlp(const char *  refExpr,     const char *  tstExpr,
+                               const gmx_simd4_real_t ref, const gmx_simd4_real_t tst)
+{
+    return compareVectorRealUlp(refExpr, tstExpr, simd4Real2Vector(ref), simd4Real2Vector(tst));
+}
+
+testing::AssertionResult
+Simd4Test::compareSimd4RealEq(const char * refExpr, const char * tstExpr,
+                              const gmx_simd4_real_t ref, const gmx_simd4_real_t tst)
+{
+    return compareVectorEq(refExpr, tstExpr, simd4Real2Vector(ref), simd4Real2Vector(tst));
+}
+
+#endif  // GMX_SIMD4_HAVE_REAL
+
+/*! \} */
+/*! \endcond */
+
+}      // namespace
+}      // namespace
diff --git a/src/gromacs/simd/tests/simd4.h b/src/gromacs/simd/tests/simd4.h

new file mode 100644 (file)

index 0000000..70b8443
--- /dev/null
+++ b/src/gromacs/simd/tests/simd4.h
@@ -0,0 +1,174 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2014, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+#ifndef GMX_SIMD_TESTS_SIMD4_H
+#define GMX_SIMD_TESTS_SIMD4_H
+
+/*! \internal \file
+ * \brief
+ * Declares fixture for testing of SIMD4 functionality.
+ *
+ * This files specializes the common base test utilities to be used
+ * for SIMD4 variables. For detailed documentation, check out the normal
+ * SIMD test classes and files.
+ *
+ * \author Erik Lindahl <erik.lindahl@scilifelab.se>
+ * \ingroup module_simd
+ */
+
+#include <vector>
+#include <gtest/gtest.h>
+#include "gromacs/simd/simd.h"
+#include "gromacs/simd/tests/base.h"
+
+namespace gmx
+{
+namespace test
+{
+
+/*! \cond internal */
+/*! \addtogroup module_simd */
+/*! \{ */
+
+#ifdef GMX_SIMD4_HAVE_REAL
+extern const gmx_simd4_real_t rSimd4_1_2_3;     //!< Generic (different) fp values.
+extern const gmx_simd4_real_t rSimd4_4_5_6;     //!< Generic (different) fp values.
+extern const gmx_simd4_real_t rSimd4_7_8_9;     //!< Generic (different) fp values.
+extern const gmx_simd4_real_t rSimd4_5_7_9;     //!< rSimd_1_2_3 + rSimd_4_5_6.
+extern const gmx_simd4_real_t rSimd4_m1_m2_m3;  //!< Generic negative fp values.
+extern const gmx_simd4_real_t rSimd4_3_1_4;     //!< Used to test min/max.
+extern const gmx_simd4_real_t rSimd4_m3_m1_m4;  //!< negative rSimd_3_1_4.
+extern const gmx_simd4_real_t rSimd4_2p25;      //!< Value that rounds down.
+extern const gmx_simd4_real_t rSimd4_3p75;      //!< Value that rounds up.
+extern const gmx_simd4_real_t rSimd4_m2p25;     //!< Negative value that rounds up.
+extern const gmx_simd4_real_t rSimd4_m3p75;     //!< Negative value that rounds down.
+//! Three large floating-point values whose exponents are >32.
+extern const gmx_simd4_real_t rSimd4_Exp;
+#    if (defined GMX_SIMD_HAVE_DOUBLE) && (defined GMX_DOUBLE)
+// Make sure we also test exponents outside single precision when we use double
+extern const gmx_simd4_real_t rSimd4_ExpDouble;
+#    endif
+extern const gmx_simd4_real_t rSimd4_Bits1; //!< Pattern F0 repeated to fill single/double.
+extern const gmx_simd4_real_t rSimd4_Bits2; //!< Pattern CC repeated to fill single/double.
+extern const gmx_simd4_real_t rSimd4_Bits3; //!< Pattern C0 repeated to fill single/double.
+extern const gmx_simd4_real_t rSimd4_Bits4; //!< Pattern 0C repeated to fill single/double.
+extern const gmx_simd4_real_t rSimd4_Bits5; //!< Pattern FC repeated to fill single/double.
+extern const gmx_simd4_real_t rSimd4_Bits6; //!< Pattern 3C repeated to fill single/double.
+
+/*! \brief Test fixture for SIMD4 tests - contains test settings.
+ *
+ * This is a very simple test fixture that basically just takes the common
+ * SIMD/SIMD4 functionality from SimdBaseTest and creates wrapper routines
+ * specific for SIMD4 functionality.
+ */
+class Simd4Test : public SimdBaseTest
+{
+    public:
+        /*! \brief Compare two real SIMD4 variables for approximate equality.
+         *
+         * This is an internal implementation routine. YOu should always use
+         * GMX_EXPECT_SIMD4_REAL_NEAR() instead.
+         *
+         * This routine is designed according to the Google test specs, so the char
+         * strings will describe the arguments to the macro.
+         *
+         * The comparison is applied to each element, and it returns true if each element
+         * in the SIMD4 test variable is within the class tolerances of the corresponding
+         * reference element.
+         */
+        ::testing::AssertionResult
+        compareSimd4RealUlp(const char * refExpr, const char * tstExpr,
+                            const gmx_simd4_real_t ref, const gmx_simd4_real_t tst);
+
+        /*! \brief Compare two real SIMD4 variables for exact equality.
+         *
+         * This is an internal implementation routine. YOu should always use
+         * GMX_EXPECT_SIMD4_REAL_NEAR() instead.
+         *
+         * This routine is designed according to the Google test specs, so the char
+         * strings will describe the arguments to the macro.
+         *
+         * The comparison is applied to each element, and it returns true if each element
+         * in the SIMD4 test variable is within the class tolerances of the corresponding
+         * reference element.
+         */
+        ::testing::AssertionResult
+        compareSimd4RealEq(const char * refExpr, const char * tstExpr,
+                           const gmx_simd4_real_t ref, const gmx_simd4_real_t tst);
+};
+
+/*! \brief Convert SIMD4 real to std::vector<real>.
+ *
+ * The returned vector will have the same length as the SIMD4 width.
+ */
+std::vector<real> simd4Real2Vector(const gmx_simd4_real_t simd4);
+
+/*! \brief Return floating-point SIMD4 value from std::vector<real>.
+ *
+ * If the vector is longer than SIMD4 width, only the first elements will be used.
+ * If it is shorter, the contents will be repeated to fill the SIMD4 register.
+ */
+gmx_simd4_real_t   vector2Simd4Real(const std::vector<real> &v);
+
+/*! \brief Set SIMD4 register contents from three real values.
+ *
+ * It might seem stupid to use three values when we know that the SIMD4 width
+ * is 4, but it simplifies the test organization when the SIMD and SIMD4 tests
+ * are completely symmetric.
+ */
+gmx_simd4_real_t   setSimd4RealFrom3R(real r0, real r1, real r2);
+
+/*! \brief Set SIMD4 register contents from single real value.
+ *
+ * All elements is set from the given value. This is effectively the same
+ * operation as gmx_simd4_set1_r(), but is implemented using only load/store
+ * operations that have been tested separately in the bootstrapping tests.
+ */
+gmx_simd4_real_t   setSimd4RealFrom1R(real value);
+
+/*! \brief Test if a SIMD4 real is bitwise identical to reference SIMD4 value. */
+#define GMX_EXPECT_SIMD4_REAL_EQ(ref, tst)   EXPECT_PRED_FORMAT2(compareSimd4RealEq, ref, tst)
+
+/*! \brief Test if a SIMD4 real is within tolerance of reference SIMD4 value. */
+#define GMX_EXPECT_SIMD4_REAL_NEAR(ref, tst) EXPECT_PRED_FORMAT2(compareSimd4RealUlp, ref, tst)
+
+#endif  // GMX_SIMD4_HAVE_REAL
+
+/*! \} */
+/*! \endcond */
+
+}      // namespace
+}      // namespace
+
+#endif // GMX_SIMD_TESTS_SIMD4_H
diff --git a/src/gromacs/simd/tests/simd4_floatingpoint.cpp b/src/gromacs/simd/tests/simd4_floatingpoint.cpp

new file mode 100644 (file)

index 0000000..41aa5aa
--- /dev/null
+++ b/src/gromacs/simd/tests/simd4_floatingpoint.cpp
@@ -0,0 +1,285 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2014, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+#include "gromacs/math/utilities.h"
+
+#include "simd4.h"
+
+namespace gmx
+{
+namespace test
+{
+namespace
+{
+
+/*! \cond internal */
+/*! \addtogroup module_simd */
+/*! \{ */
+
+#ifdef GMX_SIMD4_HAVE_REAL
+
+/*! \brief Test fixture for SIMD4 floating-point operations (identical to the SIMD4 \ref Simd4Test) */
+typedef Simd4Test Simd4FloatingpointTest;
+
+TEST_F(Simd4FloatingpointTest, gmxSimd4SetZeroR)
+{
+    GMX_EXPECT_SIMD4_REAL_EQ(setSimd4RealFrom1R(0.0), gmx_simd4_setzero_r());
+}
+
+TEST_F(Simd4FloatingpointTest, gmxSimd4Set1R)
+{
+    GMX_EXPECT_SIMD4_REAL_EQ(setSimd4RealFrom1R(1.0), gmx_simd4_set1_r(1.0));
+}
+
+TEST_F(Simd4FloatingpointTest, gmxSimd4Load1R)
+{
+    real r = 2.0;
+    GMX_EXPECT_SIMD4_REAL_EQ(setSimd4RealFrom1R(r), gmx_simd4_load1_r(&r));
+}
+
+TEST_F(Simd4FloatingpointTest, gmxSimd4AddR)
+{
+    GMX_EXPECT_SIMD4_REAL_EQ(rSimd4_5_7_9, gmx_simd4_add_r(rSimd4_1_2_3, rSimd4_4_5_6)); // 1+4=5, 2+5=7, 3+6=9
+}
+
+TEST_F(Simd4FloatingpointTest, gmxSimd4SubR)
+{
+    GMX_EXPECT_SIMD4_REAL_EQ(rSimd4_4_5_6, gmx_simd4_sub_r(rSimd4_5_7_9, rSimd4_1_2_3)); // 5-1=4, 7-2=5, 9-3=6
+}
+
+TEST_F(Simd4FloatingpointTest, gmxSimd4MulR)
+{
+    GMX_EXPECT_SIMD4_REAL_EQ(setSimd4RealFrom3R(4, 10, 18), gmx_simd4_mul_r(rSimd4_1_2_3, rSimd4_4_5_6));
+}
+
+TEST_F(Simd4FloatingpointTest, gmxSimd4FmaddR)
+{
+    GMX_EXPECT_SIMD4_REAL_EQ(setSimd4RealFrom3R(11, 18, 27), gmx_simd4_fmadd_r(rSimd4_1_2_3, rSimd4_4_5_6, rSimd4_7_8_9)); // 1*4+7, etc.
+}
+
+TEST_F(Simd4FloatingpointTest, gmxSimd4FmsubR)
+{
+    GMX_EXPECT_SIMD4_REAL_EQ(setSimd4RealFrom3R(-3, 2, 9), gmx_simd4_fmsub_r(rSimd4_1_2_3, rSimd4_4_5_6, rSimd4_7_8_9)); // 1*4-7, etc.
+}
+
+TEST_F(Simd4FloatingpointTest, gmxSimd4FnmaddR)
+{
+    GMX_EXPECT_SIMD4_REAL_EQ(setSimd4RealFrom3R(3, -2, -9), gmx_simd4_fnmadd_r(rSimd4_1_2_3, rSimd4_4_5_6, rSimd4_7_8_9)); // -1*4+7, etc.
+}
+
+TEST_F(Simd4FloatingpointTest, gmxSimd4FnmsubR)
+{
+    GMX_EXPECT_SIMD4_REAL_EQ(setSimd4RealFrom3R(-11, -18, -27), gmx_simd4_fnmsub_r(rSimd4_1_2_3, rSimd4_4_5_6, rSimd4_7_8_9)); // -1*4-7, etc.
+}
+
+TEST_F(Simd4FloatingpointTest, gmxSimd4FabsR)
+{
+    GMX_EXPECT_SIMD4_REAL_EQ(rSimd4_1_2_3, gmx_simd4_fabs_r(rSimd4_1_2_3));    // fabs(x)=x
+    GMX_EXPECT_SIMD4_REAL_EQ(rSimd4_1_2_3, gmx_simd4_fabs_r(rSimd4_m1_m2_m3)); // fabs(-x)=x
+}
+
+TEST_F(Simd4FloatingpointTest, gmxSimd4FnegR)
+{
+    GMX_EXPECT_SIMD4_REAL_EQ(rSimd4_m1_m2_m3, gmx_simd4_fneg_r(rSimd4_1_2_3));   // fneg(x)=-x
+    GMX_EXPECT_SIMD4_REAL_EQ(rSimd4_1_2_3,   gmx_simd4_fneg_r(rSimd4_m1_m2_m3)); // fneg(-x)=x
+}
+
+#ifdef GMX_SIMD4_HAVE_LOGICAL
+TEST_F(Simd4FloatingpointTest, gmxSimd4AndR)
+{
+    GMX_EXPECT_SIMD4_REAL_EQ(rSimd4_Bits3, gmx_simd4_and_r(rSimd4_Bits1, rSimd4_Bits2)); // Bits1 & Bits2 = Bits3
+}
+
+TEST_F(Simd4FloatingpointTest, gmxSimd4AndnotR)
+{
+    GMX_EXPECT_SIMD4_REAL_EQ(rSimd4_Bits4, gmx_simd4_andnot_r(rSimd4_Bits1, rSimd4_Bits2)); // (~Bits1) & Bits2 = Bits3
+}
+
+TEST_F(Simd4FloatingpointTest, gmxSimd4OrR)
+{
+    GMX_EXPECT_SIMD4_REAL_EQ(rSimd4_Bits5, gmx_simd4_or_r(rSimd4_Bits1, rSimd4_Bits2)); // Bits1 | Bits2 = Bits3
+}
+
+TEST_F(Simd4FloatingpointTest, gmxSimd4XorR)
+{
+    GMX_EXPECT_SIMD4_REAL_EQ(rSimd4_Bits6, gmx_simd4_xor_r(rSimd4_Bits1, rSimd4_Bits2)); // Bits1 ^ Bits2 = Bits3
+}
+#endif
+
+TEST_F(Simd4FloatingpointTest, gmxSimd4MaxR)
+{
+    GMX_EXPECT_SIMD4_REAL_EQ(setSimd4RealFrom3R(3, 2, 4), gmx_simd4_max_r(rSimd4_1_2_3, rSimd4_3_1_4));
+    GMX_EXPECT_SIMD4_REAL_EQ(setSimd4RealFrom3R(3, 2, 4), gmx_simd4_max_r(rSimd4_3_1_4, rSimd4_1_2_3));
+    GMX_EXPECT_SIMD4_REAL_EQ(setSimd4RealFrom3R(-1, -1, -3), gmx_simd4_max_r(rSimd4_m1_m2_m3, rSimd4_m3_m1_m4));
+    GMX_EXPECT_SIMD4_REAL_EQ(setSimd4RealFrom3R(-1, -1, -3), gmx_simd4_max_r(rSimd4_m3_m1_m4, rSimd4_m1_m2_m3));
+}
+
+TEST_F(Simd4FloatingpointTest, gmxSimd4MinR)
+{
+    GMX_EXPECT_SIMD4_REAL_EQ(setSimd4RealFrom3R(1, 1, 3), gmx_simd4_min_r(rSimd4_1_2_3, rSimd4_3_1_4));
+    GMX_EXPECT_SIMD4_REAL_EQ(setSimd4RealFrom3R(1, 1, 3), gmx_simd4_min_r(rSimd4_3_1_4, rSimd4_1_2_3));
+    GMX_EXPECT_SIMD4_REAL_EQ(setSimd4RealFrom3R(-3, -2, -4), gmx_simd4_min_r(rSimd4_m1_m2_m3, rSimd4_m3_m1_m4));
+    GMX_EXPECT_SIMD4_REAL_EQ(setSimd4RealFrom3R(-3, -2, -4), gmx_simd4_min_r(rSimd4_m3_m1_m4, rSimd4_m1_m2_m3));
+}
+
+TEST_F(Simd4FloatingpointTest, gmxSimd4RoundR)
+{
+    GMX_EXPECT_SIMD4_REAL_EQ(setSimd4RealFrom1R(2), gmx_simd4_round_r(gmx_simd4_set1_r(2.25)));
+    GMX_EXPECT_SIMD4_REAL_EQ(setSimd4RealFrom1R(4), gmx_simd4_round_r(gmx_simd4_set1_r(3.75)));
+    GMX_EXPECT_SIMD4_REAL_EQ(setSimd4RealFrom1R(-2), gmx_simd4_round_r(gmx_simd4_set1_r(-2.25)));
+    GMX_EXPECT_SIMD4_REAL_EQ(setSimd4RealFrom1R(-4), gmx_simd4_round_r(gmx_simd4_set1_r(-3.75)));
+}
+
+TEST_F(Simd4FloatingpointTest, gmxSimd4TruncR)
+{
+    GMX_EXPECT_SIMD4_REAL_EQ(setSimd4RealFrom1R(2), gmx_simd4_trunc_r(rSimd4_2p25));
+    GMX_EXPECT_SIMD4_REAL_EQ(setSimd4RealFrom1R(3), gmx_simd4_trunc_r(rSimd4_3p75));
+    GMX_EXPECT_SIMD4_REAL_EQ(setSimd4RealFrom1R(-2), gmx_simd4_trunc_r(rSimd4_m2p25));
+    GMX_EXPECT_SIMD4_REAL_EQ(setSimd4RealFrom1R(-3), gmx_simd4_trunc_r(rSimd4_m3p75));
+}
+
+/* We do extensive 1/sqrt(x) and 1/x accuracy testing in the tests for
+ * the SIMD math functions, so we just make sure the lookup instructions
+ * appear to work for a few values here.
+ */
+TEST_F(Simd4FloatingpointTest, gmxSimd4RsqrtR)
+{
+    gmx_simd4_real_t x      = setSimd4RealFrom3R(4.0, M_PI, 1234567890.0);
+    gmx_simd4_real_t ref    = setSimd4RealFrom3R(0.5, 1.0/sqrt(M_PI), 1.0/sqrt(1234567890.0));
+
+    // The allowed Ulp deviation is 2 to the power of the number of mantissa
+    // digits, minus the number of bits provided by the table lookup
+    setUlpTol(1LL << (std::numeric_limits<real>::digits-GMX_SIMD_RSQRT_BITS));
+    GMX_EXPECT_SIMD4_REAL_NEAR(ref, gmx_simd4_rsqrt_r(x));
+}
+
+TEST_F(Simd4FloatingpointTest, gmxSimd4BoolCmpEqAndBlendZeroR)
+{
+    gmx_simd4_bool_t eq   = gmx_simd4_cmpeq_r(rSimd4_5_7_9, rSimd4_7_8_9);
+    GMX_EXPECT_SIMD4_REAL_EQ(setSimd4RealFrom3R(0, 0, 3), gmx_simd4_blendzero_r(rSimd4_1_2_3, eq));
+}
+
+TEST_F(Simd4FloatingpointTest, gmxSimd4BlendNotZeroR)
+{
+    gmx_simd4_bool_t eq   = gmx_simd4_cmpeq_r(rSimd4_5_7_9, rSimd4_7_8_9);
+    GMX_EXPECT_SIMD4_REAL_EQ(setSimd4RealFrom3R(1, 2, 0), gmx_simd4_blendnotzero_r(rSimd4_1_2_3, eq));
+}
+
+TEST_F(Simd4FloatingpointTest, gmxSimd4BoolCmpLER)
+{
+    gmx_simd4_bool_t le   = gmx_simd4_cmple_r(rSimd4_5_7_9, rSimd4_7_8_9);
+    GMX_EXPECT_SIMD4_REAL_EQ(rSimd4_1_2_3, gmx_simd4_blendzero_r(rSimd4_1_2_3, le));
+}
+
+TEST_F(Simd4FloatingpointTest, gmxSimd4BoolCmpLTR)
+{
+    gmx_simd4_bool_t lt   = gmx_simd4_cmplt_r(rSimd4_5_7_9, rSimd4_7_8_9);
+    GMX_EXPECT_SIMD4_REAL_EQ(setSimd4RealFrom3R(1, 2, 0), gmx_simd4_blendzero_r(rSimd4_1_2_3, lt));
+}
+
+TEST_F(Simd4FloatingpointTest, gmxSimd4BoolAndB)
+{
+    gmx_simd4_bool_t eq   = gmx_simd4_cmpeq_r(rSimd4_5_7_9, rSimd4_7_8_9);
+    gmx_simd4_bool_t le   = gmx_simd4_cmple_r(rSimd4_5_7_9, rSimd4_7_8_9);
+    GMX_EXPECT_SIMD4_REAL_EQ(setSimd4RealFrom3R(0, 0, 3), gmx_simd4_blendzero_r(rSimd4_1_2_3, gmx_simd4_and_b(eq, le)));
+}
+
+TEST_F(Simd4FloatingpointTest, gmxSimd4BoolOrB)
+{
+    gmx_simd4_bool_t eq   = gmx_simd4_cmpeq_r(rSimd4_5_7_9, rSimd4_7_8_9);
+    gmx_simd4_bool_t lt   = gmx_simd4_cmplt_r(rSimd4_5_7_9, rSimd4_7_8_9);
+    GMX_EXPECT_SIMD4_REAL_EQ(setSimd4RealFrom3R(1, 2, 3), gmx_simd4_blendzero_r(rSimd4_1_2_3, gmx_simd4_or_b(eq, lt)));
+}
+
+TEST_F(Simd4FloatingpointTest, gmxSimd4AnytrueB)
+{
+    gmx_simd4_bool_t eq;
+
+    /* this test is a bit tricky since we don't know the simd width.
+     * We cannot check for truth values for "any" element beyond the first,
+     * since that part of the data will not be used if simd width is 1.
+     */
+    eq = gmx_simd4_cmpeq_r(rSimd4_5_7_9, setSimd4RealFrom3R(5, 0, 0));
+    EXPECT_NE(0, gmx_simd4_anytrue_b(eq));
+
+    eq = gmx_simd4_cmpeq_r(rSimd4_1_2_3, rSimd4_4_5_6);
+    EXPECT_EQ(0, gmx_simd4_anytrue_b(eq));
+}
+
+TEST_F(Simd4FloatingpointTest, gmxSimd4BlendvR)
+{
+    gmx_simd4_bool_t lt   = gmx_simd4_cmplt_r(rSimd4_5_7_9, rSimd4_7_8_9);
+    GMX_EXPECT_SIMD4_REAL_EQ(setSimd4RealFrom3R(4, 5, 3), gmx_simd4_blendv_r(rSimd4_1_2_3, rSimd4_4_5_6, lt));
+}
+
+TEST_F(Simd4FloatingpointTest, gmxSimd4ReduceR)
+{
+    // The horizontal sum of the SIMD variable depends on the width, so
+    // simply store it an extra time and calculate what the sum should be
+    std::vector<real> v   = simd4Real2Vector(rSimd4_1_2_3);
+    real              sum = 0.0;
+
+    for (int i = 0; i < GMX_SIMD4_WIDTH; i++)
+    {
+        sum += v[i];
+    }
+
+    EXPECT_EQ(sum, gmx_simd4_reduce_r(rSimd4_1_2_3));
+}
+
+
+TEST_F(Simd4FloatingpointTest, gmxSimd4Dotproduct3R)
+{
+    gmx_simd4_real_t v1 = setSimd4RealFrom3R(1, 4, 5);
+    gmx_simd4_real_t v2 = setSimd4RealFrom3R(3, 8, 2);
+#    ifdef GMX_DOUBLE
+    EXPECT_DOUBLE_EQ(45.0, gmx_simd4_dotproduct3_r(v1, v2));
+#    else
+    EXPECT_FLOAT_EQ(45.0, gmx_simd4_dotproduct3_r(v1, v2));
+#    endif
+}
+
+#endif      // GMX_SIMD4_HAVE_REAL
+
+/*! \} */
+/*! \endcond */
+
+}      // namespace
+}      // namespace
+}      // namespace
diff --git a/src/gromacs/simd/tests/simd4_math.cpp b/src/gromacs/simd/tests/simd4_math.cpp

new file mode 100644 (file)

index 0000000..5d08f16
--- /dev/null
+++ b/src/gromacs/simd/tests/simd4_math.cpp
@@ -0,0 +1,214 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2014, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <vector>
+#include "gromacs/math/utilities.h"
+#include "gromacs/simd/simd.h"
+#include "gromacs/simd/simd_math.h"
+#include "gromacs/options/basicoptions.h"
+
+#include "simd4.h"
+
+namespace gmx
+{
+namespace test
+{
+
+/*! \cond internal */
+/*! \addtogroup module_simd */
+/*! \{ */
+
+#ifdef GMX_SIMD4_HAVE_REAL
+
+class Simd4MathTest : public Simd4Test
+{
+    public:
+        ::testing::AssertionResult
+                             compareSimd4MathFunction(const char * refFuncExpr, const char *simd4FuncExpr,
+                                                      real refFunc(real x),     gmx_simd4_real_t simd4Func(gmx_simd4_real_t x));
+};
+
+/*! \brief Test approximate equality of SIMD4 vs reference version of a function.
+ *
+ * This macro takes vanilla C and SIMD flavors of a function and tests it with
+ * the number of points, range, and tolerances specified by the test fixture class.
+ */
+#define GMX_EXPECT_SIMD4_FUNC_NEAR(refFunc, tstFunc) \
+    EXPECT_PRED_FORMAT2(compareSimd4MathFunction, refFunc, tstFunc)
+
+
+/*! \brief Implementation routine to compare SIMD4 vs reference functions.
+ *
+ * \param refFuncExpr    Description of reference function expression
+ * \param simd4FuncExpr  Description of SIMD function expression
+ * \param refFunc        Reference math function pointer
+ * \param simd4Func      SIMD math function pointer
+ *
+ * The function will be tested with the range and tolerances specified in
+ * the SimdBaseTest class. You should not never call this function directly,
+ * but use the macro GMX_EXPECT_SIMD4_FUNC_NEAR(refFunc,tstFunc) instead.
+ */
+::testing::AssertionResult
+Simd4MathTest::compareSimd4MathFunction(const char * refFuncExpr, const char *simd4FuncExpr,
+                                        real refFunc(real x),     gmx_simd4_real_t simd4Func(gmx_simd4_real_t x))
+{
+    std::vector<real>            vx(GMX_SIMD4_WIDTH);
+    std::vector<real>            vref(GMX_SIMD4_WIDTH);
+    std::vector<real>            vtst(GMX_SIMD4_WIDTH);
+    real                         dx;
+    gmx_int64_t                  ulpDiff, maxUlpDiff;
+    real                         maxUlpDiffPos;
+    real                         refValMaxUlpDiff, simdValMaxUlpDiff;
+    bool                         eq, signOk;
+    int                          i, iter;
+    int                          niter   = s_nPoints/GMX_SIMD4_WIDTH;
+    int                          npoints = niter*GMX_SIMD4_WIDTH;
+#    ifdef GMX_DOUBLE
+    union {
+        double r; gmx_int64_t i;
+    } conv0, conv1;
+#    else
+    union {
+        float  r; gmx_int32_t i;
+    } conv0, conv1;
+#    endif
+
+    maxUlpDiff = 0;
+    dx         = (range_.second-range_.first)/npoints;
+
+    for (iter = 0; iter < niter; iter++)
+    {
+        for (i = 0; i < GMX_SIMD4_WIDTH; i++)
+        {
+            vx[i]   = range_.first+dx*(iter*GMX_SIMD4_WIDTH+i);
+            vref[i] = refFunc(vx[i]);
+        }
+        vtst  = simd4Real2Vector(simd4Func(vector2Simd4Real(vx)));
+
+        for (i = 0, eq = true, signOk = true; i < GMX_SIMD4_WIDTH && eq == true; i++)
+        {
+            eq     = eq && ( fabs(vref[i]-vtst[i]) < absTol_ );
+            signOk = signOk && ( vref[i]*vtst[i] >= 0 );
+        }
+        if (eq == true)
+        {
+            // Go to next point if everything within absolute tolerance
+            continue;
+        }
+        else if (signOk == false)
+        {
+            return ::testing::AssertionFailure()
+                   << "Failing SIMD4 math function comparison due to sign differences." << std::endl
+                   << "Reference function: " << refFuncExpr << std::endl
+                   << "Simd function:      " << simd4FuncExpr << std::endl
+                   << "Test range is ( " << range_.first << " , " << range_.second << " ) " << std::endl
+                   << "First sign difference around x=" << std::setprecision(20) << ::testing::PrintToString(vx) << std::endl
+                   << "Ref values:   " << std::setprecision(20) << ::testing::PrintToString(vref) << std::endl
+                   << "SIMD4 values: " << std::setprecision(20) << ::testing::PrintToString(vtst) << std::endl;
+        }
+        /* We replicate the trivial ulp differences comparison here rather than
+         * calling the lower-level routine for comparing them, since this enables
+         * us to run through the entire test range and report the largest deviation
+         * without lots of extra glue routines.
+         */
+        for (i = 0; i < GMX_SIMD4_WIDTH; i++)
+        {
+            conv0.r = vref[i];
+            conv1.r = vtst[i];
+            ulpDiff = llabs(conv0.i-conv1.i);
+            if (ulpDiff > maxUlpDiff)
+            {
+                maxUlpDiff        = ulpDiff;
+                maxUlpDiffPos     = vx[i];
+                refValMaxUlpDiff  = vref[i];
+                simdValMaxUlpDiff = vtst[i];
+            }
+        }
+    }
+
+    if (maxUlpDiff <= ulpTol_)
+    {
+        return ::testing::AssertionSuccess();
+    }
+    else
+    {
+        return ::testing::AssertionFailure()
+               << "Failing SIMD4 math function ulp comparison between " << refFuncExpr << " and " << simd4FuncExpr << std::endl
+               << "Requested ulp tolerance: " << ulpTol_ << std::endl
+               << "Requested abs tolerance: " << absTol_ << std::endl
+               << "Largest Ulp difference occurs for x=" << std::setprecision(20) << maxUlpDiffPos << std::endl
+               << "Ref  values:  " << std::setprecision(20) << refValMaxUlpDiff << std::endl
+               << "SIMD4 values: " << std::setprecision(20) << simdValMaxUlpDiff << std::endl
+               << "Ulp diff.:   " << std::setprecision(20) << maxUlpDiff << std::endl;
+    }
+}
+
+/*! \} */
+/*! \endcond */
+
+// Actual math function tests below
+
+namespace
+{
+
+/*! \cond internal */
+/*! \addtogroup module_simd */
+/*! \{ */
+
+/*! \brief Function wrapper to evaluate reference 1/sqrt(x) */
+static real
+ref_invsqrt(real x)
+{
+    return 1.0/sqrt(x);
+}
+
+TEST_F(Simd4MathTest, gmxSimd4InvsqrtR)
+{
+    setRange(1e-10, 1e10);
+    GMX_EXPECT_SIMD4_FUNC_NEAR(ref_invsqrt, gmx_simd4_invsqrt_r);
+}
+
+}      // namespace
+
+#endif // GMX_SIMD4_HAVE_REAL
+
+/*! \} */
+/*! \endcond */
+
+}      // namespace
+}      // namespace
diff --git a/src/gromacs/simd/tests/simd4_vector_operations.cpp b/src/gromacs/simd/tests/simd4_vector_operations.cpp

new file mode 100644 (file)

index 0000000..7859705
--- /dev/null
+++ b/src/gromacs/simd/tests/simd4_vector_operations.cpp
@@ -0,0 +1,79 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2014, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+#include "gromacs/simd/simd.h"
+#include "gromacs/simd/vector_operations.h"
+
+#include "simd4.h"
+
+namespace gmx
+{
+namespace test
+{
+namespace
+{
+
+/*! \cond internal */
+/*! \addtogroup module_simd */
+/*! \{ */
+
+#ifdef GMX_SIMD4_HAVE_REAL
+
+/*! \brief Test fixture for SIMD4 vector operations (identical to the SIMD4 \ref Simd4Test) */
+typedef Simd4Test Simd4VectorOperationsTest;
+
+TEST_F(Simd4VectorOperationsTest, gmxSimd4CalcRsqR)
+{
+    gmx_simd4_real_t simdX  = setSimd4RealFrom3R(1, 2, 3);
+    gmx_simd4_real_t simdY  = setSimd4RealFrom3R(3, 0, 5);
+    gmx_simd4_real_t simdZ  = setSimd4RealFrom3R(4, 1, 8);
+    gmx_simd4_real_t simdR2 = setSimd4RealFrom3R(26, 5, 98);
+
+    setUlpTol(2);
+    GMX_EXPECT_SIMD4_REAL_NEAR(simdR2, gmx_simd4_calc_rsq_r(simdX, simdY, simdZ));
+}
+
+#endif      // GMX_SIMD4_HAVE_REAL
+
+/*! \} */
+/*! \endcond */
+
+}      // namespace
+}      // namespace
+}      // namespace
diff --git a/src/gromacs/simd/tests/simd_floatingpoint.cpp b/src/gromacs/simd/tests/simd_floatingpoint.cpp

new file mode 100644 (file)

index 0000000..f0ee108
--- /dev/null
+++ b/src/gromacs/simd/tests/simd_floatingpoint.cpp
@@ -0,0 +1,344 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2014, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+#include "gromacs/math/utilities.h"
+
+#include "simd.h"
+
+namespace gmx
+{
+namespace test
+{
+namespace
+{
+
+/*! \cond internal */
+/*! \addtogroup module_simd */
+/*! \{ */
+
+#ifdef GMX_SIMD_HAVE_REAL
+
+/*! \brief Test fixture for floating-point tests (identical to the generic \ref SimdTest) */
+typedef SimdTest SimdFloatingpointTest;
+
+TEST_F(SimdFloatingpointTest, gmxSimdSetZeroR)
+{
+    GMX_EXPECT_SIMD_REAL_EQ(setSimdRealFrom1R(0.0), gmx_simd_setzero_r());
+}
+
+TEST_F(SimdFloatingpointTest, gmxSimdSet1R)
+{
+    GMX_EXPECT_SIMD_REAL_EQ(setSimdRealFrom1R(1.0), gmx_simd_set1_r(1.0));
+}
+
+TEST_F(SimdFloatingpointTest, gmxSimdLoad1R)
+{
+    real r = 2.0;
+    GMX_EXPECT_SIMD_REAL_EQ(setSimdRealFrom1R(r), gmx_simd_load1_r(&r));
+}
+
+TEST_F(SimdFloatingpointTest, gmxSimdAddR)
+{
+    GMX_EXPECT_SIMD_REAL_EQ(rSimd_5_7_9,
+                            gmx_simd_add_r(rSimd_1_2_3, rSimd_4_5_6)); // 1+4=5, 2+5=7, 3+6=9
+}
+
+TEST_F(SimdFloatingpointTest, gmxSimdSubR)
+{
+    GMX_EXPECT_SIMD_REAL_EQ(rSimd_4_5_6,
+                            gmx_simd_sub_r(rSimd_5_7_9, rSimd_1_2_3)); // 5-1=4, 7-2=5, 9-3=6
+}
+
+TEST_F(SimdFloatingpointTest, gmxSimdMulR)
+{
+    GMX_EXPECT_SIMD_REAL_EQ(setSimdRealFrom3R(4, 10, 18),
+                            gmx_simd_mul_r(rSimd_1_2_3, rSimd_4_5_6));
+}
+
+TEST_F(SimdFloatingpointTest, gmxSimdFmaddR)
+{
+    GMX_EXPECT_SIMD_REAL_EQ(setSimdRealFrom3R(11, 18, 27),
+                            gmx_simd_fmadd_r(rSimd_1_2_3, rSimd_4_5_6, rSimd_7_8_9)); // 1*4+7, etc.
+}
+
+TEST_F(SimdFloatingpointTest, gmxSimdFmsubR)
+{
+    GMX_EXPECT_SIMD_REAL_EQ(setSimdRealFrom3R(-3, 2, 9),
+                            gmx_simd_fmsub_r(rSimd_1_2_3, rSimd_4_5_6, rSimd_7_8_9)); // 1*4-7, etc.
+}
+
+TEST_F(SimdFloatingpointTest, gmxSimdFnmaddR)
+{
+    GMX_EXPECT_SIMD_REAL_EQ(setSimdRealFrom3R(3, -2, -9),
+                            gmx_simd_fnmadd_r(rSimd_1_2_3, rSimd_4_5_6, rSimd_7_8_9)); // -1*4+7, etc.
+}
+
+TEST_F(SimdFloatingpointTest, gmxSimdFnmsubR)
+{
+    GMX_EXPECT_SIMD_REAL_EQ(setSimdRealFrom3R(-11, -18, -27),
+                            gmx_simd_fnmsub_r(rSimd_1_2_3, rSimd_4_5_6, rSimd_7_8_9)); // -1*4-7, etc.
+}
+
+TEST_F(SimdFloatingpointTest, gmxSimdFabsR)
+{
+    GMX_EXPECT_SIMD_REAL_EQ(rSimd_1_2_3, gmx_simd_fabs_r(rSimd_1_2_3));    // fabs(x)=x
+    GMX_EXPECT_SIMD_REAL_EQ(rSimd_1_2_3, gmx_simd_fabs_r(rSimd_m1_m2_m3)); // fabs(-x)=x
+}
+
+TEST_F(SimdFloatingpointTest, gmxSimdFnegR)
+{
+    GMX_EXPECT_SIMD_REAL_EQ(rSimd_m1_m2_m3, gmx_simd_fneg_r(rSimd_1_2_3));    // fneg(x)=-x
+    GMX_EXPECT_SIMD_REAL_EQ(rSimd_1_2_3,    gmx_simd_fneg_r(rSimd_m1_m2_m3)); // fneg(-x)=x
+}
+
+#ifdef GMX_SIMD_HAVE_LOGICAL
+TEST_F(SimdFloatingpointTest, gmxSimdAndR)
+{
+    GMX_EXPECT_SIMD_REAL_EQ(rSimd_Bits3, gmx_simd_and_r(rSimd_Bits1, rSimd_Bits2)); // Bits1 & Bits2 = Bits3
+}
+
+TEST_F(SimdFloatingpointTest, gmxSimdAndnotR)
+{
+    GMX_EXPECT_SIMD_REAL_EQ(rSimd_Bits4, gmx_simd_andnot_r(rSimd_Bits1, rSimd_Bits2)); // (~Bits1) & Bits2 = Bits3
+}
+
+TEST_F(SimdFloatingpointTest, gmxSimdOrR)
+{
+    GMX_EXPECT_SIMD_REAL_EQ(rSimd_Bits5, gmx_simd_or_r(rSimd_Bits1, rSimd_Bits2)); // Bits1 | Bits2 = Bits3
+}
+
+TEST_F(SimdFloatingpointTest, gmxSimdXorR)
+{
+    GMX_EXPECT_SIMD_REAL_EQ(rSimd_Bits6, gmx_simd_xor_r(rSimd_Bits1, rSimd_Bits2)); // Bits1 ^ Bits2 = Bits3
+}
+#endif
+
+TEST_F(SimdFloatingpointTest, gmxSimdMaxR)
+{
+    GMX_EXPECT_SIMD_REAL_EQ(setSimdRealFrom3R(3, 2, 4), gmx_simd_max_r(rSimd_1_2_3, rSimd_3_1_4));
+    GMX_EXPECT_SIMD_REAL_EQ(setSimdRealFrom3R(3, 2, 4), gmx_simd_max_r(rSimd_3_1_4, rSimd_1_2_3));
+    GMX_EXPECT_SIMD_REAL_EQ(setSimdRealFrom3R(-1, -1, -3), gmx_simd_max_r(rSimd_m1_m2_m3, rSimd_m3_m1_m4));
+    GMX_EXPECT_SIMD_REAL_EQ(setSimdRealFrom3R(-1, -1, -3), gmx_simd_max_r(rSimd_m3_m1_m4, rSimd_m1_m2_m3));
+}
+
+TEST_F(SimdFloatingpointTest, gmxSimdMinR)
+{
+    GMX_EXPECT_SIMD_REAL_EQ(setSimdRealFrom3R(1, 1, 3), gmx_simd_min_r(rSimd_1_2_3, rSimd_3_1_4));
+    GMX_EXPECT_SIMD_REAL_EQ(setSimdRealFrom3R(1, 1, 3), gmx_simd_min_r(rSimd_3_1_4, rSimd_1_2_3));
+    GMX_EXPECT_SIMD_REAL_EQ(setSimdRealFrom3R(-3, -2, -4), gmx_simd_min_r(rSimd_m1_m2_m3, rSimd_m3_m1_m4));
+    GMX_EXPECT_SIMD_REAL_EQ(setSimdRealFrom3R(-3, -2, -4), gmx_simd_min_r(rSimd_m3_m1_m4, rSimd_m1_m2_m3));
+}
+
+TEST_F(SimdFloatingpointTest, gmxSimdRoundR)
+{
+    GMX_EXPECT_SIMD_REAL_EQ(setSimdRealFrom1R(2), gmx_simd_round_r(gmx_simd_set1_r(2.25)));
+    GMX_EXPECT_SIMD_REAL_EQ(setSimdRealFrom1R(4), gmx_simd_round_r(gmx_simd_set1_r(3.75)));
+    GMX_EXPECT_SIMD_REAL_EQ(setSimdRealFrom1R(-2), gmx_simd_round_r(gmx_simd_set1_r(-2.25)));
+    GMX_EXPECT_SIMD_REAL_EQ(setSimdRealFrom1R(-4), gmx_simd_round_r(gmx_simd_set1_r(-3.75)));
+}
+
+TEST_F(SimdFloatingpointTest, gmxSimdTruncR)
+{
+    GMX_EXPECT_SIMD_REAL_EQ(setSimdRealFrom1R(2), gmx_simd_trunc_r(rSimd_2p25));
+    GMX_EXPECT_SIMD_REAL_EQ(setSimdRealFrom1R(3), gmx_simd_trunc_r(rSimd_3p75));
+    GMX_EXPECT_SIMD_REAL_EQ(setSimdRealFrom1R(-2), gmx_simd_trunc_r(rSimd_m2p25));
+    GMX_EXPECT_SIMD_REAL_EQ(setSimdRealFrom1R(-3), gmx_simd_trunc_r(rSimd_m3p75));
+}
+
+TEST_F(SimdFloatingpointTest, gmxSimdFractionR)
+{
+    GMX_EXPECT_SIMD_REAL_EQ(setSimdRealFrom1R(0.25), gmx_simd_fraction_r(rSimd_2p25));   // fract(2.25)=0.25
+    GMX_EXPECT_SIMD_REAL_EQ(setSimdRealFrom1R(0.75), gmx_simd_fraction_r(rSimd_3p75));   // fract(3.75)=0.75
+    GMX_EXPECT_SIMD_REAL_EQ(setSimdRealFrom1R(-0.25), gmx_simd_fraction_r(rSimd_m2p25)); // fract(-2.25)=-0.25
+    GMX_EXPECT_SIMD_REAL_EQ(setSimdRealFrom1R(-0.75), gmx_simd_fraction_r(rSimd_m3p75)); // fract(-3.75)=-0.75
+}
+
+// We explicitly test the exponent/mantissa routines with double precision data,
+// since these usually rely on direct manipulation and shift of the SIMD registers,
+// where it is easy to make mistakes with single vs double precision.
+
+TEST_F(SimdFloatingpointTest, gmxSimdGetExponentR)
+{
+    GMX_EXPECT_SIMD_REAL_EQ(setSimdRealFrom3R(60.0, -41.0, 54.0), gmx_simd_get_exponent_r(rSimd_Exp));
+#if (defined GMX_SIMD_HAVE_DOUBLE) && (defined GMX_DOUBLE)
+    GMX_EXPECT_SIMD_REAL_EQ(setSimdRealFrom3R(587.0, -462.0, 672.0), gmx_simd_get_exponent_r(rSimd_ExpDouble));
+#endif
+}
+
+TEST_F(SimdFloatingpointTest, gmxSimdGetMantissaR)
+{
+    GMX_EXPECT_SIMD_REAL_EQ(setSimdRealFrom3R(1.219097320577810839026256,
+                                              1.166738027848349235071623,
+                                              1.168904015004464724825084), gmx_simd_get_mantissa_r(rSimd_Exp));
+#if (defined GMX_SIMD_HAVE_DOUBLE) && (defined GMX_DOUBLE)
+    GMX_EXPECT_SIMD_REAL_EQ(setSimdRealFrom3R(1.241261238952345623563251,
+                                              1.047294723759123852359232,
+                                              1.856066204750275957395734), gmx_simd_get_mantissa_r(rSimd_ExpDouble));
+#endif
+}
+
+TEST_F(SimdFloatingpointTest, gmxSimdSetExponentR)
+{
+    gmx_simd_real_t x0 = setSimdRealFrom3R(0.5, 11.5, 99.5);
+    gmx_simd_real_t x1 = setSimdRealFrom3R(-0.5, -11.5, -99.5);
+
+    GMX_EXPECT_SIMD_REAL_EQ(setSimdRealFrom3R(pow(2.0, 60.0), pow(2.0, -41.0), pow(2.0, 54.0)),
+                            gmx_simd_set_exponent_r(setSimdRealFrom3R(60.0, -41.0, 54.0)));
+#if (defined GMX_SIMD_HAVE_DOUBLE) && (defined GMX_DOUBLE)
+    GMX_EXPECT_SIMD_REAL_EQ(setSimdRealFrom3R(pow(2.0, 587.0), pow(2.0, -462.0), pow(2.0, 672.0)),
+                            gmx_simd_set_exponent_r(setSimdRealFrom3R(587.0, -462.0, 672.0)));
+#endif
+    /* Rounding mode in gmx_simd_set_exponent_r() must be consistent with gmx_simd_round_r() */
+    GMX_EXPECT_SIMD_REAL_EQ(gmx_simd_set_exponent_r(gmx_simd_round_r(x0)), gmx_simd_set_exponent_r(x0));
+    GMX_EXPECT_SIMD_REAL_EQ(gmx_simd_set_exponent_r(gmx_simd_round_r(x1)), gmx_simd_set_exponent_r(x1));
+}
+
+/*
+ * We do extensive 1/sqrt(x) and 1/x accuracy testing in the math module, so
+ * we just make sure the lookup instructions appear to work here
+ */
+
+TEST_F(SimdFloatingpointTest, gmxSimdRsqrtR)
+{
+    gmx_simd_real_t x      = setSimdRealFrom3R(4.0, M_PI, 1234567890.0);
+    gmx_simd_real_t ref    = setSimdRealFrom3R(0.5, 1.0/sqrt(M_PI), 1.0/sqrt(1234567890.0));
+
+    /* Set the allowed ulp error as 2 to the power of the number of bits in
+     * the mantissa that do not have to be correct after the table lookup.
+     */
+    setUlpTol(1LL << (std::numeric_limits<real>::digits-GMX_SIMD_RSQRT_BITS));
+
+    GMX_EXPECT_SIMD_REAL_NEAR(ref, gmx_simd_rsqrt_r(x));
+}
+
+TEST_F(SimdFloatingpointTest, gmxSimdRcpR)
+{
+    gmx_simd_real_t x      = setSimdRealFrom3R(4.0, M_PI, 1234567890.0);
+    gmx_simd_real_t ref    = setSimdRealFrom3R(0.25, 1.0/M_PI, 1.0/1234567890.0);
+
+    /* Set the allowed ulp error as 2 to the power of the number of bits in
+     * the mantissa that do not have to be correct after the table lookup.
+     */
+    setUlpTol(1LL << (std::numeric_limits<real>::digits-GMX_SIMD_RCP_BITS));
+
+    GMX_EXPECT_SIMD_REAL_NEAR(ref, gmx_simd_rcp_r(x));
+}
+
+TEST_F(SimdFloatingpointTest, gmxSimdBoolCmpEqAndBlendZeroR)
+{
+    gmx_simd_bool_t eq   = gmx_simd_cmpeq_r(rSimd_5_7_9, rSimd_7_8_9);
+    GMX_EXPECT_SIMD_REAL_EQ(setSimdRealFrom3R(0, 0, 3), gmx_simd_blendzero_r(rSimd_1_2_3, eq));
+}
+
+TEST_F(SimdFloatingpointTest, gmxSimdBlendNotZeroR)
+{
+    gmx_simd_bool_t eq   = gmx_simd_cmpeq_r(rSimd_5_7_9, rSimd_7_8_9);
+    GMX_EXPECT_SIMD_REAL_EQ(setSimdRealFrom3R(1, 2, 0), gmx_simd_blendnotzero_r(rSimd_1_2_3, eq));
+}
+
+TEST_F(SimdFloatingpointTest, gmxSimdBoolCmpLER)
+{
+    gmx_simd_bool_t le   = gmx_simd_cmple_r(rSimd_5_7_9, rSimd_7_8_9);
+    GMX_EXPECT_SIMD_REAL_EQ(rSimd_1_2_3, gmx_simd_blendzero_r(rSimd_1_2_3, le));
+}
+
+TEST_F(SimdFloatingpointTest, gmxSimdBoolCmpLTR)
+{
+    gmx_simd_bool_t lt   = gmx_simd_cmplt_r(rSimd_5_7_9, rSimd_7_8_9);
+    GMX_EXPECT_SIMD_REAL_EQ(setSimdRealFrom3R(1, 2, 0), gmx_simd_blendzero_r(rSimd_1_2_3, lt));
+}
+
+TEST_F(SimdFloatingpointTest, gmxSimdBoolAndB)
+{
+    gmx_simd_bool_t eq   = gmx_simd_cmpeq_r(rSimd_5_7_9, rSimd_7_8_9);
+    gmx_simd_bool_t le   = gmx_simd_cmple_r(rSimd_5_7_9, rSimd_7_8_9);
+    GMX_EXPECT_SIMD_REAL_EQ(setSimdRealFrom3R(0, 0, 3), gmx_simd_blendzero_r(rSimd_1_2_3, gmx_simd_and_b(eq, le)));
+}
+
+TEST_F(SimdFloatingpointTest, gmxSimdBoolOrB)
+{
+    gmx_simd_bool_t eq   = gmx_simd_cmpeq_r(rSimd_5_7_9, rSimd_7_8_9);
+    gmx_simd_bool_t lt   = gmx_simd_cmplt_r(rSimd_5_7_9, rSimd_7_8_9);
+    GMX_EXPECT_SIMD_REAL_EQ(setSimdRealFrom3R(1, 2, 3), gmx_simd_blendzero_r(rSimd_1_2_3, gmx_simd_or_b(eq, lt)));
+}
+
+TEST_F(SimdFloatingpointTest, gmxSimdAnytrueB)
+{
+    gmx_simd_bool_t eq;
+
+    /* this test is a bit tricky since we don't know the simd width.
+     * We cannot check for truth values for "any" element beyond the first,
+     * since that part of the data will not be used if simd width is 1.
+     */
+    eq = gmx_simd_cmpeq_r(rSimd_5_7_9, setSimdRealFrom3R(5, 0, 0));
+    EXPECT_NE(0, gmx_simd_anytrue_b(eq));
+
+    eq = gmx_simd_cmpeq_r(rSimd_1_2_3, rSimd_4_5_6);
+    EXPECT_EQ(0, gmx_simd_anytrue_b(eq));
+}
+
+TEST_F(SimdFloatingpointTest, gmxSimdBlendvR)
+{
+    gmx_simd_bool_t lt   = gmx_simd_cmplt_r(rSimd_5_7_9, rSimd_7_8_9);
+    GMX_EXPECT_SIMD_REAL_EQ(setSimdRealFrom3R(4, 5, 3), gmx_simd_blendv_r(rSimd_1_2_3, rSimd_4_5_6, lt));
+}
+
+TEST_F(SimdFloatingpointTest, gmxSimdReduceR)
+{
+    // The horizontal sum of the SIMD variable depends on the width, so
+    // simply store it an extra time and calculate what the sum should be
+    std::vector<real> v   = simdReal2Vector(rSimd_4_5_6);
+    real              sum = 0.0;
+
+    for (int i = 0; i < GMX_SIMD_REAL_WIDTH; i++)
+    {
+        sum += v[i];
+    }
+
+    EXPECT_EQ(sum, gmx_simd_reduce_r(rSimd_4_5_6));
+}
+
+#endif      // GMX_SIMD_HAVE_REAL
+
+/*! \} */
+/*! \endcond */
+
+}      // namespace
+}      // namespace
+}      // namespace
diff --git a/src/gromacs/simd/tests/simd_integer.cpp b/src/gromacs/simd/tests/simd_integer.cpp

new file mode 100644 (file)

index 0000000..804354c
--- /dev/null
+++ b/src/gromacs/simd/tests/simd_integer.cpp
@@ -0,0 +1,273 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2014, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include "simd.h"
+
+/* Some notes on the setup of these tests:
+ *
+ * It might seem strange to mix different instructions for "setting" SIMD
+ * registers, but the difference is that the routines like setSimdIntFrom1I()
+ * only use the load/store operations that we already test separately in
+ * bootstrap_loadstore.cpp. Since these are "known good" if the bootstrap
+ * tests pass, we use them to test the normal SIMD implementation instructions
+ * that all have gmx_simd_ prefixes.
+ */
+
+namespace gmx
+{
+namespace test
+{
+namespace
+{
+
+/*! \cond internal */
+/*! \addtogroup module_simd */
+/*! \{ */
+
+#ifdef GMX_SIMD_HAVE_INT32
+
+/*! \brief Test fixture for integer tests (identical to the generic \ref SimdTest) */
+typedef SimdTest SimdIntegerTest;
+
+TEST_F(SimdIntegerTest, gmxSimdSetZeroI)
+{
+    GMX_EXPECT_SIMD_INT_EQ(setSimdIntFrom1I(0), gmx_simd_setzero_i());
+}
+
+TEST_F(SimdIntegerTest, gmxSimdSet1I)
+{
+    GMX_EXPECT_SIMD_INT_EQ(setSimdIntFrom1I(1), gmx_simd_set1_i(1));
+}
+
+#ifdef GMX_SIMD_HAVE_FINT32_ARITHMETICS
+TEST_F(SimdIntegerTest, gmxSimdAddI)
+{
+    GMX_EXPECT_SIMD_INT_EQ(iSimd_5_7_9,   gmx_simd_add_i(iSimd_1_2_3, iSimd_4_5_6)    );    // short add
+    GMX_EXPECT_SIMD_INT_EQ(iSimd_5M_7M_9M, gmx_simd_add_i(iSimd_1M_2M_3M, iSimd_4M_5M_6M)); // 32 bit add
+}
+
+TEST_F(SimdIntegerTest, gmxSimdSubI)
+{
+    GMX_EXPECT_SIMD_INT_EQ(iSimd_1_2_3,   gmx_simd_sub_i(iSimd_5_7_9, iSimd_4_5_6)    );    // short sub
+    GMX_EXPECT_SIMD_INT_EQ(iSimd_1M_2M_3M, gmx_simd_sub_i(iSimd_5M_7M_9M, iSimd_4M_5M_6M)); // 32 bit sub
+}
+
+TEST_F(SimdIntegerTest, gmxSimdMulI)
+{
+    GMX_EXPECT_SIMD_INT_EQ(setSimdIntFrom3I(4, 10, 18), gmx_simd_mul_i(iSimd_1_2_3, iSimd_4_5_6));                       // 2*3=6 (short mul)
+    GMX_EXPECT_SIMD_INT_EQ(setSimdIntFrom1I(268435456), gmx_simd_mul_i(gmx_simd_set1_i(16384), gmx_simd_set1_i(16384))); // 16384*16384 = 268435456 (long mul)
+}
+#endif
+
+#ifdef GMX_SIMD_HAVE_FINT32_LOGICAL
+TEST_F(SimdIntegerTest, gmxSimdSlliI)
+{
+    GMX_EXPECT_SIMD_INT_EQ(setSimdIntFrom1I(4194304), gmx_simd_slli_i(gmx_simd_set1_i(2), 21)); // 2 << 21 = 4194304
+}
+
+TEST_F(SimdIntegerTest, gmxSimdSrliI)
+{
+    GMX_EXPECT_SIMD_INT_EQ(setSimdIntFrom1I(4), gmx_simd_srli_i(gmx_simd_set1_i(4194304), 20)); // 4194304 >> 20 = 4
+}
+
+TEST_F(SimdIntegerTest, gmxSimdAndI)
+{
+    GMX_EXPECT_SIMD_INT_EQ(setSimdIntFrom1I(0xC0C0C0C0), gmx_simd_and_i(iSimd_0xF0F0F0F0, iSimd_0xCCCCCCCC));
+}
+
+TEST_F(SimdIntegerTest, gmxSimdAndnotI)
+{
+    GMX_EXPECT_SIMD_INT_EQ(setSimdIntFrom1I(0x0C0C0C0C), gmx_simd_andnot_i(iSimd_0xF0F0F0F0, iSimd_0xCCCCCCCC));
+}
+
+TEST_F(SimdIntegerTest, gmxSimdOrI)
+{
+    GMX_EXPECT_SIMD_INT_EQ(setSimdIntFrom1I(0xFCFCFCFC), gmx_simd_or_i(iSimd_0xF0F0F0F0, iSimd_0xCCCCCCCC));
+}
+
+TEST_F(SimdIntegerTest, gmxSimdXorI)
+{
+    GMX_EXPECT_SIMD_INT_EQ(setSimdIntFrom1I(0x3C3C3C3C), gmx_simd_xor_i(iSimd_0xF0F0F0F0, iSimd_0xCCCCCCCC));
+}
+#endif
+
+#ifdef GMX_SIMD_HAVE_INT32_EXTRACT
+TEST_F(SimdIntegerTest, gmxSimdExtractI)
+{
+    int              idata[GMX_SIMD_INT32_WIDTH*2];
+    int *            p = gmx_simd_align_i(idata);
+    gmx_simd_int32_t simd;
+    int              i, extracted_int;
+
+    for (i = 0; i < GMX_SIMD_INT32_WIDTH; i++)
+    {
+        p[i] = i+1;
+    }
+    simd = gmx_simd_load_i(p);
+
+    /* We cannot do a loop here, since
+     * - C++ gets confused about signed/unsigned if SSE macros are used in EXPECT_EQ()
+     * - Extract macros can only take immediates (not variables) on some archs,
+     *   and some compilers are not smart enough to expand the for loop.
+     *
+     * To solve this we use a few values manually instead of a for-loop.
+     */
+    extracted_int = gmx_simd_extract_i(simd, 0);
+    EXPECT_EQ(1, extracted_int);
+    if (GMX_SIMD_INT32_WIDTH >= 2)
+    {
+        extracted_int = gmx_simd_extract_i(simd, 1);
+        EXPECT_EQ(2, extracted_int);
+    }
+    if (GMX_SIMD_INT32_WIDTH >= 4)
+    {
+        extracted_int = gmx_simd_extract_i(simd, 3);
+        EXPECT_EQ(4, extracted_int);
+    }
+    if (GMX_SIMD_INT32_WIDTH >= 6)
+    {
+        extracted_int = gmx_simd_extract_i(simd, 5);
+        EXPECT_EQ(6, extracted_int);
+    }
+    if (GMX_SIMD_INT32_WIDTH >= 8)
+    {
+        extracted_int = gmx_simd_extract_i(simd, 7);
+        EXPECT_EQ(8, extracted_int);
+    }
+}
+#endif
+
+#ifdef GMX_SIMD_HAVE_REAL
+TEST_F(SimdIntegerTest, gmxSimdCvtR2I)
+{
+    GMX_EXPECT_SIMD_INT_EQ(setSimdIntFrom1I(4), gmx_simd_cvt_r2i(rSimd_3p75));
+    GMX_EXPECT_SIMD_INT_EQ(setSimdIntFrom1I(-4), gmx_simd_cvt_r2i(rSimd_m3p75));
+}
+
+TEST_F(SimdIntegerTest, gmxSimdCvttR2I)
+{
+    GMX_EXPECT_SIMD_INT_EQ(setSimdIntFrom1I(3), gmx_simd_cvtt_r2i(rSimd_3p75));
+    GMX_EXPECT_SIMD_INT_EQ(setSimdIntFrom1I(-3), gmx_simd_cvtt_r2i(rSimd_m3p75));
+}
+
+TEST_F(SimdIntegerTest, gmxSimdCvtI2R)
+{
+    GMX_EXPECT_SIMD_REAL_EQ(setSimdRealFrom1R(2.0), gmx_simd_cvt_i2r(gmx_simd_set1_i(2)));
+    GMX_EXPECT_SIMD_REAL_EQ(setSimdRealFrom1R(-2.0), gmx_simd_cvt_i2r(gmx_simd_set1_i(-2)));
+}
+#endif
+
+#ifdef GMX_SIMD_HAVE_FINT32_ARITHMETICS
+TEST_F(SimdIntegerTest, gmxSimdBoolCmpEqAndBlendZeroI)
+{
+    gmx_simd_ibool_t eq   = gmx_simd_cmpeq_i(iSimd_5_7_9, iSimd_7_8_9);
+    GMX_EXPECT_SIMD_INT_EQ(setSimdIntFrom3I(0, 0, 3), gmx_simd_blendzero_i(iSimd_1_2_3, eq));
+}
+
+TEST_F(SimdIntegerTest, gmxSimdBlendNotZeroI)
+{
+    gmx_simd_ibool_t eq   = gmx_simd_cmpeq_i(iSimd_5_7_9, iSimd_7_8_9);
+    GMX_EXPECT_SIMD_INT_EQ(setSimdIntFrom3I(1, 2, 0), gmx_simd_blendnotzero_i(iSimd_1_2_3, eq));
+}
+
+TEST_F(SimdIntegerTest, gmxSimdBoolCmpLTI)
+{
+    gmx_simd_ibool_t lt   = gmx_simd_cmplt_i(iSimd_5_7_9, iSimd_7_8_9);
+    GMX_EXPECT_SIMD_INT_EQ(setSimdIntFrom3I(1, 2, 0), gmx_simd_blendzero_i(iSimd_1_2_3, lt));
+}
+
+TEST_F(SimdIntegerTest, gmxSimdBoolAndIB)
+{
+    gmx_simd_ibool_t eq1  = gmx_simd_cmpeq_i(iSimd_5_7_9, iSimd_7_8_9);
+    gmx_simd_ibool_t eq2  = gmx_simd_cmpeq_i(iSimd_5_7_9, iSimd_5_7_9);
+    GMX_EXPECT_SIMD_INT_EQ(setSimdIntFrom3I(0, 0, 3), gmx_simd_blendzero_i(iSimd_1_2_3, gmx_simd_and_ib(eq1, eq2)));
+}
+
+TEST_F(SimdIntegerTest, gmxSimdBoolOrIB)
+{
+    gmx_simd_ibool_t eq1  = gmx_simd_cmpeq_i(iSimd_5_7_9, iSimd_7_8_9);
+    gmx_simd_ibool_t eq2  = gmx_simd_cmpeq_i(iSimd_5_7_9, setSimdIntFrom3I(5, 0, 0));
+    GMX_EXPECT_SIMD_INT_EQ(setSimdIntFrom3I(1, 0, 3), gmx_simd_blendzero_i(iSimd_1_2_3, gmx_simd_or_ib(eq1, eq2)));
+}
+
+TEST_F(SimdIntegerTest, gmxSimdAnytrueIB)
+{
+    gmx_simd_ibool_t eq;
+
+    /* See comment in floatingpoint.cpp. We should only check the first element here,
+     * since the SIMD width could be 1 as a special case.
+     */
+    eq = gmx_simd_cmpeq_i(iSimd_5_7_9, setSimdIntFrom3I(5, 0, 0));
+    EXPECT_NE(0, gmx_simd_anytrue_ib(eq));
+
+    eq = gmx_simd_cmpeq_i(iSimd_1_2_3, iSimd_4_5_6);
+    EXPECT_EQ(0, gmx_simd_anytrue_ib(eq));
+}
+
+TEST_F(SimdIntegerTest, gmxSimdBlendvI)
+{
+    gmx_simd_ibool_t lt   = gmx_simd_cmplt_i(iSimd_5_7_9, iSimd_7_8_9);
+    GMX_EXPECT_SIMD_INT_EQ(setSimdIntFrom3I(4, 5, 3), gmx_simd_blendv_i(iSimd_1_2_3, iSimd_4_5_6, lt));
+}
+#endif
+
+#if (defined GMX_SIMD_HAVE_REAL) && (defined GMX_SIMD_HAVE_FINT32_ARITHMETICS)
+TEST_F(SimdIntegerTest, gmxSimdCvtB2IB)
+{
+    gmx_simd_bool_t  eq   = gmx_simd_cmpeq_r(rSimd_5_7_9, setSimdRealFrom3R(5, 0, 0));  // eq should be T,F,F
+    gmx_simd_ibool_t eqi  = gmx_simd_cvt_b2ib(eq);
+    GMX_EXPECT_SIMD_INT_EQ(setSimdIntFrom3I(1, 0, 0), gmx_simd_blendzero_i(iSimd_1_2_3, eqi));
+
+}
+
+TEST_F(SimdIntegerTest, gmxSimdCvtIB2B)
+{
+    gmx_simd_ibool_t eqi  = gmx_simd_cmpeq_i(iSimd_5_7_9, setSimdIntFrom3I(5, 0, 0));  // eq should be T,F,F
+    gmx_simd_bool_t  eq   = gmx_simd_cvt_ib2b(eqi);
+    GMX_EXPECT_SIMD_REAL_EQ(setSimdRealFrom3R(1.0, 0, 0), gmx_simd_blendzero_r(rSimd_1_2_3, eq));
+}
+#endif
+
+#endif      // GMX_SIMD_HAVE_INT32
+
+/*! \} */
+/*! \endcond */
+
+}      // namespace
+}      // namespace
+}      // namespace
diff --git a/src/gromacs/simd/tests/simd_math.cpp b/src/gromacs/simd/tests/simd_math.cpp

new file mode 100644 (file)

index 0000000..9d4d63e
--- /dev/null
+++ b/src/gromacs/simd/tests/simd_math.cpp
@@ -0,0 +1,493 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2014, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <vector>
+#include "gromacs/math/utilities.h"
+#include "gromacs/simd/simd.h"
+#include "gromacs/simd/simd_math.h"
+#include "gromacs/options/basicoptions.h"
+
+#include "simd.h"
+
+namespace gmx
+{
+namespace test
+{
+
+/*! \cond internal */
+/*! \addtogroup module_simd */
+/*! \{ */
+
+#ifdef GMX_SIMD_HAVE_REAL
+
+class SimdMathTest : public SimdTest
+{
+    public:
+        ::testing::AssertionResult
+                            compareSimdMathFunction(const char * refFuncExpr, const char *simdFuncExpr,
+                                                    real refFunc(real x),     gmx_simd_real_t simdFunc(gmx_simd_real_t x));
+};
+
+/*! \brief Test approximate equality of SIMD vs reference version of a function.
+ *
+ * This macro takes vanilla C and SIMD flavors of a function and tests it with
+ * the number of points, range, and tolerances specified by the test fixture class.
+ */
+#define GMX_EXPECT_SIMD_FUNC_NEAR(refFunc, tstFunc) \
+    EXPECT_PRED_FORMAT2(compareSimdMathFunction, refFunc, tstFunc)
+
+/*! \brief Implementation routine to compare SIMD vs reference functions.
+ *
+ * \param refFuncExpr   Description of reference function expression
+ * \param simdFuncExpr  Description of SIMD function expression
+ * \param refFunc       Reference math function pointer
+ * \param simdFunc      SIMD math function pointer
+ *
+ * The function will be tested with the range and tolerances specified in
+ * the SimdBaseTest class. You should not never call this function directly,
+ * but use the macro GMX_EXPECT_SIMD_FUNC_NEAR(refFunc,tstFunc) instead.
+ */
+::testing::AssertionResult
+SimdMathTest::compareSimdMathFunction(const char * refFuncExpr, const char *simdFuncExpr,
+                                      real refFunc(real x),     gmx_simd_real_t simdFunc(gmx_simd_real_t x))
+{
+    std::vector<real>            vx(GMX_SIMD_REAL_WIDTH);
+    std::vector<real>            vref(GMX_SIMD_REAL_WIDTH);
+    std::vector<real>            vtst(GMX_SIMD_REAL_WIDTH);
+    real                         dx, absDiff;
+    gmx_int64_t                  ulpDiff, maxUlpDiff;
+    real                         maxUlpDiffPos;
+    real                         refValMaxUlpDiff, simdValMaxUlpDiff;
+    bool                         absOk, signOk;
+    int                          i, iter;
+    int                          niter   = s_nPoints/GMX_SIMD_REAL_WIDTH;
+    int                          npoints = niter*GMX_SIMD_REAL_WIDTH;
+#    ifdef GMX_DOUBLE
+    union {
+        double r; gmx_int64_t i;
+    } conv0, conv1;
+#    else
+    union {
+        float  r; gmx_int32_t i;
+    } conv0, conv1;
+#    endif
+
+    maxUlpDiff = 0;
+    dx         = (range_.second-range_.first)/npoints;
+
+    for (iter = 0; iter < niter; iter++)
+    {
+        for (i = 0; i < GMX_SIMD_REAL_WIDTH; i++)
+        {
+            vx[i]   = range_.first+dx*(iter*GMX_SIMD_REAL_WIDTH+i);
+            vref[i] = refFunc(vx[i]);
+        }
+        vtst  = simdReal2Vector(simdFunc(vector2SimdReal(vx)));
+
+        for (i = 0, signOk = true, absOk = true; i < GMX_SIMD_REAL_WIDTH; i++)
+        {
+            absDiff = fabs(vref[i]-vtst[i]);
+            absOk   = absOk  && ( absDiff < absTol_ );
+            signOk  = signOk && ( vref[i]*vtst[i] >= 0 );
+
+            if (absDiff >= absTol_)
+            {
+                /* We replicate the trivial ulp differences comparison here rather than
+                 * calling the lower-level routine for comparing them, since this enables
+                 * us to run through the entire test range and report the largest deviation
+                 * without lots of extra glue routines.
+                 */
+                conv0.r           = vref[i];
+                conv1.r           = vtst[i];
+                ulpDiff           = llabs(conv0.i-conv1.i);
+                if (ulpDiff > maxUlpDiff)
+                {
+                    maxUlpDiff        = ulpDiff;
+                    maxUlpDiffPos     = vx[i];
+                    refValMaxUlpDiff  = vref[i];
+                    simdValMaxUlpDiff = vtst[i];
+                }
+            }
+        }
+        if ( (absOk == false) && (signOk == false) )
+        {
+            return ::testing::AssertionFailure()
+                   << "Failing SIMD math function comparison due to sign differences." << std::endl
+                   << "Reference function: " << refFuncExpr << std::endl
+                   << "Simd function:      " << simdFuncExpr << std::endl
+                   << "Test range is ( " << range_.first << " , " << range_.second << " ) " << std::endl
+                   << "First sign difference around x=" << std::setprecision(20) << ::testing::PrintToString(vx) << std::endl
+                   << "Ref values:  " << std::setprecision(20) << ::testing::PrintToString(vref) << std::endl
+                   << "SIMD values: " << std::setprecision(20) << ::testing::PrintToString(vtst) << std::endl;
+        }
+    }
+
+    if (maxUlpDiff <= ulpTol_)
+    {
+        return ::testing::AssertionSuccess();
+    }
+    else
+    {
+        return ::testing::AssertionFailure()
+               << "Failing SIMD math function ulp comparison between " << refFuncExpr << " and " << simdFuncExpr << std::endl
+               << "Requested ulp tolerance: " << ulpTol_ << std::endl
+               << "Requested abs tolerance: " << absTol_ << std::endl
+               << "Largest Ulp difference occurs for x=" << std::setprecision(20) << maxUlpDiffPos << std::endl
+               << "Ref  values: " << std::setprecision(20) << refValMaxUlpDiff << std::endl
+               << "SIMD values: " << std::setprecision(20) << simdValMaxUlpDiff << std::endl
+               << "Ulp diff.:   " << std::setprecision(20) << maxUlpDiff << std::endl;
+    }
+}
+
+/*! \} */
+/*! \endcond */
+
+
+// Actual math function tests below
+
+
+namespace
+{
+
+/*! \cond internal */
+/*! \addtogroup module_simd */
+/*! \{ */
+
+TEST_F(SimdMathTest, gmxSimdXorSignR)
+{
+    GMX_EXPECT_SIMD_REAL_EQ(setSimdRealFrom3R(-4, 5, 6), gmx_simd_xor_sign_r(setSimdRealFrom3R(4, 5, 6), setSimdRealFrom3R(-5, 2, 0)));
+    GMX_EXPECT_SIMD_REAL_EQ(setSimdRealFrom3R(4, -5, -6), gmx_simd_xor_sign_r(setSimdRealFrom3R(-4, -5, -6), setSimdRealFrom3R(-5, 2, 0)));
+}
+
+/*! \brief Function wrapper to evaluate reference 1/sqrt(x) */
+static real
+ref_invsqrt(real x)
+{
+    return 1.0/sqrt(x);
+}
+
+TEST_F(SimdMathTest, gmxSimdInvsqrtR)
+{
+    setRange(1e-10, 1e10);
+    GMX_EXPECT_SIMD_FUNC_NEAR(ref_invsqrt, gmx_simd_invsqrt_r);
+}
+
+/*! \brief Function wrapper to return first result when testing \ref gmx_simd_invsqrt_pair_r */
+gmx_simd_real_t
+tst_invsqrt_pair0(gmx_simd_real_t x)
+{
+    gmx_simd_real_t r0, r1;
+    gmx_simd_invsqrt_pair_r(x, x, &r0, &r1);
+    return r0;
+}
+
+/*! \brief Function wrapper to return second result when testing \ref gmx_simd_invsqrt_pair_r */
+gmx_simd_real_t
+tst_invsqrt_pair1(gmx_simd_real_t x)
+{
+    gmx_simd_real_t r0, r1;
+    gmx_simd_invsqrt_pair_r(x, x, &r0, &r1);
+    return r1;
+}
+
+TEST_F(SimdMathTest, gmxSimdInvsqrtPairR)
+{
+    setRange(1e-10, 1e10);
+    // The accuracy conversions lose a bit of extra accuracy compared to
+    // doing the iterations in all-double.
+    setUlpTol(4*ulpTol_);
+
+    GMX_EXPECT_SIMD_FUNC_NEAR(ref_invsqrt, tst_invsqrt_pair0);
+    GMX_EXPECT_SIMD_FUNC_NEAR(ref_invsqrt, tst_invsqrt_pair1);
+}
+
+TEST_F(SimdMathTest, gmxSimdSqrtR)
+{
+    // Just make sure sqrt(0)=0 works and isn't evaluated as 0*1/sqrt(0)=NaN
+    GMX_EXPECT_SIMD_REAL_NEAR(setSimdRealFrom3R(0, 2, 3), gmx_simd_sqrt_r(setSimdRealFrom3R(0, 4, 9)));
+}
+
+/*! \brief Function wrapper to evaluate reference 1/x */
+real ref_inv(real x)
+{
+    return 1.0/x;
+}
+
+TEST_F(SimdMathTest, gmxSimdInvR)
+{
+    // test <0
+    setRange(-1e10, -1e-10);
+    GMX_EXPECT_SIMD_FUNC_NEAR(ref_inv, gmx_simd_inv_r);
+    setRange(1e-10, 1e10);
+    GMX_EXPECT_SIMD_FUNC_NEAR(ref_inv, gmx_simd_inv_r);
+}
+
+/*! \brief Function wrapper for log(x), with argument/return in default Gromacs precision */
+real ref_log(real x)
+{
+    return log(x);
+}
+
+TEST_F(SimdMathTest, gmxSimdLogR)
+{
+    setRange(1e-30, 1e30);
+    GMX_EXPECT_SIMD_FUNC_NEAR(ref_log, gmx_simd_log_r);
+}
+
+// MSVC does not support exp2(), so we have no reference to test against
+#ifndef _MSC_VER
+/*! \brief Function wrapper for exp2(x), with argument/return in default Gromacs precision */
+real ref_exp2(real x)
+{
+    return exp2(x);
+}
+
+TEST_F(SimdMathTest, gmxSimdExp2R)
+{
+    setRange(-100, 100);
+    GMX_EXPECT_SIMD_FUNC_NEAR(ref_exp2, gmx_simd_exp2_r);
+}
+#endif
+
+/*! \brief Function wrapper for exp(x), with argument/return in default Gromacs precision */
+real ref_exp(real x)
+{
+    return exp(x);
+}
+
+TEST_F(SimdMathTest, gmxSimdExpR)
+{
+    setRange(-75, 75);
+    GMX_EXPECT_SIMD_FUNC_NEAR(ref_exp, gmx_simd_exp_r);
+}
+
+/*! \brief Function wrapper for erf(x), with argument/return in default Gromacs precision.
+ *
+ * \note The single-precision gmx_erff() in gmxlib is slightly lower precision
+ * than the SIMD flavor, so we use double for reference.
+ */
+real ref_erf(real x)
+{
+    return gmx_erfd(x);
+}
+
+TEST_F(SimdMathTest, gmxSimdErfR)
+{
+    setRange(-9, 9);
+    setAbsTol(GMX_REAL_MIN);
+    GMX_EXPECT_SIMD_FUNC_NEAR(ref_erf, gmx_simd_erf_r);
+}
+
+/*! \brief Function wrapper for erfc(x), with argument/return in default Gromacs precision.
+ *
+ * \note The single-precision gmx_erfcf() in gmxlib is slightly lower precision
+ * than the SIMD flavor, so we use double for reference.
+ */
+real ref_erfc(real x)
+{
+    return gmx_erfcd(x);
+}
+
+TEST_F(SimdMathTest, gmxSimdErfcR)
+{
+    setRange(-9, 9);
+    setAbsTol(GMX_REAL_MIN);
+    // Our erfc algorithm has 4 ulp accuracy, so relax defaultTol a bit
+    setUlpTol(4*ulpTol_);
+    GMX_EXPECT_SIMD_FUNC_NEAR(ref_erfc, gmx_simd_erfc_r);
+}
+
+/*! \brief Function wrapper for sin(x), with argument/return in default Gromacs precision */
+real ref_sin(real x)
+{
+    return sin(x);
+}
+
+TEST_F(SimdMathTest, gmxSimdSinR)
+{
+    setRange(-8*M_PI, 8*M_PI);
+    GMX_EXPECT_SIMD_FUNC_NEAR(ref_sin, gmx_simd_sin_r);
+    // Range reduction leads to accuracy loss, so we might want higher tolerance here
+    setRange(-10000, 10000);
+    setUlpTol(2*ulpTol_);
+    GMX_EXPECT_SIMD_FUNC_NEAR(ref_sin, gmx_simd_sin_r);
+}
+
+/*! \brief Function wrapper for cos(x), with argument/return in default Gromacs precision */
+real ref_cos(real x)
+{
+    return cos(x);
+}
+
+TEST_F(SimdMathTest, gmxSimdCosR)
+{
+    setRange(-8*M_PI, 8*M_PI);
+    GMX_EXPECT_SIMD_FUNC_NEAR(ref_cos, gmx_simd_cos_r);
+    // Range reduction leads to accuracy loss, so we might want higher tolerance here
+    setRange(-10000, 10000);
+    setUlpTol(2*ulpTol_);
+    GMX_EXPECT_SIMD_FUNC_NEAR(ref_cos, gmx_simd_cos_r);
+}
+
+/*! \brief Function wrapper for tan(x), with argument/return in default Gromacs precision */
+real ref_tan(real x)
+{
+    return tan(x);
+}
+
+TEST_F(SimdMathTest, gmxSimdTanR)
+{
+    // Tan(x) is a little sensitive due to the division in the algorithm.
+    // Rather than using lots of extra FP operations, we accept the algorithm
+    // presently only achieves a ~3 ulp error and use the medium tolerance.
+    setRange(-8*M_PI, 8*M_PI);
+    GMX_EXPECT_SIMD_FUNC_NEAR(ref_tan, gmx_simd_tan_r);
+    // Range reduction leads to accuracy loss, so we might want higher tolerance here
+    setRange(-10000, 10000);
+    setUlpTol(2*ulpTol_);
+    GMX_EXPECT_SIMD_FUNC_NEAR(ref_tan, gmx_simd_tan_r);
+}
+
+/*! \brief Function wrapper for asin(x), with argument/return in default Gromacs precision */
+real ref_asin(real x)
+{
+    return asin(x);
+}
+
+TEST_F(SimdMathTest, gmxSimdAsinR)
+{
+    // Our present asin(x) algorithm achieves 2-3 ulp accuracy
+    setRange(-1, 1);
+    GMX_EXPECT_SIMD_FUNC_NEAR(ref_asin, gmx_simd_asin_r);
+}
+
+/*! \brief Function wrapper for acos(x), with argument/return in default Gromacs precision */
+real ref_acos(real x)
+{
+    return acos(x);
+}
+
+TEST_F(SimdMathTest, gmxSimdAcosR)
+{
+    // Our present acos(x) algorithm achieves 2-3 ulp accuracy
+    setRange(-1, 1);
+    GMX_EXPECT_SIMD_FUNC_NEAR(ref_acos, gmx_simd_acos_r);
+}
+
+/*! \brief Function wrapper for atan(x), with argument/return in default Gromacs precision */
+real ref_atan(real x)
+{
+    return atan(x);
+}
+
+TEST_F(SimdMathTest, gmxSimdAtanR)
+{
+    // Our present atan(x) algorithm achieves 1 ulp accuracy
+    setRange(-10000, 10000);
+    GMX_EXPECT_SIMD_FUNC_NEAR(ref_atan, gmx_simd_atan_r);
+}
+
+TEST_F(SimdMathTest, gmxSimdAtan2R)
+{
+    // test each quadrant
+    GMX_EXPECT_SIMD_REAL_NEAR(setSimdRealFrom1R(atan2(1.0, 1.0)), gmx_simd_atan2_r(rSimd_1_2_3, rSimd_1_2_3));
+    GMX_EXPECT_SIMD_REAL_NEAR(setSimdRealFrom1R(atan2(-1.0, 1.0)), gmx_simd_atan2_r(rSimd_m1_m2_m3, rSimd_1_2_3));
+    GMX_EXPECT_SIMD_REAL_NEAR(setSimdRealFrom1R(atan2(-1.0, -1.0)), gmx_simd_atan2_r(rSimd_m1_m2_m3, rSimd_m1_m2_m3));
+    GMX_EXPECT_SIMD_REAL_NEAR(setSimdRealFrom1R(atan2(1.0, -1.0)), gmx_simd_atan2_r(rSimd_1_2_3, rSimd_m1_m2_m3));
+    // cases important for calculating angles
+    // values on coordinate axes
+    GMX_EXPECT_SIMD_REAL_NEAR(setSimdRealFrom1R(atan2(0.0, 1.0)), gmx_simd_atan2_r(gmx_simd_setzero_r(), rSimd_1_2_3));
+    GMX_EXPECT_SIMD_REAL_NEAR(setSimdRealFrom1R(atan2(1.0, 0.0)), gmx_simd_atan2_r(rSimd_1_2_3, gmx_simd_setzero_r()));
+    GMX_EXPECT_SIMD_REAL_NEAR(setSimdRealFrom1R(atan2(0.0, -1.0)), gmx_simd_atan2_r(gmx_simd_setzero_r(), rSimd_m1_m2_m3));
+    GMX_EXPECT_SIMD_REAL_NEAR(setSimdRealFrom1R(atan2(-1.0, 0.0)), gmx_simd_atan2_r(rSimd_m1_m2_m3, gmx_simd_setzero_r()));
+    // degenerate value (origin) should return 0.0
+    GMX_EXPECT_SIMD_REAL_NEAR(setSimdRealFrom1R(atan2(0.0, 0.0)), gmx_simd_atan2_r(setSimdRealFrom3R(0.0, 0.0, 0.0), gmx_simd_setzero_r()));
+}
+
+/*! \brief Evaluate reference version of PME force correction. */
+real ref_pmecorrF(real x)
+{
+    real y = sqrt(x);
+    return 2*exp(-x)/(sqrt(M_PI)*x) - gmx_erfd(y)/(x*y);
+}
+
+// The PME corrections will be added to ~1/r2, so absolute tolerance of EPS is fine.
+TEST_F(SimdMathTest, gmxSimdPmecorrForceR)
+{
+    // Pme correction only needs to be ~1e-6 accuracy single, 1e-10 double
+#ifdef GMX_DOUBLE
+    setUlpTol((gmx_int64_t)(1e-10/GMX_REAL_EPS));
+#else
+    setUlpTol((gmx_int64_t)(1e-6/GMX_REAL_EPS));
+#endif
+
+    setRange(0.15, 4);
+    setAbsTol(GMX_REAL_EPS);
+    GMX_EXPECT_SIMD_FUNC_NEAR(ref_pmecorrF, gmx_simd_pmecorrF_r);
+}
+
+/*! \brief Evaluate reference version of PME potential correction. */
+real ref_pmecorrV(real x)
+{
+    real y = sqrt(x);
+    return gmx_erfd(y)/y;
+}
+
+// The PME corrections will be added to ~1/r, so absolute tolerance of EPS is fine.
+TEST_F(SimdMathTest, gmxSimdPmecorrPotentialR)
+{
+    // Pme correction only needs to be ~1e-6 accuracy single, 1e-10 double
+#ifdef GMX_DOUBLE
+    setUlpTol((gmx_int64_t)(1e-10/GMX_REAL_EPS));
+#else
+    setUlpTol((gmx_int64_t)(1e-6/GMX_REAL_EPS));
+#endif
+    setRange(0.15, 4);
+    setAbsTol(GMX_REAL_EPS);
+    GMX_EXPECT_SIMD_FUNC_NEAR(ref_pmecorrV, gmx_simd_pmecorrV_r);
+}
+
+}      // namespace
+
+#endif // GMX_SIMD_HAVE_REAL
+
+/*! \} */
+/*! \endcond */
+
+}      // namespace
+}      // namespace
diff --git a/src/gromacs/simd/tests/simd_vector_operations.cpp b/src/gromacs/simd/tests/simd_vector_operations.cpp

new file mode 100644 (file)

index 0000000..60652c7
--- /dev/null
+++ b/src/gromacs/simd/tests/simd_vector_operations.cpp
@@ -0,0 +1,124 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2014, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+#include "gromacs/simd/simd.h"
+#include "gromacs/simd/vector_operations.h"
+
+#include "simd.h"
+
+namespace gmx
+{
+namespace test
+{
+namespace
+{
+
+/*! \cond internal */
+/*! \addtogroup module_simd */
+/*! \{ */
+
+#ifdef GMX_SIMD_HAVE_REAL
+
+/*! \internal \brief Test fixture for vector operations tests (identical to the generic \ref SimdTest) */
+typedef SimdTest SimdVectorOperationsTest;
+
+TEST_F(SimdVectorOperationsTest, gmxSimdCalcRsqR)
+{
+    gmx_simd_real_t simdX  = setSimdRealFrom3R(1, 2, 3);
+    gmx_simd_real_t simdY  = setSimdRealFrom3R(3, 0, 5);
+    gmx_simd_real_t simdZ  = setSimdRealFrom3R(4, 1, 8);
+    gmx_simd_real_t simdR2 = setSimdRealFrom3R(26, 5, 98);
+
+    setUlpTol(2);
+    GMX_EXPECT_SIMD_REAL_NEAR(simdR2, gmx_simd_calc_rsq_r(simdX, simdY, simdZ));
+}
+
+TEST_F(SimdVectorOperationsTest, gmxSimdIprodR)
+{
+    gmx_simd_real_t aX    = setSimdRealFrom3R(1, 2, 3);
+    gmx_simd_real_t aY    = setSimdRealFrom3R(3, 0, 5);
+    gmx_simd_real_t aZ    = setSimdRealFrom3R(4, 1, 8);
+    gmx_simd_real_t bX    = setSimdRealFrom3R(8, 3, 6);
+    gmx_simd_real_t bY    = setSimdRealFrom3R(2, 3, 1);
+    gmx_simd_real_t bZ    = setSimdRealFrom3R(5, 7, 9);
+    gmx_simd_real_t iprod = setSimdRealFrom3R(34, 13, 95);
+
+    setUlpTol(2);
+    GMX_EXPECT_SIMD_REAL_NEAR(iprod, gmx_simd_iprod_r(aX, aY, aZ, bX, bY, bZ));
+}
+
+TEST_F(SimdVectorOperationsTest, gmxSimdNorm2R)
+{
+    gmx_simd_real_t simdX     = setSimdRealFrom3R(1, 2, 3);
+    gmx_simd_real_t simdY     = setSimdRealFrom3R(3, 0, 5);
+    gmx_simd_real_t simdZ     = setSimdRealFrom3R(4, 1, 8);
+    gmx_simd_real_t simdNorm2 = setSimdRealFrom3R(26, 5, 98);
+
+    setUlpTol(2);
+    GMX_EXPECT_SIMD_REAL_NEAR(simdNorm2, gmx_simd_norm2_r(simdX, simdY, simdZ));
+}
+
+TEST_F(SimdVectorOperationsTest, gmxSimdCprodR)
+{
+    gmx_simd_real_t aX    = setSimdRealFrom3R(1, 2, 3);
+    gmx_simd_real_t aY    = setSimdRealFrom3R(3, 0, 5);
+    gmx_simd_real_t aZ    = setSimdRealFrom3R(4, 1, 8);
+    gmx_simd_real_t bX    = setSimdRealFrom3R(8, 3, 6);
+    gmx_simd_real_t bY    = setSimdRealFrom3R(2, 3, 1);
+    gmx_simd_real_t bZ    = setSimdRealFrom3R(5, 7, 9);
+    gmx_simd_real_t refcX = setSimdRealFrom3R(7, -3, 37);
+    gmx_simd_real_t refcY = setSimdRealFrom3R(27, -11, 21);
+    gmx_simd_real_t refcZ = setSimdRealFrom3R(-22, 6, -27);
+    gmx_simd_real_t cX, cY, cZ;
+    gmx_simd_cprod_r(aX, aY, aZ, bX, bY, bZ, &cX, &cY, &cZ);
+
+    setUlpTol(2);
+    GMX_EXPECT_SIMD_REAL_NEAR(refcX, cX);
+    GMX_EXPECT_SIMD_REAL_NEAR(refcY, cY);
+    GMX_EXPECT_SIMD_REAL_NEAR(refcZ, cZ);
+}
+
+#endif      // GMX_SIMD_HAVE_REAL
+
+/*! \} */
+/*! \endcond */
+
+}      // namespace
+}      // namespace
+}      // namespace
diff --git a/src/gromacs/simd/vector_operations.h b/src/gromacs/simd/vector_operations.h

index 1fb9a142e1f83ea3ae895d0279377ce8ba54f789..f13957286fc95fc3a5e0420eda2a94ce8a647430 100644 (file)
--- a/src/gromacs/simd/vector_operations.h
+++ b/src/gromacs/simd/vector_operations.h
@@ -1,7 +1,7 @@
  /*
   * This file is part of the GROMACS molecular simulation package.
   *
- * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
+ * Copyright (c) 2013,2014, by the GROMACS development team, led by
   * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   * and including many others, as listed in the AUTHORS file in the
   * top-level source directory and at http://www.gromacs.org.
@@ -33,80 +33,300 @@
   * the research papers on the package. Check out http://www.gromacs.org.
   */
  
-/* The macros in this file are intended to be used for writing
- * architecture-independent SIMD intrinsics code.
- * To support a new architecture, adding macros here should be (nearly)
- * all that is needed.
- */
-
-/* This file contains vector operation functions using SIMD intrinsics.
- * gromacs/simd/macros.h should be included before including this file.
+/*! \libinternal \file
+ *
+ * \brief SIMD operations corresponding to Gromacs rvec datatypes.
+ *
+ * \author Erik Lindahl <erik.lindahl@scilifelab.se>
+ *
+ * \inlibraryapi
+ * \ingroup module_simd
   */
  
  #ifndef GMX_SIMD_VECTOR_OPERATIONS_H
  #define GMX_SIMD_VECTOR_OPERATIONS_H
  
-#ifndef GMX_SIMD_MACROS_H
-#error "gromacs/simd/macros.h was not included before including gromacs/simd/vector_operations.h"
-#endif
+#include "gromacs/simd/simd.h"
+
+/*! \cond libapi */
+/*! \addtogroup module_simd */
+/*! \{ */
+
+#if (defined GMX_SIMD_HAVE_FLOAT) || (defined DOXYGEN)
+/*! \brief SIMD float inner product of multiple float vectors.
+ *
+ * For normal usage you should always call the real-precision \ref gmx_simd_iprod_r.
+ *
+ * \param ax X components of first vectors
+ * \param ay Y components of first vectors
+ * \param az Z components of first vectors
+ * \param bx X components of second vectors
+ * \param by Y components of second vectors
+ * \param bz Z components of second vectors
+ *
+ * \return Element i will be res[i] = ax[i]*bx[i]+ay[i]*by[i]+az[i]*bz[i].
+ *
+ * \note The SIMD part is that we calculate many scalar products in one call.
+ */
+static gmx_inline gmx_simd_float_t
+gmx_simd_iprod_f(gmx_simd_float_t ax, gmx_simd_float_t ay, gmx_simd_float_t az,
+                 gmx_simd_float_t bx, gmx_simd_float_t by, gmx_simd_float_t bz)
+{
+    gmx_simd_float_t ret;
+
+    ret = gmx_simd_mul_f(ax, bx);
+    ret = gmx_simd_fmadd_f(ay, by, ret);
+    ret = gmx_simd_fmadd_f(az, bz, ret);
+
+    return ret;
+}
+
+/*! \brief SIMD float norm squared of multiple vectors.
+ *
+ * For normal usage you should always call the real-precision \ref gmx_simd_norm2_r.
+ *
+ * \param ax X components of vectors
+ * \param ay Y components of vectors
+ * \param az Z components of vectors
+ *
+ * \return Element i will be res[i] = ax[i]*ax[i]+ay[i]*ay[i]+az[i]*az[i].
+ *
+ * \note This corresponds to the scalar product of the vector with itself, but
+ * the compiler might be able to optimize it better with identical vectors.
+ */
+static gmx_inline gmx_simd_float_t
+gmx_simd_norm2_f(gmx_simd_float_t ax, gmx_simd_float_t ay, gmx_simd_float_t az)
+{
+    gmx_simd_float_t ret;
+
+    ret = gmx_simd_mul_f(ax, ax);
+    ret = gmx_simd_fmadd_f(ay, ay, ret);
+    ret = gmx_simd_fmadd_f(az, az, ret);
+
+    return ret;
+}
  
+/*! \brief Calculating r^2 is the same as evaluating the norm of dx*dx.
+ *
+ * For details, see \ref gmx_simd_norm2_f.
+ */
+#define gmx_simd_calc_rsq_f gmx_simd_norm2_f
  
-/* x^2 + y^2 + z^2 */
-static gmx_inline gmx_simd_real_t
-gmx_simd_calc_rsq_r(gmx_simd_real_t x, gmx_simd_real_t y, gmx_simd_real_t z)
+/*! \brief SIMD float cross-product of multiple vectors.
+ *
+ * For normal usage you should always call the real-precision \ref gmx_simd_cprod_r.
+ *
+ * \param ax X components of first vectors
+ * \param ay Y components of first vectors
+ * \param az Z components of first vectors
+ * \param bx X components of second vectors
+ * \param by Y components of second vectors
+ * \param bz Z components of second vectors
+ * \param[out] cx X components of cross product vectors
+ * \param[out] cy Y components of cross product vectors
+ * \param[out] cz Z components of cross product vectors
+ *
+ * \returns void
+ *
+ * This calculates C = A x B, where the cross denotes the cross product.
+ * The arguments x/y/z denotes the different components, and each element
+ * corresponds to a separate vector.
+ */
+static gmx_inline void
+gmx_simd_cprod_f(gmx_simd_float_t ax, gmx_simd_float_t ay, gmx_simd_float_t az,
+                 gmx_simd_float_t bx, gmx_simd_float_t by, gmx_simd_float_t bz,
+                 gmx_simd_float_t *cx, gmx_simd_float_t *cy, gmx_simd_float_t *cz)
  {
-    return gmx_simd_fmadd_r(z, z, gmx_simd_fmadd_r(y, y, gmx_simd_mul_r(x, x)));
+    *cx = gmx_simd_mul_f(ay, bz);
+    *cx = gmx_simd_fnmadd_f(az, by, *cx);
+
+    *cy = gmx_simd_mul_f(az, bx);
+    *cy = gmx_simd_fnmadd_f(ax, bz, *cy);
+
+    *cz = gmx_simd_mul_f(ax, by);
+    *cz = gmx_simd_fnmadd_f(ay, bx, *cz);
  }
+#endif /* GMX_SIMD_HAVE_FLOAT */
  
-/* inner-product of multiple vectors */
-static gmx_inline gmx_simd_real_t
-gmx_simd_iprod_r(gmx_simd_real_t ax, gmx_simd_real_t ay, gmx_simd_real_t az,
-                 gmx_simd_real_t bx, gmx_simd_real_t by, gmx_simd_real_t bz)
+#if (defined GMX_SIMD_HAVE_DOUBLE) || (defined DOXYGEN)
+/*! \brief SIMD double inner product of multiple double vectors.
+ *
+ * \copydetails gmx_simd_iprod_f
+ */
+static gmx_inline gmx_simd_double_t
+gmx_simd_iprod_d(gmx_simd_double_t ax, gmx_simd_double_t ay, gmx_simd_double_t az,
+                 gmx_simd_double_t bx, gmx_simd_double_t by, gmx_simd_double_t bz)
  {
-    gmx_simd_real_t ret;
+    gmx_simd_double_t ret;
  
-    ret = gmx_simd_mul_r(ax, bx);
-    ret = gmx_simd_fmadd_r(ay, by, ret);
-    ret = gmx_simd_fmadd_r(az, bz, ret);
+    ret = gmx_simd_mul_d(ax, bx);
+    ret = gmx_simd_fmadd_d(ay, by, ret);
+    ret = gmx_simd_fmadd_d(az, bz, ret);
  
      return ret;
  }
  
-/* norm squared of multiple vectors */
-static gmx_inline gmx_simd_real_t
-gmx_simd_norm2_r(gmx_simd_real_t ax, gmx_simd_real_t ay, gmx_simd_real_t az)
+/*! \brief SIMD double norm squared of multiple vectors.
+ *
+ * \copydetails gmx_simd_norm2_f
+ */
+static gmx_inline gmx_simd_double_t
+gmx_simd_norm2_d(gmx_simd_double_t ax, gmx_simd_double_t ay, gmx_simd_double_t az)
  {
-    gmx_simd_real_t ret;
+    gmx_simd_double_t ret;
  
-    ret = gmx_simd_mul_r(ax, ax);
-    ret = gmx_simd_fmadd_r(ay, ay, ret);
-    ret = gmx_simd_fmadd_r(az, az, ret);
+    ret = gmx_simd_mul_d(ax, ax);
+    ret = gmx_simd_fmadd_d(ay, ay, ret);
+    ret = gmx_simd_fmadd_d(az, az, ret);
  
      return ret;
  }
  
-/* cross-product of multiple vectors */
+/*! \brief Calculating r^2 is the same as evaluating the norm of dx*dx.
+ *
+ * For details, see \ref gmx_simd_norm2_d.
+ */
+#define gmx_simd_calc_rsq_d gmx_simd_norm2_d
+
+/*! \brief SIMD double cross-product of multiple vectors.
+ *
+ * \copydetails gmx_simd_cprod_f
+ */
  static gmx_inline void
-gmx_simd_cprod_r(gmx_simd_real_t ax, gmx_simd_real_t ay, gmx_simd_real_t az,
-                 gmx_simd_real_t bx, gmx_simd_real_t by, gmx_simd_real_t bz,
-                 gmx_simd_real_t *cx, gmx_simd_real_t *cy, gmx_simd_real_t *cz)
+gmx_simd_cprod_d(gmx_simd_double_t ax, gmx_simd_double_t ay, gmx_simd_double_t az,
+                 gmx_simd_double_t bx, gmx_simd_double_t by, gmx_simd_double_t bz,
+                 gmx_simd_double_t *cx, gmx_simd_double_t *cy, gmx_simd_double_t *cz)
+{
+    *cx = gmx_simd_mul_d(ay, bz);
+    *cx = gmx_simd_fnmadd_d(az, by, *cx);
+
+    *cy = gmx_simd_mul_d(az, bx);
+    *cy = gmx_simd_fnmadd_d(ax, bz, *cy);
+
+    *cz = gmx_simd_mul_d(ax, by);
+    *cz = gmx_simd_fnmadd_d(ay, bx, *cz);
+}
+#endif /* GMX_SIMD_HAVE_DOUBLE */
+
+
+#if (defined GMX_SIMD4_HAVE_FLOAT) || (defined DOXYGEN)
+/*! \brief SIMD4 float inner product of four float vectors.
+ *
+ * \copydetails gmx_simd_norm2_f
+ */
+static gmx_inline gmx_simd4_float_t
+gmx_simd4_norm2_f(gmx_simd4_float_t ax, gmx_simd4_float_t ay, gmx_simd4_float_t az)
  {
-    *cx = gmx_simd_mul_r(ay, bz);
-    *cx = gmx_simd_fnmadd_r(az, by, *cx);
+    gmx_simd4_float_t ret;
  
-    *cy = gmx_simd_mul_r(az, bx);
-    *cy = gmx_simd_fnmadd_r(ax, bz, *cy);
+    ret = gmx_simd4_mul_f(ax, ax);
+    ret = gmx_simd4_fmadd_f(ay, ay, ret);
+    ret = gmx_simd4_fmadd_f(az, az, ret);
  
-    *cz = gmx_simd_mul_r(ax, by);
-    *cz = gmx_simd_fnmadd_r(ay, bx, *cz);
+    return ret;
  }
  
-/* a + b + c + d (not really a vector operation, but where else put this?) */
-static gmx_inline gmx_simd_real_t
-gmx_simd_sum4_r(gmx_simd_real_t a, gmx_simd_real_t b, gmx_simd_real_t c, gmx_simd_real_t d)
+/*! \brief Calculating r^2 is the same as evaluating the norm of dx*dx.
+ *
+ * For details, see \ref gmx_simd4_norm2_f
+ */
+#define gmx_simd4_calc_rsq_f gmx_simd4_norm2_f
+
+#endif /* GMX_SIMD4_HAVE_FLOAT */
+
+#if (defined GMX_SIMD4_HAVE_DOUBLE)  || (defined DOXYGEN)
+/*! \brief SIMD4 double norm squared of multiple vectors.
+ *
+ * \copydetails gmx_simd_norm2_f
+ */
+static gmx_inline gmx_simd4_double_t
+gmx_simd4_norm2_d(gmx_simd4_double_t ax, gmx_simd4_double_t ay, gmx_simd4_double_t az)
  {
-    return gmx_simd_add_r(gmx_simd_add_r(a, b), gmx_simd_add_r(c, d));
+    gmx_simd4_double_t ret;
+
+    ret = gmx_simd4_mul_d(ax, ax);
+    ret = gmx_simd4_fmadd_d(ay, ay, ret);
+    ret = gmx_simd4_fmadd_d(az, az, ret);
+
+    return ret;
  }
  
+/*! \brief Calculating r^2 is the same as evaluating the norm of dx*dx.
+ *
+ * For details, see \ref gmx_simd4_norm2_d.
+ */
+#define gmx_simd4_calc_rsq_d gmx_simd4_norm2_d
+
+#endif /* GMX_SIMD4_HAVE_DOUBLE */
+
+
+#ifdef GMX_DOUBLE
+/* Documented for the single branch below */
+#    define gmx_simd_iprod_r      gmx_simd_iprod_d
+#    define gmx_simd_norm2_r      gmx_simd_norm2_d
+#    define gmx_simd_calc_rsq_r   gmx_simd_calc_rsq_d
+#    define gmx_simd_cprod_r      gmx_simd_cprod_d
+#    define gmx_simd4_norm2_r     gmx_simd4_norm2_d
+#    define gmx_simd4_calc_rsq_r  gmx_simd4_calc_rsq_d
+#else /* GMX_DOUBLE */
+
+/*! \brief SIMD real inner product of multiple real vectors.
+ *
+ * This will call \ref gmx_simd_iprod_d if GMX_DOUBLE is defined, otherwise
+ * \ref gmx_simd_iprod_f.
+ *
+ * \copydetails gmx_simd_iprod_f
+ */
+#    define gmx_simd_iprod_r      gmx_simd_iprod_f
+
+/*! \brief SIMD real norm squared of multiple real vectors.
+ *
+ * This will call \ref gmx_simd_norm2_d if GMX_DOUBLE is defined, otherwise
+ * \ref gmx_simd_norm2_f.
+ *
+ * \copydetails gmx_simd_norm2_f
+ */
+#    define gmx_simd_norm2_r      gmx_simd_norm2_f
+
+/*! \brief Calculating r^2 is the same as evaluating the norm of dx*dx.
+ *
+ * This will call \ref gmx_simd_calc_rsq_d if GMX_DOUBLE is defined, otherwise
+ * \ref gmx_simd_calc_rsq_f.
+ *
+ * \copydetails gmx_simd_calc_rsq_f
+ */
+#    define gmx_simd_calc_rsq_r   gmx_simd_calc_rsq_f
+
+/*! \brief SIMD real cross-product of multiple real vectors.
+ *
+ * This will call \ref gmx_simd_cprod_d if GMX_DOUBLE is defined, otherwise
+ * \ref gmx_simd_cprod_f.
+ *
+ * \copydetails gmx_simd_cprod_f
+ */
+#    define gmx_simd_cprod_r      gmx_simd_cprod_f
+
+/*! \brief SIMD4 real norm squared of multiple vectors.
+ *
+ * This will call \ref gmx_simd4_norm2_d if GMX_DOUBLE is defined, otherwise
+ * \ref gmx_simd4_norm2_f.
+ *
+ * \copydetails gmx_simd4_norm2_f
+ */
+#    define gmx_simd4_norm2_r     gmx_simd4_norm2_f
+
+/*! \brief Calculating r^2 is the same as evaluating the norm of dx*dx.
+ *
+ * This will call \ref gmx_simd4_calc_rsq_d if GMX_DOUBLE is defined, otherwise
+ * \ref gmx_simd4_calc_rsq_f.
+ *
+ * \copydetails gmx_simd4_calc_rsq_f
+ */
+#    define gmx_simd4_calc_rsq_r  gmx_simd4_calc_rsq_f
+
+#endif /* GMX_DOUBLE */
+
+/*! \} */
+/*! \endcond */
  
-#endif
+#endif /* GMX_SIMD_VECTOR_OPERATIONS_H */
diff --git a/src/gromacs/utility/gmxomp.h b/src/gromacs/utility/gmxomp.h

index 4b4ec6fd59c35bcb79d820a9aa618e5dce6a118a..c698b5168ac11fce1c9caddf58af10568f577caa 100644 (file)
--- a/src/gromacs/utility/gmxomp.h
+++ b/src/gromacs/utility/gmxomp.h
@@ -53,8 +53,14 @@
  #include "config.h"
  #endif
  
-#ifdef GMX_SIMD_X86_SSE2_OR_HIGHER
-#include <xmmintrin.h>
+/* Ugly hack because the openmp implementation below hacks into the SIMD
+ * settings to decide when to use _mm_pause(). This should eventually be
+ * changed into proper detection of the intrinsics uses, not SIMD.
+ */
+#if (defined GMX_SIMD_X86_SSE2) || (defined GMX_SIMD_X86_SSE4_1) || \
+    (defined GMX_SIMD_X86_AVX_128_FMA) || (defined GMX_SIMD_X86_AVX_256) || \
+    (defined GMX_SIMD_X86_AVX2_256)
+#    include <xmmintrin.h>
  #endif
  
  #include "types/commrec.h"
@@ -112,8 +118,14 @@ void gmx_omp_check_thread_affinity(FILE *fplog, const t_commrec *cr,
   */
  static gmx_inline void gmx_pause()
  {
+    /* Ugly hack because the openmp implementation below hacks into the SIMD
+     * settings to decide when to use _mm_pause(). This should eventually be
+     * changed into proper detection of the intrinsics uses, not SIMD.
+     */
+#if (defined GMX_SIMD_X86_SSE2) || (defined GMX_SIMD_X86_SSE4_1) || \
+    (defined GMX_SIMD_X86_AVX_128_FMA) || (defined GMX_SIMD_X86_AVX_256) || \
+    (defined GMX_SIMD_X86_AVX2_256)
      /* Replace with tbb::internal::atomic_backoff when/if we use TBB */
-#if defined GMX_SIMD_X86_SSE2_OR_HIGHER
      _mm_pause();
  #elif defined __MIC__
      _mm_delay_32(32);
author	Erik Lindahl <erik@kth.se>
	Wed, 22 Jan 2014 17:30:10 +0000 (18:30 +0100)
committer	Mark Abraham <mark.j.abraham@gmail.com>
	Wed, 26 Feb 2014 10:52:57 +0000 (11:52 +0100)
CMakeLists.txt		patch \| blob \| history
cmake/gmxTestSimd.cmake		patch \| blob \| history
doxygen/Doxyfile-common.cmakein		patch \| blob \| history
doxygen/Doxyfile-lib.cmakein		patch \| blob \| history
doxygen/Doxyfile-user.cmakein		patch \| blob \| history
doxygen/directories.cpp		patch \| blob \| history
doxygen/mainpage.md		patch \| blob \| history
doxygen/simd.md	[new file with mode: 0644]	patch \| blob
src/config.h.cmakein		patch \| blob \| history
src/gromacs/CMakeLists.txt		patch \| blob \| history
src/gromacs/gmxlib/bondfree.c		patch \| blob \| history
src/gromacs/gmxlib/gmx_cpuid.c		patch \| blob \| history
src/gromacs/gmxlib/nonbonded/CMakeLists.txt		patch \| blob \| history
src/gromacs/gmxlib/nonbonded/nb_kernel_avx_128_fma_double/kernelutil_x86_avx_128_fma_double.h		patch \| blob \| history
src/gromacs/gmxlib/nonbonded/nb_kernel_avx_128_fma_single/kernelutil_x86_avx_128_fma_single.h		patch \| blob \| history
src/gromacs/gmxlib/nonbonded/nb_kernel_avx_256_double/kernelutil_x86_avx_256_double.h		patch \| blob \| history
src/gromacs/gmxlib/nonbonded/nb_kernel_avx_256_single/kernelutil_x86_avx_256_single.h		patch \| blob \| history
src/gromacs/gmxlib/nonbonded/nb_kernel_sse2_double/kernelutil_x86_sse2_double.h		patch \| blob \| history
src/gromacs/gmxlib/nonbonded/nb_kernel_sse2_single/kernelutil_x86_sse2_single.h		patch \| blob \| history
src/gromacs/gmxlib/nonbonded/nb_kernel_sse4_1_double/kernelutil_x86_sse4_1_double.h		patch \| blob \| history
src/gromacs/gmxlib/nonbonded/nb_kernel_sse4_1_single/kernelutil_x86_sse4_1_single.h		patch \| blob \| history
src/gromacs/gmxlib/nonbonded/nonbonded.c		patch \| blob \| history
src/gromacs/gmxpreprocess/calc_verletbuf.c		patch \| blob \| history
src/gromacs/legacyheaders/gmx_cpuid.h		patch \| blob \| history
src/gromacs/legacyheaders/types/nb_verlet.h		patch \| blob \| history
src/gromacs/mdlib/forcerec.c		patch \| blob \| history
src/gromacs/mdlib/nbnxn_internal.h		patch \| blob \| history
src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_simd_utils.h		patch \| blob \| history
src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_simd_utils_ibm_qpx.h		patch \| blob \| history
src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_simd_utils_ref.h		patch \| blob \| history
src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_simd_utils_x86_128d.h		patch \| blob \| history
src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_simd_utils_x86_128s.h		patch \| blob \| history
src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_simd_utils_x86_256s.h		patch \| blob \| history
src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_simd_utils_x86_mic.h		patch \| blob \| history
src/gromacs/mdlib/nbnxn_kernels/simd_2xnn/nbnxn_kernel_simd_2xnn.c		patch \| blob \| history
src/gromacs/mdlib/nbnxn_kernels/simd_2xnn/nbnxn_kernel_simd_2xnn.h		patch \| blob \| history
src/gromacs/mdlib/nbnxn_kernels/simd_2xnn/nbnxn_kernel_simd_2xnn_common.h		patch \| blob \| history
src/gromacs/mdlib/nbnxn_kernels/simd_2xnn/nbnxn_kernel_simd_2xnn_inner.h		patch \| blob \| history
src/gromacs/mdlib/nbnxn_kernels/simd_2xnn/nbnxn_kernel_simd_2xnn_outer.h		patch \| blob \| history
src/gromacs/mdlib/nbnxn_kernels/simd_4xn/nbnxn_kernel_simd_4xn.c		patch \| blob \| history
src/gromacs/mdlib/nbnxn_kernels/simd_4xn/nbnxn_kernel_simd_4xn.h		patch \| blob \| history
src/gromacs/mdlib/nbnxn_kernels/simd_4xn/nbnxn_kernel_simd_4xn_common.h		patch \| blob \| history
src/gromacs/mdlib/nbnxn_kernels/simd_4xn/nbnxn_kernel_simd_4xn_inner.h		patch \| blob \| history
src/gromacs/mdlib/nbnxn_kernels/simd_4xn/nbnxn_kernel_simd_4xn_outer.h		patch \| blob \| history
src/gromacs/mdlib/nbnxn_search.c		patch \| blob \| history
src/gromacs/mdlib/nbnxn_simd.h	[new file with mode: 0644]	patch \| blob
src/gromacs/mdlib/pme.c		patch \| blob \| history
src/gromacs/mdlib/pme_simd4.h		patch \| blob \| history
src/gromacs/mdlib/tpi.c		patch \| blob \| history
src/gromacs/simd/CMakeLists.txt	[new file with mode: 0644]	patch \| blob
src/gromacs/simd/four_wide_macros.h	[deleted file]	patch \| blob \| history
src/gromacs/simd/four_wide_macros_ref.h	[deleted file]	patch \| blob \| history
src/gromacs/simd/general_x86_avx_128_fma.h	[deleted file]	patch \| blob \| history
src/gromacs/simd/general_x86_avx_256.h	[deleted file]	patch \| blob \| history
src/gromacs/simd/general_x86_mic.h	[deleted file]	patch \| blob \| history
src/gromacs/simd/general_x86_sse2.h	[deleted file]	patch \| blob \| history
src/gromacs/simd/general_x86_sse4_1.h	[deleted file]	patch \| blob \| history
src/gromacs/simd/impl_ibm_qpx/impl_ibm_qpx.h	[new file with mode: 0644]	patch \| blob
src/gromacs/simd/impl_intel_mic/impl_intel_mic.h	[new file with mode: 0644]	patch \| blob
src/gromacs/simd/impl_reference/impl_reference.h	[new file with mode: 0644]	patch \| blob
src/gromacs/simd/impl_x86_avx2_256/impl_x86_avx2_256.h	[new file with mode: 0644]	patch \| blob
src/gromacs/simd/impl_x86_avx_128_fma/impl_x86_avx_128_fma.h	[new file with mode: 0644]	patch \| blob
src/gromacs/simd/impl_x86_avx_256/impl_x86_avx_256.h	[new file with mode: 0644]	patch \| blob
src/gromacs/simd/impl_x86_sse2/impl_x86_sse2.h	[new file with mode: 0644]	patch \| blob
src/gromacs/simd/impl_x86_sse4_1/impl_x86_sse4_1.h	[new file with mode: 0644]	patch \| blob
src/gromacs/simd/macros.h	[deleted file]	patch \| blob \| history
src/gromacs/simd/macros_ref.h	[deleted file]	patch \| blob \| history
src/gromacs/simd/math_double.h	[deleted file]	patch \| blob \| history
src/gromacs/simd/math_single.h	[deleted file]	patch \| blob \| history
src/gromacs/simd/math_x86_avx_128_fma_double.h		patch \| blob \| history
src/gromacs/simd/math_x86_avx_128_fma_single.h		patch \| blob \| history
src/gromacs/simd/math_x86_avx_256_double.h		patch \| blob \| history
src/gromacs/simd/math_x86_avx_256_single.h		patch \| blob \| history
src/gromacs/simd/math_x86_sse2_double.h		patch \| blob \| history
src/gromacs/simd/math_x86_sse2_single.h		patch \| blob \| history
src/gromacs/simd/math_x86_sse4_1_double.h		patch \| blob \| history
src/gromacs/simd/math_x86_sse4_1_single.h		patch \| blob \| history
src/gromacs/simd/simd.h	[new file with mode: 0644]	patch \| blob
src/gromacs/simd/simd_math.h	[new file with mode: 0644]	patch \| blob
src/gromacs/simd/tests/CMakeLists.txt	[new file with mode: 0644]	patch \| blob
src/gromacs/simd/tests/base.cpp	[new file with mode: 0644]	patch \| blob
src/gromacs/simd/tests/base.h	[new file with mode: 0644]	patch \| blob
src/gromacs/simd/tests/bootstrap_loadstore.cpp	[new file with mode: 0644]	patch \| blob
src/gromacs/simd/tests/simd.cpp	[new file with mode: 0644]	patch \| blob
src/gromacs/simd/tests/simd.h	[new file with mode: 0644]	patch \| blob
src/gromacs/simd/tests/simd4.cpp	[new file with mode: 0644]	patch \| blob
src/gromacs/simd/tests/simd4.h	[new file with mode: 0644]	patch \| blob
src/gromacs/simd/tests/simd4_floatingpoint.cpp	[new file with mode: 0644]	patch \| blob
src/gromacs/simd/tests/simd4_math.cpp	[new file with mode: 0644]	patch \| blob
src/gromacs/simd/tests/simd4_vector_operations.cpp	[new file with mode: 0644]	patch \| blob
src/gromacs/simd/tests/simd_floatingpoint.cpp	[new file with mode: 0644]	patch \| blob
src/gromacs/simd/tests/simd_integer.cpp	[new file with mode: 0644]	patch \| blob
src/gromacs/simd/tests/simd_math.cpp	[new file with mode: 0644]	patch \| blob
src/gromacs/simd/tests/simd_vector_operations.cpp	[new file with mode: 0644]	patch \| blob
src/gromacs/simd/vector_operations.h		patch \| blob \| history
src/gromacs/utility/gmxomp.h		patch \| blob \| history