# SIMD.
include(gmxDetectTargetArchitecture)
gmx_detect_target_architecture()
+
+if(GMX_CPU_ACCELERATION)
+ # Stay compatible with old Jenkins command line options for specific SIMD acceleration
+ set(GMX_SIMD "${GMX_CPU_ACCELERATION}" CACHE STRING "SIMD instruction set level and compiler optimization" FORCE)
+endif(GMX_CPU_ACCELERATION)
+
include(gmxDetectSimd)
gmx_detect_simd(GMX_SUGGESTED_SIMD)
-if("${GMX_SUGGESTED_SIMD}" STREQUAL "AVX2_256")
- message(STATUS "Changing acceleration from AVX2 to AVX (until AVX2 patches commited).")
- set(GMX_SUGGESTED_SIMD "AVX_256")
-endif()
gmx_option_multichoice(
GMX_SIMD
None
none gaussian mopac gamess orca)
-gmx_dependent_cache_variable(GMX_NBNXN_REF_KERNEL_TYPE "Reference kernel type (4xn or 2xnn)" STRING "4xn" "GMX_SIMD STREQUAL REFERENCE")
-gmx_dependent_cache_variable(GMX_NBNXN_REF_KERNEL_WIDTH "Reference kernel width" STRING "4" "GMX_SIMD STREQUAL REFERENCE")
+gmx_dependent_cache_variable(GMX_SIMD_REF_FLOAT_WIDTH "Reference SIMD single precision width" STRING "4" "GMX_SIMD STREQUAL REFERENCE")
+gmx_dependent_cache_variable(GMX_SIMD_REF_DOUBLE_WIDTH "Reference SIMD double precision width" STRING "2" "GMX_SIMD STREQUAL REFERENCE")
+
+# This should be moved to a separate NBNXN cmake module when that code is cleaned up and modularized
+
+if("${GMX_SIMD}" STREQUAL "REFERENCE")
+ if(GMX_DOUBLE)
+ set(KERNEL_WIDTH ${GMX_SIMD_REF_DOUBLE_WIDTH})
+ else()
+ set(KERNEL_WIDTH ${GMX_SIMD_REF_FLOAT_WIDTH})
+ endif()
+endif()
option(GMX_BROKEN_CALLOC "Work around broken calloc()" OFF)
mark_as_advanced(GMX_BROKEN_CALLOC)
if(HAVE_LIBM)
list(APPEND GMX_EXTRA_LIBRARIES m)
endif(HAVE_LIBM)
-if (${CMAKE_SYSTEM_NAME} MATCHES "BlueGene")
- check_library_exists(mass_simd atan2f4 "" HAVE_MASS_SIMD)
- if(HAVE_MASS_SIMD)
- list(APPEND GMX_EXTRA_LIBRARIES mass_simd)
- else()
- message(FATAL_ERROR "Could not link to the SIMD version of the IBM MASS library. Please adjust your CMAKE_PREFIX_PATH to contain it")
- endif()
-endif()
-
option(GMX_NACL "Configure for Native Client builds" OFF)
if (GMX_NACL)
endif()
set(GMX_SIMD_X86_SSE2 1)
- set(GMX_SIMD_X86_SSE2_OR_HIGHER 1)
-
set(SIMD_STATUS_MESSAGE "Enabling SSE2 SIMD instructions")
elseif(${GMX_SIMD} STREQUAL "SSE4.1")
endif()
set(GMX_SIMD_X86_SSE4_1 1)
- set(GMX_SIMD_X86_SSE4_1_OR_HIGHER 1)
- set(GMX_SIMD_X86_SSE2_OR_HIGHER 1)
set(SIMD_STATUS_MESSAGE "Enabling SSE4.1 SIMD instructions")
elseif(${GMX_SIMD} STREQUAL "AVX_128_FMA")
gmx_test_avx_gcc_maskload_bug(GMX_SIMD_X86_AVX_GCC_MASKLOAD_BUG "${SIMD_C_FLAGS}")
set(GMX_SIMD_X86_AVX_128_FMA 1)
- set(GMX_SIMD_X86_AVX_128_FMA_OR_HIGHER 1)
- set(GMX_SIMD_X86_SSE4_1_OR_HIGHER 1)
- set(GMX_SIMD_X86_SSE2_OR_HIGHER 1)
-
set(SIMD_STATUS_MESSAGE "Enabling 128-bit AVX SIMD Gromacs SIMD (with fused-multiply add)")
elseif(${GMX_SIMD} STREQUAL "AVX_256")
gmx_test_avx_gcc_maskload_bug(GMX_SIMD_X86_AVX_GCC_MASKLOAD_BUG "${SIMD_C_FLAGS}")
set(GMX_SIMD_X86_AVX_256 1)
- set(GMX_SIMD_X86_AVX_256_OR_HIGHER 1)
- set(GMX_SIMD_X86_SSE4_1_OR_HIGHER 1)
- set(GMX_SIMD_X86_SSE2_OR_HIGHER 1)
-
set(SIMD_STATUS_MESSAGE "Enabling 256-bit AVX SIMD instructions")
elseif(${GMX_SIMD} STREQUAL "AVX2_256")
- # Comment out this line for AVX2 development
- message(FATAL_ERROR "AVX2_256 is disabled until the implementation has been commited.")
-
gmx_use_clang_as_with_gnu_compilers_on_osx()
gmx_find_cflag_for_source(CFLAGS_AVX2 "C compiler AVX2 flag"
# No need to test for Maskload bug - it was fixed before gcc added AVX2 support
set(GMX_SIMD_X86_AVX2_256 1)
- set(GMX_SIMD_X86_AVX2_256_OR_HIGHER 1)
- set(GMX_SIMD_X86_AVX_256_OR_HIGHER 1)
- set(GMX_SIMD_X86_SSE4_1_OR_HIGHER 1)
- set(GMX_SIMD_X86_SSE2_OR_HIGHER 1)
-
set(SIMD_STATUS_MESSAGE "Enabling 256-bit AVX2 SIMD instructions")
elseif(${GMX_SIMD} STREQUAL "IBM_QPX")
elseif(${GMX_SIMD} STREQUAL "REFERENCE")
- add_definitions(-DGMX_SIMD_REFERENCE)
- if(${GMX_NBNXN_REF_KERNEL_TYPE} STREQUAL "4xn")
- if(${GMX_NBNXN_REF_KERNEL_WIDTH} STREQUAL "2" OR ${GMX_NBNXN_REF_KERNEL_WIDTH} STREQUAL "4" OR ${GMX_NBNXN_REF_KERNEL_WIDTH} STREQUAL "8")
- add_definitions(-DGMX_NBNXN_SIMD_4XN -DGMX_SIMD_REF_WIDTH=${GMX_NBNXN_REF_KERNEL_WIDTH})
- else()
- message(FATAL_ERROR "Unsupported width for 4xn reference kernels")
- endif()
- elseif(${GMX_NBNXN_REF_KERNEL_TYPE} STREQUAL "2xnn")
- if(${GMX_NBNXN_REF_KERNEL_WIDTH} STREQUAL "8" OR ${GMX_NBNXN_REF_KERNEL_WIDTH} STREQUAL "16")
- add_definitions(-DGMX_NBNXN_SIMD_2XNN -DGMX_SIMD_REF_WIDTH=${GMX_NBNXN_REF_KERNEL_WIDTH})
- else()
- message(FATAL_ERROR "Unsupported width for 2xn reference kernels")
- endif()
- else()
- message(FATAL_ERROR "Unsupported kernel type")
+ # NB: This file handles settings for the SIMD module, so in the interest
+ # of proper modularization, please do NOT put any verlet kernel settings in this file.
+
+ if(GMX_SIMD_REF_FLOAT_WIDTH)
+ add_definitions(-DGMX_SIMD_REF_FLOAT_WIDTH=${GMX_SIMD_REF_FLOAT_WIDTH})
endif()
+ if(GMX_SIMD_REF_DOUBLE_WIDTH)
+ add_definitions(-DGMX_SIMD_REF_DOUBLE_WIDTH=${GMX_SIMD_REF_DOUBLE_WIDTH})
+ endif()
+
+ set(GMX_SIMD_REFERENCE 1)
+ set(SIMD_STATUS_MESSAGE "Enabling reference (emulated) SIMD instructions.")
else()
gmx_invalid_option_value(GMX_SIMD)
ENABLED_SECTIONS = @DOXYGEN_SECTIONS@
+MACRO_EXPANSION = YES
+EXPAND_ONLY_PREDEF = YES
# Extract documentation also for code in headers within #ifdef __cplusplus
PREDEFINED = __cplusplus
# This is for thread_mpi to #ifdef some code out that should not be documented.
PREDEFINED += DOXYGEN
+# This makes 'static gmx_inline' functions appear better in the documentation.
+PREDEFINED += gmx_inline=inline
# This is for parser.cpp to make it produce code that Doxygen understands
# and that does not have unnecessary function declarations.
PREDEFINED += __STDC__ YYMALLOC=malloc YYFREE=free
ENABLED_SECTIONS += libapi
INTERNAL_DOCS = NO
+# This includes 'static inline' functions from headers in the documentation.
+EXTRACT_STATIC = YES
HIDE_UNDOC_CLASSES = YES
WARN_LOGFILE = doxygen-lib.log
HTML_OUTPUT = html-lib
EXCLUDE += @CMAKE_SOURCE_DIR@/doxygen/doxygen.md
EXCLUDE += @CMAKE_SOURCE_DIR@/doxygen/unittesting.md
EXCLUDE += @CMAKE_SOURCE_DIR@/doxygen/wrapperbinary.md
+EXCLUDE += @CMAKE_SOURCE_DIR@/doxygen/simd.md
INTERNAL_DOCS = NO
HIDE_UNDOC_CLASSES = YES
\ingroup module_selection
*/
+/*! \libinternal
+\dir src/gromacs/simd
+\brief \ref module_simd
+
+\ingroup module_simd
+ */
+/*! \libinternal
+\dir src/gromacs/simd/tests
+\brief Unit tests for \ref module_simd
+
+\ingroup module_simd
+ */
+
/*!
\dir src/gromacs/trajectoryanalysis
\brief \ref module_trajectoryanalysis
Provides an overview of unit testing in \Gromacs.
- \subpage page_wrapperbinary <br/>
Provides an overview of how the `gmx` wrapper binary is implemented.
+ - \subpage page_simd <br/>
+ Documentation about the new SIMD module that makes it possible to write
+ highly accelerated CPU code that is still portable.
- \subpage thread_mpi <br/>
This code is used internally for threading support, and also provides a
(partial) MPI implementation that allows compiling a "thread-MPI" version of
--- /dev/null
+Single-instruction Multiple-data (SIMD) coding {#page_simd}
+==============================================
+
+Coding with SIMD instructions
+=============================
+
+One important way for \Gromacs to achieve high performance is
+to use modern hardware capabilities where a single assembly
+instruction operates on multiple data units, essentially short
+fixed-length vectors (usually 2,4,8, or 16 elements). This provides
+a very efficient way for the CPU to increase floating-point
+performance, but it is much less versatile than general purpose
+registers. For this reason it is difficult for the compiler to
+generate efficient SIMD code, so the user has to organize the
+data in a way where it is possible to access as vectors, and
+these vectors often need to be aligned on cache boundaries.
+
+We have supported a number of different SIMD instruction sets in
+the group kernels for ages, and it is now also present in the
+verlet kernels and a few other places. However, with the increased
+usage and several architectures with different capabilities we now
+use a vendor-agnostic \Gromacs SIMD module, as documented in
+\ref module_simd.
+
+Design of the \Gromacs SIMD module
+==================================
+
+The macros in `src/gromacs/simd` are intended to be used for writing
+architecture-independent SIMD intrinsics code. Rather than making assumptions
+based on architecture, we have introduced a limited number of
+predefined preprocessor macros that describe the capabilities of the
+current implementation - these are the ones you need to check when
+writing SIMD code. As you will see, the functionality exposed by
+this module as typically a small subset of general SIMD implementations,
+and in particular we do not even try to expose advanced shuffling or
+permute operations, simply because we haven't been able to describe those
+in a generic way that can be implemented efficiently regardless of the
+hardware. However, the advantage of this approach is that it is straightforward
+to extend with support for new simd instruction sets in the future,
+and that will instantly speed up old code too.
+
+Unfortunately there is no standard for SIMD architectures. The available
+features vary a lot, but we still need to use quite a few of them to
+get the best performance possible. This means some features will only
+be available on certain platforms, and it is critical that we do NOT make
+to many assumptions about the storage formats, their size or SIMD width.
+Just to give a few examples:
+
+- On x86, double precision (64-bit) floating-point values always convert
+ to 32-bit integers, while many other platforms use 64-bit, and some cannot
+ use 32-bit integers at all. This means we cannot use a mask (boolean)
+ derived from integer operations to select double-precision floating-point
+ values, and it could get very complex for higher-level code if all these
+ decisions were exposed. Instead, we want to keep integers 32-bit since
+ all algorithms anyway need to work in single precision (w. 32-bit ints).
+- IBM QPX uses 4-wide SIMD both for single and double precision. Integer
+ support is highly limited, and the storage format means QPX does not
+ use x86-style all-ones masks (which have different widths in single/double)
+ but it uses the sign bit to denote the _false_ value. In particular, this
+ means we cannot use the bit contents for any fancy mask operations.
+- AVX1 only supports 4-wide 128-bit integer SIMD arithmetics, but the integer
+ _conversions_ can still be done 8-wide which corresponds to the single
+ precision floating-point width. Similarly, with AVX1 conversions between
+ double-precision and integers use the 32-bit 4-wide 128bit registers where
+ we can also do integer arithmetics. AVX2 adds proper arithmetics for
+ 8-wide integers. We would severely limit performance if we had to say
+ that integer support was not present, so instead we stick to 32-bit ints
+ but limit the operations we expose (and do shuffling internally).
+- For SSE2 through SSE4.1, double precision is 2-wide, but when we convert
+ to integers they will be put in the first two elements of a 4-wide integer
+ type. This means we cannot assume that floating-point SIMD registers and
+ corresponding integer registers (after conversion) have the same width.
+- The 2-wide SIMD instructions on BlueGene/L and BlueGene/P cannot do any
+ floating-point logical operations (and/andnot/or/xor) whatsoever, which
+ can be a pain when implementing approximations for math functions.
+- Since boolean values can have different width for float/double and the
+ integers corresponding to float/double, we need to use separate boolean
+ types for all these values and convert between them if we e.g. want to use
+ result of an integer compare to select floating-point values.
+
+While this might sound complicated, it is actually far easier than writing
+separate SIMD code for 10 architectures in both single & double. The point
+is not that you need to remember the limitations above, but it is critical
+that you *never assume anything about the SIMD implementation*. We
+typically implement SIMD support for a new architecture in days with this
+new module, and the extensions required for verlet kernels
+are also very straightforward (group kernels can be more complex, but those
+are gradually on their way out). For the higher-level
+code, the only important thing is to never _assume_ anything about the SIMD
+architecture. Our general strategy in \Gromacs is to split the SIMD coding
+in three levels:
+
+<dl>
+<dt>Base level generic SIMD</dt>
+<dd>
+The base level SIMD module (which we get by including `gromacs/simd/simd.h`
+provides the API to define and manipulate SIMD datatypes. This will be enough
+for lots of cases, and it is a huge advantage that there is roughly
+parity between different architectures.
+</dd>
+<dt>Larger architecture-specific SIMD functions</dt>
+<dd>
+For some parts of the code this is not enough. In particular, both the
+group and Verlet kernels do insane amounts of floating-point operations,
+and since we spend 85-90% of the time in these kernels it is critical that
+we can optimize them as much as possible. Here, our strategy is first to
+define larger high-level functions that e.g. take a number of distances
+and loads the table interactions for this interaction. This way we can
+move this architecture-specific implementation to the SIMD module, and
+both achieve a reasonably clean kernel but still optimize a lot.
+</dd>
+<dt>Architecture-specific kernels (directories/files)</dt>
+<dd>
+When it is absolutely impossible to use a shared implementation we might
+have to code SIMD (just as GPU code). When this happens, we should create
+subdirectory or otherwise clearly names a file with a suffix for the
+SIMD architecture, to clarify to the user that the SIMD file has a
+direct non-SIMD correspondence. Since this code can be very hard to read,
+it is important to be explicit and use lots of comments - this is not the
+type of code where you should use smart optimization with hundreds of
+preprocessor directives. Keep it simple so other developers can help you
+support it. The question is not whether you can get a function 20% faster,
+but whether it justifies the added complexity of the code.
+</dd>
+</dl>
+
+File organization
+=================
+
+The SIMD module uses a couple of different files:
+
+<dl>
+<dt>`gromacs/simd/simd.h`</dt>
+<dd>
+This is the top-level wrapper that you should always include first.
+It will check the settings made at configuration time and include a
+suitable low-level implementation (that can be either single, double,
+or both). It also contains the routines for memory alignment, and
+based on the current Gromacs precision it will set aliases to 'real'
+SIMD datatypes (see further down) so the implementations do not have
+to care about Gromacs-specific details. However, note that you might
+not get all SIMD support you hoped for: If you compiled Gromacs in
+double precision but the hardware only supports single-precision SIMD
+there will not be any SIMD routines for default Gromacs 'real' precision.
+There are \#defines you can use to check this, as described further down.
+</dd>
+<dt>`gromacs/simd/impl_reference.h`</dt>
+<dd>
+This is an example of a low-level implementation. You should never, ever,
+work directly with these in higher-level code. The reference implementation
+contains the documentation for all SIMD wrappers, though.
+</dd>
+<dt>`gromacs/simd/simd_math.h`</dt>
+<dd>
+SIMD math functions. All functions in this file have to be designed
+so they work no matter whether the hardware supports integer SIMD, logical
+operations on integer or floating-point SIMD, or arithmetic operations
+on integers. However, a few routines check for defines and use faster
+algorithms if these features are present.
+</dd>
+<dt>`gromacs/simd/vector_operations.h`</dt>
+<dd>
+This file contains a few rvec-related SIMD functions, e.g. to
+calculate scalar products, norms, or cross products. They obviously
+cannot operate on scalar Gromacs rvec types, but use separate SIMD
+variables for X,Y, and Z vector components.
+</dd>
+</dl>
+
+
+SIMD datatypes
+==============
+
+The SIMD module handles the challenges mentioned in the introduction
+by introducing a number of datatypes;
+many of these might map to the same underlying SIMD types, but we need separate
+types because some architectures use different registers e.g. for boolean
+types.
+
+Floating-point data
+-------------------
+
+<dl>
+<dt>`#gmx_simd_real_t`</dt>
+<dd>
+This is the SIMD-version of \Gromacs' real type,
+which is set based on the CMake configuration and internally aliased
+to one of the next two types.
+Operations on these variables have the suffix `_r`, e.g. `gmx_simd_add_r()`.
+</dd>
+<dt>`#gmx_simd_float_t`</dt>
+<dd>
+This is always single-precision data, but it
+might not be supported on all architectures. Suffix `_f` is used for
+explicit single-precision routines, e.g. `gmx_simd_mul_f()`.
+</dd>
+<dt>`gmx_simd_double_t`</dt>
+<dd>
+This is always double precision when available,
+and in rare cases you might want to use a specific precision.
+Suffix `_d` is used for explicit double-precision routines,
+e.g. `gmx_simd_mul_d()`
+</dd>
+</dl>
+
+Integers corresponding to floating-point values
+-----------------------------------------------
+
+For these types, 'correspond' means that it is the integer type we
+get when we convert data e.g. from single (or double) precision
+floating-point SIMD variables. Those need to be different, since many
+common implementations only use half as many elements for double as
+for single SIMD variables, and then we only get half the number of
+integers too.
+
+<dl>
+<dt>`#gmx_simd_int32_t`</dt>
+<dd>
+This is used for integers when converting to/from Gromacs default "real" type.
+The corresponding routines have suffix `_i`, e.g. `gmx_simd_add_i()`.
+</dd>
+<dt>`gmx_simd_fint32_t`</dt>
+<dd>
+Integers obtained when converting from single precision, or intended to be
+converted to single precision floating-point. These are normal integers
+(not a special conversion type), but since some SIMD architectures such as
+SSE or AVX use different registers for integer SIMD variables having the
+same width as float and double, respectively, we need to separate these
+two types of integers. The actual operations you perform on the are normal
+ones such as addition or multiplication. The routines
+operating on these variables have suffix `_fi`, like `gmx_simd_add_fi()`.
+This will also be the widest integer data type if you want to do pure
+integer SIMD operations, but that will not be supported on all platforms.
+</dd>
+<dt>`gmx_simd_dint32_t`</dt>
+<dd>
+Integers used when converting to/from double. See the preceding item
+for a detailed explanation. On many architectures,
+including all x86 ones, this will be a narrower type than `gmx_simd_fint32_t`.
+The correspoding routines have suffix `_di`, like `gmx_simd_add_di()`.
+</dd>
+</dl>
+
+Note that all integer load/stores operations defined here load/store 32-bit
+integers, even when the internal register storage might be 64-bit, and we
+set the "width" of the SIMD implementation based on how many float/double/
+integers we load/store - even if the internal width could be larger.
+
+Boolean values
+--------------
+
+We need a separate boolean datatype for masks and comparison results, since
+we cannot assume they are identical either to integers, floats or double -
+some implementations use specific predicate registers for booleans.
+
+<dl>
+<dt>`#gmx_simd_bool_t`</dt>
+<dd>
+Results from boolean operations involving reals, and the booleans we use
+to select between real values. The corresponding routines have suffix `_b`,
+like `gmx_simd_or_b()`.
+</dd>
+<dt>`gmx_simd_fbool_t`</dt>
+<dd>
+Booleans specifically for single precision. Corresponding function suffix
+is `_fb`, like `gmx_simd_or_fb()`.
+</dd>
+<dt>`gmx_simd_dbool_t`</dt>
+<dd>
+Operations specifically on double. Operations have suffix `_db`: `gmx_simd_or_db()`
+</dd>
+<dt>`#gmx_simd_ibool_t`</dt>
+<dd>
+Boolean operations on integers corresponding to real (see floating-point
+descriptions above). Operations on these booleans use suffix `_ib`,
+like `gmx_simd_or_ib()`.
+</dd>
+<dt>`gmx_simd_fibool_t`</dt>
+<dd>
+Booleans for integers corresponding to float. Operation suffix is `_fib`,
+like `gmx_simd_or_fib()`.
+</dd>
+<dt>`gmx_simd_dibool_t`</dt>
+<dd>
+Booleans for integers corresponding to double. Operation suffix is `_dib`,
+like `gmx_simd_or_dib()`.
+</dd>
+</dl>
+
+The subset you should use in practice
+-------------------------------------
+
+If this seems daunting, in practice you should only need to use these types
+when you start coding:
+
+<dl>
+<dt>`#gmx_simd_real_t`</dt>
+<dd>
+Floating-point data.
+</dd>
+<dt>`#gmx_simd_bool_t`</dt>
+<dd>
+Booleans.
+</dd>
+<dt>`#gmx_simd_int32_t`</dt>
+<dd>
+Integer data. Might not be supported, so you must check
+the preprocessor macros described below.
+</dd>
+</dl>
+
+Operations on these types will be defined to either float/double (or corresponding integers) based on the current Gromacs precision, so the documentation is occasionally more detailed for the lower-level actual implementation functions.
+
+SIMD4 Macros
+------------
+
+The above should be sufficient for code that works with the full SIMD width.
+Unfortunately reality is not that simple. Some algorithms like lattice
+summation need quartets of elements, so even when the SIMD width is >4 we
+need width-4 SIMD if it is supported. These datatypes and operations use the
+prefix `gmx_simd4_`, and availability is indicated by `GMX_SIMD4_HAVE_FLOAT`
+and `GMX_SIMD4_HAVE_DOUBLE`. For now we only support a small subset of SIMD
+operations for SIMD4, but that is trivial to extend if we need to.
+
+Predefined SIMD preprocessor macros
+===================================
+
+Functionality-wise, we have a small set of core set of features that we
+require to be present on all platforms, while more avanced features can be
+used in the code when defines like e.g. `GMX_SIMD_HAVE_LOADU` are set.
+
+This is a summary of the currently available preprocessor defines that
+you should use to check for support when using the corresponding features.
+We first list the float/double/int defines set by the _implementation_; in
+most cases you do not want to check directly for float/double defines, but
+you should instead use the derived "real" defines set in this file - we list
+those at the end below.
+
+Preprocessor predefined macro defines set by the low-level implementation.
+These are only set if they work for all datatypes; `GMX_SIMD_HAVE_LOADU`
+thus means we can load both float, double, and integers from unaligned memory,
+and that the unaligned loads are available for SIMD4 too.
+
+<dl>
+<dt>`GMX_SIMD_HAVE_FLOAT`</dt>
+<dd>
+Single-precision instructions available.
+</dd>
+<dt>`GMX_SIMD_HAVE_DOUBLE `</dt>
+<dd>
+Double-precision instructions available.
+</dd>
+<dt>`GMX_SIMD_HAVE_HARDWARE`</dt>
+<dd>
+Set when we are NOT emulating SIMD.
+</dd>
+<dt>`GMX_SIMD_HAVE_LOADU`</dt>
+<dd>
+Load from unaligned memory available.
+</dd>
+<dt>`GMX_SIMD_HAVE_STOREU`</dt>
+<dd>
+Store to unaligned memory available.
+</dd>
+<dt>`GMX_SIMD_HAVE_LOGICAL`</dt>
+<dd>
+Support for and/andnot/or/xor on floating-point variables.
+</dd>
+<dt>`GMX_SIMD_HAVE_FMA`</dt>
+<dd>
+Floating-point fused multiply-add.
+Note: We provide emulated FMA instructions if you do not have FMA
+support, but in that case you might be able to code it more efficient w/o FMA.
+</dd>
+<dt>`GMX_SIMD_HAVE_FRACTION`</dt>
+<dd>
+Instruction to get decimal fraction. Same as FMA: This denotes
+hardware support, otherwise instruction will be emulated.
+</dd>
+<dt>`GMX_SIMD_HAVE_FINT32`</dt>
+<dd>
+Integer conversions to/from float available.
+</dd>
+<dt>`GMX_SIMD_HAVE_FINT32_EXTRACT`</dt>
+<dd>
+Support for extracting integer SIMD elements from `gmx_simd_fint32_t`.
+</dd>
+<dt>`GMX_SIMD_HAVE_FINT32_LOGICAL`</dt>
+<dd>
+Bitwise shifts on `gmx_simd_fint32_t`.
+</dd>
+<dt>`GMX_SIMD_HAVE_FINT32_ARITHMETICS`</dt>
+<dd>
+Arithmetic ops for `gmx_simd_fint32_t`.
+</dd>
+<dt>`GMX_SIMD_HAVE_DINT32`</dt>
+<dd>
+Integer conversions to/from double available.
+</dd>
+<dt>`GMX_SIMD_HAVE_DINT32_EXTRACT`</dt>
+<dd>
+Support for extracting integer SIMD elements from `gmx_simd_dint32_t`.
+</dd>
+<dt>`GMX_SIMD_HAVE_DINT32_LOGICAL`</dt>
+<dd>
+Bitwise shifts on `gmx_simd_dint32_t`.
+</dd>
+<dt>`GMX_SIMD_HAVE_DINT32_ARITHMETICS`</dt>
+<dd>
+Arithmetic ops for `gmx_simd_dint32_t`.
+</dd>
+</dl>
+
+There are also two macros specific to SIMD4: `GMX_SIMD4_HAVE_FLOAT` is set
+if we can use SIMD4 in single precision, and `GMX_SIMD4_HAVE_DOUBLE`
+similarly denotes support for a double-precision SIMD4 implementation. For
+generic properties (e.g. whether SIMD4 FMA is supported), you should check
+the normal SIMD macros above.
+
+Implementation properties
+-------------------------
+
+Higher-level code can use these macros to find information about the implementation,
+for instance what the SIMD width is:
+
+<dl>
+<dt>`GMX_SIMD_FLOAT_WIDTH`</dt>
+<dd>
+Number of elements in `gmx_simd_float_t`, and practical width of `gmx_simd_fint32_t`.
+</dd>
+<dt>`GMX_SIMD_DOUBLE_WIDTH`</dt>
+<dd>
+Number of elements in `gmx_simd_double_t`, and practical width of `gmx_simd_dint32_t`</dd>
+<dt>`GMX_SIMD_RSQRT_BITS`</dt>
+<dd>
+Accuracy (bits) of 1/sqrt(x) lookup step.
+</dd>
+<dt>`GMX_SIMD_RCP_BITS`</dt>
+<dd>
+Accuracy (bits) of 1/x lookup step.
+</dd>
+</dl>
+
+After including the low-level architecture-specific implementation, this
+header sets the following derived defines based on the current precision;
+these are the ones you should check for unless you absolutely want to dig
+deep into the explicit single/double precision implementations:
+
+<dl>
+<dt>`GMX_SIMD_HAVE_REAL`</dt>
+<dd>
+Set either to `GMX_SIMD_HAVE_FLOAT` or `GMX_SIMD_HAVE_DOUBLE`
+</dd>
+<dt>`GMX_SIMD4_HAVE_REAL`</dt>
+<dd>
+Set either to `GMX_SIMD4_HAVE_FLOAT` or `GMX_SIMD4_HAVE_DOUBLE`
+</dd>
+<dt>`GMX_SIMD_REAL_WIDTH`</dt>
+<dd>
+Set either to `GMX_SIMD_FLOAT_WIDTH` or `GMX_SIMD_DOUBLE_WIDTH`
+</dd>
+<dt>`GMX_SIMD_HAVE_INT32`</dt>
+<dd>
+Set either to `GMX_SIMD_HAVE_FINT32` or `GMX_SIMD_HAVE_DINT32`
+</dd>
+<dt>`GMX_SIMD_INT32_WIDTH`</dt>
+<dd>
+Set either to `GMX_SIMD_FINT32_WIDTH` or `GMX_SIMD_DINT32_WIDTH`
+</dd>
+<dt>`GMX_SIMD_HAVE_INT32_EXTRACT`</dt>
+<dd>
+Set either to `GMX_SIMD_HAVE_FINT32_EXTRACT` or `GMX_SIMD_HAVE_DINT32_EXTRACT`
+</dd>
+<dt>`GMX_SIMD_HAVE_INT32_LOGICAL`</dt>
+<dd>
+Set either to `GMX_SIMD_HAVE_FINT32_LOGICAL` or `GMX_SIMD_HAVE_DINT32_LOGICAL`
+</dd>
+<dt>`GMX_SIMD_HAVE_INT32_ARITHMETICS`</dt>
+<dd>
+Set either to `GMX_SIMD_HAVE_FINT32_ARITHMETICS` or `GMX_SIMD_HAVE_DINT32_ARITHMETICS`
+</dd>
+</dl>
+
+For convenience we also define `GMX_SIMD4_WIDTH` to 4. This will never vary,
+but using it helps you make it clear that a loop or array refers to the
+SIMD4 width rather than some other '4'.
+
+While all these defines are available to specify the features of the
+hardware, we would strongly recommend that you do NOT sprinkle your code
+with defines - if nothing else it will be a debug nightmare. Instead you can
+write a slower generic SIMD function that works everywhere, and then override
+this with faster architecture-specific versions for some implementations. The
+recommended way to do that is to add a define around the generic function
+that skips it if the name is already defined. The actual implementations in
+the lowest-level files are typically defined to an architecture-specific name
+(such as `gmx_simd_sincos_d_sse2`) so we can override it (e.g. in SSE4) by
+simply undefining and setting a new definition. Still, this is an
+implementation detail you won't have to worry about until you start writing
+support for a new SIMD architecture.
+
+
+
/* Target platform is BlueGene/Q */
#cmakedefine GMX_TARGET_BGQ
-/* SSE2 instructions available */
-#cmakedefine GMX_SIMD_X86_SSE2_OR_HIGHER
-
-/* SSE4.1 instructions available */
-#cmakedefine GMX_SIMD_X86_SSE4_1_OR_HIGHER
-
-/* AVX 128-bit FMA instructions available (AMD side of the AVX world) */
-#cmakedefine GMX_SIMD_X86_AVX_128_FMA_OR_HIGHER
-
-/* AVX 256-bit instructions available (Intel side of the AVX world) */
-#cmakedefine GMX_SIMD_X86_AVX_256_OR_HIGHER
-
/* GCC bug in AVX maskload/maskstore arguments - worked around internally */
#cmakedefine GMX_SIMD_X86_AVX_GCC_MASKLOAD_BUG
/* AVX 256-bit was selected as SIMD instructions */
#cmakedefine GMX_SIMD_X86_AVX_256
+/* AVX2 256-bit SIMD instruction set level was selected */
+#cmakedefine GMX_SIMD_X86_AVX2_256
+
/* IBM QPX was selected as SIMD instructions (e.g. BlueGene/Q) */
#cmakedefine GMX_SIMD_IBM_QPX
/* Fujitsu Sparc64 HPC-ACE SIMD acceleration */
#cmakedefine GMX_SIMD_SPARC64_HPC_ACE
+/* Reference SIMD implementation for testing */
+#cmakedefine GMX_SIMD_REFERENCE
+
/* String for SIMD instruction choice (for writing to log files and stdout) */
#define GMX_SIMD_STRING "@GMX_SIMD@"
add_subdirectory(swap)
add_subdirectory(essentialdynamics)
add_subdirectory(pulling)
+add_subdirectory(simd)
if (NOT GMX_BUILD_MDRUN_ONLY)
add_subdirectory(legacyheaders)
add_subdirectory(gmxana)
#include "force.h"
#include "nonbonded.h"
-/* Include the SIMD macro file and then check for support */
-#include "gromacs/simd/macros.h"
-#if defined GMX_HAVE_SIMD_MACROS && defined GMX_SIMD_HAVE_TRIGONOMETRIC
-#define SIMD_BONDEDS
+#include "gromacs/simd/simd.h"
+#include "gromacs/simd/simd_math.h"
#include "gromacs/simd/vector_operations.h"
-#endif
/* Find a better place for this? */
const int cmap_coeff_matrix[] = {
}
}
-#ifdef SIMD_BONDEDS
+#ifdef GMX_SIMD_HAVE_REAL
/* SIMD PBC data structure, containing 1/boxdiag and the box vectors */
typedef struct {
*dx = gmx_simd_fnmadd_r(sh, pbc->bxx, *dx);
}
-#endif /* SIMD_BONDEDS */
+#endif /* GMX_SIMD_HAVE_REAL */
/*
* Morse potential bond by Frank Everdij
return vtot;
}
-#ifdef SIMD_BONDEDS
+#ifdef GMX_SIMD_HAVE_REAL
/* As angles, but using SIMD to calculate many dihedrals at once.
* This routines does not calculate energies and shift forces.
}
}
-#endif /* SIMD_BONDEDS */
+#endif /* GMX_SIMD_HAVE_REAL */
real linear_angles(int nbonds,
const t_iatom forceatoms[], const t_iparams forceparams[],
}
-#ifdef SIMD_BONDEDS
+#ifdef GMX_SIMD_HAVE_REAL
/* As dih_angle above, but calculates 4 dihedral angles at once using SIMD,
* also calculates the pre-factor required for the dihedral force update.
*nrkj_n2_S = gmx_simd_mul_r(nrkj_S, gmx_simd_inv_r(iprn_S));
/* Set sign of phi_S with the sign of ipr_S; phi_S is currently positive */
- *phi_S = gmx_cpsgn_nonneg_pr(ipr_S, *phi_S);
-
+ *phi_S = gmx_simd_xor_sign_r(*phi_S, ipr_S);
p_S = gmx_simd_iprod_r(rijx_S, rijy_S, rijz_S,
rkjx_S, rkjy_S, rkjz_S);
p_S = gmx_simd_mul_r(p_S, nrkj_2_S);
gmx_simd_store_r(q, q_S);
}
-#endif /* SIMD_BONDEDS */
+#endif /* GMX_SIMD_HAVE_REAL */
void do_dih_fup(int i, int j, int k, int l, real ddphi,
}
-#ifdef SIMD_BONDEDS
+#ifdef GMX_SIMD_HAVE_REAL
/* As pdihs_noner above, but using SIMD to calculate many dihedrals at once */
static void
}
}
-#endif /* SIMD_BONDEDS */
+#endif /* GMX_SIMD_HAVE_REAL */
real idihs(int nbonds,
pbc, g, lambda[efptFTYPE], &(dvdl[efptFTYPE]),
md, fcd, global_atom_index);
}
-#ifdef SIMD_BONDEDS
+#ifdef GMX_SIMD_HAVE_REAL
else if (ftype == F_ANGLES &&
!bCalcEnerVir && fr->efep == efepNO)
{
!bCalcEnerVir && fr->efep == efepNO)
{
/* No energies, shift forces, dvdl */
-#ifndef SIMD_BONDEDS
- pdihs_noener
-#else
+#ifdef GMX_SIMD_HAVE_REAL
pdihs_noener_simd
+#else
+ pdihs_noener
#endif
(nbn, idef->il[ftype].iatoms+nb0,
idef->iparams,
{
"CannotDetect",
"None",
+ "Reference",
"SSE2",
"SSE4.1",
"AVX_128_FMA",
-/* What type of SIMD was compiled in, if any?
- * This is set from Cmake. Note that the SSE2 and SSE4_1 macros are set for
- * AVX too, so it is important that they appear last in the list.
- */
-#ifdef GMX_SIMD_X86_AVX_256
+/* What type of SIMD was compiled in, if any? */
+#ifdef GMX_SIMD_X86_AVX2_256
+static const enum gmx_cpuid_simd compiled_simd = GMX_CPUID_SIMD_X86_AVX2_256;
+#elif defined GMX_SIMD_X86_AVX_256
static const enum gmx_cpuid_simd compiled_simd = GMX_CPUID_SIMD_X86_AVX_256;
#elif defined GMX_SIMD_X86_AVX_128_FMA
static const enum gmx_cpuid_simd compiled_simd = GMX_CPUID_SIMD_X86_AVX_128_FMA;
static const enum gmx_cpuid_simd compiled_simd = GMX_CPUID_SIMD_SPARC64_HPC_ACE;
#elif defined GMX_SIMD_IBM_QPX
static const enum gmx_cpuid_simd compiled_simd = GMX_CPUID_SIMD_IBM_QPX;
+#elif defined GMX_SIMD_REFERENCE
+static const enum gmx_cpuid_simd compiled_simd = GMX_CPUID_SIMD_REFERENCE;
#else
static const enum gmx_cpuid_simd compiled_simd = GMX_CPUID_SIMD_NONE;
#endif
file(GLOB NONBONDED_AVX_128_FMA_SINGLE_SOURCES nb_kernel_avx_128_fma_single/*.c)
endif()
-if("${GMX_SIMD}" STREQUAL "AVX_256" AND NOT GMX_DOUBLE)
+if((("${GMX_SIMD}" STREQUAL "AVX_256") OR ("${GMX_SIMD}" STREQUAL "AVX2_256")) AND NOT GMX_DOUBLE)
file(GLOB NONBONDED_AVX_256_SINGLE_SOURCES nb_kernel_avx_256_single/*.c)
endif()
file(GLOB NONBONDED_AVX_128_FMA_DOUBLE_SOURCES nb_kernel_avx_128_fma_double/*.c)
endif()
-if("${GMX_SIMD}" STREQUAL "AVX_256" AND GMX_DOUBLE)
+if((("${GMX_SIMD}" STREQUAL "AVX_256") OR ("${GMX_SIMD}" STREQUAL "AVX2_256")) AND GMX_DOUBLE)
file(GLOB NONBONDED_AVX_256_DOUBLE_SOURCES nb_kernel_avx_256_double/*.c)
endif()
-if("${GMX_SIMD}" STREQUAL "Sparc64_HPC_ACE" AND GMX_DOUBLE)
+if("${GMX_SIMD}" STREQUAL "SPARC64_HPC_ACE" AND GMX_DOUBLE)
file(GLOB NONBONDED_SPARC64_HPC_ACE_DOUBLE_SOURCES nb_kernel_sparc64_hpc_ace_double/*.c)
endif()
/*
* This file is part of the GROMACS molecular simulation package.
*
- * Copyright (c) 2012,2013, by the GROMACS development team, led by
+ * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
* Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
* and including many others, as listed in the AUTHORS file in the
* top-level source directory and at http://www.gromacs.org.
#ifndef _kernelutil_x86_avx_128_fma_double_h_
#define _kernelutil_x86_avx_128_fma_double_h_
-#include "gromacs/simd/general_x86_avx_128_fma.h"
+#include <math.h>
+#include <immintrin.h>
+#ifdef _MSC_VER
+# include <intrin.h>
+#else
+# include <x86intrin.h>
+#endif
+#define gmx_mm_castsi128_pd _mm_castsi128_pd
+#define gmx_mm_extract_epi32 _mm_extract_epi32
+
+#define GMX_MM_TRANSPOSE2_PD(row0, row1) { \
+ __m128d __gmx_t1 = row0; \
+ row0 = _mm_unpacklo_pd(row0, row1); \
+ row1 = _mm_unpackhi_pd(__gmx_t1, row1); \
+}
static int
gmx_mm_any_lt(__m128d a, __m128d b)
/*
* This file is part of the GROMACS molecular simulation package.
*
- * Copyright (c) 2012,2013, by the GROMACS development team, led by
+ * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
* Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
* and including many others, as listed in the AUTHORS file in the
* top-level source directory and at http://www.gromacs.org.
#include <math.h>
+#include <immintrin.h>
+#ifdef _MSC_VER
+# include <intrin.h>
+#else
+# include <x86intrin.h>
+#endif
+
+#define gmx_mm_castsi128_ps _mm_castsi128_ps
+#define gmx_mm_extract_epi32 _mm_extract_epi32
-#include "gromacs/simd/general_x86_avx_128_fma.h"
+/* Work around gcc bug with wrong type for mask formal parameter to maskload/maskstore */
+#ifdef GMX_SIMD_X86_AVX_GCC_MASKLOAD_BUG
+# define gmx_mm_maskload_ps(mem, mask) _mm_maskload_ps((mem), _mm_castsi128_ps(mask))
+# define gmx_mm_maskstore_ps(mem, mask, x) _mm_maskstore_ps((mem), _mm_castsi128_ps(mask), (x))
+# define gmx_mm256_maskload_ps(mem, mask) _mm256_maskload_ps((mem), _mm256_castsi256_ps(mask))
+# define gmx_mm256_maskstore_ps(mem, mask, x) _mm256_maskstore_ps((mem), _mm256_castsi256_ps(mask), (x))
+#else
+# define gmx_mm_maskload_ps(mem, mask) _mm_maskload_ps((mem), (mask))
+# define gmx_mm_maskstore_ps(mem, mask, x) _mm_maskstore_ps((mem), (mask), (x))
+# define gmx_mm256_maskload_ps(mem, mask) _mm256_maskload_ps((mem), (mask))
+# define gmx_mm256_maskstore_ps(mem, mask, x) _mm256_maskstore_ps((mem), (mask), (x))
+#endif
/* Normal sum of four xmm registers */
#define gmx_mm_sum4_ps(t0, t1, t2, t3) _mm_add_ps(_mm_add_ps(t0, t1), _mm_add_ps(t2, t3))
/*
* This file is part of the GROMACS molecular simulation package.
*
- * Copyright (c) 2012,2013, by the GROMACS development team, led by
+ * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
* Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
* and including many others, as listed in the AUTHORS file in the
* top-level source directory and at http://www.gromacs.org.
#ifndef _kernelutil_x86_avx_256_double_h_
#define _kernelutil_x86_avx_256_double_h_
+#define gmx_mm_castsi128_ps(a) _mm_castsi128_ps(a)
+
+#define _GMX_MM_BLEND256D(b3, b2, b1, b0) (((b3) << 3) | ((b2) << 2) | ((b1) << 1) | ((b0)))
+#define _GMX_MM_PERMUTE(fp3, fp2, fp1, fp0) (((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | ((fp0)))
+#define _GMX_MM_PERMUTE128D(fp1, fp0) (((fp1) << 1) | ((fp0)))
+#define _GMX_MM_PERMUTE256D(fp3, fp2, fp1, fp0) (((fp3) << 3) | ((fp2) << 2) | ((fp1) << 1) | ((fp0)))
+#define GMX_MM256_FULLTRANSPOSE4_PD(row0, row1, row2, row3) \
+ { \
+ __m256d _t0, _t1, _t2, _t3; \
+ _t0 = _mm256_unpacklo_pd((row0), (row1)); \
+ _t1 = _mm256_unpackhi_pd((row0), (row1)); \
+ _t2 = _mm256_unpacklo_pd((row2), (row3)); \
+ _t3 = _mm256_unpackhi_pd((row2), (row3)); \
+ row0 = _mm256_permute2f128_pd(_t0, _t2, 0x20); \
+ row1 = _mm256_permute2f128_pd(_t1, _t3, 0x20); \
+ row2 = _mm256_permute2f128_pd(_t0, _t2, 0x31); \
+ row3 = _mm256_permute2f128_pd(_t1, _t3, 0x31); \
+ }
+
+#define gmx_mm_extract_epi32(x, imm) _mm_extract_epi32((x), (imm))
-#include "gromacs/simd/general_x86_avx_256.h"
+static __m256d
+gmx_mm256_unpack128lo_pd(__m256d xmm1, __m256d xmm2)
+{
+ return _mm256_permute2f128_pd(xmm1, xmm2, 0x20);
+}
+static __m256d
+gmx_mm256_unpack128hi_pd(__m256d xmm1, __m256d xmm2)
+{
+ return _mm256_permute2f128_pd(xmm1, xmm2, 0x31);
+}
+
+static __m256d
+gmx_mm256_set_m128d(__m128d hi, __m128d lo)
+{
+ return _mm256_insertf128_pd(_mm256_castpd128_pd256(lo), hi, 0x1);
+}
+
+static gmx_inline __m256
+gmx_mm256_set_m128(__m128 hi, __m128 lo)
+{
+ return _mm256_insertf128_ps(_mm256_castps128_ps256(lo), hi, 0x1);
+}
static int
gmx_mm256_any_lt(__m256d a, __m256d b)
/*
* This file is part of the GROMACS molecular simulation package.
*
- * Copyright (c) 2012,2013, by the GROMACS development team, led by
+ * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
* Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
* and including many others, as listed in the AUTHORS file in the
* top-level source directory and at http://www.gromacs.org.
#ifndef _kernelutil_x86_avx_256_single_h_
#define _kernelutil_x86_avx_256_single_h_
-#include "gromacs/simd/general_x86_avx_256.h"
+#define gmx_mm_castsi128_ps(a) _mm_castsi128_ps(a)
+
+static gmx_inline __m256
+gmx_mm256_unpack128lo_ps(__m256 xmm1, __m256 xmm2)
+{
+ return _mm256_permute2f128_ps(xmm1, xmm2, 0x20);
+}
+
+static gmx_inline __m256
+gmx_mm256_unpack128hi_ps(__m256 xmm1, __m256 xmm2)
+{
+ return _mm256_permute2f128_ps(xmm1, xmm2, 0x31);
+}
+
+static gmx_inline __m256
+gmx_mm256_set_m128(__m128 hi, __m128 lo)
+{
+ return _mm256_insertf128_ps(_mm256_castps128_ps256(lo), hi, 0x1);
+}
+
+/* Work around gcc bug with wrong type for mask formal parameter to maskload/maskstore */
+#ifdef GMX_SIMD_X86_AVX_GCC_MASKLOAD_BUG
+# define gmx_mm_maskload_ps(mem, mask) _mm_maskload_ps((mem), _mm_castsi128_ps(mask))
+# define gmx_mm_maskstore_ps(mem, mask, x) _mm_maskstore_ps((mem), _mm_castsi128_ps(mask), (x))
+# define gmx_mm256_maskload_ps(mem, mask) _mm256_maskload_ps((mem), _mm256_castsi256_ps(mask))
+# define gmx_mm256_maskstore_ps(mem, mask, x) _mm256_maskstore_ps((mem), _mm256_castsi256_ps(mask), (x))
+#else
+# define gmx_mm_maskload_ps(mem, mask) _mm_maskload_ps((mem), (mask))
+# define gmx_mm_maskstore_ps(mem, mask, x) _mm_maskstore_ps((mem), (mask), (x))
+# define gmx_mm256_maskload_ps(mem, mask) _mm256_maskload_ps((mem), (mask))
+# define gmx_mm256_maskstore_ps(mem, mask, x) _mm256_maskstore_ps((mem), (mask), (x))
+#endif
/* Transpose lower/upper half of 256-bit registers separately */
#define GMX_MM256_HALFTRANSPOSE4_PS(ymm0, ymm1, ymm2, ymm3) { \
/*
* This file is part of the GROMACS molecular simulation package.
*
- * Copyright (c) 2012,2013, by the GROMACS development team, led by
+ * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
* Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
* and including many others, as listed in the AUTHORS file in the
* top-level source directory and at http://www.gromacs.org.
#include <math.h>
-#include "gromacs/simd/general_x86_sse2.h"
#include <stdio.h>
/* Normal sum of four ymm registers */
#define gmx_mm_sum4_pd(t0, t1, t2, t3) _mm_add_pd(_mm_add_pd(t0, t1), _mm_add_pd(t2, t3))
+#define gmx_mm_extract_epi32(x, imm) _mm_cvtsi128_si32(_mm_srli_si128((x), 4 * (imm)))
+#define gmx_mm_castsi128_pd(a) _mm_castsi128_pd(a)
+
+#define GMX_MM_TRANSPOSE2_PD(row0, row1) { \
+ __m128d __gmx_t1 = row0; \
+ row0 = _mm_unpacklo_pd(row0, row1); \
+ row1 = _mm_unpackhi_pd(__gmx_t1, row1); \
+}
static int
gmx_mm_any_lt(__m128d a, __m128d b)
/*
* This file is part of the GROMACS molecular simulation package.
*
- * Copyright (c) 2012,2013, by the GROMACS development team, led by
+ * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
* Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
* and including many others, as listed in the AUTHORS file in the
* top-level source directory and at http://www.gromacs.org.
#include <math.h>
-#include "gromacs/simd/general_x86_sse2.h"
+#define gmx_mm_castsi128_ps _mm_castsi128_ps
+
+#define gmx_mm_extract_epi32(x, imm) _mm_cvtsi128_si32(_mm_srli_si128((x), 4 * (imm)))
/* Normal sum of four xmm registers */
/*
* This file is part of the GROMACS molecular simulation package.
*
- * Copyright (c) 2012,2013, by the GROMACS development team, led by
+ * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
* Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
* and including many others, as listed in the AUTHORS file in the
* top-level source directory and at http://www.gromacs.org.
#include <math.h>
-#include "gromacs/simd/general_x86_sse4_1.h"
-
#include <stdio.h>
+#define gmx_mm_extract_epi32(x, imm) _mm_extract_epi32((x), (imm))
+#define gmx_mm_castsi128_pd(a) _mm_castsi128_pd(a)
+
+#define GMX_MM_TRANSPOSE2_PD(row0, row1) { \
+ __m128d __gmx_t1 = row0; \
+ row0 = _mm_unpacklo_pd(row0, row1); \
+ row1 = _mm_unpackhi_pd(__gmx_t1, row1); \
+}
/* Normal sum of four ymm registers */
#define gmx_mm_sum4_pd(t0, t1, t2, t3) _mm_add_pd(_mm_add_pd(t0, t1), _mm_add_pd(t2, t3))
/*
* This file is part of the GROMACS molecular simulation package.
*
- * Copyright (c) 2012,2013, by the GROMACS development team, led by
+ * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
* Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
* and including many others, as listed in the AUTHORS file in the
* top-level source directory and at http://www.gromacs.org.
#include <math.h>
-#include "gromacs/simd/general_x86_sse4_1.h"
-
#undef gmx_restrict
#define gmx_restrict
+#define gmx_mm_castsi128_ps _mm_castsi128_ps
+#define gmx_mm_extract_epi32 _mm_extract_epi32
+
/* Normal sum of four xmm registers */
#define gmx_mm_sum4_ps(t0, t1, t2, t3) _mm_add_ps(_mm_add_ps(t0, t1), _mm_add_ps(t2, t3))
#include "nb_generic_cg.h"
#include "nb_generic_adress.h"
-/* Different default (c) and accelerated interaction-specific kernels */
+/* Different default (c) and SIMD instructions interaction-specific kernels */
#include "nb_kernel_c/nb_kernel_c.h"
#if (defined GMX_SIMD_X86_SSE2) && !(defined GMX_DOUBLE)
#if (defined GMX_SIMD_X86_AVX_128_FMA) && !(defined GMX_DOUBLE)
# include "nb_kernel_avx_128_fma_single/nb_kernel_avx_128_fma_single.h"
#endif
-#if (defined GMX_SIMD_X86_AVX_256) && !(defined GMX_DOUBLE)
+#if (defined GMX_SIMD_X86_AVX_256_OR_HIGHER) && !(defined GMX_DOUBLE)
# include "nb_kernel_avx_256_single/nb_kernel_avx_256_single.h"
#endif
#if (defined GMX_SIMD_X86_SSE2 && defined GMX_DOUBLE)
#if (defined GMX_SIMD_X86_AVX_128_FMA && defined GMX_DOUBLE)
# include "nb_kernel_avx_128_fma_double/nb_kernel_avx_128_fma_double.h"
#endif
-#if (defined GMX_SIMD_X86_AVX_256 && defined GMX_DOUBLE)
+#if (defined GMX_SIMD_X86_AVX_256_OR_HIGHER && defined GMX_DOUBLE)
# include "nb_kernel_avx_256_double/nb_kernel_avx_256_double.h"
#endif
#if (defined GMX_SIMD_SPARC64_HPC_ACE && defined GMX_DOUBLE)
#if (defined GMX_SIMD_X86_AVX_128_FMA) && !(defined GMX_DOUBLE)
nb_kernel_list_add_kernels(kernellist_avx_128_fma_single, kernellist_avx_128_fma_single_size);
#endif
-#if (defined GMX_SIMD_X86_AVX_256) && !(defined GMX_DOUBLE)
+#if (defined GMX_SIMD_X86_AVX_256_OR_HIGHER) && !(defined GMX_DOUBLE)
nb_kernel_list_add_kernels(kernellist_avx_256_single, kernellist_avx_256_single_size);
#endif
/* Double precision */
#if (defined GMX_SIMD_X86_AVX_128_FMA && defined GMX_DOUBLE)
nb_kernel_list_add_kernels(kernellist_avx_128_fma_double, kernellist_avx_128_fma_double_size);
#endif
-#if (defined GMX_SIMD_X86_AVX_256 && defined GMX_DOUBLE)
+#if (defined GMX_SIMD_X86_AVX_256_OR_HIGHER && defined GMX_DOUBLE)
nb_kernel_list_add_kernels(kernellist_avx_256_double, kernellist_avx_256_double_size);
#endif
#if (defined GMX_SIMD_SPARC64_HPC_ACE && defined GMX_DOUBLE)
arch_and_padding[] =
{
/* Single precision */
-#if (defined GMX_SIMD_X86_AVX_256) && !(defined GMX_DOUBLE)
+#if (defined GMX_SIMD_X86_AVX_256_OR_HIGHER) && !(defined GMX_DOUBLE)
{ "avx_256_single", 8 },
#endif
#if (defined GMX_SIMD_X86_AVX_128_FMA) && !(defined GMX_DOUBLE)
{ "sse2_single", 4 },
#endif
/* Double precision */
-#if (defined GMX_SIMD_X86_AVX_256 && defined GMX_DOUBLE)
+#if (defined GMX_SIMD_X86_AVX_256_OR_HIGHER && defined GMX_DOUBLE)
{ "avx_256_double", 4 },
#endif
#if (defined GMX_SIMD_X86_AVX_128_FMA && defined GMX_DOUBLE)
#ifdef GMX_NBNXN_HALF_WIDTH_SIMD
#define GMX_USE_HALF_WIDTH_SIMD_HERE
#endif
-#include "gromacs/simd/macros.h"
+#include "gromacs/simd/simd.h"
#endif
{
GMX_CPUID_SIMD_CANNOTDETECT, /* Should only be used if something fails */
GMX_CPUID_SIMD_NONE,
+ GMX_CPUID_SIMD_REFERENCE,
GMX_CPUID_SIMD_X86_SSE2,
GMX_CPUID_SIMD_X86_SSE4_1,
GMX_CPUID_SIMD_X86_AVX_128_FMA,
extern "C" {
#endif
-#ifdef GMX_SIMD_REFERENCE
-#define GMX_NBNXN_SIMD
-#endif
-
-#if (defined GMX_SIMD_X86_SSE2_OR_HIGHER) || (defined GMX_SIMD_IBM_QPX)
-/* Use SIMD accelerated nbnxn search and kernels */
-#define GMX_NBNXN_SIMD
-
-/* Uncomment the next line to use, slower, 128-bit SIMD with AVX-256 */
-/* #define GMX_NBNXN_HALF_WIDTH_SIMD */
-
-/* The nbnxn SIMD 4xN and 2x(N+N) kernels can be added independently.
- * Currently the 2xNN SIMD kernels only make sense with:
- * 8-way SIMD: 4x4 setup, works with AVX-256 in single precision
- * 16-way SIMD: 4x8 setup, not used, but most of the kernel code is there
- */
-#define GMX_NBNXN_SIMD_4XN
-#if defined GMX_SIMD_X86_AVX_256_OR_HIGHER && !(defined GMX_DOUBLE || defined GMX_NBNXN_HALF_WIDTH_SIMD)
-#define GMX_NBNXN_SIMD_2XNN
-#endif
-
-#endif
-
-#ifdef __MIC__
-#define GMX_NBNXN_SIMD
-#define GMX_NBNXN_SIMD_2XNN
-#endif
-
/*! Nonbonded NxN kernel types: plain C, CPU SIMD, GPU CUDA, GPU emulation */
typedef enum
#include "qmmm.h"
#include "copyrite.h"
#include "mtop_util.h"
+#include "nbnxn_simd.h"
#include "nbnxn_search.h"
#include "nbnxn_atomdata.h"
#include "nbnxn_consts.h"
*kernel_type = nbnxnk4xN_SIMD_4xN;
#endif
#ifdef GMX_NBNXN_SIMD_2XNN
- /* We expect the 2xNN kernels to be faster in most cases */
*kernel_type = nbnxnk4xN_SIMD_2xNN;
#endif
-#if defined GMX_NBNXN_SIMD_4XN && defined GMX_SIMD_X86_AVX_256_OR_HIGHER
- if (EEL_RF(ir->coulombtype) || ir->coulombtype == eelCUT)
+#if defined GMX_NBNXN_SIMD_2XNN && defined GMX_NBNXN_SIMD_4XN
+ /* We need to choose if we want 2x(N+N) or 4xN kernels.
+ * Currently this is based on the SIMD acceleration choice,
+ * but it might be better to decide this at runtime based on CPU.
+ *
+ * 4xN calculates more (zero) interactions, but has less pair-search
+ * work and much better kernel instruction scheduling.
+ *
+ * Up till now we have only seen that on Intel Sandy/Ivy Bridge,
+ * which doesn't have FMA, both the analytical and tabulated Ewald
+ * kernels have similar pair rates for 4x8 and 2x(4+4), so we choose
+ * 2x(4+4) because it results in significantly fewer pairs.
+ * For RF, the raw pair rate of the 4x8 kernel is higher than 2x(4+4),
+ * 10% with HT, 50% without HT. As we currently don't detect the actual
+ * use of HT, use 4x8 to avoid a potential performance hit.
+ * On Intel Haswell 4x8 is always faster.
+ */
+ *kernel_type = nbnxnk4xN_SIMD_4xN;
+
+#ifndef GMX_SIMD_HAVE_FMA
+ if (EEL_PME(ir->coulombtype) || EEL_EWALD(ir->coulombtype) ||
+ EVDW_PME(ir->vdwtype))
{
- /* The raw pair rate of the 4x8 kernel is higher than 2x(4+4),
- * 10% with HT, 50% without HT, but extra zeros interactions
- * can compensate. As we currently don't detect the actual use
- * of HT, switch to 4x8 to avoid a potential performance hit.
+ /* We have Ewald kernels without FMA (Intel Sandy/Ivy Bridge).
+ * There are enough instructions to make 2x(4+4) efficient.
*/
- *kernel_type = nbnxnk4xN_SIMD_4xN;
+ *kernel_type = nbnxnk4xN_SIMD_2xNN;
}
#endif
+#endif /* GMX_NBNXN_SIMD_2XNN && GMX_NBNXN_SIMD_4XN */
+
+
if (getenv("GMX_NBNXN_SIMD_4XN") != NULL)
{
#ifdef GMX_NBNXN_SIMD_4XN
}
/* Analytical Ewald exclusion correction is only an option in
- * the SIMD kernel. On BlueGene/Q, this is faster regardless
- * of precision. In single precision, this is faster on
- * Bulldozer, and slightly faster on Sandy Bridge.
+ * the SIMD kernel.
+ * Since table lookup's don't parallelize with SIMD, analytical
+ * will probably always be faster for a SIMD width of 8 or more.
+ * With FMA analytical is sometimes faster for a width if 4 as well.
+ * On BlueGene/Q, this is faster regardless of precision.
+ * In single precision, this is faster on Bulldozer.
*/
-#if ((defined GMX_SIMD_X86_AVX_128_FMA_OR_HIGHER || defined GMX_SIMD_X86_AVX_256_OR_HIGHER || defined __MIC__) && !defined GMX_DOUBLE) || (defined GMX_SIMD_IBM_QPX)
+#if GMX_SIMD_REAL_WIDTH >= 8 || \
+ (GMX_SIMD_REAL_WIDTH >= 4 && defined GMX_SIMD_HAVE_FMA && !defined GMX_DOUBLE) || \
+ defined GMX_SIMD_IBM_QPX
*ewald_excl = ewaldexclAnalytical;
#endif
if (getenv("GMX_NBNXN_EWALD_TABLE") != NULL)
case nbnxnk4xN_SIMD_4xN:
case nbnxnk4xN_SIMD_2xNN:
#ifdef GMX_NBNXN_SIMD
-#ifdef GMX_SIMD_X86_SSE2_OR_HIGHER
- /* We have x86 SSE2 compatible SIMD */
-#ifdef GMX_SIMD_X86_AVX_128_FMA_OR_HIGHER
- returnvalue = "AVX-128-FMA";
-#else
-#if defined GMX_SIMD_X86_AVX_256_OR_HIGHER || defined __AVX__
- /* x86 SIMD intrinsics can be converted to SSE or AVX depending
- * on compiler flags. As we use nearly identical intrinsics,
- * compiling for AVX without an AVX macros effectively results
- * in AVX kernels.
- * For gcc we check for __AVX__
- * At least a check for icc should be added (if there is a macro)
- */
-#if defined GMX_SIMD_X86_AVX_256_OR_HIGHER && !defined GMX_NBNXN_HALF_WIDTH_SIMD
- returnvalue = "AVX-256";
-#else
- returnvalue = "AVX-128";
-#endif
-#else
-#ifdef GMX_SIMD_X86_SSE4_1_OR_HIGHER
- returnvalue = "SSE4.1";
+#if defined GMX_SIMD_X86_SSE2
+ returnvalue = "SSE2";
+#elif defined GMX_SIMD_X86_SSE4_1
+ returnvalue = "SSE4.1";
+#elif defined GMX_SIMD_X86_AVX_128_FMA
+ returnvalue = "AVX_128_FMA";
+#elif defined GMX_SIMD_X86_AVX_256
+ returnvalue = "AVX_256";
+#elif defined GMX_SIMD_X86_AVX2_256
+ returnvalue = "AVX2_256";
#else
- returnvalue = "SSE2";
-#endif
-#endif
+ returnvalue = "SIMD";
#endif
-#else /* GMX_SIMD_X86_SSE2_OR_HIGHER */
- /* not GMX_SIMD_X86_SSE2_OR_HIGHER, but other SIMD */
- returnvalue = "SIMD";
-#endif /* GMX_SIMD_X86_SSE2_OR_HIGHER */
#else /* GMX_NBNXN_SIMD */
returnvalue = "not available";
#endif /* GMX_NBNXN_SIMD */
#define _nbnxn_internal_h
#include "typedefs.h"
+#include "nbnxn_simd.h"
#include "domdec.h"
#include "gromacs/timing/cyclecounter.h"
-#ifdef GMX_NBNXN_SIMD
-/* The include below sets the SIMD instruction type (precision+width)
- * for all nbnxn SIMD search and non-bonded kernel code.
- */
-#ifdef GMX_NBNXN_HALF_WIDTH_SIMD
-#define GMX_USE_HALF_WIDTH_SIMD_HERE
-#endif
-#include "gromacs/simd/macros.h"
-#endif
-
-/* Bounding box calculations are (currently) always in single precision.
+/* Bounding box calculations are (currently) always in single precision, so
+ * we only need to check for single precision support here.
* This uses less (cache-)memory and SIMD is faster, at least on x86.
*/
-#define GMX_SIMD4_SINGLE
-/* Include the 4-wide SIMD macro file */
-#include "gromacs/simd/four_wide_macros.h"
-/* Check if we have 4-wide SIMD macro support */
-#ifdef GMX_HAVE_SIMD4_MACROS
+#ifdef GMX_SIMD4_HAVE_FLOAT
#define NBNXN_SEARCH_BB_SIMD4
#endif
#else /* GMX_SIMD_REFERENCE */
-#if defined GMX_SIMD_X86_SSE2_OR_HIGHER && !defined __MIC__
+#if defined GMX_TARGET_X86 && !defined __MIC__
/* Include x86 SSE2 compatible SIMD functions */
/* Set the stride for the lookup of the two LJ parameters from their
#endif
/* Align a stack-based thread-local working array. Table loads on
- * full-width AVX_256 use the array, but other implementations do
- * not. */
+ * 256-bit AVX use the array, but other implementations do not.
+ */
static gmx_inline int *
-prepare_table_load_buffer(const int gmx_unused *array)
+prepare_table_load_buffer(int gmx_unused *array)
{
-#if defined GMX_SIMD_X86_AVX_256_OR_HIGHER && !defined GMX_USE_HALF_WIDTH_SIMD_HERE
+#if GMX_SIMD_REAL_WIDTH >= 8 || (defined GMX_DOUBLE && GMX_SIMD_REAL_WIDTH >= 4)
return gmx_simd_align_i(array);
#else
return NULL;
#endif
}
-#if defined GMX_SIMD_X86_AVX_256_OR_HIGHER && !defined GMX_USE_HALF_WIDTH_SIMD_HERE
-
-/* With full AVX-256 SIMD, half SIMD-width table loads are optimal */
-#if GMX_SIMD_REAL_WIDTH == 8
-#define TAB_FDV0
-#endif
#ifdef GMX_DOUBLE
+#if GMX_SIMD_REAL_WIDTH == 2
+#include "nbnxn_kernel_simd_utils_x86_128d.h"
+#else
#include "nbnxn_kernel_simd_utils_x86_256d.h"
-#else /* GMX_DOUBLE */
-#include "nbnxn_kernel_simd_utils_x86_256s.h"
-#endif /* GMX_DOUBLE */
-
-#else /* defined GMX_SIMD_X86_AVX_256_OR_HIGHER && !defined GMX_USE_HALF_WIDTH_SIMD_HERE */
-
-/* We use the FDV0 table layout when we can use aligned table loads */
-#if GMX_SIMD_REAL_WIDTH == 4
-#define TAB_FDV0
#endif
-
-#ifdef GMX_DOUBLE
-#include "nbnxn_kernel_simd_utils_x86_128d.h"
-#else /* GMX_DOUBLE */
+#else /* GMX_DOUBLE */
+/* In single precision aligned FDV0 table loads are optimal */
+#define TAB_FDV0
+#if GMX_SIMD_REAL_WIDTH == 4
#include "nbnxn_kernel_simd_utils_x86_128s.h"
+#else
+#include "nbnxn_kernel_simd_utils_x86_256s.h"
+#endif
#endif /* GMX_DOUBLE */
-#endif /* defined GMX_SIMD_X86_AVX_256_OR_HIGHER && !defined GMX_USE_HALF_WIDTH_SIMD_HERE */
-
-#else /* GMX_SIMD_X86_SSE2_OR_HIGHER */
+#else /* GMX_TARGET_X86 && !__MIC__ */
#if GMX_SIMD_REAL_WIDTH > 4
/* For width>4 we use unaligned loads. And thus we can use the minimal stride */
#include "nbnxn_kernel_simd_utils_x86_mic.h"
#endif
-#endif /* GMX_SIMD_X86_SSE2_OR_HIGHER */
-#endif /* GMX_SIMD_REFERENCE */
-
-#if GMX_SIMD_REAL_WIDTH == 4
-#define gmx_mm_pr4 gmx_simd_real_t
-#define gmx_load_pr4 gmx_simd_load_r
-#define gmx_store_pr4 gmx_simd_store_r
-#define gmx_add_pr4 gmx_simd_add_r
-#endif
+#endif /* GMX_TARGET_X86 && !__MIC__ */
-#ifndef HAVE_GMX_SUM_SIMD /* should be defined for arch with hardware reduce */
-static gmx_inline real
-gmx_sum_simd2(gmx_simd_real_t x, real* b)
-{
- gmx_simd_store_r(b, x);
- return b[0]+b[1];
-}
-
-#if GMX_SIMD_REAL_WIDTH >= 4
-static gmx_inline real
-gmx_sum_simd4(gmx_mm_pr4 x, real* b)
-{
- gmx_store_pr4(b, x);
- return b[0]+b[1]+b[2]+b[3];
-}
-#endif
+#endif /* GMX_SIMD_REFERENCE */
-#if GMX_SIMD_REAL_WIDTH == 2
-static gmx_inline real gmx_sum_simd(gmx_simd_real_t x, real* b)
-{
- gmx_simd_store_r(b, x);
- return b[0]+b[1];
-}
-#elif GMX_SIMD_REAL_WIDTH == 4
-static gmx_inline real gmx_sum_simd(gmx_simd_real_t x, real* b)
-{
- gmx_simd_store_r(b, x);
- return b[0]+b[1]+b[2]+b[3];
-}
-#elif GMX_SIMD_REAL_WIDTH == 8
-static gmx_inline real gmx_sum_simd(gmx_simd_real_t x, real* b)
-{
- gmx_simd_store_r(b, x);
- return b[0]+b[1]+b[2]+b[3]+b[4]+b[5]+b[6]+b[7];
-}
-#elif GMX_SIMD_REAL_WIDTH == 16
-/* This is getting ridiculous, SIMD horizontal adds would help,
- * but this is not performance critical (only used to reduce energies)
+/* If the simd width is 4, but simd4 instructions are not defined,
+ * reuse the simd real type and the four instructions we need.
*/
-static gmx_inline real gmx_sum_simd(gmx_simd_real_t x, real* b)
-{
- gmx_simd_store_r(b, x);
- return b[0]+b[1]+b[2]+b[3]+b[4]+b[5]+b[6]+b[7]+b[8]+b[9]+b[10]+b[11]+b[12]+b[13]+b[14]+b[15];
-}
-#else
-#error "unsupported kernel configuration"
+#if GMX_SIMD_REAL_WIDTH == 4 && \
+ !((!defined GMX_DOUBLE && defined GMX_SIMD4_HAVE_FLOAT) || \
+ (defined GMX_DOUBLE && defined GMX_SIMD4_HAVE_DOUBLE))
+#define gmx_simd4_real_t gmx_simd_real_t
+#define gmx_simd4_load_r gmx_simd_load_r
+#define gmx_simd4_store_r gmx_simd_store_r
+#define gmx_simd4_add_r gmx_simd_add_r
+#define gmx_simd4_reduce_r gmx_simd_reduce_r
#endif
-#endif //HAVE_GMX_SUM_SIMD
#ifdef UNROLLJ
/* Add energy register to possibly multiple terms in the energy array */
typedef gmx_simd_real_t gmx_exclfilter;
static const int filter_stride = 1;
-/* The 4xn kernel operates on 4-wide i-force registers */
-typedef gmx_simd_real_t gmx_mm_pr4;
-
/* This files contains all functions/macros for the SIMD kernels
* which have explicit dependencies on the j-cluster size and/or SIMD-width.
* The functionality which depends on the j-cluster size is:
/* Align a stack-based thread-local working array. Table loads on QPX
* use the array, but most other implementations do not. */
static gmx_inline int *
-prepare_table_load_buffer(const int *array)
+prepare_table_load_buffer(int *array)
{
return gmx_simd_align_i(array);
}
return gmx_simd_add_r(sum01, sim23);
}
-#ifdef GMX_DOUBLE
-/* In double precision on x86 it can be faster to first calculate
- * single precision square roots for two double precision registers at
- * once and then use double precision Newton-Raphson iteration to
- * reach full double precision. For QPX, we just wrap the usual
- * reciprocal square roots.
- */
-static gmx_inline void
-gmx_mm_invsqrt2_pd(gmx_simd_real_t in0, gmx_simd_real_t in1,
- gmx_simd_real_t *out0, gmx_simd_real_t *out1)
-{
- *out0 = gmx_simd_invsqrt_r(in0);
- *out1 = gmx_simd_invsqrt_r(in1);
-}
-#endif
-
static gmx_inline void
load_lj_pair_params(const real *nbfp, const int *type, int aj,
gmx_simd_real_t *c6_S, gmx_simd_real_t *c12_S)
#ifndef _nbnxn_kernel_simd_utils_ref_h_
#define _nbnxn_kernel_simd_utils_ref_h_
-typedef gmx_simd_ref_epi32 gmx_simd_ref_exclfilter;
+#
+#include "gromacs/simd/simd_math.h"
+
+typedef gmx_simd_int32_t gmx_simd_ref_exclfilter;
typedef gmx_simd_ref_exclfilter gmx_exclfilter;
static const int filter_stride = GMX_SIMD_INT32_WIDTH/GMX_SIMD_REAL_WIDTH;
/* float/double SIMD register type */
typedef struct {
real r[4];
-} gmx_mm_pr4;
+} gmx_simd4_real_t;
-static gmx_inline gmx_mm_pr4
-gmx_load_pr4(const real *r)
+static gmx_inline gmx_simd4_real_t
+gmx_simd4_load_r(const real *r)
{
- gmx_mm_pr4 a;
- int i;
+ gmx_simd4_real_t a;
+ int i;
for (i = 0; i < 4; i++)
{
}
static gmx_inline void
-gmx_store_pr4(real *dest, gmx_mm_pr4 src)
+gmx_simd4_store_r(real *dest, gmx_simd4_real_t src)
{
- gmx_mm_pr4 a;
- int i;
+ gmx_simd4_real_t a;
+ int i;
for (i = 0; i < 4; i++)
{
}
}
-static gmx_inline gmx_mm_pr4
-gmx_add_pr4(gmx_mm_pr4 a, gmx_mm_pr4 b)
+static gmx_inline gmx_simd4_real_t
+gmx_simd4_add_r(gmx_simd4_real_t a, gmx_simd4_real_t b)
{
- gmx_mm_pr4 c;
- int i;
+ gmx_simd4_real_t c;
+ int i;
for (i = 0; i < 4; i++)
{
return c;
}
+
+static gmx_inline real
+gmx_simd4_reduce_r(gmx_simd4_real_t a)
+{
+ return a.r[0] + a.r[1] + a.r[2] + a.r[3];
+}
+
#endif
/* Load one real at b and one real at b+1 into halves of a, respectively */
static gmx_inline void
-gmx_load1p1_pr(gmx_simd_ref_pr *a, const real *b)
+gmx_load1p1_pr(gmx_simd_real_t *a, const real *b)
{
int i;
/* Load reals at half-width aligned pointer b into two halves of a */
static gmx_inline void
-gmx_loaddh_pr(gmx_simd_ref_pr *a, const real *b)
+gmx_loaddh_pr(gmx_simd_real_t *a, const real *b)
{
int i;
/* Sum over 4 half SIMD registers */
static gmx_inline gmx_mm_hpr
-gmx_sum4_hpr(gmx_simd_ref_pr a, gmx_simd_ref_pr b)
+gmx_sum4_hpr(gmx_simd_real_t a, gmx_simd_real_t b)
{
gmx_mm_hpr c;
int i;
#ifdef GMX_NBNXN_SIMD_2XNN
/* Sum the elements of halfs of each input register and store sums in out */
-static gmx_inline gmx_mm_pr4
-gmx_mm_transpose_sum4h_pr(gmx_simd_ref_pr a, gmx_simd_ref_pr b)
+static gmx_inline gmx_simd4_real_t
+gmx_mm_transpose_sum4h_pr(gmx_simd_real_t a, gmx_simd_real_t b)
{
- gmx_mm_pr4 sum;
- int i;
+ gmx_simd4_real_t sum;
+ int i;
sum.r[0] = 0;
sum.r[1] = 0;
#endif
static gmx_inline void
-gmx_pr_to_2hpr(gmx_simd_ref_pr a, gmx_mm_hpr *b, gmx_mm_hpr *c)
+gmx_pr_to_2hpr(gmx_simd_real_t a, gmx_mm_hpr *b, gmx_mm_hpr *c)
{
int i;
}
}
static gmx_inline void
-gmx_2hpr_to_pr(gmx_mm_hpr a, gmx_mm_hpr b, gmx_simd_ref_pr *c)
+gmx_2hpr_to_pr(gmx_mm_hpr a, gmx_mm_hpr b, gmx_simd_real_t *c)
{
int i;
#ifndef TAB_FDV0
static gmx_inline void
-load_table_f(const real *tab_coul_F, gmx_simd_ref_epi32 ti_S,
+load_table_f(const real *tab_coul_F, gmx_simd_int32_t ti_S,
int gmx_unused *ti,
- gmx_simd_ref_pr *ctab0_S, gmx_simd_ref_pr *ctab1_S)
+ gmx_simd_real_t *ctab0_S, gmx_simd_real_t *ctab1_S)
{
int i;
for (i = 0; i < GMX_SIMD_REAL_WIDTH; i++)
{
- ctab0_S->r[i] = tab_coul_F[ti_S.r[i]];
- ctab1_S->r[i] = tab_coul_F[ti_S.r[i]+1];
+ ctab0_S->r[i] = tab_coul_F[ti_S.i[i]];
+ ctab1_S->r[i] = tab_coul_F[ti_S.i[i]+1];
}
*ctab1_S = gmx_simd_sub_r(*ctab1_S, *ctab0_S);
static gmx_inline void
load_table_f_v(const real *tab_coul_F, const real *tab_coul_V,
- gmx_simd_ref_epi32 ti_S, int *ti,
- gmx_simd_ref_pr *ctab0_S, gmx_simd_ref_pr *ctab1_S,
- gmx_simd_ref_pr *ctabv_S)
+ gmx_simd_int32_t ti_S, int *ti,
+ gmx_simd_real_t *ctab0_S, gmx_simd_real_t *ctab1_S,
+ gmx_simd_real_t *ctabv_S)
{
int i;
for (i = 0; i < GMX_SIMD_REAL_WIDTH; i++)
{
- ctabv_S->r[i] = tab_coul_V[ti_S.r[i]];
+ ctabv_S->r[i] = tab_coul_V[ti_S.i[i]];
}
}
#endif
#ifdef TAB_FDV0
static gmx_inline void
-load_table_f(const real *tab_coul_FDV0, gmx_simd_ref_epi32 ti_S, int *ti,
- gmx_simd_ref_pr *ctab0_S, gmx_simd_ref_pr *ctab1_S)
+load_table_f(const real *tab_coul_FDV0, gmx_simd_int32_t ti_S, int *ti,
+ gmx_simd_real_t *ctab0_S, gmx_simd_real_t *ctab1_S)
{
int i;
for (i = 0; i < GMX_SIMD_REAL_WIDTH; i++)
{
- ctab0_S->r[i] = tab_coul_FDV0[ti_S.r[i]*4];
- ctab1_S->r[i] = tab_coul_FDV0[ti_S.r[i]*4+1];
+ ctab0_S->r[i] = tab_coul_FDV0[ti_S.i[i]*4];
+ ctab1_S->r[i] = tab_coul_FDV0[ti_S.i[i]*4+1];
}
}
static gmx_inline void
load_table_f_v(const real *tab_coul_FDV0,
- gmx_simd_ref_epi32 ti_S, int *ti,
- gmx_simd_ref_pr *ctab0_S, gmx_simd_ref_pr *ctab1_S,
- gmx_simd_ref_pr *ctabv_S)
+ gmx_simd_int32_t ti_S, int *ti,
+ gmx_simd_real_t *ctab0_S, gmx_simd_real_t *ctab1_S,
+ gmx_simd_real_t *ctabv_S)
{
int i;
for (i = 0; i < GMX_SIMD_REAL_WIDTH; i++)
{
- ctabv_S->r[i] = tab_coul_FDV0[ti_S.r[i]*4+2];
+ ctabv_S->r[i] = tab_coul_FDV0[ti_S.i[i]*4+2];
}
}
#endif
* Note that 4/8-way SIMD requires gmx_mm_transpose_sum4_pr instead.
*/
#if GMX_SIMD_REAL_WIDTH == 2
-static gmx_inline gmx_simd_ref_pr
-gmx_mm_transpose_sum2_pr(gmx_simd_ref_pr in0, gmx_simd_ref_pr in1)
+static gmx_inline gmx_simd_real_t
+gmx_mm_transpose_sum2_pr(gmx_simd_real_t in0, gmx_simd_real_t in1)
{
- gmx_simd_ref_pr sum;
+ gmx_simd_real_t sum;
sum.r[0] = in0.r[0] + in0.r[1];
sum.r[1] = in1.r[0] + in1.r[1];
#if GMX_SIMD_REAL_WIDTH >= 4
#if GMX_SIMD_REAL_WIDTH == 4
-static gmx_inline gmx_simd_ref_pr
+static gmx_inline gmx_simd_real_t
#else
-static gmx_inline gmx_mm_pr4
+static gmx_inline gmx_simd4_real_t
#endif
-gmx_mm_transpose_sum4_pr(gmx_simd_ref_pr in0, gmx_simd_ref_pr in1,
- gmx_simd_ref_pr in2, gmx_simd_ref_pr in3)
+gmx_mm_transpose_sum4_pr(gmx_simd_real_t in0, gmx_simd_real_t in1,
+ gmx_simd_real_t in2, gmx_simd_real_t in3)
{
#if GMX_SIMD_REAL_WIDTH == 4
- gmx_simd_ref_pr sum;
+ gmx_simd_real_t sum;
#else
- gmx_mm_pr4 sum;
+ gmx_simd4_real_t sum;
#endif
- int i;
+ int i;
sum.r[0] = 0;
sum.r[1] = 0;
* For this reference code we just use a plain-C sqrt.
*/
static gmx_inline void
-gmx_mm_invsqrt2_pd(gmx_simd_ref_pr in0, gmx_simd_ref_pr in1,
- gmx_simd_ref_pr *out0, gmx_simd_ref_pr *out1)
+gmx_mm_invsqrt2_pd(gmx_simd_real_t in0, gmx_simd_real_t in1,
+ gmx_simd_real_t *out0, gmx_simd_real_t *out1)
{
*out0 = gmx_simd_invsqrt_r(in0);
*out1 = gmx_simd_invsqrt_r(in1);
static gmx_inline void
load_lj_pair_params(const real *nbfp, const int *type, int aj,
- gmx_simd_ref_pr *c6_S, gmx_simd_ref_pr *c12_S)
+ gmx_simd_real_t *c6_S, gmx_simd_real_t *c12_S)
{
int i;
static gmx_inline void
load_lj_pair_params2(const real *nbfp0, const real *nbfp1,
const int *type, int aj,
- gmx_simd_ref_pr *c6_S, gmx_simd_ref_pr *c12_S)
+ gmx_simd_real_t *c6_S, gmx_simd_real_t *c12_S)
{
int i;
gmx_simd_ref_exclfilter a;
int i;
- for (i = 0; i < GMX_SIMD_REF_WIDTH; i++)
+ for (i = 0; i < GMX_SIMD_REAL_WIDTH; i++)
{
- a.r[i] = src;
+ a.i[i] = src;
}
return a;
gmx_simd_ref_exclfilter a;
int i;
- for (i = 0; i < GMX_SIMD_REF_WIDTH; i++)
+ for (i = 0; i < GMX_SIMD_REAL_WIDTH; i++)
{
- a.r[i] = src[i];
+ a.i[i] = src[i];
}
return a;
* If the same bit is set in both input masks, return TRUE, else
* FALSE. This function is only called with a single bit set in b.
*/
-static gmx_inline gmx_simd_ref_pb
+static gmx_inline gmx_simd_bool_t
gmx_simd_ref_checkbitmask_pb(gmx_simd_ref_exclfilter a, gmx_simd_ref_exclfilter b)
{
- gmx_simd_ref_pb c;
+ gmx_simd_bool_t c;
int i;
- for (i = 0; i < GMX_SIMD_REF_WIDTH; i++)
+ for (i = 0; i < GMX_SIMD_REAL_WIDTH; i++)
{
- c.r[i] = ((a.r[i] & b.r[i]) != 0);
+ c.b[i] = ((a.i[i] & b.i[i]) != 0);
}
return c;
* energy group pair energy storage
*/
+#define gmx_mm_extract_epi32(x, imm) _mm_cvtsi128_si32(_mm_srli_si128((x), 4 * (imm)))
+
typedef gmx_simd_int32_t gmx_exclfilter;
-static const int filter_stride = GMX_SIMD_INT32_WIDTH/GMX_SIMD_REAL_WIDTH;
+/* This is set to a constant for now, since the code does not adapt automatically just
+ * because we set the SIMD widths to other values.
+ */
+static const int filter_stride = 2;
/* Transpose 2 double precision registers */
static gmx_inline void
static gmx_inline gmx_exclfilter
gmx_load_exclusion_filter(const unsigned *i)
{
- return _mm_load_si128((__m128i *) i);
+ /* For now this has to be an explicit-float load since we use stride==2 */
+ return gmx_simd_load_fi(i);
}
static gmx_inline gmx_simd_bool_t
gmx_checkbitmask_pb(gmx_exclfilter m0, gmx_exclfilter m1)
{
- return gmx_mm_castsi128_pd(_mm_cmpeq_epi32(_mm_andnot_si128(m0, m1), _mm_setzero_si128()));
+ return _mm_castsi128_pd(_mm_cmpeq_epi32(_mm_andnot_si128(m0, m1), _mm_setzero_si128()));
}
#endif /* _nbnxn_kernel_simd_utils_x86_s128d_h_ */
/* Table has 4 entries, left-shift index by 2 */
ti_S = _mm_slli_epi32(ti_S, 2);
/* Without SSE4.1 the extract macro needs an immediate: unroll */
- idx[0] = gmx_mm_extract_epi32(ti_S, 0);
+ idx[0] = gmx_simd_extract_i(ti_S, 0);
ctab_S[0] = _mm_load_ps(tab_coul_FDV0+idx[0]);
- idx[1] = gmx_mm_extract_epi32(ti_S, 1);
+ idx[1] = gmx_simd_extract_i(ti_S, 1);
ctab_S[1] = _mm_load_ps(tab_coul_FDV0+idx[1]);
- idx[2] = gmx_mm_extract_epi32(ti_S, 2);
+ idx[2] = gmx_simd_extract_i(ti_S, 2);
ctab_S[2] = _mm_load_ps(tab_coul_FDV0+idx[2]);
- idx[3] = gmx_mm_extract_epi32(ti_S, 3);
+ idx[3] = gmx_simd_extract_i(ti_S, 3);
ctab_S[3] = _mm_load_ps(tab_coul_FDV0+idx[3]);
/* Shuffle the force table entries to a convenient order */
/* Table has 4 entries, left-shift index by 2 */
ti_S = _mm_slli_epi32(ti_S, 2);
/* Without SSE4.1 the extract macro needs an immediate: unroll */
- idx[0] = gmx_mm_extract_epi32(ti_S, 0);
+ idx[0] = gmx_simd_extract_i(ti_S, 0);
ctab_S[0] = _mm_load_ps(tab_coul_FDV0+idx[0]);
- idx[1] = gmx_mm_extract_epi32(ti_S, 1);
+ idx[1] = gmx_simd_extract_i(ti_S, 1);
ctab_S[1] = _mm_load_ps(tab_coul_FDV0+idx[1]);
- idx[2] = gmx_mm_extract_epi32(ti_S, 2);
+ idx[2] = gmx_simd_extract_i(ti_S, 2);
ctab_S[2] = _mm_load_ps(tab_coul_FDV0+idx[2]);
- idx[3] = gmx_mm_extract_epi32(ti_S, 3);
+ idx[3] = gmx_simd_extract_i(ti_S, 3);
ctab_S[3] = _mm_load_ps(tab_coul_FDV0+idx[3]);
/* Shuffle the force table entries to a convenient order */
static gmx_inline gmx_exclfilter
gmx_load_exclusion_filter(const unsigned *i)
{
- return _mm_load_si128((__m128i *) i);
+ return gmx_simd_load_i(i);
}
static gmx_inline gmx_simd_bool_t
gmx_checkbitmask_pb(gmx_exclfilter m0, gmx_exclfilter m1)
{
- return gmx_mm_castsi128_ps(_mm_cmpeq_epi32(_mm_andnot_si128(m0, m1), _mm_setzero_si128()));
+ return _mm_castsi128_ps(_mm_cmpeq_epi32(_mm_andnot_si128(m0, m1), _mm_setzero_si128()));
}
#endif /* _nbnxn_kernel_simd_utils_x86_s128s_h_ */
* energy group pair energy storage
*/
-typedef gmx_simd_real_t gmx_exclfilter;
-static const int filter_stride = 1;
-
-/* The 4xn kernel operates on 4-wide i-force registers */
-#define gmx_mm_pr4 __m128
-#define gmx_load_pr4 _mm_load_ps
-#define gmx_store_pr4 _mm_store_ps
-#define gmx_add_pr4 _mm_add_ps
-
#ifdef GMX_NBNXN_SIMD_2XNN
/* Half-width operations are required for the 2xnn kernels */
#define gmx_set1_hpr(a, b) *(a) = _mm_set1_ps(b)
/* Load one real at b and one real at b+1 into halves of a, respectively */
#define gmx_load1p1_pr(a, b) *(a) = _mm256_insertf128_ps(_mm256_castps128_ps256(_mm_load1_ps(b)), _mm_load1_ps(b+1), 0x1)
-/* Load reals at half-width aligned pointer b into two halves of a */
-#define gmx_loaddh_pr(a, b) *(a) = gmx_mm256_load4_ps(b)
/* To half-width SIMD register b into half width aligned memory a */
#define gmx_store_hpr(a, b) _mm_store_ps(a, b)
#define gmx_add_hpr _mm_add_ps
#define gmx_sub_hpr _mm_sub_ps
+
/* Sum over 4 half SIMD registers */
-#define gmx_sum4_hpr gmx_mm256_sum4h_m128
+static __m128 gmx_sum4_hpr(__m256 x, __m256 y)
+{
+ __m256 sum;
+
+ sum = _mm256_add_ps(x, y);
+ return _mm_add_ps(_mm256_castps256_ps128(sum), _mm256_extractf128_ps(sum, 0x1));
+}
+
+/* Load reals at half-width aligned pointer b into two halves of a */
+static gmx_inline void
+gmx_loaddh_pr(gmx_simd_real_t *a, const real *b)
+{
+ __m128 tmp;
+ tmp = _mm_load_ps(b);
+ *a = _mm256_insertf128_ps(_mm256_castps128_ps256(tmp), tmp, 0x1);
+}
static gmx_inline void
gmx_pr_to_2hpr(gmx_simd_real_t a, gmx_mm_hpr *b, gmx_mm_hpr *c)
*ctabv_S = gmx_2_mm_to_m256(ctabvt_S[0], ctabvt_S[1]);
}
+#ifdef GMX_SIMD_HAVE_FINT32_LOGICAL
+
+typedef gmx_simd_int32_t gmx_exclfilter;
+static const int filter_stride = GMX_SIMD_INT32_WIDTH/GMX_SIMD_REAL_WIDTH;
+
+static gmx_inline gmx_exclfilter
+gmx_load1_exclfilter(int e)
+{
+ return _mm256_set1_epi32(e);
+}
+
+static gmx_inline gmx_exclfilter
+gmx_load_exclusion_filter(const unsigned *i)
+{
+ return gmx_simd_load_i(i);
+}
+
+static gmx_inline gmx_simd_bool_t
+gmx_checkbitmask_pb(gmx_exclfilter m0, gmx_exclfilter m1)
+{
+ return _mm256_castsi256_ps(_mm256_cmpeq_epi32(_mm256_andnot_si256(m0, m1), _mm256_setzero_si256()));
+}
+
+#else /* GMX_SIMD_HAVE_FINT32_LOGICAL */
+
+/* No integer support, use a real to store the exclusion bits */
+typedef gmx_simd_real_t gmx_exclfilter;
+static const int filter_stride = 1;
+
static gmx_inline gmx_exclfilter
gmx_load1_exclfilter(int e)
{
return _mm256_cmp_ps(_mm256_cvtepi32_ps(_mm256_castps_si256(_mm256_and_ps(m0, m1))), _mm256_setzero_ps(), 0x0c);
}
+#endif /* GMX_SIMD_HAVE_FINT32_LOGICAL */
+
#endif /* _nbnxn_kernel_simd_utils_x86_s256s_h_ */
#define mask_loh _mm512_int2mask(0x00FF) /* would be better a constant - but can't initialize with a function call. */
#define mask_hih _mm512_int2mask(0xFF00)
-/* float/double SIMD register type */
-typedef __m512 gmx_mm_pr4;
-
-static gmx_inline gmx_mm_pr4
-gmx_load_pr4(const real *r)
-{
- return _mm512_loadunpacklo_ps(_mm512_undefined_ps(), r);
-}
-
-static gmx_inline void
-gmx_store_pr4(real *dest, gmx_mm_pr4 src)
-{
- _mm512_mask_packstorelo_ps(dest, _mm512_int2mask(0xF), src);
-}
-
-static gmx_inline gmx_mm_pr4
-gmx_add_pr4(gmx_mm_pr4 a, gmx_mm_pr4 b)
-{
- return _mm512_add_ps(a, b);
-}
-
/* Half-width SIMD real type */
typedef __m512 gmx_mm_hpr; /* high half is ignored */
/* Load one real at b and one real at b+1 into halves of a, respectively */
static gmx_inline void
-gmx_load1p1_pr(gmx_mm_ps *a, const real *b)
+gmx_load1p1_pr(gmx_simd_float_t *a, const real *b)
{
*a = _mm512_mask_extload_ps(_mm512_extload_ps(b, _MM_UPCONV_PS_NONE, _MM_BROADCAST_1X16, _MM_HINT_NONE), mask_hih,
/* Load reals at half-width aligned pointer b into two halves of a */
static gmx_inline void
-gmx_loaddh_pr(gmx_mm_ps *a, const real *b)
+gmx_loaddh_pr(gmx_simd_float_t *a, const real *b)
{
*a = _mm512_permute4f128_ps(_mm512_loadunpacklo_ps(_mm512_undefined_ps(), b), PERM_LOW2HIGH);
}
/* Sum over 4 half SIMD registers */
static gmx_inline gmx_mm_hpr
-gmx_sum4_hpr(gmx_mm_ps a, gmx_mm_ps b)
+gmx_sum4_hpr(gmx_simd_float_t a, gmx_simd_float_t b)
{
a = _mm512_add_ps(a, b);
b = _mm512_permute4f128_ps(a, PERM_HIGH2LOW);
}
/* Sum the elements of halfs of each input register and store sums in out */
-static gmx_inline gmx_mm_pr4
-gmx_mm_transpose_sum4h_pr(gmx_mm_ps a, gmx_mm_ps b)
+static gmx_inline __m512
+gmx_mm_transpose_sum4h_pr(gmx_simd_float_t a, gmx_simd_float_t b)
{
return _mm512_setr4_ps(_mm512_mask_reduce_add_ps(mask_loh, a),
_mm512_mask_reduce_add_ps(mask_hih, a),
}
static gmx_inline void
-gmx_pr_to_2hpr(gmx_mm_ps a, gmx_mm_hpr *b, gmx_mm_hpr *c)
+gmx_pr_to_2hpr(gmx_simd_float_t a, gmx_mm_hpr *b, gmx_mm_hpr *c)
{
*b = a;
*c = _mm512_permute4f128_ps(a, PERM_HIGH2LOW);
}
static gmx_inline void
-gmx_2hpr_to_pr(gmx_mm_hpr a, gmx_mm_hpr b, gmx_mm_ps *c)
+gmx_2hpr_to_pr(gmx_mm_hpr a, gmx_mm_hpr b, gmx_simd_float_t *c)
{
*c = _mm512_mask_permute4f128_ps(a, mask_hih, b, PERM_LOW2HIGH);
}
/* recombine the 2 high half into c */
static gmx_inline void
-gmx_2hpr_high_to_pr(gmx_mm_hpr a, gmx_mm_hpr b, gmx_mm_ps *c)
+gmx_2hpr_high_to_pr(gmx_mm_hpr a, gmx_mm_hpr b, gmx_simd_float_t *c)
{
*c = _mm512_mask_permute4f128_ps(b, mask_loh, a, PERM_HIGH2LOW);
}
*/
static gmx_inline void
load_table_f(const real *tab_coul_F, gmx_simd_int32_t ti_S, int *ti,
- gmx_mm_ps *ctab0_S, gmx_mm_ps *ctab1_S)
+ gmx_simd_float_t *ctab0_S, gmx_simd_float_t *ctab1_S)
{
__m512i idx;
__m512i ti1 = _mm512_add_epi32(ti_S, _mm512_set1_epi32(1)); /* incr by 1 for tab1 */
gmx_2hpr_to_pr(tmp1, tmp2, ctab0_S);
gmx_2hpr_high_to_pr(tmp1, tmp2, ctab1_S);
+
*ctab1_S = gmx_simd_sub_r(*ctab1_S, *ctab0_S);
}
static gmx_inline void
load_table_f_v(const real *tab_coul_F, const real *tab_coul_V,
gmx_simd_int32_t ti_S, int *ti,
- gmx_mm_ps *ctab0_S, gmx_mm_ps *ctab1_S,
- gmx_mm_ps *ctabv_S)
+ gmx_simd_float_t *ctab0_S, gmx_simd_float_t *ctab1_S,
+ gmx_simd_float_t *ctabv_S)
{
load_table_f(tab_coul_F, ti_S, ti, ctab0_S, ctab1_S);
*ctabv_S = _mm512_i32gather_ps(ti_S, tab_coul_V, sizeof(float));
}
-static gmx_inline gmx_mm_pr4
-gmx_mm_transpose_sum4_pr(gmx_mm_ps in0, gmx_mm_ps in1,
- gmx_mm_ps in2, gmx_mm_ps in3)
+static gmx_inline __m512
+gmx_mm_transpose_sum4_pr(gmx_simd_float_t in0, gmx_simd_float_t in1,
+ gmx_simd_float_t in2, gmx_simd_float_t in3)
{
return _mm512_setr4_ps(_mm512_reduce_add_ps(in0),
_mm512_reduce_add_ps(in1),
static gmx_inline void
load_lj_pair_params2(const real *nbfp0, const real *nbfp1,
const int *type, int aj,
- gmx_mm_ps *c6_S, gmx_mm_ps *c12_S)
+ gmx_simd_float_t *c6_S, gmx_simd_float_t *c12_S)
{
__m512i idx0, idx1, idx;
gmx_2hpr_high_to_pr(tmp1, tmp2, c12_S);
}
-#define HAVE_GMX_SUM_SIMD
-static gmx_inline real
-gmx_sum_simd(gmx_simd_real_t x, real* b)
-{
- return _mm512_reduce_add_ps(x);
-}
-static gmx_inline real
-gmx_sum_simd4(gmx_simd_real_t x, real* b)
-{
- return _mm512_mask_reduce_add_ps(_mm512_int2mask(0xF), x);
-}
-
/* Code for handling loading exclusions and converting them into
interactions. */
#define gmx_load1_exclfilter _mm512_set1_epi32
#include "typedefs.h"
+#include "gromacs/mdlib/nbnxn_simd.h"
+
#ifdef GMX_NBNXN_SIMD_2XNN
/* Include the full-width SIMD macros */
-
-#include "gromacs/simd/macros.h"
#include "gromacs/simd/vector_operations.h"
#if !(GMX_SIMD_REAL_WIDTH == 8 || GMX_SIMD_REAL_WIDTH == 16)
#include "typedefs.h"
+#include "gromacs/mdlib/nbnxn_simd.h"
+
#ifdef __cplusplus
extern "C" {
#endif
* To help us fund GROMACS development, we humbly ask that you cite
* the research papers on the package. Check out http://www.gromacs.org.
*/
-#include "gromacs/simd/macros.h"
-#include "gromacs/simd/four_wide_macros.h"
+#include "gromacs/simd/simd.h"
+#include "gromacs/simd/simd_math.h"
#include "gromacs/simd/vector_operations.h"
#include "../../nbnxn_consts.h"
#ifdef CALC_COUL_EWALD
#ifdef CHECK_EXCLS
/* For excluded pairs add a small number to avoid r^-6 = NaN */
- rsq_S0 = gmx_masknot_add_pr(interact_S0, rsq_S0, avoid_sing_S);
- rsq_S2 = gmx_masknot_add_pr(interact_S2, rsq_S2, avoid_sing_S);
+ rsq_S0 = gmx_simd_add_r(rsq_S0, gmx_simd_blendnotzero_r(avoid_sing_S, interact_S0));
+ rsq_S2 = gmx_simd_add_r(rsq_S2, gmx_simd_blendnotzero_r(avoid_sing_S, interact_S2));
#endif
/* Calculate 1/r */
rinv_S0 = gmx_simd_blendzero_r(rinv_S0, wco_S0);
rinv_S2 = gmx_simd_blendzero_r(rinv_S2, wco_S2);
#else
+ /* This needs to be modified: It makes assumptions about the internal storage
+ * of the SIMD representation, in particular that the blendv instruction always
+ * selects based on the sign bit. If the performance is really critical, it
+ * should be turned into a function that is platform-specific.
+ */
/* We only need to mask for the cut-off: blendv is faster */
rinv_S0 = gmx_simd_blendv_r(rinv_S0, zero_S, gmx_simd_sub_r(rc2_S, rsq_S0));
rinv_S2 = gmx_simd_blendv_r(rinv_S2, zero_S, gmx_simd_sub_r(rc2_S, rsq_S2));
/* Truncate scaled r to an int */
ti_S0 = gmx_simd_cvtt_r2i(rs_S0);
ti_S2 = gmx_simd_cvtt_r2i(rs_S2);
-#ifdef GMX_SIMD_HAVE_FLOOR
- rf_S0 = gmx_simd_floor_r(rs_S0);
- rf_S2 = gmx_simd_floor_r(rs_S2);
+#ifdef GMX_SIMD_HAVE_TRUNC
+ rf_S0 = gmx_simd_trunc_r(rs_S0);
+ rf_S2 = gmx_simd_trunc_r(rs_S2);
#else
rf_S0 = gmx_simd_cvt_i2r(ti_S0);
rf_S2 = gmx_simd_cvt_i2r(ti_S2);
gmx_simd_real_t fix_S0, fiy_S0, fiz_S0;
gmx_simd_real_t fix_S2, fiy_S2, fiz_S2;
/* We use an i-force SIMD register width of 4 */
- /* The pr4 stuff is defined in nbnxn_kernel_simd_utils.h */
+ /* The simd4 stuff might be defined in nbnxn_kernel_simd_utils.h */
gmx_simd4_real_t fix_S, fiy_S, fiz_S;
gmx_simd_real_t diagonal_jmi_S;
gmx_simd_real_t rcvdw2_S;
#endif
-#ifdef CALC_ENERGIES
- /* cppcheck-suppress unassignedVariable */
- real tmpsum_array[2*GMX_SIMD_REAL_WIDTH], *tmpsum;
-#endif
-#ifdef CALC_SHIFTFORCES
- /* cppcheck-suppress unassignedVariable */
- real shf_array[2*GMX_SIMD_REAL_WIDTH], *shf;
-#endif
-
int ninner;
#ifdef COUNT_PAIRS
shiftvec = shift_vec[0];
x = nbat->x;
-#ifdef CALC_ENERGIES
- tmpsum = gmx_simd_align_r(tmpsum_array);
-#endif
-#ifdef CALC_SHIFTFORCES
- shf = gmx_simd_align_r(shf_array);
-#endif
-
#ifdef FIX_LJ_C
pvdw_c6 = gmx_simd_align_r(pvdw_array);
pvdw_c12 = pvdw_c6 + UNROLLI*UNROLLJ;
/* Add accumulated i-forces to the force array */
fix_S = gmx_mm_transpose_sum4h_pr(fix_S0, fix_S2);
- gmx_simd4_store_r(f+scix, gmx_add_pr4(fix_S, gmx_simd4_load_r(f+scix)));
+ gmx_simd4_store_r(f+scix, gmx_simd4_add_r(fix_S, gmx_simd4_load_r(f+scix)));
fiy_S = gmx_mm_transpose_sum4h_pr(fiy_S0, fiy_S2);
- gmx_simd4_store_r(f+sciy, gmx_add_pr4(fiy_S, gmx_simd4_load_r(f+sciy)));
+ gmx_simd4_store_r(f+sciy, gmx_simd4_add_r(fiy_S, gmx_simd4_load_r(f+sciy)));
fiz_S = gmx_mm_transpose_sum4h_pr(fiz_S0, fiz_S2);
- gmx_simd4_store_r(f+sciz, gmx_add_pr4(fiz_S, gmx_simd4_load_r(f+sciz)));
+ gmx_simd4_store_r(f+sciz, gmx_simd4_add_r(fiz_S, gmx_simd4_load_r(f+sciz)));
#ifdef CALC_SHIFTFORCES
- fshift[ish3+0] += gmx_sum_simd4(fix_S, shf);
- fshift[ish3+1] += gmx_sum_simd4(fiy_S, shf);
- fshift[ish3+2] += gmx_sum_simd4(fiz_S, shf);
+ fshift[ish3+0] += gmx_simd4_reduce_r(fix_S);
+ fshift[ish3+1] += gmx_simd4_reduce_r(fiy_S);
+ fshift[ish3+2] += gmx_simd4_reduce_r(fiz_S);
#endif
#ifdef CALC_ENERGIES
if (do_coul)
{
- *Vc += gmx_sum_simd(vctot_S, tmpsum);
+ *Vc += gmx_simd_reduce_r(vctot_S);
}
-
- *Vvdw += gmx_sum_simd(Vvdwtot_S, tmpsum);
+ *Vvdw += gmx_simd_reduce_r(Vvdwtot_S);
#endif
/* Outer loop uses 6 flops/iteration */
#include "typedefs.h"
-#ifdef GMX_NBNXN_SIMD_4XN
+#include "gromacs/mdlib/nbnxn_simd.h"
-#ifdef GMX_NBNXN_HALF_WIDTH_SIMD
-#define GMX_USE_HALF_WIDTH_SIMD_HERE
-#endif
+#ifdef GMX_NBNXN_SIMD_4XN
-#include "gromacs/simd/macros.h"
#include "gromacs/simd/vector_operations.h"
#if !(GMX_SIMD_REAL_WIDTH == 2 || GMX_SIMD_REAL_WIDTH == 4 || GMX_SIMD_REAL_WIDTH == 8)
#error "unsupported SIMD width"
#include "typedefs.h"
+#include "gromacs/mdlib/nbnxn_simd.h"
+
#ifdef __cplusplus
extern "C" {
#endif
* To help us fund GROMACS development, we humbly ask that you cite
* the research papers on the package. Check out http://www.gromacs.org.
*/
-#include "gromacs/simd/macros.h"
-#include "gromacs/simd/four_wide_macros.h"
+#include "gromacs/simd/simd.h"
+#include "gromacs/simd/simd_math.h"
#include "gromacs/simd/vector_operations.h"
#include "../../nbnxn_consts.h"
#ifdef CALC_COUL_EWALD
#ifdef CHECK_EXCLS
/* For excluded pairs add a small number to avoid r^-6 = NaN */
- rsq_S0 = gmx_masknot_add_pr(interact_S0, rsq_S0, avoid_sing_S);
- rsq_S1 = gmx_masknot_add_pr(interact_S1, rsq_S1, avoid_sing_S);
- rsq_S2 = gmx_masknot_add_pr(interact_S2, rsq_S2, avoid_sing_S);
- rsq_S3 = gmx_masknot_add_pr(interact_S3, rsq_S3, avoid_sing_S);
+ rsq_S0 = gmx_simd_add_r(rsq_S0, gmx_simd_blendv_r(avoid_sing_S, gmx_simd_setzero_r(), interact_S0));
+ rsq_S1 = gmx_simd_add_r(rsq_S1, gmx_simd_blendv_r(avoid_sing_S, gmx_simd_setzero_r(), interact_S1));
+ rsq_S2 = gmx_simd_add_r(rsq_S2, gmx_simd_blendv_r(avoid_sing_S, gmx_simd_setzero_r(), interact_S2));
+ rsq_S3 = gmx_simd_add_r(rsq_S3, gmx_simd_blendv_r(avoid_sing_S, gmx_simd_setzero_r(), interact_S3));
#endif
/* Calculate 1/r */
rinv_S2 = gmx_simd_invsqrt_r(rsq_S2);
rinv_S3 = gmx_simd_invsqrt_r(rsq_S3);
#else
- gmx_mm_invsqrt2_pd(rsq_S0, rsq_S1, &rinv_S0, &rinv_S1);
- gmx_mm_invsqrt2_pd(rsq_S2, rsq_S3, &rinv_S2, &rinv_S3);
+ gmx_simd_invsqrt_pair_r(rsq_S0, rsq_S1, &rinv_S0, &rinv_S1);
+ gmx_simd_invsqrt_pair_r(rsq_S2, rsq_S3, &rinv_S2, &rinv_S3);
#endif
#ifdef CALC_COULOMB
ti_S1 = gmx_simd_cvtt_r2i(rs_S1);
ti_S2 = gmx_simd_cvtt_r2i(rs_S2);
ti_S3 = gmx_simd_cvtt_r2i(rs_S3);
-#ifdef GMX_SIMD_HAVE_FLOOR
- /* SSE4.1 floor is faster than gmx_cvtepi32_ps int->float cast */
- rf_S0 = gmx_simd_floor_r(rs_S0);
- rf_S1 = gmx_simd_floor_r(rs_S1);
- rf_S2 = gmx_simd_floor_r(rs_S2);
- rf_S3 = gmx_simd_floor_r(rs_S3);
+#ifdef GMX_SIMD_HAVE_TRUNC
+ /* SSE4.1 trunc is faster than gmx_cvtepi32_ps int->float cast */
+ rf_S0 = gmx_simd_trunc_r(rs_S0);
+ rf_S1 = gmx_simd_trunc_r(rs_S1);
+ rf_S2 = gmx_simd_trunc_r(rs_S2);
+ rf_S3 = gmx_simd_trunc_r(rs_S3);
#else
rf_S0 = gmx_simd_cvt_i2r(ti_S0);
rf_S1 = gmx_simd_cvt_i2r(ti_S1);
{
const nbnxn_ci_t *nbln;
const nbnxn_cj_t *l_cj;
- const int *type;
- const real *q;
+ const int * type;
+ const real * q;
const real *shiftvec;
const real *x;
const real *nbfp0, *nbfp1, *nbfp2 = NULL, *nbfp3 = NULL;
#ifdef CALC_COUL_TAB
/* Coulomb table variables */
gmx_simd_real_t invtsp_S;
- const real *tab_coul_F;
+ const real * tab_coul_F;
#ifndef TAB_FDV0
- const real *tab_coul_V;
+ const real * tab_coul_V;
#endif
/* Thread-local working buffers for force and potential lookups */
int ti0_array[2*GMX_SIMD_REAL_WIDTH], *ti0 = NULL;
gmx_simd_real_t rcvdw2_S;
#endif
-#ifdef CALC_ENERGIES
- /* cppcheck-suppress unassignedVariable */
- real tmpsum_array[GMX_SIMD_REAL_WIDTH*2], *tmpsum;
-#endif
-#ifdef CALC_SHIFTFORCES
- /* cppcheck-suppress unassignedVariable */
- real shf_array[GMX_SIMD_REAL_WIDTH*2], *shf;
-#endif
-
int ninner;
#ifdef COUNT_PAIRS
shiftvec = shift_vec[0];
x = nbat->x;
-#ifdef CALC_ENERGIES
- tmpsum = gmx_simd_align_r(tmpsum_array);
-#endif
-#ifdef CALC_SHIFTFORCES
- shf = gmx_simd_align_r(shf_array);
-#endif
-
#ifdef FIX_LJ_C
pvdw_c6 = gmx_simd_align_real(pvdw_array);
pvdw_c12 = pvdw_c6 + UNROLLI*UNROLLJ;
gmx_simd4_store_r(f+sciz, gmx_simd4_add_r(fiz_S, gmx_simd4_load_r(f+sciz)));
#ifdef CALC_SHIFTFORCES
- fshift[ish3+0] += gmx_sum_simd4(fix_S, shf);
- fshift[ish3+1] += gmx_sum_simd4(fiy_S, shf);
- fshift[ish3+2] += gmx_sum_simd4(fiz_S, shf);
+ fshift[ish3+0] += gmx_simd4_reduce_r(fix_S);
+ fshift[ish3+1] += gmx_simd4_reduce_r(fiy_S);
+ fshift[ish3+2] += gmx_simd4_reduce_r(fiz_S);
#endif
#else
fix0_S = gmx_mm_transpose_sum2_pr(fix_S0, fix_S1);
gmx_simd_store_r(f+sciz+2, gmx_simd_add_r(fiz2_S, gmx_simd_load_r(f+sciz+2)));
#ifdef CALC_SHIFTFORCES
- fshift[ish3+0] += gmx_sum_simd2(gmx_simd_add_r(fix0_S, fix2_S), shf);
- fshift[ish3+1] += gmx_sum_simd2(gmx_simd_add_r(fiy0_S, fiy2_S), shf);
- fshift[ish3+2] += gmx_sum_simd2(gmx_simd_add_r(fiz0_S, fiz2_S), shf);
+ fshift[ish3+0] += gmx_simd_reduce_r(gmx_simd_add_r(fix0_S, fix2_S));
+ fshift[ish3+1] += gmx_simd_reduce_r(gmx_simd_add_r(fiy0_S, fiy2_S));
+ fshift[ish3+2] += gmx_simd_reduce_r(gmx_simd_add_r(fiz0_S, fiz2_S));
#endif
#endif
#ifdef CALC_ENERGIES
if (do_coul)
{
- *Vc += gmx_sum_simd(vctot_S, tmpsum);
+ *Vc += gmx_simd_reduce_r(vctot_S);
}
- *Vvdw += gmx_sum_simd(Vvdwtot_S, tmpsum);
+ *Vvdw += gmx_simd_reduce_r(Vvdwtot_S);
#endif
/* Outer loop uses 6 flops/iteration */
#include "gromacs/fileio/gmxfio.h"
#ifdef NBNXN_SEARCH_BB_SIMD4
-/* We use 4-wide SIMD for bounding box calculations */
+/* Always use 4-wide SIMD for bounding box calculations */
-#ifndef GMX_DOUBLE
+# ifndef GMX_DOUBLE
/* Single precision BBs + coordinates, we can also load coordinates with SIMD */
-#define NBNXN_SEARCH_SIMD4_FLOAT_X_BB
-#endif
+# define NBNXN_SEARCH_SIMD4_FLOAT_X_BB
+# endif
-#if defined NBNXN_SEARCH_SIMD4_FLOAT_X_BB && (GPU_NSUBCELL == 4 || GPU_NSUBCELL == 8)
+# if defined NBNXN_SEARCH_SIMD4_FLOAT_X_BB && (GPU_NSUBCELL == 4 || GPU_NSUBCELL == 8)
/* Store bounding boxes with x, y and z coordinates in packs of 4 */
-#define NBNXN_PBB_SIMD4
-#endif
+# define NBNXN_PBB_SIMD4
+# endif
/* The packed bounding box coordinate stride is always set to 4.
* With AVX we could use 8, but that turns out not to be faster.
*/
-#define STRIDE_PBB 4
-#define STRIDE_PBB_2LOG 2
+# define STRIDE_PBB 4
+# define STRIDE_PBB_2LOG 2
#endif /* NBNXN_SEARCH_BB_SIMD4 */
#define X_IND_CI_SIMD_2XNN(ci) X_IND_CI_J8(ci)
#define X_IND_CJ_SIMD_2XNN(cj) X_IND_CJ_J8(cj)
#else
-#error "unsupported GMX_NBNXN_SIMD_WIDTH"
+#error "unsupported GMX_SIMD_REAL_WIDTH"
#endif
#endif
#endif
* so we don't need to treat special cases in the rest of the code.
*/
#ifdef NBNXN_SEARCH_BB_SIMD4
- gmx_simd4_store_r(&bbj[1].lower[0], gmx_simd4_load_bb_pr(&bbj[0].lower[0]));
- gmx_simd4_store_r(&bbj[1].upper[0], gmx_simd4_load_bb_pr(&bbj[0].upper[0]));
+ gmx_simd4_store_f(&bbj[1].lower[0], gmx_simd4_load_f(&bbj[0].lower[0]));
+ gmx_simd4_store_f(&bbj[1].upper[0], gmx_simd4_load_f(&bbj[0].upper[0]));
#else
bbj[1] = bbj[0];
#endif
}
#ifdef NBNXN_SEARCH_BB_SIMD4
- gmx_simd4_store_r(&bb->lower[0],
- gmx_simd4_min_r(gmx_simd4_load_bb_pr(&bbj[0].lower[0]),
- gmx_simd4_load_bb_pr(&bbj[1].lower[0])));
- gmx_simd4_store_r(&bb->upper[0],
- gmx_simd4_max_r(gmx_simd4_load_bb_pr(&bbj[0].upper[0]),
- gmx_simd4_load_bb_pr(&bbj[1].upper[0])));
+ gmx_simd4_store_f(&bb->lower[0],
+ gmx_simd4_min_f(gmx_simd4_load_f(&bbj[0].lower[0]),
+ gmx_simd4_load_f(&bbj[1].lower[0])));
+ gmx_simd4_store_f(&bb->upper[0],
+ gmx_simd4_max_f(gmx_simd4_load_f(&bbj[0].upper[0]),
+ gmx_simd4_load_f(&bbj[1].upper[0])));
#else
{
int i;
/* Coordinate order xyz?, bb order xyz0 */
static void calc_bounding_box_simd4(int na, const float *x, nbnxn_bb_t *bb)
{
- gmx_simd4_real_t bb_0_S, bb_1_S;
- gmx_simd4_real_t x_S;
+ gmx_simd4_float_t bb_0_S, bb_1_S;
+ gmx_simd4_float_t x_S;
- int i;
+ int i;
- bb_0_S = gmx_simd4_load_bb_pr(x);
+ bb_0_S = gmx_simd4_load_f(x);
bb_1_S = bb_0_S;
for (i = 1; i < na; i++)
{
- x_S = gmx_simd4_load_bb_pr(x+i*NNBSBB_C);
- bb_0_S = gmx_simd4_min_r(bb_0_S, x_S);
- bb_1_S = gmx_simd4_max_r(bb_1_S, x_S);
+ x_S = gmx_simd4_load_f(x+i*NNBSBB_C);
+ bb_0_S = gmx_simd4_min_f(bb_0_S, x_S);
+ bb_1_S = gmx_simd4_max_f(bb_1_S, x_S);
}
- gmx_simd4_store_r(&bb->lower[0], bb_0_S);
- gmx_simd4_store_r(&bb->upper[0], bb_1_S);
+ gmx_simd4_store_f(&bb->lower[0], bb_0_S);
+ gmx_simd4_store_f(&bb->upper[0], bb_1_S);
}
/* Coordinate order xyz?, bb order xxxxyyyyzzzz */
for (c2 = sc2; c2 < sc2+nc2; c2++)
{
#ifdef NBNXN_SEARCH_BB_SIMD4
- gmx_simd4_real_t min_S, max_S;
-
- min_S = gmx_simd4_min_r(gmx_simd4_load_bb_pr(&bb[c2*2+0].lower[0]),
- gmx_simd4_load_bb_pr(&bb[c2*2+1].lower[0]));
- max_S = gmx_simd4_max_r(gmx_simd4_load_bb_pr(&bb[c2*2+0].upper[0]),
- gmx_simd4_load_bb_pr(&bb[c2*2+1].upper[0]));
- gmx_simd4_store_r(&grid->bbj[c2].lower[0], min_S);
- gmx_simd4_store_r(&grid->bbj[c2].upper[0], max_S);
+ gmx_simd4_float_t min_S, max_S;
+
+ min_S = gmx_simd4_min_f(gmx_simd4_load_f(&bb[c2*2+0].lower[0]),
+ gmx_simd4_load_f(&bb[c2*2+1].lower[0]));
+ max_S = gmx_simd4_max_f(gmx_simd4_load_f(&bb[c2*2+0].upper[0]),
+ gmx_simd4_load_f(&bb[c2*2+1].upper[0]));
+ gmx_simd4_store_f(&grid->bbj[c2].lower[0], min_S);
+ gmx_simd4_store_f(&grid->bbj[c2].upper[0], max_S);
#else
for (j = 0; j < NNBSBB_C; j++)
{
static float subc_bb_dist2_simd4(int si, const nbnxn_bb_t *bb_i_ci,
int csj, const nbnxn_bb_t *bb_j_all)
{
- gmx_simd4_real_t bb_i_S0, bb_i_S1;
- gmx_simd4_real_t bb_j_S0, bb_j_S1;
- gmx_simd4_real_t dl_S;
- gmx_simd4_real_t dh_S;
- gmx_simd4_real_t dm_S;
- gmx_simd4_real_t dm0_S;
+ gmx_simd4_float_t bb_i_S0, bb_i_S1;
+ gmx_simd4_float_t bb_j_S0, bb_j_S1;
+ gmx_simd4_float_t dl_S;
+ gmx_simd4_float_t dh_S;
+ gmx_simd4_float_t dm_S;
+ gmx_simd4_float_t dm0_S;
- bb_i_S0 = gmx_simd4_load_bb_pr(&bb_i_ci[si].lower[0]);
- bb_i_S1 = gmx_simd4_load_bb_pr(&bb_i_ci[si].upper[0]);
- bb_j_S0 = gmx_simd4_load_bb_pr(&bb_j_all[csj].lower[0]);
- bb_j_S1 = gmx_simd4_load_bb_pr(&bb_j_all[csj].upper[0]);
+ bb_i_S0 = gmx_simd4_load_f(&bb_i_ci[si].lower[0]);
+ bb_i_S1 = gmx_simd4_load_f(&bb_i_ci[si].upper[0]);
+ bb_j_S0 = gmx_simd4_load_f(&bb_j_all[csj].lower[0]);
+ bb_j_S1 = gmx_simd4_load_f(&bb_j_all[csj].upper[0]);
- dl_S = gmx_simd4_sub_r(bb_i_S0, bb_j_S1);
- dh_S = gmx_simd4_sub_r(bb_j_S0, bb_i_S1);
+ dl_S = gmx_simd4_sub_f(bb_i_S0, bb_j_S1);
+ dh_S = gmx_simd4_sub_f(bb_j_S0, bb_i_S1);
- dm_S = gmx_simd4_max_r(dl_S, dh_S);
- dm0_S = gmx_simd4_max_r(dm_S, gmx_simd4_setzero_r());
+ dm_S = gmx_simd4_max_f(dl_S, dh_S);
+ dm0_S = gmx_simd4_max_f(dm_S, gmx_simd4_setzero_f());
- return gmx_simd4_dotproduct3_r(dm0_S, dm0_S);
+ return gmx_simd4_dotproduct3_f(dm0_S, dm0_S);
}
/* Calculate bb bounding distances of bb_i[si,...,si+3] and store them in d2 */
#define SUBC_BB_DIST2_SIMD4_XXXX_INNER(si, bb_i, d2) \
{ \
- int shi; \
+ int shi; \
\
- gmx_simd4_real_t dx_0, dy_0, dz_0; \
- gmx_simd4_real_t dx_1, dy_1, dz_1; \
+ gmx_simd4_float_t dx_0, dy_0, dz_0; \
+ gmx_simd4_float_t dx_1, dy_1, dz_1; \
\
- gmx_simd4_real_t mx, my, mz; \
- gmx_simd4_real_t m0x, m0y, m0z; \
+ gmx_simd4_float_t mx, my, mz; \
+ gmx_simd4_float_t m0x, m0y, m0z; \
\
- gmx_simd4_real_t d2x, d2y, d2z; \
- gmx_simd4_real_t d2s, d2t; \
+ gmx_simd4_float_t d2x, d2y, d2z; \
+ gmx_simd4_float_t d2s, d2t; \
\
shi = si*NNBSBB_D*DIM; \
\
- xi_l = gmx_simd4_load_bb_pr(bb_i+shi+0*STRIDE_PBB); \
- yi_l = gmx_simd4_load_bb_pr(bb_i+shi+1*STRIDE_PBB); \
- zi_l = gmx_simd4_load_bb_pr(bb_i+shi+2*STRIDE_PBB); \
- xi_h = gmx_simd4_load_bb_pr(bb_i+shi+3*STRIDE_PBB); \
- yi_h = gmx_simd4_load_bb_pr(bb_i+shi+4*STRIDE_PBB); \
- zi_h = gmx_simd4_load_bb_pr(bb_i+shi+5*STRIDE_PBB); \
+ xi_l = gmx_simd4_load_f(bb_i+shi+0*STRIDE_PBB); \
+ yi_l = gmx_simd4_load_f(bb_i+shi+1*STRIDE_PBB); \
+ zi_l = gmx_simd4_load_f(bb_i+shi+2*STRIDE_PBB); \
+ xi_h = gmx_simd4_load_f(bb_i+shi+3*STRIDE_PBB); \
+ yi_h = gmx_simd4_load_f(bb_i+shi+4*STRIDE_PBB); \
+ zi_h = gmx_simd4_load_f(bb_i+shi+5*STRIDE_PBB); \
\
- dx_0 = gmx_simd4_sub_r(xi_l, xj_h); \
- dy_0 = gmx_simd4_sub_r(yi_l, yj_h); \
- dz_0 = gmx_simd4_sub_r(zi_l, zj_h); \
+ dx_0 = gmx_simd4_sub_f(xi_l, xj_h); \
+ dy_0 = gmx_simd4_sub_f(yi_l, yj_h); \
+ dz_0 = gmx_simd4_sub_f(zi_l, zj_h); \
\
- dx_1 = gmx_simd4_sub_r(xj_l, xi_h); \
- dy_1 = gmx_simd4_sub_r(yj_l, yi_h); \
- dz_1 = gmx_simd4_sub_r(zj_l, zi_h); \
+ dx_1 = gmx_simd4_sub_f(xj_l, xi_h); \
+ dy_1 = gmx_simd4_sub_f(yj_l, yi_h); \
+ dz_1 = gmx_simd4_sub_f(zj_l, zi_h); \
\
- mx = gmx_simd4_max_r(dx_0, dx_1); \
- my = gmx_simd4_max_r(dy_0, dy_1); \
- mz = gmx_simd4_max_r(dz_0, dz_1); \
+ mx = gmx_simd4_max_f(dx_0, dx_1); \
+ my = gmx_simd4_max_f(dy_0, dy_1); \
+ mz = gmx_simd4_max_f(dz_0, dz_1); \
\
- m0x = gmx_simd4_max_r(mx, zero); \
- m0y = gmx_simd4_max_r(my, zero); \
- m0z = gmx_simd4_max_r(mz, zero); \
+ m0x = gmx_simd4_max_f(mx, zero); \
+ m0y = gmx_simd4_max_f(my, zero); \
+ m0z = gmx_simd4_max_f(mz, zero); \
\
- d2x = gmx_simd4_mul_r(m0x, m0x); \
- d2y = gmx_simd4_mul_r(m0y, m0y); \
- d2z = gmx_simd4_mul_r(m0z, m0z); \
+ d2x = gmx_simd4_mul_f(m0x, m0x); \
+ d2y = gmx_simd4_mul_f(m0y, m0y); \
+ d2z = gmx_simd4_mul_f(m0z, m0z); \
\
- d2s = gmx_simd4_add_r(d2x, d2y); \
- d2t = gmx_simd4_add_r(d2s, d2z); \
+ d2s = gmx_simd4_add_f(d2x, d2y); \
+ d2t = gmx_simd4_add_f(d2s, d2z); \
\
- gmx_simd4_store_r(d2+si, d2t); \
+ gmx_simd4_store_f(d2+si, d2t); \
}
/* 4-wide SIMD code for nsi bb distances for bb format xxxxyyyyzzzz */
int nsi, const float *bb_i,
float *d2)
{
- gmx_simd4_real_t xj_l, yj_l, zj_l;
- gmx_simd4_real_t xj_h, yj_h, zj_h;
- gmx_simd4_real_t xi_l, yi_l, zi_l;
- gmx_simd4_real_t xi_h, yi_h, zi_h;
+ gmx_simd4_float_t xj_l, yj_l, zj_l;
+ gmx_simd4_float_t xj_h, yj_h, zj_h;
+ gmx_simd4_float_t xi_l, yi_l, zi_l;
+ gmx_simd4_float_t xi_h, yi_h, zi_h;
- gmx_simd4_real_t zero;
+ gmx_simd4_float_t zero;
- zero = gmx_simd4_setzero_r();
+ zero = gmx_simd4_setzero_f();
- xj_l = gmx_simd4_set1_r(bb_j[0*STRIDE_PBB]);
- yj_l = gmx_simd4_set1_r(bb_j[1*STRIDE_PBB]);
- zj_l = gmx_simd4_set1_r(bb_j[2*STRIDE_PBB]);
- xj_h = gmx_simd4_set1_r(bb_j[3*STRIDE_PBB]);
- yj_h = gmx_simd4_set1_r(bb_j[4*STRIDE_PBB]);
- zj_h = gmx_simd4_set1_r(bb_j[5*STRIDE_PBB]);
+ xj_l = gmx_simd4_set1_f(bb_j[0*STRIDE_PBB]);
+ yj_l = gmx_simd4_set1_f(bb_j[1*STRIDE_PBB]);
+ zj_l = gmx_simd4_set1_f(bb_j[2*STRIDE_PBB]);
+ xj_h = gmx_simd4_set1_f(bb_j[3*STRIDE_PBB]);
+ yj_h = gmx_simd4_set1_f(bb_j[4*STRIDE_PBB]);
+ zj_h = gmx_simd4_set1_f(bb_j[5*STRIDE_PBB]);
/* Here we "loop" over si (0,STRIDE_PBB) from 0 to nsi with step STRIDE_PBB.
* But as we know the number of iterations is 1 or 2, we unroll manually.
}
#ifdef NBNXN_SEARCH_SIMD4_FLOAT_X_BB
-/* When we make seperate single/double precision SIMD vector operation
- * include files, this function should be moved there (also using FMA).
- */
-static inline gmx_simd4_real_t
-gmx_simd4_calc_rsq_r(gmx_simd4_real_t x, gmx_simd4_real_t y, gmx_simd4_real_t z)
-{
- return gmx_simd4_add_r( gmx_simd4_add_r( gmx_simd4_mul_r(x, x), gmx_simd4_mul_r(y, y) ), gmx_simd4_mul_r(z, z) );
-}
/* 4-wide SIMD function which determines if any atom pair between two cells,
* both with 8 atoms, is within distance sqrt(rl2).
rc2_S = gmx_simd4_set1_r(rl2);
dim_stride = NBNXN_GPU_CLUSTER_SIZE/STRIDE_PBB*DIM;
- ix_S0 = gmx_simd4_load_bb_pr(x_i+(si*dim_stride+0)*STRIDE_PBB);
- iy_S0 = gmx_simd4_load_bb_pr(x_i+(si*dim_stride+1)*STRIDE_PBB);
- iz_S0 = gmx_simd4_load_bb_pr(x_i+(si*dim_stride+2)*STRIDE_PBB);
- ix_S1 = gmx_simd4_load_bb_pr(x_i+(si*dim_stride+3)*STRIDE_PBB);
- iy_S1 = gmx_simd4_load_bb_pr(x_i+(si*dim_stride+4)*STRIDE_PBB);
- iz_S1 = gmx_simd4_load_bb_pr(x_i+(si*dim_stride+5)*STRIDE_PBB);
+ ix_S0 = gmx_simd4_load_r(x_i+(si*dim_stride+0)*STRIDE_PBB);
+ iy_S0 = gmx_simd4_load_r(x_i+(si*dim_stride+1)*STRIDE_PBB);
+ iz_S0 = gmx_simd4_load_r(x_i+(si*dim_stride+2)*STRIDE_PBB);
+ ix_S1 = gmx_simd4_load_r(x_i+(si*dim_stride+3)*STRIDE_PBB);
+ iy_S1 = gmx_simd4_load_r(x_i+(si*dim_stride+4)*STRIDE_PBB);
+ iz_S1 = gmx_simd4_load_r(x_i+(si*dim_stride+5)*STRIDE_PBB);
/* We loop from the outer to the inner particles to maximize
* the chance that we find a pair in range quickly and return.
--- /dev/null
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2014, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+
+#ifndef _nbnxn_simd_h
+#define _nbnxn_simd_h
+
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include "typedefs.h"
+
+/* Include SIMD, below we select kernels based on the SIMD width */
+#include "gromacs/simd/simd.h"
+#include "gromacs/simd/simd_math.h"
+
+
+#ifdef GMX_SIMD_REFERENCE
+#define GMX_NBNXN_SIMD
+#endif
+
+/* As we modularize the verlet kernels, we should remove stuff like this
+ * that checks internal SIMD implementation details.
+ */
+#if (defined GMX_SIMD_X86_SSE2) || (defined GMX_SIMD_X86_SSE4_1) || \
+ (defined GMX_SIMD_X86_AVX_128_FMA) || (defined GMX_SIMD_X86_AVX_256) || \
+ (defined GMX_SIMD_X86_AVX2_256) || (defined GMX_SIMD_IBM_QPX)
+/* Use SIMD accelerated nbnxn search and kernels */
+#define GMX_NBNXN_SIMD
+#endif
+
+/* MIC for double is implemented in the SIMD module but so far missing in
+ mdlib/nbnxn_kernels/nbnxn_kernel_simd_utils_x86_mic.h */
+#if defined __MIC__ && !defined GMX_DOUBLE
+#define GMX_NBNXN_SIMD
+#endif
+
+#ifdef GMX_NBNXN_SIMD
+/* The nbnxn SIMD 4xN and 2x(N+N) kernels can be added independently.
+ * Currently the 2xNN SIMD kernels only make sense with:
+ * 8-way SIMD: 4x4 setup, works with AVX-256 in single precision
+ * 16-way SIMD: 4x8 setup, works with Intel MIC in single precision
+ */
+#if GMX_SIMD_REAL_WIDTH == 2 || GMX_SIMD_REAL_WIDTH == 4 || GMX_SIMD_REAL_WIDTH == 8
+#define GMX_NBNXN_SIMD_4XN
+#endif
+#if GMX_SIMD_REAL_WIDTH == 8 || GMX_SIMD_REAL_WIDTH == 16
+#define GMX_NBNXN_SIMD_2XNN
+#endif
+
+#if !(defined GMX_NBNXN_SIMD_4XN || defined GMX_NBNXN_SIMD_2XNN)
+#error "No SIMD kernel type defined"
+#endif
+
+#endif /* GMX_NBNXN_SIMD */
+
+#endif /* _nbnxn_simd_h */
#include "gromacs/utility/gmxomp.h"
/* Include the SIMD macro file and then check for support */
-#include "gromacs/simd/macros.h"
-#if defined GMX_HAVE_SIMD_MACROS
+#include "gromacs/simd/simd.h"
+#include "gromacs/simd/simd_math.h"
+#ifdef GMX_SIMD_HAVE_REAL
/* Turn on arbitrary width SIMD intrinsics for PME solve */
-#define PME_SIMD_SOLVE
+# define PME_SIMD_SOLVE
#endif
#define PME_GRID_QA 0 /* Gridindex for A-state for Q */
/* Pascal triangle coefficients used in solve_pme_lj_yzx, only need to do 4 calculations due to symmetry */
const real lb_scale_factor_symm[] = { 2.0/64, 12.0/64, 30.0/64, 20.0/64 };
-/* Include the 4-wide SIMD macro file */
-#include "gromacs/simd/four_wide_macros.h"
/* Check if we have 4-wide SIMD macro support */
-#ifdef GMX_HAVE_SIMD4_MACROS
+#if (defined GMX_SIMD4_HAVE_REAL)
/* Do PME spread and gather with 4-wide SIMD.
* NOTE: SIMD is only used with PME order 4 and 5 (which are the most common).
*/
-#define PME_SIMD4_SPREAD_GATHER
+# define PME_SIMD4_SPREAD_GATHER
-#ifdef GMX_SIMD4_HAVE_UNALIGNED
+# if (defined GMX_SIMD_HAVE_LOADU) && (defined GMX_SIMD_HAVE_STOREU)
/* With PME-order=4 on x86, unaligned load+store is slightly faster
* than doubling all SIMD operations when using aligned load+store.
*/
-#define PME_SIMD4_UNALIGNED
-#endif
+# define PME_SIMD4_UNALIGNED
+# endif
#endif
#define DFT_TOL 1e-7
#endif
#ifdef PME_SIMD4_SPREAD_GATHER
-#define SIMD4_ALIGNMENT (GMX_SIMD4_WIDTH*sizeof(real))
+# define SIMD4_ALIGNMENT (GMX_SIMD4_WIDTH*sizeof(real))
#else
/* We can use any alignment, apart from 0, so we use 4 reals */
-#define SIMD4_ALIGNMENT (4*sizeof(real))
+# define SIMD4_ALIGNMENT (4*sizeof(real))
#endif
/* GMX_CACHE_SEP should be a multiple of the SIMD and SIMD4 register size
int offx, offy, offz;
#if defined PME_SIMD4_SPREAD_GATHER && !defined PME_SIMD4_UNALIGNED
- real thz_buffer[12], *thz_aligned;
+ real thz_buffer[GMX_SIMD4_WIDTH*3], *thz_aligned;
- thz_aligned = gmx_simd4_align_real(thz_buffer);
+ thz_aligned = gmx_simd4_align_r(thz_buffer);
#endif
pnx = pmegrid->s[XX];
}
-#if defined PME_SIMD_SOLVE && defined GMX_SIMD_HAVE_EXP
+#if defined PME_SIMD_SOLVE
/* Calculate exponentials through SIMD */
inline static void calc_exponentials_q(int gmx_unused start, int end, real f, real *d_aligned, real *r_aligned, real *e_aligned)
{
}
#endif
-#if defined PME_SIMD_SOLVE && defined GMX_SIMD_HAVE_ERFC
+#if defined PME_SIMD_SOLVE
/* Calculate exponentials through SIMD */
inline static void calc_exponentials_lj(int gmx_unused start, int end, real *r_aligned, real *factor_aligned, real *d_aligned)
{
pme_spline_work_t *work;
#if defined PME_SIMD4_SPREAD_GATHER && !defined PME_SIMD4_UNALIGNED
- real thz_buffer[12], *thz_aligned;
- real dthz_buffer[12], *dthz_aligned;
+ real thz_buffer[GMX_SIMD4_WIDTH*3], *thz_aligned;
+ real dthz_buffer[GMX_SIMD4_WIDTH*3], *dthz_aligned;
- thz_aligned = gmx_simd4_align_real(thz_buffer);
- dthz_aligned = gmx_simd4_align_real(dthz_buffer);
+ thz_aligned = gmx_simd4_align_r(thz_buffer);
+ dthz_aligned = gmx_simd4_align_r(dthz_buffer);
#endif
work = pme->spline_work;
pme_spline_work_t *work;
#ifdef PME_SIMD4_SPREAD_GATHER
- real tmp[12], *tmp_aligned;
+ real tmp[GMX_SIMD4_WIDTH*3], *tmp_aligned;
gmx_simd4_real_t zero_S;
gmx_simd4_real_t real_mask_S0, real_mask_S1;
- int of, i;
+ int of, i;
snew_aligned(work, 1, SIMD4_ALIGNMENT);
- tmp_aligned = gmx_simd4_align_real(tmp);
+ tmp_aligned = gmx_simd4_align_r(tmp);
zero_S = gmx_simd4_setzero_r();
* as we only operate on order of the 8 grid entries that are
* load into 2 SIMD registers.
*/
- for (of = 0; of < 8-(order-1); of++)
+ for (of = 0; of < 2*GMX_SIMD4_WIDTH-(order-1); of++)
{
- for (i = 0; i < 8; i++)
+ for (i = 0; i < 2*GMX_SIMD4_WIDTH; i++)
{
tmp_aligned[i] = (i >= of && i < of+order ? -1.0 : 1.0);
}
real_mask_S0 = gmx_simd4_load_r(tmp_aligned);
- real_mask_S1 = gmx_simd4_load_r(tmp_aligned+4);
+ real_mask_S1 = gmx_simd4_load_r(tmp_aligned+GMX_SIMD4_WIDTH);
work->mask_S0[of] = gmx_simd4_cmplt_r(real_mask_S0, zero_S);
work->mask_S1[of] = gmx_simd4_cmplt_r(real_mask_S1, zero_S);
}
ty_S4 = gmx_simd4_set1_r(thy[4]);
#endif
-#ifdef GMX_SIMD4_HAVE_UNALIGNED
+#ifdef PME_SIMD4_UNALIGNED
tz_S0 = gmx_simd4_loadu_r(thz-offset);
tz_S1 = gmx_simd4_loadu_r(thz-offset+4);
#else
fy_S = gmx_simd4_setzero_r();
fz_S = gmx_simd4_setzero_r();
-#ifdef GMX_SIMD4_HAVE_UNALIGNED
+#ifdef PME_SIMD4_UNALIGNED
tz_S0 = gmx_simd4_loadu_r(thz-offset);
tz_S1 = gmx_simd4_loadu_r(thz-offset+4);
dz_S0 = gmx_simd4_loadu_r(dthz-offset);
#include "gromacs/timing/wallcycle.h"
#include "gromacs/timing/walltime_accounting.h"
-#ifdef GMX_SIMD_X86_SSE2_OR_HIGHER
-#include "gromacs/simd/general_x86_sse2.h"
-#endif
-
-
static void global_max(t_commrec *cr, int *n)
{
int *sum, i;
gmx_fatal(FARGS, "Unknown integrator %s", ei_names[inputrec->eI]);
}
-#ifdef GMX_SIMD_X86_SSE2_OR_HIGHER
- /* Make sure we don't detect SSE overflow generated before this point */
- gmx_mm_check_and_reset_overflow();
+#ifdef GMX_SIMD
+ /* Make sure we don't detect SIMD overflow generated before this point */
+ gmx_simd_check_and_reset_overflow();
#endif
while (bNotLastFrame)
--- /dev/null
+#
+# This file is part of the GROMACS molecular simulation package.
+#
+# Copyright (c) 2014, by the GROMACS development team, led by
+# Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+# and including many others, as listed in the AUTHORS file in the
+# top-level source directory and at http://www.gromacs.org.
+#
+# GROMACS is free software; you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public License
+# as published by the Free Software Foundation; either version 2.1
+# of the License, or (at your option) any later version.
+#
+# GROMACS is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public
+# License along with GROMACS; if not, see
+# http://www.gnu.org/licenses, or write to the Free Software Foundation,
+# Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+#
+# If you want to redistribute modifications to GROMACS, please
+# consider that scientific software is very special. Version
+# control is crucial - bugs must be traceable. We will be happy to
+# consider code for inclusion in the official distribution, but
+# derived work must not be called official GROMACS. Details are found
+# in the README & COPYING files - if they are missing, get the
+# official version at http://www.gromacs.org.
+#
+# To help us fund GROMACS development, we humbly ask that you cite
+# the research papers on the package. Check out http://www.gromacs.org.
+
+if (BUILD_TESTING)
+ add_subdirectory(tests)
+endif (BUILD_TESTING)
+++ /dev/null
-/*
- * This file is part of the GROMACS molecular simulation package.
- *
- * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
- * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
- * and including many others, as listed in the AUTHORS file in the
- * top-level source directory and at http://www.gromacs.org.
- *
- * GROMACS is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public License
- * as published by the Free Software Foundation; either version 2.1
- * of the License, or (at your option) any later version.
- *
- * GROMACS is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with GROMACS; if not, see
- * http://www.gnu.org/licenses, or write to the Free Software Foundation,
- * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
- *
- * If you want to redistribute modifications to GROMACS, please
- * consider that scientific software is very special. Version
- * control is crucial - bugs must be traceable. We will be happy to
- * consider code for inclusion in the official distribution, but
- * derived work must not be called official GROMACS. Details are found
- * in the README & COPYING files - if they are missing, get the
- * official version at http://www.gromacs.org.
- *
- * To help us fund GROMACS development, we humbly ask that you cite
- * the research papers on the package. Check out http://www.gromacs.org.
- */
-
-/* The macros in this file are intended to be used for writing
- * architecture-independent SIMD intrinsics code with a SIMD width of 4.
- * To support a new architecture, adding macros here should be all
- * that is needed.
- *
- * Note that this file is intended only for SIMD operations that require
- * a SIMD width of 4. In general gmx_simd_macros.h provides wider hardware
- * support, more functionality and higher performance, but the SIMD width is
- * not necessarily equal to 4.
- */
-
-#ifdef GMX_SIMD_FOUR_WIDE_MACROS_H
-#error "four_wide_macros.h included twice"
-#else
-#define GMX_SIMD_FOUR_WIDE_MACROS_H
-
-
-/* The SIMD width here is always 4, since that is the whole point */
-#define GMX_SIMD4_WIDTH 4
-
-
-#if defined GMX_SIMD4_SINGLE || defined GMX_SIMD4_DOUBLE
-/* Precision set before inclusion, honour that request */
-#else
-/* Match precision to the Gromacs real precision */
-#ifdef GMX_DOUBLE
-#define GMX_SIMD4_DOUBLE
-#else
-#define GMX_SIMD4_SINGLE
-#endif
-#endif
-
-#ifdef GMX_SIMD4_DOUBLE
-typedef double gmx_simd4_real;
-#endif
-#ifdef GMX_SIMD4_SINGLE
-typedef float gmx_simd4_real;
-#endif
-
-/* Uncomment the next line, without other SIMD active, for testing plain-C */
-/* #define GMX_SIMD4_REFERENCE */
-#ifdef GMX_SIMD4_REFERENCE
-/* Plain C SIMD reference implementation, also serves as documentation */
-#define GMX_HAVE_SIMD4_MACROS
-
-/* Include plain-C reference implementation, also serves as documentation */
-#include "four_wide_macros_ref.h"
-
-/* float/double SIMD register type */
-#define gmx_simd4_real_t gmx_simd4_ref_pr
-
-/* boolean SIMD register type */
-#define gmx_simd4_bool_t gmx_simd4_ref_pb
-
-#define gmx_simd4_load_r gmx_simd4_ref_load_pr
-#define gmx_simd4_load_bb_pr gmx_simd4_ref_load_pr
-#define gmx_simd4_set1_r gmx_simd4_ref_set1_pr
-#define gmx_simd4_setzero_r gmx_simd4_ref_setzero_pr
-#define gmx_simd4_store_r gmx_simd4_ref_store_pr
-
-/* Unaligned load+store are not required,
- * but they can speed up the PME spread+gather operations.
- */
-#define GMX_SIMD4_HAVE_UNALIGNED
-#ifdef GMX_SIMD4_HAVE_UNALIGNED
-#define gmx_simd4_loadu_r gmx_simd4_ref_load_pr
-#define gmx_simd4_storeu_r gmx_simd4_ref_store_pr
-#endif
-
-#define gmx_simd4_add_r gmx_simd4_ref_add_pr
-#define gmx_simd4_sub_r gmx_simd4_ref_sub_pr
-#define gmx_simd4_mul_r gmx_simd4_ref_mul_pr
-/* For the FMA macros below, aim for c=d in code, so FMA3 uses 1 instruction */
-#define gmx_simd4_fmadd_r gmx_simd4_ref_madd_pr
-#define gmx_simd4_fnmadd_r gmx_simd4_ref_nmsub_pr
-
-#define gmx_simd4_dotproduct3_r gmx_simd4_ref_dotproduct3
-
-#define gmx_simd4_min_r gmx_simd4_ref_min_pr
-#define gmx_simd4_max_r gmx_simd4_ref_max_pr
-
-#define gmx_simd4_blendzero_r gmx_simd4_ref_blendzero_pr
-
-/* Comparison */
-#define gmx_simd4_cmplt_r gmx_simd4_ref_cmplt_pr
-
-/* Logical operations on SIMD booleans */
-#define gmx_simd4_and_b gmx_simd4_ref_and_pb
-#define gmx_simd4_or_b gmx_simd4_ref_or_pb
-
-/* Returns a single int (0/1) which tells if any of the 4 booleans is True */
-#define gmx_simd4_anytrue_b gmx_simd4_ref_anytrue_pb
-
-#endif /* GMX_SIMD4_REFERENCE */
-
-
-/* The same SIMD macros can be translated to SIMD intrinsics (and compiled
- * to instructions for) different SIMD width and float precision.
- *
- * On x86: The gmx_simd4 prefix is replaced by _mm_ or _mm256_ (SSE or AVX).
- * The _pr suffix is replaced by _ps or _pd (for single or double precision).
- * Compiler settings will decide if 128-bit intrinsics will
- * be translated into SSE or AVX instructions.
- */
-
-
-#ifdef GMX_SIMD_X86_SSE2_OR_HIGHER
-/* This is for general x86 SIMD instruction sets that also support SSE2 */
-
-#ifdef GMX_SIMD4_SINGLE
-#define GMX_HAVE_SIMD4_MACROS
-#endif
-
-#ifdef GMX_SIMD4_DOUBLE
-/* Note that here we will use 256-bit SIMD with GMX_SIMD_X86_AVX_128_FMA_OR_HIGHER.
- * This is inconsistent naming wise, but should give the best performance.
- */
-#if defined GMX_SIMD_X86_AVX_128_FMA_OR_HIGHER || defined GMX_SIMD_X86_AVX_256_OR_HIGHER
-#define GMX_HAVE_SIMD4_MACROS
-#endif
-#endif
-
-#ifdef GMX_HAVE_SIMD4_MACROS
-
-#if defined GMX_SIMD_X86_AVX_128_FMA_OR_HIGHER || defined GMX_SIMD_X86_AVX_256_OR_HIGHER
-
-#include <immintrin.h>
-#ifdef HAVE_X86INTRIN_H
-#include <x86intrin.h> /* FMA */
-#endif
-#ifdef HAVE_INTRIN_H
-#include <intrin.h> /* FMA MSVC */
-#endif
-
-#else
-#ifdef GMX_SIMD_X86_SSE4_1_OR_HIGHER
-#include <smmintrin.h>
-#else
-/* We only have SSE2 */
-#include <emmintrin.h>
-#endif
-#endif
-
-#ifdef GMX_SIMD4_SINGLE
-
-#define gmx_simd4_real_t __m128
-
-#define gmx_simd4_bool_t __m128
-
-#define gmx_simd4_load_r _mm_load_ps
-#define gmx_simd4_load_bb_pr _mm_load_ps
-#define gmx_simd4_set1_r _mm_set1_ps
-#define gmx_simd4_setzero_r _mm_setzero_ps
-#define gmx_simd4_store_r _mm_store_ps
-
-/* Some old AMD processors could have problems with unaligned loads+stores */
-#ifndef GMX_FAHCORE
-#define GMX_SIMD4_HAVE_UNALIGNED
-#endif
-#ifdef GMX_SIMD4_HAVE_UNALIGNED
-#define gmx_simd4_loadu_r _mm_loadu_ps
-#define gmx_simd4_storeu_r _mm_storeu_ps
-#endif
-
-#define gmx_simd4_add_r _mm_add_ps
-#define gmx_simd4_sub_r _mm_sub_ps
-#define gmx_simd4_mul_r _mm_mul_ps
-
-#ifdef GMX_SIMD_X86_AVX_128_FMA_OR_HIGHER
-#define gmx_simd4_fmadd_r(a, b, c) _mm_macc_ps(a, b, c)
-#define gmx_simd4_fnmadd_r(a, b, c) _mm_nmacc_ps(a, b, c)
-#else
-#define gmx_simd4_fmadd_r(a, b, c) _mm_add_ps(c, _mm_mul_ps(a, b))
-#define gmx_simd4_fnmadd_r(a, b, c) _mm_sub_ps(c, _mm_mul_ps(a, b))
-#endif
-
-static inline float gmx_simd4_dotproduct3_r(__m128 a, __m128 b)
-#ifdef GMX_SIMD_X86_SSE4_1_OR_HIGHER
-{
- float dp;
-
- /* SSE4.1 dot product of components 0,1,2, stored in component 0 */
- _mm_store_ss(&dp, _mm_dp_ps(a, b, 0x71));
-
- return dp;
-}
-#else
-{
- float dp_array[7], *dp;
-
- /* Generate an aligned pointer */
- dp = (float *)(((size_t)(dp_array+3)) & (~((size_t)15)));
-
- _mm_store_ps(dp, _mm_mul_ps(a, b));
-
- return dp[0] + dp[1] + dp[2];
-}
-#endif
-
-#define gmx_simd4_min_r _mm_min_ps
-#define gmx_simd4_max_r _mm_max_ps
-
-#define gmx_simd4_blendzero_r _mm_and_ps
-
-#define gmx_simd4_cmplt_r _mm_cmplt_ps
-#define gmx_simd4_and_b _mm_and_ps
-#define gmx_simd4_or_b _mm_or_ps
-
-#define gmx_simd4_anytrue_b _mm_movemask_ps
-
-#endif /* GMX_SIMD4_SINGLE */
-
-
-#ifdef GMX_SIMD4_DOUBLE
-
-#define gmx_simd4_real_t __m256d
-
-#define gmx_simd4_bool_t __m256d
-
-#define gmx_simd4_load_r _mm256_load_pd
-#define gmx_simd4_load_bb_pr _mm256_load_pd
-#define gmx_simd4_set1_r _mm256_set1_pd
-#define gmx_simd4_setzero_r _mm256_setzero_pd
-#define gmx_simd4_store_r _mm256_store_pd
-
-#define GMX_SIMD4_HAVE_UNALIGNED
-#define gmx_simd4_loadu_r _mm256_loadu_pd
-#define gmx_simd4_storeu_r _mm256_storeu_pd
-
-#define gmx_simd4_add_r _mm256_add_pd
-#define gmx_simd4_sub_r _mm256_sub_pd
-#define gmx_simd4_mul_r _mm256_mul_pd
-#ifdef GMX_SIMD_X86_AVX_128_FMA_OR_HIGHER
-#define gmx_simd4_fmadd_r(a, b, c) _mm256_macc_pd(a, b, c)
-#define gmx_simd4_fnmadd_r(a, b, c) _mm256_nmacc_pd(a, b, c)
-#else
-#define gmx_simd4_fmadd_r(a, b, c) _mm256_add_pd(c, _mm256_mul_pd(a, b))
-#define gmx_simd4_fnmadd_r(a, b, c) _mm256_sub_pd(c, _mm256_mul_pd(a, b))
-#endif
-#define gmx_simd4_min_r _mm256_min_pd
-#define gmx_simd4_max_r _mm256_max_pd
-
-#define gmx_simd4_blendzero_r _mm256_and_pd
-
-/* Less-than (we use ordered, non-signaling, but that's not required) */
-#define gmx_simd4_cmplt_r(x, y) _mm256_cmp_pd(x, y, 0x11)
-#define gmx_simd4_and_b _mm256_and_pd
-#define gmx_simd4_or_b _mm256_or_pd
-
-#define gmx_simd4_anytrue_b _mm256_movemask_pd
-
-#endif /* GMX_SIMD4_DOUBLE */
-
-
-#endif /* GMX_HAVE_SIMD4_MACROS */
-
-#endif /* GMX_SIMD_X86_SSE2_OR_HIGHER */
-
-#ifdef GMX_SIMD_IBM_QPX
-/* i.e. BlueGene/Q */
-
-/* This hack works on the compilers that can reach this code. A real
- solution with broader scope will be proposed in master branch. */
-#define gmx_always_inline __attribute__((always_inline))
-
-#ifdef GMX_SIMD4_SINGLE
-#define GMX_HAVE_SIMD4_MACROS
-#endif
-
-typedef vector4double gmx_simd4_real_t;
-typedef vector4double gmx_simd4_bool_t;
-
-/* The declarations of vec_ld* use non-const pointers, and IBM
- can't/won't fix this any time soon. So GROMACS has to cast away the
- const-ness of its pointers before loads. Four-wide SIMD loads
- sometimes occur from variables of type real, and sometimes from
- variables of type float (even at double precison), so the correct
- cast cannot be done easily. The correct cast is necessary because
- the resulting type determines the alignment assumption of vec_ld*,
- which is different for float and double. So the loads of
- always-float variables have to be done with a function that does
- the correct cast. Since functions cannot be overloaded by type in
- C, they have to have different names. Thus we have
- gmx_simd4_load_r and gmx_simd4_load_bb_pr.
- */
-
-static gmx_inline gmx_simd4_real_t gmx_always_inline gmx_simd4_load_r(const real *a)
-{
-#ifdef NDEBUG
- return vec_ld(0, (real *) a);
-#else
- return vec_lda(0, (real *) a);
-#endif
-}
-
-static gmx_inline gmx_simd4_real_t gmx_always_inline gmx_simd4_load_bb_pr(const float *a)
-{
-#ifdef NDEBUG
- return vec_ld(0, (float *) a);
-#else
- return vec_lda(0, (float *) a);
-#endif
-}
-
-static gmx_inline gmx_simd4_real_t gmx_always_inline gmx_simd4_set1_r(const real a)
-{
- return vec_splats(a);
-}
-
-static gmx_inline gmx_simd4_real_t gmx_always_inline gmx_simd4_setzero_r()
-{
- return vec_splats(0.0);
-}
-
-/* TODO this will not yet work, because the function might be passed a
- pointer to a float when running in double precision.
- */
-static gmx_inline void gmx_always_inline gmx_simd4_store_r(real *a, gmx_simd4_real_t b)
-{
-#ifdef NDEBUG
- vec_st(b, 0, a);
-#else
- vec_sta(b, 0, a);
-#endif
-}
-
-static gmx_inline gmx_simd4_real_t gmx_always_inline gmx_simd4_add_r(gmx_simd4_real_t a, gmx_simd4_real_t b)
-{
- return vec_add(a, b);
-}
-
-static gmx_inline gmx_simd4_real_t gmx_always_inline gmx_simd4_sub_r(gmx_simd4_real_t a, gmx_simd4_real_t b)
-{
- return vec_sub(a, b);
-}
-
-static gmx_inline gmx_simd4_real_t gmx_always_inline gmx_simd4_mul_r(gmx_simd4_real_t a, gmx_simd4_real_t b)
-{
- return vec_mul(a, b);
-}
-
-static gmx_inline gmx_simd4_real_t gmx_always_inline gmx_simd4_fmadd_r(gmx_simd4_real_t a, gmx_simd4_real_t b, gmx_simd4_real_t c)
-{
- return vec_madd(a, b, c);
-}
-
-static gmx_inline gmx_simd4_real_t gmx_always_inline gmx_simd4_fnmadd_r(gmx_simd4_real_t a, gmx_simd4_real_t b, gmx_simd4_real_t c)
-{
- return vec_nmsub(a, b, c);
-}
-
-static gmx_inline gmx_simd4_real_t gmx_always_inline gmx_simd4_min_r(gmx_simd4_real_t a, gmx_simd4_real_t b)
-{
- /* Implemented the same way as max, but with the subtraction
- operands swapped. */
- return vec_sel(b, a, vec_sub(b, a));
-}
-
-static gmx_inline gmx_simd4_real_t gmx_always_inline gmx_simd4_max_r(gmx_simd4_real_t a, gmx_simd4_real_t b)
-{
- return vec_sel(b, a, vec_sub(a, b));
-}
-
-static gmx_inline gmx_simd4_real_t gmx_always_inline gmx_simd4_blendzero_r(gmx_simd4_real_t a, gmx_simd4_real_t b)
-{
- return vec_sel(gmx_simd_setzero_r(), a, b);
-}
-
-static gmx_inline gmx_simd4_bool_t gmx_always_inline gmx_simd4_cmplt_r(gmx_simd4_real_t a, gmx_simd4_real_t b)
-{
- return vec_cmplt(a, b);
-}
-
-static gmx_inline gmx_simd4_bool_t gmx_always_inline gmx_simd4_and_b(gmx_simd4_bool_t a, gmx_simd4_bool_t b)
-{
- return vec_and(a, b);
-}
-
-static gmx_inline gmx_simd4_bool_t gmx_always_inline gmx_simd4_or_b(gmx_simd4_bool_t a, gmx_simd4_bool_t b)
-{
- return vec_or(a, b);
-}
-
-static gmx_inline float gmx_always_inline gmx_simd4_dotproduct3_r(gmx_simd4_real_t a, gmx_simd4_real_t b)
-{
- /* The dot product is done solely on the QPX AXU (which is the
- only available FPU). This is awkward, because pretty much no
- "horizontal" SIMD-vector operations exist, unlike x86 where
- SSE4.1 added various kinds of horizontal operations. So we have
- to make do with shifting vector elements and operating on the
- results. This makes for lots of data dependency, but the main
- alternative of storing to memory and reloading is not going to
- help, either. OpenMP over 2 or 4 hardware threads per core will
- hide much of the latency from the data dependency. The
- vec_extract() lets the compiler correctly use a floating-point
- comparison on the zeroth vector element, which avoids needing
- memory at all.
- */
-
- gmx_simd4_real_t dp_shifted_left_0 = vec_mul(a, b);
- gmx_simd4_real_t dp_shifted_left_1 = vec_sldw(dp_shifted_left_0, dp_shifted_left_0, 1);
- gmx_simd4_real_t dp_shifted_left_2 = vec_sldw(dp_shifted_left_0, dp_shifted_left_0, 2);
- gmx_simd4_real_t dp = vec_add(dp_shifted_left_2,
- vec_add(dp_shifted_left_0, dp_shifted_left_1));
-
- /* See comment in nbnxn_make_pairlist_part() about how this should
- be able to return a double on PowerPC. */
- return (float) vec_extract(dp, 0);
-}
-
-static gmx_inline int gmx_always_inline gmx_simd4_anytrue_b(gmx_simd4_bool_t a)
-{
- return gmx_simd_anytrue_b(a);
-}
-
-#undef gmx_always_inline
-
-#endif /* GMX_SIMD_IBM_QPX */
-
-#ifdef GMX_HAVE_SIMD4_MACROS
-/* Generic functions to extract a SIMD4 aligned pointer from a pointer x.
- * x should have at least GMX_SIMD4_WIDTH=4 elements extra compared
- * to how many you want to use, to avoid indexing outside the aligned region.
- */
-
-static gmx_inline gmx_simd4_real *
-gmx_simd4_align_real(const gmx_simd4_real *x)
-{
- return (gmx_simd4_real *)(((size_t)((x)+GMX_SIMD4_WIDTH)) & (~((size_t)(GMX_SIMD4_WIDTH*sizeof(gmx_simd4_real)-1))));
-}
-#endif
-
-
-#endif
+++ /dev/null
-/*
- * This file is part of the GROMACS molecular simulation package.
- *
- * Copyright (c) 2013,2014, by the GROMACS development team, led by
- * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
- * and including many others, as listed in the AUTHORS file in the
- * top-level source directory and at http://www.gromacs.org.
- *
- * GROMACS is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public License
- * as published by the Free Software Foundation; either version 2.1
- * of the License, or (at your option) any later version.
- *
- * GROMACS is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with GROMACS; if not, see
- * http://www.gnu.org/licenses, or write to the Free Software Foundation,
- * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
- *
- * If you want to redistribute modifications to GROMACS, please
- * consider that scientific software is very special. Version
- * control is crucial - bugs must be traceable. We will be happy to
- * consider code for inclusion in the official distribution, but
- * derived work must not be called official GROMACS. Details are found
- * in the README & COPYING files - if they are missing, get the
- * official version at http://www.gromacs.org.
- *
- * To help us fund GROMACS development, we humbly ask that you cite
- * the research papers on the package. Check out http://www.gromacs.org.
- */
-
-#ifndef GMX_SIMD_FOUR_WIDE_MACROS_REF_H
-#define GMX_SIMD_FOUR_WIDE_MACROS_REF_H
-
-/* This file contains a reference plain-C implementation of 4-wide SIMD.
- * This code is only useful for testing and documentation.
- * Either float or double precision is supported through gmx_simd4_real,
- * which is set in gmx_simd4_macros.h
- */
-
-
-#include <math.h>
-
-/* float/double SIMD register type */
-typedef struct {
- gmx_simd4_real r[GMX_SIMD4_WIDTH];
-} gmx_simd4_ref_pr;
-
-/* boolean SIMD register type */
-typedef struct {
- char r[GMX_SIMD4_WIDTH];
-} gmx_simd4_ref_pb;
-
-
-/* Load GMX_SIMD4_WIDTH reals for memory starting at r */
-static gmx_inline gmx_simd4_ref_pr
-gmx_simd4_ref_load_pr(const gmx_simd4_real *r)
-{
- gmx_simd4_ref_pr a;
- int i;
-
- for (i = 0; i < GMX_SIMD4_WIDTH; i++)
- {
- a.r[i] = r[i];
- }
-
- return a;
-}
-
-/* Set all SIMD register elements to r */
-static gmx_inline gmx_simd4_ref_pr
-gmx_simd4_ref_set1_pr(gmx_simd4_real r)
-{
- gmx_simd4_ref_pr a;
- int i;
-
- for (i = 0; i < GMX_SIMD4_WIDTH; i++)
- {
- a.r[i] = r;
- }
-
- return a;
-}
-
-/* Set all SIMD register elements to 0 */
-static gmx_inline gmx_simd4_ref_pr
-gmx_simd4_ref_setzero_pr()
-{
- gmx_simd4_ref_pr a;
- int i;
-
- for (i = 0; i < GMX_SIMD4_WIDTH; i++)
- {
- a.r[i] = 0.0;
- }
-
- return a;
-}
-
-static gmx_inline void
-gmx_simd4_ref_store_pr(gmx_simd4_real *dest, gmx_simd4_ref_pr src)
-{
- int i;
-
- for (i = 0; i < GMX_SIMD4_WIDTH; i++)
- {
- dest[i] = src.r[i];
- }
-}
-
-static gmx_inline gmx_simd4_ref_pr
-gmx_simd4_ref_add_pr(gmx_simd4_ref_pr a, gmx_simd4_ref_pr b)
-{
- gmx_simd4_ref_pr c;
- int i;
-
- for (i = 0; i < GMX_SIMD4_WIDTH; i++)
- {
- c.r[i] = a.r[i] + b.r[i];
- }
-
- return c;
-}
-
-static gmx_inline gmx_simd4_ref_pr
-gmx_simd4_ref_sub_pr(gmx_simd4_ref_pr a, gmx_simd4_ref_pr b)
-{
- gmx_simd4_ref_pr c;
- int i;
-
- for (i = 0; i < GMX_SIMD4_WIDTH; i++)
- {
- c.r[i] = a.r[i] - b.r[i];
- }
-
- return c;
-}
-
-static gmx_inline gmx_simd4_ref_pr
-gmx_simd4_ref_mul_pr(gmx_simd4_ref_pr a, gmx_simd4_ref_pr b)
-{
- gmx_simd4_ref_pr c;
- int i;
-
- for (i = 0; i < GMX_SIMD4_WIDTH; i++)
- {
- c.r[i] = a.r[i]*b.r[i];
- }
-
- return c;
-}
-
-static gmx_inline gmx_simd4_ref_pr
-gmx_simd4_ref_madd_pr(gmx_simd4_ref_pr a, gmx_simd4_ref_pr b, gmx_simd4_ref_pr c)
-{
- gmx_simd4_ref_pr d;
- int i;
-
- for (i = 0; i < GMX_SIMD4_WIDTH; i++)
- {
- d.r[i] = a.r[i]*b.r[i] + c.r[i];
- }
-
- return d;
-}
-
-static gmx_inline gmx_simd4_ref_pr
-gmx_simd4_ref_nmsub_pr(gmx_simd4_ref_pr a, gmx_simd4_ref_pr b, gmx_simd4_ref_pr c)
-{
- gmx_simd4_ref_pr d;
- int i;
-
- for (i = 0; i < GMX_SIMD4_WIDTH; i++)
- {
- d.r[i] = -a.r[i]*b.r[i] + c.r[i];
- }
-
- return d;
-}
-
-static gmx_inline gmx_simd4_real
-gmx_simd4_ref_dotproduct3(gmx_simd4_ref_pr a, gmx_simd4_ref_pr b)
-{
- gmx_simd4_real dp;
- int i;
-
- dp = 0.0;
- for (i = 0; i < 3; i++)
- {
- dp += a.r[i]*b.r[i];
- }
-
- return dp;
-}
-
-static gmx_inline gmx_simd4_ref_pr
-gmx_simd4_ref_min_pr(gmx_simd4_ref_pr a, gmx_simd4_ref_pr b)
-{
- gmx_simd4_ref_pr c;
- int i;
-
- for (i = 0; i < GMX_SIMD4_WIDTH; i++)
- {
- c.r[i] = (a.r[i] <= b.r[i] ? a.r[i] : b.r[i]);
- }
-
- return c;
-}
-
-static gmx_inline gmx_simd4_ref_pr
-gmx_simd4_ref_max_pr(gmx_simd4_ref_pr a, gmx_simd4_ref_pr b)
-{
- gmx_simd4_ref_pr c;
- int i;
-
- for (i = 0; i < GMX_SIMD4_WIDTH; i++)
- {
- c.r[i] = (a.r[i] >= b.r[i] ? a.r[i] : b.r[i]);
- }
-
- return c;
-}
-
-static gmx_inline gmx_simd4_ref_pr
-gmx_simd4_ref_blendzero_pr(gmx_simd4_ref_pr a, gmx_simd4_ref_pb b)
-{
- gmx_simd4_ref_pr c;
- int i;
-
- for (i = 0; i < GMX_SIMD4_WIDTH; i++)
- {
- c.r[i] = (b.r[i] ? a.r[i] : 0.0);
- }
-
- return c;
-}
-
-/* Comparison */
-static gmx_inline gmx_simd4_ref_pb
-gmx_simd4_ref_cmplt_pr(gmx_simd4_ref_pr a, gmx_simd4_ref_pr b)
-{
- gmx_simd4_ref_pb c;
- int i;
-
- for (i = 0; i < GMX_SIMD4_WIDTH; i++)
- {
- c.r[i] = (a.r[i] < b.r[i]);
- }
-
- return c;
-}
-
-/* Logical AND on SIMD booleans */
-static gmx_inline gmx_simd4_ref_pb
-gmx_simd4_ref_and_pb(gmx_simd4_ref_pb a, gmx_simd4_ref_pb b)
-{
- gmx_simd4_ref_pb c;
- int i;
-
- for (i = 0; i < GMX_SIMD4_WIDTH; i++)
- {
- c.r[i] = (a.r[i] && b.r[i]);
- }
-
- return c;
-}
-
-/* Logical OR on SIMD booleans */
-static gmx_inline gmx_simd4_ref_pb
-gmx_simd4_ref_or_pb(gmx_simd4_ref_pb a, gmx_simd4_ref_pb b)
-{
- gmx_simd4_ref_pb c;
- int i;
-
- for (i = 0; i < GMX_SIMD4_WIDTH; i++)
- {
- c.r[i] = (a.r[i] || b.r[i]);
- }
-
- return c;
-}
-
-/* gmx_simd_anytrue_b(x) returns if any of the boolean is x is True */
-static gmx_inline int
-gmx_simd4_ref_anytrue_pb(gmx_simd4_ref_pb a)
-{
- int anytrue;
- int i;
-
- anytrue = 0;
- for (i = 0; i < GMX_SIMD4_WIDTH; i++)
- {
- if (a.r[i])
- {
- anytrue = 1;
- }
- }
-
- return anytrue;
-}
-
-#endif
+++ /dev/null
-/*
- * This file is part of the GROMACS molecular simulation package.
- *
- * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
- * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
- * and including many others, as listed in the AUTHORS file in the
- * top-level source directory and at http://www.gromacs.org.
- *
- * GROMACS is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public License
- * as published by the Free Software Foundation; either version 2.1
- * of the License, or (at your option) any later version.
- *
- * GROMACS is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with GROMACS; if not, see
- * http://www.gnu.org/licenses, or write to the Free Software Foundation,
- * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
- *
- * If you want to redistribute modifications to GROMACS, please
- * consider that scientific software is very special. Version
- * control is crucial - bugs must be traceable. We will be happy to
- * consider code for inclusion in the official distribution, but
- * derived work must not be called official GROMACS. Details are found
- * in the README & COPYING files - if they are missing, get the
- * official version at http://www.gromacs.org.
- *
- * To help us fund GROMACS development, we humbly ask that you cite
- * the research papers on the package. Check out http://www.gromacs.org.
- */
-#ifndef GMX_SIMD_GENERAL_AVX_128_FMA_H
-#define GMX_SIMD_GENERAL_AVX_128_FMA_H
-
-
-#include <immintrin.h>
-#ifdef HAVE_X86INTRIN_H
-#include <x86intrin.h> /* FMA */
-#endif
-#ifdef HAVE_INTRIN_H
-#include <intrin.h> /* FMA MSVC */
-#endif
-
-
-#include <stdio.h>
-
-#include "types/simple.h"
-
-
-#define gmx_mm_extract_epi32(x, imm) _mm_cvtsi128_si32(_mm_srli_si128((x), 4 * (imm)))
-
-#define _GMX_MM_BLEND(b3, b2, b1, b0) (((b3) << 3) | ((b2) << 2) | ((b1) << 1) | ((b0)))
-
-#define _GMX_MM_PERMUTE128D(fp1, fp0) (((fp1) << 1) | ((fp0)))
-
-
-#define GMX_MM_TRANSPOSE2_PD(row0, row1) { \
- __m128d __gmx_t1 = row0; \
- row0 = _mm_unpacklo_pd(row0, row1); \
- row1 = _mm_unpackhi_pd(__gmx_t1, row1); \
-}
-
-
-#if (defined (_MSC_VER) || defined(__INTEL_COMPILER))
-# define gmx_mm_castsi128_ps(a) _mm_castsi128_ps(a)
-# define gmx_mm_castps_si128(a) _mm_castps_si128(a)
-# define gmx_mm_castps_ps128(a) (a)
-# define gmx_mm_castsi128_pd(a) _mm_castsi128_pd(a)
-# define gmx_mm_castpd_si128(a) _mm_castpd_si128(a)
-#elif defined(__GNUC__)
-# define gmx_mm_castsi128_ps(a) ((__m128)(a))
-# define gmx_mm_castps_si128(a) ((__m128i)(a))
-# define gmx_mm_castps_ps128(a) ((__m128)(a))
-# define gmx_mm_castsi128_pd(a) ((__m128d)(a))
-# define gmx_mm_castpd_si128(a) ((__m128i)(a))
-#else
-static __m128 gmx_mm_castsi128_ps(__m128i a)
-{
- return *(__m128 *) &a;
-}
-static __m128i gmx_mm_castps_si128(__m128 a)
-{
- return *(__m128i *) &a;
-}
-static __m128 gmx_mm_castps_ps128(__m128 a)
-{
- return *(__m128 *) &a;
-}
-static __m128d gmx_mm_castsi128_pd(__m128i a)
-{
- return *(__m128d *) &a;
-}
-static __m128i gmx_mm_castpd_si128(__m128d a)
-{
- return *(__m128i *) &a;
-}
-#endif
-
-#if GMX_EMULATE_AMD_FMA
-/* Wrapper routines so we can do test builds on non-FMA or non-AMD hardware */
-static __m128
-_mm_macc_ps(__m128 a, __m128 b, __m128 c)
-{
- return _mm_add_ps(c, _mm_mul_ps(a, b));
-}
-
-static __m128
-_mm_nmacc_ps(__m128 a, __m128 b, __m128 c)
-{
- return _mm_sub_ps(c, _mm_mul_ps(a, b));
-}
-
-static __m128
-_mm_msub_ps(__m128 a, __m128 b, __m128 c)
-{
- return _mm_sub_ps(_mm_mul_ps(a, b), c);
-}
-
-static __m128d
-_mm_macc_pd(__m128d a, __m128d b, __m128d c)
-{
- return _mm_add_pd(c, _mm_mul_pd(a, b));
-}
-
-static __m128d
-_mm_nmacc_pd(__m128d a, __m128d b, __m128d c)
-{
- return _mm_sub_pd(c, _mm_mul_pd(a, b));
-}
-
-static __m128d
-_mm_msub_pd(__m128d a, __m128d b, __m128d c)
-{
- return _mm_sub_pd(_mm_mul_pd(a, b), c);
-}
-#endif /* AMD FMA emulation support */
-
-static void
-gmx_simd_real_tintxmm_ps(const char *s, __m128 xmm)
-{
- float f[4];
-
- _mm_storeu_ps(f, xmm);
- printf("%s: %15.10e %15.10e %15.10e %15.10e\n", s, f[0], f[1], f[2], f[3]);
-}
-
-
-static void
-gmx_simd_real_tintxmmsum_ps(const char *s, __m128 xmm)
-{
- float f[4];
-
- _mm_storeu_ps(f, xmm);
- printf("%s (sum): %15.10g\n", s, f[0]+f[1]+f[2]+f[3]);
-}
-
-
-static void
-gmx_simd_real_tintxmm_pd(const char *s, __m128d xmm)
-{
- double f[2];
-
- _mm_storeu_pd(f, xmm);
- printf("%s: %30.20e %30.20e\n", s, f[0], f[1]);
-}
-
-static void
-gmx_simd_real_tintxmmsum_pd(const char *s, __m128d xmm)
-{
- double f[2];
-
- _mm_storeu_pd(f, xmm);
- printf("%s (sum): %15.10g\n", s, f[0]+f[1]);
-}
-
-
-static void
-gmx_simd_real_tintxmm_epi32(const char *s, __m128i xmmi)
-{
- int i[4];
-
- _mm_storeu_si128((__m128i *)i, xmmi);
- printf("%10s: %2d %2d %2d %2d\n", s, i[0], i[1], i[2], i[3]);
-}
-
-
-
-static int gmx_mm_check_and_reset_overflow(void)
-{
- int MXCSR;
- int sse_overflow;
-
- MXCSR = _mm_getcsr();
- /* The overflow flag is bit 3 in the register */
- if (MXCSR & 0x0008)
- {
- sse_overflow = 1;
- /* Set the overflow flag to zero */
- MXCSR = MXCSR & 0xFFF7;
- _mm_setcsr(MXCSR);
- }
- else
- {
- sse_overflow = 0;
- }
-
- return sse_overflow;
-}
-
-/* Work around gcc bug with wrong type for mask formal parameter to maskload/maskstore */
-#ifdef GMX_SIMD_X86_AVX_GCC_MASKLOAD_BUG
-# define gmx_mm_maskload_ps(mem, mask) _mm_maskload_ps((mem), _mm_castsi128_ps(mask))
-# define gmx_mm_maskstore_ps(mem, mask, x) _mm_maskstore_ps((mem), _mm_castsi128_ps(mask), (x))
-# define gmx_mm256_maskload_ps(mem, mask) _mm256_maskload_ps((mem), _mm256_castsi256_ps(mask))
-# define gmx_mm256_maskstore_ps(mem, mask, x) _mm256_maskstore_ps((mem), _mm256_castsi256_ps(mask), (x))
-#else
-# define gmx_mm_maskload_ps(mem, mask) _mm_maskload_ps((mem), (mask))
-# define gmx_mm_maskstore_ps(mem, mask, x) _mm_maskstore_ps((mem), (mask), (x))
-# define gmx_mm256_maskload_ps(mem, mask) _mm256_maskload_ps((mem), (mask))
-# define gmx_mm256_maskstore_ps(mem, mask, x) _mm256_maskstore_ps((mem), (mask), (x))
-#endif
-
-
-
-#endif
+++ /dev/null
-/*
- * This file is part of the GROMACS molecular simulation package.
- *
- * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
- * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
- * and including many others, as listed in the AUTHORS file in the
- * top-level source directory and at http://www.gromacs.org.
- *
- * GROMACS is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public License
- * as published by the Free Software Foundation; either version 2.1
- * of the License, or (at your option) any later version.
- *
- * GROMACS is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with GROMACS; if not, see
- * http://www.gnu.org/licenses, or write to the Free Software Foundation,
- * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
- *
- * If you want to redistribute modifications to GROMACS, please
- * consider that scientific software is very special. Version
- * control is crucial - bugs must be traceable. We will be happy to
- * consider code for inclusion in the official distribution, but
- * derived work must not be called official GROMACS. Details are found
- * in the README & COPYING files - if they are missing, get the
- * official version at http://www.gromacs.org.
- *
- * To help us fund GROMACS development, we humbly ask that you cite
- * the research papers on the package. Check out http://www.gromacs.org.
- */
-#ifndef GMX_SIMD_GENERAL_AVX_256_H
-#define GMX_SIMD_GENERAL_AVX_256_H
-
-
-#include <immintrin.h>
-#ifdef HAVE_X86INTRIN_H
-#include <x86intrin.h> /* FMA */
-#endif
-
-
-#include <stdio.h>
-
-#include "types/simple.h"
-
-
-#define gmx_mm_extract_epi32(x, imm) _mm_cvtsi128_si32(_mm_srli_si128((x), 4 * (imm)))
-
-#define _GMX_MM_BLEND256D(b3, b2, b1, b0) (((b3) << 3) | ((b2) << 2) | ((b1) << 1) | ((b0)))
-#define _GMX_MM_PERMUTE(fp3, fp2, fp1, fp0) (((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | ((fp0)))
-#define _GMX_MM_PERMUTE256D(fp3, fp2, fp1, fp0) (((fp3) << 3) | ((fp2) << 2) | ((fp1) << 1) | ((fp0)))
-#define _GMX_MM_PERMUTE128D(fp1, fp0) (((fp1) << 1) | ((fp0)))
-
-
-#define GMX_MM_TRANSPOSE2_PD(row0, row1) { \
- __m128d __gmx_t1 = row0; \
- row0 = _mm_unpacklo_pd(row0, row1); \
- row1 = _mm_unpackhi_pd(__gmx_t1, row1); \
-}
-
-#define GMX_MM256_FULLTRANSPOSE4_PD(row0, row1, row2, row3) \
- { \
- __m256d _t0, _t1, _t2, _t3; \
- _t0 = _mm256_unpacklo_pd((row0), (row1)); \
- _t1 = _mm256_unpackhi_pd((row0), (row1)); \
- _t2 = _mm256_unpacklo_pd((row2), (row3)); \
- _t3 = _mm256_unpackhi_pd((row2), (row3)); \
- row0 = _mm256_permute2f128_pd(_t0, _t2, 0x20); \
- row1 = _mm256_permute2f128_pd(_t1, _t3, 0x20); \
- row2 = _mm256_permute2f128_pd(_t0, _t2, 0x31); \
- row3 = _mm256_permute2f128_pd(_t1, _t3, 0x31); \
- }
-
-#if (defined (_MSC_VER) || defined(__INTEL_COMPILER))
-# define gmx_mm_castsi128_ps(a) _mm_castsi128_ps(a)
-# define gmx_mm_castps_si128(a) _mm_castps_si128(a)
-# define gmx_mm_castps_ps128(a) (a)
-# define gmx_mm_castsi128_pd(a) _mm_castsi128_pd(a)
-# define gmx_mm_castpd_si128(a) _mm_castpd_si128(a)
-#elif defined(__GNUC__)
-# define gmx_mm_castsi128_ps(a) ((__m128)(a))
-# define gmx_mm_castps_si128(a) ((__m128i)(a))
-# define gmx_mm_castps_ps128(a) ((__m128)(a))
-# define gmx_mm_castsi128_pd(a) ((__m128d)(a))
-# define gmx_mm_castpd_si128(a) ((__m128i)(a))
-#else
-static __m128 gmx_mm_castsi128_ps(__m128i a)
-{
- return *(__m128 *) &a;
-}
-static __m128i gmx_mm_castps_si128(__m128 a)
-{
- return *(__m128i *) &a;
-}
-static __m128 gmx_mm_castps_ps128(__m128 a)
-{
- return *(__m128 *) &a;
-}
-static __m128d gmx_mm_castsi128_pd(__m128i a)
-{
- return *(__m128d *) &a;
-}
-static __m128i gmx_mm_castpd_si128(__m128d a)
-{
- return *(__m128i *) &a;
-}
-#endif
-
-static gmx_inline __m256
-gmx_mm256_unpack128lo_ps(__m256 xmm1, __m256 xmm2)
-{
- return _mm256_permute2f128_ps(xmm1, xmm2, 0x20);
-}
-
-static gmx_inline __m256
-gmx_mm256_unpack128hi_ps(__m256 xmm1, __m256 xmm2)
-{
- return _mm256_permute2f128_ps(xmm1, xmm2, 0x31);
-}
-
-static gmx_inline __m256
-gmx_mm256_set_m128(__m128 hi, __m128 lo)
-{
- return _mm256_insertf128_ps(_mm256_castps128_ps256(lo), hi, 0x1);
-}
-
-
-static gmx_inline __m256
-gmx_mm256_load4_ps(float const * p)
-{
- __m128 a;
-
- a = _mm_load_ps(p);
- return _mm256_insertf128_ps(_mm256_castps128_ps256(a), a, 0x1);
-}
-
-
-static __m256d
-gmx_mm256_unpack128lo_pd(__m256d xmm1, __m256d xmm2)
-{
- return _mm256_permute2f128_pd(xmm1, xmm2, 0x20);
-}
-
-static __m256d
-gmx_mm256_unpack128hi_pd(__m256d xmm1, __m256d xmm2)
-{
- return _mm256_permute2f128_pd(xmm1, xmm2, 0x31);
-}
-
-static __m256d
-gmx_mm256_set_m128d(__m128d hi, __m128d lo)
-{
- return _mm256_insertf128_pd(_mm256_castpd128_pd256(lo), hi, 0x1);
-}
-
-
-static __m128 gmx_mm256_sum4h_m128(__m256 x, __m256 y)
-{
- __m256 sum;
-
- sum = _mm256_add_ps(x, y);
- return _mm_add_ps(_mm256_castps256_ps128(sum), _mm256_extractf128_ps(sum, 0x1));
-}
-
-
-static void
-gmx_simd_real_tintxmm_ps(const char *s, __m128 xmm)
-{
- float f[4];
-
- _mm_storeu_ps(f, xmm);
- printf("%s: %15.10e %15.10e %15.10e %15.10e\n", s, f[0], f[1], f[2], f[3]);
-}
-
-
-static void
-gmx_simd_real_tintxmmsum_ps(const char *s, __m128 xmm)
-{
- float f[4];
-
- _mm_storeu_ps(f, xmm);
- printf("%s (sum): %15.10g\n", s, f[0]+f[1]+f[2]+f[3]);
-}
-
-
-static void
-gmx_simd_real_tintxmm_pd(const char *s, __m128d xmm)
-{
- double f[2];
-
- _mm_storeu_pd(f, xmm);
- printf("%s: %30.20e %30.20e\n", s, f[0], f[1]);
-}
-
-static void
-gmx_simd_real_tintxmmsum_pd(const char *s, __m128d xmm)
-{
- double f[2];
-
- _mm_storeu_pd(f, xmm);
- printf("%s (sum): %15.10g\n", s, f[0]+f[1]);
-}
-
-
-static void
-gmx_simd_real_tintxmm_epi32(const char *s, __m128i xmmi)
-{
- int i[4];
-
- _mm_storeu_si128((__m128i *)i, xmmi);
- printf("%10s: %2d %2d %2d %2d\n", s, i[0], i[1], i[2], i[3]);
-}
-
-static void
-gmx_mm256_printymm_ps(const char *s, __m256 ymm)
-{
- float f[8];
-
- _mm256_storeu_ps(f, ymm);
- printf("%s: %12.7f %12.7f %12.7f %12.7f %12.7f %12.7f %12.7f %12.7f\n", s, f[0], f[1], f[2], f[3], f[4], f[5], f[6], f[7]);
-}
-
-static void
-gmx_mm256_printymmsum_ps(const char *s, __m256 ymm)
-{
- float f[8];
-
- _mm256_storeu_ps(f, ymm);
- printf("%s (sum): %15.10g\n", s, f[0]+f[1]+f[2]+f[3]+f[4]+f[5]+f[6]+f[7]);
-}
-
-
-static void
-gmx_mm256_printymm_pd(const char *s, __m256d ymm)
-{
- double f[4];
-
- _mm256_storeu_pd(f, ymm);
- printf("%s: %16.12f %16.12f %16.12f %16.12f\n", s, f[0], f[1], f[2], f[3]);
-}
-
-static void
-gmx_mm256_printymmsum_pd(const char *s, __m256d ymm)
-{
- double f[4];
-
- _mm256_storeu_pd(f, ymm);
- printf("%s (sum): %15.10g\n", s, f[0]+f[1]+f[2]+f[3]);
-}
-
-
-
-static void
-gmx_mm256_printymm_epi32(const char *s, __m256i ymmi)
-{
- int i[8];
-
- _mm256_storeu_si256((__m256i *)i, ymmi);
- printf("%10s: %2d %2d %2d %2d %2d %2d %2d %2d\n", s, i[0], i[1], i[2], i[3], i[4], i[5], i[6], i[7]);
-}
-
-
-
-static int gmx_mm_check_and_reset_overflow(void)
-{
- int MXCSR;
- int sse_overflow;
-
- MXCSR = _mm_getcsr();
- /* The overflow flag is bit 3 in the register */
- if (MXCSR & 0x0008)
- {
- sse_overflow = 1;
- /* Set the overflow flag to zero */
- MXCSR = MXCSR & 0xFFF7;
- _mm_setcsr(MXCSR);
- }
- else
- {
- sse_overflow = 0;
- }
-
- return sse_overflow;
-}
-
-/* Work around gcc bug with wrong type for mask formal parameter to maskload/maskstore */
-#ifdef GMX_SIMD_X86_AVX_GCC_MASKLOAD_BUG
-# define gmx_mm_maskload_ps(mem, mask) _mm_maskload_ps((mem), _mm_castsi128_ps(mask))
-# define gmx_mm_maskstore_ps(mem, mask, x) _mm_maskstore_ps((mem), _mm_castsi128_ps(mask), (x))
-# define gmx_mm256_maskload_ps(mem, mask) _mm256_maskload_ps((mem), _mm256_castsi256_ps(mask))
-# define gmx_mm256_maskstore_ps(mem, mask, x) _mm256_maskstore_ps((mem), _mm256_castsi256_ps(mask), (x))
-#else
-# define gmx_mm_maskload_ps(mem, mask) _mm_maskload_ps((mem), (mask))
-# define gmx_mm_maskstore_ps(mem, mask, x) _mm_maskstore_ps((mem), (mask), (x))
-# define gmx_mm256_maskload_ps(mem, mask) _mm256_maskload_ps((mem), (mask))
-# define gmx_mm256_maskstore_ps(mem, mask, x) _mm256_maskstore_ps((mem), (mask), (x))
-#endif
-
-
-#endif
+++ /dev/null
-/*
- * This file is part of the GROMACS molecular simulation package.
- *
- * Copyright (c) 2013,2014, by the GROMACS development team, led by
- * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
- * and including many others, as listed in the AUTHORS file in the
- * top-level source directory and at http://www.gromacs.org.
- *
- * GROMACS is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public License
- * as published by the Free Software Foundation; either version 2.1
- * of the License, or (at your option) any later version.
- *
- * GROMACS is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with GROMACS; if not, see
- * http://www.gnu.org/licenses, or write to the Free Software Foundation,
- * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
- *
- * If you want to redistribute modifications to GROMACS, please
- * consider that scientific software is very special. Version
- * control is crucial - bugs must be traceable. We will be happy to
- * consider code for inclusion in the official distribution, but
- * derived work must not be called official GROMACS. Details are found
- * in the README & COPYING files - if they are missing, get the
- * official version at http://www.gromacs.org.
- *
- * To help us fund GROMACS development, we humbly ask that you cite
- * the research papers on the package. Check out http://www.gromacs.org.
- */
-
-#ifndef _general_x86_mic_h_
-#define _general_x86_mic_h_
-
-/* This file contains the SIMD implmenetation for Intel MIC
- */
-
-#include <math.h>
-#include <immintrin.h>
-
-#ifdef GMX_DOUBLE
-#error "Double precision isn't supported on Intel Phi yet"
-#endif
-
-typedef __m512 gmx_mm_ps;
-typedef __m512 gmx_simd_real_t;
-/* boolean SIMD register type */
-typedef __mmask16 gmx_simd_bool_t;
-typedef __m512i gmx_simd_int32_t;
-
-#define GMX_HAVE_SIMD_MACROS
-#define GMX_SIMD_REAL_WIDTH 16
-#define GMX_SIMD_INT32_WIDTH 16
-
-#define gmx_simd_load_r _mm512_load_ps
-
-/* Set all SIMD register elements to *r */
-static gmx_inline gmx_mm_ps
-gmx_simd_load1_r(const real *r)
-{
- return _mm512_extload_ps(r, _MM_UPCONV_PS_NONE, _MM_BROADCAST_1X16, _MM_HINT_NONE);
-}
-
-#define gmx_simd_set1_r _mm512_set1_ps
-/* Set all SIMD register elements to 0 */
-#define gmx_simd_setzero_r _mm512_setzero_ps
-#define gmx_simd_store_r _mm512_store_ps
-
-#define gmx_simd_add_r _mm512_add_ps
-#define gmx_simd_sub_r _mm512_sub_ps
-#define gmx_simd_mul_r _mm512_mul_ps
-
-#define GMX_SIMD_HAVE_FMA
-#define gmx_simd_fmadd_r _mm512_fmadd_ps
-#define gmx_simd_fnmadd_r _mm512_fnmadd_ps
-
-#define gmx_simd_max_r _mm512_max_ps
-
-static gmx_inline gmx_mm_ps
-gmx_simd_blendzero_r(gmx_mm_ps a, gmx_simd_bool_t b)
-{
- return _mm512_mask_mov_ps(_mm512_setzero_ps(), b, a);
-}
-
-#define gmx_simd_round_r _mm512_rint_ps
-
-#define GMX_SIMD_HAVE_FLOOR
-#define gmx_simd_floor_r _mm512_floor_ps
-
-/* Copy the sign of a to b, assumes b >= 0 for efficiency */
-static gmx_inline gmx_mm_ps
-gmx_cpsgn_nonneg_pr(gmx_mm_ps a, gmx_mm_ps b)
-{
- __m512 zero = _mm512_setzero_ps();
- __m512 neg1 = _mm512_set1_ps(-1);
- /* TODO (only bond): Bitwise operations on floating points can be done after casting to int.
- That allows us to do it the same way as AVX which might be faster. */
- return _mm512_mask_mul_ps(b, _mm512_cmplt_ps_mask(a, zero), b, neg1);
-}
-
-/* Very specific operation required in the non-bonded kernels */
-static gmx_inline gmx_mm_ps
-gmx_masknot_add_pr(gmx_simd_bool_t a, gmx_mm_ps b, gmx_mm_ps c)
-{
- return _mm512_mask_add_ps(b, _mm512_knot(a), b, c);
-}
-
-/* Comparison */
-#define gmx_simd_cmplt_r _mm512_cmplt_ps_mask
-
-/* Logical AND on SIMD booleans. */
-#define gmx_simd_and_b _mm512_kand
-
-/* Logical OR on SIMD booleans. */
-#define gmx_simd_or_b _mm512_kor
-
-/* Returns a single int (0/1) which tells if any of the booleans is True
- It returns the full mask (not 1 for True). But given that any non-zero is True this is OK. */
-#define gmx_simd_anytrue_b _mm512_mask2int
-
-/* Conversions only used for PME table lookup */
-static gmx_inline gmx_simd_int32_t
-gmx_simd_cvtt_r2i(gmx_mm_ps a)
-{
- return _mm512_cvtfxpnt_round_adjustps_epi32(a, _MM_ROUND_MODE_DOWN, _MM_EXPADJ_NONE);
-};
-
-/* These two function only need to be approximate, Newton-Raphson iteration
- * is used for full accuracy in gmx_simd_invsqrt_r and gmx_simd_inv_r.
- */
-#define gmx_simd_rsqrt_r _mm512_rsqrt23_ps
-#define gmx_simd_rcp_r _mm512_rcp23_ps
-
-#define GMX_SIMD_HAVE_EXP
-#define gmx_simd_exp_r _mm512_exp_ps
-
-#define GMX_SIMD_HAVE_ERFC
-#define gmx_simd_erfc_r _mm512_erfc_ps
-
-#define GMX_SIMD_HAVE_TRIGONOMETRIC
-#define gmx_simd_sqrt_r _mm512_sqrt_ps
-
-static gmx_inline int
-gmx_simd_sincos_r(gmx_mm_ps a,
- gmx_mm_ps *s, gmx_mm_ps *c)
-{
- /* TODO (only bond): optimize that both are calculated together.
- Or (if if that isn't fast on MIC) don't call sincos if only one is needed. */
- *s = _mm512_sin_ps(a);
- *c = _mm512_cos_ps(a);
- return 0;
-}
-
-#define gmx_simd_acos_r _mm512_acos_ps
-#define gmx_simd_atan2_r _mm512_atan2_ps
-
-#endif /* _general_x86_mic_h_ */
+++ /dev/null
-/*
- * This file is part of the GROMACS molecular simulation package.
- *
- * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
- * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
- * and including many others, as listed in the AUTHORS file in the
- * top-level source directory and at http://www.gromacs.org.
- *
- * GROMACS is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public License
- * as published by the Free Software Foundation; either version 2.1
- * of the License, or (at your option) any later version.
- *
- * GROMACS is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with GROMACS; if not, see
- * http://www.gnu.org/licenses, or write to the Free Software Foundation,
- * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
- *
- * If you want to redistribute modifications to GROMACS, please
- * consider that scientific software is very special. Version
- * control is crucial - bugs must be traceable. We will be happy to
- * consider code for inclusion in the official distribution, but
- * derived work must not be called official GROMACS. Details are found
- * in the README & COPYING files - if they are missing, get the
- * official version at http://www.gromacs.org.
- *
- * To help us fund GROMACS development, we humbly ask that you cite
- * the research papers on the package. Check out http://www.gromacs.org.
- */
-#ifndef GMX_SIMD_GENERAL_SSE2_H
-#define GMX_SIMD_GENERAL_SSE2_H
-
-#include <emmintrin.h>
-
-#include <stdio.h>
-
-#include "types/simple.h"
-
-
-
-/* Create some basic definitions that are not 100% SSE2 standard and thus not
- * available on all compilers. These should be fairly self-evident by comparing
- * with an arbitrary emmintrin.h.
- */
-
-#define gmx_mm_extract_epi32(x, imm) _mm_cvtsi128_si32(_mm_srli_si128((x), 4 * (imm)))
-
-#define GMX_MM_TRANSPOSE2_PD(row0, row1) { \
- __m128d __gmx_t1 = row0; \
- row0 = _mm_unpacklo_pd(row0, row1); \
- row1 = _mm_unpackhi_pd(__gmx_t1, row1); \
-}
-
-
-#if (defined (_MSC_VER) || defined(__INTEL_COMPILER))
-# define gmx_mm_castsi128_ps(a) _mm_castsi128_ps(a)
-# define gmx_mm_castps_si128(a) _mm_castps_si128(a)
-# define gmx_mm_castps_ps128(a) (a)
-# define gmx_mm_castsi128_pd(a) _mm_castsi128_pd(a)
-# define gmx_mm_castpd_si128(a) _mm_castpd_si128(a)
-#elif defined(__GNUC__)
-# define gmx_mm_castsi128_ps(a) ((__m128)(a))
-# define gmx_mm_castps_si128(a) ((__m128i)(a))
-# define gmx_mm_castps_ps128(a) ((__m128)(a))
-# define gmx_mm_castsi128_pd(a) ((__m128d)(a))
-# define gmx_mm_castpd_si128(a) ((__m128i)(a))
-#else
-static __m128 gmx_mm_castsi128_ps(__m128i a)
-{
- return *(__m128 *) &a;
-}
-static __m128i gmx_mm_castps_si128(__m128 a)
-{
- return *(__m128i *) &a;
-}
-static __m128 gmx_mm_castps_ps128(__m128 a)
-{
- return *(__m128 *) &a;
-}
-static __m128d gmx_mm_castsi128_pd(__m128i a)
-{
- return *(__m128d *) &a;
-}
-static __m128i gmx_mm_castpd_si128(__m128d a)
-{
- return *(__m128i *) &a;
-}
-#endif
-
-
-static void
-gmx_simd_real_tintxmm_ps(const char *s, __m128 xmm)
-{
- float f[4];
-
- _mm_storeu_ps(f, xmm);
- printf("%s: %15.10e %15.10e %15.10e %15.10e\n", s, f[0], f[1], f[2], f[3]);
-}
-
-
-static void
-gmx_simd_real_tintxmmsum_ps(const char *s, __m128 xmm)
-{
- float f[4];
-
- _mm_storeu_ps(f, xmm);
- printf("%s (sum): %15.10g\n", s, f[0]+f[1]+f[2]+f[3]);
-}
-
-
-static void
-gmx_simd_real_tintxmm_pd(const char *s, __m128d xmm)
-{
- double f[2];
-
- _mm_storeu_pd(f, xmm);
- printf("%s: %30.20e %30.20e\n", s, f[0], f[1]);
-}
-
-static void
-gmx_simd_real_tintxmmsum_pd(const char *s, __m128d xmm)
-{
- double f[2];
-
- _mm_storeu_pd(f, xmm);
- printf("%s (sum): %15.10g\n", s, f[0]+f[1]);
-}
-
-
-static void
-gmx_simd_real_tintxmm_epi32(const char *s, __m128i xmmi)
-{
- int i[4];
-
- _mm_storeu_si128((__m128i *)i, xmmi);
- printf("%10s: %2d %2d %2d %2d\n", s, i[0], i[1], i[2], i[3]);
-}
-
-
-
-static int gmx_mm_check_and_reset_overflow(void)
-{
- int MXCSR;
- int sse_overflow;
-
- MXCSR = _mm_getcsr();
- /* The overflow flag is bit 3 in the register */
- if (MXCSR & 0x0008)
- {
- sse_overflow = 1;
- /* Set the overflow flag to zero */
- MXCSR = MXCSR & 0xFFF7;
- _mm_setcsr(MXCSR);
- }
- else
- {
- sse_overflow = 0;
- }
-
- return sse_overflow;
-}
-
-
-#endif
+++ /dev/null
-/*
- * This file is part of the GROMACS molecular simulation package.
- *
- * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
- * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
- * and including many others, as listed in the AUTHORS file in the
- * top-level source directory and at http://www.gromacs.org.
- *
- * GROMACS is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public License
- * as published by the Free Software Foundation; either version 2.1
- * of the License, or (at your option) any later version.
- *
- * GROMACS is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with GROMACS; if not, see
- * http://www.gnu.org/licenses, or write to the Free Software Foundation,
- * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
- *
- * If you want to redistribute modifications to GROMACS, please
- * consider that scientific software is very special. Version
- * control is crucial - bugs must be traceable. We will be happy to
- * consider code for inclusion in the official distribution, but
- * derived work must not be called official GROMACS. Details are found
- * in the README & COPYING files - if they are missing, get the
- * official version at http://www.gromacs.org.
- *
- * To help us fund GROMACS development, we humbly ask that you cite
- * the research papers on the package. Check out http://www.gromacs.org.
- */
-#ifndef GMX_SIMD_GENERAL_SSE4_1_H
-#define GMX_SIMD_GENERAL_SSE4_1_H
-
-#include <smmintrin.h>
-
-#include <stdio.h>
-
-#include "types/simple.h"
-
-/* Create some basic definitions that are not 100% SSE2 standard and thus not
- * available on all compilers. These should be fairly self-evident by comparing
- * with an arbitrary emmintrin.h.
- */
-
-
-#define gmx_mm_extract_epi32(x, imm) _mm_extract_epi32((x), (imm))
-
-#define GMX_MM_TRANSPOSE2_PD(row0, row1) { \
- __m128d __gmx_t1 = row0; \
- row0 = _mm_unpacklo_pd(row0, row1); \
- row1 = _mm_unpackhi_pd(__gmx_t1, row1); \
-}
-
-#define _GMX_MM_BLEND(b3, b2, b1, b0) (((b3) << 3) | ((b2) << 2) | ((b1) << 1) | ((b0)))
-
-#if (defined (_MSC_VER) || defined(__INTEL_COMPILER))
-# define gmx_mm_castsi128_ps(a) _mm_castsi128_ps(a)
-# define gmx_mm_castps_si128(a) _mm_castps_si128(a)
-# define gmx_mm_castps_ps128(a) (a)
-# define gmx_mm_castsi128_pd(a) _mm_castsi128_pd(a)
-# define gmx_mm_castpd_si128(a) _mm_castpd_si128(a)
-#elif defined(__GNUC__)
-# define gmx_mm_castsi128_ps(a) ((__m128)(a))
-# define gmx_mm_castps_si128(a) ((__m128i)(a))
-# define gmx_mm_castps_ps128(a) ((__m128)(a))
-# define gmx_mm_castsi128_pd(a) ((__m128d)(a))
-# define gmx_mm_castpd_si128(a) ((__m128i)(a))
-#else
-static __m128 gmx_mm_castsi128_ps(__m128i a)
-{
- return *(__m128 *) &a;
-}
-static __m128i gmx_mm_castps_si128(__m128 a)
-{
- return *(__m128i *) &a;
-}
-static __m128 gmx_mm_castps_ps128(__m128 a)
-{
- return *(__m128 *) &a;
-}
-static __m128d gmx_mm_castsi128_pd(__m128i a)
-{
- return *(__m128d *) &a;
-}
-static __m128i gmx_mm_castpd_si128(__m128d a)
-{
- return *(__m128i *) &a;
-}
-#endif
-
-
-static void
-gmx_simd_real_tintxmm_ps(const char *s, __m128 xmm)
-{
- float f[4];
-
- _mm_storeu_ps(f, xmm);
- printf("%s: %15.10e %15.10e %15.10e %15.10e\n", s, f[0], f[1], f[2], f[3]);
-}
-
-
-static void
-gmx_simd_real_tintxmmsum_ps(const char *s, __m128 xmm)
-{
- float f[4];
-
- _mm_storeu_ps(f, xmm);
- printf("%s (sum): %15.10g\n", s, f[0]+f[1]+f[2]+f[3]);
-}
-
-
-static void
-gmx_simd_real_tintxmm_pd(const char *s, __m128d xmm)
-{
- double f[2];
-
- _mm_storeu_pd(f, xmm);
- printf("%s: %30.20e %30.20e\n", s, f[0], f[1]);
-}
-
-static void
-gmx_simd_real_tintxmmsum_pd(const char *s, __m128d xmm)
-{
- double f[2];
-
- _mm_storeu_pd(f, xmm);
- printf("%s (sum): %15.10g\n", s, f[0]+f[1]);
-}
-
-
-static void
-gmx_simd_real_tintxmm_epi32(const char *s, __m128i xmmi)
-{
- int i[4];
-
- _mm_storeu_si128((__m128i *)i, xmmi);
- printf("%10s: %2d %2d %2d %2d\n", s, i[0], i[1], i[2], i[3]);
-}
-
-
-
-static int gmx_mm_check_and_reset_overflow(void)
-{
- int MXCSR;
- int sse_overflow;
-
- MXCSR = _mm_getcsr();
- /* The overflow flag is bit 3 in the register */
- if (MXCSR & 0x0008)
- {
- sse_overflow = 1;
- /* Set the overflow flag to zero */
- MXCSR = MXCSR & 0xFFF7;
- _mm_setcsr(MXCSR);
- }
- else
- {
- sse_overflow = 0;
- }
-
- return sse_overflow;
-}
-
-
-#endif
--- /dev/null
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2014, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+
+#ifndef GMX_SIMD_IMPLEMENTATION_IBM_QPX_H
+#define GMX_SIMD_IMPLEMENTATION_IBM_QPX_H
+
+#include <math.h>
+#ifdef __clang__
+#include <qpxmath.h>
+#endif
+
+/* IBM QPX SIMD instruction wrappers
+ *
+ * Please see documentation in gromacs/simd/simd.h for the available
+ * defines.
+ */
+/* Capability definitions for IBM QPX */
+#define GMX_SIMD_HAVE_FLOAT
+#define GMX_SIMD_HAVE_DOUBLE
+#define GMX_SIMD_HAVE_HARDWARE
+#undef GMX_SIMD_HAVE_STOREU
+#undef GMX_SIMD_HAVE_STOREU
+#undef GMX_SIMD_HAVE_LOGICAL
+#define GMX_SIMD_HAVE_FMA
+#undef GMX_SIMD_HAVE_FRACTION
+#define GMX_SIMD_HAVE_FINT32
+#undef GMX_SIMD_HAVE_FINT32_EXTRACT
+#undef GMX_SIMD_HAVE_FINT32_LOGICAL
+#undef GMX_SIMD_HAVE_FINT32_ARITHMETICS
+#define GMX_SIMD_HAVE_DINT32
+#undef GMX_SIMD_HAVE_DINT32_EXTRACT
+#undef GMX_SIMD_HAVE_DINT32_LOGICAL
+#undef GMX_SIMD_HAVE_DINT32_ARITHMETICS
+#define GMX_SIMD4_HAVE_FLOAT
+#define GMX_SIMD4_HAVE_DOUBLE
+
+/* Implementation details */
+#define GMX_SIMD_FLOAT_WIDTH 4
+#define GMX_SIMD_DOUBLE_WIDTH 4
+#define GMX_SIMD_FINT32_WIDTH 4
+#define GMX_SIMD_DINT32_WIDTH 4
+#define GMX_SIMD_RSQRT_BITS 14
+#define GMX_SIMD_RCP_BITS 14
+
+/****************************************************
+ * SINGLE PRECISION SIMD IMPLEMENTATION *
+ ****************************************************/
+#define gmx_simd_float_t vector4double
+#ifdef NDEBUG
+# define gmx_simd_load_f(m) vec_ld(0, (float *)(m))
+# define gmx_simd_store_f(m, a) vec_st(a, 0, (float *)(m))
+#else
+# define gmx_simd_load_f(m) vec_lda(0, (float *)(m))
+# define gmx_simd_store_f(m, a) vec_sta(a, 0, (float *)(m))
+#endif
+# define gmx_simd_load1_f(m) vec_lds(0, (float *)(m))
+#define gmx_simd_set1_f(x) vec_splats(x)
+/* No support for unaligned load/store */
+#define gmx_simd_setzero_f gmx_simd_setzero_ibm_qpx
+#define gmx_simd_add_f(a, b) vec_add(a, b)
+#define gmx_simd_sub_f(a, b) vec_sub(a, b)
+#define gmx_simd_mul_f(a, b) vec_mul(a, b)
+#define gmx_simd_fmadd_f(a, b, c) vec_madd(a, b, c)
+#define gmx_simd_fmsub_f(a, b, c) vec_msub(a, b, c)
+/* IBM uses an alternative FMA definition, so -a*b+c=-(a*b-c) is "nmsub" */
+#define gmx_simd_fnmadd_f(a, b, c) vec_nmsub(a, b, c)
+/* IBM uses an alternative FMA definition, so -a*b-c=-(a*b+c) is "nmadd" */
+#define gmx_simd_fnmsub_f(a, b, c) vec_nmadd(a, b, c)
+/* gmx_simd_and_f not supported - no bitwise logical ops */
+/* gmx_simd_andnot_f not supported - no bitwise logical ops */
+/* gmx_simd_or_f not supported - no bitwise logical ops */
+/* gmx_simd_xor_f not supported - no bitwise logical ops */
+#define gmx_simd_rsqrt_f(a) vec_rsqrte(a)
+#define gmx_simd_rcp_f(a) vec_re(a)
+#define gmx_simd_fabs_f(a) vec_abs(a)
+#define gmx_simd_fneg_f gmx_simd_fneg_ibm_qpx
+#define gmx_simd_max_f(a, b) vec_sel(b, a, vec_sub(a, b))
+#define gmx_simd_min_f(a, b) vec_sel(b, a, vec_sub(b, a))
+/* Note: It is critical to use vec_cfid(vec_ctid(a)) for the implementation
+ * of gmx_simd_round_f(), since vec_round() does not adhere to the FP control
+ * word rounding scheme. We rely on float-to-float and float-to-integer
+ * rounding being the same for half-way values in a few algorithms.
+ */
+#define gmx_simd_round_f(a) vec_cfid(vec_ctid(a))
+#define gmx_simd_trunc_f(a) vec_trunc(a)
+#define gmx_simd_fraction_f(x) vec_sub(x, vec_trunc(x))
+#define gmx_simd_get_exponent_f(a) gmx_simd_get_exponent_ibm_qpx(a)
+#define gmx_simd_get_mantissa_f(a) gmx_simd_get_mantissa_ibm_qpx(a)
+#define gmx_simd_set_exponent_f(a) gmx_simd_set_exponent_ibm_qpx(a)
+/* integer datatype corresponding to float: gmx_simd_fint32_t */
+#define gmx_simd_fint32_t vector4double
+#ifdef NDEBUG
+# define gmx_simd_load_fi(m) vec_ldia(0, (int *)(m))
+#else
+# define gmx_simd_load_fi(m) vec_ldiaa(0, (int *)(m))
+#endif
+#define gmx_simd_set1_fi(i) gmx_simd_set1_int_ibm_qpx(i)
+#define gmx_simd_store_fi(m, x) vec_st(x, 0, (int *)(m))
+#define gmx_simd_setzero_fi gmx_simd_setzero_ibm_qpx
+#define gmx_simd_cvt_f2i(a) vec_ctiw(a)
+#define gmx_simd_cvtt_f2i(a) vec_ctiwz(a)
+#define gmx_simd_cvt_i2f(a) vec_cfid(a)
+/* Integer simd extract not available */
+/* Integer logical ops on gmx_simd_fint32_t not supported */
+/* Integer arithmetic ops on gmx_simd_fint32_t not supported */
+/* Boolean & comparison operations on gmx_simd_float_t */
+#define gmx_simd_fbool_t vector4double
+#define gmx_simd_cmpeq_f(a, b) vec_cmpeq(a, b)
+#define gmx_simd_cmplt_f(a, b) vec_cmplt((a), (b))
+#define gmx_simd_cmple_f(a, b) gmx_simd_or_fb(vec_cmpeq(a, b), vec_cmplt(a, b))
+#define gmx_simd_and_fb(a, b) vec_and(a, b)
+#define gmx_simd_or_fb(a, b) vec_or(a, b)
+#define gmx_simd_anytrue_fb(a) gmx_simd_anytrue_bool_ibm_qpx(a)
+#define gmx_simd_blendzero_f(a, sel) vec_sel(vec_splats(0.0), a, sel)
+#define gmx_simd_blendnotzero_f(a, sel) vec_sel(a, vec_splats(0.0), sel)
+#define gmx_simd_blendv_f(a, b, sel) vec_sel(a, b, sel)
+#define gmx_simd_reduce_f(a) gmx_simd_reduce_ibm_qpx(a)
+
+
+/* Boolean & comparison operations on gmx_simd_fint32_t not supported */
+/* Conversions between different booleans not supported */
+
+static __attribute__((always_inline)) vector4double
+gmx_simd_fneg_ibm_qpx(vector4double a)
+{
+ return vec_neg(a);
+}
+/****************************************************
+ * DOUBLE PRECISION SIMD IMPLEMENTATION *
+ ****************************************************/
+#define gmx_simd_double_t vector4double
+#ifdef NDEBUG
+# define gmx_simd_load_d(m) vec_ld(0, (double *)(m))
+# define gmx_simd_store_d(m, a) vec_st(a, 0, (double *)(m))
+#else
+# define gmx_simd_load_d(m) vec_lda(0, (double *)(m))
+# define gmx_simd_store_d(m, a) vec_sta(a, 0, (double *)(m))
+#endif
+# define gmx_simd_load1_d(m) vec_lds(0, (double *)(m))
+#define gmx_simd_set1_d(x) vec_splats(x)
+/* No support for unaligned load/store */
+#define gmx_simd_setzero_d gmx_simd_setzero_ibm_qpx
+#define gmx_simd_add_d(a, b) vec_add(a, b)
+#define gmx_simd_sub_d(a, b) vec_sub(a, b)
+#define gmx_simd_mul_d(a, b) vec_mul(a, b)
+#define gmx_simd_fmadd_d(a, b, c) vec_madd(a, b, c)
+#define gmx_simd_fmsub_d(a, b, c) vec_msub(a, b, c)
+/* IBM uses an alternative FMA definition, so -a*b+c=-(a*b-c) is "nmsub" */
+#define gmx_simd_fnmadd_d(a, b, c) vec_nmsub(a, b, c)
+/* IBM uses an alternative FMA definition, so -a*b-c=-(a*b+c) is "nmadd" */
+#define gmx_simd_fnmsub_d(a, b, c) vec_nmadd(a, b, c)
+/* gmx_simd_and_d not supported - no bitwise logical ops */
+/* gmx_simd_andnot_d not supported - no bitwise logical ops */
+/* gmx_simd_or_d not supported - no bitwise logical ops */
+/* gmx_simd_xor_d not supported - no bitwise logical ops */
+#define gmx_simd_rsqrt_d(a) vec_rsqrte(a)
+#define gmx_simd_rcp_d(a) vec_re(a)
+#define gmx_simd_fabs_d(a) vec_abs(a)
+#define gmx_simd_fneg_d gmx_simd_fneg_ibm_qpx
+#define gmx_simd_max_d(a, b) vec_sel(b, a, vec_sub(a, b))
+#define gmx_simd_min_d(a, b) vec_sel(b, a, vec_sub(b, a))
+/* Note: It is critical to use vec_cfid(vec_ctid(a)) for the implementation
+ * of gmx_simd_round_f(), since vec_round() does not adhere to the FP control
+ * word rounding scheme. We rely on float-to-float and float-to-integer
+ * rounding being the same for half-way values in a few algorithms.
+ */
+#define gmx_simd_round_d(a) vec_cfid(vec_ctid(a))
+#define gmx_simd_trunc_d(a) vec_trunc(a)
+#define gmx_simd_fraction_d(x) vec_sub(x, vec_trunc(x))
+#define gmx_simd_get_exponent_d(a) gmx_simd_get_exponent_ibm_qpx(a)
+#define gmx_simd_get_mantissa_d(a) gmx_simd_get_mantissa_ibm_qpx(a)
+#define gmx_simd_set_exponent_d(a) gmx_simd_set_exponent_ibm_qpx(a)
+/* integer datatype corresponding to double: gmx_simd_dint32_t */
+#define gmx_simd_dint32_t vector4double
+#ifdef NDEBUG
+# define gmx_simd_load_di(m) vec_ldia(0, (int *)(m))
+#else
+# define gmx_simd_load_di(m) vec_ldiaa(0, (int *)(m))
+#endif
+#define gmx_simd_set1_di(i) gmx_simd_set1_int_ibm_qpx(i)
+#define gmx_simd_store_di(m, x) vec_st(x, 0, (int *)(m))
+#define gmx_simd_setzero_di gmx_simd_setzero_ibm_qpx
+#define gmx_simd_cvt_d2i(a) vec_ctiw(a)
+#define gmx_simd_cvtt_d2i(a) vec_ctiwz(a)
+#define gmx_simd_cvt_i2d(a) vec_cfid(a)
+/* Integer simd extract not available */
+/* Integer logical ops on gmx_simd_dint32_t not supported */
+/* Integer arithmetic ops on gmx_simd_dint32_t not supported */
+/* Boolean & comparison operations on gmx_simd_double_t */
+#define gmx_simd_dbool_t vector4double
+#define gmx_simd_cmpeq_d(a, b) vec_cmpeq(a, b)
+#define gmx_simd_cmplt_d(a, b) vec_cmplt((a), (b))
+#define gmx_simd_cmple_d(a, b) gmx_simd_or_fb(vec_cmpeq(a, b), vec_cmplt(a, b))
+#define gmx_simd_and_db(a, b) vec_and(a, b)
+#define gmx_simd_or_db(a, b) vec_or(a, b)
+#define gmx_simd_anytrue_db(a) gmx_simd_anytrue_bool_ibm_qpx(a)
+#define gmx_simd_blendzero_d(a, sel) vec_sel(vec_splats(0.0), a, sel)
+#define gmx_simd_blendnotzero_d(a, sel) vec_sel(a, vec_splats(0.0), sel)
+#define gmx_simd_blendv_d(a, b, sel) vec_sel(a, b, sel)
+#define gmx_simd_reduce_d(a) gmx_simd_reduce_ibm_qpx(a)
+
+/* Boolean & comparison operations on gmx_simd_dint32_t not supported */
+/* Conversions between different booleans not supported */
+
+
+/****************************************************
+ * IMPLEMENTATION HELPER FUNCTIONS *
+ ****************************************************/
+static __attribute__((always_inline)) vector4double
+gmx_simd_setzero_ibm_qpx(void)
+{
+ return vec_splats(0.0);
+}
+
+static __attribute__((always_inline)) vector4double
+gmx_simd_get_exponent_ibm_qpx(vector4double x)
+{
+ const gmx_int64_t expmask = 0x7ff0000000000000LL;
+ const gmx_int64_t expbase = 1023;
+ gmx_int64_t idata[4] __attribute__((aligned(32)));
+
+ /* Store to memory */
+ vec_st(x, 0, idata);
+ /* Perform integer arithmetics in general registers. */
+ idata[0] = ((idata[0] & expmask) >> 52) - expbase;
+ idata[1] = ((idata[1] & expmask) >> 52) - expbase;
+ idata[2] = ((idata[2] & expmask) >> 52) - expbase;
+ idata[3] = ((idata[3] & expmask) >> 52) - expbase;
+ /* Reload from memory */
+ return vec_cfid(vec_ld(0, idata));
+}
+
+static __attribute__((always_inline)) vector4double
+gmx_simd_get_mantissa_ibm_qpx(vector4double x)
+{
+ const gmx_int64_t exp_and_sign_mask = 0xfff0000000000000LL;
+ const gmx_int64_t ione = 0x3ff0000000000000LL;
+ gmx_int64_t idata[4] __attribute__((aligned(32)));
+
+ /* Store to memory */
+ vec_st(x, 0, idata);
+ /* Perform integer arithmetics in general registers. */
+ idata[0] = (idata[0] & (~exp_and_sign_mask)) | ione;
+ idata[1] = (idata[1] & (~exp_and_sign_mask)) | ione;
+ idata[2] = (idata[2] & (~exp_and_sign_mask)) | ione;
+ idata[3] = (idata[3] & (~exp_and_sign_mask)) | ione;
+ /* Reload from memory */
+ return vec_ld(0, idata);
+}
+
+static __attribute__((always_inline)) vector4double
+gmx_simd_set_exponent_ibm_qpx(vector4double x)
+{
+ const gmx_int64_t expbase = 1023;
+ gmx_int64_t idata[4] __attribute__((aligned(32)));
+
+ /* Store to memory for shifts. It is REALLY critical that we use the same
+ * rounding mode as for gmx_simd_round_r() here. In particular, for QPX
+ * this means we implement gmx_simd_round_r(a) as vec_cfid(vec_ctid(a)),
+ * since vec_round() uses a different rounding scheme.
+ */
+ vec_st(vec_ctid(x), 0, idata);
+ /* Perform integer arithmetics in general registers. */
+ idata[0] = (idata[0] + expbase) << 52;
+ idata[1] = (idata[1] + expbase) << 52;
+ idata[2] = (idata[2] + expbase) << 52;
+ idata[3] = (idata[3] + expbase) << 52;
+ /* Reload from memory */
+ return vec_ld(0, idata);
+}
+
+static __attribute__((always_inline)) double
+gmx_simd_reduce_ibm_qpx(vector4double x)
+{
+ vector4double y = vec_sldw(x, x, 2);
+ vector4double z;
+
+ y = vec_add(y, x);
+ z = vec_sldw(y, y, 1);
+ y = vec_add(y, z);
+ return vec_extract(y, 0);
+}
+
+static __attribute__((always_inline)) vector4double
+gmx_simd_set1_int_ibm_qpx(int i)
+{
+ int idata[4] __attribute__((aligned(32)));
+
+ idata[0] = i;
+
+ /* Reload from memory */
+ return vec_splat(vec_ldia(0, idata), 0);
+}
+
+/* This works in both single and double */
+static __attribute__((always_inline)) int
+gmx_simd_anytrue_bool_ibm_qpx(vector4double a)
+{
+ vector4double b = vec_sldw(a, a, 2);
+
+ a = vec_or(a, b);
+ b = vec_sldw(a, a, 1);
+ a = vec_or(a, b);
+ return (vec_extract(a, 0) > 0);
+}
+
+/* QPX is already 4-wide both in single and double, so just reuse for SIMD4 */
+
+/* SINGLE */
+#define gmx_simd4_float_t gmx_simd_float_t
+#define gmx_simd4_load_f gmx_simd_load_f
+#define gmx_simd4_load1_f gmx_simd_load1_f
+#define gmx_simd4_set1_f gmx_simd_set1_f
+#define gmx_simd4_store_f gmx_simd_store_f
+#define gmx_simd4_loadu_f gmx_simd_loadu_f
+#define gmx_simd4_storeu_f gmx_simd_storeu_f
+#define gmx_simd4_setzero_f gmx_simd_setzero_f
+#define gmx_simd4_add_f gmx_simd_add_f
+#define gmx_simd4_sub_f gmx_simd_sub_f
+#define gmx_simd4_mul_f gmx_simd_mul_f
+#define gmx_simd4_fmadd_f gmx_simd_fmadd_f
+#define gmx_simd4_fmsub_f gmx_simd_fmsub_f
+#define gmx_simd4_fnmadd_f gmx_simd_fnmadd_f
+#define gmx_simd4_fnmsub_f gmx_simd_fnmsub_f
+#define gmx_simd4_and_f gmx_simd_and_f
+#define gmx_simd4_andnot_f gmx_simd_andnot_f
+#define gmx_simd4_or_f gmx_simd_or_f
+#define gmx_simd4_xor_f gmx_simd_xor_f
+#define gmx_simd4_rsqrt_f gmx_simd_rsqrt_f
+#define gmx_simd4_rcp_f gmx_simd_rcp_f
+#define gmx_simd4_fabs_f gmx_simd_fabs_f
+#define gmx_simd4_fneg_f gmx_simd_fneg_f
+#define gmx_simd4_max_f gmx_simd_max_f
+#define gmx_simd4_min_f gmx_simd_min_f
+#define gmx_simd4_round_f gmx_simd_round_f
+#define gmx_simd4_trunc_f gmx_simd_trunc_f
+#define gmx_simd4_fraction_f gmx_simd_fraction_f
+#define gmx_simd4_get_exponent_f gmx_simd_get_exponent_f
+#define gmx_simd4_get_mantissa_f gmx_simd_get_mantissa_f
+#define gmx_simd4_set_exponent_f gmx_simd_set_exponent_f
+#define gmx_simd4_dotproduct3_f gmx_simd4_dotproduct3_f_ibm_qpx
+#define gmx_simd4_fint32_t gmx_simd_fint32_t
+#define gmx_simd4_load_fi gmx_simd_load_fi
+#define gmx_simd4_load1_fi gmx_simd_load1_fi
+#define gmx_simd4_set1_fi gmx_simd_set1_fi
+#define gmx_simd4_store_fi gmx_simd_store_fi
+#define gmx_simd4_loadu_fi gmx_simd_loadu_fi
+#define gmx_simd4_storeu_fi gmx_simd_storeu_fi
+#define gmx_simd4_setzero_fi gmx_simd_setzero_fi
+#define gmx_simd4_cvt_f2i gmx_simd_cvt_f2i
+#define gmx_simd4_cvtt_f2i gmx_simd_cvtt_f2i
+#define gmx_simd4_cvt_i2f gmx_simd_cvt_i2f
+#define gmx_simd4_fbool_t gmx_simd_fbool_t
+#define gmx_simd4_cmpeq_f gmx_simd_cmpeq_f
+#define gmx_simd4_cmplt_f gmx_simd_cmplt_f
+#define gmx_simd4_cmple_f gmx_simd_cmple_f
+#define gmx_simd4_and_fb gmx_simd_and_fb
+#define gmx_simd4_or_fb gmx_simd_or_fb
+#define gmx_simd4_anytrue_fb gmx_simd_anytrue_fb
+#define gmx_simd4_blendzero_f gmx_simd_blendzero_f
+#define gmx_simd4_blendnotzero_f gmx_simd_blendnotzero_f
+#define gmx_simd4_blendv_f gmx_simd_blendv_f
+#define gmx_simd4_reduce_f gmx_simd_reduce_f
+/* DOUBLE */
+#define gmx_simd4_double_t gmx_simd_double_t
+#define gmx_simd4_load_d gmx_simd_load_d
+#define gmx_simd4_load1_d gmx_simd_load1_d
+#define gmx_simd4_set1_d gmx_simd_set1_d
+#define gmx_simd4_store_d gmx_simd_store_d
+#define gmx_simd4_loadu_d gmx_simd_loadu_d
+#define gmx_simd4_storeu_d gmx_simd_storeu_d
+#define gmx_simd4_setzero_d gmx_simd_setzero_d
+#define gmx_simd4_add_d gmx_simd_add_d
+#define gmx_simd4_sub_d gmx_simd_sub_d
+#define gmx_simd4_mul_d gmx_simd_mul_d
+#define gmx_simd4_fmadd_d gmx_simd_fmadd_d
+#define gmx_simd4_fmsub_d gmx_simd_fmsub_d
+#define gmx_simd4_fnmadd_d gmx_simd_fnmadd_d
+#define gmx_simd4_fnmsub_d gmx_simd_fnmsub_d
+#define gmx_simd4_and_d gmx_simd_and_d
+#define gmx_simd4_andnot_d gmx_simd_andnot_d
+#define gmx_simd4_or_d gmx_simd_or_d
+#define gmx_simd4_xor_d gmx_simd_xor_d
+#define gmx_simd4_rsqrt_d gmx_simd_rsqrt_d
+#define gmx_simd4_rcp_d gmx_simd_rcp_d
+#define gmx_simd4_fabs_d gmx_simd_fabs_d
+#define gmx_simd4_fneg_d gmx_simd_fneg_d
+#define gmx_simd4_max_d gmx_simd_max_d
+#define gmx_simd4_min_d gmx_simd_min_d
+#define gmx_simd4_round_d gmx_simd_round_d
+#define gmx_simd4_trunc_d gmx_simd_trunc_d
+#define gmx_simd4_fraction_d gmx_simd_fraction_d
+#define gmx_simd4_get_exponent_d gmx_simd_get_exponent_d
+#define gmx_simd4_get_mantissa_d gmx_simd_get_mantissa_d
+#define gmx_simd4_set_exponent_d gmx_simd_set_exponent_d
+#define gmx_simd4_dotproduct3_d gmx_simd4_dotproduct3_d_ibm_qpx
+#define gmx_simd4_dint32_t gmx_simd_dint32_t
+#define gmx_simd4_load_di gmx_simd_load_di
+#define gmx_simd4_load1_di gmx_simd_load1_di
+#define gmx_simd4_set1_di gmx_simd_set1_di
+#define gmx_simd4_store_di gmx_simd_store_di
+#define gmx_simd4_loadu_di gmx_simd_loadu_di
+#define gmx_simd4_storeu_di gmx_simd_storeu_di
+#define gmx_simd4_setzero_di gmx_simd_setzero_di
+#define gmx_simd4_cvt_d2i gmx_simd_cvt_d2i
+#define gmx_simd4_cvtt_d2i gmx_simd_cvtt_d2i
+#define gmx_simd4_cvt_i2f gmx_simd_cvt_i2f
+#define gmx_simd4_dbool_t gmx_simd_dbool_t
+#define gmx_simd4_cmpeq_d gmx_simd_cmpeq_d
+#define gmx_simd4_cmplt_d gmx_simd_cmplt_d
+#define gmx_simd4_cmple_d gmx_simd_cmple_d
+#define gmx_simd4_and_db gmx_simd_and_db
+#define gmx_simd4_or_db gmx_simd_or_db
+#define gmx_simd4_anytrue_db gmx_simd_anytrue_db
+#define gmx_simd4_blendzero_d gmx_simd_blendzero_d
+#define gmx_simd4_blendnotzero_d gmx_simd_blendnotzero_d
+#define gmx_simd4_blendv_d gmx_simd_blendv_d
+#define gmx_simd4_reduce_d gmx_simd_reduce_d
+
+static __attribute__((always_inline)) double
+gmx_simd4_dotproduct3_d_ibm_qpx(vector4double a, vector4double b)
+{
+ vector4double dp_sh0 = vec_mul(a, b);
+ vector4double dp_sh1 = vec_sldw(dp_sh0, dp_sh0, 1);
+ vector4double dp_sh2 = vec_sldw(dp_sh0, dp_sh0, 2);
+ vector4double dp = vec_add(dp_sh2, vec_add(dp_sh0, dp_sh1));
+
+ return vec_extract(dp, 0);
+}
+
+static __attribute__((always_inline)) float
+gmx_simd4_dotproduct3_f_ibm_qpx(vector4double a, vector4double b)
+{
+ return (float)gmx_simd4_dotproduct3_d_ibm_qpx(a, b);
+}
+
+/* Function to check whether SIMD operations have resulted in overflow.
+ * For now, this is unfortunately a dummy for this architecture.
+ */
+static int
+gmx_simd_check_and_reset_overflow(void)
+{
+ return 0;
+}
+
+#endif /* GMX_SIMD_IMPLEMENTATION_IBM_QPX_H */
--- /dev/null
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2014, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+
+#ifndef GMX_SIMD_IMPL_INTEL_MIC_H
+#define GMX_SIMD_IMPL_INTEL_MIC_H
+
+#include <math.h>
+#include <immintrin.h>
+
+/* Intel Xeon Phi, or
+ * the-artist-formerly-known-as-Knight's-corner, or
+ * the-artist-formerly-formerly-known-as-MIC, or
+ * the artist formerly-formerly-formerly-known-as-Larrabee
+ * 512-bit SIMD instruction wrappers.
+ */
+
+/* Capability definitions for Xeon Phi SIMD */
+#define GMX_SIMD_HAVE_FLOAT
+#define GMX_SIMD_HAVE_DOUBLE
+#define GMX_SIMD_HAVE_SIMD_HARDWARE
+#define GMX_SIMD_HAVE_LOADU
+#define GMX_SIMD_HAVE_STOREU
+#define GMX_SIMD_HAVE_LOGICAL
+#define GMX_SIMD_HAVE_FMA
+#undef GMX_SIMD_HAVE_FRACTION
+#define GMX_SIMD_HAVE_FINT32
+#define GMX_SIMD_HAVE_FINT32_EXTRACT
+#define GMX_SIMD_HAVE_FINT32_LOGICAL
+#define GMX_SIMD_HAVE_FINT32_ARITHMETICS
+#define GMX_SIMD_HAVE_DINT32
+#define GMX_SIMD_HAVE_DINT32_EXTRACT
+#define GMX_SIMD_HAVE_DINT32_LOGICAL
+#define GMX_SIMD_HAVE_DINT32_ARITHMETICS
+#define GMX_SIMD4_HAVE_FLOAT
+#define GMX_SIMD4_HAVE_DOUBLE
+
+/* Implementation details */
+#define GMX_SIMD_FLOAT_WIDTH 16
+#define GMX_SIMD_DOUBLE_WIDTH 8
+#define GMX_SIMD_FINT32_WIDTH 16
+#define GMX_SIMD_DINT32_WIDTH 8
+#define GMX_SIMD_RSQRT_BITS 23
+#define GMX_SIMD_RCP_BITS 23
+
+/****************************************************
+ * SINGLE PRECISION SIMD IMPLEMENTATION *
+ ****************************************************/
+#define gmx_simd_float_t __m512
+#define gmx_simd_load_f _mm512_load_ps
+#define gmx_simd_load1_f(m) _mm512_extload_ps(m, _MM_UPCONV_PS_NONE, _MM_BROADCAST_1X16, _MM_HINT_NONE)
+#define gmx_simd_set1_f _mm512_set1_ps
+#define gmx_simd_store_f _mm512_store_ps
+#define gmx_simd_loadu_f gmx_simd_loadu_f_mic
+#define gmx_simd_storeu_f gmx_simd_storeu_f_mic
+#define gmx_simd_setzero_f _mm512_setzero_ps
+#define gmx_simd_add_f _mm512_add_ps
+#define gmx_simd_sub_f _mm512_sub_ps
+#define gmx_simd_mul_f _mm512_mul_ps
+#define gmx_simd_fmadd_f _mm512_fmadd_ps
+#define gmx_simd_fmsub_f _mm512_fmsub_ps
+#define gmx_simd_fnmadd_f _mm512_fnmadd_ps
+#define gmx_simd_fnmsub_f _mm512_fnmsub_ps
+#define gmx_simd_and_f(a, b) _mm512_castsi512_ps(_mm512_and_epi32(_mm512_castps_si512(a), _mm512_castps_si512(b)))
+#define gmx_simd_andnot_f(a, b) _mm512_castsi512_ps(_mm512_andnot_epi32(_mm512_castps_si512(a), _mm512_castps_si512(b)))
+#define gmx_simd_or_f(a, b) _mm512_castsi512_ps(_mm512_or_epi32(_mm512_castps_si512(a), _mm512_castps_si512(b)))
+#define gmx_simd_xor_f(a, b) _mm512_castsi512_ps(_mm512_xor_epi32(_mm512_castps_si512(a), _mm512_castps_si512(b)))
+#define gmx_simd_rsqrt_f _mm512_rsqrt23_ps
+#define gmx_simd_rcp_f _mm512_rcp23_ps
+#define gmx_simd_fabs_f(x) gmx_simd_andnot_f(_mm512_set1_ps(-0.0), x)
+#define gmx_simd_fneg_f(x) _mm512_addn_ps(x, _mm512_setzero_ps())
+#define gmx_simd_max_f _mm512_gmax_ps
+#define gmx_simd_min_f _mm512_gmin_ps
+#define gmx_simd_round_f(x) _mm512_round_ps(x, _MM_FROUND_TO_NEAREST_INT, _MM_EXPADJ_NONE)
+#define gmx_simd_trunc_f(x) _mm512_round_ps(x, _MM_FROUND_TO_ZERO, _MM_EXPADJ_NONE)
+#define gmx_simd_fraction_f(x) _mm512_sub_ps(x, gmx_simd_trunc_f(x))
+#define gmx_simd_get_exponent_f(x) _mm512_getexp_ps(x)
+#define gmx_simd_get_mantissa_f(x) _mm512_getmant_ps(x, _MM_MANT_NORM_1_2, _MM_MANT_SIGN_zero)
+#define gmx_simd_set_exponent_f(x) gmx_simd_set_exponent_f_mic(x)
+/* integer datatype corresponding to float: gmx_simd_fint32_t */
+#define gmx_simd_fint32_t __m512i
+#define gmx_simd_load_fi _mm512_load_epi32
+#define gmx_simd_set1_fi _mm512_set1_epi32
+#define gmx_simd_store_fi _mm512_store_epi32
+#define gmx_simd_loadu_fi gmx_simd_loadu_fi_mic
+#define gmx_simd_storeu_fi gmx_simd_storeu_fi_mic
+#define gmx_simd_extract_fi gmx_simd_extract_fi_mic
+#define gmx_simd_setzero_fi _mm512_setzero_epi32
+#define gmx_simd_cvt_f2i(a) _mm512_cvtfxpnt_round_adjustps_epi32(a, _MM_FROUND_TO_NEAREST_INT, _MM_EXPADJ_NONE)
+#define gmx_simd_cvtt_f2i(a) _mm512_cvtfxpnt_round_adjustps_epi32(a, _MM_FROUND_TO_ZERO, _MM_EXPADJ_NONE)
+#define gmx_simd_cvt_i2f(a) _mm512_cvtfxpnt_round_adjustepi32_ps(a, _MM_FROUND_TO_NEAREST_INT, _MM_EXPADJ_NONE)
+/* Integer logical ops on gmx_simd_fint32_t */
+#define gmx_simd_slli_fi _mm512_slli_epi32
+#define gmx_simd_srli_fi _mm512_srli_epi32
+#define gmx_simd_and_fi _mm512_and_epi32
+#define gmx_simd_andnot_fi _mm512_andnot_epi32
+#define gmx_simd_or_fi _mm512_or_epi32
+#define gmx_simd_xor_fi _mm512_xor_epi32
+/* Integer arithmetic ops on gmx_simd_fint32_t */
+#define gmx_simd_add_fi _mm512_add_epi32
+#define gmx_simd_sub_fi _mm512_sub_epi32
+#define gmx_simd_mul_fi _mm512_mullo_epi32
+/* Boolean & comparison operations on gmx_simd_float_t */
+#define gmx_simd_fbool_t __mmask16
+#define gmx_simd_cmpeq_f(a, b) _mm512_cmp_ps_mask(a, b, _CMP_EQ_OQ)
+#define gmx_simd_cmplt_f(a, b) _mm512_cmp_ps_mask(a, b, _CMP_LT_OS)
+#define gmx_simd_cmple_f(a, b) _mm512_cmp_ps_mask(a, b, _CMP_LE_OS)
+#define gmx_simd_and_fb _mm512_kand
+#define gmx_simd_andnot_fb(a, b) _mm512_knot(_mm512_kor(a, b))
+#define gmx_simd_or_fb _mm512_kor
+#define gmx_simd_anytrue_fb _mm512_mask2int
+#define gmx_simd_blendzero_f(a, sel) _mm512_mask_mov_ps(_mm512_setzero_ps(), sel, a)
+#define gmx_simd_blendnotzero_f(a, sel) _mm512_mask_mov_ps(_mm512_setzero_ps(), _mm512_knot(sel), a)
+#define gmx_simd_blendv_f(a, b, sel) _mm512_mask_blend_ps(sel, a, b)
+#define gmx_simd_reduce_f(a) _mm512_reduce_add_ps(a)
+/* Boolean & comparison operations on gmx_simd_fint32_t */
+#define gmx_simd_fibool_t __mmask16
+#define gmx_simd_cmpeq_fi(a, b) _mm512_cmp_epi32_mask(a, b, _MM_CMPINT_EQ)
+#define gmx_simd_cmplt_fi(a, b) _mm512_cmp_epi32_mask(a, b, _MM_CMPINT_LT)
+#define gmx_simd_and_fib _mm512_kand
+#define gmx_simd_or_fib _mm512_kor
+#define gmx_simd_anytrue_fib _mm512_mask2int
+#define gmx_simd_blendzero_fi(a, sel) _mm512_mask_mov_epi32(_mm512_setzero_epi32(), sel, a)
+#define gmx_simd_blendnotzero_fi(a, sel) _mm512_mask_mov_epi32(_mm512_setzero_epi32(), _mm512_knot(sel), a)
+#define gmx_simd_blendv_fi(a, b, sel) _mm512_mask_blend_epi32(sel, a, b)
+/* Conversions between different booleans */
+#define gmx_simd_cvt_fb2fib(x) (x)
+#define gmx_simd_cvt_fib2fb(x) (x)
+
+/* MIC provides full single precision of some neat functions: */
+/* 1/sqrt(x) and 1/x work fine in simd_math.h, and won't use extra iterations */
+
+#define gmx_simd_exp2_f gmx_simd_exp2_f_mic
+#define gmx_simd_exp_f gmx_simd_exp_f_mic
+#define gmx_simd_log_f gmx_simd_log_f_mic
+
+/****************************************************
+ * DOUBLE PRECISION SIMD IMPLEMENTATION *
+ ****************************************************/
+#define gmx_simd_double_t __m512d
+#define gmx_simd_load_d _mm512_load_pd
+#define gmx_simd_load1_d(m) _mm512_extload_pd(m, _MM_UPCONV_PD_NONE, _MM_BROADCAST_1X8, _MM_HINT_NONE)
+#define gmx_simd_set1_d _mm512_set1_pd
+#define gmx_simd_store_d _mm512_store_pd
+#define gmx_simd_loadu_d gmx_simd_loadu_d_mic
+#define gmx_simd_storeu_d gmx_simd_storeu_d_mic
+#define gmx_simd_setzero_d _mm512_setzero_pd
+#define gmx_simd_add_d _mm512_add_pd
+#define gmx_simd_sub_d _mm512_sub_pd
+#define gmx_simd_mul_d _mm512_mul_pd
+#define gmx_simd_fmadd_d _mm512_fmadd_pd
+#define gmx_simd_fmsub_d _mm512_fmsub_pd
+#define gmx_simd_fnmadd_d _mm512_fnmadd_pd
+#define gmx_simd_fnmsub_d _mm512_fnmsub_pd
+#define gmx_simd_and_d(a, b) _mm512_castsi512_pd(_mm512_and_epi32(_mm512_castpd_si512(a), _mm512_castpd_si512(b)))
+#define gmx_simd_andnot_d(a, b) _mm512_castsi512_pd(_mm512_andnot_epi32(_mm512_castpd_si512(a), _mm512_castpd_si512(b)))
+#define gmx_simd_or_d(a, b) _mm512_castsi512_pd(_mm512_or_epi32(_mm512_castpd_si512(a), _mm512_castpd_si512(b)))
+#define gmx_simd_xor_d(a, b) _mm512_castsi512_pd(_mm512_xor_epi32(_mm512_castpd_si512(a), _mm512_castpd_si512(b)))
+#define gmx_simd_rsqrt_d(x) _mm512_cvtpslo_pd(_mm512_rsqrt23_ps(_mm512_cvtpd_pslo(x)))
+#define gmx_simd_rcp_d(x) _mm512_cvtpslo_pd(_mm512_rcp23_ps(_mm512_cvtpd_pslo(x)))
+#define gmx_simd_fabs_d(x) gmx_simd_andnot_d(_mm512_set1_pd(-0.0), x)
+#define gmx_simd_fneg_d(x) _mm512_addn_pd(x, _mm512_setzero_pd())
+#define gmx_simd_max_d _mm512_gmax_pd
+#define gmx_simd_min_d _mm512_gmin_pd
+#define gmx_simd_round_d(a) _mm512_roundfxpnt_adjust_pd(a, _MM_FROUND_TO_NEAREST_INT, _MM_EXPADJ_NONE)
+#define gmx_simd_trunc_d(a) _mm512_roundfxpnt_adjust_pd(a, _MM_FROUND_TO_ZERO, _MM_EXPADJ_NONE)
+#define gmx_simd_fraction_d(x) _mm512_sub_pd(x, gmx_simd_trunc_d(x))
+#define gmx_simd_get_exponent_d(x) _mm512_getexp_pd(x)
+#define gmx_simd_get_mantissa_d(x) _mm512_getmant_pd(x, _MM_MANT_NORM_1_2, _MM_MANT_SIGN_zero)
+#define gmx_simd_set_exponent_d(x) gmx_simd_set_exponent_d_mic(x)
+/* integer datatype corresponding to float: gmx_simd_fint32_t
+ Doesn't use mask other than where required. No side effect expected for operating on the (unused) upper 8.
+ */
+#define gmx_simd_dint32_t __m512i
+#define gmx_simd_load_di(m) _mm512_mask_loadunpacklo_epi32(_mm512_undefined_epi32(), mask_loh, m)
+#define gmx_simd_set1_di _mm512_set1_epi32
+#define gmx_simd_store_di(m, a) _mm512_mask_packstorelo_epi32(m, mask_loh, a)
+#define gmx_simd_loadu_di gmx_simd_loadu_di_mic
+#define gmx_simd_storeu_di gmx_simd_storeu_di_mic
+#define gmx_simd_extract_di gmx_simd_extract_di_mic
+#define gmx_simd_setzero_di _mm512_setzero_epi32
+#define gmx_simd_cvt_d2i(a) _mm512_cvtfxpnt_roundpd_epi32lo(a, _MM_FROUND_TO_NEAREST_INT)
+#define gmx_simd_cvtt_d2i(a) _mm512_cvtfxpnt_roundpd_epi32lo(a, _MM_FROUND_TO_ZERO)
+#define gmx_simd_cvt_i2d _mm512_cvtepi32lo_pd
+/* Integer logical ops on gmx_simd_fint32_t */
+#define gmx_simd_slli_di _mm512_slli_epi32
+#define gmx_simd_srli_di _mm512_srli_epi32
+#define gmx_simd_and_di _mm512_and_epi32
+#define gmx_simd_andnot_di _mm512_andnot_epi32
+#define gmx_simd_or_di _mm512_or_epi32
+#define gmx_simd_xor_di _mm512_xor_epi32
+/* Integer arithmetic ops on gmx_simd_fint32_t */
+#define gmx_simd_add_di _mm512_add_epi32
+#define gmx_simd_sub_di _mm512_sub_epi32
+#define gmx_simd_mul_di _mm512_mullo_epi32
+/* Boolean & comparison operations on gmx_simd_float_t */
+#define gmx_simd_dbool_t __mmask8
+#define gmx_simd_cmpeq_d(a, b) _mm512_cmp_pd_mask(a, b, _CMP_EQ_OQ)
+#define gmx_simd_cmplt_d(a, b) _mm512_cmp_pd_mask(a, b, _CMP_LT_OS)
+#define gmx_simd_cmple_d(a, b) _mm512_cmp_pd_mask(a, b, _CMP_LE_OS)
+#define gmx_simd_and_db _mm512_kand
+#define gmx_simd_or_db _mm512_kor
+#define gmx_simd_anytrue_db(x) _mm512_mask2int(x)
+#define gmx_simd_blendzero_d(a, sel) _mm512_mask_mov_pd(_mm512_setzero_pd(), sel, a)
+#define gmx_simd_blendnotzero_d(a, sel) _mm512_mask_mov_pd(_mm512_setzero_pd(), _mm512_knot(sel), a)
+#define gmx_simd_blendv_d(a, b, sel) _mm512_mask_blend_pd(sel, a, b)
+#define gmx_simd_reduce_d(a) _mm512_reduce_add_pd(a)
+/* Boolean & comparison operations on gmx_simd_fint32_t */
+#define gmx_simd_dibool_t __mmask16
+#define gmx_simd_cmpeq_di(a, b) _mm512_cmp_epi32_mask(a, b, _MM_CMPINT_EQ)
+#define gmx_simd_cmplt_di(a, b) _mm512_cmp_epi32_mask(a, b, _MM_CMPINT_LT)
+#define gmx_simd_and_dib _mm512_kand
+#define gmx_simd_or_dib _mm512_kor
+#define gmx_simd_anytrue_dib(x) (_mm512_mask2int(x)&0xFF)
+#define gmx_simd_blendzero_di(a, sel) _mm512_mask_mov_epi32(_mm512_setzero_epi32(), sel, a)
+#define gmx_simd_blendnotzero_di(a, sel) _mm512_mask_mov_epi32(_mm512_setzero_epi32(), _mm512_knot(sel), a)
+#define gmx_simd_blendv_di(a, b, sel) _mm512_mask_blend_epi32(sel, a, b)
+/* Conversions between booleans. Double & dint stuff is stored in low bits */
+#define gmx_simd_cvt_db2dib(x) (x)
+#define gmx_simd_cvt_dib2db(x) (x)
+
+/* Float/double conversion */
+#define gmx_simd_cvt_f2dd gmx_simd_cvt_f2dd_mic
+#define gmx_simd_cvt_dd2f gmx_simd_cvt_dd2f_mic
+
+/****************************************************
+ * SINGLE PRECISION SIMD4 IMPLEMENTATION *
+ ****************************************************/
+/* Load and store are guranteed to only access the 4 floats. All arithmetic operations
+ only operate on the 4 elements (to avoid floating excpetions). But other operations
+ are not gurateed to not modify the other 12 elements. E.g. setzero or blendzero
+ set the upper 12 to zero. */
+#define gmx_simd4_float_t __m512
+#define gmx_simd4_mask _mm512_int2mask(0xF)
+#define gmx_simd4_load_f(m) _mm512_mask_loadunpacklo_ps(_mm512_undefined_ps(), gmx_simd4_mask, m)
+#define gmx_simd4_load1_f(m) _mm512_mask_extload_ps(_mm512_undefined_ps(), gmx_simd4_mask, m, _MM_UPCONV_PS_NONE, _MM_BROADCAST_1X16, _MM_HINT_NONE)
+#define gmx_simd4_set1_f _mm512_set1_ps
+#define gmx_simd4_store_f(m, a) _mm512_mask_packstorelo_ps(m, gmx_simd4_mask, a)
+#define gmx_simd4_loadu_f gmx_simd4_loadu_f_mic
+#define gmx_simd4_storeu_f gmx_simd4_storeu_f_mic
+#define gmx_simd4_setzero_f _mm512_setzero_ps
+#define gmx_simd4_add_f(a, b) _mm512_mask_add_ps(_mm512_undefined_ps(), gmx_simd4_mask, a, b)
+#define gmx_simd4_sub_f(a, b) _mm512_mask_sub_ps(_mm512_undefined_ps(), gmx_simd4_mask, a, b)
+#define gmx_simd4_mul_f(a, b) _mm512_mask_mul_ps(_mm512_undefined_ps(), gmx_simd4_mask, a, b)
+#define gmx_simd4_fmadd_f(a, b, c) _mm512_mask_fmadd_ps(a, gmx_simd4_mask, b, c)
+#define gmx_simd4_fmsub_f(a, b, c) _mm512_mask_fmsub_ps(a, gmx_simd4_mask, b, c)
+#define gmx_simd4_fnmadd_f(a, b, c) _mm512_mask_fnmadd_ps(a, gmx_simd4_mask, b, c)
+#define gmx_simd4_fnmsub_f(a, b, c) _mm512_mask_fnmsub_ps(a, gmx_simd4_mask, b, c)
+#define gmx_simd4_and_f(a, b) _mm512_castsi512_ps(_mm512_mask_and_epi32(_mm512_undefined_epi32(), gmx_simd4_mask, _mm512_castps_si512(a), _mm512_castps_si512(b)))
+#define gmx_simd4_andnot_f(a, b) _mm512_castsi512_ps(_mm512_mask_andnot_epi32(_mm512_undefined_epi32(), gmx_simd4_mask, _mm512_castps_si512(a), _mm512_castps_si512(b)))
+#define gmx_simd4_or_f(a, b) _mm512_castsi512_ps(_mm512_mask_or_epi32(_mm512_undefined_epi32(), gmx_simd4_mask, _mm512_castps_si512(a), _mm512_castps_si512(b)))
+#define gmx_simd4_xor_f(a, b) _mm512_castsi512_ps(_mm512_mask_xor_epi32(_mm512_undefined_epi32(), gmx_simd4_mask, _mm512_castps_si512(a), _mm512_castps_si512(b)))
+#define gmx_simd4_rsqrt_f(a) _mm512_mask_rsqrt23_ps(_mm512_undefined_ps(), gmx_simd4_mask, a)
+#define gmx_simd4_fabs_f(x) gmx_simd4_andnot_f(_mm512_set1_ps(-0.0), x)
+#define gmx_simd4_fneg_f(x) _mm512_mask_addn_ps(_mm512_undefined_ps(), gmx_simd4_mask, x, _mm512_setzero_ps())
+#define gmx_simd4_max_f(a, b) _mm512_mask_gmax_ps(_mm512_undefined_ps(), gmx_simd4_mask, a, b)
+#define gmx_simd4_min_f(a, b) _mm512_mask_gmin_ps(_mm512_undefined_ps(), gmx_simd4_mask, a, b)
+#define gmx_simd4_round_f(x) _mm512_mask_round_ps(_mm512_undefined_ps(), gmx_simd4_mask, x, _MM_FROUND_TO_NEAREST_INT, _MM_EXPADJ_NONE)
+#define gmx_simd4_trunc_f(x) _mm512_mask_round_ps(_mm512_undefined_ps(), gmx_simd4_mask, x, _MM_FROUND_TO_ZERO, _MM_EXPADJ_NONE)
+#define gmx_simd4_dotproduct3_f(a, b) _mm512_mask_reduce_add_ps(_mm512_int2mask(7), _mm512_mask_mul_ps(_mm512_undefined_ps(), _mm512_int2mask(7), a, b))
+#define gmx_simd4_fbool_t __mmask16
+#define gmx_simd4_cmpeq_f(a, b) _mm512_mask_cmp_ps_mask(gmx_simd4_mask, a, b, _CMP_EQ_OQ)
+#define gmx_simd4_cmplt_f(a, b) _mm512_mask_cmp_ps_mask(gmx_simd4_mask, a, b, _CMP_LT_OS)
+#define gmx_simd4_cmple_f(a, b) _mm512_mask_cmp_ps_mask(gmx_simd4_mask, a, b, _CMP_LE_OS)
+#define gmx_simd4_and_fb _mm512_kand
+#define gmx_simd4_or_fb _mm512_kor
+#define gmx_simd4_anytrue_fb(x) (_mm512_mask2int(x)&0xF)
+#define gmx_simd4_blendzero_f(a, sel) _mm512_mask_mov_ps(_mm512_setzero_ps(), sel, a)
+#define gmx_simd4_blendnotzero_f(a, sel) _mm512_mask_mov_ps(_mm512_setzero_ps(), _mm512_knot(sel), a)
+#define gmx_simd4_blendv_f(a, b, sel) _mm512_mask_blend_ps(sel, a, b)
+#define gmx_simd4_reduce_f(x) _mm512_mask_reduce_add_ps(_mm512_int2mask(0xF), x)
+
+/****************************************************
+ * DOUBLE PRECISION SIMD4 IMPLEMENTATION *
+ ****************************************************/
+#define gmx_simd4_double_t __m512d
+#define gmx_simd4_mask _mm512_int2mask(0xF)
+#define gmx_simd4_load_d(m) _mm512_mask_loadunpacklo_pd(_mm512_undefined_pd(), gmx_simd4_mask, m)
+#define gmx_simd4_load1_d(m) _mm512_mask_extload_pd(_mm512_undefined_pd(), gmx_simd4_mask, m, _MM_UPCONV_PD_NONE, _MM_BROADCAST_1X8, _MM_HINT_NONE)
+#define gmx_simd4_set1_d _mm512_set1_pd
+#define gmx_simd4_store_d(m, a) _mm512_mask_packstorelo_pd(m, gmx_simd4_mask, a)
+#define gmx_simd4_loadu_d gmx_simd4_loadu_d_mic
+#define gmx_simd4_storeu_d gmx_simd4_storeu_d_mic
+#define gmx_simd4_setzero_d _mm512_setzero_pd
+#define gmx_simd4_add_d(a, b) _mm512_mask_add_pd(_mm512_undefined_pd(), gmx_simd4_mask, a, b)
+#define gmx_simd4_sub_d(a, b) _mm512_mask_sub_pd(_mm512_undefined_pd(), gmx_simd4_mask, a, b)
+#define gmx_simd4_mul_d(a, b) _mm512_mask_mul_pd(_mm512_undefined_pd(), gmx_simd4_mask, a, b)
+#define gmx_simd4_fmadd_d(a, b, c) _mm512_mask_fmadd_pd(a, gmx_simd4_mask, b, c)
+#define gmx_simd4_fmsub_d(a, b, c) _mm512_mask_fmsub_pd(a, gmx_simd4_mask, b, c)
+#define gmx_simd4_fnmadd_d(a, b, c) _mm512_mask_fnmadd_pd(a, gmx_simd4_mask, b, c)
+#define gmx_simd4_fnmsub_d(a, b, c) _mm512_mask_fnmsub_pd(a, gmx_simd4_mask, b, c)
+#define gmx_simd4_and_d(a, b) _mm512_castsi512_pd(_mm512_mask_and_epi32(_mm512_undefined_epi32(), mask_loh, _mm512_castpd_si512(a), _mm512_castpd_si512(b)))
+#define gmx_simd4_andnot_d(a, b) _mm512_castsi512_pd(_mm512_mask_andnot_epi32(_mm512_undefined_epi32(), mask_loh, _mm512_castpd_si512(a), _mm512_castpd_si512(b)))
+#define gmx_simd4_or_d(a, b) _mm512_castsi512_pd(_mm512_mask_or_epi32(_mm512_undefined_epi32(), mask_loh, _mm512_castpd_si512(a), _mm512_castpd_si512(b)))
+#define gmx_simd4_xor_d(a, b) _mm512_castsi512_pd(_mm512_mask_xor_epi32(_mm512_undefined_epi32(), mask_loh, _mm512_castpd_si512(a), _mm512_castpd_si512(b)))
+#define gmx_simd4_rsqrt_d(a) _mm512_mask_cvtpslo_pd(_mm512_undefined_pd(), gmx_simd4_mask, _mm512_mask_rsqrt23_ps(_mm512_undefined_ps(), gmx_simd4_mask, _mm512_mask_cvtpd_pslo(_mm512_undefined_ps(), gmx_simd4_mask, x)))
+#define gmx_simd4_fabs_d(x) gmx_simd4_andnot_d(_mm512_set1_pd(-0.0), x)
+#define gmx_simd4_fneg_d(x) _mm512_mask_addn_pd(_mm512_undefined_pd(), gmx_simd4_mask, x, _mm512_setzero_pd())
+#define gmx_simd4_max_d(a, b) _mm512_mask_gmax_pd(_mm512_undefined_pd(), gmx_simd4_mask, a, b)
+#define gmx_simd4_min_d(a, b) _mm512_mask_gmin_pd(_mm512_undefined_pd(), gmx_simd4_mask, a, b)
+#define gmx_simd4_round_d(a) _mm512_mask_roundfxpnt_adjust_pd(_mm512_undefined_pd(), gmx_simd4_mask, a, _MM_FROUND_TO_NEAREST_INT, _MM_EXPADJ_NONE)
+#define gmx_simd4_trunc_d(a) _mm512_mask_roundfxpnt_adjust_pd(_mm512_undefined_pd(), gmx_simd4_mask, a, _MM_FROUND_TO_ZERO, _MM_EXPADJ_NONE)
+#define gmx_simd4_dotproduct3_d(a, b) _mm512_mask_reduce_add_pd(_mm512_int2mask(7), _mm512_mask_mul_pd(_mm512_undefined_pd(), _mm512_int2mask(7), a, b))
+#define gmx_simd4_dbool_t __mmask16
+#define gmx_simd4_cmpeq_d(a, b) _mm512_mask_cmp_pd_mask(gmx_simd4_mask, a, b, _CMP_EQ_OQ)
+#define gmx_simd4_cmplt_d(a, b) _mm512_mask_cmp_pd_mask(gmx_simd4_mask, a, b, _CMP_LT_OS)
+#define gmx_simd4_cmple_d(a, b) _mm512_mask_cmp_pd_mask(gmx_simd4_mask, a, b, _CMP_LE_OS)
+#define gmx_simd4_and_db _mm512_kand
+#define gmx_simd4_or_db _mm512_kor
+#define gmx_simd4_anytrue_db(x) (_mm512_mask2int(x)&0xF)
+#define gmx_simd4_blendzero_d(a, sel) _mm512_mask_mov_pd(_mm512_setzero_pd(), sel, a)
+#define gmx_simd4_blendnotzero_d(a, sel) _mm512_mask_mov_pd(_mm512_setzero_pd(), _mm512_knot(sel), a)
+#define gmx_simd4_blendv_d(a, b, sel) _mm512_mask_blend_pd(sel, a, b)
+#define gmx_simd4_reduce_d(x) _mm512_mask_reduce_add_pd(_mm512_int2mask(0xF), x)
+
+#define PERM_LOW2HIGH _MM_PERM_BABA
+#define PERM_HIGH2LOW _MM_PERM_DCDC
+
+#define mask_loh _mm512_int2mask(0x00FF) /* would be better a constant - but can't initialize with a function call. */
+#define mask_hih _mm512_int2mask(0xFF00)
+
+/* load store float */
+static gmx_inline __m512
+gmx_simd_loadu_f_mic(const float * m)
+{
+ return _mm512_loadunpackhi_ps(_mm512_loadunpacklo_ps(_mm512_undefined_ps(), m), m+16);
+}
+
+static gmx_inline void
+gmx_simd_storeu_f_mic(float * m, __m512 s)
+{
+ _mm512_packstorelo_ps(m, s);
+ _mm512_packstorehi_ps(m+16, s);
+}
+
+/* load store fint32 */
+static gmx_inline __m512i
+gmx_simd_loadu_fi_mic(const gmx_int32_t * m)
+{
+ return _mm512_loadunpackhi_epi32(_mm512_loadunpacklo_epi32(_mm512_undefined_epi32(), m), m+16);
+}
+
+static gmx_inline void
+gmx_simd_storeu_fi_mic(gmx_int32_t * m, __m512i s)
+{
+ _mm512_packstorelo_epi32(m, s);
+ _mm512_packstorehi_epi32(m+16, s);
+}
+
+/* load store double */
+static gmx_inline __m512d
+gmx_simd_loadu_d_mic(const double * m)
+{
+ return _mm512_loadunpackhi_pd(_mm512_loadunpacklo_pd(_mm512_undefined_pd(), m), m+8);
+}
+
+static gmx_inline void
+gmx_simd_storeu_d_mic(double * m, __m512d s)
+{
+ _mm512_packstorelo_pd(m, s);
+ _mm512_packstorehi_pd(m+8, s);
+}
+
+/* load store dint32 */
+static gmx_inline __m512i
+gmx_simd_loadu_di_mic(const gmx_int32_t * m)
+{
+ return _mm512_mask_loadunpackhi_epi32(_mm512_mask_loadunpacklo_epi32(_mm512_undefined_epi32(), mask_loh, m), mask_loh, m+16);
+}
+
+static gmx_inline void
+gmx_simd_storeu_di_mic(gmx_int32_t * m, __m512i s)
+{
+ _mm512_mask_packstorelo_epi32(m, mask_loh, s);
+ _mm512_mask_packstorehi_epi32(m+16, mask_loh, s);
+}
+
+/* load store simd4 */
+static gmx_inline __m512
+gmx_simd4_loadu_f_mic(const float * m)
+{
+ return _mm512_mask_loadunpackhi_ps(_mm512_mask_loadunpacklo_ps(_mm512_undefined_ps(), gmx_simd4_mask, m), gmx_simd4_mask, m+16);
+}
+
+static gmx_inline void
+gmx_simd4_storeu_f_mic(float * m, __m512 s)
+{
+ _mm512_mask_packstorelo_ps(m, gmx_simd4_mask, s);
+ _mm512_mask_packstorehi_ps(m+16, gmx_simd4_mask, s);
+}
+
+static gmx_inline __m512d
+gmx_simd4_loadu_d_mic(const double * m)
+{
+ return _mm512_mask_loadunpackhi_pd(_mm512_mask_loadunpacklo_pd(_mm512_undefined_pd(), gmx_simd4_mask, m), gmx_simd4_mask, m+8);
+}
+
+static gmx_inline void
+gmx_simd4_storeu_d_mic(double * m, __m512d s)
+{
+ _mm512_mask_packstorelo_pd(m, gmx_simd4_mask, s);
+ _mm512_mask_packstorehi_pd(m+8, gmx_simd4_mask, s);
+}
+
+/* extract */
+static gmx_inline gmx_int32_t
+gmx_simd_extract_fi_mic(gmx_simd_fint32_t a, int index)
+{
+ int r;
+ _mm512_mask_packstorelo_epi32(&r, _mm512_mask2int(1<<index), a);
+ return r;
+}
+
+static gmx_inline gmx_int32_t
+gmx_simd_extract_di_mic(gmx_simd_dint32_t a, int index)
+{
+ int r;
+ _mm512_mask_packstorelo_epi32(&r, _mm512_mask2int(1<<index), a);
+ return r;
+}
+
+/* This is likely faster than the built in scale operation (lat 8, t-put 3)
+ * since we only work on the integer part and use shifts. TODO: check. given that scale also only does integer
+ */
+static gmx_inline __m512
+gmx_simd_set_exponent_f_mic(__m512 a)
+{
+ __m512i iexp = gmx_simd_cvt_f2i(a);
+
+ const __m512i expbias = _mm512_set1_epi32(127);
+ iexp = _mm512_slli_epi32(_mm512_add_epi32(iexp, expbias), 23);
+ return _mm512_castsi512_ps(iexp);
+
+ /* scale alternative:
+ return _mm512_scale_ps(_mm512_set1_ps(1), iexp);
+ */
+}
+
+static gmx_inline __m512d
+gmx_simd_set_exponent_d_mic(__m512d a)
+{
+ const __m512i expbias = _mm512_set1_epi32(1023);
+ __m512i iexp = _mm512_cvtfxpnt_roundpd_epi32lo(a, _MM_FROUND_TO_NEAREST_INT);
+ iexp = _mm512_permutevar_epi32(_mm512_set_epi32(7, 7, 6, 6, 5, 5, 4, 4, 3, 3, 2, 2, 1, 1, 0, 0), iexp);
+ iexp = _mm512_mask_slli_epi32(_mm512_setzero_epi32(), _mm512_int2mask(0xAAAA), _mm512_add_epi32(iexp, expbias), 20);
+ return _mm512_castsi512_pd(iexp);
+}
+
+static gmx_inline void
+gmx_simd_cvt_f2dd_mic(__m512 f, __m512d * d0, __m512d * d1)
+{
+ __m512i i1 = _mm512_permute4f128_epi32(_mm512_castps_si512(f), _MM_PERM_CDCD);
+
+ *d0 = _mm512_cvtpslo_pd(f);
+ *d1 = _mm512_cvtpslo_pd(_mm512_castsi512_ps(i1));
+}
+
+static gmx_inline __m512
+gmx_simd_cvt_dd2f_mic(__m512d d0, __m512d d1)
+{
+ __m512 f0 = _mm512_cvtpd_pslo(d0);
+ __m512 f1 = _mm512_cvtpd_pslo(d1);
+ return _mm512_mask_permute4f128_ps(f0, mask_hih, f1, PERM_LOW2HIGH);
+}
+
+static gmx_inline __m512
+gmx_simd_exp2_f_mic(__m512 x)
+{
+ return _mm512_exp223_ps(_mm512_cvtfxpnt_round_adjustps_epi32(x, _MM_ROUND_MODE_NEAREST, _MM_EXPADJ_24));
+}
+
+static gmx_inline __m512
+gmx_simd_exp_f_mic(__m512 x)
+{
+ /* only 59ulp accuracy so we need to do extra an iteration
+ Using: http://yacas.sourceforge.net/Algochapter5.html 5.4 Method 3 */
+ __m512 r = gmx_simd_exp2_f(_mm512_mul_ps(x, _mm512_set1_ps(1.44269504088896341)));
+ __mmask16 m = _mm512_cmpneq_ps_mask(r, _mm512_setzero_ps());
+ __m512 t = _mm512_mask_fnmadd_ps(_mm512_mask_log2ae23_ps(_mm512_undefined_ps(), m, r), m, _mm512_set1_ps(0.693147180559945286226764), x);
+ return _mm512_mask_fmadd_ps(r, m, t, r);
+}
+
+static gmx_inline __m512
+gmx_simd_log_f_mic(__m512 x)
+{
+ return _mm512_mul_ps(_mm512_set1_ps(0.693147180559945286226764), _mm512_log2ae23_ps(x));
+}
+
+/* Function to check whether SIMD operations have resulted in overflow */
+static int
+gmx_simd_check_and_reset_overflow(void)
+{
+ int MXCSR;
+ int sse_overflow;
+ /* The overflow flag is bit 3 in the register */
+ const unsigned int flag = 0x8;
+
+ MXCSR = _mm_getcsr();
+ if (MXCSR & flag)
+ {
+ sse_overflow = 1;
+ /* Set the overflow flag to zero */
+ MXCSR = MXCSR & ~flag;
+ _mm_setcsr(MXCSR);
+ }
+ else
+ {
+ sse_overflow = 0;
+ }
+ return sse_overflow;
+}
+
+#endif /* GMX_SIMD_IMPL_INTEL_MIC_H */
--- /dev/null
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2014, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+
+#ifndef GMX_SIMD_IMPL_REFERENCE_H
+#define GMX_SIMD_IMPL_REFERENCE_H
+
+/*! \libinternal \file
+ *
+ * \brief Reference SIMD implementation, including SIMD documentation.
+ *
+ * \author Erik Lindahl <erik.lindahl@scilifelab.se>
+ *
+ * \ingroup module_simd
+ */
+
+
+#include <math.h>
+
+#include "gmx_fatal.h"
+
+/*! \cond libapi */
+/*! \addtogroup module_simd */
+/*! \{ */
+
+/*! \name SIMD implementation capability definitions
+ * \{
+ */
+
+/*! \brief
+ * Defined when SIMD float support is present.
+ *
+ * You should only use this to specifically check for single precision SIMD,
+ * support, even when the rest of Gromacs uses double precision.
+ * \sa GMX_SIMD_HAVE_REAL, GMX_SIMD_HAVE_DOUBLE
+ */
+#define GMX_SIMD_HAVE_FLOAT
+
+/*! \brief Defined if SIMD double support is present. */
+#define GMX_SIMD_HAVE_DOUBLE
+
+/*! \brief Defined if SIMD is implemented with real hardware instructions. */
+#define GMX_SIMD_HAVE_HARDWARE /* For Doxygen */
+#undef GMX_SIMD_HAVE_HARDWARE /* Reference implementation setting */
+
+/*! \brief Defined if the SIMD implementation supports unaligned loads. */
+#define GMX_SIMD_HAVE_LOADU
+
+/*! \brief Defined if the SIMD implementation supports unaligned stores. */
+#define GMX_SIMD_HAVE_STOREU
+
+/*! \brief Defined if SIMD implementation has logical operations on floating-point data. */
+#define GMX_SIMD_HAVE_LOGICAL
+
+/*! \brief Defined if SIMD fused multiply-add uses hardware instructions */
+#define GMX_SIMD_HAVE_FMA /* For Doxygen */
+#undef GMX_SIMD_HAVE_FMA /* Reference implementation setting */
+
+/*! \brief Defined if the SIMD fraction has a direct hardware instruction. */
+#define GMX_SIMD_HAVE_FRACTION /* For Doxygen */
+#undef GMX_SIMD_HAVE_FRACTION /* Reference implementation setting */
+
+/*! \brief Defined if the SIMD implementation has \ref gmx_simd_fint32_t. */
+#define GMX_SIMD_HAVE_FINT32
+
+/*! \brief Support for extracting integers from \ref gmx_simd_fint32_t. */
+#define GMX_SIMD_HAVE_FINT32_EXTRACT
+
+/*! \brief Defined if SIMD logical operations are supported for \ref gmx_simd_fint32_t */
+#define GMX_SIMD_HAVE_FINT32_LOGICAL
+
+/*! \brief Defined if SIMD arithmetic operations are supported for \ref gmx_simd_fint32_t */
+#define GMX_SIMD_HAVE_FINT32_ARITHMETICS
+
+/*! \brief Defined if the SIMD implementation has \ref gmx_simd_dint32_t.
+ *
+ * \note The Gromacs SIMD module works entirely with 32 bit integers, both
+ * in single and double precision, since some platforms do not support 64 bit
+ * SIMD integers at all. In particular, this means it is up to each
+ * implementation to get this working even if the architectures internal
+ * representation uses 64 bit integers when converting to/from double SIMD
+ * variables. For now we will try HARD to use conversions, packing or shuffling
+ * so the integer datatype has the same width as the floating-point type, i.e.
+ * if you use double precision SIMD with a width of 8, we want the integers
+ * we work with to also use a SIMD width of 8 to make it easy to load/store
+ * indices from arrays. This refers entirely to the function calls
+ * and how many integers we load/store in one call; the actual SIMD registers
+ * might be wider for integers internally (e.g. on x86 gmx_simd_dint32_t will
+ * only fill half the register), but this is none of the user's business.
+ * While this works for all current architectures, and we think it will work
+ * for future ones, we might have to alter this decision in the future. To
+ * avoid rewriting every single instance that refers to the SIMD width we still
+ * provide separate defines for the width of SIMD integer variables that you
+ * should use.
+ */
+#define GMX_SIMD_HAVE_DINT32
+
+/*! \brief Support for extracting integer from \ref gmx_simd_dint32_t */
+#define GMX_SIMD_HAVE_DINT32_EXTRACT
+
+/*! \brief Defined if logical operations are supported for \ref gmx_simd_dint32_t */
+#define GMX_SIMD_HAVE_DINT32_LOGICAL
+
+/*! \brief Defined if SIMD arithmetic operations are supported for \ref gmx_simd_dint32_t */
+#define GMX_SIMD_HAVE_DINT32_ARITHMETICS
+
+/*! \brief Defined if the implementation provides \ref gmx_simd4_float_t. */
+#define GMX_SIMD4_HAVE_FLOAT
+
+/*! \brief Defined if the implementation provides \ref gmx_simd4_double_t. */
+#define GMX_SIMD4_HAVE_DOUBLE
+
+#ifdef GMX_SIMD_REF_FLOAT_WIDTH
+# define GMX_SIMD_FLOAT_WIDTH GMX_SIMD_REF_FLOAT_WIDTH
+#else
+/*! \brief Width of the \ref gmx_simd_float_t datatype. */
+# define GMX_SIMD_FLOAT_WIDTH 4
+#endif
+
+#ifdef GMX_SIMD_REF_DOUBLE_WIDTH
+# define GMX_SIMD_DOUBLE_WIDTH GMX_SIMD_REF_DOUBLE_WIDTH
+#else
+/*! \brief Width of the \ref gmx_simd_double_t datatype. */
+# define GMX_SIMD_DOUBLE_WIDTH 4
+#endif
+
+/*! \brief Width of the \ref gmx_simd_fint32_t datatype. */
+#define GMX_SIMD_FINT32_WIDTH GMX_SIMD_FLOAT_WIDTH
+
+/*! \brief Width of the \ref gmx_simd_dint32_t datatype. */
+#define GMX_SIMD_DINT32_WIDTH GMX_SIMD_DOUBLE_WIDTH
+
+/*! \brief Accuracy of SIMD 1/sqrt(x) lookup. Used to determine number of iterations. */
+#define GMX_SIMD_RSQRT_BITS 23
+
+/*! \brief Accuracy of SIMD 1/x lookup. Used to determine number of iterations. */
+#define GMX_SIMD_RCP_BITS 23
+
+/*! \}
+ *
+ * \name SIMD implementation data types
+ * \{
+ */
+/*! \brief Float SIMD variable. Supported with GMX_SIMD_HAVE_FLOAT.
+ */
+typedef struct
+{
+ float r[GMX_SIMD_FLOAT_WIDTH]; /*!< Implementation dependent. Don't touch. */
+}
+gmx_simd_float_t;
+
+/*! \brief Floating-point SIMD variable type in double precision.
+ *
+ * Supported with GMX_SIMD_HAVE_DOUBLE.
+ */
+typedef struct
+{
+ double r[GMX_SIMD_DOUBLE_WIDTH]; /*!< Implementation dependent. Don't touch. */
+}
+gmx_simd_double_t;
+
+/*! \brief Integer SIMD variable type to use for conversions to/from float.
+ *
+ * This is also the widest integer SIMD type.
+ */
+typedef struct
+{
+ gmx_int32_t i[GMX_SIMD_FINT32_WIDTH]; /*!< Implementation dependent. Don't touch. */
+}
+gmx_simd_fint32_t;
+
+/*! \brief Integer SIMD variable type to use for conversions to/from double.
+ *
+ * Available with GMX_SIMD_HAVE_DINT32.
+ */
+typedef struct
+{
+ gmx_int32_t i[GMX_SIMD_DINT32_WIDTH]; /*!< Implementation dependent. Don't touch. */
+}
+gmx_simd_dint32_t;
+
+/*! \brief Boolean type for float SIMD data.
+ *
+ * You should likely use gmx_simd_bool_t
+ * (for gmx_simd_real_t) instead, unless you really know what you are doing.
+ */
+typedef struct
+{
+ gmx_int32_t b[GMX_SIMD_FLOAT_WIDTH]; /*!< Implementation dependent. Don't touch. */
+}
+gmx_simd_fbool_t;
+
+/*! \brief Boolean type for double precision SIMD data.
+ *
+ * Use the generic gmx_simd_bool_t
+ * (for gmx_simd_real_t) instead, unless you really know what you are doing.
+ */
+typedef struct
+{
+ gmx_int32_t b[GMX_SIMD_DOUBLE_WIDTH]; /*!< Implementation dependent. Don't touch. */
+}
+gmx_simd_dbool_t;
+
+/*! \brief Boolean type for integer datatypes corresponding to float SIMD. */
+typedef struct
+{
+ gmx_int32_t b[GMX_SIMD_FINT32_WIDTH]; /*!< Implementation dependent. Don't touch. */
+}
+gmx_simd_fibool_t;
+
+/*! \brief Boolean type for integer datatypes corresponding to double SIMD.
+ *
+ * You should likely use gmx_simd_ibool_t (for gmx_simd_int32_t) instead,
+ * unless you really know what you are doing.
+ */
+typedef struct
+{
+ gmx_int32_t b[GMX_SIMD_DINT32_WIDTH]; /*!< Implementation dependent. Don't touch. */
+}
+gmx_simd_dibool_t;
+
+/*! \}
+ *
+ * \name SIMD implementation load/store operations for single precision floating point
+ * \{
+ */
+
+/*! \brief Load \ref GMX_SIMD_FLOAT_WIDTH numbers from aligned memory.
+ *
+ * \param m Pointer to memory aligned to the SIMD width.
+ * \return SIMD variable with data loaded.
+ */
+static gmx_inline gmx_simd_float_t
+gmx_simd_load_f(const float *m)
+{
+ gmx_simd_float_t a;
+ int i;
+
+ for (i = 0; i < GMX_SIMD_FLOAT_WIDTH; i++)
+ {
+ a.r[i] = m[i];
+ }
+ return a;
+}
+
+/*! \brief Set all SIMD variable elements to float pointed to by m (unaligned).
+ *
+ * \param m Pointer to single value in memory.
+ * \return SIMD variable with all elements set to *m.
+ */
+static gmx_inline gmx_simd_float_t
+gmx_simd_load1_f(const float *m)
+{
+ gmx_simd_float_t a;
+ int i;
+ float f = *m;
+
+ for (i = 0; i < GMX_SIMD_FLOAT_WIDTH; i++)
+ {
+ a.r[i] = f;
+ }
+ return a;
+}
+
+/*! \brief Set all SIMD float variable elements to the value r.
+ *
+ * \param r floating-point constant
+ * \return SIMD variable with all elements set to r.
+ */
+static gmx_inline gmx_simd_float_t
+gmx_simd_set1_f(float r)
+{
+ gmx_simd_float_t a;
+ int i;
+
+ for (i = 0; i < GMX_SIMD_FLOAT_WIDTH; i++)
+ {
+ a.r[i] = r;
+ }
+ return a;
+}
+
+/*! \brief Set all SIMD float variable elements to 0.0f.
+ *
+ * \return The value 0.0 in all elements of a SIMD variable.
+ */
+static gmx_inline gmx_simd_float_t
+gmx_simd_setzero_f()
+{
+ gmx_simd_float_t a;
+ int i;
+
+ for (i = 0; i < GMX_SIMD_FLOAT_WIDTH; i++)
+ {
+ a.r[i] = 0.0;
+ }
+ return a;
+}
+
+/*! \brief Store the contents of the SIMD float variable pr to aligned memory m.
+ *
+ * \param[out] m Pointer to memory, aligned to SIMD width.
+ * \param a SIMD variable to store
+ */
+static gmx_inline void
+gmx_simd_store_f(float *m, gmx_simd_float_t a)
+{
+ int i;
+
+ for (i = 0; i < GMX_SIMD_FLOAT_WIDTH; i++)
+ {
+ m[i] = a.r[i];
+ }
+}
+
+/*! \brief Load SIMD float from unaligned memory.
+ *
+ * Available with \ref GMX_SIMD_HAVE_LOADU.
+ *
+ * \param m Pointer to memory, no alignment requirement.
+ * \return SIMD variable with data loaded.
+ */
+#define gmx_simd_loadu_f gmx_simd_load_f
+
+/*! \brief Store SIMD float to unaligned memory.
+ *
+ * Available with \ref GMX_SIMD_HAVE_STOREU.
+ *
+ * \param[out] m Pointer to memory, no alignment requirement.
+ * \param a SIMD variable to store.
+ */
+#define gmx_simd_storeu_f gmx_simd_store_f
+
+/*! \}
+ *
+ * \name SIMD implementation load/store operations for double precision floating point
+ * \{
+ */
+
+/*! \brief Load \ref GMX_SIMD_DOUBLE_WIDTH numbers from aligned memory.
+ *
+ * \copydetails gmx_simd_load_f
+ */
+static gmx_inline gmx_simd_double_t
+gmx_simd_load_d(const double *m)
+{
+ gmx_simd_double_t a;
+ int i;
+
+ for (i = 0; i < GMX_SIMD_DOUBLE_WIDTH; i++)
+ {
+ a.r[i] = m[i];
+ }
+ return a;
+}
+
+/*! \brief Set all SIMD variable elements to double pointed to by m (unaligned).
+ *
+ * \copydetails gmx_simd_load1_f
+ */
+static gmx_inline gmx_simd_double_t
+gmx_simd_load1_d(const double *m)
+{
+ gmx_simd_double_t a;
+ int i;
+ double d = *m;
+
+ for (i = 0; i < GMX_SIMD_DOUBLE_WIDTH; i++)
+ {
+ a.r[i] = d;
+ }
+ return a;
+}
+
+/*! \brief Set all SIMD double variable elements to the value r.
+ *
+ * \copydetails gmx_simd_set1_f
+ */
+static gmx_inline gmx_simd_double_t
+gmx_simd_set1_d(double r)
+{
+ gmx_simd_double_t a;
+ int i;
+
+ for (i = 0; i < GMX_SIMD_DOUBLE_WIDTH; i++)
+ {
+ a.r[i] = r;
+ }
+ return a;
+}
+
+/*! \brief Set all SIMD double variable elements to 0.0.
+ *
+ * \copydetails gmx_simd_setzero_f
+ */
+static gmx_inline gmx_simd_double_t
+gmx_simd_setzero_d()
+{
+ gmx_simd_double_t a;
+ int i;
+
+ for (i = 0; i < GMX_SIMD_DOUBLE_WIDTH; i++)
+ {
+ a.r[i] = 0.0;
+ }
+ return a;
+}
+
+/*! \brief Store the contents of the SIMD double variable pr to aligned memory m.
+ *
+ * \copydetails gmx_simd_store_f
+ */
+static gmx_inline void
+gmx_simd_store_d(double *m, gmx_simd_double_t a)
+{
+ int i;
+
+ for (i = 0; i < GMX_SIMD_DOUBLE_WIDTH; i++)
+ {
+ m[i] = a.r[i];
+ }
+}
+
+/*! \brief Load SIMD double from unaligned memory.
+ *
+ * Available with \ref GMX_SIMD_HAVE_LOADU.
+ *
+ * \copydetails gmx_simd_loadu_f
+ */
+#define gmx_simd_loadu_d gmx_simd_load_d
+
+/*! \brief Store SIMD double to unaligned memory.
+ *
+ * Available with \ref GMX_SIMD_HAVE_STOREU.
+ *
+ * \copydetails gmx_simd_storeu_f
+ */
+#define gmx_simd_storeu_d gmx_simd_store_d
+
+/*! \}
+ *
+ * \name SIMD implementation load/store operations for integers (corresponding to float)
+ * \{
+ */
+
+/*! \brief Load aligned SIMD integer data, width corresponds to \ref gmx_simd_float_t.
+ *
+ * You should typically call the real-precision \ref gmx_simd_load_i.
+ *
+ * \param m Pointer to memory, aligned to integer SIMD width.
+ * \return SIMD integer variable.
+ */
+static gmx_inline gmx_simd_fint32_t
+gmx_simd_load_fi(const gmx_int32_t * m)
+{
+ gmx_simd_fint32_t a;
+ int i;
+ for (i = 0; i < GMX_SIMD_FINT32_WIDTH; i++)
+ {
+ a.i[i] = m[i];
+ }
+ return a;
+};
+
+/*! \brief Set SIMD from integer, width corresponds to \ref gmx_simd_float_t.
+ *
+ * You should typically call the real-precision \ref gmx_simd_set1_i.
+ *
+ * \param b integer value to set variable to.
+ * \return SIMD variable with all elements set to b.
+ */
+static gmx_inline gmx_simd_fint32_t
+gmx_simd_set1_fi(gmx_int32_t b)
+{
+ gmx_simd_fint32_t a;
+ int i;
+ for (i = 0; i < GMX_SIMD_FINT32_WIDTH; i++)
+ {
+ a.i[i] = b;
+ }
+ return a;
+}
+
+/*! \brief Set all SIMD variable elements to 0, width corresponds to \ref gmx_simd_float_t.
+ *
+ * You should typically call the real-precision \ref gmx_simd_setzero_i.
+ *
+ * \return SIMD integer variable with all bits set to zero.
+ */
+static gmx_inline gmx_simd_fint32_t
+gmx_simd_setzero_fi()
+{
+ gmx_simd_fint32_t a;
+ int i;
+
+ for (i = 0; i < GMX_SIMD_FINT32_WIDTH; i++)
+ {
+ a.i[i] = 0;
+ }
+ return a;
+}
+
+/*! \brief Store aligned SIMD integer data, width corresponds to \ref gmx_simd_float_t.
+ *
+ * You should typically call the real-precision \ref gmx_simd_store_i.
+ *
+ * \param m Memory aligned to integer SIMD width.
+ * \param a SIMD variable to store.
+ */
+static gmx_inline gmx_simd_fint32_t
+gmx_simd_store_fi(int * m, gmx_simd_fint32_t a)
+{
+ int i;
+ for (i = 0; i < GMX_SIMD_FINT32_WIDTH; i++)
+ {
+ m[i] = a.i[i];
+ }
+ return a;
+};
+
+/*! \brief Load unaligned integer SIMD data, width corresponds to \ref gmx_simd_float_t.
+ *
+ * You should typically call the real-precision \ref gmx_simd_loadu_i.
+ *
+ * Supported with \ref GMX_SIMD_HAVE_LOADU.
+ *
+ * \param m Pointer to memory, no alignment requirements.
+ * \return SIMD integer variable.
+ */
+#define gmx_simd_loadu_fi gmx_simd_load_fi
+
+/*! \brief Store unaligned SIMD integer data, width corresponds to \ref gmx_simd_float_t.
+ *
+ * You should typically call the real-precision \ref gmx_simd_storeu_i.
+ *
+ * Supported with \ref GMX_SIMD_HAVE_STOREU.
+ *
+ * \param m Memory pointer, no alignment requirements.
+ * \param a SIMD variable to store.
+ */
+#define gmx_simd_storeu_fi gmx_simd_store_fi
+
+/*! \brief Extract element with index i from \ref gmx_simd_fint32_t.
+ *
+ * You should typically call the real-precision \ref gmx_simd_extract_i.
+ *
+ * Available with \ref GMX_SIMD_HAVE_FINT32_EXTRACT.
+ *
+ * \param a SIMD variable
+ * \param index Position to extract integer from
+ * \return Single integer from position index in SIMD variable.
+ */
+static gmx_inline gmx_int32_t
+gmx_simd_extract_fi(gmx_simd_fint32_t a, int index)
+{
+ return a.i[index];
+}
+
+/*! \}
+ *
+ * \name SIMD implementation load/store operations for integers (corresponding to double)
+ * \{
+ */
+
+/*! \brief Load aligned SIMD integer data, width corresponds to \ref gmx_simd_double_t.
+ *
+ * \copydetails gmx_simd_load_fi
+ */
+static gmx_inline gmx_simd_dint32_t
+gmx_simd_load_di(const gmx_int32_t * m)
+{
+ gmx_simd_dint32_t a;
+ int i;
+ for (i = 0; i < GMX_SIMD_DINT32_WIDTH; i++)
+ {
+ a.i[i] = m[i];
+ }
+ return a;
+};
+
+/*! \brief Set SIMD from integer, width corresponds to \ref gmx_simd_double_t.
+ *
+ * \copydetails gmx_simd_set1_fi
+ */
+static gmx_inline gmx_simd_dint32_t
+gmx_simd_set1_di(gmx_int32_t b)
+{
+ gmx_simd_dint32_t a;
+ int i;
+ for (i = 0; i < GMX_SIMD_DINT32_WIDTH; i++)
+ {
+ a.i[i] = b;
+ }
+ return a;
+}
+
+/*! \brief Set all SIMD variable elements to 0, width corresponds to \ref gmx_simd_double_t.
+ *
+ * \copydetails gmx_simd_setzero_fi
+ */
+static gmx_inline gmx_simd_dint32_t
+gmx_simd_setzero_di()
+{
+ gmx_simd_dint32_t a;
+ int i;
+
+ for (i = 0; i < GMX_SIMD_DINT32_WIDTH; i++)
+ {
+ a.i[i] = 0;
+ }
+ return a;
+}
+
+/*! \brief Store aligned SIMD integer data, width corresponds to \ref gmx_simd_double_t.
+ *
+ * \copydetails gmx_simd_store_fi
+ */
+static gmx_inline gmx_simd_dint32_t
+gmx_simd_store_di(gmx_int32_t * m, gmx_simd_dint32_t a)
+{
+ int i;
+ for (i = 0; i < GMX_SIMD_DINT32_WIDTH; i++)
+ {
+ m[i] = a.i[i];
+ }
+ return a;
+};
+
+/*! \brief Load unaligned integer SIMD data, width corresponds to \ref gmx_simd_double_t.
+ *
+ * \copydetails gmx_simd_loadu_fi
+ */
+#define gmx_simd_loadu_di gmx_simd_load_di
+
+/*! \brief Store unaligned SIMD integer data, width corresponds to \ref gmx_simd_double_t.
+ *
+ * \copydetails gmx_simd_storeu_fi
+ */
+#define gmx_simd_storeu_di gmx_simd_store_di
+
+/*! \brief Extract element with index i from \ref gmx_simd_dint32_t.
+ *
+ * \copydetails gmx_simd_extract_fi
+ */
+static gmx_inline gmx_int32_t
+gmx_simd_extract_di(gmx_simd_dint32_t a, int index)
+{
+ return a.i[index];
+}
+
+/*! \}
+ *
+ * \name SIMD implementation single precision floating-point bitwise logical operations
+ * \{
+ */
+
+/*! \brief Bitwise and for two SIMD float variables. Supported with \ref GMX_SIMD_HAVE_LOGICAL.
+ *
+ * You should typically call the real-precision \ref gmx_simd_and_r.
+ *
+ * \param a data1
+ * \param b data2
+ * \return data1 & data2
+ */
+static gmx_inline gmx_simd_float_t
+gmx_simd_and_f(gmx_simd_float_t a, gmx_simd_float_t b)
+{
+ gmx_simd_float_t c;
+ int i;
+#ifdef __cplusplus
+ gmx_int32_t val1, val2, res;
+#else
+ union
+ {
+ float r;
+ gmx_int32_t i;
+ }
+ conv1, conv2;
+#endif
+
+ for (i = 0; i < GMX_SIMD_FLOAT_WIDTH; i++)
+ {
+#ifdef __cplusplus
+ val1 = reinterpret_cast<int &>(a.r[i]);
+ val2 = reinterpret_cast<int &>(b.r[i]);
+ res = val1 & val2;
+ c.r[i] = reinterpret_cast<float &>(res);
+#else
+ conv1.r = a.r[i];
+ conv2.r = b.r[i];
+ conv1.i = conv1.i & conv2.i;
+ c.r[i] = conv1.r;
+#endif
+ }
+ return c;
+}
+
+/*! \brief Bitwise andnot for SIMD float. c=(~a) & b. Supported with \ref GMX_SIMD_HAVE_LOGICAL.
+ *
+ * You should typically call the real-precision \ref gmx_simd_andnot_r.
+ *
+ * \param a data1
+ * \param b data2
+ * \return (~data1) & data2
+ */
+static gmx_inline gmx_simd_float_t
+gmx_simd_andnot_f(gmx_simd_float_t a, gmx_simd_float_t b)
+{
+ gmx_simd_float_t c;
+ int i;
+#ifdef __cplusplus
+ gmx_int32_t val1, val2, res;
+#else
+ union
+ {
+ float r;
+ gmx_int32_t i;
+ }
+ conv1, conv2;
+#endif
+
+ for (i = 0; i < GMX_SIMD_FLOAT_WIDTH; i++)
+ {
+#ifdef __cplusplus
+ val1 = reinterpret_cast<int &>(a.r[i]);
+ val2 = reinterpret_cast<int &>(b.r[i]);
+ res = (~val1) & val2;
+ c.r[i] = reinterpret_cast<float &>(res);
+#else
+ conv1.r = a.r[i];
+ conv2.r = b.r[i];
+ conv1.i = (~conv1.i) & conv2.i;
+ c.r[i] = conv1.r;
+#endif
+ }
+ return c;
+}
+
+/*! \brief Bitwise or for SIMD float. Supported with \ref GMX_SIMD_HAVE_LOGICAL.
+ *
+ * You should typically call the real-precision \ref gmx_simd_or_r.
+ *
+ * \param a data1
+ * \param b data2
+ * \return data1 | data2
+ */
+static gmx_inline gmx_simd_float_t
+gmx_simd_or_f(gmx_simd_float_t a, gmx_simd_float_t b)
+{
+ gmx_simd_float_t c;
+ int i;
+#ifdef __cplusplus
+ gmx_int32_t val1, val2, res;
+#else
+ union
+ {
+ float r;
+ gmx_int32_t i;
+ }
+ conv1, conv2;
+#endif
+
+ for (i = 0; i < GMX_SIMD_FLOAT_WIDTH; i++)
+ {
+#ifdef __cplusplus
+ val1 = reinterpret_cast<int &>(a.r[i]);
+ val2 = reinterpret_cast<int &>(b.r[i]);
+ res = val1 | val2;
+ c.r[i] = reinterpret_cast<float &>(res);
+#else
+ conv1.r = a.r[i];
+ conv2.r = b.r[i];
+ conv1.i = conv1.i | conv2.i;
+ c.r[i] = conv1.r;
+#endif
+ }
+ return c;
+}
+
+/*! \brief Bitwise xor for SIMD float. Supported with \ref GMX_SIMD_HAVE_LOGICAL.
+ *
+ * You should typically call the real-precision \ref gmx_simd_xor_r.
+ *
+ * \param a data1
+ * \param b data2
+ * \return data1 ^ data2
+ */
+static gmx_inline gmx_simd_float_t
+gmx_simd_xor_f(gmx_simd_float_t a, gmx_simd_float_t b)
+{
+ gmx_simd_float_t c;
+ int i;
+#ifdef __cplusplus
+ gmx_int32_t val1, val2, res;
+#else
+ union
+ {
+ float r;
+ gmx_int32_t i;
+ }
+ conv1, conv2;
+#endif
+
+ for (i = 0; i < GMX_SIMD_FLOAT_WIDTH; i++)
+ {
+#ifdef __cplusplus
+ val1 = reinterpret_cast<int &>(a.r[i]);
+ val2 = reinterpret_cast<int &>(b.r[i]);
+ res = val1 ^ val2;
+ c.r[i] = reinterpret_cast<float &>(res);
+#else
+ conv1.r = a.r[i];
+ conv2.r = b.r[i];
+ conv1.i = conv1.i ^ conv2.i;
+ c.r[i] = conv1.r;
+#endif
+ }
+ return c;
+}
+
+/*! \}
+ *
+ * \name SIMD implementation single precision floating-point arithmetics
+ * \{
+ */
+/*! \brief Add two float SIMD variables.
+ *
+ * You should typically call the real-precision \ref gmx_simd_add_r.
+ *
+ * \param a term1
+ * \param b term2
+ * \return a+b
+ */
+static gmx_inline gmx_simd_float_t
+gmx_simd_add_f(gmx_simd_float_t a, gmx_simd_float_t b)
+{
+ gmx_simd_float_t c;
+ int i;
+
+ for (i = 0; i < GMX_SIMD_FLOAT_WIDTH; i++)
+ {
+ c.r[i] = a.r[i] + b.r[i];
+ }
+ return c;
+}
+
+/*! \brief Subtract two SIMD variables.
+ *
+ * You should typically call the real-precision \ref gmx_simd_sub_r.
+ *
+ * \param a term1
+ * \param b term2
+ * \return a-b
+ */
+static gmx_inline gmx_simd_float_t
+gmx_simd_sub_f(gmx_simd_float_t a, gmx_simd_float_t b)
+{
+ gmx_simd_float_t c;
+ int i;
+
+ for (i = 0; i < GMX_SIMD_FLOAT_WIDTH; i++)
+ {
+ c.r[i] = a.r[i] - b.r[i];
+ }
+ return c;
+}
+
+/*! \brief Multiply two SIMD variables.
+ *
+ * You should typically call the real-precision \ref gmx_simd_mul_r.
+ *
+ * \param a factor1
+ * \param b factor2
+ * \return a*b.
+ */
+static gmx_inline gmx_simd_float_t
+gmx_simd_mul_f(gmx_simd_float_t a, gmx_simd_float_t b)
+{
+ gmx_simd_float_t c;
+ int i;
+
+ for (i = 0; i < GMX_SIMD_FLOAT_WIDTH; i++)
+ {
+ c.r[i] = a.r[i]*b.r[i];
+ }
+ return c;
+}
+
+/*! \brief Fused-multiply-add. Result is a*b+c.
+ *
+ * You should typically call the real-precision \ref gmx_simd_fmadd_r.
+ *
+ * If \ref GMX_SIMD_HAVE_FMA is defined this is a single hardware instruction.
+ *
+ * \param a value
+ * \param b value
+ * \param c value
+ * \return a*b+c
+ *
+ * For some implementations you save an instruction if you assign the result
+ * to c.
+ */
+#define gmx_simd_fmadd_f(a, b, c) gmx_simd_add_f(gmx_simd_mul_f(a, b), c)
+
+
+/*! \brief Fused-multiply-subtract. Result is a*b-c.
+ *
+ * You should typically call the real-precision \ref gmx_simd_fmsub_r.
+ *
+ * If \ref GMX_SIMD_HAVE_FMA is defined this is a single hardware instruction.
+ *
+ * \param a value
+ * \param b value
+ * \param c value
+ * \return a*b-c
+ *
+ * For some implementations you save an instruction if you assign the result
+ * to c.
+ */
+#define gmx_simd_fmsub_f(a, b, c) gmx_simd_sub_f(gmx_simd_mul_f(a, b), c)
+
+
+/*! \brief Fused-negated-multiply-add. Result is -a*b+c.
+ *
+ * You should typically call the real-precision \ref gmx_simd_fnmadd_r.
+ *
+ * If \ref GMX_SIMD_HAVE_FMA is defined this is a single hardware instruction.
+ *
+ * \param a value
+ * \param b value
+ * \param c value
+ * \return -a*b+c
+ *
+ * For some implementations you save an instruction if you assign the result
+ * to c.
+ */
+#define gmx_simd_fnmadd_f(a, b, c) gmx_simd_sub_f(c, gmx_simd_mul_f(a, b))
+
+
+/*! \brief Fused-negated-multiply-sub. Result is -a*b-c.
+ *
+ * You should typically call the real-precision \ref gmx_simd_fnmsub_r.
+ *
+ * If \ref GMX_SIMD_HAVE_FMA is defined this is a single hardware instruction.
+ *
+ * \param a value
+ * \param b value
+ * \param c value
+ * \return -a*b-c
+ *
+ * For some implementations you save an instruction if you assign the result
+ * to c.
+ */
+#define gmx_simd_fnmsub_f(a, b, c) gmx_simd_sub_f(gmx_simd_setzero_f(), gmx_simd_fmadd_f(a, b, c))
+
+/*! \brief SIMD 1.0/sqrt(x) lookup.
+ *
+ * You should typically call the real-precision \ref gmx_simd_rsqrt_r.
+ *
+ * This is a low-level instruction that should only be called from routines
+ * implementing the inverse square root in simd_math.h.
+ *
+ * \param x Argument, x>0
+ * \return Approximation of 1/sqrt(x), accuracy is \ref GMX_SIMD_RSQRT_BITS.
+ */
+static gmx_inline gmx_simd_float_t
+gmx_simd_rsqrt_f(gmx_simd_float_t x)
+{
+ gmx_simd_float_t b;
+ int i;
+
+ for (i = 0; i < GMX_SIMD_FLOAT_WIDTH; i++)
+ {
+ b.r[i] = (x.r[i] > 0.0f) ? 1.0f/sqrtf(x.r[i]) : 0.0f;
+ }
+ return b;
+};
+
+/*! \brief SIMD 1.0/x lookup.
+ *
+ * You should typically call the real-precision \ref gmx_simd_rcp_r.
+ *
+ * This is a low-level instruction that should only be called from routines
+ * implementing the reciprocal in simd_math.h.
+ *
+ * \param x Argument, x!=0
+ * \return Approximation of 1/x, accuracy is \ref GMX_SIMD_RCP_BITS.
+ */
+static gmx_inline gmx_simd_float_t
+gmx_simd_rcp_f(gmx_simd_float_t x)
+{
+ gmx_simd_float_t b;
+ int i;
+
+ for (i = 0; i < GMX_SIMD_FLOAT_WIDTH; i++)
+ {
+ b.r[i] = (x.r[i] != 0.0f) ? 1.0f/x.r[i] : 0.0f;
+ }
+ return b;
+};
+
+/*! \brief SIMD Floating-point fabs().
+ *
+ * You should typically call the real-precision \ref gmx_simd_fabs_r.
+ *
+ * \param a any floating point values
+ * \return fabs(a) for each element.
+ */
+static gmx_inline gmx_simd_float_t
+gmx_simd_fabs_f(gmx_simd_float_t a)
+{
+ gmx_simd_float_t c;
+ int i;
+
+ for (i = 0; i < GMX_SIMD_FLOAT_WIDTH; i++)
+ {
+ c.r[i] = fabsf(a.r[i]);
+ }
+ return c;
+}
+
+/*! \brief SIMD floating-point negate.
+ *
+ * You should typically call the real-precision \ref gmx_simd_fneg_r.
+ *
+ * \param a Any floating-point value
+ * \return -a
+ */
+static gmx_inline gmx_simd_float_t
+gmx_simd_fneg_f(gmx_simd_float_t a)
+{
+ gmx_simd_float_t c;
+ int i;
+
+ for (i = 0; i < GMX_SIMD_FLOAT_WIDTH; i++)
+ {
+ c.r[i] = -a.r[i];
+ }
+ return c;
+}
+
+/*! \brief Set each SIMD element to the largest from two variables.
+ *
+ * You should typically call the real-precision \ref gmx_simd_max_r.
+ *
+ * \param a Any floating-point value
+ * \param b Any floating-point value
+ * \return max(a,b) for each element.
+ */
+static gmx_inline gmx_simd_float_t
+gmx_simd_max_f(gmx_simd_float_t a, gmx_simd_float_t b)
+{
+ gmx_simd_float_t c;
+ int i;
+
+ for (i = 0; i < GMX_SIMD_FLOAT_WIDTH; i++)
+ {
+ c.r[i] = (a.r[i] >= b.r[i] ? a.r[i] : b.r[i]);
+ }
+ return c;
+}
+
+/*! \brief Set each SIMD element to the smallest from two variables.
+ *
+ * You should typically call the real-precision \ref gmx_simd_min_r.
+ *
+ * \param a Any floating-point value
+ * \param b Any floating-point value
+ * \return min(a,b) for each element.
+ */
+static gmx_inline gmx_simd_float_t
+gmx_simd_min_f(gmx_simd_float_t a, gmx_simd_float_t b)
+{
+ gmx_simd_float_t c;
+ int i;
+
+ for (i = 0; i < GMX_SIMD_FLOAT_WIDTH; i++)
+ {
+ c.r[i] = (a.r[i] <= b.r[i] ? a.r[i] : b.r[i]);
+ }
+ return c;
+}
+
+/*! \brief Round to nearest integer value (in floating-point format).
+ *
+ * You should typically call the real-precision \ref gmx_simd_round_r.
+ *
+ * \param a Any floating-point value
+ * \return The nearest integer, represented in floating-point format.
+ *
+ * \note The reference implementation rounds exact half-way cases
+ * away from zero, whereas most SIMD intrinsics will round to nearest even.
+ * This could be fixed by using rint/rintf, but the bigger problem is that
+ * MSVC does not support full C99, and none of the round or rint
+ * functions are defined. It's much easier to approximately implement
+ * round() than rint(), so we do that and hope we never get bitten in
+ * testing. (Thanks, Microsoft.)
+ */
+static gmx_inline gmx_simd_float_t
+gmx_simd_round_f(gmx_simd_float_t a)
+{
+ gmx_simd_float_t b;
+ int i;
+
+ for (i = 0; i < GMX_SIMD_FLOAT_WIDTH; i++)
+ {
+#ifdef _MSC_VER
+ int temp = (a.r[i] >= 0.0f) ? (a.r[i] + 0.5f) : (a.r[i] - 0.5f);
+ b.r[i] = temp;
+#else
+ b.r[i] = roundf(a.r[i]);
+#endif
+ }
+ return b;
+}
+
+/*! \brief Truncate SIMD, i.e. round towards zero - common hardware instruction.
+ *
+ * You should typically call the real-precision \ref gmx_simd_trunc_r.
+ *
+ * \param a Any floating-point value
+ * \return Integer rounded towards zero, represented in floating-point format.
+ *
+ * \note This is truncation towards zero, not floor(). The reason for this
+ * is that truncation is virtually always present as a dedicated hardware
+ * instruction, but floor() frequently isn't.
+ */
+static gmx_inline gmx_simd_float_t
+gmx_simd_trunc_f(gmx_simd_float_t a)
+{
+ gmx_simd_float_t b;
+ int i;
+
+ for (i = 0; i < GMX_SIMD_FLOAT_WIDTH; i++)
+ {
+ b.r[i] = truncf(a.r[i]);
+ }
+ return b;
+}
+
+
+/*! \brief Fraction of the SIMD floating point number.
+ *
+ * You should typically call the real-precision \ref gmx_simd_fraction_r.
+ *
+ * \param a Any floating-point value
+ * \return a-trunc(r)
+ *
+ * To maximize compatibility, we use the same definition of fractions as used
+ * e.g. for the AMD64 hardware instructions. This relies on truncation towards
+ * zero for the integer part, and the remaining fraction can thus be either
+ * positive or negative. As an example, -1.42 would return the fraction -0.42.
+ *
+ * Hardware support with \ref GMX_SIMD_HAVE_FRACTION, otherwise emulated.
+ */
+static gmx_inline gmx_simd_float_t
+gmx_simd_fraction_f(gmx_simd_float_t a)
+{
+ return gmx_simd_sub_f(a, gmx_simd_trunc_f(a));
+}
+
+/*! \brief Extract (integer) exponent from single precision SIMD.
+ *
+ * You should typically call the real-precision \ref gmx_simd_get_exponent_r.
+ *
+ * \param a Any floating-point value
+ * \return Exponent value, represented in floating-point format.
+ *
+ * The IEEE754 exponent field is selected, the bias removed, and it is converted to
+ * a normal floating-point SIMD.
+ */
+static gmx_inline gmx_simd_float_t
+gmx_simd_get_exponent_f(gmx_simd_float_t a)
+{
+ /* Mask with ones for the exponent field of single precision fp */
+ const gmx_int32_t expmask = 0x7f800000;
+ gmx_simd_float_t b;
+ int i;
+ union
+ {
+ float f;
+ gmx_int32_t i;
+ }
+ conv;
+
+ for (i = 0; i < GMX_SIMD_FLOAT_WIDTH; i++)
+ {
+ conv.f = a.r[i];
+ /* Keep exponent, shift 23 right (float mantissa), remove bias (127) */
+ b.r[i] = ((conv.i & expmask) >> 23) - 127;
+ }
+ return b;
+}
+
+/*! \brief Get SIMD mantissa.
+ *
+ * You should typically call the real-precision \ref gmx_simd_get_mantissa_r.
+ *
+ * \param a Any floating-point value
+ * \return Mantissa, represented in floating-point format.
+ *
+ * The mantissa field is selected, and a new neutral exponent created.
+ */
+static gmx_inline gmx_simd_float_t
+gmx_simd_get_mantissa_f(gmx_simd_float_t a)
+{
+ const gmx_int32_t mantmask = 0x007fffff;
+ const gmx_int32_t one = 0x3f800000;
+ gmx_simd_float_t b;
+ int i;
+ union
+ {
+ float f;
+ gmx_int32_t i;
+ }
+ conv;
+
+ for (i = 0; i < GMX_SIMD_FLOAT_WIDTH; i++)
+ {
+ conv.f = a.r[i];
+ /* remove current exponent, add a biased exponent for 1.0 (i.e., 2^0=1) */
+ conv.i = (conv.i & (mantmask)) | one;
+ b.r[i] = conv.f;
+ }
+ return b;
+}
+
+/*! \brief Set (integer) exponent from single precision floating-point SIMD.
+ *
+ * You should typically call the real-precision \ref gmx_simd_set_exponent_r.
+ *
+ * \param a A floating point value that will not overflow as 2^a.
+ * \return 2^(round(a)).
+ *
+ * The input is \a rounded to the nearest integer, the exponent bias is added
+ * to this integer, and the bits are shifted to the IEEE754 exponent part of the number.
+ *
+ * \note The argument will be \a rounded to nearest integer since that is what
+ * we need for the exponential functions, and this integer x will be set as the
+ * exponent so the new floating-point number will be 2^x.
+ */
+static gmx_inline gmx_simd_float_t
+gmx_simd_set_exponent_f(gmx_simd_float_t a)
+{
+ gmx_simd_float_t b;
+ gmx_int32_t iexp;
+ int i;
+ union
+ {
+ float f;
+ gmx_int32_t i;
+ }
+ conv;
+
+ for (i = 0; i < GMX_SIMD_FLOAT_WIDTH; i++)
+ {
+ /* Critical to use same algorithm as for gmx_simd_round_f() */
+#ifdef _MSC_VER
+ iexp = (a.r[i] >= 0.0f) ? (a.r[i] + 0.5f) : (a.r[i] - 0.5f);
+#else
+ iexp = roundf(a.r[i]);
+#endif
+ /* Add bias (127), and shift 23 bits left (mantissa size) */
+ conv.i = (iexp + 127) << 23;
+ b.r[i] = conv.f;
+ }
+ return b;
+}
+
+/*! \}
+ *
+ * \name SIMD implementation single precision floating-point comparisons, boolean, selection.
+ * \{
+ */
+/*! \brief SIMD a==b for single SIMD.
+ *
+ * You should typically call the real-precision \ref gmx_simd_cmpeq_r.
+ *
+ * \param a value1
+ * \param b value2
+ * \return Each element of the boolean will be set to true if a==b.
+ *
+ * Beware that exact floating-point comparisons are difficult.
+ */
+static gmx_inline gmx_simd_fbool_t
+gmx_simd_cmpeq_f(gmx_simd_float_t a, gmx_simd_float_t b)
+{
+ gmx_simd_fbool_t c;
+ int i;
+
+ for (i = 0; i < GMX_SIMD_FLOAT_WIDTH; i++)
+ {
+ c.b[i] = (a.r[i] == b.r[i]);
+ }
+ return c;
+}
+
+/*! \brief SIMD a<b for single SIMD.
+ *
+ * You should typically call the real-precision \ref gmx_simd_cmplt_r.
+ *
+ * \param a value1
+ * \param b value2
+ * \return Each element of the boolean will be set to true if a<b.
+ */
+static gmx_inline gmx_simd_fbool_t
+gmx_simd_cmplt_f(gmx_simd_float_t a, gmx_simd_float_t b)
+{
+ gmx_simd_fbool_t c;
+ int i;
+
+ for (i = 0; i < GMX_SIMD_FLOAT_WIDTH; i++)
+ {
+ c.b[i] = (a.r[i] < b.r[i]);
+ }
+ return c;
+}
+
+/*! \brief SIMD a<=b for single SIMD.
+ *
+ * You should typically call the real-precision \ref gmx_simd_cmple_r.
+ *
+ * \param a value1
+ * \param b value2
+ * \return Each element of the boolean will be set to true if a<=b.
+ */
+static gmx_inline gmx_simd_fbool_t
+gmx_simd_cmple_f(gmx_simd_float_t a, gmx_simd_float_t b)
+{
+ gmx_simd_fbool_t c;
+ int i;
+
+ for (i = 0; i < GMX_SIMD_FLOAT_WIDTH; i++)
+ {
+ c.b[i] = (a.r[i] <= b.r[i]);
+ }
+ return c;
+}
+
+/*! \brief Logical \a and on single precision SIMD booleans.
+ *
+ * You should typically call the real-precision \ref gmx_simd_and_r.
+ *
+ * \param a logical vars 1
+ * \param b logical vars 2
+ * \return For each element, the result boolean is true if a \& b are true.
+ *
+ * \note This is not necessarily a bitwise operation - the storage format
+ * of booleans is implementation-dependent.
+ *
+ * \sa gmx_simd_and_ib
+ */
+static gmx_inline gmx_simd_fbool_t
+gmx_simd_and_fb(gmx_simd_fbool_t a, gmx_simd_fbool_t b)
+{
+ gmx_simd_fbool_t c;
+ int i;
+
+ for (i = 0; i < GMX_SIMD_FLOAT_WIDTH; i++)
+ {
+ c.b[i] = (a.b[i] && b.b[i]);
+ }
+ return c;
+}
+
+/*! \brief Logical \a or on single precision SIMD booleans.
+ *
+ * You should typically call the real-precision \ref gmx_simd_or_r.
+ *
+ * \param a logical vars 1
+ * \param b logical vars 2
+ * \return For each element, the result boolean is true if a or b is true.
+ *
+ * Note that this is not necessarily a bitwise operation - the storage format
+ * of booleans is implementation-dependent.
+ *
+ * \sa gmx_simd_or_ib
+ */
+static gmx_inline gmx_simd_fbool_t
+gmx_simd_or_fb(gmx_simd_fbool_t a, gmx_simd_fbool_t b)
+{
+ gmx_simd_fbool_t c;
+ int i;
+
+ for (i = 0; i < GMX_SIMD_FLOAT_WIDTH; i++)
+ {
+ c.b[i] = (a.b[i] || b.b[i]);
+ }
+ return c;
+}
+
+/*! \brief Returns non-zero if any of the boolean in x is True, otherwise 0.
+ *
+ * You should typically call the real-precision \ref gmx_simd_anytrue_b.
+ *
+ * \param a Logical variable.
+ * \return non-zero if any element in a is true, otherwise 0.
+ *
+ * The actual return value for truth will depend on the architecture,
+ * so any non-zero value is considered truth.
+ */
+static gmx_inline int
+gmx_simd_anytrue_fb(gmx_simd_fbool_t a)
+{
+ int anytrue;
+ int i;
+
+ anytrue = 0;
+ for (i = 0; i < GMX_SIMD_FLOAT_WIDTH; i++)
+ {
+ anytrue = anytrue || a.b[i];
+ }
+ return anytrue;
+}
+
+/*! \brief Select from single precision SIMD variable where boolean is true.
+ *
+ * You should typically call the real-precision \ref gmx_simd_blendzero_r.
+ *
+ * \param a Floating-point variable to select from
+ * \param sel Boolean selector
+ * \return For each element, a is selected for true, 0 for false.
+ */
+static gmx_inline gmx_simd_float_t
+gmx_simd_blendzero_f(gmx_simd_float_t a, gmx_simd_fbool_t sel)
+{
+ gmx_simd_float_t c;
+ int i;
+
+ for (i = 0; i < GMX_SIMD_FLOAT_WIDTH; i++)
+ {
+ c.r[i] = sel.b[i] ? a.r[i] : 0.0;
+ }
+ return c;
+}
+
+/*! \brief Select from single precision SIMD variable where boolean is false.
+ *
+ * You should typically call the real-precision \ref gmx_simd_blendnotzero_r.
+ *
+ * \param a Floating-point variable to select from
+ * \param sel Boolean selector
+ * \return For each element, a is selected for false, 0 for true (sic).
+ */
+static gmx_inline gmx_simd_float_t
+gmx_simd_blendnotzero_f(gmx_simd_float_t a, gmx_simd_fbool_t sel)
+{
+ gmx_simd_float_t c;
+ int i;
+
+ for (i = 0; i < GMX_SIMD_FLOAT_WIDTH; i++)
+ {
+ c.r[i] = sel.b[i] ? 0.0 : a.r[i];
+ }
+ return c;
+}
+
+/*! \brief Vector-blend SIMD selection.
+ *
+ * You should typically call the real-precision \ref gmx_simd_blendv_r.
+ *
+ * \param a First source
+ * \param b Second source
+ * \param sel Boolean selector
+ * \return For each element, select b if sel is true, a otherwise.
+ */
+static gmx_inline gmx_simd_float_t
+gmx_simd_blendv_f(gmx_simd_float_t a, gmx_simd_float_t b, gmx_simd_fbool_t sel)
+{
+ gmx_simd_float_t d;
+ int i;
+
+ for (i = 0; i < GMX_SIMD_FLOAT_WIDTH; i++)
+ {
+ d.r[i] = sel.b[i] ? b.r[i] : a.r[i];
+ }
+ return d;
+}
+
+/*! \brief Return sum of all elements in SIMD float variable.
+ *
+ * You should typically call the real-precision \ref gmx_simd_reduce_r.
+ *
+ * \param a SIMD variable to reduce/sum.
+ * \return The sum of all elements in the argument variable.
+ *
+ */
+static gmx_inline float
+gmx_simd_reduce_f(gmx_simd_float_t a)
+{
+ float sum = 0.0;
+ int i;
+
+ for (i = 0; i < GMX_SIMD_FLOAT_WIDTH; i++)
+ {
+ sum += a.r[i];
+ }
+ return sum;
+}
+
+/*! \}
+ *
+ * \name SIMD implementation double precision floating-point bitwise logical operations
+ * \{
+ */
+/*! \brief Bitwise and for two SIMD double variables. Supported with \ref GMX_SIMD_HAVE_LOGICAL.
+ *
+ * \copydetails gmx_simd_and_f
+ */
+static gmx_inline gmx_simd_double_t
+gmx_simd_and_d(gmx_simd_double_t a, gmx_simd_double_t b)
+{
+ gmx_simd_double_t c;
+ int i;
+#ifdef __cplusplus
+ gmx_int64_t val1, val2, res;
+#else
+ union
+ {
+ double r;
+ gmx_int64_t i;
+ }
+ conv1, conv2;
+#endif
+
+ for (i = 0; i < GMX_SIMD_DOUBLE_WIDTH; i++)
+ {
+#ifdef __cplusplus
+ val1 = reinterpret_cast<gmx_int64_t &>(a.r[i]);
+ val2 = reinterpret_cast<gmx_int64_t &>(b.r[i]);
+ res = val1 & val2;
+ c.r[i] = reinterpret_cast<double &>(res);
+#else
+ conv1.r = a.r[i];
+ conv2.r = b.r[i];
+ conv1.i = conv1.i & conv2.i;
+ c.r[i] = conv1.r;
+#endif
+ }
+ return c;
+}
+
+/*! \brief Bitwise andnot for SIMD double. c=(~a) & b. Supported with \ref GMX_SIMD_HAVE_LOGICAL.
+ *
+ * \copydetails gmx_simd_andnot_f
+ */
+static gmx_inline gmx_simd_double_t
+gmx_simd_andnot_d(gmx_simd_double_t a, gmx_simd_double_t b)
+{
+ gmx_simd_double_t c;
+ int i;
+#ifdef __cplusplus
+ gmx_int64_t val1, val2, res;
+#else
+ union
+ {
+ double r;
+ gmx_int64_t i;
+ }
+ conv1, conv2;
+#endif
+
+ for (i = 0; i < GMX_SIMD_DOUBLE_WIDTH; i++)
+ {
+#ifdef __cplusplus
+ val1 = reinterpret_cast<gmx_int64_t &>(a.r[i]);
+ val2 = reinterpret_cast<gmx_int64_t &>(b.r[i]);
+ res = (~val1) & val2;
+ c.r[i] = reinterpret_cast<double &>(res);
+#else
+ conv1.r = a.r[i];
+ conv2.r = b.r[i];
+ conv1.i = conv1.i & conv2.i;
+ c.r[i] = conv1.r;
+#endif
+ }
+ return c;
+}
+
+/*! \brief Bitwise or for SIMD double. Supported with \ref GMX_SIMD_HAVE_LOGICAL.
+ *
+ * \copydetails gmx_simd_or_f
+ */
+static gmx_inline gmx_simd_double_t
+gmx_simd_or_d(gmx_simd_double_t a, gmx_simd_double_t b)
+{
+ gmx_simd_double_t c;
+ int i;
+#ifdef __cplusplus
+ gmx_int64_t val1, val2, res;
+#else
+ union
+ {
+ double r;
+ gmx_int64_t i;
+ }
+ conv1, conv2;
+#endif
+
+ for (i = 0; i < GMX_SIMD_DOUBLE_WIDTH; i++)
+ {
+#ifdef __cplusplus
+ val1 = reinterpret_cast<gmx_int64_t &>(a.r[i]);
+ val2 = reinterpret_cast<gmx_int64_t &>(b.r[i]);
+ res = val1 | val2;
+ c.r[i] = reinterpret_cast<double &>(res);
+#else
+ conv1.r = a.r[i];
+ conv2.r = b.r[i];
+ conv1.i = conv1.i & conv2.i;
+ c.r[i] = conv1.r;
+#endif
+ }
+ return c;
+}
+
+/*! \brief Bitwise xor for SIMD double. Supported with \ref GMX_SIMD_HAVE_LOGICAL.
+ *
+ * \copydetails gmx_simd_xor_f
+ */
+static gmx_inline gmx_simd_double_t
+gmx_simd_xor_d(gmx_simd_double_t a, gmx_simd_double_t b)
+{
+ gmx_simd_double_t c;
+ int i;
+#ifdef __cplusplus
+ gmx_int64_t val1, val2, res;
+#else
+ union
+ {
+ double r;
+ gmx_int64_t i;
+ }
+ conv1, conv2;
+#endif
+
+ for (i = 0; i < GMX_SIMD_DOUBLE_WIDTH; i++)
+ {
+#ifdef __cplusplus
+ val1 = reinterpret_cast<gmx_int64_t &>(a.r[i]);
+ val2 = reinterpret_cast<gmx_int64_t &>(b.r[i]);
+ res = val1 ^ val2;
+ c.r[i] = reinterpret_cast<double &>(res);
+#else
+ conv1.r = a.r[i];
+ conv2.r = b.r[i];
+ conv1.i = conv1.i & conv2.i;
+ c.r[i] = conv1.r;
+#endif
+ }
+ return c;
+}
+
+/*! \}
+ *
+ * \name SIMD implementation double precision floating-point arithmetics
+ * \{
+ */
+/*! \brief Add two double SIMD variables.
+ *
+ * \copydetails gmx_simd_add_f
+ */
+static gmx_inline gmx_simd_double_t
+gmx_simd_add_d(gmx_simd_double_t a, gmx_simd_double_t b)
+{
+ gmx_simd_double_t c;
+ int i;
+
+ for (i = 0; i < GMX_SIMD_DOUBLE_WIDTH; i++)
+ {
+ c.r[i] = a.r[i] + b.r[i];
+ }
+ return c;
+}
+
+/*! \brief Add two float SIMD variables.
+ *
+ * \copydetails gmx_simd_sub_f
+ */
+static gmx_inline gmx_simd_double_t
+gmx_simd_sub_d(gmx_simd_double_t a, gmx_simd_double_t b)
+{
+ gmx_simd_double_t c;
+ int i;
+
+ for (i = 0; i < GMX_SIMD_DOUBLE_WIDTH; i++)
+ {
+ c.r[i] = a.r[i] - b.r[i];
+ }
+ return c;
+}
+
+/*! \brief Multiply two SIMD variables.
+ *
+ * \copydetails gmx_simd_mul_f
+ */
+static gmx_inline gmx_simd_double_t
+gmx_simd_mul_d(gmx_simd_double_t a, gmx_simd_double_t b)
+{
+ gmx_simd_double_t c;
+ int i;
+
+ for (i = 0; i < GMX_SIMD_DOUBLE_WIDTH; i++)
+ {
+ c.r[i] = a.r[i]*b.r[i];
+ }
+ return c;
+}
+
+/*! \brief Fused-multiply-add. Result is a*b+c.
+ *
+ * \copydetails gmx_simd_fmadd_f
+ */
+#define gmx_simd_fmadd_d(a, b, c) gmx_simd_add_d(gmx_simd_mul_d(a, b), c)
+
+/*! \brief Fused-multiply-subtract. Result is a*b-c.
+ *
+ * \copydetails gmx_simd_fmsub_f
+ */
+#define gmx_simd_fmsub_d(a, b, c) gmx_simd_sub_d(gmx_simd_mul_d(a, b), c)
+
+/*! \brief Fused-negated-multiply-add. Result is -a*b+c.
+ *
+ * \copydetails gmx_simd_fnmadd_f
+ */
+#define gmx_simd_fnmadd_d(a, b, c) gmx_simd_sub_d(c, gmx_simd_mul_d(a, b))
+
+/*! \brief Fused-negated-multiply-add. Result is -a*b-c.
+ *
+ * \copydetails gmx_simd_fnmsub_f
+ */
+#define gmx_simd_fnmsub_d(a, b, c) gmx_simd_sub_d(gmx_simd_setzero_d(), gmx_simd_fmadd_d(a, b, c))
+
+/*! \brief SIMD 1.0/sqrt(x) lookup.
+ *
+ * \copydetails gmx_simd_rsqrt_f
+ */
+static gmx_inline gmx_simd_double_t
+gmx_simd_rsqrt_d(gmx_simd_double_t x)
+{
+ gmx_simd_double_t b;
+ int i;
+
+ for (i = 0; i < GMX_SIMD_DOUBLE_WIDTH; i++)
+ {
+ /* Sic - we only need single precision for the reference lookup, since
+ * we have defined GMX_SIMD_RSQRT_BITS to 23.
+ */
+ b.r[i] = (x.r[i] > 0.0) ? 1.0f/sqrtf(x.r[i]) : 0.0;
+ }
+ return b;
+};
+
+/*! \brief 1.0/x lookup.
+ *
+ * \copydetails gmx_simd_rcp_f
+ */
+static gmx_inline gmx_simd_double_t
+gmx_simd_rcp_d(gmx_simd_double_t x)
+{
+ gmx_simd_double_t b;
+ int i;
+
+ for (i = 0; i < GMX_SIMD_DOUBLE_WIDTH; i++)
+ {
+ /* Sic - we only need single precision for the reference lookup, since
+ * we have defined GMX_SIMD_RCP_BITS to 23.
+ */
+ b.r[i] = (x.r[i] != 0.0) ? 1.0f/x.r[i] : 0.0;
+ }
+ return b;
+};
+
+/*! \brief SIMD Floating-point fabs().
+ *
+ * \copydetails gmx_simd_fabs_f
+ */
+static gmx_inline gmx_simd_double_t
+gmx_simd_fabs_d(gmx_simd_double_t a)
+{
+ gmx_simd_double_t c;
+ int i;
+
+ for (i = 0; i < GMX_SIMD_DOUBLE_WIDTH; i++)
+ {
+ c.r[i] = fabs(a.r[i]);
+ }
+ return c;
+}
+
+/*! \brief SIMD floating-point negate.
+ *
+ * \copydetails gmx_simd_fneg_f
+ */
+static gmx_inline gmx_simd_double_t
+gmx_simd_fneg_d(gmx_simd_double_t a)
+{
+ gmx_simd_double_t c;
+ int i;
+
+ for (i = 0; i < GMX_SIMD_DOUBLE_WIDTH; i++)
+ {
+ c.r[i] = -a.r[i];
+ }
+ return c;
+}
+
+/*! \brief Set each SIMD element to the largest from two variables.
+ *
+ * \copydetails gmx_simd_max_f
+ */
+static gmx_inline gmx_simd_double_t
+gmx_simd_max_d(gmx_simd_double_t a, gmx_simd_double_t b)
+{
+ gmx_simd_double_t c;
+ int i;
+
+ for (i = 0; i < GMX_SIMD_DOUBLE_WIDTH; i++)
+ {
+ c.r[i] = (a.r[i] >= b.r[i] ? a.r[i] : b.r[i]);
+ }
+ return c;
+}
+
+/*! \brief Set each SIMD element to the smallest from two variables.
+ *
+ * \copydetails gmx_simd_min_f
+ */
+static gmx_inline gmx_simd_double_t
+gmx_simd_min_d(gmx_simd_double_t a, gmx_simd_double_t b)
+{
+ gmx_simd_double_t c;
+ int i;
+
+ for (i = 0; i < GMX_SIMD_DOUBLE_WIDTH; i++)
+ {
+ c.r[i] = (a.r[i] <= b.r[i] ? a.r[i] : b.r[i]);
+ }
+ return c;
+}
+
+/*! \brief Round to nearest integer value (in double floating-point format).
+ *
+ * \copydetails gmx_simd_round_f
+ */
+static gmx_inline gmx_simd_double_t
+gmx_simd_round_d(gmx_simd_double_t a)
+{
+ gmx_simd_double_t b;
+ int i;
+
+ for (i = 0; i < GMX_SIMD_DOUBLE_WIDTH; i++)
+ {
+#ifdef _MSC_VER
+ int temp = (a.r[i] >= 0.0) ? (a.r[i] + 0.5) : (a.r[i] - 0.5);
+ b.r[i] = temp;
+#else
+ b.r[i] = round(a.r[i]);
+#endif
+ }
+ return b;
+}
+
+/*! \brief Truncate SIMD, i.e. round towards zero - common hardware instruction.
+ *
+ * \copydetails gmx_simd_trunc_f
+ */
+static gmx_inline gmx_simd_double_t
+gmx_simd_trunc_d(gmx_simd_double_t a)
+{
+ gmx_simd_double_t b;
+ int i;
+
+ for (i = 0; i < GMX_SIMD_DOUBLE_WIDTH; i++)
+ {
+ b.r[i] = trunc(a.r[i]);
+ }
+ return b;
+}
+
+/*! \brief Fraction of the SIMD floating point number.
+ *
+ * \copydetails gmx_simd_fraction_f
+ */
+static gmx_inline gmx_simd_double_t
+gmx_simd_fraction_d(gmx_simd_double_t a)
+{
+ return gmx_simd_sub_d(a, gmx_simd_trunc_d(a));
+}
+
+
+/*! \brief Extract (integer) exponent from double precision SIMD.
+ *
+ * \copydetails gmx_simd_get_exponent_f
+ */
+static gmx_inline gmx_simd_double_t
+gmx_simd_get_exponent_d(gmx_simd_double_t a)
+{
+ /* Mask with ones for the exponent field of double precision fp */
+ const gmx_int64_t expmask = 0x7ff0000000000000LL;
+ gmx_simd_double_t b;
+ int i;
+ union
+ {
+ double d;
+ gmx_int64_t i;
+ }
+ conv;
+
+ for (i = 0; i < GMX_SIMD_DOUBLE_WIDTH; i++)
+ {
+ conv.d = a.r[i];
+ /* Zero everything but exponent field (remove sign),
+ * shift 23 bits right (mantissa size), and remove exponent bias (1023).
+ */
+ b.r[i] = ((conv.i & expmask) >> 52) - 1023;
+ }
+ return b;
+}
+
+/*! \brief Get SIMD doublemantissa.
+ *
+ * \copydetails gmx_simd_get_mantissa_f
+ */
+static gmx_inline gmx_simd_double_t
+gmx_simd_get_mantissa_d(gmx_simd_double_t a)
+{
+ const gmx_int64_t mantmask = 0x000fffffffffffffLL;
+ const gmx_int64_t one = 0x3ff0000000000000LL;
+ gmx_simd_double_t b;
+ int i;
+ union
+ {
+ double d;
+ gmx_int64_t i;
+ }
+ conv;
+
+ for (i = 0; i < GMX_SIMD_DOUBLE_WIDTH; i++)
+ {
+ conv.d = a.r[i];
+ conv.i = (conv.i & (mantmask)) | one;
+ b.r[i] = conv.d;
+ }
+ return b;
+}
+
+/*! \brief Set (integer) exponent from single precision floating-point SIMD.
+ *
+ * \copydetails gmx_simd_set_exponent_f
+ */
+static gmx_inline gmx_simd_double_t
+gmx_simd_set_exponent_d(gmx_simd_double_t a)
+{
+ gmx_simd_double_t b;
+ int i;
+ gmx_int64_t iexp;
+ union
+ {
+ double d;
+ gmx_int64_t i;
+ }
+ conv;
+
+ for (i = 0; i < GMX_SIMD_DOUBLE_WIDTH; i++)
+ {
+ /* Critical to use same algorithm as for gmx_simd_round_d() */
+#ifdef _MSC_VER
+ iexp = (a.r[i] >= 0.0) ? (a.r[i] + 0.5) : (a.r[i] - 0.5);
+#else
+ iexp = round(a.r[i]);
+#endif
+ /* Add bias (1023), and shift 52 bits left (mantissa size) */
+ conv.i = (iexp + 1023) << 52;
+ b.r[i] = conv.d;
+ }
+ return b;
+}
+
+/*! \}
+ *
+ * \name SIMD implementation double precision floating-point comparison, boolean, selection.
+ * \{
+ */
+/*! \brief SIMD a==b for double SIMD.
+ *
+ * \copydetails gmx_simd_cmpeq_f
+ */
+static gmx_inline gmx_simd_dbool_t
+gmx_simd_cmpeq_d(gmx_simd_double_t a, gmx_simd_double_t b)
+{
+ gmx_simd_dbool_t c;
+ int i;
+
+ for (i = 0; i < GMX_SIMD_DOUBLE_WIDTH; i++)
+ {
+ c.b[i] = (a.r[i] == b.r[i]);
+ }
+ return c;
+}
+
+/*! \brief SIMD a<b for double SIMD.
+ *
+ * \copydetails gmx_simd_cmplt_f
+ */
+static gmx_inline gmx_simd_dbool_t
+gmx_simd_cmplt_d(gmx_simd_double_t a, gmx_simd_double_t b)
+{
+ gmx_simd_dbool_t c;
+ int i;
+
+ for (i = 0; i < GMX_SIMD_DOUBLE_WIDTH; i++)
+ {
+ c.b[i] = (a.r[i] < b.r[i]);
+ }
+ return c;
+}
+
+/*! \brief SIMD a<=b for double SIMD.
+ *
+ * \copydetails gmx_simd_cmple_f
+ */
+static gmx_inline gmx_simd_dbool_t
+gmx_simd_cmple_d(gmx_simd_double_t a, gmx_simd_double_t b)
+{
+ gmx_simd_dbool_t c;
+ int i;
+
+ for (i = 0; i < GMX_SIMD_DOUBLE_WIDTH; i++)
+ {
+ c.b[i] = (a.r[i] <= b.r[i]);
+ }
+ return c;
+}
+
+
+/*! \brief Logical \a and on double precision SIMD booleans.
+ *
+ * \copydetails gmx_simd_and_fb
+ */
+static gmx_inline gmx_simd_dbool_t
+gmx_simd_and_db(gmx_simd_dbool_t a, gmx_simd_dbool_t b)
+{
+ gmx_simd_dbool_t c;
+ int i;
+
+ for (i = 0; i < GMX_SIMD_DOUBLE_WIDTH; i++)
+ {
+ c.b[i] = (a.b[i] && b.b[i]);
+ }
+ return c;
+}
+
+/*! \brief Logical \a or on double precision SIMD booleans.
+ *
+ * \copydetails gmx_simd_or_fb
+ */
+static gmx_inline gmx_simd_dbool_t
+gmx_simd_or_db(gmx_simd_dbool_t a, gmx_simd_dbool_t b)
+{
+ gmx_simd_dbool_t c;
+ int i;
+
+ for (i = 0; i < GMX_SIMD_DOUBLE_WIDTH; i++)
+ {
+ c.b[i] = (a.b[i] || b.b[i]);
+ }
+ return c;
+}
+
+
+/*! \brief Returns non-zero if any of the boolean in x is True, otherwise 0.
+ *
+ * \copydetails gmx_simd_anytrue_fb
+ */
+static gmx_inline int
+gmx_simd_anytrue_db(gmx_simd_dbool_t a)
+{
+ int anytrue;
+ int i;
+
+ anytrue = 0;
+ for (i = 0; i < GMX_SIMD_DOUBLE_WIDTH; i++)
+ {
+ anytrue = anytrue || a.b[i];
+ }
+ return anytrue;
+}
+
+
+/*! \brief Select from double SIMD variable where boolean is true.
+ *
+ * \copydetails gmx_simd_blendzero_f
+ */
+static gmx_inline gmx_simd_double_t
+gmx_simd_blendzero_d(gmx_simd_double_t a, gmx_simd_dbool_t sel)
+{
+ gmx_simd_double_t c;
+ int i;
+
+ for (i = 0; i < GMX_SIMD_DOUBLE_WIDTH; i++)
+ {
+ c.r[i] = sel.b[i] ? a.r[i] : 0.0;
+ }
+ return c;
+}
+
+/*! \brief Select from double SIMD variable where boolean is false.
+ *
+ * \copydetails gmx_simd_blendnotzero_f
+ */
+static gmx_inline gmx_simd_double_t
+gmx_simd_blendnotzero_d(gmx_simd_double_t a, gmx_simd_dbool_t sel)
+{
+ gmx_simd_double_t c;
+ int i;
+
+ for (i = 0; i < GMX_SIMD_DOUBLE_WIDTH; i++)
+ {
+ c.r[i] = sel.b[i] ? 0.0 : a.r[i];
+ }
+ return c;
+}
+
+/*! \brief Vector-blend double SIMD selection.
+ *
+ * \copydetails gmx_simd_blendv_f
+ */
+static gmx_inline gmx_simd_double_t
+gmx_simd_blendv_d(gmx_simd_double_t a, gmx_simd_double_t b, gmx_simd_dbool_t sel)
+{
+ gmx_simd_double_t d;
+ int i;
+
+ for (i = 0; i < GMX_SIMD_DOUBLE_WIDTH; i++)
+ {
+ d.r[i] = sel.b[i] ? b.r[i] : a.r[i];
+ }
+ return d;
+}
+
+/*! \brief Return sum of all elements in SIMD double variable.
+ *
+ * \copydetails gmx_simd_reduce_f
+ *
+ */
+static gmx_inline double
+gmx_simd_reduce_d(gmx_simd_double_t a)
+{
+ double sum = 0.0;
+ int i;
+
+ for (i = 0; i < GMX_SIMD_DOUBLE_WIDTH; i++)
+ {
+ sum += a.r[i];
+ }
+ return sum;
+}
+
+/*! \}
+ *
+ * \name SIMD implementation integer (corresponding to float) bitwise logical operations
+ * \{
+ */
+
+/*! \brief SIMD integer shift left logical, based on immediate value.
+ *
+ * You should typically call the real-precision \ref gmx_simd_slli_i.
+ *
+ * Logical shift. Each element is shifted (independently) up to 32 positions
+ * left, while zeros are shifted in from the right. Only available if
+ * \ref GMX_SIMD_HAVE_FINT32_LOGICAL (single) or \ref GMX_SIMD_HAVE_DINT32_LOGICAL
+ * (double) is defined.
+ *
+ * \param a integer data to shift
+ * \param n number of positions to shift left. n<=32.
+ * \return shifted values
+ */
+static gmx_inline gmx_simd_fint32_t
+gmx_simd_slli_fi(gmx_simd_fint32_t a, int n)
+{
+ gmx_simd_fint32_t c;
+ int i;
+
+ for (i = 0; i < GMX_SIMD_FINT32_WIDTH; i++)
+ {
+ c.i[i] = a.i[i] << n;
+ }
+ return c;
+}
+
+/*! \brief SIMD integer shift right logical, based on immediate value.
+ *
+ * You should typically call the real-precision \ref gmx_simd_srli_i.
+ *
+ * Logical shift. Each element is shifted (independently) up to 32 positions
+ * right, while zeros are shifted in from the left. Only available if
+ * \ref GMX_SIMD_HAVE_FINT32_LOGICAL (single) or \ref GMX_SIMD_HAVE_DINT32_LOGICAL
+ * (double) is defined.
+ *
+ * \param a integer data to shift
+ * \param n number of positions to shift right. n<=32.
+ * \return shifted values
+ */
+static gmx_inline gmx_simd_fint32_t
+gmx_simd_srli_fi(gmx_simd_fint32_t a, int n)
+{
+ gmx_simd_fint32_t c;
+ int i;
+
+ for (i = 0; i < GMX_SIMD_FINT32_WIDTH; i++)
+ {
+ c.i[i] = a.i[i] >> n;
+ }
+ return c;
+}
+
+/*! \brief Integer SIMD bitwise and.
+ *
+ * You should typically call the real-precision \ref gmx_simd_and_i.
+ *
+ * This routine is only available if \ref GMX_SIMD_HAVE_FINT32_LOGICAL (single)
+ * or \ref GMX_SIMD_HAVE_DINT32_LOGICAL (double) is defined.
+ *
+ * \note You can \a not use this operation directly to select based on a boolean
+ * SIMD variable, since booleans are separate from integer SIMD. If that
+ * is what you need, have a look at \ref gmx_simd_blendzero_i instead.
+ *
+ * \param a first integer SIMD
+ * \param b second integer SIMD
+ * \return a \& b (bitwise and)
+ */
+static gmx_inline gmx_simd_fint32_t
+gmx_simd_and_fi(gmx_simd_fint32_t a, gmx_simd_fint32_t b)
+{
+ gmx_simd_fint32_t c;
+ int i;
+
+ for (i = 0; i < GMX_SIMD_FINT32_WIDTH; i++)
+ {
+ c.i[i] = a.i[i] & b.i[i];
+ }
+ return c;
+}
+
+/*! \brief Integer SIMD bitwise not-and.
+ *
+ * You should typically call the real-precision \ref gmx_simd_andnot_i.
+ *
+ * This routine is only available if \ref GMX_SIMD_HAVE_FINT32_LOGICAL (single)
+ * or \ref GMX_SIMD_HAVE_DINT32_LOGICAL (double) is defined.
+ *
+ * Note that you can NOT use this operation directly to select based on a boolean
+ * SIMD variable, since booleans are separate from integer SIMD. If that
+ * is what you need, have a look at \ref gmx_simd_blendnotzero_i instead.
+ *
+ * \param a first integer SIMD
+ * \param b second integer SIMD
+ * \return (~a) \& b (bitwise andnot)
+ */
+static gmx_inline gmx_simd_fint32_t
+gmx_simd_andnot_fi(gmx_simd_fint32_t a, gmx_simd_fint32_t b)
+{
+ gmx_simd_fint32_t c;
+ int i;
+
+ for (i = 0; i < GMX_SIMD_FINT32_WIDTH; i++)
+ {
+ c.i[i] = (~a.i[i]) & b.i[i];
+ }
+ return c;
+}
+
+/*! \brief Integer SIMD bitwise or.
+ *
+ * You should typically call the real-precision \ref gmx_simd_or_i.
+ *
+ * This routine is only available if \ref GMX_SIMD_HAVE_FINT32_LOGICAL (single)
+ * or \ref GMX_SIMD_HAVE_DINT32_LOGICAL (double) is defined.
+ *
+ * \param a first integer SIMD
+ * \param b second integer SIMD
+ * \return a \| b (bitwise or)
+ */
+static gmx_inline gmx_simd_fint32_t
+gmx_simd_or_fi(gmx_simd_fint32_t a, gmx_simd_fint32_t b)
+{
+ gmx_simd_fint32_t c;
+ int i;
+
+ for (i = 0; i < GMX_SIMD_FINT32_WIDTH; i++)
+ {
+ c.i[i] = a.i[i] | b.i[i];
+ }
+ return c;
+}
+
+/*! \brief Integer SIMD bitwise xor.
+ *
+ * You should typically call the real-precision \ref gmx_simd_xor_i.
+ *
+ * This routine is only available if \ref GMX_SIMD_HAVE_FINT32_LOGICAL (single)
+ * or \ref GMX_SIMD_HAVE_DINT32_LOGICAL (double) is defined.
+ *
+ * \param a first integer SIMD
+ * \param b second integer SIMD
+ * \return a ^ b (bitwise xor)
+ */
+static gmx_inline gmx_simd_fint32_t
+gmx_simd_xor_fi(gmx_simd_fint32_t a, gmx_simd_fint32_t b)
+{
+ gmx_simd_fint32_t c;
+ int i;
+
+ for (i = 0; i < GMX_SIMD_FINT32_WIDTH; i++)
+ {
+ c.i[i] = a.i[i] ^ b.i[i];
+ }
+ return c;
+}
+
+/*! \}
+ *
+ * \name SIMD implementation integer (corresponding to float) arithmetics
+ * \{
+ */
+/*! \brief Add SIMD integers.
+ *
+ * You should typically call the real-precision \ref gmx_simd_xor_i.
+ *
+ * This routine is only available if \ref GMX_SIMD_HAVE_FINT32_ARITHMETICS (single)
+ * or \ref GMX_SIMD_HAVE_DINT32_ARITHMETICS (double) is defined.
+ *
+ * \param a term1
+ * \param b term2
+ * \return a+b
+ */
+static gmx_inline gmx_simd_fint32_t
+gmx_simd_add_fi(gmx_simd_fint32_t a, gmx_simd_fint32_t b)
+{
+ gmx_simd_fint32_t c;
+ int i;
+
+ for (i = 0; i < GMX_SIMD_FINT32_WIDTH; i++)
+ {
+ c.i[i] = a.i[i] + b.i[i];
+ }
+ return c;
+}
+
+/*! \brief Subtract SIMD integers.
+ *
+ * You should typically call the real-precision \ref gmx_simd_xor_i.
+ *
+ * This routine is only available if \ref GMX_SIMD_HAVE_FINT32_ARITHMETICS (single)
+ * or \ref GMX_SIMD_HAVE_DINT32_ARITHMETICS (double) is defined.
+ *
+ * \param a term1
+ * \param b term2
+ * \return a-b
+ */
+static gmx_inline gmx_simd_fint32_t
+gmx_simd_sub_fi(gmx_simd_fint32_t a, gmx_simd_fint32_t b)
+{
+ gmx_simd_fint32_t c;
+ int i;
+
+ for (i = 0; i < GMX_SIMD_FINT32_WIDTH; i++)
+ {
+ c.i[i] = a.i[i] - b.i[i];
+ }
+ return c;
+}
+
+/*! \brief Multiply SIMD integers.
+ *
+ * You should typically call the real-precision \ref gmx_simd_xor_i.
+ *
+ * This routine is only available if \ref GMX_SIMD_HAVE_FINT32_ARITHMETICS (single)
+ * or \ref GMX_SIMD_HAVE_DINT32_ARITHMETICS (double) is defined.
+ *
+ * \param a factor1
+ * \param b factor2
+ * \return a*b.
+ *
+ * \note Only the low 32 bits are retained, so this can overflow.
+ */
+static gmx_inline gmx_simd_fint32_t
+gmx_simd_mul_fi(gmx_simd_fint32_t a, gmx_simd_fint32_t b)
+{
+ gmx_simd_fint32_t c;
+ int i;
+
+ for (i = 0; i < GMX_SIMD_FINT32_WIDTH; i++)
+ {
+ c.i[i] = a.i[i]*b.i[i];
+ }
+ return c;
+}
+
+/*! \}
+ *
+ * \name SIMD implementation integer (corresponding to float) comparisons, boolean, selection
+ * \{
+ */
+
+/*! \brief Equality comparison of two integers corresponding to float values.
+ *
+ * You should typically call the real-precision \ref gmx_simd_cmpeq_i.
+ *
+ * This routine is only available if \ref GMX_SIMD_HAVE_FINT32_ARITHMETICS (single)
+ * or \ref GMX_SIMD_HAVE_DINT32_ARITHMETICS (double) is defined.
+ *
+ * \param a SIMD integer1
+ * \param b SIMD integer2
+ * \return SIMD integer boolean with true for elements where a==b
+ */
+static gmx_inline gmx_simd_fibool_t
+gmx_simd_cmpeq_fi(gmx_simd_fint32_t a, gmx_simd_fint32_t b)
+{
+ gmx_simd_fibool_t c;
+ int i;
+
+ for (i = 0; i < GMX_SIMD_FINT32_WIDTH; i++)
+ {
+ c.b[i] = (a.i[i] == b.i[i]);
+ }
+ return c;
+}
+
+/*! \brief Less-than comparison of two SIMD integers corresponding to float values.
+ *
+ * You should typically call the real-precision \ref gmx_simd_cmplt_i.
+ *
+ * This routine is only available if \ref GMX_SIMD_HAVE_FINT32_ARITHMETICS (single)
+ * or \ref GMX_SIMD_HAVE_DINT32_ARITHMETICS (double) is defined.
+ *
+ * \param a SIMD integer1
+ * \param b SIMD integer2
+ * \return SIMD integer boolean with true for elements where a<b
+ */
+static gmx_inline gmx_simd_fibool_t
+gmx_simd_cmplt_fi(gmx_simd_fint32_t a, gmx_simd_fint32_t b)
+{
+ gmx_simd_fibool_t c;
+ int i;
+
+ for (i = 0; i < GMX_SIMD_FINT32_WIDTH; i++)
+ {
+ c.b[i] = (a.i[i] < b.i[i]);
+ }
+ return c;
+}
+
+/*! \brief Logical AND on gmx_simd_fibool_t.
+ *
+ * You should typically call the real-precision \ref gmx_simd_and_ib.
+ *
+ * This routine is only available if \ref GMX_SIMD_HAVE_FINT32_ARITHMETICS (single)
+ * or \ref GMX_SIMD_HAVE_DINT32_ARITHMETICS (double) is defined.
+ *
+ * \param a SIMD boolean 1
+ * \param b SIMD boolean 2
+ * \return True for elements where both a and b are true.
+ */
+static gmx_inline gmx_simd_fibool_t
+gmx_simd_and_fib(gmx_simd_fibool_t a, gmx_simd_fibool_t b)
+{
+ gmx_simd_fibool_t c;
+ int i;
+
+ for (i = 0; i < GMX_SIMD_FINT32_WIDTH; i++)
+ {
+ c.b[i] = (a.b[i] && b.b[i]);
+ }
+ return c;
+}
+
+/*! \brief Logical OR on gmx_simd_fibool_t.
+ *
+ * You should typically call the real-precision \ref gmx_simd_or_ib.
+ *
+ * This routine is only available if \ref GMX_SIMD_HAVE_FINT32_ARITHMETICS (single)
+ * or \ref GMX_SIMD_HAVE_DINT32_ARITHMETICS (double) is defined.
+ *
+ * \param a SIMD boolean 1
+ * \param b SIMD boolean 2
+ * \return True for elements where both a and b are true.
+ */
+static gmx_inline gmx_simd_fibool_t
+gmx_simd_or_fib(gmx_simd_fibool_t a, gmx_simd_fibool_t b)
+{
+ gmx_simd_fibool_t c;
+ int i;
+
+ for (i = 0; i < GMX_SIMD_FINT32_WIDTH; i++)
+ {
+ c.b[i] = (a.b[i] || b.b[i]);
+ }
+ return c;
+}
+
+/*! \brief Returns non-zero if any of the boolean in x is True, otherwise 0.
+ *
+ * You should typically call the real-precision \ref gmx_simd_anytrue_ib.
+ *
+ * This routine is only available if \ref GMX_SIMD_HAVE_FINT32_ARITHMETICS (single)
+ * or \ref GMX_SIMD_HAVE_DINT32_ARITHMETICS (double) is defined.
+ *
+ * The actual return value for "any true" will depend on the architecture.
+ * Any non-zero value should be considered truth.
+ *
+ * \param a SIMD boolean
+ * \return Nonzero integer if any of the elements in a is true, otherwise 0.
+ */
+static gmx_inline int
+gmx_simd_anytrue_fib(gmx_simd_fibool_t a)
+{
+ int anytrue;
+ int i;
+
+ anytrue = 0;
+ for (i = 0; i < GMX_SIMD_FINT32_WIDTH; i++)
+ {
+ anytrue = anytrue || a.b[i];
+ }
+ return anytrue;
+}
+
+/*! \brief Select from \ref gmx_simd_fint32_t variable where boolean is true.
+ *
+ * You should typically call the real-precision \ref gmx_simd_blendzero_i.
+ *
+ * This routine is only available if \ref GMX_SIMD_HAVE_FINT32_ARITHMETICS (single)
+ * or \ref GMX_SIMD_HAVE_DINT32_ARITHMETICS (double) is defined.
+ *
+ * \param a SIMD integer to select from
+ * \param sel Boolean selector
+ * \return Elements from a where sel is true, 0 otherwise.
+ */
+static gmx_inline gmx_simd_fint32_t
+gmx_simd_blendzero_fi(gmx_simd_fint32_t a, gmx_simd_fibool_t sel)
+{
+ gmx_simd_fint32_t c;
+ int i;
+
+ for (i = 0; i < GMX_SIMD_FINT32_WIDTH; i++)
+ {
+ c.i[i] = sel.b[i] ? a.i[i] : 0.0;
+ }
+ return c;
+}
+
+/*! \brief Select from \ref gmx_simd_fint32_t variable where boolean is false.
+ *
+ * You should typically call the real-precision \ref gmx_simd_blendnotzero_i.
+ *
+ * This routine is only available if \ref GMX_SIMD_HAVE_FINT32_ARITHMETICS (single)
+ * or \ref GMX_SIMD_HAVE_DINT32_ARITHMETICS (double) is defined.
+ *
+ * \param a SIMD integer to select from
+ * \param sel Boolean selector
+ * \return Elements from a where sel is false, 0 otherwise (sic).
+ */
+static gmx_inline gmx_simd_fint32_t
+gmx_simd_blendnotzero_fi(gmx_simd_fint32_t a, gmx_simd_fibool_t sel)
+{
+ gmx_simd_fint32_t c;
+ int i;
+
+ for (i = 0; i < GMX_SIMD_FINT32_WIDTH; i++)
+ {
+ c.i[i] = sel.b[i] ? 0.0 : a.i[i];
+ }
+ return c;
+}
+
+/*! \brief Vector-blend SIMD selection.
+ *
+ * You should typically call the real-precision \ref gmx_simd_blendv_i.
+ *
+ * This routine is only available if \ref GMX_SIMD_HAVE_FINT32_ARITHMETICS (single)
+ * or \ref GMX_SIMD_HAVE_DINT32_ARITHMETICS (double) is defined.
+ *
+ * \param a First source
+ * \param b Second source
+ * \param sel Boolean selector
+ * \return For each element, select b if sel is true, a otherwise.
+ */
+static gmx_inline gmx_simd_fint32_t
+gmx_simd_blendv_fi(gmx_simd_fint32_t a, gmx_simd_fint32_t b, gmx_simd_fibool_t sel)
+{
+ gmx_simd_fint32_t d;
+ int i;
+
+ for (i = 0; i < GMX_SIMD_FINT32_WIDTH; i++)
+ {
+ d.i[i] = sel.b[i] ? b.i[i] : a.i[i];
+ }
+ return d;
+}
+
+/*! \}
+ *
+ * \name SIMD implementation integer (corresponding to double) bitwise logical operations
+ * \{
+ */
+
+/*! \brief SIMD integer shift left, based on immediate value.
+ *
+ * \copydetails gmx_simd_slli_fi
+ */
+static gmx_inline gmx_simd_dint32_t
+gmx_simd_slli_di(gmx_simd_dint32_t a, int n)
+{
+ gmx_simd_dint32_t c;
+ int i;
+
+ for (i = 0; i < GMX_SIMD_DINT32_WIDTH; i++)
+ {
+ c.i[i] = a.i[i] << n;
+ }
+ return c;
+}
+
+/*! \brief SIMD integer shift right, based on immediate value.
+ *
+ * \copydetails gmx_simd_srli_fi
+ */
+static gmx_inline gmx_simd_dint32_t
+gmx_simd_srli_di(gmx_simd_dint32_t a, int n)
+{
+ gmx_simd_dint32_t c;
+ int i;
+
+ for (i = 0; i < GMX_SIMD_DINT32_WIDTH; i++)
+ {
+ c.i[i] = a.i[i] >> n;
+ }
+ return c;
+}
+
+/*! \brief Integer bitwise and for SIMD variables.
+ *
+ * \copydetails gmx_simd_and_fi
+ */
+static gmx_inline gmx_simd_dint32_t
+gmx_simd_and_di(gmx_simd_dint32_t a, gmx_simd_dint32_t b)
+{
+ gmx_simd_dint32_t c;
+ int i;
+
+ for (i = 0; i < GMX_SIMD_DINT32_WIDTH; i++)
+ {
+ c.i[i] = a.i[i] & b.i[i];
+ }
+ return c;
+}
+
+/*! \brief Integer bitwise not-and for SIMD variables.
+ *
+ * \copydetails gmx_simd_andnot_fi
+ */
+static gmx_inline gmx_simd_dint32_t
+gmx_simd_andnot_di(gmx_simd_dint32_t a, gmx_simd_dint32_t b)
+{
+ gmx_simd_dint32_t c;
+ int i;
+
+ for (i = 0; i < GMX_SIMD_DINT32_WIDTH; i++)
+ {
+ c.i[i] = (~a.i[i]) & b.i[i];
+ }
+ return c;
+}
+
+/*! \brief Integer bitwise or for SIMD variables.
+ *
+ * \copydetails gmx_simd_or_fi
+ */
+static gmx_inline gmx_simd_dint32_t
+gmx_simd_or_di(gmx_simd_dint32_t a, gmx_simd_dint32_t b)
+{
+ gmx_simd_dint32_t c;
+ int i;
+
+ for (i = 0; i < GMX_SIMD_DINT32_WIDTH; i++)
+ {
+ c.i[i] = a.i[i] | b.i[i];
+ }
+ return c;
+}
+
+/*! \brief Integer bitwise xor for SIMD variables.
+ *
+ * \copydetails gmx_simd_xor_fi
+ */
+static gmx_inline gmx_simd_dint32_t
+gmx_simd_xor_di(gmx_simd_dint32_t a, gmx_simd_dint32_t b)
+{
+ gmx_simd_dint32_t c;
+ int i;
+
+ for (i = 0; i < GMX_SIMD_DINT32_WIDTH; i++)
+ {
+ c.i[i] = a.i[i] ^ b.i[i];
+ }
+ return c;
+}
+
+/*! \}
+ *
+ * \name SIMD implementation integer (corresponding to double) arithmetics
+ * \{
+ */
+/*! \brief Add SIMD integers, corresponding to double precision.
+ *
+ * \copydetails gmx_simd_add_fi
+ */
+static gmx_inline gmx_simd_dint32_t
+gmx_simd_add_di(gmx_simd_dint32_t a, gmx_simd_dint32_t b)
+{
+ gmx_simd_dint32_t c;
+ int i;
+
+ for (i = 0; i < GMX_SIMD_DINT32_WIDTH; i++)
+ {
+ c.i[i] = a.i[i] + b.i[i];
+ }
+ return c;
+}
+
+/*! \brief Subtract SIMD integers, corresponding to double precision.
+ *
+ * \copydetails gmx_simd_sub_fi
+ */
+static gmx_inline gmx_simd_dint32_t
+gmx_simd_sub_di(gmx_simd_dint32_t a, gmx_simd_dint32_t b)
+{
+ gmx_simd_dint32_t c;
+ int i;
+
+ for (i = 0; i < GMX_SIMD_DINT32_WIDTH; i++)
+ {
+ c.i[i] = a.i[i] - b.i[i];
+ }
+ return c;
+}
+
+/*! \brief Multiply SIMD integers, corresponding to double precision.
+ *
+ * \copydetails gmx_simd_mul_fi
+ */
+static gmx_inline gmx_simd_dint32_t
+gmx_simd_mul_di(gmx_simd_dint32_t a, gmx_simd_dint32_t b)
+{
+ gmx_simd_dint32_t c;
+ int i;
+
+ for (i = 0; i < GMX_SIMD_DINT32_WIDTH; i++)
+ {
+ c.i[i] = a.i[i]*b.i[i];
+ }
+ return c;
+}
+
+/*! \}
+ *
+ * \name SIMD implementation integer (corresponding to double) comparisons, boolean selection
+ * \{
+ */
+
+/*! \brief Equality comparison of two ints corresponding to double SIMD data.
+ *
+ * \copydetails gmx_simd_cmpeq_fi
+ */
+static gmx_inline gmx_simd_dibool_t
+gmx_simd_cmpeq_di(gmx_simd_dint32_t a, gmx_simd_dint32_t b)
+{
+ gmx_simd_dibool_t c;
+ int i;
+
+ for (i = 0; i < GMX_SIMD_DINT32_WIDTH; i++)
+ {
+ c.b[i] = (a.i[i] == b.i[i]);
+ }
+ return c;
+}
+
+/*! \brief Less-than comparison of two ints corresponding to double SIMD data.
+ *
+ * \copydetails gmx_simd_cmplt_fi
+ */
+static gmx_inline gmx_simd_dibool_t
+gmx_simd_cmplt_di(gmx_simd_dint32_t a, gmx_simd_dint32_t b)
+{
+ gmx_simd_dibool_t c;
+ int i;
+
+ for (i = 0; i < GMX_SIMD_DINT32_WIDTH; i++)
+ {
+ c.b[i] = (a.i[i] < b.i[i]);
+ }
+ return c;
+}
+
+/*! \brief Logical AND on gmx_simd_dibool_t.
+ *
+ * \copydetails gmx_simd_and_fib
+ */
+static gmx_inline gmx_simd_dibool_t
+gmx_simd_and_dib(gmx_simd_dibool_t a, gmx_simd_dibool_t b)
+{
+ gmx_simd_dibool_t c;
+ int i;
+
+ for (i = 0; i < GMX_SIMD_DINT32_WIDTH; i++)
+ {
+ c.b[i] = (a.b[i] && b.b[i]);
+ }
+ return c;
+}
+
+/*! \brief Logical OR on gmx_simd_dibool_t.
+ *
+ * \copydetails gmx_simd_or_fib
+ */
+static gmx_inline gmx_simd_dibool_t
+gmx_simd_or_dib(gmx_simd_dibool_t a, gmx_simd_dibool_t b)
+{
+ gmx_simd_dibool_t c;
+ int i;
+
+ for (i = 0; i < GMX_SIMD_DINT32_WIDTH; i++)
+ {
+ c.b[i] = (a.b[i] || b.b[i]);
+ }
+ return c;
+}
+
+/*! \brief Returns non-zero if any of the double-int SIMD booleans in x is True, otherwise 0.
+ *
+ * \copydetails gmx_simd_anytrue_fib
+ */
+static gmx_inline int
+gmx_simd_anytrue_dib(gmx_simd_dibool_t a)
+{
+ int anytrue;
+ int i;
+
+ anytrue = 0;
+ for (i = 0; i < GMX_SIMD_DINT32_WIDTH; i++)
+ {
+ anytrue = anytrue || a.b[i];
+ }
+ return anytrue;
+}
+
+/*! \brief Select from SIMD ints (corresponding to double) where boolean is true.
+ *
+ * \copydetails gmx_simd_blendzero_fi
+ */
+static gmx_inline gmx_simd_dint32_t
+gmx_simd_blendzero_di(gmx_simd_dint32_t a, gmx_simd_dibool_t sel)
+{
+ gmx_simd_dint32_t c;
+ int i;
+
+ for (i = 0; i < GMX_SIMD_DINT32_WIDTH; i++)
+ {
+ c.i[i] = sel.b[i] ? a.i[i] : 0.0;
+ }
+ return c;
+}
+
+/*! \brief Select from SIMD ints (corresponding to double) where boolean is false.
+ *
+ * \copydetails gmx_simd_blendnotzero_fi
+ */
+static gmx_inline gmx_simd_dint32_t
+gmx_simd_blendnotzero_di(gmx_simd_dint32_t a, gmx_simd_dibool_t sel)
+{
+ gmx_simd_dint32_t c;
+ int i;
+
+ for (i = 0; i < GMX_SIMD_DINT32_WIDTH; i++)
+ {
+ c.i[i] = sel.b[i] ? 0.0 : a.i[i];
+ }
+ return c;
+}
+
+/*! \brief Vector-blend SIMD selection for double-int SIMD.
+ *
+ * \copydetails gmx_simd_blendv_fi
+ */
+static gmx_inline gmx_simd_dint32_t
+gmx_simd_blendv_di(gmx_simd_dint32_t a, gmx_simd_dint32_t b, gmx_simd_dibool_t sel)
+{
+ gmx_simd_dint32_t d;
+ int i;
+
+ for (i = 0; i < GMX_SIMD_DINT32_WIDTH; i++)
+ {
+ d.i[i] = sel.b[i] ? b.i[i] : a.i[i];
+ }
+ return d;
+}
+
+/*! \}
+ *
+ * \name SIMD implementation conversion operations
+ * \{
+ */
+
+/*! \brief Round single precision floating point to integer.
+ *
+ * You should typically call the real-precision \ref gmx_simd_cvt_r2i.
+ *
+ * \param a SIMD floating-point
+ * \return SIMD integer, rounded to nearest integer.
+ */
+static gmx_inline gmx_simd_fint32_t
+gmx_simd_cvt_f2i(gmx_simd_float_t a)
+{
+ gmx_simd_fint32_t b;
+ int i;
+
+ for (i = 0; i < GMX_SIMD_FINT32_WIDTH; i++)
+ {
+#ifdef _MSC_VER
+ b.i[i] = (a.r[i] >= 0.0) ? (a.r[i] + 0.5) : (a.r[i] - 0.5);
+#else
+ b.i[i] = roundf(a.r[i]);
+#endif
+ }
+ return b;
+};
+
+/*! \brief Truncate single precision floating point to integer.
+ *
+ * You should typically call the real-precision \ref gmx_simd_cvtt_r2i.
+ *
+ * \param a SIMD floating-point
+ * \return SIMD integer, truncated towards zero.
+ */
+static gmx_inline gmx_simd_fint32_t
+gmx_simd_cvtt_f2i(gmx_simd_float_t a)
+{
+ gmx_simd_fint32_t b;
+ int i;
+
+ for (i = 0; i < GMX_SIMD_FINT32_WIDTH; i++)
+ {
+ b.i[i] = a.r[i];
+ }
+ return b;
+};
+
+/*! \brief Convert integer to single precision floating-point.
+ *
+ * You should typically call the real-precision \ref gmx_simd_cvt_i2r.
+ *
+ * \param a SIMD integer
+ * \return SIMD floating-pint
+ */
+static gmx_inline gmx_simd_float_t
+gmx_simd_cvt_i2f(gmx_simd_fint32_t a)
+{
+ gmx_simd_float_t b;
+ int i;
+
+ for (i = 0; i < GMX_SIMD_FINT32_WIDTH; i++)
+ {
+ b.r[i] = a.i[i];
+ }
+ return b;
+};
+
+/*! \brief Round double precision floating point to integer.
+ *
+ * \copydetails gmx_simd_cvt_f2i
+ */
+static gmx_inline gmx_simd_dint32_t
+gmx_simd_cvt_d2i(gmx_simd_double_t a)
+{
+ gmx_simd_dint32_t b;
+ int i;
+
+ for (i = 0; i < GMX_SIMD_DINT32_WIDTH; i++)
+ {
+#ifdef _MSC_VER
+ b.i[i] = (a.r[i] >= 0.0) ? (a.r[i] + 0.5) : (a.r[i] - 0.5);
+#else
+ b.i[i] = round(a.r[i]);
+#endif
+ }
+ return b;
+};
+
+/*! \brief Truncate double precision floating point to integer.
+ *
+ * \copydetails gmx_simd_cvtt_f2i
+ */
+static gmx_inline gmx_simd_dint32_t
+gmx_simd_cvtt_d2i(gmx_simd_double_t a)
+{
+ gmx_simd_dint32_t b;
+ int i;
+
+ for (i = 0; i < GMX_SIMD_DINT32_WIDTH; i++)
+ {
+ b.i[i] = a.r[i];
+ }
+ return b;
+};
+
+/*! \brief Convert integer to single precision floating-point.
+ *
+ * \copydetails gmx_simd_cvt_i2f
+ */
+static gmx_inline gmx_simd_double_t
+gmx_simd_cvt_i2d(gmx_simd_dint32_t a)
+{
+ gmx_simd_double_t b;
+ int i;
+
+ for (i = 0; i < GMX_SIMD_DINT32_WIDTH; i++)
+ {
+ b.r[i] = a.i[i];
+ }
+ return b;
+};
+
+/*! \brief Convert from float boolean to corresponding integer boolean.
+ *
+ * You should typically call the real-precision \ref gmx_simd_cvt_b2ib.
+ *
+ * \param a Boolean corresponding to SIMD floating-point
+ * \return Boolean that can be applied to SIMD integer operations.
+ */
+static gmx_inline gmx_simd_fibool_t
+gmx_simd_cvt_fb2fib(gmx_simd_fbool_t a)
+{
+ gmx_simd_fibool_t b;
+ int i;
+
+ /* Integer width >= float width */
+ for (i = 0; i < GMX_SIMD_FLOAT_WIDTH; i++)
+ {
+ b.b[i] = a.b[i];
+ }
+ return b;
+}
+
+/*! \brief Convert from integer boolean (corresponding to float) to float boolean.
+ *
+ * You should typically call the real-precision \ref gmx_simd_cvt_ib2b.
+ *
+ * \param a Boolean corresponding to SIMD integer
+ * \return Boolean that can be applied to SIMD floating-point.
+ */
+static gmx_inline gmx_simd_fbool_t
+gmx_simd_cvt_fib2fb(gmx_simd_fibool_t a)
+{
+ gmx_simd_fbool_t b;
+ int i;
+
+ /* Integer width >= float width */
+ for (i = 0; i < GMX_SIMD_FLOAT_WIDTH; i++)
+ {
+ b.b[i] = a.b[i];
+ }
+ return b;
+}
+
+/*! \brief Convert from double boolean to corresponding integer boolean.
+ *
+ * \copydetails gmx_simd_cvt_fb2fib
+ */
+static gmx_inline gmx_simd_dibool_t
+gmx_simd_cvt_db2dib(gmx_simd_dbool_t a)
+{
+ gmx_simd_dibool_t b;
+ int i;
+
+ /* Integer width >= double width */
+ for (i = 0; i < GMX_SIMD_DOUBLE_WIDTH; i++)
+ {
+ b.b[i] = a.b[i];
+ }
+ return b;
+}
+
+/*! \brief Convert from integer boolean (corresponding to double) to double boolean.
+ *
+ * \copydetails gmx_simd_cvt_fib2fb
+ */
+static gmx_inline gmx_simd_dbool_t
+gmx_simd_cvt_dib2db(gmx_simd_dibool_t a)
+{
+ gmx_simd_dbool_t b;
+ int i;
+
+ /* Integer width >= double width */
+ for (i = 0; i < GMX_SIMD_DOUBLE_WIDTH; i++)
+ {
+ b.b[i] = a.b[i];
+ }
+ return b;
+}
+
+/*! \brief Convert SIMD float to double.
+ *
+ * This version is available if \ref GMX_SIMD_FLOAT_WIDTH is identical to
+ * \ref GMX_SIMD_DOUBLE_WIDTH.
+ *
+ * Float/double conversions are complex since the SIMD width could either
+ * be different (e.g. on x86) or identical (e.g. IBM QPX). This means you will
+ * need to check for the width in the code, and have different code paths.
+ *
+ * \param f Single-precision SIMD variable
+ * \return Double-precision SIMD variable of the same width
+ */
+static gmx_inline gmx_simd_double_t
+gmx_simd_cvt_f2d(gmx_simd_float_t f)
+{
+ gmx_simd_double_t d;
+#if (GMX_SIMD_FLOAT_WIDTH == GMX_SIMD_DOUBLE_WIDTH)
+ int i;
+ for (i = 0; i < GMX_SIMD_DOUBLE_WIDTH; i++)
+ {
+ d.r[i] = f.r[i];
+ }
+#else
+ gmx_fatal(FARGS, "gmx_simd_cvt_f2d() requires GMX_SIMD_FLOAT_WIDTH==GMX_SIMD_DOUBLE_WIDTH");
+ /* Avoid compiler warnings */
+ d.r[0] = f.r[0];
+#endif
+ return d;
+}
+
+/*! \brief Convert SIMD double to float.
+ *
+ * This version is available if \ref GMX_SIMD_FLOAT_WIDTH is identical to
+ * \ref GMX_SIMD_DOUBLE_WIDTH.
+ *
+ * Float/double conversions are complex since the SIMD width could either
+ * be different (e.g. on x86) or identical (e.g. IBM QPX). This means you will
+ * need to check for the width in the code, and have different code paths.
+ *
+ * \param d Double-precision SIMD variable
+ * \return Single-precision SIMD variable of the same width
+ */
+static gmx_inline gmx_simd_float_t
+gmx_simd_cvt_d2f(gmx_simd_double_t d)
+{
+ gmx_simd_float_t f;
+#if (GMX_SIMD_FLOAT_WIDTH == GMX_SIMD_DOUBLE_WIDTH)
+ int i;
+ for (i = 0; i < GMX_SIMD_DOUBLE_WIDTH; i++)
+ {
+ f.r[i] = d.r[i];
+ }
+#else
+ gmx_fatal(FARGS, "gmx_simd_cvt_d2f() requires GMX_SIMD_FLOAT_WIDTH==GMX_SIMD_DOUBLE_WIDTH");
+ /* Avoid compiler warnings */
+ f.r[0] = d.r[0];
+#endif
+ return f;
+}
+
+/*! \brief Convert SIMD float to double.
+ *
+ * This version is available if \ref GMX_SIMD_FLOAT_WIDTH is twice as large
+ * as \ref GMX_SIMD_DOUBLE_WIDTH.
+ *
+ * Float/double conversions are complex since the SIMD width could either
+ * be different (e.g. on x86) or identical (e.g. IBM QPX). This means you will
+ * need to check for the width in the code, and have different code paths.
+ *
+ * \param f Single-precision SIMD variable
+ * \param[out] d0 Double-precision SIMD variable, first half of values from f.
+ * \param[out] d1 Double-precision SIMD variable, second half of values from f.
+ */
+static gmx_inline void
+gmx_simd_cvt_f2dd(gmx_simd_float_t f, gmx_simd_double_t *d0, gmx_simd_double_t *d1)
+{
+#if (GMX_SIMD_FLOAT_WIDTH == 2*GMX_SIMD_DOUBLE_WIDTH)
+ int i;
+ for (i = 0; i < GMX_SIMD_DOUBLE_WIDTH; i++)
+ {
+ d0->r[i] = f.r[i];
+ d1->r[i] = f.r[GMX_SIMD_DOUBLE_WIDTH+i];
+ }
+#else
+ gmx_fatal(FARGS, "gmx_simd_cvt_f2dd() requires GMX_SIMD_FLOAT_WIDTH==2*GMX_SIMD_DOUBLE_WIDTH");
+ /* Avoid compiler warnings about unused arguments */
+ d0->r[0] = f.r[0];
+ d1->r[0] = f.r[0];
+#endif
+}
+
+/*! \brief Convert SIMD double to float.
+ *
+ * This version is available if \ref GMX_SIMD_FLOAT_WIDTH is twice as large
+ * as \ref GMX_SIMD_DOUBLE_WIDTH.
+ *
+ * Float/double conversions are complex since the SIMD width could either
+ * be different (e.g. on x86) or identical (e.g. IBM QPX). This means you will
+ * need to check for the width in the code, and have different code paths.
+ *
+ * \param d0 Double-precision SIMD variable, first half of values to put in f.
+ * \param d1 Double-precision SIMD variable, second half of values to put in f.
+ * \return Single-precision SIMD variable with all values.
+ */
+static gmx_inline gmx_simd_float_t
+gmx_simd_cvt_dd2f(gmx_simd_double_t d0, gmx_simd_double_t d1)
+{
+ gmx_simd_float_t f;
+#if (GMX_SIMD_FLOAT_WIDTH == 2*GMX_SIMD_DOUBLE_WIDTH)
+ int i;
+ for (i = 0; i < GMX_SIMD_DOUBLE_WIDTH; i++)
+ {
+ f.r[i] = d0.r[i];
+ f.r[GMX_SIMD_DOUBLE_WIDTH+i] = d1.r[i];
+ }
+#else
+ gmx_fatal(FARGS, "gmx_simd_cvt_dd2f() requires GMX_SIMD_FLOAT_WIDTH==2*GMX_SIMD_DOUBLE_WIDTH");
+ /* Avoid compiler warnings about unused arguments & uninitialized f */
+ f.r[0] = d0.r[0] + d1.r[0];
+#endif
+ return f;
+}
+
+/*! \} */
+
+/*! \name SIMD4. Constant width-4 SIMD types and instructions
+ * \{
+ */
+
+#if (GMX_SIMD_FLOAT_WIDTH == 4) || (defined DOXYGEN)
+
+
+/*! \brief SIMD4 float type. Available with \ref GMX_SIMD4_HAVE_FLOAT.
+ *
+ * Unless you specifically want a single-precision type you should check
+ * \ref gmx_simd4_real_t instead.
+ *
+ * While the SIMD4 datatype is identical to the normal SIMD type in the
+ * reference implementation, this will often not be the case for
+ * other architectures.
+ */
+# define gmx_simd4_float_t gmx_simd_float_t
+
+/*! \brief Load SIMD4 float from aligned memory.
+ * \copydetails gmx_simd_load_f
+ */
+# define gmx_simd4_load_f gmx_simd_load_f
+
+/*! \brief Set all elements of SIMD4 float from single pointer.
+ * \copydetails gmx_simd_load1_f
+ */
+# define gmx_simd4_load1_f gmx_simd_load1_f
+
+/*! \brief Set all SIMD4 float elements to the value r.
+ * \copydetails gmx_simd_set1_f
+ */
+# define gmx_simd4_set1_f gmx_simd_set1_f
+
+/*! \brief Store the contents of SIMD4 float pr to aligned memory m.
+ * \copydetails gmx_simd_store_f
+ */
+# define gmx_simd4_store_f gmx_simd_store_f
+
+/*! \brief Load SIMD4 float from unaligned memory.
+ * \copydetails gmx_simd_loadu_f
+ */
+# define gmx_simd4_loadu_f gmx_simd_loadu_f
+
+/*! \brief Store SIMD4 float to unaligned memory.
+ * \copydetails gmx_simd_storeu_f
+ */
+# define gmx_simd4_storeu_f gmx_simd_storeu_f
+
+/*! \brief Set all SIMD4 float elements to 0.
+ * \copydetails gmx_simd_setzero_f
+ */
+# define gmx_simd4_setzero_f gmx_simd_setzero_f
+
+/*! \brief Bitwise and for two SIMD4 float variables.
+ * \copydetails gmx_simd_and_f
+ */
+# define gmx_simd4_and_f gmx_simd_and_f
+
+/*! \brief Bitwise andnot for two SIMD4 float variables. c=(~a) & b.
+ * \copydetails gmx_simd_andnot_f
+ */
+# define gmx_simd4_andnot_f gmx_simd_andnot_f
+
+/*! \brief Bitwise or for two SIMD4 float variables.
+ * \copydetails gmx_simd_or_f
+ */
+# define gmx_simd4_or_f gmx_simd_or_f
+
+/*! \brief Bitwise xor for two SIMD4 float variables.
+ * \copydetails gmx_simd_xor_f
+ */
+# define gmx_simd4_xor_f gmx_simd_xor_f
+
+/*! \brief Add two SIMD4 float variables.
+ * \copydetails gmx_simd_add_f
+ */
+# define gmx_simd4_add_f gmx_simd_add_f
+
+/*! \brief Subtract two SIMD4 float variables.
+ * \copydetails gmx_simd_sub_f
+ */
+# define gmx_simd4_sub_f gmx_simd_sub_f
+
+/*! \brief Multiply two SIMD4 float variables.
+ * \copydetails gmx_simd_mul_f
+ */
+# define gmx_simd4_mul_f gmx_simd_mul_f
+
+/*! \brief Fused-multiply-add for SIMD4 float. Result is a*b+c.
+ * \copydetails gmx_simd_fmadd_f
+ */
+# define gmx_simd4_fmadd_f gmx_simd_fmadd_f
+
+/*! \brief Fused-multiply-subtract for SIMD4 float. Result is a*b-c.
+ * \copydetails gmx_simd_fmsub_f
+ */
+# define gmx_simd4_fmsub_f gmx_simd_fmsub_f
+
+/*! \brief Fused-negated-multiply-add for SIMD4 float. Result is -a*b+c.
+ * \copydetails gmx_simd_fnmadd_f
+ */
+# define gmx_simd4_fnmadd_f gmx_simd_fnmadd_f
+
+/*! \brief Fused-negated-multiply-add for SIMD4 float. Result is -a*b-c.
+ * \copydetails gmx_simd_fnmsub_f
+ */
+# define gmx_simd4_fnmsub_f gmx_simd_fnmsub_f
+
+/*! \brief Lookup of approximate 1/sqrt(x) for SIMD4 float.
+ * \copydetails gmx_simd_rsqrt_f
+ */
+# define gmx_simd4_rsqrt_f gmx_simd_rsqrt_f
+
+/*! \brief Floating-point absolute value for SIMD4 float.
+ * \copydetails gmx_simd_fabs_f
+ */
+# define gmx_simd4_fabs_f gmx_simd_fabs_f
+
+/*! \brief Floating-point negate for SIMD4 float.
+ * \copydetails gmx_simd_fneg_f
+ */
+# define gmx_simd4_fneg_f gmx_simd_fneg_f
+
+/*! \brief Set each SIMD4 float element to the largest from two variables.
+ * \copydetails gmx_simd_max_f
+ */
+# define gmx_simd4_max_f gmx_simd_max_f
+
+/*! \brief Set each SIMD4 float element to the smallest from two variables.
+ * \copydetails gmx_simd_min_f
+ */
+# define gmx_simd4_min_f gmx_simd_min_f
+
+/*! \brief Round to nearest integer value for SIMD4 float.
+ * \copydetails gmx_simd_round_f
+ */
+# define gmx_simd4_round_f gmx_simd_round_f
+
+/*! \brief Round to largest integral value for SIMD4 float.
+ * \copydetails gmx_simd_trunc_f
+ */
+# define gmx_simd4_trunc_f gmx_simd_trunc_f
+
+/*! \brief Return dot product of two single precision SIMD4 variables.
+ *
+ * The dot product is calculated between the first three elements in the two
+ * vectors, while the fourth is ignored. The result is returned as a scalar.
+ *
+ * \param a vector1
+ * \param b vector2
+ * \result a[0]*b[0]+a[1]*b[1]+a[2]*b[2], returned as scalar. Last element is ignored.
+ */
+static gmx_inline float
+gmx_simd4_dotproduct3_f(gmx_simd_float_t a, gmx_simd_float_t b)
+{
+ return a.r[0]*b.r[0]+a.r[1]*b.r[1]+a.r[2]*b.r[2];
+}
+
+/*! \brief SIMD4 variable type to use for logical comparisons on floats.
+ * \copydetails gmx_simd_fbool_t
+ */
+# define gmx_simd4_fbool_t gmx_simd_fbool_t
+
+/*! \brief Equality comparison of two single precision SIMD4.
+ * \copydetails gmx_simd_cmpeq_f
+ */
+# define gmx_simd4_cmpeq_f gmx_simd_cmpeq_f
+
+/*! \brief Less-than comparison of two single precision SIMD4.
+ * \copydetails gmx_simd_cmplt_f
+ */
+# define gmx_simd4_cmplt_f gmx_simd_cmplt_f
+
+/*! \brief Less-than comparison of two single precision SIMD4.
+ * \copydetails gmx_simd_cmple_f
+ */
+# define gmx_simd4_cmple_f gmx_simd_cmple_f
+
+/*! \brief Logical AND on float SIMD4 booleans.
+ * \copydetails gmx_simd_and_fb
+ */
+# define gmx_simd4_and_fb gmx_simd_and_fb
+
+/*! \brief Logical OR on float SIMD4 booleans.
+ * \copydetails gmx_simd_or_fb
+ */
+# define gmx_simd4_or_fb gmx_simd_or_fb
+
+/*! \brief Returns non-zero if any of the SIMD4 boolean in x is True.
+ * \copydetails gmx_simd_anytrue_fb
+ */
+# define gmx_simd4_anytrue_fb gmx_simd_anytrue_fb
+
+/*! \brief Select from single precision SIMD4 variable where boolean is true.
+ * \copydetails gmx_simd_blendzero_f
+ */
+# define gmx_simd4_blendzero_f gmx_simd_blendzero_f
+
+/*! \brief Select from single precision SIMD4 variable where boolean is false.
+ * \copydetails gmx_simd_blendnotzero_f
+ */
+# define gmx_simd4_blendnotzero_f gmx_simd_blendnotzero_f
+
+/*! \brief Vector-blend instruction form SIMD4 float.
+ * \copydetails gmx_simd_blendv_f
+ */
+# define gmx_simd4_blendv_f gmx_simd_blendv_f
+
+/*! \brief Return sum of all elements in SIMD4 float.
+ * \copydetails gmx_simd_reduce_f
+ */
+# define gmx_simd4_reduce_f gmx_simd_reduce_f
+
+#else /* GMX_SIMD_FLOAT_WIDTH!=4 */
+# undef GMX_SIMD4_HAVE_FLOAT
+#endif
+
+
+#if (GMX_SIMD_DOUBLE_WIDTH == 4) || (defined DOXYGEN)
+
+/*! \brief SIMD4 double type. Available with \ref GMX_SIMD4_HAVE_DOUBLE.
+ *
+ * Unless you specifically want a double-precision type you should check
+ * \ref gmx_simd4_real_t instead.
+ *
+ * While the SIMD4 datatype is identical to the normal SIMD type in the
+ * reference implementation, this will often not be the case for
+ * other architectures.
+ */
+# define gmx_simd4_double_t gmx_simd_double_t
+
+/*! \brief Double precision SIMD4 load aligned.
+ * \copydetails gmx_simd_load_d
+ */
+# define gmx_simd4_load_d gmx_simd_load_d
+
+/*! \brief Double precision SIMD4 load single value to all elements.
+ * \copydetails gmx_simd_load1_d
+ */
+# define gmx_simd4_load1_d gmx_simd_load1_d
+
+/*! \brief Double precision SIMD4 set all elements from value.
+ * \copydetails gmx_simd_set1_d
+ */
+# define gmx_simd4_set1_d gmx_simd_set1_d
+
+/*! \brief Double precision SIMD4 store to aligned memory.
+ * \copydetails gmx_simd_store_d
+ */
+# define gmx_simd4_store_d gmx_simd_store_d
+
+/*! \brief Load unaligned SIMD4 double.
+ * \copydetails gmx_simd_loadu_d
+ */
+# define gmx_simd4_loadu_d gmx_simd_loadu_d
+
+/*! \brief Store unaligned SIMD4 double.
+ * \copydetails gmx_simd_storeu_d
+ */
+# define gmx_simd4_storeu_d gmx_simd_storeu_d
+
+/*! \brief Set all elements in SIMD4 double to 0.0.
+ * \copydetails gmx_simd_setzero_d
+ */
+# define gmx_simd4_setzero_d gmx_simd_setzero_d
+
+/*! \brief Bitwise and for two SIMD4 double variables.
+ * \copydetails gmx_simd_and_d
+ */
+# define gmx_simd4_and_d gmx_simd_and_d
+
+/*! \brief Bitwise andnot for SIMD4 double. c=(~a) & b.
+ * \copydetails gmx_simd_andnot_d
+ */
+# define gmx_simd4_andnot_d gmx_simd_andnot_d
+
+/*! \brief Bitwise or for SIMD4 double.
+ * \copydetails gmx_simd_or_d
+ */
+# define gmx_simd4_or_d gmx_simd_or_d
+
+/*! \brief Bitwise xor for SIMD4 double.
+ * \copydetails gmx_simd_xor_d
+ */
+# define gmx_simd4_xor_d gmx_simd_xor_d
+
+/*! \brief Add two SIMD4 double values.
+ * \copydetails gmx_simd_add_d
+ */
+# define gmx_simd4_add_d gmx_simd_add_d
+
+/*! \brief Subtract two SIMD4 double values.
+ * \copydetails gmx_simd_sub_d
+ */
+# define gmx_simd4_sub_d gmx_simd_sub_d
+
+/*! \brief Multiply two SIMD4 double values.
+ * \copydetails gmx_simd_mul_d
+ */
+# define gmx_simd4_mul_d gmx_simd_mul_d
+
+/*! \brief Fused-multiply-add for SIMD4 double. Result is a*b+c.
+ * \copydetails gmx_simd_fmadd_d
+ */
+# define gmx_simd4_fmadd_d gmx_simd_fmadd_d
+
+/*! \brief Fused-multiply-subtract for SIMD4 double. Result is a*b-c.
+ * \copydetails gmx_simd_fmsub_d
+ */
+# define gmx_simd4_fmsub_d gmx_simd_fmsub_d
+
+/*! \brief Fused-negated-multiply-add for SIMD4 double. Result is -a*b+c.
+ * \copydetails gmx_simd_fnmadd_d
+ */
+# define gmx_simd4_fnmadd_d gmx_simd_fnmadd_d
+
+/*! \brief Fused-negated-multiply-sub for SIMD4 double. Result is -a*b-c.
+ * \copydetails gmx_simd_fnmsub_d
+ */
+# define gmx_simd4_fnmsub_d gmx_simd_fnmsub_d
+
+/*! \brief SIMD4 double 1.0/sqrt(x) lookup.
+ * \copydetails gmx_simd_rsqrt_d
+ */
+# define gmx_simd4_rsqrt_d gmx_simd_rsqrt_d
+
+/*! \brief SIMD4 double Floating-point fabs().
+ * \copydetails gmx_simd_fabs_d
+ */
+# define gmx_simd4_fabs_d gmx_simd_fabs_d
+
+/*! \brief SIMD4 double floating-point negate.
+ * \copydetails gmx_simd_fneg_d
+ */
+# define gmx_simd4_fneg_d gmx_simd_fneg_d
+
+/*! \brief Set each SIMD4 element to the largest from two variables.
+ * \copydetails gmx_simd_max_d
+ */
+# define gmx_simd4_max_d gmx_simd_max_d
+
+/*! \brief Set each SIMD4 element to the smallest from two variables.
+ * \copydetails gmx_simd_min_d
+ */
+# define gmx_simd4_min_d gmx_simd_min_d
+
+/*! \brief Round SIMD4 double to nearest integer value (in floating-point format).
+ * \copydetails gmx_simd_round_d
+ */
+# define gmx_simd4_round_d gmx_simd_round_d
+
+/*! \brief Truncate SIMD4 double, i.e. round towards zero.
+ * \copydetails gmx_simd_trunc_d
+ */
+# define gmx_simd4_trunc_d gmx_simd_trunc_d
+
+/*! \brief Return dot product of two double precision SIMD4 variables.
+ * \copydetails gmx_simd_setzero_f
+ */
+static gmx_inline double
+gmx_simd4_dotproduct3_d(gmx_simd_double_t a, gmx_simd_double_t b)
+{
+ return a.r[0]*b.r[0]+a.r[1]*b.r[1]+a.r[2]*b.r[2];
+}
+
+/*! \brief SIMD4 variable type to use for logical comparisons on doubles.
+ * \copydetails gmx_simd_dbool_t
+ */
+# define gmx_simd4_dbool_t gmx_simd_dbool_t
+
+/*! \brief Equality comparison of two double precision SIMD4 values.
+ * \copydetails gmx_simd_cmpeq_d
+ */
+# define gmx_simd4_cmpeq_d gmx_simd_cmpeq_d
+
+/*! \brief Less-than comparison of two double precision SIMD4 values.
+ * \copydetails gmx_simd_cmplt_d
+ */
+# define gmx_simd4_cmplt_d gmx_simd_cmplt_d
+
+/*! \brief Less-than comparison of two double precision SIMD4 values.
+ * \copydetails gmx_simd_cmple_d
+ */
+# define gmx_simd4_cmple_d gmx_simd_cmple_d
+
+/*! \brief Logical AND on double SIMD4 booleans.
+ * \copydetails gmx_simd_and_db
+ */
+# define gmx_simd4_and_db gmx_simd_and_db
+
+/*! \brief Logical OR on double SIMD4 booleans.
+ * \copydetails gmx_simd_or_db
+ */
+# define gmx_simd4_or_db gmx_simd_or_db
+
+/*! \brief Returns non-zero if any of the SIMD4 booleans in x is True.
+ * \copydetails gmx_simd_anytrue_db
+ */
+# define gmx_simd4_anytrue_db gmx_simd_anytrue_db
+
+/*! \brief Select from double precision SIMD4 variable where boolean is true.
+ * \copydetails gmx_simd_blendzero_d
+ */
+# define gmx_simd4_blendzero_d gmx_simd_blendzero_d
+
+/*! \brief Select from double precision SIMD4 variable where boolean is false.
+ * \copydetails gmx_simd_blendnotzero_d
+ */
+# define gmx_simd4_blendnotzero_d gmx_simd_blendnotzero_d
+
+/*! \brief Vector-blend instruction for SIMD4 double.
+ * \copydetails gmx_simd_blendv_d
+ */
+# define gmx_simd4_blendv_d gmx_simd_blendv_d
+
+/*! \brief Return sum of all elements in SIMD4 double.
+ * \copydetails gmx_simd_reduce_d
+ */
+# define gmx_simd4_reduce_d gmx_simd_reduce_d
+
+#else /* GMX_SIMD4_DOUBLE_WIDTH!=4 */
+# undef GMX_SIMD4_HAVE_DOUBLE
+#endif
+
+/*! \} */
+
+
+/*! \brief Return 1 if SIMD floating-point ops have overflowed, and reset check.
+
+ * This function to check whether SIMD operations have resulted in overflow,
+ * and returns 1 if it occured, 0 otherwise.
+ * For now, this is unfortunately a dummy for all architectures except x86.
+ */
+static int
+gmx_simd_check_and_reset_overflow(void)
+{
+ return 0;
+}
+
+/*! \} */
+/*! \endcond */
+
+#endif /* GMX_SIMD_IMPL_REFERENCE_H */
--- /dev/null
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2014, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+
+#ifndef GMX_SIMD_IMPL_X86_AVX2_256_H
+#define GMX_SIMD_IMPL_X86_AVX2_256_H
+
+#include <math.h>
+#include <immintrin.h>
+
+/* x86 256-bit AVX2 SIMD instruction wrappers
+ *
+ * Please see documentation in gromacs/simd/simd.h for details
+ */
+
+/* Inherit parts of AVX2_256 from AVX_256 */
+#include "gromacs/simd/impl_x86_avx_256/impl_x86_avx_256.h"
+/* Increment over AVX_256 capabilities */
+#define GMX_SIMD_X86_AVX2_256_OR_HIGHER
+
+/* Override some capability definitions for things added in AVX2 */
+#define GMX_SIMD_HAVE_FMA
+#define GMX_SIMD_HAVE_FINT32_LOGICAL /* AVX2 adds 256-bit integer shifts */
+#define GMX_SIMD_HAVE_FINT32_ARITHMETICS /* AVX2 adds 256-bit integer +,-,* */
+
+/****************************************************
+ * SINGLE PRECISION SIMD IMPLEMENTATION *
+ ****************************************************/
+#undef gmx_simd_fmadd_f
+#define gmx_simd_fmadd_f _mm256_fmadd_ps
+#undef gmx_simd_fmsub_f
+#define gmx_simd_fmsub_f _mm256_fmsub_ps
+#undef gmx_simd_fnmadd_f
+#define gmx_simd_fnmadd_f _mm256_fnmadd_ps
+#undef gmx_simd_fnmsub_f
+#define gmx_simd_fnmsub_f _mm256_fnmsub_ps
+#undef gmx_simd_get_exponent_f
+#define gmx_simd_get_exponent_f gmx_simd_get_exponent_f_avx2_256
+#undef gmx_simd_set_exponent_f
+#define gmx_simd_set_exponent_f gmx_simd_set_exponent_f_avx2_256
+/* Previously undefined logical ops on gmx_simd_fint32_t */
+#define gmx_simd_slli_fi _mm256_slli_epi32
+#define gmx_simd_srli_fi _mm256_srli_epi32
+#define gmx_simd_and_fi _mm256_and_si256
+#define gmx_simd_andnot_fi _mm256_andnot_si256
+#define gmx_simd_or_fi _mm256_or_si256
+#define gmx_simd_xor_fi _mm256_xor_si256
+/* Previously undefined arithmetic ops on gmx_simd_fint32_t */
+#define gmx_simd_add_fi _mm256_add_epi32
+#define gmx_simd_sub_fi _mm256_sub_epi32
+#define gmx_simd_mul_fi _mm256_mullo_epi32
+/* Previously undefined boolean ops on gmx_simd_fint32_t */
+#define gmx_simd_cmpeq_fi _mm256_cmpeq_epi32
+#define gmx_simd_cmplt_fi(a, b) _mm256_cmpgt_epi32(b, a)
+#define gmx_simd_and_fib _mm256_and_si256
+#define gmx_simd_or_fib _mm256_or_si256
+#define gmx_simd_anytrue_fib _mm256_movemask_epi8
+#define gmx_simd_blendzero_fi _mm256_and_si256
+#define gmx_simd_blendnotzero_fi(a, sel) _mm256_andnot_si256(sel, a)
+#define gmx_simd_blendv_fi _mm256_blendv_epi8
+
+
+/****************************************************
+ * DOUBLE PRECISION SIMD IMPLEMENTATION *
+ ****************************************************/
+#undef gmx_simd_fmadd_d
+#define gmx_simd_fmadd_d _mm256_fmadd_pd
+#undef gmx_simd_fmsub_d
+#define gmx_simd_fmsub_d _mm256_fmsub_pd
+#undef gmx_simd_fnmadd_d
+#define gmx_simd_fnmadd_d _mm256_fnmadd_pd
+#undef gmx_simd_fnmsub_d
+#define gmx_simd_fnmsub_d _mm256_fnmsub_pd
+#undef gmx_simd_get_exponent_d
+#define gmx_simd_get_exponent_d gmx_simd_get_exponent_d_avx2_256
+#undef gmx_simd_set_exponent_d
+#define gmx_simd_set_exponent_d gmx_simd_set_exponent_d_avx2_256
+#undef gmx_simd_cvt_db2dib
+#define gmx_simd_cvt_db2dib gmx_simd_cvt_db2dib_avx2_256
+#undef gmx_simd_cvt_dib2db
+#define gmx_simd_cvt_dib2db gmx_simd_cvt_dib2db_avx2_256
+
+/****************************************************
+ * SIMD4 SINGLE PRECISION IMPLEMENTATION *
+ ****************************************************/
+#undef gmx_simd4_fmadd_f
+#define gmx_simd4_fmadd_f _mm_fmadd_ps
+#undef gmx_simd4_fmsub_f
+#define gmx_simd4_fmsub_f _mm_fmsub_ps
+#undef gmx_simd4_fnmadd_f
+#define gmx_simd4_fnmadd_f _mm_fnmadd_ps
+#undef gmx_simd4_fnmsub_f
+#define gmx_simd4_fnmsub_f _mm_fnmsub_ps
+
+/* No need to update SIMD4 double, since those instructions
+ * are aliased to the general SIMD double instructions above.
+ */
+
+/*********************************************************
+ * SIMD SINGLE PRECISION IMPLEMENTATION HELPER FUNCTIONS *
+ *********************************************************/
+static gmx_inline gmx_simd_float_t
+gmx_simd_get_exponent_f_avx2_256(gmx_simd_float_t x)
+{
+ const __m256 expmask = _mm256_castsi256_ps(_mm256_set1_epi32(0x7F800000));
+ const __m256i expbias = _mm256_set1_epi32(127);
+ __m256i iexp;
+
+ iexp = _mm256_castps_si256(_mm256_and_ps(x, expmask));
+ iexp = _mm256_sub_epi32(_mm256_srli_epi32(iexp, 23), expbias);
+ return _mm256_cvtepi32_ps(iexp);
+}
+
+static gmx_inline gmx_simd_float_t
+gmx_simd_set_exponent_f_avx2_256(gmx_simd_float_t x)
+{
+ const __m256i expbias = _mm256_set1_epi32(127);
+ __m256i iexp = _mm256_cvtps_epi32(x);
+
+ iexp = _mm256_slli_epi32(_mm256_add_epi32(iexp, expbias), 23);
+ return _mm256_castsi256_ps(iexp);
+}
+
+/*********************************************************
+ * SIMD DOUBLE PRECISION IMPLEMENTATION HELPER FUNCTIONS *
+ *********************************************************/
+static gmx_inline gmx_simd_double_t
+gmx_simd_get_exponent_d_avx2_256(gmx_simd_double_t x)
+{
+ const __m256d expmask = _mm256_castsi256_pd(_mm256_set1_epi64x(0x7FF0000000000000LL));
+ const __m256i expbias = _mm256_set1_epi64x(1023LL);
+ __m256i iexp;
+ __m128i iexp128;
+
+ iexp = _mm256_castpd_si256(_mm256_and_pd(x, expmask));
+ iexp = _mm256_sub_epi64(_mm256_srli_epi64(iexp, 52), expbias);
+ iexp = _mm256_shuffle_epi32(iexp, _MM_SHUFFLE(3, 1, 2, 0));
+
+ iexp128 = _mm256_extractf128_si256(iexp, 1);
+ iexp128 = _mm_unpacklo_epi64(_mm256_castsi256_si128(iexp), iexp128);
+ return _mm256_cvtepi32_pd(iexp128);
+}
+
+static gmx_inline gmx_simd_double_t
+gmx_simd_set_exponent_d_avx2_256(gmx_simd_double_t x)
+{
+ const __m256i expbias = _mm256_set1_epi64x(1023LL);
+ __m256i iexp = _mm256_cvtepi32_epi64(_mm256_cvtpd_epi32(x));
+
+ iexp = _mm256_slli_epi64(_mm256_add_epi64(iexp, expbias), 52);
+ return _mm256_castsi256_pd(iexp);
+}
+
+static gmx_inline gmx_simd_dibool_t
+gmx_simd_cvt_db2dib_avx2_256(gmx_simd_dbool_t a)
+{
+ __m128i ia = _mm256_castsi256_si128(_mm256_castpd_si256(a));
+ __m128i ib = _mm256_extractf128_si256(_mm256_castpd_si256(a), 0x1);
+
+ ia = _mm_packs_epi32(ia, ib);
+
+ return ia;
+}
+
+static gmx_inline gmx_simd_dbool_t
+gmx_simd_cvt_dib2db_avx2_256(gmx_simd_dibool_t ia)
+{
+ __m128d lo = _mm_castsi128_pd(_mm_unpacklo_epi32(ia, ia));
+ __m128d hi = _mm_castsi128_pd(_mm_unpackhi_epi32(ia, ia));
+
+ return _mm256_insertf128_pd(_mm256_castpd128_pd256(lo), hi, 0x1);
+}
+
+#endif /* GMX_SIMD_IMPL_X86_AVX2_256_H */
--- /dev/null
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2014, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+
+#ifndef GMX_SIMD_IMPL_X86_AVX_128_FMA_H
+#define GMX_SIMD_IMPL_X86_AVX_128_FMA_H
+
+#include <math.h>
+#include <immintrin.h>
+#include <x86intrin.h>
+
+/* x86 128-bit AVX with FMA SIMD instruction wrappers
+ *
+ * Please see documentation in gromacs/simd/simd.h for details
+ */
+
+/* Inherit parts of AVX_128_FMA from SSE4.1 */
+#include "gromacs/simd/impl_x86_sse4_1/impl_x86_sse4_1.h"
+/* Increment over SSE4.1 capabilities */
+#define GMX_SIMD_X86_AVX_128_FMA_OR_HIGHER
+
+/* Override some capability definitions for things added in AVX over SSE4.1 */
+#define GMX_SIMD_HAVE_FMA
+#define GMX_SIMD_HAVE_FRACTION
+#define GMX_SIMD4_HAVE_DOUBLE /* We can use 256-bit operations for this */
+
+/* SINGLE */
+#undef gmx_simd_fmadd_ps
+#define gmx_simd_fmadd_ps _mm_macc_ps
+#undef gmx_simd_fmsub_ps
+#define gmx_simd_fmsub_ps(a, b, c) _mm_msub_ps
+#undef gmx_simd_fnmadd_ps
+#define gmx_simd_fnmadd_ps(a, b, c) _mm_nmacc_ps
+#undef gmx_simd_fnmsub_ps
+#define gmx_simd_fnmsub_ps(a, b, c) _mm_nmsub_ps
+#undef gmx_simd_fraction_f
+#define gmx_simd_fraction_f _mm_frcz_ps
+
+/* DOUBLE */
+#undef gmx_simd_fmadd_pd
+#define gmx_simd_fmadd_pd _mm_macc_pd
+#undef gmx_simd_fmsub_pd
+#define gmx_simd_fmsub_pd(a, b, c) _mm_msub_pd
+#undef gmx_simd_fnmadd_pd
+#define gmx_simd_fnmadd_pd(a, b, c) _mm_nmacc_pd
+#undef gmx_simd_fnmsub_pd
+#define gmx_simd_fnmsub_pd(a, b, c) _mm_nmsub_pd
+#undef gmx_simd_fraction_d
+#define gmx_simd_fraction_d _mm_frcz_pd
+
+/* Even if the _main_ SIMD implementation for this architecture file corresponds
+ * to 128-bit AVX (since it will be faster), the 256-bit operations will always
+ * be available in AVX, so we can use them for double precision SIMD4!
+ */
+/* SIMD4 Double precision floating point */
+#define gmx_simd4_double_t __m256d
+#define gmx_simd4_load_d _mm256_load_pd
+#define gmx_simd4_load1_d _mm256_broadcast_sd
+#define gmx_simd4_set1_d _mm256_set1_pd
+#define gmx_simd4_store_d _mm256_store_pd
+#define gmx_simd4_loadu_d _mm256_loadu_pd
+#define gmx_simd4_storeu_d _mm256_storeu_pd
+#define gmx_simd4_setzero_d _mm256_setzero_pd
+#define gmx_simd4_add_d _mm256_add_pd
+#define gmx_simd4_sub_d _mm256_sub_pd
+#define gmx_simd4_mul_d _mm256_mul_pd
+#define gmx_simd4_fmadd_d _mm256_macc_pd
+#define gmx_simd4_fmsub_d _mm256_msub_pd
+#define gmx_simd4_fnmadd_d _mm256_nmacc_pd
+#define gmx_simd4_fnmsub_d _mm256_nmsub_pd
+#define gmx_simd4_and_d _mm256_and_pd
+#define gmx_simd4_andnot_d _mm256_andnot_pd
+#define gmx_simd4_or_d _mm256_or_pd
+#define gmx_simd4_xor_d _mm256_xor_pd
+#define gmx_simd4_rsqrt_d(x) _mm256_cvtps_pd(_mm_rsqrt_ps(_mm256_cvtpd_ps(x)))
+#define gmx_simd4_fabs_d(x) _mm256_andnot_pd(_mm256_set1_pd(-0.0), x)
+#define gmx_simd4_fneg_d(x) _mm256_xor_pd(x, _mm256_set1_pd(-0.0))
+#define gmx_simd4_max_d _mm256_max_pd
+#define gmx_simd4_min_d _mm256_min_pd
+#define gmx_simd4_round_d(x) _mm256_round_pd(x, _MM_FROUND_NINT)
+#define gmx_simd4_trunc_d(x) _mm256_round_pd(x, _MM_FROUND_TRUNC)
+#define gmx_simd4_dotproduct3_d gmx_simd4_dotproduct3_d_avx_128_fma
+/* SIMD4 booleans corresponding to double */
+#define gmx_simd4_dbool_t __m256d
+#define gmx_simd4_cmpeq_d(a, b) _mm256_cmp_pd(a, b, _CMP_EQ_OQ)
+#define gmx_simd4_cmplt_d(a, b) _mm256_cmp_pd(a, b, _CMP_LT_OQ)
+#define gmx_simd4_cmple_d(a, b) _mm256_cmp_pd(a, b, _CMP_LE_OQ)
+#define gmx_simd4_and_db _mm256_and_pd
+#define gmx_simd4_or_db _mm256_or_pd
+#define gmx_simd4_anytrue_db _mm256_movemask_pd
+#define gmx_simd4_blendzero_d _mm256_and_pd
+#define gmx_simd4_blendnotzero_d(a, sel) _mm256_andnot_pd(sel, a)
+#define gmx_simd4_blendv_d _mm256_blendv_pd
+#define gmx_simd4_reduce_d gmx_simd4_reduce_d_avx_128_fma
+/* SIMD4 float/double conversion */
+#define gmx_simd4_cvt_f2d _mm256_cvtps_pd
+#define gmx_simd4_cvt_d2f _mm256_cvtpd_ps
+
+static gmx_inline double
+gmx_simd4_reduce_d_avx_128_fma(__m256d a)
+{
+ double f;
+ __m128d a0, a1;
+ a = _mm256_hadd_pd(a, a);
+ a0 = _mm256_castpd256_pd128(a);
+ a1 = _mm256_extractf128_pd(a, 0x1);
+ a0 = _mm_add_sd(a0, a1);
+ _mm_store_sd(&f, a0);
+ return f;
+}
+
+static gmx_inline double
+gmx_simd4_dotproduct3_d_avx_128_fma(__m256d a, __m256d b)
+{
+ double d;
+ __m128d tmp1, tmp2;
+ a = _mm256_mul_pd(a, b);
+ tmp1 = _mm256_castpd256_pd128(a);
+ tmp2 = _mm256_extractf128_pd(a, 0x1);
+
+ tmp1 = _mm_add_pd(tmp1, _mm_permute_pd(tmp1, _MM_SHUFFLE2(0, 1)));
+ tmp1 = _mm_add_pd(tmp1, tmp2);
+ _mm_store_sd(&d, tmp1);
+ return d;
+}
+
+#endif /* GMX_SIMD_IMPL_X86_AVX_128_FMA_H */
--- /dev/null
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2014, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+
+#ifndef GMX_SIMD_IMPL_X86_AVX_256_H
+#define GMX_SIMD_IMPL_X86_AVX_256_H
+
+#include <math.h>
+#include <immintrin.h>
+
+/* It is cleaner to start the AVX implementation from scratch rather than
+ * first inheriting from SSE4.1, which in turn inherits from SSE2. However,
+ * the capabilities still form a superset.
+ */
+#define GMX_SIMD_X86_SSE2_OR_HIGHER
+#define GMX_SIMD_X86_SSE4_1_OR_HIGHER
+#define GMX_SIMD_X86_AVX_256_OR_HIGHER
+
+
+/* x86 256-bit AVX SIMD instruction wrappers
+ *
+ * Please see documentation in gromacs/simd/simd.h for defines.
+ */
+
+/* Capability definitions for 256-bit AVX - no inheritance from SSE */
+#define GMX_SIMD_HAVE_FLOAT
+#define GMX_SIMD_HAVE_DOUBLE
+#define GMX_SIMD_HAVE_SIMD_HARDWARE
+#define GMX_SIMD_HAVE_LOADU
+#define GMX_SIMD_HAVE_STOREU
+#define GMX_SIMD_HAVE_LOGICAL
+#undef GMX_SIMD_HAVE_FMA
+#undef GMX_SIMD_HAVE_FRACTION
+#define GMX_SIMD_HAVE_FINT32
+#define GMX_SIMD_HAVE_FINT32_EXTRACT /* Emulated */
+#undef GMX_SIMD_HAVE_FINT32_LOGICAL /* AVX1 cannot do 256-bit int shifts */
+#undef GMX_SIMD_HAVE_FINT32_ARITHMETICS /* AVX1 cannot do 256-bit int +,-,* */
+#define GMX_SIMD_HAVE_DINT32
+#define GMX_SIMD_HAVE_DINT32_EXTRACT /* Native, dint uses 128-bit SIMD */
+#define GMX_SIMD_HAVE_DINT32_LOGICAL
+#define GMX_SIMD_HAVE_DINT32_ARITHMETICS
+#define GMX_SIMD4_HAVE_FLOAT
+#define GMX_SIMD4_HAVE_DOUBLE
+
+/* Implementation details */
+#define GMX_SIMD_FLOAT_WIDTH 8
+#define GMX_SIMD_DOUBLE_WIDTH 4
+#define GMX_SIMD_FINT32_WIDTH 8
+#define GMX_SIMD_DINT32_WIDTH 4
+#define GMX_SIMD_RSQRT_BITS 11
+#define GMX_SIMD_RCP_BITS 11
+
+/****************************************************
+ * SINGLE PRECISION SIMD IMPLEMENTATION *
+ ****************************************************/
+#define gmx_simd_float_t __m256
+#define gmx_simd_load_f _mm256_load_ps
+#define gmx_simd_load1_f _mm256_broadcast_ss
+#define gmx_simd_set1_f _mm256_set1_ps
+#define gmx_simd_store_f _mm256_store_ps
+#define gmx_simd_loadu_f _mm256_loadu_ps
+#define gmx_simd_storeu_f _mm256_storeu_ps
+#define gmx_simd_setzero_f _mm256_setzero_ps
+#define gmx_simd_add_f _mm256_add_ps
+#define gmx_simd_sub_f _mm256_sub_ps
+#define gmx_simd_mul_f _mm256_mul_ps
+#define gmx_simd_fmadd_f(a, b, c) _mm256_add_ps(_mm256_mul_ps(a, b), c)
+#define gmx_simd_fmsub_f(a, b, c) _mm256_sub_ps(_mm256_mul_ps(a, b), c)
+#define gmx_simd_fnmadd_f(a, b, c) _mm256_sub_ps(c, _mm256_mul_ps(a, b))
+#define gmx_simd_fnmsub_f(a, b, c) _mm256_sub_ps(_mm256_setzero_ps(), gmx_simd_fmadd_f(a, b, c))
+#define gmx_simd_and_f _mm256_and_ps
+#define gmx_simd_andnot_f _mm256_andnot_ps
+#define gmx_simd_or_f _mm256_or_ps
+#define gmx_simd_xor_f _mm256_xor_ps
+#define gmx_simd_rsqrt_f _mm256_rsqrt_ps
+#define gmx_simd_rcp_f _mm256_rcp_ps
+#define gmx_simd_fabs_f(x) _mm256_andnot_ps(_mm256_set1_ps(-0.0), x)
+#define gmx_simd_fneg_f(x) _mm256_xor_ps(x, _mm256_set1_ps(-0.0))
+#define gmx_simd_max_f _mm256_max_ps
+#define gmx_simd_min_f _mm256_min_ps
+#define gmx_simd_round_f(x) _mm256_round_ps(x, _MM_FROUND_NINT)
+#define gmx_simd_trunc_f(x) _mm256_round_ps(x, _MM_FROUND_TRUNC)
+#define gmx_simd_fraction_f(x) _mm256_sub_ps(x, gmx_simd_trunc_f(x))
+#define gmx_simd_get_exponent_f gmx_simd_get_exponent_f_avx_256
+#define gmx_simd_get_mantissa_f gmx_simd_get_mantissa_f_avx_256
+#define gmx_simd_set_exponent_f gmx_simd_set_exponent_f_avx_256
+/* integer datatype corresponding to float: gmx_simd_fint32_t */
+#define gmx_simd_fint32_t __m256i
+#define gmx_simd_load_fi(m) _mm256_castps_si256(_mm256_load_ps((const float *)m))
+#define gmx_simd_set1_fi _mm256_set1_epi32
+#define gmx_simd_store_fi(m, x) _mm256_store_ps((float *)m, _mm256_castsi256_ps(x))
+#define gmx_simd_loadu_fi(m) _mm256_castps_si256(_mm256_loadu_ps((const float *)m))
+#define gmx_simd_storeu_fi(m, x) _mm256_storeu_ps((float *)m, _mm256_castsi256_ps(x))
+#define gmx_simd_setzero_fi _mm256_setzero_si256
+#define gmx_simd_cvt_f2i _mm256_cvtps_epi32
+#define gmx_simd_cvtt_f2i _mm256_cvttps_epi32
+#define gmx_simd_cvt_i2f _mm256_cvtepi32_ps
+#define gmx_simd_extract_fi(x, i) _mm_extract_epi32(_mm256_extractf128_si256(x, (i)>>2), (i)&0x3)
+/* Integer logical ops on gmx_simd_fint32_t */
+/* gmx_simd_add_fi not supported */
+/* gmx_simd_sub_fi not supported */
+/* gmx_simd_mul_fi not supported */
+/* gmx_simd_slli_fi not supported */
+/* gmx_simd_srli_fi not supported */
+/* gmx_simd_and_fi not supported */
+/* gmx_simd_andnot_fi not supported */
+/* gmx_simd_or_fi not supported */
+/* gmx_simd_xor_fi not supported */
+/* Integer arithmetic ops on gmx_simd_fint32_t */
+/* gmx_simd_add_fi not supported */
+/* gmx_simd_sub_fi not supported */
+/* gmx_simd_mul_fi not supported */
+/* Boolean & comparison operations on gmx_simd_float_t */
+#define gmx_simd_fbool_t __m256
+#define gmx_simd_cmpeq_f(a, b) _mm256_cmp_ps(a, b, _CMP_EQ_OQ)
+#define gmx_simd_cmplt_f(a, b) _mm256_cmp_ps(a, b, _CMP_LT_OQ)
+#define gmx_simd_cmple_f(a, b) _mm256_cmp_ps(a, b, _CMP_LE_OQ)
+#define gmx_simd_and_fb _mm256_and_ps
+#define gmx_simd_or_fb _mm256_or_ps
+#define gmx_simd_anytrue_fb _mm256_movemask_ps
+#define gmx_simd_blendzero_f _mm256_and_ps
+#define gmx_simd_blendnotzero_f(a, sel) _mm256_andnot_ps(sel, a)
+#define gmx_simd_blendv_f _mm256_blendv_ps
+#define gmx_simd_reduce_f gmx_simd_reduce_f_avx_256
+/* Boolean & comparison operations on gmx_simd_fint32_t */
+#define gmx_simd_fibool_t __m256i
+/* gmx_simd_cmpeq_fi not supported */
+/* gmx_simd_cmplt_fi not supported */
+/* gmx_simd_and_fib not supported */
+/* gmx_simd_or_fib not supported */
+/* gmx_simd_anytrue_fib not supported */
+/* gmx_simd_blendzero_fi not supported */
+/* gmx_simd_blendnotzero_fi not supported */
+/* gmx_simd_blendv_fi not supported */
+/* Conversions between different booleans */
+#define gmx_simd_cvt_fb2fib _mm256_castps_si256
+#define gmx_simd_cvt_fib2fb _mm256_castsi256_ps
+
+/****************************************************
+ * DOUBLE PRECISION SIMD IMPLEMENTATION *
+ ****************************************************/
+#define gmx_simd_double_t __m256d
+#define gmx_simd_load_d _mm256_load_pd
+#define gmx_simd_load1_d _mm256_broadcast_sd
+#define gmx_simd_set1_d _mm256_set1_pd
+#define gmx_simd_store_d _mm256_store_pd
+#define gmx_simd_loadu_d _mm256_loadu_pd
+#define gmx_simd_storeu_d _mm256_storeu_pd
+#define gmx_simd_setzero_d _mm256_setzero_pd
+#define gmx_simd_add_d _mm256_add_pd
+#define gmx_simd_sub_d _mm256_sub_pd
+#define gmx_simd_mul_d _mm256_mul_pd
+#define gmx_simd_fmadd_d(a, b, c) _mm256_add_pd(_mm256_mul_pd(a, b), c)
+#define gmx_simd_fmsub_d(a, b, c) _mm256_sub_pd(_mm256_mul_pd(a, b), c)
+#define gmx_simd_fnmadd_d(a, b, c) _mm256_sub_pd(c, _mm256_mul_pd(a, b))
+#define gmx_simd_fnmsub_d(a, b, c) _mm256_sub_pd(_mm256_setzero_pd(), gmx_simd_fmadd_d(a, b, c))
+#define gmx_simd_and_d _mm256_and_pd
+#define gmx_simd_andnot_d _mm256_andnot_pd
+#define gmx_simd_or_d _mm256_or_pd
+#define gmx_simd_xor_d _mm256_xor_pd
+#define gmx_simd_rsqrt_d(x) _mm256_cvtps_pd(_mm_rsqrt_ps(_mm256_cvtpd_ps(x)))
+#define gmx_simd_rcp_d(x) _mm256_cvtps_pd(_mm_rcp_ps(_mm256_cvtpd_ps(x)))
+#define gmx_simd_fabs_d(x) _mm256_andnot_pd(_mm256_set1_pd(-0.0), x)
+#define gmx_simd_fneg_d(x) _mm256_xor_pd(x, _mm256_set1_pd(-0.0))
+#define gmx_simd_max_d _mm256_max_pd
+#define gmx_simd_min_d _mm256_min_pd
+#define gmx_simd_round_d(x) _mm256_round_pd(x, _MM_FROUND_NINT)
+#define gmx_simd_trunc_d(x) _mm256_round_pd(x, _MM_FROUND_TRUNC)
+#define gmx_simd_fraction_d(x) _mm256_sub_pd(x, gmx_simd_trunc_d(x))
+#define gmx_simd_get_exponent_d gmx_simd_get_exponent_d_avx_256
+#define gmx_simd_get_mantissa_d gmx_simd_get_mantissa_d_avx_256
+#define gmx_simd_set_exponent_d gmx_simd_set_exponent_d_avx_256
+/* integer datatype corresponding to double: gmx_simd_dint32_t */
+#define gmx_simd_dint32_t __m128i
+#define gmx_simd_load_di(m) _mm_load_si128((const __m128i *)m)
+#define gmx_simd_set1_di _mm_set1_epi32
+#define gmx_simd_store_di(m, x) _mm_store_si128((__m128i *)m, x)
+#define gmx_simd_loadu_di(m) _mm_loadu_si128((const __m128i *)m)
+#define gmx_simd_storeu_di(m, x) _mm_storeu_si128((__m128i *)m, x)
+#define gmx_simd_setzero_di _mm_setzero_si128
+#define gmx_simd_cvt_d2i _mm256_cvtpd_epi32
+#define gmx_simd_cvtt_d2i _mm256_cvttpd_epi32
+#define gmx_simd_cvt_i2d _mm256_cvtepi32_pd
+#define gmx_simd_extract_di _mm_extract_epi32
+/* Integer logical ops on gmx_simd_dint32_t */
+#define gmx_simd_slli_di _mm_slli_epi32
+#define gmx_simd_srli_di _mm_srli_epi32
+#define gmx_simd_and_di _mm_and_si128
+#define gmx_simd_andnot_di _mm_andnot_si128
+#define gmx_simd_or_di _mm_or_si128
+#define gmx_simd_xor_di _mm_xor_si128
+/* Integer arithmetic ops on integer datatype corresponding to double */
+#define gmx_simd_add_di _mm_add_epi32
+#define gmx_simd_sub_di _mm_sub_epi32
+#define gmx_simd_mul_di _mm_mullo_epi32
+/* Boolean & comparison operations on gmx_simd_double_t */
+#define gmx_simd_dbool_t __m256d
+#define gmx_simd_cmpeq_d(a, b) _mm256_cmp_pd(a, b, _CMP_EQ_OQ)
+#define gmx_simd_cmplt_d(a, b) _mm256_cmp_pd(a, b, _CMP_LT_OQ)
+#define gmx_simd_cmple_d(a, b) _mm256_cmp_pd(a, b, _CMP_LE_OQ)
+#define gmx_simd_and_db _mm256_and_pd
+#define gmx_simd_or_db _mm256_or_pd
+#define gmx_simd_anytrue_db _mm256_movemask_pd
+#define gmx_simd_blendzero_d _mm256_and_pd
+#define gmx_simd_blendnotzero_d(a, sel) _mm256_andnot_pd(sel, a)
+#define gmx_simd_blendv_d _mm256_blendv_pd
+#define gmx_simd_reduce_d gmx_simd_reduce_d_avx_256
+/* Boolean & comparison operations on gmx_simd_dint32_t */
+#define gmx_simd_dibool_t __m128i
+#define gmx_simd_cmpeq_di _mm_cmpeq_epi32
+#define gmx_simd_cmplt_di _mm_cmplt_epi32
+#define gmx_simd_and_dib _mm_and_si128
+#define gmx_simd_or_dib _mm_or_si128
+#define gmx_simd_anytrue_dib _mm_movemask_epi8
+#define gmx_simd_blendzero_di _mm_and_si128
+#define gmx_simd_blendnotzero_di(a, sel) _mm_andnot_si128(sel, a)
+#define gmx_simd_blendv_di _mm_blendv_epi8
+/* Conversions between different booleans */
+#define gmx_simd_cvt_db2dib gmx_simd_cvt_db2dib_avx_256
+#define gmx_simd_cvt_dib2db gmx_simd_cvt_dib2db_avx_256
+/* Float/double conversion */
+#define gmx_simd_cvt_f2dd gmx_simd_cvt_f2dd_avx_256
+#define gmx_simd_cvt_dd2f gmx_simd_cvt_dd2f_avx_256
+
+/****************************************************
+ * SINGLE PRECISION SIMD4 IMPLEMENTATION *
+ ****************************************************/
+#define gmx_simd4_float_t __m128
+#define gmx_simd4_load_f _mm_load_ps
+#define gmx_simd4_load1_f _mm_broadcast_ss
+#define gmx_simd4_set1_f _mm_set1_ps
+#define gmx_simd4_store_f _mm_store_ps
+#define gmx_simd4_loadu_f _mm_loadu_ps
+#define gmx_simd4_storeu_f _mm_storeu_ps
+#define gmx_simd4_setzero_f _mm_setzero_ps
+#define gmx_simd4_add_f _mm_add_ps
+#define gmx_simd4_sub_f _mm_sub_ps
+#define gmx_simd4_mul_f _mm_mul_ps
+#define gmx_simd4_fmadd_f(a, b, c) _mm_add_ps(_mm_mul_ps(a, b), c)
+#define gmx_simd4_fmsub_f(a, b, c) _mm_sub_ps(_mm_mul_ps(a, b), c)
+#define gmx_simd4_fnmadd_f(a, b, c) _mm_sub_ps(c, _mm_mul_ps(a, b))
+#define gmx_simd4_fnmsub_f(a, b, c) _mm_sub_ps(_mm_setzero_ps(), gmx_simd4_fmadd_f(a, b, c))
+#define gmx_simd4_and_f _mm_and_ps
+#define gmx_simd4_andnot_f _mm_andnot_ps
+#define gmx_simd4_or_f _mm_or_ps
+#define gmx_simd4_xor_f _mm_xor_ps
+#define gmx_simd4_rsqrt_f _mm_rsqrt_ps
+#define gmx_simd4_fabs_f(x) _mm_andnot_ps(_mm_set1_ps(-0.0), x)
+#define gmx_simd4_fneg_f(x) _mm_xor_ps(x, _mm_set1_ps(-0.0))
+#define gmx_simd4_max_f _mm_max_ps
+#define gmx_simd4_min_f _mm_min_ps
+#define gmx_simd4_round_f(x) _mm_round_ps(x, _MM_FROUND_NINT)
+#define gmx_simd4_trunc_f(x) _mm_round_ps(x, _MM_FROUND_TRUNC)
+#define gmx_simd4_dotproduct3_f gmx_simd4_dotproduct3_f_avx_256
+#define gmx_simd4_fbool_t __m128
+#define gmx_simd4_cmpeq_f _mm_cmpeq_ps
+#define gmx_simd4_cmplt_f _mm_cmplt_ps
+#define gmx_simd4_cmple_f _mm_cmple_ps
+#define gmx_simd4_and_fb _mm_and_ps
+#define gmx_simd4_or_fb _mm_or_ps
+#define gmx_simd4_anytrue_fb _mm_movemask_ps
+#define gmx_simd4_blendzero_f _mm_and_ps
+#define gmx_simd4_blendnotzero_f(a, sel) _mm_andnot_ps(sel, a)
+#define gmx_simd4_blendv_f _mm_blendv_ps
+#define gmx_simd4_reduce_f gmx_simd4_reduce_f_avx_256
+
+/****************************************************
+ * DOUBLE PRECISION SIMD4 IMPLEMENTATION *
+ ****************************************************/
+#define gmx_simd4_double_t gmx_simd_double_t
+#define gmx_simd4_load_d gmx_simd_load_d
+#define gmx_simd4_load1_d gmx_simd_load1_d
+#define gmx_simd4_set1_d gmx_simd_set1_d
+#define gmx_simd4_store_d gmx_simd_store_d
+#define gmx_simd4_loadu_d gmx_simd_loadu_d
+#define gmx_simd4_storeu_d gmx_simd_storeu_d
+#define gmx_simd4_setzero_d gmx_simd_setzero_d
+#define gmx_simd4_add_d gmx_simd_add_d
+#define gmx_simd4_sub_d gmx_simd_sub_d
+#define gmx_simd4_mul_d gmx_simd_mul_d
+#define gmx_simd4_fmadd_d gmx_simd_fmadd_d
+#define gmx_simd4_fmsub_d gmx_simd_fmsub_d
+#define gmx_simd4_fnmadd_d gmx_simd_fnmadd_d
+#define gmx_simd4_fnmsub_d gmx_simd_fnmsub_d
+#define gmx_simd4_and_d gmx_simd_and_d
+#define gmx_simd4_andnot_d gmx_simd_andnot_d
+#define gmx_simd4_or_d gmx_simd_or_d
+#define gmx_simd4_xor_d gmx_simd_xor_d
+#define gmx_simd4_rsqrt_d gmx_simd_rsqrt_d
+#define gmx_simd4_fabs_d gmx_simd_fabs_d
+#define gmx_simd4_fneg_d gmx_simd_fneg_d
+#define gmx_simd4_max_d gmx_simd_max_d
+#define gmx_simd4_min_d gmx_simd_min_d
+#define gmx_simd4_round_d gmx_simd_round_d
+#define gmx_simd4_trunc_d gmx_simd_trunc_d
+#define gmx_simd4_dotproduct3_d gmx_simd4_dotproduct3_d_avx_256
+#define gmx_simd4_dbool_t gmx_simd_dbool_t
+#define gmx_simd4_cmpeq_d gmx_simd_cmpeq_d
+#define gmx_simd4_cmplt_d gmx_simd_cmplt_d
+#define gmx_simd4_cmple_d gmx_simd_cmple_d
+#define gmx_simd4_and_db gmx_simd_and_db
+#define gmx_simd4_or_db gmx_simd_or_db
+#define gmx_simd4_anytrue_db gmx_simd_anytrue_db
+#define gmx_simd4_blendzero_d gmx_simd_blendzero_d
+#define gmx_simd4_blendnotzero_d gmx_simd_blendnotzero_d
+#define gmx_simd4_blendv_d gmx_simd_blendv_d
+#define gmx_simd4_reduce_d gmx_simd_reduce_d
+/* SIMD4 float/double conversion */
+#define gmx_simd4_cvt_f2d _mm256_cvtps_pd
+#define gmx_simd4_cvt_d2f _mm256_cvtpd_ps
+
+/*********************************************************
+ * SIMD SINGLE PRECISION IMPLEMENTATION HELPER FUNCTIONS *
+ *********************************************************/
+static gmx_inline __m256
+gmx_simd_get_exponent_f_avx_256(__m256 x)
+{
+ const __m256 expmask = _mm256_castsi256_ps(_mm256_set1_epi32(0x7F800000));
+ const __m128i expbias = _mm_set1_epi32(127);
+ __m256i iexp256;
+ __m128i iexp128a, iexp128b;
+
+ iexp256 = _mm256_castps_si256(_mm256_and_ps(x, expmask));
+ iexp128b = _mm256_extractf128_si256(iexp256, 0x1);
+ iexp128a = _mm256_castsi256_si128(iexp256);
+ iexp128a = _mm_srli_epi32(iexp128a, 23);
+ iexp128b = _mm_srli_epi32(iexp128b, 23);
+ iexp128a = _mm_sub_epi32(iexp128a, expbias);
+ iexp128b = _mm_sub_epi32(iexp128b, expbias);
+ iexp256 = _mm256_castsi128_si256(iexp128a);
+ iexp256 = _mm256_insertf128_si256(iexp256, iexp128b, 0x1);
+ return _mm256_cvtepi32_ps(iexp256);
+}
+
+static gmx_inline __m256
+gmx_simd_get_mantissa_f_avx_256(__m256 x)
+{
+ const __m256 mantmask = _mm256_castsi256_ps(_mm256_set1_epi32(0x007FFFFF));
+ const __m256 one = _mm256_set1_ps(1.0);
+
+ x = _mm256_and_ps(x, mantmask);
+ return _mm256_or_ps(x, one);
+}
+
+static gmx_inline __m256
+gmx_simd_set_exponent_f_avx_256(__m256 x)
+{
+ const __m128i expbias = _mm_set1_epi32(127);
+ __m256i iexp256;
+ __m128i iexp128a, iexp128b;
+
+ iexp256 = _mm256_cvtps_epi32(x);
+ iexp128b = _mm256_extractf128_si256(iexp256, 0x1);
+ iexp128a = _mm256_castsi256_si128(iexp256);
+ iexp128a = _mm_slli_epi32(_mm_add_epi32(iexp128a, expbias), 23);
+ iexp128b = _mm_slli_epi32(_mm_add_epi32(iexp128b, expbias), 23);
+ iexp256 = _mm256_castsi128_si256(iexp128a);
+ iexp256 = _mm256_insertf128_si256(iexp256, iexp128b, 0x1);
+ return _mm256_castsi256_ps(iexp256);
+}
+
+static gmx_inline float
+gmx_simd_reduce_f_avx_256(__m256 a)
+{
+ float f;
+
+ __m128 a0, a1;
+ a = _mm256_hadd_ps(a, a);
+ a = _mm256_hadd_ps(a, a);
+ a0 = _mm256_castps256_ps128(a);
+ a1 = _mm256_extractf128_ps(a, 0x1);
+ a0 = _mm_add_ss(a0, a1);
+ _mm_store_ss(&f, a0);
+ return f;
+}
+
+/*********************************************************
+ * SIMD DOUBLE PRECISION IMPLEMENTATION HELPER FUNCTIONS *
+ *********************************************************/
+static gmx_inline __m256d
+gmx_simd_get_exponent_d_avx_256(__m256d x)
+{
+ const __m256d expmask = _mm256_castsi256_pd( _mm256_set1_epi64x(0x7FF0000000000000LL));
+ const __m128i expbias = _mm_set1_epi32(1023);
+ __m256i iexp256;
+ __m128i iexp128a, iexp128b;
+
+ iexp256 = _mm256_castpd_si256(_mm256_and_pd(x, expmask));
+ iexp128b = _mm256_extractf128_si256(iexp256, 0x1);
+ iexp128a = _mm256_castsi256_si128(iexp256);
+ iexp128a = _mm_srli_epi64(iexp128a, 52);
+ iexp128b = _mm_srli_epi64(iexp128b, 52);
+ iexp128a = _mm_shuffle_epi32(iexp128a, _MM_SHUFFLE(1, 1, 2, 0));
+ iexp128b = _mm_shuffle_epi32(iexp128b, _MM_SHUFFLE(2, 0, 1, 1));
+ iexp128a = _mm_or_si128(iexp128a, iexp128b);
+ iexp128a = _mm_sub_epi32(iexp128a, expbias);
+ return _mm256_cvtepi32_pd(iexp128a);
+}
+
+static gmx_inline __m256d
+gmx_simd_get_mantissa_d_avx_256(__m256d x)
+{
+ const __m256d mantmask = _mm256_castsi256_pd(_mm256_set1_epi64x(0x000FFFFFFFFFFFFFLL));
+ const __m256d one = _mm256_set1_pd(1.0);
+
+ x = _mm256_and_pd(x, mantmask);
+ return _mm256_or_pd(x, one);
+}
+
+static gmx_inline __m256d
+gmx_simd_set_exponent_d_avx_256(__m256d x)
+{
+ const __m128i expbias = _mm_set1_epi32(1023);
+ __m128i iexp128a, iexp128b;
+
+ iexp128a = _mm256_cvtpd_epi32(x);
+ iexp128a = _mm_add_epi32(iexp128a, expbias);
+ iexp128b = _mm_shuffle_epi32(iexp128a, _MM_SHUFFLE(3, 3, 2, 2));
+ iexp128a = _mm_shuffle_epi32(iexp128a, _MM_SHUFFLE(1, 1, 0, 0));
+ iexp128b = _mm_slli_epi64(iexp128b, 52);
+ iexp128a = _mm_slli_epi64(iexp128a, 52);
+ return _mm256_castsi256_pd(_mm256_insertf128_si256(_mm256_castsi128_si256(iexp128a), iexp128b, 0x1));
+}
+
+static gmx_inline double
+gmx_simd_reduce_d_avx_256(__m256d a)
+{
+ double f;
+ __m128d a0, a1;
+ a = _mm256_hadd_pd(a, a);
+ a0 = _mm256_castpd256_pd128(a);
+ a1 = _mm256_extractf128_pd(a, 0x1);
+ a0 = _mm_add_sd(a0, a1);
+ _mm_store_sd(&f, a0);
+ return f;
+}
+
+static gmx_inline gmx_simd_dibool_t
+gmx_simd_cvt_db2dib_avx_256(gmx_simd_dbool_t a)
+{
+ __m128i a1 = _mm256_extractf128_si256(_mm256_castpd_si256(a), 0x1);
+ __m128i a0 = _mm256_castsi256_si128(_mm256_castpd_si256(a));
+ a0 = _mm_shuffle_epi32(a0, _MM_SHUFFLE(2, 0, 2, 0));
+ a1 = _mm_shuffle_epi32(a1, _MM_SHUFFLE(2, 0, 2, 0));
+ return _mm_blend_epi16(a0, a1, 0xF0);
+}
+
+static gmx_inline gmx_simd_dbool_t
+gmx_simd_cvt_dib2db_avx_256(gmx_simd_dibool_t a)
+{
+ __m128i a1 = _mm_shuffle_epi32(a, _MM_SHUFFLE(3, 3, 2, 2));
+ __m128i a0 = _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 1, 0, 0));
+ return _mm256_castsi256_pd(_mm256_insertf128_si256(_mm256_castsi128_si256(a0), a1, 0x1));
+}
+
+static gmx_inline void
+gmx_simd_cvt_f2dd_avx_256(__m256 f, __m256d *d0, __m256d *d1)
+{
+ *d0 = _mm256_cvtps_pd(_mm256_castps256_ps128(f));
+ *d1 = _mm256_cvtps_pd(_mm256_extractf128_ps(f, 0x1));
+}
+
+static gmx_inline __m256
+gmx_simd_cvt_dd2f_avx_256(__m256d d0, __m256d d1)
+{
+ __m128 f0 = _mm256_cvtpd_ps(d0);
+ __m128 f1 = _mm256_cvtpd_ps(d1);
+ return _mm256_insertf128_ps(_mm256_castps128_ps256(f0), f1, 0x1);
+}
+
+/* SIMD4 reduce helper */
+static gmx_inline float
+gmx_simd4_reduce_f_avx_256(__m128 a)
+{
+ float f;
+ a = _mm_hadd_ps(a, a);
+ a = _mm_hadd_ps(a, a);
+ _mm_store_ss(&f, a);
+ return f;
+}
+
+/* SIMD4 Dotproduct helper function */
+static gmx_inline float
+gmx_simd4_dotproduct3_f_avx_256(__m128 a, __m128 b)
+{
+ float f;
+ __m128 c;
+ a = _mm_mul_ps(a, b);
+ c = _mm_add_ps(a, _mm_permute_ps(a, _MM_SHUFFLE(0, 3, 2, 1)));
+ c = _mm_add_ps(c, _mm_permute_ps(a, _MM_SHUFFLE(1, 0, 3, 2)));
+ _mm_store_ss(&f, c);
+ return f;
+}
+
+static gmx_inline double
+gmx_simd4_dotproduct3_d_avx_256(__m256d a, __m256d b)
+{
+ double d;
+ __m128d tmp1, tmp2;
+ a = _mm256_mul_pd(a, b);
+ tmp1 = _mm256_castpd256_pd128(a);
+ tmp2 = _mm256_extractf128_pd(a, 0x1);
+
+ tmp1 = _mm_add_pd(tmp1, _mm_permute_pd(tmp1, _MM_SHUFFLE2(0, 1)));
+ tmp1 = _mm_add_pd(tmp1, tmp2);
+ _mm_store_sd(&d, tmp1);
+ return d;
+}
+
+/* Function to check whether SIMD operations have resulted in overflow */
+static int
+gmx_simd_check_and_reset_overflow(void)
+{
+ int MXCSR;
+ int sse_overflow;
+
+ MXCSR = _mm_getcsr();
+ /* The overflow flag is bit 3 in the register */
+ if (MXCSR & 0x0008)
+ {
+ sse_overflow = 1;
+ /* Set the overflow flag to zero */
+ MXCSR = MXCSR & 0xFFF7;
+ _mm_setcsr(MXCSR);
+ }
+ else
+ {
+ sse_overflow = 0;
+ }
+ return sse_overflow;
+}
+
+
+#endif /* GMX_SIMD_IMPL_X86_AVX_256_H */
--- /dev/null
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2014, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+
+#ifndef GMX_SIMD_IMPL_X86_SSE2_H
+#define GMX_SIMD_IMPL_X86_SSE2_H
+
+#include <math.h>
+#include <emmintrin.h>
+
+/* Set capabilities that can be inherited */
+#define GMX_SIMD_X86_SSE2_OR_HIGHER
+
+/* x86 SSE2 SIMD instruction wrappers
+ *
+ * Please see documentation in gromacs/simd/simd.h for defines.
+ */
+
+/* Capability definitions for SSE2 */
+#define GMX_SIMD_HAVE_FLOAT
+#define GMX_SIMD_HAVE_DOUBLE
+#define GMX_SIMD_HAVE_HARDWARE
+#define GMX_SIMD_HAVE_LOADU
+#define GMX_SIMD_HAVE_STOREU
+#define GMX_SIMD_HAVE_LOGICAL
+#undef GMX_SIMD_HAVE_FMA
+#undef GMX_SIMD_HAVE_FRACTION
+#define GMX_SIMD_HAVE_FINT32
+#define GMX_SIMD_HAVE_FINT32_EXTRACT /* No SSE2 instruction, but use shifts */
+#define GMX_SIMD_HAVE_FINT32_LOGICAL
+#define GMX_SIMD_HAVE_FINT32_ARITHMETICS
+#define GMX_SIMD_HAVE_DINT32
+#define GMX_SIMD_HAVE_DINT32_EXTRACT /* No SSE2 instruction, but use shifts */
+#define GMX_SIMD_HAVE_DINT32_LOGICAL
+#define GMX_SIMD_HAVE_DINT32_ARITHMETICS
+#define GMX_SIMD4_HAVE_FLOAT
+#undef GMX_SIMD4_HAVE_DOUBLE
+
+/* Implementation details */
+#define GMX_SIMD_FLOAT_WIDTH 4
+#define GMX_SIMD_DOUBLE_WIDTH 2
+#define GMX_SIMD_FINT32_WIDTH 4
+#define GMX_SIMD_DINT32_WIDTH 2
+#define GMX_SIMD_RSQRT_BITS 11
+#define GMX_SIMD_RCP_BITS 11
+
+/****************************************************
+ * SINGLE PRECISION SIMD IMPLEMENTATION *
+ ****************************************************/
+#define gmx_simd_float_t __m128
+#define gmx_simd_load_f _mm_load_ps
+#define gmx_simd_load1_f _mm_load1_ps
+#define gmx_simd_set1_f _mm_set1_ps
+#define gmx_simd_store_f _mm_store_ps
+#define gmx_simd_loadu_f _mm_loadu_ps
+#define gmx_simd_storeu_f _mm_storeu_ps
+#define gmx_simd_setzero_f _mm_setzero_ps
+#define gmx_simd_add_f _mm_add_ps
+#define gmx_simd_sub_f _mm_sub_ps
+#define gmx_simd_mul_f _mm_mul_ps
+#define gmx_simd_fmadd_f(a, b, c) _mm_add_ps(_mm_mul_ps(a, b), c)
+#define gmx_simd_fmsub_f(a, b, c) _mm_sub_ps(_mm_mul_ps(a, b), c)
+#define gmx_simd_fnmadd_f(a, b, c) _mm_sub_ps(c, _mm_mul_ps(a, b))
+#define gmx_simd_fnmsub_f(a, b, c) _mm_sub_ps(_mm_setzero_ps(), gmx_simd_fmadd_f(a, b, c))
+#define gmx_simd_and_f _mm_and_ps
+#define gmx_simd_andnot_f _mm_andnot_ps
+#define gmx_simd_or_f _mm_or_ps
+#define gmx_simd_xor_f _mm_xor_ps
+#define gmx_simd_rsqrt_f _mm_rsqrt_ps
+#define gmx_simd_rcp_f _mm_rcp_ps
+#define gmx_simd_fabs_f(x) _mm_andnot_ps(_mm_set1_ps(-0.0), x)
+#define gmx_simd_fneg_f(x) _mm_xor_ps(x, _mm_set1_ps(-0.0))
+#define gmx_simd_max_f _mm_max_ps
+#define gmx_simd_min_f _mm_min_ps
+#define gmx_simd_round_f(x) _mm_cvtepi32_ps(_mm_cvtps_epi32(x))
+#define gmx_simd_trunc_f(x) _mm_cvtepi32_ps(_mm_cvttps_epi32(x))
+#define gmx_simd_fraction_f(x) _mm_sub_ps(x, gmx_simd_trunc_f(x))
+#define gmx_simd_get_exponent_f gmx_simd_get_exponent_f_sse2
+#define gmx_simd_get_mantissa_f gmx_simd_get_mantissa_f_sse2
+#define gmx_simd_set_exponent_f gmx_simd_set_exponent_f_sse2
+/* integer datatype corresponding to float: gmx_simd_fint32_t */
+#define gmx_simd_fint32_t __m128i
+#define gmx_simd_load_fi(m) _mm_load_si128((const __m128i *)m)
+#define gmx_simd_set1_fi _mm_set1_epi32
+#define gmx_simd_store_fi(m, x) _mm_store_si128((__m128i *)m, x)
+#define gmx_simd_loadu_fi(m) _mm_loadu_si128((const __m128i *)m)
+#define gmx_simd_storeu_fi(m, x) _mm_storeu_si128((__m128i *)m, x)
+#define gmx_simd_setzero_fi _mm_setzero_si128
+#define gmx_simd_cvt_f2i _mm_cvtps_epi32
+#define gmx_simd_cvtt_f2i _mm_cvttps_epi32
+#define gmx_simd_cvt_i2f _mm_cvtepi32_ps
+#define gmx_simd_extract_fi(x, i) _mm_cvtsi128_si32(_mm_srli_si128((x), sizeof(int) * (i)))
+/* Integer logical ops on gmx_simd_fint32_t */
+#define gmx_simd_slli_fi _mm_slli_epi32
+#define gmx_simd_srli_fi _mm_srli_epi32
+#define gmx_simd_and_fi _mm_and_si128
+#define gmx_simd_andnot_fi _mm_andnot_si128
+#define gmx_simd_or_fi _mm_or_si128
+#define gmx_simd_xor_fi _mm_xor_si128
+/* Integer arithmetic ops on gmx_simd_fint32_t */
+#define gmx_simd_add_fi _mm_add_epi32
+#define gmx_simd_sub_fi _mm_sub_epi32
+#define gmx_simd_mul_fi gmx_simd_mul_fi_sse2
+/* Boolean & comparison operations on gmx_simd_float_t */
+#define gmx_simd_fbool_t __m128
+#define gmx_simd_cmpeq_f _mm_cmpeq_ps
+#define gmx_simd_cmplt_f _mm_cmplt_ps
+#define gmx_simd_cmple_f _mm_cmple_ps
+#define gmx_simd_and_fb _mm_and_ps
+#define gmx_simd_or_fb _mm_or_ps
+#define gmx_simd_anytrue_fb _mm_movemask_ps
+#define gmx_simd_blendzero_f _mm_and_ps
+#define gmx_simd_blendnotzero_f(a, sel) _mm_andnot_ps(sel, a)
+#define gmx_simd_blendv_f(a, b, s) _mm_or_ps(_mm_andnot_ps(s, a), _mm_and_ps(s, b))
+#define gmx_simd_reduce_f(a) gmx_simd_reduce_f_sse2(a)
+/* Boolean & comparison operations on gmx_simd_fint32_t */
+#define gmx_simd_fibool_t __m128i
+#define gmx_simd_cmpeq_fi _mm_cmpeq_epi32
+#define gmx_simd_cmplt_fi _mm_cmplt_epi32
+#define gmx_simd_and_fib _mm_and_si128
+#define gmx_simd_or_fib _mm_or_si128
+#define gmx_simd_anytrue_fib _mm_movemask_epi8
+#define gmx_simd_blendzero_fi _mm_and_si128
+#define gmx_simd_blendnotzero_fi(a, sel) _mm_andnot_si128(sel, a)
+#define gmx_simd_blendv_fi(a, b, s) _mm_or_si128(_mm_andnot_si128(s, a), _mm_and_si128(s, b))
+/* Conversions between different booleans */
+#define gmx_simd_cvt_fb2fib _mm_castps_si128
+#define gmx_simd_cvt_fib2fb _mm_castsi128_ps
+
+/****************************************************
+ * DOUBLE PRECISION SIMD IMPLEMENTATION *
+ ****************************************************/
+#define gmx_simd_double_t __m128d
+#define gmx_simd_load_d _mm_load_pd
+#define gmx_simd_load1_d _mm_load1_pd
+#define gmx_simd_set1_d _mm_set1_pd
+#define gmx_simd_store_d _mm_store_pd
+#define gmx_simd_loadu_d _mm_loadu_pd
+#define gmx_simd_storeu_d _mm_storeu_pd
+#define gmx_simd_setzero_d _mm_setzero_pd
+#define gmx_simd_add_d _mm_add_pd
+#define gmx_simd_sub_d _mm_sub_pd
+#define gmx_simd_mul_d _mm_mul_pd
+#define gmx_simd_fmadd_d(a, b, c) _mm_add_pd(_mm_mul_pd(a, b), c)
+#define gmx_simd_fmsub_d(a, b, c) _mm_sub_pd(_mm_mul_pd(a, b), c)
+#define gmx_simd_fnmadd_d(a, b, c) _mm_sub_pd(c, _mm_mul_pd(a, b))
+#define gmx_simd_fnmsub_d(a, b, c) _mm_sub_pd(_mm_setzero_pd(), gmx_simd_fmadd_d(a, b, c))
+#define gmx_simd_and_d _mm_and_pd
+#define gmx_simd_andnot_d _mm_andnot_pd
+#define gmx_simd_or_d _mm_or_pd
+#define gmx_simd_xor_d _mm_xor_pd
+#define gmx_simd_rsqrt_d(x) _mm_cvtps_pd(_mm_rsqrt_ps(_mm_cvtpd_ps(x)))
+/* Don't use FMA for sqrt N-R iterations - this saves 1 instruction without FMA hardware */
+#define gmx_simd_rcp_d(x) _mm_cvtps_pd(_mm_rcp_ps(_mm_cvtpd_ps(x)))
+#define gmx_simd_fabs_d(x) _mm_andnot_pd(_mm_set1_pd(-0.0), x)
+#define gmx_simd_fneg_d(x) _mm_xor_pd(x, _mm_set1_pd(-0.0))
+#define gmx_simd_max_d _mm_max_pd
+#define gmx_simd_min_d _mm_min_pd
+#define gmx_simd_round_d(x) _mm_cvtepi32_pd(_mm_cvtpd_epi32(x))
+#define gmx_simd_trunc_d(x) _mm_cvtepi32_pd(_mm_cvttpd_epi32(x))
+#define gmx_simd_fraction_d(x) _mm_sub_pd(x, gmx_simd_trunc_d(x))
+#define gmx_simd_get_exponent_d gmx_simd_get_exponent_d_sse2
+#define gmx_simd_get_mantissa_d gmx_simd_get_mantissa_d_sse2
+#define gmx_simd_set_exponent_d gmx_simd_set_exponent_d_sse2
+/* integer datatype corresponding to double: gmx_simd_dint32_t */
+#define gmx_simd_dint32_t __m128i
+#define gmx_simd_load_di(m) _mm_loadl_epi64((const __m128i *)m)
+#define gmx_simd_set1_di _mm_set1_epi32
+#define gmx_simd_store_di(m, x) _mm_storel_epi64((__m128i *)m, x)
+#define gmx_simd_loadu_di(m) _mm_loadl_epi64((const __m128i *)m)
+#define gmx_simd_storeu_di(m, x) _mm_storel_epi64((__m128i *)m, x)
+#define gmx_simd_setzero_di _mm_setzero_si128
+#define gmx_simd_cvt_d2i _mm_cvtpd_epi32
+#define gmx_simd_cvtt_d2i _mm_cvttpd_epi32
+#define gmx_simd_cvt_i2d _mm_cvtepi32_pd
+#define gmx_simd_extract_di(x, i) _mm_cvtsi128_si32(_mm_srli_si128((x), sizeof(int) * (i)))
+/* Integer logical ops on gmx_simd_dint32_t */
+#define gmx_simd_slli_di _mm_slli_epi32
+#define gmx_simd_srli_di _mm_srli_epi32
+#define gmx_simd_and_di _mm_and_si128
+#define gmx_simd_andnot_di _mm_andnot_si128
+#define gmx_simd_or_di _mm_or_si128
+#define gmx_simd_xor_di _mm_xor_si128
+/* Integer arithmetic ops on integer datatype corresponding to double */
+#define gmx_simd_add_di _mm_add_epi32
+#define gmx_simd_sub_di _mm_sub_epi32
+#define gmx_simd_mul_di gmx_simd_mul_di_sse2
+/* Boolean & comparison operations on gmx_simd_double_t */
+#define gmx_simd_dbool_t __m128d
+#define gmx_simd_cmpeq_d _mm_cmpeq_pd
+#define gmx_simd_cmplt_d _mm_cmplt_pd
+#define gmx_simd_cmple_d _mm_cmple_pd
+#define gmx_simd_and_db _mm_and_pd
+#define gmx_simd_or_db _mm_or_pd
+#define gmx_simd_anytrue_db _mm_movemask_pd
+#define gmx_simd_blendzero_d _mm_and_pd
+#define gmx_simd_blendnotzero_d(a, sel) _mm_andnot_pd(sel, a)
+#define gmx_simd_blendv_d(a, b, sel) _mm_or_pd(_mm_andnot_pd(sel, a), _mm_and_pd(sel, b))
+#define gmx_simd_reduce_d(a) gmx_simd_reduce_d_sse2(a)
+
+/* Boolean & comparison operations on gmx_simd_dint32_t */
+#define gmx_simd_dibool_t __m128i
+#define gmx_simd_cmpeq_di _mm_cmpeq_epi32
+#define gmx_simd_cmplt_di _mm_cmplt_epi32
+#define gmx_simd_and_dib _mm_and_si128
+#define gmx_simd_or_dib _mm_or_si128
+#define gmx_simd_anytrue_dib(x) _mm_movemask_epi8(_mm_shuffle_epi32(x, _MM_SHUFFLE(1, 0, 1, 0)))
+#define gmx_simd_blendzero_di _mm_and_si128
+#define gmx_simd_blendnotzero_di(a, sel) _mm_andnot_si128(sel, a)
+#define gmx_simd_blendv_di(a, b, sel) _mm_or_si128(_mm_andnot_si128(sel, a), _mm_and_si128(sel, b))
+#define gmx_simd_cvt_db2dib(x) _mm_shuffle_epi32(_mm_castpd_si128(x), _MM_SHUFFLE(2, 0, 2, 0))
+#define gmx_simd_cvt_dib2db(x) _mm_castsi128_pd(_mm_shuffle_epi32(x, _MM_SHUFFLE(1, 1, 0, 0)))
+/* Float/double conversion */
+#define gmx_simd_cvt_f2dd(f, d0, d1) { *d0 = _mm_cvtps_pd(f); *d1 = _mm_cvtps_pd(_mm_movehl_ps(f, f)); }
+#define gmx_simd_cvt_dd2f(d0, d1) _mm_movelh_ps(_mm_cvtpd_ps(d0), _mm_cvtpd_ps(d1))
+
+
+/****************************************************
+ * SINGLE PRECISION IMPLEMENTATION HELPER FUNCTIONS *
+ ****************************************************/
+static gmx_inline __m128
+gmx_simd_get_exponent_f_sse2(__m128 x)
+{
+ const __m128 expmask = _mm_castsi128_ps(_mm_set1_epi32(0x7F800000));
+ const __m128i expbias = _mm_set1_epi32(127);
+ __m128i iexp;
+
+ iexp = _mm_castps_si128(_mm_and_ps(x, expmask));
+ iexp = _mm_sub_epi32(_mm_srli_epi32(iexp, 23), expbias);
+ return _mm_cvtepi32_ps(iexp);
+}
+
+static gmx_inline __m128
+gmx_simd_get_mantissa_f_sse2(__m128 x)
+{
+ const __m128 mantmask = _mm_castsi128_ps(_mm_set1_epi32(0x007FFFFF));
+ const __m128 one = _mm_set1_ps(1.0f);
+
+ x = _mm_and_ps(x, mantmask);
+ return _mm_or_ps(x, one);
+}
+
+static gmx_inline __m128
+gmx_simd_set_exponent_f_sse2(__m128 x)
+{
+ const __m128i expbias = _mm_set1_epi32(127);
+ __m128i iexp = _mm_cvtps_epi32(x);
+
+ iexp = _mm_slli_epi32(_mm_add_epi32(iexp, expbias), 23);
+ return _mm_castsi128_ps(iexp);
+}
+
+static gmx_inline __m128i
+gmx_simd_mul_fi_sse2(__m128i a, __m128i b)
+{
+ __m128i a1 = _mm_srli_si128(a, 4); /* - a[3] a[2] a[1] */
+ __m128i b1 = _mm_srli_si128(b, 4); /* - b[3] b[2] b[1] */
+ __m128i c = _mm_mul_epu32(a, b);
+ __m128i c1 = _mm_mul_epu32(a1, b1);
+
+ c = _mm_shuffle_epi32(c, _MM_SHUFFLE(3, 1, 2, 0)); /* - - a[2]*b[2] a[0]*b[0] */
+ c1 = _mm_shuffle_epi32(c1, _MM_SHUFFLE(3, 1, 2, 0)); /* - - a[3]*b[3] a[1]*b[1] */
+
+ return _mm_unpacklo_epi32(c, c1);
+}
+
+static gmx_inline float
+gmx_simd_reduce_f_sse2(__m128 a)
+{
+ __m128 b;
+ float f;
+ b = _mm_add_ps(a, _mm_shuffle_ps(a, a, _MM_SHUFFLE(1, 0, 3, 2)));
+ b = _mm_add_ss(b, _mm_shuffle_ps(b, b, _MM_SHUFFLE(0, 3, 2, 1)));
+ _mm_store_ss(&f, b);
+ return f;
+}
+
+/****************************************************
+ * DOUBLE PRECISION IMPLEMENTATION HELPER FUNCTIONS *
+ ****************************************************/
+static gmx_inline __m128d
+gmx_simd_get_exponent_d_sse2(__m128d x)
+{
+ /* Don't use _mm_set1_epi64x() - on MSVC it is only supported for 64-bit builds */
+ const __m128d expmask = _mm_castsi128_pd( _mm_set_epi32(0x7FF00000, 0x00000000, 0x7FF00000, 0x00000000) );
+ const __m128i expbias = _mm_set1_epi32(1023);
+ __m128i iexp;
+
+ iexp = _mm_castpd_si128(_mm_and_pd(x, expmask));
+ iexp = _mm_sub_epi32(_mm_srli_epi64(iexp, 52), expbias);
+ iexp = _mm_shuffle_epi32(iexp, _MM_SHUFFLE(3, 1, 2, 0) );
+ return _mm_cvtepi32_pd(iexp);
+}
+
+static gmx_inline __m128d
+gmx_simd_get_mantissa_d_sse2(__m128d x)
+{
+ /* Don't use _mm_set1_epi64x() - on MSVC it is only supported for 64-bit builds */
+ const __m128d mantmask = _mm_castsi128_pd( _mm_set_epi32(0x000FFFFF, 0xFFFFFFFF, 0x000FFFFF, 0xFFFFFFFF) );
+ const __m128d one = _mm_set1_pd(1.0);
+
+ x = _mm_and_pd(x, mantmask);
+ return _mm_or_pd(x, one);
+}
+
+static gmx_inline __m128d
+gmx_simd_set_exponent_d_sse2(__m128d x)
+{
+ const __m128i expbias = _mm_set1_epi32(1023);
+ __m128i iexp = _mm_cvtpd_epi32(x);
+
+ /* After conversion integers will be in slot 0,1. Move them to 0,2 so
+ * we can do a 64-bit shift and get them to the dp exponents. */
+ iexp = _mm_shuffle_epi32(iexp, _MM_SHUFFLE(3, 1, 2, 0));
+ iexp = _mm_slli_epi64(_mm_add_epi32(iexp, expbias), 52);
+ return _mm_castsi128_pd(iexp);
+}
+
+static gmx_inline __m128i
+gmx_simd_mul_di_sse2(__m128i a, __m128i b)
+{
+ __m128i c;
+
+ a = _mm_unpacklo_epi32(a, _mm_setzero_si128()); /* 0 a[1] 0 a[0] */
+ b = _mm_unpacklo_epi32(b, _mm_setzero_si128()); /* 0 b[1] 0 b[0] */
+
+ c = _mm_mul_epu32(a, b); /* 0 a[1]*b[1] 0 a[0]*b[0] */
+ return _mm_shuffle_epi32(c, _MM_SHUFFLE(3, 1, 2, 0)); /* 0 0 a[1]*b[1] a[0]*b[0] */
+}
+
+static gmx_inline double
+gmx_simd_reduce_d_sse2(__m128d a)
+{
+ __m128d b;
+ double f;
+
+ b = _mm_add_sd(a, _mm_shuffle_pd(a, a, _MM_SHUFFLE2(1, 1)));
+ _mm_store_sd(&f, b);
+ return f;
+}
+
+/* Function to check whether SIMD operations have resulted in overflow */
+static int
+gmx_simd_check_and_reset_overflow(void)
+{
+ int MXCSR;
+ int sse_overflow;
+
+ MXCSR = _mm_getcsr();
+ /* The overflow flag is bit 3 in the register */
+ if (MXCSR & 0x0008)
+ {
+ sse_overflow = 1;
+ /* Set the overflow flag to zero */
+ MXCSR = MXCSR & 0xFFF7;
+ _mm_setcsr(MXCSR);
+ }
+ else
+ {
+ sse_overflow = 0;
+ }
+ return sse_overflow;
+}
+
+/* SSE2 is already 4-wide in single, so we just reuse float datatype for SIMD4.
+ * SSE2 cannot do double-precision SIMD4.
+ */
+#define gmx_simd4_float_t gmx_simd_float_t
+#define gmx_simd4_load_f gmx_simd_load_f
+#define gmx_simd4_load1_f gmx_simd_load1_f
+#define gmx_simd4_set1_f gmx_simd_set1_f
+#define gmx_simd4_store_f gmx_simd_store_f
+#define gmx_simd4_loadu_f gmx_simd_loadu_f
+#define gmx_simd4_storeu_f gmx_simd_storeu_f
+#define gmx_simd4_setzero_f gmx_simd_setzero_f
+#define gmx_simd4_add_f gmx_simd_add_f
+#define gmx_simd4_sub_f gmx_simd_sub_f
+#define gmx_simd4_mul_f gmx_simd_mul_f
+#define gmx_simd4_fmadd_f gmx_simd_fmadd_f
+#define gmx_simd4_fmsub_f gmx_simd_fmsub_f
+#define gmx_simd4_fnmadd_f gmx_simd_fnmadd_f
+#define gmx_simd4_fnmsub_f gmx_simd_fnmsub_f
+#define gmx_simd4_and_f gmx_simd_and_f
+#define gmx_simd4_andnot_f gmx_simd_andnot_f
+#define gmx_simd4_or_f gmx_simd_or_f
+#define gmx_simd4_xor_f gmx_simd_xor_f
+#define gmx_simd4_rsqrt_f gmx_simd_rsqrt_f
+#define gmx_simd4_fabs_f gmx_simd_fabs_f
+#define gmx_simd4_fneg_f gmx_simd_fneg_f
+#define gmx_simd4_max_f gmx_simd_max_f
+#define gmx_simd4_min_f gmx_simd_min_f
+#define gmx_simd4_round_f gmx_simd_round_f
+#define gmx_simd4_trunc_f gmx_simd_trunc_f
+#define gmx_simd4_dotproduct3_f gmx_simd4_dotproduct3_f_sse2
+#define gmx_simd4_fbool_t gmx_simd_fbool_t
+#define gmx_simd4_cmpeq_f gmx_simd_cmpeq_f
+#define gmx_simd4_cmplt_f gmx_simd_cmplt_f
+#define gmx_simd4_cmple_f gmx_simd_cmple_f
+#define gmx_simd4_and_fb gmx_simd_and_fb
+#define gmx_simd4_or_fb gmx_simd_or_fb
+#define gmx_simd4_anytrue_fb gmx_simd_anytrue_fb
+#define gmx_simd4_blendzero_f gmx_simd_blendzero_f
+#define gmx_simd4_blendnotzero_f gmx_simd_blendnotzero_f
+#define gmx_simd4_blendv_f gmx_simd_blendv_f
+#define gmx_simd4_reduce_f gmx_simd_reduce_f
+
+/* SIMD4 Dotproduct helper function */
+static gmx_inline float
+gmx_simd4_dotproduct3_f_sse2(__m128 a, __m128 b)
+{
+ float f;
+ __m128 c;
+ a = _mm_mul_ps(a, b);
+ c = _mm_add_ps(a, _mm_shuffle_ps(a, a, _MM_SHUFFLE(2, 1, 2, 1)));
+ c = _mm_add_ps(c, _mm_shuffle_ps(a, a, _MM_SHUFFLE(3, 2, 3, 2)));
+ _mm_store_ss(&f, c);
+ return f;
+}
+
+#endif /* GMX_SIMD_IMPL_X86_SSE2_H */
--- /dev/null
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2014, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+
+#ifndef GMX_SIMD_IMPL_X86_SSE4_1_H
+#define GMX_SIMD_IMPL_X86_SSE4_1_H
+
+#include <math.h>
+#include <smmintrin.h>
+
+
+/* x86 SSE4.1 SIMD instruction wrappers
+ *
+ * Please see documentation in gromacs/simd/simd.h for the available
+ * defines.
+ */
+
+/* Inherit most of SSE4.1 from SSE2 */
+#include "gromacs/simd/impl_x86_sse2/impl_x86_sse2.h"
+/* Increment over SSE2 capabilities */
+#define GMX_SIMD_X86_SSE4_1_OR_HIGHER
+
+
+/* Override capability definitions from SSE2 */
+#define GMX_SIMD4_HAVE_FLOAT_DOTPRODUCT3
+
+/* Almost all SSE4.1 instructions already exist in SSE2, but a few of them
+ * can be implemented more efficiently in SSE4.1.
+ */
+#undef gmx_simd_round_f
+#define gmx_simd_round_f(x) _mm_round_ps(x, _MM_FROUND_NINT)
+#undef gmx_simd_trunc_f
+#define gmx_simd_trunc_f(x) _mm_round_ps(x, _MM_FROUND_TRUNC)
+#undef gmx_simd_round_d
+#define gmx_simd_round_d(x) _mm_round_pd(x, _MM_FROUND_NINT)
+#undef gmx_simd_trunc_d
+#define gmx_simd_trunc_d(x) _mm_round_pd(x, _MM_FROUND_TRUNC)
+
+#undef gmx_simd_extract_fi
+#define gmx_simd_extract_fi _mm_extract_epi32
+#undef gmx_simd_mul_fi
+#define gmx_simd_mul_fi _mm_mullo_epi32
+
+#undef gmx_simd_extract_di
+#define gmx_simd_extract_di _mm_extract_epi32
+#undef gmx_simd_mul_di
+#define gmx_simd_mul_di _mm_mullo_epi32
+
+#undef gmx_simd_blendv_f
+#define gmx_simd_blendv_f _mm_blendv_ps
+#undef gmx_simd_blendv_d
+#define gmx_simd_blendv_d _mm_blendv_pd
+
+#undef gmx_simd_reduce_f
+#define gmx_simd_reduce_f(a) gmx_simd_reduce_f_sse4_1(a)
+#undef gmx_simd_reduce_d
+#define gmx_simd_reduce_d(a) gmx_simd_reduce_d_sse4_1(a)
+
+#undef gmx_simd_blendv_fi
+#define gmx_simd_blendv_fi _mm_blendv_epi8
+#undef gmx_simd_blendv_di
+#define gmx_simd_blendv_di _mm_blendv_epi8
+
+#undef gmx_simd4_dotproduct3_f
+#define gmx_simd4_dotproduct3_f gmx_simd4_dotproduct3_f_sse4_1
+
+/* SIMD reduction function */
+static gmx_inline float
+gmx_simd_reduce_f_sse4_1(__m128 a)
+{
+ float f;
+
+ a = _mm_hadd_ps(a, a);
+ a = _mm_hadd_ps(a, a);
+ _mm_store_ss(&f, a);
+ return f;
+}
+
+/* SIMD4 Dotproduct helper function */
+static gmx_inline float
+gmx_simd4_dotproduct3_f_sse4_1(__m128 a, __m128 b)
+{
+ float f;
+ _MM_EXTRACT_FLOAT(f, _mm_dp_ps(a, b, 0x71), 0);
+ return f;
+}
+
+static gmx_inline double
+gmx_simd_reduce_d_sse4_1(__m128d a)
+{
+ double f;
+
+ a = _mm_hadd_pd(a, a);
+ _mm_store_sd(&f, a);
+ return f;
+}
+
+#endif /* GMX_SIMD_IMPL_X86_SSE4_1_H */
+++ /dev/null
-/*
- * This file is part of the GROMACS molecular simulation package.
- *
- * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
- * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
- * and including many others, as listed in the AUTHORS file in the
- * top-level source directory and at http://www.gromacs.org.
- *
- * GROMACS is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public License
- * as published by the Free Software Foundation; either version 2.1
- * of the License, or (at your option) any later version.
- *
- * GROMACS is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with GROMACS; if not, see
- * http://www.gnu.org/licenses, or write to the Free Software Foundation,
- * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
- *
- * If you want to redistribute modifications to GROMACS, please
- * consider that scientific software is very special. Version
- * control is crucial - bugs must be traceable. We will be happy to
- * consider code for inclusion in the official distribution, but
- * derived work must not be called official GROMACS. Details are found
- * in the README & COPYING files - if they are missing, get the
- * official version at http://www.gromacs.org.
- *
- * To help us fund GROMACS development, we humbly ask that you cite
- * the research papers on the package. Check out http://www.gromacs.org.
- */
-
-/* The macros in this file are intended to be used for writing
- * architecture-independent SIMD intrinsics code.
- * To support a new architecture, adding macros here should be (nearly)
- * all that is needed.
- */
-
-#ifdef GMX_SIMD_MACROS_H
-#error "gromacs/simd/macros.h included twice"
-#else
-#define GMX_SIMD_MACROS_H
-
-/* NOTE: SSE2 acceleration does not include floor or blendv */
-
-#ifdef GMX_SIMD_REFERENCE
-/* Plain C SIMD reference implementation, also serves as documentation */
-#define GMX_HAVE_SIMD_MACROS
-
-/* Include plain-C reference implementation, also serves as documentation */
-#include "gromacs/simd/macros_ref.h"
-
-#define GMX_SIMD_REAL_WIDTH GMX_SIMD_REF_WIDTH
-
-/* float/double SIMD register type */
-#define gmx_simd_real_t gmx_simd_ref_pr
-
-/* boolean SIMD register type */
-#define gmx_simd_bool_t gmx_simd_ref_pb
-
-/* integer SIMD register type, only for table indexing and exclusion masks */
-#define gmx_simd_int32_t gmx_simd_ref_epi32
-#define GMX_SIMD_INT32_WIDTH GMX_SIMD_REF_EPI32_WIDTH
-
-/* Load GMX_SIMD_REAL_WIDTH reals for memory starting at r */
-#define gmx_simd_load_r gmx_simd_ref_load_pr
-/* Set all SIMD register elements to *r */
-#define gmx_simd_load1_r gmx_simd_ref_load1_pr
-#define gmx_simd_set1_r gmx_simd_ref_set1_pr
-#define gmx_simd_setzero_r gmx_simd_ref_setzero_pr
-#define gmx_simd_store_r gmx_simd_ref_store_pr
-
-#define gmx_simd_add_r gmx_simd_ref_add_pr
-#define gmx_simd_sub_r gmx_simd_ref_sub_pr
-#define gmx_simd_mul_r gmx_simd_ref_mul_pr
-/* For the FMA macros below, aim for c=d in code, so FMA3 uses 1 instruction */
-#define gmx_simd_fmadd_r gmx_simd_ref_madd_pr
-#define gmx_simd_fnmadd_r gmx_simd_ref_nmsub_pr
-
-#define gmx_simd_max_r gmx_simd_ref_max_pr
-#define gmx_simd_blendzero_r gmx_simd_ref_blendzero_pr
-
-#define gmx_simd_round_r gmx_simd_ref_round_pr
-
-/* Not required, only used to speed up the nbnxn tabulated PME kernels */
-#define GMX_SIMD_HAVE_FLOOR
-#ifdef GMX_SIMD_HAVE_FLOOR
-#define gmx_simd_floor_r gmx_simd_ref_floor_pr
-#endif
-
-/* Not required, only used when blendv is faster than comparison */
-#define GMX_SIMD_HAVE_BLENDV
-#ifdef GMX_SIMD_HAVE_BLENDV
-#define gmx_simd_blendv_r gmx_simd_ref_blendv_pr
-#endif
-
-/* Copy the sign of a to b, assumes b >= 0 for efficiency */
-#define gmx_cpsgn_nonneg_pr gmx_simd_ref_cpsgn_nonneg_pr
-
-/* Very specific operation required in the non-bonded kernels */
-#define gmx_masknot_add_pr gmx_simd_ref_masknot_add_pr
-
-/* Comparison */
-#define gmx_simd_cmplt_r gmx_simd_ref_cmplt_pr
-
-/* Logical operations on SIMD booleans */
-#define gmx_simd_and_b gmx_simd_ref_and_pb
-#define gmx_simd_or_b gmx_simd_ref_or_pb
-
-/* Returns a single int (0/1) which tells if any of the 4 booleans is True */
-#define gmx_simd_anytrue_b gmx_simd_ref_anytrue_pb
-
-/* Conversions only used for PME table lookup */
-#define gmx_simd_cvtt_r2i gmx_simd_ref_cvttpr_epi32
-#define gmx_simd_cvt_i2r gmx_simd_ref_cvtepi32_pr
-
-/* These two function only need to be approximate, Newton-Raphson iteration
- * is used for full accuracy in gmx_simd_invsqrt_r and gmx_simd_inv_r.
- */
-#define gmx_simd_rsqrt_r gmx_simd_ref_rsqrt_pr
-#define gmx_simd_rcp_r gmx_simd_ref_rcp_pr
-
-/* sqrt+inv+sin+cos+acos+atan2 are used for bonded potentials, exp for PME */
-#define GMX_SIMD_HAVE_EXP
-#ifdef GMX_SIMD_HAVE_EXP
-#define gmx_simd_exp_r gmx_simd_ref_exp_pr
-#endif
-#define GMX_SIMD_HAVE_TRIGONOMETRIC
-#ifdef GMX_SIMD_HAVE_TRIGONOMETRIC
-#define gmx_simd_sqrt_r gmx_simd_ref_sqrt_pr
-#define gmx_simd_sincos_r gmx_simd_ref_sincos_pr
-#define gmx_simd_acos_r gmx_simd_ref_acos_pr
-#define gmx_simd_atan2_r gmx_simd_ref_atan2_pr
-#endif
-
-#endif /* GMX_SIMD_REFERENCE */
-
-
-/* The same SIMD macros can be translated to SIMD intrinsics (and compiled
- * to instructions for) different SIMD width and float precision.
- *
- * On x86: The gmx_ prefix is replaced by _mm_ or _mm256_ (SSE or AVX).
- * The _pr suffix is replaced by _ps or _pd (for single or double precision).
- * Compiler settings will decide if 128-bit intrinsics will
- * be translated into SSE or AVX instructions.
- */
-
-
-#ifdef GMX_USE_HALF_WIDTH_SIMD_HERE
-#if defined GMX_SIMD_X86_AVX_256_OR_HIGHER || defined __MIC__
-/* We have half SIMD width support, continue */
-#else
-#error "half SIMD width intrinsics are not supported"
-#endif
-#endif
-
-#if defined GMX_TARGET_X86 && !defined __MIC__
-
-#ifdef GMX_SIMD_X86_SSE2_OR_HIGHER
-/* This is for general x86 SIMD instruction sets that also support SSE2 */
-#define GMX_HAVE_SIMD_MACROS
-
-/* Include the highest supported x86 SIMD intrisics + math functions */
-#ifdef GMX_SIMD_X86_AVX_256_OR_HIGHER
-#include "general_x86_avx_256.h"
-#ifdef GMX_DOUBLE
-#include "math_x86_avx_256_double.h"
-#else /* GMX_DOUBLE */
-#include "math_x86_avx_256_single.h"
-#endif /* GMX_DOUBLE */
-#else /* GMX_SIMD_X86_AVX_256_OR_HIGHER */
-#ifdef GMX_SIMD_X86_AVX_128_FMA_OR_HIGHER
-#include "general_x86_avx_128_fma.h"
-#ifdef GMX_DOUBLE
-#include "math_x86_avx_128_fma_double.h"
-#else /* GMX_DOUBLE */
-#include "math_x86_avx_128_fma_single.h"
-#endif /* GMX_DOUBLE */
-#else /* GMX_SIMD_X86_AVX_128_FMA_OR_HIGHER */
-#ifdef GMX_SIMD_X86_SSE4_1
-#include "general_x86_sse4_1.h"
-#ifdef GMX_DOUBLE
-#include "math_x86_sse4_1_double.h"
-#else /* GMX_DOUBLE */
-#include "math_x86_sse4_1_single.h"
-#endif /* GMX_DOUBLE */
-#else /* GMX_SIMD_X86_SSE4_1_OR_HIGHER */
-#ifdef GMX_SIMD_X86_SSE2_OR_HIGHER
-#include "general_x86_sse2.h"
-#ifdef GMX_DOUBLE
-#include "math_x86_sse2_double.h"
-#else /* GMX_DOUBLE */
-#include "math_x86_sse2_single.h"
-#endif /* GMX_DOUBLE */
-#else /* GMX_SIMD_X86_SSE2_OR_HIGHER */
-#error No x86 acceleration defined
-#endif /* GMX_SIMD_X86_SSE2_OR_HIGHER */
-#endif /* GMX_SIMD_X86_SSE4_1_OR_HIGHER */
-#endif /* GMX_SIMD_X86_AVX_128_FMA_OR_HIGHER */
-#endif /* GMX_SIMD_X86_AVX_256_OR_HIGHER */
-
-/* exp and trigonometric functions are included above */
-#define GMX_SIMD_HAVE_EXP
-#define GMX_SIMD_HAVE_ERFC
-#define GMX_SIMD_HAVE_TRIGONOMETRIC
-
-#if !defined GMX_SIMD_X86_AVX_256_OR_HIGHER || defined GMX_USE_HALF_WIDTH_SIMD_HERE
-
-#ifndef GMX_DOUBLE
-
-#define GMX_SIMD_REAL_WIDTH 4
-
-#define gmx_simd_real_t __m128
-
-#define gmx_simd_bool_t __m128
-
-#define gmx_simd_int32_t __m128i
-#define GMX_SIMD_INT32_WIDTH 4
-
-#define gmx_simd_load_r _mm_load_ps
-#define gmx_simd_load1_r _mm_load1_ps
-#define gmx_simd_set1_r _mm_set1_ps
-#define gmx_simd_setzero_r _mm_setzero_ps
-#define gmx_simd_store_r _mm_store_ps
-
-#define gmx_simd_add_r _mm_add_ps
-#define gmx_simd_sub_r _mm_sub_ps
-#define gmx_simd_mul_r _mm_mul_ps
-#ifdef GMX_SIMD_X86_AVX_128_FMA_OR_HIGHER
-#define GMX_SIMD_HAVE_FMA
-#define gmx_simd_fmadd_r(a, b, c) _mm_macc_ps(a, b, c)
-#define gmx_simd_fnmadd_r(a, b, c) _mm_nmacc_ps(a, b, c)
-#else
-#define gmx_simd_fmadd_r(a, b, c) _mm_add_ps(c, _mm_mul_ps(a, b))
-#define gmx_simd_fnmadd_r(a, b, c) _mm_sub_ps(c, _mm_mul_ps(a, b))
-#endif
-#define gmx_simd_max_r _mm_max_ps
-#define gmx_simd_blendzero_r _mm_and_ps
-
-#define gmx_simd_cmplt_r _mm_cmplt_ps
-#define gmx_simd_and_b _mm_and_ps
-#define gmx_simd_or_b _mm_or_ps
-
-#ifdef GMX_SIMD_X86_SSE4_1_OR_HIGHER
-#define gmx_simd_round_r(x) _mm_round_ps(x, 0x0)
-#define GMX_SIMD_HAVE_FLOOR
-#define gmx_simd_floor_r _mm_floor_ps
-#else
-#define gmx_simd_round_r(x) _mm_cvtepi32_ps(_mm_cvtps_epi32(x))
-#endif
-
-#ifdef GMX_SIMD_X86_SSE4_1_OR_HIGHER
-#define GMX_SIMD_HAVE_BLENDV
-#define gmx_simd_blendv_r _mm_blendv_ps
-#endif
-
-static gmx_inline gmx_simd_real_t gmx_cpsgn_nonneg_pr(gmx_simd_real_t a, gmx_simd_real_t b)
-{
- /* The value -0.0 has only the sign-bit set */
- gmx_simd_real_t sign_mask = _mm_set1_ps(-0.0);
- return _mm_or_ps(_mm_and_ps(a, sign_mask), b);
-};
-
-static gmx_inline gmx_simd_real_t gmx_masknot_add_pr(gmx_simd_bool_t a, gmx_simd_real_t b, gmx_simd_real_t c)
-{
- return _mm_add_ps(b, _mm_andnot_ps(a, c));
-};
-
-#define gmx_simd_anytrue_b _mm_movemask_ps
-
-#define gmx_simd_cvtt_r2i _mm_cvttps_epi32
-#define gmx_simd_cvt_i2r _mm_cvtepi32_ps
-
-#define gmx_simd_rsqrt_r _mm_rsqrt_ps
-#define gmx_simd_rcp_r _mm_rcp_ps
-
-#define gmx_simd_exp_r gmx_mm_exp_ps
-#define gmx_simd_sqrt_r gmx_mm_sqrt_ps
-#define gmx_simd_sincos_r gmx_mm_sincos_ps
-#define gmx_simd_acos_r gmx_mm_acos_ps
-#define gmx_simd_atan2_r gmx_mm_atan2_ps
-#define gmx_simd_erfc_r gmx_mm_erfc_ps
-
-#else /* ifndef GMX_DOUBLE */
-
-#define GMX_SIMD_REAL_WIDTH 2
-
-#define gmx_simd_real_t __m128d
-
-#define gmx_simd_bool_t __m128d
-
-#define gmx_simd_int32_t __m128i
-#define GMX_SIMD_INT32_WIDTH 4
-
-#define gmx_simd_load_r _mm_load_pd
-#define gmx_simd_load1_r _mm_load1_pd
-#define gmx_simd_set1_r _mm_set1_pd
-#define gmx_simd_setzero_r _mm_setzero_pd
-#define gmx_simd_store_r _mm_store_pd
-
-#define gmx_simd_add_r _mm_add_pd
-#define gmx_simd_sub_r _mm_sub_pd
-#define gmx_simd_mul_r _mm_mul_pd
-#ifdef GMX_SIMD_X86_AVX_128_FMA_OR_HIGHER
-#define GMX_SIMD_HAVE_FMA
-#define gmx_simd_fmadd_r(a, b, c) _mm_macc_pd(a, b, c)
-#define gmx_simd_fnmadd_r(a, b, c) _mm_nmacc_pd(a, b, c)
-#else
-#define gmx_simd_fmadd_r(a, b, c) _mm_add_pd(c, _mm_mul_pd(a, b))
-#define gmx_simd_fnmadd_r(a, b, c) _mm_sub_pd(c, _mm_mul_pd(a, b))
-#endif
-#define gmx_simd_max_r _mm_max_pd
-#define gmx_simd_blendzero_r _mm_and_pd
-
-#ifdef GMX_SIMD_X86_SSE4_1_OR_HIGHER
-#define gmx_simd_round_r(x) _mm_round_pd(x, 0x0)
-#define GMX_SIMD_HAVE_FLOOR
-#define gmx_simd_floor_r _mm_floor_pd
-#else
-#define gmx_simd_round_r(x) _mm_cvtepi32_pd(_mm_cvtpd_epi32(x))
-/* gmx_simd_floor_r is not used in code for pre-SSE4_1 hardware */
-#endif
-
-#ifdef GMX_SIMD_X86_SSE4_1_OR_HIGHER
-#define GMX_SIMD_HAVE_BLENDV
-#define gmx_simd_blendv_r _mm_blendv_pd
-#endif
-
-static gmx_inline gmx_simd_real_t gmx_cpsgn_nonneg_pr(gmx_simd_real_t a, gmx_simd_real_t b)
-{
- gmx_simd_real_t sign_mask = _mm_set1_pd(-0.0);
- return _mm_or_pd(_mm_and_pd(a, sign_mask), b);
-};
-
-static gmx_inline gmx_simd_real_t gmx_masknot_add_pr(gmx_simd_bool_t a, gmx_simd_real_t b, gmx_simd_real_t c)
-{
- return _mm_add_pd(b, _mm_andnot_pd(a, c));
-};
-
-#define gmx_simd_cmplt_r _mm_cmplt_pd
-
-#define gmx_simd_and_b _mm_and_pd
-#define gmx_simd_or_b _mm_or_pd
-
-#define gmx_simd_anytrue_b _mm_movemask_pd
-
-#define gmx_simd_cvtt_r2i _mm_cvttpd_epi32
-#define gmx_simd_cvt_i2r _mm_cvtepi32_pd
-
-#define gmx_simd_rsqrt_r(r) _mm_cvtps_pd(_mm_rsqrt_ps(_mm_cvtpd_ps(r)))
-#define gmx_simd_rcp_r(r) _mm_cvtps_pd(_mm_rcp_ps(_mm_cvtpd_ps(r)))
-
-#define gmx_simd_exp_r gmx_mm_exp_pd
-#define gmx_simd_sqrt_r gmx_mm_sqrt_pd
-#define gmx_simd_sincos_r gmx_mm_sincos_pd
-#define gmx_simd_acos_r gmx_mm_acos_pd
-#define gmx_simd_atan2_r gmx_mm_atan2_pd
-#define gmx_simd_erfc_r gmx_mm_erfc_pd
-
-#endif /* ifndef GMX_DOUBLE */
-
-#else
-/* We have GMX_SIMD_X86_AVX_256_OR_HIGHER and not GMX_USE_HALF_WIDTH_SIMD_HERE,
- * so we use 256-bit SIMD.
- */
-
-#ifndef GMX_DOUBLE
-
-#define GMX_SIMD_REAL_WIDTH 8
-
-#define gmx_simd_real_t __m256
-
-#define gmx_simd_bool_t __m256
-
-#define gmx_simd_int32_t __m256i
-#define GMX_SIMD_INT32_WIDTH 8
-
-#define gmx_simd_load_r _mm256_load_ps
-#define gmx_simd_load1_r(x) _mm256_set1_ps((x)[0])
-#define gmx_simd_set1_r _mm256_set1_ps
-#define gmx_simd_setzero_r _mm256_setzero_ps
-#define gmx_simd_store_r _mm256_store_ps
-
-#define gmx_simd_add_r _mm256_add_ps
-#define gmx_simd_sub_r _mm256_sub_ps
-#define gmx_simd_mul_r _mm256_mul_ps
-#define gmx_simd_fmadd_r(a, b, c) _mm256_add_ps(c, _mm256_mul_ps(a, b))
-#define gmx_simd_fnmadd_r(a, b, c) _mm256_sub_ps(c, _mm256_mul_ps(a, b))
-#define gmx_simd_max_r _mm256_max_ps
-#define gmx_simd_blendzero_r _mm256_and_ps
-
-#define gmx_simd_round_r(x) _mm256_round_ps(x, 0x0)
-#define GMX_SIMD_HAVE_FLOOR
-#define gmx_simd_floor_r _mm256_floor_ps
-
-#define GMX_SIMD_HAVE_BLENDV
-#define gmx_simd_blendv_r _mm256_blendv_ps
-
-static gmx_inline gmx_simd_real_t gmx_cpsgn_nonneg_pr(gmx_simd_real_t a, gmx_simd_real_t b)
-{
- gmx_simd_real_t sign_mask = _mm256_set1_ps(-0.0);
- return _mm256_or_ps(_mm256_and_ps(a, sign_mask), b);
-};
-
-static gmx_inline gmx_simd_real_t gmx_masknot_add_pr(gmx_simd_bool_t a, gmx_simd_real_t b, gmx_simd_real_t c)
-{
- return _mm256_add_ps(b, _mm256_andnot_ps(a, c));
-};
-
-/* Less-than (we use ordered, non-signaling, but that's not required) */
-#define gmx_simd_cmplt_r(x, y) _mm256_cmp_ps(x, y, 0x11)
-#define gmx_simd_and_b _mm256_and_ps
-#define gmx_simd_or_b _mm256_or_ps
-
-#define gmx_simd_anytrue_b _mm256_movemask_ps
-
-#define gmx_simd_cvtt_r2i _mm256_cvttps_epi32
-
-#define gmx_simd_rsqrt_r _mm256_rsqrt_ps
-#define gmx_simd_rcp_r _mm256_rcp_ps
-
-#define gmx_simd_exp_r gmx_mm256_exp_ps
-#define gmx_simd_sqrt_r gmx_mm256_sqrt_ps
-#define gmx_simd_sincos_r gmx_mm256_sincos_ps
-#define gmx_simd_acos_r gmx_mm256_acos_ps
-#define gmx_simd_atan2_r gmx_mm256_atan2_ps
-#define gmx_simd_erfc_r gmx_mm256_erfc_ps
-
-#else /* ifndef GMX_DOUBLE */
-
-#define GMX_SIMD_REAL_WIDTH 4
-
-#define gmx_simd_real_t __m256d
-
-#define gmx_simd_bool_t __m256d
-
-/* We use 128-bit integer registers because of missing 256-bit operations */
-#define gmx_simd_int32_t __m128i
-#define GMX_SIMD_INT32_WIDTH 4
-
-#define gmx_simd_load_r _mm256_load_pd
-#define gmx_simd_load1_r(x) _mm256_set1_pd((x)[0])
-#define gmx_simd_set1_r _mm256_set1_pd
-#define gmx_simd_setzero_r _mm256_setzero_pd
-#define gmx_simd_store_r _mm256_store_pd
-
-#define gmx_simd_add_r _mm256_add_pd
-#define gmx_simd_sub_r _mm256_sub_pd
-#define gmx_simd_mul_r _mm256_mul_pd
-#define gmx_simd_fmadd_r(a, b, c) _mm256_add_pd(c, _mm256_mul_pd(a, b))
-#define gmx_simd_fnmadd_r(a, b, c) _mm256_sub_pd(c, _mm256_mul_pd(a, b))
-#define gmx_simd_max_r _mm256_max_pd
-#define gmx_simd_blendzero_r _mm256_and_pd
-
-#define gmx_simd_round_r(x) _mm256_round_pd(x, 0x0)
-#define GMX_SIMD_HAVE_FLOOR
-#define gmx_simd_floor_r _mm256_floor_pd
-
-#define GMX_SIMD_HAVE_BLENDV
-#define gmx_simd_blendv_r _mm256_blendv_pd
-
-static gmx_inline gmx_simd_real_t gmx_cpsgn_nonneg_pr(gmx_simd_real_t a, gmx_simd_real_t b)
-{
- gmx_simd_real_t sign_mask = _mm256_set1_pd(-0.0);
- return _mm256_or_pd(_mm256_and_pd(a, sign_mask), b);
-};
-
-static gmx_inline gmx_simd_real_t gmx_masknot_add_pr(gmx_simd_bool_t a, gmx_simd_real_t b, gmx_simd_real_t c)
-{
- return _mm256_add_pd(b, _mm256_andnot_pd(a, c));
-};
-
-/* Less-than (we use ordered, non-signaling, but that's not required) */
-#define gmx_simd_cmplt_r(x, y) _mm256_cmp_pd(x, y, 0x11)
-
-#define gmx_simd_and_b _mm256_and_pd
-#define gmx_simd_or_b _mm256_or_pd
-
-#define gmx_simd_anytrue_b _mm256_movemask_pd
-
-#define gmx_simd_cvtt_r2i _mm256_cvttpd_epi32
-
-#define gmx_simd_rsqrt_r(r) _mm256_cvtps_pd(_mm_rsqrt_ps(_mm256_cvtpd_ps(r)))
-#define gmx_simd_rcp_r(r) _mm256_cvtps_pd(_mm_rcp_ps(_mm256_cvtpd_ps(r)))
-
-#define gmx_simd_exp_r gmx_mm256_exp_pd
-#define gmx_simd_sqrt_r gmx_mm256_sqrt_pd
-#define gmx_simd_sincos_r gmx_mm256_sincos_pd
-#define gmx_simd_acos_r gmx_mm256_acos_pd
-#define gmx_simd_atan2_r gmx_mm256_atan2_pd
-#define gmx_simd_erfc_r gmx_mm256_erfc_pd
-
-#endif /* ifndef GMX_DOUBLE */
-
-#endif /* 128- or 256-bit x86 SIMD */
-
-#endif /* GMX_SIMD_X86_SSE2_OR_HIGHER */
-
-#endif /* GMX_TARGET_X86 */
-
-#ifdef GMX_SIMD_IBM_QPX
-
-/* This hack works on the compilers that can reach this code. A real
- solution with broader scope will be proposed in master branch. */
-#define gmx_always_inline __attribute__((always_inline))
-
-/* This is for the A2 core on BlueGene/Q that supports IBM's QPX
- vector built-in functions */
-#include <mass_simd.h>
-#define GMX_HAVE_SIMD_MACROS
-#ifdef __clang__
-#include <qpxmath.h>
-#endif
-
-/* No need to version the code by the precision, because the QPX AXU
- extends to and truncates from double precision for free. */
-
-#define GMX_SIMD_REAL_WIDTH 4
-typedef vector4double gmx_simd_real_t;
-typedef vector4double gmx_simd_bool_t;
-typedef vector4double gmx_simd_int32_t;
-#define GMX_SIMD_INT32_WIDTH 4
-
-static gmx_inline gmx_simd_real_t gmx_always_inline gmx_simd_load_r(const real *a)
-{
-#ifdef NDEBUG
- return vec_ld(0, (real *) a);
-#else
- return vec_lda(0, (real *) a);
-#endif
-}
-
-static gmx_inline gmx_simd_real_t gmx_always_inline gmx_simd_load1_r(const real *a)
-{
- return vec_splats(*a);
-}
-
-static gmx_inline gmx_simd_real_t gmx_always_inline gmx_simd_set1_r(real a)
-{
- return vec_splats(a);
-}
-
-static gmx_inline gmx_simd_real_t gmx_always_inline gmx_simd_setzero_r()
-{
- return vec_splats(0.0);
-}
-
-static gmx_inline void gmx_always_inline gmx_simd_store_r(real *a, gmx_simd_real_t b)
-{
-#ifdef NDEBUG
- vec_st(b, 0, a);
-#else
- vec_sta(b, 0, a);
-#endif
-}
-
-static gmx_inline gmx_simd_real_t gmx_always_inline gmx_simd_add_r(gmx_simd_real_t a, gmx_simd_real_t b)
-{
- return vec_add(a, b);
-}
-
-static gmx_inline gmx_simd_real_t gmx_always_inline gmx_simd_sub_r(gmx_simd_real_t a, gmx_simd_real_t b)
-{
- return vec_sub(a, b);
-}
-
-static gmx_inline gmx_simd_real_t gmx_always_inline gmx_simd_mul_r(gmx_simd_real_t a, gmx_simd_real_t b)
-{
- return vec_mul(a, b);
-}
-
-static gmx_inline gmx_simd_real_t gmx_always_inline gmx_simd_fmadd_r(gmx_simd_real_t a, gmx_simd_real_t b, gmx_simd_real_t c)
-{
- return vec_madd(a, b, c);
-}
-
-static gmx_inline gmx_simd_real_t gmx_always_inline gmx_simd_fnmadd_r(gmx_simd_real_t a, gmx_simd_real_t b, gmx_simd_real_t c)
-{
- return vec_nmsub(a, b, c);
-}
-
-static gmx_inline gmx_simd_real_t gmx_always_inline gmx_simd_max_r(gmx_simd_real_t a, gmx_simd_real_t b)
-{
- return vec_sel(b, a, vec_sub(a, b));
-}
-
-static gmx_inline gmx_simd_real_t gmx_always_inline gmx_simd_blendzero_r(gmx_simd_real_t a, gmx_simd_real_t b)
-{
- return vec_sel(gmx_simd_setzero_r(), a, b);
-}
-
-static gmx_inline gmx_simd_bool_t gmx_always_inline gmx_simd_cmplt_r(gmx_simd_real_t a, gmx_simd_real_t b)
-{
- return vec_cmplt(a, b);
-}
-
-static gmx_inline gmx_simd_bool_t gmx_always_inline gmx_simd_and_b(gmx_simd_bool_t a, gmx_simd_bool_t b)
-{
- return vec_and(a, b);
-}
-
-static gmx_inline gmx_simd_bool_t gmx_always_inline gmx_simd_or_b(gmx_simd_bool_t a, gmx_simd_bool_t b)
-{
- return vec_or(a, b);
-}
-
-static gmx_inline gmx_simd_real_t gmx_always_inline gmx_simd_round_r(gmx_simd_real_t a)
-{
- return vec_round(a);
-}
-
-#define GMX_SIMD_HAVE_FLOOR
-static gmx_inline gmx_simd_real_t gmx_always_inline gmx_simd_floor_r(gmx_simd_real_t a)
-{
- return vec_floor(a);
-}
-
-#define GMX_SIMD_HAVE_BLENDV
-static gmx_inline gmx_simd_real_t gmx_always_inline gmx_simd_blendv_r(gmx_simd_real_t a, gmx_simd_real_t b, gmx_simd_real_t c)
-{
- return vec_sel(b, a, gmx_simd_cmplt_r(gmx_simd_setzero_r(), c));
-}
-
-static gmx_inline gmx_simd_real_t gmx_always_inline gmx_cpsgn_nonneg_pr(gmx_simd_real_t a, gmx_simd_real_t b)
-{
- return vec_cpsgn(a, b);
-};
-
-static gmx_inline gmx_simd_real_t gmx_always_inline gmx_masknot_add_pr(gmx_simd_bool_t a, gmx_simd_real_t b, gmx_simd_real_t c)
-{
- return vec_add(b, vec_sel(c, gmx_simd_setzero_r(), a));
-};
-
-static gmx_inline gmx_bool gmx_always_inline
-GMX_SIMD_IS_TRUE(real x)
-{
- return x >= 0.0;
-}
-
-static gmx_inline gmx_simd_int32_t gmx_always_inline gmx_simd_cvtt_r2i(gmx_simd_real_t a)
-{
- return vec_ctiwuz(a);
-}
-/* Don't want this, we have floor */
-/* #define gmx_simd_cvt_i2r vec_cvtepi32 */
-
-/* A2 core on BG/Q delivers relative error of 2^-14, whereas Power ISA
- Architecture only promises 2^-8. So probably no need for
- Newton-Raphson iterates at single or double. */
-static gmx_inline gmx_simd_real_t gmx_always_inline gmx_simd_rsqrt_r(gmx_simd_real_t a)
-{
- return vec_rsqrte(a);
-}
-
-/* A2 core on BG/Q delivers relative error of 2^-14, whereas Power ISA
- Architecture only promises 2^-5. So probably no need for
- Newton-Raphson iterates at single or double. */
-static gmx_inline gmx_simd_real_t gmx_always_inline gmx_simd_rcp_r(gmx_simd_real_t a)
-{
- return vec_re(a);
-}
-
-/* Note that here, and below, we use the built-in SLEEF port when
- compiling on BlueGene/Q with clang */
-
-#define GMX_SIMD_HAVE_EXP
-static gmx_inline gmx_simd_real_t gmx_always_inline gmx_simd_exp_r(gmx_simd_real_t a)
-{
-#ifdef __clang__
-#ifndef GMX_DOUBLE
- return xexpf(a);
-#else
- return xexp(a);
-#endif
-#else
-#ifndef GMX_DOUBLE
- return expf4(a);
-#else
- return expd4(a);
-#endif
-#endif
-}
-
-static gmx_inline gmx_simd_real_t gmx_always_inline gmx_simd_sqrt_r(gmx_simd_real_t a)
-{
-#ifdef NDEBUG
- return vec_swsqrt_nochk(a);
-#else
- return vec_swsqrt(a);
-#endif
-}
-
-#define GMX_SIMD_HAVE_TRIGONOMETRIC
-static gmx_inline int gmx_always_inline gmx_simd_sincos_r(gmx_simd_real_t a, gmx_simd_real_t *b, gmx_simd_real_t *c)
-{
-#ifdef __clang__
-#ifndef GMX_DOUBLE
- xsincosf(a, b, c);
-#else
- xsincos(a, b, c);
-#endif
-#else
-#ifndef GMX_DOUBLE
- sincosf4(a, b, c);
-#else
- sincosd4(a, b, c);
-#endif
-#endif
- return 1;
-}
-
-static gmx_inline gmx_simd_real_t gmx_always_inline gmx_simd_acos_r(gmx_simd_real_t a)
-{
-#ifdef __clang__
-#ifndef GMX_DOUBLE
- return xacosf(a);
-#else
- return xacos(a);
-#endif
-#else
-#ifndef GMX_DOUBLE
- return acosf4(a);
-#else
- return acosd4(a);
-#endif
-#endif
-}
-
-/* NB The order of parameters here is correct; the
- documentation of atan2[df]4 in SIMD MASS is wrong. */
-static gmx_inline gmx_simd_real_t gmx_always_inline gmx_simd_atan2_r(gmx_simd_real_t a, gmx_simd_real_t b)
-{
-#ifdef __clang__
-#ifndef GMX_DOUBLE
- return xatan2f(a, b);
-#else
- return xatan2(a, b);
-#endif
-#else
-#ifndef GMX_DOUBLE
- return atan2f4(a, b);
-#else
- return atan2d4(a, b);
-#endif
-#endif
-}
-
-#define GMX_SIMD_HAVE_ERFC
-static gmx_inline gmx_simd_real_t gmx_always_inline gmx_simd_erfc_r(gmx_simd_real_t a)
-{
- /* The BG/Q qpxmath.h vector math library intended for use with
- bgclang does not have erfc, so we need to use a function from
- mass_simd.h. If this changes, then the #include <mass_simd.h> can
- become conditional. */
-#ifndef GMX_DOUBLE
- return erfcf4(a);
-#else
- return erfcd4(a);
-#endif
-}
-
-/* TODO: gmx_mm_erfc_p[sd] should be generalized using gmx_*_pr, so that it just works on BlueGene */
-
-static gmx_inline int gmx_always_inline
-gmx_simd_anytrue_b(gmx_simd_bool_t a)
-{
- /* The "anytrue" is done solely on the QPX AXU (which is the only
- available FPU). This is awkward, because pretty much no
- "horizontal" SIMD-vector operations exist, unlike x86 where
- SSE4.1 added various kinds of horizontal operations. So we have
- to make do with shifting vector elements and operating on the
- results. This makes for lots of data dependency, but the main
- alternative of storing to memory and reloading is not going to
- help, either. OpenMP over 2 or 4 hardware threads per core will
- hide much of the latency from the data dependency. The
- vec_extract() lets the compiler correctly use a floating-point
- comparison on the zeroth vector element, which avoids needing
- memory at all.
- */
- gmx_simd_bool_t vec_shifted_left_0 = a;
- gmx_simd_bool_t vec_shifted_left_1 = vec_sldw(a, a, 1);
- gmx_simd_bool_t vec_shifted_left_2 = vec_sldw(a, a, 2);
- gmx_simd_bool_t vec_shifted_left_3 = vec_sldw(a, a, 3);
-
- gmx_simd_bool_t vec_return = vec_or(vec_or(vec_shifted_left_2, vec_shifted_left_3),
- vec_or(vec_shifted_left_0, vec_shifted_left_1));
- return (0.0 < vec_extract(vec_return, 0));
-};
-
-#undef gmx_always_inline
-
-#endif /* GMX_SIMD_IBM_QPX */
-
-#ifdef __MIC__
-#include "general_x86_mic.h"
-#endif
-
-#ifdef GMX_HAVE_SIMD_MACROS
-/* Generic functions to extract a SIMD aligned pointer from a pointer x.
- * x should have at least GMX_SIMD_REAL_WIDTH elements extra compared
- * to how many you want to use, to avoid indexing outside the aligned region.
- */
-
-static gmx_inline real *
-gmx_simd_align_r(const real *x)
-{
- return (real *)(((size_t)((x)+GMX_SIMD_REAL_WIDTH)) & (~((size_t)(GMX_SIMD_REAL_WIDTH*sizeof(real)-1))));
-}
-
-static gmx_inline int *
-gmx_simd_align_i(const int *x)
-{
- return (int *)(((size_t)((x)+GMX_SIMD_REAL_WIDTH)) & (~((size_t)(GMX_SIMD_REAL_WIDTH*sizeof(int )-1))));
-}
-
-
-/* Include the math functions which only need the above macros,
- * generally these are the ones that don't need masking operations.
- */
-#ifdef GMX_DOUBLE
-#include "math_double.h"
-#else
-#include "math_single.h"
-#endif
-
-
-#endif /* GMX_HAVE_SIMD_MACROS */
-
-#endif
+++ /dev/null
-/*
- * This file is part of the GROMACS molecular simulation package.
- *
- * Copyright (c) 2013,2014, by the GROMACS development team, led by
- * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
- * and including many others, as listed in the AUTHORS file in the
- * top-level source directory and at http://www.gromacs.org.
- *
- * GROMACS is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public License
- * as published by the Free Software Foundation; either version 2.1
- * of the License, or (at your option) any later version.
- *
- * GROMACS is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with GROMACS; if not, see
- * http://www.gnu.org/licenses, or write to the Free Software Foundation,
- * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
- *
- * If you want to redistribute modifications to GROMACS, please
- * consider that scientific software is very special. Version
- * control is crucial - bugs must be traceable. We will be happy to
- * consider code for inclusion in the official distribution, but
- * derived work must not be called official GROMACS. Details are found
- * in the README & COPYING files - if they are missing, get the
- * official version at http://www.gromacs.org.
- *
- * To help us fund GROMACS development, we humbly ask that you cite
- * the research papers on the package. Check out http://www.gromacs.org.
- */
-
-#ifndef GMX_SIMD_MACROS_REF_H
-#define GMX_SIMD_MACROS_REF_H
-
-/* This file contains a reference plain-C implementation of arbitrary width.
- * This code is only useful for testing and documentation.
- * The SIMD width is set by defining GMX_SIMD_REF_WIDTH before including.
- */
-
-
-#ifndef GMX_SIMD_REF_WIDTH
-#error "GMX_SIMD_REF_WIDTH should be defined before including gromacs/simd/macros_ref.h"
-#endif
-
-#include <math.h>
-
-/* float/double SIMD register type */
-typedef struct {
- real r[GMX_SIMD_REF_WIDTH];
-} gmx_simd_ref_pr;
-
-/* boolean SIMD register type */
-typedef struct {
- char r[GMX_SIMD_REF_WIDTH];
-} gmx_simd_ref_pb;
-
-/* integer SIMD register type, only for table indexing and exclusion masks */
-typedef struct {
- int r[GMX_SIMD_REF_WIDTH];
-} gmx_simd_ref_epi32;
-#define GMX_SIMD_REF_EPI32_WIDTH GMX_SIMD_REF_WIDTH
-
-/* Load GMX_SIMD_REF_WIDTH reals for memory starting at r */
-static gmx_inline gmx_simd_ref_pr
-gmx_simd_ref_load_pr(const real *r)
-{
- gmx_simd_ref_pr a;
- int i;
-
- for (i = 0; i < GMX_SIMD_REF_WIDTH; i++)
- {
- a.r[i] = r[i];
- }
-
- return a;
-}
-
-/* Set all SIMD register elements to *r */
-static gmx_inline gmx_simd_ref_pr
-gmx_simd_ref_load1_pr(const real *r)
-{
- gmx_simd_ref_pr a;
- int i;
-
- for (i = 0; i < GMX_SIMD_REF_WIDTH; i++)
- {
- a.r[i] = *r;
- }
-
- return a;
-}
-
-/* Set all SIMD register elements to r */
-static gmx_inline gmx_simd_ref_pr
-gmx_simd_ref_set1_pr(real r)
-{
- gmx_simd_ref_pr a;
- int i;
-
- for (i = 0; i < GMX_SIMD_REF_WIDTH; i++)
- {
- a.r[i] = r;
- }
-
- return a;
-}
-
-/* Set all SIMD register elements to 0 */
-static gmx_inline gmx_simd_ref_pr
-gmx_simd_ref_setzero_pr()
-{
- gmx_simd_ref_pr a;
- int i;
-
- for (i = 0; i < GMX_SIMD_REF_WIDTH; i++)
- {
- a.r[i] = 0.0;
- }
-
- return a;
-}
-
-static gmx_inline void
-gmx_simd_ref_store_pr(real *dest, gmx_simd_ref_pr src)
-{
- int i;
-
- for (i = 0; i < GMX_SIMD_REF_WIDTH; i++)
- {
- dest[i] = src.r[i];
- }
-}
-
-static gmx_inline gmx_simd_ref_pr
-gmx_simd_ref_add_pr(gmx_simd_ref_pr a, gmx_simd_ref_pr b)
-{
- gmx_simd_ref_pr c;
- int i;
-
- for (i = 0; i < GMX_SIMD_REF_WIDTH; i++)
- {
- c.r[i] = a.r[i] + b.r[i];
- }
-
- return c;
-}
-
-static gmx_inline gmx_simd_ref_pr
-gmx_simd_ref_sub_pr(gmx_simd_ref_pr a, gmx_simd_ref_pr b)
-{
- gmx_simd_ref_pr c;
- int i;
-
- for (i = 0; i < GMX_SIMD_REF_WIDTH; i++)
- {
- c.r[i] = a.r[i] - b.r[i];
- }
-
- return c;
-}
-
-static gmx_inline gmx_simd_ref_pr
-gmx_simd_ref_mul_pr(gmx_simd_ref_pr a, gmx_simd_ref_pr b)
-{
- gmx_simd_ref_pr c;
- int i;
-
- for (i = 0; i < GMX_SIMD_REF_WIDTH; i++)
- {
- c.r[i] = a.r[i]*b.r[i];
- }
-
- return c;
-}
-
-static gmx_inline gmx_simd_ref_pr
-gmx_simd_ref_madd_pr(gmx_simd_ref_pr a, gmx_simd_ref_pr b, gmx_simd_ref_pr c)
-{
- gmx_simd_ref_pr d;
- int i;
-
- for (i = 0; i < GMX_SIMD_REF_WIDTH; i++)
- {
- d.r[i] = a.r[i]*b.r[i] + c.r[i];
- }
-
- return d;
-}
-
-static gmx_inline gmx_simd_ref_pr
-gmx_simd_ref_nmsub_pr(gmx_simd_ref_pr a, gmx_simd_ref_pr b, gmx_simd_ref_pr c)
-{
- gmx_simd_ref_pr d;
- int i;
-
- for (i = 0; i < GMX_SIMD_REF_WIDTH; i++)
- {
- d.r[i] = -a.r[i]*b.r[i] + c.r[i];
- }
-
- return d;
-}
-
-static gmx_inline gmx_simd_ref_pr
-gmx_simd_ref_max_pr(gmx_simd_ref_pr a, gmx_simd_ref_pr b)
-{
- gmx_simd_ref_pr c;
- int i;
-
- for (i = 0; i < GMX_SIMD_REF_WIDTH; i++)
- {
- c.r[i] = (a.r[i] >= b.r[i] ? a.r[i] : b.r[i]);
- }
-
- return c;
-}
-
-static gmx_inline gmx_simd_ref_pr
-gmx_simd_ref_blendzero_pr(gmx_simd_ref_pr a, gmx_simd_ref_pb b)
-{
- gmx_simd_ref_pr c;
- int i;
-
- for (i = 0; i < GMX_SIMD_REF_WIDTH; i++)
- {
- c.r[i] = (b.r[i] ? a.r[i] : 0.0);
- }
-
- return c;
-}
-
-/* Note that this reference implementation rounds away from zero,
- * whereas most SIMD intrinsics will round to nearest even. Since this
- * function is only used for periodic image calculations, the rounding
- * of mantissas close to 0.5 is irrelevant, except in testing. This
- * could be fixed by using rint/rintf, but the bigger problem is that
- * MSVC does not support full C99, and none of the round or rint
- * functions are defined. It's much easier to approximately implement
- * round() than rint(), so we do that and hope we never get bitten in
- * testing. (Thanks, Microsoft.)
- */
-static gmx_inline gmx_simd_ref_pr
-gmx_simd_ref_round_pr(gmx_simd_ref_pr a)
-{
- gmx_simd_ref_pr b;
- int i;
-
- for (i = 0; i < GMX_SIMD_REF_WIDTH; i++)
- {
-#ifdef _MSC_VER
- int temp = (a.r[i] >= 0.)
- ? (a.r[i] + 0.5)
- : (a.r[i] - 0.5);
- b.r[i] = (real) temp;
-#elif defined GMX_DOUBLE
- b.r[i] = round(a.r[i]);
-#else
- b.r[i] = roundf(a.r[i]);
-#endif
- }
-
- return b;
-}
-
-/* Not required, only used to speed up the nbnxn tabulated PME kernels */
-static gmx_inline gmx_simd_ref_pr
-gmx_simd_ref_floor_pr(gmx_simd_ref_pr a)
-{
- gmx_simd_ref_pr b;
- int i;
-
- for (i = 0; i < GMX_SIMD_REF_WIDTH; i++)
- {
-#ifdef GMX_DOUBLE
- b.r[i] = floor(a.r[i]);
-#else
- b.r[i] = floorf(a.r[i]);
-#endif
- }
-
- return b;
-}
-
-/* Not required, only used when blendv is faster than comparison */
-static gmx_inline gmx_simd_ref_pr
-gmx_simd_ref_blendv_pr(gmx_simd_ref_pr a, gmx_simd_ref_pr b, gmx_simd_ref_pr c)
-{
- gmx_simd_ref_pr d;
- int i;
-
- for (i = 0; i < GMX_SIMD_REF_WIDTH; i++)
- {
- d.r[i] = (c.r[i] >= 0) ? a.r[i] : b.r[i];
- }
-
- return d;
-}
-
-/* Copy the sign of a to b, assumes b >= 0 for efficiency */
-static gmx_inline gmx_simd_ref_pr
-gmx_simd_ref_cpsgn_nonneg_pr(gmx_simd_ref_pr a, gmx_simd_ref_pr b)
-{
- gmx_simd_ref_pr c;
- int i;
-
- for (i = 0; i < GMX_SIMD_REF_WIDTH; i++)
- {
- c.r[i] = (a.r[i] >= 0) ? b.r[i] : -b.r[i];
- }
-
- return c;
-}
-
-/* Very specific operation required in the non-bonded kernels */
-static gmx_inline gmx_simd_ref_pr
-gmx_simd_ref_masknot_add_pr(gmx_simd_ref_pb a, gmx_simd_ref_pr b, gmx_simd_ref_pr c)
-{
- gmx_simd_ref_pr d;
- int i;
-
- for (i = 0; i < GMX_SIMD_REF_WIDTH; i++)
- {
- d.r[i] = a.r[i] ? b.r[i] : b.r[i] + c.r[i];
- }
-
- return d;
-}
-
-/* Comparison */
-static gmx_inline gmx_simd_ref_pb
-gmx_simd_ref_cmplt_pr(gmx_simd_ref_pr a, gmx_simd_ref_pr b)
-{
- gmx_simd_ref_pb c;
- int i;
-
- for (i = 0; i < GMX_SIMD_REF_WIDTH; i++)
- {
- c.r[i] = (a.r[i] < b.r[i]);
- }
-
- return c;
-}
-
-/* Logical AND on SIMD booleans. */
-static gmx_inline gmx_simd_ref_pb
-gmx_simd_ref_and_pb(gmx_simd_ref_pb a, gmx_simd_ref_pb b)
-{
- gmx_simd_ref_pb c;
- int i;
-
- for (i = 0; i < GMX_SIMD_REF_WIDTH; i++)
- {
- c.r[i] = (a.r[i] && b.r[i]);
- }
-
- return c;
-}
-
-/* Logical OR on SIMD booleans. */
-static gmx_inline gmx_simd_ref_pb
-gmx_simd_ref_or_pb(gmx_simd_ref_pb a, gmx_simd_ref_pb b)
-{
- gmx_simd_ref_pb c;
- int i;
-
- for (i = 0; i < GMX_SIMD_REF_WIDTH; i++)
- {
- c.r[i] = (a.r[i] || b.r[i]);
- }
-
- return c;
-}
-
-/* Returns a single int (0/1) which tells if any of the booleans is True */
-static gmx_inline int
-gmx_simd_ref_anytrue_pb(gmx_simd_ref_pb a)
-{
- int anytrue;
- int i;
-
- anytrue = 0;
- for (i = 0; i < GMX_SIMD_REF_WIDTH; i++)
- {
- if (a.r[i])
- {
- anytrue = 1;
- }
- }
-
- return anytrue;
-}
-
-/* Conversions only used for PME table lookup */
-static gmx_inline gmx_simd_ref_epi32
-gmx_simd_ref_cvttpr_epi32(gmx_simd_ref_pr a)
-{
- gmx_simd_ref_epi32 b;
- int i;
-
- for (i = 0; i < GMX_SIMD_REF_WIDTH; i++)
- {
- b.r[i] = (int)a.r[i];
- }
-
- return b;
-};
-
-/* These two function only need to be approximate, Newton-Raphson iteration
- * is used for full accuracy in gmx_simd_invsqrt_r and gmx_simd_inv_r.
- */
-static gmx_inline gmx_simd_ref_pr
-gmx_simd_ref_rsqrt_pr(gmx_simd_ref_pr a)
-{
- gmx_simd_ref_pr b;
- int i;
-
- for (i = 0; i < GMX_SIMD_REF_WIDTH; i++)
- {
-#ifdef GMX_DOUBLE
- b.r[i] = 1.0/sqrt(a.r[i]);
-#else
- b.r[i] = 1.0/sqrtf(a.r[i]);
-#endif
- }
-
- return b;
-};
-
-static gmx_inline gmx_simd_ref_pr
-gmx_simd_ref_rcp_pr(gmx_simd_ref_pr a)
-{
- gmx_simd_ref_pr b;
- int i;
-
- for (i = 0; i < GMX_SIMD_REF_WIDTH; i++)
- {
- b.r[i] = 1.0/a.r[i];
- }
-
- return b;
-};
-
-static gmx_inline gmx_simd_ref_pr
-gmx_simd_ref_exp_pr(gmx_simd_ref_pr a)
-{
- gmx_simd_ref_pr b;
- int i;
-
- for (i = 0; i < GMX_SIMD_REF_WIDTH; i++)
- {
-#ifdef GMX_DOUBLE
- b.r[i] = exp(a.r[i]);
-#else
- b.r[i] = expf(a.r[i]);
-#endif
- }
-
- return b;
-};
-
-static gmx_inline gmx_simd_ref_pr
-gmx_simd_ref_sqrt_pr(gmx_simd_ref_pr a)
-{
- gmx_simd_ref_pr b;
- int i;
-
- for (i = 0; i < GMX_SIMD_REF_WIDTH; i++)
- {
-#ifdef GMX_DOUBLE
- b.r[i] = sqrt(a.r[i]);
-#else
- b.r[i] = sqrtf(a.r[i]);
-#endif
- }
-
- return b;
-}
-
-static gmx_inline int
-gmx_simd_ref_sincos_pr(gmx_simd_ref_pr a,
- gmx_simd_ref_pr *s, gmx_simd_ref_pr *c)
-{
- int i;
-
- for (i = 0; i < GMX_SIMD_REF_WIDTH; i++)
- {
- s->r[i] = sin(a.r[i]);
- c->r[i] = cos(a.r[i]);
- }
-
- return 0;
-}
-
-static gmx_inline gmx_simd_ref_pr
-gmx_simd_ref_acos_pr(gmx_simd_ref_pr a)
-{
- gmx_simd_ref_pr b;
- int i;
-
- for (i = 0; i < GMX_SIMD_REF_WIDTH; i++)
- {
- b.r[i] = acos(a.r[i]);
- }
-
- return b;
-}
-
-static gmx_inline gmx_simd_ref_pr
-gmx_simd_ref_atan2_pr(gmx_simd_ref_pr a, gmx_simd_ref_pr b)
-{
- gmx_simd_ref_pr c;
- int i;
-
- for (i = 0; i < GMX_SIMD_REF_WIDTH; i++)
- {
- c.r[i] = atan2(a.r[i], b.r[i]);
- }
-
- return c;
-}
-
-#endif
+++ /dev/null
-/*
- * This file is part of the GROMACS molecular simulation package.
- *
- * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
- * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
- * and including many others, as listed in the AUTHORS file in the
- * top-level source directory and at http://www.gromacs.org.
- *
- * GROMACS is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public License
- * as published by the Free Software Foundation; either version 2.1
- * of the License, or (at your option) any later version.
- *
- * GROMACS is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with GROMACS; if not, see
- * http://www.gnu.org/licenses, or write to the Free Software Foundation,
- * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
- *
- * If you want to redistribute modifications to GROMACS, please
- * consider that scientific software is very special. Version
- * control is crucial - bugs must be traceable. We will be happy to
- * consider code for inclusion in the official distribution, but
- * derived work must not be called official GROMACS. Details are found
- * in the README & COPYING files - if they are missing, get the
- * official version at http://www.gromacs.org.
- *
- * To help us fund GROMACS development, we humbly ask that you cite
- * the research papers on the package. Check out http://www.gromacs.org.
- */
-#ifndef GMX_SIMD_MATH_DOUBLE_H_
-#define GMX_SIMD_MATH_DOUBLE_H_
-
-
-/* 1.0/sqrt(x) */
-static gmx_inline gmx_simd_real_t
-gmx_simd_invsqrt_r(gmx_simd_real_t x)
-{
- const gmx_simd_real_t half = gmx_simd_set1_r(0.5);
- const gmx_simd_real_t three = gmx_simd_set1_r(3.0);
-
- /* Lookup instruction only exists in single precision, convert back and forth... */
- gmx_simd_real_t lu = gmx_simd_rsqrt_r(x);
-
- lu = gmx_simd_mul_r(gmx_simd_mul_r(half, lu), gmx_simd_fnmadd_r(gmx_simd_mul_r(lu, lu), x, three));
- return gmx_simd_mul_r(gmx_simd_mul_r(half, lu), gmx_simd_fnmadd_r(gmx_simd_mul_r(lu, lu), x, three));
-}
-
-
-/* 1.0/x */
-static gmx_inline gmx_simd_real_t
-gmx_simd_inv_r(gmx_simd_real_t x)
-{
- const gmx_simd_real_t two = gmx_simd_set1_r(2.0);
-
- /* Lookup instruction only exists in single precision, convert back and forth... */
- gmx_simd_real_t lu = gmx_simd_rcp_r(x);
-
- /* Perform two N-R steps for double precision */
- lu = gmx_simd_mul_r(lu, gmx_simd_fnmadd_r(lu, x, two));
- return gmx_simd_mul_r(lu, gmx_simd_fnmadd_r(lu, x, two));
-}
-
-
-/* Calculate the force correction due to PME analytically.
- *
- * This routine is meant to enable analytical evaluation of the
- * direct-space PME electrostatic force to avoid tables.
- *
- * The direct-space potential should be Erfc(beta*r)/r, but there
- * are some problems evaluating that:
- *
- * First, the error function is difficult (read: expensive) to
- * approxmiate accurately for intermediate to large arguments, and
- * this happens already in ranges of beta*r that occur in simulations.
- * Second, we now try to avoid calculating potentials in Gromacs but
- * use forces directly.
- *
- * We can simply things slight by noting that the PME part is really
- * a correction to the normal Coulomb force since Erfc(z)=1-Erf(z), i.e.
- *
- * V= 1/r - Erf(beta*r)/r
- *
- * The first term we already have from the inverse square root, so
- * that we can leave out of this routine.
- *
- * For pme tolerances of 1e-3 to 1e-8 and cutoffs of 0.5nm to 1.8nm,
- * the argument beta*r will be in the range 0.15 to ~4. Use your
- * favorite plotting program to realize how well-behaved Erf(z)/z is
- * in this range!
- *
- * We approximate f(z)=erf(z)/z with a rational minimax polynomial.
- * However, it turns out it is more efficient to approximate f(z)/z and
- * then only use even powers. This is another minor optimization, since
- * we actually WANT f(z)/z, because it is going to be multiplied by
- * the vector between the two atoms to get the vectorial force. The
- * fastest flops are the ones we can avoid calculating!
- *
- * So, here's how it should be used:
- *
- * 1. Calculate r^2.
- * 2. Multiply by beta^2, so you get z^2=beta^2*r^2.
- * 3. Evaluate this routine with z^2 as the argument.
- * 4. The return value is the expression:
- *
- *
- * 2*exp(-z^2) erf(z)
- * ------------ - --------
- * sqrt(Pi)*z^2 z^3
- *
- * 5. Multiply the entire expression by beta^3. This will get you
- *
- * beta^3*2*exp(-z^2) beta^3*erf(z)
- * ------------------ - ---------------
- * sqrt(Pi)*z^2 z^3
- *
- * or, switching back to r (z=r*beta):
- *
- * 2*beta*exp(-r^2*beta^2) erf(r*beta)
- * ----------------------- - -----------
- * sqrt(Pi)*r^2 r^3
- *
- *
- * With a bit of math exercise you should be able to confirm that
- * this is exactly D[Erf[beta*r]/r,r] divided by r another time.
- *
- * 6. Add the result to 1/r^3, multiply by the product of the charges,
- * and you have your force (divided by r). A final multiplication
- * with the vector connecting the two particles and you have your
- * vectorial force to add to the particles.
- *
- */
-static gmx_simd_real_t
-gmx_simd_pmecorrF_r(gmx_simd_real_t z2)
-{
- const gmx_simd_real_t FN10 = gmx_simd_set1_r(-8.0072854618360083154e-14);
- const gmx_simd_real_t FN9 = gmx_simd_set1_r(1.1859116242260148027e-11);
- const gmx_simd_real_t FN8 = gmx_simd_set1_r(-8.1490406329798423616e-10);
- const gmx_simd_real_t FN7 = gmx_simd_set1_r(3.4404793543907847655e-8);
- const gmx_simd_real_t FN6 = gmx_simd_set1_r(-9.9471420832602741006e-7);
- const gmx_simd_real_t FN5 = gmx_simd_set1_r(0.000020740315999115847456);
- const gmx_simd_real_t FN4 = gmx_simd_set1_r(-0.00031991745139313364005);
- const gmx_simd_real_t FN3 = gmx_simd_set1_r(0.0035074449373659008203);
- const gmx_simd_real_t FN2 = gmx_simd_set1_r(-0.031750380176100813405);
- const gmx_simd_real_t FN1 = gmx_simd_set1_r(0.13884101728898463426);
- const gmx_simd_real_t FN0 = gmx_simd_set1_r(-0.75225277815249618847);
-
- const gmx_simd_real_t FD5 = gmx_simd_set1_r(0.000016009278224355026701);
- const gmx_simd_real_t FD4 = gmx_simd_set1_r(0.00051055686934806966046);
- const gmx_simd_real_t FD3 = gmx_simd_set1_r(0.0081803507497974289008);
- const gmx_simd_real_t FD2 = gmx_simd_set1_r(0.077181146026670287235);
- const gmx_simd_real_t FD1 = gmx_simd_set1_r(0.41543303143712535988);
- const gmx_simd_real_t FD0 = gmx_simd_set1_r(1.0);
-
- gmx_simd_real_t z4;
- gmx_simd_real_t polyFN0, polyFN1, polyFD0, polyFD1;
-
- z4 = gmx_simd_mul_r(z2, z2);
-
- polyFD1 = gmx_simd_fmadd_r(FD5, z4, FD3);
- polyFD1 = gmx_simd_fmadd_r(polyFD1, z4, FD1);
- polyFD1 = gmx_simd_mul_r(polyFD1, z2);
- polyFD0 = gmx_simd_fmadd_r(FD4, z4, FD2);
- polyFD0 = gmx_simd_fmadd_r(polyFD0, z4, FD0);
- polyFD0 = gmx_simd_add_r(polyFD0, polyFD1);
-
- polyFD0 = gmx_simd_inv_r(polyFD0);
-
- polyFN0 = gmx_simd_fmadd_r(FN10, z4, FN8);
- polyFN0 = gmx_simd_fmadd_r(polyFN0, z4, FN6);
- polyFN0 = gmx_simd_fmadd_r(polyFN0, z4, FN4);
- polyFN0 = gmx_simd_fmadd_r(polyFN0, z4, FN2);
- polyFN0 = gmx_simd_fmadd_r(polyFN0, z4, FN0);
- polyFN1 = gmx_simd_fmadd_r(FN9, z4, FN7);
- polyFN1 = gmx_simd_fmadd_r(polyFN1, z4, FN5);
- polyFN1 = gmx_simd_fmadd_r(polyFN1, z4, FN3);
- polyFN1 = gmx_simd_fmadd_r(polyFN1, z4, FN1);
- polyFN0 = gmx_simd_fmadd_r(polyFN1, z2, polyFN0);
-
- return gmx_simd_mul_r(polyFN0, polyFD0);
-}
-
-
-/* Calculate the potential correction due to PME analytically.
- *
- * This routine calculates Erf(z)/z, although you should provide z^2
- * as the input argument.
- *
- * Here's how it should be used:
- *
- * 1. Calculate r^2.
- * 2. Multiply by beta^2, so you get z^2=beta^2*r^2.
- * 3. Evaluate this routine with z^2 as the argument.
- * 4. The return value is the expression:
- *
- *
- * erf(z)
- * --------
- * z
- *
- * 5. Multiply the entire expression by beta and switching back to r (z=r*beta):
- *
- * erf(r*beta)
- * -----------
- * r
- *
- * 6. Subtract the result from 1/r, multiply by the product of the charges,
- * and you have your potential.
- *
- */
-static gmx_simd_real_t
-gmx_simd_pmecorrV_r(gmx_simd_real_t z2)
-{
- const gmx_simd_real_t VN9 = gmx_simd_set1_r(-9.3723776169321855475e-13);
- const gmx_simd_real_t VN8 = gmx_simd_set1_r(1.2280156762674215741e-10);
- const gmx_simd_real_t VN7 = gmx_simd_set1_r(-7.3562157912251309487e-9);
- const gmx_simd_real_t VN6 = gmx_simd_set1_r(2.6215886208032517509e-7);
- const gmx_simd_real_t VN5 = gmx_simd_set1_r(-4.9532491651265819499e-6);
- const gmx_simd_real_t VN4 = gmx_simd_set1_r(0.00025907400778966060389);
- const gmx_simd_real_t VN3 = gmx_simd_set1_r(0.0010585044856156469792);
- const gmx_simd_real_t VN2 = gmx_simd_set1_r(0.045247661136833092885);
- const gmx_simd_real_t VN1 = gmx_simd_set1_r(0.11643931522926034421);
- const gmx_simd_real_t VN0 = gmx_simd_set1_r(1.1283791671726767970);
-
- const gmx_simd_real_t VD5 = gmx_simd_set1_r(0.000021784709867336150342);
- const gmx_simd_real_t VD4 = gmx_simd_set1_r(0.00064293662010911388448);
- const gmx_simd_real_t VD3 = gmx_simd_set1_r(0.0096311444822588683504);
- const gmx_simd_real_t VD2 = gmx_simd_set1_r(0.085608012351550627051);
- const gmx_simd_real_t VD1 = gmx_simd_set1_r(0.43652499166614811084);
- const gmx_simd_real_t VD0 = gmx_simd_set1_r(1.0);
-
- gmx_simd_real_t z4;
- gmx_simd_real_t polyVN0, polyVN1, polyVD0, polyVD1;
-
- z4 = gmx_simd_mul_r(z2, z2);
-
- polyVD1 = gmx_simd_fmadd_r(VD5, z4, VD3);
- polyVD0 = gmx_simd_fmadd_r(VD4, z4, VD2);
- polyVD1 = gmx_simd_fmadd_r(polyVD1, z4, VD1);
- polyVD0 = gmx_simd_fmadd_r(polyVD0, z4, VD0);
- polyVD0 = gmx_simd_fmadd_r(polyVD1, z2, polyVD0);
-
- polyVD0 = gmx_simd_inv_r(polyVD0);
-
- polyVN1 = gmx_simd_fmadd_r(VN9, z4, VN7);
- polyVN0 = gmx_simd_fmadd_r(VN8, z4, VN6);
- polyVN1 = gmx_simd_fmadd_r(polyVN1, z4, VN5);
- polyVN0 = gmx_simd_fmadd_r(polyVN0, z4, VN4);
- polyVN1 = gmx_simd_fmadd_r(polyVN1, z4, VN3);
- polyVN0 = gmx_simd_fmadd_r(polyVN0, z4, VN2);
- polyVN1 = gmx_simd_fmadd_r(polyVN1, z4, VN1);
- polyVN0 = gmx_simd_fmadd_r(polyVN0, z4, VN0);
- polyVN0 = gmx_simd_fmadd_r(polyVN1, z2, polyVN0);
-
- return gmx_simd_mul_r(polyVN0, polyVD0);
-}
-
-
-#endif
+++ /dev/null
-/*
- * This file is part of the GROMACS molecular simulation package.
- *
- * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
- * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
- * and including many others, as listed in the AUTHORS file in the
- * top-level source directory and at http://www.gromacs.org.
- *
- * GROMACS is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public License
- * as published by the Free Software Foundation; either version 2.1
- * of the License, or (at your option) any later version.
- *
- * GROMACS is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with GROMACS; if not, see
- * http://www.gnu.org/licenses, or write to the Free Software Foundation,
- * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
- *
- * If you want to redistribute modifications to GROMACS, please
- * consider that scientific software is very special. Version
- * control is crucial - bugs must be traceable. We will be happy to
- * consider code for inclusion in the official distribution, but
- * derived work must not be called official GROMACS. Details are found
- * in the README & COPYING files - if they are missing, get the
- * official version at http://www.gromacs.org.
- *
- * To help us fund GROMACS development, we humbly ask that you cite
- * the research papers on the package. Check out http://www.gromacs.org.
- */
-#ifndef GMX_SIMD_MATH_SINGLE_H_
-#define GMX_SIMD_MATH_SINGLE_H_
-
-
-/* 1.0/sqrt(x) */
-static gmx_inline gmx_simd_real_t
-gmx_simd_invsqrt_r(gmx_simd_real_t x)
-{
- /* This is one of the few cases where FMA adds a FLOP, but ends up with
- * less instructions in total when FMA is available in hardware.
- * Usually we would not optimize this far, but invsqrt is used often.
- */
-#ifdef GMX_SIMD_HAVE_FMA
- const gmx_simd_real_t half = gmx_simd_set1_r(0.5);
- const gmx_simd_real_t one = gmx_simd_set1_r(1.0);
-
- gmx_simd_real_t lu = gmx_simd_rsqrt_r(x);
-
- return gmx_simd_fmadd_r(gmx_simd_fnmadd_r(x, gmx_simd_mul_r(lu, lu), one), gmx_simd_mul_r(lu, half), lu);
-#else
- const gmx_simd_real_t half = gmx_simd_set1_r(0.5);
- const gmx_simd_real_t three = gmx_simd_set1_r(3.0);
-
- gmx_simd_real_t lu = gmx_simd_rsqrt_r(x);
-
- return gmx_simd_mul_r(half, gmx_simd_mul_r(gmx_simd_sub_r(three, gmx_simd_mul_r(gmx_simd_mul_r(lu, lu), x)), lu));
-#endif
-}
-
-
-/* 1.0/x */
-static gmx_inline gmx_simd_real_t
-gmx_simd_inv_r(gmx_simd_real_t x)
-{
- const gmx_simd_real_t two = gmx_simd_set1_r(2.0);
-
- gmx_simd_real_t lu = gmx_simd_rcp_r(x);
-
- return gmx_simd_mul_r(lu, gmx_simd_fnmadd_r(lu, x, two));
-}
-
-
-/* Calculate the force correction due to PME analytically.
- *
- * This routine is meant to enable analytical evaluation of the
- * direct-space PME electrostatic force to avoid tables.
- *
- * The direct-space potential should be Erfc(beta*r)/r, but there
- * are some problems evaluating that:
- *
- * First, the error function is difficult (read: expensive) to
- * approxmiate accurately for intermediate to large arguments, and
- * this happens already in ranges of beta*r that occur in simulations.
- * Second, we now try to avoid calculating potentials in Gromacs but
- * use forces directly.
- *
- * We can simply things slight by noting that the PME part is really
- * a correction to the normal Coulomb force since Erfc(z)=1-Erf(z), i.e.
- *
- * V= 1/r - Erf(beta*r)/r
- *
- * The first term we already have from the inverse square root, so
- * that we can leave out of this routine.
- *
- * For pme tolerances of 1e-3 to 1e-8 and cutoffs of 0.5nm to 1.8nm,
- * the argument beta*r will be in the range 0.15 to ~4. Use your
- * favorite plotting program to realize how well-behaved Erf(z)/z is
- * in this range!
- *
- * We approximate f(z)=erf(z)/z with a rational minimax polynomial.
- * However, it turns out it is more efficient to approximate f(z)/z and
- * then only use even powers. This is another minor optimization, since
- * we actually WANT f(z)/z, because it is going to be multiplied by
- * the vector between the two atoms to get the vectorial force. The
- * fastest flops are the ones we can avoid calculating!
- *
- * So, here's how it should be used:
- *
- * 1. Calculate r^2.
- * 2. Multiply by beta^2, so you get z^2=beta^2*r^2.
- * 3. Evaluate this routine with z^2 as the argument.
- * 4. The return value is the expression:
- *
- *
- * 2*exp(-z^2) erf(z)
- * ------------ - --------
- * sqrt(Pi)*z^2 z^3
- *
- * 5. Multiply the entire expression by beta^3. This will get you
- *
- * beta^3*2*exp(-z^2) beta^3*erf(z)
- * ------------------ - ---------------
- * sqrt(Pi)*z^2 z^3
- *
- * or, switching back to r (z=r*beta):
- *
- * 2*beta*exp(-r^2*beta^2) erf(r*beta)
- * ----------------------- - -----------
- * sqrt(Pi)*r^2 r^3
- *
- *
- * With a bit of math exercise you should be able to confirm that
- * this is exactly D[Erf[beta*r]/r,r] divided by r another time.
- *
- * 6. Add the result to 1/r^3, multiply by the product of the charges,
- * and you have your force (divided by r). A final multiplication
- * with the vector connecting the two particles and you have your
- * vectorial force to add to the particles.
- *
- */
-static gmx_simd_real_t
-gmx_simd_pmecorrF_r(gmx_simd_real_t z2)
-{
- const gmx_simd_real_t FN6 = gmx_simd_set1_r(-1.7357322914161492954e-8f);
- const gmx_simd_real_t FN5 = gmx_simd_set1_r(1.4703624142580877519e-6f);
- const gmx_simd_real_t FN4 = gmx_simd_set1_r(-0.000053401640219807709149f);
- const gmx_simd_real_t FN3 = gmx_simd_set1_r(0.0010054721316683106153f);
- const gmx_simd_real_t FN2 = gmx_simd_set1_r(-0.019278317264888380590f);
- const gmx_simd_real_t FN1 = gmx_simd_set1_r(0.069670166153766424023f);
- const gmx_simd_real_t FN0 = gmx_simd_set1_r(-0.75225204789749321333f);
-
- const gmx_simd_real_t FD4 = gmx_simd_set1_r(0.0011193462567257629232f);
- const gmx_simd_real_t FD3 = gmx_simd_set1_r(0.014866955030185295499f);
- const gmx_simd_real_t FD2 = gmx_simd_set1_r(0.11583842382862377919f);
- const gmx_simd_real_t FD1 = gmx_simd_set1_r(0.50736591960530292870f);
- const gmx_simd_real_t FD0 = gmx_simd_set1_r(1.0f);
-
- gmx_simd_real_t z4;
- gmx_simd_real_t polyFN0, polyFN1, polyFD0, polyFD1;
-
- z4 = gmx_simd_mul_r(z2, z2);
-
- polyFD0 = gmx_simd_fmadd_r(FD4, z4, FD2);
- polyFD1 = gmx_simd_fmadd_r(FD3, z4, FD1);
- polyFD0 = gmx_simd_fmadd_r(polyFD0, z4, FD0);
- polyFD0 = gmx_simd_fmadd_r(polyFD1, z2, polyFD0);
-
- polyFD0 = gmx_simd_inv_r(polyFD0);
-
- polyFN0 = gmx_simd_fmadd_r(FN6, z4, FN4);
- polyFN1 = gmx_simd_fmadd_r(FN5, z4, FN3);
- polyFN0 = gmx_simd_fmadd_r(polyFN0, z4, FN2);
- polyFN1 = gmx_simd_fmadd_r(polyFN1, z4, FN1);
- polyFN0 = gmx_simd_fmadd_r(polyFN0, z4, FN0);
- polyFN0 = gmx_simd_fmadd_r(polyFN1, z2, polyFN0);
-
- return gmx_simd_mul_r(polyFN0, polyFD0);
-}
-
-
-/* Calculate the potential correction due to PME analytically.
- *
- * See gmx_simd_pmecorrF_r() for details about the approximation.
- *
- * This routine calculates Erf(z)/z, although you should provide z^2
- * as the input argument.
- *
- * Here's how it should be used:
- *
- * 1. Calculate r^2.
- * 2. Multiply by beta^2, so you get z^2=beta^2*r^2.
- * 3. Evaluate this routine with z^2 as the argument.
- * 4. The return value is the expression:
- *
- *
- * erf(z)
- * --------
- * z
- *
- * 5. Multiply the entire expression by beta and switching back to r (z=r*beta):
- *
- * erf(r*beta)
- * -----------
- * r
- *
- * 6. Add the result to 1/r, multiply by the product of the charges,
- * and you have your potential.
- */
-static gmx_simd_real_t
-gmx_simd_pmecorrV_r(gmx_simd_real_t z2)
-{
- const gmx_simd_real_t VN6 = gmx_simd_set1_r(1.9296833005951166339e-8f);
- const gmx_simd_real_t VN5 = gmx_simd_set1_r(-1.4213390571557850962e-6f);
- const gmx_simd_real_t VN4 = gmx_simd_set1_r(0.000041603292906656984871f);
- const gmx_simd_real_t VN3 = gmx_simd_set1_r(-0.00013134036773265025626f);
- const gmx_simd_real_t VN2 = gmx_simd_set1_r(0.038657983986041781264f);
- const gmx_simd_real_t VN1 = gmx_simd_set1_r(0.11285044772717598220f);
- const gmx_simd_real_t VN0 = gmx_simd_set1_r(1.1283802385263030286f);
-
- const gmx_simd_real_t VD3 = gmx_simd_set1_r(0.0066752224023576045451f);
- const gmx_simd_real_t VD2 = gmx_simd_set1_r(0.078647795836373922256f);
- const gmx_simd_real_t VD1 = gmx_simd_set1_r(0.43336185284710920150f);
- const gmx_simd_real_t VD0 = gmx_simd_set1_r(1.0f);
-
- gmx_simd_real_t z4;
- gmx_simd_real_t polyVN0, polyVN1, polyVD0, polyVD1;
-
- z4 = gmx_simd_mul_r(z2, z2);
-
- polyVD1 = gmx_simd_fmadd_r(VD3, z4, VD1);
- polyVD0 = gmx_simd_fmadd_r(VD2, z4, VD0);
- polyVD0 = gmx_simd_fmadd_r(polyVD1, z2, polyVD0);
-
- polyVD0 = gmx_simd_inv_r(polyVD0);
-
- polyVN0 = gmx_simd_fmadd_r(VN6, z4, VN4);
- polyVN1 = gmx_simd_fmadd_r(VN5, z4, VN3);
- polyVN0 = gmx_simd_fmadd_r(polyVN0, z4, VN2);
- polyVN1 = gmx_simd_fmadd_r(polyVN1, z4, VN1);
- polyVN0 = gmx_simd_fmadd_r(polyVN0, z4, VN0);
- polyVN0 = gmx_simd_fmadd_r(polyVN1, z2, polyVN0);
-
- return gmx_simd_mul_r(polyVN0, polyVD0);
-}
-
-
-#endif
/*
* This file is part of the GROMACS molecular simulation package.
*
- * Copyright (c) 2012,2013, by the GROMACS development team, led by
+ * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
* Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
* and including many others, as listed in the AUTHORS file in the
* top-level source directory and at http://www.gromacs.org.
#ifndef GMX_SIMD_MATH_AVX_128_FMA_DOUBLE_H
#define GMX_SIMD_MATH_AVX_128_FMA_DOUBLE_H
-#include <immintrin.h> /* AVX */
-#ifdef HAVE_X86INTRIN_H
-#include <x86intrin.h> /* FMA */
-#endif
-#ifdef HAVE_INTRIN_H
-#include <intrin.h> /* FMA MSVC */
-#endif
-
-#include <math.h>
-
-#include "general_x86_avx_128_fma.h"
-
-
-#ifndef M_PI
-# define M_PI 3.14159265358979323846264338327950288
-#endif
-
-
-/************************
- * *
- * Simple math routines *
- * *
- ************************/
-
-/* 1.0/sqrt(x) */
-static gmx_inline __m128d
-gmx_mm_invsqrt_pd(__m128d x)
-{
- const __m128d half = _mm_set1_pd(0.5);
- const __m128d three = _mm_set1_pd(3.0);
-
- /* Lookup instruction only exists in single precision, convert back and forth... */
- __m128d lu = _mm_cvtps_pd(_mm_rsqrt_ps( _mm_cvtpd_ps(x)));
-
- lu = _mm_mul_pd(_mm_mul_pd(half, lu), _mm_nmacc_pd(_mm_mul_pd(lu, lu), x, three));
- return _mm_mul_pd(_mm_mul_pd(half, lu), _mm_nmacc_pd(_mm_mul_pd(lu, lu), x, three));
-}
-
-/* 1.0/sqrt(x), done for a pair of arguments to improve throughput */
-static void
-gmx_mm_invsqrt_pair_pd(__m128d x1, __m128d x2, __m128d *invsqrt1, __m128d *invsqrt2)
-{
- const __m128d half = _mm_set1_pd(0.5);
- const __m128d three = _mm_set1_pd(3.0);
- const __m128 halff = _mm_set1_ps(0.5f);
- const __m128 threef = _mm_set1_ps(3.0f);
-
- __m128 xf, luf;
- __m128d lu1, lu2;
-
- /* Do first N-R step in float for 2x throughput */
- xf = _mm_shuffle_ps(_mm_cvtpd_ps(x1), _mm_cvtpd_ps(x2), _MM_SHUFFLE(1, 0, 1, 0));
- luf = _mm_rsqrt_ps(xf);
-
- luf = _mm_mul_ps(_mm_mul_ps(halff, luf), _mm_nmacc_ps(_mm_mul_ps(luf, luf), xf, threef));
-
-
- lu2 = _mm_cvtps_pd(_mm_shuffle_ps(luf, luf, _MM_SHUFFLE(3, 2, 3, 2)));
- lu1 = _mm_cvtps_pd(luf);
-
- *invsqrt1 = _mm_mul_pd(_mm_mul_pd(half, lu1), _mm_nmacc_pd(_mm_mul_pd(lu1, lu1), x1, three));
- *invsqrt2 = _mm_mul_pd(_mm_mul_pd(half, lu2), _mm_nmacc_pd(_mm_mul_pd(lu2, lu2), x2, three));
-}
-
-/* sqrt(x) - Do NOT use this (but rather invsqrt) if you actually need 1.0/sqrt(x) */
-static gmx_inline __m128d
-gmx_mm_sqrt_pd(__m128d x)
-{
- __m128d mask;
- __m128d res;
-
- mask = _mm_cmpeq_pd(x, _mm_setzero_pd());
- res = _mm_andnot_pd(mask, gmx_mm_invsqrt_pd(x));
-
- res = _mm_mul_pd(x, res);
-
- return res;
-}
-
-/* 1.0/x */
-static gmx_inline __m128d
-gmx_mm_inv_pd(__m128d x)
-{
- const __m128d two = _mm_set1_pd(2.0);
-
- /* Lookup instruction only exists in single precision, convert back and forth... */
- __m128d lu = _mm_cvtps_pd(_mm_rcp_ps( _mm_cvtpd_ps(x)));
-
- /* Perform two N-R steps for double precision */
- lu = _mm_mul_pd(lu, _mm_nmacc_pd(lu, x, two));
- return _mm_mul_pd(lu, _mm_nmacc_pd(lu, x, two));
-}
-
-static gmx_inline __m128d
-gmx_mm_abs_pd(__m128d x)
-{
- const __m128d signmask = gmx_mm_castsi128_pd( _mm_set_epi32(0x7FFFFFFF, 0xFFFFFFFF, 0x7FFFFFFF, 0xFFFFFFFF) );
-
- return _mm_and_pd(x, signmask);
-}
-
-
-/*
- * 2^x function.
- *
- * The 2^w term is calculated from a (6,0)-th order (no denominator) Minimax polynomia on the interval
- * [-0.5,0.5].
- *
- * The approximation on [-0.5,0.5] is a rational Padé approximation, 1+2*P(x^2)/(Q(x^2)-P(x^2)),
- * according to the same algorithm as used in the Cephes/netlib math routines.
- */
-static __m128d
-gmx_mm_exp2_pd(__m128d x)
-{
- /* Lower bound: We do not allow numbers that would lead to an IEEE fp representation exponent smaller than -126. */
- const __m128d arglimit = _mm_set1_pd(1022.0);
- const __m128i expbase = _mm_set1_epi32(1023);
-
- const __m128d P2 = _mm_set1_pd(2.30933477057345225087e-2);
- const __m128d P1 = _mm_set1_pd(2.02020656693165307700e1);
- const __m128d P0 = _mm_set1_pd(1.51390680115615096133e3);
- /* Q2 == 1.0 */
- const __m128d Q1 = _mm_set1_pd(2.33184211722314911771e2);
- const __m128d Q0 = _mm_set1_pd(4.36821166879210612817e3);
- const __m128d one = _mm_set1_pd(1.0);
- const __m128d two = _mm_set1_pd(2.0);
-
- __m128d valuemask;
- __m128i iexppart;
- __m128d fexppart;
- __m128d intpart;
- __m128d z, z2;
- __m128d PolyP, PolyQ;
-
- iexppart = _mm_cvtpd_epi32(x);
- intpart = _mm_round_pd(x, _MM_FROUND_TO_NEAREST_INT);
-
- /* The two lowest elements of iexppart now contains 32-bit numbers with a correctly biased exponent.
- * To be able to shift it into the exponent for a double precision number we first need to
- * shuffle so that the lower half contains the first element, and the upper half the second.
- * This should really be done as a zero-extension, but since the next instructions will shift
- * the registers left by 52 bits it doesn't matter what we put there - it will be shifted out.
- * (thus we just use element 2 from iexppart).
- */
- iexppart = _mm_shuffle_epi32(iexppart, _MM_SHUFFLE(2, 1, 2, 0));
-
- /* Do the shift operation on the 64-bit registers */
- iexppart = _mm_add_epi32(iexppart, expbase);
- iexppart = _mm_slli_epi64(iexppart, 52);
-
- valuemask = _mm_cmpge_pd(arglimit, gmx_mm_abs_pd(x));
- fexppart = _mm_and_pd(valuemask, gmx_mm_castsi128_pd(iexppart));
-
- z = _mm_sub_pd(x, intpart);
- z2 = _mm_mul_pd(z, z);
-
- PolyP = _mm_macc_pd(P2, z2, P1);
- PolyQ = _mm_add_pd(z2, Q1);
- PolyP = _mm_macc_pd(PolyP, z2, P0);
- PolyQ = _mm_macc_pd(PolyQ, z2, Q0);
- PolyP = _mm_mul_pd(PolyP, z);
-
- z = _mm_mul_pd(PolyP, gmx_mm_inv_pd(_mm_sub_pd(PolyQ, PolyP)));
- z = _mm_macc_pd(two, z, one);
-
- z = _mm_mul_pd(z, fexppart);
-
- return z;
-}
-
-/* Exponential function. This could be calculated from 2^x as Exp(x)=2^(y), where y=log2(e)*x,
- * but there will then be a small rounding error since we lose some precision due to the
- * multiplication. This will then be magnified a lot by the exponential.
- *
- * Instead, we calculate the fractional part directly as a Padé approximation of
- * Exp(z) on [-0.5,0.5]. We use extended precision arithmetics to calculate the fraction
- * remaining after 2^y, which avoids the precision-loss.
- */
-static __m128d
-gmx_mm_exp_pd(__m128d exparg)
-{
- const __m128d argscale = _mm_set1_pd(1.4426950408889634073599);
- /* Lower bound: We do not allow numbers that would lead to an IEEE fp representation exponent smaller than -126. */
- const __m128d arglimit = _mm_set1_pd(1022.0);
- const __m128i expbase = _mm_set1_epi32(1023);
-
- const __m128d invargscale0 = _mm_set1_pd(6.93145751953125e-1);
- const __m128d invargscale1 = _mm_set1_pd(1.42860682030941723212e-6);
-
- const __m128d P2 = _mm_set1_pd(1.26177193074810590878e-4);
- const __m128d P1 = _mm_set1_pd(3.02994407707441961300e-2);
- /* P0 == 1.0 */
- const __m128d Q3 = _mm_set1_pd(3.00198505138664455042E-6);
- const __m128d Q2 = _mm_set1_pd(2.52448340349684104192E-3);
- const __m128d Q1 = _mm_set1_pd(2.27265548208155028766E-1);
- /* Q0 == 2.0 */
- const __m128d one = _mm_set1_pd(1.0);
- const __m128d two = _mm_set1_pd(2.0);
-
- __m128d valuemask;
- __m128i iexppart;
- __m128d fexppart;
- __m128d intpart;
- __m128d x, z, z2;
- __m128d PolyP, PolyQ;
-
- x = _mm_mul_pd(exparg, argscale);
-
- iexppart = _mm_cvtpd_epi32(x);
- intpart = _mm_round_pd(x, _MM_FROUND_TO_NEAREST_INT);
-
- /* The two lowest elements of iexppart now contains 32-bit numbers with a correctly biased exponent.
- * To be able to shift it into the exponent for a double precision number we first need to
- * shuffle so that the lower half contains the first element, and the upper half the second.
- * This should really be done as a zero-extension, but since the next instructions will shift
- * the registers left by 52 bits it doesn't matter what we put there - it will be shifted out.
- * (thus we just use element 2 from iexppart).
- */
- iexppart = _mm_shuffle_epi32(iexppart, _MM_SHUFFLE(2, 1, 2, 0));
-
- /* Do the shift operation on the 64-bit registers */
- iexppart = _mm_add_epi32(iexppart, expbase);
- iexppart = _mm_slli_epi64(iexppart, 52);
-
- valuemask = _mm_cmpge_pd(arglimit, gmx_mm_abs_pd(x));
- fexppart = _mm_and_pd(valuemask, gmx_mm_castsi128_pd(iexppart));
-
- z = _mm_sub_pd(exparg, _mm_mul_pd(invargscale0, intpart));
- z = _mm_sub_pd(z, _mm_mul_pd(invargscale1, intpart));
-
- z2 = _mm_mul_pd(z, z);
-
- PolyQ = _mm_macc_pd(Q3, z2, Q2);
- PolyP = _mm_macc_pd(P2, z2, P1);
- PolyQ = _mm_macc_pd(PolyQ, z2, Q1);
-
- PolyP = _mm_macc_pd(PolyP, z2, one);
- PolyQ = _mm_macc_pd(PolyQ, z2, two);
-
- PolyP = _mm_mul_pd(PolyP, z);
-
- z = _mm_mul_pd(PolyP, gmx_mm_inv_pd(_mm_sub_pd(PolyQ, PolyP)));
- z = _mm_macc_pd(two, z, one);
-
- z = _mm_mul_pd(z, fexppart);
-
- return z;
-}
-
-
-
-static __m128d
-gmx_mm_log_pd(__m128d x)
-{
- /* Same algorithm as cephes library */
- const __m128d expmask = gmx_mm_castsi128_pd( _mm_set_epi32(0x7FF00000, 0x00000000, 0x7FF00000, 0x00000000) );
-
- const __m128i expbase_m1 = _mm_set1_epi32(1023-1); /* We want non-IEEE format */
-
- const __m128d half = _mm_set1_pd(0.5);
- const __m128d one = _mm_set1_pd(1.0);
- const __m128d two = _mm_set1_pd(2.0);
- const __m128d invsq2 = _mm_set1_pd(1.0/sqrt(2.0));
-
- const __m128d corr1 = _mm_set1_pd(-2.121944400546905827679e-4);
- const __m128d corr2 = _mm_set1_pd(0.693359375);
-
- const __m128d P5 = _mm_set1_pd(1.01875663804580931796e-4);
- const __m128d P4 = _mm_set1_pd(4.97494994976747001425e-1);
- const __m128d P3 = _mm_set1_pd(4.70579119878881725854e0);
- const __m128d P2 = _mm_set1_pd(1.44989225341610930846e1);
- const __m128d P1 = _mm_set1_pd(1.79368678507819816313e1);
- const __m128d P0 = _mm_set1_pd(7.70838733755885391666e0);
-
- const __m128d Q4 = _mm_set1_pd(1.12873587189167450590e1);
- const __m128d Q3 = _mm_set1_pd(4.52279145837532221105e1);
- const __m128d Q2 = _mm_set1_pd(8.29875266912776603211e1);
- const __m128d Q1 = _mm_set1_pd(7.11544750618563894466e1);
- const __m128d Q0 = _mm_set1_pd(2.31251620126765340583e1);
-
- const __m128d R2 = _mm_set1_pd(-7.89580278884799154124e-1);
- const __m128d R1 = _mm_set1_pd(1.63866645699558079767e1);
- const __m128d R0 = _mm_set1_pd(-6.41409952958715622951e1);
-
- const __m128d S2 = _mm_set1_pd(-3.56722798256324312549E1);
- const __m128d S1 = _mm_set1_pd(3.12093766372244180303E2);
- const __m128d S0 = _mm_set1_pd(-7.69691943550460008604E2);
-
- __m128d fexp;
- __m128i iexp;
-
- __m128d mask1, mask2;
- __m128d corr, t1, t2, q;
- __m128d zA, yA, xA, zB, yB, xB, z;
- __m128d polyR, polyS;
- __m128d polyP1, polyP2, polyQ1, polyQ2;
-
- /* Separate x into exponent and mantissa, with a mantissa in the range [0.5..1[ (not IEEE754 standard!) */
- fexp = _mm_and_pd(x, expmask);
- iexp = gmx_mm_castpd_si128(fexp);
- iexp = _mm_srli_epi64(iexp, 52);
- iexp = _mm_sub_epi32(iexp, expbase_m1);
- iexp = _mm_shuffle_epi32(iexp, _MM_SHUFFLE(1, 1, 2, 0) );
- fexp = _mm_cvtepi32_pd(iexp);
-
- x = _mm_andnot_pd(expmask, x);
- x = _mm_or_pd(x, one);
- x = _mm_mul_pd(x, half);
-
- mask1 = _mm_cmpgt_pd(gmx_mm_abs_pd(fexp), two);
- mask2 = _mm_cmplt_pd(x, invsq2);
-
- fexp = _mm_sub_pd(fexp, _mm_and_pd(mask2, one));
-
- /* If mask1 is set ('A') */
- zA = _mm_sub_pd(x, half);
- t1 = _mm_blendv_pd( zA, x, mask2 );
- zA = _mm_sub_pd(t1, half);
- t2 = _mm_blendv_pd( x, zA, mask2 );
- yA = _mm_mul_pd(half, _mm_add_pd(t2, one));
-
- xA = _mm_mul_pd(zA, gmx_mm_inv_pd(yA));
- zA = _mm_mul_pd(xA, xA);
-
- /* EVALUATE POLY */
- polyR = _mm_macc_pd(R2, zA, R1);
- polyR = _mm_macc_pd(polyR, zA, R0);
-
- polyS = _mm_add_pd(zA, S2);
- polyS = _mm_macc_pd(polyS, zA, S1);
- polyS = _mm_macc_pd(polyS, zA, S0);
-
- q = _mm_mul_pd(polyR, gmx_mm_inv_pd(polyS));
- zA = _mm_mul_pd(_mm_mul_pd(xA, zA), q);
-
- zA = _mm_macc_pd(corr1, fexp, zA);
- zA = _mm_add_pd(zA, xA);
- zA = _mm_macc_pd(corr2, fexp, zA);
-
- /* If mask1 is not set ('B') */
- corr = _mm_and_pd(mask2, x);
- xB = _mm_add_pd(x, corr);
- xB = _mm_sub_pd(xB, one);
- zB = _mm_mul_pd(xB, xB);
-
- polyP1 = _mm_macc_pd(P5, zB, P3);
- polyP2 = _mm_macc_pd(P4, zB, P2);
- polyP1 = _mm_macc_pd(polyP1, zB, P1);
- polyP2 = _mm_macc_pd(polyP2, zB, P0);
- polyP1 = _mm_macc_pd(polyP1, xB, polyP2);
-
- polyQ2 = _mm_macc_pd(Q4, zB, Q2);
- polyQ1 = _mm_add_pd(zB, Q3);
- polyQ1 = _mm_macc_pd(polyQ1, zB, Q1);
- polyQ2 = _mm_macc_pd(polyQ2, zB, Q0);
- polyQ1 = _mm_macc_pd(polyQ1, xB, polyQ2);
-
- fexp = _mm_and_pd(fexp, _mm_cmpneq_pd(fexp, _mm_setzero_pd()));
-
- q = _mm_mul_pd(polyP1, gmx_mm_inv_pd(polyQ1));
- yB = _mm_macc_pd(_mm_mul_pd(xB, zB), q, _mm_mul_pd(corr1, fexp));
-
- yB = _mm_nmacc_pd(half, zB, yB);
- zB = _mm_add_pd(xB, yB);
- zB = _mm_macc_pd(corr2, fexp, zB);
-
- z = _mm_blendv_pd( zB, zA, mask1 );
-
- return z;
-}
-
-
-
-static __m128d
-gmx_mm_erf_pd(__m128d x)
-{
- /* Coefficients for minimax approximation of erf(x)=x*(CAoffset + P(x^2)/Q(x^2)) in range [-0.75,0.75] */
- const __m128d CAP4 = _mm_set1_pd(-0.431780540597889301512e-4);
- const __m128d CAP3 = _mm_set1_pd(-0.00578562306260059236059);
- const __m128d CAP2 = _mm_set1_pd(-0.028593586920219752446);
- const __m128d CAP1 = _mm_set1_pd(-0.315924962948621698209);
- const __m128d CAP0 = _mm_set1_pd(0.14952975608477029151);
-
- const __m128d CAQ5 = _mm_set1_pd(-0.374089300177174709737e-5);
- const __m128d CAQ4 = _mm_set1_pd(0.00015126584532155383535);
- const __m128d CAQ3 = _mm_set1_pd(0.00536692680669480725423);
- const __m128d CAQ2 = _mm_set1_pd(0.0668686825594046122636);
- const __m128d CAQ1 = _mm_set1_pd(0.402604990869284362773);
- /* CAQ0 == 1.0 */
- const __m128d CAoffset = _mm_set1_pd(0.9788494110107421875);
-
- /* Coefficients for minimax approximation of erfc(x)=exp(-x^2)*x*(P(x-1)/Q(x-1)) in range [1.0,4.5] */
- const __m128d CBP6 = _mm_set1_pd(2.49650423685462752497647637088e-10);
- const __m128d CBP5 = _mm_set1_pd(0.00119770193298159629350136085658);
- const __m128d CBP4 = _mm_set1_pd(0.0164944422378370965881008942733);
- const __m128d CBP3 = _mm_set1_pd(0.0984581468691775932063932439252);
- const __m128d CBP2 = _mm_set1_pd(0.317364595806937763843589437418);
- const __m128d CBP1 = _mm_set1_pd(0.554167062641455850932670067075);
- const __m128d CBP0 = _mm_set1_pd(0.427583576155807163756925301060);
- const __m128d CBQ7 = _mm_set1_pd(0.00212288829699830145976198384930);
- const __m128d CBQ6 = _mm_set1_pd(0.0334810979522685300554606393425);
- const __m128d CBQ5 = _mm_set1_pd(0.2361713785181450957579508850717);
- const __m128d CBQ4 = _mm_set1_pd(0.955364736493055670530981883072);
- const __m128d CBQ3 = _mm_set1_pd(2.36815675631420037315349279199);
- const __m128d CBQ2 = _mm_set1_pd(3.55261649184083035537184223542);
- const __m128d CBQ1 = _mm_set1_pd(2.93501136050160872574376997993);
- /* CBQ0 == 1.0 */
-
- /* Coefficients for minimax approximation of erfc(x)=exp(-x^2)/x*(P(1/x)/Q(1/x)) in range [4.5,inf] */
- const __m128d CCP6 = _mm_set1_pd(-2.8175401114513378771);
- const __m128d CCP5 = _mm_set1_pd(-3.22729451764143718517);
- const __m128d CCP4 = _mm_set1_pd(-2.5518551727311523996);
- const __m128d CCP3 = _mm_set1_pd(-0.687717681153649930619);
- const __m128d CCP2 = _mm_set1_pd(-0.212652252872804219852);
- const __m128d CCP1 = _mm_set1_pd(0.0175389834052493308818);
- const __m128d CCP0 = _mm_set1_pd(0.00628057170626964891937);
-
- const __m128d CCQ6 = _mm_set1_pd(5.48409182238641741584);
- const __m128d CCQ5 = _mm_set1_pd(13.5064170191802889145);
- const __m128d CCQ4 = _mm_set1_pd(22.9367376522880577224);
- const __m128d CCQ3 = _mm_set1_pd(15.930646027911794143);
- const __m128d CCQ2 = _mm_set1_pd(11.0567237927800161565);
- const __m128d CCQ1 = _mm_set1_pd(2.79257750980575282228);
- /* CCQ0 == 1.0 */
- const __m128d CCoffset = _mm_set1_pd(0.5579090118408203125);
-
- const __m128d one = _mm_set1_pd(1.0);
- const __m128d two = _mm_set1_pd(2.0);
-
- const __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000, 0x00000000, 0x80000000, 0x00000000) );
-
- __m128d xabs, x2, x4, t, t2, w, w2;
- __m128d PolyAP0, PolyAP1, PolyAQ0, PolyAQ1;
- __m128d PolyBP0, PolyBP1, PolyBQ0, PolyBQ1;
- __m128d PolyCP0, PolyCP1, PolyCQ0, PolyCQ1;
- __m128d res_erf, res_erfcB, res_erfcC, res_erfc, res;
- __m128d mask, expmx2;
-
- /* Calculate erf() */
- xabs = gmx_mm_abs_pd(x);
- x2 = _mm_mul_pd(x, x);
- x4 = _mm_mul_pd(x2, x2);
-
- PolyAP0 = _mm_macc_pd(CAP4, x4, CAP2);
- PolyAP1 = _mm_macc_pd(CAP3, x4, CAP1);
- PolyAP0 = _mm_macc_pd(PolyAP0, x4, CAP0);
- PolyAP0 = _mm_macc_pd(PolyAP1, x2, PolyAP0);
-
- PolyAQ1 = _mm_macc_pd(CAQ5, x4, CAQ3);
- PolyAQ0 = _mm_macc_pd(CAQ4, x4, CAQ2);
- PolyAQ1 = _mm_macc_pd(PolyAQ1, x4, CAQ1);
- PolyAQ0 = _mm_macc_pd(PolyAQ0, x4, one);
- PolyAQ0 = _mm_macc_pd(PolyAQ1, x2, PolyAQ0);
-
- res_erf = _mm_macc_pd(PolyAP0, gmx_mm_inv_pd(PolyAQ0), CAoffset);
- res_erf = _mm_mul_pd(x, res_erf);
-
- /* Calculate erfc() in range [1,4.5] */
- t = _mm_sub_pd(xabs, one);
- t2 = _mm_mul_pd(t, t);
-
- PolyBP0 = _mm_macc_pd(CBP6, t2, CBP4);
- PolyBP1 = _mm_macc_pd(CBP5, t2, CBP3);
- PolyBP0 = _mm_macc_pd(PolyBP0, t2, CBP2);
- PolyBP1 = _mm_macc_pd(PolyBP1, t2, CBP1);
- PolyBP0 = _mm_macc_pd(PolyBP0, t2, CBP0);
- PolyBP0 = _mm_macc_pd(PolyBP1, t, PolyBP0);
-
- PolyBQ1 = _mm_macc_pd(CBQ7, t2, CBQ5);
- PolyBQ0 = _mm_macc_pd(CBQ6, t2, CBQ4);
- PolyBQ1 = _mm_macc_pd(PolyBQ1, t2, CBQ3);
- PolyBQ0 = _mm_macc_pd(PolyBQ0, t2, CBQ2);
- PolyBQ1 = _mm_macc_pd(PolyBQ1, t2, CBQ1);
- PolyBQ0 = _mm_macc_pd(PolyBQ0, t2, one);
- PolyBQ0 = _mm_macc_pd(PolyBQ1, t, PolyBQ0);
-
- res_erfcB = _mm_mul_pd(PolyBP0, gmx_mm_inv_pd(PolyBQ0));
-
- res_erfcB = _mm_mul_pd(res_erfcB, xabs);
-
- /* Calculate erfc() in range [4.5,inf] */
- w = gmx_mm_inv_pd(xabs);
- w2 = _mm_mul_pd(w, w);
-
- PolyCP0 = _mm_macc_pd(CCP6, w2, CCP4);
- PolyCP1 = _mm_macc_pd(CCP5, w2, CCP3);
- PolyCP0 = _mm_macc_pd(PolyCP0, w2, CCP2);
- PolyCP1 = _mm_macc_pd(PolyCP1, w2, CCP1);
- PolyCP0 = _mm_macc_pd(PolyCP0, w2, CCP0);
- PolyCP0 = _mm_macc_pd(PolyCP1, w, PolyCP0);
-
- PolyCQ0 = _mm_macc_pd(CCQ6, w2, CCQ4);
- PolyCQ1 = _mm_macc_pd(CCQ5, w2, CCQ3);
- PolyCQ0 = _mm_macc_pd(PolyCQ0, w2, CCQ2);
- PolyCQ1 = _mm_macc_pd(PolyCQ1, w2, CCQ1);
- PolyCQ0 = _mm_macc_pd(PolyCQ0, w2, one);
- PolyCQ0 = _mm_macc_pd(PolyCQ1, w, PolyCQ0);
-
- expmx2 = gmx_mm_exp_pd( _mm_or_pd(signbit, x2) );
-
- res_erfcC = _mm_macc_pd(PolyCP0, gmx_mm_inv_pd(PolyCQ0), CCoffset);
- res_erfcC = _mm_mul_pd(res_erfcC, w);
-
- mask = _mm_cmpgt_pd(xabs, _mm_set1_pd(4.5));
- res_erfc = _mm_blendv_pd(res_erfcB, res_erfcC, mask);
-
- res_erfc = _mm_mul_pd(res_erfc, expmx2);
-
- /* erfc(x<0) = 2-erfc(|x|) */
- mask = _mm_cmplt_pd(x, _mm_setzero_pd());
- res_erfc = _mm_blendv_pd(res_erfc, _mm_sub_pd(two, res_erfc), mask);
-
- /* Select erf() or erfc() */
- mask = _mm_cmplt_pd(xabs, one);
- res = _mm_blendv_pd(_mm_sub_pd(one, res_erfc), res_erf, mask);
-
- return res;
-}
-
-
-static __m128d
-gmx_mm_erfc_pd(__m128d x)
-{
- /* Coefficients for minimax approximation of erf(x)=x*(CAoffset + P(x^2)/Q(x^2)) in range [-0.75,0.75] */
- const __m128d CAP4 = _mm_set1_pd(-0.431780540597889301512e-4);
- const __m128d CAP3 = _mm_set1_pd(-0.00578562306260059236059);
- const __m128d CAP2 = _mm_set1_pd(-0.028593586920219752446);
- const __m128d CAP1 = _mm_set1_pd(-0.315924962948621698209);
- const __m128d CAP0 = _mm_set1_pd(0.14952975608477029151);
-
- const __m128d CAQ5 = _mm_set1_pd(-0.374089300177174709737e-5);
- const __m128d CAQ4 = _mm_set1_pd(0.00015126584532155383535);
- const __m128d CAQ3 = _mm_set1_pd(0.00536692680669480725423);
- const __m128d CAQ2 = _mm_set1_pd(0.0668686825594046122636);
- const __m128d CAQ1 = _mm_set1_pd(0.402604990869284362773);
- /* CAQ0 == 1.0 */
- const __m128d CAoffset = _mm_set1_pd(0.9788494110107421875);
-
- /* Coefficients for minimax approximation of erfc(x)=exp(-x^2)*x*(P(x-1)/Q(x-1)) in range [1.0,4.5] */
- const __m128d CBP6 = _mm_set1_pd(2.49650423685462752497647637088e-10);
- const __m128d CBP5 = _mm_set1_pd(0.00119770193298159629350136085658);
- const __m128d CBP4 = _mm_set1_pd(0.0164944422378370965881008942733);
- const __m128d CBP3 = _mm_set1_pd(0.0984581468691775932063932439252);
- const __m128d CBP2 = _mm_set1_pd(0.317364595806937763843589437418);
- const __m128d CBP1 = _mm_set1_pd(0.554167062641455850932670067075);
- const __m128d CBP0 = _mm_set1_pd(0.427583576155807163756925301060);
- const __m128d CBQ7 = _mm_set1_pd(0.00212288829699830145976198384930);
- const __m128d CBQ6 = _mm_set1_pd(0.0334810979522685300554606393425);
- const __m128d CBQ5 = _mm_set1_pd(0.2361713785181450957579508850717);
- const __m128d CBQ4 = _mm_set1_pd(0.955364736493055670530981883072);
- const __m128d CBQ3 = _mm_set1_pd(2.36815675631420037315349279199);
- const __m128d CBQ2 = _mm_set1_pd(3.55261649184083035537184223542);
- const __m128d CBQ1 = _mm_set1_pd(2.93501136050160872574376997993);
- /* CBQ0 == 1.0 */
-
- /* Coefficients for minimax approximation of erfc(x)=exp(-x^2)/x*(P(1/x)/Q(1/x)) in range [4.5,inf] */
- const __m128d CCP6 = _mm_set1_pd(-2.8175401114513378771);
- const __m128d CCP5 = _mm_set1_pd(-3.22729451764143718517);
- const __m128d CCP4 = _mm_set1_pd(-2.5518551727311523996);
- const __m128d CCP3 = _mm_set1_pd(-0.687717681153649930619);
- const __m128d CCP2 = _mm_set1_pd(-0.212652252872804219852);
- const __m128d CCP1 = _mm_set1_pd(0.0175389834052493308818);
- const __m128d CCP0 = _mm_set1_pd(0.00628057170626964891937);
-
- const __m128d CCQ6 = _mm_set1_pd(5.48409182238641741584);
- const __m128d CCQ5 = _mm_set1_pd(13.5064170191802889145);
- const __m128d CCQ4 = _mm_set1_pd(22.9367376522880577224);
- const __m128d CCQ3 = _mm_set1_pd(15.930646027911794143);
- const __m128d CCQ2 = _mm_set1_pd(11.0567237927800161565);
- const __m128d CCQ1 = _mm_set1_pd(2.79257750980575282228);
- /* CCQ0 == 1.0 */
- const __m128d CCoffset = _mm_set1_pd(0.5579090118408203125);
-
- const __m128d one = _mm_set1_pd(1.0);
- const __m128d two = _mm_set1_pd(2.0);
-
- const __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000, 0x00000000, 0x80000000, 0x00000000) );
-
- __m128d xabs, x2, x4, t, t2, w, w2;
- __m128d PolyAP0, PolyAP1, PolyAQ0, PolyAQ1;
- __m128d PolyBP0, PolyBP1, PolyBQ0, PolyBQ1;
- __m128d PolyCP0, PolyCP1, PolyCQ0, PolyCQ1;
- __m128d res_erf, res_erfcB, res_erfcC, res_erfc, res;
- __m128d mask, expmx2;
-
- /* Calculate erf() */
- xabs = gmx_mm_abs_pd(x);
- x2 = _mm_mul_pd(x, x);
- x4 = _mm_mul_pd(x2, x2);
-
- PolyAP0 = _mm_macc_pd(CAP4, x4, CAP2);
- PolyAP1 = _mm_macc_pd(CAP3, x4, CAP1);
- PolyAP0 = _mm_macc_pd(PolyAP0, x4, CAP0);
- PolyAP0 = _mm_macc_pd(PolyAP1, x2, PolyAP0);
-
- PolyAQ1 = _mm_macc_pd(CAQ5, x4, CAQ3);
- PolyAQ0 = _mm_macc_pd(CAQ4, x4, CAQ2);
- PolyAQ1 = _mm_macc_pd(PolyAQ1, x4, CAQ1);
- PolyAQ0 = _mm_macc_pd(PolyAQ0, x4, one);
- PolyAQ0 = _mm_macc_pd(PolyAQ1, x2, PolyAQ0);
-
- res_erf = _mm_macc_pd(PolyAP0, gmx_mm_inv_pd(PolyAQ0), CAoffset);
- res_erf = _mm_mul_pd(x, res_erf);
-
- /* Calculate erfc() in range [1,4.5] */
- t = _mm_sub_pd(xabs, one);
- t2 = _mm_mul_pd(t, t);
-
- PolyBP0 = _mm_macc_pd(CBP6, t2, CBP4);
- PolyBP1 = _mm_macc_pd(CBP5, t2, CBP3);
- PolyBP0 = _mm_macc_pd(PolyBP0, t2, CBP2);
- PolyBP1 = _mm_macc_pd(PolyBP1, t2, CBP1);
- PolyBP0 = _mm_macc_pd(PolyBP0, t2, CBP0);
- PolyBP0 = _mm_macc_pd(PolyBP1, t, PolyBP0);
-
- PolyBQ1 = _mm_macc_pd(CBQ7, t2, CBQ5);
- PolyBQ0 = _mm_macc_pd(CBQ6, t2, CBQ4);
- PolyBQ1 = _mm_macc_pd(PolyBQ1, t2, CBQ3);
- PolyBQ0 = _mm_macc_pd(PolyBQ0, t2, CBQ2);
- PolyBQ1 = _mm_macc_pd(PolyBQ1, t2, CBQ1);
- PolyBQ0 = _mm_macc_pd(PolyBQ0, t2, one);
- PolyBQ0 = _mm_macc_pd(PolyBQ1, t, PolyBQ0);
-
- res_erfcB = _mm_mul_pd(PolyBP0, gmx_mm_inv_pd(PolyBQ0));
-
- res_erfcB = _mm_mul_pd(res_erfcB, xabs);
-
- /* Calculate erfc() in range [4.5,inf] */
- w = gmx_mm_inv_pd(xabs);
- w2 = _mm_mul_pd(w, w);
-
- PolyCP0 = _mm_macc_pd(CCP6, w2, CCP4);
- PolyCP1 = _mm_macc_pd(CCP5, w2, CCP3);
- PolyCP0 = _mm_macc_pd(PolyCP0, w2, CCP2);
- PolyCP1 = _mm_macc_pd(PolyCP1, w2, CCP1);
- PolyCP0 = _mm_macc_pd(PolyCP0, w2, CCP0);
- PolyCP0 = _mm_macc_pd(PolyCP1, w, PolyCP0);
-
- PolyCQ0 = _mm_macc_pd(CCQ6, w2, CCQ4);
- PolyCQ1 = _mm_macc_pd(CCQ5, w2, CCQ3);
- PolyCQ0 = _mm_macc_pd(PolyCQ0, w2, CCQ2);
- PolyCQ1 = _mm_macc_pd(PolyCQ1, w2, CCQ1);
- PolyCQ0 = _mm_macc_pd(PolyCQ0, w2, one);
- PolyCQ0 = _mm_macc_pd(PolyCQ1, w, PolyCQ0);
-
- expmx2 = gmx_mm_exp_pd( _mm_or_pd(signbit, x2) );
-
- res_erfcC = _mm_macc_pd(PolyCP0, gmx_mm_inv_pd(PolyCQ0), CCoffset);
- res_erfcC = _mm_mul_pd(res_erfcC, w);
-
- mask = _mm_cmpgt_pd(xabs, _mm_set1_pd(4.5));
- res_erfc = _mm_blendv_pd(res_erfcB, res_erfcC, mask);
-
- res_erfc = _mm_mul_pd(res_erfc, expmx2);
-
- /* erfc(x<0) = 2-erfc(|x|) */
- mask = _mm_cmplt_pd(x, _mm_setzero_pd());
- res_erfc = _mm_blendv_pd(res_erfc, _mm_sub_pd(two, res_erfc), mask);
+#include "simd_math.h"
- /* Select erf() or erfc() */
- mask = _mm_cmplt_pd(xabs, one);
- res = _mm_blendv_pd(res_erfc, _mm_sub_pd(one, res_erf), mask);
-
- return res;
-}
-
-
-
-/* Calculate the force correction due to PME analytically.
- *
- * This routine is meant to enable analytical evaluation of the
- * direct-space PME electrostatic force to avoid tables.
- *
- * The direct-space potential should be Erfc(beta*r)/r, but there
- * are some problems evaluating that:
- *
- * First, the error function is difficult (read: expensive) to
- * approxmiate accurately for intermediate to large arguments, and
- * this happens already in ranges of beta*r that occur in simulations.
- * Second, we now try to avoid calculating potentials in Gromacs but
- * use forces directly.
- *
- * We can simply things slight by noting that the PME part is really
- * a correction to the normal Coulomb force since Erfc(z)=1-Erf(z), i.e.
- *
- * V= 1/r - Erf(beta*r)/r
- *
- * The first term we already have from the inverse square root, so
- * that we can leave out of this routine.
- *
- * For pme tolerances of 1e-3 to 1e-8 and cutoffs of 0.5nm to 1.8nm,
- * the argument beta*r will be in the range 0.15 to ~4. Use your
- * favorite plotting program to realize how well-behaved Erf(z)/z is
- * in this range!
- *
- * We approximate f(z)=erf(z)/z with a rational minimax polynomial.
- * However, it turns out it is more efficient to approximate f(z)/z and
- * then only use even powers. This is another minor optimization, since
- * we actually WANT f(z)/z, because it is going to be multiplied by
- * the vector between the two atoms to get the vectorial force. The
- * fastest flops are the ones we can avoid calculating!
- *
- * So, here's how it should be used:
- *
- * 1. Calculate r^2.
- * 2. Multiply by beta^2, so you get z^2=beta^2*r^2.
- * 3. Evaluate this routine with z^2 as the argument.
- * 4. The return value is the expression:
- *
- *
- * 2*exp(-z^2) erf(z)
- * ------------ - --------
- * sqrt(Pi)*z^2 z^3
- *
- * 5. Multiply the entire expression by beta^3. This will get you
- *
- * beta^3*2*exp(-z^2) beta^3*erf(z)
- * ------------------ - ---------------
- * sqrt(Pi)*z^2 z^3
- *
- * or, switching back to r (z=r*beta):
- *
- * 2*beta*exp(-r^2*beta^2) erf(r*beta)
- * ----------------------- - -----------
- * sqrt(Pi)*r^2 r^3
- *
- *
- * With a bit of math exercise you should be able to confirm that
- * this is exactly D[Erf[beta*r]/r,r] divided by r another time.
- *
- * 6. Add the result to 1/r^3, multiply by the product of the charges,
- * and you have your force (divided by r). A final multiplication
- * with the vector connecting the two particles and you have your
- * vectorial force to add to the particles.
- *
+/* Temporary:
+ * Alias some old SSE definitions to new SIMD definitions so we don't need
+ * to modify _all_ group kernels - they will anyway be replaced with a new
+ * generic SIMD version soon.
*/
-static __m128d
-gmx_mm_pmecorrF_pd(__m128d z2)
-{
- const __m128d FN10 = _mm_set1_pd(-8.0072854618360083154e-14);
- const __m128d FN9 = _mm_set1_pd(1.1859116242260148027e-11);
- const __m128d FN8 = _mm_set1_pd(-8.1490406329798423616e-10);
- const __m128d FN7 = _mm_set1_pd(3.4404793543907847655e-8);
- const __m128d FN6 = _mm_set1_pd(-9.9471420832602741006e-7);
- const __m128d FN5 = _mm_set1_pd(0.000020740315999115847456);
- const __m128d FN4 = _mm_set1_pd(-0.00031991745139313364005);
- const __m128d FN3 = _mm_set1_pd(0.0035074449373659008203);
- const __m128d FN2 = _mm_set1_pd(-0.031750380176100813405);
- const __m128d FN1 = _mm_set1_pd(0.13884101728898463426);
- const __m128d FN0 = _mm_set1_pd(-0.75225277815249618847);
-
- const __m128d FD5 = _mm_set1_pd(0.000016009278224355026701);
- const __m128d FD4 = _mm_set1_pd(0.00051055686934806966046);
- const __m128d FD3 = _mm_set1_pd(0.0081803507497974289008);
- const __m128d FD2 = _mm_set1_pd(0.077181146026670287235);
- const __m128d FD1 = _mm_set1_pd(0.41543303143712535988);
- const __m128d FD0 = _mm_set1_pd(1.0);
-
- __m128d z4;
- __m128d polyFN0, polyFN1, polyFD0, polyFD1;
-
- z4 = _mm_mul_pd(z2, z2);
-
- polyFD1 = _mm_macc_pd(FD5, z4, FD3);
- polyFD1 = _mm_macc_pd(polyFD1, z4, FD1);
- polyFD1 = _mm_mul_pd(polyFD1, z2);
- polyFD0 = _mm_macc_pd(FD4, z4, FD2);
- polyFD0 = _mm_macc_pd(polyFD0, z4, FD0);
- polyFD0 = _mm_add_pd(polyFD0, polyFD1);
-
- polyFD0 = gmx_mm_inv_pd(polyFD0);
-
- polyFN0 = _mm_macc_pd(FN10, z4, FN8);
- polyFN0 = _mm_macc_pd(polyFN0, z4, FN6);
- polyFN0 = _mm_macc_pd(polyFN0, z4, FN4);
- polyFN0 = _mm_macc_pd(polyFN0, z4, FN2);
- polyFN0 = _mm_macc_pd(polyFN0, z4, FN0);
- polyFN1 = _mm_macc_pd(FN9, z4, FN7);
- polyFN1 = _mm_macc_pd(polyFN1, z4, FN5);
- polyFN1 = _mm_macc_pd(polyFN1, z4, FN3);
- polyFN1 = _mm_macc_pd(polyFN1, z4, FN1);
- polyFN0 = _mm_macc_pd(polyFN1, z2, polyFN0);
-
- return _mm_mul_pd(polyFN0, polyFD0);
-}
-
-
-/* Calculate the potential correction due to PME analytically.
- *
- * This routine calculates Erf(z)/z, although you should provide z^2
- * as the input argument.
- *
- * Here's how it should be used:
- *
- * 1. Calculate r^2.
- * 2. Multiply by beta^2, so you get z^2=beta^2*r^2.
- * 3. Evaluate this routine with z^2 as the argument.
- * 4. The return value is the expression:
- *
- *
- * erf(z)
- * --------
- * z
- *
- * 5. Multiply the entire expression by beta and switching back to r (z=r*beta):
- *
- * erf(r*beta)
- * -----------
- * r
- *
- * 6. Subtract the result from 1/r, multiply by the product of the charges,
- * and you have your potential.
- *
- */
-static __m128d
-gmx_mm_pmecorrV_pd(__m128d z2)
-{
- const __m128d VN9 = _mm_set1_pd(-9.3723776169321855475e-13);
- const __m128d VN8 = _mm_set1_pd(1.2280156762674215741e-10);
- const __m128d VN7 = _mm_set1_pd(-7.3562157912251309487e-9);
- const __m128d VN6 = _mm_set1_pd(2.6215886208032517509e-7);
- const __m128d VN5 = _mm_set1_pd(-4.9532491651265819499e-6);
- const __m128d VN4 = _mm_set1_pd(0.00025907400778966060389);
- const __m128d VN3 = _mm_set1_pd(0.0010585044856156469792);
- const __m128d VN2 = _mm_set1_pd(0.045247661136833092885);
- const __m128d VN1 = _mm_set1_pd(0.11643931522926034421);
- const __m128d VN0 = _mm_set1_pd(1.1283791671726767970);
-
- const __m128d VD5 = _mm_set1_pd(0.000021784709867336150342);
- const __m128d VD4 = _mm_set1_pd(0.00064293662010911388448);
- const __m128d VD3 = _mm_set1_pd(0.0096311444822588683504);
- const __m128d VD2 = _mm_set1_pd(0.085608012351550627051);
- const __m128d VD1 = _mm_set1_pd(0.43652499166614811084);
- const __m128d VD0 = _mm_set1_pd(1.0);
-
- __m128d z4;
- __m128d polyVN0, polyVN1, polyVD0, polyVD1;
-
- z4 = _mm_mul_pd(z2, z2);
-
- polyVD1 = _mm_macc_pd(VD5, z4, VD3);
- polyVD0 = _mm_macc_pd(VD4, z4, VD2);
- polyVD1 = _mm_macc_pd(polyVD1, z4, VD1);
- polyVD0 = _mm_macc_pd(polyVD0, z4, VD0);
- polyVD0 = _mm_macc_pd(polyVD1, z2, polyVD0);
-
- polyVD0 = gmx_mm_inv_pd(polyVD0);
-
- polyVN1 = _mm_macc_pd(VN9, z4, VN7);
- polyVN0 = _mm_macc_pd(VN8, z4, VN6);
- polyVN1 = _mm_macc_pd(polyVN1, z4, VN5);
- polyVN0 = _mm_macc_pd(polyVN0, z4, VN4);
- polyVN1 = _mm_macc_pd(polyVN1, z4, VN3);
- polyVN0 = _mm_macc_pd(polyVN0, z4, VN2);
- polyVN1 = _mm_macc_pd(polyVN1, z4, VN1);
- polyVN0 = _mm_macc_pd(polyVN0, z4, VN0);
- polyVN0 = _mm_macc_pd(polyVN1, z2, polyVN0);
-
- return _mm_mul_pd(polyVN0, polyVD0);
-}
-
-
-static int
-gmx_mm_sincos_pd(__m128d x,
- __m128d *sinval,
- __m128d *cosval)
-{
-#ifdef _MSC_VER
- __declspec(align(16))
- const double sintable[34] =
- {
- 1.00000000000000000e+00, 0.00000000000000000e+00,
- 9.95184726672196929e-01, 9.80171403295606036e-02,
- 9.80785280403230431e-01, 1.95090322016128248e-01,
- 9.56940335732208824e-01, 2.90284677254462331e-01,
- 9.23879532511286738e-01, 3.82683432365089782e-01,
- 8.81921264348355050e-01, 4.71396736825997642e-01,
- 8.31469612302545236e-01, 5.55570233019602178e-01,
- 7.73010453362736993e-01, 6.34393284163645488e-01,
- 7.07106781186547573e-01, 7.07106781186547462e-01,
- 6.34393284163645599e-01, 7.73010453362736882e-01,
- 5.55570233019602289e-01, 8.31469612302545125e-01,
- 4.71396736825997809e-01, 8.81921264348354939e-01,
- 3.82683432365089837e-01, 9.23879532511286738e-01,
- 2.90284677254462276e-01, 9.56940335732208935e-01,
- 1.95090322016128304e-01, 9.80785280403230431e-01,
- 9.80171403295607702e-02, 9.95184726672196818e-01,
- 0.0, 1.00000000000000000e+00
- };
-#else
- const __m128d sintable[17] =
- {
- _mm_set_pd( 0.0, 1.0 ),
- _mm_set_pd( sin( 1.0 * (M_PI/2.0) / 16.0), cos( 1.0 * (M_PI/2.0) / 16.0) ),
- _mm_set_pd( sin( 2.0 * (M_PI/2.0) / 16.0), cos( 2.0 * (M_PI/2.0) / 16.0) ),
- _mm_set_pd( sin( 3.0 * (M_PI/2.0) / 16.0), cos( 3.0 * (M_PI/2.0) / 16.0) ),
- _mm_set_pd( sin( 4.0 * (M_PI/2.0) / 16.0), cos( 4.0 * (M_PI/2.0) / 16.0) ),
- _mm_set_pd( sin( 5.0 * (M_PI/2.0) / 16.0), cos( 5.0 * (M_PI/2.0) / 16.0) ),
- _mm_set_pd( sin( 6.0 * (M_PI/2.0) / 16.0), cos( 6.0 * (M_PI/2.0) / 16.0) ),
- _mm_set_pd( sin( 7.0 * (M_PI/2.0) / 16.0), cos( 7.0 * (M_PI/2.0) / 16.0) ),
- _mm_set_pd( sin( 8.0 * (M_PI/2.0) / 16.0), cos( 8.0 * (M_PI/2.0) / 16.0) ),
- _mm_set_pd( sin( 9.0 * (M_PI/2.0) / 16.0), cos( 9.0 * (M_PI/2.0) / 16.0) ),
- _mm_set_pd( sin( 10.0 * (M_PI/2.0) / 16.0), cos( 10.0 * (M_PI/2.0) / 16.0) ),
- _mm_set_pd( sin( 11.0 * (M_PI/2.0) / 16.0), cos( 11.0 * (M_PI/2.0) / 16.0) ),
- _mm_set_pd( sin( 12.0 * (M_PI/2.0) / 16.0), cos( 12.0 * (M_PI/2.0) / 16.0) ),
- _mm_set_pd( sin( 13.0 * (M_PI/2.0) / 16.0), cos( 13.0 * (M_PI/2.0) / 16.0) ),
- _mm_set_pd( sin( 14.0 * (M_PI/2.0) / 16.0), cos( 14.0 * (M_PI/2.0) / 16.0) ),
- _mm_set_pd( sin( 15.0 * (M_PI/2.0) / 16.0), cos( 15.0 * (M_PI/2.0) / 16.0) ),
- _mm_set_pd( 1.0, 0.0 )
- };
-#endif
-
- const __m128d signmask = gmx_mm_castsi128_pd( _mm_set_epi32(0x7FFFFFFF, 0xFFFFFFFF, 0x7FFFFFFF, 0xFFFFFFFF) );
- const __m128i signbit_epi32 = _mm_set1_epi32(0x80000000);
-
- const __m128d tabscale = _mm_set1_pd(32.0/M_PI);
- const __m128d invtabscale0 = _mm_set1_pd(9.81747508049011230469e-02);
- const __m128d invtabscale1 = _mm_set1_pd(1.96197799156550576057e-08);
- const __m128i ione = _mm_set1_epi32(1);
- const __m128i i32 = _mm_set1_epi32(32);
- const __m128i i16 = _mm_set1_epi32(16);
- const __m128i tabmask = _mm_set1_epi32(0x3F);
- const __m128d sinP7 = _mm_set1_pd(-1.0/5040.0);
- const __m128d sinP5 = _mm_set1_pd(1.0/120.0);
- const __m128d sinP3 = _mm_set1_pd(-1.0/6.0);
- const __m128d sinP1 = _mm_set1_pd(1.0);
-
- const __m128d cosP6 = _mm_set1_pd(-1.0/720.0);
- const __m128d cosP4 = _mm_set1_pd(1.0/24.0);
- const __m128d cosP2 = _mm_set1_pd(-1.0/2.0);
- const __m128d cosP0 = _mm_set1_pd(1.0);
-
- __m128d scalex;
- __m128i tabidx, corridx;
- __m128d xabs, z, z2, polySin, polyCos;
- __m128d xpoint;
- __m128d ypoint0, ypoint1;
-
- __m128d sinpoint, cospoint;
- __m128d xsign, ssign, csign;
- __m128i imask, sswapsign, cswapsign;
- __m128d minusone;
-
- xsign = _mm_andnot_pd(signmask, x);
- xabs = _mm_and_pd(x, signmask);
-
- scalex = _mm_mul_pd(tabscale, xabs);
- tabidx = _mm_cvtpd_epi32(scalex);
-
- xpoint = _mm_round_pd(scalex, _MM_FROUND_TO_NEAREST_INT);
-
- /* Extended precision arithmetics */
- z = _mm_nmacc_pd(invtabscale0, xpoint, xabs);
- z = _mm_nmacc_pd(invtabscale1, xpoint, z);
-
- /* Range reduction to 0..2*Pi */
- tabidx = _mm_and_si128(tabidx, tabmask);
-
- /* tabidx is now in range [0,..,64] */
- imask = _mm_cmpgt_epi32(tabidx, i32);
- sswapsign = imask;
- cswapsign = imask;
- corridx = _mm_and_si128(imask, i32);
- tabidx = _mm_sub_epi32(tabidx, corridx);
-
- /* tabidx is now in range [0..32] */
- imask = _mm_cmpgt_epi32(tabidx, i16);
- cswapsign = _mm_xor_si128(cswapsign, imask);
- corridx = _mm_sub_epi32(i32, tabidx);
- tabidx = _mm_blendv_epi8(tabidx, corridx, imask);
- /* tabidx is now in range [0..16] */
- ssign = _mm_cvtepi32_pd( _mm_or_si128( sswapsign, ione ) );
- csign = _mm_cvtepi32_pd( _mm_or_si128( cswapsign, ione ) );
-
-#ifdef _MSC_VER
- ypoint0 = _mm_load_pd(sintable + 2*_mm_extract_epi32(tabidx, 0));
- ypoint1 = _mm_load_pd(sintable + 2*_mm_extract_epi32(tabidx, 1));
-#else
- ypoint0 = sintable[_mm_extract_epi32(tabidx, 0)];
- ypoint1 = sintable[_mm_extract_epi32(tabidx, 1)];
-#endif
- sinpoint = _mm_unpackhi_pd(ypoint0, ypoint1);
- cospoint = _mm_unpacklo_pd(ypoint0, ypoint1);
-
- sinpoint = _mm_mul_pd(sinpoint, ssign);
- cospoint = _mm_mul_pd(cospoint, csign);
-
- z2 = _mm_mul_pd(z, z);
-
- polySin = _mm_macc_pd(sinP7, z2, sinP5);
- polySin = _mm_macc_pd(polySin, z2, sinP3);
- polySin = _mm_macc_pd(polySin, z2, sinP1);
- polySin = _mm_mul_pd(polySin, z);
-
- polyCos = _mm_macc_pd(cosP6, z2, cosP4);
- polyCos = _mm_macc_pd(polyCos, z2, cosP2);
- polyCos = _mm_macc_pd(polyCos, z2, cosP0);
-
- *sinval = _mm_xor_pd(_mm_add_pd( _mm_mul_pd(sinpoint, polyCos), _mm_mul_pd(cospoint, polySin) ), xsign);
- *cosval = _mm_sub_pd( _mm_mul_pd(cospoint, polyCos), _mm_mul_pd(sinpoint, polySin) );
-
- return 0;
-}
-
-/*
- * IMPORTANT: Do NOT call both sin & cos if you need both results, since each of them
- * will then call the sincos() routine and waste a factor 2 in performance!
- */
-static __m128d
-gmx_mm_sin_pd(__m128d x)
-{
- __m128d s, c;
- gmx_mm_sincos_pd(x, &s, &c);
- return s;
-}
-
-/*
- * IMPORTANT: Do NOT call both sin & cos if you need both results, since each of them
- * will then call the sincos() routine and waste a factor 2 in performance!
- */
-static __m128d
-gmx_mm_cos_pd(__m128d x)
-{
- __m128d s, c;
- gmx_mm_sincos_pd(x, &s, &c);
- return c;
-}
-
-
-
-static __m128d
-gmx_mm_tan_pd(__m128d x)
-{
- __m128d sinval, cosval;
- __m128d tanval;
-
- gmx_mm_sincos_pd(x, &sinval, &cosval);
-
- tanval = _mm_mul_pd(sinval, gmx_mm_inv_pd(cosval));
-
- return tanval;
-}
-
-
-
-static __m128d
-gmx_mm_asin_pd(__m128d x)
-{
- /* Same algorithm as cephes library */
- const __m128d signmask = gmx_mm_castsi128_pd( _mm_set_epi32(0x7FFFFFFF, 0xFFFFFFFF, 0x7FFFFFFF, 0xFFFFFFFF) );
- const __m128d limit1 = _mm_set1_pd(0.625);
- const __m128d limit2 = _mm_set1_pd(1e-8);
- const __m128d one = _mm_set1_pd(1.0);
- const __m128d halfpi = _mm_set1_pd(M_PI/2.0);
- const __m128d quarterpi = _mm_set1_pd(M_PI/4.0);
- const __m128d morebits = _mm_set1_pd(6.123233995736765886130e-17);
-
- const __m128d P5 = _mm_set1_pd(4.253011369004428248960e-3);
- const __m128d P4 = _mm_set1_pd(-6.019598008014123785661e-1);
- const __m128d P3 = _mm_set1_pd(5.444622390564711410273e0);
- const __m128d P2 = _mm_set1_pd(-1.626247967210700244449e1);
- const __m128d P1 = _mm_set1_pd(1.956261983317594739197e1);
- const __m128d P0 = _mm_set1_pd(-8.198089802484824371615e0);
-
- const __m128d Q4 = _mm_set1_pd(-1.474091372988853791896e1);
- const __m128d Q3 = _mm_set1_pd(7.049610280856842141659e1);
- const __m128d Q2 = _mm_set1_pd(-1.471791292232726029859e2);
- const __m128d Q1 = _mm_set1_pd(1.395105614657485689735e2);
- const __m128d Q0 = _mm_set1_pd(-4.918853881490881290097e1);
-
- const __m128d R4 = _mm_set1_pd(2.967721961301243206100e-3);
- const __m128d R3 = _mm_set1_pd(-5.634242780008963776856e-1);
- const __m128d R2 = _mm_set1_pd(6.968710824104713396794e0);
- const __m128d R1 = _mm_set1_pd(-2.556901049652824852289e1);
- const __m128d R0 = _mm_set1_pd(2.853665548261061424989e1);
-
- const __m128d S3 = _mm_set1_pd(-2.194779531642920639778e1);
- const __m128d S2 = _mm_set1_pd(1.470656354026814941758e2);
- const __m128d S1 = _mm_set1_pd(-3.838770957603691357202e2);
- const __m128d S0 = _mm_set1_pd(3.424398657913078477438e2);
-
- __m128d sign;
- __m128d mask;
- __m128d xabs;
- __m128d zz, ww, z, q, w, y, zz2, ww2;
- __m128d PA, PB;
- __m128d QA, QB;
- __m128d RA, RB;
- __m128d SA, SB;
- __m128d nom, denom;
-
- sign = _mm_andnot_pd(signmask, x);
- xabs = _mm_and_pd(x, signmask);
-
- mask = _mm_cmpgt_pd(xabs, limit1);
-
- zz = _mm_sub_pd(one, xabs);
- ww = _mm_mul_pd(xabs, xabs);
- zz2 = _mm_mul_pd(zz, zz);
- ww2 = _mm_mul_pd(ww, ww);
-
- /* R */
- RA = _mm_macc_pd(R4, zz2, R2);
- RB = _mm_macc_pd(R3, zz2, R1);
- RA = _mm_macc_pd(RA, zz2, R0);
- RA = _mm_macc_pd(RB, zz, RA);
-
- /* S, SA = zz2 */
- SB = _mm_macc_pd(S3, zz2, S1);
- SA = _mm_add_pd(zz2, S2);
- SA = _mm_macc_pd(SA, zz2, S0);
- SA = _mm_macc_pd(SB, zz, SA);
-
- /* P */
- PA = _mm_macc_pd(P5, ww2, P3);
- PB = _mm_macc_pd(P4, ww2, P2);
- PA = _mm_macc_pd(PA, ww2, P1);
- PB = _mm_macc_pd(PB, ww2, P0);
- PA = _mm_macc_pd(PA, ww, PB);
-
- /* Q, QA = ww2 */
- QB = _mm_macc_pd(Q4, ww2, Q2);
- QA = _mm_add_pd(ww2, Q3);
- QA = _mm_macc_pd(QA, ww2, Q1);
- QB = _mm_macc_pd(QB, ww2, Q0);
- QA = _mm_macc_pd(QA, ww, QB);
-
- RA = _mm_mul_pd(RA, zz);
- PA = _mm_mul_pd(PA, ww);
-
- nom = _mm_blendv_pd( PA, RA, mask );
- denom = _mm_blendv_pd( QA, SA, mask );
-
- q = _mm_mul_pd( nom, gmx_mm_inv_pd(denom) );
-
- zz = _mm_add_pd(zz, zz);
- zz = gmx_mm_sqrt_pd(zz);
- z = _mm_sub_pd(quarterpi, zz);
- zz = _mm_mul_pd(zz, q);
- zz = _mm_sub_pd(zz, morebits);
- z = _mm_sub_pd(z, zz);
- z = _mm_add_pd(z, quarterpi);
-
- w = _mm_macc_pd(xabs, q, xabs);
-
- z = _mm_blendv_pd( w, z, mask );
-
- mask = _mm_cmpgt_pd(xabs, limit2);
- z = _mm_blendv_pd( xabs, z, mask );
-
- z = _mm_xor_pd(z, sign);
-
- return z;
-}
-
-
-static __m128d
-gmx_mm_acos_pd(__m128d x)
-{
- const __m128d signmask = gmx_mm_castsi128_pd( _mm_set_epi32(0x7FFFFFFF, 0xFFFFFFFF, 0x7FFFFFFF, 0xFFFFFFFF) );
- const __m128d one = _mm_set1_pd(1.0);
- const __m128d half = _mm_set1_pd(0.5);
- const __m128d pi = _mm_set1_pd(M_PI);
- const __m128d quarterpi0 = _mm_set1_pd(7.85398163397448309616e-1);
- const __m128d quarterpi1 = _mm_set1_pd(6.123233995736765886130e-17);
-
-
- __m128d mask1;
-
- __m128d z, z1, z2;
-
- mask1 = _mm_cmpgt_pd(x, half);
- z1 = _mm_mul_pd(half, _mm_sub_pd(one, x));
- z1 = gmx_mm_sqrt_pd(z1);
- z = _mm_blendv_pd( x, z1, mask1 );
-
- z = gmx_mm_asin_pd(z);
-
- z1 = _mm_add_pd(z, z);
-
- z2 = _mm_sub_pd(quarterpi0, z);
- z2 = _mm_add_pd(z2, quarterpi1);
- z2 = _mm_add_pd(z2, quarterpi0);
-
- z = _mm_blendv_pd(z2, z1, mask1);
-
- return z;
-}
-
-static __m128d
-gmx_mm_atan_pd(__m128d x)
-{
- /* Same algorithm as cephes library */
- const __m128d signmask = gmx_mm_castsi128_pd( _mm_set_epi32(0x7FFFFFFF, 0xFFFFFFFF, 0x7FFFFFFF, 0xFFFFFFFF) );
- const __m128d limit1 = _mm_set1_pd(0.66);
- const __m128d limit2 = _mm_set1_pd(2.41421356237309504880);
- const __m128d quarterpi = _mm_set1_pd(M_PI/4.0);
- const __m128d halfpi = _mm_set1_pd(M_PI/2.0);
- const __m128d mone = _mm_set1_pd(-1.0);
- const __m128d morebits1 = _mm_set1_pd(0.5*6.123233995736765886130E-17);
- const __m128d morebits2 = _mm_set1_pd(6.123233995736765886130E-17);
-
- const __m128d P4 = _mm_set1_pd(-8.750608600031904122785E-1);
- const __m128d P3 = _mm_set1_pd(-1.615753718733365076637E1);
- const __m128d P2 = _mm_set1_pd(-7.500855792314704667340E1);
- const __m128d P1 = _mm_set1_pd(-1.228866684490136173410E2);
- const __m128d P0 = _mm_set1_pd(-6.485021904942025371773E1);
-
- const __m128d Q4 = _mm_set1_pd(2.485846490142306297962E1);
- const __m128d Q3 = _mm_set1_pd(1.650270098316988542046E2);
- const __m128d Q2 = _mm_set1_pd(4.328810604912902668951E2);
- const __m128d Q1 = _mm_set1_pd(4.853903996359136964868E2);
- const __m128d Q0 = _mm_set1_pd(1.945506571482613964425E2);
-
- __m128d sign;
- __m128d mask1, mask2;
- __m128d y, t1, t2;
- __m128d z, z2;
- __m128d P_A, P_B, Q_A, Q_B;
-
- sign = _mm_andnot_pd(signmask, x);
- x = _mm_and_pd(x, signmask);
-
- mask1 = _mm_cmpgt_pd(x, limit1);
- mask2 = _mm_cmpgt_pd(x, limit2);
-
- t1 = _mm_mul_pd(_mm_add_pd(x, mone), gmx_mm_inv_pd(_mm_sub_pd(x, mone)));
- t2 = _mm_mul_pd(mone, gmx_mm_inv_pd(x));
-
- y = _mm_and_pd(mask1, quarterpi);
- y = _mm_or_pd( _mm_and_pd(mask2, halfpi), _mm_andnot_pd(mask2, y) );
-
- x = _mm_or_pd( _mm_and_pd(mask1, t1), _mm_andnot_pd(mask1, x) );
- x = _mm_or_pd( _mm_and_pd(mask2, t2), _mm_andnot_pd(mask2, x) );
-
- z = _mm_mul_pd(x, x);
- z2 = _mm_mul_pd(z, z);
-
- P_A = _mm_macc_pd(P4, z2, P2);
- P_B = _mm_macc_pd(P3, z2, P1);
- P_A = _mm_macc_pd(P_A, z2, P0);
- P_A = _mm_macc_pd(P_B, z, P_A);
-
- /* Q_A = z2 */
- Q_B = _mm_macc_pd(Q4, z2, Q2);
- Q_A = _mm_add_pd(z2, Q3);
- Q_A = _mm_macc_pd(Q_A, z2, Q1);
- Q_B = _mm_macc_pd(Q_B, z2, Q0);
- Q_A = _mm_macc_pd(Q_A, z, Q_B);
-
- z = _mm_mul_pd(z, P_A);
- z = _mm_mul_pd(z, gmx_mm_inv_pd(Q_A));
- z = _mm_macc_pd(z, x, x);
-
- t1 = _mm_and_pd(mask1, morebits1);
- t1 = _mm_or_pd( _mm_and_pd(mask2, morebits2), _mm_andnot_pd(mask2, t1) );
-
- z = _mm_add_pd(z, t1);
- y = _mm_add_pd(y, z);
-
- y = _mm_xor_pd(y, sign);
-
- return y;
-}
-
-
-static __m128d
-gmx_mm_atan2_pd(__m128d y, __m128d x)
-{
- const __m128d pi = _mm_set1_pd(M_PI);
- const __m128d minuspi = _mm_set1_pd(-M_PI);
- const __m128d halfpi = _mm_set1_pd(M_PI/2.0);
- const __m128d minushalfpi = _mm_set1_pd(-M_PI/2.0);
-
- __m128d z, z1, z3, z4;
- __m128d w;
- __m128d maskx_lt, maskx_eq;
- __m128d masky_lt, masky_eq;
- __m128d mask1, mask2, mask3, mask4, maskall;
-
- maskx_lt = _mm_cmplt_pd(x, _mm_setzero_pd());
- masky_lt = _mm_cmplt_pd(y, _mm_setzero_pd());
- maskx_eq = _mm_cmpeq_pd(x, _mm_setzero_pd());
- masky_eq = _mm_cmpeq_pd(y, _mm_setzero_pd());
-
- z = _mm_mul_pd(y, gmx_mm_inv_pd(x));
- z = gmx_mm_atan_pd(z);
-
- mask1 = _mm_and_pd(maskx_eq, masky_lt);
- mask2 = _mm_andnot_pd(maskx_lt, masky_eq);
- mask3 = _mm_andnot_pd( _mm_or_pd(masky_lt, masky_eq), maskx_eq);
- mask4 = _mm_and_pd(masky_eq, maskx_lt);
-
- maskall = _mm_or_pd( _mm_or_pd(mask1, mask2), _mm_or_pd(mask3, mask4) );
-
- z = _mm_andnot_pd(maskall, z);
- z1 = _mm_and_pd(mask1, minushalfpi);
- z3 = _mm_and_pd(mask3, halfpi);
- z4 = _mm_and_pd(mask4, pi);
-
- z = _mm_or_pd( _mm_or_pd(z, z1), _mm_or_pd(z3, z4) );
-
- w = _mm_blendv_pd(pi, minuspi, masky_lt);
- w = _mm_and_pd(w, maskx_lt);
-
- w = _mm_andnot_pd(maskall, w);
-
- z = _mm_add_pd(z, w);
- return z;
-}
+#define gmx_mm_invsqrt_pd gmx_simd_invsqrt_d
+#define gmx_mm_inv_pd gmx_simd_inv_d
+#define gmx_mm_log_pd gmx_simd_log_d
+#define gmx_mm_pmecorrF_pd gmx_simd_pmecorrF_d
+#define gmx_mm_pmecorrV_pd gmx_simd_pmecorrV_d
+#define gmx_mm_sincos_pd gmx_simd_sincos_d
#endif
/*
* This file is part of the GROMACS molecular simulation package.
*
- * Copyright (c) 2012,2013, by the GROMACS development team, led by
+ * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
* Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
* and including many others, as listed in the AUTHORS file in the
* top-level source directory and at http://www.gromacs.org.
#ifndef GMX_SIMD_MATH_AVX_128_FMA_SINGLE_H
#define GMX_SIMD_MATH_AVX_128_FMA_SINGLE_H
-#include <immintrin.h> /* AVX */
-#ifdef HAVE_X86INTRIN_H
-#include <x86intrin.h> /* FMA */
-#endif
-#ifdef HAVE_INTRIN_H
-#include <intrin.h> /* FMA MSVC */
-#endif
-
-#include <math.h>
-
-#include "general_x86_avx_128_fma.h"
-
-
-#ifndef M_PI
-# define M_PI 3.14159265358979323846264338327950288
-#endif
-
-
-
-
-/************************
- * *
- * Simple math routines *
- * *
- ************************/
-
-/* 1.0/sqrt(x) */
-static gmx_inline __m128
-gmx_mm_invsqrt_ps(__m128 x)
-{
- const __m128 half = _mm_set1_ps(0.5);
- const __m128 one = _mm_set1_ps(1.0);
-
- __m128 lu = _mm_rsqrt_ps(x);
-
- return _mm_macc_ps(_mm_nmacc_ps(x, _mm_mul_ps(lu, lu), one), _mm_mul_ps(lu, half), lu);
-}
-
-/* sqrt(x) - Do NOT use this (but rather invsqrt) if you actually need 1.0/sqrt(x) */
-static gmx_inline __m128
-gmx_mm_sqrt_ps(__m128 x)
-{
- __m128 mask;
- __m128 res;
-
- mask = _mm_cmp_ps(x, _mm_setzero_ps(), _CMP_EQ_OQ);
- res = _mm_andnot_ps(mask, gmx_mm_invsqrt_ps(x));
-
- res = _mm_mul_ps(x, res);
-
- return res;
-}
-
-/* 1.0/x */
-static gmx_inline __m128
-gmx_mm_inv_ps(__m128 x)
-{
- const __m128 two = _mm_set1_ps(2.0);
-
- __m128 lu = _mm_rcp_ps(x);
-
- return _mm_mul_ps(lu, _mm_nmacc_ps(lu, x, two));
-}
-
-static gmx_inline __m128
-gmx_mm_abs_ps(__m128 x)
-{
- const __m128 signmask = gmx_mm_castsi128_ps( _mm_set1_epi32(0x7FFFFFFF) );
-
- return _mm_and_ps(x, signmask);
-}
-
-static __m128
-gmx_mm_log_ps(__m128 x)
-{
- /* Same algorithm as cephes library */
- const __m128 expmask = gmx_mm_castsi128_ps( _mm_set_epi32(0x7F800000, 0x7F800000, 0x7F800000, 0x7F800000) );
- const __m128i expbase_m1 = _mm_set1_epi32(127-1); /* We want non-IEEE format */
- const __m128 half = _mm_set1_ps(0.5f);
- const __m128 one = _mm_set1_ps(1.0f);
- const __m128 invsq2 = _mm_set1_ps(1.0f/sqrt(2.0f));
- const __m128 corr1 = _mm_set1_ps(-2.12194440e-4f);
- const __m128 corr2 = _mm_set1_ps(0.693359375f);
-
- const __m128 CA_1 = _mm_set1_ps(0.070376836292f);
- const __m128 CB_0 = _mm_set1_ps(1.6714950086782716f);
- const __m128 CB_1 = _mm_set1_ps(-2.452088066061482f);
- const __m128 CC_0 = _mm_set1_ps(1.5220770854701728f);
- const __m128 CC_1 = _mm_set1_ps(-1.3422238433233642f);
- const __m128 CD_0 = _mm_set1_ps(1.386218787509749f);
- const __m128 CD_1 = _mm_set1_ps(0.35075468953796346f);
- const __m128 CE_0 = _mm_set1_ps(1.3429983063133937f);
- const __m128 CE_1 = _mm_set1_ps(1.807420826584643f);
-
- __m128 fexp, fexp1;
- __m128i iexp;
- __m128 mask;
- __m128 x1, x2;
- __m128 y;
- __m128 pA, pB, pC, pD, pE, tB, tC, tD, tE;
-
- /* Separate x into exponent and mantissa, with a mantissa in the range [0.5..1[ (not IEEE754 standard!) */
- fexp = _mm_and_ps(x, expmask);
- iexp = gmx_mm_castps_si128(fexp);
- iexp = _mm_srli_epi32(iexp, 23);
- iexp = _mm_sub_epi32(iexp, expbase_m1);
-
- x = _mm_andnot_ps(expmask, x);
- x = _mm_or_ps(x, one);
- x = _mm_mul_ps(x, half);
-
- mask = _mm_cmp_ps(x, invsq2, _CMP_LT_OQ);
-
- x = _mm_add_ps(x, _mm_and_ps(mask, x));
- x = _mm_sub_ps(x, one);
- iexp = _mm_add_epi32(iexp, gmx_mm_castps_si128(mask)); /* 0xFFFFFFFF = -1 as int */
-
- x2 = _mm_mul_ps(x, x);
-
- pA = _mm_mul_ps(CA_1, x);
-
- pB = _mm_add_ps(x, CB_1);
- pC = _mm_add_ps(x, CC_1);
- pD = _mm_add_ps(x, CD_1);
- pE = _mm_add_ps(x, CE_1);
-
- pB = _mm_macc_ps(x, pB, CB_0);
- pC = _mm_macc_ps(x, pC, CC_0);
- pD = _mm_macc_ps(x, pD, CD_0);
- pE = _mm_macc_ps(x, pE, CE_0);
-
- pA = _mm_mul_ps(pA, pB);
- pC = _mm_mul_ps(pC, pD);
- pE = _mm_mul_ps(pE, x2);
- pA = _mm_mul_ps(pA, pC);
- y = _mm_mul_ps(pA, pE);
-
- fexp = _mm_cvtepi32_ps(iexp);
- y = _mm_macc_ps(fexp, corr1, y);
- y = _mm_nmacc_ps(half, x2, y);
-
- x2 = _mm_add_ps(x, y);
- x2 = _mm_macc_ps(fexp, corr2, x2);
-
- return x2;
-}
-
-
-/*
- * 2^x function.
- *
- * The 2^w term is calculated from a (6,0)-th order (no denominator) Minimax polynomia on the interval
- * [-0.5,0.5]. The coefficiencts of this was derived in Mathematica using the command:
- *
- * MiniMaxApproximation[(2^x), {x, {-0.5, 0.5}, 6, 0}, WorkingPrecision -> 15]
- *
- * The largest-magnitude exponent we can represent in IEEE single-precision binary format
- * is 2^-126 for small numbers and 2^127 for large ones. To avoid wrap-around problems, we set the
- * result to zero if the argument falls outside this range. For small numbers this is just fine, but
- * for large numbers you could be fancy and return the smallest/largest IEEE single-precision
- * number instead. That would take a few extra cycles and not really help, since something is
- * wrong if you are using single precision to work with numbers that cannot really be represented
- * in single precision.
- *
- * The accuracy is at least 23 bits.
- */
-static __m128
-gmx_mm_exp2_ps(__m128 x)
-{
- /* Lower bound: We do not allow numbers that would lead to an IEEE fp representation exponent smaller than -126. */
- const __m128 arglimit = _mm_set1_ps(126.0f);
-
- const __m128i expbase = _mm_set1_epi32(127);
- const __m128 CA6 = _mm_set1_ps(1.535336188319500E-004);
- const __m128 CA5 = _mm_set1_ps(1.339887440266574E-003);
- const __m128 CA4 = _mm_set1_ps(9.618437357674640E-003);
- const __m128 CA3 = _mm_set1_ps(5.550332471162809E-002);
- const __m128 CA2 = _mm_set1_ps(2.402264791363012E-001);
- const __m128 CA1 = _mm_set1_ps(6.931472028550421E-001);
- const __m128 CA0 = _mm_set1_ps(1.0f);
-
- __m128 valuemask;
- __m128i iexppart;
- __m128 fexppart;
- __m128 intpart;
- __m128 x2;
- __m128 p0, p1;
-
- iexppart = _mm_cvtps_epi32(x);
- intpart = _mm_round_ps(x, _MM_FROUND_TO_NEAREST_INT);
- iexppart = _mm_slli_epi32(_mm_add_epi32(iexppart, expbase), 23);
- valuemask = _mm_cmp_ps(arglimit, gmx_mm_abs_ps(x), _CMP_GE_OQ);
- fexppart = _mm_and_ps(valuemask, gmx_mm_castsi128_ps(iexppart));
-
- x = _mm_sub_ps(x, intpart);
- x2 = _mm_mul_ps(x, x);
-
- p0 = _mm_macc_ps(CA6, x2, CA4);
- p1 = _mm_macc_ps(CA5, x2, CA3);
- p0 = _mm_macc_ps(p0, x2, CA2);
- p1 = _mm_macc_ps(p1, x2, CA1);
- p0 = _mm_macc_ps(p0, x2, CA0);
- p0 = _mm_macc_ps(p1, x, p0);
- x = _mm_mul_ps(p0, fexppart);
-
- return x;
-}
-
-
-/* Exponential function. This could be calculated from 2^x as Exp(x)=2^(y), where y=log2(e)*x,
- * but there will then be a small rounding error since we lose some precision due to the
- * multiplication. This will then be magnified a lot by the exponential.
- *
- * Instead, we calculate the fractional part directly as a minimax approximation of
- * Exp(z) on [-0.5,0.5]. We use extended precision arithmetics to calculate the fraction
- * remaining after 2^y, which avoids the precision-loss.
- * The final result is correct to within 1 LSB over the entire argument range.
- */
-static __m128
-gmx_mm_exp_ps(__m128 x)
-{
- const __m128 argscale = _mm_set1_ps(1.44269504088896341f);
- /* Lower bound: Disallow numbers that would lead to an IEEE fp exponent reaching +-127. */
- const __m128 arglimit = _mm_set1_ps(126.0f);
- const __m128i expbase = _mm_set1_epi32(127);
-
- const __m128 invargscale0 = _mm_set1_ps(0.693359375f);
- const __m128 invargscale1 = _mm_set1_ps(-2.12194440e-4f);
-
- const __m128 CC5 = _mm_set1_ps(1.9875691500e-4f);
- const __m128 CC4 = _mm_set1_ps(1.3981999507e-3f);
- const __m128 CC3 = _mm_set1_ps(8.3334519073e-3f);
- const __m128 CC2 = _mm_set1_ps(4.1665795894e-2f);
- const __m128 CC1 = _mm_set1_ps(1.6666665459e-1f);
- const __m128 CC0 = _mm_set1_ps(5.0000001201e-1f);
- const __m128 one = _mm_set1_ps(1.0f);
-
- __m128 y, x2;
- __m128 p0, p1;
- __m128 valuemask;
- __m128i iexppart;
- __m128 fexppart;
- __m128 intpart;
-
- y = _mm_mul_ps(x, argscale);
-
- iexppart = _mm_cvtps_epi32(y);
- intpart = _mm_round_ps(y, _MM_FROUND_TO_NEAREST_INT);
-
- iexppart = _mm_slli_epi32(_mm_add_epi32(iexppart, expbase), 23);
- valuemask = _mm_cmp_ps(arglimit, gmx_mm_abs_ps(y), _CMP_GE_OQ);
- fexppart = _mm_and_ps(valuemask, gmx_mm_castsi128_ps(iexppart));
-
- /* Extended precision arithmetics */
- x = _mm_nmacc_ps(invargscale0, intpart, x);
- x = _mm_nmacc_ps(invargscale1, intpart, x);
-
- x2 = _mm_mul_ps(x, x);
-
- p1 = _mm_macc_ps(CC5, x2, CC3);
- p0 = _mm_macc_ps(CC4, x2, CC2);
- p1 = _mm_macc_ps(p1, x2, CC1);
- p0 = _mm_macc_ps(p0, x2, CC0);
- p0 = _mm_macc_ps(p1, x, p0);
- p0 = _mm_macc_ps(p0, x2, one);
-
- x = _mm_add_ps(x, p0);
-
- x = _mm_mul_ps(x, fexppart);
-
- return x;
-}
-
-/* FULL precision. Only errors in LSB */
-static __m128
-gmx_mm_erf_ps(__m128 x)
-{
- /* Coefficients for minimax approximation of erf(x)=x*P(x^2) in range [-1,1] */
- const __m128 CA6 = _mm_set1_ps(7.853861353153693e-5f);
- const __m128 CA5 = _mm_set1_ps(-8.010193625184903e-4f);
- const __m128 CA4 = _mm_set1_ps(5.188327685732524e-3f);
- const __m128 CA3 = _mm_set1_ps(-2.685381193529856e-2f);
- const __m128 CA2 = _mm_set1_ps(1.128358514861418e-1f);
- const __m128 CA1 = _mm_set1_ps(-3.761262582423300e-1f);
- const __m128 CA0 = _mm_set1_ps(1.128379165726710f);
- /* Coefficients for minimax approximation of erfc(x)=Exp(-x^2)*P((1/(x-1))^2) in range [0.67,2] */
- const __m128 CB9 = _mm_set1_ps(-0.0018629930017603923f);
- const __m128 CB8 = _mm_set1_ps(0.003909821287598495f);
- const __m128 CB7 = _mm_set1_ps(-0.0052094582210355615f);
- const __m128 CB6 = _mm_set1_ps(0.005685614362160572f);
- const __m128 CB5 = _mm_set1_ps(-0.0025367682853477272f);
- const __m128 CB4 = _mm_set1_ps(-0.010199799682318782f);
- const __m128 CB3 = _mm_set1_ps(0.04369575504816542f);
- const __m128 CB2 = _mm_set1_ps(-0.11884063474674492f);
- const __m128 CB1 = _mm_set1_ps(0.2732120154030589f);
- const __m128 CB0 = _mm_set1_ps(0.42758357702025784f);
- /* Coefficients for minimax approximation of erfc(x)=Exp(-x^2)*(1/x)*P((1/x)^2) in range [2,9.19] */
- const __m128 CC10 = _mm_set1_ps(-0.0445555913112064f);
- const __m128 CC9 = _mm_set1_ps(0.21376355144663348f);
- const __m128 CC8 = _mm_set1_ps(-0.3473187200259257f);
- const __m128 CC7 = _mm_set1_ps(0.016690861551248114f);
- const __m128 CC6 = _mm_set1_ps(0.7560973182491192f);
- const __m128 CC5 = _mm_set1_ps(-1.2137903600145787f);
- const __m128 CC4 = _mm_set1_ps(0.8411872321232948f);
- const __m128 CC3 = _mm_set1_ps(-0.08670413896296343f);
- const __m128 CC2 = _mm_set1_ps(-0.27124782687240334f);
- const __m128 CC1 = _mm_set1_ps(-0.0007502488047806069f);
- const __m128 CC0 = _mm_set1_ps(0.5642114853803148f);
-
- /* Coefficients for expansion of exp(x) in [0,0.1] */
- /* CD0 and CD1 are both 1.0, so no need to declare them separately */
- const __m128 CD2 = _mm_set1_ps(0.5000066608081202f);
- const __m128 CD3 = _mm_set1_ps(0.1664795422874624f);
- const __m128 CD4 = _mm_set1_ps(0.04379839977652482f);
-
- const __m128 sieve = gmx_mm_castsi128_ps( _mm_set1_epi32(0xfffff000) );
- const __m128 signbit = gmx_mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
- const __m128 one = _mm_set1_ps(1.0f);
- const __m128 two = _mm_set1_ps(2.0f);
-
- __m128 x2, x4, y;
- __m128 z, q, t, t2, w, w2;
- __m128 pA0, pA1, pB0, pB1, pC0, pC1;
- __m128 expmx2, corr;
- __m128 res_erf, res_erfc, res;
- __m128 mask;
-
- /* Calculate erf() */
- x2 = _mm_mul_ps(x, x);
- x4 = _mm_mul_ps(x2, x2);
-
- pA0 = _mm_macc_ps(CA6, x4, CA4);
- pA1 = _mm_macc_ps(CA5, x4, CA3);
- pA0 = _mm_macc_ps(pA0, x4, CA2);
- pA1 = _mm_macc_ps(pA1, x4, CA1);
- pA0 = _mm_mul_ps(pA0, x4);
- pA0 = _mm_macc_ps(pA1, x2, pA0);
- /* Constant term must come last for precision reasons */
- pA0 = _mm_add_ps(pA0, CA0);
-
- res_erf = _mm_mul_ps(x, pA0);
-
- /* Calculate erfc */
-
- y = gmx_mm_abs_ps(x);
- t = gmx_mm_inv_ps(y);
- w = _mm_sub_ps(t, one);
- t2 = _mm_mul_ps(t, t);
- w2 = _mm_mul_ps(w, w);
- /*
- * We cannot simply calculate exp(-x2) directly in single precision, since
- * that will lose a couple of bits of precision due to the multiplication.
- * Instead, we introduce x=z+w, where the last 12 bits of precision are in w.
- * Then we get exp(-x2) = exp(-z2)*exp((z-x)*(z+x)).
- *
- * The only drawback with this is that it requires TWO separate exponential
- * evaluations, which would be horrible performance-wise. However, the argument
- * for the second exp() call is always small, so there we simply use a
- * low-order minimax expansion on [0,0.1].
- */
-
- z = _mm_and_ps(y, sieve);
- q = _mm_mul_ps( _mm_sub_ps(z, y), _mm_add_ps(z, y) );
-
- corr = _mm_macc_ps(CD4, q, CD3);
- corr = _mm_macc_ps(corr, q, CD2);
- corr = _mm_macc_ps(corr, q, one);
- corr = _mm_macc_ps(corr, q, one);
-
- expmx2 = gmx_mm_exp_ps( _mm_or_ps( signbit, _mm_mul_ps(z, z) ) );
- expmx2 = _mm_mul_ps(expmx2, corr);
-
- pB1 = _mm_macc_ps(CB9, w2, CB7);
- pB0 = _mm_macc_ps(CB8, w2, CB6);
- pB1 = _mm_macc_ps(pB1, w2, CB5);
- pB0 = _mm_macc_ps(pB0, w2, CB4);
- pB1 = _mm_macc_ps(pB1, w2, CB3);
- pB0 = _mm_macc_ps(pB0, w2, CB2);
- pB1 = _mm_macc_ps(pB1, w2, CB1);
- pB0 = _mm_macc_ps(pB0, w2, CB0);
- pB0 = _mm_macc_ps(pB1, w, pB0);
-
- pC0 = _mm_macc_ps(CC10, t2, CC8);
- pC1 = _mm_macc_ps(CC9, t2, CC7);
- pC0 = _mm_macc_ps(pC0, t2, CC6);
- pC1 = _mm_macc_ps(pC1, t2, CC5);
- pC0 = _mm_macc_ps(pC0, t2, CC4);
- pC1 = _mm_macc_ps(pC1, t2, CC3);
- pC0 = _mm_macc_ps(pC0, t2, CC2);
- pC1 = _mm_macc_ps(pC1, t2, CC1);
-
- pC0 = _mm_macc_ps(pC0, t2, CC0);
- pC0 = _mm_macc_ps(pC1, t, pC0);
- pC0 = _mm_mul_ps(pC0, t);
-
- /* SELECT pB0 or pC0 for erfc() */
- mask = _mm_cmp_ps(two, y, _CMP_LT_OQ);
- res_erfc = _mm_blendv_ps(pB0, pC0, mask);
- res_erfc = _mm_mul_ps(res_erfc, expmx2);
-
- /* erfc(x<0) = 2-erfc(|x|) */
- mask = _mm_cmp_ps(x, _mm_setzero_ps(), _CMP_LT_OQ);
- res_erfc = _mm_blendv_ps(res_erfc, _mm_sub_ps(two, res_erfc), mask);
-
- /* Select erf() or erfc() */
- mask = _mm_cmp_ps(y, _mm_set1_ps(0.75f), _CMP_LT_OQ);
- res = _mm_blendv_ps(_mm_sub_ps(one, res_erfc), res_erf, mask);
-
- return res;
-}
-
-
-/* FULL precision. Only errors in LSB */
-static __m128
-gmx_mm_erfc_ps(__m128 x)
-{
- /* Coefficients for minimax approximation of erf(x)=x*P(x^2) in range [-1,1] */
- const __m128 CA6 = _mm_set1_ps(7.853861353153693e-5f);
- const __m128 CA5 = _mm_set1_ps(-8.010193625184903e-4f);
- const __m128 CA4 = _mm_set1_ps(5.188327685732524e-3f);
- const __m128 CA3 = _mm_set1_ps(-2.685381193529856e-2f);
- const __m128 CA2 = _mm_set1_ps(1.128358514861418e-1f);
- const __m128 CA1 = _mm_set1_ps(-3.761262582423300e-1f);
- const __m128 CA0 = _mm_set1_ps(1.128379165726710f);
- /* Coefficients for minimax approximation of erfc(x)=Exp(-x^2)*P((1/(x-1))^2) in range [0.67,2] */
- const __m128 CB9 = _mm_set1_ps(-0.0018629930017603923f);
- const __m128 CB8 = _mm_set1_ps(0.003909821287598495f);
- const __m128 CB7 = _mm_set1_ps(-0.0052094582210355615f);
- const __m128 CB6 = _mm_set1_ps(0.005685614362160572f);
- const __m128 CB5 = _mm_set1_ps(-0.0025367682853477272f);
- const __m128 CB4 = _mm_set1_ps(-0.010199799682318782f);
- const __m128 CB3 = _mm_set1_ps(0.04369575504816542f);
- const __m128 CB2 = _mm_set1_ps(-0.11884063474674492f);
- const __m128 CB1 = _mm_set1_ps(0.2732120154030589f);
- const __m128 CB0 = _mm_set1_ps(0.42758357702025784f);
- /* Coefficients for minimax approximation of erfc(x)=Exp(-x^2)*(1/x)*P((1/x)^2) in range [2,9.19] */
- const __m128 CC10 = _mm_set1_ps(-0.0445555913112064f);
- const __m128 CC9 = _mm_set1_ps(0.21376355144663348f);
- const __m128 CC8 = _mm_set1_ps(-0.3473187200259257f);
- const __m128 CC7 = _mm_set1_ps(0.016690861551248114f);
- const __m128 CC6 = _mm_set1_ps(0.7560973182491192f);
- const __m128 CC5 = _mm_set1_ps(-1.2137903600145787f);
- const __m128 CC4 = _mm_set1_ps(0.8411872321232948f);
- const __m128 CC3 = _mm_set1_ps(-0.08670413896296343f);
- const __m128 CC2 = _mm_set1_ps(-0.27124782687240334f);
- const __m128 CC1 = _mm_set1_ps(-0.0007502488047806069f);
- const __m128 CC0 = _mm_set1_ps(0.5642114853803148f);
-
- /* Coefficients for expansion of exp(x) in [0,0.1] */
- /* CD0 and CD1 are both 1.0, so no need to declare them separately */
- const __m128 CD2 = _mm_set1_ps(0.5000066608081202f);
- const __m128 CD3 = _mm_set1_ps(0.1664795422874624f);
- const __m128 CD4 = _mm_set1_ps(0.04379839977652482f);
-
- const __m128 sieve = gmx_mm_castsi128_ps( _mm_set1_epi32(0xfffff000) );
- const __m128 signbit = gmx_mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
- const __m128 one = _mm_set1_ps(1.0f);
- const __m128 two = _mm_set1_ps(2.0f);
-
- __m128 x2, x4, y;
- __m128 z, q, t, t2, w, w2;
- __m128 pA0, pA1, pB0, pB1, pC0, pC1;
- __m128 expmx2, corr;
- __m128 res_erf, res_erfc, res;
- __m128 mask;
-
- /* Calculate erf() */
- x2 = _mm_mul_ps(x, x);
- x4 = _mm_mul_ps(x2, x2);
-
- pA0 = _mm_macc_ps(CA6, x4, CA4);
- pA1 = _mm_macc_ps(CA5, x4, CA3);
- pA0 = _mm_macc_ps(pA0, x4, CA2);
- pA1 = _mm_macc_ps(pA1, x4, CA1);
- pA1 = _mm_mul_ps(pA1, x2);
- pA0 = _mm_macc_ps(pA0, x4, pA1);
- /* Constant term must come last for precision reasons */
- pA0 = _mm_add_ps(pA0, CA0);
-
- res_erf = _mm_mul_ps(x, pA0);
-
- /* Calculate erfc */
- y = gmx_mm_abs_ps(x);
- t = gmx_mm_inv_ps(y);
- w = _mm_sub_ps(t, one);
- t2 = _mm_mul_ps(t, t);
- w2 = _mm_mul_ps(w, w);
- /*
- * We cannot simply calculate exp(-x2) directly in single precision, since
- * that will lose a couple of bits of precision due to the multiplication.
- * Instead, we introduce x=z+w, where the last 12 bits of precision are in w.
- * Then we get exp(-x2) = exp(-z2)*exp((z-x)*(z+x)).
- *
- * The only drawback with this is that it requires TWO separate exponential
- * evaluations, which would be horrible performance-wise. However, the argument
- * for the second exp() call is always small, so there we simply use a
- * low-order minimax expansion on [0,0.1].
- */
-
- z = _mm_and_ps(y, sieve);
- q = _mm_mul_ps( _mm_sub_ps(z, y), _mm_add_ps(z, y) );
-
- corr = _mm_macc_ps(CD4, q, CD3);
- corr = _mm_macc_ps(corr, q, CD2);
- corr = _mm_macc_ps(corr, q, one);
- corr = _mm_macc_ps(corr, q, one);
-
- expmx2 = gmx_mm_exp_ps( _mm_or_ps( signbit, _mm_mul_ps(z, z) ) );
- expmx2 = _mm_mul_ps(expmx2, corr);
-
- pB1 = _mm_macc_ps(CB9, w2, CB7);
- pB0 = _mm_macc_ps(CB8, w2, CB6);
- pB1 = _mm_macc_ps(pB1, w2, CB5);
- pB0 = _mm_macc_ps(pB0, w2, CB4);
- pB1 = _mm_macc_ps(pB1, w2, CB3);
- pB0 = _mm_macc_ps(pB0, w2, CB2);
- pB1 = _mm_macc_ps(pB1, w2, CB1);
- pB0 = _mm_macc_ps(pB0, w2, CB0);
- pB0 = _mm_macc_ps(pB1, w, pB0);
-
- pC0 = _mm_macc_ps(CC10, t2, CC8);
- pC1 = _mm_macc_ps(CC9, t2, CC7);
- pC0 = _mm_macc_ps(pC0, t2, CC6);
- pC1 = _mm_macc_ps(pC1, t2, CC5);
- pC0 = _mm_macc_ps(pC0, t2, CC4);
- pC1 = _mm_macc_ps(pC1, t2, CC3);
- pC0 = _mm_macc_ps(pC0, t2, CC2);
- pC1 = _mm_macc_ps(pC1, t2, CC1);
-
- pC0 = _mm_macc_ps(pC0, t2, CC0);
- pC0 = _mm_macc_ps(pC1, t, pC0);
- pC0 = _mm_mul_ps(pC0, t);
-
- /* SELECT pB0 or pC0 for erfc() */
- mask = _mm_cmp_ps(two, y, _CMP_LT_OQ);
- res_erfc = _mm_blendv_ps(pB0, pC0, mask);
- res_erfc = _mm_mul_ps(res_erfc, expmx2);
-
- /* erfc(x<0) = 2-erfc(|x|) */
- mask = _mm_cmp_ps(x, _mm_setzero_ps(), _CMP_LT_OQ);
- res_erfc = _mm_blendv_ps(res_erfc, _mm_sub_ps(two, res_erfc), mask);
-
- /* Select erf() or erfc() */
- mask = _mm_cmp_ps(y, _mm_set1_ps(0.75f), _CMP_LT_OQ);
- res = _mm_blendv_ps(res_erfc, _mm_sub_ps(one, res_erf), mask);
-
- return res;
-}
-
-
-/* Calculate the force correction due to PME analytically.
- *
- * This routine is meant to enable analytical evaluation of the
- * direct-space PME electrostatic force to avoid tables.
- *
- * The direct-space potential should be Erfc(beta*r)/r, but there
- * are some problems evaluating that:
- *
- * First, the error function is difficult (read: expensive) to
- * approxmiate accurately for intermediate to large arguments, and
- * this happens already in ranges of beta*r that occur in simulations.
- * Second, we now try to avoid calculating potentials in Gromacs but
- * use forces directly.
- *
- * We can simply things slight by noting that the PME part is really
- * a correction to the normal Coulomb force since Erfc(z)=1-Erf(z), i.e.
- *
- * V= 1/r - Erf(beta*r)/r
- *
- * The first term we already have from the inverse square root, so
- * that we can leave out of this routine.
- *
- * For pme tolerances of 1e-3 to 1e-8 and cutoffs of 0.5nm to 1.8nm,
- * the argument beta*r will be in the range 0.15 to ~4. Use your
- * favorite plotting program to realize how well-behaved Erf(z)/z is
- * in this range!
- *
- * We approximate f(z)=erf(z)/z with a rational minimax polynomial.
- * However, it turns out it is more efficient to approximate f(z)/z and
- * then only use even powers. This is another minor optimization, since
- * we actually WANT f(z)/z, because it is going to be multiplied by
- * the vector between the two atoms to get the vectorial force. The
- * fastest flops are the ones we can avoid calculating!
- *
- * So, here's how it should be used:
- *
- * 1. Calculate r^2.
- * 2. Multiply by beta^2, so you get z^2=beta^2*r^2.
- * 3. Evaluate this routine with z^2 as the argument.
- * 4. The return value is the expression:
- *
- *
- * 2*exp(-z^2) erf(z)
- * ------------ - --------
- * sqrt(Pi)*z^2 z^3
- *
- * 5. Multiply the entire expression by beta^3. This will get you
- *
- * beta^3*2*exp(-z^2) beta^3*erf(z)
- * ------------------ - ---------------
- * sqrt(Pi)*z^2 z^3
- *
- * or, switching back to r (z=r*beta):
- *
- * 2*beta*exp(-r^2*beta^2) erf(r*beta)
- * ----------------------- - -----------
- * sqrt(Pi)*r^2 r^3
- *
- *
- * With a bit of math exercise you should be able to confirm that
- * this is exactly D[Erf[beta*r]/r,r] divided by r another time.
- *
- * 6. Add the result to 1/r^3, multiply by the product of the charges,
- * and you have your force (divided by r). A final multiplication
- * with the vector connecting the two particles and you have your
- * vectorial force to add to the particles.
- *
- */
-static __m128
-gmx_mm_pmecorrF_ps(__m128 z2)
-{
- const __m128 FN6 = _mm_set1_ps(-1.7357322914161492954e-8f);
- const __m128 FN5 = _mm_set1_ps(1.4703624142580877519e-6f);
- const __m128 FN4 = _mm_set1_ps(-0.000053401640219807709149f);
- const __m128 FN3 = _mm_set1_ps(0.0010054721316683106153f);
- const __m128 FN2 = _mm_set1_ps(-0.019278317264888380590f);
- const __m128 FN1 = _mm_set1_ps(0.069670166153766424023f);
- const __m128 FN0 = _mm_set1_ps(-0.75225204789749321333f);
-
- const __m128 FD4 = _mm_set1_ps(0.0011193462567257629232f);
- const __m128 FD3 = _mm_set1_ps(0.014866955030185295499f);
- const __m128 FD2 = _mm_set1_ps(0.11583842382862377919f);
- const __m128 FD1 = _mm_set1_ps(0.50736591960530292870f);
- const __m128 FD0 = _mm_set1_ps(1.0f);
-
- __m128 z4;
- __m128 polyFN0, polyFN1, polyFD0, polyFD1;
-
- z4 = _mm_mul_ps(z2, z2);
-
- polyFD0 = _mm_macc_ps(FD4, z4, FD2);
- polyFD1 = _mm_macc_ps(FD3, z4, FD1);
- polyFD0 = _mm_macc_ps(polyFD0, z4, FD0);
- polyFD0 = _mm_macc_ps(polyFD1, z2, polyFD0);
-
- polyFD0 = gmx_mm_inv_ps(polyFD0);
-
- polyFN0 = _mm_macc_ps(FN6, z4, FN4);
- polyFN1 = _mm_macc_ps(FN5, z4, FN3);
- polyFN0 = _mm_macc_ps(polyFN0, z4, FN2);
- polyFN1 = _mm_macc_ps(polyFN1, z4, FN1);
- polyFN0 = _mm_macc_ps(polyFN0, z4, FN0);
- polyFN0 = _mm_macc_ps(polyFN1, z2, polyFN0);
-
- return _mm_mul_ps(polyFN0, polyFD0);
-}
-
-
-
-
-/* Calculate the potential correction due to PME analytically.
- *
- * See gmx_mm256_pmecorrF_ps() for details about the approximation.
- *
- * This routine calculates Erf(z)/z, although you should provide z^2
- * as the input argument.
- *
- * Here's how it should be used:
- *
- * 1. Calculate r^2.
- * 2. Multiply by beta^2, so you get z^2=beta^2*r^2.
- * 3. Evaluate this routine with z^2 as the argument.
- * 4. The return value is the expression:
- *
- *
- * erf(z)
- * --------
- * z
- *
- * 5. Multiply the entire expression by beta and switching back to r (z=r*beta):
- *
- * erf(r*beta)
- * -----------
- * r
- *
- * 6. Add the result to 1/r, multiply by the product of the charges,
- * and you have your potential.
- */
-static __m128
-gmx_mm_pmecorrV_ps(__m128 z2)
-{
- const __m128 VN6 = _mm_set1_ps(1.9296833005951166339e-8f);
- const __m128 VN5 = _mm_set1_ps(-1.4213390571557850962e-6f);
- const __m128 VN4 = _mm_set1_ps(0.000041603292906656984871f);
- const __m128 VN3 = _mm_set1_ps(-0.00013134036773265025626f);
- const __m128 VN2 = _mm_set1_ps(0.038657983986041781264f);
- const __m128 VN1 = _mm_set1_ps(0.11285044772717598220f);
- const __m128 VN0 = _mm_set1_ps(1.1283802385263030286f);
-
- const __m128 VD3 = _mm_set1_ps(0.0066752224023576045451f);
- const __m128 VD2 = _mm_set1_ps(0.078647795836373922256f);
- const __m128 VD1 = _mm_set1_ps(0.43336185284710920150f);
- const __m128 VD0 = _mm_set1_ps(1.0f);
-
- __m128 z4;
- __m128 polyVN0, polyVN1, polyVD0, polyVD1;
+#include "simd_math.h"
- z4 = _mm_mul_ps(z2, z2);
-
- polyVD1 = _mm_macc_ps(VD3, z4, VD1);
- polyVD0 = _mm_macc_ps(VD2, z4, VD0);
- polyVD0 = _mm_macc_ps(polyVD1, z2, polyVD0);
-
- polyVD0 = gmx_mm_inv_ps(polyVD0);
-
- polyVN0 = _mm_macc_ps(VN6, z4, VN4);
- polyVN1 = _mm_macc_ps(VN5, z4, VN3);
- polyVN0 = _mm_macc_ps(polyVN0, z4, VN2);
- polyVN1 = _mm_macc_ps(polyVN1, z4, VN1);
- polyVN0 = _mm_macc_ps(polyVN0, z4, VN0);
- polyVN0 = _mm_macc_ps(polyVN1, z2, polyVN0);
-
- return _mm_mul_ps(polyVN0, polyVD0);
-}
-
-
-
-static int
-gmx_mm_sincos_ps(__m128 x,
- __m128 *sinval,
- __m128 *cosval)
-{
- const __m128 two_over_pi = _mm_set1_ps(2.0/M_PI);
- const __m128 half = _mm_set1_ps(0.5);
- const __m128 one = _mm_set1_ps(1.0);
-
- const __m128i izero = _mm_set1_epi32(0);
- const __m128i ione = _mm_set1_epi32(1);
- const __m128i itwo = _mm_set1_epi32(2);
- const __m128i ithree = _mm_set1_epi32(3);
- const __m128 signbit = gmx_mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
-
- const __m128 CA1 = _mm_set1_ps(1.5703125f);
- const __m128 CA2 = _mm_set1_ps(4.837512969970703125e-4f);
- const __m128 CA3 = _mm_set1_ps(7.54978995489188216e-8f);
-
- const __m128 CC0 = _mm_set1_ps(-0.0013602249f);
- const __m128 CC1 = _mm_set1_ps(0.0416566950f);
- const __m128 CC2 = _mm_set1_ps(-0.4999990225f);
- const __m128 CS0 = _mm_set1_ps(-0.0001950727f);
- const __m128 CS1 = _mm_set1_ps(0.0083320758f);
- const __m128 CS2 = _mm_set1_ps(-0.1666665247f);
-
- __m128 y, y2;
- __m128 z;
- __m128i iz;
- __m128i offset_sin, offset_cos;
- __m128 tmp1, tmp2;
- __m128 mask_sin, mask_cos;
- __m128 tmp_sin, tmp_cos;
-
- y = _mm_mul_ps(x, two_over_pi);
- y = _mm_add_ps(y, _mm_or_ps(_mm_and_ps(y, signbit), half));
-
- iz = _mm_cvttps_epi32(y);
- z = _mm_round_ps(y, _MM_FROUND_TO_ZERO);
-
- offset_sin = _mm_and_si128(iz, ithree);
- offset_cos = _mm_add_epi32(iz, ione);
-
- /* Extended precision arithmethic to achieve full precision */
- y = _mm_nmacc_ps(z, CA1, x);
- y = _mm_nmacc_ps(z, CA2, y);
- y = _mm_nmacc_ps(z, CA3, y);
-
- y2 = _mm_mul_ps(y, y);
-
- tmp1 = _mm_macc_ps(CC0, y2, CC1);
- tmp2 = _mm_macc_ps(CS0, y2, CS1);
- tmp1 = _mm_macc_ps(tmp1, y2, CC2);
- tmp2 = _mm_macc_ps(tmp2, y2, CS2);
-
- tmp1 = _mm_macc_ps(tmp1, y2, one);
-
- tmp2 = _mm_macc_ps(tmp2, _mm_mul_ps(y, y2), y);
-
- mask_sin = gmx_mm_castsi128_ps(_mm_cmpeq_epi32( _mm_and_si128(offset_sin, ione), izero));
- mask_cos = gmx_mm_castsi128_ps(_mm_cmpeq_epi32( _mm_and_si128(offset_cos, ione), izero));
-
- tmp_sin = _mm_blendv_ps(tmp1, tmp2, mask_sin);
- tmp_cos = _mm_blendv_ps(tmp1, tmp2, mask_cos);
-
- mask_sin = gmx_mm_castsi128_ps(_mm_cmpeq_epi32( _mm_and_si128(offset_sin, itwo), izero));
- mask_cos = gmx_mm_castsi128_ps(_mm_cmpeq_epi32( _mm_and_si128(offset_cos, itwo), izero));
-
- tmp1 = _mm_xor_ps(signbit, tmp_sin);
- tmp2 = _mm_xor_ps(signbit, tmp_cos);
-
- *sinval = _mm_blendv_ps(tmp1, tmp_sin, mask_sin);
- *cosval = _mm_blendv_ps(tmp2, tmp_cos, mask_cos);
-
- return 0;
-}
-
-/*
- * IMPORTANT: Do NOT call both sin & cos if you need both results, since each of them
- * will then call the sincos() routine and waste a factor 2 in performance!
- */
-static __m128
-gmx_mm_sin_ps(__m128 x)
-{
- __m128 s, c;
- gmx_mm_sincos_ps(x, &s, &c);
- return s;
-}
-
-/*
- * IMPORTANT: Do NOT call both sin & cos if you need both results, since each of them
- * will then call the sincos() routine and waste a factor 2 in performance!
+/* Temporary:
+ * Alias some old SSE definitions to new SIMD definitions so we don't need
+ * to modify _all_ group kernels - they will anyway be replaced with a new
+ * generic SIMD version soon.
*/
-static __m128
-gmx_mm_cos_ps(__m128 x)
-{
- __m128 s, c;
- gmx_mm_sincos_ps(x, &s, &c);
- return c;
-}
-
-
-static __m128
-gmx_mm_tan_ps(__m128 x)
-{
- __m128 sinval, cosval;
- __m128 tanval;
-
- gmx_mm_sincos_ps(x, &sinval, &cosval);
-
- tanval = _mm_mul_ps(sinval, gmx_mm_inv_ps(cosval));
-
- return tanval;
-}
-
-
-static __m128
-gmx_mm_asin_ps(__m128 x)
-{
- /* Same algorithm as cephes library */
- const __m128 signmask = gmx_mm_castsi128_ps( _mm_set1_epi32(0x7FFFFFFF) );
- const __m128 limitlow = _mm_set1_ps(1e-4f);
- const __m128 half = _mm_set1_ps(0.5f);
- const __m128 one = _mm_set1_ps(1.0f);
- const __m128 halfpi = _mm_set1_ps(M_PI/2.0f);
-
- const __m128 CC5 = _mm_set1_ps(4.2163199048E-2f);
- const __m128 CC4 = _mm_set1_ps(2.4181311049E-2f);
- const __m128 CC3 = _mm_set1_ps(4.5470025998E-2f);
- const __m128 CC2 = _mm_set1_ps(7.4953002686E-2f);
- const __m128 CC1 = _mm_set1_ps(1.6666752422E-1f);
-
- __m128 sign;
- __m128 mask;
- __m128 xabs;
- __m128 z, z1, z2, q, q1, q2;
- __m128 pA, pB;
-
- sign = _mm_andnot_ps(signmask, x);
- xabs = _mm_and_ps(x, signmask);
-
- mask = _mm_cmp_ps(xabs, half, _CMP_GT_OQ);
-
- z1 = _mm_mul_ps(half, _mm_sub_ps(one, xabs));
- q1 = _mm_mul_ps(z1, gmx_mm_invsqrt_ps(z1));
- q1 = _mm_andnot_ps(_mm_cmp_ps(xabs, one, _CMP_EQ_OQ), q1);
-
- q2 = xabs;
- z2 = _mm_mul_ps(q2, q2);
-
- z = _mm_or_ps( _mm_and_ps(mask, z1), _mm_andnot_ps(mask, z2) );
- q = _mm_or_ps( _mm_and_ps(mask, q1), _mm_andnot_ps(mask, q2) );
-
- z2 = _mm_mul_ps(z, z);
-
- pA = _mm_macc_ps(CC5, z2, CC3);
- pB = _mm_macc_ps(CC4, z2, CC2);
-
- pA = _mm_macc_ps(pA, z2, CC1);
- pA = _mm_mul_ps(pA, z);
-
- z = _mm_macc_ps(pB, z2, pA);
-
- z = _mm_macc_ps(z, q, q);
-
- q2 = _mm_sub_ps(halfpi, z);
- q2 = _mm_sub_ps(q2, z);
-
- z = _mm_or_ps( _mm_and_ps(mask, q2), _mm_andnot_ps(mask, z) );
-
- mask = _mm_cmp_ps(xabs, limitlow, _CMP_GT_OQ);
- z = _mm_or_ps( _mm_and_ps(mask, z), _mm_andnot_ps(mask, xabs) );
-
- z = _mm_xor_ps(z, sign);
-
- return z;
-}
-
-
-static __m128
-gmx_mm_acos_ps(__m128 x)
-{
- const __m128 signmask = gmx_mm_castsi128_ps( _mm_set1_epi32(0x7FFFFFFF) );
- const __m128 one_ps = _mm_set1_ps(1.0f);
- const __m128 half_ps = _mm_set1_ps(0.5f);
- const __m128 pi_ps = _mm_set1_ps(M_PI);
- const __m128 halfpi_ps = _mm_set1_ps(M_PI/2.0f);
-
- __m128 mask1;
- __m128 mask2;
- __m128 xabs;
- __m128 z, z1, z2, z3;
-
- xabs = _mm_and_ps(x, signmask);
- mask1 = _mm_cmp_ps(xabs, half_ps, _CMP_GT_OQ);
- mask2 = _mm_cmp_ps(x, _mm_setzero_ps(), _CMP_GT_OQ);
-
- z = _mm_mul_ps(half_ps, _mm_sub_ps(one_ps, xabs));
- z = _mm_mul_ps(z, gmx_mm_invsqrt_ps(z));
- z = _mm_andnot_ps(_mm_cmp_ps(xabs, one_ps, _CMP_EQ_OQ), z);
-
- z = _mm_blendv_ps(x, z, mask1);
- z = gmx_mm_asin_ps(z);
-
- z2 = _mm_add_ps(z, z);
- z1 = _mm_sub_ps(pi_ps, z2);
- z3 = _mm_sub_ps(halfpi_ps, z);
-
- z = _mm_blendv_ps(z1, z2, mask2);
- z = _mm_blendv_ps(z3, z, mask1);
-
- return z;
-}
-
-
-static __m128
-gmx_mm_atan_ps(__m128 x)
-{
- /* Same algorithm as cephes library */
- const __m128 signmask = gmx_mm_castsi128_ps( _mm_set1_epi32(0x7FFFFFFF) );
- const __m128 limit1 = _mm_set1_ps(0.414213562373095f);
- const __m128 limit2 = _mm_set1_ps(2.414213562373095f);
- const __m128 quarterpi = _mm_set1_ps(0.785398163397448f);
- const __m128 halfpi = _mm_set1_ps(1.570796326794896f);
- const __m128 mone = _mm_set1_ps(-1.0f);
- const __m128 CC3 = _mm_set1_ps(-3.33329491539E-1f);
- const __m128 CC5 = _mm_set1_ps(1.99777106478E-1f);
- const __m128 CC7 = _mm_set1_ps(-1.38776856032E-1);
- const __m128 CC9 = _mm_set1_ps(8.05374449538e-2f);
-
- __m128 sign;
- __m128 mask1, mask2;
- __m128 y, z1, z2;
- __m128 x2, x4;
- __m128 sum1, sum2;
-
- sign = _mm_andnot_ps(signmask, x);
- x = _mm_and_ps(x, signmask);
-
- mask1 = _mm_cmp_ps(x, limit1, _CMP_GT_OQ);
- mask2 = _mm_cmp_ps(x, limit2, _CMP_GT_OQ);
-
- z1 = _mm_mul_ps(_mm_add_ps(x, mone), gmx_mm_inv_ps(_mm_sub_ps(x, mone)));
- z2 = _mm_mul_ps(mone, gmx_mm_inv_ps(x));
-
- y = _mm_and_ps(mask1, quarterpi);
- y = _mm_blendv_ps(y, halfpi, mask2);
-
- x = _mm_blendv_ps(x, z1, mask1);
- x = _mm_blendv_ps(x, z2, mask2);
-
- x2 = _mm_mul_ps(x, x);
- x4 = _mm_mul_ps(x2, x2);
-
- sum1 = _mm_macc_ps(CC9, x4, CC5);
- sum2 = _mm_macc_ps(CC7, x4, CC3);
- sum1 = _mm_mul_ps(sum1, x4);
- sum1 = _mm_macc_ps(sum2, x2, sum1);
-
- sum1 = _mm_sub_ps(sum1, mone);
- y = _mm_macc_ps(sum1, x, y);
-
- y = _mm_xor_ps(y, sign);
-
- return y;
-}
-
-
-static __m128
-gmx_mm_atan2_ps(__m128 y, __m128 x)
-{
- const __m128 pi = _mm_set1_ps(M_PI);
- const __m128 minuspi = _mm_set1_ps(-M_PI);
- const __m128 halfpi = _mm_set1_ps(M_PI/2.0);
- const __m128 minushalfpi = _mm_set1_ps(-M_PI/2.0);
-
- __m128 z, z1, z3, z4;
- __m128 w;
- __m128 maskx_lt, maskx_eq;
- __m128 masky_lt, masky_eq;
- __m128 mask1, mask2, mask3, mask4, maskall;
-
- maskx_lt = _mm_cmp_ps(x, _mm_setzero_ps(), _CMP_LT_OQ);
- masky_lt = _mm_cmp_ps(y, _mm_setzero_ps(), _CMP_LT_OQ);
- maskx_eq = _mm_cmp_ps(x, _mm_setzero_ps(), _CMP_EQ_OQ);
- masky_eq = _mm_cmp_ps(y, _mm_setzero_ps(), _CMP_EQ_OQ);
-
- z = _mm_mul_ps(y, gmx_mm_inv_ps(x));
- z = gmx_mm_atan_ps(z);
-
- mask1 = _mm_and_ps(maskx_eq, masky_lt);
- mask2 = _mm_andnot_ps(maskx_lt, masky_eq);
- mask3 = _mm_andnot_ps( _mm_or_ps(masky_lt, masky_eq), maskx_eq);
- mask4 = _mm_and_ps(masky_eq, maskx_lt);
-
- maskall = _mm_or_ps( _mm_or_ps(mask1, mask2), _mm_or_ps(mask3, mask4) );
-
- z = _mm_andnot_ps(maskall, z);
- z1 = _mm_and_ps(mask1, minushalfpi);
- z3 = _mm_and_ps(mask3, halfpi);
- z4 = _mm_and_ps(mask4, pi);
-
- z = _mm_or_ps( _mm_or_ps(z, z1), _mm_or_ps(z3, z4) );
-
- mask1 = _mm_andnot_ps(masky_lt, maskx_lt);
- mask2 = _mm_and_ps(maskx_lt, masky_lt);
-
- w = _mm_or_ps( _mm_and_ps(mask1, pi), _mm_and_ps(mask2, minuspi) );
- w = _mm_andnot_ps(maskall, w);
-
- z = _mm_add_ps(z, w);
-
- return z;
-}
-
+#define gmx_mm_invsqrt_ps gmx_simd_invsqrt_f
+#define gmx_mm_inv_ps gmx_simd_inv_f
+#define gmx_mm_log_ps gmx_simd_log_f
+#define gmx_mm_pmecorrF_ps gmx_simd_pmecorrF_f
+#define gmx_mm_pmecorrV_ps gmx_simd_pmecorrV_f
+#define gmx_mm_sincos_ps gmx_simd_sincos_f
#endif
/*
* This file is part of the GROMACS molecular simulation package.
*
- * Copyright (c) 2012,2013, by the GROMACS development team, led by
+ * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
* Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
* and including many others, as listed in the AUTHORS file in the
* top-level source directory and at http://www.gromacs.org.
#ifndef GMX_SIMD_MATH_AVX_256_DOUBLE_H
#define GMX_SIMD_MATH_AVX_256_DOUBLE_H
-#include <math.h>
+#include "simd_math.h"
-#include "general_x86_avx_256.h"
-
-#ifndef M_PI
-# define M_PI 3.14159265358979323846264338327950288
-#endif
-
-
-/************************
- * *
- * Simple math routines *
- * *
- ************************/
-
-/* 1.0/sqrt(x), 256 bit wide */
-static gmx_inline __m256d
-gmx_mm256_invsqrt_pd(__m256d x)
-{
- const __m256d half = _mm256_set1_pd(0.5);
- const __m256d three = _mm256_set1_pd(3.0);
-
- /* Lookup instruction only exists in single precision, convert back and forth... */
- __m256d lu = _mm256_cvtps_pd(_mm_rsqrt_ps( _mm256_cvtpd_ps(x)));
-
- lu = _mm256_mul_pd(half, _mm256_mul_pd(_mm256_sub_pd(three, _mm256_mul_pd(_mm256_mul_pd(lu, lu), x)), lu));
- return _mm256_mul_pd(half, _mm256_mul_pd(_mm256_sub_pd(three, _mm256_mul_pd(_mm256_mul_pd(lu, lu), x)), lu));
-}
-
-/* 1.0/sqrt(x), done for a pair of arguments to improve throughput */
-static void
-gmx_mm256_invsqrt_pair_pd(__m256d x1, __m256d x2, __m256d *invsqrt1, __m256d *invsqrt2)
-{
- const __m256d half = _mm256_set1_pd(0.5);
- const __m256d three = _mm256_set1_pd(3.0);
- const __m256 halff = _mm256_set1_ps(0.5f);
- const __m256 threef = _mm256_set1_ps(3.0f);
-
- __m256 xf, luf;
- __m256d lu1, lu2;
-
- /* Do first N-R step in float for 2x throughput */
- xf = _mm256_insertf128_ps(_mm256_castps128_ps256(_mm256_cvtpd_ps(x1)), _mm256_cvtpd_ps(x2), 0x1);
- luf = _mm256_rsqrt_ps(xf);
-
- luf = _mm256_mul_ps(halff, _mm256_mul_ps(_mm256_sub_ps(threef, _mm256_mul_ps(_mm256_mul_ps(luf, luf), xf)), luf));
-
- lu2 = _mm256_cvtps_pd(_mm256_extractf128_ps(luf, 0x1));
- lu1 = _mm256_cvtps_pd(_mm256_castps256_ps128(luf));
-
- *invsqrt1 = _mm256_mul_pd(half, _mm256_mul_pd(_mm256_sub_pd(three, _mm256_mul_pd(_mm256_mul_pd(lu1, lu1), x1)), lu1));
- *invsqrt2 = _mm256_mul_pd(half, _mm256_mul_pd(_mm256_sub_pd(three, _mm256_mul_pd(_mm256_mul_pd(lu2, lu2), x2)), lu2));
-}
-
-/* 1.0/sqrt(x), 128 bit wide */
-static gmx_inline __m128d
-gmx_mm_invsqrt_pd(__m128d x)
-{
- const __m128d half = _mm_set1_pd(0.5);
- const __m128d three = _mm_set1_pd(3.0);
-
- /* Lookup instruction only exists in single precision, convert back and forth... */
- __m128d lu = _mm_cvtps_pd(_mm_rsqrt_ps( _mm_cvtpd_ps(x)));
-
- lu = _mm_mul_pd(half, _mm_mul_pd(_mm_sub_pd(three, _mm_mul_pd(_mm_mul_pd(lu, lu), x)), lu));
- return _mm_mul_pd(half, _mm_mul_pd(_mm_sub_pd(three, _mm_mul_pd(_mm_mul_pd(lu, lu), x)), lu));
-}
-
-/* 1.0/sqrt(x), done for two pairs to improve throughput */
-static void
-gmx_mm_invsqrt_pair_pd(__m128d x1, __m128d x2, __m128d *invsqrt1, __m128d *invsqrt2)
-{
- const __m128d half = _mm_set1_pd(0.5);
- const __m128d three = _mm_set1_pd(3.0);
- const __m128 halff = _mm_set1_ps(0.5f);
- const __m128 threef = _mm_set1_ps(3.0f);
-
- __m128 xf, luf;
- __m128d lu1, lu2;
-
- /* Do first N-R step in float for 2x throughput */
- xf = _mm_shuffle_ps(_mm_cvtpd_ps(x1), _mm_cvtpd_ps(x2), _MM_SHUFFLE(1, 0, 1, 0));
- luf = _mm_rsqrt_ps(xf);
- luf = _mm_mul_ps(halff, _mm_mul_ps(_mm_sub_ps(threef, _mm_mul_ps(_mm_mul_ps(luf, luf), xf)), luf));
-
- lu2 = _mm_cvtps_pd(_mm_shuffle_ps(luf, luf, _MM_SHUFFLE(3, 2, 3, 2)));
- lu1 = _mm_cvtps_pd(luf);
-
- *invsqrt1 = _mm_mul_pd(half, _mm_mul_pd(_mm_sub_pd(three, _mm_mul_pd(_mm_mul_pd(lu1, lu1), x1)), lu1));
- *invsqrt2 = _mm_mul_pd(half, _mm_mul_pd(_mm_sub_pd(three, _mm_mul_pd(_mm_mul_pd(lu2, lu2), x2)), lu2));
-}
-
-/* sqrt(x) (256 bit)- Do NOT use this (but rather invsqrt) if you actually need 1.0/sqrt(x) */
-static gmx_inline __m256d
-gmx_mm256_sqrt_pd(__m256d x)
-{
- __m256d mask;
- __m256d res;
-
- mask = _mm256_cmp_pd(x, _mm256_setzero_pd(), _CMP_EQ_OQ);
- res = _mm256_andnot_pd(mask, gmx_mm256_invsqrt_pd(x));
-
- res = _mm256_mul_pd(x, res);
-
- return res;
-}
-
-/* sqrt(x) (128 bit) - Do NOT use this (but rather invsqrt) if you actually need 1.0/sqrt(x) */
-static gmx_inline __m128d
-gmx_mm_sqrt_pd(__m128d x)
-{
- __m128d mask;
- __m128d res;
-
- mask = _mm_cmpeq_pd(x, _mm_setzero_pd());
- res = _mm_andnot_pd(mask, gmx_mm_invsqrt_pd(x));
-
- res = _mm_mul_pd(x, res);
-
- return res;
-}
-
-
-/* 1.0/x, 256 bit wide */
-static gmx_inline __m256d
-gmx_mm256_inv_pd(__m256d x)
-{
- const __m256d two = _mm256_set1_pd(2.0);
-
- /* Lookup instruction only exists in single precision, convert back and forth... */
- __m256d lu = _mm256_cvtps_pd(_mm_rcp_ps( _mm256_cvtpd_ps(x)));
-
- /* Perform two N-R steps for double precision */
- lu = _mm256_mul_pd(lu, _mm256_sub_pd(two, _mm256_mul_pd(x, lu)));
- return _mm256_mul_pd(lu, _mm256_sub_pd(two, _mm256_mul_pd(x, lu)));
-}
-
-/* 1.0/x, 128 bit */
-static gmx_inline __m128d
-gmx_mm_inv_pd(__m128d x)
-{
- const __m128d two = _mm_set1_pd(2.0);
-
- /* Lookup instruction only exists in single precision, convert back and forth... */
- __m128d lu = _mm_cvtps_pd(_mm_rcp_ps( _mm_cvtpd_ps(x)));
-
- /* Perform two N-R steps for double precision */
- lu = _mm_mul_pd(lu, _mm_sub_pd(two, _mm_mul_pd(x, lu)));
- return _mm_mul_pd(lu, _mm_sub_pd(two, _mm_mul_pd(x, lu)));
-}
-
-
-static gmx_inline __m256d
-gmx_mm256_abs_pd(__m256d x)
-{
- const __m256d signmask = _mm256_castsi256_pd( _mm256_set_epi32(0x7FFFFFFF, 0xFFFFFFFF, 0x7FFFFFFF, 0xFFFFFFFF,
- 0x7FFFFFFF, 0xFFFFFFFF, 0x7FFFFFFF, 0xFFFFFFFF) );
-
- return _mm256_and_pd(x, signmask);
-}
-
-static gmx_inline __m128d
-gmx_mm_abs_pd(__m128d x)
-{
- const __m128d signmask = gmx_mm_castsi128_pd( _mm_set_epi32(0x7FFFFFFF, 0xFFFFFFFF, 0x7FFFFFFF, 0xFFFFFFFF) );
-
- return _mm_and_pd(x, signmask);
-}
-
-
-/*
- * 2^x function, 256 bit
- *
- * The 2^w term is calculated from a (6,0)-th order (no denominator) Minimax polynomia on the interval
- * [-0.5,0.5].
- *
- * The approximation on [-0.5,0.5] is a rational Padé approximation, 1+2*P(x^2)/(Q(x^2)-P(x^2)),
- * according to the same algorithm as used in the Cephes/netlib math routines.
- */
-static __m256d
-gmx_mm256_exp2_pd(__m256d x)
-{
- /* Lower bound: We do not allow numbers that would lead to an IEEE fp representation exponent smaller than -126. */
- const __m256d arglimit = _mm256_set1_pd(1022.0);
- const __m128i expbase = _mm_set1_epi32(1023);
-
- const __m256d P2 = _mm256_set1_pd(2.30933477057345225087e-2);
- const __m256d P1 = _mm256_set1_pd(2.02020656693165307700e1);
- const __m256d P0 = _mm256_set1_pd(1.51390680115615096133e3);
- /* Q2 == 1.0 */
- const __m256d Q1 = _mm256_set1_pd(2.33184211722314911771e2);
- const __m256d Q0 = _mm256_set1_pd(4.36821166879210612817e3);
- const __m256d one = _mm256_set1_pd(1.0);
- const __m256d two = _mm256_set1_pd(2.0);
-
- __m256d valuemask;
- __m256i iexppart;
- __m128i iexppart128a, iexppart128b;
- __m256d fexppart;
- __m256d intpart;
- __m256d z, z2;
- __m256d PolyP, PolyQ;
-
- iexppart128a = _mm256_cvtpd_epi32(x);
- intpart = _mm256_round_pd(x, _MM_FROUND_TO_NEAREST_INT);
-
- /* Add exponent bias */
- iexppart128a = _mm_add_epi32(iexppart128a, expbase);
-
- /* We now want to shift the exponent 52 positions left, but to achieve this we need
- * to separate the 128-bit register data into two registers (4x64-bit > 128bit)
- * shift them, and then merge into a single __m256d.
- * Elements 0/1 should end up in iexppart128a, and 2/3 in iexppart128b.
- * It doesnt matter what we put in the 2nd/4th position, since that data will be
- * shifted out and replaced with zeros.
- */
- iexppart128b = _mm_shuffle_epi32(iexppart128a, _MM_SHUFFLE(3, 3, 2, 2));
- iexppart128a = _mm_shuffle_epi32(iexppart128a, _MM_SHUFFLE(1, 1, 0, 0));
-
- iexppart128b = _mm_slli_epi64(iexppart128b, 52);
- iexppart128a = _mm_slli_epi64(iexppart128a, 52);
-
- iexppart = _mm256_castsi128_si256(iexppart128a);
- iexppart = _mm256_insertf128_si256(iexppart, iexppart128b, 0x1);
-
- valuemask = _mm256_cmp_pd(arglimit, gmx_mm256_abs_pd(x), _CMP_GE_OQ);
- fexppart = _mm256_and_pd(valuemask, _mm256_castsi256_pd(iexppart));
-
- z = _mm256_sub_pd(x, intpart);
-
- z2 = _mm256_mul_pd(z, z);
-
- PolyP = _mm256_mul_pd(P2, z2);
- PolyP = _mm256_add_pd(PolyP, P1);
- PolyQ = _mm256_add_pd(z2, Q1);
- PolyP = _mm256_mul_pd(PolyP, z2);
- PolyQ = _mm256_mul_pd(PolyQ, z2);
- PolyP = _mm256_add_pd(PolyP, P0);
- PolyQ = _mm256_add_pd(PolyQ, Q0);
- PolyP = _mm256_mul_pd(PolyP, z);
-
- z = _mm256_mul_pd(PolyP, gmx_mm256_inv_pd(_mm256_sub_pd(PolyQ, PolyP)));
- z = _mm256_add_pd(one, _mm256_mul_pd(two, z));
-
- z = _mm256_mul_pd(z, fexppart);
-
- return z;
-}
-
-/* 2^x, 128 bit */
-static __m128d
-gmx_mm_exp2_pd(__m128d x)
-{
- /* Lower bound: We do not allow numbers that would lead to an IEEE fp representation exponent smaller than -126. */
- const __m128d arglimit = _mm_set1_pd(1022.0);
- const __m128i expbase = _mm_set1_epi32(1023);
-
- const __m128d P2 = _mm_set1_pd(2.30933477057345225087e-2);
- const __m128d P1 = _mm_set1_pd(2.02020656693165307700e1);
- const __m128d P0 = _mm_set1_pd(1.51390680115615096133e3);
- /* Q2 == 1.0 */
- const __m128d Q1 = _mm_set1_pd(2.33184211722314911771e2);
- const __m128d Q0 = _mm_set1_pd(4.36821166879210612817e3);
- const __m128d one = _mm_set1_pd(1.0);
- const __m128d two = _mm_set1_pd(2.0);
-
- __m128d valuemask;
- __m128i iexppart;
- __m128d fexppart;
- __m128d intpart;
- __m128d z, z2;
- __m128d PolyP, PolyQ;
-
- iexppart = _mm_cvtpd_epi32(x);
- intpart = _mm_round_pd(x, _MM_FROUND_TO_NEAREST_INT);
-
- /* The two lowest elements of iexppart now contains 32-bit numbers with a correctly biased exponent.
- * To be able to shift it into the exponent for a double precision number we first need to
- * shuffle so that the lower half contains the first element, and the upper half the second.
- * This should really be done as a zero-extension, but since the next instructions will shift
- * the registers left by 52 bits it doesn't matter what we put there - it will be shifted out.
- * (thus we just use element 2 from iexppart).
- */
- iexppart = _mm_shuffle_epi32(iexppart, _MM_SHUFFLE(2, 1, 2, 0));
-
- /* Do the shift operation on the 64-bit registers */
- iexppart = _mm_add_epi32(iexppart, expbase);
- iexppart = _mm_slli_epi64(iexppart, 52);
-
- valuemask = _mm_cmpge_pd(arglimit, gmx_mm_abs_pd(x));
- fexppart = _mm_and_pd(valuemask, gmx_mm_castsi128_pd(iexppart));
-
- z = _mm_sub_pd(x, intpart);
- z2 = _mm_mul_pd(z, z);
-
- PolyP = _mm_mul_pd(P2, z2);
- PolyP = _mm_add_pd(PolyP, P1);
- PolyQ = _mm_add_pd(z2, Q1);
- PolyP = _mm_mul_pd(PolyP, z2);
- PolyQ = _mm_mul_pd(PolyQ, z2);
- PolyP = _mm_add_pd(PolyP, P0);
- PolyQ = _mm_add_pd(PolyQ, Q0);
- PolyP = _mm_mul_pd(PolyP, z);
-
- z = _mm_mul_pd(PolyP, gmx_mm_inv_pd(_mm_sub_pd(PolyQ, PolyP)));
- z = _mm_add_pd(one, _mm_mul_pd(two, z));
-
- z = _mm_mul_pd(z, fexppart);
-
- return z;
-}
-
-
-/* Exponential function, 256 bit. This could be calculated from 2^x as Exp(x)=2^(y),
- * where y=log2(e)*x, but there will then be a small rounding error since we lose
- * some precision due to the multiplication. This will then be magnified a lot by
- * the exponential.
- *
- * Instead, we calculate the fractional part directly as a Padé approximation of
- * Exp(z) on [-0.5,0.5]. We use extended precision arithmetics to calculate the fraction
- * remaining after 2^y, which avoids the precision-loss.
- */
-static __m256d
-gmx_mm256_exp_pd(__m256d exparg)
-{
- const __m256d argscale = _mm256_set1_pd(1.4426950408889634073599);
- /* Lower bound: We do not allow numbers that would lead to an IEEE fp representation exponent smaller than -126. */
- const __m256d arglimit = _mm256_set1_pd(1022.0);
- const __m128i expbase = _mm_set1_epi32(1023);
-
- const __m256d invargscale0 = _mm256_set1_pd(6.93145751953125e-1);
- const __m256d invargscale1 = _mm256_set1_pd(1.42860682030941723212e-6);
-
- const __m256d P2 = _mm256_set1_pd(1.26177193074810590878e-4);
- const __m256d P1 = _mm256_set1_pd(3.02994407707441961300e-2);
- /* P0 == 1.0 */
- const __m256d Q3 = _mm256_set1_pd(3.00198505138664455042E-6);
- const __m256d Q2 = _mm256_set1_pd(2.52448340349684104192E-3);
- const __m256d Q1 = _mm256_set1_pd(2.27265548208155028766E-1);
- /* Q0 == 2.0 */
- const __m256d one = _mm256_set1_pd(1.0);
- const __m256d two = _mm256_set1_pd(2.0);
-
- __m256d valuemask;
- __m256i iexppart;
- __m128i iexppart128a, iexppart128b;
- __m256d fexppart;
- __m256d intpart;
- __m256d x, z, z2;
- __m256d PolyP, PolyQ;
-
- x = _mm256_mul_pd(exparg, argscale);
-
- iexppart128a = _mm256_cvtpd_epi32(x);
- intpart = _mm256_round_pd(x, _MM_FROUND_TO_NEAREST_INT);
-
- /* Add exponent bias */
- iexppart128a = _mm_add_epi32(iexppart128a, expbase);
-
- /* We now want to shift the exponent 52 positions left, but to achieve this we need
- * to separate the 128-bit register data into two registers (4x64-bit > 128bit)
- * shift them, and then merge into a single __m256d.
- * Elements 0/1 should end up in iexppart128a, and 2/3 in iexppart128b.
- * It doesnt matter what we put in the 2nd/4th position, since that data will be
- * shifted out and replaced with zeros.
- */
- iexppart128b = _mm_shuffle_epi32(iexppart128a, _MM_SHUFFLE(3, 3, 2, 2));
- iexppart128a = _mm_shuffle_epi32(iexppart128a, _MM_SHUFFLE(1, 1, 0, 0));
-
- iexppart128b = _mm_slli_epi64(iexppart128b, 52);
- iexppart128a = _mm_slli_epi64(iexppart128a, 52);
-
- iexppart = _mm256_castsi128_si256(iexppart128a);
- iexppart = _mm256_insertf128_si256(iexppart, iexppart128b, 0x1);
-
- valuemask = _mm256_cmp_pd(arglimit, gmx_mm256_abs_pd(x), _CMP_GE_OQ);
- fexppart = _mm256_and_pd(valuemask, _mm256_castsi256_pd(iexppart));
-
- z = _mm256_sub_pd(exparg, _mm256_mul_pd(invargscale0, intpart));
- z = _mm256_sub_pd(z, _mm256_mul_pd(invargscale1, intpart));
-
- z2 = _mm256_mul_pd(z, z);
-
- PolyQ = _mm256_mul_pd(Q3, z2);
- PolyQ = _mm256_add_pd(PolyQ, Q2);
- PolyP = _mm256_mul_pd(P2, z2);
- PolyQ = _mm256_mul_pd(PolyQ, z2);
- PolyP = _mm256_add_pd(PolyP, P1);
- PolyQ = _mm256_add_pd(PolyQ, Q1);
- PolyP = _mm256_mul_pd(PolyP, z2);
- PolyQ = _mm256_mul_pd(PolyQ, z2);
- PolyP = _mm256_add_pd(PolyP, one);
- PolyQ = _mm256_add_pd(PolyQ, two);
-
- PolyP = _mm256_mul_pd(PolyP, z);
-
- z = _mm256_mul_pd(PolyP, gmx_mm256_inv_pd(_mm256_sub_pd(PolyQ, PolyP)));
- z = _mm256_add_pd(one, _mm256_mul_pd(two, z));
-
- z = _mm256_mul_pd(z, fexppart);
-
- return z;
-}
-
-/* exp(), 128 bit */
-static __m128d
-gmx_mm_exp_pd(__m128d exparg)
-{
- const __m128d argscale = _mm_set1_pd(1.4426950408889634073599);
- /* Lower bound: We do not allow numbers that would lead to an IEEE fp representation exponent smaller than -126. */
- const __m128d arglimit = _mm_set1_pd(1022.0);
- const __m128i expbase = _mm_set1_epi32(1023);
-
- const __m128d invargscale0 = _mm_set1_pd(6.93145751953125e-1);
- const __m128d invargscale1 = _mm_set1_pd(1.42860682030941723212e-6);
-
- const __m128d P2 = _mm_set1_pd(1.26177193074810590878e-4);
- const __m128d P1 = _mm_set1_pd(3.02994407707441961300e-2);
- /* P0 == 1.0 */
- const __m128d Q3 = _mm_set1_pd(3.00198505138664455042E-6);
- const __m128d Q2 = _mm_set1_pd(2.52448340349684104192E-3);
- const __m128d Q1 = _mm_set1_pd(2.27265548208155028766E-1);
- /* Q0 == 2.0 */
- const __m128d one = _mm_set1_pd(1.0);
- const __m128d two = _mm_set1_pd(2.0);
-
- __m128d valuemask;
- __m128i iexppart;
- __m128d fexppart;
- __m128d intpart;
- __m128d x, z, z2;
- __m128d PolyP, PolyQ;
-
- x = _mm_mul_pd(exparg, argscale);
-
- iexppart = _mm_cvtpd_epi32(x);
- intpart = _mm_round_pd(x, _MM_FROUND_TO_NEAREST_INT);
-
- /* The two lowest elements of iexppart now contains 32-bit numbers with a correctly biased exponent.
- * To be able to shift it into the exponent for a double precision number we first need to
- * shuffle so that the lower half contains the first element, and the upper half the second.
- * This should really be done as a zero-extension, but since the next instructions will shift
- * the registers left by 52 bits it doesn't matter what we put there - it will be shifted out.
- * (thus we just use element 2 from iexppart).
- */
- iexppart = _mm_shuffle_epi32(iexppart, _MM_SHUFFLE(2, 1, 2, 0));
-
- /* Do the shift operation on the 64-bit registers */
- iexppart = _mm_add_epi32(iexppart, expbase);
- iexppart = _mm_slli_epi64(iexppart, 52);
-
- valuemask = _mm_cmpge_pd(arglimit, gmx_mm_abs_pd(x));
- fexppart = _mm_and_pd(valuemask, gmx_mm_castsi128_pd(iexppart));
-
- z = _mm_sub_pd(exparg, _mm_mul_pd(invargscale0, intpart));
- z = _mm_sub_pd(z, _mm_mul_pd(invargscale1, intpart));
-
- z2 = _mm_mul_pd(z, z);
-
- PolyQ = _mm_mul_pd(Q3, z2);
- PolyQ = _mm_add_pd(PolyQ, Q2);
- PolyP = _mm_mul_pd(P2, z2);
- PolyQ = _mm_mul_pd(PolyQ, z2);
- PolyP = _mm_add_pd(PolyP, P1);
- PolyQ = _mm_add_pd(PolyQ, Q1);
- PolyP = _mm_mul_pd(PolyP, z2);
- PolyQ = _mm_mul_pd(PolyQ, z2);
- PolyP = _mm_add_pd(PolyP, one);
- PolyQ = _mm_add_pd(PolyQ, two);
-
- PolyP = _mm_mul_pd(PolyP, z);
-
- z = _mm_mul_pd(PolyP, gmx_mm_inv_pd(_mm_sub_pd(PolyQ, PolyP)));
- z = _mm_add_pd(one, _mm_mul_pd(two, z));
-
- z = _mm_mul_pd(z, fexppart);
-
- return z;
-}
-
-
-static __m256d
-gmx_mm256_log_pd(__m256d x)
-{
- /* Same algorithm as cephes library */
- const __m256d expmask = _mm256_castsi256_pd( _mm256_set_epi32(0x7FF00000, 0x00000000, 0x7FF00000, 0x00000000,
- 0x7FF00000, 0x00000000, 0x7FF00000, 0x00000000) );
-
- const __m128i expbase_m1 = _mm_set1_epi32(1023-1); /* We want non-IEEE format */
-
- const __m256d half = _mm256_set1_pd(0.5);
- const __m256d one = _mm256_set1_pd(1.0);
- const __m256d two = _mm256_set1_pd(2.0);
- const __m256d invsq2 = _mm256_set1_pd(1.0/sqrt(2.0));
-
- const __m256d corr1 = _mm256_set1_pd(-2.121944400546905827679e-4);
- const __m256d corr2 = _mm256_set1_pd(0.693359375);
-
- const __m256d P5 = _mm256_set1_pd(1.01875663804580931796e-4);
- const __m256d P4 = _mm256_set1_pd(4.97494994976747001425e-1);
- const __m256d P3 = _mm256_set1_pd(4.70579119878881725854e0);
- const __m256d P2 = _mm256_set1_pd(1.44989225341610930846e1);
- const __m256d P1 = _mm256_set1_pd(1.79368678507819816313e1);
- const __m256d P0 = _mm256_set1_pd(7.70838733755885391666e0);
-
- const __m256d Q4 = _mm256_set1_pd(1.12873587189167450590e1);
- const __m256d Q3 = _mm256_set1_pd(4.52279145837532221105e1);
- const __m256d Q2 = _mm256_set1_pd(8.29875266912776603211e1);
- const __m256d Q1 = _mm256_set1_pd(7.11544750618563894466e1);
- const __m256d Q0 = _mm256_set1_pd(2.31251620126765340583e1);
-
- const __m256d R2 = _mm256_set1_pd(-7.89580278884799154124e-1);
- const __m256d R1 = _mm256_set1_pd(1.63866645699558079767e1);
- const __m256d R0 = _mm256_set1_pd(-6.41409952958715622951e1);
-
- const __m256d S2 = _mm256_set1_pd(-3.56722798256324312549E1);
- const __m256d S1 = _mm256_set1_pd(3.12093766372244180303E2);
- const __m256d S0 = _mm256_set1_pd(-7.69691943550460008604E2);
-
- __m256d fexp;
- __m256i iexp;
- __m128i iexp128a, iexp128b;
-
- __m256d mask1, mask2;
- __m256d corr, t1, t2, q;
- __m256d zA, yA, xA, zB, yB, xB, z;
- __m256d polyR, polyS;
- __m256d polyP1, polyP2, polyQ1, polyQ2;
-
- /* Separate x into exponent and mantissa, with a mantissa in the range [0.5..1[ (not IEEE754 standard!) */
- fexp = _mm256_and_pd(x, expmask);
-
- iexp = _mm256_castpd_si256(fexp);
- iexp128b = _mm256_extractf128_si256(iexp, 0x1);
- iexp128a = _mm256_castsi256_si128(iexp);
-
- iexp128a = _mm_srli_epi64(iexp128a, 52);
- iexp128b = _mm_srli_epi64(iexp128b, 52);
- /* Merge into a single register */
- iexp128a = _mm_shuffle_epi32(iexp128a, _MM_SHUFFLE(1, 1, 2, 0));
- iexp128b = _mm_shuffle_epi32(iexp128b, _MM_SHUFFLE(2, 0, 1, 1));
- iexp128a = _mm_or_si128(iexp128a, iexp128b);
- iexp128a = _mm_sub_epi32(iexp128a, expbase_m1);
-
- fexp = _mm256_cvtepi32_pd(iexp128a);
-
- x = _mm256_andnot_pd(expmask, x); /* Get mantissa */
- x = _mm256_or_pd(x, one);
- x = _mm256_mul_pd(x, half);
-
- mask1 = _mm256_cmp_pd(gmx_mm256_abs_pd(fexp), two, _CMP_GT_OQ);
- mask2 = _mm256_cmp_pd(x, invsq2, _CMP_LT_OQ);
-
- fexp = _mm256_sub_pd(fexp, _mm256_and_pd(mask2, one));
-
- /* If mask1 is set ('A') */
- zA = _mm256_sub_pd(x, half);
- t1 = _mm256_blendv_pd( zA, x, mask2 );
- zA = _mm256_sub_pd(t1, half);
- t2 = _mm256_blendv_pd( x, zA, mask2 );
- yA = _mm256_mul_pd(half, _mm256_add_pd(t2, one));
-
- xA = _mm256_mul_pd(zA, gmx_mm256_inv_pd(yA));
- zA = _mm256_mul_pd(xA, xA);
-
- /* EVALUATE POLY */
- polyR = _mm256_mul_pd(R2, zA);
- polyR = _mm256_add_pd(polyR, R1);
- polyR = _mm256_mul_pd(polyR, zA);
- polyR = _mm256_add_pd(polyR, R0);
-
- polyS = _mm256_add_pd(zA, S2);
- polyS = _mm256_mul_pd(polyS, zA);
- polyS = _mm256_add_pd(polyS, S1);
- polyS = _mm256_mul_pd(polyS, zA);
- polyS = _mm256_add_pd(polyS, S0);
-
- q = _mm256_mul_pd(polyR, gmx_mm256_inv_pd(polyS));
- zA = _mm256_mul_pd(_mm256_mul_pd(xA, zA), q);
-
- zA = _mm256_add_pd(zA, _mm256_mul_pd(corr1, fexp));
- zA = _mm256_add_pd(zA, xA);
- zA = _mm256_add_pd(zA, _mm256_mul_pd(corr2, fexp));
-
- /* If mask1 is not set ('B') */
- corr = _mm256_and_pd(mask2, x);
- xB = _mm256_add_pd(x, corr);
- xB = _mm256_sub_pd(xB, one);
- zB = _mm256_mul_pd(xB, xB);
-
- polyP1 = _mm256_mul_pd(P5, zB);
- polyP2 = _mm256_mul_pd(P4, zB);
- polyP1 = _mm256_add_pd(polyP1, P3);
- polyP2 = _mm256_add_pd(polyP2, P2);
- polyP1 = _mm256_mul_pd(polyP1, zB);
- polyP2 = _mm256_mul_pd(polyP2, zB);
- polyP1 = _mm256_add_pd(polyP1, P1);
- polyP2 = _mm256_add_pd(polyP2, P0);
- polyP1 = _mm256_mul_pd(polyP1, xB);
- polyP1 = _mm256_add_pd(polyP1, polyP2);
-
- polyQ2 = _mm256_mul_pd(Q4, zB);
- polyQ1 = _mm256_add_pd(zB, Q3);
- polyQ2 = _mm256_add_pd(polyQ2, Q2);
- polyQ1 = _mm256_mul_pd(polyQ1, zB);
- polyQ2 = _mm256_mul_pd(polyQ2, zB);
- polyQ1 = _mm256_add_pd(polyQ1, Q1);
- polyQ2 = _mm256_add_pd(polyQ2, Q0);
- polyQ1 = _mm256_mul_pd(polyQ1, xB);
- polyQ1 = _mm256_add_pd(polyQ1, polyQ2);
-
- fexp = _mm256_and_pd(fexp, _mm256_cmp_pd(fexp, _mm256_setzero_pd(), _CMP_NEQ_OQ));
-
- q = _mm256_mul_pd(polyP1, gmx_mm256_inv_pd(polyQ1));
- yB = _mm256_mul_pd(_mm256_mul_pd(xB, zB), q);
-
- yB = _mm256_add_pd(yB, _mm256_mul_pd(corr1, fexp));
- yB = _mm256_sub_pd(yB, _mm256_mul_pd(half, zB));
- zB = _mm256_add_pd(xB, yB);
- zB = _mm256_add_pd(zB, _mm256_mul_pd(corr2, fexp));
-
- z = _mm256_blendv_pd( zB, zA, mask1 );
-
- return z;
-}
-
-static __m128d
-gmx_mm_log_pd(__m128d x)
-{
- /* Same algorithm as cephes library */
- const __m128d expmask = gmx_mm_castsi128_pd( _mm_set_epi32(0x7FF00000, 0x00000000, 0x7FF00000, 0x00000000) );
-
- const __m128i expbase_m1 = _mm_set1_epi32(1023-1); /* We want non-IEEE format */
-
- const __m128d half = _mm_set1_pd(0.5);
- const __m128d one = _mm_set1_pd(1.0);
- const __m128d two = _mm_set1_pd(2.0);
- const __m128d invsq2 = _mm_set1_pd(1.0/sqrt(2.0));
-
- const __m128d corr1 = _mm_set1_pd(-2.121944400546905827679e-4);
- const __m128d corr2 = _mm_set1_pd(0.693359375);
-
- const __m128d P5 = _mm_set1_pd(1.01875663804580931796e-4);
- const __m128d P4 = _mm_set1_pd(4.97494994976747001425e-1);
- const __m128d P3 = _mm_set1_pd(4.70579119878881725854e0);
- const __m128d P2 = _mm_set1_pd(1.44989225341610930846e1);
- const __m128d P1 = _mm_set1_pd(1.79368678507819816313e1);
- const __m128d P0 = _mm_set1_pd(7.70838733755885391666e0);
-
- const __m128d Q4 = _mm_set1_pd(1.12873587189167450590e1);
- const __m128d Q3 = _mm_set1_pd(4.52279145837532221105e1);
- const __m128d Q2 = _mm_set1_pd(8.29875266912776603211e1);
- const __m128d Q1 = _mm_set1_pd(7.11544750618563894466e1);
- const __m128d Q0 = _mm_set1_pd(2.31251620126765340583e1);
-
- const __m128d R2 = _mm_set1_pd(-7.89580278884799154124e-1);
- const __m128d R1 = _mm_set1_pd(1.63866645699558079767e1);
- const __m128d R0 = _mm_set1_pd(-6.41409952958715622951e1);
-
- const __m128d S2 = _mm_set1_pd(-3.56722798256324312549E1);
- const __m128d S1 = _mm_set1_pd(3.12093766372244180303E2);
- const __m128d S0 = _mm_set1_pd(-7.69691943550460008604E2);
-
- __m128d fexp;
- __m128i iexp;
-
- __m128d mask1, mask2;
- __m128d corr, t1, t2, q;
- __m128d zA, yA, xA, zB, yB, xB, z;
- __m128d polyR, polyS;
- __m128d polyP1, polyP2, polyQ1, polyQ2;
-
- /* Separate x into exponent and mantissa, with a mantissa in the range [0.5..1[ (not IEEE754 standard!) */
- fexp = _mm_and_pd(x, expmask);
- iexp = gmx_mm_castpd_si128(fexp);
- iexp = _mm_srli_epi64(iexp, 52);
- iexp = _mm_sub_epi32(iexp, expbase_m1);
- iexp = _mm_shuffle_epi32(iexp, _MM_SHUFFLE(1, 1, 2, 0) );
- fexp = _mm_cvtepi32_pd(iexp);
-
- x = _mm_andnot_pd(expmask, x);
- x = _mm_or_pd(x, one);
- x = _mm_mul_pd(x, half);
-
- mask1 = _mm_cmpgt_pd(gmx_mm_abs_pd(fexp), two);
- mask2 = _mm_cmplt_pd(x, invsq2);
-
- fexp = _mm_sub_pd(fexp, _mm_and_pd(mask2, one));
-
- /* If mask1 is set ('A') */
- zA = _mm_sub_pd(x, half);
- t1 = _mm_blendv_pd( zA, x, mask2 );
- zA = _mm_sub_pd(t1, half);
- t2 = _mm_blendv_pd( x, zA, mask2 );
- yA = _mm_mul_pd(half, _mm_add_pd(t2, one));
-
- xA = _mm_mul_pd(zA, gmx_mm_inv_pd(yA));
- zA = _mm_mul_pd(xA, xA);
-
- /* EVALUATE POLY */
- polyR = _mm_mul_pd(R2, zA);
- polyR = _mm_add_pd(polyR, R1);
- polyR = _mm_mul_pd(polyR, zA);
- polyR = _mm_add_pd(polyR, R0);
-
- polyS = _mm_add_pd(zA, S2);
- polyS = _mm_mul_pd(polyS, zA);
- polyS = _mm_add_pd(polyS, S1);
- polyS = _mm_mul_pd(polyS, zA);
- polyS = _mm_add_pd(polyS, S0);
-
- q = _mm_mul_pd(polyR, gmx_mm_inv_pd(polyS));
- zA = _mm_mul_pd(_mm_mul_pd(xA, zA), q);
-
- zA = _mm_add_pd(zA, _mm_mul_pd(corr1, fexp));
- zA = _mm_add_pd(zA, xA);
- zA = _mm_add_pd(zA, _mm_mul_pd(corr2, fexp));
-
- /* If mask1 is not set ('B') */
- corr = _mm_and_pd(mask2, x);
- xB = _mm_add_pd(x, corr);
- xB = _mm_sub_pd(xB, one);
- zB = _mm_mul_pd(xB, xB);
-
- polyP1 = _mm_mul_pd(P5, zB);
- polyP2 = _mm_mul_pd(P4, zB);
- polyP1 = _mm_add_pd(polyP1, P3);
- polyP2 = _mm_add_pd(polyP2, P2);
- polyP1 = _mm_mul_pd(polyP1, zB);
- polyP2 = _mm_mul_pd(polyP2, zB);
- polyP1 = _mm_add_pd(polyP1, P1);
- polyP2 = _mm_add_pd(polyP2, P0);
- polyP1 = _mm_mul_pd(polyP1, xB);
- polyP1 = _mm_add_pd(polyP1, polyP2);
-
- polyQ2 = _mm_mul_pd(Q4, zB);
- polyQ1 = _mm_add_pd(zB, Q3);
- polyQ2 = _mm_add_pd(polyQ2, Q2);
- polyQ1 = _mm_mul_pd(polyQ1, zB);
- polyQ2 = _mm_mul_pd(polyQ2, zB);
- polyQ1 = _mm_add_pd(polyQ1, Q1);
- polyQ2 = _mm_add_pd(polyQ2, Q0);
- polyQ1 = _mm_mul_pd(polyQ1, xB);
- polyQ1 = _mm_add_pd(polyQ1, polyQ2);
-
- fexp = _mm_and_pd(fexp, _mm_cmpneq_pd(fexp, _mm_setzero_pd()));
-
- q = _mm_mul_pd(polyP1, gmx_mm_inv_pd(polyQ1));
- yB = _mm_mul_pd(_mm_mul_pd(xB, zB), q);
-
- yB = _mm_add_pd(yB, _mm_mul_pd(corr1, fexp));
- yB = _mm_sub_pd(yB, _mm_mul_pd(half, zB));
- zB = _mm_add_pd(xB, yB);
- zB = _mm_add_pd(zB, _mm_mul_pd(corr2, fexp));
-
- z = _mm_blendv_pd( zB, zA, mask1 );
-
- return z;
-}
-
-
-static __m256d
-gmx_mm256_erf_pd(__m256d x)
-{
- /* Coefficients for minimax approximation of erf(x)=x*(CAoffset + P(x^2)/Q(x^2)) in range [-0.75,0.75] */
- const __m256d CAP4 = _mm256_set1_pd(-0.431780540597889301512e-4);
- const __m256d CAP3 = _mm256_set1_pd(-0.00578562306260059236059);
- const __m256d CAP2 = _mm256_set1_pd(-0.028593586920219752446);
- const __m256d CAP1 = _mm256_set1_pd(-0.315924962948621698209);
- const __m256d CAP0 = _mm256_set1_pd(0.14952975608477029151);
-
- const __m256d CAQ5 = _mm256_set1_pd(-0.374089300177174709737e-5);
- const __m256d CAQ4 = _mm256_set1_pd(0.00015126584532155383535);
- const __m256d CAQ3 = _mm256_set1_pd(0.00536692680669480725423);
- const __m256d CAQ2 = _mm256_set1_pd(0.0668686825594046122636);
- const __m256d CAQ1 = _mm256_set1_pd(0.402604990869284362773);
- /* CAQ0 == 1.0 */
- const __m256d CAoffset = _mm256_set1_pd(0.9788494110107421875);
-
- /* Coefficients for minimax approximation of erfc(x)=exp(-x^2)*x*(P(x-1)/Q(x-1)) in range [1.0,4.5] */
- const __m256d CBP6 = _mm256_set1_pd(2.49650423685462752497647637088e-10);
- const __m256d CBP5 = _mm256_set1_pd(0.00119770193298159629350136085658);
- const __m256d CBP4 = _mm256_set1_pd(0.0164944422378370965881008942733);
- const __m256d CBP3 = _mm256_set1_pd(0.0984581468691775932063932439252);
- const __m256d CBP2 = _mm256_set1_pd(0.317364595806937763843589437418);
- const __m256d CBP1 = _mm256_set1_pd(0.554167062641455850932670067075);
- const __m256d CBP0 = _mm256_set1_pd(0.427583576155807163756925301060);
- const __m256d CBQ7 = _mm256_set1_pd(0.00212288829699830145976198384930);
- const __m256d CBQ6 = _mm256_set1_pd(0.0334810979522685300554606393425);
- const __m256d CBQ5 = _mm256_set1_pd(0.2361713785181450957579508850717);
- const __m256d CBQ4 = _mm256_set1_pd(0.955364736493055670530981883072);
- const __m256d CBQ3 = _mm256_set1_pd(2.36815675631420037315349279199);
- const __m256d CBQ2 = _mm256_set1_pd(3.55261649184083035537184223542);
- const __m256d CBQ1 = _mm256_set1_pd(2.93501136050160872574376997993);
- /* CBQ0 == 1.0 */
-
- /* Coefficients for minimax approximation of erfc(x)=exp(-x^2)/x*(P(1/x)/Q(1/x)) in range [4.5,inf] */
- const __m256d CCP6 = _mm256_set1_pd(-2.8175401114513378771);
- const __m256d CCP5 = _mm256_set1_pd(-3.22729451764143718517);
- const __m256d CCP4 = _mm256_set1_pd(-2.5518551727311523996);
- const __m256d CCP3 = _mm256_set1_pd(-0.687717681153649930619);
- const __m256d CCP2 = _mm256_set1_pd(-0.212652252872804219852);
- const __m256d CCP1 = _mm256_set1_pd(0.0175389834052493308818);
- const __m256d CCP0 = _mm256_set1_pd(0.00628057170626964891937);
-
- const __m256d CCQ6 = _mm256_set1_pd(5.48409182238641741584);
- const __m256d CCQ5 = _mm256_set1_pd(13.5064170191802889145);
- const __m256d CCQ4 = _mm256_set1_pd(22.9367376522880577224);
- const __m256d CCQ3 = _mm256_set1_pd(15.930646027911794143);
- const __m256d CCQ2 = _mm256_set1_pd(11.0567237927800161565);
- const __m256d CCQ1 = _mm256_set1_pd(2.79257750980575282228);
- /* CCQ0 == 1.0 */
- const __m256d CCoffset = _mm256_set1_pd(0.5579090118408203125);
-
- const __m256d one = _mm256_set1_pd(1.0);
- const __m256d two = _mm256_set1_pd(2.0);
-
- const __m256d signbit = _mm256_castsi256_pd( _mm256_set_epi32(0x80000000, 0x00000000, 0x80000000, 0x00000000,
- 0x80000000, 0x00000000, 0x80000000, 0x00000000) );
-
- __m256d xabs, x2, x4, t, t2, w, w2;
- __m256d PolyAP0, PolyAP1, PolyAQ0, PolyAQ1;
- __m256d PolyBP0, PolyBP1, PolyBQ0, PolyBQ1;
- __m256d PolyCP0, PolyCP1, PolyCQ0, PolyCQ1;
- __m256d res_erf, res_erfcB, res_erfcC, res_erfc, res;
- __m256d mask, expmx2;
-
- /* Calculate erf() */
- xabs = gmx_mm256_abs_pd(x);
- x2 = _mm256_mul_pd(x, x);
- x4 = _mm256_mul_pd(x2, x2);
-
- PolyAP0 = _mm256_mul_pd(CAP4, x4);
- PolyAP1 = _mm256_mul_pd(CAP3, x4);
- PolyAP0 = _mm256_add_pd(PolyAP0, CAP2);
- PolyAP1 = _mm256_add_pd(PolyAP1, CAP1);
- PolyAP0 = _mm256_mul_pd(PolyAP0, x4);
- PolyAP1 = _mm256_mul_pd(PolyAP1, x2);
- PolyAP0 = _mm256_add_pd(PolyAP0, CAP0);
- PolyAP0 = _mm256_add_pd(PolyAP0, PolyAP1);
-
- PolyAQ1 = _mm256_mul_pd(CAQ5, x4);
- PolyAQ0 = _mm256_mul_pd(CAQ4, x4);
- PolyAQ1 = _mm256_add_pd(PolyAQ1, CAQ3);
- PolyAQ0 = _mm256_add_pd(PolyAQ0, CAQ2);
- PolyAQ1 = _mm256_mul_pd(PolyAQ1, x4);
- PolyAQ0 = _mm256_mul_pd(PolyAQ0, x4);
- PolyAQ1 = _mm256_add_pd(PolyAQ1, CAQ1);
- PolyAQ0 = _mm256_add_pd(PolyAQ0, one);
- PolyAQ1 = _mm256_mul_pd(PolyAQ1, x2);
- PolyAQ0 = _mm256_add_pd(PolyAQ0, PolyAQ1);
-
- res_erf = _mm256_mul_pd(PolyAP0, gmx_mm256_inv_pd(PolyAQ0));
- res_erf = _mm256_add_pd(CAoffset, res_erf);
- res_erf = _mm256_mul_pd(x, res_erf);
-
- /* Calculate erfc() in range [1,4.5] */
- t = _mm256_sub_pd(xabs, one);
- t2 = _mm256_mul_pd(t, t);
-
- PolyBP0 = _mm256_mul_pd(CBP6, t2);
- PolyBP1 = _mm256_mul_pd(CBP5, t2);
- PolyBP0 = _mm256_add_pd(PolyBP0, CBP4);
- PolyBP1 = _mm256_add_pd(PolyBP1, CBP3);
- PolyBP0 = _mm256_mul_pd(PolyBP0, t2);
- PolyBP1 = _mm256_mul_pd(PolyBP1, t2);
- PolyBP0 = _mm256_add_pd(PolyBP0, CBP2);
- PolyBP1 = _mm256_add_pd(PolyBP1, CBP1);
- PolyBP0 = _mm256_mul_pd(PolyBP0, t2);
- PolyBP1 = _mm256_mul_pd(PolyBP1, t);
- PolyBP0 = _mm256_add_pd(PolyBP0, CBP0);
- PolyBP0 = _mm256_add_pd(PolyBP0, PolyBP1);
-
- PolyBQ1 = _mm256_mul_pd(CBQ7, t2);
- PolyBQ0 = _mm256_mul_pd(CBQ6, t2);
- PolyBQ1 = _mm256_add_pd(PolyBQ1, CBQ5);
- PolyBQ0 = _mm256_add_pd(PolyBQ0, CBQ4);
- PolyBQ1 = _mm256_mul_pd(PolyBQ1, t2);
- PolyBQ0 = _mm256_mul_pd(PolyBQ0, t2);
- PolyBQ1 = _mm256_add_pd(PolyBQ1, CBQ3);
- PolyBQ0 = _mm256_add_pd(PolyBQ0, CBQ2);
- PolyBQ1 = _mm256_mul_pd(PolyBQ1, t2);
- PolyBQ0 = _mm256_mul_pd(PolyBQ0, t2);
- PolyBQ1 = _mm256_add_pd(PolyBQ1, CBQ1);
- PolyBQ0 = _mm256_add_pd(PolyBQ0, one);
- PolyBQ1 = _mm256_mul_pd(PolyBQ1, t);
- PolyBQ0 = _mm256_add_pd(PolyBQ0, PolyBQ1);
-
- res_erfcB = _mm256_mul_pd(PolyBP0, gmx_mm256_inv_pd(PolyBQ0));
-
- res_erfcB = _mm256_mul_pd(res_erfcB, xabs);
-
- /* Calculate erfc() in range [4.5,inf] */
- w = gmx_mm256_inv_pd(xabs);
- w2 = _mm256_mul_pd(w, w);
-
- PolyCP0 = _mm256_mul_pd(CCP6, w2);
- PolyCP1 = _mm256_mul_pd(CCP5, w2);
- PolyCP0 = _mm256_add_pd(PolyCP0, CCP4);
- PolyCP1 = _mm256_add_pd(PolyCP1, CCP3);
- PolyCP0 = _mm256_mul_pd(PolyCP0, w2);
- PolyCP1 = _mm256_mul_pd(PolyCP1, w2);
- PolyCP0 = _mm256_add_pd(PolyCP0, CCP2);
- PolyCP1 = _mm256_add_pd(PolyCP1, CCP1);
- PolyCP0 = _mm256_mul_pd(PolyCP0, w2);
- PolyCP1 = _mm256_mul_pd(PolyCP1, w);
- PolyCP0 = _mm256_add_pd(PolyCP0, CCP0);
- PolyCP0 = _mm256_add_pd(PolyCP0, PolyCP1);
-
- PolyCQ0 = _mm256_mul_pd(CCQ6, w2);
- PolyCQ1 = _mm256_mul_pd(CCQ5, w2);
- PolyCQ0 = _mm256_add_pd(PolyCQ0, CCQ4);
- PolyCQ1 = _mm256_add_pd(PolyCQ1, CCQ3);
- PolyCQ0 = _mm256_mul_pd(PolyCQ0, w2);
- PolyCQ1 = _mm256_mul_pd(PolyCQ1, w2);
- PolyCQ0 = _mm256_add_pd(PolyCQ0, CCQ2);
- PolyCQ1 = _mm256_add_pd(PolyCQ1, CCQ1);
- PolyCQ0 = _mm256_mul_pd(PolyCQ0, w2);
- PolyCQ1 = _mm256_mul_pd(PolyCQ1, w);
- PolyCQ0 = _mm256_add_pd(PolyCQ0, one);
- PolyCQ0 = _mm256_add_pd(PolyCQ0, PolyCQ1);
-
- expmx2 = gmx_mm256_exp_pd( _mm256_or_pd(signbit, x2) );
-
- res_erfcC = _mm256_mul_pd(PolyCP0, gmx_mm256_inv_pd(PolyCQ0));
- res_erfcC = _mm256_add_pd(res_erfcC, CCoffset);
- res_erfcC = _mm256_mul_pd(res_erfcC, w);
-
- mask = _mm256_cmp_pd(xabs, _mm256_set1_pd(4.5), _CMP_GT_OQ);
- res_erfc = _mm256_blendv_pd(res_erfcB, res_erfcC, mask);
-
- res_erfc = _mm256_mul_pd(res_erfc, expmx2);
-
- /* erfc(x<0) = 2-erfc(|x|) */
- mask = _mm256_cmp_pd(x, _mm256_setzero_pd(), _CMP_LT_OQ);
- res_erfc = _mm256_blendv_pd(res_erfc, _mm256_sub_pd(two, res_erfc), mask);
-
- /* Select erf() or erfc() */
- mask = _mm256_cmp_pd(xabs, one, _CMP_LT_OQ);
- res = _mm256_blendv_pd(_mm256_sub_pd(one, res_erfc), res_erf, mask);
-
- return res;
-}
-
-static __m128d
-gmx_mm_erf_pd(__m128d x)
-{
- /* Coefficients for minimax approximation of erf(x)=x*(CAoffset + P(x^2)/Q(x^2)) in range [-0.75,0.75] */
- const __m128d CAP4 = _mm_set1_pd(-0.431780540597889301512e-4);
- const __m128d CAP3 = _mm_set1_pd(-0.00578562306260059236059);
- const __m128d CAP2 = _mm_set1_pd(-0.028593586920219752446);
- const __m128d CAP1 = _mm_set1_pd(-0.315924962948621698209);
- const __m128d CAP0 = _mm_set1_pd(0.14952975608477029151);
-
- const __m128d CAQ5 = _mm_set1_pd(-0.374089300177174709737e-5);
- const __m128d CAQ4 = _mm_set1_pd(0.00015126584532155383535);
- const __m128d CAQ3 = _mm_set1_pd(0.00536692680669480725423);
- const __m128d CAQ2 = _mm_set1_pd(0.0668686825594046122636);
- const __m128d CAQ1 = _mm_set1_pd(0.402604990869284362773);
- /* CAQ0 == 1.0 */
- const __m128d CAoffset = _mm_set1_pd(0.9788494110107421875);
-
- /* Coefficients for minimax approximation of erfc(x)=exp(-x^2)*x*(P(x-1)/Q(x-1)) in range [1.0,4.5] */
- const __m128d CBP6 = _mm_set1_pd(2.49650423685462752497647637088e-10);
- const __m128d CBP5 = _mm_set1_pd(0.00119770193298159629350136085658);
- const __m128d CBP4 = _mm_set1_pd(0.0164944422378370965881008942733);
- const __m128d CBP3 = _mm_set1_pd(0.0984581468691775932063932439252);
- const __m128d CBP2 = _mm_set1_pd(0.317364595806937763843589437418);
- const __m128d CBP1 = _mm_set1_pd(0.554167062641455850932670067075);
- const __m128d CBP0 = _mm_set1_pd(0.427583576155807163756925301060);
- const __m128d CBQ7 = _mm_set1_pd(0.00212288829699830145976198384930);
- const __m128d CBQ6 = _mm_set1_pd(0.0334810979522685300554606393425);
- const __m128d CBQ5 = _mm_set1_pd(0.2361713785181450957579508850717);
- const __m128d CBQ4 = _mm_set1_pd(0.955364736493055670530981883072);
- const __m128d CBQ3 = _mm_set1_pd(2.36815675631420037315349279199);
- const __m128d CBQ2 = _mm_set1_pd(3.55261649184083035537184223542);
- const __m128d CBQ1 = _mm_set1_pd(2.93501136050160872574376997993);
- /* CBQ0 == 1.0 */
-
- /* Coefficients for minimax approximation of erfc(x)=exp(-x^2)/x*(P(1/x)/Q(1/x)) in range [4.5,inf] */
- const __m128d CCP6 = _mm_set1_pd(-2.8175401114513378771);
- const __m128d CCP5 = _mm_set1_pd(-3.22729451764143718517);
- const __m128d CCP4 = _mm_set1_pd(-2.5518551727311523996);
- const __m128d CCP3 = _mm_set1_pd(-0.687717681153649930619);
- const __m128d CCP2 = _mm_set1_pd(-0.212652252872804219852);
- const __m128d CCP1 = _mm_set1_pd(0.0175389834052493308818);
- const __m128d CCP0 = _mm_set1_pd(0.00628057170626964891937);
-
- const __m128d CCQ6 = _mm_set1_pd(5.48409182238641741584);
- const __m128d CCQ5 = _mm_set1_pd(13.5064170191802889145);
- const __m128d CCQ4 = _mm_set1_pd(22.9367376522880577224);
- const __m128d CCQ3 = _mm_set1_pd(15.930646027911794143);
- const __m128d CCQ2 = _mm_set1_pd(11.0567237927800161565);
- const __m128d CCQ1 = _mm_set1_pd(2.79257750980575282228);
- /* CCQ0 == 1.0 */
- const __m128d CCoffset = _mm_set1_pd(0.5579090118408203125);
-
- const __m128d one = _mm_set1_pd(1.0);
- const __m128d two = _mm_set1_pd(2.0);
-
- const __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000, 0x00000000, 0x80000000, 0x00000000) );
-
- __m128d xabs, x2, x4, t, t2, w, w2;
- __m128d PolyAP0, PolyAP1, PolyAQ0, PolyAQ1;
- __m128d PolyBP0, PolyBP1, PolyBQ0, PolyBQ1;
- __m128d PolyCP0, PolyCP1, PolyCQ0, PolyCQ1;
- __m128d res_erf, res_erfcB, res_erfcC, res_erfc, res;
- __m128d mask, expmx2;
-
- /* Calculate erf() */
- xabs = gmx_mm_abs_pd(x);
- x2 = _mm_mul_pd(x, x);
- x4 = _mm_mul_pd(x2, x2);
-
- PolyAP0 = _mm_mul_pd(CAP4, x4);
- PolyAP1 = _mm_mul_pd(CAP3, x4);
- PolyAP0 = _mm_add_pd(PolyAP0, CAP2);
- PolyAP1 = _mm_add_pd(PolyAP1, CAP1);
- PolyAP0 = _mm_mul_pd(PolyAP0, x4);
- PolyAP1 = _mm_mul_pd(PolyAP1, x2);
- PolyAP0 = _mm_add_pd(PolyAP0, CAP0);
- PolyAP0 = _mm_add_pd(PolyAP0, PolyAP1);
-
- PolyAQ1 = _mm_mul_pd(CAQ5, x4);
- PolyAQ0 = _mm_mul_pd(CAQ4, x4);
- PolyAQ1 = _mm_add_pd(PolyAQ1, CAQ3);
- PolyAQ0 = _mm_add_pd(PolyAQ0, CAQ2);
- PolyAQ1 = _mm_mul_pd(PolyAQ1, x4);
- PolyAQ0 = _mm_mul_pd(PolyAQ0, x4);
- PolyAQ1 = _mm_add_pd(PolyAQ1, CAQ1);
- PolyAQ0 = _mm_add_pd(PolyAQ0, one);
- PolyAQ1 = _mm_mul_pd(PolyAQ1, x2);
- PolyAQ0 = _mm_add_pd(PolyAQ0, PolyAQ1);
-
- res_erf = _mm_mul_pd(PolyAP0, gmx_mm_inv_pd(PolyAQ0));
- res_erf = _mm_add_pd(CAoffset, res_erf);
- res_erf = _mm_mul_pd(x, res_erf);
-
- /* Calculate erfc() in range [1,4.5] */
- t = _mm_sub_pd(xabs, one);
- t2 = _mm_mul_pd(t, t);
-
- PolyBP0 = _mm_mul_pd(CBP6, t2);
- PolyBP1 = _mm_mul_pd(CBP5, t2);
- PolyBP0 = _mm_add_pd(PolyBP0, CBP4);
- PolyBP1 = _mm_add_pd(PolyBP1, CBP3);
- PolyBP0 = _mm_mul_pd(PolyBP0, t2);
- PolyBP1 = _mm_mul_pd(PolyBP1, t2);
- PolyBP0 = _mm_add_pd(PolyBP0, CBP2);
- PolyBP1 = _mm_add_pd(PolyBP1, CBP1);
- PolyBP0 = _mm_mul_pd(PolyBP0, t2);
- PolyBP1 = _mm_mul_pd(PolyBP1, t);
- PolyBP0 = _mm_add_pd(PolyBP0, CBP0);
- PolyBP0 = _mm_add_pd(PolyBP0, PolyBP1);
-
- PolyBQ1 = _mm_mul_pd(CBQ7, t2);
- PolyBQ0 = _mm_mul_pd(CBQ6, t2);
- PolyBQ1 = _mm_add_pd(PolyBQ1, CBQ5);
- PolyBQ0 = _mm_add_pd(PolyBQ0, CBQ4);
- PolyBQ1 = _mm_mul_pd(PolyBQ1, t2);
- PolyBQ0 = _mm_mul_pd(PolyBQ0, t2);
- PolyBQ1 = _mm_add_pd(PolyBQ1, CBQ3);
- PolyBQ0 = _mm_add_pd(PolyBQ0, CBQ2);
- PolyBQ1 = _mm_mul_pd(PolyBQ1, t2);
- PolyBQ0 = _mm_mul_pd(PolyBQ0, t2);
- PolyBQ1 = _mm_add_pd(PolyBQ1, CBQ1);
- PolyBQ0 = _mm_add_pd(PolyBQ0, one);
- PolyBQ1 = _mm_mul_pd(PolyBQ1, t);
- PolyBQ0 = _mm_add_pd(PolyBQ0, PolyBQ1);
-
- res_erfcB = _mm_mul_pd(PolyBP0, gmx_mm_inv_pd(PolyBQ0));
-
- res_erfcB = _mm_mul_pd(res_erfcB, xabs);
-
- /* Calculate erfc() in range [4.5,inf] */
- w = gmx_mm_inv_pd(xabs);
- w2 = _mm_mul_pd(w, w);
-
- PolyCP0 = _mm_mul_pd(CCP6, w2);
- PolyCP1 = _mm_mul_pd(CCP5, w2);
- PolyCP0 = _mm_add_pd(PolyCP0, CCP4);
- PolyCP1 = _mm_add_pd(PolyCP1, CCP3);
- PolyCP0 = _mm_mul_pd(PolyCP0, w2);
- PolyCP1 = _mm_mul_pd(PolyCP1, w2);
- PolyCP0 = _mm_add_pd(PolyCP0, CCP2);
- PolyCP1 = _mm_add_pd(PolyCP1, CCP1);
- PolyCP0 = _mm_mul_pd(PolyCP0, w2);
- PolyCP1 = _mm_mul_pd(PolyCP1, w);
- PolyCP0 = _mm_add_pd(PolyCP0, CCP0);
- PolyCP0 = _mm_add_pd(PolyCP0, PolyCP1);
-
- PolyCQ0 = _mm_mul_pd(CCQ6, w2);
- PolyCQ1 = _mm_mul_pd(CCQ5, w2);
- PolyCQ0 = _mm_add_pd(PolyCQ0, CCQ4);
- PolyCQ1 = _mm_add_pd(PolyCQ1, CCQ3);
- PolyCQ0 = _mm_mul_pd(PolyCQ0, w2);
- PolyCQ1 = _mm_mul_pd(PolyCQ1, w2);
- PolyCQ0 = _mm_add_pd(PolyCQ0, CCQ2);
- PolyCQ1 = _mm_add_pd(PolyCQ1, CCQ1);
- PolyCQ0 = _mm_mul_pd(PolyCQ0, w2);
- PolyCQ1 = _mm_mul_pd(PolyCQ1, w);
- PolyCQ0 = _mm_add_pd(PolyCQ0, one);
- PolyCQ0 = _mm_add_pd(PolyCQ0, PolyCQ1);
-
- expmx2 = gmx_mm_exp_pd( _mm_or_pd(signbit, x2) );
-
- res_erfcC = _mm_mul_pd(PolyCP0, gmx_mm_inv_pd(PolyCQ0));
- res_erfcC = _mm_add_pd(res_erfcC, CCoffset);
- res_erfcC = _mm_mul_pd(res_erfcC, w);
-
- mask = _mm_cmpgt_pd(xabs, _mm_set1_pd(4.5));
- res_erfc = _mm_blendv_pd(res_erfcB, res_erfcC, mask);
-
- res_erfc = _mm_mul_pd(res_erfc, expmx2);
-
- /* erfc(x<0) = 2-erfc(|x|) */
- mask = _mm_cmplt_pd(x, _mm_setzero_pd());
- res_erfc = _mm_blendv_pd(res_erfc, _mm_sub_pd(two, res_erfc), mask);
-
- /* Select erf() or erfc() */
- mask = _mm_cmplt_pd(xabs, one);
- res = _mm_blendv_pd(_mm_sub_pd(one, res_erfc), res_erf, mask);
-
- return res;
-}
-
-
-static __m256d
-gmx_mm256_erfc_pd(__m256d x)
-{
- /* Coefficients for minimax approximation of erf(x)=x*(CAoffset + P(x^2)/Q(x^2)) in range [-0.75,0.75] */
- const __m256d CAP4 = _mm256_set1_pd(-0.431780540597889301512e-4);
- const __m256d CAP3 = _mm256_set1_pd(-0.00578562306260059236059);
- const __m256d CAP2 = _mm256_set1_pd(-0.028593586920219752446);
- const __m256d CAP1 = _mm256_set1_pd(-0.315924962948621698209);
- const __m256d CAP0 = _mm256_set1_pd(0.14952975608477029151);
-
- const __m256d CAQ5 = _mm256_set1_pd(-0.374089300177174709737e-5);
- const __m256d CAQ4 = _mm256_set1_pd(0.00015126584532155383535);
- const __m256d CAQ3 = _mm256_set1_pd(0.00536692680669480725423);
- const __m256d CAQ2 = _mm256_set1_pd(0.0668686825594046122636);
- const __m256d CAQ1 = _mm256_set1_pd(0.402604990869284362773);
- /* CAQ0 == 1.0 */
- const __m256d CAoffset = _mm256_set1_pd(0.9788494110107421875);
-
- /* Coefficients for minimax approximation of erfc(x)=exp(-x^2)*x*(P(x-1)/Q(x-1)) in range [1.0,4.5] */
- const __m256d CBP6 = _mm256_set1_pd(2.49650423685462752497647637088e-10);
- const __m256d CBP5 = _mm256_set1_pd(0.00119770193298159629350136085658);
- const __m256d CBP4 = _mm256_set1_pd(0.0164944422378370965881008942733);
- const __m256d CBP3 = _mm256_set1_pd(0.0984581468691775932063932439252);
- const __m256d CBP2 = _mm256_set1_pd(0.317364595806937763843589437418);
- const __m256d CBP1 = _mm256_set1_pd(0.554167062641455850932670067075);
- const __m256d CBP0 = _mm256_set1_pd(0.427583576155807163756925301060);
- const __m256d CBQ7 = _mm256_set1_pd(0.00212288829699830145976198384930);
- const __m256d CBQ6 = _mm256_set1_pd(0.0334810979522685300554606393425);
- const __m256d CBQ5 = _mm256_set1_pd(0.2361713785181450957579508850717);
- const __m256d CBQ4 = _mm256_set1_pd(0.955364736493055670530981883072);
- const __m256d CBQ3 = _mm256_set1_pd(2.36815675631420037315349279199);
- const __m256d CBQ2 = _mm256_set1_pd(3.55261649184083035537184223542);
- const __m256d CBQ1 = _mm256_set1_pd(2.93501136050160872574376997993);
- /* CBQ0 == 1.0 */
-
- /* Coefficients for minimax approximation of erfc(x)=exp(-x^2)/x*(P(1/x)/Q(1/x)) in range [4.5,inf] */
- const __m256d CCP6 = _mm256_set1_pd(-2.8175401114513378771);
- const __m256d CCP5 = _mm256_set1_pd(-3.22729451764143718517);
- const __m256d CCP4 = _mm256_set1_pd(-2.5518551727311523996);
- const __m256d CCP3 = _mm256_set1_pd(-0.687717681153649930619);
- const __m256d CCP2 = _mm256_set1_pd(-0.212652252872804219852);
- const __m256d CCP1 = _mm256_set1_pd(0.0175389834052493308818);
- const __m256d CCP0 = _mm256_set1_pd(0.00628057170626964891937);
-
- const __m256d CCQ6 = _mm256_set1_pd(5.48409182238641741584);
- const __m256d CCQ5 = _mm256_set1_pd(13.5064170191802889145);
- const __m256d CCQ4 = _mm256_set1_pd(22.9367376522880577224);
- const __m256d CCQ3 = _mm256_set1_pd(15.930646027911794143);
- const __m256d CCQ2 = _mm256_set1_pd(11.0567237927800161565);
- const __m256d CCQ1 = _mm256_set1_pd(2.79257750980575282228);
- /* CCQ0 == 1.0 */
- const __m256d CCoffset = _mm256_set1_pd(0.5579090118408203125);
-
- const __m256d one = _mm256_set1_pd(1.0);
- const __m256d two = _mm256_set1_pd(2.0);
-
- const __m256d signbit = _mm256_castsi256_pd( _mm256_set_epi32(0x80000000, 0x00000000, 0x80000000, 0x00000000,
- 0x80000000, 0x00000000, 0x80000000, 0x00000000) );
-
- __m256d xabs, x2, x4, t, t2, w, w2;
- __m256d PolyAP0, PolyAP1, PolyAQ0, PolyAQ1;
- __m256d PolyBP0, PolyBP1, PolyBQ0, PolyBQ1;
- __m256d PolyCP0, PolyCP1, PolyCQ0, PolyCQ1;
- __m256d res_erf, res_erfcB, res_erfcC, res_erfc, res;
- __m256d mask, expmx2;
-
- /* Calculate erf() */
- xabs = gmx_mm256_abs_pd(x);
- x2 = _mm256_mul_pd(x, x);
- x4 = _mm256_mul_pd(x2, x2);
-
- PolyAP0 = _mm256_mul_pd(CAP4, x4);
- PolyAP1 = _mm256_mul_pd(CAP3, x4);
- PolyAP0 = _mm256_add_pd(PolyAP0, CAP2);
- PolyAP1 = _mm256_add_pd(PolyAP1, CAP1);
- PolyAP0 = _mm256_mul_pd(PolyAP0, x4);
- PolyAP1 = _mm256_mul_pd(PolyAP1, x2);
- PolyAP0 = _mm256_add_pd(PolyAP0, CAP0);
- PolyAP0 = _mm256_add_pd(PolyAP0, PolyAP1);
-
- PolyAQ1 = _mm256_mul_pd(CAQ5, x4);
- PolyAQ0 = _mm256_mul_pd(CAQ4, x4);
- PolyAQ1 = _mm256_add_pd(PolyAQ1, CAQ3);
- PolyAQ0 = _mm256_add_pd(PolyAQ0, CAQ2);
- PolyAQ1 = _mm256_mul_pd(PolyAQ1, x4);
- PolyAQ0 = _mm256_mul_pd(PolyAQ0, x4);
- PolyAQ1 = _mm256_add_pd(PolyAQ1, CAQ1);
- PolyAQ0 = _mm256_add_pd(PolyAQ0, one);
- PolyAQ1 = _mm256_mul_pd(PolyAQ1, x2);
- PolyAQ0 = _mm256_add_pd(PolyAQ0, PolyAQ1);
-
- res_erf = _mm256_mul_pd(PolyAP0, gmx_mm256_inv_pd(PolyAQ0));
- res_erf = _mm256_add_pd(CAoffset, res_erf);
- res_erf = _mm256_mul_pd(x, res_erf);
-
- /* Calculate erfc() in range [1,4.5] */
- t = _mm256_sub_pd(xabs, one);
- t2 = _mm256_mul_pd(t, t);
-
- PolyBP0 = _mm256_mul_pd(CBP6, t2);
- PolyBP1 = _mm256_mul_pd(CBP5, t2);
- PolyBP0 = _mm256_add_pd(PolyBP0, CBP4);
- PolyBP1 = _mm256_add_pd(PolyBP1, CBP3);
- PolyBP0 = _mm256_mul_pd(PolyBP0, t2);
- PolyBP1 = _mm256_mul_pd(PolyBP1, t2);
- PolyBP0 = _mm256_add_pd(PolyBP0, CBP2);
- PolyBP1 = _mm256_add_pd(PolyBP1, CBP1);
- PolyBP0 = _mm256_mul_pd(PolyBP0, t2);
- PolyBP1 = _mm256_mul_pd(PolyBP1, t);
- PolyBP0 = _mm256_add_pd(PolyBP0, CBP0);
- PolyBP0 = _mm256_add_pd(PolyBP0, PolyBP1);
-
- PolyBQ1 = _mm256_mul_pd(CBQ7, t2);
- PolyBQ0 = _mm256_mul_pd(CBQ6, t2);
- PolyBQ1 = _mm256_add_pd(PolyBQ1, CBQ5);
- PolyBQ0 = _mm256_add_pd(PolyBQ0, CBQ4);
- PolyBQ1 = _mm256_mul_pd(PolyBQ1, t2);
- PolyBQ0 = _mm256_mul_pd(PolyBQ0, t2);
- PolyBQ1 = _mm256_add_pd(PolyBQ1, CBQ3);
- PolyBQ0 = _mm256_add_pd(PolyBQ0, CBQ2);
- PolyBQ1 = _mm256_mul_pd(PolyBQ1, t2);
- PolyBQ0 = _mm256_mul_pd(PolyBQ0, t2);
- PolyBQ1 = _mm256_add_pd(PolyBQ1, CBQ1);
- PolyBQ0 = _mm256_add_pd(PolyBQ0, one);
- PolyBQ1 = _mm256_mul_pd(PolyBQ1, t);
- PolyBQ0 = _mm256_add_pd(PolyBQ0, PolyBQ1);
-
- res_erfcB = _mm256_mul_pd(PolyBP0, gmx_mm256_inv_pd(PolyBQ0));
-
- res_erfcB = _mm256_mul_pd(res_erfcB, xabs);
-
- /* Calculate erfc() in range [4.5,inf] */
- w = gmx_mm256_inv_pd(xabs);
- w2 = _mm256_mul_pd(w, w);
-
- PolyCP0 = _mm256_mul_pd(CCP6, w2);
- PolyCP1 = _mm256_mul_pd(CCP5, w2);
- PolyCP0 = _mm256_add_pd(PolyCP0, CCP4);
- PolyCP1 = _mm256_add_pd(PolyCP1, CCP3);
- PolyCP0 = _mm256_mul_pd(PolyCP0, w2);
- PolyCP1 = _mm256_mul_pd(PolyCP1, w2);
- PolyCP0 = _mm256_add_pd(PolyCP0, CCP2);
- PolyCP1 = _mm256_add_pd(PolyCP1, CCP1);
- PolyCP0 = _mm256_mul_pd(PolyCP0, w2);
- PolyCP1 = _mm256_mul_pd(PolyCP1, w);
- PolyCP0 = _mm256_add_pd(PolyCP0, CCP0);
- PolyCP0 = _mm256_add_pd(PolyCP0, PolyCP1);
-
- PolyCQ0 = _mm256_mul_pd(CCQ6, w2);
- PolyCQ1 = _mm256_mul_pd(CCQ5, w2);
- PolyCQ0 = _mm256_add_pd(PolyCQ0, CCQ4);
- PolyCQ1 = _mm256_add_pd(PolyCQ1, CCQ3);
- PolyCQ0 = _mm256_mul_pd(PolyCQ0, w2);
- PolyCQ1 = _mm256_mul_pd(PolyCQ1, w2);
- PolyCQ0 = _mm256_add_pd(PolyCQ0, CCQ2);
- PolyCQ1 = _mm256_add_pd(PolyCQ1, CCQ1);
- PolyCQ0 = _mm256_mul_pd(PolyCQ0, w2);
- PolyCQ1 = _mm256_mul_pd(PolyCQ1, w);
- PolyCQ0 = _mm256_add_pd(PolyCQ0, one);
- PolyCQ0 = _mm256_add_pd(PolyCQ0, PolyCQ1);
-
- expmx2 = gmx_mm256_exp_pd( _mm256_or_pd(signbit, x2) );
-
- res_erfcC = _mm256_mul_pd(PolyCP0, gmx_mm256_inv_pd(PolyCQ0));
- res_erfcC = _mm256_add_pd(res_erfcC, CCoffset);
- res_erfcC = _mm256_mul_pd(res_erfcC, w);
-
- mask = _mm256_cmp_pd(xabs, _mm256_set1_pd(4.5), _CMP_GT_OQ);
- res_erfc = _mm256_blendv_pd(res_erfcB, res_erfcC, mask);
-
- res_erfc = _mm256_mul_pd(res_erfc, expmx2);
-
- /* erfc(x<0) = 2-erfc(|x|) */
- mask = _mm256_cmp_pd(x, _mm256_setzero_pd(), _CMP_LT_OQ);
- res_erfc = _mm256_blendv_pd(res_erfc, _mm256_sub_pd(two, res_erfc), mask);
-
- /* Select erf() or erfc() */
- mask = _mm256_cmp_pd(xabs, one, _CMP_LT_OQ);
- res = _mm256_blendv_pd(res_erfc, _mm256_sub_pd(one, res_erf), mask);
-
- return res;
-}
-
-
-static __m128d
-gmx_mm_erfc_pd(__m128d x)
-{
- /* Coefficients for minimax approximation of erf(x)=x*(CAoffset + P(x^2)/Q(x^2)) in range [-0.75,0.75] */
- const __m128d CAP4 = _mm_set1_pd(-0.431780540597889301512e-4);
- const __m128d CAP3 = _mm_set1_pd(-0.00578562306260059236059);
- const __m128d CAP2 = _mm_set1_pd(-0.028593586920219752446);
- const __m128d CAP1 = _mm_set1_pd(-0.315924962948621698209);
- const __m128d CAP0 = _mm_set1_pd(0.14952975608477029151);
-
- const __m128d CAQ5 = _mm_set1_pd(-0.374089300177174709737e-5);
- const __m128d CAQ4 = _mm_set1_pd(0.00015126584532155383535);
- const __m128d CAQ3 = _mm_set1_pd(0.00536692680669480725423);
- const __m128d CAQ2 = _mm_set1_pd(0.0668686825594046122636);
- const __m128d CAQ1 = _mm_set1_pd(0.402604990869284362773);
- /* CAQ0 == 1.0 */
- const __m128d CAoffset = _mm_set1_pd(0.9788494110107421875);
-
- /* Coefficients for minimax approximation of erfc(x)=exp(-x^2)*x*(P(x-1)/Q(x-1)) in range [1.0,4.5] */
- const __m128d CBP6 = _mm_set1_pd(2.49650423685462752497647637088e-10);
- const __m128d CBP5 = _mm_set1_pd(0.00119770193298159629350136085658);
- const __m128d CBP4 = _mm_set1_pd(0.0164944422378370965881008942733);
- const __m128d CBP3 = _mm_set1_pd(0.0984581468691775932063932439252);
- const __m128d CBP2 = _mm_set1_pd(0.317364595806937763843589437418);
- const __m128d CBP1 = _mm_set1_pd(0.554167062641455850932670067075);
- const __m128d CBP0 = _mm_set1_pd(0.427583576155807163756925301060);
- const __m128d CBQ7 = _mm_set1_pd(0.00212288829699830145976198384930);
- const __m128d CBQ6 = _mm_set1_pd(0.0334810979522685300554606393425);
- const __m128d CBQ5 = _mm_set1_pd(0.2361713785181450957579508850717);
- const __m128d CBQ4 = _mm_set1_pd(0.955364736493055670530981883072);
- const __m128d CBQ3 = _mm_set1_pd(2.36815675631420037315349279199);
- const __m128d CBQ2 = _mm_set1_pd(3.55261649184083035537184223542);
- const __m128d CBQ1 = _mm_set1_pd(2.93501136050160872574376997993);
- /* CBQ0 == 1.0 */
-
- /* Coefficients for minimax approximation of erfc(x)=exp(-x^2)/x*(P(1/x)/Q(1/x)) in range [4.5,inf] */
- const __m128d CCP6 = _mm_set1_pd(-2.8175401114513378771);
- const __m128d CCP5 = _mm_set1_pd(-3.22729451764143718517);
- const __m128d CCP4 = _mm_set1_pd(-2.5518551727311523996);
- const __m128d CCP3 = _mm_set1_pd(-0.687717681153649930619);
- const __m128d CCP2 = _mm_set1_pd(-0.212652252872804219852);
- const __m128d CCP1 = _mm_set1_pd(0.0175389834052493308818);
- const __m128d CCP0 = _mm_set1_pd(0.00628057170626964891937);
-
- const __m128d CCQ6 = _mm_set1_pd(5.48409182238641741584);
- const __m128d CCQ5 = _mm_set1_pd(13.5064170191802889145);
- const __m128d CCQ4 = _mm_set1_pd(22.9367376522880577224);
- const __m128d CCQ3 = _mm_set1_pd(15.930646027911794143);
- const __m128d CCQ2 = _mm_set1_pd(11.0567237927800161565);
- const __m128d CCQ1 = _mm_set1_pd(2.79257750980575282228);
- /* CCQ0 == 1.0 */
- const __m128d CCoffset = _mm_set1_pd(0.5579090118408203125);
-
- const __m128d one = _mm_set1_pd(1.0);
- const __m128d two = _mm_set1_pd(2.0);
-
- const __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000, 0x00000000, 0x80000000, 0x00000000) );
-
- __m128d xabs, x2, x4, t, t2, w, w2;
- __m128d PolyAP0, PolyAP1, PolyAQ0, PolyAQ1;
- __m128d PolyBP0, PolyBP1, PolyBQ0, PolyBQ1;
- __m128d PolyCP0, PolyCP1, PolyCQ0, PolyCQ1;
- __m128d res_erf, res_erfcB, res_erfcC, res_erfc, res;
- __m128d mask, expmx2;
-
- /* Calculate erf() */
- xabs = gmx_mm_abs_pd(x);
- x2 = _mm_mul_pd(x, x);
- x4 = _mm_mul_pd(x2, x2);
-
- PolyAP0 = _mm_mul_pd(CAP4, x4);
- PolyAP1 = _mm_mul_pd(CAP3, x4);
- PolyAP0 = _mm_add_pd(PolyAP0, CAP2);
- PolyAP1 = _mm_add_pd(PolyAP1, CAP1);
- PolyAP0 = _mm_mul_pd(PolyAP0, x4);
- PolyAP1 = _mm_mul_pd(PolyAP1, x2);
- PolyAP0 = _mm_add_pd(PolyAP0, CAP0);
- PolyAP0 = _mm_add_pd(PolyAP0, PolyAP1);
-
- PolyAQ1 = _mm_mul_pd(CAQ5, x4);
- PolyAQ0 = _mm_mul_pd(CAQ4, x4);
- PolyAQ1 = _mm_add_pd(PolyAQ1, CAQ3);
- PolyAQ0 = _mm_add_pd(PolyAQ0, CAQ2);
- PolyAQ1 = _mm_mul_pd(PolyAQ1, x4);
- PolyAQ0 = _mm_mul_pd(PolyAQ0, x4);
- PolyAQ1 = _mm_add_pd(PolyAQ1, CAQ1);
- PolyAQ0 = _mm_add_pd(PolyAQ0, one);
- PolyAQ1 = _mm_mul_pd(PolyAQ1, x2);
- PolyAQ0 = _mm_add_pd(PolyAQ0, PolyAQ1);
-
- res_erf = _mm_mul_pd(PolyAP0, gmx_mm_inv_pd(PolyAQ0));
- res_erf = _mm_add_pd(CAoffset, res_erf);
- res_erf = _mm_mul_pd(x, res_erf);
-
- /* Calculate erfc() in range [1,4.5] */
- t = _mm_sub_pd(xabs, one);
- t2 = _mm_mul_pd(t, t);
-
- PolyBP0 = _mm_mul_pd(CBP6, t2);
- PolyBP1 = _mm_mul_pd(CBP5, t2);
- PolyBP0 = _mm_add_pd(PolyBP0, CBP4);
- PolyBP1 = _mm_add_pd(PolyBP1, CBP3);
- PolyBP0 = _mm_mul_pd(PolyBP0, t2);
- PolyBP1 = _mm_mul_pd(PolyBP1, t2);
- PolyBP0 = _mm_add_pd(PolyBP0, CBP2);
- PolyBP1 = _mm_add_pd(PolyBP1, CBP1);
- PolyBP0 = _mm_mul_pd(PolyBP0, t2);
- PolyBP1 = _mm_mul_pd(PolyBP1, t);
- PolyBP0 = _mm_add_pd(PolyBP0, CBP0);
- PolyBP0 = _mm_add_pd(PolyBP0, PolyBP1);
-
- PolyBQ1 = _mm_mul_pd(CBQ7, t2);
- PolyBQ0 = _mm_mul_pd(CBQ6, t2);
- PolyBQ1 = _mm_add_pd(PolyBQ1, CBQ5);
- PolyBQ0 = _mm_add_pd(PolyBQ0, CBQ4);
- PolyBQ1 = _mm_mul_pd(PolyBQ1, t2);
- PolyBQ0 = _mm_mul_pd(PolyBQ0, t2);
- PolyBQ1 = _mm_add_pd(PolyBQ1, CBQ3);
- PolyBQ0 = _mm_add_pd(PolyBQ0, CBQ2);
- PolyBQ1 = _mm_mul_pd(PolyBQ1, t2);
- PolyBQ0 = _mm_mul_pd(PolyBQ0, t2);
- PolyBQ1 = _mm_add_pd(PolyBQ1, CBQ1);
- PolyBQ0 = _mm_add_pd(PolyBQ0, one);
- PolyBQ1 = _mm_mul_pd(PolyBQ1, t);
- PolyBQ0 = _mm_add_pd(PolyBQ0, PolyBQ1);
-
- res_erfcB = _mm_mul_pd(PolyBP0, gmx_mm_inv_pd(PolyBQ0));
-
- res_erfcB = _mm_mul_pd(res_erfcB, xabs);
-
- /* Calculate erfc() in range [4.5,inf] */
- w = gmx_mm_inv_pd(xabs);
- w2 = _mm_mul_pd(w, w);
-
- PolyCP0 = _mm_mul_pd(CCP6, w2);
- PolyCP1 = _mm_mul_pd(CCP5, w2);
- PolyCP0 = _mm_add_pd(PolyCP0, CCP4);
- PolyCP1 = _mm_add_pd(PolyCP1, CCP3);
- PolyCP0 = _mm_mul_pd(PolyCP0, w2);
- PolyCP1 = _mm_mul_pd(PolyCP1, w2);
- PolyCP0 = _mm_add_pd(PolyCP0, CCP2);
- PolyCP1 = _mm_add_pd(PolyCP1, CCP1);
- PolyCP0 = _mm_mul_pd(PolyCP0, w2);
- PolyCP1 = _mm_mul_pd(PolyCP1, w);
- PolyCP0 = _mm_add_pd(PolyCP0, CCP0);
- PolyCP0 = _mm_add_pd(PolyCP0, PolyCP1);
-
- PolyCQ0 = _mm_mul_pd(CCQ6, w2);
- PolyCQ1 = _mm_mul_pd(CCQ5, w2);
- PolyCQ0 = _mm_add_pd(PolyCQ0, CCQ4);
- PolyCQ1 = _mm_add_pd(PolyCQ1, CCQ3);
- PolyCQ0 = _mm_mul_pd(PolyCQ0, w2);
- PolyCQ1 = _mm_mul_pd(PolyCQ1, w2);
- PolyCQ0 = _mm_add_pd(PolyCQ0, CCQ2);
- PolyCQ1 = _mm_add_pd(PolyCQ1, CCQ1);
- PolyCQ0 = _mm_mul_pd(PolyCQ0, w2);
- PolyCQ1 = _mm_mul_pd(PolyCQ1, w);
- PolyCQ0 = _mm_add_pd(PolyCQ0, one);
- PolyCQ0 = _mm_add_pd(PolyCQ0, PolyCQ1);
-
- expmx2 = gmx_mm_exp_pd( _mm_or_pd(signbit, x2) );
-
- res_erfcC = _mm_mul_pd(PolyCP0, gmx_mm_inv_pd(PolyCQ0));
- res_erfcC = _mm_add_pd(res_erfcC, CCoffset);
- res_erfcC = _mm_mul_pd(res_erfcC, w);
-
- mask = _mm_cmpgt_pd(xabs, _mm_set1_pd(4.5));
- res_erfc = _mm_blendv_pd(res_erfcB, res_erfcC, mask);
-
- res_erfc = _mm_mul_pd(res_erfc, expmx2);
-
- /* erfc(x<0) = 2-erfc(|x|) */
- mask = _mm_cmplt_pd(x, _mm_setzero_pd());
- res_erfc = _mm_blendv_pd(res_erfc, _mm_sub_pd(two, res_erfc), mask);
-
- /* Select erf() or erfc() */
- mask = _mm_cmplt_pd(xabs, one);
- res = _mm_blendv_pd(res_erfc, _mm_sub_pd(one, res_erf), mask);
-
- return res;
-}
-
-
-/* Calculate the force correction due to PME analytically.
- *
- * This routine is meant to enable analytical evaluation of the
- * direct-space PME electrostatic force to avoid tables.
- *
- * The direct-space potential should be Erfc(beta*r)/r, but there
- * are some problems evaluating that:
- *
- * First, the error function is difficult (read: expensive) to
- * approxmiate accurately for intermediate to large arguments, and
- * this happens already in ranges of beta*r that occur in simulations.
- * Second, we now try to avoid calculating potentials in Gromacs but
- * use forces directly.
- *
- * We can simply things slight by noting that the PME part is really
- * a correction to the normal Coulomb force since Erfc(z)=1-Erf(z), i.e.
- *
- * V= 1/r - Erf(beta*r)/r
- *
- * The first term we already have from the inverse square root, so
- * that we can leave out of this routine.
- *
- * For pme tolerances of 1e-3 to 1e-8 and cutoffs of 0.5nm to 1.8nm,
- * the argument beta*r will be in the range 0.15 to ~4. Use your
- * favorite plotting program to realize how well-behaved Erf(z)/z is
- * in this range!
- *
- * We approximate f(z)=erf(z)/z with a rational minimax polynomial.
- * However, it turns out it is more efficient to approximate f(z)/z and
- * then only use even powers. This is another minor optimization, since
- * we actually WANT f(z)/z, because it is going to be multiplied by
- * the vector between the two atoms to get the vectorial force. The
- * fastest flops are the ones we can avoid calculating!
- *
- * So, here's how it should be used:
- *
- * 1. Calculate r^2.
- * 2. Multiply by beta^2, so you get z^2=beta^2*r^2.
- * 3. Evaluate this routine with z^2 as the argument.
- * 4. The return value is the expression:
- *
- *
- * 2*exp(-z^2) erf(z)
- * ------------ - --------
- * sqrt(Pi)*z^2 z^3
- *
- * 5. Multiply the entire expression by beta^3. This will get you
- *
- * beta^3*2*exp(-z^2) beta^3*erf(z)
- * ------------------ - ---------------
- * sqrt(Pi)*z^2 z^3
- *
- * or, switching back to r (z=r*beta):
- *
- * 2*beta*exp(-r^2*beta^2) erf(r*beta)
- * ----------------------- - -----------
- * sqrt(Pi)*r^2 r^3
- *
- *
- * With a bit of math exercise you should be able to confirm that
- * this is exactly D[Erf[beta*r]/r,r] divided by r another time.
- *
- * 6. Add the result to 1/r^3, multiply by the product of the charges,
- * and you have your force (divided by r). A final multiplication
- * with the vector connecting the two particles and you have your
- * vectorial force to add to the particles.
- *
- */
-static __m256d
-gmx_mm256_pmecorrF_pd(__m256d z2)
-{
- const __m256d FN10 = _mm256_set1_pd(-8.0072854618360083154e-14);
- const __m256d FN9 = _mm256_set1_pd(1.1859116242260148027e-11);
- const __m256d FN8 = _mm256_set1_pd(-8.1490406329798423616e-10);
- const __m256d FN7 = _mm256_set1_pd(3.4404793543907847655e-8);
- const __m256d FN6 = _mm256_set1_pd(-9.9471420832602741006e-7);
- const __m256d FN5 = _mm256_set1_pd(0.000020740315999115847456);
- const __m256d FN4 = _mm256_set1_pd(-0.00031991745139313364005);
- const __m256d FN3 = _mm256_set1_pd(0.0035074449373659008203);
- const __m256d FN2 = _mm256_set1_pd(-0.031750380176100813405);
- const __m256d FN1 = _mm256_set1_pd(0.13884101728898463426);
- const __m256d FN0 = _mm256_set1_pd(-0.75225277815249618847);
-
- const __m256d FD5 = _mm256_set1_pd(0.000016009278224355026701);
- const __m256d FD4 = _mm256_set1_pd(0.00051055686934806966046);
- const __m256d FD3 = _mm256_set1_pd(0.0081803507497974289008);
- const __m256d FD2 = _mm256_set1_pd(0.077181146026670287235);
- const __m256d FD1 = _mm256_set1_pd(0.41543303143712535988);
- const __m256d FD0 = _mm256_set1_pd(1.0);
-
- __m256d z4;
- __m256d polyFN0, polyFN1, polyFD0, polyFD1;
-
- z4 = _mm256_mul_pd(z2, z2);
-
- polyFD1 = _mm256_mul_pd(FD5, z4);
- polyFD0 = _mm256_mul_pd(FD4, z4);
- polyFD1 = _mm256_add_pd(polyFD1, FD3);
- polyFD0 = _mm256_add_pd(polyFD0, FD2);
- polyFD1 = _mm256_mul_pd(polyFD1, z4);
- polyFD0 = _mm256_mul_pd(polyFD0, z4);
- polyFD1 = _mm256_add_pd(polyFD1, FD1);
- polyFD0 = _mm256_add_pd(polyFD0, FD0);
- polyFD1 = _mm256_mul_pd(polyFD1, z2);
- polyFD0 = _mm256_add_pd(polyFD0, polyFD1);
-
- polyFD0 = gmx_mm256_inv_pd(polyFD0);
-
- polyFN0 = _mm256_mul_pd(FN10, z4);
- polyFN1 = _mm256_mul_pd(FN9, z4);
- polyFN0 = _mm256_add_pd(polyFN0, FN8);
- polyFN1 = _mm256_add_pd(polyFN1, FN7);
- polyFN0 = _mm256_mul_pd(polyFN0, z4);
- polyFN1 = _mm256_mul_pd(polyFN1, z4);
- polyFN0 = _mm256_add_pd(polyFN0, FN6);
- polyFN1 = _mm256_add_pd(polyFN1, FN5);
- polyFN0 = _mm256_mul_pd(polyFN0, z4);
- polyFN1 = _mm256_mul_pd(polyFN1, z4);
- polyFN0 = _mm256_add_pd(polyFN0, FN4);
- polyFN1 = _mm256_add_pd(polyFN1, FN3);
- polyFN0 = _mm256_mul_pd(polyFN0, z4);
- polyFN1 = _mm256_mul_pd(polyFN1, z4);
- polyFN0 = _mm256_add_pd(polyFN0, FN2);
- polyFN1 = _mm256_add_pd(polyFN1, FN1);
- polyFN0 = _mm256_mul_pd(polyFN0, z4);
- polyFN1 = _mm256_mul_pd(polyFN1, z2);
- polyFN0 = _mm256_add_pd(polyFN0, FN0);
- polyFN0 = _mm256_add_pd(polyFN0, polyFN1);
-
- return _mm256_mul_pd(polyFN0, polyFD0);
-}
-
-
-static __m128d
-gmx_mm_pmecorrF_pd(__m128d z2)
-{
- const __m128d FN10 = _mm_set1_pd(-8.0072854618360083154e-14);
- const __m128d FN9 = _mm_set1_pd(1.1859116242260148027e-11);
- const __m128d FN8 = _mm_set1_pd(-8.1490406329798423616e-10);
- const __m128d FN7 = _mm_set1_pd(3.4404793543907847655e-8);
- const __m128d FN6 = _mm_set1_pd(-9.9471420832602741006e-7);
- const __m128d FN5 = _mm_set1_pd(0.000020740315999115847456);
- const __m128d FN4 = _mm_set1_pd(-0.00031991745139313364005);
- const __m128d FN3 = _mm_set1_pd(0.0035074449373659008203);
- const __m128d FN2 = _mm_set1_pd(-0.031750380176100813405);
- const __m128d FN1 = _mm_set1_pd(0.13884101728898463426);
- const __m128d FN0 = _mm_set1_pd(-0.75225277815249618847);
-
- const __m128d FD5 = _mm_set1_pd(0.000016009278224355026701);
- const __m128d FD4 = _mm_set1_pd(0.00051055686934806966046);
- const __m128d FD3 = _mm_set1_pd(0.0081803507497974289008);
- const __m128d FD2 = _mm_set1_pd(0.077181146026670287235);
- const __m128d FD1 = _mm_set1_pd(0.41543303143712535988);
- const __m128d FD0 = _mm_set1_pd(1.0);
-
- __m128d z4;
- __m128d polyFN0, polyFN1, polyFD0, polyFD1;
-
- z4 = _mm_mul_pd(z2, z2);
-
- polyFD1 = _mm_mul_pd(FD5, z4);
- polyFD0 = _mm_mul_pd(FD4, z4);
- polyFD1 = _mm_add_pd(polyFD1, FD3);
- polyFD0 = _mm_add_pd(polyFD0, FD2);
- polyFD1 = _mm_mul_pd(polyFD1, z4);
- polyFD0 = _mm_mul_pd(polyFD0, z4);
- polyFD1 = _mm_add_pd(polyFD1, FD1);
- polyFD0 = _mm_add_pd(polyFD0, FD0);
- polyFD1 = _mm_mul_pd(polyFD1, z2);
- polyFD0 = _mm_add_pd(polyFD0, polyFD1);
-
- polyFD0 = gmx_mm_inv_pd(polyFD0);
-
- polyFN0 = _mm_mul_pd(FN10, z4);
- polyFN1 = _mm_mul_pd(FN9, z4);
- polyFN0 = _mm_add_pd(polyFN0, FN8);
- polyFN1 = _mm_add_pd(polyFN1, FN7);
- polyFN0 = _mm_mul_pd(polyFN0, z4);
- polyFN1 = _mm_mul_pd(polyFN1, z4);
- polyFN0 = _mm_add_pd(polyFN0, FN6);
- polyFN1 = _mm_add_pd(polyFN1, FN5);
- polyFN0 = _mm_mul_pd(polyFN0, z4);
- polyFN1 = _mm_mul_pd(polyFN1, z4);
- polyFN0 = _mm_add_pd(polyFN0, FN4);
- polyFN1 = _mm_add_pd(polyFN1, FN3);
- polyFN0 = _mm_mul_pd(polyFN0, z4);
- polyFN1 = _mm_mul_pd(polyFN1, z4);
- polyFN0 = _mm_add_pd(polyFN0, FN2);
- polyFN1 = _mm_add_pd(polyFN1, FN1);
- polyFN0 = _mm_mul_pd(polyFN0, z4);
- polyFN1 = _mm_mul_pd(polyFN1, z2);
- polyFN0 = _mm_add_pd(polyFN0, FN0);
- polyFN0 = _mm_add_pd(polyFN0, polyFN1);
-
- return _mm_mul_pd(polyFN0, polyFD0);
-}
-
-
-
-
-/* Calculate the potential correction due to PME analytically.
- *
- * See gmx_mm256_pmecorrF_ps() for details about the approximation.
- *
- * This routine calculates Erf(z)/z, although you should provide z^2
- * as the input argument.
- *
- * Here's how it should be used:
- *
- * 1. Calculate r^2.
- * 2. Multiply by beta^2, so you get z^2=beta^2*r^2.
- * 3. Evaluate this routine with z^2 as the argument.
- * 4. The return value is the expression:
- *
- *
- * erf(z)
- * --------
- * z
- *
- * 5. Multiply the entire expression by beta and switching back to r (z=r*beta):
- *
- * erf(r*beta)
- * -----------
- * r
- *
- * 6. Subtract the result from 1/r, multiply by the product of the charges,
- * and you have your potential.
- *
- */
-static __m256d
-gmx_mm256_pmecorrV_pd(__m256d z2)
-{
- const __m256d VN9 = _mm256_set1_pd(-9.3723776169321855475e-13);
- const __m256d VN8 = _mm256_set1_pd(1.2280156762674215741e-10);
- const __m256d VN7 = _mm256_set1_pd(-7.3562157912251309487e-9);
- const __m256d VN6 = _mm256_set1_pd(2.6215886208032517509e-7);
- const __m256d VN5 = _mm256_set1_pd(-4.9532491651265819499e-6);
- const __m256d VN4 = _mm256_set1_pd(0.00025907400778966060389);
- const __m256d VN3 = _mm256_set1_pd(0.0010585044856156469792);
- const __m256d VN2 = _mm256_set1_pd(0.045247661136833092885);
- const __m256d VN1 = _mm256_set1_pd(0.11643931522926034421);
- const __m256d VN0 = _mm256_set1_pd(1.1283791671726767970);
-
- const __m256d VD5 = _mm256_set1_pd(0.000021784709867336150342);
- const __m256d VD4 = _mm256_set1_pd(0.00064293662010911388448);
- const __m256d VD3 = _mm256_set1_pd(0.0096311444822588683504);
- const __m256d VD2 = _mm256_set1_pd(0.085608012351550627051);
- const __m256d VD1 = _mm256_set1_pd(0.43652499166614811084);
- const __m256d VD0 = _mm256_set1_pd(1.0);
-
- __m256d z4;
- __m256d polyVN0, polyVN1, polyVD0, polyVD1;
-
- z4 = _mm256_mul_pd(z2, z2);
-
- polyVD1 = _mm256_mul_pd(VD5, z4);
- polyVD0 = _mm256_mul_pd(VD4, z4);
- polyVD1 = _mm256_add_pd(polyVD1, VD3);
- polyVD0 = _mm256_add_pd(polyVD0, VD2);
- polyVD1 = _mm256_mul_pd(polyVD1, z4);
- polyVD0 = _mm256_mul_pd(polyVD0, z4);
- polyVD1 = _mm256_add_pd(polyVD1, VD1);
- polyVD0 = _mm256_add_pd(polyVD0, VD0);
- polyVD1 = _mm256_mul_pd(polyVD1, z2);
- polyVD0 = _mm256_add_pd(polyVD0, polyVD1);
-
- polyVD0 = gmx_mm256_inv_pd(polyVD0);
-
- polyVN1 = _mm256_mul_pd(VN9, z4);
- polyVN0 = _mm256_mul_pd(VN8, z4);
- polyVN1 = _mm256_add_pd(polyVN1, VN7);
- polyVN0 = _mm256_add_pd(polyVN0, VN6);
- polyVN1 = _mm256_mul_pd(polyVN1, z4);
- polyVN0 = _mm256_mul_pd(polyVN0, z4);
- polyVN1 = _mm256_add_pd(polyVN1, VN5);
- polyVN0 = _mm256_add_pd(polyVN0, VN4);
- polyVN1 = _mm256_mul_pd(polyVN1, z4);
- polyVN0 = _mm256_mul_pd(polyVN0, z4);
- polyVN1 = _mm256_add_pd(polyVN1, VN3);
- polyVN0 = _mm256_add_pd(polyVN0, VN2);
- polyVN1 = _mm256_mul_pd(polyVN1, z4);
- polyVN0 = _mm256_mul_pd(polyVN0, z4);
- polyVN1 = _mm256_add_pd(polyVN1, VN1);
- polyVN0 = _mm256_add_pd(polyVN0, VN0);
- polyVN1 = _mm256_mul_pd(polyVN1, z2);
- polyVN0 = _mm256_add_pd(polyVN0, polyVN1);
-
- return _mm256_mul_pd(polyVN0, polyVD0);
-}
-
-
-static __m128d
-gmx_mm_pmecorrV_pd(__m128d z2)
-{
- const __m128d VN9 = _mm_set1_pd(-9.3723776169321855475e-13);
- const __m128d VN8 = _mm_set1_pd(1.2280156762674215741e-10);
- const __m128d VN7 = _mm_set1_pd(-7.3562157912251309487e-9);
- const __m128d VN6 = _mm_set1_pd(2.6215886208032517509e-7);
- const __m128d VN5 = _mm_set1_pd(-4.9532491651265819499e-6);
- const __m128d VN4 = _mm_set1_pd(0.00025907400778966060389);
- const __m128d VN3 = _mm_set1_pd(0.0010585044856156469792);
- const __m128d VN2 = _mm_set1_pd(0.045247661136833092885);
- const __m128d VN1 = _mm_set1_pd(0.11643931522926034421);
- const __m128d VN0 = _mm_set1_pd(1.1283791671726767970);
-
- const __m128d VD5 = _mm_set1_pd(0.000021784709867336150342);
- const __m128d VD4 = _mm_set1_pd(0.00064293662010911388448);
- const __m128d VD3 = _mm_set1_pd(0.0096311444822588683504);
- const __m128d VD2 = _mm_set1_pd(0.085608012351550627051);
- const __m128d VD1 = _mm_set1_pd(0.43652499166614811084);
- const __m128d VD0 = _mm_set1_pd(1.0);
-
- __m128d z4;
- __m128d polyVN0, polyVN1, polyVD0, polyVD1;
-
- z4 = _mm_mul_pd(z2, z2);
-
- polyVD1 = _mm_mul_pd(VD5, z4);
- polyVD0 = _mm_mul_pd(VD4, z4);
- polyVD1 = _mm_add_pd(polyVD1, VD3);
- polyVD0 = _mm_add_pd(polyVD0, VD2);
- polyVD1 = _mm_mul_pd(polyVD1, z4);
- polyVD0 = _mm_mul_pd(polyVD0, z4);
- polyVD1 = _mm_add_pd(polyVD1, VD1);
- polyVD0 = _mm_add_pd(polyVD0, VD0);
- polyVD1 = _mm_mul_pd(polyVD1, z2);
- polyVD0 = _mm_add_pd(polyVD0, polyVD1);
-
- polyVD0 = gmx_mm_inv_pd(polyVD0);
-
- polyVN1 = _mm_mul_pd(VN9, z4);
- polyVN0 = _mm_mul_pd(VN8, z4);
- polyVN1 = _mm_add_pd(polyVN1, VN7);
- polyVN0 = _mm_add_pd(polyVN0, VN6);
- polyVN1 = _mm_mul_pd(polyVN1, z4);
- polyVN0 = _mm_mul_pd(polyVN0, z4);
- polyVN1 = _mm_add_pd(polyVN1, VN5);
- polyVN0 = _mm_add_pd(polyVN0, VN4);
- polyVN1 = _mm_mul_pd(polyVN1, z4);
- polyVN0 = _mm_mul_pd(polyVN0, z4);
- polyVN1 = _mm_add_pd(polyVN1, VN3);
- polyVN0 = _mm_add_pd(polyVN0, VN2);
- polyVN1 = _mm_mul_pd(polyVN1, z4);
- polyVN0 = _mm_mul_pd(polyVN0, z4);
- polyVN1 = _mm_add_pd(polyVN1, VN1);
- polyVN0 = _mm_add_pd(polyVN0, VN0);
- polyVN1 = _mm_mul_pd(polyVN1, z2);
- polyVN0 = _mm_add_pd(polyVN0, polyVN1);
-
- return _mm_mul_pd(polyVN0, polyVD0);
-}
-
-
-
-static int
-gmx_mm256_sincos_pd(__m256d x,
- __m256d *sinval,
- __m256d *cosval)
-{
-#ifdef _MSC_VER
- __declspec(align(16))
- const double sintable[34] =
- {
- 1.00000000000000000e+00, 0.00000000000000000e+00,
- 9.95184726672196929e-01, 9.80171403295606036e-02,
- 9.80785280403230431e-01, 1.95090322016128248e-01,
- 9.56940335732208824e-01, 2.90284677254462331e-01,
- 9.23879532511286738e-01, 3.82683432365089782e-01,
- 8.81921264348355050e-01, 4.71396736825997642e-01,
- 8.31469612302545236e-01, 5.55570233019602178e-01,
- 7.73010453362736993e-01, 6.34393284163645488e-01,
- 7.07106781186547573e-01, 7.07106781186547462e-01,
- 6.34393284163645599e-01, 7.73010453362736882e-01,
- 5.55570233019602289e-01, 8.31469612302545125e-01,
- 4.71396736825997809e-01, 8.81921264348354939e-01,
- 3.82683432365089837e-01, 9.23879532511286738e-01,
- 2.90284677254462276e-01, 9.56940335732208935e-01,
- 1.95090322016128304e-01, 9.80785280403230431e-01,
- 9.80171403295607702e-02, 9.95184726672196818e-01,
- 0.0, 1.00000000000000000e+00
- };
-#else
- const __m128d sintable[17] =
- {
- _mm_set_pd( 0.0, 1.0 ),
- _mm_set_pd( sin( 1.0 * (M_PI/2.0) / 16.0), cos( 1.0 * (M_PI/2.0) / 16.0) ),
- _mm_set_pd( sin( 2.0 * (M_PI/2.0) / 16.0), cos( 2.0 * (M_PI/2.0) / 16.0) ),
- _mm_set_pd( sin( 3.0 * (M_PI/2.0) / 16.0), cos( 3.0 * (M_PI/2.0) / 16.0) ),
- _mm_set_pd( sin( 4.0 * (M_PI/2.0) / 16.0), cos( 4.0 * (M_PI/2.0) / 16.0) ),
- _mm_set_pd( sin( 5.0 * (M_PI/2.0) / 16.0), cos( 5.0 * (M_PI/2.0) / 16.0) ),
- _mm_set_pd( sin( 6.0 * (M_PI/2.0) / 16.0), cos( 6.0 * (M_PI/2.0) / 16.0) ),
- _mm_set_pd( sin( 7.0 * (M_PI/2.0) / 16.0), cos( 7.0 * (M_PI/2.0) / 16.0) ),
- _mm_set_pd( sin( 8.0 * (M_PI/2.0) / 16.0), cos( 8.0 * (M_PI/2.0) / 16.0) ),
- _mm_set_pd( sin( 9.0 * (M_PI/2.0) / 16.0), cos( 9.0 * (M_PI/2.0) / 16.0) ),
- _mm_set_pd( sin( 10.0 * (M_PI/2.0) / 16.0), cos( 10.0 * (M_PI/2.0) / 16.0) ),
- _mm_set_pd( sin( 11.0 * (M_PI/2.0) / 16.0), cos( 11.0 * (M_PI/2.0) / 16.0) ),
- _mm_set_pd( sin( 12.0 * (M_PI/2.0) / 16.0), cos( 12.0 * (M_PI/2.0) / 16.0) ),
- _mm_set_pd( sin( 13.0 * (M_PI/2.0) / 16.0), cos( 13.0 * (M_PI/2.0) / 16.0) ),
- _mm_set_pd( sin( 14.0 * (M_PI/2.0) / 16.0), cos( 14.0 * (M_PI/2.0) / 16.0) ),
- _mm_set_pd( sin( 15.0 * (M_PI/2.0) / 16.0), cos( 15.0 * (M_PI/2.0) / 16.0) ),
- _mm_set_pd( 1.0, 0.0 )
- };
-#endif
-
- const __m256d signmask = _mm256_castsi256_pd( _mm256_set_epi32(0x7FFFFFFF, 0xFFFFFFFF, 0x7FFFFFFF, 0xFFFFFFFF,
- 0x7FFFFFFF, 0xFFFFFFFF, 0x7FFFFFFF, 0xFFFFFFFF) );
-
- const __m256d tabscale = _mm256_set1_pd(32.0/M_PI);
- const __m256d invtabscale0 = _mm256_set1_pd(9.81747508049011230469e-02);
- const __m256d invtabscale1 = _mm256_set1_pd(1.96197799156550576057e-08);
- const __m128i ione = _mm_set1_epi32(1);
- const __m128i i32 = _mm_set1_epi32(32);
- const __m128i i16 = _mm_set1_epi32(16);
- const __m128i tabmask = _mm_set1_epi32(0x3F);
- const __m256d sinP7 = _mm256_set1_pd(-1.0/5040.0);
- const __m256d sinP5 = _mm256_set1_pd(1.0/120.0);
- const __m256d sinP3 = _mm256_set1_pd(-1.0/6.0);
- const __m256d sinP1 = _mm256_set1_pd(1.0);
-
- const __m256d cosP6 = _mm256_set1_pd(-1.0/720.0);
- const __m256d cosP4 = _mm256_set1_pd(1.0/24.0);
- const __m256d cosP2 = _mm256_set1_pd(-1.0/2.0);
- const __m256d cosP0 = _mm256_set1_pd(1.0);
-
- __m256d scalex;
- __m128i tabidx, corridx;
- __m256d xabs, z, z2, polySin, polyCos;
- __m256d xpoint;
- __m256d t1, t2;
-
- __m256d sinpoint, cospoint;
- __m256d xsign, ssign, csign;
- __m128i imask, sswapsign, cswapsign;
-
- xsign = _mm256_andnot_pd(signmask, x);
- xabs = _mm256_and_pd(x, signmask);
-
- scalex = _mm256_mul_pd(tabscale, xabs);
- tabidx = _mm256_cvtpd_epi32(scalex);
-
- xpoint = _mm256_round_pd(scalex, _MM_FROUND_TO_NEAREST_INT);
-
- /* Extended precision arithmetics */
- z = _mm256_sub_pd(xabs, _mm256_mul_pd(invtabscale0, xpoint));
- z = _mm256_sub_pd(z, _mm256_mul_pd(invtabscale1, xpoint));
-
- /* Range reduction to 0..2*Pi */
- tabidx = _mm_and_si128(tabidx, tabmask);
-
- /* tabidx is now in range [0,..,64] */
- imask = _mm_cmpgt_epi32(tabidx, i32);
- sswapsign = imask;
- cswapsign = imask;
- corridx = _mm_and_si128(imask, i32);
- tabidx = _mm_sub_epi32(tabidx, corridx);
-
- /* tabidx is now in range [0..32] */
- imask = _mm_cmpgt_epi32(tabidx, i16);
- cswapsign = _mm_xor_si128(cswapsign, imask);
- corridx = _mm_sub_epi32(i32, tabidx);
- tabidx = _mm_blendv_epi8(tabidx, corridx, imask);
- /* tabidx is now in range [0..16] */
- ssign = _mm256_cvtepi32_pd( _mm_or_si128( sswapsign, ione ) );
- csign = _mm256_cvtepi32_pd( _mm_or_si128( cswapsign, ione ) );
-
- /* First lookup into table */
-#ifdef _MSC_VER
- t1 = _mm256_insertf128_pd(_mm256_castpd128_pd256(_mm_load_pd(sintable + 2*_mm_extract_epi32(tabidx, 0))),
- _mm_load_pd(sintable + 2*_mm_extract_epi32(tabidx, 2)), 0x1);
- t2 = _mm256_insertf128_pd(_mm256_castpd128_pd256(_mm_load_pd(sintable + 2*_mm_extract_epi32(tabidx, 1))),
- _mm_load_pd(sintable + 2*_mm_extract_epi32(tabidx, 3)), 0x1);
-#else
- t1 = _mm256_insertf128_pd(_mm256_castpd128_pd256(sintable[_mm_extract_epi32(tabidx, 0)]),
- sintable[_mm_extract_epi32(tabidx, 2)], 0x1);
- t2 = _mm256_insertf128_pd(_mm256_castpd128_pd256(sintable[_mm_extract_epi32(tabidx, 1)]),
- sintable[_mm_extract_epi32(tabidx, 3)], 0x1);
-#endif
-
- sinpoint = _mm256_unpackhi_pd(t1, t2);
- cospoint = _mm256_unpacklo_pd(t1, t2);
-
- sinpoint = _mm256_mul_pd(sinpoint, ssign);
- cospoint = _mm256_mul_pd(cospoint, csign);
-
- z2 = _mm256_mul_pd(z, z);
-
- polySin = _mm256_mul_pd(sinP7, z2);
- polySin = _mm256_add_pd(polySin, sinP5);
- polySin = _mm256_mul_pd(polySin, z2);
- polySin = _mm256_add_pd(polySin, sinP3);
- polySin = _mm256_mul_pd(polySin, z2);
- polySin = _mm256_add_pd(polySin, sinP1);
- polySin = _mm256_mul_pd(polySin, z);
-
- polyCos = _mm256_mul_pd(cosP6, z2);
- polyCos = _mm256_add_pd(polyCos, cosP4);
- polyCos = _mm256_mul_pd(polyCos, z2);
- polyCos = _mm256_add_pd(polyCos, cosP2);
- polyCos = _mm256_mul_pd(polyCos, z2);
- polyCos = _mm256_add_pd(polyCos, cosP0);
-
- *sinval = _mm256_xor_pd(_mm256_add_pd( _mm256_mul_pd(sinpoint, polyCos), _mm256_mul_pd(cospoint, polySin) ), xsign);
- *cosval = _mm256_sub_pd( _mm256_mul_pd(cospoint, polyCos), _mm256_mul_pd(sinpoint, polySin) );
-
- return 0;
-}
-
-static int
-gmx_mm_sincos_pd(__m128d x,
- __m128d *sinval,
- __m128d *cosval)
-{
-#ifdef _MSC_VER
- __declspec(align(16))
- const double sintable[34] =
- {
- 1.00000000000000000e+00, 0.00000000000000000e+00,
- 9.95184726672196929e-01, 9.80171403295606036e-02,
- 9.80785280403230431e-01, 1.95090322016128248e-01,
- 9.56940335732208824e-01, 2.90284677254462331e-01,
- 9.23879532511286738e-01, 3.82683432365089782e-01,
- 8.81921264348355050e-01, 4.71396736825997642e-01,
- 8.31469612302545236e-01, 5.55570233019602178e-01,
- 7.73010453362736993e-01, 6.34393284163645488e-01,
- 7.07106781186547573e-01, 7.07106781186547462e-01,
- 6.34393284163645599e-01, 7.73010453362736882e-01,
- 5.55570233019602289e-01, 8.31469612302545125e-01,
- 4.71396736825997809e-01, 8.81921264348354939e-01,
- 3.82683432365089837e-01, 9.23879532511286738e-01,
- 2.90284677254462276e-01, 9.56940335732208935e-01,
- 1.95090322016128304e-01, 9.80785280403230431e-01,
- 9.80171403295607702e-02, 9.95184726672196818e-01,
- 0.0, 1.00000000000000000e+00
- };
-#else
- const __m128d sintable[17] =
- {
- _mm_set_pd( 0.0, 1.0 ),
- _mm_set_pd( sin( 1.0 * (M_PI/2.0) / 16.0), cos( 1.0 * (M_PI/2.0) / 16.0) ),
- _mm_set_pd( sin( 2.0 * (M_PI/2.0) / 16.0), cos( 2.0 * (M_PI/2.0) / 16.0) ),
- _mm_set_pd( sin( 3.0 * (M_PI/2.0) / 16.0), cos( 3.0 * (M_PI/2.0) / 16.0) ),
- _mm_set_pd( sin( 4.0 * (M_PI/2.0) / 16.0), cos( 4.0 * (M_PI/2.0) / 16.0) ),
- _mm_set_pd( sin( 5.0 * (M_PI/2.0) / 16.0), cos( 5.0 * (M_PI/2.0) / 16.0) ),
- _mm_set_pd( sin( 6.0 * (M_PI/2.0) / 16.0), cos( 6.0 * (M_PI/2.0) / 16.0) ),
- _mm_set_pd( sin( 7.0 * (M_PI/2.0) / 16.0), cos( 7.0 * (M_PI/2.0) / 16.0) ),
- _mm_set_pd( sin( 8.0 * (M_PI/2.0) / 16.0), cos( 8.0 * (M_PI/2.0) / 16.0) ),
- _mm_set_pd( sin( 9.0 * (M_PI/2.0) / 16.0), cos( 9.0 * (M_PI/2.0) / 16.0) ),
- _mm_set_pd( sin( 10.0 * (M_PI/2.0) / 16.0), cos( 10.0 * (M_PI/2.0) / 16.0) ),
- _mm_set_pd( sin( 11.0 * (M_PI/2.0) / 16.0), cos( 11.0 * (M_PI/2.0) / 16.0) ),
- _mm_set_pd( sin( 12.0 * (M_PI/2.0) / 16.0), cos( 12.0 * (M_PI/2.0) / 16.0) ),
- _mm_set_pd( sin( 13.0 * (M_PI/2.0) / 16.0), cos( 13.0 * (M_PI/2.0) / 16.0) ),
- _mm_set_pd( sin( 14.0 * (M_PI/2.0) / 16.0), cos( 14.0 * (M_PI/2.0) / 16.0) ),
- _mm_set_pd( sin( 15.0 * (M_PI/2.0) / 16.0), cos( 15.0 * (M_PI/2.0) / 16.0) ),
- _mm_set_pd( 1.0, 0.0 )
- };
-#endif
-
- const __m128d signmask = gmx_mm_castsi128_pd( _mm_set_epi32(0x7FFFFFFF, 0xFFFFFFFF, 0x7FFFFFFF, 0xFFFFFFFF) );
-
- const __m128d tabscale = _mm_set1_pd(32.0/M_PI);
- const __m128d invtabscale0 = _mm_set1_pd(9.81747508049011230469e-02);
- const __m128d invtabscale1 = _mm_set1_pd(1.96197799156550576057e-08);
- const __m128i ione = _mm_set1_epi32(1);
- const __m128i i32 = _mm_set1_epi32(32);
- const __m128i i16 = _mm_set1_epi32(16);
- const __m128i tabmask = _mm_set1_epi32(0x3F);
- const __m128d sinP7 = _mm_set1_pd(-1.0/5040.0);
- const __m128d sinP5 = _mm_set1_pd(1.0/120.0);
- const __m128d sinP3 = _mm_set1_pd(-1.0/6.0);
- const __m128d sinP1 = _mm_set1_pd(1.0);
-
- const __m128d cosP6 = _mm_set1_pd(-1.0/720.0);
- const __m128d cosP4 = _mm_set1_pd(1.0/24.0);
- const __m128d cosP2 = _mm_set1_pd(-1.0/2.0);
- const __m128d cosP0 = _mm_set1_pd(1.0);
-
- __m128d scalex;
- __m128i tabidx, corridx;
- __m128d xabs, z, z2, polySin, polyCos;
- __m128d xpoint;
- __m128d ypoint0, ypoint1;
-
- __m128d sinpoint, cospoint;
- __m128d xsign, ssign, csign;
- __m128i imask, sswapsign, cswapsign;
-
- xsign = _mm_andnot_pd(signmask, x);
- xabs = _mm_and_pd(x, signmask);
-
- scalex = _mm_mul_pd(tabscale, xabs);
- tabidx = _mm_cvtpd_epi32(scalex);
-
- xpoint = _mm_round_pd(scalex, _MM_FROUND_TO_NEAREST_INT);
-
- /* Extended precision arithmetics */
- z = _mm_sub_pd(xabs, _mm_mul_pd(invtabscale0, xpoint));
- z = _mm_sub_pd(z, _mm_mul_pd(invtabscale1, xpoint));
-
- /* Range reduction to 0..2*Pi */
- tabidx = _mm_and_si128(tabidx, tabmask);
-
- /* tabidx is now in range [0,..,64] */
- imask = _mm_cmpgt_epi32(tabidx, i32);
- sswapsign = imask;
- cswapsign = imask;
- corridx = _mm_and_si128(imask, i32);
- tabidx = _mm_sub_epi32(tabidx, corridx);
-
- /* tabidx is now in range [0..32] */
- imask = _mm_cmpgt_epi32(tabidx, i16);
- cswapsign = _mm_xor_si128(cswapsign, imask);
- corridx = _mm_sub_epi32(i32, tabidx);
- tabidx = _mm_blendv_epi8(tabidx, corridx, imask);
- /* tabidx is now in range [0..16] */
- ssign = _mm_cvtepi32_pd( _mm_or_si128( sswapsign, ione ) );
- csign = _mm_cvtepi32_pd( _mm_or_si128( cswapsign, ione ) );
-
-#ifdef _MSC_VER
- ypoint0 = _mm_load_pd(sintable + 2*_mm_extract_epi32(tabidx, 0));
- ypoint1 = _mm_load_pd(sintable + 2*_mm_extract_epi32(tabidx, 1));
-#else
- ypoint0 = sintable[_mm_extract_epi32(tabidx, 0)];
- ypoint1 = sintable[_mm_extract_epi32(tabidx, 1)];
-#endif
- sinpoint = _mm_unpackhi_pd(ypoint0, ypoint1);
- cospoint = _mm_unpacklo_pd(ypoint0, ypoint1);
-
- sinpoint = _mm_mul_pd(sinpoint, ssign);
- cospoint = _mm_mul_pd(cospoint, csign);
-
- z2 = _mm_mul_pd(z, z);
-
- polySin = _mm_mul_pd(sinP7, z2);
- polySin = _mm_add_pd(polySin, sinP5);
- polySin = _mm_mul_pd(polySin, z2);
- polySin = _mm_add_pd(polySin, sinP3);
- polySin = _mm_mul_pd(polySin, z2);
- polySin = _mm_add_pd(polySin, sinP1);
- polySin = _mm_mul_pd(polySin, z);
-
- polyCos = _mm_mul_pd(cosP6, z2);
- polyCos = _mm_add_pd(polyCos, cosP4);
- polyCos = _mm_mul_pd(polyCos, z2);
- polyCos = _mm_add_pd(polyCos, cosP2);
- polyCos = _mm_mul_pd(polyCos, z2);
- polyCos = _mm_add_pd(polyCos, cosP0);
-
- *sinval = _mm_xor_pd(_mm_add_pd( _mm_mul_pd(sinpoint, polyCos), _mm_mul_pd(cospoint, polySin) ), xsign);
- *cosval = _mm_sub_pd( _mm_mul_pd(cospoint, polyCos), _mm_mul_pd(sinpoint, polySin) );
-
- return 0;
-}
-
-
-/*
- * IMPORTANT: Do NOT call both sin & cos if you need both results, since each of them
- * will then call the sincos() routine and waste a factor 2 in performance!
- */
-static __m256d
-gmx_mm256_sin_pd(__m256d x)
-{
- __m256d s, c;
- gmx_mm256_sincos_pd(x, &s, &c);
- return s;
-}
-
-static __m128d
-gmx_mm_sin_pd(__m128d x)
-{
- __m128d s, c;
- gmx_mm_sincos_pd(x, &s, &c);
- return s;
-}
-
-/*
- * IMPORTANT: Do NOT call both sin & cos if you need both results, since each of them
- * will then call the sincos() routine and waste a factor 2 in performance!
+/* Temporary:
+ * Alias some old SSE definitions to new SIMD definitions so we don't need
+ * to modify _all_ group kernels - they will anyway be replaced with a new
+ * generic SIMD version soon.
*/
-static __m256d
-gmx_mm256_cos_pd(__m256d x)
-{
- __m256d s, c;
- gmx_mm256_sincos_pd(x, &s, &c);
- return c;
-}
-
-static __m128d
-gmx_mm_cos_pd(__m128d x)
-{
- __m128d s, c;
- gmx_mm_sincos_pd(x, &s, &c);
- return c;
-}
-
-
-static __m256d
-gmx_mm256_tan_pd(__m256d x)
-{
- __m256d sinval, cosval;
- __m256d tanval;
-
- gmx_mm256_sincos_pd(x, &sinval, &cosval);
-
- tanval = _mm256_mul_pd(sinval, gmx_mm256_inv_pd(cosval));
-
- return tanval;
-}
-
-static __m128d
-gmx_mm_tan_pd(__m128d x)
-{
- __m128d sinval, cosval;
- __m128d tanval;
-
- gmx_mm_sincos_pd(x, &sinval, &cosval);
-
- tanval = _mm_mul_pd(sinval, gmx_mm_inv_pd(cosval));
-
- return tanval;
-}
-
-
-static __m256d
-gmx_mm256_asin_pd(__m256d x)
-{
- /* Same algorithm as cephes library */
- const __m256d signmask = _mm256_castsi256_pd( _mm256_set_epi32(0x7FFFFFFF, 0xFFFFFFFF, 0x7FFFFFFF, 0xFFFFFFFF,
- 0x7FFFFFFF, 0xFFFFFFFF, 0x7FFFFFFF, 0xFFFFFFFF) );
- const __m256d limit1 = _mm256_set1_pd(0.625);
- const __m256d limit2 = _mm256_set1_pd(1e-8);
- const __m256d one = _mm256_set1_pd(1.0);
- const __m256d quarterpi = _mm256_set1_pd(M_PI/4.0);
- const __m256d morebits = _mm256_set1_pd(6.123233995736765886130e-17);
-
- const __m256d P5 = _mm256_set1_pd(4.253011369004428248960e-3);
- const __m256d P4 = _mm256_set1_pd(-6.019598008014123785661e-1);
- const __m256d P3 = _mm256_set1_pd(5.444622390564711410273e0);
- const __m256d P2 = _mm256_set1_pd(-1.626247967210700244449e1);
- const __m256d P1 = _mm256_set1_pd(1.956261983317594739197e1);
- const __m256d P0 = _mm256_set1_pd(-8.198089802484824371615e0);
-
- const __m256d Q4 = _mm256_set1_pd(-1.474091372988853791896e1);
- const __m256d Q3 = _mm256_set1_pd(7.049610280856842141659e1);
- const __m256d Q2 = _mm256_set1_pd(-1.471791292232726029859e2);
- const __m256d Q1 = _mm256_set1_pd(1.395105614657485689735e2);
- const __m256d Q0 = _mm256_set1_pd(-4.918853881490881290097e1);
-
- const __m256d R4 = _mm256_set1_pd(2.967721961301243206100e-3);
- const __m256d R3 = _mm256_set1_pd(-5.634242780008963776856e-1);
- const __m256d R2 = _mm256_set1_pd(6.968710824104713396794e0);
- const __m256d R1 = _mm256_set1_pd(-2.556901049652824852289e1);
- const __m256d R0 = _mm256_set1_pd(2.853665548261061424989e1);
-
- const __m256d S3 = _mm256_set1_pd(-2.194779531642920639778e1);
- const __m256d S2 = _mm256_set1_pd(1.470656354026814941758e2);
- const __m256d S1 = _mm256_set1_pd(-3.838770957603691357202e2);
- const __m256d S0 = _mm256_set1_pd(3.424398657913078477438e2);
-
- __m256d sign;
- __m256d mask;
- __m256d xabs;
- __m256d zz, ww, z, q, w, zz2, ww2;
- __m256d PA, PB;
- __m256d QA, QB;
- __m256d RA, RB;
- __m256d SA, SB;
- __m256d nom, denom;
-
- sign = _mm256_andnot_pd(signmask, x);
- xabs = _mm256_and_pd(x, signmask);
-
- mask = _mm256_cmp_pd(xabs, limit1, _CMP_GT_OQ);
-
- zz = _mm256_sub_pd(one, xabs);
- ww = _mm256_mul_pd(xabs, xabs);
- zz2 = _mm256_mul_pd(zz, zz);
- ww2 = _mm256_mul_pd(ww, ww);
-
- /* R */
- RA = _mm256_mul_pd(R4, zz2);
- RB = _mm256_mul_pd(R3, zz2);
- RA = _mm256_add_pd(RA, R2);
- RB = _mm256_add_pd(RB, R1);
- RA = _mm256_mul_pd(RA, zz2);
- RB = _mm256_mul_pd(RB, zz);
- RA = _mm256_add_pd(RA, R0);
- RA = _mm256_add_pd(RA, RB);
-
- /* S, SA = zz2 */
- SB = _mm256_mul_pd(S3, zz2);
- SA = _mm256_add_pd(zz2, S2);
- SB = _mm256_add_pd(SB, S1);
- SA = _mm256_mul_pd(SA, zz2);
- SB = _mm256_mul_pd(SB, zz);
- SA = _mm256_add_pd(SA, S0);
- SA = _mm256_add_pd(SA, SB);
-
- /* P */
- PA = _mm256_mul_pd(P5, ww2);
- PB = _mm256_mul_pd(P4, ww2);
- PA = _mm256_add_pd(PA, P3);
- PB = _mm256_add_pd(PB, P2);
- PA = _mm256_mul_pd(PA, ww2);
- PB = _mm256_mul_pd(PB, ww2);
- PA = _mm256_add_pd(PA, P1);
- PB = _mm256_add_pd(PB, P0);
- PA = _mm256_mul_pd(PA, ww);
- PA = _mm256_add_pd(PA, PB);
-
- /* Q, QA = ww2 */
- QB = _mm256_mul_pd(Q4, ww2);
- QA = _mm256_add_pd(ww2, Q3);
- QB = _mm256_add_pd(QB, Q2);
- QA = _mm256_mul_pd(QA, ww2);
- QB = _mm256_mul_pd(QB, ww2);
- QA = _mm256_add_pd(QA, Q1);
- QB = _mm256_add_pd(QB, Q0);
- QA = _mm256_mul_pd(QA, ww);
- QA = _mm256_add_pd(QA, QB);
-
- RA = _mm256_mul_pd(RA, zz);
- PA = _mm256_mul_pd(PA, ww);
-
- nom = _mm256_blendv_pd( PA, RA, mask );
- denom = _mm256_blendv_pd( QA, SA, mask );
-
- q = _mm256_mul_pd( nom, gmx_mm256_inv_pd(denom) );
-
- zz = _mm256_add_pd(zz, zz);
- zz = gmx_mm256_sqrt_pd(zz);
- z = _mm256_sub_pd(quarterpi, zz);
- zz = _mm256_mul_pd(zz, q);
- zz = _mm256_sub_pd(zz, morebits);
- z = _mm256_sub_pd(z, zz);
- z = _mm256_add_pd(z, quarterpi);
-
- w = _mm256_mul_pd(xabs, q);
- w = _mm256_add_pd(w, xabs);
-
- z = _mm256_blendv_pd( w, z, mask );
-
- mask = _mm256_cmp_pd(xabs, limit2, _CMP_GT_OQ);
- z = _mm256_blendv_pd( xabs, z, mask );
-
- z = _mm256_xor_pd(z, sign);
-
- return z;
-}
-
-static __m128d
-gmx_mm_asin_pd(__m128d x)
-{
- /* Same algorithm as cephes library */
- const __m128d signmask = gmx_mm_castsi128_pd( _mm_set_epi32(0x7FFFFFFF, 0xFFFFFFFF, 0x7FFFFFFF, 0xFFFFFFFF) );
- const __m128d limit1 = _mm_set1_pd(0.625);
- const __m128d limit2 = _mm_set1_pd(1e-8);
- const __m128d one = _mm_set1_pd(1.0);
- const __m128d quarterpi = _mm_set1_pd(M_PI/4.0);
- const __m128d morebits = _mm_set1_pd(6.123233995736765886130e-17);
-
- const __m128d P5 = _mm_set1_pd(4.253011369004428248960e-3);
- const __m128d P4 = _mm_set1_pd(-6.019598008014123785661e-1);
- const __m128d P3 = _mm_set1_pd(5.444622390564711410273e0);
- const __m128d P2 = _mm_set1_pd(-1.626247967210700244449e1);
- const __m128d P1 = _mm_set1_pd(1.956261983317594739197e1);
- const __m128d P0 = _mm_set1_pd(-8.198089802484824371615e0);
-
- const __m128d Q4 = _mm_set1_pd(-1.474091372988853791896e1);
- const __m128d Q3 = _mm_set1_pd(7.049610280856842141659e1);
- const __m128d Q2 = _mm_set1_pd(-1.471791292232726029859e2);
- const __m128d Q1 = _mm_set1_pd(1.395105614657485689735e2);
- const __m128d Q0 = _mm_set1_pd(-4.918853881490881290097e1);
-
- const __m128d R4 = _mm_set1_pd(2.967721961301243206100e-3);
- const __m128d R3 = _mm_set1_pd(-5.634242780008963776856e-1);
- const __m128d R2 = _mm_set1_pd(6.968710824104713396794e0);
- const __m128d R1 = _mm_set1_pd(-2.556901049652824852289e1);
- const __m128d R0 = _mm_set1_pd(2.853665548261061424989e1);
-
- const __m128d S3 = _mm_set1_pd(-2.194779531642920639778e1);
- const __m128d S2 = _mm_set1_pd(1.470656354026814941758e2);
- const __m128d S1 = _mm_set1_pd(-3.838770957603691357202e2);
- const __m128d S0 = _mm_set1_pd(3.424398657913078477438e2);
-
- __m128d sign;
- __m128d mask;
- __m128d xabs;
- __m128d zz, ww, z, q, w, zz2, ww2;
- __m128d PA, PB;
- __m128d QA, QB;
- __m128d RA, RB;
- __m128d SA, SB;
- __m128d nom, denom;
-
- sign = _mm_andnot_pd(signmask, x);
- xabs = _mm_and_pd(x, signmask);
-
- mask = _mm_cmpgt_pd(xabs, limit1);
-
- zz = _mm_sub_pd(one, xabs);
- ww = _mm_mul_pd(xabs, xabs);
- zz2 = _mm_mul_pd(zz, zz);
- ww2 = _mm_mul_pd(ww, ww);
-
- /* R */
- RA = _mm_mul_pd(R4, zz2);
- RB = _mm_mul_pd(R3, zz2);
- RA = _mm_add_pd(RA, R2);
- RB = _mm_add_pd(RB, R1);
- RA = _mm_mul_pd(RA, zz2);
- RB = _mm_mul_pd(RB, zz);
- RA = _mm_add_pd(RA, R0);
- RA = _mm_add_pd(RA, RB);
-
- /* S, SA = zz2 */
- SB = _mm_mul_pd(S3, zz2);
- SA = _mm_add_pd(zz2, S2);
- SB = _mm_add_pd(SB, S1);
- SA = _mm_mul_pd(SA, zz2);
- SB = _mm_mul_pd(SB, zz);
- SA = _mm_add_pd(SA, S0);
- SA = _mm_add_pd(SA, SB);
-
- /* P */
- PA = _mm_mul_pd(P5, ww2);
- PB = _mm_mul_pd(P4, ww2);
- PA = _mm_add_pd(PA, P3);
- PB = _mm_add_pd(PB, P2);
- PA = _mm_mul_pd(PA, ww2);
- PB = _mm_mul_pd(PB, ww2);
- PA = _mm_add_pd(PA, P1);
- PB = _mm_add_pd(PB, P0);
- PA = _mm_mul_pd(PA, ww);
- PA = _mm_add_pd(PA, PB);
-
- /* Q, QA = ww2 */
- QB = _mm_mul_pd(Q4, ww2);
- QA = _mm_add_pd(ww2, Q3);
- QB = _mm_add_pd(QB, Q2);
- QA = _mm_mul_pd(QA, ww2);
- QB = _mm_mul_pd(QB, ww2);
- QA = _mm_add_pd(QA, Q1);
- QB = _mm_add_pd(QB, Q0);
- QA = _mm_mul_pd(QA, ww);
- QA = _mm_add_pd(QA, QB);
-
- RA = _mm_mul_pd(RA, zz);
- PA = _mm_mul_pd(PA, ww);
-
- nom = _mm_blendv_pd( PA, RA, mask );
- denom = _mm_blendv_pd( QA, SA, mask );
-
- q = _mm_mul_pd( nom, gmx_mm_inv_pd(denom) );
-
- zz = _mm_add_pd(zz, zz);
- zz = gmx_mm_sqrt_pd(zz);
- z = _mm_sub_pd(quarterpi, zz);
- zz = _mm_mul_pd(zz, q);
- zz = _mm_sub_pd(zz, morebits);
- z = _mm_sub_pd(z, zz);
- z = _mm_add_pd(z, quarterpi);
-
- w = _mm_mul_pd(xabs, q);
- w = _mm_add_pd(w, xabs);
-
- z = _mm_blendv_pd( w, z, mask );
-
- mask = _mm_cmpgt_pd(xabs, limit2);
- z = _mm_blendv_pd( xabs, z, mask );
-
- z = _mm_xor_pd(z, sign);
-
- return z;
-}
-
-
-static __m256d
-gmx_mm256_acos_pd(__m256d x)
-{
- const __m256d one = _mm256_set1_pd(1.0);
- const __m256d half = _mm256_set1_pd(0.5);
- const __m256d quarterpi0 = _mm256_set1_pd(7.85398163397448309616e-1);
- const __m256d quarterpi1 = _mm256_set1_pd(6.123233995736765886130e-17);
-
-
- __m256d mask1;
-
- __m256d z, z1, z2;
-
- mask1 = _mm256_cmp_pd(x, half, _CMP_GT_OQ);
- z1 = _mm256_mul_pd(half, _mm256_sub_pd(one, x));
- z1 = gmx_mm256_sqrt_pd(z1);
- z = _mm256_blendv_pd( x, z1, mask1 );
-
- z = gmx_mm256_asin_pd(z);
-
- z1 = _mm256_add_pd(z, z);
-
- z2 = _mm256_sub_pd(quarterpi0, z);
- z2 = _mm256_add_pd(z2, quarterpi1);
- z2 = _mm256_add_pd(z2, quarterpi0);
-
- z = _mm256_blendv_pd(z2, z1, mask1);
-
- return z;
-}
-
-static __m128d
-gmx_mm_acos_pd(__m128d x)
-{
- const __m128d one = _mm_set1_pd(1.0);
- const __m128d half = _mm_set1_pd(0.5);
- const __m128d quarterpi0 = _mm_set1_pd(7.85398163397448309616e-1);
- const __m128d quarterpi1 = _mm_set1_pd(6.123233995736765886130e-17);
-
-
- __m128d mask1;
-
- __m128d z, z1, z2;
-
- mask1 = _mm_cmpgt_pd(x, half);
- z1 = _mm_mul_pd(half, _mm_sub_pd(one, x));
- z1 = gmx_mm_sqrt_pd(z1);
- z = _mm_blendv_pd( x, z1, mask1 );
-
- z = gmx_mm_asin_pd(z);
-
- z1 = _mm_add_pd(z, z);
-
- z2 = _mm_sub_pd(quarterpi0, z);
- z2 = _mm_add_pd(z2, quarterpi1);
- z2 = _mm_add_pd(z2, quarterpi0);
-
- z = _mm_blendv_pd(z2, z1, mask1);
-
- return z;
-}
-
-
-static __m256d
-gmx_mm256_atan_pd(__m256d x)
-{
- /* Same algorithm as cephes library */
- const __m256d signmask = _mm256_castsi256_pd( _mm256_set_epi32(0x7FFFFFFF, 0xFFFFFFFF, 0x7FFFFFFF, 0xFFFFFFFF,
- 0x7FFFFFFF, 0xFFFFFFFF, 0x7FFFFFFF, 0xFFFFFFFF) );
- const __m256d limit1 = _mm256_set1_pd(0.66);
- const __m256d limit2 = _mm256_set1_pd(2.41421356237309504880);
- const __m256d quarterpi = _mm256_set1_pd(M_PI/4.0);
- const __m256d halfpi = _mm256_set1_pd(M_PI/2.0);
- const __m256d mone = _mm256_set1_pd(-1.0);
- const __m256d morebits1 = _mm256_set1_pd(0.5*6.123233995736765886130E-17);
- const __m256d morebits2 = _mm256_set1_pd(6.123233995736765886130E-17);
-
- const __m256d P4 = _mm256_set1_pd(-8.750608600031904122785E-1);
- const __m256d P3 = _mm256_set1_pd(-1.615753718733365076637E1);
- const __m256d P2 = _mm256_set1_pd(-7.500855792314704667340E1);
- const __m256d P1 = _mm256_set1_pd(-1.228866684490136173410E2);
- const __m256d P0 = _mm256_set1_pd(-6.485021904942025371773E1);
-
- const __m256d Q4 = _mm256_set1_pd(2.485846490142306297962E1);
- const __m256d Q3 = _mm256_set1_pd(1.650270098316988542046E2);
- const __m256d Q2 = _mm256_set1_pd(4.328810604912902668951E2);
- const __m256d Q1 = _mm256_set1_pd(4.853903996359136964868E2);
- const __m256d Q0 = _mm256_set1_pd(1.945506571482613964425E2);
-
- __m256d sign;
- __m256d mask1, mask2;
- __m256d y, t1, t2;
- __m256d z, z2;
- __m256d P_A, P_B, Q_A, Q_B;
-
- sign = _mm256_andnot_pd(signmask, x);
- x = _mm256_and_pd(x, signmask);
-
- mask1 = _mm256_cmp_pd(x, limit1, _CMP_GT_OQ);
- mask2 = _mm256_cmp_pd(x, limit2, _CMP_GT_OQ);
-
- t1 = _mm256_mul_pd(_mm256_add_pd(x, mone), gmx_mm256_inv_pd(_mm256_sub_pd(x, mone)));
- t2 = _mm256_mul_pd(mone, gmx_mm256_inv_pd(x));
-
- y = _mm256_and_pd(mask1, quarterpi);
- y = _mm256_or_pd( _mm256_and_pd(mask2, halfpi), _mm256_andnot_pd(mask2, y) );
-
- x = _mm256_or_pd( _mm256_and_pd(mask1, t1), _mm256_andnot_pd(mask1, x) );
- x = _mm256_or_pd( _mm256_and_pd(mask2, t2), _mm256_andnot_pd(mask2, x) );
-
- z = _mm256_mul_pd(x, x);
- z2 = _mm256_mul_pd(z, z);
-
- P_A = _mm256_mul_pd(P4, z2);
- P_B = _mm256_mul_pd(P3, z2);
- P_A = _mm256_add_pd(P_A, P2);
- P_B = _mm256_add_pd(P_B, P1);
- P_A = _mm256_mul_pd(P_A, z2);
- P_B = _mm256_mul_pd(P_B, z);
- P_A = _mm256_add_pd(P_A, P0);
- P_A = _mm256_add_pd(P_A, P_B);
-
- /* Q_A = z2 */
- Q_B = _mm256_mul_pd(Q4, z2);
- Q_A = _mm256_add_pd(z2, Q3);
- Q_B = _mm256_add_pd(Q_B, Q2);
- Q_A = _mm256_mul_pd(Q_A, z2);
- Q_B = _mm256_mul_pd(Q_B, z2);
- Q_A = _mm256_add_pd(Q_A, Q1);
- Q_B = _mm256_add_pd(Q_B, Q0);
- Q_A = _mm256_mul_pd(Q_A, z);
- Q_A = _mm256_add_pd(Q_A, Q_B);
-
- z = _mm256_mul_pd(z, P_A);
- z = _mm256_mul_pd(z, gmx_mm256_inv_pd(Q_A));
- z = _mm256_mul_pd(z, x);
- z = _mm256_add_pd(z, x);
-
- t1 = _mm256_and_pd(mask1, morebits1);
- t1 = _mm256_or_pd( _mm256_and_pd(mask2, morebits2), _mm256_andnot_pd(mask2, t1) );
-
- z = _mm256_add_pd(z, t1);
- y = _mm256_add_pd(y, z);
-
- y = _mm256_xor_pd(y, sign);
-
- return y;
-}
-
-static __m128d
-gmx_mm_atan_pd(__m128d x)
-{
- /* Same algorithm as cephes library */
- const __m128d signmask = gmx_mm_castsi128_pd( _mm_set_epi32(0x7FFFFFFF, 0xFFFFFFFF, 0x7FFFFFFF, 0xFFFFFFFF) );
- const __m128d limit1 = _mm_set1_pd(0.66);
- const __m128d limit2 = _mm_set1_pd(2.41421356237309504880);
- const __m128d quarterpi = _mm_set1_pd(M_PI/4.0);
- const __m128d halfpi = _mm_set1_pd(M_PI/2.0);
- const __m128d mone = _mm_set1_pd(-1.0);
- const __m128d morebits1 = _mm_set1_pd(0.5*6.123233995736765886130E-17);
- const __m128d morebits2 = _mm_set1_pd(6.123233995736765886130E-17);
-
- const __m128d P4 = _mm_set1_pd(-8.750608600031904122785E-1);
- const __m128d P3 = _mm_set1_pd(-1.615753718733365076637E1);
- const __m128d P2 = _mm_set1_pd(-7.500855792314704667340E1);
- const __m128d P1 = _mm_set1_pd(-1.228866684490136173410E2);
- const __m128d P0 = _mm_set1_pd(-6.485021904942025371773E1);
-
- const __m128d Q4 = _mm_set1_pd(2.485846490142306297962E1);
- const __m128d Q3 = _mm_set1_pd(1.650270098316988542046E2);
- const __m128d Q2 = _mm_set1_pd(4.328810604912902668951E2);
- const __m128d Q1 = _mm_set1_pd(4.853903996359136964868E2);
- const __m128d Q0 = _mm_set1_pd(1.945506571482613964425E2);
-
- __m128d sign;
- __m128d mask1, mask2;
- __m128d y, t1, t2;
- __m128d z, z2;
- __m128d P_A, P_B, Q_A, Q_B;
-
- sign = _mm_andnot_pd(signmask, x);
- x = _mm_and_pd(x, signmask);
-
- mask1 = _mm_cmpgt_pd(x, limit1);
- mask2 = _mm_cmpgt_pd(x, limit2);
-
- t1 = _mm_mul_pd(_mm_add_pd(x, mone), gmx_mm_inv_pd(_mm_sub_pd(x, mone)));
- t2 = _mm_mul_pd(mone, gmx_mm_inv_pd(x));
-
- y = _mm_and_pd(mask1, quarterpi);
- y = _mm_or_pd( _mm_and_pd(mask2, halfpi), _mm_andnot_pd(mask2, y) );
-
- x = _mm_or_pd( _mm_and_pd(mask1, t1), _mm_andnot_pd(mask1, x) );
- x = _mm_or_pd( _mm_and_pd(mask2, t2), _mm_andnot_pd(mask2, x) );
-
- z = _mm_mul_pd(x, x);
- z2 = _mm_mul_pd(z, z);
-
- P_A = _mm_mul_pd(P4, z2);
- P_B = _mm_mul_pd(P3, z2);
- P_A = _mm_add_pd(P_A, P2);
- P_B = _mm_add_pd(P_B, P1);
- P_A = _mm_mul_pd(P_A, z2);
- P_B = _mm_mul_pd(P_B, z);
- P_A = _mm_add_pd(P_A, P0);
- P_A = _mm_add_pd(P_A, P_B);
-
- /* Q_A = z2 */
- Q_B = _mm_mul_pd(Q4, z2);
- Q_A = _mm_add_pd(z2, Q3);
- Q_B = _mm_add_pd(Q_B, Q2);
- Q_A = _mm_mul_pd(Q_A, z2);
- Q_B = _mm_mul_pd(Q_B, z2);
- Q_A = _mm_add_pd(Q_A, Q1);
- Q_B = _mm_add_pd(Q_B, Q0);
- Q_A = _mm_mul_pd(Q_A, z);
- Q_A = _mm_add_pd(Q_A, Q_B);
-
- z = _mm_mul_pd(z, P_A);
- z = _mm_mul_pd(z, gmx_mm_inv_pd(Q_A));
- z = _mm_mul_pd(z, x);
- z = _mm_add_pd(z, x);
-
- t1 = _mm_and_pd(mask1, morebits1);
- t1 = _mm_or_pd( _mm_and_pd(mask2, morebits2), _mm_andnot_pd(mask2, t1) );
-
- z = _mm_add_pd(z, t1);
- y = _mm_add_pd(y, z);
-
- y = _mm_xor_pd(y, sign);
-
- return y;
-}
-
-
-
-static __m256d
-gmx_mm256_atan2_pd(__m256d y, __m256d x)
-{
- const __m256d pi = _mm256_set1_pd(M_PI);
- const __m256d minuspi = _mm256_set1_pd(-M_PI);
- const __m256d halfpi = _mm256_set1_pd(M_PI/2.0);
- const __m256d minushalfpi = _mm256_set1_pd(-M_PI/2.0);
-
- __m256d z, z1, z3, z4;
- __m256d w;
- __m256d maskx_lt, maskx_eq;
- __m256d masky_lt, masky_eq;
- __m256d mask1, mask2, mask3, mask4, maskall;
-
- maskx_lt = _mm256_cmp_pd(x, _mm256_setzero_pd(), _CMP_LT_OQ);
- masky_lt = _mm256_cmp_pd(y, _mm256_setzero_pd(), _CMP_LT_OQ);
- maskx_eq = _mm256_cmp_pd(x, _mm256_setzero_pd(), _CMP_EQ_OQ);
- masky_eq = _mm256_cmp_pd(y, _mm256_setzero_pd(), _CMP_EQ_OQ);
-
- z = _mm256_mul_pd(y, gmx_mm256_inv_pd(x));
- z = gmx_mm256_atan_pd(z);
-
- mask1 = _mm256_and_pd(maskx_eq, masky_lt);
- mask2 = _mm256_andnot_pd(maskx_lt, masky_eq);
- mask3 = _mm256_andnot_pd( _mm256_or_pd(masky_lt, masky_eq), maskx_eq);
- mask4 = _mm256_and_pd(masky_eq, maskx_lt);
-
- maskall = _mm256_or_pd( _mm256_or_pd(mask1, mask2), _mm256_or_pd(mask3, mask4) );
-
- z = _mm256_andnot_pd(maskall, z);
- z1 = _mm256_and_pd(mask1, minushalfpi);
- z3 = _mm256_and_pd(mask3, halfpi);
- z4 = _mm256_and_pd(mask4, pi);
-
- z = _mm256_or_pd( _mm256_or_pd(z, z1), _mm256_or_pd(z3, z4) );
-
- w = _mm256_blendv_pd(pi, minuspi, masky_lt);
- w = _mm256_and_pd(w, maskx_lt);
-
- w = _mm256_andnot_pd(maskall, w);
-
- z = _mm256_add_pd(z, w);
-
- return z;
-}
-
-static __m128d
-gmx_mm_atan2_pd(__m128d y, __m128d x)
-{
- const __m128d pi = _mm_set1_pd(M_PI);
- const __m128d minuspi = _mm_set1_pd(-M_PI);
- const __m128d halfpi = _mm_set1_pd(M_PI/2.0);
- const __m128d minushalfpi = _mm_set1_pd(-M_PI/2.0);
-
- __m128d z, z1, z3, z4;
- __m128d w;
- __m128d maskx_lt, maskx_eq;
- __m128d masky_lt, masky_eq;
- __m128d mask1, mask2, mask3, mask4, maskall;
-
- maskx_lt = _mm_cmplt_pd(x, _mm_setzero_pd());
- masky_lt = _mm_cmplt_pd(y, _mm_setzero_pd());
- maskx_eq = _mm_cmpeq_pd(x, _mm_setzero_pd());
- masky_eq = _mm_cmpeq_pd(y, _mm_setzero_pd());
-
- z = _mm_mul_pd(y, gmx_mm_inv_pd(x));
- z = gmx_mm_atan_pd(z);
-
- mask1 = _mm_and_pd(maskx_eq, masky_lt);
- mask2 = _mm_andnot_pd(maskx_lt, masky_eq);
- mask3 = _mm_andnot_pd( _mm_or_pd(masky_lt, masky_eq), maskx_eq);
- mask4 = _mm_and_pd(masky_eq, maskx_lt);
-
- maskall = _mm_or_pd( _mm_or_pd(mask1, mask2), _mm_or_pd(mask3, mask4) );
-
- z = _mm_andnot_pd(maskall, z);
- z1 = _mm_and_pd(mask1, minushalfpi);
- z3 = _mm_and_pd(mask3, halfpi);
- z4 = _mm_and_pd(mask4, pi);
-
- z = _mm_or_pd( _mm_or_pd(z, z1), _mm_or_pd(z3, z4) );
-
- w = _mm_blendv_pd(pi, minuspi, masky_lt);
- w = _mm_and_pd(w, maskx_lt);
-
- w = _mm_andnot_pd(maskall, w);
-
- z = _mm_add_pd(z, w);
- return z;
-}
+#define gmx_mm256_invsqrt_pd gmx_simd_invsqrt_d
+#define gmx_mm256_inv_pd gmx_simd_inv_d
+#define gmx_mm256_log_pd gmx_simd_log_d
+#define gmx_mm256_pmecorrF_pd gmx_simd_pmecorrF_d
+#define gmx_mm256_pmecorrV_pd gmx_simd_pmecorrV_d
+#define gmx_mm256_sincos_pd gmx_simd_sincos_d
#endif
/*
* This file is part of the GROMACS molecular simulation package.
*
- * Copyright (c) 2012,2013, by the GROMACS development team, led by
+ * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
* Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
* and including many others, as listed in the AUTHORS file in the
* top-level source directory and at http://www.gromacs.org.
#ifndef GMX_SIMD_MATH_AVX_256_SINGLE_H
#define GMX_SIMD_MATH_AVX_256_SINGLE_H
-#include <math.h>
+#include "simd_math.h"
-#include "general_x86_avx_256.h"
-
-#ifndef M_PI
-# define M_PI 3.14159265358979323846264338327950288
-#endif
-
-
-
-/************************
- * *
- * Simple math routines *
- * *
- ************************/
-
-/* 1.0/sqrt(x), 256-bit wide version */
-static gmx_inline __m256
-gmx_mm256_invsqrt_ps(__m256 x)
-{
- const __m256 half = _mm256_set1_ps(0.5f);
- const __m256 three = _mm256_set1_ps(3.0f);
-
- __m256 lu = _mm256_rsqrt_ps(x);
-
- return _mm256_mul_ps(half, _mm256_mul_ps(_mm256_sub_ps(three, _mm256_mul_ps(_mm256_mul_ps(lu, lu), x)), lu));
-}
-
-/* 1.0/sqrt(x), 128-bit wide version */
-static gmx_inline __m128
-gmx_mm_invsqrt_ps(__m128 x)
-{
- const __m128 half = _mm_set_ps(0.5, 0.5, 0.5, 0.5);
- const __m128 three = _mm_set_ps(3.0, 3.0, 3.0, 3.0);
-
- __m128 lu = _mm_rsqrt_ps(x);
-
- return _mm_mul_ps(half, _mm_mul_ps(_mm_sub_ps(three, _mm_mul_ps(_mm_mul_ps(lu, lu), x)), lu));
-}
-
-
-/* sqrt(x) (256 bit) - Do NOT use this (but rather invsqrt) if you actually need 1.0/sqrt(x) */
-static gmx_inline __m256
-gmx_mm256_sqrt_ps(__m256 x)
-{
- __m256 mask;
- __m256 res;
-
- mask = _mm256_cmp_ps(x, _mm256_setzero_ps(), _CMP_EQ_OQ);
- res = _mm256_andnot_ps(mask, gmx_mm256_invsqrt_ps(x));
-
- res = _mm256_mul_ps(x, res);
-
- return res;
-}
-
-/* sqrt(x) (128 bit) - Do NOT use this (but rather invsqrt) if you actually need 1.0/sqrt(x) */
-static gmx_inline __m128
-gmx_mm_sqrt_ps(__m128 x)
-{
- __m128 mask;
- __m128 res;
-
- mask = _mm_cmpeq_ps(x, _mm_setzero_ps());
- res = _mm_andnot_ps(mask, gmx_mm_invsqrt_ps(x));
-
- res = _mm_mul_ps(x, res);
-
- return res;
-}
-
-
-/* 1.0/x, 256-bit wide */
-static gmx_inline __m256
-gmx_mm256_inv_ps(__m256 x)
-{
- const __m256 two = _mm256_set1_ps(2.0f);
-
- __m256 lu = _mm256_rcp_ps(x);
-
- return _mm256_mul_ps(lu, _mm256_sub_ps(two, _mm256_mul_ps(lu, x)));
-}
-
-/* 1.0/x, 128-bit wide */
-static gmx_inline __m128
-gmx_mm_inv_ps(__m128 x)
-{
- const __m128 two = _mm_set_ps(2.0f, 2.0f, 2.0f, 2.0f);
-
- __m128 lu = _mm_rcp_ps(x);
-
- return _mm_mul_ps(lu, _mm_sub_ps(two, _mm_mul_ps(lu, x)));
-}
-
-
-static gmx_inline __m256
-gmx_mm256_abs_ps(__m256 x)
-{
- const __m256 signmask = _mm256_castsi256_ps( _mm256_set1_epi32(0x7FFFFFFF) );
-
- return _mm256_and_ps(x, signmask);
-}
-
-static gmx_inline __m128
-gmx_mm_abs_ps(__m128 x)
-{
- const __m128 signmask = gmx_mm_castsi128_ps( _mm_set1_epi32(0x7FFFFFFF) );
-
- return _mm_and_ps(x, signmask);
-}
-
-
-static __m256
-gmx_mm256_log_ps(__m256 x)
-{
- const __m256 expmask = _mm256_castsi256_ps( _mm256_set1_epi32(0x7F800000) );
- const __m128i expbase_m1 = _mm_set1_epi32(127-1); /* We want non-IEEE format */
- const __m256 half = _mm256_set1_ps(0.5f);
- const __m256 one = _mm256_set1_ps(1.0f);
- const __m256 invsq2 = _mm256_set1_ps(1.0f/sqrt(2.0f));
- const __m256 corr1 = _mm256_set1_ps(-2.12194440e-4f);
- const __m256 corr2 = _mm256_set1_ps(0.693359375f);
-
- const __m256 CA_1 = _mm256_set1_ps(0.070376836292f);
- const __m256 CB_0 = _mm256_set1_ps(1.6714950086782716f);
- const __m256 CB_1 = _mm256_set1_ps(-2.452088066061482f);
- const __m256 CC_0 = _mm256_set1_ps(1.5220770854701728f);
- const __m256 CC_1 = _mm256_set1_ps(-1.3422238433233642f);
- const __m256 CD_0 = _mm256_set1_ps(1.386218787509749f);
- const __m256 CD_1 = _mm256_set1_ps(0.35075468953796346f);
- const __m256 CE_0 = _mm256_set1_ps(1.3429983063133937f);
- const __m256 CE_1 = _mm256_set1_ps(1.807420826584643f);
-
- __m256 fexp;
- __m256i iexp;
- __m128i iexp128a, iexp128b;
- __m256 mask;
- __m256i imask;
- __m128i imask128a, imask128b;
- __m256 x2;
- __m256 y;
- __m256 pA, pB, pC, pD, pE, tB, tC, tD, tE;
-
- /* Separate x into exponent and mantissa, with a mantissa in the range [0.5..1[ (not IEEE754 standard!) */
- fexp = _mm256_and_ps(x, expmask);
- iexp = _mm256_castps_si256(fexp);
-
- iexp128b = _mm256_extractf128_si256(iexp, 0x1);
- iexp128a = _mm256_castsi256_si128(iexp);
-
- iexp128a = _mm_srli_epi32(iexp128a, 23);
- iexp128b = _mm_srli_epi32(iexp128b, 23);
- iexp128a = _mm_sub_epi32(iexp128a, expbase_m1);
- iexp128b = _mm_sub_epi32(iexp128b, expbase_m1);
-
- x = _mm256_andnot_ps(expmask, x);
- x = _mm256_or_ps(x, one);
- x = _mm256_mul_ps(x, half);
-
- mask = _mm256_cmp_ps(x, invsq2, _CMP_LT_OQ);
-
- x = _mm256_add_ps(x, _mm256_and_ps(mask, x));
- x = _mm256_sub_ps(x, one);
-
- imask = _mm256_castps_si256(mask);
-
- imask128b = _mm256_extractf128_si256(imask, 0x1);
- imask128a = _mm256_castsi256_si128(imask);
-
- iexp128a = _mm_add_epi32(iexp128a, imask128a);
- iexp128b = _mm_add_epi32(iexp128b, imask128b);
-
- iexp = _mm256_castsi128_si256(iexp128a);
- iexp = _mm256_insertf128_si256(iexp, iexp128b, 0x1);
-
- x2 = _mm256_mul_ps(x, x);
-
- pA = _mm256_mul_ps(CA_1, x);
- pB = _mm256_mul_ps(CB_1, x);
- pC = _mm256_mul_ps(CC_1, x);
- pD = _mm256_mul_ps(CD_1, x);
- pE = _mm256_mul_ps(CE_1, x);
- tB = _mm256_add_ps(CB_0, x2);
- tC = _mm256_add_ps(CC_0, x2);
- tD = _mm256_add_ps(CD_0, x2);
- tE = _mm256_add_ps(CE_0, x2);
- pB = _mm256_add_ps(pB, tB);
- pC = _mm256_add_ps(pC, tC);
- pD = _mm256_add_ps(pD, tD);
- pE = _mm256_add_ps(pE, tE);
-
- pA = _mm256_mul_ps(pA, pB);
- pC = _mm256_mul_ps(pC, pD);
- pE = _mm256_mul_ps(pE, x2);
- pA = _mm256_mul_ps(pA, pC);
- y = _mm256_mul_ps(pA, pE);
-
- fexp = _mm256_cvtepi32_ps(iexp);
- y = _mm256_add_ps(y, _mm256_mul_ps(fexp, corr1));
-
- y = _mm256_sub_ps(y, _mm256_mul_ps(half, x2));
- x2 = _mm256_add_ps(x, y);
-
- x2 = _mm256_add_ps(x2, _mm256_mul_ps(fexp, corr2));
-
- return x2;
-}
-
-
-static __m128
-gmx_mm_log_ps(__m128 x)
-{
- /* Same algorithm as cephes library */
- const __m128 expmask = gmx_mm_castsi128_ps( _mm_set_epi32(0x7F800000, 0x7F800000, 0x7F800000, 0x7F800000) );
- const __m128i expbase_m1 = _mm_set1_epi32(127-1); /* We want non-IEEE format */
- const __m128 half = _mm_set1_ps(0.5f);
- const __m128 one = _mm_set1_ps(1.0f);
- const __m128 invsq2 = _mm_set1_ps(1.0f/sqrt(2.0f));
- const __m128 corr1 = _mm_set1_ps(-2.12194440e-4f);
- const __m128 corr2 = _mm_set1_ps(0.693359375f);
-
- const __m128 CA_1 = _mm_set1_ps(0.070376836292f);
- const __m128 CB_0 = _mm_set1_ps(1.6714950086782716f);
- const __m128 CB_1 = _mm_set1_ps(-2.452088066061482f);
- const __m128 CC_0 = _mm_set1_ps(1.5220770854701728f);
- const __m128 CC_1 = _mm_set1_ps(-1.3422238433233642f);
- const __m128 CD_0 = _mm_set1_ps(1.386218787509749f);
- const __m128 CD_1 = _mm_set1_ps(0.35075468953796346f);
- const __m128 CE_0 = _mm_set1_ps(1.3429983063133937f);
- const __m128 CE_1 = _mm_set1_ps(1.807420826584643f);
-
- __m128 fexp;
- __m128i iexp;
- __m128 mask;
- __m128 x2;
- __m128 y;
- __m128 pA, pB, pC, pD, pE, tB, tC, tD, tE;
-
- /* Separate x into exponent and mantissa, with a mantissa in the range [0.5..1[ (not IEEE754 standard!) */
- fexp = _mm_and_ps(x, expmask);
- iexp = gmx_mm_castps_si128(fexp);
- iexp = _mm_srli_epi32(iexp, 23);
- iexp = _mm_sub_epi32(iexp, expbase_m1);
-
- x = _mm_andnot_ps(expmask, x);
- x = _mm_or_ps(x, one);
- x = _mm_mul_ps(x, half);
-
- mask = _mm_cmplt_ps(x, invsq2);
-
- x = _mm_add_ps(x, _mm_and_ps(mask, x));
- x = _mm_sub_ps(x, one);
- iexp = _mm_add_epi32(iexp, gmx_mm_castps_si128(mask)); /* 0xFFFFFFFF = -1 as int */
-
- x2 = _mm_mul_ps(x, x);
-
- pA = _mm_mul_ps(CA_1, x);
- pB = _mm_mul_ps(CB_1, x);
- pC = _mm_mul_ps(CC_1, x);
- pD = _mm_mul_ps(CD_1, x);
- pE = _mm_mul_ps(CE_1, x);
- tB = _mm_add_ps(CB_0, x2);
- tC = _mm_add_ps(CC_0, x2);
- tD = _mm_add_ps(CD_0, x2);
- tE = _mm_add_ps(CE_0, x2);
- pB = _mm_add_ps(pB, tB);
- pC = _mm_add_ps(pC, tC);
- pD = _mm_add_ps(pD, tD);
- pE = _mm_add_ps(pE, tE);
-
- pA = _mm_mul_ps(pA, pB);
- pC = _mm_mul_ps(pC, pD);
- pE = _mm_mul_ps(pE, x2);
- pA = _mm_mul_ps(pA, pC);
- y = _mm_mul_ps(pA, pE);
-
- fexp = _mm_cvtepi32_ps(iexp);
- y = _mm_add_ps(y, _mm_mul_ps(fexp, corr1));
-
- y = _mm_sub_ps(y, _mm_mul_ps(half, x2));
- x2 = _mm_add_ps(x, y);
-
- x2 = _mm_add_ps(x2, _mm_mul_ps(fexp, corr2));
-
- return x2;
-}
-
-
-/*
- * 2^x function, 256-bit wide
- *
- * The 2^w term is calculated from a (6,0)-th order (no denominator) Minimax polynomia on the interval
- * [-0.5,0.5]. The coefficiencts of this was derived in Mathematica using the command:
- *
- * MiniMaxApproximation[(2^x), {x, {-0.5, 0.5}, 6, 0}, WorkingPrecision -> 15]
- *
- * The largest-magnitude exponent we can represent in IEEE single-precision binary format
- * is 2^-126 for small numbers and 2^127 for large ones. To avoid wrap-around problems, we set the
- * result to zero if the argument falls outside this range. For small numbers this is just fine, but
- * for large numbers you could be fancy and return the smallest/largest IEEE single-precision
- * number instead. That would take a few extra cycles and not really help, since something is
- * wrong if you are using single precision to work with numbers that cannot really be represented
- * in single precision.
- *
- * The accuracy is at least 23 bits.
+/* Temporary:
+ * Alias some old SSE definitions to new SIMD definitions so we don't need
+ * to modify _all_ group kernels - they will anyway be replaced with a new
+ * generic SIMD version soon.
*/
-static __m256
-gmx_mm256_exp2_ps(__m256 x)
-{
- /* Lower bound: Disallow numbers that would lead to an IEEE fp exponent reaching +-127. */
- const __m256 arglimit = _mm256_set1_ps(126.0f);
-
- const __m128i expbase = _mm_set1_epi32(127);
- const __m256 CC6 = _mm256_set1_ps(1.535336188319500E-004);
- const __m256 CC5 = _mm256_set1_ps(1.339887440266574E-003);
- const __m256 CC4 = _mm256_set1_ps(9.618437357674640E-003);
- const __m256 CC3 = _mm256_set1_ps(5.550332471162809E-002);
- const __m256 CC2 = _mm256_set1_ps(2.402264791363012E-001);
- const __m256 CC1 = _mm256_set1_ps(6.931472028550421E-001);
- const __m256 CC0 = _mm256_set1_ps(1.0f);
-
- __m256 p0, p1;
- __m256 valuemask;
- __m256i iexppart;
- __m128i iexppart128a, iexppart128b;
- __m256 fexppart;
- __m256 intpart;
- __m256 x2;
-
-
- iexppart = _mm256_cvtps_epi32(x);
- intpart = _mm256_round_ps(x, _MM_FROUND_TO_NEAREST_INT);
-
- iexppart128b = _mm256_extractf128_si256(iexppart, 0x1);
- iexppart128a = _mm256_castsi256_si128(iexppart);
-
- iexppart128a = _mm_slli_epi32(_mm_add_epi32(iexppart128a, expbase), 23);
- iexppart128b = _mm_slli_epi32(_mm_add_epi32(iexppart128b, expbase), 23);
-
- iexppart = _mm256_castsi128_si256(iexppart128a);
- iexppart = _mm256_insertf128_si256(iexppart, iexppart128b, 0x1);
- valuemask = _mm256_cmp_ps(arglimit, gmx_mm256_abs_ps(x), _CMP_GE_OQ);
- fexppart = _mm256_and_ps(valuemask, _mm256_castsi256_ps(iexppart));
-
- x = _mm256_sub_ps(x, intpart);
- x2 = _mm256_mul_ps(x, x);
-
- p0 = _mm256_mul_ps(CC6, x2);
- p1 = _mm256_mul_ps(CC5, x2);
- p0 = _mm256_add_ps(p0, CC4);
- p1 = _mm256_add_ps(p1, CC3);
- p0 = _mm256_mul_ps(p0, x2);
- p1 = _mm256_mul_ps(p1, x2);
- p0 = _mm256_add_ps(p0, CC2);
- p1 = _mm256_add_ps(p1, CC1);
- p0 = _mm256_mul_ps(p0, x2);
- p1 = _mm256_mul_ps(p1, x);
- p0 = _mm256_add_ps(p0, CC0);
- p0 = _mm256_add_ps(p0, p1);
- x = _mm256_mul_ps(p0, fexppart);
-
- return x;
-}
-
-
-/* 2^x, 128 bit wide */
-static __m128
-gmx_mm_exp2_ps(__m128 x)
-{
- /* Lower bound: We do not allow numbers that would lead to an IEEE fp representation exponent smaller than -126. */
- const __m128 arglimit = _mm_set1_ps(126.0f);
-
- const __m128i expbase = _mm_set1_epi32(127);
- const __m128 CA6 = _mm_set1_ps(1.535336188319500E-004);
- const __m128 CA5 = _mm_set1_ps(1.339887440266574E-003);
- const __m128 CA4 = _mm_set1_ps(9.618437357674640E-003);
- const __m128 CA3 = _mm_set1_ps(5.550332471162809E-002);
- const __m128 CA2 = _mm_set1_ps(2.402264791363012E-001);
- const __m128 CA1 = _mm_set1_ps(6.931472028550421E-001);
- const __m128 CA0 = _mm_set1_ps(1.0f);
-
- __m128 valuemask;
- __m128i iexppart;
- __m128 fexppart;
- __m128 intpart;
- __m128 x2;
- __m128 p0, p1;
-
- iexppart = _mm_cvtps_epi32(x);
- intpart = _mm_round_ps(x, _MM_FROUND_TO_NEAREST_INT);
- iexppart = _mm_slli_epi32(_mm_add_epi32(iexppart, expbase), 23);
- valuemask = _mm_cmpge_ps(arglimit, gmx_mm_abs_ps(x));
- fexppart = _mm_and_ps(valuemask, gmx_mm_castsi128_ps(iexppart));
-
- x = _mm_sub_ps(x, intpart);
- x2 = _mm_mul_ps(x, x);
-
- p0 = _mm_mul_ps(CA6, x2);
- p1 = _mm_mul_ps(CA5, x2);
- p0 = _mm_add_ps(p0, CA4);
- p1 = _mm_add_ps(p1, CA3);
- p0 = _mm_mul_ps(p0, x2);
- p1 = _mm_mul_ps(p1, x2);
- p0 = _mm_add_ps(p0, CA2);
- p1 = _mm_add_ps(p1, CA1);
- p0 = _mm_mul_ps(p0, x2);
- p1 = _mm_mul_ps(p1, x);
- p0 = _mm_add_ps(p0, CA0);
- p0 = _mm_add_ps(p0, p1);
- x = _mm_mul_ps(p0, fexppart);
-
- return x;
-}
-
-
-/* Exponential function, 256 bit wide. This could be calculated from 2^x as Exp(x)=2^(y),
- * where y=log2(e)*x, but there will then be a small rounding error since we lose some
- * precision due to the multiplication. This will then be magnified a lot by the exponential.
- *
- * Instead, we calculate the fractional part directly as a minimax approximation of
- * Exp(z) on [-0.5,0.5]. We use extended precision arithmetics to calculate the fraction
- * remaining after 2^y, which avoids the precision-loss.
- * The final result is correct to within 1 LSB over the entire argument range.
- */
-static __m256
-gmx_mm256_exp_ps(__m256 exparg)
-{
- const __m256 argscale = _mm256_set1_ps(1.44269504088896341f);
- /* Lower bound: Disallow numbers that would lead to an IEEE fp exponent reaching +-127. */
- const __m256 arglimit = _mm256_set1_ps(126.0f);
- const __m128i expbase = _mm_set1_epi32(127);
-
- const __m256 invargscale0 = _mm256_set1_ps(0.693359375f);
- const __m256 invargscale1 = _mm256_set1_ps(-2.12194440e-4f);
-
- const __m256 CE5 = _mm256_set1_ps(1.9875691500e-4f);
- const __m256 CE4 = _mm256_set1_ps(1.3981999507e-3f);
- const __m256 CE3 = _mm256_set1_ps(8.3334519073e-3f);
- const __m256 CE2 = _mm256_set1_ps(4.1665795894e-2f);
- const __m256 CE1 = _mm256_set1_ps(1.6666665459e-1f);
- const __m256 CE0 = _mm256_set1_ps(5.0000001201e-1f);
- const __m256 one = _mm256_set1_ps(1.0f);
-
- __m256 exparg2, exp2arg;
- __m256 pE0, pE1;
- __m256 valuemask;
- __m256i iexppart;
- __m128i iexppart128a, iexppart128b;
- __m256 fexppart;
- __m256 intpart;
-
- exp2arg = _mm256_mul_ps(exparg, argscale);
-
- iexppart = _mm256_cvtps_epi32(exp2arg);
- intpart = _mm256_round_ps(exp2arg, _MM_FROUND_TO_NEAREST_INT);
-
- iexppart128b = _mm256_extractf128_si256(iexppart, 0x1);
- iexppart128a = _mm256_castsi256_si128(iexppart);
-
- iexppart128a = _mm_slli_epi32(_mm_add_epi32(iexppart128a, expbase), 23);
- iexppart128b = _mm_slli_epi32(_mm_add_epi32(iexppart128b, expbase), 23);
-
- iexppart = _mm256_castsi128_si256(iexppart128a);
- iexppart = _mm256_insertf128_si256(iexppart, iexppart128b, 0x1);
- valuemask = _mm256_cmp_ps(arglimit, gmx_mm256_abs_ps(exp2arg), _CMP_GE_OQ);
- fexppart = _mm256_and_ps(valuemask, _mm256_castsi256_ps(iexppart));
-
- /* Extended precision arithmetics */
- exparg = _mm256_sub_ps(exparg, _mm256_mul_ps(invargscale0, intpart));
- exparg = _mm256_sub_ps(exparg, _mm256_mul_ps(invargscale1, intpart));
-
- exparg2 = _mm256_mul_ps(exparg, exparg);
-
- pE1 = _mm256_mul_ps(CE5, exparg2);
- pE0 = _mm256_mul_ps(CE4, exparg2);
- pE1 = _mm256_add_ps(pE1, CE3);
- pE0 = _mm256_add_ps(pE0, CE2);
- pE1 = _mm256_mul_ps(pE1, exparg2);
- pE0 = _mm256_mul_ps(pE0, exparg2);
- pE1 = _mm256_add_ps(pE1, CE1);
- pE0 = _mm256_add_ps(pE0, CE0);
- pE1 = _mm256_mul_ps(pE1, exparg);
- pE0 = _mm256_add_ps(pE0, pE1);
- pE0 = _mm256_mul_ps(pE0, exparg2);
- exparg = _mm256_add_ps(exparg, one);
- exparg = _mm256_add_ps(exparg, pE0);
-
- exparg = _mm256_mul_ps(exparg, fexppart);
-
- return exparg;
-}
-
-
-/* exp(), 128 bit wide. */
-static __m128
-gmx_mm_exp_ps(__m128 x)
-{
- const __m128 argscale = _mm_set1_ps(1.44269504088896341f);
- /* Lower bound: Disallow numbers that would lead to an IEEE fp exponent reaching +-127. */
- const __m128 arglimit = _mm_set1_ps(126.0f);
- const __m128i expbase = _mm_set1_epi32(127);
-
- const __m128 invargscale0 = _mm_set1_ps(0.693359375f);
- const __m128 invargscale1 = _mm_set1_ps(-2.12194440e-4f);
-
- const __m128 CC5 = _mm_set1_ps(1.9875691500e-4f);
- const __m128 CC4 = _mm_set1_ps(1.3981999507e-3f);
- const __m128 CC3 = _mm_set1_ps(8.3334519073e-3f);
- const __m128 CC2 = _mm_set1_ps(4.1665795894e-2f);
- const __m128 CC1 = _mm_set1_ps(1.6666665459e-1f);
- const __m128 CC0 = _mm_set1_ps(5.0000001201e-1f);
- const __m128 one = _mm_set1_ps(1.0f);
-
- __m128 y, x2;
- __m128 p0, p1;
- __m128 valuemask;
- __m128i iexppart;
- __m128 fexppart;
- __m128 intpart;
-
- y = _mm_mul_ps(x, argscale);
-
- iexppart = _mm_cvtps_epi32(y);
- intpart = _mm_round_ps(y, _MM_FROUND_TO_NEAREST_INT);
-
- iexppart = _mm_slli_epi32(_mm_add_epi32(iexppart, expbase), 23);
- valuemask = _mm_cmpge_ps(arglimit, gmx_mm_abs_ps(y));
- fexppart = _mm_and_ps(valuemask, gmx_mm_castsi128_ps(iexppart));
-
- /* Extended precision arithmetics */
- x = _mm_sub_ps(x, _mm_mul_ps(invargscale0, intpart));
- x = _mm_sub_ps(x, _mm_mul_ps(invargscale1, intpart));
-
- x2 = _mm_mul_ps(x, x);
-
- p1 = _mm_mul_ps(CC5, x2);
- p0 = _mm_mul_ps(CC4, x2);
- p1 = _mm_add_ps(p1, CC3);
- p0 = _mm_add_ps(p0, CC2);
- p1 = _mm_mul_ps(p1, x2);
- p0 = _mm_mul_ps(p0, x2);
- p1 = _mm_add_ps(p1, CC1);
- p0 = _mm_add_ps(p0, CC0);
- p1 = _mm_mul_ps(p1, x);
- p0 = _mm_add_ps(p0, p1);
- p0 = _mm_mul_ps(p0, x2);
- x = _mm_add_ps(x, one);
- x = _mm_add_ps(x, p0);
-
- x = _mm_mul_ps(x, fexppart);
-
- return x;
-}
-
-
-
-/* FULL precision erf(), 256-bit wide. Only errors in LSB */
-static __m256
-gmx_mm256_erf_ps(__m256 x)
-{
- /* Coefficients for minimax approximation of erf(x)=x*P(x^2) in range [-1,1] */
- const __m256 CA6 = _mm256_set1_ps(7.853861353153693e-5f);
- const __m256 CA5 = _mm256_set1_ps(-8.010193625184903e-4f);
- const __m256 CA4 = _mm256_set1_ps(5.188327685732524e-3f);
- const __m256 CA3 = _mm256_set1_ps(-2.685381193529856e-2f);
- const __m256 CA2 = _mm256_set1_ps(1.128358514861418e-1f);
- const __m256 CA1 = _mm256_set1_ps(-3.761262582423300e-1f);
- const __m256 CA0 = _mm256_set1_ps(1.128379165726710f);
- /* Coefficients for minimax approximation of erfc(x)=Exp(-x^2)*P((1/(x-1))^2) in range [0.67,2] */
- const __m256 CB9 = _mm256_set1_ps(-0.0018629930017603923f);
- const __m256 CB8 = _mm256_set1_ps(0.003909821287598495f);
- const __m256 CB7 = _mm256_set1_ps(-0.0052094582210355615f);
- const __m256 CB6 = _mm256_set1_ps(0.005685614362160572f);
- const __m256 CB5 = _mm256_set1_ps(-0.0025367682853477272f);
- const __m256 CB4 = _mm256_set1_ps(-0.010199799682318782f);
- const __m256 CB3 = _mm256_set1_ps(0.04369575504816542f);
- const __m256 CB2 = _mm256_set1_ps(-0.11884063474674492f);
- const __m256 CB1 = _mm256_set1_ps(0.2732120154030589f);
- const __m256 CB0 = _mm256_set1_ps(0.42758357702025784f);
- /* Coefficients for minimax approximation of erfc(x)=Exp(-x^2)*(1/x)*P((1/x)^2) in range [2,9.19] */
- const __m256 CC10 = _mm256_set1_ps(-0.0445555913112064f);
- const __m256 CC9 = _mm256_set1_ps(0.21376355144663348f);
- const __m256 CC8 = _mm256_set1_ps(-0.3473187200259257f);
- const __m256 CC7 = _mm256_set1_ps(0.016690861551248114f);
- const __m256 CC6 = _mm256_set1_ps(0.7560973182491192f);
- const __m256 CC5 = _mm256_set1_ps(-1.2137903600145787f);
- const __m256 CC4 = _mm256_set1_ps(0.8411872321232948f);
- const __m256 CC3 = _mm256_set1_ps(-0.08670413896296343f);
- const __m256 CC2 = _mm256_set1_ps(-0.27124782687240334f);
- const __m256 CC1 = _mm256_set1_ps(-0.0007502488047806069f);
- const __m256 CC0 = _mm256_set1_ps(0.5642114853803148f);
-
- /* Coefficients for expansion of exp(x) in [0,0.1] */
- /* CD0 and CD1 are both 1.0, so no need to declare them separately */
- const __m256 CD2 = _mm256_set1_ps(0.5000066608081202f);
- const __m256 CD3 = _mm256_set1_ps(0.1664795422874624f);
- const __m256 CD4 = _mm256_set1_ps(0.04379839977652482f);
-
- const __m256 sieve = _mm256_castsi256_ps( _mm256_set1_epi32(0xfffff000) );
- const __m256 signbit = _mm256_castsi256_ps( _mm256_set1_epi32(0x80000000) );
- const __m256 one = _mm256_set1_ps(1.0f);
- const __m256 two = _mm256_set1_ps(2.0f);
-
- __m256 x2, x4, y;
- __m256 z, q, t, t2, w, w2;
- __m256 pA0, pA1, pB0, pB1, pC0, pC1;
- __m256 expmx2, corr;
- __m256 res_erf, res_erfc, res;
- __m256 mask;
-
- /* Calculate erf() */
- x2 = _mm256_mul_ps(x, x);
- x4 = _mm256_mul_ps(x2, x2);
-
- pA0 = _mm256_mul_ps(CA6, x4);
- pA1 = _mm256_mul_ps(CA5, x4);
- pA0 = _mm256_add_ps(pA0, CA4);
- pA1 = _mm256_add_ps(pA1, CA3);
- pA0 = _mm256_mul_ps(pA0, x4);
- pA1 = _mm256_mul_ps(pA1, x4);
- pA0 = _mm256_add_ps(pA0, CA2);
- pA1 = _mm256_add_ps(pA1, CA1);
- pA0 = _mm256_mul_ps(pA0, x4);
- pA1 = _mm256_mul_ps(pA1, x2);
- pA0 = _mm256_add_ps(pA0, pA1);
- pA0 = _mm256_add_ps(pA0, CA0);
-
- res_erf = _mm256_mul_ps(x, pA0);
-
- /* Calculate erfc */
-
- y = gmx_mm256_abs_ps(x);
- t = gmx_mm256_inv_ps(y);
- w = _mm256_sub_ps(t, one);
- t2 = _mm256_mul_ps(t, t);
- w2 = _mm256_mul_ps(w, w);
- /*
- * We cannot simply calculate exp(-x2) directly in single precision, since
- * that will lose a couple of bits of precision due to the multiplication.
- * Instead, we introduce x=z+w, where the last 12 bits of precision are in w.
- * Then we get exp(-x2) = exp(-z2)*exp((z-x)*(z+x)).
- *
- * The only drawback with this is that it requires TWO separate exponential
- * evaluations, which would be horrible performance-wise. However, the argument
- * for the second exp() call is always small, so there we simply use a
- * low-order minimax expansion on [0,0.1].
- */
-
- z = _mm256_and_ps(y, sieve);
- q = _mm256_mul_ps( _mm256_sub_ps(z, y), _mm256_add_ps(z, y) );
-
- corr = _mm256_mul_ps(CD4, q);
- corr = _mm256_add_ps(corr, CD3);
- corr = _mm256_mul_ps(corr, q);
- corr = _mm256_add_ps(corr, CD2);
- corr = _mm256_mul_ps(corr, q);
- corr = _mm256_add_ps(corr, one);
- corr = _mm256_mul_ps(corr, q);
- corr = _mm256_add_ps(corr, one);
-
- expmx2 = gmx_mm256_exp_ps( _mm256_or_ps( signbit, _mm256_mul_ps(z, z) ) );
- expmx2 = _mm256_mul_ps(expmx2, corr);
-
- pB1 = _mm256_mul_ps(CB9, w2);
- pB0 = _mm256_mul_ps(CB8, w2);
- pB1 = _mm256_add_ps(pB1, CB7);
- pB0 = _mm256_add_ps(pB0, CB6);
- pB1 = _mm256_mul_ps(pB1, w2);
- pB0 = _mm256_mul_ps(pB0, w2);
- pB1 = _mm256_add_ps(pB1, CB5);
- pB0 = _mm256_add_ps(pB0, CB4);
- pB1 = _mm256_mul_ps(pB1, w2);
- pB0 = _mm256_mul_ps(pB0, w2);
- pB1 = _mm256_add_ps(pB1, CB3);
- pB0 = _mm256_add_ps(pB0, CB2);
- pB1 = _mm256_mul_ps(pB1, w2);
- pB0 = _mm256_mul_ps(pB0, w2);
- pB1 = _mm256_add_ps(pB1, CB1);
- pB1 = _mm256_mul_ps(pB1, w);
- pB0 = _mm256_add_ps(pB0, pB1);
- pB0 = _mm256_add_ps(pB0, CB0);
-
- pC0 = _mm256_mul_ps(CC10, t2);
- pC1 = _mm256_mul_ps(CC9, t2);
- pC0 = _mm256_add_ps(pC0, CC8);
- pC1 = _mm256_add_ps(pC1, CC7);
- pC0 = _mm256_mul_ps(pC0, t2);
- pC1 = _mm256_mul_ps(pC1, t2);
- pC0 = _mm256_add_ps(pC0, CC6);
- pC1 = _mm256_add_ps(pC1, CC5);
- pC0 = _mm256_mul_ps(pC0, t2);
- pC1 = _mm256_mul_ps(pC1, t2);
- pC0 = _mm256_add_ps(pC0, CC4);
- pC1 = _mm256_add_ps(pC1, CC3);
- pC0 = _mm256_mul_ps(pC0, t2);
- pC1 = _mm256_mul_ps(pC1, t2);
- pC0 = _mm256_add_ps(pC0, CC2);
- pC1 = _mm256_add_ps(pC1, CC1);
- pC0 = _mm256_mul_ps(pC0, t2);
- pC1 = _mm256_mul_ps(pC1, t);
- pC0 = _mm256_add_ps(pC0, pC1);
- pC0 = _mm256_add_ps(pC0, CC0);
- pC0 = _mm256_mul_ps(pC0, t);
-
- /* SELECT pB0 or pC0 for erfc() */
- mask = _mm256_cmp_ps(two, y, _CMP_LT_OQ);
- res_erfc = _mm256_blendv_ps(pB0, pC0, mask);
- res_erfc = _mm256_mul_ps(res_erfc, expmx2);
-
- /* erfc(x<0) = 2-erfc(|x|) */
- mask = _mm256_cmp_ps(x, _mm256_setzero_ps(), _CMP_LT_OQ);
- res_erfc = _mm256_blendv_ps(res_erfc, _mm256_sub_ps(two, res_erfc), mask);
-
- /* Select erf() or erfc() */
- mask = _mm256_cmp_ps(y, _mm256_set1_ps(0.75f), _CMP_LT_OQ);
- res = _mm256_blendv_ps(_mm256_sub_ps(one, res_erfc), res_erf, mask);
-
- return res;
-}
-
-
-/* erf(), 128 bit wide */
-static __m128
-gmx_mm_erf_ps(__m128 x)
-{
- /* Coefficients for minimax approximation of erf(x)=x*P(x^2) in range [-1,1] */
- const __m128 CA6 = _mm_set1_ps(7.853861353153693e-5f);
- const __m128 CA5 = _mm_set1_ps(-8.010193625184903e-4f);
- const __m128 CA4 = _mm_set1_ps(5.188327685732524e-3f);
- const __m128 CA3 = _mm_set1_ps(-2.685381193529856e-2f);
- const __m128 CA2 = _mm_set1_ps(1.128358514861418e-1f);
- const __m128 CA1 = _mm_set1_ps(-3.761262582423300e-1f);
- const __m128 CA0 = _mm_set1_ps(1.128379165726710f);
- /* Coefficients for minimax approximation of erfc(x)=Exp(-x^2)*P((1/(x-1))^2) in range [0.67,2] */
- const __m128 CB9 = _mm_set1_ps(-0.0018629930017603923f);
- const __m128 CB8 = _mm_set1_ps(0.003909821287598495f);
- const __m128 CB7 = _mm_set1_ps(-0.0052094582210355615f);
- const __m128 CB6 = _mm_set1_ps(0.005685614362160572f);
- const __m128 CB5 = _mm_set1_ps(-0.0025367682853477272f);
- const __m128 CB4 = _mm_set1_ps(-0.010199799682318782f);
- const __m128 CB3 = _mm_set1_ps(0.04369575504816542f);
- const __m128 CB2 = _mm_set1_ps(-0.11884063474674492f);
- const __m128 CB1 = _mm_set1_ps(0.2732120154030589f);
- const __m128 CB0 = _mm_set1_ps(0.42758357702025784f);
- /* Coefficients for minimax approximation of erfc(x)=Exp(-x^2)*(1/x)*P((1/x)^2) in range [2,9.19] */
- const __m128 CC10 = _mm_set1_ps(-0.0445555913112064f);
- const __m128 CC9 = _mm_set1_ps(0.21376355144663348f);
- const __m128 CC8 = _mm_set1_ps(-0.3473187200259257f);
- const __m128 CC7 = _mm_set1_ps(0.016690861551248114f);
- const __m128 CC6 = _mm_set1_ps(0.7560973182491192f);
- const __m128 CC5 = _mm_set1_ps(-1.2137903600145787f);
- const __m128 CC4 = _mm_set1_ps(0.8411872321232948f);
- const __m128 CC3 = _mm_set1_ps(-0.08670413896296343f);
- const __m128 CC2 = _mm_set1_ps(-0.27124782687240334f);
- const __m128 CC1 = _mm_set1_ps(-0.0007502488047806069f);
- const __m128 CC0 = _mm_set1_ps(0.5642114853803148f);
-
- /* Coefficients for expansion of exp(x) in [0,0.1] */
- /* CD0 and CD1 are both 1.0, so no need to declare them separately */
- const __m128 CD2 = _mm_set1_ps(0.5000066608081202f);
- const __m128 CD3 = _mm_set1_ps(0.1664795422874624f);
- const __m128 CD4 = _mm_set1_ps(0.04379839977652482f);
-
- const __m128 sieve = gmx_mm_castsi128_ps( _mm_set1_epi32(0xfffff000) );
- const __m128 signbit = gmx_mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
- const __m128 one = _mm_set1_ps(1.0f);
- const __m128 two = _mm_set1_ps(2.0f);
-
- __m128 x2, x4, y;
- __m128 z, q, t, t2, w, w2;
- __m128 pA0, pA1, pB0, pB1, pC0, pC1;
- __m128 expmx2, corr;
- __m128 res_erf, res_erfc, res;
- __m128 mask;
-
- /* Calculate erf() */
- x2 = _mm_mul_ps(x, x);
- x4 = _mm_mul_ps(x2, x2);
-
- pA0 = _mm_mul_ps(CA6, x4);
- pA1 = _mm_mul_ps(CA5, x4);
- pA0 = _mm_add_ps(pA0, CA4);
- pA1 = _mm_add_ps(pA1, CA3);
- pA0 = _mm_mul_ps(pA0, x4);
- pA1 = _mm_mul_ps(pA1, x4);
- pA0 = _mm_add_ps(pA0, CA2);
- pA1 = _mm_add_ps(pA1, CA1);
- pA0 = _mm_mul_ps(pA0, x4);
- pA1 = _mm_mul_ps(pA1, x2);
- pA0 = _mm_add_ps(pA0, pA1);
- pA0 = _mm_add_ps(pA0, CA0);
-
- res_erf = _mm_mul_ps(x, pA0);
-
- /* Calculate erfc */
-
- y = gmx_mm_abs_ps(x);
- t = gmx_mm_inv_ps(y);
- w = _mm_sub_ps(t, one);
- t2 = _mm_mul_ps(t, t);
- w2 = _mm_mul_ps(w, w);
- /*
- * We cannot simply calculate exp(-x2) directly in single precision, since
- * that will lose a couple of bits of precision due to the multiplication.
- * Instead, we introduce x=z+w, where the last 12 bits of precision are in w.
- * Then we get exp(-x2) = exp(-z2)*exp((z-x)*(z+x)).
- *
- * The only drawback with this is that it requires TWO separate exponential
- * evaluations, which would be horrible performance-wise. However, the argument
- * for the second exp() call is always small, so there we simply use a
- * low-order minimax expansion on [0,0.1].
- */
-
- z = _mm_and_ps(y, sieve);
- q = _mm_mul_ps( _mm_sub_ps(z, y), _mm_add_ps(z, y) );
-
- corr = _mm_mul_ps(CD4, q);
- corr = _mm_add_ps(corr, CD3);
- corr = _mm_mul_ps(corr, q);
- corr = _mm_add_ps(corr, CD2);
- corr = _mm_mul_ps(corr, q);
- corr = _mm_add_ps(corr, one);
- corr = _mm_mul_ps(corr, q);
- corr = _mm_add_ps(corr, one);
-
- expmx2 = gmx_mm_exp_ps( _mm_or_ps( signbit, _mm_mul_ps(z, z) ) );
- expmx2 = _mm_mul_ps(expmx2, corr);
-
- pB1 = _mm_mul_ps(CB9, w2);
- pB0 = _mm_mul_ps(CB8, w2);
- pB1 = _mm_add_ps(pB1, CB7);
- pB0 = _mm_add_ps(pB0, CB6);
- pB1 = _mm_mul_ps(pB1, w2);
- pB0 = _mm_mul_ps(pB0, w2);
- pB1 = _mm_add_ps(pB1, CB5);
- pB0 = _mm_add_ps(pB0, CB4);
- pB1 = _mm_mul_ps(pB1, w2);
- pB0 = _mm_mul_ps(pB0, w2);
- pB1 = _mm_add_ps(pB1, CB3);
- pB0 = _mm_add_ps(pB0, CB2);
- pB1 = _mm_mul_ps(pB1, w2);
- pB0 = _mm_mul_ps(pB0, w2);
- pB1 = _mm_add_ps(pB1, CB1);
- pB1 = _mm_mul_ps(pB1, w);
- pB0 = _mm_add_ps(pB0, pB1);
- pB0 = _mm_add_ps(pB0, CB0);
-
- pC0 = _mm_mul_ps(CC10, t2);
- pC1 = _mm_mul_ps(CC9, t2);
- pC0 = _mm_add_ps(pC0, CC8);
- pC1 = _mm_add_ps(pC1, CC7);
- pC0 = _mm_mul_ps(pC0, t2);
- pC1 = _mm_mul_ps(pC1, t2);
- pC0 = _mm_add_ps(pC0, CC6);
- pC1 = _mm_add_ps(pC1, CC5);
- pC0 = _mm_mul_ps(pC0, t2);
- pC1 = _mm_mul_ps(pC1, t2);
- pC0 = _mm_add_ps(pC0, CC4);
- pC1 = _mm_add_ps(pC1, CC3);
- pC0 = _mm_mul_ps(pC0, t2);
- pC1 = _mm_mul_ps(pC1, t2);
- pC0 = _mm_add_ps(pC0, CC2);
- pC1 = _mm_add_ps(pC1, CC1);
- pC0 = _mm_mul_ps(pC0, t2);
- pC1 = _mm_mul_ps(pC1, t);
- pC0 = _mm_add_ps(pC0, pC1);
- pC0 = _mm_add_ps(pC0, CC0);
- pC0 = _mm_mul_ps(pC0, t);
-
- /* SELECT pB0 or pC0 for erfc() */
- mask = _mm_cmplt_ps(two, y);
- res_erfc = _mm_blendv_ps(pB0, pC0, mask);
- res_erfc = _mm_mul_ps(res_erfc, expmx2);
-
- /* erfc(x<0) = 2-erfc(|x|) */
- mask = _mm_cmplt_ps(x, _mm_setzero_ps());
- res_erfc = _mm_blendv_ps(res_erfc, _mm_sub_ps(two, res_erfc), mask);
-
- /* Select erf() or erfc() */
- mask = _mm_cmplt_ps(y, _mm_set1_ps(0.75f));
- res = _mm_blendv_ps(_mm_sub_ps(one, res_erfc), res_erf, mask);
-
- return res;
-}
-
-
-
-
-/* FULL precision erfc(), 256 bit wide. Only errors in LSB */
-static __m256
-gmx_mm256_erfc_ps(__m256 x)
-{
- /* Coefficients for minimax approximation of erf(x)=x*P(x^2) in range [-1,1] */
- const __m256 CA6 = _mm256_set1_ps(7.853861353153693e-5f);
- const __m256 CA5 = _mm256_set1_ps(-8.010193625184903e-4f);
- const __m256 CA4 = _mm256_set1_ps(5.188327685732524e-3f);
- const __m256 CA3 = _mm256_set1_ps(-2.685381193529856e-2f);
- const __m256 CA2 = _mm256_set1_ps(1.128358514861418e-1f);
- const __m256 CA1 = _mm256_set1_ps(-3.761262582423300e-1f);
- const __m256 CA0 = _mm256_set1_ps(1.128379165726710f);
- /* Coefficients for minimax approximation of erfc(x)=Exp(-x^2)*P((1/(x-1))^2) in range [0.67,2] */
- const __m256 CB9 = _mm256_set1_ps(-0.0018629930017603923f);
- const __m256 CB8 = _mm256_set1_ps(0.003909821287598495f);
- const __m256 CB7 = _mm256_set1_ps(-0.0052094582210355615f);
- const __m256 CB6 = _mm256_set1_ps(0.005685614362160572f);
- const __m256 CB5 = _mm256_set1_ps(-0.0025367682853477272f);
- const __m256 CB4 = _mm256_set1_ps(-0.010199799682318782f);
- const __m256 CB3 = _mm256_set1_ps(0.04369575504816542f);
- const __m256 CB2 = _mm256_set1_ps(-0.11884063474674492f);
- const __m256 CB1 = _mm256_set1_ps(0.2732120154030589f);
- const __m256 CB0 = _mm256_set1_ps(0.42758357702025784f);
- /* Coefficients for minimax approximation of erfc(x)=Exp(-x^2)*(1/x)*P((1/x)^2) in range [2,9.19] */
- const __m256 CC10 = _mm256_set1_ps(-0.0445555913112064f);
- const __m256 CC9 = _mm256_set1_ps(0.21376355144663348f);
- const __m256 CC8 = _mm256_set1_ps(-0.3473187200259257f);
- const __m256 CC7 = _mm256_set1_ps(0.016690861551248114f);
- const __m256 CC6 = _mm256_set1_ps(0.7560973182491192f);
- const __m256 CC5 = _mm256_set1_ps(-1.2137903600145787f);
- const __m256 CC4 = _mm256_set1_ps(0.8411872321232948f);
- const __m256 CC3 = _mm256_set1_ps(-0.08670413896296343f);
- const __m256 CC2 = _mm256_set1_ps(-0.27124782687240334f);
- const __m256 CC1 = _mm256_set1_ps(-0.0007502488047806069f);
- const __m256 CC0 = _mm256_set1_ps(0.5642114853803148f);
-
- /* Coefficients for expansion of exp(x) in [0,0.1] */
- /* CD0 and CD1 are both 1.0, so no need to declare them separately */
- const __m256 CD2 = _mm256_set1_ps(0.5000066608081202f);
- const __m256 CD3 = _mm256_set1_ps(0.1664795422874624f);
- const __m256 CD4 = _mm256_set1_ps(0.04379839977652482f);
-
- const __m256 sieve = _mm256_castsi256_ps( _mm256_set1_epi32(0xfffff000) );
- const __m256 signbit = _mm256_castsi256_ps( _mm256_set1_epi32(0x80000000) );
- const __m256 one = _mm256_set1_ps(1.0f);
- const __m256 two = _mm256_set1_ps(2.0f);
-
- __m256 x2, x4, y;
- __m256 z, q, t, t2, w, w2;
- __m256 pA0, pA1, pB0, pB1, pC0, pC1;
- __m256 expmx2, corr;
- __m256 res_erf, res_erfc, res;
- __m256 mask;
-
- /* Calculate erf() */
- x2 = _mm256_mul_ps(x, x);
- x4 = _mm256_mul_ps(x2, x2);
-
- pA0 = _mm256_mul_ps(CA6, x4);
- pA1 = _mm256_mul_ps(CA5, x4);
- pA0 = _mm256_add_ps(pA0, CA4);
- pA1 = _mm256_add_ps(pA1, CA3);
- pA0 = _mm256_mul_ps(pA0, x4);
- pA1 = _mm256_mul_ps(pA1, x4);
- pA0 = _mm256_add_ps(pA0, CA2);
- pA1 = _mm256_add_ps(pA1, CA1);
- pA0 = _mm256_mul_ps(pA0, x4);
- pA1 = _mm256_mul_ps(pA1, x2);
- pA0 = _mm256_add_ps(pA0, pA1);
- pA0 = _mm256_add_ps(pA0, CA0);
-
- res_erf = _mm256_mul_ps(x, pA0);
-
- /* Calculate erfc */
- y = gmx_mm256_abs_ps(x);
- t = gmx_mm256_inv_ps(y);
- w = _mm256_sub_ps(t, one);
- t2 = _mm256_mul_ps(t, t);
- w2 = _mm256_mul_ps(w, w);
- /*
- * We cannot simply calculate exp(-x2) directly in single precision, since
- * that will lose a couple of bits of precision due to the multiplication.
- * Instead, we introduce x=z+w, where the last 12 bits of precision are in w.
- * Then we get exp(-x2) = exp(-z2)*exp((z-x)*(z+x)).
- *
- * The only drawback with this is that it requires TWO separate exponential
- * evaluations, which would be horrible performance-wise. However, the argument
- * for the second exp() call is always small, so there we simply use a
- * low-order minimax expansion on [0,0.1].
- */
-
- z = _mm256_and_ps(y, sieve);
- q = _mm256_mul_ps( _mm256_sub_ps(z, y), _mm256_add_ps(z, y) );
-
- corr = _mm256_mul_ps(CD4, q);
- corr = _mm256_add_ps(corr, CD3);
- corr = _mm256_mul_ps(corr, q);
- corr = _mm256_add_ps(corr, CD2);
- corr = _mm256_mul_ps(corr, q);
- corr = _mm256_add_ps(corr, one);
- corr = _mm256_mul_ps(corr, q);
- corr = _mm256_add_ps(corr, one);
-
- expmx2 = gmx_mm256_exp_ps( _mm256_or_ps( signbit, _mm256_mul_ps(z, z) ) );
- expmx2 = _mm256_mul_ps(expmx2, corr);
-
- pB1 = _mm256_mul_ps(CB9, w2);
- pB0 = _mm256_mul_ps(CB8, w2);
- pB1 = _mm256_add_ps(pB1, CB7);
- pB0 = _mm256_add_ps(pB0, CB6);
- pB1 = _mm256_mul_ps(pB1, w2);
- pB0 = _mm256_mul_ps(pB0, w2);
- pB1 = _mm256_add_ps(pB1, CB5);
- pB0 = _mm256_add_ps(pB0, CB4);
- pB1 = _mm256_mul_ps(pB1, w2);
- pB0 = _mm256_mul_ps(pB0, w2);
- pB1 = _mm256_add_ps(pB1, CB3);
- pB0 = _mm256_add_ps(pB0, CB2);
- pB1 = _mm256_mul_ps(pB1, w2);
- pB0 = _mm256_mul_ps(pB0, w2);
- pB1 = _mm256_add_ps(pB1, CB1);
- pB1 = _mm256_mul_ps(pB1, w);
- pB0 = _mm256_add_ps(pB0, pB1);
- pB0 = _mm256_add_ps(pB0, CB0);
-
- pC0 = _mm256_mul_ps(CC10, t2);
- pC1 = _mm256_mul_ps(CC9, t2);
- pC0 = _mm256_add_ps(pC0, CC8);
- pC1 = _mm256_add_ps(pC1, CC7);
- pC0 = _mm256_mul_ps(pC0, t2);
- pC1 = _mm256_mul_ps(pC1, t2);
- pC0 = _mm256_add_ps(pC0, CC6);
- pC1 = _mm256_add_ps(pC1, CC5);
- pC0 = _mm256_mul_ps(pC0, t2);
- pC1 = _mm256_mul_ps(pC1, t2);
- pC0 = _mm256_add_ps(pC0, CC4);
- pC1 = _mm256_add_ps(pC1, CC3);
- pC0 = _mm256_mul_ps(pC0, t2);
- pC1 = _mm256_mul_ps(pC1, t2);
- pC0 = _mm256_add_ps(pC0, CC2);
- pC1 = _mm256_add_ps(pC1, CC1);
- pC0 = _mm256_mul_ps(pC0, t2);
- pC1 = _mm256_mul_ps(pC1, t);
- pC0 = _mm256_add_ps(pC0, pC1);
- pC0 = _mm256_add_ps(pC0, CC0);
- pC0 = _mm256_mul_ps(pC0, t);
-
- /* SELECT pB0 or pC0 for erfc() */
- mask = _mm256_cmp_ps(two, y, _CMP_LT_OQ);
- res_erfc = _mm256_blendv_ps(pB0, pC0, mask);
- res_erfc = _mm256_mul_ps(res_erfc, expmx2);
-
- /* erfc(x<0) = 2-erfc(|x|) */
- mask = _mm256_cmp_ps(x, _mm256_setzero_ps(), _CMP_LT_OQ);
- res_erfc = _mm256_blendv_ps(res_erfc, _mm256_sub_ps(two, res_erfc), mask);
-
- /* Select erf() or erfc() */
- mask = _mm256_cmp_ps(y, _mm256_set1_ps(0.75f), _CMP_LT_OQ);
- res = _mm256_blendv_ps(res_erfc, _mm256_sub_ps(one, res_erf), mask);
-
- return res;
-}
-
-
-/* erfc(), 128 bit wide */
-static __m128
-gmx_mm_erfc_ps(__m128 x)
-{
- /* Coefficients for minimax approximation of erf(x)=x*P(x^2) in range [-1,1] */
- const __m128 CA6 = _mm_set1_ps(7.853861353153693e-5f);
- const __m128 CA5 = _mm_set1_ps(-8.010193625184903e-4f);
- const __m128 CA4 = _mm_set1_ps(5.188327685732524e-3f);
- const __m128 CA3 = _mm_set1_ps(-2.685381193529856e-2f);
- const __m128 CA2 = _mm_set1_ps(1.128358514861418e-1f);
- const __m128 CA1 = _mm_set1_ps(-3.761262582423300e-1f);
- const __m128 CA0 = _mm_set1_ps(1.128379165726710f);
- /* Coefficients for minimax approximation of erfc(x)=Exp(-x^2)*P((1/(x-1))^2) in range [0.67,2] */
- const __m128 CB9 = _mm_set1_ps(-0.0018629930017603923f);
- const __m128 CB8 = _mm_set1_ps(0.003909821287598495f);
- const __m128 CB7 = _mm_set1_ps(-0.0052094582210355615f);
- const __m128 CB6 = _mm_set1_ps(0.005685614362160572f);
- const __m128 CB5 = _mm_set1_ps(-0.0025367682853477272f);
- const __m128 CB4 = _mm_set1_ps(-0.010199799682318782f);
- const __m128 CB3 = _mm_set1_ps(0.04369575504816542f);
- const __m128 CB2 = _mm_set1_ps(-0.11884063474674492f);
- const __m128 CB1 = _mm_set1_ps(0.2732120154030589f);
- const __m128 CB0 = _mm_set1_ps(0.42758357702025784f);
- /* Coefficients for minimax approximation of erfc(x)=Exp(-x^2)*(1/x)*P((1/x)^2) in range [2,9.19] */
- const __m128 CC10 = _mm_set1_ps(-0.0445555913112064f);
- const __m128 CC9 = _mm_set1_ps(0.21376355144663348f);
- const __m128 CC8 = _mm_set1_ps(-0.3473187200259257f);
- const __m128 CC7 = _mm_set1_ps(0.016690861551248114f);
- const __m128 CC6 = _mm_set1_ps(0.7560973182491192f);
- const __m128 CC5 = _mm_set1_ps(-1.2137903600145787f);
- const __m128 CC4 = _mm_set1_ps(0.8411872321232948f);
- const __m128 CC3 = _mm_set1_ps(-0.08670413896296343f);
- const __m128 CC2 = _mm_set1_ps(-0.27124782687240334f);
- const __m128 CC1 = _mm_set1_ps(-0.0007502488047806069f);
- const __m128 CC0 = _mm_set1_ps(0.5642114853803148f);
-
- /* Coefficients for expansion of exp(x) in [0,0.1] */
- /* CD0 and CD1 are both 1.0, so no need to declare them separately */
- const __m128 CD2 = _mm_set1_ps(0.5000066608081202f);
- const __m128 CD3 = _mm_set1_ps(0.1664795422874624f);
- const __m128 CD4 = _mm_set1_ps(0.04379839977652482f);
-
- const __m128 sieve = gmx_mm_castsi128_ps( _mm_set1_epi32(0xfffff000) );
- const __m128 signbit = gmx_mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
- const __m128 one = _mm_set1_ps(1.0f);
- const __m128 two = _mm_set1_ps(2.0f);
-
- __m128 x2, x4, y;
- __m128 z, q, t, t2, w, w2;
- __m128 pA0, pA1, pB0, pB1, pC0, pC1;
- __m128 expmx2, corr;
- __m128 res_erf, res_erfc, res;
- __m128 mask;
-
- /* Calculate erf() */
- x2 = _mm_mul_ps(x, x);
- x4 = _mm_mul_ps(x2, x2);
-
- pA0 = _mm_mul_ps(CA6, x4);
- pA1 = _mm_mul_ps(CA5, x4);
- pA0 = _mm_add_ps(pA0, CA4);
- pA1 = _mm_add_ps(pA1, CA3);
- pA0 = _mm_mul_ps(pA0, x4);
- pA1 = _mm_mul_ps(pA1, x4);
- pA0 = _mm_add_ps(pA0, CA2);
- pA1 = _mm_add_ps(pA1, CA1);
- pA0 = _mm_mul_ps(pA0, x4);
- pA1 = _mm_mul_ps(pA1, x2);
- pA0 = _mm_add_ps(pA0, pA1);
- pA0 = _mm_add_ps(pA0, CA0);
-
- res_erf = _mm_mul_ps(x, pA0);
-
- /* Calculate erfc */
- y = gmx_mm_abs_ps(x);
- t = gmx_mm_inv_ps(y);
- w = _mm_sub_ps(t, one);
- t2 = _mm_mul_ps(t, t);
- w2 = _mm_mul_ps(w, w);
- /*
- * We cannot simply calculate exp(-x2) directly in single precision, since
- * that will lose a couple of bits of precision due to the multiplication.
- * Instead, we introduce x=z+w, where the last 12 bits of precision are in w.
- * Then we get exp(-x2) = exp(-z2)*exp((z-x)*(z+x)).
- *
- * The only drawback with this is that it requires TWO separate exponential
- * evaluations, which would be horrible performance-wise. However, the argument
- * for the second exp() call is always small, so there we simply use a
- * low-order minimax expansion on [0,0.1].
- */
-
- z = _mm_and_ps(y, sieve);
- q = _mm_mul_ps( _mm_sub_ps(z, y), _mm_add_ps(z, y) );
-
- corr = _mm_mul_ps(CD4, q);
- corr = _mm_add_ps(corr, CD3);
- corr = _mm_mul_ps(corr, q);
- corr = _mm_add_ps(corr, CD2);
- corr = _mm_mul_ps(corr, q);
- corr = _mm_add_ps(corr, one);
- corr = _mm_mul_ps(corr, q);
- corr = _mm_add_ps(corr, one);
-
- expmx2 = gmx_mm_exp_ps( _mm_or_ps( signbit, _mm_mul_ps(z, z) ) );
- expmx2 = _mm_mul_ps(expmx2, corr);
-
- pB1 = _mm_mul_ps(CB9, w2);
- pB0 = _mm_mul_ps(CB8, w2);
- pB1 = _mm_add_ps(pB1, CB7);
- pB0 = _mm_add_ps(pB0, CB6);
- pB1 = _mm_mul_ps(pB1, w2);
- pB0 = _mm_mul_ps(pB0, w2);
- pB1 = _mm_add_ps(pB1, CB5);
- pB0 = _mm_add_ps(pB0, CB4);
- pB1 = _mm_mul_ps(pB1, w2);
- pB0 = _mm_mul_ps(pB0, w2);
- pB1 = _mm_add_ps(pB1, CB3);
- pB0 = _mm_add_ps(pB0, CB2);
- pB1 = _mm_mul_ps(pB1, w2);
- pB0 = _mm_mul_ps(pB0, w2);
- pB1 = _mm_add_ps(pB1, CB1);
- pB1 = _mm_mul_ps(pB1, w);
- pB0 = _mm_add_ps(pB0, pB1);
- pB0 = _mm_add_ps(pB0, CB0);
-
- pC0 = _mm_mul_ps(CC10, t2);
- pC1 = _mm_mul_ps(CC9, t2);
- pC0 = _mm_add_ps(pC0, CC8);
- pC1 = _mm_add_ps(pC1, CC7);
- pC0 = _mm_mul_ps(pC0, t2);
- pC1 = _mm_mul_ps(pC1, t2);
- pC0 = _mm_add_ps(pC0, CC6);
- pC1 = _mm_add_ps(pC1, CC5);
- pC0 = _mm_mul_ps(pC0, t2);
- pC1 = _mm_mul_ps(pC1, t2);
- pC0 = _mm_add_ps(pC0, CC4);
- pC1 = _mm_add_ps(pC1, CC3);
- pC0 = _mm_mul_ps(pC0, t2);
- pC1 = _mm_mul_ps(pC1, t2);
- pC0 = _mm_add_ps(pC0, CC2);
- pC1 = _mm_add_ps(pC1, CC1);
- pC0 = _mm_mul_ps(pC0, t2);
- pC1 = _mm_mul_ps(pC1, t);
- pC0 = _mm_add_ps(pC0, pC1);
- pC0 = _mm_add_ps(pC0, CC0);
- pC0 = _mm_mul_ps(pC0, t);
-
- /* SELECT pB0 or pC0 for erfc() */
- mask = _mm_cmplt_ps(two, y);
- res_erfc = _mm_blendv_ps(pB0, pC0, mask);
- res_erfc = _mm_mul_ps(res_erfc, expmx2);
-
- /* erfc(x<0) = 2-erfc(|x|) */
- mask = _mm_cmplt_ps(x, _mm_setzero_ps());
- res_erfc = _mm_blendv_ps(res_erfc, _mm_sub_ps(two, res_erfc), mask);
-
- /* Select erf() or erfc() */
- mask = _mm_cmplt_ps(y, _mm_set1_ps(0.75f));
- res = _mm_blendv_ps(res_erfc, _mm_sub_ps(one, res_erf), mask);
-
- return res;
-}
-
-
-
-/* Calculate the force correction due to PME analytically.
- *
- * This routine is meant to enable analytical evaluation of the
- * direct-space PME electrostatic force to avoid tables.
- *
- * The direct-space potential should be Erfc(beta*r)/r, but there
- * are some problems evaluating that:
- *
- * First, the error function is difficult (read: expensive) to
- * approxmiate accurately for intermediate to large arguments, and
- * this happens already in ranges of beta*r that occur in simulations.
- * Second, we now try to avoid calculating potentials in Gromacs but
- * use forces directly.
- *
- * We can simply things slight by noting that the PME part is really
- * a correction to the normal Coulomb force since Erfc(z)=1-Erf(z), i.e.
- *
- * V= 1/r - Erf(beta*r)/r
- *
- * The first term we already have from the inverse square root, so
- * that we can leave out of this routine.
- *
- * For pme tolerances of 1e-3 to 1e-8 and cutoffs of 0.5nm to 1.8nm,
- * the argument beta*r will be in the range 0.15 to ~4. Use your
- * favorite plotting program to realize how well-behaved Erf(z)/z is
- * in this range!
- *
- * We approximate f(z)=erf(z)/z with a rational minimax polynomial.
- * However, it turns out it is more efficient to approximate f(z)/z and
- * then only use even powers. This is another minor optimization, since
- * we actually WANT f(z)/z, because it is going to be multiplied by
- * the vector between the two atoms to get the vectorial force. The
- * fastest flops are the ones we can avoid calculating!
- *
- * So, here's how it should be used:
- *
- * 1. Calculate r^2.
- * 2. Multiply by beta^2, so you get z^2=beta^2*r^2.
- * 3. Evaluate this routine with z^2 as the argument.
- * 4. The return value is the expression:
- *
- *
- * 2*exp(-z^2) erf(z)
- * ------------ - --------
- * sqrt(Pi)*z^2 z^3
- *
- * 5. Multiply the entire expression by beta^3. This will get you
- *
- * beta^3*2*exp(-z^2) beta^3*erf(z)
- * ------------------ - ---------------
- * sqrt(Pi)*z^2 z^3
- *
- * or, switching back to r (z=r*beta):
- *
- * 2*beta*exp(-r^2*beta^2) erf(r*beta)
- * ----------------------- - -----------
- * sqrt(Pi)*r^2 r^3
- *
- *
- * With a bit of math exercise you should be able to confirm that
- * this is exactly D[Erf[beta*r]/r,r] divided by r another time.
- *
- * 6. Add the result to 1/r^3, multiply by the product of the charges,
- * and you have your force (divided by r). A final multiplication
- * with the vector connecting the two particles and you have your
- * vectorial force to add to the particles.
- *
- */
-static __m256
-gmx_mm256_pmecorrF_ps(__m256 z2)
-{
- const __m256 FN6 = _mm256_set1_ps(-1.7357322914161492954e-8f);
- const __m256 FN5 = _mm256_set1_ps(1.4703624142580877519e-6f);
- const __m256 FN4 = _mm256_set1_ps(-0.000053401640219807709149f);
- const __m256 FN3 = _mm256_set1_ps(0.0010054721316683106153f);
- const __m256 FN2 = _mm256_set1_ps(-0.019278317264888380590f);
- const __m256 FN1 = _mm256_set1_ps(0.069670166153766424023f);
- const __m256 FN0 = _mm256_set1_ps(-0.75225204789749321333f);
-
- const __m256 FD4 = _mm256_set1_ps(0.0011193462567257629232f);
- const __m256 FD3 = _mm256_set1_ps(0.014866955030185295499f);
- const __m256 FD2 = _mm256_set1_ps(0.11583842382862377919f);
- const __m256 FD1 = _mm256_set1_ps(0.50736591960530292870f);
- const __m256 FD0 = _mm256_set1_ps(1.0f);
-
- __m256 z4;
- __m256 polyFN0, polyFN1, polyFD0, polyFD1;
-
- z4 = _mm256_mul_ps(z2, z2);
-
- polyFD0 = _mm256_mul_ps(FD4, z4);
- polyFD1 = _mm256_mul_ps(FD3, z4);
- polyFD0 = _mm256_add_ps(polyFD0, FD2);
- polyFD1 = _mm256_add_ps(polyFD1, FD1);
- polyFD0 = _mm256_mul_ps(polyFD0, z4);
- polyFD1 = _mm256_mul_ps(polyFD1, z2);
- polyFD0 = _mm256_add_ps(polyFD0, FD0);
- polyFD0 = _mm256_add_ps(polyFD0, polyFD1);
-
- polyFD0 = gmx_mm256_inv_ps(polyFD0);
-
- polyFN0 = _mm256_mul_ps(FN6, z4);
- polyFN1 = _mm256_mul_ps(FN5, z4);
- polyFN0 = _mm256_add_ps(polyFN0, FN4);
- polyFN1 = _mm256_add_ps(polyFN1, FN3);
- polyFN0 = _mm256_mul_ps(polyFN0, z4);
- polyFN1 = _mm256_mul_ps(polyFN1, z4);
- polyFN0 = _mm256_add_ps(polyFN0, FN2);
- polyFN1 = _mm256_add_ps(polyFN1, FN1);
- polyFN0 = _mm256_mul_ps(polyFN0, z4);
- polyFN1 = _mm256_mul_ps(polyFN1, z2);
- polyFN0 = _mm256_add_ps(polyFN0, FN0);
- polyFN0 = _mm256_add_ps(polyFN0, polyFN1);
-
- return _mm256_mul_ps(polyFN0, polyFD0);
-}
-
-
-static __m128
-gmx_mm_pmecorrF_ps(__m128 z2)
-{
- const __m128 FN6 = _mm_set1_ps(-1.7357322914161492954e-8f);
- const __m128 FN5 = _mm_set1_ps(1.4703624142580877519e-6f);
- const __m128 FN4 = _mm_set1_ps(-0.000053401640219807709149f);
- const __m128 FN3 = _mm_set1_ps(0.0010054721316683106153f);
- const __m128 FN2 = _mm_set1_ps(-0.019278317264888380590f);
- const __m128 FN1 = _mm_set1_ps(0.069670166153766424023f);
- const __m128 FN0 = _mm_set1_ps(-0.75225204789749321333f);
-
- const __m128 FD4 = _mm_set1_ps(0.0011193462567257629232f);
- const __m128 FD3 = _mm_set1_ps(0.014866955030185295499f);
- const __m128 FD2 = _mm_set1_ps(0.11583842382862377919f);
- const __m128 FD1 = _mm_set1_ps(0.50736591960530292870f);
- const __m128 FD0 = _mm_set1_ps(1.0f);
-
- __m128 z4;
- __m128 polyFN0, polyFN1, polyFD0, polyFD1;
-
- z4 = _mm_mul_ps(z2, z2);
-
- polyFD0 = _mm_mul_ps(FD4, z4);
- polyFD1 = _mm_mul_ps(FD3, z4);
- polyFD0 = _mm_add_ps(polyFD0, FD2);
- polyFD1 = _mm_add_ps(polyFD1, FD1);
- polyFD0 = _mm_mul_ps(polyFD0, z4);
- polyFD1 = _mm_mul_ps(polyFD1, z2);
- polyFD0 = _mm_add_ps(polyFD0, FD0);
- polyFD0 = _mm_add_ps(polyFD0, polyFD1);
-
- polyFD0 = gmx_mm_inv_ps(polyFD0);
-
- polyFN0 = _mm_mul_ps(FN6, z4);
- polyFN1 = _mm_mul_ps(FN5, z4);
- polyFN0 = _mm_add_ps(polyFN0, FN4);
- polyFN1 = _mm_add_ps(polyFN1, FN3);
- polyFN0 = _mm_mul_ps(polyFN0, z4);
- polyFN1 = _mm_mul_ps(polyFN1, z4);
- polyFN0 = _mm_add_ps(polyFN0, FN2);
- polyFN1 = _mm_add_ps(polyFN1, FN1);
- polyFN0 = _mm_mul_ps(polyFN0, z4);
- polyFN1 = _mm_mul_ps(polyFN1, z2);
- polyFN0 = _mm_add_ps(polyFN0, FN0);
- polyFN0 = _mm_add_ps(polyFN0, polyFN1);
-
- return _mm_mul_ps(polyFN0, polyFD0);
-}
-
-
-
-/* Calculate the potential correction due to PME analytically.
- *
- * See gmx_mm256_pmecorrF_ps() for details about the approximation.
- *
- * This routine calculates Erf(z)/z, although you should provide z^2
- * as the input argument.
- *
- * Here's how it should be used:
- *
- * 1. Calculate r^2.
- * 2. Multiply by beta^2, so you get z^2=beta^2*r^2.
- * 3. Evaluate this routine with z^2 as the argument.
- * 4. The return value is the expression:
- *
- *
- * erf(z)
- * --------
- * z
- *
- * 5. Multiply the entire expression by beta and switching back to r (z=r*beta):
- *
- * erf(r*beta)
- * -----------
- * r
- *
- * 6. Subtract the result from 1/r, multiply by the product of the charges,
- * and you have your potential.
- */
-static __m256
-gmx_mm256_pmecorrV_ps(__m256 z2)
-{
- const __m256 VN6 = _mm256_set1_ps(1.9296833005951166339e-8f);
- const __m256 VN5 = _mm256_set1_ps(-1.4213390571557850962e-6f);
- const __m256 VN4 = _mm256_set1_ps(0.000041603292906656984871f);
- const __m256 VN3 = _mm256_set1_ps(-0.00013134036773265025626f);
- const __m256 VN2 = _mm256_set1_ps(0.038657983986041781264f);
- const __m256 VN1 = _mm256_set1_ps(0.11285044772717598220f);
- const __m256 VN0 = _mm256_set1_ps(1.1283802385263030286f);
-
- const __m256 VD3 = _mm256_set1_ps(0.0066752224023576045451f);
- const __m256 VD2 = _mm256_set1_ps(0.078647795836373922256f);
- const __m256 VD1 = _mm256_set1_ps(0.43336185284710920150f);
- const __m256 VD0 = _mm256_set1_ps(1.0f);
-
- __m256 z4;
- __m256 polyVN0, polyVN1, polyVD0, polyVD1;
-
- z4 = _mm256_mul_ps(z2, z2);
-
- polyVD1 = _mm256_mul_ps(VD3, z4);
- polyVD0 = _mm256_mul_ps(VD2, z4);
- polyVD1 = _mm256_add_ps(polyVD1, VD1);
- polyVD0 = _mm256_add_ps(polyVD0, VD0);
- polyVD1 = _mm256_mul_ps(polyVD1, z2);
- polyVD0 = _mm256_add_ps(polyVD0, polyVD1);
-
- polyVD0 = gmx_mm256_inv_ps(polyVD0);
-
- polyVN0 = _mm256_mul_ps(VN6, z4);
- polyVN1 = _mm256_mul_ps(VN5, z4);
- polyVN0 = _mm256_add_ps(polyVN0, VN4);
- polyVN1 = _mm256_add_ps(polyVN1, VN3);
- polyVN0 = _mm256_mul_ps(polyVN0, z4);
- polyVN1 = _mm256_mul_ps(polyVN1, z4);
- polyVN0 = _mm256_add_ps(polyVN0, VN2);
- polyVN1 = _mm256_add_ps(polyVN1, VN1);
- polyVN0 = _mm256_mul_ps(polyVN0, z4);
- polyVN1 = _mm256_mul_ps(polyVN1, z2);
- polyVN0 = _mm256_add_ps(polyVN0, VN0);
- polyVN0 = _mm256_add_ps(polyVN0, polyVN1);
-
- return _mm256_mul_ps(polyVN0, polyVD0);
-}
-
-
-static __m128
-gmx_mm_pmecorrV_ps(__m128 z2)
-{
- const __m128 VN6 = _mm_set1_ps(1.9296833005951166339e-8f);
- const __m128 VN5 = _mm_set1_ps(-1.4213390571557850962e-6f);
- const __m128 VN4 = _mm_set1_ps(0.000041603292906656984871f);
- const __m128 VN3 = _mm_set1_ps(-0.00013134036773265025626f);
- const __m128 VN2 = _mm_set1_ps(0.038657983986041781264f);
- const __m128 VN1 = _mm_set1_ps(0.11285044772717598220f);
- const __m128 VN0 = _mm_set1_ps(1.1283802385263030286f);
-
- const __m128 VD3 = _mm_set1_ps(0.0066752224023576045451f);
- const __m128 VD2 = _mm_set1_ps(0.078647795836373922256f);
- const __m128 VD1 = _mm_set1_ps(0.43336185284710920150f);
- const __m128 VD0 = _mm_set1_ps(1.0f);
-
- __m128 z4;
- __m128 polyVN0, polyVN1, polyVD0, polyVD1;
-
- z4 = _mm_mul_ps(z2, z2);
-
- polyVD1 = _mm_mul_ps(VD3, z4);
- polyVD0 = _mm_mul_ps(VD2, z4);
- polyVD1 = _mm_add_ps(polyVD1, VD1);
- polyVD0 = _mm_add_ps(polyVD0, VD0);
- polyVD1 = _mm_mul_ps(polyVD1, z2);
- polyVD0 = _mm_add_ps(polyVD0, polyVD1);
-
- polyVD0 = gmx_mm_inv_ps(polyVD0);
-
- polyVN0 = _mm_mul_ps(VN6, z4);
- polyVN1 = _mm_mul_ps(VN5, z4);
- polyVN0 = _mm_add_ps(polyVN0, VN4);
- polyVN1 = _mm_add_ps(polyVN1, VN3);
- polyVN0 = _mm_mul_ps(polyVN0, z4);
- polyVN1 = _mm_mul_ps(polyVN1, z4);
- polyVN0 = _mm_add_ps(polyVN0, VN2);
- polyVN1 = _mm_add_ps(polyVN1, VN1);
- polyVN0 = _mm_mul_ps(polyVN0, z4);
- polyVN1 = _mm_mul_ps(polyVN1, z2);
- polyVN0 = _mm_add_ps(polyVN0, VN0);
- polyVN0 = _mm_add_ps(polyVN0, polyVN1);
-
- return _mm_mul_ps(polyVN0, polyVD0);
-}
-
-
-static int
-gmx_mm256_sincos_ps(__m256 x,
- __m256 *sinval,
- __m256 *cosval)
-{
- const __m256 two_over_pi = _mm256_set1_ps(2.0f/(float)M_PI);
- const __m256 half = _mm256_set1_ps(0.5f);
- const __m256 one = _mm256_set1_ps(1.0f);
- const __m256 zero = _mm256_setzero_ps();
-
- const __m128i ione = _mm_set1_epi32(1);
-
- const __m256 mask_one = _mm256_castsi256_ps(_mm256_set1_epi32(1));
- const __m256 mask_two = _mm256_castsi256_ps(_mm256_set1_epi32(2));
- const __m256 mask_three = _mm256_castsi256_ps(_mm256_set1_epi32(3));
-
- const __m256 CA1 = _mm256_set1_ps(1.5703125f);
- const __m256 CA2 = _mm256_set1_ps(4.837512969970703125e-4f);
- const __m256 CA3 = _mm256_set1_ps(7.54978995489188216e-8f);
-
- const __m256 CC0 = _mm256_set1_ps(-0.0013602249f);
- const __m256 CC1 = _mm256_set1_ps(0.0416566950f);
- const __m256 CC2 = _mm256_set1_ps(-0.4999990225f);
- const __m256 CS0 = _mm256_set1_ps(-0.0001950727f);
- const __m256 CS1 = _mm256_set1_ps(0.0083320758f);
- const __m256 CS2 = _mm256_set1_ps(-0.1666665247f);
-
- const __m256 signbit = _mm256_castsi256_ps( _mm256_set1_epi32(0x80000000) );
-
- __m256 y, y2;
- __m256 z;
- __m256i iz;
- __m128i iz_high, iz_low;
- __m256 offset_sin, offset_cos;
- __m256 mask_sin, mask_cos;
- __m256 tmp1, tmp2;
- __m256 tmp_sin, tmp_cos;
-
- y = _mm256_mul_ps(x, two_over_pi);
- y = _mm256_add_ps(y, _mm256_or_ps(_mm256_and_ps(y, signbit), half));
-
- iz = _mm256_cvttps_epi32(y);
- z = _mm256_round_ps(y, _MM_FROUND_TO_ZERO);
-
- offset_sin = _mm256_and_ps(_mm256_castsi256_ps(iz), mask_three);
-
- iz_high = _mm256_extractf128_si256(iz, 0x1);
- iz_low = _mm256_castsi256_si128(iz);
- iz_low = _mm_add_epi32(iz_low, ione);
- iz_high = _mm_add_epi32(iz_high, ione);
- iz = _mm256_castsi128_si256(iz_low);
- iz = _mm256_insertf128_si256(iz, iz_high, 0x1);
- offset_cos = _mm256_castsi256_ps(iz);
-
- /* Extended precision arithmethic to achieve full precision */
- y = _mm256_mul_ps(z, CA1);
- tmp1 = _mm256_mul_ps(z, CA2);
- tmp2 = _mm256_mul_ps(z, CA3);
- y = _mm256_sub_ps(x, y);
- y = _mm256_sub_ps(y, tmp1);
- y = _mm256_sub_ps(y, tmp2);
-
- y2 = _mm256_mul_ps(y, y);
-
- tmp1 = _mm256_mul_ps(CC0, y2);
- tmp1 = _mm256_add_ps(tmp1, CC1);
- tmp2 = _mm256_mul_ps(CS0, y2);
- tmp2 = _mm256_add_ps(tmp2, CS1);
- tmp1 = _mm256_mul_ps(tmp1, y2);
- tmp1 = _mm256_add_ps(tmp1, CC2);
- tmp2 = _mm256_mul_ps(tmp2, y2);
- tmp2 = _mm256_add_ps(tmp2, CS2);
-
- tmp1 = _mm256_mul_ps(tmp1, y2);
- tmp1 = _mm256_add_ps(tmp1, one);
-
- tmp2 = _mm256_mul_ps(tmp2, _mm256_mul_ps(y, y2));
- tmp2 = _mm256_add_ps(tmp2, y);
-
-#ifdef __INTEL_COMPILER
- /* Intel Compiler version 12.1.3 20120130 is buggy if optimization is enabled unless we cast explicitly! */
- mask_sin = _mm256_cmp_ps(_mm256_cvtepi32_ps(_mm256_castps_si256(_mm256_and_ps(offset_sin, mask_one))), zero, _CMP_EQ_OQ);
- mask_cos = _mm256_cmp_ps(_mm256_cvtepi32_ps(_mm256_castps_si256(_mm256_and_ps(offset_cos, mask_one))), zero, _CMP_EQ_OQ);
-#else
- mask_sin = _mm256_cmp_ps( _mm256_and_ps(offset_sin, mask_one), zero, _CMP_EQ_OQ);
- mask_cos = _mm256_cmp_ps( _mm256_and_ps(offset_cos, mask_one), zero, _CMP_EQ_OQ);
-#endif
- tmp_sin = _mm256_blendv_ps(tmp1, tmp2, mask_sin);
- tmp_cos = _mm256_blendv_ps(tmp1, tmp2, mask_cos);
-
- tmp1 = _mm256_xor_ps(signbit, tmp_sin);
- tmp2 = _mm256_xor_ps(signbit, tmp_cos);
-
-#ifdef __INTEL_COMPILER
- /* Intel Compiler version 12.1.3 20120130 is buggy if optimization is enabled unless we cast explicitly! */
- mask_sin = _mm256_cmp_ps(_mm256_cvtepi32_ps(_mm256_castps_si256(_mm256_and_ps(offset_sin, mask_two))), zero, _CMP_EQ_OQ);
- mask_cos = _mm256_cmp_ps(_mm256_cvtepi32_ps(_mm256_castps_si256(_mm256_and_ps(offset_cos, mask_two))), zero, _CMP_EQ_OQ);
-#else
- mask_sin = _mm256_cmp_ps( _mm256_and_ps(offset_sin, mask_two), zero, _CMP_EQ_OQ);
- mask_cos = _mm256_cmp_ps( _mm256_and_ps(offset_cos, mask_two), zero, _CMP_EQ_OQ);
-
-#endif
- *sinval = _mm256_blendv_ps(tmp1, tmp_sin, mask_sin);
- *cosval = _mm256_blendv_ps(tmp2, tmp_cos, mask_cos);
-
- return 0;
-}
-
-static int
-gmx_mm_sincos_ps(__m128 x,
- __m128 *sinval,
- __m128 *cosval)
-{
- const __m128 two_over_pi = _mm_set1_ps(2.0/M_PI);
- const __m128 half = _mm_set1_ps(0.5);
- const __m128 one = _mm_set1_ps(1.0);
-
- const __m128i izero = _mm_set1_epi32(0);
- const __m128i ione = _mm_set1_epi32(1);
- const __m128i itwo = _mm_set1_epi32(2);
- const __m128i ithree = _mm_set1_epi32(3);
- const __m128 signbit = gmx_mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
-
- const __m128 CA1 = _mm_set1_ps(1.5703125f);
- const __m128 CA2 = _mm_set1_ps(4.837512969970703125e-4f);
- const __m128 CA3 = _mm_set1_ps(7.54978995489188216e-8f);
-
- const __m128 CC0 = _mm_set1_ps(-0.0013602249f);
- const __m128 CC1 = _mm_set1_ps(0.0416566950f);
- const __m128 CC2 = _mm_set1_ps(-0.4999990225f);
- const __m128 CS0 = _mm_set1_ps(-0.0001950727f);
- const __m128 CS1 = _mm_set1_ps(0.0083320758f);
- const __m128 CS2 = _mm_set1_ps(-0.1666665247f);
-
- __m128 y, y2;
- __m128 z;
- __m128i iz;
- __m128i offset_sin, offset_cos;
- __m128 tmp1, tmp2;
- __m128 mask_sin, mask_cos;
- __m128 tmp_sin, tmp_cos;
-
- y = _mm_mul_ps(x, two_over_pi);
- y = _mm_add_ps(y, _mm_or_ps(_mm_and_ps(y, signbit), half));
-
- iz = _mm_cvttps_epi32(y);
- z = _mm_round_ps(y, _MM_FROUND_TO_ZERO);
-
- offset_sin = _mm_and_si128(iz, ithree);
- offset_cos = _mm_add_epi32(iz, ione);
-
- /* Extended precision arithmethic to achieve full precision */
- y = _mm_mul_ps(z, CA1);
- tmp1 = _mm_mul_ps(z, CA2);
- tmp2 = _mm_mul_ps(z, CA3);
- y = _mm_sub_ps(x, y);
- y = _mm_sub_ps(y, tmp1);
- y = _mm_sub_ps(y, tmp2);
-
- y2 = _mm_mul_ps(y, y);
-
- tmp1 = _mm_mul_ps(CC0, y2);
- tmp1 = _mm_add_ps(tmp1, CC1);
- tmp2 = _mm_mul_ps(CS0, y2);
- tmp2 = _mm_add_ps(tmp2, CS1);
- tmp1 = _mm_mul_ps(tmp1, y2);
- tmp1 = _mm_add_ps(tmp1, CC2);
- tmp2 = _mm_mul_ps(tmp2, y2);
- tmp2 = _mm_add_ps(tmp2, CS2);
-
- tmp1 = _mm_mul_ps(tmp1, y2);
- tmp1 = _mm_add_ps(tmp1, one);
-
- tmp2 = _mm_mul_ps(tmp2, _mm_mul_ps(y, y2));
- tmp2 = _mm_add_ps(tmp2, y);
-
- mask_sin = gmx_mm_castsi128_ps(_mm_cmpeq_epi32( _mm_and_si128(offset_sin, ione), izero));
- mask_cos = gmx_mm_castsi128_ps(_mm_cmpeq_epi32( _mm_and_si128(offset_cos, ione), izero));
-
- tmp_sin = _mm_blendv_ps(tmp1, tmp2, mask_sin);
- tmp_cos = _mm_blendv_ps(tmp1, tmp2, mask_cos);
-
- mask_sin = gmx_mm_castsi128_ps(_mm_cmpeq_epi32( _mm_and_si128(offset_sin, itwo), izero));
- mask_cos = gmx_mm_castsi128_ps(_mm_cmpeq_epi32( _mm_and_si128(offset_cos, itwo), izero));
-
- tmp1 = _mm_xor_ps(signbit, tmp_sin);
- tmp2 = _mm_xor_ps(signbit, tmp_cos);
-
- *sinval = _mm_blendv_ps(tmp1, tmp_sin, mask_sin);
- *cosval = _mm_blendv_ps(tmp2, tmp_cos, mask_cos);
-
- return 0;
-}
-
-
-
-
-/*
- * IMPORTANT: Do NOT call both sin & cos if you need both results, since each of them
- * will then call the sincos() routine and waste a factor 2 in performance!
- */
-static __m256
-gmx_mm256_sin_ps(__m256 x)
-{
- __m256 s, c;
- gmx_mm256_sincos_ps(x, &s, &c);
- return s;
-}
-
-static __m128
-gmx_mm_sin_ps(__m128 x)
-{
- __m128 s, c;
- gmx_mm_sincos_ps(x, &s, &c);
- return s;
-}
-
-
-/*
- * IMPORTANT: Do NOT call both sin & cos if you need both results, since each of them
- * will then call the sincos() routine and waste a factor 2 in performance!
- */
-static __m256
-gmx_mm256_cos_ps(__m256 x)
-{
- __m256 s, c;
- gmx_mm256_sincos_ps(x, &s, &c);
- return c;
-}
-
-static __m128
-gmx_mm_cos_ps(__m128 x)
-{
- __m128 s, c;
- gmx_mm_sincos_ps(x, &s, &c);
- return c;
-}
-
-
-static __m256
-gmx_mm256_tan_ps(__m256 x)
-{
- __m256 sinval, cosval;
- __m256 tanval;
-
- gmx_mm256_sincos_ps(x, &sinval, &cosval);
-
- tanval = _mm256_mul_ps(sinval, gmx_mm256_inv_ps(cosval));
-
- return tanval;
-}
-
-static __m128
-gmx_mm_tan_ps(__m128 x)
-{
- __m128 sinval, cosval;
- __m128 tanval;
-
- gmx_mm_sincos_ps(x, &sinval, &cosval);
-
- tanval = _mm_mul_ps(sinval, gmx_mm_inv_ps(cosval));
-
- return tanval;
-}
-
-
-static __m256
-gmx_mm256_asin_ps(__m256 x)
-{
- const __m256 signmask = _mm256_castsi256_ps( _mm256_set1_epi32(0x7FFFFFFF) );
- const __m256 limitlow = _mm256_set1_ps(1e-4f);
- const __m256 half = _mm256_set1_ps(0.5f);
- const __m256 one = _mm256_set1_ps(1.0f);
- const __m256 halfpi = _mm256_set1_ps((float)M_PI/2.0f);
-
- const __m256 CC5 = _mm256_set1_ps(4.2163199048E-2f);
- const __m256 CC4 = _mm256_set1_ps(2.4181311049E-2f);
- const __m256 CC3 = _mm256_set1_ps(4.5470025998E-2f);
- const __m256 CC2 = _mm256_set1_ps(7.4953002686E-2f);
- const __m256 CC1 = _mm256_set1_ps(1.6666752422E-1f);
-
- __m256 sign;
- __m256 mask;
- __m256 xabs;
- __m256 z, z1, z2, q, q1, q2;
- __m256 pA, pB;
-
- sign = _mm256_andnot_ps(signmask, x);
- xabs = _mm256_and_ps(x, signmask);
-
- mask = _mm256_cmp_ps(xabs, half, _CMP_GT_OQ);
-
- z1 = _mm256_mul_ps(half, _mm256_sub_ps(one, xabs));
- q1 = _mm256_mul_ps(z1, gmx_mm256_invsqrt_ps(z1));
- q1 = _mm256_andnot_ps(_mm256_cmp_ps(xabs, one, _CMP_EQ_OQ), q1);
-
- q2 = xabs;
- z2 = _mm256_mul_ps(q2, q2);
-
- z = _mm256_blendv_ps(z2, z1, mask);
- q = _mm256_blendv_ps(q2, q1, mask);
-
- z2 = _mm256_mul_ps(z, z);
-
- pA = _mm256_mul_ps(CC5, z2);
- pB = _mm256_mul_ps(CC4, z2);
-
- pA = _mm256_add_ps(pA, CC3);
- pB = _mm256_add_ps(pB, CC2);
-
- pA = _mm256_mul_ps(pA, z2);
- pB = _mm256_mul_ps(pB, z2);
-
- pA = _mm256_add_ps(pA, CC1);
- pA = _mm256_mul_ps(pA, z);
-
- z = _mm256_add_ps(pA, pB);
- z = _mm256_mul_ps(z, q);
- z = _mm256_add_ps(z, q);
-
- q2 = _mm256_sub_ps(halfpi, z);
- q2 = _mm256_sub_ps(q2, z);
-
- z = _mm256_blendv_ps(z, q2, mask);
-
- mask = _mm256_cmp_ps(xabs, limitlow, _CMP_GT_OQ);
- z = _mm256_blendv_ps(xabs, z, mask);
-
- z = _mm256_xor_ps(z, sign);
-
- return z;
-}
-
-static __m128
-gmx_mm_asin_ps(__m128 x)
-{
- /* Same algorithm as cephes library */
- const __m128 signmask = gmx_mm_castsi128_ps( _mm_set1_epi32(0x7FFFFFFF) );
- const __m128 limitlow = _mm_set1_ps(1e-4f);
- const __m128 half = _mm_set1_ps(0.5f);
- const __m128 one = _mm_set1_ps(1.0f);
- const __m128 halfpi = _mm_set1_ps(M_PI/2.0f);
-
- const __m128 CC5 = _mm_set1_ps(4.2163199048E-2f);
- const __m128 CC4 = _mm_set1_ps(2.4181311049E-2f);
- const __m128 CC3 = _mm_set1_ps(4.5470025998E-2f);
- const __m128 CC2 = _mm_set1_ps(7.4953002686E-2f);
- const __m128 CC1 = _mm_set1_ps(1.6666752422E-1f);
-
- __m128 sign;
- __m128 mask;
- __m128 xabs;
- __m128 z, z1, z2, q, q1, q2;
- __m128 pA, pB;
-
- sign = _mm_andnot_ps(signmask, x);
- xabs = _mm_and_ps(x, signmask);
-
- mask = _mm_cmpgt_ps(xabs, half);
-
- z1 = _mm_mul_ps(half, _mm_sub_ps(one, xabs));
- q1 = _mm_mul_ps(z1, gmx_mm_invsqrt_ps(z1));
- q1 = _mm_andnot_ps(_mm_cmpeq_ps(xabs, one), q1);
-
- q2 = xabs;
- z2 = _mm_mul_ps(q2, q2);
-
- z = _mm_or_ps( _mm_and_ps(mask, z1), _mm_andnot_ps(mask, z2) );
- q = _mm_or_ps( _mm_and_ps(mask, q1), _mm_andnot_ps(mask, q2) );
-
- z2 = _mm_mul_ps(z, z);
-
- pA = _mm_mul_ps(CC5, z2);
- pB = _mm_mul_ps(CC4, z2);
-
- pA = _mm_add_ps(pA, CC3);
- pB = _mm_add_ps(pB, CC2);
-
- pA = _mm_mul_ps(pA, z2);
- pB = _mm_mul_ps(pB, z2);
-
- pA = _mm_add_ps(pA, CC1);
- pA = _mm_mul_ps(pA, z);
-
- z = _mm_add_ps(pA, pB);
- z = _mm_mul_ps(z, q);
- z = _mm_add_ps(z, q);
-
- q2 = _mm_sub_ps(halfpi, z);
- q2 = _mm_sub_ps(q2, z);
-
- z = _mm_or_ps( _mm_and_ps(mask, q2), _mm_andnot_ps(mask, z) );
-
- mask = _mm_cmpgt_ps(xabs, limitlow);
- z = _mm_or_ps( _mm_and_ps(mask, z), _mm_andnot_ps(mask, xabs) );
-
- z = _mm_xor_ps(z, sign);
-
- return z;
-}
-
-
-static __m256
-gmx_mm256_acos_ps(__m256 x)
-{
- const __m256 signmask = _mm256_castsi256_ps( _mm256_set1_epi32(0x7FFFFFFF) );
- const __m256 one_ps = _mm256_set1_ps(1.0f);
- const __m256 half_ps = _mm256_set1_ps(0.5f);
- const __m256 pi_ps = _mm256_set1_ps((float)M_PI);
- const __m256 halfpi_ps = _mm256_set1_ps((float)M_PI/2.0f);
-
- __m256 mask1;
- __m256 mask2;
- __m256 xabs;
- __m256 z, z1, z2, z3;
-
- xabs = _mm256_and_ps(x, signmask);
- mask1 = _mm256_cmp_ps(xabs, half_ps, _CMP_GT_OQ);
- mask2 = _mm256_cmp_ps(x, _mm256_setzero_ps(), _CMP_GT_OQ);
-
- z = _mm256_mul_ps(half_ps, _mm256_sub_ps(one_ps, xabs));
- z = _mm256_mul_ps(z, gmx_mm256_invsqrt_ps(z));
- z = _mm256_andnot_ps(_mm256_cmp_ps(xabs, one_ps, _CMP_EQ_OQ), z);
-
- z = _mm256_blendv_ps(x, z, mask1);
- z = gmx_mm256_asin_ps(z);
-
- z2 = _mm256_add_ps(z, z);
- z1 = _mm256_sub_ps(pi_ps, z2);
- z3 = _mm256_sub_ps(halfpi_ps, z);
-
- z = _mm256_blendv_ps(z1, z2, mask2);
- z = _mm256_blendv_ps(z3, z, mask1);
-
- return z;
-}
-
-static __m128
-gmx_mm_acos_ps(__m128 x)
-{
- const __m128 signmask = gmx_mm_castsi128_ps( _mm_set1_epi32(0x7FFFFFFF) );
- const __m128 one_ps = _mm_set1_ps(1.0f);
- const __m128 half_ps = _mm_set1_ps(0.5f);
- const __m128 pi_ps = _mm_set1_ps(M_PI);
- const __m128 halfpi_ps = _mm_set1_ps(M_PI/2.0f);
-
- __m128 mask1;
- __m128 mask2;
- __m128 xabs;
- __m128 z, z1, z2, z3;
-
- xabs = _mm_and_ps(x, signmask);
- mask1 = _mm_cmpgt_ps(xabs, half_ps);
- mask2 = _mm_cmpgt_ps(x, _mm_setzero_ps());
-
- z = _mm_mul_ps(half_ps, _mm_sub_ps(one_ps, xabs));
- z = _mm_mul_ps(z, gmx_mm_invsqrt_ps(z));
- z = _mm_andnot_ps(_mm_cmpeq_ps(xabs, one_ps), z);
-
- z = _mm_blendv_ps(x, z, mask1);
- z = gmx_mm_asin_ps(z);
-
- z2 = _mm_add_ps(z, z);
- z1 = _mm_sub_ps(pi_ps, z2);
- z3 = _mm_sub_ps(halfpi_ps, z);
-
- z = _mm_blendv_ps(z1, z2, mask2);
- z = _mm_blendv_ps(z3, z, mask1);
-
- return z;
-}
-
-
-static __m256
-gmx_mm256_atan_ps(__m256 x)
-{
- const __m256 signmask = _mm256_castsi256_ps( _mm256_set1_epi32(0x7FFFFFFF) );
- const __m256 limit1 = _mm256_set1_ps(0.414213562373095f);
- const __m256 limit2 = _mm256_set1_ps(2.414213562373095f);
- const __m256 quarterpi = _mm256_set1_ps(0.785398163397448f);
- const __m256 halfpi = _mm256_set1_ps(1.570796326794896f);
- const __m256 mone = _mm256_set1_ps(-1.0f);
- const __m256 CC3 = _mm256_set1_ps(-3.33329491539E-1f);
- const __m256 CC5 = _mm256_set1_ps(1.99777106478E-1f);
- const __m256 CC7 = _mm256_set1_ps(-1.38776856032E-1);
- const __m256 CC9 = _mm256_set1_ps(8.05374449538e-2f);
-
- __m256 sign;
- __m256 mask1, mask2;
- __m256 y, z1, z2;
- __m256 x2, x4;
- __m256 sum1, sum2;
-
- sign = _mm256_andnot_ps(signmask, x);
- x = _mm256_and_ps(x, signmask);
-
- mask1 = _mm256_cmp_ps(x, limit1, _CMP_GT_OQ);
- mask2 = _mm256_cmp_ps(x, limit2, _CMP_GT_OQ);
-
- z1 = _mm256_mul_ps(_mm256_add_ps(x, mone), gmx_mm256_inv_ps(_mm256_sub_ps(x, mone)));
- z2 = _mm256_mul_ps(mone, gmx_mm256_inv_ps(x));
-
- y = _mm256_and_ps(mask1, quarterpi);
- y = _mm256_blendv_ps(y, halfpi, mask2);
-
- x = _mm256_blendv_ps(x, z1, mask1);
- x = _mm256_blendv_ps(x, z2, mask2);
-
- x2 = _mm256_mul_ps(x, x);
- x4 = _mm256_mul_ps(x2, x2);
-
- sum1 = _mm256_mul_ps(CC9, x4);
- sum2 = _mm256_mul_ps(CC7, x4);
- sum1 = _mm256_add_ps(sum1, CC5);
- sum2 = _mm256_add_ps(sum2, CC3);
- sum1 = _mm256_mul_ps(sum1, x4);
- sum2 = _mm256_mul_ps(sum2, x2);
-
- sum1 = _mm256_add_ps(sum1, sum2);
- sum1 = _mm256_sub_ps(sum1, mone);
- sum1 = _mm256_mul_ps(sum1, x);
- y = _mm256_add_ps(y, sum1);
-
- y = _mm256_xor_ps(y, sign);
-
- return y;
-}
-
-static __m128
-gmx_mm_atan_ps(__m128 x)
-{
- /* Same algorithm as cephes library */
- const __m128 signmask = gmx_mm_castsi128_ps( _mm_set1_epi32(0x7FFFFFFF) );
- const __m128 limit1 = _mm_set1_ps(0.414213562373095f);
- const __m128 limit2 = _mm_set1_ps(2.414213562373095f);
- const __m128 quarterpi = _mm_set1_ps(0.785398163397448f);
- const __m128 halfpi = _mm_set1_ps(1.570796326794896f);
- const __m128 mone = _mm_set1_ps(-1.0f);
- const __m128 CC3 = _mm_set1_ps(-3.33329491539E-1f);
- const __m128 CC5 = _mm_set1_ps(1.99777106478E-1f);
- const __m128 CC7 = _mm_set1_ps(-1.38776856032E-1);
- const __m128 CC9 = _mm_set1_ps(8.05374449538e-2f);
-
- __m128 sign;
- __m128 mask1, mask2;
- __m128 y, z1, z2;
- __m128 x2, x4;
- __m128 sum1, sum2;
-
- sign = _mm_andnot_ps(signmask, x);
- x = _mm_and_ps(x, signmask);
-
- mask1 = _mm_cmpgt_ps(x, limit1);
- mask2 = _mm_cmpgt_ps(x, limit2);
-
- z1 = _mm_mul_ps(_mm_add_ps(x, mone), gmx_mm_inv_ps(_mm_sub_ps(x, mone)));
- z2 = _mm_mul_ps(mone, gmx_mm_inv_ps(x));
-
- y = _mm_and_ps(mask1, quarterpi);
- y = _mm_blendv_ps(y, halfpi, mask2);
-
- x = _mm_blendv_ps(x, z1, mask1);
- x = _mm_blendv_ps(x, z2, mask2);
-
- x2 = _mm_mul_ps(x, x);
- x4 = _mm_mul_ps(x2, x2);
-
- sum1 = _mm_mul_ps(CC9, x4);
- sum2 = _mm_mul_ps(CC7, x4);
- sum1 = _mm_add_ps(sum1, CC5);
- sum2 = _mm_add_ps(sum2, CC3);
- sum1 = _mm_mul_ps(sum1, x4);
- sum2 = _mm_mul_ps(sum2, x2);
-
- sum1 = _mm_add_ps(sum1, sum2);
- sum1 = _mm_sub_ps(sum1, mone);
- sum1 = _mm_mul_ps(sum1, x);
- y = _mm_add_ps(y, sum1);
-
- y = _mm_xor_ps(y, sign);
-
- return y;
-}
-
-
-static __m256
-gmx_mm256_atan2_ps(__m256 y, __m256 x)
-{
- const __m256 pi = _mm256_set1_ps( (float) M_PI);
- const __m256 minuspi = _mm256_set1_ps( (float) -M_PI);
- const __m256 halfpi = _mm256_set1_ps( (float) M_PI/2.0f);
- const __m256 minushalfpi = _mm256_set1_ps( (float) -M_PI/2.0f);
-
- __m256 z, z1, z3, z4;
- __m256 w;
- __m256 maskx_lt, maskx_eq;
- __m256 masky_lt, masky_eq;
- __m256 mask1, mask2, mask3, mask4, maskall;
-
- maskx_lt = _mm256_cmp_ps(x, _mm256_setzero_ps(), _CMP_LT_OQ);
- masky_lt = _mm256_cmp_ps(y, _mm256_setzero_ps(), _CMP_LT_OQ);
- maskx_eq = _mm256_cmp_ps(x, _mm256_setzero_ps(), _CMP_EQ_OQ);
- masky_eq = _mm256_cmp_ps(y, _mm256_setzero_ps(), _CMP_EQ_OQ);
-
- z = _mm256_mul_ps(y, gmx_mm256_inv_ps(x));
- z = gmx_mm256_atan_ps(z);
-
- mask1 = _mm256_and_ps(maskx_eq, masky_lt);
- mask2 = _mm256_andnot_ps(maskx_lt, masky_eq);
- mask3 = _mm256_andnot_ps( _mm256_or_ps(masky_lt, masky_eq), maskx_eq);
- mask4 = _mm256_and_ps(maskx_lt, masky_eq);
- maskall = _mm256_or_ps( _mm256_or_ps(mask1, mask2), _mm256_or_ps(mask3, mask4) );
-
- z = _mm256_andnot_ps(maskall, z);
- z1 = _mm256_and_ps(mask1, minushalfpi);
- z3 = _mm256_and_ps(mask3, halfpi);
- z4 = _mm256_and_ps(mask4, pi);
-
- z = _mm256_or_ps( _mm256_or_ps(z, z1), _mm256_or_ps(z3, z4) );
-
- w = _mm256_blendv_ps(pi, minuspi, masky_lt);
- w = _mm256_and_ps(w, maskx_lt);
-
- w = _mm256_andnot_ps(maskall, w);
-
- z = _mm256_add_ps(z, w);
-
- return z;
-}
-
-static __m128
-gmx_mm_atan2_ps(__m128 y, __m128 x)
-{
- const __m128 pi = _mm_set1_ps(M_PI);
- const __m128 minuspi = _mm_set1_ps(-M_PI);
- const __m128 halfpi = _mm_set1_ps(M_PI/2.0);
- const __m128 minushalfpi = _mm_set1_ps(-M_PI/2.0);
-
- __m128 z, z1, z3, z4;
- __m128 w;
- __m128 maskx_lt, maskx_eq;
- __m128 masky_lt, masky_eq;
- __m128 mask1, mask2, mask3, mask4, maskall;
-
- maskx_lt = _mm_cmplt_ps(x, _mm_setzero_ps());
- masky_lt = _mm_cmplt_ps(y, _mm_setzero_ps());
- maskx_eq = _mm_cmpeq_ps(x, _mm_setzero_ps());
- masky_eq = _mm_cmpeq_ps(y, _mm_setzero_ps());
-
- z = _mm_mul_ps(y, gmx_mm_inv_ps(x));
- z = gmx_mm_atan_ps(z);
-
- mask1 = _mm_and_ps(maskx_eq, masky_lt);
- mask2 = _mm_andnot_ps(maskx_lt, masky_eq);
- mask3 = _mm_andnot_ps( _mm_or_ps(masky_lt, masky_eq), maskx_eq);
- mask4 = _mm_and_ps(masky_eq, maskx_lt);
-
- maskall = _mm_or_ps( _mm_or_ps(mask1, mask2), _mm_or_ps(mask3, mask4) );
-
- z = _mm_andnot_ps(maskall, z);
- z1 = _mm_and_ps(mask1, minushalfpi);
- z3 = _mm_and_ps(mask3, halfpi);
- z4 = _mm_and_ps(mask4, pi);
-
- z = _mm_or_ps( _mm_or_ps(z, z1), _mm_or_ps(z3, z4) );
-
- mask1 = _mm_andnot_ps(masky_lt, maskx_lt);
- mask2 = _mm_and_ps(maskx_lt, masky_lt);
-
- w = _mm_or_ps( _mm_and_ps(mask1, pi), _mm_and_ps(mask2, minuspi) );
- w = _mm_andnot_ps(maskall, w);
-
- z = _mm_add_ps(z, w);
- return z;
-}
+#define gmx_mm256_invsqrt_ps gmx_simd_invsqrt_f
+#define gmx_mm256_inv_ps gmx_simd_inv_f
+#define gmx_mm256_log_ps gmx_simd_log_f
+#define gmx_mm256_pmecorrF_ps gmx_simd_pmecorrF_f
+#define gmx_mm256_pmecorrV_ps gmx_simd_pmecorrV_f
+#define gmx_mm256_sincos_ps gmx_simd_sincos_f
#endif
/*
* This file is part of the GROMACS molecular simulation package.
*
- * Copyright (c) 2012,2013, by the GROMACS development team, led by
+ * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
* Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
* and including many others, as listed in the AUTHORS file in the
* top-level source directory and at http://www.gromacs.org.
#ifndef GMX_SIMD_MATH_SSE2_DOUBLE_H
#define GMX_SIMD_MATH_SSE2_DOUBLE_H
+#include "simd_math.h"
-#include <stdio.h>
-#include <math.h>
-
-#include "general_x86_sse2.h"
-
-
-#ifndef M_PI
-# define M_PI 3.14159265358979323846264338327950288
-#endif
-
-
-
-/************************
- * *
- * Simple math routines *
- * *
- ************************/
-
-/* 1.0/sqrt(x) */
-static gmx_inline __m128d
-gmx_mm_invsqrt_pd(__m128d x)
-{
- const __m128d half = _mm_set1_pd(0.5);
- const __m128d three = _mm_set1_pd(3.0);
-
- /* Lookup instruction only exists in single precision, convert back and forth... */
- __m128d lu = _mm_cvtps_pd(_mm_rsqrt_ps( _mm_cvtpd_ps(x)));
-
- lu = _mm_mul_pd(half, _mm_mul_pd(_mm_sub_pd(three, _mm_mul_pd(_mm_mul_pd(lu, lu), x)), lu));
- return _mm_mul_pd(half, _mm_mul_pd(_mm_sub_pd(three, _mm_mul_pd(_mm_mul_pd(lu, lu), x)), lu));
-}
-
-/* 1.0/sqrt(x), done for a pair of arguments to improve throughput */
-static void
-gmx_mm_invsqrt_pair_pd(__m128d x1, __m128d x2, __m128d *invsqrt1, __m128d *invsqrt2)
-{
- const __m128d half = _mm_set1_pd(0.5);
- const __m128d three = _mm_set1_pd(3.0);
- const __m128 halff = _mm_set1_ps(0.5f);
- const __m128 threef = _mm_set1_ps(3.0f);
-
- __m128 xf, luf;
- __m128d lu1, lu2;
-
- /* Do first N-R step in float for 2x throughput */
- xf = _mm_shuffle_ps(_mm_cvtpd_ps(x1), _mm_cvtpd_ps(x2), _MM_SHUFFLE(1, 0, 1, 0));
- luf = _mm_rsqrt_ps(xf);
- luf = _mm_mul_ps(halff, _mm_mul_ps(_mm_sub_ps(threef, _mm_mul_ps(_mm_mul_ps(luf, luf), xf)), luf));
-
- lu2 = _mm_cvtps_pd(_mm_shuffle_ps(luf, luf, _MM_SHUFFLE(3, 2, 3, 2)));
- lu1 = _mm_cvtps_pd(luf);
-
- *invsqrt1 = _mm_mul_pd(half, _mm_mul_pd(_mm_sub_pd(three, _mm_mul_pd(_mm_mul_pd(lu1, lu1), x1)), lu1));
- *invsqrt2 = _mm_mul_pd(half, _mm_mul_pd(_mm_sub_pd(three, _mm_mul_pd(_mm_mul_pd(lu2, lu2), x2)), lu2));
-}
-
-
-/* sqrt(x) - Do NOT use this (but rather invsqrt) if you actually need 1.0/sqrt(x) */
-static gmx_inline __m128d
-gmx_mm_sqrt_pd(__m128d x)
-{
- __m128d mask;
- __m128d res;
-
- mask = _mm_cmpeq_pd(x, _mm_setzero_pd());
- res = _mm_andnot_pd(mask, gmx_mm_invsqrt_pd(x));
-
- res = _mm_mul_pd(x, res);
-
- return res;
-}
-
-/* 1.0/x */
-static gmx_inline __m128d
-gmx_mm_inv_pd(__m128d x)
-{
- const __m128d two = _mm_set1_pd(2.0);
-
- /* Lookup instruction only exists in single precision, convert back and forth... */
- __m128d lu = _mm_cvtps_pd(_mm_rcp_ps( _mm_cvtpd_ps(x)));
-
- /* Perform two N-R steps for double precision */
- lu = _mm_mul_pd(lu, _mm_sub_pd(two, _mm_mul_pd(x, lu)));
- return _mm_mul_pd(lu, _mm_sub_pd(two, _mm_mul_pd(x, lu)));
-}
-
-static gmx_inline __m128d
-gmx_mm_abs_pd(__m128d x)
-{
- const __m128d signmask = gmx_mm_castsi128_pd( _mm_set_epi32(0x7FFFFFFF, 0xFFFFFFFF, 0x7FFFFFFF, 0xFFFFFFFF) );
-
- return _mm_and_pd(x, signmask);
-}
-
-
-/*
- * 2^x function.
- *
- * The 2^w term is calculated from a (6,0)-th order (no denominator) Minimax polynomia on the interval
- * [-0.5,0.5].
- *
- * The approximation on [-0.5,0.5] is a rational Padé approximation, 1+2*P(x^2)/(Q(x^2)-P(x^2)),
- * according to the same algorithm as used in the Cephes/netlib math routines.
- */
-static __m128d
-gmx_mm_exp2_pd(__m128d x)
-{
- /* Lower bound: We do not allow numbers that would lead to an IEEE fp representation exponent smaller than -126. */
- const __m128d arglimit = _mm_set1_pd(1022.0);
- const __m128i expbase = _mm_set1_epi32(1023);
-
- const __m128d P2 = _mm_set1_pd(2.30933477057345225087e-2);
- const __m128d P1 = _mm_set1_pd(2.02020656693165307700e1);
- const __m128d P0 = _mm_set1_pd(1.51390680115615096133e3);
- /* Q2 == 1.0 */
- const __m128d Q1 = _mm_set1_pd(2.33184211722314911771e2);
- const __m128d Q0 = _mm_set1_pd(4.36821166879210612817e3);
- const __m128d one = _mm_set1_pd(1.0);
- const __m128d two = _mm_set1_pd(2.0);
-
- __m128d valuemask;
- __m128i iexppart;
- __m128d fexppart;
- __m128d intpart;
- __m128d z, z2;
- __m128d PolyP, PolyQ;
-
- iexppart = _mm_cvtpd_epi32(x);
- intpart = _mm_cvtepi32_pd(iexppart);
-
- /* The two lowest elements of iexppart now contains 32-bit numbers with a correctly biased exponent.
- * To be able to shift it into the exponent for a double precision number we first need to
- * shuffle so that the lower half contains the first element, and the upper half the second.
- * This should really be done as a zero-extension, but since the next instructions will shift
- * the registers left by 52 bits it doesn't matter what we put there - it will be shifted out.
- * (thus we just use element 2 from iexppart).
- */
- iexppart = _mm_shuffle_epi32(iexppart, _MM_SHUFFLE(2, 1, 2, 0));
-
- /* Do the shift operation on the 64-bit registers */
- iexppart = _mm_add_epi32(iexppart, expbase);
- iexppart = _mm_slli_epi64(iexppart, 52);
-
- valuemask = _mm_cmpge_pd(arglimit, gmx_mm_abs_pd(x));
- fexppart = _mm_and_pd(valuemask, gmx_mm_castsi128_pd(iexppart));
-
- z = _mm_sub_pd(x, intpart);
- z2 = _mm_mul_pd(z, z);
-
- PolyP = _mm_mul_pd(P2, z2);
- PolyP = _mm_add_pd(PolyP, P1);
- PolyQ = _mm_add_pd(z2, Q1);
- PolyP = _mm_mul_pd(PolyP, z2);
- PolyQ = _mm_mul_pd(PolyQ, z2);
- PolyP = _mm_add_pd(PolyP, P0);
- PolyQ = _mm_add_pd(PolyQ, Q0);
- PolyP = _mm_mul_pd(PolyP, z);
-
- z = _mm_mul_pd(PolyP, gmx_mm_inv_pd(_mm_sub_pd(PolyQ, PolyP)));
- z = _mm_add_pd(one, _mm_mul_pd(two, z));
-
- z = _mm_mul_pd(z, fexppart);
-
- return z;
-}
-
-/* Exponential function. This could be calculated from 2^x as Exp(x)=2^(y), where y=log2(e)*x,
- * but there will then be a small rounding error since we lose some precision due to the
- * multiplication. This will then be magnified a lot by the exponential.
- *
- * Instead, we calculate the fractional part directly as a Padé approximation of
- * Exp(z) on [-0.5,0.5]. We use extended precision arithmetics to calculate the fraction
- * remaining after 2^y, which avoids the precision-loss.
- */
-static __m128d
-gmx_mm_exp_pd(__m128d exparg)
-{
- const __m128d argscale = _mm_set1_pd(1.4426950408889634073599);
- /* Lower bound: We do not allow numbers that would lead to an IEEE fp representation exponent smaller than -126. */
- const __m128d arglimit = _mm_set1_pd(1022.0);
- const __m128i expbase = _mm_set1_epi32(1023);
-
- const __m128d invargscale0 = _mm_set1_pd(6.93145751953125e-1);
- const __m128d invargscale1 = _mm_set1_pd(1.42860682030941723212e-6);
-
- const __m128d P2 = _mm_set1_pd(1.26177193074810590878e-4);
- const __m128d P1 = _mm_set1_pd(3.02994407707441961300e-2);
- /* P0 == 1.0 */
- const __m128d Q3 = _mm_set1_pd(3.00198505138664455042E-6);
- const __m128d Q2 = _mm_set1_pd(2.52448340349684104192E-3);
- const __m128d Q1 = _mm_set1_pd(2.27265548208155028766E-1);
- /* Q0 == 2.0 */
- const __m128d one = _mm_set1_pd(1.0);
- const __m128d two = _mm_set1_pd(2.0);
-
- __m128d valuemask;
- __m128i iexppart;
- __m128d fexppart;
- __m128d intpart;
- __m128d x, z, z2;
- __m128d PolyP, PolyQ;
-
- x = _mm_mul_pd(exparg, argscale);
-
- iexppart = _mm_cvtpd_epi32(x);
- intpart = _mm_cvtepi32_pd(iexppart);
-
- /* The two lowest elements of iexppart now contains 32-bit numbers with a correctly biased exponent.
- * To be able to shift it into the exponent for a double precision number we first need to
- * shuffle so that the lower half contains the first element, and the upper half the second.
- * This should really be done as a zero-extension, but since the next instructions will shift
- * the registers left by 52 bits it doesn't matter what we put there - it will be shifted out.
- * (thus we just use element 2 from iexppart).
- */
- iexppart = _mm_shuffle_epi32(iexppart, _MM_SHUFFLE(2, 1, 2, 0));
-
- /* Do the shift operation on the 64-bit registers */
- iexppart = _mm_add_epi32(iexppart, expbase);
- iexppart = _mm_slli_epi64(iexppart, 52);
-
- valuemask = _mm_cmpge_pd(arglimit, gmx_mm_abs_pd(x));
- fexppart = _mm_and_pd(valuemask, gmx_mm_castsi128_pd(iexppart));
-
- z = _mm_sub_pd(exparg, _mm_mul_pd(invargscale0, intpart));
- z = _mm_sub_pd(z, _mm_mul_pd(invargscale1, intpart));
-
- z2 = _mm_mul_pd(z, z);
-
- PolyQ = _mm_mul_pd(Q3, z2);
- PolyQ = _mm_add_pd(PolyQ, Q2);
- PolyP = _mm_mul_pd(P2, z2);
- PolyQ = _mm_mul_pd(PolyQ, z2);
- PolyP = _mm_add_pd(PolyP, P1);
- PolyQ = _mm_add_pd(PolyQ, Q1);
- PolyP = _mm_mul_pd(PolyP, z2);
- PolyQ = _mm_mul_pd(PolyQ, z2);
- PolyP = _mm_add_pd(PolyP, one);
- PolyQ = _mm_add_pd(PolyQ, two);
-
- PolyP = _mm_mul_pd(PolyP, z);
-
- z = _mm_mul_pd(PolyP, gmx_mm_inv_pd(_mm_sub_pd(PolyQ, PolyP)));
- z = _mm_add_pd(one, _mm_mul_pd(two, z));
-
- z = _mm_mul_pd(z, fexppart);
-
- return z;
-}
-
-
-
-static __m128d
-gmx_mm_log_pd(__m128d x)
-{
- /* Same algorithm as cephes library */
- const __m128d expmask = gmx_mm_castsi128_pd( _mm_set_epi32(0x7FF00000, 0x00000000, 0x7FF00000, 0x00000000) );
-
- const __m128i expbase_m1 = _mm_set1_epi32(1023-1); /* We want non-IEEE format */
-
- const __m128d half = _mm_set1_pd(0.5);
- const __m128d one = _mm_set1_pd(1.0);
- const __m128d two = _mm_set1_pd(2.0);
- const __m128d invsq2 = _mm_set1_pd(1.0/sqrt(2.0));
-
- const __m128d corr1 = _mm_set1_pd(-2.121944400546905827679e-4);
- const __m128d corr2 = _mm_set1_pd(0.693359375);
-
- const __m128d P5 = _mm_set1_pd(1.01875663804580931796e-4);
- const __m128d P4 = _mm_set1_pd(4.97494994976747001425e-1);
- const __m128d P3 = _mm_set1_pd(4.70579119878881725854e0);
- const __m128d P2 = _mm_set1_pd(1.44989225341610930846e1);
- const __m128d P1 = _mm_set1_pd(1.79368678507819816313e1);
- const __m128d P0 = _mm_set1_pd(7.70838733755885391666e0);
-
- const __m128d Q4 = _mm_set1_pd(1.12873587189167450590e1);
- const __m128d Q3 = _mm_set1_pd(4.52279145837532221105e1);
- const __m128d Q2 = _mm_set1_pd(8.29875266912776603211e1);
- const __m128d Q1 = _mm_set1_pd(7.11544750618563894466e1);
- const __m128d Q0 = _mm_set1_pd(2.31251620126765340583e1);
-
- const __m128d R2 = _mm_set1_pd(-7.89580278884799154124e-1);
- const __m128d R1 = _mm_set1_pd(1.63866645699558079767e1);
- const __m128d R0 = _mm_set1_pd(-6.41409952958715622951e1);
-
- const __m128d S2 = _mm_set1_pd(-3.56722798256324312549E1);
- const __m128d S1 = _mm_set1_pd(3.12093766372244180303E2);
- const __m128d S0 = _mm_set1_pd(-7.69691943550460008604E2);
-
- __m128d fexp;
- __m128i iexp;
-
- __m128d mask1, mask2;
- __m128d corr, t1, t2, q;
- __m128d zA, yA, xA, zB, yB, xB, z;
- __m128d polyR, polyS;
- __m128d polyP1, polyP2, polyQ1, polyQ2;
-
- /* Separate x into exponent and mantissa, with a mantissa in the range [0.5..1[ (not IEEE754 standard!) */
- fexp = _mm_and_pd(x, expmask);
- iexp = gmx_mm_castpd_si128(fexp);
- iexp = _mm_srli_epi64(iexp, 52);
- iexp = _mm_sub_epi32(iexp, expbase_m1);
- iexp = _mm_shuffle_epi32(iexp, _MM_SHUFFLE(1, 1, 2, 0) );
- fexp = _mm_cvtepi32_pd(iexp);
-
- x = _mm_andnot_pd(expmask, x);
- x = _mm_or_pd(x, one);
- x = _mm_mul_pd(x, half);
-
- mask1 = _mm_cmpgt_pd(gmx_mm_abs_pd(fexp), two);
- mask2 = _mm_cmplt_pd(x, invsq2);
-
- fexp = _mm_sub_pd(fexp, _mm_and_pd(mask2, one));
-
- /* If mask1 is set ('A') */
- zA = _mm_sub_pd(x, half);
- t1 = _mm_or_pd( _mm_andnot_pd(mask2, zA), _mm_and_pd(mask2, x) );
- zA = _mm_sub_pd(t1, half);
- t2 = _mm_or_pd( _mm_andnot_pd(mask2, x), _mm_and_pd(mask2, zA) );
- yA = _mm_mul_pd(half, _mm_add_pd(t2, one));
-
- xA = _mm_mul_pd(zA, gmx_mm_inv_pd(yA));
- zA = _mm_mul_pd(xA, xA);
-
- /* EVALUATE POLY */
- polyR = _mm_mul_pd(R2, zA);
- polyR = _mm_add_pd(polyR, R1);
- polyR = _mm_mul_pd(polyR, zA);
- polyR = _mm_add_pd(polyR, R0);
-
- polyS = _mm_add_pd(zA, S2);
- polyS = _mm_mul_pd(polyS, zA);
- polyS = _mm_add_pd(polyS, S1);
- polyS = _mm_mul_pd(polyS, zA);
- polyS = _mm_add_pd(polyS, S0);
-
- q = _mm_mul_pd(polyR, gmx_mm_inv_pd(polyS));
- zA = _mm_mul_pd(_mm_mul_pd(xA, zA), q);
-
- zA = _mm_add_pd(zA, _mm_mul_pd(corr1, fexp));
- zA = _mm_add_pd(zA, xA);
- zA = _mm_add_pd(zA, _mm_mul_pd(corr2, fexp));
-
- /* If mask1 is not set ('B') */
- corr = _mm_and_pd(mask2, x);
- xB = _mm_add_pd(x, corr);
- xB = _mm_sub_pd(xB, one);
- zB = _mm_mul_pd(xB, xB);
-
- polyP1 = _mm_mul_pd(P5, zB);
- polyP2 = _mm_mul_pd(P4, zB);
- polyP1 = _mm_add_pd(polyP1, P3);
- polyP2 = _mm_add_pd(polyP2, P2);
- polyP1 = _mm_mul_pd(polyP1, zB);
- polyP2 = _mm_mul_pd(polyP2, zB);
- polyP1 = _mm_add_pd(polyP1, P1);
- polyP2 = _mm_add_pd(polyP2, P0);
- polyP1 = _mm_mul_pd(polyP1, xB);
- polyP1 = _mm_add_pd(polyP1, polyP2);
-
- polyQ2 = _mm_mul_pd(Q4, zB);
- polyQ1 = _mm_add_pd(zB, Q3);
- polyQ2 = _mm_add_pd(polyQ2, Q2);
- polyQ1 = _mm_mul_pd(polyQ1, zB);
- polyQ2 = _mm_mul_pd(polyQ2, zB);
- polyQ1 = _mm_add_pd(polyQ1, Q1);
- polyQ2 = _mm_add_pd(polyQ2, Q0);
- polyQ1 = _mm_mul_pd(polyQ1, xB);
- polyQ1 = _mm_add_pd(polyQ1, polyQ2);
-
- fexp = _mm_and_pd(fexp, _mm_cmpneq_pd(fexp, _mm_setzero_pd()));
-
- q = _mm_mul_pd(polyP1, gmx_mm_inv_pd(polyQ1));
- yB = _mm_mul_pd(_mm_mul_pd(xB, zB), q);
-
- yB = _mm_add_pd(yB, _mm_mul_pd(corr1, fexp));
- yB = _mm_sub_pd(yB, _mm_mul_pd(half, zB));
- zB = _mm_add_pd(xB, yB);
- zB = _mm_add_pd(zB, _mm_mul_pd(corr2, fexp));
-
- z = _mm_or_pd( _mm_andnot_pd(mask1, zB), _mm_and_pd(mask1, zA) );
-
- return z;
-}
-
-
-
-static __m128d
-gmx_mm_erf_pd(__m128d x)
-{
- /* Coefficients for minimax approximation of erf(x)=x*(CAoffset + P(x^2)/Q(x^2)) in range [-0.75,0.75] */
- const __m128d CAP4 = _mm_set1_pd(-0.431780540597889301512e-4);
- const __m128d CAP3 = _mm_set1_pd(-0.00578562306260059236059);
- const __m128d CAP2 = _mm_set1_pd(-0.028593586920219752446);
- const __m128d CAP1 = _mm_set1_pd(-0.315924962948621698209);
- const __m128d CAP0 = _mm_set1_pd(0.14952975608477029151);
-
- const __m128d CAQ5 = _mm_set1_pd(-0.374089300177174709737e-5);
- const __m128d CAQ4 = _mm_set1_pd(0.00015126584532155383535);
- const __m128d CAQ3 = _mm_set1_pd(0.00536692680669480725423);
- const __m128d CAQ2 = _mm_set1_pd(0.0668686825594046122636);
- const __m128d CAQ1 = _mm_set1_pd(0.402604990869284362773);
- /* CAQ0 == 1.0 */
- const __m128d CAoffset = _mm_set1_pd(0.9788494110107421875);
-
- /* Coefficients for minimax approximation of erfc(x)=exp(-x^2)*x*(P(x-1)/Q(x-1)) in range [1.0,4.5] */
- const __m128d CBP6 = _mm_set1_pd(2.49650423685462752497647637088e-10);
- const __m128d CBP5 = _mm_set1_pd(0.00119770193298159629350136085658);
- const __m128d CBP4 = _mm_set1_pd(0.0164944422378370965881008942733);
- const __m128d CBP3 = _mm_set1_pd(0.0984581468691775932063932439252);
- const __m128d CBP2 = _mm_set1_pd(0.317364595806937763843589437418);
- const __m128d CBP1 = _mm_set1_pd(0.554167062641455850932670067075);
- const __m128d CBP0 = _mm_set1_pd(0.427583576155807163756925301060);
- const __m128d CBQ7 = _mm_set1_pd(0.00212288829699830145976198384930);
- const __m128d CBQ6 = _mm_set1_pd(0.0334810979522685300554606393425);
- const __m128d CBQ5 = _mm_set1_pd(0.2361713785181450957579508850717);
- const __m128d CBQ4 = _mm_set1_pd(0.955364736493055670530981883072);
- const __m128d CBQ3 = _mm_set1_pd(2.36815675631420037315349279199);
- const __m128d CBQ2 = _mm_set1_pd(3.55261649184083035537184223542);
- const __m128d CBQ1 = _mm_set1_pd(2.93501136050160872574376997993);
- /* CBQ0 == 1.0 */
-
- /* Coefficients for minimax approximation of erfc(x)=exp(-x^2)/x*(P(1/x)/Q(1/x)) in range [4.5,inf] */
- const __m128d CCP6 = _mm_set1_pd(-2.8175401114513378771);
- const __m128d CCP5 = _mm_set1_pd(-3.22729451764143718517);
- const __m128d CCP4 = _mm_set1_pd(-2.5518551727311523996);
- const __m128d CCP3 = _mm_set1_pd(-0.687717681153649930619);
- const __m128d CCP2 = _mm_set1_pd(-0.212652252872804219852);
- const __m128d CCP1 = _mm_set1_pd(0.0175389834052493308818);
- const __m128d CCP0 = _mm_set1_pd(0.00628057170626964891937);
-
- const __m128d CCQ6 = _mm_set1_pd(5.48409182238641741584);
- const __m128d CCQ5 = _mm_set1_pd(13.5064170191802889145);
- const __m128d CCQ4 = _mm_set1_pd(22.9367376522880577224);
- const __m128d CCQ3 = _mm_set1_pd(15.930646027911794143);
- const __m128d CCQ2 = _mm_set1_pd(11.0567237927800161565);
- const __m128d CCQ1 = _mm_set1_pd(2.79257750980575282228);
- /* CCQ0 == 1.0 */
- const __m128d CCoffset = _mm_set1_pd(0.5579090118408203125);
-
- const __m128d one = _mm_set1_pd(1.0);
- const __m128d two = _mm_set1_pd(2.0);
-
- const __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000, 0x00000000, 0x80000000, 0x00000000) );
-
- __m128d xabs, x2, x4, t, t2, w, w2;
- __m128d PolyAP0, PolyAP1, PolyAQ0, PolyAQ1;
- __m128d PolyBP0, PolyBP1, PolyBQ0, PolyBQ1;
- __m128d PolyCP0, PolyCP1, PolyCQ0, PolyCQ1;
- __m128d res_erf, res_erfcB, res_erfcC, res_erfc, res;
- __m128d mask, expmx2;
-
- /* Calculate erf() */
- xabs = gmx_mm_abs_pd(x);
- x2 = _mm_mul_pd(x, x);
- x4 = _mm_mul_pd(x2, x2);
-
- PolyAP0 = _mm_mul_pd(CAP4, x4);
- PolyAP1 = _mm_mul_pd(CAP3, x4);
- PolyAP0 = _mm_add_pd(PolyAP0, CAP2);
- PolyAP1 = _mm_add_pd(PolyAP1, CAP1);
- PolyAP0 = _mm_mul_pd(PolyAP0, x4);
- PolyAP1 = _mm_mul_pd(PolyAP1, x2);
- PolyAP0 = _mm_add_pd(PolyAP0, CAP0);
- PolyAP0 = _mm_add_pd(PolyAP0, PolyAP1);
-
- PolyAQ1 = _mm_mul_pd(CAQ5, x4);
- PolyAQ0 = _mm_mul_pd(CAQ4, x4);
- PolyAQ1 = _mm_add_pd(PolyAQ1, CAQ3);
- PolyAQ0 = _mm_add_pd(PolyAQ0, CAQ2);
- PolyAQ1 = _mm_mul_pd(PolyAQ1, x4);
- PolyAQ0 = _mm_mul_pd(PolyAQ0, x4);
- PolyAQ1 = _mm_add_pd(PolyAQ1, CAQ1);
- PolyAQ0 = _mm_add_pd(PolyAQ0, one);
- PolyAQ1 = _mm_mul_pd(PolyAQ1, x2);
- PolyAQ0 = _mm_add_pd(PolyAQ0, PolyAQ1);
-
- res_erf = _mm_mul_pd(PolyAP0, gmx_mm_inv_pd(PolyAQ0));
- res_erf = _mm_add_pd(CAoffset, res_erf);
- res_erf = _mm_mul_pd(x, res_erf);
-
- /* Calculate erfc() in range [1,4.5] */
- t = _mm_sub_pd(xabs, one);
- t2 = _mm_mul_pd(t, t);
-
- PolyBP0 = _mm_mul_pd(CBP6, t2);
- PolyBP1 = _mm_mul_pd(CBP5, t2);
- PolyBP0 = _mm_add_pd(PolyBP0, CBP4);
- PolyBP1 = _mm_add_pd(PolyBP1, CBP3);
- PolyBP0 = _mm_mul_pd(PolyBP0, t2);
- PolyBP1 = _mm_mul_pd(PolyBP1, t2);
- PolyBP0 = _mm_add_pd(PolyBP0, CBP2);
- PolyBP1 = _mm_add_pd(PolyBP1, CBP1);
- PolyBP0 = _mm_mul_pd(PolyBP0, t2);
- PolyBP1 = _mm_mul_pd(PolyBP1, t);
- PolyBP0 = _mm_add_pd(PolyBP0, CBP0);
- PolyBP0 = _mm_add_pd(PolyBP0, PolyBP1);
-
- PolyBQ1 = _mm_mul_pd(CBQ7, t2);
- PolyBQ0 = _mm_mul_pd(CBQ6, t2);
- PolyBQ1 = _mm_add_pd(PolyBQ1, CBQ5);
- PolyBQ0 = _mm_add_pd(PolyBQ0, CBQ4);
- PolyBQ1 = _mm_mul_pd(PolyBQ1, t2);
- PolyBQ0 = _mm_mul_pd(PolyBQ0, t2);
- PolyBQ1 = _mm_add_pd(PolyBQ1, CBQ3);
- PolyBQ0 = _mm_add_pd(PolyBQ0, CBQ2);
- PolyBQ1 = _mm_mul_pd(PolyBQ1, t2);
- PolyBQ0 = _mm_mul_pd(PolyBQ0, t2);
- PolyBQ1 = _mm_add_pd(PolyBQ1, CBQ1);
- PolyBQ0 = _mm_add_pd(PolyBQ0, one);
- PolyBQ1 = _mm_mul_pd(PolyBQ1, t);
- PolyBQ0 = _mm_add_pd(PolyBQ0, PolyBQ1);
-
- res_erfcB = _mm_mul_pd(PolyBP0, gmx_mm_inv_pd(PolyBQ0));
-
- res_erfcB = _mm_mul_pd(res_erfcB, xabs);
-
- /* Calculate erfc() in range [4.5,inf] */
- w = gmx_mm_inv_pd(xabs);
- w2 = _mm_mul_pd(w, w);
-
- PolyCP0 = _mm_mul_pd(CCP6, w2);
- PolyCP1 = _mm_mul_pd(CCP5, w2);
- PolyCP0 = _mm_add_pd(PolyCP0, CCP4);
- PolyCP1 = _mm_add_pd(PolyCP1, CCP3);
- PolyCP0 = _mm_mul_pd(PolyCP0, w2);
- PolyCP1 = _mm_mul_pd(PolyCP1, w2);
- PolyCP0 = _mm_add_pd(PolyCP0, CCP2);
- PolyCP1 = _mm_add_pd(PolyCP1, CCP1);
- PolyCP0 = _mm_mul_pd(PolyCP0, w2);
- PolyCP1 = _mm_mul_pd(PolyCP1, w);
- PolyCP0 = _mm_add_pd(PolyCP0, CCP0);
- PolyCP0 = _mm_add_pd(PolyCP0, PolyCP1);
-
- PolyCQ0 = _mm_mul_pd(CCQ6, w2);
- PolyCQ1 = _mm_mul_pd(CCQ5, w2);
- PolyCQ0 = _mm_add_pd(PolyCQ0, CCQ4);
- PolyCQ1 = _mm_add_pd(PolyCQ1, CCQ3);
- PolyCQ0 = _mm_mul_pd(PolyCQ0, w2);
- PolyCQ1 = _mm_mul_pd(PolyCQ1, w2);
- PolyCQ0 = _mm_add_pd(PolyCQ0, CCQ2);
- PolyCQ1 = _mm_add_pd(PolyCQ1, CCQ1);
- PolyCQ0 = _mm_mul_pd(PolyCQ0, w2);
- PolyCQ1 = _mm_mul_pd(PolyCQ1, w);
- PolyCQ0 = _mm_add_pd(PolyCQ0, one);
- PolyCQ0 = _mm_add_pd(PolyCQ0, PolyCQ1);
-
- expmx2 = gmx_mm_exp_pd( _mm_or_pd(signbit, x2) );
-
- res_erfcC = _mm_mul_pd(PolyCP0, gmx_mm_inv_pd(PolyCQ0));
- res_erfcC = _mm_add_pd(res_erfcC, CCoffset);
- res_erfcC = _mm_mul_pd(res_erfcC, w);
-
- mask = _mm_cmpgt_pd(xabs, _mm_set1_pd(4.5));
- res_erfc = _mm_or_pd(_mm_andnot_pd(mask, res_erfcB), _mm_and_pd(mask, res_erfcC));
-
- res_erfc = _mm_mul_pd(res_erfc, expmx2);
-
- /* erfc(x<0) = 2-erfc(|x|) */
- mask = _mm_cmplt_pd(x, _mm_setzero_pd());
- res_erfc = _mm_or_pd(_mm_andnot_pd(mask, res_erfc), _mm_and_pd(mask, _mm_sub_pd(two, res_erfc)));
-
- /* Select erf() or erfc() */
- mask = _mm_cmplt_pd(xabs, one);
- res = _mm_or_pd(_mm_andnot_pd(mask, _mm_sub_pd(one, res_erfc)), _mm_and_pd(mask, res_erf));
-
- return res;
-}
-
-
-static __m128d
-gmx_mm_erfc_pd(__m128d x)
-{
- /* Coefficients for minimax approximation of erf(x)=x*(CAoffset + P(x^2)/Q(x^2)) in range [-0.75,0.75] */
- const __m128d CAP4 = _mm_set1_pd(-0.431780540597889301512e-4);
- const __m128d CAP3 = _mm_set1_pd(-0.00578562306260059236059);
- const __m128d CAP2 = _mm_set1_pd(-0.028593586920219752446);
- const __m128d CAP1 = _mm_set1_pd(-0.315924962948621698209);
- const __m128d CAP0 = _mm_set1_pd(0.14952975608477029151);
-
- const __m128d CAQ5 = _mm_set1_pd(-0.374089300177174709737e-5);
- const __m128d CAQ4 = _mm_set1_pd(0.00015126584532155383535);
- const __m128d CAQ3 = _mm_set1_pd(0.00536692680669480725423);
- const __m128d CAQ2 = _mm_set1_pd(0.0668686825594046122636);
- const __m128d CAQ1 = _mm_set1_pd(0.402604990869284362773);
- /* CAQ0 == 1.0 */
- const __m128d CAoffset = _mm_set1_pd(0.9788494110107421875);
-
- /* Coefficients for minimax approximation of erfc(x)=exp(-x^2)*x*(P(x-1)/Q(x-1)) in range [1.0,4.5] */
- const __m128d CBP6 = _mm_set1_pd(2.49650423685462752497647637088e-10);
- const __m128d CBP5 = _mm_set1_pd(0.00119770193298159629350136085658);
- const __m128d CBP4 = _mm_set1_pd(0.0164944422378370965881008942733);
- const __m128d CBP3 = _mm_set1_pd(0.0984581468691775932063932439252);
- const __m128d CBP2 = _mm_set1_pd(0.317364595806937763843589437418);
- const __m128d CBP1 = _mm_set1_pd(0.554167062641455850932670067075);
- const __m128d CBP0 = _mm_set1_pd(0.427583576155807163756925301060);
- const __m128d CBQ7 = _mm_set1_pd(0.00212288829699830145976198384930);
- const __m128d CBQ6 = _mm_set1_pd(0.0334810979522685300554606393425);
- const __m128d CBQ5 = _mm_set1_pd(0.2361713785181450957579508850717);
- const __m128d CBQ4 = _mm_set1_pd(0.955364736493055670530981883072);
- const __m128d CBQ3 = _mm_set1_pd(2.36815675631420037315349279199);
- const __m128d CBQ2 = _mm_set1_pd(3.55261649184083035537184223542);
- const __m128d CBQ1 = _mm_set1_pd(2.93501136050160872574376997993);
- /* CBQ0 == 1.0 */
-
- /* Coefficients for minimax approximation of erfc(x)=exp(-x^2)/x*(P(1/x)/Q(1/x)) in range [4.5,inf] */
- const __m128d CCP6 = _mm_set1_pd(-2.8175401114513378771);
- const __m128d CCP5 = _mm_set1_pd(-3.22729451764143718517);
- const __m128d CCP4 = _mm_set1_pd(-2.5518551727311523996);
- const __m128d CCP3 = _mm_set1_pd(-0.687717681153649930619);
- const __m128d CCP2 = _mm_set1_pd(-0.212652252872804219852);
- const __m128d CCP1 = _mm_set1_pd(0.0175389834052493308818);
- const __m128d CCP0 = _mm_set1_pd(0.00628057170626964891937);
-
- const __m128d CCQ6 = _mm_set1_pd(5.48409182238641741584);
- const __m128d CCQ5 = _mm_set1_pd(13.5064170191802889145);
- const __m128d CCQ4 = _mm_set1_pd(22.9367376522880577224);
- const __m128d CCQ3 = _mm_set1_pd(15.930646027911794143);
- const __m128d CCQ2 = _mm_set1_pd(11.0567237927800161565);
- const __m128d CCQ1 = _mm_set1_pd(2.79257750980575282228);
- /* CCQ0 == 1.0 */
- const __m128d CCoffset = _mm_set1_pd(0.5579090118408203125);
-
- const __m128d one = _mm_set1_pd(1.0);
- const __m128d two = _mm_set1_pd(2.0);
-
- const __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000, 0x00000000, 0x80000000, 0x00000000) );
-
- __m128d xabs, x2, x4, t, t2, w, w2;
- __m128d PolyAP0, PolyAP1, PolyAQ0, PolyAQ1;
- __m128d PolyBP0, PolyBP1, PolyBQ0, PolyBQ1;
- __m128d PolyCP0, PolyCP1, PolyCQ0, PolyCQ1;
- __m128d res_erf, res_erfcB, res_erfcC, res_erfc, res;
- __m128d mask, expmx2;
-
- /* Calculate erf() */
- xabs = gmx_mm_abs_pd(x);
- x2 = _mm_mul_pd(x, x);
- x4 = _mm_mul_pd(x2, x2);
-
- PolyAP0 = _mm_mul_pd(CAP4, x4);
- PolyAP1 = _mm_mul_pd(CAP3, x4);
- PolyAP0 = _mm_add_pd(PolyAP0, CAP2);
- PolyAP1 = _mm_add_pd(PolyAP1, CAP1);
- PolyAP0 = _mm_mul_pd(PolyAP0, x4);
- PolyAP1 = _mm_mul_pd(PolyAP1, x2);
- PolyAP0 = _mm_add_pd(PolyAP0, CAP0);
- PolyAP0 = _mm_add_pd(PolyAP0, PolyAP1);
-
- PolyAQ1 = _mm_mul_pd(CAQ5, x4);
- PolyAQ0 = _mm_mul_pd(CAQ4, x4);
- PolyAQ1 = _mm_add_pd(PolyAQ1, CAQ3);
- PolyAQ0 = _mm_add_pd(PolyAQ0, CAQ2);
- PolyAQ1 = _mm_mul_pd(PolyAQ1, x4);
- PolyAQ0 = _mm_mul_pd(PolyAQ0, x4);
- PolyAQ1 = _mm_add_pd(PolyAQ1, CAQ1);
- PolyAQ0 = _mm_add_pd(PolyAQ0, one);
- PolyAQ1 = _mm_mul_pd(PolyAQ1, x2);
- PolyAQ0 = _mm_add_pd(PolyAQ0, PolyAQ1);
-
- res_erf = _mm_mul_pd(PolyAP0, gmx_mm_inv_pd(PolyAQ0));
- res_erf = _mm_add_pd(CAoffset, res_erf);
- res_erf = _mm_mul_pd(x, res_erf);
-
- /* Calculate erfc() in range [1,4.5] */
- t = _mm_sub_pd(xabs, one);
- t2 = _mm_mul_pd(t, t);
-
- PolyBP0 = _mm_mul_pd(CBP6, t2);
- PolyBP1 = _mm_mul_pd(CBP5, t2);
- PolyBP0 = _mm_add_pd(PolyBP0, CBP4);
- PolyBP1 = _mm_add_pd(PolyBP1, CBP3);
- PolyBP0 = _mm_mul_pd(PolyBP0, t2);
- PolyBP1 = _mm_mul_pd(PolyBP1, t2);
- PolyBP0 = _mm_add_pd(PolyBP0, CBP2);
- PolyBP1 = _mm_add_pd(PolyBP1, CBP1);
- PolyBP0 = _mm_mul_pd(PolyBP0, t2);
- PolyBP1 = _mm_mul_pd(PolyBP1, t);
- PolyBP0 = _mm_add_pd(PolyBP0, CBP0);
- PolyBP0 = _mm_add_pd(PolyBP0, PolyBP1);
-
- PolyBQ1 = _mm_mul_pd(CBQ7, t2);
- PolyBQ0 = _mm_mul_pd(CBQ6, t2);
- PolyBQ1 = _mm_add_pd(PolyBQ1, CBQ5);
- PolyBQ0 = _mm_add_pd(PolyBQ0, CBQ4);
- PolyBQ1 = _mm_mul_pd(PolyBQ1, t2);
- PolyBQ0 = _mm_mul_pd(PolyBQ0, t2);
- PolyBQ1 = _mm_add_pd(PolyBQ1, CBQ3);
- PolyBQ0 = _mm_add_pd(PolyBQ0, CBQ2);
- PolyBQ1 = _mm_mul_pd(PolyBQ1, t2);
- PolyBQ0 = _mm_mul_pd(PolyBQ0, t2);
- PolyBQ1 = _mm_add_pd(PolyBQ1, CBQ1);
- PolyBQ0 = _mm_add_pd(PolyBQ0, one);
- PolyBQ1 = _mm_mul_pd(PolyBQ1, t);
- PolyBQ0 = _mm_add_pd(PolyBQ0, PolyBQ1);
-
- res_erfcB = _mm_mul_pd(PolyBP0, gmx_mm_inv_pd(PolyBQ0));
-
- res_erfcB = _mm_mul_pd(res_erfcB, xabs);
-
- /* Calculate erfc() in range [4.5,inf] */
- w = gmx_mm_inv_pd(xabs);
- w2 = _mm_mul_pd(w, w);
-
- PolyCP0 = _mm_mul_pd(CCP6, w2);
- PolyCP1 = _mm_mul_pd(CCP5, w2);
- PolyCP0 = _mm_add_pd(PolyCP0, CCP4);
- PolyCP1 = _mm_add_pd(PolyCP1, CCP3);
- PolyCP0 = _mm_mul_pd(PolyCP0, w2);
- PolyCP1 = _mm_mul_pd(PolyCP1, w2);
- PolyCP0 = _mm_add_pd(PolyCP0, CCP2);
- PolyCP1 = _mm_add_pd(PolyCP1, CCP1);
- PolyCP0 = _mm_mul_pd(PolyCP0, w2);
- PolyCP1 = _mm_mul_pd(PolyCP1, w);
- PolyCP0 = _mm_add_pd(PolyCP0, CCP0);
- PolyCP0 = _mm_add_pd(PolyCP0, PolyCP1);
-
- PolyCQ0 = _mm_mul_pd(CCQ6, w2);
- PolyCQ1 = _mm_mul_pd(CCQ5, w2);
- PolyCQ0 = _mm_add_pd(PolyCQ0, CCQ4);
- PolyCQ1 = _mm_add_pd(PolyCQ1, CCQ3);
- PolyCQ0 = _mm_mul_pd(PolyCQ0, w2);
- PolyCQ1 = _mm_mul_pd(PolyCQ1, w2);
- PolyCQ0 = _mm_add_pd(PolyCQ0, CCQ2);
- PolyCQ1 = _mm_add_pd(PolyCQ1, CCQ1);
- PolyCQ0 = _mm_mul_pd(PolyCQ0, w2);
- PolyCQ1 = _mm_mul_pd(PolyCQ1, w);
- PolyCQ0 = _mm_add_pd(PolyCQ0, one);
- PolyCQ0 = _mm_add_pd(PolyCQ0, PolyCQ1);
-
- expmx2 = gmx_mm_exp_pd( _mm_or_pd(signbit, x2) );
-
- res_erfcC = _mm_mul_pd(PolyCP0, gmx_mm_inv_pd(PolyCQ0));
- res_erfcC = _mm_add_pd(res_erfcC, CCoffset);
- res_erfcC = _mm_mul_pd(res_erfcC, w);
-
- mask = _mm_cmpgt_pd(xabs, _mm_set1_pd(4.5));
- res_erfc = _mm_or_pd(_mm_andnot_pd(mask, res_erfcB), _mm_and_pd(mask, res_erfcC));
-
- res_erfc = _mm_mul_pd(res_erfc, expmx2);
-
- /* erfc(x<0) = 2-erfc(|x|) */
- mask = _mm_cmplt_pd(x, _mm_setzero_pd());
- res_erfc = _mm_or_pd(_mm_andnot_pd(mask, res_erfc), _mm_and_pd(mask, _mm_sub_pd(two, res_erfc)));
-
- /* Select erf() or erfc() */
- mask = _mm_cmplt_pd(xabs, one);
- res = _mm_or_pd(_mm_andnot_pd(mask, res_erfc), _mm_and_pd(mask, _mm_sub_pd(one, res_erf)));
-
- return res;
-}
-
-
-/* Calculate the force correction due to PME analytically.
- *
- * This routine is meant to enable analytical evaluation of the
- * direct-space PME electrostatic force to avoid tables.
- *
- * The direct-space potential should be Erfc(beta*r)/r, but there
- * are some problems evaluating that:
- *
- * First, the error function is difficult (read: expensive) to
- * approxmiate accurately for intermediate to large arguments, and
- * this happens already in ranges of beta*r that occur in simulations.
- * Second, we now try to avoid calculating potentials in Gromacs but
- * use forces directly.
- *
- * We can simply things slight by noting that the PME part is really
- * a correction to the normal Coulomb force since Erfc(z)=1-Erf(z), i.e.
- *
- * V= 1/r - Erf(beta*r)/r
- *
- * The first term we already have from the inverse square root, so
- * that we can leave out of this routine.
- *
- * For pme tolerances of 1e-3 to 1e-8 and cutoffs of 0.5nm to 1.8nm,
- * the argument beta*r will be in the range 0.15 to ~4. Use your
- * favorite plotting program to realize how well-behaved Erf(z)/z is
- * in this range!
- *
- * We approximate f(z)=erf(z)/z with a rational minimax polynomial.
- * However, it turns out it is more efficient to approximate f(z)/z and
- * then only use even powers. This is another minor optimization, since
- * we actually WANT f(z)/z, because it is going to be multiplied by
- * the vector between the two atoms to get the vectorial force. The
- * fastest flops are the ones we can avoid calculating!
- *
- * So, here's how it should be used:
- *
- * 1. Calculate r^2.
- * 2. Multiply by beta^2, so you get z^2=beta^2*r^2.
- * 3. Evaluate this routine with z^2 as the argument.
- * 4. The return value is the expression:
- *
- *
- * 2*exp(-z^2) erf(z)
- * ------------ - --------
- * sqrt(Pi)*z^2 z^3
- *
- * 5. Multiply the entire expression by beta^3. This will get you
- *
- * beta^3*2*exp(-z^2) beta^3*erf(z)
- * ------------------ - ---------------
- * sqrt(Pi)*z^2 z^3
- *
- * or, switching back to r (z=r*beta):
- *
- * 2*beta*exp(-r^2*beta^2) erf(r*beta)
- * ----------------------- - -----------
- * sqrt(Pi)*r^2 r^3
- *
- *
- * With a bit of math exercise you should be able to confirm that
- * this is exactly D[Erf[beta*r]/r,r] divided by r another time.
- *
- * 6. Add the result to 1/r^3, multiply by the product of the charges,
- * and you have your force (divided by r). A final multiplication
- * with the vector connecting the two particles and you have your
- * vectorial force to add to the particles.
- *
+/* Temporary:
+ * Alias some old SSE definitions to new SIMD definitions so we don't need
+ * to modify _all_ group kernels - they will anyway be replaced with a new
+ * generic SIMD version soon.
*/
-static __m128d
-gmx_mm_pmecorrF_pd(__m128d z2)
-{
- const __m128d FN10 = _mm_set1_pd(-8.0072854618360083154e-14);
- const __m128d FN9 = _mm_set1_pd(1.1859116242260148027e-11);
- const __m128d FN8 = _mm_set1_pd(-8.1490406329798423616e-10);
- const __m128d FN7 = _mm_set1_pd(3.4404793543907847655e-8);
- const __m128d FN6 = _mm_set1_pd(-9.9471420832602741006e-7);
- const __m128d FN5 = _mm_set1_pd(0.000020740315999115847456);
- const __m128d FN4 = _mm_set1_pd(-0.00031991745139313364005);
- const __m128d FN3 = _mm_set1_pd(0.0035074449373659008203);
- const __m128d FN2 = _mm_set1_pd(-0.031750380176100813405);
- const __m128d FN1 = _mm_set1_pd(0.13884101728898463426);
- const __m128d FN0 = _mm_set1_pd(-0.75225277815249618847);
-
- const __m128d FD5 = _mm_set1_pd(0.000016009278224355026701);
- const __m128d FD4 = _mm_set1_pd(0.00051055686934806966046);
- const __m128d FD3 = _mm_set1_pd(0.0081803507497974289008);
- const __m128d FD2 = _mm_set1_pd(0.077181146026670287235);
- const __m128d FD1 = _mm_set1_pd(0.41543303143712535988);
- const __m128d FD0 = _mm_set1_pd(1.0);
-
- __m128d z4;
- __m128d polyFN0, polyFN1, polyFD0, polyFD1;
-
- z4 = _mm_mul_pd(z2, z2);
-
- polyFD1 = _mm_mul_pd(FD5, z4);
- polyFD0 = _mm_mul_pd(FD4, z4);
- polyFD1 = _mm_add_pd(polyFD1, FD3);
- polyFD0 = _mm_add_pd(polyFD0, FD2);
- polyFD1 = _mm_mul_pd(polyFD1, z4);
- polyFD0 = _mm_mul_pd(polyFD0, z4);
- polyFD1 = _mm_add_pd(polyFD1, FD1);
- polyFD0 = _mm_add_pd(polyFD0, FD0);
- polyFD1 = _mm_mul_pd(polyFD1, z2);
- polyFD0 = _mm_add_pd(polyFD0, polyFD1);
-
- polyFD0 = gmx_mm_inv_pd(polyFD0);
-
- polyFN0 = _mm_mul_pd(FN10, z4);
- polyFN1 = _mm_mul_pd(FN9, z4);
- polyFN0 = _mm_add_pd(polyFN0, FN8);
- polyFN1 = _mm_add_pd(polyFN1, FN7);
- polyFN0 = _mm_mul_pd(polyFN0, z4);
- polyFN1 = _mm_mul_pd(polyFN1, z4);
- polyFN0 = _mm_add_pd(polyFN0, FN6);
- polyFN1 = _mm_add_pd(polyFN1, FN5);
- polyFN0 = _mm_mul_pd(polyFN0, z4);
- polyFN1 = _mm_mul_pd(polyFN1, z4);
- polyFN0 = _mm_add_pd(polyFN0, FN4);
- polyFN1 = _mm_add_pd(polyFN1, FN3);
- polyFN0 = _mm_mul_pd(polyFN0, z4);
- polyFN1 = _mm_mul_pd(polyFN1, z4);
- polyFN0 = _mm_add_pd(polyFN0, FN2);
- polyFN1 = _mm_add_pd(polyFN1, FN1);
- polyFN0 = _mm_mul_pd(polyFN0, z4);
- polyFN1 = _mm_mul_pd(polyFN1, z2);
- polyFN0 = _mm_add_pd(polyFN0, FN0);
- polyFN0 = _mm_add_pd(polyFN0, polyFN1);
-
- return _mm_mul_pd(polyFN0, polyFD0);
-}
-
-
-
-
-/* Calculate the potential correction due to PME analytically.
- *
- * See gmx_mm256_pmecorrF_ps() for details about the approximation.
- *
- * This routine calculates Erf(z)/z, although you should provide z^2
- * as the input argument.
- *
- * Here's how it should be used:
- *
- * 1. Calculate r^2.
- * 2. Multiply by beta^2, so you get z^2=beta^2*r^2.
- * 3. Evaluate this routine with z^2 as the argument.
- * 4. The return value is the expression:
- *
- *
- * erf(z)
- * --------
- * z
- *
- * 5. Multiply the entire expression by beta and switching back to r (z=r*beta):
- *
- * erf(r*beta)
- * -----------
- * r
- *
- * 6. Subtract the result from 1/r, multiply by the product of the charges,
- * and you have your potential.
- *
- */
-static __m128d
-gmx_mm_pmecorrV_pd(__m128d z2)
-{
- const __m128d VN9 = _mm_set1_pd(-9.3723776169321855475e-13);
- const __m128d VN8 = _mm_set1_pd(1.2280156762674215741e-10);
- const __m128d VN7 = _mm_set1_pd(-7.3562157912251309487e-9);
- const __m128d VN6 = _mm_set1_pd(2.6215886208032517509e-7);
- const __m128d VN5 = _mm_set1_pd(-4.9532491651265819499e-6);
- const __m128d VN4 = _mm_set1_pd(0.00025907400778966060389);
- const __m128d VN3 = _mm_set1_pd(0.0010585044856156469792);
- const __m128d VN2 = _mm_set1_pd(0.045247661136833092885);
- const __m128d VN1 = _mm_set1_pd(0.11643931522926034421);
- const __m128d VN0 = _mm_set1_pd(1.1283791671726767970);
-
- const __m128d VD5 = _mm_set1_pd(0.000021784709867336150342);
- const __m128d VD4 = _mm_set1_pd(0.00064293662010911388448);
- const __m128d VD3 = _mm_set1_pd(0.0096311444822588683504);
- const __m128d VD2 = _mm_set1_pd(0.085608012351550627051);
- const __m128d VD1 = _mm_set1_pd(0.43652499166614811084);
- const __m128d VD0 = _mm_set1_pd(1.0);
-
- __m128d z4;
- __m128d polyVN0, polyVN1, polyVD0, polyVD1;
-
- z4 = _mm_mul_pd(z2, z2);
-
- polyVD1 = _mm_mul_pd(VD5, z4);
- polyVD0 = _mm_mul_pd(VD4, z4);
- polyVD1 = _mm_add_pd(polyVD1, VD3);
- polyVD0 = _mm_add_pd(polyVD0, VD2);
- polyVD1 = _mm_mul_pd(polyVD1, z4);
- polyVD0 = _mm_mul_pd(polyVD0, z4);
- polyVD1 = _mm_add_pd(polyVD1, VD1);
- polyVD0 = _mm_add_pd(polyVD0, VD0);
- polyVD1 = _mm_mul_pd(polyVD1, z2);
- polyVD0 = _mm_add_pd(polyVD0, polyVD1);
-
- polyVD0 = gmx_mm_inv_pd(polyVD0);
-
- polyVN1 = _mm_mul_pd(VN9, z4);
- polyVN0 = _mm_mul_pd(VN8, z4);
- polyVN1 = _mm_add_pd(polyVN1, VN7);
- polyVN0 = _mm_add_pd(polyVN0, VN6);
- polyVN1 = _mm_mul_pd(polyVN1, z4);
- polyVN0 = _mm_mul_pd(polyVN0, z4);
- polyVN1 = _mm_add_pd(polyVN1, VN5);
- polyVN0 = _mm_add_pd(polyVN0, VN4);
- polyVN1 = _mm_mul_pd(polyVN1, z4);
- polyVN0 = _mm_mul_pd(polyVN0, z4);
- polyVN1 = _mm_add_pd(polyVN1, VN3);
- polyVN0 = _mm_add_pd(polyVN0, VN2);
- polyVN1 = _mm_mul_pd(polyVN1, z4);
- polyVN0 = _mm_mul_pd(polyVN0, z4);
- polyVN1 = _mm_add_pd(polyVN1, VN1);
- polyVN0 = _mm_add_pd(polyVN0, VN0);
- polyVN1 = _mm_mul_pd(polyVN1, z2);
- polyVN0 = _mm_add_pd(polyVN0, polyVN1);
-
- return _mm_mul_pd(polyVN0, polyVD0);
-}
-
-
-
-static int
-gmx_mm_sincos_pd(__m128d x,
- __m128d *sinval,
- __m128d *cosval)
-{
-#ifdef _MSC_VER
- __declspec(align(16))
- const double sintable[34] =
- {
- 1.00000000000000000e+00, 0.00000000000000000e+00,
- 9.95184726672196929e-01, 9.80171403295606036e-02,
- 9.80785280403230431e-01, 1.95090322016128248e-01,
- 9.56940335732208824e-01, 2.90284677254462331e-01,
- 9.23879532511286738e-01, 3.82683432365089782e-01,
- 8.81921264348355050e-01, 4.71396736825997642e-01,
- 8.31469612302545236e-01, 5.55570233019602178e-01,
- 7.73010453362736993e-01, 6.34393284163645488e-01,
- 7.07106781186547573e-01, 7.07106781186547462e-01,
- 6.34393284163645599e-01, 7.73010453362736882e-01,
- 5.55570233019602289e-01, 8.31469612302545125e-01,
- 4.71396736825997809e-01, 8.81921264348354939e-01,
- 3.82683432365089837e-01, 9.23879532511286738e-01,
- 2.90284677254462276e-01, 9.56940335732208935e-01,
- 1.95090322016128304e-01, 9.80785280403230431e-01,
- 9.80171403295607702e-02, 9.95184726672196818e-01,
- 0.0, 1.00000000000000000e+00
- };
-#else
- const __m128d sintable[17] =
- {
- _mm_set_pd( 0.0, 1.0 ),
- _mm_set_pd( sin( 1.0 * (M_PI/2.0) / 16.0), cos( 1.0 * (M_PI/2.0) / 16.0) ),
- _mm_set_pd( sin( 2.0 * (M_PI/2.0) / 16.0), cos( 2.0 * (M_PI/2.0) / 16.0) ),
- _mm_set_pd( sin( 3.0 * (M_PI/2.0) / 16.0), cos( 3.0 * (M_PI/2.0) / 16.0) ),
- _mm_set_pd( sin( 4.0 * (M_PI/2.0) / 16.0), cos( 4.0 * (M_PI/2.0) / 16.0) ),
- _mm_set_pd( sin( 5.0 * (M_PI/2.0) / 16.0), cos( 5.0 * (M_PI/2.0) / 16.0) ),
- _mm_set_pd( sin( 6.0 * (M_PI/2.0) / 16.0), cos( 6.0 * (M_PI/2.0) / 16.0) ),
- _mm_set_pd( sin( 7.0 * (M_PI/2.0) / 16.0), cos( 7.0 * (M_PI/2.0) / 16.0) ),
- _mm_set_pd( sin( 8.0 * (M_PI/2.0) / 16.0), cos( 8.0 * (M_PI/2.0) / 16.0) ),
- _mm_set_pd( sin( 9.0 * (M_PI/2.0) / 16.0), cos( 9.0 * (M_PI/2.0) / 16.0) ),
- _mm_set_pd( sin( 10.0 * (M_PI/2.0) / 16.0), cos( 10.0 * (M_PI/2.0) / 16.0) ),
- _mm_set_pd( sin( 11.0 * (M_PI/2.0) / 16.0), cos( 11.0 * (M_PI/2.0) / 16.0) ),
- _mm_set_pd( sin( 12.0 * (M_PI/2.0) / 16.0), cos( 12.0 * (M_PI/2.0) / 16.0) ),
- _mm_set_pd( sin( 13.0 * (M_PI/2.0) / 16.0), cos( 13.0 * (M_PI/2.0) / 16.0) ),
- _mm_set_pd( sin( 14.0 * (M_PI/2.0) / 16.0), cos( 14.0 * (M_PI/2.0) / 16.0) ),
- _mm_set_pd( sin( 15.0 * (M_PI/2.0) / 16.0), cos( 15.0 * (M_PI/2.0) / 16.0) ),
- _mm_set_pd( 1.0, 0.0 )
- };
-#endif
-
- const __m128d signmask = gmx_mm_castsi128_pd( _mm_set_epi32(0x7FFFFFFF, 0xFFFFFFFF, 0x7FFFFFFF, 0xFFFFFFFF) );
-
- const __m128d tabscale = _mm_set1_pd(32.0/M_PI);
- const __m128d invtabscale0 = _mm_set1_pd(9.81747508049011230469e-02);
- const __m128d invtabscale1 = _mm_set1_pd(1.96197799156550576057e-08);
- const __m128i ione = _mm_set1_epi32(1);
- const __m128i i32 = _mm_set1_epi32(32);
- const __m128i i16 = _mm_set1_epi32(16);
- const __m128i tabmask = _mm_set1_epi32(0x3F);
- const __m128d sinP7 = _mm_set1_pd(-1.0/5040.0);
- const __m128d sinP5 = _mm_set1_pd(1.0/120.0);
- const __m128d sinP3 = _mm_set1_pd(-1.0/6.0);
- const __m128d sinP1 = _mm_set1_pd(1.0);
-
- const __m128d cosP6 = _mm_set1_pd(-1.0/720.0);
- const __m128d cosP4 = _mm_set1_pd(1.0/24.0);
- const __m128d cosP2 = _mm_set1_pd(-1.0/2.0);
- const __m128d cosP0 = _mm_set1_pd(1.0);
-
- __m128d scalex;
- __m128i tabidx, corridx;
- __m128d xabs, z, z2, polySin, polyCos;
- __m128d xpoint;
- __m128d ypoint0, ypoint1;
-
- __m128d sinpoint, cospoint;
- __m128d xsign, ssign, csign;
- __m128i imask, sswapsign, cswapsign;
-
- xsign = _mm_andnot_pd(signmask, x);
- xabs = _mm_and_pd(x, signmask);
-
- scalex = _mm_mul_pd(tabscale, xabs);
- tabidx = _mm_cvtpd_epi32(scalex);
-
- xpoint = _mm_cvtepi32_pd(tabidx);
-
- /* Extended precision arithmetics */
- z = _mm_sub_pd(xabs, _mm_mul_pd(invtabscale0, xpoint));
- z = _mm_sub_pd(z, _mm_mul_pd(invtabscale1, xpoint));
-
- /* Range reduction to 0..2*Pi */
- tabidx = _mm_and_si128(tabidx, tabmask);
-
- /* tabidx is now in range [0,..,64] */
- imask = _mm_cmpgt_epi32(tabidx, i32);
- sswapsign = imask;
- cswapsign = imask;
- corridx = _mm_and_si128(imask, i32);
- tabidx = _mm_sub_epi32(tabidx, corridx);
-
- /* tabidx is now in range [0..32] */
- imask = _mm_cmpgt_epi32(tabidx, i16);
- cswapsign = _mm_xor_si128(cswapsign, imask);
- corridx = _mm_sub_epi32(i32, tabidx);
- tabidx = _mm_or_si128( _mm_and_si128(imask, corridx), _mm_andnot_si128(imask, tabidx) );
- /* tabidx is now in range [0..16] */
- ssign = _mm_cvtepi32_pd( _mm_or_si128( sswapsign, ione ) );
- csign = _mm_cvtepi32_pd( _mm_or_si128( cswapsign, ione ) );
-
-#ifdef _MSC_VER
- ypoint0 = _mm_load_pd(sintable + 2*gmx_mm_extract_epi32(tabidx, 0));
- ypoint1 = _mm_load_pd(sintable + 2*gmx_mm_extract_epi32(tabidx, 1));
-#else
- ypoint0 = sintable[gmx_mm_extract_epi32(tabidx, 0)];
- ypoint1 = sintable[gmx_mm_extract_epi32(tabidx, 1)];
-#endif
- sinpoint = _mm_unpackhi_pd(ypoint0, ypoint1);
- cospoint = _mm_unpacklo_pd(ypoint0, ypoint1);
-
- sinpoint = _mm_mul_pd(sinpoint, ssign);
- cospoint = _mm_mul_pd(cospoint, csign);
-
- z2 = _mm_mul_pd(z, z);
-
- polySin = _mm_mul_pd(sinP7, z2);
- polySin = _mm_add_pd(polySin, sinP5);
- polySin = _mm_mul_pd(polySin, z2);
- polySin = _mm_add_pd(polySin, sinP3);
- polySin = _mm_mul_pd(polySin, z2);
- polySin = _mm_add_pd(polySin, sinP1);
- polySin = _mm_mul_pd(polySin, z);
-
- polyCos = _mm_mul_pd(cosP6, z2);
- polyCos = _mm_add_pd(polyCos, cosP4);
- polyCos = _mm_mul_pd(polyCos, z2);
- polyCos = _mm_add_pd(polyCos, cosP2);
- polyCos = _mm_mul_pd(polyCos, z2);
- polyCos = _mm_add_pd(polyCos, cosP0);
-
- *sinval = _mm_xor_pd(_mm_add_pd( _mm_mul_pd(sinpoint, polyCos), _mm_mul_pd(cospoint, polySin) ), xsign);
- *cosval = _mm_sub_pd( _mm_mul_pd(cospoint, polyCos), _mm_mul_pd(sinpoint, polySin) );
-
- return 0;
-}
-
-/*
- * IMPORTANT: Do NOT call both sin & cos if you need both results, since each of them
- * will then call the sincos() routine and waste a factor 2 in performance!
- */
-static __m128d
-gmx_mm_sin_pd(__m128d x)
-{
- __m128d s, c;
- gmx_mm_sincos_pd(x, &s, &c);
- return s;
-}
-
-/*
- * IMPORTANT: Do NOT call both sin & cos if you need both results, since each of them
- * will then call the sincos() routine and waste a factor 2 in performance!
- */
-static __m128d
-gmx_mm_cos_pd(__m128d x)
-{
- __m128d s, c;
- gmx_mm_sincos_pd(x, &s, &c);
- return c;
-}
-
-
-
-static __m128d
-gmx_mm_tan_pd(__m128d x)
-{
- __m128d sinval, cosval;
- __m128d tanval;
-
- gmx_mm_sincos_pd(x, &sinval, &cosval);
-
- tanval = _mm_mul_pd(sinval, gmx_mm_inv_pd(cosval));
-
- return tanval;
-}
-
-
-
-static __m128d
-gmx_mm_asin_pd(__m128d x)
-{
- /* Same algorithm as cephes library */
- const __m128d signmask = gmx_mm_castsi128_pd( _mm_set_epi32(0x7FFFFFFF, 0xFFFFFFFF, 0x7FFFFFFF, 0xFFFFFFFF) );
- const __m128d limit1 = _mm_set1_pd(0.625);
- const __m128d limit2 = _mm_set1_pd(1e-8);
- const __m128d one = _mm_set1_pd(1.0);
- const __m128d quarterpi = _mm_set1_pd(M_PI/4.0);
- const __m128d morebits = _mm_set1_pd(6.123233995736765886130e-17);
-
- const __m128d P5 = _mm_set1_pd(4.253011369004428248960e-3);
- const __m128d P4 = _mm_set1_pd(-6.019598008014123785661e-1);
- const __m128d P3 = _mm_set1_pd(5.444622390564711410273e0);
- const __m128d P2 = _mm_set1_pd(-1.626247967210700244449e1);
- const __m128d P1 = _mm_set1_pd(1.956261983317594739197e1);
- const __m128d P0 = _mm_set1_pd(-8.198089802484824371615e0);
-
- const __m128d Q4 = _mm_set1_pd(-1.474091372988853791896e1);
- const __m128d Q3 = _mm_set1_pd(7.049610280856842141659e1);
- const __m128d Q2 = _mm_set1_pd(-1.471791292232726029859e2);
- const __m128d Q1 = _mm_set1_pd(1.395105614657485689735e2);
- const __m128d Q0 = _mm_set1_pd(-4.918853881490881290097e1);
-
- const __m128d R4 = _mm_set1_pd(2.967721961301243206100e-3);
- const __m128d R3 = _mm_set1_pd(-5.634242780008963776856e-1);
- const __m128d R2 = _mm_set1_pd(6.968710824104713396794e0);
- const __m128d R1 = _mm_set1_pd(-2.556901049652824852289e1);
- const __m128d R0 = _mm_set1_pd(2.853665548261061424989e1);
-
- const __m128d S3 = _mm_set1_pd(-2.194779531642920639778e1);
- const __m128d S2 = _mm_set1_pd(1.470656354026814941758e2);
- const __m128d S1 = _mm_set1_pd(-3.838770957603691357202e2);
- const __m128d S0 = _mm_set1_pd(3.424398657913078477438e2);
-
- __m128d sign;
- __m128d mask;
- __m128d xabs;
- __m128d zz, ww, z, q, w, zz2, ww2;
- __m128d PA, PB;
- __m128d QA, QB;
- __m128d RA, RB;
- __m128d SA, SB;
- __m128d nom, denom;
-
- sign = _mm_andnot_pd(signmask, x);
- xabs = _mm_and_pd(x, signmask);
-
- mask = _mm_cmpgt_pd(xabs, limit1);
-
- zz = _mm_sub_pd(one, xabs);
- ww = _mm_mul_pd(xabs, xabs);
- zz2 = _mm_mul_pd(zz, zz);
- ww2 = _mm_mul_pd(ww, ww);
-
- /* R */
- RA = _mm_mul_pd(R4, zz2);
- RB = _mm_mul_pd(R3, zz2);
- RA = _mm_add_pd(RA, R2);
- RB = _mm_add_pd(RB, R1);
- RA = _mm_mul_pd(RA, zz2);
- RB = _mm_mul_pd(RB, zz);
- RA = _mm_add_pd(RA, R0);
- RA = _mm_add_pd(RA, RB);
-
- /* S, SA = zz2 */
- SB = _mm_mul_pd(S3, zz2);
- SA = _mm_add_pd(zz2, S2);
- SB = _mm_add_pd(SB, S1);
- SA = _mm_mul_pd(SA, zz2);
- SB = _mm_mul_pd(SB, zz);
- SA = _mm_add_pd(SA, S0);
- SA = _mm_add_pd(SA, SB);
-
- /* P */
- PA = _mm_mul_pd(P5, ww2);
- PB = _mm_mul_pd(P4, ww2);
- PA = _mm_add_pd(PA, P3);
- PB = _mm_add_pd(PB, P2);
- PA = _mm_mul_pd(PA, ww2);
- PB = _mm_mul_pd(PB, ww2);
- PA = _mm_add_pd(PA, P1);
- PB = _mm_add_pd(PB, P0);
- PA = _mm_mul_pd(PA, ww);
- PA = _mm_add_pd(PA, PB);
-
- /* Q, QA = ww2 */
- QB = _mm_mul_pd(Q4, ww2);
- QA = _mm_add_pd(ww2, Q3);
- QB = _mm_add_pd(QB, Q2);
- QA = _mm_mul_pd(QA, ww2);
- QB = _mm_mul_pd(QB, ww2);
- QA = _mm_add_pd(QA, Q1);
- QB = _mm_add_pd(QB, Q0);
- QA = _mm_mul_pd(QA, ww);
- QA = _mm_add_pd(QA, QB);
-
- RA = _mm_mul_pd(RA, zz);
- PA = _mm_mul_pd(PA, ww);
-
- nom = _mm_or_pd( _mm_andnot_pd(mask, PA), _mm_and_pd(mask, RA) );
- denom = _mm_or_pd( _mm_andnot_pd(mask, QA), _mm_and_pd(mask, SA) );
-
- q = _mm_mul_pd( nom, gmx_mm_inv_pd(denom) );
-
- zz = _mm_add_pd(zz, zz);
- zz = gmx_mm_sqrt_pd(zz);
- z = _mm_sub_pd(quarterpi, zz);
- zz = _mm_mul_pd(zz, q);
- zz = _mm_sub_pd(zz, morebits);
- z = _mm_sub_pd(z, zz);
- z = _mm_add_pd(z, quarterpi);
-
- w = _mm_mul_pd(xabs, q);
- w = _mm_add_pd(w, xabs);
-
- z = _mm_or_pd( _mm_andnot_pd(mask, w), _mm_and_pd(mask, z) );
-
- mask = _mm_cmpgt_pd(xabs, limit2);
- z = _mm_or_pd( _mm_andnot_pd(mask, xabs), _mm_and_pd(mask, z) );
-
- z = _mm_xor_pd(z, sign);
-
- return z;
-}
-
-
-static __m128d
-gmx_mm_acos_pd(__m128d x)
-{
- const __m128d one = _mm_set1_pd(1.0);
- const __m128d half = _mm_set1_pd(0.5);
- const __m128d quarterpi0 = _mm_set1_pd(7.85398163397448309616e-1);
- const __m128d quarterpi1 = _mm_set1_pd(6.123233995736765886130e-17);
-
-
- __m128d mask1;
-
- __m128d z, z1, z2;
-
- mask1 = _mm_cmpgt_pd(x, half);
- z1 = _mm_mul_pd(half, _mm_sub_pd(one, x));
- z1 = gmx_mm_sqrt_pd(z1);
- z = _mm_or_pd( _mm_andnot_pd(mask1, x), _mm_and_pd(mask1, z1) );
-
- z = gmx_mm_asin_pd(z);
-
- z1 = _mm_add_pd(z, z);
-
- z2 = _mm_sub_pd(quarterpi0, z);
- z2 = _mm_add_pd(z2, quarterpi1);
- z2 = _mm_add_pd(z2, quarterpi0);
-
- z = _mm_or_pd(_mm_andnot_pd(mask1, z2), _mm_and_pd(mask1, z1));
-
- return z;
-}
-
-static __m128d
-gmx_mm_atan_pd(__m128d x)
-{
- /* Same algorithm as cephes library */
- const __m128d signmask = gmx_mm_castsi128_pd( _mm_set_epi32(0x7FFFFFFF, 0xFFFFFFFF, 0x7FFFFFFF, 0xFFFFFFFF) );
- const __m128d limit1 = _mm_set1_pd(0.66);
- const __m128d limit2 = _mm_set1_pd(2.41421356237309504880);
- const __m128d quarterpi = _mm_set1_pd(M_PI/4.0);
- const __m128d halfpi = _mm_set1_pd(M_PI/2.0);
- const __m128d mone = _mm_set1_pd(-1.0);
- const __m128d morebits1 = _mm_set1_pd(0.5*6.123233995736765886130E-17);
- const __m128d morebits2 = _mm_set1_pd(6.123233995736765886130E-17);
-
- const __m128d P4 = _mm_set1_pd(-8.750608600031904122785E-1);
- const __m128d P3 = _mm_set1_pd(-1.615753718733365076637E1);
- const __m128d P2 = _mm_set1_pd(-7.500855792314704667340E1);
- const __m128d P1 = _mm_set1_pd(-1.228866684490136173410E2);
- const __m128d P0 = _mm_set1_pd(-6.485021904942025371773E1);
-
- const __m128d Q4 = _mm_set1_pd(2.485846490142306297962E1);
- const __m128d Q3 = _mm_set1_pd(1.650270098316988542046E2);
- const __m128d Q2 = _mm_set1_pd(4.328810604912902668951E2);
- const __m128d Q1 = _mm_set1_pd(4.853903996359136964868E2);
- const __m128d Q0 = _mm_set1_pd(1.945506571482613964425E2);
-
- __m128d sign;
- __m128d mask1, mask2;
- __m128d y, t1, t2;
- __m128d z, z2;
- __m128d P_A, P_B, Q_A, Q_B;
-
- sign = _mm_andnot_pd(signmask, x);
- x = _mm_and_pd(x, signmask);
-
- mask1 = _mm_cmpgt_pd(x, limit1);
- mask2 = _mm_cmpgt_pd(x, limit2);
-
- t1 = _mm_mul_pd(_mm_add_pd(x, mone), gmx_mm_inv_pd(_mm_sub_pd(x, mone)));
- t2 = _mm_mul_pd(mone, gmx_mm_inv_pd(x));
-
- y = _mm_and_pd(mask1, quarterpi);
- y = _mm_or_pd( _mm_and_pd(mask2, halfpi), _mm_andnot_pd(mask2, y) );
-
- x = _mm_or_pd( _mm_and_pd(mask1, t1), _mm_andnot_pd(mask1, x) );
- x = _mm_or_pd( _mm_and_pd(mask2, t2), _mm_andnot_pd(mask2, x) );
-
- z = _mm_mul_pd(x, x);
- z2 = _mm_mul_pd(z, z);
-
- P_A = _mm_mul_pd(P4, z2);
- P_B = _mm_mul_pd(P3, z2);
- P_A = _mm_add_pd(P_A, P2);
- P_B = _mm_add_pd(P_B, P1);
- P_A = _mm_mul_pd(P_A, z2);
- P_B = _mm_mul_pd(P_B, z);
- P_A = _mm_add_pd(P_A, P0);
- P_A = _mm_add_pd(P_A, P_B);
-
- /* Q_A = z2 */
- Q_B = _mm_mul_pd(Q4, z2);
- Q_A = _mm_add_pd(z2, Q3);
- Q_B = _mm_add_pd(Q_B, Q2);
- Q_A = _mm_mul_pd(Q_A, z2);
- Q_B = _mm_mul_pd(Q_B, z2);
- Q_A = _mm_add_pd(Q_A, Q1);
- Q_B = _mm_add_pd(Q_B, Q0);
- Q_A = _mm_mul_pd(Q_A, z);
- Q_A = _mm_add_pd(Q_A, Q_B);
-
- z = _mm_mul_pd(z, P_A);
- z = _mm_mul_pd(z, gmx_mm_inv_pd(Q_A));
- z = _mm_mul_pd(z, x);
- z = _mm_add_pd(z, x);
-
- t1 = _mm_and_pd(mask1, morebits1);
- t1 = _mm_or_pd( _mm_and_pd(mask2, morebits2), _mm_andnot_pd(mask2, t1) );
-
- z = _mm_add_pd(z, t1);
- y = _mm_add_pd(y, z);
-
- y = _mm_xor_pd(y, sign);
-
- return y;
-}
-
-
-static __m128d
-gmx_mm_atan2_pd(__m128d y, __m128d x)
-{
- const __m128d pi = _mm_set1_pd(M_PI);
- const __m128d minuspi = _mm_set1_pd(-M_PI);
- const __m128d halfpi = _mm_set1_pd(M_PI/2.0);
- const __m128d minushalfpi = _mm_set1_pd(-M_PI/2.0);
-
- __m128d z, z1, z3, z4;
- __m128d w;
- __m128d maskx_lt, maskx_eq;
- __m128d masky_lt, masky_eq;
- __m128d mask1, mask2, mask3, mask4, maskall;
-
- maskx_lt = _mm_cmplt_pd(x, _mm_setzero_pd());
- masky_lt = _mm_cmplt_pd(y, _mm_setzero_pd());
- maskx_eq = _mm_cmpeq_pd(x, _mm_setzero_pd());
- masky_eq = _mm_cmpeq_pd(y, _mm_setzero_pd());
-
- z = _mm_mul_pd(y, gmx_mm_inv_pd(x));
- z = gmx_mm_atan_pd(z);
-
- mask1 = _mm_and_pd(maskx_eq, masky_lt);
- mask2 = _mm_andnot_pd(maskx_lt, masky_eq);
- mask3 = _mm_andnot_pd( _mm_or_pd(masky_lt, masky_eq), maskx_eq);
- mask4 = _mm_and_pd(masky_eq, maskx_lt);
-
- maskall = _mm_or_pd( _mm_or_pd(mask1, mask2), _mm_or_pd(mask3, mask4) );
-
- z = _mm_andnot_pd(maskall, z);
- z1 = _mm_and_pd(mask1, minushalfpi);
- z3 = _mm_and_pd(mask3, halfpi);
- z4 = _mm_and_pd(mask4, pi);
-
- z = _mm_or_pd( _mm_or_pd(z, z1), _mm_or_pd(z3, z4) );
-
- w = _mm_or_pd(_mm_andnot_pd(masky_lt, pi), _mm_and_pd(masky_lt, minuspi));
- w = _mm_and_pd(w, maskx_lt);
-
- w = _mm_andnot_pd(maskall, w);
-
- z = _mm_add_pd(z, w);
- return z;
-}
+#define gmx_mm_invsqrt_pd gmx_simd_invsqrt_d
+#define gmx_mm_inv_pd gmx_simd_inv_d
+#define gmx_mm_log_pd gmx_simd_log_d
+#define gmx_mm_pmecorrF_pd gmx_simd_pmecorrF_d
+#define gmx_mm_pmecorrV_pd gmx_simd_pmecorrV_d
+#define gmx_mm_sincos_pd gmx_simd_sincos_d
#endif
/*
* This file is part of the GROMACS molecular simulation package.
*
- * Copyright (c) 2012,2013, by the GROMACS development team, led by
+ * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
* Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
* and including many others, as listed in the AUTHORS file in the
* top-level source directory and at http://www.gromacs.org.
#ifndef GMX_SIMD_MATH_SSE2_SINGLE_H
#define GMX_SIMD_MATH_SSE2_SINGLE_H
+#include "simd_math.h"
-#include <stdio.h>
-#include <math.h>
-
-#include "general_x86_sse2.h"
-
-
-#ifndef M_PI
-# define M_PI 3.14159265358979323846264338327950288
-#endif
-
-
-
-/************************
- * *
- * Simple math routines *
- * *
- ************************/
-
-/* 1.0/sqrt(x) */
-static gmx_inline __m128
-gmx_mm_invsqrt_ps(__m128 x)
-{
- const __m128 half = _mm_set_ps(0.5, 0.5, 0.5, 0.5);
- const __m128 three = _mm_set_ps(3.0, 3.0, 3.0, 3.0);
-
- __m128 lu = _mm_rsqrt_ps(x);
-
- return _mm_mul_ps(half, _mm_mul_ps(_mm_sub_ps(three, _mm_mul_ps(_mm_mul_ps(lu, lu), x)), lu));
-}
-
-/* sqrt(x) - Do NOT use this (but rather invsqrt) if you actually need 1.0/sqrt(x) */
-static gmx_inline __m128
-gmx_mm_sqrt_ps(__m128 x)
-{
- __m128 mask;
- __m128 res;
-
- mask = _mm_cmpeq_ps(x, _mm_setzero_ps());
- res = _mm_andnot_ps(mask, gmx_mm_invsqrt_ps(x));
-
- res = _mm_mul_ps(x, res);
-
- return res;
-}
-
-/* 1.0/x */
-static gmx_inline __m128
-gmx_mm_inv_ps(__m128 x)
-{
- const __m128 two = _mm_set_ps(2.0f, 2.0f, 2.0f, 2.0f);
-
- __m128 lu = _mm_rcp_ps(x);
-
- return _mm_mul_ps(lu, _mm_sub_ps(two, _mm_mul_ps(lu, x)));
-}
-
-static gmx_inline __m128
-gmx_mm_abs_ps(__m128 x)
-{
- const __m128 signmask = gmx_mm_castsi128_ps( _mm_set1_epi32(0x7FFFFFFF) );
-
- return _mm_and_ps(x, signmask);
-}
-
-
-static __m128
-gmx_mm_log_ps(__m128 x)
-{
- /* Same algorithm as cephes library */
- const __m128 expmask = gmx_mm_castsi128_ps( _mm_set_epi32(0x7F800000, 0x7F800000, 0x7F800000, 0x7F800000) );
- const __m128i expbase_m1 = _mm_set1_epi32(127-1); /* We want non-IEEE format */
- const __m128 half = _mm_set1_ps(0.5f);
- const __m128 one = _mm_set1_ps(1.0f);
- const __m128 invsq2 = _mm_set1_ps(1.0f/sqrt(2.0f));
- const __m128 corr1 = _mm_set1_ps(-2.12194440e-4f);
- const __m128 corr2 = _mm_set1_ps(0.693359375f);
-
- const __m128 CA_1 = _mm_set1_ps(0.070376836292f);
- const __m128 CB_0 = _mm_set1_ps(1.6714950086782716f);
- const __m128 CB_1 = _mm_set1_ps(-2.452088066061482f);
- const __m128 CC_0 = _mm_set1_ps(1.5220770854701728f);
- const __m128 CC_1 = _mm_set1_ps(-1.3422238433233642f);
- const __m128 CD_0 = _mm_set1_ps(1.386218787509749f);
- const __m128 CD_1 = _mm_set1_ps(0.35075468953796346f);
- const __m128 CE_0 = _mm_set1_ps(1.3429983063133937f);
- const __m128 CE_1 = _mm_set1_ps(1.807420826584643f);
-
- __m128 fexp;
- __m128i iexp;
- __m128 mask;
- __m128 x2;
- __m128 y;
- __m128 pA, pB, pC, pD, pE, tB, tC, tD, tE;
-
- /* Separate x into exponent and mantissa, with a mantissa in the range [0.5..1[ (not IEEE754 standard!) */
- fexp = _mm_and_ps(x, expmask);
- iexp = gmx_mm_castps_si128(fexp);
- iexp = _mm_srli_epi32(iexp, 23);
- iexp = _mm_sub_epi32(iexp, expbase_m1);
-
- x = _mm_andnot_ps(expmask, x);
- x = _mm_or_ps(x, one);
- x = _mm_mul_ps(x, half);
-
- mask = _mm_cmplt_ps(x, invsq2);
-
- x = _mm_add_ps(x, _mm_and_ps(mask, x));
- x = _mm_sub_ps(x, one);
- iexp = _mm_add_epi32(iexp, gmx_mm_castps_si128(mask)); /* 0xFFFFFFFF = -1 as int */
-
- x2 = _mm_mul_ps(x, x);
-
- pA = _mm_mul_ps(CA_1, x);
- pB = _mm_mul_ps(CB_1, x);
- pC = _mm_mul_ps(CC_1, x);
- pD = _mm_mul_ps(CD_1, x);
- pE = _mm_mul_ps(CE_1, x);
- tB = _mm_add_ps(CB_0, x2);
- tC = _mm_add_ps(CC_0, x2);
- tD = _mm_add_ps(CD_0, x2);
- tE = _mm_add_ps(CE_0, x2);
- pB = _mm_add_ps(pB, tB);
- pC = _mm_add_ps(pC, tC);
- pD = _mm_add_ps(pD, tD);
- pE = _mm_add_ps(pE, tE);
-
- pA = _mm_mul_ps(pA, pB);
- pC = _mm_mul_ps(pC, pD);
- pE = _mm_mul_ps(pE, x2);
- pA = _mm_mul_ps(pA, pC);
- y = _mm_mul_ps(pA, pE);
-
- fexp = _mm_cvtepi32_ps(iexp);
- y = _mm_add_ps(y, _mm_mul_ps(fexp, corr1));
-
- y = _mm_sub_ps(y, _mm_mul_ps(half, x2));
- x2 = _mm_add_ps(x, y);
-
- x2 = _mm_add_ps(x2, _mm_mul_ps(fexp, corr2));
-
- return x2;
-}
-
-
-/*
- * 2^x function.
- *
- * The 2^w term is calculated from a (6,0)-th order (no denominator) Minimax polynomia on the interval
- * [-0.5,0.5]. The coefficiencts of this was derived in Mathematica using the command:
- *
- * MiniMaxApproximation[(2^x), {x, {-0.5, 0.5}, 6, 0}, WorkingPrecision -> 15]
- *
- * The largest-magnitude exponent we can represent in IEEE single-precision binary format
- * is 2^-126 for small numbers and 2^127 for large ones. To avoid wrap-around problems, we set the
- * result to zero if the argument falls outside this range. For small numbers this is just fine, but
- * for large numbers you could be fancy and return the smallest/largest IEEE single-precision
- * number instead. That would take a few extra cycles and not really help, since something is
- * wrong if you are using single precision to work with numbers that cannot really be represented
- * in single precision.
- *
- * The accuracy is at least 23 bits.
- */
-static __m128
-gmx_mm_exp2_ps(__m128 x)
-{
- /* Lower bound: We do not allow numbers that would lead to an IEEE fp representation exponent smaller than -126. */
- const __m128 arglimit = _mm_set1_ps(126.0f);
-
- const __m128i expbase = _mm_set1_epi32(127);
- const __m128 CA6 = _mm_set1_ps(1.535336188319500E-004);
- const __m128 CA5 = _mm_set1_ps(1.339887440266574E-003);
- const __m128 CA4 = _mm_set1_ps(9.618437357674640E-003);
- const __m128 CA3 = _mm_set1_ps(5.550332471162809E-002);
- const __m128 CA2 = _mm_set1_ps(2.402264791363012E-001);
- const __m128 CA1 = _mm_set1_ps(6.931472028550421E-001);
- const __m128 CA0 = _mm_set1_ps(1.0f);
-
-
- __m128 valuemask;
- __m128i iexppart;
- __m128 fexppart;
- __m128 intpart;
- __m128 x2;
- __m128 p0, p1;
-
- iexppart = _mm_cvtps_epi32(x);
- intpart = _mm_cvtepi32_ps(iexppart);
- iexppart = _mm_slli_epi32(_mm_add_epi32(iexppart, expbase), 23);
- valuemask = _mm_cmpge_ps(arglimit, gmx_mm_abs_ps(x));
- fexppart = _mm_and_ps(valuemask, gmx_mm_castsi128_ps(iexppart));
-
- x = _mm_sub_ps(x, intpart);
- x2 = _mm_mul_ps(x, x);
-
- p0 = _mm_mul_ps(CA6, x2);
- p1 = _mm_mul_ps(CA5, x2);
- p0 = _mm_add_ps(p0, CA4);
- p1 = _mm_add_ps(p1, CA3);
- p0 = _mm_mul_ps(p0, x2);
- p1 = _mm_mul_ps(p1, x2);
- p0 = _mm_add_ps(p0, CA2);
- p1 = _mm_add_ps(p1, CA1);
- p0 = _mm_mul_ps(p0, x2);
- p1 = _mm_mul_ps(p1, x);
- p0 = _mm_add_ps(p0, CA0);
- p0 = _mm_add_ps(p0, p1);
- x = _mm_mul_ps(p0, fexppart);
-
- return x;
-}
-
-
-/* Exponential function. This could be calculated from 2^x as Exp(x)=2^(y), where y=log2(e)*x,
- * but there will then be a small rounding error since we lose some precision due to the
- * multiplication. This will then be magnified a lot by the exponential.
- *
- * Instead, we calculate the fractional part directly as a minimax approximation of
- * Exp(z) on [-0.5,0.5]. We use extended precision arithmetics to calculate the fraction
- * remaining after 2^y, which avoids the precision-loss.
- * The final result is correct to within 1 LSB over the entire argument range.
+/* Temporary:
+ * Alias some old SSE definitions to new SIMD definitions so we don't need
+ * to modify _all_ group kernels - they will anyway be replaced with a new
+ * generic SIMD version soon.
*/
-static __m128
-gmx_mm_exp_ps(__m128 x)
-{
- const __m128 argscale = _mm_set1_ps(1.44269504088896341f);
- /* Lower bound: Disallow numbers that would lead to an IEEE fp exponent reaching +-127. */
- const __m128 arglimit = _mm_set1_ps(126.0f);
- const __m128i expbase = _mm_set1_epi32(127);
-
- const __m128 invargscale0 = _mm_set1_ps(0.693359375f);
- const __m128 invargscale1 = _mm_set1_ps(-2.12194440e-4f);
-
- const __m128 CC5 = _mm_set1_ps(1.9875691500e-4f);
- const __m128 CC4 = _mm_set1_ps(1.3981999507e-3f);
- const __m128 CC3 = _mm_set1_ps(8.3334519073e-3f);
- const __m128 CC2 = _mm_set1_ps(4.1665795894e-2f);
- const __m128 CC1 = _mm_set1_ps(1.6666665459e-1f);
- const __m128 CC0 = _mm_set1_ps(5.0000001201e-1f);
- const __m128 one = _mm_set1_ps(1.0f);
-
- __m128 y, x2;
- __m128 p0, p1;
- __m128 valuemask;
- __m128i iexppart;
- __m128 fexppart;
- __m128 intpart;
-
- y = _mm_mul_ps(x, argscale);
-
- iexppart = _mm_cvtps_epi32(y);
- intpart = _mm_cvtepi32_ps(iexppart);
-
- iexppart = _mm_slli_epi32(_mm_add_epi32(iexppart, expbase), 23);
- valuemask = _mm_cmpge_ps(arglimit, gmx_mm_abs_ps(y));
- fexppart = _mm_and_ps(valuemask, gmx_mm_castsi128_ps(iexppart));
-
- /* Extended precision arithmetics */
- x = _mm_sub_ps(x, _mm_mul_ps(invargscale0, intpart));
- x = _mm_sub_ps(x, _mm_mul_ps(invargscale1, intpart));
-
- x2 = _mm_mul_ps(x, x);
-
- p1 = _mm_mul_ps(CC5, x2);
- p0 = _mm_mul_ps(CC4, x2);
- p1 = _mm_add_ps(p1, CC3);
- p0 = _mm_add_ps(p0, CC2);
- p1 = _mm_mul_ps(p1, x2);
- p0 = _mm_mul_ps(p0, x2);
- p1 = _mm_add_ps(p1, CC1);
- p0 = _mm_add_ps(p0, CC0);
- p1 = _mm_mul_ps(p1, x);
- p0 = _mm_add_ps(p0, p1);
- p0 = _mm_mul_ps(p0, x2);
- x = _mm_add_ps(x, one);
- x = _mm_add_ps(x, p0);
-
- x = _mm_mul_ps(x, fexppart);
-
- return x;
-}
-
-/* FULL precision. Only errors in LSB */
-static __m128
-gmx_mm_erf_ps(__m128 x)
-{
- /* Coefficients for minimax approximation of erf(x)=x*P(x^2) in range [-1,1] */
- const __m128 CA6 = _mm_set1_ps(7.853861353153693e-5f);
- const __m128 CA5 = _mm_set1_ps(-8.010193625184903e-4f);
- const __m128 CA4 = _mm_set1_ps(5.188327685732524e-3f);
- const __m128 CA3 = _mm_set1_ps(-2.685381193529856e-2f);
- const __m128 CA2 = _mm_set1_ps(1.128358514861418e-1f);
- const __m128 CA1 = _mm_set1_ps(-3.761262582423300e-1f);
- const __m128 CA0 = _mm_set1_ps(1.128379165726710f);
- /* Coefficients for minimax approximation of erfc(x)=Exp(-x^2)*P((1/(x-1))^2) in range [0.67,2] */
- const __m128 CB9 = _mm_set1_ps(-0.0018629930017603923f);
- const __m128 CB8 = _mm_set1_ps(0.003909821287598495f);
- const __m128 CB7 = _mm_set1_ps(-0.0052094582210355615f);
- const __m128 CB6 = _mm_set1_ps(0.005685614362160572f);
- const __m128 CB5 = _mm_set1_ps(-0.0025367682853477272f);
- const __m128 CB4 = _mm_set1_ps(-0.010199799682318782f);
- const __m128 CB3 = _mm_set1_ps(0.04369575504816542f);
- const __m128 CB2 = _mm_set1_ps(-0.11884063474674492f);
- const __m128 CB1 = _mm_set1_ps(0.2732120154030589f);
- const __m128 CB0 = _mm_set1_ps(0.42758357702025784f);
- /* Coefficients for minimax approximation of erfc(x)=Exp(-x^2)*(1/x)*P((1/x)^2) in range [2,9.19] */
- const __m128 CC10 = _mm_set1_ps(-0.0445555913112064f);
- const __m128 CC9 = _mm_set1_ps(0.21376355144663348f);
- const __m128 CC8 = _mm_set1_ps(-0.3473187200259257f);
- const __m128 CC7 = _mm_set1_ps(0.016690861551248114f);
- const __m128 CC6 = _mm_set1_ps(0.7560973182491192f);
- const __m128 CC5 = _mm_set1_ps(-1.2137903600145787f);
- const __m128 CC4 = _mm_set1_ps(0.8411872321232948f);
- const __m128 CC3 = _mm_set1_ps(-0.08670413896296343f);
- const __m128 CC2 = _mm_set1_ps(-0.27124782687240334f);
- const __m128 CC1 = _mm_set1_ps(-0.0007502488047806069f);
- const __m128 CC0 = _mm_set1_ps(0.5642114853803148f);
-
- /* Coefficients for expansion of exp(x) in [0,0.1] */
- /* CD0 and CD1 are both 1.0, so no need to declare them separately */
- const __m128 CD2 = _mm_set1_ps(0.5000066608081202f);
- const __m128 CD3 = _mm_set1_ps(0.1664795422874624f);
- const __m128 CD4 = _mm_set1_ps(0.04379839977652482f);
-
- const __m128 sieve = gmx_mm_castsi128_ps( _mm_set1_epi32(0xfffff000) );
- const __m128 signbit = gmx_mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
- const __m128 one = _mm_set1_ps(1.0f);
- const __m128 two = _mm_set1_ps(2.0f);
-
- __m128 x2, x4, y;
- __m128 z, q, t, t2, w, w2;
- __m128 pA0, pA1, pB0, pB1, pC0, pC1;
- __m128 expmx2, corr;
- __m128 res_erf, res_erfc, res;
- __m128 mask;
-
- /* Calculate erf() */
- x2 = _mm_mul_ps(x, x);
- x4 = _mm_mul_ps(x2, x2);
-
- pA0 = _mm_mul_ps(CA6, x4);
- pA1 = _mm_mul_ps(CA5, x4);
- pA0 = _mm_add_ps(pA0, CA4);
- pA1 = _mm_add_ps(pA1, CA3);
- pA0 = _mm_mul_ps(pA0, x4);
- pA1 = _mm_mul_ps(pA1, x4);
- pA0 = _mm_add_ps(pA0, CA2);
- pA1 = _mm_add_ps(pA1, CA1);
- pA0 = _mm_mul_ps(pA0, x4);
- pA1 = _mm_mul_ps(pA1, x2);
- pA0 = _mm_add_ps(pA0, pA1);
- pA0 = _mm_add_ps(pA0, CA0);
-
- res_erf = _mm_mul_ps(x, pA0);
-
- /* Calculate erfc */
-
- y = gmx_mm_abs_ps(x);
- t = gmx_mm_inv_ps(y);
- w = _mm_sub_ps(t, one);
- t2 = _mm_mul_ps(t, t);
- w2 = _mm_mul_ps(w, w);
- /*
- * We cannot simply calculate exp(-x2) directly in single precision, since
- * that will lose a couple of bits of precision due to the multiplication.
- * Instead, we introduce x=z+w, where the last 12 bits of precision are in w.
- * Then we get exp(-x2) = exp(-z2)*exp((z-x)*(z+x)).
- *
- * The only drawback with this is that it requires TWO separate exponential
- * evaluations, which would be horrible performance-wise. However, the argument
- * for the second exp() call is always small, so there we simply use a
- * low-order minimax expansion on [0,0.1].
- */
-
- z = _mm_and_ps(y, sieve);
- q = _mm_mul_ps( _mm_sub_ps(z, y), _mm_add_ps(z, y) );
-
- corr = _mm_mul_ps(CD4, q);
- corr = _mm_add_ps(corr, CD3);
- corr = _mm_mul_ps(corr, q);
- corr = _mm_add_ps(corr, CD2);
- corr = _mm_mul_ps(corr, q);
- corr = _mm_add_ps(corr, one);
- corr = _mm_mul_ps(corr, q);
- corr = _mm_add_ps(corr, one);
-
- expmx2 = gmx_mm_exp_ps( _mm_or_ps( signbit, _mm_mul_ps(z, z) ) );
- expmx2 = _mm_mul_ps(expmx2, corr);
-
- pB1 = _mm_mul_ps(CB9, w2);
- pB0 = _mm_mul_ps(CB8, w2);
- pB1 = _mm_add_ps(pB1, CB7);
- pB0 = _mm_add_ps(pB0, CB6);
- pB1 = _mm_mul_ps(pB1, w2);
- pB0 = _mm_mul_ps(pB0, w2);
- pB1 = _mm_add_ps(pB1, CB5);
- pB0 = _mm_add_ps(pB0, CB4);
- pB1 = _mm_mul_ps(pB1, w2);
- pB0 = _mm_mul_ps(pB0, w2);
- pB1 = _mm_add_ps(pB1, CB3);
- pB0 = _mm_add_ps(pB0, CB2);
- pB1 = _mm_mul_ps(pB1, w2);
- pB0 = _mm_mul_ps(pB0, w2);
- pB1 = _mm_add_ps(pB1, CB1);
- pB1 = _mm_mul_ps(pB1, w);
- pB0 = _mm_add_ps(pB0, pB1);
- pB0 = _mm_add_ps(pB0, CB0);
-
- pC0 = _mm_mul_ps(CC10, t2);
- pC1 = _mm_mul_ps(CC9, t2);
- pC0 = _mm_add_ps(pC0, CC8);
- pC1 = _mm_add_ps(pC1, CC7);
- pC0 = _mm_mul_ps(pC0, t2);
- pC1 = _mm_mul_ps(pC1, t2);
- pC0 = _mm_add_ps(pC0, CC6);
- pC1 = _mm_add_ps(pC1, CC5);
- pC0 = _mm_mul_ps(pC0, t2);
- pC1 = _mm_mul_ps(pC1, t2);
- pC0 = _mm_add_ps(pC0, CC4);
- pC1 = _mm_add_ps(pC1, CC3);
- pC0 = _mm_mul_ps(pC0, t2);
- pC1 = _mm_mul_ps(pC1, t2);
- pC0 = _mm_add_ps(pC0, CC2);
- pC1 = _mm_add_ps(pC1, CC1);
- pC0 = _mm_mul_ps(pC0, t2);
- pC1 = _mm_mul_ps(pC1, t);
- pC0 = _mm_add_ps(pC0, pC1);
- pC0 = _mm_add_ps(pC0, CC0);
- pC0 = _mm_mul_ps(pC0, t);
-
- /* SELECT pB0 or pC0 for erfc() */
- mask = _mm_cmplt_ps(two, y);
- res_erfc = _mm_or_ps(_mm_andnot_ps(mask, pB0), _mm_and_ps(mask, pC0));
- res_erfc = _mm_mul_ps(res_erfc, expmx2);
-
- /* erfc(x<0) = 2-erfc(|x|) */
- mask = _mm_cmplt_ps(x, _mm_setzero_ps());
- res_erfc = _mm_or_ps(_mm_andnot_ps(mask, res_erfc),
- _mm_and_ps(mask, _mm_sub_ps(two, res_erfc)));
-
- /* Select erf() or erfc() */
- mask = _mm_cmplt_ps(y, _mm_set1_ps(0.75f));
- res = _mm_or_ps(_mm_andnot_ps(mask, _mm_sub_ps(one, res_erfc)), _mm_and_ps(mask, res_erf));
-
- return res;
-}
-
-
-
-
-
-/* FULL precision. Only errors in LSB */
-static __m128
-gmx_mm_erfc_ps(__m128 x)
-{
- /* Coefficients for minimax approximation of erf(x)=x*P(x^2) in range [-1,1] */
- const __m128 CA6 = _mm_set1_ps(7.853861353153693e-5f);
- const __m128 CA5 = _mm_set1_ps(-8.010193625184903e-4f);
- const __m128 CA4 = _mm_set1_ps(5.188327685732524e-3f);
- const __m128 CA3 = _mm_set1_ps(-2.685381193529856e-2f);
- const __m128 CA2 = _mm_set1_ps(1.128358514861418e-1f);
- const __m128 CA1 = _mm_set1_ps(-3.761262582423300e-1f);
- const __m128 CA0 = _mm_set1_ps(1.128379165726710f);
- /* Coefficients for minimax approximation of erfc(x)=Exp(-x^2)*P((1/(x-1))^2) in range [0.67,2] */
- const __m128 CB9 = _mm_set1_ps(-0.0018629930017603923f);
- const __m128 CB8 = _mm_set1_ps(0.003909821287598495f);
- const __m128 CB7 = _mm_set1_ps(-0.0052094582210355615f);
- const __m128 CB6 = _mm_set1_ps(0.005685614362160572f);
- const __m128 CB5 = _mm_set1_ps(-0.0025367682853477272f);
- const __m128 CB4 = _mm_set1_ps(-0.010199799682318782f);
- const __m128 CB3 = _mm_set1_ps(0.04369575504816542f);
- const __m128 CB2 = _mm_set1_ps(-0.11884063474674492f);
- const __m128 CB1 = _mm_set1_ps(0.2732120154030589f);
- const __m128 CB0 = _mm_set1_ps(0.42758357702025784f);
- /* Coefficients for minimax approximation of erfc(x)=Exp(-x^2)*(1/x)*P((1/x)^2) in range [2,9.19] */
- const __m128 CC10 = _mm_set1_ps(-0.0445555913112064f);
- const __m128 CC9 = _mm_set1_ps(0.21376355144663348f);
- const __m128 CC8 = _mm_set1_ps(-0.3473187200259257f);
- const __m128 CC7 = _mm_set1_ps(0.016690861551248114f);
- const __m128 CC6 = _mm_set1_ps(0.7560973182491192f);
- const __m128 CC5 = _mm_set1_ps(-1.2137903600145787f);
- const __m128 CC4 = _mm_set1_ps(0.8411872321232948f);
- const __m128 CC3 = _mm_set1_ps(-0.08670413896296343f);
- const __m128 CC2 = _mm_set1_ps(-0.27124782687240334f);
- const __m128 CC1 = _mm_set1_ps(-0.0007502488047806069f);
- const __m128 CC0 = _mm_set1_ps(0.5642114853803148f);
-
- /* Coefficients for expansion of exp(x) in [0,0.1] */
- /* CD0 and CD1 are both 1.0, so no need to declare them separately */
- const __m128 CD2 = _mm_set1_ps(0.5000066608081202f);
- const __m128 CD3 = _mm_set1_ps(0.1664795422874624f);
- const __m128 CD4 = _mm_set1_ps(0.04379839977652482f);
-
- const __m128 sieve = gmx_mm_castsi128_ps( _mm_set1_epi32(0xfffff000) );
- const __m128 signbit = gmx_mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
- const __m128 one = _mm_set1_ps(1.0f);
- const __m128 two = _mm_set1_ps(2.0f);
-
- __m128 x2, x4, y;
- __m128 z, q, t, t2, w, w2;
- __m128 pA0, pA1, pB0, pB1, pC0, pC1;
- __m128 expmx2, corr;
- __m128 res_erf, res_erfc, res;
- __m128 mask;
-
- /* Calculate erf() */
- x2 = _mm_mul_ps(x, x);
- x4 = _mm_mul_ps(x2, x2);
-
- pA0 = _mm_mul_ps(CA6, x4);
- pA1 = _mm_mul_ps(CA5, x4);
- pA0 = _mm_add_ps(pA0, CA4);
- pA1 = _mm_add_ps(pA1, CA3);
- pA0 = _mm_mul_ps(pA0, x4);
- pA1 = _mm_mul_ps(pA1, x4);
- pA0 = _mm_add_ps(pA0, CA2);
- pA1 = _mm_add_ps(pA1, CA1);
- pA0 = _mm_mul_ps(pA0, x4);
- pA1 = _mm_mul_ps(pA1, x2);
- pA0 = _mm_add_ps(pA0, pA1);
- pA0 = _mm_add_ps(pA0, CA0);
-
- res_erf = _mm_mul_ps(x, pA0);
-
- /* Calculate erfc */
- y = gmx_mm_abs_ps(x);
- t = gmx_mm_inv_ps(y);
- w = _mm_sub_ps(t, one);
- t2 = _mm_mul_ps(t, t);
- w2 = _mm_mul_ps(w, w);
- /*
- * We cannot simply calculate exp(-x2) directly in single precision, since
- * that will lose a couple of bits of precision due to the multiplication.
- * Instead, we introduce x=z+w, where the last 12 bits of precision are in w.
- * Then we get exp(-x2) = exp(-z2)*exp((z-x)*(z+x)).
- *
- * The only drawback with this is that it requires TWO separate exponential
- * evaluations, which would be horrible performance-wise. However, the argument
- * for the second exp() call is always small, so there we simply use a
- * low-order minimax expansion on [0,0.1].
- */
-
- z = _mm_and_ps(y, sieve);
- q = _mm_mul_ps( _mm_sub_ps(z, y), _mm_add_ps(z, y) );
-
- corr = _mm_mul_ps(CD4, q);
- corr = _mm_add_ps(corr, CD3);
- corr = _mm_mul_ps(corr, q);
- corr = _mm_add_ps(corr, CD2);
- corr = _mm_mul_ps(corr, q);
- corr = _mm_add_ps(corr, one);
- corr = _mm_mul_ps(corr, q);
- corr = _mm_add_ps(corr, one);
-
- expmx2 = gmx_mm_exp_ps( _mm_or_ps( signbit, _mm_mul_ps(z, z) ) );
- expmx2 = _mm_mul_ps(expmx2, corr);
-
- pB1 = _mm_mul_ps(CB9, w2);
- pB0 = _mm_mul_ps(CB8, w2);
- pB1 = _mm_add_ps(pB1, CB7);
- pB0 = _mm_add_ps(pB0, CB6);
- pB1 = _mm_mul_ps(pB1, w2);
- pB0 = _mm_mul_ps(pB0, w2);
- pB1 = _mm_add_ps(pB1, CB5);
- pB0 = _mm_add_ps(pB0, CB4);
- pB1 = _mm_mul_ps(pB1, w2);
- pB0 = _mm_mul_ps(pB0, w2);
- pB1 = _mm_add_ps(pB1, CB3);
- pB0 = _mm_add_ps(pB0, CB2);
- pB1 = _mm_mul_ps(pB1, w2);
- pB0 = _mm_mul_ps(pB0, w2);
- pB1 = _mm_add_ps(pB1, CB1);
- pB1 = _mm_mul_ps(pB1, w);
- pB0 = _mm_add_ps(pB0, pB1);
- pB0 = _mm_add_ps(pB0, CB0);
-
- pC0 = _mm_mul_ps(CC10, t2);
- pC1 = _mm_mul_ps(CC9, t2);
- pC0 = _mm_add_ps(pC0, CC8);
- pC1 = _mm_add_ps(pC1, CC7);
- pC0 = _mm_mul_ps(pC0, t2);
- pC1 = _mm_mul_ps(pC1, t2);
- pC0 = _mm_add_ps(pC0, CC6);
- pC1 = _mm_add_ps(pC1, CC5);
- pC0 = _mm_mul_ps(pC0, t2);
- pC1 = _mm_mul_ps(pC1, t2);
- pC0 = _mm_add_ps(pC0, CC4);
- pC1 = _mm_add_ps(pC1, CC3);
- pC0 = _mm_mul_ps(pC0, t2);
- pC1 = _mm_mul_ps(pC1, t2);
- pC0 = _mm_add_ps(pC0, CC2);
- pC1 = _mm_add_ps(pC1, CC1);
- pC0 = _mm_mul_ps(pC0, t2);
- pC1 = _mm_mul_ps(pC1, t);
- pC0 = _mm_add_ps(pC0, pC1);
- pC0 = _mm_add_ps(pC0, CC0);
- pC0 = _mm_mul_ps(pC0, t);
-
- /* SELECT pB0 or pC0 for erfc() */
- mask = _mm_cmplt_ps(two, y);
- res_erfc = _mm_or_ps(_mm_andnot_ps(mask, pB0), _mm_and_ps(mask, pC0));
- res_erfc = _mm_mul_ps(res_erfc, expmx2);
-
- /* erfc(x<0) = 2-erfc(|x|) */
- mask = _mm_cmplt_ps(x, _mm_setzero_ps());
- res_erfc = _mm_or_ps(_mm_andnot_ps(mask, res_erfc), _mm_and_ps(mask, _mm_sub_ps(two, res_erfc)));
-
- /* Select erf() or erfc() */
- mask = _mm_cmplt_ps(y, _mm_set1_ps(0.75f));
- res = _mm_or_ps(_mm_andnot_ps(mask, res_erfc), _mm_and_ps(mask, _mm_sub_ps(one, res_erf)));
-
- return res;
-}
-
-
-/* Calculate the force correction due to PME analytically.
- *
- * This routine is meant to enable analytical evaluation of the
- * direct-space PME electrostatic force to avoid tables.
- *
- * The direct-space potential should be Erfc(beta*r)/r, but there
- * are some problems evaluating that:
- *
- * First, the error function is difficult (read: expensive) to
- * approxmiate accurately for intermediate to large arguments, and
- * this happens already in ranges of beta*r that occur in simulations.
- * Second, we now try to avoid calculating potentials in Gromacs but
- * use forces directly.
- *
- * We can simply things slight by noting that the PME part is really
- * a correction to the normal Coulomb force since Erfc(z)=1-Erf(z), i.e.
- *
- * V= 1/r - Erf(beta*r)/r
- *
- * The first term we already have from the inverse square root, so
- * that we can leave out of this routine.
- *
- * For pme tolerances of 1e-3 to 1e-8 and cutoffs of 0.5nm to 1.8nm,
- * the argument beta*r will be in the range 0.15 to ~4. Use your
- * favorite plotting program to realize how well-behaved Erf(z)/z is
- * in this range!
- *
- * We approximate f(z)=erf(z)/z with a rational minimax polynomial.
- * However, it turns out it is more efficient to approximate f(z)/z and
- * then only use even powers. This is another minor optimization, since
- * we actually WANT f(z)/z, because it is going to be multiplied by
- * the vector between the two atoms to get the vectorial force. The
- * fastest flops are the ones we can avoid calculating!
- *
- * So, here's how it should be used:
- *
- * 1. Calculate r^2.
- * 2. Multiply by beta^2, so you get z^2=beta^2*r^2.
- * 3. Evaluate this routine with z^2 as the argument.
- * 4. The return value is the expression:
- *
- *
- * 2*exp(-z^2) erf(z)
- * ------------ - --------
- * sqrt(Pi)*z^2 z^3
- *
- * 5. Multiply the entire expression by beta^3. This will get you
- *
- * beta^3*2*exp(-z^2) beta^3*erf(z)
- * ------------------ - ---------------
- * sqrt(Pi)*z^2 z^3
- *
- * or, switching back to r (z=r*beta):
- *
- * 2*beta*exp(-r^2*beta^2) erf(r*beta)
- * ----------------------- - -----------
- * sqrt(Pi)*r^2 r^3
- *
- *
- * With a bit of math exercise you should be able to confirm that
- * this is exactly D[Erf[beta*r]/r,r] divided by r another time.
- *
- * 6. Add the result to 1/r^3, multiply by the product of the charges,
- * and you have your force (divided by r). A final multiplication
- * with the vector connecting the two particles and you have your
- * vectorial force to add to the particles.
- *
- */
-static gmx_inline __m128
-gmx_mm_pmecorrF_ps(__m128 z2)
-{
- const __m128 FN6 = _mm_set1_ps(-1.7357322914161492954e-8f);
- const __m128 FN5 = _mm_set1_ps(1.4703624142580877519e-6f);
- const __m128 FN4 = _mm_set1_ps(-0.000053401640219807709149f);
- const __m128 FN3 = _mm_set1_ps(0.0010054721316683106153f);
- const __m128 FN2 = _mm_set1_ps(-0.019278317264888380590f);
- const __m128 FN1 = _mm_set1_ps(0.069670166153766424023f);
- const __m128 FN0 = _mm_set1_ps(-0.75225204789749321333f);
-
- const __m128 FD4 = _mm_set1_ps(0.0011193462567257629232f);
- const __m128 FD3 = _mm_set1_ps(0.014866955030185295499f);
- const __m128 FD2 = _mm_set1_ps(0.11583842382862377919f);
- const __m128 FD1 = _mm_set1_ps(0.50736591960530292870f);
- const __m128 FD0 = _mm_set1_ps(1.0f);
-
- __m128 z4;
- __m128 polyFN0, polyFN1, polyFD0, polyFD1;
-
- z4 = _mm_mul_ps(z2, z2);
-
- polyFD0 = _mm_mul_ps(FD4, z4);
- polyFD1 = _mm_mul_ps(FD3, z4);
- polyFD0 = _mm_add_ps(polyFD0, FD2);
- polyFD1 = _mm_add_ps(polyFD1, FD1);
- polyFD0 = _mm_mul_ps(polyFD0, z4);
- polyFD1 = _mm_mul_ps(polyFD1, z2);
- polyFD0 = _mm_add_ps(polyFD0, FD0);
- polyFD0 = _mm_add_ps(polyFD0, polyFD1);
-
- polyFD0 = gmx_mm_inv_ps(polyFD0);
-
- polyFN0 = _mm_mul_ps(FN6, z4);
- polyFN1 = _mm_mul_ps(FN5, z4);
- polyFN0 = _mm_add_ps(polyFN0, FN4);
- polyFN1 = _mm_add_ps(polyFN1, FN3);
- polyFN0 = _mm_mul_ps(polyFN0, z4);
- polyFN1 = _mm_mul_ps(polyFN1, z4);
- polyFN0 = _mm_add_ps(polyFN0, FN2);
- polyFN1 = _mm_add_ps(polyFN1, FN1);
- polyFN0 = _mm_mul_ps(polyFN0, z4);
- polyFN1 = _mm_mul_ps(polyFN1, z2);
- polyFN0 = _mm_add_ps(polyFN0, FN0);
- polyFN0 = _mm_add_ps(polyFN0, polyFN1);
-
- return _mm_mul_ps(polyFN0, polyFD0);
-}
-
-
-/* Calculate the potential correction due to PME analytically.
- *
- * See gmx_mm256_pmecorrF_ps() for details about the approximation.
- *
- * This routine calculates Erf(z)/z, although you should provide z^2
- * as the input argument.
- *
- * Here's how it should be used:
- *
- * 1. Calculate r^2.
- * 2. Multiply by beta^2, so you get z^2=beta^2*r^2.
- * 3. Evaluate this routine with z^2 as the argument.
- * 4. The return value is the expression:
- *
- *
- * erf(z)
- * --------
- * z
- *
- * 5. Multiply the entire expression by beta and switching back to r (z=r*beta):
- *
- * erf(r*beta)
- * -----------
- * r
- *
- * 6. Subtract the result from 1/r, multiply by the product of the charges,
- * and you have your potential.
- */
-static gmx_inline __m128
-gmx_mm_pmecorrV_ps(__m128 z2)
-{
- const __m128 VN6 = _mm_set1_ps(1.9296833005951166339e-8f);
- const __m128 VN5 = _mm_set1_ps(-1.4213390571557850962e-6f);
- const __m128 VN4 = _mm_set1_ps(0.000041603292906656984871f);
- const __m128 VN3 = _mm_set1_ps(-0.00013134036773265025626f);
- const __m128 VN2 = _mm_set1_ps(0.038657983986041781264f);
- const __m128 VN1 = _mm_set1_ps(0.11285044772717598220f);
- const __m128 VN0 = _mm_set1_ps(1.1283802385263030286f);
-
- const __m128 VD3 = _mm_set1_ps(0.0066752224023576045451f);
- const __m128 VD2 = _mm_set1_ps(0.078647795836373922256f);
- const __m128 VD1 = _mm_set1_ps(0.43336185284710920150f);
- const __m128 VD0 = _mm_set1_ps(1.0f);
-
- __m128 z4;
- __m128 polyVN0, polyVN1, polyVD0, polyVD1;
-
- z4 = _mm_mul_ps(z2, z2);
-
- polyVD1 = _mm_mul_ps(VD3, z4);
- polyVD0 = _mm_mul_ps(VD2, z4);
- polyVD1 = _mm_add_ps(polyVD1, VD1);
- polyVD0 = _mm_add_ps(polyVD0, VD0);
- polyVD1 = _mm_mul_ps(polyVD1, z2);
- polyVD0 = _mm_add_ps(polyVD0, polyVD1);
-
- polyVD0 = gmx_mm_inv_ps(polyVD0);
-
- polyVN0 = _mm_mul_ps(VN6, z4);
- polyVN1 = _mm_mul_ps(VN5, z4);
- polyVN0 = _mm_add_ps(polyVN0, VN4);
- polyVN1 = _mm_add_ps(polyVN1, VN3);
- polyVN0 = _mm_mul_ps(polyVN0, z4);
- polyVN1 = _mm_mul_ps(polyVN1, z4);
- polyVN0 = _mm_add_ps(polyVN0, VN2);
- polyVN1 = _mm_add_ps(polyVN1, VN1);
- polyVN0 = _mm_mul_ps(polyVN0, z4);
- polyVN1 = _mm_mul_ps(polyVN1, z2);
- polyVN0 = _mm_add_ps(polyVN0, VN0);
- polyVN0 = _mm_add_ps(polyVN0, polyVN1);
-
- return _mm_mul_ps(polyVN0, polyVD0);
-}
-
-
-static int
-gmx_mm_sincos_ps(__m128 x,
- __m128 *sinval,
- __m128 *cosval)
-{
- const __m128 two_over_pi = _mm_set1_ps(2.0/M_PI);
- const __m128 half = _mm_set1_ps(0.5);
- const __m128 one = _mm_set1_ps(1.0);
-
- const __m128i izero = _mm_set1_epi32(0);
- const __m128i ione = _mm_set1_epi32(1);
- const __m128i itwo = _mm_set1_epi32(2);
- const __m128i ithree = _mm_set1_epi32(3);
- const __m128 signbit = gmx_mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
-
- const __m128 CA1 = _mm_set1_ps(1.5703125f);
- const __m128 CA2 = _mm_set1_ps(4.837512969970703125e-4f);
- const __m128 CA3 = _mm_set1_ps(7.54978995489188216e-8f);
-
- const __m128 CC0 = _mm_set1_ps(-0.0013602249f);
- const __m128 CC1 = _mm_set1_ps(0.0416566950f);
- const __m128 CC2 = _mm_set1_ps(-0.4999990225f);
- const __m128 CS0 = _mm_set1_ps(-0.0001950727f);
- const __m128 CS1 = _mm_set1_ps(0.0083320758f);
- const __m128 CS2 = _mm_set1_ps(-0.1666665247f);
-
- __m128 y, y2;
- __m128 z;
- __m128i iz;
- __m128i offset_sin, offset_cos;
- __m128 tmp1, tmp2;
- __m128 mask_sin, mask_cos;
- __m128 tmp_sin, tmp_cos;
-
- y = _mm_mul_ps(x, two_over_pi);
- y = _mm_add_ps(y, _mm_or_ps(_mm_and_ps(y, signbit), half));
-
- iz = _mm_cvttps_epi32(y);
- z = _mm_cvtepi32_ps(iz);
-
- offset_sin = _mm_and_si128(iz, ithree);
- offset_cos = _mm_add_epi32(iz, ione);
-
- /* Extended precision arithmethic to achieve full precision */
- y = _mm_mul_ps(z, CA1);
- tmp1 = _mm_mul_ps(z, CA2);
- tmp2 = _mm_mul_ps(z, CA3);
- y = _mm_sub_ps(x, y);
- y = _mm_sub_ps(y, tmp1);
- y = _mm_sub_ps(y, tmp2);
-
- y2 = _mm_mul_ps(y, y);
-
- tmp1 = _mm_mul_ps(CC0, y2);
- tmp1 = _mm_add_ps(tmp1, CC1);
- tmp2 = _mm_mul_ps(CS0, y2);
- tmp2 = _mm_add_ps(tmp2, CS1);
- tmp1 = _mm_mul_ps(tmp1, y2);
- tmp1 = _mm_add_ps(tmp1, CC2);
- tmp2 = _mm_mul_ps(tmp2, y2);
- tmp2 = _mm_add_ps(tmp2, CS2);
-
- tmp1 = _mm_mul_ps(tmp1, y2);
- tmp1 = _mm_add_ps(tmp1, one);
-
- tmp2 = _mm_mul_ps(tmp2, _mm_mul_ps(y, y2));
- tmp2 = _mm_add_ps(tmp2, y);
-
- mask_sin = gmx_mm_castsi128_ps(_mm_cmpeq_epi32( _mm_and_si128(offset_sin, ione), izero));
- mask_cos = gmx_mm_castsi128_ps(_mm_cmpeq_epi32( _mm_and_si128(offset_cos, ione), izero));
-
- tmp_sin = _mm_or_ps( _mm_andnot_ps(mask_sin, tmp1), _mm_and_ps(mask_sin, tmp2) );
- tmp_cos = _mm_or_ps( _mm_andnot_ps(mask_cos, tmp1), _mm_and_ps(mask_cos, tmp2) );
-
- mask_sin = gmx_mm_castsi128_ps(_mm_cmpeq_epi32( _mm_and_si128(offset_sin, itwo), izero));
- mask_cos = gmx_mm_castsi128_ps(_mm_cmpeq_epi32( _mm_and_si128(offset_cos, itwo), izero));
-
- tmp1 = _mm_xor_ps(signbit, tmp_sin);
- tmp2 = _mm_xor_ps(signbit, tmp_cos);
-
- *sinval = _mm_or_ps( _mm_andnot_ps(mask_sin, tmp1), _mm_and_ps(mask_sin, tmp_sin) );
- *cosval = _mm_or_ps( _mm_andnot_ps(mask_cos, tmp2), _mm_and_ps(mask_cos, tmp_cos) );
-
- return 0;
-}
-
-/*
- * IMPORTANT: Do NOT call both sin & cos if you need both results, since each of them
- * will then call the sincos() routine and waste a factor 2 in performance!
- */
-static __m128
-gmx_mm_sin_ps(__m128 x)
-{
- __m128 s, c;
- gmx_mm_sincos_ps(x, &s, &c);
- return s;
-}
-
-/*
- * IMPORTANT: Do NOT call both sin & cos if you need both results, since each of them
- * will then call the sincos() routine and waste a factor 2 in performance!
- */
-static __m128
-gmx_mm_cos_ps(__m128 x)
-{
- __m128 s, c;
- gmx_mm_sincos_ps(x, &s, &c);
- return c;
-}
-
-
-static __m128
-gmx_mm_tan_ps(__m128 x)
-{
- __m128 sinval, cosval;
- __m128 tanval;
-
- gmx_mm_sincos_ps(x, &sinval, &cosval);
-
- tanval = _mm_mul_ps(sinval, gmx_mm_inv_ps(cosval));
-
- return tanval;
-}
-
-
-static __m128
-gmx_mm_asin_ps(__m128 x)
-{
- /* Same algorithm as cephes library */
- const __m128 signmask = gmx_mm_castsi128_ps( _mm_set1_epi32(0x7FFFFFFF) );
- const __m128 limitlow = _mm_set1_ps(1e-4f);
- const __m128 half = _mm_set1_ps(0.5f);
- const __m128 one = _mm_set1_ps(1.0f);
- const __m128 halfpi = _mm_set1_ps(M_PI/2.0f);
-
- const __m128 CC5 = _mm_set1_ps(4.2163199048E-2f);
- const __m128 CC4 = _mm_set1_ps(2.4181311049E-2f);
- const __m128 CC3 = _mm_set1_ps(4.5470025998E-2f);
- const __m128 CC2 = _mm_set1_ps(7.4953002686E-2f);
- const __m128 CC1 = _mm_set1_ps(1.6666752422E-1f);
-
- __m128 sign;
- __m128 mask;
- __m128 xabs;
- __m128 z, z1, z2, q, q1, q2;
- __m128 pA, pB;
-
- sign = _mm_andnot_ps(signmask, x);
- xabs = _mm_and_ps(x, signmask);
-
- mask = _mm_cmpgt_ps(xabs, half);
-
- z1 = _mm_mul_ps(half, _mm_sub_ps(one, xabs));
- q1 = _mm_mul_ps(z1, gmx_mm_invsqrt_ps(z1));
- q1 = _mm_andnot_ps(_mm_cmpeq_ps(xabs, one), q1);
-
- q2 = xabs;
- z2 = _mm_mul_ps(q2, q2);
-
- z = _mm_or_ps( _mm_and_ps(mask, z1), _mm_andnot_ps(mask, z2) );
- q = _mm_or_ps( _mm_and_ps(mask, q1), _mm_andnot_ps(mask, q2) );
-
- z2 = _mm_mul_ps(z, z);
-
- pA = _mm_mul_ps(CC5, z2);
- pB = _mm_mul_ps(CC4, z2);
-
- pA = _mm_add_ps(pA, CC3);
- pB = _mm_add_ps(pB, CC2);
-
- pA = _mm_mul_ps(pA, z2);
- pB = _mm_mul_ps(pB, z2);
-
- pA = _mm_add_ps(pA, CC1);
- pA = _mm_mul_ps(pA, z);
-
- z = _mm_add_ps(pA, pB);
- z = _mm_mul_ps(z, q);
- z = _mm_add_ps(z, q);
-
- q2 = _mm_sub_ps(halfpi, z);
- q2 = _mm_sub_ps(q2, z);
-
- z = _mm_or_ps( _mm_and_ps(mask, q2), _mm_andnot_ps(mask, z) );
-
- mask = _mm_cmpgt_ps(xabs, limitlow);
- z = _mm_or_ps( _mm_and_ps(mask, z), _mm_andnot_ps(mask, xabs) );
-
- z = _mm_xor_ps(z, sign);
-
- return z;
-}
-
-
-static __m128
-gmx_mm_acos_ps(__m128 x)
-{
- const __m128 signmask = gmx_mm_castsi128_ps( _mm_set1_epi32(0x7FFFFFFF) );
- const __m128 one_ps = _mm_set1_ps(1.0f);
- const __m128 half_ps = _mm_set1_ps(0.5f);
- const __m128 pi_ps = _mm_set1_ps(M_PI);
- const __m128 halfpi_ps = _mm_set1_ps(M_PI/2.0f);
-
- __m128 mask1;
- __m128 mask2;
- __m128 xabs;
- __m128 z, z1, z2, z3;
-
- xabs = _mm_and_ps(x, signmask);
- mask1 = _mm_cmpgt_ps(xabs, half_ps);
- mask2 = _mm_cmpgt_ps(x, _mm_setzero_ps());
-
- z = _mm_mul_ps(half_ps, _mm_sub_ps(one_ps, xabs));
- z = _mm_mul_ps(z, gmx_mm_invsqrt_ps(z));
- z = _mm_andnot_ps(_mm_cmpeq_ps(xabs, one_ps), z);
-
- z = _mm_or_ps( _mm_and_ps(mask1, z), _mm_andnot_ps(mask1, x) );
- z = gmx_mm_asin_ps(z);
-
- z2 = _mm_add_ps(z, z);
- z1 = _mm_sub_ps(pi_ps, z2);
- z3 = _mm_sub_ps(halfpi_ps, z);
-
- z = _mm_or_ps( _mm_and_ps(mask2, z2), _mm_andnot_ps(mask2, z1) );
- z = _mm_or_ps( _mm_and_ps(mask1, z), _mm_andnot_ps(mask1, z3) );
-
- return z;
-}
-
-
-static __m128
-gmx_mm_atan_ps(__m128 x)
-{
- /* Same algorithm as cephes library */
- const __m128 signmask = gmx_mm_castsi128_ps( _mm_set1_epi32(0x7FFFFFFF) );
- const __m128 limit1 = _mm_set1_ps(0.414213562373095f);
- const __m128 limit2 = _mm_set1_ps(2.414213562373095f);
- const __m128 quarterpi = _mm_set1_ps(0.785398163397448f);
- const __m128 halfpi = _mm_set1_ps(1.570796326794896f);
- const __m128 mone = _mm_set1_ps(-1.0f);
- const __m128 CC3 = _mm_set1_ps(-3.33329491539E-1f);
- const __m128 CC5 = _mm_set1_ps(1.99777106478E-1f);
- const __m128 CC7 = _mm_set1_ps(-1.38776856032E-1);
- const __m128 CC9 = _mm_set1_ps(8.05374449538e-2f);
-
- __m128 sign;
- __m128 mask1, mask2;
- __m128 y, z1, z2;
- __m128 x2, x4;
- __m128 sum1, sum2;
-
- sign = _mm_andnot_ps(signmask, x);
- x = _mm_and_ps(x, signmask);
-
- mask1 = _mm_cmpgt_ps(x, limit1);
- mask2 = _mm_cmpgt_ps(x, limit2);
-
- z1 = _mm_mul_ps(_mm_add_ps(x, mone), gmx_mm_inv_ps(_mm_sub_ps(x, mone)));
- z2 = _mm_mul_ps(mone, gmx_mm_inv_ps(x));
-
- y = _mm_and_ps(mask1, quarterpi);
- y = _mm_or_ps( _mm_and_ps(mask2, halfpi), _mm_andnot_ps(mask2, y) );
-
- x = _mm_or_ps( _mm_and_ps(mask1, z1), _mm_andnot_ps(mask1, x) );
- x = _mm_or_ps( _mm_and_ps(mask2, z2), _mm_andnot_ps(mask2, x) );
-
- x2 = _mm_mul_ps(x, x);
- x4 = _mm_mul_ps(x2, x2);
-
- sum1 = _mm_mul_ps(CC9, x4);
- sum2 = _mm_mul_ps(CC7, x4);
- sum1 = _mm_add_ps(sum1, CC5);
- sum2 = _mm_add_ps(sum2, CC3);
- sum1 = _mm_mul_ps(sum1, x4);
- sum2 = _mm_mul_ps(sum2, x2);
-
- sum1 = _mm_add_ps(sum1, sum2);
- sum1 = _mm_sub_ps(sum1, mone);
- sum1 = _mm_mul_ps(sum1, x);
- y = _mm_add_ps(y, sum1);
-
- y = _mm_xor_ps(y, sign);
-
- return y;
-}
-
-
-static __m128
-gmx_mm_atan2_ps(__m128 y, __m128 x)
-{
- const __m128 pi = _mm_set1_ps(M_PI);
- const __m128 minuspi = _mm_set1_ps(-M_PI);
- const __m128 halfpi = _mm_set1_ps(M_PI/2.0);
- const __m128 minushalfpi = _mm_set1_ps(-M_PI/2.0);
-
- __m128 z, z1, z3, z4;
- __m128 w;
- __m128 maskx_lt, maskx_eq;
- __m128 masky_lt, masky_eq;
- __m128 mask1, mask2, mask3, mask4, maskall;
-
- maskx_lt = _mm_cmplt_ps(x, _mm_setzero_ps());
- masky_lt = _mm_cmplt_ps(y, _mm_setzero_ps());
- maskx_eq = _mm_cmpeq_ps(x, _mm_setzero_ps());
- masky_eq = _mm_cmpeq_ps(y, _mm_setzero_ps());
-
- z = _mm_mul_ps(y, gmx_mm_inv_ps(x));
- z = gmx_mm_atan_ps(z);
-
- mask1 = _mm_and_ps(maskx_eq, masky_lt);
- mask2 = _mm_andnot_ps(maskx_lt, masky_eq);
- mask3 = _mm_andnot_ps( _mm_or_ps(masky_lt, masky_eq), maskx_eq);
- mask4 = _mm_and_ps(masky_eq, maskx_lt);
-
- maskall = _mm_or_ps( _mm_or_ps(mask1, mask2), _mm_or_ps(mask3, mask4) );
-
- z = _mm_andnot_ps(maskall, z);
- z1 = _mm_and_ps(mask1, minushalfpi);
- z3 = _mm_and_ps(mask3, halfpi);
- z4 = _mm_and_ps(mask4, pi);
-
- z = _mm_or_ps( _mm_or_ps(z, z1), _mm_or_ps(z3, z4) );
-
- mask1 = _mm_andnot_ps(masky_lt, maskx_lt);
- mask2 = _mm_and_ps(maskx_lt, masky_lt);
-
- w = _mm_or_ps( _mm_and_ps(mask1, pi), _mm_and_ps(mask2, minuspi) );
- w = _mm_andnot_ps(maskall, w);
-
- z = _mm_add_ps(z, w);
-
- return z;
-}
+#define gmx_mm_invsqrt_ps gmx_simd_invsqrt_f
+#define gmx_mm_inv_ps gmx_simd_inv_f
+#define gmx_mm_log_ps gmx_simd_log_f
+#define gmx_mm_pmecorrF_ps gmx_simd_pmecorrF_f
+#define gmx_mm_pmecorrV_ps gmx_simd_pmecorrV_f
+#define gmx_mm_sincos_ps gmx_simd_sincos_f
#endif
/*
* This file is part of the GROMACS molecular simulation package.
*
- * Copyright (c) 2012,2013, by the GROMACS development team, led by
+ * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
* Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
* and including many others, as listed in the AUTHORS file in the
* top-level source directory and at http://www.gromacs.org.
#ifndef GMX_SIMD_MATH_SSE4_1_DOUBLE_H
#define GMX_SIMD_MATH_SSE4_1_DOUBLE_H
-#include <stdio.h>
-#include <math.h>
+#include "simd_math.h"
-#include "general_x86_sse4_1.h"
-
-
-
-#ifndef M_PI
-# define M_PI 3.14159265358979323846264338327950288
-#endif
-
-/************************
- * *
- * Simple math routines *
- * *
- ************************/
-
-/* 1.0/sqrt(x) */
-static gmx_inline __m128d
-gmx_mm_invsqrt_pd(__m128d x)
-{
- const __m128d half = _mm_set1_pd(0.5);
- const __m128d three = _mm_set1_pd(3.0);
-
- /* Lookup instruction only exists in single precision, convert back and forth... */
- __m128d lu = _mm_cvtps_pd(_mm_rsqrt_ps( _mm_cvtpd_ps(x)));
-
- lu = _mm_mul_pd(half, _mm_mul_pd(_mm_sub_pd(three, _mm_mul_pd(_mm_mul_pd(lu, lu), x)), lu));
- return _mm_mul_pd(half, _mm_mul_pd(_mm_sub_pd(three, _mm_mul_pd(_mm_mul_pd(lu, lu), x)), lu));
-}
-
-/* 1.0/sqrt(x), done for a pair of arguments to improve throughput */
-static void
-gmx_mm_invsqrt_pair_pd(__m128d x1, __m128d x2, __m128d *invsqrt1, __m128d *invsqrt2)
-{
- const __m128d half = _mm_set1_pd(0.5);
- const __m128d three = _mm_set1_pd(3.0);
- const __m128 halff = _mm_set1_ps(0.5f);
- const __m128 threef = _mm_set1_ps(3.0f);
-
- __m128 xf, luf;
- __m128d lu1, lu2;
-
- /* Do first N-R step in float for 2x throughput */
- xf = _mm_shuffle_ps(_mm_cvtpd_ps(x1), _mm_cvtpd_ps(x2), _MM_SHUFFLE(1, 0, 1, 0));
- luf = _mm_rsqrt_ps(xf);
- luf = _mm_mul_ps(halff, _mm_mul_ps(_mm_sub_ps(threef, _mm_mul_ps(_mm_mul_ps(luf, luf), xf)), luf));
-
- lu2 = _mm_cvtps_pd(_mm_shuffle_ps(luf, luf, _MM_SHUFFLE(3, 2, 3, 2)));
- lu1 = _mm_cvtps_pd(luf);
-
- *invsqrt1 = _mm_mul_pd(half, _mm_mul_pd(_mm_sub_pd(three, _mm_mul_pd(_mm_mul_pd(lu1, lu1), x1)), lu1));
- *invsqrt2 = _mm_mul_pd(half, _mm_mul_pd(_mm_sub_pd(three, _mm_mul_pd(_mm_mul_pd(lu2, lu2), x2)), lu2));
-}
-
-/* sqrt(x) - Do NOT use this (but rather invsqrt) if you actually need 1.0/sqrt(x) */
-static gmx_inline __m128d
-gmx_mm_sqrt_pd(__m128d x)
-{
- __m128d mask;
- __m128d res;
-
- mask = _mm_cmpeq_pd(x, _mm_setzero_pd());
- res = _mm_andnot_pd(mask, gmx_mm_invsqrt_pd(x));
-
- res = _mm_mul_pd(x, res);
-
- return res;
-}
-
-/* 1.0/x */
-static gmx_inline __m128d
-gmx_mm_inv_pd(__m128d x)
-{
- const __m128d two = _mm_set1_pd(2.0);
-
- /* Lookup instruction only exists in single precision, convert back and forth... */
- __m128d lu = _mm_cvtps_pd(_mm_rcp_ps( _mm_cvtpd_ps(x)));
-
- /* Perform two N-R steps for double precision */
- lu = _mm_mul_pd(lu, _mm_sub_pd(two, _mm_mul_pd(x, lu)));
- return _mm_mul_pd(lu, _mm_sub_pd(two, _mm_mul_pd(x, lu)));
-}
-
-static gmx_inline __m128d
-gmx_mm_abs_pd(__m128d x)
-{
- const __m128d signmask = gmx_mm_castsi128_pd( _mm_set_epi32(0x7FFFFFFF, 0xFFFFFFFF, 0x7FFFFFFF, 0xFFFFFFFF) );
-
- return _mm_and_pd(x, signmask);
-}
-
-
-/*
- * 2^x function.
- *
- * The 2^w term is calculated from a (6,0)-th order (no denominator) Minimax polynomia on the interval
- * [-0.5,0.5].
- *
- * The approximation on [-0.5,0.5] is a rational Padé approximation, 1+2*P(x^2)/(Q(x^2)-P(x^2)),
- * according to the same algorithm as used in the Cephes/netlib math routines.
- */
-static __m128d
-gmx_mm_exp2_pd(__m128d x)
-{
- /* Lower bound: We do not allow numbers that would lead to an IEEE fp representation exponent smaller than -126. */
- const __m128d arglimit = _mm_set1_pd(1022.0);
- const __m128i expbase = _mm_set1_epi32(1023);
-
- const __m128d P2 = _mm_set1_pd(2.30933477057345225087e-2);
- const __m128d P1 = _mm_set1_pd(2.02020656693165307700e1);
- const __m128d P0 = _mm_set1_pd(1.51390680115615096133e3);
- /* Q2 == 1.0 */
- const __m128d Q1 = _mm_set1_pd(2.33184211722314911771e2);
- const __m128d Q0 = _mm_set1_pd(4.36821166879210612817e3);
- const __m128d one = _mm_set1_pd(1.0);
- const __m128d two = _mm_set1_pd(2.0);
-
- __m128d valuemask;
- __m128i iexppart;
- __m128d fexppart;
- __m128d intpart;
- __m128d z, z2;
- __m128d PolyP, PolyQ;
-
- iexppart = _mm_cvtpd_epi32(x);
- intpart = _mm_round_pd(x, _MM_FROUND_TO_NEAREST_INT);
-
- /* The two lowest elements of iexppart now contains 32-bit numbers with a correctly biased exponent.
- * To be able to shift it into the exponent for a double precision number we first need to
- * shuffle so that the lower half contains the first element, and the upper half the second.
- * This should really be done as a zero-extension, but since the next instructions will shift
- * the registers left by 52 bits it doesn't matter what we put there - it will be shifted out.
- * (thus we just use element 2 from iexppart).
- */
- iexppart = _mm_shuffle_epi32(iexppart, _MM_SHUFFLE(2, 1, 2, 0));
-
- /* Do the shift operation on the 64-bit registers */
- iexppart = _mm_add_epi32(iexppart, expbase);
- iexppart = _mm_slli_epi64(iexppart, 52);
-
- valuemask = _mm_cmpge_pd(arglimit, gmx_mm_abs_pd(x));
- fexppart = _mm_and_pd(valuemask, gmx_mm_castsi128_pd(iexppart));
-
- z = _mm_sub_pd(x, intpart);
- z2 = _mm_mul_pd(z, z);
-
- PolyP = _mm_mul_pd(P2, z2);
- PolyP = _mm_add_pd(PolyP, P1);
- PolyQ = _mm_add_pd(z2, Q1);
- PolyP = _mm_mul_pd(PolyP, z2);
- PolyQ = _mm_mul_pd(PolyQ, z2);
- PolyP = _mm_add_pd(PolyP, P0);
- PolyQ = _mm_add_pd(PolyQ, Q0);
- PolyP = _mm_mul_pd(PolyP, z);
-
- z = _mm_mul_pd(PolyP, gmx_mm_inv_pd(_mm_sub_pd(PolyQ, PolyP)));
- z = _mm_add_pd(one, _mm_mul_pd(two, z));
-
- z = _mm_mul_pd(z, fexppart);
-
- return z;
-}
-
-/* Exponential function. This could be calculated from 2^x as Exp(x)=2^(y), where y=log2(e)*x,
- * but there will then be a small rounding error since we lose some precision due to the
- * multiplication. This will then be magnified a lot by the exponential.
- *
- * Instead, we calculate the fractional part directly as a Padé approximation of
- * Exp(z) on [-0.5,0.5]. We use extended precision arithmetics to calculate the fraction
- * remaining after 2^y, which avoids the precision-loss.
+/* Temporary:
+ * Alias some old SSE definitions to new SIMD definitions so we don't need
+ * to modify _all_ group kernels - they will anyway be replaced with a new
+ * generic SIMD version soon.
*/
-static __m128d
-gmx_mm_exp_pd(__m128d exparg)
-{
- const __m128d argscale = _mm_set1_pd(1.4426950408889634073599);
- /* Lower bound: We do not allow numbers that would lead to an IEEE fp representation exponent smaller than -126. */
- const __m128d arglimit = _mm_set1_pd(1022.0);
- const __m128i expbase = _mm_set1_epi32(1023);
-
- const __m128d invargscale0 = _mm_set1_pd(6.93145751953125e-1);
- const __m128d invargscale1 = _mm_set1_pd(1.42860682030941723212e-6);
-
- const __m128d P2 = _mm_set1_pd(1.26177193074810590878e-4);
- const __m128d P1 = _mm_set1_pd(3.02994407707441961300e-2);
- /* P0 == 1.0 */
- const __m128d Q3 = _mm_set1_pd(3.00198505138664455042E-6);
- const __m128d Q2 = _mm_set1_pd(2.52448340349684104192E-3);
- const __m128d Q1 = _mm_set1_pd(2.27265548208155028766E-1);
- /* Q0 == 2.0 */
- const __m128d one = _mm_set1_pd(1.0);
- const __m128d two = _mm_set1_pd(2.0);
-
- __m128d valuemask;
- __m128i iexppart;
- __m128d fexppart;
- __m128d intpart;
- __m128d x, z, z2;
- __m128d PolyP, PolyQ;
-
- x = _mm_mul_pd(exparg, argscale);
-
- iexppart = _mm_cvtpd_epi32(x);
- intpart = _mm_round_pd(x, _MM_FROUND_TO_NEAREST_INT);
-
- /* The two lowest elements of iexppart now contains 32-bit numbers with a correctly biased exponent.
- * To be able to shift it into the exponent for a double precision number we first need to
- * shuffle so that the lower half contains the first element, and the upper half the second.
- * This should really be done as a zero-extension, but since the next instructions will shift
- * the registers left by 52 bits it doesn't matter what we put there - it will be shifted out.
- * (thus we just use element 2 from iexppart).
- */
- iexppart = _mm_shuffle_epi32(iexppart, _MM_SHUFFLE(2, 1, 2, 0));
-
- /* Do the shift operation on the 64-bit registers */
- iexppart = _mm_add_epi32(iexppart, expbase);
- iexppart = _mm_slli_epi64(iexppart, 52);
-
- valuemask = _mm_cmpge_pd(arglimit, gmx_mm_abs_pd(x));
- fexppart = _mm_and_pd(valuemask, gmx_mm_castsi128_pd(iexppart));
-
- z = _mm_sub_pd(exparg, _mm_mul_pd(invargscale0, intpart));
- z = _mm_sub_pd(z, _mm_mul_pd(invargscale1, intpart));
-
- z2 = _mm_mul_pd(z, z);
-
- PolyQ = _mm_mul_pd(Q3, z2);
- PolyQ = _mm_add_pd(PolyQ, Q2);
- PolyP = _mm_mul_pd(P2, z2);
- PolyQ = _mm_mul_pd(PolyQ, z2);
- PolyP = _mm_add_pd(PolyP, P1);
- PolyQ = _mm_add_pd(PolyQ, Q1);
- PolyP = _mm_mul_pd(PolyP, z2);
- PolyQ = _mm_mul_pd(PolyQ, z2);
- PolyP = _mm_add_pd(PolyP, one);
- PolyQ = _mm_add_pd(PolyQ, two);
-
- PolyP = _mm_mul_pd(PolyP, z);
-
- z = _mm_mul_pd(PolyP, gmx_mm_inv_pd(_mm_sub_pd(PolyQ, PolyP)));
- z = _mm_add_pd(one, _mm_mul_pd(two, z));
-
- z = _mm_mul_pd(z, fexppart);
-
- return z;
-}
-
-
-
-static __m128d
-gmx_mm_log_pd(__m128d x)
-{
- /* Same algorithm as cephes library */
- const __m128d expmask = gmx_mm_castsi128_pd( _mm_set_epi32(0x7FF00000, 0x00000000, 0x7FF00000, 0x00000000) );
-
- const __m128i expbase_m1 = _mm_set1_epi32(1023-1); /* We want non-IEEE format */
-
- const __m128d half = _mm_set1_pd(0.5);
- const __m128d one = _mm_set1_pd(1.0);
- const __m128d two = _mm_set1_pd(2.0);
- const __m128d invsq2 = _mm_set1_pd(1.0/sqrt(2.0));
-
- const __m128d corr1 = _mm_set1_pd(-2.121944400546905827679e-4);
- const __m128d corr2 = _mm_set1_pd(0.693359375);
-
- const __m128d P5 = _mm_set1_pd(1.01875663804580931796e-4);
- const __m128d P4 = _mm_set1_pd(4.97494994976747001425e-1);
- const __m128d P3 = _mm_set1_pd(4.70579119878881725854e0);
- const __m128d P2 = _mm_set1_pd(1.44989225341610930846e1);
- const __m128d P1 = _mm_set1_pd(1.79368678507819816313e1);
- const __m128d P0 = _mm_set1_pd(7.70838733755885391666e0);
-
- const __m128d Q4 = _mm_set1_pd(1.12873587189167450590e1);
- const __m128d Q3 = _mm_set1_pd(4.52279145837532221105e1);
- const __m128d Q2 = _mm_set1_pd(8.29875266912776603211e1);
- const __m128d Q1 = _mm_set1_pd(7.11544750618563894466e1);
- const __m128d Q0 = _mm_set1_pd(2.31251620126765340583e1);
-
- const __m128d R2 = _mm_set1_pd(-7.89580278884799154124e-1);
- const __m128d R1 = _mm_set1_pd(1.63866645699558079767e1);
- const __m128d R0 = _mm_set1_pd(-6.41409952958715622951e1);
-
- const __m128d S2 = _mm_set1_pd(-3.56722798256324312549E1);
- const __m128d S1 = _mm_set1_pd(3.12093766372244180303E2);
- const __m128d S0 = _mm_set1_pd(-7.69691943550460008604E2);
-
- __m128d fexp;
- __m128i iexp;
-
- __m128d mask1, mask2;
- __m128d corr, t1, t2, q;
- __m128d zA, yA, xA, zB, yB, xB, z;
- __m128d polyR, polyS;
- __m128d polyP1, polyP2, polyQ1, polyQ2;
-
- /* Separate x into exponent and mantissa, with a mantissa in the range [0.5..1[ (not IEEE754 standard!) */
- fexp = _mm_and_pd(x, expmask);
- iexp = gmx_mm_castpd_si128(fexp);
- iexp = _mm_srli_epi64(iexp, 52);
- iexp = _mm_sub_epi32(iexp, expbase_m1);
- iexp = _mm_shuffle_epi32(iexp, _MM_SHUFFLE(1, 1, 2, 0) );
- fexp = _mm_cvtepi32_pd(iexp);
-
- x = _mm_andnot_pd(expmask, x);
- x = _mm_or_pd(x, one);
- x = _mm_mul_pd(x, half);
-
- mask1 = _mm_cmpgt_pd(gmx_mm_abs_pd(fexp), two);
- mask2 = _mm_cmplt_pd(x, invsq2);
-
- fexp = _mm_sub_pd(fexp, _mm_and_pd(mask2, one));
-
- /* If mask1 is set ('A') */
- zA = _mm_sub_pd(x, half);
- t1 = _mm_blendv_pd( zA, x, mask2 );
- zA = _mm_sub_pd(t1, half);
- t2 = _mm_blendv_pd( x, zA, mask2 );
- yA = _mm_mul_pd(half, _mm_add_pd(t2, one));
-
- xA = _mm_mul_pd(zA, gmx_mm_inv_pd(yA));
- zA = _mm_mul_pd(xA, xA);
-
- /* EVALUATE POLY */
- polyR = _mm_mul_pd(R2, zA);
- polyR = _mm_add_pd(polyR, R1);
- polyR = _mm_mul_pd(polyR, zA);
- polyR = _mm_add_pd(polyR, R0);
-
- polyS = _mm_add_pd(zA, S2);
- polyS = _mm_mul_pd(polyS, zA);
- polyS = _mm_add_pd(polyS, S1);
- polyS = _mm_mul_pd(polyS, zA);
- polyS = _mm_add_pd(polyS, S0);
-
- q = _mm_mul_pd(polyR, gmx_mm_inv_pd(polyS));
- zA = _mm_mul_pd(_mm_mul_pd(xA, zA), q);
-
- zA = _mm_add_pd(zA, _mm_mul_pd(corr1, fexp));
- zA = _mm_add_pd(zA, xA);
- zA = _mm_add_pd(zA, _mm_mul_pd(corr2, fexp));
-
- /* If mask1 is not set ('B') */
- corr = _mm_and_pd(mask2, x);
- xB = _mm_add_pd(x, corr);
- xB = _mm_sub_pd(xB, one);
- zB = _mm_mul_pd(xB, xB);
-
- polyP1 = _mm_mul_pd(P5, zB);
- polyP2 = _mm_mul_pd(P4, zB);
- polyP1 = _mm_add_pd(polyP1, P3);
- polyP2 = _mm_add_pd(polyP2, P2);
- polyP1 = _mm_mul_pd(polyP1, zB);
- polyP2 = _mm_mul_pd(polyP2, zB);
- polyP1 = _mm_add_pd(polyP1, P1);
- polyP2 = _mm_add_pd(polyP2, P0);
- polyP1 = _mm_mul_pd(polyP1, xB);
- polyP1 = _mm_add_pd(polyP1, polyP2);
-
- polyQ2 = _mm_mul_pd(Q4, zB);
- polyQ1 = _mm_add_pd(zB, Q3);
- polyQ2 = _mm_add_pd(polyQ2, Q2);
- polyQ1 = _mm_mul_pd(polyQ1, zB);
- polyQ2 = _mm_mul_pd(polyQ2, zB);
- polyQ1 = _mm_add_pd(polyQ1, Q1);
- polyQ2 = _mm_add_pd(polyQ2, Q0);
- polyQ1 = _mm_mul_pd(polyQ1, xB);
- polyQ1 = _mm_add_pd(polyQ1, polyQ2);
-
- fexp = _mm_and_pd(fexp, _mm_cmpneq_pd(fexp, _mm_setzero_pd()));
-
- q = _mm_mul_pd(polyP1, gmx_mm_inv_pd(polyQ1));
- yB = _mm_mul_pd(_mm_mul_pd(xB, zB), q);
-
- yB = _mm_add_pd(yB, _mm_mul_pd(corr1, fexp));
- yB = _mm_sub_pd(yB, _mm_mul_pd(half, zB));
- zB = _mm_add_pd(xB, yB);
- zB = _mm_add_pd(zB, _mm_mul_pd(corr2, fexp));
-
- z = _mm_blendv_pd( zB, zA, mask1 );
-
- return z;
-}
-
-
-static __m128d
-gmx_mm_erf_pd(__m128d x)
-{
- /* Coefficients for minimax approximation of erf(x)=x*(CAoffset + P(x^2)/Q(x^2)) in range [-0.75,0.75] */
- const __m128d CAP4 = _mm_set1_pd(-0.431780540597889301512e-4);
- const __m128d CAP3 = _mm_set1_pd(-0.00578562306260059236059);
- const __m128d CAP2 = _mm_set1_pd(-0.028593586920219752446);
- const __m128d CAP1 = _mm_set1_pd(-0.315924962948621698209);
- const __m128d CAP0 = _mm_set1_pd(0.14952975608477029151);
-
- const __m128d CAQ5 = _mm_set1_pd(-0.374089300177174709737e-5);
- const __m128d CAQ4 = _mm_set1_pd(0.00015126584532155383535);
- const __m128d CAQ3 = _mm_set1_pd(0.00536692680669480725423);
- const __m128d CAQ2 = _mm_set1_pd(0.0668686825594046122636);
- const __m128d CAQ1 = _mm_set1_pd(0.402604990869284362773);
- /* CAQ0 == 1.0 */
- const __m128d CAoffset = _mm_set1_pd(0.9788494110107421875);
-
- /* Coefficients for minimax approximation of erfc(x)=exp(-x^2)*x*(P(x-1)/Q(x-1)) in range [1.0,4.5] */
- const __m128d CBP6 = _mm_set1_pd(2.49650423685462752497647637088e-10);
- const __m128d CBP5 = _mm_set1_pd(0.00119770193298159629350136085658);
- const __m128d CBP4 = _mm_set1_pd(0.0164944422378370965881008942733);
- const __m128d CBP3 = _mm_set1_pd(0.0984581468691775932063932439252);
- const __m128d CBP2 = _mm_set1_pd(0.317364595806937763843589437418);
- const __m128d CBP1 = _mm_set1_pd(0.554167062641455850932670067075);
- const __m128d CBP0 = _mm_set1_pd(0.427583576155807163756925301060);
- const __m128d CBQ7 = _mm_set1_pd(0.00212288829699830145976198384930);
- const __m128d CBQ6 = _mm_set1_pd(0.0334810979522685300554606393425);
- const __m128d CBQ5 = _mm_set1_pd(0.2361713785181450957579508850717);
- const __m128d CBQ4 = _mm_set1_pd(0.955364736493055670530981883072);
- const __m128d CBQ3 = _mm_set1_pd(2.36815675631420037315349279199);
- const __m128d CBQ2 = _mm_set1_pd(3.55261649184083035537184223542);
- const __m128d CBQ1 = _mm_set1_pd(2.93501136050160872574376997993);
- /* CBQ0 == 1.0 */
-
- /* Coefficients for minimax approximation of erfc(x)=exp(-x^2)/x*(P(1/x)/Q(1/x)) in range [4.5,inf] */
- const __m128d CCP6 = _mm_set1_pd(-2.8175401114513378771);
- const __m128d CCP5 = _mm_set1_pd(-3.22729451764143718517);
- const __m128d CCP4 = _mm_set1_pd(-2.5518551727311523996);
- const __m128d CCP3 = _mm_set1_pd(-0.687717681153649930619);
- const __m128d CCP2 = _mm_set1_pd(-0.212652252872804219852);
- const __m128d CCP1 = _mm_set1_pd(0.0175389834052493308818);
- const __m128d CCP0 = _mm_set1_pd(0.00628057170626964891937);
-
- const __m128d CCQ6 = _mm_set1_pd(5.48409182238641741584);
- const __m128d CCQ5 = _mm_set1_pd(13.5064170191802889145);
- const __m128d CCQ4 = _mm_set1_pd(22.9367376522880577224);
- const __m128d CCQ3 = _mm_set1_pd(15.930646027911794143);
- const __m128d CCQ2 = _mm_set1_pd(11.0567237927800161565);
- const __m128d CCQ1 = _mm_set1_pd(2.79257750980575282228);
- /* CCQ0 == 1.0 */
- const __m128d CCoffset = _mm_set1_pd(0.5579090118408203125);
-
- const __m128d one = _mm_set1_pd(1.0);
- const __m128d two = _mm_set1_pd(2.0);
-
- const __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000, 0x00000000, 0x80000000, 0x00000000) );
-
- __m128d xabs, x2, x4, t, t2, w, w2;
- __m128d PolyAP0, PolyAP1, PolyAQ0, PolyAQ1;
- __m128d PolyBP0, PolyBP1, PolyBQ0, PolyBQ1;
- __m128d PolyCP0, PolyCP1, PolyCQ0, PolyCQ1;
- __m128d res_erf, res_erfcB, res_erfcC, res_erfc, res;
- __m128d mask, expmx2;
-
- /* Calculate erf() */
- xabs = gmx_mm_abs_pd(x);
- x2 = _mm_mul_pd(x, x);
- x4 = _mm_mul_pd(x2, x2);
-
- PolyAP0 = _mm_mul_pd(CAP4, x4);
- PolyAP1 = _mm_mul_pd(CAP3, x4);
- PolyAP0 = _mm_add_pd(PolyAP0, CAP2);
- PolyAP1 = _mm_add_pd(PolyAP1, CAP1);
- PolyAP0 = _mm_mul_pd(PolyAP0, x4);
- PolyAP1 = _mm_mul_pd(PolyAP1, x2);
- PolyAP0 = _mm_add_pd(PolyAP0, CAP0);
- PolyAP0 = _mm_add_pd(PolyAP0, PolyAP1);
-
- PolyAQ1 = _mm_mul_pd(CAQ5, x4);
- PolyAQ0 = _mm_mul_pd(CAQ4, x4);
- PolyAQ1 = _mm_add_pd(PolyAQ1, CAQ3);
- PolyAQ0 = _mm_add_pd(PolyAQ0, CAQ2);
- PolyAQ1 = _mm_mul_pd(PolyAQ1, x4);
- PolyAQ0 = _mm_mul_pd(PolyAQ0, x4);
- PolyAQ1 = _mm_add_pd(PolyAQ1, CAQ1);
- PolyAQ0 = _mm_add_pd(PolyAQ0, one);
- PolyAQ1 = _mm_mul_pd(PolyAQ1, x2);
- PolyAQ0 = _mm_add_pd(PolyAQ0, PolyAQ1);
-
- res_erf = _mm_mul_pd(PolyAP0, gmx_mm_inv_pd(PolyAQ0));
- res_erf = _mm_add_pd(CAoffset, res_erf);
- res_erf = _mm_mul_pd(x, res_erf);
-
- /* Calculate erfc() in range [1,4.5] */
- t = _mm_sub_pd(xabs, one);
- t2 = _mm_mul_pd(t, t);
-
- PolyBP0 = _mm_mul_pd(CBP6, t2);
- PolyBP1 = _mm_mul_pd(CBP5, t2);
- PolyBP0 = _mm_add_pd(PolyBP0, CBP4);
- PolyBP1 = _mm_add_pd(PolyBP1, CBP3);
- PolyBP0 = _mm_mul_pd(PolyBP0, t2);
- PolyBP1 = _mm_mul_pd(PolyBP1, t2);
- PolyBP0 = _mm_add_pd(PolyBP0, CBP2);
- PolyBP1 = _mm_add_pd(PolyBP1, CBP1);
- PolyBP0 = _mm_mul_pd(PolyBP0, t2);
- PolyBP1 = _mm_mul_pd(PolyBP1, t);
- PolyBP0 = _mm_add_pd(PolyBP0, CBP0);
- PolyBP0 = _mm_add_pd(PolyBP0, PolyBP1);
-
- PolyBQ1 = _mm_mul_pd(CBQ7, t2);
- PolyBQ0 = _mm_mul_pd(CBQ6, t2);
- PolyBQ1 = _mm_add_pd(PolyBQ1, CBQ5);
- PolyBQ0 = _mm_add_pd(PolyBQ0, CBQ4);
- PolyBQ1 = _mm_mul_pd(PolyBQ1, t2);
- PolyBQ0 = _mm_mul_pd(PolyBQ0, t2);
- PolyBQ1 = _mm_add_pd(PolyBQ1, CBQ3);
- PolyBQ0 = _mm_add_pd(PolyBQ0, CBQ2);
- PolyBQ1 = _mm_mul_pd(PolyBQ1, t2);
- PolyBQ0 = _mm_mul_pd(PolyBQ0, t2);
- PolyBQ1 = _mm_add_pd(PolyBQ1, CBQ1);
- PolyBQ0 = _mm_add_pd(PolyBQ0, one);
- PolyBQ1 = _mm_mul_pd(PolyBQ1, t);
- PolyBQ0 = _mm_add_pd(PolyBQ0, PolyBQ1);
-
- res_erfcB = _mm_mul_pd(PolyBP0, gmx_mm_inv_pd(PolyBQ0));
-
- res_erfcB = _mm_mul_pd(res_erfcB, xabs);
-
- /* Calculate erfc() in range [4.5,inf] */
- w = gmx_mm_inv_pd(xabs);
- w2 = _mm_mul_pd(w, w);
-
- PolyCP0 = _mm_mul_pd(CCP6, w2);
- PolyCP1 = _mm_mul_pd(CCP5, w2);
- PolyCP0 = _mm_add_pd(PolyCP0, CCP4);
- PolyCP1 = _mm_add_pd(PolyCP1, CCP3);
- PolyCP0 = _mm_mul_pd(PolyCP0, w2);
- PolyCP1 = _mm_mul_pd(PolyCP1, w2);
- PolyCP0 = _mm_add_pd(PolyCP0, CCP2);
- PolyCP1 = _mm_add_pd(PolyCP1, CCP1);
- PolyCP0 = _mm_mul_pd(PolyCP0, w2);
- PolyCP1 = _mm_mul_pd(PolyCP1, w);
- PolyCP0 = _mm_add_pd(PolyCP0, CCP0);
- PolyCP0 = _mm_add_pd(PolyCP0, PolyCP1);
-
- PolyCQ0 = _mm_mul_pd(CCQ6, w2);
- PolyCQ1 = _mm_mul_pd(CCQ5, w2);
- PolyCQ0 = _mm_add_pd(PolyCQ0, CCQ4);
- PolyCQ1 = _mm_add_pd(PolyCQ1, CCQ3);
- PolyCQ0 = _mm_mul_pd(PolyCQ0, w2);
- PolyCQ1 = _mm_mul_pd(PolyCQ1, w2);
- PolyCQ0 = _mm_add_pd(PolyCQ0, CCQ2);
- PolyCQ1 = _mm_add_pd(PolyCQ1, CCQ1);
- PolyCQ0 = _mm_mul_pd(PolyCQ0, w2);
- PolyCQ1 = _mm_mul_pd(PolyCQ1, w);
- PolyCQ0 = _mm_add_pd(PolyCQ0, one);
- PolyCQ0 = _mm_add_pd(PolyCQ0, PolyCQ1);
-
- expmx2 = gmx_mm_exp_pd( _mm_or_pd(signbit, x2) );
-
- res_erfcC = _mm_mul_pd(PolyCP0, gmx_mm_inv_pd(PolyCQ0));
- res_erfcC = _mm_add_pd(res_erfcC, CCoffset);
- res_erfcC = _mm_mul_pd(res_erfcC, w);
-
- mask = _mm_cmpgt_pd(xabs, _mm_set1_pd(4.5));
- res_erfc = _mm_blendv_pd(res_erfcB, res_erfcC, mask);
-
- res_erfc = _mm_mul_pd(res_erfc, expmx2);
-
- /* erfc(x<0) = 2-erfc(|x|) */
- mask = _mm_cmplt_pd(x, _mm_setzero_pd());
- res_erfc = _mm_blendv_pd(res_erfc, _mm_sub_pd(two, res_erfc), mask);
-
- /* Select erf() or erfc() */
- mask = _mm_cmplt_pd(xabs, one);
- res = _mm_blendv_pd(_mm_sub_pd(one, res_erfc), res_erf, mask);
-
- return res;
-}
-
-
-static __m128d
-gmx_mm_erfc_pd(__m128d x)
-{
- /* Coefficients for minimax approximation of erf(x)=x*(CAoffset + P(x^2)/Q(x^2)) in range [-0.75,0.75] */
- const __m128d CAP4 = _mm_set1_pd(-0.431780540597889301512e-4);
- const __m128d CAP3 = _mm_set1_pd(-0.00578562306260059236059);
- const __m128d CAP2 = _mm_set1_pd(-0.028593586920219752446);
- const __m128d CAP1 = _mm_set1_pd(-0.315924962948621698209);
- const __m128d CAP0 = _mm_set1_pd(0.14952975608477029151);
-
- const __m128d CAQ5 = _mm_set1_pd(-0.374089300177174709737e-5);
- const __m128d CAQ4 = _mm_set1_pd(0.00015126584532155383535);
- const __m128d CAQ3 = _mm_set1_pd(0.00536692680669480725423);
- const __m128d CAQ2 = _mm_set1_pd(0.0668686825594046122636);
- const __m128d CAQ1 = _mm_set1_pd(0.402604990869284362773);
- /* CAQ0 == 1.0 */
- const __m128d CAoffset = _mm_set1_pd(0.9788494110107421875);
-
- /* Coefficients for minimax approximation of erfc(x)=exp(-x^2)*x*(P(x-1)/Q(x-1)) in range [1.0,4.5] */
- const __m128d CBP6 = _mm_set1_pd(2.49650423685462752497647637088e-10);
- const __m128d CBP5 = _mm_set1_pd(0.00119770193298159629350136085658);
- const __m128d CBP4 = _mm_set1_pd(0.0164944422378370965881008942733);
- const __m128d CBP3 = _mm_set1_pd(0.0984581468691775932063932439252);
- const __m128d CBP2 = _mm_set1_pd(0.317364595806937763843589437418);
- const __m128d CBP1 = _mm_set1_pd(0.554167062641455850932670067075);
- const __m128d CBP0 = _mm_set1_pd(0.427583576155807163756925301060);
- const __m128d CBQ7 = _mm_set1_pd(0.00212288829699830145976198384930);
- const __m128d CBQ6 = _mm_set1_pd(0.0334810979522685300554606393425);
- const __m128d CBQ5 = _mm_set1_pd(0.2361713785181450957579508850717);
- const __m128d CBQ4 = _mm_set1_pd(0.955364736493055670530981883072);
- const __m128d CBQ3 = _mm_set1_pd(2.36815675631420037315349279199);
- const __m128d CBQ2 = _mm_set1_pd(3.55261649184083035537184223542);
- const __m128d CBQ1 = _mm_set1_pd(2.93501136050160872574376997993);
- /* CBQ0 == 1.0 */
-
- /* Coefficients for minimax approximation of erfc(x)=exp(-x^2)/x*(P(1/x)/Q(1/x)) in range [4.5,inf] */
- const __m128d CCP6 = _mm_set1_pd(-2.8175401114513378771);
- const __m128d CCP5 = _mm_set1_pd(-3.22729451764143718517);
- const __m128d CCP4 = _mm_set1_pd(-2.5518551727311523996);
- const __m128d CCP3 = _mm_set1_pd(-0.687717681153649930619);
- const __m128d CCP2 = _mm_set1_pd(-0.212652252872804219852);
- const __m128d CCP1 = _mm_set1_pd(0.0175389834052493308818);
- const __m128d CCP0 = _mm_set1_pd(0.00628057170626964891937);
-
- const __m128d CCQ6 = _mm_set1_pd(5.48409182238641741584);
- const __m128d CCQ5 = _mm_set1_pd(13.5064170191802889145);
- const __m128d CCQ4 = _mm_set1_pd(22.9367376522880577224);
- const __m128d CCQ3 = _mm_set1_pd(15.930646027911794143);
- const __m128d CCQ2 = _mm_set1_pd(11.0567237927800161565);
- const __m128d CCQ1 = _mm_set1_pd(2.79257750980575282228);
- /* CCQ0 == 1.0 */
- const __m128d CCoffset = _mm_set1_pd(0.5579090118408203125);
-
- const __m128d one = _mm_set1_pd(1.0);
- const __m128d two = _mm_set1_pd(2.0);
-
- const __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000, 0x00000000, 0x80000000, 0x00000000) );
-
- __m128d xabs, x2, x4, t, t2, w, w2;
- __m128d PolyAP0, PolyAP1, PolyAQ0, PolyAQ1;
- __m128d PolyBP0, PolyBP1, PolyBQ0, PolyBQ1;
- __m128d PolyCP0, PolyCP1, PolyCQ0, PolyCQ1;
- __m128d res_erf, res_erfcB, res_erfcC, res_erfc, res;
- __m128d mask, expmx2;
-
- /* Calculate erf() */
- xabs = gmx_mm_abs_pd(x);
- x2 = _mm_mul_pd(x, x);
- x4 = _mm_mul_pd(x2, x2);
-
- PolyAP0 = _mm_mul_pd(CAP4, x4);
- PolyAP1 = _mm_mul_pd(CAP3, x4);
- PolyAP0 = _mm_add_pd(PolyAP0, CAP2);
- PolyAP1 = _mm_add_pd(PolyAP1, CAP1);
- PolyAP0 = _mm_mul_pd(PolyAP0, x4);
- PolyAP1 = _mm_mul_pd(PolyAP1, x2);
- PolyAP0 = _mm_add_pd(PolyAP0, CAP0);
- PolyAP0 = _mm_add_pd(PolyAP0, PolyAP1);
-
- PolyAQ1 = _mm_mul_pd(CAQ5, x4);
- PolyAQ0 = _mm_mul_pd(CAQ4, x4);
- PolyAQ1 = _mm_add_pd(PolyAQ1, CAQ3);
- PolyAQ0 = _mm_add_pd(PolyAQ0, CAQ2);
- PolyAQ1 = _mm_mul_pd(PolyAQ1, x4);
- PolyAQ0 = _mm_mul_pd(PolyAQ0, x4);
- PolyAQ1 = _mm_add_pd(PolyAQ1, CAQ1);
- PolyAQ0 = _mm_add_pd(PolyAQ0, one);
- PolyAQ1 = _mm_mul_pd(PolyAQ1, x2);
- PolyAQ0 = _mm_add_pd(PolyAQ0, PolyAQ1);
-
- res_erf = _mm_mul_pd(PolyAP0, gmx_mm_inv_pd(PolyAQ0));
- res_erf = _mm_add_pd(CAoffset, res_erf);
- res_erf = _mm_mul_pd(x, res_erf);
-
- /* Calculate erfc() in range [1,4.5] */
- t = _mm_sub_pd(xabs, one);
- t2 = _mm_mul_pd(t, t);
-
- PolyBP0 = _mm_mul_pd(CBP6, t2);
- PolyBP1 = _mm_mul_pd(CBP5, t2);
- PolyBP0 = _mm_add_pd(PolyBP0, CBP4);
- PolyBP1 = _mm_add_pd(PolyBP1, CBP3);
- PolyBP0 = _mm_mul_pd(PolyBP0, t2);
- PolyBP1 = _mm_mul_pd(PolyBP1, t2);
- PolyBP0 = _mm_add_pd(PolyBP0, CBP2);
- PolyBP1 = _mm_add_pd(PolyBP1, CBP1);
- PolyBP0 = _mm_mul_pd(PolyBP0, t2);
- PolyBP1 = _mm_mul_pd(PolyBP1, t);
- PolyBP0 = _mm_add_pd(PolyBP0, CBP0);
- PolyBP0 = _mm_add_pd(PolyBP0, PolyBP1);
-
- PolyBQ1 = _mm_mul_pd(CBQ7, t2);
- PolyBQ0 = _mm_mul_pd(CBQ6, t2);
- PolyBQ1 = _mm_add_pd(PolyBQ1, CBQ5);
- PolyBQ0 = _mm_add_pd(PolyBQ0, CBQ4);
- PolyBQ1 = _mm_mul_pd(PolyBQ1, t2);
- PolyBQ0 = _mm_mul_pd(PolyBQ0, t2);
- PolyBQ1 = _mm_add_pd(PolyBQ1, CBQ3);
- PolyBQ0 = _mm_add_pd(PolyBQ0, CBQ2);
- PolyBQ1 = _mm_mul_pd(PolyBQ1, t2);
- PolyBQ0 = _mm_mul_pd(PolyBQ0, t2);
- PolyBQ1 = _mm_add_pd(PolyBQ1, CBQ1);
- PolyBQ0 = _mm_add_pd(PolyBQ0, one);
- PolyBQ1 = _mm_mul_pd(PolyBQ1, t);
- PolyBQ0 = _mm_add_pd(PolyBQ0, PolyBQ1);
-
- res_erfcB = _mm_mul_pd(PolyBP0, gmx_mm_inv_pd(PolyBQ0));
-
- res_erfcB = _mm_mul_pd(res_erfcB, xabs);
-
- /* Calculate erfc() in range [4.5,inf] */
- w = gmx_mm_inv_pd(xabs);
- w2 = _mm_mul_pd(w, w);
-
- PolyCP0 = _mm_mul_pd(CCP6, w2);
- PolyCP1 = _mm_mul_pd(CCP5, w2);
- PolyCP0 = _mm_add_pd(PolyCP0, CCP4);
- PolyCP1 = _mm_add_pd(PolyCP1, CCP3);
- PolyCP0 = _mm_mul_pd(PolyCP0, w2);
- PolyCP1 = _mm_mul_pd(PolyCP1, w2);
- PolyCP0 = _mm_add_pd(PolyCP0, CCP2);
- PolyCP1 = _mm_add_pd(PolyCP1, CCP1);
- PolyCP0 = _mm_mul_pd(PolyCP0, w2);
- PolyCP1 = _mm_mul_pd(PolyCP1, w);
- PolyCP0 = _mm_add_pd(PolyCP0, CCP0);
- PolyCP0 = _mm_add_pd(PolyCP0, PolyCP1);
-
- PolyCQ0 = _mm_mul_pd(CCQ6, w2);
- PolyCQ1 = _mm_mul_pd(CCQ5, w2);
- PolyCQ0 = _mm_add_pd(PolyCQ0, CCQ4);
- PolyCQ1 = _mm_add_pd(PolyCQ1, CCQ3);
- PolyCQ0 = _mm_mul_pd(PolyCQ0, w2);
- PolyCQ1 = _mm_mul_pd(PolyCQ1, w2);
- PolyCQ0 = _mm_add_pd(PolyCQ0, CCQ2);
- PolyCQ1 = _mm_add_pd(PolyCQ1, CCQ1);
- PolyCQ0 = _mm_mul_pd(PolyCQ0, w2);
- PolyCQ1 = _mm_mul_pd(PolyCQ1, w);
- PolyCQ0 = _mm_add_pd(PolyCQ0, one);
- PolyCQ0 = _mm_add_pd(PolyCQ0, PolyCQ1);
-
- expmx2 = gmx_mm_exp_pd( _mm_or_pd(signbit, x2) );
-
- res_erfcC = _mm_mul_pd(PolyCP0, gmx_mm_inv_pd(PolyCQ0));
- res_erfcC = _mm_add_pd(res_erfcC, CCoffset);
- res_erfcC = _mm_mul_pd(res_erfcC, w);
-
- mask = _mm_cmpgt_pd(xabs, _mm_set1_pd(4.5));
- res_erfc = _mm_blendv_pd(res_erfcB, res_erfcC, mask);
-
- res_erfc = _mm_mul_pd(res_erfc, expmx2);
-
- /* erfc(x<0) = 2-erfc(|x|) */
- mask = _mm_cmplt_pd(x, _mm_setzero_pd());
- res_erfc = _mm_blendv_pd(res_erfc, _mm_sub_pd(two, res_erfc), mask);
-
- /* Select erf() or erfc() */
- mask = _mm_cmplt_pd(xabs, one);
- res = _mm_blendv_pd(res_erfc, _mm_sub_pd(one, res_erf), mask);
-
- return res;
-}
-
-
-/* Calculate the force correction due to PME analytically.
- *
- * This routine is meant to enable analytical evaluation of the
- * direct-space PME electrostatic force to avoid tables.
- *
- * The direct-space potential should be Erfc(beta*r)/r, but there
- * are some problems evaluating that:
- *
- * First, the error function is difficult (read: expensive) to
- * approxmiate accurately for intermediate to large arguments, and
- * this happens already in ranges of beta*r that occur in simulations.
- * Second, we now try to avoid calculating potentials in Gromacs but
- * use forces directly.
- *
- * We can simply things slight by noting that the PME part is really
- * a correction to the normal Coulomb force since Erfc(z)=1-Erf(z), i.e.
- *
- * V= 1/r - Erf(beta*r)/r
- *
- * The first term we already have from the inverse square root, so
- * that we can leave out of this routine.
- *
- * For pme tolerances of 1e-3 to 1e-8 and cutoffs of 0.5nm to 1.8nm,
- * the argument beta*r will be in the range 0.15 to ~4. Use your
- * favorite plotting program to realize how well-behaved Erf(z)/z is
- * in this range!
- *
- * We approximate f(z)=erf(z)/z with a rational minimax polynomial.
- * However, it turns out it is more efficient to approximate f(z)/z and
- * then only use even powers. This is another minor optimization, since
- * we actually WANT f(z)/z, because it is going to be multiplied by
- * the vector between the two atoms to get the vectorial force. The
- * fastest flops are the ones we can avoid calculating!
- *
- * So, here's how it should be used:
- *
- * 1. Calculate r^2.
- * 2. Multiply by beta^2, so you get z^2=beta^2*r^2.
- * 3. Evaluate this routine with z^2 as the argument.
- * 4. The return value is the expression:
- *
- *
- * 2*exp(-z^2) erf(z)
- * ------------ - --------
- * sqrt(Pi)*z^2 z^3
- *
- * 5. Multiply the entire expression by beta^3. This will get you
- *
- * beta^3*2*exp(-z^2) beta^3*erf(z)
- * ------------------ - ---------------
- * sqrt(Pi)*z^2 z^3
- *
- * or, switching back to r (z=r*beta):
- *
- * 2*beta*exp(-r^2*beta^2) erf(r*beta)
- * ----------------------- - -----------
- * sqrt(Pi)*r^2 r^3
- *
- *
- * With a bit of math exercise you should be able to confirm that
- * this is exactly D[Erf[beta*r]/r,r] divided by r another time.
- *
- * 6. Add the result to 1/r^3, multiply by the product of the charges,
- * and you have your force (divided by r). A final multiplication
- * with the vector connecting the two particles and you have your
- * vectorial force to add to the particles.
- *
- */
-static __m128d
-gmx_mm_pmecorrF_pd(__m128d z2)
-{
- const __m128d FN10 = _mm_set1_pd(-8.0072854618360083154e-14);
- const __m128d FN9 = _mm_set1_pd(1.1859116242260148027e-11);
- const __m128d FN8 = _mm_set1_pd(-8.1490406329798423616e-10);
- const __m128d FN7 = _mm_set1_pd(3.4404793543907847655e-8);
- const __m128d FN6 = _mm_set1_pd(-9.9471420832602741006e-7);
- const __m128d FN5 = _mm_set1_pd(0.000020740315999115847456);
- const __m128d FN4 = _mm_set1_pd(-0.00031991745139313364005);
- const __m128d FN3 = _mm_set1_pd(0.0035074449373659008203);
- const __m128d FN2 = _mm_set1_pd(-0.031750380176100813405);
- const __m128d FN1 = _mm_set1_pd(0.13884101728898463426);
- const __m128d FN0 = _mm_set1_pd(-0.75225277815249618847);
-
- const __m128d FD5 = _mm_set1_pd(0.000016009278224355026701);
- const __m128d FD4 = _mm_set1_pd(0.00051055686934806966046);
- const __m128d FD3 = _mm_set1_pd(0.0081803507497974289008);
- const __m128d FD2 = _mm_set1_pd(0.077181146026670287235);
- const __m128d FD1 = _mm_set1_pd(0.41543303143712535988);
- const __m128d FD0 = _mm_set1_pd(1.0);
-
- __m128d z4;
- __m128d polyFN0, polyFN1, polyFD0, polyFD1;
-
- z4 = _mm_mul_pd(z2, z2);
-
- polyFD1 = _mm_mul_pd(FD5, z4);
- polyFD0 = _mm_mul_pd(FD4, z4);
- polyFD1 = _mm_add_pd(polyFD1, FD3);
- polyFD0 = _mm_add_pd(polyFD0, FD2);
- polyFD1 = _mm_mul_pd(polyFD1, z4);
- polyFD0 = _mm_mul_pd(polyFD0, z4);
- polyFD1 = _mm_add_pd(polyFD1, FD1);
- polyFD0 = _mm_add_pd(polyFD0, FD0);
- polyFD1 = _mm_mul_pd(polyFD1, z2);
- polyFD0 = _mm_add_pd(polyFD0, polyFD1);
-
- polyFD0 = gmx_mm_inv_pd(polyFD0);
-
- polyFN0 = _mm_mul_pd(FN10, z4);
- polyFN1 = _mm_mul_pd(FN9, z4);
- polyFN0 = _mm_add_pd(polyFN0, FN8);
- polyFN1 = _mm_add_pd(polyFN1, FN7);
- polyFN0 = _mm_mul_pd(polyFN0, z4);
- polyFN1 = _mm_mul_pd(polyFN1, z4);
- polyFN0 = _mm_add_pd(polyFN0, FN6);
- polyFN1 = _mm_add_pd(polyFN1, FN5);
- polyFN0 = _mm_mul_pd(polyFN0, z4);
- polyFN1 = _mm_mul_pd(polyFN1, z4);
- polyFN0 = _mm_add_pd(polyFN0, FN4);
- polyFN1 = _mm_add_pd(polyFN1, FN3);
- polyFN0 = _mm_mul_pd(polyFN0, z4);
- polyFN1 = _mm_mul_pd(polyFN1, z4);
- polyFN0 = _mm_add_pd(polyFN0, FN2);
- polyFN1 = _mm_add_pd(polyFN1, FN1);
- polyFN0 = _mm_mul_pd(polyFN0, z4);
- polyFN1 = _mm_mul_pd(polyFN1, z2);
- polyFN0 = _mm_add_pd(polyFN0, FN0);
- polyFN0 = _mm_add_pd(polyFN0, polyFN1);
-
- return _mm_mul_pd(polyFN0, polyFD0);
-}
-
-
-
-
-/* Calculate the potential correction due to PME analytically.
- *
- * See gmx_mm256_pmecorrF_ps() for details about the approximation.
- *
- * This routine calculates Erf(z)/z, although you should provide z^2
- * as the input argument.
- *
- * Here's how it should be used:
- *
- * 1. Calculate r^2.
- * 2. Multiply by beta^2, so you get z^2=beta^2*r^2.
- * 3. Evaluate this routine with z^2 as the argument.
- * 4. The return value is the expression:
- *
- *
- * erf(z)
- * --------
- * z
- *
- * 5. Multiply the entire expression by beta and switching back to r (z=r*beta):
- *
- * erf(r*beta)
- * -----------
- * r
- *
- * 6. Subtract the result from 1/r, multiply by the product of the charges,
- * and you have your potential.
- *
- */
-static __m128d
-gmx_mm_pmecorrV_pd(__m128d z2)
-{
- const __m128d VN9 = _mm_set1_pd(-9.3723776169321855475e-13);
- const __m128d VN8 = _mm_set1_pd(1.2280156762674215741e-10);
- const __m128d VN7 = _mm_set1_pd(-7.3562157912251309487e-9);
- const __m128d VN6 = _mm_set1_pd(2.6215886208032517509e-7);
- const __m128d VN5 = _mm_set1_pd(-4.9532491651265819499e-6);
- const __m128d VN4 = _mm_set1_pd(0.00025907400778966060389);
- const __m128d VN3 = _mm_set1_pd(0.0010585044856156469792);
- const __m128d VN2 = _mm_set1_pd(0.045247661136833092885);
- const __m128d VN1 = _mm_set1_pd(0.11643931522926034421);
- const __m128d VN0 = _mm_set1_pd(1.1283791671726767970);
-
- const __m128d VD5 = _mm_set1_pd(0.000021784709867336150342);
- const __m128d VD4 = _mm_set1_pd(0.00064293662010911388448);
- const __m128d VD3 = _mm_set1_pd(0.0096311444822588683504);
- const __m128d VD2 = _mm_set1_pd(0.085608012351550627051);
- const __m128d VD1 = _mm_set1_pd(0.43652499166614811084);
- const __m128d VD0 = _mm_set1_pd(1.0);
-
- __m128d z4;
- __m128d polyVN0, polyVN1, polyVD0, polyVD1;
-
- z4 = _mm_mul_pd(z2, z2);
-
- polyVD1 = _mm_mul_pd(VD5, z4);
- polyVD0 = _mm_mul_pd(VD4, z4);
- polyVD1 = _mm_add_pd(polyVD1, VD3);
- polyVD0 = _mm_add_pd(polyVD0, VD2);
- polyVD1 = _mm_mul_pd(polyVD1, z4);
- polyVD0 = _mm_mul_pd(polyVD0, z4);
- polyVD1 = _mm_add_pd(polyVD1, VD1);
- polyVD0 = _mm_add_pd(polyVD0, VD0);
- polyVD1 = _mm_mul_pd(polyVD1, z2);
- polyVD0 = _mm_add_pd(polyVD0, polyVD1);
-
- polyVD0 = gmx_mm_inv_pd(polyVD0);
-
- polyVN1 = _mm_mul_pd(VN9, z4);
- polyVN0 = _mm_mul_pd(VN8, z4);
- polyVN1 = _mm_add_pd(polyVN1, VN7);
- polyVN0 = _mm_add_pd(polyVN0, VN6);
- polyVN1 = _mm_mul_pd(polyVN1, z4);
- polyVN0 = _mm_mul_pd(polyVN0, z4);
- polyVN1 = _mm_add_pd(polyVN1, VN5);
- polyVN0 = _mm_add_pd(polyVN0, VN4);
- polyVN1 = _mm_mul_pd(polyVN1, z4);
- polyVN0 = _mm_mul_pd(polyVN0, z4);
- polyVN1 = _mm_add_pd(polyVN1, VN3);
- polyVN0 = _mm_add_pd(polyVN0, VN2);
- polyVN1 = _mm_mul_pd(polyVN1, z4);
- polyVN0 = _mm_mul_pd(polyVN0, z4);
- polyVN1 = _mm_add_pd(polyVN1, VN1);
- polyVN0 = _mm_add_pd(polyVN0, VN0);
- polyVN1 = _mm_mul_pd(polyVN1, z2);
- polyVN0 = _mm_add_pd(polyVN0, polyVN1);
-
- return _mm_mul_pd(polyVN0, polyVD0);
-}
-
-
-static int
-gmx_mm_sincos_pd(__m128d x,
- __m128d *sinval,
- __m128d *cosval)
-{
-#ifdef _MSC_VER
- __declspec(align(16))
- const double sintable[34] =
- {
- 1.00000000000000000e+00, 0.00000000000000000e+00,
- 9.95184726672196929e-01, 9.80171403295606036e-02,
- 9.80785280403230431e-01, 1.95090322016128248e-01,
- 9.56940335732208824e-01, 2.90284677254462331e-01,
- 9.23879532511286738e-01, 3.82683432365089782e-01,
- 8.81921264348355050e-01, 4.71396736825997642e-01,
- 8.31469612302545236e-01, 5.55570233019602178e-01,
- 7.73010453362736993e-01, 6.34393284163645488e-01,
- 7.07106781186547573e-01, 7.07106781186547462e-01,
- 6.34393284163645599e-01, 7.73010453362736882e-01,
- 5.55570233019602289e-01, 8.31469612302545125e-01,
- 4.71396736825997809e-01, 8.81921264348354939e-01,
- 3.82683432365089837e-01, 9.23879532511286738e-01,
- 2.90284677254462276e-01, 9.56940335732208935e-01,
- 1.95090322016128304e-01, 9.80785280403230431e-01,
- 9.80171403295607702e-02, 9.95184726672196818e-01,
- 0.0, 1.00000000000000000e+00
- };
-#else
- const __m128d sintable[17] =
- {
- _mm_set_pd( 0.0, 1.0 ),
- _mm_set_pd( sin( 1.0 * (M_PI/2.0) / 16.0), cos( 1.0 * (M_PI/2.0) / 16.0) ),
- _mm_set_pd( sin( 2.0 * (M_PI/2.0) / 16.0), cos( 2.0 * (M_PI/2.0) / 16.0) ),
- _mm_set_pd( sin( 3.0 * (M_PI/2.0) / 16.0), cos( 3.0 * (M_PI/2.0) / 16.0) ),
- _mm_set_pd( sin( 4.0 * (M_PI/2.0) / 16.0), cos( 4.0 * (M_PI/2.0) / 16.0) ),
- _mm_set_pd( sin( 5.0 * (M_PI/2.0) / 16.0), cos( 5.0 * (M_PI/2.0) / 16.0) ),
- _mm_set_pd( sin( 6.0 * (M_PI/2.0) / 16.0), cos( 6.0 * (M_PI/2.0) / 16.0) ),
- _mm_set_pd( sin( 7.0 * (M_PI/2.0) / 16.0), cos( 7.0 * (M_PI/2.0) / 16.0) ),
- _mm_set_pd( sin( 8.0 * (M_PI/2.0) / 16.0), cos( 8.0 * (M_PI/2.0) / 16.0) ),
- _mm_set_pd( sin( 9.0 * (M_PI/2.0) / 16.0), cos( 9.0 * (M_PI/2.0) / 16.0) ),
- _mm_set_pd( sin( 10.0 * (M_PI/2.0) / 16.0), cos( 10.0 * (M_PI/2.0) / 16.0) ),
- _mm_set_pd( sin( 11.0 * (M_PI/2.0) / 16.0), cos( 11.0 * (M_PI/2.0) / 16.0) ),
- _mm_set_pd( sin( 12.0 * (M_PI/2.0) / 16.0), cos( 12.0 * (M_PI/2.0) / 16.0) ),
- _mm_set_pd( sin( 13.0 * (M_PI/2.0) / 16.0), cos( 13.0 * (M_PI/2.0) / 16.0) ),
- _mm_set_pd( sin( 14.0 * (M_PI/2.0) / 16.0), cos( 14.0 * (M_PI/2.0) / 16.0) ),
- _mm_set_pd( sin( 15.0 * (M_PI/2.0) / 16.0), cos( 15.0 * (M_PI/2.0) / 16.0) ),
- _mm_set_pd( 1.0, 0.0 )
- };
-#endif
-
- const __m128d signmask = gmx_mm_castsi128_pd( _mm_set_epi32(0x7FFFFFFF, 0xFFFFFFFF, 0x7FFFFFFF, 0xFFFFFFFF) );
-
- const __m128d tabscale = _mm_set1_pd(32.0/M_PI);
- const __m128d invtabscale0 = _mm_set1_pd(9.81747508049011230469e-02);
- const __m128d invtabscale1 = _mm_set1_pd(1.96197799156550576057e-08);
- const __m128i ione = _mm_set1_epi32(1);
- const __m128i i32 = _mm_set1_epi32(32);
- const __m128i i16 = _mm_set1_epi32(16);
- const __m128i tabmask = _mm_set1_epi32(0x3F);
- const __m128d sinP7 = _mm_set1_pd(-1.0/5040.0);
- const __m128d sinP5 = _mm_set1_pd(1.0/120.0);
- const __m128d sinP3 = _mm_set1_pd(-1.0/6.0);
- const __m128d sinP1 = _mm_set1_pd(1.0);
-
- const __m128d cosP6 = _mm_set1_pd(-1.0/720.0);
- const __m128d cosP4 = _mm_set1_pd(1.0/24.0);
- const __m128d cosP2 = _mm_set1_pd(-1.0/2.0);
- const __m128d cosP0 = _mm_set1_pd(1.0);
-
- __m128d scalex;
- __m128i tabidx, corridx;
- __m128d xabs, z, z2, polySin, polyCos;
- __m128d xpoint;
- __m128d ypoint0, ypoint1;
-
- __m128d sinpoint, cospoint;
- __m128d xsign, ssign, csign;
- __m128i imask, sswapsign, cswapsign;
-
- xsign = _mm_andnot_pd(signmask, x);
- xabs = _mm_and_pd(x, signmask);
-
- scalex = _mm_mul_pd(tabscale, xabs);
- tabidx = _mm_cvtpd_epi32(scalex);
-
- xpoint = _mm_round_pd(scalex, _MM_FROUND_TO_NEAREST_INT);
-
- /* Extended precision arithmetics */
- z = _mm_sub_pd(xabs, _mm_mul_pd(invtabscale0, xpoint));
- z = _mm_sub_pd(z, _mm_mul_pd(invtabscale1, xpoint));
-
- /* Range reduction to 0..2*Pi */
- tabidx = _mm_and_si128(tabidx, tabmask);
-
- /* tabidx is now in range [0,..,64] */
- imask = _mm_cmpgt_epi32(tabidx, i32);
- sswapsign = imask;
- cswapsign = imask;
- corridx = _mm_and_si128(imask, i32);
- tabidx = _mm_sub_epi32(tabidx, corridx);
-
- /* tabidx is now in range [0..32] */
- imask = _mm_cmpgt_epi32(tabidx, i16);
- cswapsign = _mm_xor_si128(cswapsign, imask);
- corridx = _mm_sub_epi32(i32, tabidx);
- tabidx = _mm_blendv_epi8(tabidx, corridx, imask);
- /* tabidx is now in range [0..16] */
- ssign = _mm_cvtepi32_pd( _mm_or_si128( sswapsign, ione ) );
- csign = _mm_cvtepi32_pd( _mm_or_si128( cswapsign, ione ) );
-
-#ifdef _MSC_VER
- ypoint0 = _mm_load_pd(sintable + 2*_mm_extract_epi32(tabidx, 0));
- ypoint1 = _mm_load_pd(sintable + 2*_mm_extract_epi32(tabidx, 1));
-#else
- ypoint0 = sintable[_mm_extract_epi32(tabidx, 0)];
- ypoint1 = sintable[_mm_extract_epi32(tabidx, 1)];
-#endif
- sinpoint = _mm_unpackhi_pd(ypoint0, ypoint1);
- cospoint = _mm_unpacklo_pd(ypoint0, ypoint1);
-
- sinpoint = _mm_mul_pd(sinpoint, ssign);
- cospoint = _mm_mul_pd(cospoint, csign);
-
- z2 = _mm_mul_pd(z, z);
-
- polySin = _mm_mul_pd(sinP7, z2);
- polySin = _mm_add_pd(polySin, sinP5);
- polySin = _mm_mul_pd(polySin, z2);
- polySin = _mm_add_pd(polySin, sinP3);
- polySin = _mm_mul_pd(polySin, z2);
- polySin = _mm_add_pd(polySin, sinP1);
- polySin = _mm_mul_pd(polySin, z);
-
- polyCos = _mm_mul_pd(cosP6, z2);
- polyCos = _mm_add_pd(polyCos, cosP4);
- polyCos = _mm_mul_pd(polyCos, z2);
- polyCos = _mm_add_pd(polyCos, cosP2);
- polyCos = _mm_mul_pd(polyCos, z2);
- polyCos = _mm_add_pd(polyCos, cosP0);
-
- *sinval = _mm_xor_pd(_mm_add_pd( _mm_mul_pd(sinpoint, polyCos), _mm_mul_pd(cospoint, polySin) ), xsign);
- *cosval = _mm_sub_pd( _mm_mul_pd(cospoint, polyCos), _mm_mul_pd(sinpoint, polySin) );
-
- return 0;
-}
-
-/*
- * IMPORTANT: Do NOT call both sin & cos if you need both results, since each of them
- * will then call the sincos() routine and waste a factor 2 in performance!
- */
-static __m128d
-gmx_mm_sin_pd(__m128d x)
-{
- __m128d s, c;
- gmx_mm_sincos_pd(x, &s, &c);
- return s;
-}
-
-/*
- * IMPORTANT: Do NOT call both sin & cos if you need both results, since each of them
- * will then call the sincos() routine and waste a factor 2 in performance!
- */
-static __m128d
-gmx_mm_cos_pd(__m128d x)
-{
- __m128d s, c;
- gmx_mm_sincos_pd(x, &s, &c);
- return c;
-}
-
-
-
-static __m128d
-gmx_mm_tan_pd(__m128d x)
-{
- __m128d sinval, cosval;
- __m128d tanval;
-
- gmx_mm_sincos_pd(x, &sinval, &cosval);
-
- tanval = _mm_mul_pd(sinval, gmx_mm_inv_pd(cosval));
-
- return tanval;
-}
-
-
-
-static __m128d
-gmx_mm_asin_pd(__m128d x)
-{
- /* Same algorithm as cephes library */
- const __m128d signmask = gmx_mm_castsi128_pd( _mm_set_epi32(0x7FFFFFFF, 0xFFFFFFFF, 0x7FFFFFFF, 0xFFFFFFFF) );
- const __m128d limit1 = _mm_set1_pd(0.625);
- const __m128d limit2 = _mm_set1_pd(1e-8);
- const __m128d one = _mm_set1_pd(1.0);
- const __m128d quarterpi = _mm_set1_pd(M_PI/4.0);
- const __m128d morebits = _mm_set1_pd(6.123233995736765886130e-17);
-
- const __m128d P5 = _mm_set1_pd(4.253011369004428248960e-3);
- const __m128d P4 = _mm_set1_pd(-6.019598008014123785661e-1);
- const __m128d P3 = _mm_set1_pd(5.444622390564711410273e0);
- const __m128d P2 = _mm_set1_pd(-1.626247967210700244449e1);
- const __m128d P1 = _mm_set1_pd(1.956261983317594739197e1);
- const __m128d P0 = _mm_set1_pd(-8.198089802484824371615e0);
-
- const __m128d Q4 = _mm_set1_pd(-1.474091372988853791896e1);
- const __m128d Q3 = _mm_set1_pd(7.049610280856842141659e1);
- const __m128d Q2 = _mm_set1_pd(-1.471791292232726029859e2);
- const __m128d Q1 = _mm_set1_pd(1.395105614657485689735e2);
- const __m128d Q0 = _mm_set1_pd(-4.918853881490881290097e1);
-
- const __m128d R4 = _mm_set1_pd(2.967721961301243206100e-3);
- const __m128d R3 = _mm_set1_pd(-5.634242780008963776856e-1);
- const __m128d R2 = _mm_set1_pd(6.968710824104713396794e0);
- const __m128d R1 = _mm_set1_pd(-2.556901049652824852289e1);
- const __m128d R0 = _mm_set1_pd(2.853665548261061424989e1);
-
- const __m128d S3 = _mm_set1_pd(-2.194779531642920639778e1);
- const __m128d S2 = _mm_set1_pd(1.470656354026814941758e2);
- const __m128d S1 = _mm_set1_pd(-3.838770957603691357202e2);
- const __m128d S0 = _mm_set1_pd(3.424398657913078477438e2);
-
- __m128d sign;
- __m128d mask;
- __m128d xabs;
- __m128d zz, ww, z, q, w, zz2, ww2;
- __m128d PA, PB;
- __m128d QA, QB;
- __m128d RA, RB;
- __m128d SA, SB;
- __m128d nom, denom;
-
- sign = _mm_andnot_pd(signmask, x);
- xabs = _mm_and_pd(x, signmask);
-
- mask = _mm_cmpgt_pd(xabs, limit1);
-
- zz = _mm_sub_pd(one, xabs);
- ww = _mm_mul_pd(xabs, xabs);
- zz2 = _mm_mul_pd(zz, zz);
- ww2 = _mm_mul_pd(ww, ww);
-
- /* R */
- RA = _mm_mul_pd(R4, zz2);
- RB = _mm_mul_pd(R3, zz2);
- RA = _mm_add_pd(RA, R2);
- RB = _mm_add_pd(RB, R1);
- RA = _mm_mul_pd(RA, zz2);
- RB = _mm_mul_pd(RB, zz);
- RA = _mm_add_pd(RA, R0);
- RA = _mm_add_pd(RA, RB);
-
- /* S, SA = zz2 */
- SB = _mm_mul_pd(S3, zz2);
- SA = _mm_add_pd(zz2, S2);
- SB = _mm_add_pd(SB, S1);
- SA = _mm_mul_pd(SA, zz2);
- SB = _mm_mul_pd(SB, zz);
- SA = _mm_add_pd(SA, S0);
- SA = _mm_add_pd(SA, SB);
-
- /* P */
- PA = _mm_mul_pd(P5, ww2);
- PB = _mm_mul_pd(P4, ww2);
- PA = _mm_add_pd(PA, P3);
- PB = _mm_add_pd(PB, P2);
- PA = _mm_mul_pd(PA, ww2);
- PB = _mm_mul_pd(PB, ww2);
- PA = _mm_add_pd(PA, P1);
- PB = _mm_add_pd(PB, P0);
- PA = _mm_mul_pd(PA, ww);
- PA = _mm_add_pd(PA, PB);
-
- /* Q, QA = ww2 */
- QB = _mm_mul_pd(Q4, ww2);
- QA = _mm_add_pd(ww2, Q3);
- QB = _mm_add_pd(QB, Q2);
- QA = _mm_mul_pd(QA, ww2);
- QB = _mm_mul_pd(QB, ww2);
- QA = _mm_add_pd(QA, Q1);
- QB = _mm_add_pd(QB, Q0);
- QA = _mm_mul_pd(QA, ww);
- QA = _mm_add_pd(QA, QB);
-
- RA = _mm_mul_pd(RA, zz);
- PA = _mm_mul_pd(PA, ww);
-
- nom = _mm_blendv_pd( PA, RA, mask );
- denom = _mm_blendv_pd( QA, SA, mask );
-
- q = _mm_mul_pd( nom, gmx_mm_inv_pd(denom) );
-
- zz = _mm_add_pd(zz, zz);
- zz = gmx_mm_sqrt_pd(zz);
- z = _mm_sub_pd(quarterpi, zz);
- zz = _mm_mul_pd(zz, q);
- zz = _mm_sub_pd(zz, morebits);
- z = _mm_sub_pd(z, zz);
- z = _mm_add_pd(z, quarterpi);
-
- w = _mm_mul_pd(xabs, q);
- w = _mm_add_pd(w, xabs);
-
- z = _mm_blendv_pd( w, z, mask );
-
- mask = _mm_cmpgt_pd(xabs, limit2);
- z = _mm_blendv_pd( xabs, z, mask );
-
- z = _mm_xor_pd(z, sign);
-
- return z;
-}
-
-
-static __m128d
-gmx_mm_acos_pd(__m128d x)
-{
- const __m128d one = _mm_set1_pd(1.0);
- const __m128d half = _mm_set1_pd(0.5);
- const __m128d quarterpi0 = _mm_set1_pd(7.85398163397448309616e-1);
- const __m128d quarterpi1 = _mm_set1_pd(6.123233995736765886130e-17);
-
-
- __m128d mask1;
-
- __m128d z, z1, z2;
-
- mask1 = _mm_cmpgt_pd(x, half);
- z1 = _mm_mul_pd(half, _mm_sub_pd(one, x));
- z1 = gmx_mm_sqrt_pd(z1);
- z = _mm_blendv_pd( x, z1, mask1 );
-
- z = gmx_mm_asin_pd(z);
-
- z1 = _mm_add_pd(z, z);
-
- z2 = _mm_sub_pd(quarterpi0, z);
- z2 = _mm_add_pd(z2, quarterpi1);
- z2 = _mm_add_pd(z2, quarterpi0);
-
- z = _mm_blendv_pd(z2, z1, mask1);
-
- return z;
-}
-
-static __m128d
-gmx_mm_atan_pd(__m128d x)
-{
- /* Same algorithm as cephes library */
- const __m128d signmask = gmx_mm_castsi128_pd( _mm_set_epi32(0x7FFFFFFF, 0xFFFFFFFF, 0x7FFFFFFF, 0xFFFFFFFF) );
- const __m128d limit1 = _mm_set1_pd(0.66);
- const __m128d limit2 = _mm_set1_pd(2.41421356237309504880);
- const __m128d quarterpi = _mm_set1_pd(M_PI/4.0);
- const __m128d halfpi = _mm_set1_pd(M_PI/2.0);
- const __m128d mone = _mm_set1_pd(-1.0);
- const __m128d morebits1 = _mm_set1_pd(0.5*6.123233995736765886130E-17);
- const __m128d morebits2 = _mm_set1_pd(6.123233995736765886130E-17);
-
- const __m128d P4 = _mm_set1_pd(-8.750608600031904122785E-1);
- const __m128d P3 = _mm_set1_pd(-1.615753718733365076637E1);
- const __m128d P2 = _mm_set1_pd(-7.500855792314704667340E1);
- const __m128d P1 = _mm_set1_pd(-1.228866684490136173410E2);
- const __m128d P0 = _mm_set1_pd(-6.485021904942025371773E1);
-
- const __m128d Q4 = _mm_set1_pd(2.485846490142306297962E1);
- const __m128d Q3 = _mm_set1_pd(1.650270098316988542046E2);
- const __m128d Q2 = _mm_set1_pd(4.328810604912902668951E2);
- const __m128d Q1 = _mm_set1_pd(4.853903996359136964868E2);
- const __m128d Q0 = _mm_set1_pd(1.945506571482613964425E2);
-
- __m128d sign;
- __m128d mask1, mask2;
- __m128d y, t1, t2;
- __m128d z, z2;
- __m128d P_A, P_B, Q_A, Q_B;
-
- sign = _mm_andnot_pd(signmask, x);
- x = _mm_and_pd(x, signmask);
-
- mask1 = _mm_cmpgt_pd(x, limit1);
- mask2 = _mm_cmpgt_pd(x, limit2);
-
- t1 = _mm_mul_pd(_mm_add_pd(x, mone), gmx_mm_inv_pd(_mm_sub_pd(x, mone)));
- t2 = _mm_mul_pd(mone, gmx_mm_inv_pd(x));
-
- y = _mm_and_pd(mask1, quarterpi);
- y = _mm_or_pd( _mm_and_pd(mask2, halfpi), _mm_andnot_pd(mask2, y) );
-
- x = _mm_or_pd( _mm_and_pd(mask1, t1), _mm_andnot_pd(mask1, x) );
- x = _mm_or_pd( _mm_and_pd(mask2, t2), _mm_andnot_pd(mask2, x) );
-
- z = _mm_mul_pd(x, x);
- z2 = _mm_mul_pd(z, z);
-
- P_A = _mm_mul_pd(P4, z2);
- P_B = _mm_mul_pd(P3, z2);
- P_A = _mm_add_pd(P_A, P2);
- P_B = _mm_add_pd(P_B, P1);
- P_A = _mm_mul_pd(P_A, z2);
- P_B = _mm_mul_pd(P_B, z);
- P_A = _mm_add_pd(P_A, P0);
- P_A = _mm_add_pd(P_A, P_B);
-
- /* Q_A = z2 */
- Q_B = _mm_mul_pd(Q4, z2);
- Q_A = _mm_add_pd(z2, Q3);
- Q_B = _mm_add_pd(Q_B, Q2);
- Q_A = _mm_mul_pd(Q_A, z2);
- Q_B = _mm_mul_pd(Q_B, z2);
- Q_A = _mm_add_pd(Q_A, Q1);
- Q_B = _mm_add_pd(Q_B, Q0);
- Q_A = _mm_mul_pd(Q_A, z);
- Q_A = _mm_add_pd(Q_A, Q_B);
-
- z = _mm_mul_pd(z, P_A);
- z = _mm_mul_pd(z, gmx_mm_inv_pd(Q_A));
- z = _mm_mul_pd(z, x);
- z = _mm_add_pd(z, x);
-
- t1 = _mm_and_pd(mask1, morebits1);
- t1 = _mm_or_pd( _mm_and_pd(mask2, morebits2), _mm_andnot_pd(mask2, t1) );
-
- z = _mm_add_pd(z, t1);
- y = _mm_add_pd(y, z);
-
- y = _mm_xor_pd(y, sign);
-
- return y;
-}
-
-
-static __m128d
-gmx_mm_atan2_pd(__m128d y, __m128d x)
-{
- const __m128d pi = _mm_set1_pd(M_PI);
- const __m128d minuspi = _mm_set1_pd(-M_PI);
- const __m128d halfpi = _mm_set1_pd(M_PI/2.0);
- const __m128d minushalfpi = _mm_set1_pd(-M_PI/2.0);
-
- __m128d z, z1, z3, z4;
- __m128d w;
- __m128d maskx_lt, maskx_eq;
- __m128d masky_lt, masky_eq;
- __m128d mask1, mask2, mask3, mask4, maskall;
-
- maskx_lt = _mm_cmplt_pd(x, _mm_setzero_pd());
- masky_lt = _mm_cmplt_pd(y, _mm_setzero_pd());
- maskx_eq = _mm_cmpeq_pd(x, _mm_setzero_pd());
- masky_eq = _mm_cmpeq_pd(y, _mm_setzero_pd());
-
- z = _mm_mul_pd(y, gmx_mm_inv_pd(x));
- z = gmx_mm_atan_pd(z);
-
- mask1 = _mm_and_pd(maskx_eq, masky_lt);
- mask2 = _mm_andnot_pd(maskx_lt, masky_eq);
- mask3 = _mm_andnot_pd( _mm_or_pd(masky_lt, masky_eq), maskx_eq);
- mask4 = _mm_and_pd(masky_eq, maskx_lt);
-
- maskall = _mm_or_pd( _mm_or_pd(mask1, mask2), _mm_or_pd(mask3, mask4) );
-
- z = _mm_andnot_pd(maskall, z);
- z1 = _mm_and_pd(mask1, minushalfpi);
- z3 = _mm_and_pd(mask3, halfpi);
- z4 = _mm_and_pd(mask4, pi);
-
- z = _mm_or_pd( _mm_or_pd(z, z1), _mm_or_pd(z3, z4) );
-
- w = _mm_blendv_pd(pi, minuspi, masky_lt);
- w = _mm_and_pd(w, maskx_lt);
-
- w = _mm_andnot_pd(maskall, w);
-
- z = _mm_add_pd(z, w);
- return z;
-}
+#define gmx_mm_invsqrt_pd gmx_simd_invsqrt_d
+#define gmx_mm_inv_pd gmx_simd_inv_d
+#define gmx_mm_log_pd gmx_simd_log_d
+#define gmx_mm_pmecorrF_pd gmx_simd_pmecorrF_d
+#define gmx_mm_pmecorrV_pd gmx_simd_pmecorrV_d
+#define gmx_mm_sincos_pd gmx_simd_sincos_d
#endif
/*
* This file is part of the GROMACS molecular simulation package.
*
- * Copyright (c) 2012,2013, by the GROMACS development team, led by
+ * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
* Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
* and including many others, as listed in the AUTHORS file in the
* top-level source directory and at http://www.gromacs.org.
#ifndef GMX_SIMD_MATH_SSE4_1_SINGLE_H
#define GMX_SIMD_MATH_SSE4_1_SINGLE_H
-#include <stdio.h>
-#include <math.h>
+#include "simd_math.h"
-#include "general_x86_sse4_1.h"
-
-
-
-#ifndef M_PI
-# define M_PI 3.14159265358979323846264338327950288
-#endif
-
-
-
-
-/************************
- * *
- * Simple math routines *
- * *
- ************************/
-
-/* 1.0/sqrt(x) */
-static gmx_inline __m128
-gmx_mm_invsqrt_ps(__m128 x)
-{
- const __m128 half = _mm_set_ps(0.5, 0.5, 0.5, 0.5);
- const __m128 three = _mm_set_ps(3.0, 3.0, 3.0, 3.0);
-
- __m128 lu = _mm_rsqrt_ps(x);
-
- return _mm_mul_ps(half, _mm_mul_ps(_mm_sub_ps(three, _mm_mul_ps(_mm_mul_ps(lu, lu), x)), lu));
-}
-
-/* sqrt(x) - Do NOT use this (but rather invsqrt) if you actually need 1.0/sqrt(x) */
-static gmx_inline __m128
-gmx_mm_sqrt_ps(__m128 x)
-{
- __m128 mask;
- __m128 res;
-
- mask = _mm_cmpeq_ps(x, _mm_setzero_ps());
- res = _mm_andnot_ps(mask, gmx_mm_invsqrt_ps(x));
-
- res = _mm_mul_ps(x, res);
-
- return res;
-}
-
-/* 1.0/x */
-static gmx_inline __m128
-gmx_mm_inv_ps(__m128 x)
-{
- const __m128 two = _mm_set_ps(2.0f, 2.0f, 2.0f, 2.0f);
-
- __m128 lu = _mm_rcp_ps(x);
-
- return _mm_mul_ps(lu, _mm_sub_ps(two, _mm_mul_ps(lu, x)));
-}
-
-static gmx_inline __m128
-gmx_mm_abs_ps(__m128 x)
-{
- const __m128 signmask = gmx_mm_castsi128_ps( _mm_set1_epi32(0x7FFFFFFF) );
-
- return _mm_and_ps(x, signmask);
-}
-
-
-
-static __m128
-gmx_mm_log_ps(__m128 x)
-{
- /* Same algorithm as cephes library */
- const __m128 expmask = gmx_mm_castsi128_ps( _mm_set_epi32(0x7F800000, 0x7F800000, 0x7F800000, 0x7F800000) );
- const __m128i expbase_m1 = _mm_set1_epi32(127-1); /* We want non-IEEE format */
- const __m128 half = _mm_set1_ps(0.5f);
- const __m128 one = _mm_set1_ps(1.0f);
- const __m128 invsq2 = _mm_set1_ps(1.0f/sqrt(2.0f));
- const __m128 corr1 = _mm_set1_ps(-2.12194440e-4f);
- const __m128 corr2 = _mm_set1_ps(0.693359375f);
-
- const __m128 CA_1 = _mm_set1_ps(0.070376836292f);
- const __m128 CB_0 = _mm_set1_ps(1.6714950086782716f);
- const __m128 CB_1 = _mm_set1_ps(-2.452088066061482f);
- const __m128 CC_0 = _mm_set1_ps(1.5220770854701728f);
- const __m128 CC_1 = _mm_set1_ps(-1.3422238433233642f);
- const __m128 CD_0 = _mm_set1_ps(1.386218787509749f);
- const __m128 CD_1 = _mm_set1_ps(0.35075468953796346f);
- const __m128 CE_0 = _mm_set1_ps(1.3429983063133937f);
- const __m128 CE_1 = _mm_set1_ps(1.807420826584643f);
-
- __m128 fexp;
- __m128i iexp;
- __m128 mask;
- __m128 x2;
- __m128 y;
- __m128 pA, pB, pC, pD, pE, tB, tC, tD, tE;
-
- /* Separate x into exponent and mantissa, with a mantissa in the range [0.5..1[ (not IEEE754 standard!) */
- fexp = _mm_and_ps(x, expmask);
- iexp = gmx_mm_castps_si128(fexp);
- iexp = _mm_srli_epi32(iexp, 23);
- iexp = _mm_sub_epi32(iexp, expbase_m1);
-
- x = _mm_andnot_ps(expmask, x);
- x = _mm_or_ps(x, one);
- x = _mm_mul_ps(x, half);
-
- mask = _mm_cmplt_ps(x, invsq2);
-
- x = _mm_add_ps(x, _mm_and_ps(mask, x));
- x = _mm_sub_ps(x, one);
- iexp = _mm_add_epi32(iexp, gmx_mm_castps_si128(mask)); /* 0xFFFFFFFF = -1 as int */
-
- x2 = _mm_mul_ps(x, x);
-
- pA = _mm_mul_ps(CA_1, x);
- pB = _mm_mul_ps(CB_1, x);
- pC = _mm_mul_ps(CC_1, x);
- pD = _mm_mul_ps(CD_1, x);
- pE = _mm_mul_ps(CE_1, x);
- tB = _mm_add_ps(CB_0, x2);
- tC = _mm_add_ps(CC_0, x2);
- tD = _mm_add_ps(CD_0, x2);
- tE = _mm_add_ps(CE_0, x2);
- pB = _mm_add_ps(pB, tB);
- pC = _mm_add_ps(pC, tC);
- pD = _mm_add_ps(pD, tD);
- pE = _mm_add_ps(pE, tE);
-
- pA = _mm_mul_ps(pA, pB);
- pC = _mm_mul_ps(pC, pD);
- pE = _mm_mul_ps(pE, x2);
- pA = _mm_mul_ps(pA, pC);
- y = _mm_mul_ps(pA, pE);
-
- fexp = _mm_cvtepi32_ps(iexp);
- y = _mm_add_ps(y, _mm_mul_ps(fexp, corr1));
-
- y = _mm_sub_ps(y, _mm_mul_ps(half, x2));
- x2 = _mm_add_ps(x, y);
-
- x2 = _mm_add_ps(x2, _mm_mul_ps(fexp, corr2));
-
- return x2;
-}
-
-
-/*
- * 2^x function.
- *
- * The 2^w term is calculated from a (6,0)-th order (no denominator) Minimax polynomia on the interval
- * [-0.5,0.5]. The coefficiencts of this was derived in Mathematica using the command:
- *
- * MiniMaxApproximation[(2^x), {x, {-0.5, 0.5}, 6, 0}, WorkingPrecision -> 15]
- *
- * The largest-magnitude exponent we can represent in IEEE single-precision binary format
- * is 2^-126 for small numbers and 2^127 for large ones. To avoid wrap-around problems, we set the
- * result to zero if the argument falls outside this range. For small numbers this is just fine, but
- * for large numbers you could be fancy and return the smallest/largest IEEE single-precision
- * number instead. That would take a few extra cycles and not really help, since something is
- * wrong if you are using single precision to work with numbers that cannot really be represented
- * in single precision.
- *
- * The accuracy is at least 23 bits.
- */
-static __m128
-gmx_mm_exp2_ps(__m128 x)
-{
- /* Lower bound: We do not allow numbers that would lead to an IEEE fp representation exponent smaller than -126. */
- const __m128 arglimit = _mm_set1_ps(126.0f);
-
- const __m128i expbase = _mm_set1_epi32(127);
- const __m128 CA6 = _mm_set1_ps(1.535336188319500E-004);
- const __m128 CA5 = _mm_set1_ps(1.339887440266574E-003);
- const __m128 CA4 = _mm_set1_ps(9.618437357674640E-003);
- const __m128 CA3 = _mm_set1_ps(5.550332471162809E-002);
- const __m128 CA2 = _mm_set1_ps(2.402264791363012E-001);
- const __m128 CA1 = _mm_set1_ps(6.931472028550421E-001);
- const __m128 CA0 = _mm_set1_ps(1.0f);
-
- __m128 valuemask;
- __m128i iexppart;
- __m128 fexppart;
- __m128 intpart;
- __m128 x2;
- __m128 p0, p1;
-
- iexppart = _mm_cvtps_epi32(x);
- intpart = _mm_round_ps(x, _MM_FROUND_TO_NEAREST_INT);
- iexppart = _mm_slli_epi32(_mm_add_epi32(iexppart, expbase), 23);
- valuemask = _mm_cmpge_ps(arglimit, gmx_mm_abs_ps(x));
- fexppart = _mm_and_ps(valuemask, gmx_mm_castsi128_ps(iexppart));
-
- x = _mm_sub_ps(x, intpart);
- x2 = _mm_mul_ps(x, x);
-
- p0 = _mm_mul_ps(CA6, x2);
- p1 = _mm_mul_ps(CA5, x2);
- p0 = _mm_add_ps(p0, CA4);
- p1 = _mm_add_ps(p1, CA3);
- p0 = _mm_mul_ps(p0, x2);
- p1 = _mm_mul_ps(p1, x2);
- p0 = _mm_add_ps(p0, CA2);
- p1 = _mm_add_ps(p1, CA1);
- p0 = _mm_mul_ps(p0, x2);
- p1 = _mm_mul_ps(p1, x);
- p0 = _mm_add_ps(p0, CA0);
- p0 = _mm_add_ps(p0, p1);
- x = _mm_mul_ps(p0, fexppart);
-
- return x;
-}
-
-
-/* Exponential function. This could be calculated from 2^x as Exp(x)=2^(y), where y=log2(e)*x,
- * but there will then be a small rounding error since we lose some precision due to the
- * multiplication. This will then be magnified a lot by the exponential.
- *
- * Instead, we calculate the fractional part directly as a minimax approximation of
- * Exp(z) on [-0.5,0.5]. We use extended precision arithmetics to calculate the fraction
- * remaining after 2^y, which avoids the precision-loss.
- * The final result is correct to within 1 LSB over the entire argument range.
- */
-static __m128
-gmx_mm_exp_ps(__m128 x)
-{
- const __m128 argscale = _mm_set1_ps(1.44269504088896341f);
- /* Lower bound: Disallow numbers that would lead to an IEEE fp exponent reaching +-127. */
- const __m128 arglimit = _mm_set1_ps(126.0f);
- const __m128i expbase = _mm_set1_epi32(127);
-
- const __m128 invargscale0 = _mm_set1_ps(0.693359375f);
- const __m128 invargscale1 = _mm_set1_ps(-2.12194440e-4f);
-
- const __m128 CC5 = _mm_set1_ps(1.9875691500e-4f);
- const __m128 CC4 = _mm_set1_ps(1.3981999507e-3f);
- const __m128 CC3 = _mm_set1_ps(8.3334519073e-3f);
- const __m128 CC2 = _mm_set1_ps(4.1665795894e-2f);
- const __m128 CC1 = _mm_set1_ps(1.6666665459e-1f);
- const __m128 CC0 = _mm_set1_ps(5.0000001201e-1f);
- const __m128 one = _mm_set1_ps(1.0f);
-
- __m128 y, x2;
- __m128 p0, p1;
- __m128 valuemask;
- __m128i iexppart;
- __m128 fexppart;
- __m128 intpart;
-
- y = _mm_mul_ps(x, argscale);
-
- iexppart = _mm_cvtps_epi32(y);
- intpart = _mm_round_ps(y, _MM_FROUND_TO_NEAREST_INT);
-
- iexppart = _mm_slli_epi32(_mm_add_epi32(iexppart, expbase), 23);
- valuemask = _mm_cmpge_ps(arglimit, gmx_mm_abs_ps(y));
- fexppart = _mm_and_ps(valuemask, gmx_mm_castsi128_ps(iexppart));
-
- /* Extended precision arithmetics */
- x = _mm_sub_ps(x, _mm_mul_ps(invargscale0, intpart));
- x = _mm_sub_ps(x, _mm_mul_ps(invargscale1, intpart));
-
- x2 = _mm_mul_ps(x, x);
-
- p1 = _mm_mul_ps(CC5, x2);
- p0 = _mm_mul_ps(CC4, x2);
- p1 = _mm_add_ps(p1, CC3);
- p0 = _mm_add_ps(p0, CC2);
- p1 = _mm_mul_ps(p1, x2);
- p0 = _mm_mul_ps(p0, x2);
- p1 = _mm_add_ps(p1, CC1);
- p0 = _mm_add_ps(p0, CC0);
- p1 = _mm_mul_ps(p1, x);
- p0 = _mm_add_ps(p0, p1);
- p0 = _mm_mul_ps(p0, x2);
- x = _mm_add_ps(x, one);
- x = _mm_add_ps(x, p0);
-
- x = _mm_mul_ps(x, fexppart);
-
- return x;
-}
-
-/* FULL precision. Only errors in LSB */
-static __m128
-gmx_mm_erf_ps(__m128 x)
-{
- /* Coefficients for minimax approximation of erf(x)=x*P(x^2) in range [-1,1] */
- const __m128 CA6 = _mm_set1_ps(7.853861353153693e-5f);
- const __m128 CA5 = _mm_set1_ps(-8.010193625184903e-4f);
- const __m128 CA4 = _mm_set1_ps(5.188327685732524e-3f);
- const __m128 CA3 = _mm_set1_ps(-2.685381193529856e-2f);
- const __m128 CA2 = _mm_set1_ps(1.128358514861418e-1f);
- const __m128 CA1 = _mm_set1_ps(-3.761262582423300e-1f);
- const __m128 CA0 = _mm_set1_ps(1.128379165726710f);
- /* Coefficients for minimax approximation of erfc(x)=Exp(-x^2)*P((1/(x-1))^2) in range [0.67,2] */
- const __m128 CB9 = _mm_set1_ps(-0.0018629930017603923f);
- const __m128 CB8 = _mm_set1_ps(0.003909821287598495f);
- const __m128 CB7 = _mm_set1_ps(-0.0052094582210355615f);
- const __m128 CB6 = _mm_set1_ps(0.005685614362160572f);
- const __m128 CB5 = _mm_set1_ps(-0.0025367682853477272f);
- const __m128 CB4 = _mm_set1_ps(-0.010199799682318782f);
- const __m128 CB3 = _mm_set1_ps(0.04369575504816542f);
- const __m128 CB2 = _mm_set1_ps(-0.11884063474674492f);
- const __m128 CB1 = _mm_set1_ps(0.2732120154030589f);
- const __m128 CB0 = _mm_set1_ps(0.42758357702025784f);
- /* Coefficients for minimax approximation of erfc(x)=Exp(-x^2)*(1/x)*P((1/x)^2) in range [2,9.19] */
- const __m128 CC10 = _mm_set1_ps(-0.0445555913112064f);
- const __m128 CC9 = _mm_set1_ps(0.21376355144663348f);
- const __m128 CC8 = _mm_set1_ps(-0.3473187200259257f);
- const __m128 CC7 = _mm_set1_ps(0.016690861551248114f);
- const __m128 CC6 = _mm_set1_ps(0.7560973182491192f);
- const __m128 CC5 = _mm_set1_ps(-1.2137903600145787f);
- const __m128 CC4 = _mm_set1_ps(0.8411872321232948f);
- const __m128 CC3 = _mm_set1_ps(-0.08670413896296343f);
- const __m128 CC2 = _mm_set1_ps(-0.27124782687240334f);
- const __m128 CC1 = _mm_set1_ps(-0.0007502488047806069f);
- const __m128 CC0 = _mm_set1_ps(0.5642114853803148f);
-
- /* Coefficients for expansion of exp(x) in [0,0.1] */
- /* CD0 and CD1 are both 1.0, so no need to declare them separately */
- const __m128 CD2 = _mm_set1_ps(0.5000066608081202f);
- const __m128 CD3 = _mm_set1_ps(0.1664795422874624f);
- const __m128 CD4 = _mm_set1_ps(0.04379839977652482f);
-
- const __m128 sieve = gmx_mm_castsi128_ps( _mm_set1_epi32(0xfffff000) );
- const __m128 signbit = gmx_mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
- const __m128 one = _mm_set1_ps(1.0f);
- const __m128 two = _mm_set1_ps(2.0f);
-
- __m128 x2, x4, y;
- __m128 z, q, t, t2, w, w2;
- __m128 pA0, pA1, pB0, pB1, pC0, pC1;
- __m128 expmx2, corr;
- __m128 res_erf, res_erfc, res;
- __m128 mask;
-
- /* Calculate erf() */
- x2 = _mm_mul_ps(x, x);
- x4 = _mm_mul_ps(x2, x2);
-
- pA0 = _mm_mul_ps(CA6, x4);
- pA1 = _mm_mul_ps(CA5, x4);
- pA0 = _mm_add_ps(pA0, CA4);
- pA1 = _mm_add_ps(pA1, CA3);
- pA0 = _mm_mul_ps(pA0, x4);
- pA1 = _mm_mul_ps(pA1, x4);
- pA0 = _mm_add_ps(pA0, CA2);
- pA1 = _mm_add_ps(pA1, CA1);
- pA0 = _mm_mul_ps(pA0, x4);
- pA1 = _mm_mul_ps(pA1, x2);
- pA0 = _mm_add_ps(pA0, pA1);
- pA0 = _mm_add_ps(pA0, CA0);
-
- res_erf = _mm_mul_ps(x, pA0);
-
- /* Calculate erfc */
-
- y = gmx_mm_abs_ps(x);
- t = gmx_mm_inv_ps(y);
- w = _mm_sub_ps(t, one);
- t2 = _mm_mul_ps(t, t);
- w2 = _mm_mul_ps(w, w);
- /*
- * We cannot simply calculate exp(-x2) directly in single precision, since
- * that will lose a couple of bits of precision due to the multiplication.
- * Instead, we introduce x=z+w, where the last 12 bits of precision are in w.
- * Then we get exp(-x2) = exp(-z2)*exp((z-x)*(z+x)).
- *
- * The only drawback with this is that it requires TWO separate exponential
- * evaluations, which would be horrible performance-wise. However, the argument
- * for the second exp() call is always small, so there we simply use a
- * low-order minimax expansion on [0,0.1].
- */
-
- z = _mm_and_ps(y, sieve);
- q = _mm_mul_ps( _mm_sub_ps(z, y), _mm_add_ps(z, y) );
-
- corr = _mm_mul_ps(CD4, q);
- corr = _mm_add_ps(corr, CD3);
- corr = _mm_mul_ps(corr, q);
- corr = _mm_add_ps(corr, CD2);
- corr = _mm_mul_ps(corr, q);
- corr = _mm_add_ps(corr, one);
- corr = _mm_mul_ps(corr, q);
- corr = _mm_add_ps(corr, one);
-
- expmx2 = gmx_mm_exp_ps( _mm_or_ps( signbit, _mm_mul_ps(z, z) ) );
- expmx2 = _mm_mul_ps(expmx2, corr);
-
- pB1 = _mm_mul_ps(CB9, w2);
- pB0 = _mm_mul_ps(CB8, w2);
- pB1 = _mm_add_ps(pB1, CB7);
- pB0 = _mm_add_ps(pB0, CB6);
- pB1 = _mm_mul_ps(pB1, w2);
- pB0 = _mm_mul_ps(pB0, w2);
- pB1 = _mm_add_ps(pB1, CB5);
- pB0 = _mm_add_ps(pB0, CB4);
- pB1 = _mm_mul_ps(pB1, w2);
- pB0 = _mm_mul_ps(pB0, w2);
- pB1 = _mm_add_ps(pB1, CB3);
- pB0 = _mm_add_ps(pB0, CB2);
- pB1 = _mm_mul_ps(pB1, w2);
- pB0 = _mm_mul_ps(pB0, w2);
- pB1 = _mm_add_ps(pB1, CB1);
- pB1 = _mm_mul_ps(pB1, w);
- pB0 = _mm_add_ps(pB0, pB1);
- pB0 = _mm_add_ps(pB0, CB0);
-
- pC0 = _mm_mul_ps(CC10, t2);
- pC1 = _mm_mul_ps(CC9, t2);
- pC0 = _mm_add_ps(pC0, CC8);
- pC1 = _mm_add_ps(pC1, CC7);
- pC0 = _mm_mul_ps(pC0, t2);
- pC1 = _mm_mul_ps(pC1, t2);
- pC0 = _mm_add_ps(pC0, CC6);
- pC1 = _mm_add_ps(pC1, CC5);
- pC0 = _mm_mul_ps(pC0, t2);
- pC1 = _mm_mul_ps(pC1, t2);
- pC0 = _mm_add_ps(pC0, CC4);
- pC1 = _mm_add_ps(pC1, CC3);
- pC0 = _mm_mul_ps(pC0, t2);
- pC1 = _mm_mul_ps(pC1, t2);
- pC0 = _mm_add_ps(pC0, CC2);
- pC1 = _mm_add_ps(pC1, CC1);
- pC0 = _mm_mul_ps(pC0, t2);
- pC1 = _mm_mul_ps(pC1, t);
- pC0 = _mm_add_ps(pC0, pC1);
- pC0 = _mm_add_ps(pC0, CC0);
- pC0 = _mm_mul_ps(pC0, t);
-
- /* SELECT pB0 or pC0 for erfc() */
- mask = _mm_cmplt_ps(two, y);
- res_erfc = _mm_blendv_ps(pB0, pC0, mask);
- res_erfc = _mm_mul_ps(res_erfc, expmx2);
-
- /* erfc(x<0) = 2-erfc(|x|) */
- mask = _mm_cmplt_ps(x, _mm_setzero_ps());
- res_erfc = _mm_blendv_ps(res_erfc, _mm_sub_ps(two, res_erfc), mask);
-
- /* Select erf() or erfc() */
- mask = _mm_cmplt_ps(y, _mm_set1_ps(0.75f));
- res = _mm_blendv_ps(_mm_sub_ps(one, res_erfc), res_erf, mask);
-
- return res;
-}
-
-
-/* FULL precision. Only errors in LSB */
-static __m128
-gmx_mm_erfc_ps(__m128 x)
-{
- /* Coefficients for minimax approximation of erf(x)=x*P(x^2) in range [-1,1] */
- const __m128 CA6 = _mm_set1_ps(7.853861353153693e-5f);
- const __m128 CA5 = _mm_set1_ps(-8.010193625184903e-4f);
- const __m128 CA4 = _mm_set1_ps(5.188327685732524e-3f);
- const __m128 CA3 = _mm_set1_ps(-2.685381193529856e-2f);
- const __m128 CA2 = _mm_set1_ps(1.128358514861418e-1f);
- const __m128 CA1 = _mm_set1_ps(-3.761262582423300e-1f);
- const __m128 CA0 = _mm_set1_ps(1.128379165726710f);
- /* Coefficients for minimax approximation of erfc(x)=Exp(-x^2)*P((1/(x-1))^2) in range [0.67,2] */
- const __m128 CB9 = _mm_set1_ps(-0.0018629930017603923f);
- const __m128 CB8 = _mm_set1_ps(0.003909821287598495f);
- const __m128 CB7 = _mm_set1_ps(-0.0052094582210355615f);
- const __m128 CB6 = _mm_set1_ps(0.005685614362160572f);
- const __m128 CB5 = _mm_set1_ps(-0.0025367682853477272f);
- const __m128 CB4 = _mm_set1_ps(-0.010199799682318782f);
- const __m128 CB3 = _mm_set1_ps(0.04369575504816542f);
- const __m128 CB2 = _mm_set1_ps(-0.11884063474674492f);
- const __m128 CB1 = _mm_set1_ps(0.2732120154030589f);
- const __m128 CB0 = _mm_set1_ps(0.42758357702025784f);
- /* Coefficients for minimax approximation of erfc(x)=Exp(-x^2)*(1/x)*P((1/x)^2) in range [2,9.19] */
- const __m128 CC10 = _mm_set1_ps(-0.0445555913112064f);
- const __m128 CC9 = _mm_set1_ps(0.21376355144663348f);
- const __m128 CC8 = _mm_set1_ps(-0.3473187200259257f);
- const __m128 CC7 = _mm_set1_ps(0.016690861551248114f);
- const __m128 CC6 = _mm_set1_ps(0.7560973182491192f);
- const __m128 CC5 = _mm_set1_ps(-1.2137903600145787f);
- const __m128 CC4 = _mm_set1_ps(0.8411872321232948f);
- const __m128 CC3 = _mm_set1_ps(-0.08670413896296343f);
- const __m128 CC2 = _mm_set1_ps(-0.27124782687240334f);
- const __m128 CC1 = _mm_set1_ps(-0.0007502488047806069f);
- const __m128 CC0 = _mm_set1_ps(0.5642114853803148f);
-
- /* Coefficients for expansion of exp(x) in [0,0.1] */
- /* CD0 and CD1 are both 1.0, so no need to declare them separately */
- const __m128 CD2 = _mm_set1_ps(0.5000066608081202f);
- const __m128 CD3 = _mm_set1_ps(0.1664795422874624f);
- const __m128 CD4 = _mm_set1_ps(0.04379839977652482f);
-
- const __m128 sieve = gmx_mm_castsi128_ps( _mm_set1_epi32(0xfffff000) );
- const __m128 signbit = gmx_mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
- const __m128 one = _mm_set1_ps(1.0f);
- const __m128 two = _mm_set1_ps(2.0f);
-
- __m128 x2, x4, y;
- __m128 z, q, t, t2, w, w2;
- __m128 pA0, pA1, pB0, pB1, pC0, pC1;
- __m128 expmx2, corr;
- __m128 res_erf, res_erfc, res;
- __m128 mask;
-
- /* Calculate erf() */
- x2 = _mm_mul_ps(x, x);
- x4 = _mm_mul_ps(x2, x2);
-
- pA0 = _mm_mul_ps(CA6, x4);
- pA1 = _mm_mul_ps(CA5, x4);
- pA0 = _mm_add_ps(pA0, CA4);
- pA1 = _mm_add_ps(pA1, CA3);
- pA0 = _mm_mul_ps(pA0, x4);
- pA1 = _mm_mul_ps(pA1, x4);
- pA0 = _mm_add_ps(pA0, CA2);
- pA1 = _mm_add_ps(pA1, CA1);
- pA0 = _mm_mul_ps(pA0, x4);
- pA1 = _mm_mul_ps(pA1, x2);
- pA0 = _mm_add_ps(pA0, pA1);
- pA0 = _mm_add_ps(pA0, CA0);
-
- res_erf = _mm_mul_ps(x, pA0);
-
- /* Calculate erfc */
- y = gmx_mm_abs_ps(x);
- t = gmx_mm_inv_ps(y);
- w = _mm_sub_ps(t, one);
- t2 = _mm_mul_ps(t, t);
- w2 = _mm_mul_ps(w, w);
- /*
- * We cannot simply calculate exp(-x2) directly in single precision, since
- * that will lose a couple of bits of precision due to the multiplication.
- * Instead, we introduce x=z+w, where the last 12 bits of precision are in w.
- * Then we get exp(-x2) = exp(-z2)*exp((z-x)*(z+x)).
- *
- * The only drawback with this is that it requires TWO separate exponential
- * evaluations, which would be horrible performance-wise. However, the argument
- * for the second exp() call is always small, so there we simply use a
- * low-order minimax expansion on [0,0.1].
- */
-
- z = _mm_and_ps(y, sieve);
- q = _mm_mul_ps( _mm_sub_ps(z, y), _mm_add_ps(z, y) );
-
- corr = _mm_mul_ps(CD4, q);
- corr = _mm_add_ps(corr, CD3);
- corr = _mm_mul_ps(corr, q);
- corr = _mm_add_ps(corr, CD2);
- corr = _mm_mul_ps(corr, q);
- corr = _mm_add_ps(corr, one);
- corr = _mm_mul_ps(corr, q);
- corr = _mm_add_ps(corr, one);
-
- expmx2 = gmx_mm_exp_ps( _mm_or_ps( signbit, _mm_mul_ps(z, z) ) );
- expmx2 = _mm_mul_ps(expmx2, corr);
-
- pB1 = _mm_mul_ps(CB9, w2);
- pB0 = _mm_mul_ps(CB8, w2);
- pB1 = _mm_add_ps(pB1, CB7);
- pB0 = _mm_add_ps(pB0, CB6);
- pB1 = _mm_mul_ps(pB1, w2);
- pB0 = _mm_mul_ps(pB0, w2);
- pB1 = _mm_add_ps(pB1, CB5);
- pB0 = _mm_add_ps(pB0, CB4);
- pB1 = _mm_mul_ps(pB1, w2);
- pB0 = _mm_mul_ps(pB0, w2);
- pB1 = _mm_add_ps(pB1, CB3);
- pB0 = _mm_add_ps(pB0, CB2);
- pB1 = _mm_mul_ps(pB1, w2);
- pB0 = _mm_mul_ps(pB0, w2);
- pB1 = _mm_add_ps(pB1, CB1);
- pB1 = _mm_mul_ps(pB1, w);
- pB0 = _mm_add_ps(pB0, pB1);
- pB0 = _mm_add_ps(pB0, CB0);
-
- pC0 = _mm_mul_ps(CC10, t2);
- pC1 = _mm_mul_ps(CC9, t2);
- pC0 = _mm_add_ps(pC0, CC8);
- pC1 = _mm_add_ps(pC1, CC7);
- pC0 = _mm_mul_ps(pC0, t2);
- pC1 = _mm_mul_ps(pC1, t2);
- pC0 = _mm_add_ps(pC0, CC6);
- pC1 = _mm_add_ps(pC1, CC5);
- pC0 = _mm_mul_ps(pC0, t2);
- pC1 = _mm_mul_ps(pC1, t2);
- pC0 = _mm_add_ps(pC0, CC4);
- pC1 = _mm_add_ps(pC1, CC3);
- pC0 = _mm_mul_ps(pC0, t2);
- pC1 = _mm_mul_ps(pC1, t2);
- pC0 = _mm_add_ps(pC0, CC2);
- pC1 = _mm_add_ps(pC1, CC1);
- pC0 = _mm_mul_ps(pC0, t2);
- pC1 = _mm_mul_ps(pC1, t);
- pC0 = _mm_add_ps(pC0, pC1);
- pC0 = _mm_add_ps(pC0, CC0);
- pC0 = _mm_mul_ps(pC0, t);
-
- /* SELECT pB0 or pC0 for erfc() */
- mask = _mm_cmplt_ps(two, y);
- res_erfc = _mm_blendv_ps(pB0, pC0, mask);
- res_erfc = _mm_mul_ps(res_erfc, expmx2);
-
- /* erfc(x<0) = 2-erfc(|x|) */
- mask = _mm_cmplt_ps(x, _mm_setzero_ps());
- res_erfc = _mm_blendv_ps(res_erfc, _mm_sub_ps(two, res_erfc), mask);
-
- /* Select erf() or erfc() */
- mask = _mm_cmplt_ps(y, _mm_set1_ps(0.75f));
- res = _mm_blendv_ps(res_erfc, _mm_sub_ps(one, res_erf), mask);
-
- return res;
-}
-
-
-/* Calculate the force correction due to PME analytically.
- *
- * This routine is meant to enable analytical evaluation of the
- * direct-space PME electrostatic force to avoid tables.
- *
- * The direct-space potential should be Erfc(beta*r)/r, but there
- * are some problems evaluating that:
- *
- * First, the error function is difficult (read: expensive) to
- * approxmiate accurately for intermediate to large arguments, and
- * this happens already in ranges of beta*r that occur in simulations.
- * Second, we now try to avoid calculating potentials in Gromacs but
- * use forces directly.
- *
- * We can simply things slight by noting that the PME part is really
- * a correction to the normal Coulomb force since Erfc(z)=1-Erf(z), i.e.
- *
- * V= 1/r - Erf(beta*r)/r
- *
- * The first term we already have from the inverse square root, so
- * that we can leave out of this routine.
- *
- * For pme tolerances of 1e-3 to 1e-8 and cutoffs of 0.5nm to 1.8nm,
- * the argument beta*r will be in the range 0.15 to ~4. Use your
- * favorite plotting program to realize how well-behaved Erf(z)/z is
- * in this range!
- *
- * We approximate f(z)=erf(z)/z with a rational minimax polynomial.
- * However, it turns out it is more efficient to approximate f(z)/z and
- * then only use even powers. This is another minor optimization, since
- * we actually WANT f(z)/z, because it is going to be multiplied by
- * the vector between the two atoms to get the vectorial force. The
- * fastest flops are the ones we can avoid calculating!
- *
- * So, here's how it should be used:
- *
- * 1. Calculate r^2.
- * 2. Multiply by beta^2, so you get z^2=beta^2*r^2.
- * 3. Evaluate this routine with z^2 as the argument.
- * 4. The return value is the expression:
- *
- *
- * 2*exp(-z^2) erf(z)
- * ------------ - --------
- * sqrt(Pi)*z^2 z^3
- *
- * 5. Multiply the entire expression by beta^3. This will get you
- *
- * beta^3*2*exp(-z^2) beta^3*erf(z)
- * ------------------ - ---------------
- * sqrt(Pi)*z^2 z^3
- *
- * or, switching back to r (z=r*beta):
- *
- * 2*beta*exp(-r^2*beta^2) erf(r*beta)
- * ----------------------- - -----------
- * sqrt(Pi)*r^2 r^3
- *
- *
- * With a bit of math exercise you should be able to confirm that
- * this is exactly D[Erf[beta*r]/r,r] divided by r another time.
- *
- * 6. Add the result to 1/r^3, multiply by the product of the charges,
- * and you have your force (divided by r). A final multiplication
- * with the vector connecting the two particles and you have your
- * vectorial force to add to the particles.
- *
- */
-static gmx_inline __m128
-gmx_mm_pmecorrF_ps(__m128 z2)
-{
- const __m128 FN6 = _mm_set1_ps(-1.7357322914161492954e-8f);
- const __m128 FN5 = _mm_set1_ps(1.4703624142580877519e-6f);
- const __m128 FN4 = _mm_set1_ps(-0.000053401640219807709149f);
- const __m128 FN3 = _mm_set1_ps(0.0010054721316683106153f);
- const __m128 FN2 = _mm_set1_ps(-0.019278317264888380590f);
- const __m128 FN1 = _mm_set1_ps(0.069670166153766424023f);
- const __m128 FN0 = _mm_set1_ps(-0.75225204789749321333f);
-
- const __m128 FD4 = _mm_set1_ps(0.0011193462567257629232f);
- const __m128 FD3 = _mm_set1_ps(0.014866955030185295499f);
- const __m128 FD2 = _mm_set1_ps(0.11583842382862377919f);
- const __m128 FD1 = _mm_set1_ps(0.50736591960530292870f);
- const __m128 FD0 = _mm_set1_ps(1.0f);
-
- __m128 z4;
- __m128 polyFN0, polyFN1, polyFD0, polyFD1;
-
- z4 = _mm_mul_ps(z2, z2);
-
- polyFD0 = _mm_mul_ps(FD4, z4);
- polyFD1 = _mm_mul_ps(FD3, z4);
- polyFD0 = _mm_add_ps(polyFD0, FD2);
- polyFD1 = _mm_add_ps(polyFD1, FD1);
- polyFD0 = _mm_mul_ps(polyFD0, z4);
- polyFD1 = _mm_mul_ps(polyFD1, z2);
- polyFD0 = _mm_add_ps(polyFD0, FD0);
- polyFD0 = _mm_add_ps(polyFD0, polyFD1);
-
- polyFD0 = gmx_mm_inv_ps(polyFD0);
-
- polyFN0 = _mm_mul_ps(FN6, z4);
- polyFN1 = _mm_mul_ps(FN5, z4);
- polyFN0 = _mm_add_ps(polyFN0, FN4);
- polyFN1 = _mm_add_ps(polyFN1, FN3);
- polyFN0 = _mm_mul_ps(polyFN0, z4);
- polyFN1 = _mm_mul_ps(polyFN1, z4);
- polyFN0 = _mm_add_ps(polyFN0, FN2);
- polyFN1 = _mm_add_ps(polyFN1, FN1);
- polyFN0 = _mm_mul_ps(polyFN0, z4);
- polyFN1 = _mm_mul_ps(polyFN1, z2);
- polyFN0 = _mm_add_ps(polyFN0, FN0);
- polyFN0 = _mm_add_ps(polyFN0, polyFN1);
-
- return _mm_mul_ps(polyFN0, polyFD0);
-}
-
-
-/* Calculate the potential correction due to PME analytically.
- *
- * See gmx_mm256_pmecorrF_ps() for details about the approximation.
- *
- * This routine calculates Erf(z)/z, although you should provide z^2
- * as the input argument.
- *
- * Here's how it should be used:
- *
- * 1. Calculate r^2.
- * 2. Multiply by beta^2, so you get z^2=beta^2*r^2.
- * 3. Evaluate this routine with z^2 as the argument.
- * 4. The return value is the expression:
- *
- *
- * erf(z)
- * --------
- * z
- *
- * 5. Multiply the entire expression by beta and switching back to r (z=r*beta):
- *
- * erf(r*beta)
- * -----------
- * r
- *
- * 6. Subtract the result from 1/r, multiply by the product of the charges,
- * and you have your potential.
+/* Temporary:
+ * Alias some old SSE definitions to new SIMD definitions so we don't need
+ * to modify _all_ group kernels - they will anyway be replaced with a new
+ * generic SIMD version soon.
*/
-static gmx_inline __m128
-gmx_mm_pmecorrV_ps(__m128 z2)
-{
- const __m128 VN6 = _mm_set1_ps(1.9296833005951166339e-8f);
- const __m128 VN5 = _mm_set1_ps(-1.4213390571557850962e-6f);
- const __m128 VN4 = _mm_set1_ps(0.000041603292906656984871f);
- const __m128 VN3 = _mm_set1_ps(-0.00013134036773265025626f);
- const __m128 VN2 = _mm_set1_ps(0.038657983986041781264f);
- const __m128 VN1 = _mm_set1_ps(0.11285044772717598220f);
- const __m128 VN0 = _mm_set1_ps(1.1283802385263030286f);
-
- const __m128 VD3 = _mm_set1_ps(0.0066752224023576045451f);
- const __m128 VD2 = _mm_set1_ps(0.078647795836373922256f);
- const __m128 VD1 = _mm_set1_ps(0.43336185284710920150f);
- const __m128 VD0 = _mm_set1_ps(1.0f);
-
- __m128 z4;
- __m128 polyVN0, polyVN1, polyVD0, polyVD1;
-
- z4 = _mm_mul_ps(z2, z2);
-
- polyVD1 = _mm_mul_ps(VD3, z4);
- polyVD0 = _mm_mul_ps(VD2, z4);
- polyVD1 = _mm_add_ps(polyVD1, VD1);
- polyVD0 = _mm_add_ps(polyVD0, VD0);
- polyVD1 = _mm_mul_ps(polyVD1, z2);
- polyVD0 = _mm_add_ps(polyVD0, polyVD1);
-
- polyVD0 = gmx_mm_inv_ps(polyVD0);
-
- polyVN0 = _mm_mul_ps(VN6, z4);
- polyVN1 = _mm_mul_ps(VN5, z4);
- polyVN0 = _mm_add_ps(polyVN0, VN4);
- polyVN1 = _mm_add_ps(polyVN1, VN3);
- polyVN0 = _mm_mul_ps(polyVN0, z4);
- polyVN1 = _mm_mul_ps(polyVN1, z4);
- polyVN0 = _mm_add_ps(polyVN0, VN2);
- polyVN1 = _mm_add_ps(polyVN1, VN1);
- polyVN0 = _mm_mul_ps(polyVN0, z4);
- polyVN1 = _mm_mul_ps(polyVN1, z2);
- polyVN0 = _mm_add_ps(polyVN0, VN0);
- polyVN0 = _mm_add_ps(polyVN0, polyVN1);
-
- return _mm_mul_ps(polyVN0, polyVD0);
-}
-
-
-static int
-gmx_mm_sincos_ps(__m128 x,
- __m128 *sinval,
- __m128 *cosval)
-{
- const __m128 two_over_pi = _mm_set1_ps(2.0/M_PI);
- const __m128 half = _mm_set1_ps(0.5);
- const __m128 one = _mm_set1_ps(1.0);
-
- const __m128i izero = _mm_set1_epi32(0);
- const __m128i ione = _mm_set1_epi32(1);
- const __m128i itwo = _mm_set1_epi32(2);
- const __m128i ithree = _mm_set1_epi32(3);
- const __m128 signbit = gmx_mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
-
- const __m128 CA1 = _mm_set1_ps(1.5703125f);
- const __m128 CA2 = _mm_set1_ps(4.837512969970703125e-4f);
- const __m128 CA3 = _mm_set1_ps(7.54978995489188216e-8f);
-
- const __m128 CC0 = _mm_set1_ps(-0.0013602249f);
- const __m128 CC1 = _mm_set1_ps(0.0416566950f);
- const __m128 CC2 = _mm_set1_ps(-0.4999990225f);
- const __m128 CS0 = _mm_set1_ps(-0.0001950727f);
- const __m128 CS1 = _mm_set1_ps(0.0083320758f);
- const __m128 CS2 = _mm_set1_ps(-0.1666665247f);
-
- __m128 y, y2;
- __m128 z;
- __m128i iz;
- __m128i offset_sin, offset_cos;
- __m128 tmp1, tmp2;
- __m128 mask_sin, mask_cos;
- __m128 tmp_sin, tmp_cos;
-
- y = _mm_mul_ps(x, two_over_pi);
- y = _mm_add_ps(y, _mm_or_ps(_mm_and_ps(y, signbit), half));
-
- iz = _mm_cvttps_epi32(y);
- z = _mm_round_ps(y, _MM_FROUND_TO_ZERO);
-
- offset_sin = _mm_and_si128(iz, ithree);
- offset_cos = _mm_add_epi32(iz, ione);
-
- /* Extended precision arithmethic to achieve full precision */
- y = _mm_mul_ps(z, CA1);
- tmp1 = _mm_mul_ps(z, CA2);
- tmp2 = _mm_mul_ps(z, CA3);
- y = _mm_sub_ps(x, y);
- y = _mm_sub_ps(y, tmp1);
- y = _mm_sub_ps(y, tmp2);
-
- y2 = _mm_mul_ps(y, y);
-
- tmp1 = _mm_mul_ps(CC0, y2);
- tmp1 = _mm_add_ps(tmp1, CC1);
- tmp2 = _mm_mul_ps(CS0, y2);
- tmp2 = _mm_add_ps(tmp2, CS1);
- tmp1 = _mm_mul_ps(tmp1, y2);
- tmp1 = _mm_add_ps(tmp1, CC2);
- tmp2 = _mm_mul_ps(tmp2, y2);
- tmp2 = _mm_add_ps(tmp2, CS2);
-
- tmp1 = _mm_mul_ps(tmp1, y2);
- tmp1 = _mm_add_ps(tmp1, one);
-
- tmp2 = _mm_mul_ps(tmp2, _mm_mul_ps(y, y2));
- tmp2 = _mm_add_ps(tmp2, y);
-
- mask_sin = gmx_mm_castsi128_ps(_mm_cmpeq_epi32( _mm_and_si128(offset_sin, ione), izero));
- mask_cos = gmx_mm_castsi128_ps(_mm_cmpeq_epi32( _mm_and_si128(offset_cos, ione), izero));
-
- tmp_sin = _mm_blendv_ps(tmp1, tmp2, mask_sin);
- tmp_cos = _mm_blendv_ps(tmp1, tmp2, mask_cos);
-
- mask_sin = gmx_mm_castsi128_ps(_mm_cmpeq_epi32( _mm_and_si128(offset_sin, itwo), izero));
- mask_cos = gmx_mm_castsi128_ps(_mm_cmpeq_epi32( _mm_and_si128(offset_cos, itwo), izero));
-
- tmp1 = _mm_xor_ps(signbit, tmp_sin);
- tmp2 = _mm_xor_ps(signbit, tmp_cos);
-
- *sinval = _mm_blendv_ps(tmp1, tmp_sin, mask_sin);
- *cosval = _mm_blendv_ps(tmp2, tmp_cos, mask_cos);
-
- return 0;
-}
-
-/*
- * IMPORTANT: Do NOT call both sin & cos if you need both results, since each of them
- * will then call the sincos() routine and waste a factor 2 in performance!
- */
-static __m128
-gmx_mm_sin_ps(__m128 x)
-{
- __m128 s, c;
- gmx_mm_sincos_ps(x, &s, &c);
- return s;
-}
-
-/*
- * IMPORTANT: Do NOT call both sin & cos if you need both results, since each of them
- * will then call the sincos() routine and waste a factor 2 in performance!
- */
-static __m128
-gmx_mm_cos_ps(__m128 x)
-{
- __m128 s, c;
- gmx_mm_sincos_ps(x, &s, &c);
- return c;
-}
-
-
-static __m128
-gmx_mm_tan_ps(__m128 x)
-{
- __m128 sinval, cosval;
- __m128 tanval;
-
- gmx_mm_sincos_ps(x, &sinval, &cosval);
-
- tanval = _mm_mul_ps(sinval, gmx_mm_inv_ps(cosval));
-
- return tanval;
-}
-
-
-static __m128
-gmx_mm_asin_ps(__m128 x)
-{
- /* Same algorithm as cephes library */
- const __m128 signmask = gmx_mm_castsi128_ps( _mm_set1_epi32(0x7FFFFFFF) );
- const __m128 limitlow = _mm_set1_ps(1e-4f);
- const __m128 half = _mm_set1_ps(0.5f);
- const __m128 one = _mm_set1_ps(1.0f);
- const __m128 halfpi = _mm_set1_ps(M_PI/2.0f);
-
- const __m128 CC5 = _mm_set1_ps(4.2163199048E-2f);
- const __m128 CC4 = _mm_set1_ps(2.4181311049E-2f);
- const __m128 CC3 = _mm_set1_ps(4.5470025998E-2f);
- const __m128 CC2 = _mm_set1_ps(7.4953002686E-2f);
- const __m128 CC1 = _mm_set1_ps(1.6666752422E-1f);
-
- __m128 sign;
- __m128 mask;
- __m128 xabs;
- __m128 z, z1, z2, q, q1, q2;
- __m128 pA, pB;
-
- sign = _mm_andnot_ps(signmask, x);
- xabs = _mm_and_ps(x, signmask);
-
- mask = _mm_cmpgt_ps(xabs, half);
-
- z1 = _mm_mul_ps(half, _mm_sub_ps(one, xabs));
- q1 = _mm_mul_ps(z1, gmx_mm_invsqrt_ps(z1));
- q1 = _mm_andnot_ps(_mm_cmpeq_ps(xabs, one), q1);
-
- q2 = xabs;
- z2 = _mm_mul_ps(q2, q2);
-
- z = _mm_or_ps( _mm_and_ps(mask, z1), _mm_andnot_ps(mask, z2) );
- q = _mm_or_ps( _mm_and_ps(mask, q1), _mm_andnot_ps(mask, q2) );
-
- z2 = _mm_mul_ps(z, z);
-
- pA = _mm_mul_ps(CC5, z2);
- pB = _mm_mul_ps(CC4, z2);
-
- pA = _mm_add_ps(pA, CC3);
- pB = _mm_add_ps(pB, CC2);
-
- pA = _mm_mul_ps(pA, z2);
- pB = _mm_mul_ps(pB, z2);
-
- pA = _mm_add_ps(pA, CC1);
- pA = _mm_mul_ps(pA, z);
-
- z = _mm_add_ps(pA, pB);
- z = _mm_mul_ps(z, q);
- z = _mm_add_ps(z, q);
-
- q2 = _mm_sub_ps(halfpi, z);
- q2 = _mm_sub_ps(q2, z);
-
- z = _mm_or_ps( _mm_and_ps(mask, q2), _mm_andnot_ps(mask, z) );
-
- mask = _mm_cmpgt_ps(xabs, limitlow);
- z = _mm_or_ps( _mm_and_ps(mask, z), _mm_andnot_ps(mask, xabs) );
-
- z = _mm_xor_ps(z, sign);
-
- return z;
-}
-
-
-static __m128
-gmx_mm_acos_ps(__m128 x)
-{
- const __m128 signmask = gmx_mm_castsi128_ps( _mm_set1_epi32(0x7FFFFFFF) );
- const __m128 one_ps = _mm_set1_ps(1.0f);
- const __m128 half_ps = _mm_set1_ps(0.5f);
- const __m128 pi_ps = _mm_set1_ps(M_PI);
- const __m128 halfpi_ps = _mm_set1_ps(M_PI/2.0f);
-
- __m128 mask1;
- __m128 mask2;
- __m128 xabs;
- __m128 z, z1, z2, z3;
-
- xabs = _mm_and_ps(x, signmask);
- mask1 = _mm_cmpgt_ps(xabs, half_ps);
- mask2 = _mm_cmpgt_ps(x, _mm_setzero_ps());
-
- z = _mm_mul_ps(half_ps, _mm_sub_ps(one_ps, xabs));
- z = _mm_mul_ps(z, gmx_mm_invsqrt_ps(z));
- z = _mm_andnot_ps(_mm_cmpeq_ps(xabs, one_ps), z);
-
- z = _mm_blendv_ps(x, z, mask1);
- z = gmx_mm_asin_ps(z);
-
- z2 = _mm_add_ps(z, z);
- z1 = _mm_sub_ps(pi_ps, z2);
- z3 = _mm_sub_ps(halfpi_ps, z);
-
- z = _mm_blendv_ps(z1, z2, mask2);
- z = _mm_blendv_ps(z3, z, mask1);
-
- return z;
-}
-
-
-static __m128
-gmx_mm_atan_ps(__m128 x)
-{
- /* Same algorithm as cephes library */
- const __m128 signmask = gmx_mm_castsi128_ps( _mm_set1_epi32(0x7FFFFFFF) );
- const __m128 limit1 = _mm_set1_ps(0.414213562373095f);
- const __m128 limit2 = _mm_set1_ps(2.414213562373095f);
- const __m128 quarterpi = _mm_set1_ps(0.785398163397448f);
- const __m128 halfpi = _mm_set1_ps(1.570796326794896f);
- const __m128 mone = _mm_set1_ps(-1.0f);
- const __m128 CC3 = _mm_set1_ps(-3.33329491539E-1f);
- const __m128 CC5 = _mm_set1_ps(1.99777106478E-1f);
- const __m128 CC7 = _mm_set1_ps(-1.38776856032E-1);
- const __m128 CC9 = _mm_set1_ps(8.05374449538e-2f);
-
- __m128 sign;
- __m128 mask1, mask2;
- __m128 y, z1, z2;
- __m128 x2, x4;
- __m128 sum1, sum2;
-
- sign = _mm_andnot_ps(signmask, x);
- x = _mm_and_ps(x, signmask);
-
- mask1 = _mm_cmpgt_ps(x, limit1);
- mask2 = _mm_cmpgt_ps(x, limit2);
-
- z1 = _mm_mul_ps(_mm_add_ps(x, mone), gmx_mm_inv_ps(_mm_sub_ps(x, mone)));
- z2 = _mm_mul_ps(mone, gmx_mm_inv_ps(x));
-
- y = _mm_and_ps(mask1, quarterpi);
- y = _mm_blendv_ps(y, halfpi, mask2);
-
- x = _mm_blendv_ps(x, z1, mask1);
- x = _mm_blendv_ps(x, z2, mask2);
-
- x2 = _mm_mul_ps(x, x);
- x4 = _mm_mul_ps(x2, x2);
-
- sum1 = _mm_mul_ps(CC9, x4);
- sum2 = _mm_mul_ps(CC7, x4);
- sum1 = _mm_add_ps(sum1, CC5);
- sum2 = _mm_add_ps(sum2, CC3);
- sum1 = _mm_mul_ps(sum1, x4);
- sum2 = _mm_mul_ps(sum2, x2);
-
- sum1 = _mm_add_ps(sum1, sum2);
- sum1 = _mm_sub_ps(sum1, mone);
- sum1 = _mm_mul_ps(sum1, x);
- y = _mm_add_ps(y, sum1);
-
- y = _mm_xor_ps(y, sign);
-
- return y;
-}
-
-
-static __m128
-gmx_mm_atan2_ps(__m128 y, __m128 x)
-{
- const __m128 pi = _mm_set1_ps(M_PI);
- const __m128 minuspi = _mm_set1_ps(-M_PI);
- const __m128 halfpi = _mm_set1_ps(M_PI/2.0);
- const __m128 minushalfpi = _mm_set1_ps(-M_PI/2.0);
-
- __m128 z, z1, z3, z4;
- __m128 w;
- __m128 maskx_lt, maskx_eq;
- __m128 masky_lt, masky_eq;
- __m128 mask1, mask2, mask3, mask4, maskall;
-
- maskx_lt = _mm_cmplt_ps(x, _mm_setzero_ps());
- masky_lt = _mm_cmplt_ps(y, _mm_setzero_ps());
- maskx_eq = _mm_cmpeq_ps(x, _mm_setzero_ps());
- masky_eq = _mm_cmpeq_ps(y, _mm_setzero_ps());
-
- z = _mm_mul_ps(y, gmx_mm_inv_ps(x));
- z = gmx_mm_atan_ps(z);
-
- mask1 = _mm_and_ps(maskx_eq, masky_lt);
- mask2 = _mm_andnot_ps(maskx_lt, masky_eq);
- mask3 = _mm_andnot_ps( _mm_or_ps(masky_lt, masky_eq), maskx_eq);
- mask4 = _mm_and_ps(masky_eq, maskx_lt);
-
- maskall = _mm_or_ps( _mm_or_ps(mask1, mask2), _mm_or_ps(mask3, mask4) );
-
- z = _mm_andnot_ps(maskall, z);
- z1 = _mm_and_ps(mask1, minushalfpi);
- z3 = _mm_and_ps(mask3, halfpi);
- z4 = _mm_and_ps(mask4, pi);
-
- z = _mm_or_ps( _mm_or_ps(z, z1), _mm_or_ps(z3, z4) );
-
- mask1 = _mm_andnot_ps(masky_lt, maskx_lt);
- mask2 = _mm_and_ps(maskx_lt, masky_lt);
-
- w = _mm_or_ps( _mm_and_ps(mask1, pi), _mm_and_ps(mask2, minuspi) );
- w = _mm_andnot_ps(maskall, w);
-
- z = _mm_add_ps(z, w);
-
- return z;
-}
-
+#define gmx_mm_invsqrt_ps gmx_simd_invsqrt_f
+#define gmx_mm_inv_ps gmx_simd_inv_f
+#define gmx_mm_log_ps gmx_simd_log_f
+#define gmx_mm_pmecorrF_ps gmx_simd_pmecorrF_f
+#define gmx_mm_pmecorrV_ps gmx_simd_pmecorrV_f
+#define gmx_mm_sincos_ps gmx_simd_sincos_f
#endif
--- /dev/null
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2013,2014, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+
+/*! \libinternal
+ * \defgroup module_simd SIMD intrinsics interface (simd)
+ * \ingroup group_utilitymodules
+ *
+ * \brief Provides an architecture-independent way of doing SIMD coding.
+ *
+ * Start by consulting the overview Doxygen SIMD module documentation which is
+ * available in the internal library documentation (but not the public API),
+ * and then the details are documented in simd.h and the reference
+ * implementation impl_reference.h.
+ *
+ * \author Erik Lindahl <erik.lindahl@scilifelab.se>
+ */
+
+#ifndef GMX_SIMD_SIMD_H
+#define GMX_SIMD_SIMD_H
+
+/*! \libinternal \file
+ *
+ * \brief Definitions, capabilities, and wrappers for SIMD module.
+ *
+ * The macros in this file are intended to be used for writing
+ * architecture-independent SIMD intrinsics code.
+ * To support a new architecture, adding a new sub-include with macros here
+ * should be (nearly) all that is needed.
+ *
+ * The defines in this top-level file will set default Gromacs real precision
+ * operations to either single or double precision based on whether
+ * GMX_DOUBLE is defined. The actual implementation - including e.g.
+ * conversion operations specifically between single and double - is documented
+ * in impl_reference.h.
+ *
+ * \author Erik Lindahl <erik.lindahl@scilifelab.se>
+ *
+ * \inlibraryapi
+ * \ingroup module_simd
+ */
+
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <stddef.h>
+#include "gromacs/legacyheaders/types/simple.h"
+
+/* Forward declarations so memory allocation can be used in implementations */
+static gmx_inline float * gmx_simd_align_f(float *p);
+static gmx_inline double * gmx_simd_align_d(double *p);
+static gmx_inline int * gmx_simd_align_fi(int *p);
+static gmx_inline int * gmx_simd_align_di(int *p);
+static gmx_inline float * gmx_simd4_align_f(float *p);
+static gmx_inline double * gmx_simd4_align_d(double *p);
+
+/*! \cond libapi */
+/*! \addtogroup module_simd */
+/*! \{ */
+
+/*! \name SIMD predefined macros to describe high-level capabilities
+ *
+ * These macros are used to describe the features available in default
+ * Gromacs real precision. They are set from the lower-level implementation
+ * files that have macros describing single and double precision individually,
+ * as well as the implementation details.
+ * \{
+ */
+
+/*! \brief
+ * GMX_SIMD indicates that some sort of SIMD support is present in software.
+ *
+ * It is disabled if no architecture, neither reference SIMD, has been selected.
+ */
+#define GMX_SIMD
+
+
+/* Intel MIC is a bit special since it is a co-processor. This means the rest
+ * of GROMACS (which runs on the CPU) should use a default SIMD set like AVX,
+ * while the part running on the coprocessor defines __MIC__. All functions in
+ * this SIMD module are static, so it will work perfectly fine to include this
+ * file with different SIMD definitions for different files.
+ */
+#if defined __MIC__
+# include "gromacs/simd/impl_intel_mic/impl_intel_mic.h"
+#elif defined GMX_SIMD_X86_AVX2_256
+# include "gromacs/simd/impl_x86_avx2_256/impl_x86_avx2_256.h"
+#elif defined GMX_SIMD_X86_AVX_256
+# include "gromacs/simd/impl_x86_avx_256/impl_x86_avx_256.h"
+#elif defined GMX_SIMD_X86_AVX_128_FMA
+# include "gromacs/simd/impl_x86_avx_128_fma/impl_x86_avx_128_fma.h"
+#elif defined GMX_SIMD_X86_SSE4_1
+# include "gromacs/simd/impl_x86_sse4_1/impl_x86_sse4_1.h"
+#elif defined GMX_SIMD_X86_SSE2
+# include "gromacs/simd/impl_x86_sse2/impl_x86_sse2.h"
+#elif defined GMX_SIMD_IBM_QPX
+# include "gromacs/simd/impl_ibm_qpx/impl_ibm_qpx.h"
+#elif (defined GMX_SIMD_REFERENCE) || (defined DOXYGEN)
+/* Plain C SIMD reference implementation, also serves as documentation.
+ * For now this code path will also be taken for Sparc64_HPC_ACE since we have
+ * not yet added the verlet kernel extensions there. The group kernels do not
+ * depend on this file, so they will still be accelerated with SIMD.
+ */
+# include "gromacs/simd/impl_reference/impl_reference.h"
+#else
+/* Turn off the GMX_SIMD flag if we do not even have reference support */
+# undef GMX_SIMD
+#endif
+
+/*! \brief
+ * SIMD4 width is always 4, but use this for clarity in definitions.
+ *
+ * It improves code readability to allocate e.g. 2*GMX_SIMD4_WIDTH instead of 8.
+ */
+#define GMX_SIMD4_WIDTH 4
+
+/*! \} */
+
+/*! \name SIMD memory alignment operations
+ * \{
+ */
+
+/*! \brief
+ * Align a float pointer for usage with SIMD instructions.
+ *
+ * You should typically \a not call this function directly (unless you explicitly
+ * want single precision even when GMX_DOUBLE is set), but use the
+ * \ref gmx_simd_align_r macro to align memory in default Gromacs real precision.
+ *
+ * \param p Pointer to memory, allocate at least \ref GMX_SIMD_FLOAT_WIDTH extra elements.
+ *
+ * \return Aligned pointer (>=p) suitable for loading/storing float fp SIMD.
+ * If \ref GMX_SIMD_HAVE_FLOAT is not set, p will be returned unchanged.
+ *
+ * Start by allocating an extra \ref GMX_SIMD_FLOAT_WIDTH float elements of memory,
+ * and then call this function. The returned pointer will be greater or equal
+ * to the one you provided, and point to an address inside your provided memory
+ * that is aligned to the SIMD width.
+ */
+static gmx_inline float *
+gmx_simd_align_f(float *p)
+{
+# ifdef GMX_SIMD_HAVE_FLOAT
+ return (float *)(((size_t)((p)+GMX_SIMD_FLOAT_WIDTH-1)) & (~((size_t)(GMX_SIMD_FLOAT_WIDTH*sizeof(float)-1))));
+# else
+ return p;
+# endif
+}
+
+/*! \brief
+ * Align a double pointer for usage with SIMD instructions.
+ *
+ * You should typically \a not call this function directly (unless you explicitly
+ * want double precision even when GMX_DOUBLE is not set), but use the
+ * \ref gmx_simd_align_r macro to align memory in default Gromacs real precision.
+ *
+ * \param p Pointer to memory, allocate at least \ref GMX_SIMD_DOUBLE_WIDTH extra elements.
+ *
+ * \return Aligned pointer (>=p) suitable for loading/storing double fp SIMD.
+ * If \ref GMX_SIMD_HAVE_DOUBLE is not set, p will be returned unchanged.
+ *
+ * Start by allocating an extra \ref GMX_SIMD_DOUBLE_WIDTH double elements of memory,
+ * and then call this function. The returned pointer will be greater or equal
+ * to the one you provided, and point to an address inside your provided memory
+ * that is aligned to the SIMD width.
+ */
+static gmx_inline double *
+gmx_simd_align_d(double *p)
+{
+# ifdef GMX_SIMD_HAVE_DOUBLE
+ return (double *)(((size_t)((p)+GMX_SIMD_DOUBLE_WIDTH-1)) & (~((size_t)(GMX_SIMD_DOUBLE_WIDTH*sizeof(double)-1))));
+# else
+ return p;
+# endif
+}
+
+/*! \brief
+ * Align a (float) integer pointer for usage with SIMD instructions.
+ *
+ * You should typically \a not call this function directly (unless you explicitly
+ * want integers corresponding to single precision even when GMX_DOUBLE is
+ * set), but use the \ref gmx_simd_align_i macro to align integer memory
+ * corresponding to Gromacs default floating-point precision.
+ *
+ * \param p Pointer to memory, allocate at least \ref GMX_SIMD_FINT32_WIDTH extra elements.
+ *
+ * \return Aligned pointer (>=p) suitable for loading/storing float-integer SIMD.
+ * If \ref GMX_SIMD_HAVE_FINT32 is not set, p will be returned unchanged.
+ *
+ * This routine provides aligned memory for usage with \ref gmx_simd_fint32_t. You
+ * should have allocated an extra \ref GMX_SIMD_FINT32_WIDTH * sizeof(int) bytes. The
+ * reason why we need to separate float-integer vs. double-integer is that the
+ * width of registers after conversions from the floating-point types might not
+ * be identical, or even supported, in both cases.
+ */
+static gmx_inline int *
+gmx_simd_align_fi(int *p)
+{
+# ifdef GMX_SIMD_HAVE_FINT32
+ return (int *)(((size_t)((p)+GMX_SIMD_FINT32_WIDTH-1)) & (~((size_t)(GMX_SIMD_FINT32_WIDTH*sizeof(int)-1))));
+# else
+ return p;
+# endif
+}
+
+/*! \brief
+ * Align a (double) integer pointer for usage with SIMD instructions.
+ *
+ * You should typically \a not call this function directly (unless you explicitly
+ * want integers corresponding to doublele precision even when GMX_DOUBLE is
+ * not set), but use the \ref gmx_simd_align_i macro to align integer memory
+ * corresponding to Gromacs default floating-point precision.
+ *
+ * \param p Pointer to memory, allocate at least \ref GMX_SIMD_DINT32_WIDTH extra elements.
+ *
+ * \return Aligned pointer (>=p) suitable for loading/storing double-integer SIMD.
+ * If \ref GMX_SIMD_HAVE_DINT32 is not set, p will be returned unchanged.
+ *
+ * This routine provides aligned memory for usage with \ref gmx_simd_dint32_t. You
+ * should have allocated an extra \ref GMX_SIMD_DINT32_WIDTH*sizeof(int) bytes. The
+ * reason why we need to separate float-integer vs. double-integer is that the
+ * width of registers after conversions from the floating-point types might not
+ * be identical, or even supported, in both cases.
+ */
+static gmx_inline int *
+gmx_simd_align_di(int *p)
+{
+# ifdef GMX_SIMD_HAVE_DINT32
+ return (int *)(((size_t)((p)+GMX_SIMD_DINT32_WIDTH-1)) & (~((size_t)(GMX_SIMD_DINT32_WIDTH*sizeof(int)-1))));
+# else
+ return p;
+# endif
+}
+
+/*! \brief
+ * Align a float pointer for usage with SIMD4 instructions.
+ *
+ * You should typically \a not call this function directly (unless you explicitly
+ * want single precision even when GMX_DOUBLE is set), but use the
+ * \ref gmx_simd4_align_r macro to align memory in default Gromacs real precision.
+ *
+ * \param p Pointer to memory, allocate at least \ref GMX_SIMD4_WIDTH extra elements.
+ *
+ * \return Aligned pointer (>=p) suitable for loading/storing float SIMD.
+ * If \ref GMX_SIMD4_HAVE_FLOAT is not set, p will be returned unchanged.
+ *
+ * This routine provides aligned memory for usage with \ref gmx_simd4_float_t.
+ * should have allocated an extra \ref GMX_SIMD4_WIDTH * sizeof(float) bytes.
+ */
+static gmx_inline float *
+gmx_simd4_align_f(float *p)
+{
+# ifdef GMX_SIMD4_HAVE_FLOAT
+ return (float *)(((size_t)((p)+GMX_SIMD4_WIDTH-1)) & (~((size_t)(GMX_SIMD4_WIDTH*sizeof(float)-1))));
+# else
+ return p;
+# endif
+}
+
+/*! \brief
+ * Align a double pointer for usage with SIMD4 instructions.
+ *
+ * You should typically \a not call this function directly (unless you explicitly
+ * want double precision even when GMX_DOUBLE is not set), but use the
+ * \ref gmx_simd4_align_r macro to align memory in default Gromacs real precision.
+ *
+ * \param p Pointer to memory, allocate at least \ref GMX_SIMD4_WIDTH extra elements.
+ *
+ * \return Aligned pointer (>=p) suitable for loading/storing float SIMD.
+ * If \ref GMX_SIMD4_HAVE_DOUBLE is not set, p will be returned unchanged.
+ *
+ * This routine provides aligned memory for usage with \ref gmx_simd4_double_t.
+ * should have allocated an extra \ref GMX_SIMD4_WIDTH * sizeof(double) bytes.
+ */
+static gmx_inline double *
+gmx_simd4_align_d(double *p)
+{
+# ifdef GMX_SIMD4_HAVE_DOUBLE
+ return (double *)(((size_t)((p)+GMX_SIMD4_WIDTH-1)) & (~((size_t)(GMX_SIMD4_WIDTH*sizeof(double)-1))));
+# else
+ return p;
+# endif
+}
+
+/*! \} */
+
+
+/* Define Gromacs "real" precision macros depending on Gromacs config. Note
+ * that conversions float-to-double and v.v. are not included here since they
+ * are not precision-dependent - find them in the implementation files.
+ */
+#ifdef GMX_DOUBLE
+/* Double floating-point. The documentation is in the float part below */
+# define gmx_simd_real_t gmx_simd_double_t
+# define gmx_simd_load_r gmx_simd_load_d
+# define gmx_simd_load1_r gmx_simd_load1_d
+# define gmx_simd_set1_r gmx_simd_set1_d
+# define gmx_simd_store_r gmx_simd_store_d
+# define gmx_simd_loadu_r gmx_simd_loadu_d
+# define gmx_simd_storeu_r gmx_simd_storeu_d
+# define gmx_simd_setzero_r gmx_simd_setzero_d
+# define gmx_simd_add_r gmx_simd_add_d
+# define gmx_simd_sub_r gmx_simd_sub_d
+# define gmx_simd_mul_r gmx_simd_mul_d
+# define gmx_simd_fmadd_r gmx_simd_fmadd_d
+# define gmx_simd_fmsub_r gmx_simd_fmsub_d
+# define gmx_simd_fnmadd_r gmx_simd_fnmadd_d
+# define gmx_simd_fnmsub_r gmx_simd_fnmsub_d
+# define gmx_simd_and_r gmx_simd_and_d
+# define gmx_simd_andnot_r gmx_simd_andnot_d
+# define gmx_simd_or_r gmx_simd_or_d
+# define gmx_simd_xor_r gmx_simd_xor_d
+# define gmx_simd_rsqrt_r gmx_simd_rsqrt_d
+# define gmx_simd_rcp_r gmx_simd_rcp_d
+# define gmx_simd_fabs_r gmx_simd_fabs_d
+# define gmx_simd_fneg_r gmx_simd_fneg_d
+# define gmx_simd_max_r gmx_simd_max_d
+# define gmx_simd_min_r gmx_simd_min_d
+# define gmx_simd_round_r gmx_simd_round_d
+# define gmx_simd_trunc_r gmx_simd_trunc_d
+# define gmx_simd_fraction_r gmx_simd_fraction_d
+# define gmx_simd_get_exponent_r gmx_simd_get_exponent_d
+# define gmx_simd_get_mantissa_r gmx_simd_get_mantissa_d
+# define gmx_simd_set_exponent_r gmx_simd_set_exponent_d
+/* Double integer and conversions */
+# define gmx_simd_int32_t gmx_simd_dint32_t
+# define gmx_simd_load_i gmx_simd_load_di
+# define gmx_simd_set1_i gmx_simd_set1_di
+# define gmx_simd_store_i gmx_simd_store_di
+# define gmx_simd_loadu_i gmx_simd_loadu_di
+# define gmx_simd_storeu_i gmx_simd_storeu_di
+# define gmx_simd_setzero_i gmx_simd_setzero_di
+# define gmx_simd_cvt_r2i gmx_simd_cvt_d2i
+# define gmx_simd_cvtt_r2i gmx_simd_cvtt_d2i
+# define gmx_simd_cvt_i2r gmx_simd_cvt_i2d
+# define gmx_simd_extract_i gmx_simd_extract_di
+# define gmx_simd_slli_i gmx_simd_slli_di
+# define gmx_simd_srli_i gmx_simd_srli_di
+# define gmx_simd_and_i gmx_simd_and_di
+# define gmx_simd_andnot_i gmx_simd_andnot_di
+# define gmx_simd_or_i gmx_simd_or_di
+# define gmx_simd_xor_i gmx_simd_xor_di
+# define gmx_simd_add_i gmx_simd_add_di
+# define gmx_simd_sub_i gmx_simd_sub_di
+# define gmx_simd_mul_i gmx_simd_mul_di
+/* Double booleans and selection */
+# define gmx_simd_bool_t gmx_simd_dbool_t
+# define gmx_simd_cmpeq_r gmx_simd_cmpeq_d
+# define gmx_simd_cmplt_r gmx_simd_cmplt_d
+# define gmx_simd_cmple_r gmx_simd_cmple_d
+# define gmx_simd_and_b gmx_simd_and_db
+# define gmx_simd_or_b gmx_simd_or_db
+# define gmx_simd_anytrue_b gmx_simd_anytrue_db
+# define gmx_simd_blendzero_r gmx_simd_blendzero_d
+# define gmx_simd_blendnotzero_r gmx_simd_blendnotzero_d
+# define gmx_simd_blendv_r gmx_simd_blendv_d
+# define gmx_simd_reduce_r gmx_simd_reduce_d
+# define gmx_simd_ibool_t gmx_simd_dibool_t
+# define gmx_simd_cmpeq_i gmx_simd_cmpeq_di
+# define gmx_simd_cmplt_i gmx_simd_cmplt_di
+# define gmx_simd_and_ib gmx_simd_and_dib
+# define gmx_simd_or_ib gmx_simd_or_dib
+# define gmx_simd_anytrue_ib gmx_simd_anytrue_dib
+# define gmx_simd_blendzero_i gmx_simd_blendzero_di
+# define gmx_simd_blendnotzero_i gmx_simd_blendnotzero_di
+# define gmx_simd_blendv_i gmx_simd_blendv_di
+/* Conversions between integer and double floating-point booleans */
+# define gmx_simd_cvt_b2ib gmx_simd_cvt_db2dib
+# define gmx_simd_cvt_ib2b gmx_simd_cvt_dib2db
+
+/* SIMD4 double fp - we only support a subset of SIMD instructions for SIMD4 */
+# define gmx_simd4_real_t gmx_simd4_double_t
+# define gmx_simd4_load_r gmx_simd4_load_d
+# define gmx_simd4_load1_r gmx_simd4_load1_d
+# define gmx_simd4_set1_r gmx_simd4_set1_d
+# define gmx_simd4_store_r gmx_simd4_store_d
+# define gmx_simd4_loadu_r gmx_simd4_loadu_d
+# define gmx_simd4_storeu_r gmx_simd4_storeu_d
+# define gmx_simd4_setzero_r gmx_simd4_setzero_d
+# define gmx_simd4_add_r gmx_simd4_add_d
+# define gmx_simd4_sub_r gmx_simd4_sub_d
+# define gmx_simd4_mul_r gmx_simd4_mul_d
+# define gmx_simd4_fmadd_r gmx_simd4_fmadd_d
+# define gmx_simd4_fmsub_r gmx_simd4_fmsub_d
+# define gmx_simd4_fnmadd_r gmx_simd4_fnmadd_d
+# define gmx_simd4_fnmsub_r gmx_simd4_fnmsub_d
+# define gmx_simd4_and_r gmx_simd4_and_d
+# define gmx_simd4_andnot_r gmx_simd4_andnot_d
+# define gmx_simd4_or_r gmx_simd4_or_d
+# define gmx_simd4_xor_r gmx_simd4_xor_d
+# define gmx_simd4_rsqrt_r gmx_simd4_rsqrt_d
+# define gmx_simd4_fabs_r gmx_simd4_fabs_d
+# define gmx_simd4_fneg_r gmx_simd4_fneg_d
+# define gmx_simd4_max_r gmx_simd4_max_d
+# define gmx_simd4_min_r gmx_simd4_min_d
+# define gmx_simd4_round_r gmx_simd4_round_d
+# define gmx_simd4_trunc_r gmx_simd4_trunc_d
+# define gmx_simd4_dotproduct3_r gmx_simd4_dotproduct3_d
+# define gmx_simd4_bool_t gmx_simd4_dbool_t
+# define gmx_simd4_cmpeq_r gmx_simd4_cmpeq_d
+# define gmx_simd4_cmplt_r gmx_simd4_cmplt_d
+# define gmx_simd4_cmple_r gmx_simd4_cmple_d
+# define gmx_simd4_and_b gmx_simd4_and_db
+# define gmx_simd4_or_b gmx_simd4_or_db
+# define gmx_simd4_anytrue_b gmx_simd4_anytrue_db
+# define gmx_simd4_blendzero_r gmx_simd4_blendzero_d
+# define gmx_simd4_blendnotzero_r gmx_simd4_blendnotzero_d
+# define gmx_simd4_blendv_r gmx_simd4_blendv_d
+# define gmx_simd4_reduce_r gmx_simd4_reduce_d
+
+/* Memory allocation */
+# define gmx_simd_align_r gmx_simd_align_d
+# define gmx_simd_align_i gmx_simd_align_di
+# define gmx_simd4_align_r gmx_simd4_align_d
+
+# ifdef GMX_SIMD_HAVE_DOUBLE
+# define GMX_SIMD_HAVE_REAL
+# define GMX_SIMD_REAL_WIDTH GMX_SIMD_DOUBLE_WIDTH
+# endif
+# ifdef GMX_SIMD_HAVE_DINT32
+# define GMX_SIMD_HAVE_INT32
+# define GMX_SIMD_INT32_WIDTH GMX_SIMD_DINT32_WIDTH
+# endif
+# ifdef GMX_SIMD_HAVE_DINT32_EXTRACT
+# define GMX_SIMD_HAVE_INT32_EXTRACT
+# endif
+# ifdef GMX_SIMD_HAVE_DINT32_LOGICAL
+# define GMX_SIMD_HAVE_INT32_LOGICAL
+# endif
+# ifdef GMX_SIMD_HAVE_DINT32_ARITHMETICS
+# define GMX_SIMD_HAVE_INT32_ARITHMETICS
+# endif
+# ifdef GMX_SIMD4_HAVE_DOUBLE
+# define GMX_SIMD4_HAVE_REAL
+# endif
+
+#else /* GMX_DOUBLE */
+
+/*! \name SIMD data types
+ *
+ * The actual storage of these types is implementation dependent. The
+ * documentation is generated from the reference implementation, but for
+ * normal usage this will likely not be what you are using.
+ * \{
+ */
+/*! \brief Real precision floating-point SIMD datatype.
+ *
+ * This type is only available if \ref GMX_SIMD_HAVE_REAL is defined.
+ *
+ * If GMX_DOUBLE is defined, this will be set to \ref gmx_simd_double_t
+ * internally, otherwise \ref gmx_simd_float_t.
+ */
+# define gmx_simd_real_t gmx_simd_float_t
+
+/*! \brief 32-bit integer SIMD type.
+ *
+ * This type is only available if \ref GMX_SIMD_HAVE_INT32 is defined.
+ *
+ * If GMX_DOUBLE is defined, this will be set to \ref gmx_simd_dint32_t
+ * internally, otherwise \ref gmx_simd_fint32_t. This might seem a strange
+ * implementation detail, but it is because some SIMD implementations use
+ * different types/widths of integers registers when converting from
+ * double vs. single precision floating point. As long as you just use
+ * this type you will not have to worry about precision.
+ */
+# define gmx_simd_int32_t gmx_simd_fint32_t
+
+/*! \brief Boolean SIMD type for usage with \ref gmx_simd_real_t.
+ *
+ * This type is only available if \ref GMX_SIMD_HAVE_REAL is defined.
+ *
+ * If GMX_DOUBLE is defined, this will be set to \ref gmx_simd_dbool_t
+ * internally, otherwise \ref gmx_simd_fbool_t. This is necessary since some
+ * SIMD implementations use bitpatterns for marking truth, so single-
+ * vs. double precision booleans are not necessarily exchangable.
+ * As long as you just use this type you will not have to worry about precision.
+ *
+ * See \ref gmx_simd_ibool_t for an explanation of real vs. integer booleans.
+ */
+# define gmx_simd_bool_t gmx_simd_fbool_t
+
+/*! \brief Boolean SIMD type for usage with \ref gmx_simd_int32_t.
+ *
+ * This type is only available if \ref GMX_SIMD_HAVE_INT32 is defined.
+ *
+ * If GMX_DOUBLE is defined, this will be set to \ref gmx_simd_dibool_t
+ * internally, otherwise \ref gmx_simd_fibool_t. This is necessary since some
+ * SIMD implementations use bitpatterns for marking truth, so single-
+ * vs. double precision booleans are not necessarily exchangable, and while
+ * a double-precision boolean might be represented with a 64-bit mask, the
+ * corresponding integer might only use a 32-bit mask.
+ *
+ * We provide conversion routines for these cases, so the only thing you need to
+ * keep in mind is to use \ref gmx_simd_bool_t when working with
+ * \ref gmx_simd_real_t while you pick \ref gmx_simd_ibool_t when working with
+ * \ref gmx_simd_int32_t.
+ *
+ * To convert between them, use \ref gmx_simd_cvt_b2ib and \ref gmx_simd_cvt_ib2b.
+ */
+# define gmx_simd_ibool_t gmx_simd_fibool_t
+
+
+/*! \}
+ * \name SIMD load/store operations on gmx_simd_real_t
+ *
+ * \note Unaligned load/stores are only available when
+ * \ref GMX_SIMD_HAVE_LOADU and \ref GMX_SIMD_HAVE_STOREU are set, respectively.
+ * \{
+ */
+
+/*! \brief Load \ref GMX_SIMD_REAL_WIDTH values from aligned memory to \ref gmx_simd_real_t
+ *
+ * If GMX_DOUBLE is defined, this will be aliased to \ref gmx_simd_load_d,
+ * otherwise \ref gmx_simd_load_f.
+ *
+ * \copydetails gmx_simd_load_f
+ */
+# define gmx_simd_load_r gmx_simd_load_f
+
+/*! \brief Set all elements in \ref gmx_simd_real_t from single value in memory.
+ *
+ * If GMX_DOUBLE is defined, this will be aliased to \ref gmx_simd_load1_d,
+ * otherwise \ref gmx_simd_load1_f.
+ *
+ * \copydetails gmx_simd_load1_f
+ */
+# define gmx_simd_load1_r gmx_simd_load1_f
+
+/*! \brief Set all elements in \ref gmx_simd_real_t from a scalar.
+ *
+ * If GMX_DOUBLE is defined, this will be aliased to \ref gmx_simd_set1_d,
+ * otherwise \ref gmx_simd_set1_f.
+ *
+ * \copydetails gmx_simd_set1_f
+ */
+# define gmx_simd_set1_r gmx_simd_set1_f
+
+/*! \brief Store \ref GMX_SIMD_REAL_WIDTH values from \ref gmx_simd_real_t to aligned memory.
+ *
+ * If GMX_DOUBLE is defined, this will be aliased to \ref gmx_simd_store_d,
+ * otherwise \ref gmx_simd_store_f.
+ *
+ * \copydetails gmx_simd_store_f
+ */
+# define gmx_simd_store_r gmx_simd_store_f
+
+/*! \brief Load \ref GMX_SIMD_REAL_WIDTH values from unaligned memory to \ref gmx_simd_real_t.
+ *
+ * If GMX_DOUBLE is defined, this will be aliased to \ref gmx_simd_loadu_d,
+ * otherwise \ref gmx_simd_loadu_f.
+ *
+ * \copydetails gmx_simd_loadu_f
+ */
+# define gmx_simd_loadu_r gmx_simd_loadu_f
+
+/*! \brief Store \ref GMX_SIMD_REAL_WIDTH values from \ref gmx_simd_real_t to unaligned memory.
+ *
+ * If GMX_DOUBLE is defined, this will be aliased to \ref gmx_simd_storeu_d,
+ * otherwise \ref gmx_simd_storeu_f.
+ *
+ * \copydetails gmx_simd_storeu_f
+ */
+# define gmx_simd_storeu_r gmx_simd_storeu_f
+
+/*! \brief Set all elements in \ref gmx_simd_real_t to 0.0.
+ *
+ * If GMX_DOUBLE is defined, this will be aliased to \ref gmx_simd_setzero_d,
+ * otherwise \ref gmx_simd_setzero_f.
+ *
+ * \copydetails gmx_simd_setzero_f
+ */
+# define gmx_simd_setzero_r gmx_simd_setzero_f
+
+/*! \}
+ * \name SIMD load/store operations on gmx_simd_int32_t
+ *
+ * \note Unaligned load/stores are only available when
+ * \ref GMX_SIMD_HAVE_LOADU and \ref GMX_SIMD_HAVE_STOREU are set, respectively.
+ * \{
+ */
+
+/*! \brief Load \ref GMX_SIMD_INT32_WIDTH values from aligned memory to \ref gmx_simd_int32_t .
+ *
+ * If GMX_DOUBLE is defined, this will be aliased to \ref gmx_simd_load_di ,
+ * otherwise \ref gmx_simd_load_fi .
+ *
+ * \copydetails gmx_simd_load_fi
+ */
+# define gmx_simd_load_i gmx_simd_load_fi
+
+/*! \brief Set all elements in \ref gmx_simd_int32_t from a single integer.
+ *
+ * If GMX_DOUBLE is defined, this will be aliased to \ref gmx_simd_set1_di ,
+ * otherwise \ref gmx_simd_set1_fi .
+ *
+ * \copydetails gmx_simd_set1_fi
+ */
+# define gmx_simd_set1_i gmx_simd_set1_fi
+
+/*! \brief Store \ref GMX_SIMD_REAL_WIDTH values from \ref gmx_simd_int32_t to aligned memory.
+ *
+ * If GMX_DOUBLE is defined, this will be aliased to \ref gmx_simd_store_di ,
+ * otherwise \ref gmx_simd_store_fi .
+ *
+ * \copydetails gmx_simd_store_fi
+ */
+# define gmx_simd_store_i gmx_simd_store_fi
+
+/*! \brief Load \ref GMX_SIMD_REAL_WIDTH values from unaligned memory to \ref gmx_simd_int32_t.
+ *
+ * If GMX_DOUBLE is defined, this will be aliased to \ref gmx_simd_loadu_di ,
+ * otherwise \ref gmx_simd_loadu_fi .
+ *
+ * \copydetails gmx_simd_loadu_fi
+ */
+# define gmx_simd_loadu_i gmx_simd_loadu_fi
+
+/*! \brief Store \ref GMX_SIMD_REAL_WIDTH values from \ref gmx_simd_int32_t to unaligned memory.
+ *
+ * If GMX_DOUBLE is defined, this will be aliased to \ref gmx_simd_storeu_di ,
+ * otherwise \ref gmx_simd_storeu_fi .
+ *
+ * \copydetails gmx_simd_storeu_fi
+ */
+# define gmx_simd_storeu_i gmx_simd_storeu_fi
+
+/*! \brief Extract single integer from \ref gmx_simd_int32_t element.
+ *
+ * If GMX_DOUBLE is defined, this will be aliased to \ref gmx_simd_extract_di ,
+ * otherwise \ref gmx_simd_extract_fi .
+ *
+ * \copydetails gmx_simd_extract_fi
+ */
+# define gmx_simd_extract_i gmx_simd_extract_fi
+
+/*! \brief Set all elements in \ref gmx_simd_int32_t to 0.
+ *
+ * If GMX_DOUBLE is defined, it will be aliased to \ref gmx_simd_setzero_di ,
+ * otherwise \ref gmx_simd_setzero_fi .
+ *
+ * \copydetails gmx_simd_setzero_fi
+ */
+# define gmx_simd_setzero_i gmx_simd_setzero_fi
+
+
+/*! \}
+ * \name SIMD floating-point logical operations on gmx_simd_real_t
+ *
+ * These instructions are available if \ref GMX_SIMD_HAVE_LOGICAL is defined.
+ * \{
+ */
+
+/*! \brief Bitwise \a and on two \ref gmx_simd_real_t.
+ *
+ * If GMX_DOUBLE is defined, this will be aliased to \ref gmx_simd_and_d,
+ * otherwise \ref gmx_simd_and_f.
+ *
+ * \copydetails gmx_simd_and_f
+ */
+# define gmx_simd_and_r gmx_simd_and_f
+
+/*! \brief Bitwise \a and-not on two \ref gmx_simd_real_t; 1st arg is complemented.
+ *
+ * If GMX_DOUBLE is defined, this will be aliased to \ref gmx_simd_andnot_d,
+ * otherwise \ref gmx_simd_andnot_f.
+ *
+ * \copydetails gmx_simd_andnot_f
+ */
+# define gmx_simd_andnot_r gmx_simd_andnot_f
+
+/*! \brief Bitwise \a or on two \ref gmx_simd_real_t.
+ *
+ * If GMX_DOUBLE is defined, this will be aliased to \ref gmx_simd_or_d,
+ * otherwise \ref gmx_simd_or_f.
+ *
+ * \copydetails gmx_simd_or_f
+ */
+# define gmx_simd_or_r gmx_simd_or_f
+
+/*! \brief Bitwise \a exclusive-or on two \ref gmx_simd_real_t.
+ *
+ * If GMX_DOUBLE is defined, this will be aliased to \ref gmx_simd_xor_d,
+ * otherwise \ref gmx_simd_xor_f.
+ *
+ * \copydetails gmx_simd_xor_f
+ */
+# define gmx_simd_xor_r gmx_simd_xor_f
+
+/*! \}
+ * \name SIMD floating-point arithmetic operations on gmx_simd_real_t
+ * \{
+ */
+
+/*! \brief SIMD a+b for two \ref gmx_simd_real_t.
+ *
+ * If GMX_DOUBLE is defined, this will be aliased to \ref gmx_simd_add_d,
+ * otherwise \ref gmx_simd_add_f.
+ *
+ * \copydetails gmx_simd_add_f
+ */
+# define gmx_simd_add_r gmx_simd_add_f
+
+/*! \brief SIMD a-b for two \ref gmx_simd_real_t.
+ *
+ * If GMX_DOUBLE is defined, this will be aliased to \ref gmx_simd_sub_d,
+ * otherwise \ref gmx_simd_sub_f.
+ *
+ * \copydetails gmx_simd_sub_f
+ */
+# define gmx_simd_sub_r gmx_simd_sub_f
+
+/*! \brief SIMD a*b for two \ref gmx_simd_real_t.
+ *
+ * If GMX_DOUBLE is defined, this will be aliased to \ref gmx_simd_mul_d,
+ * otherwise \ref gmx_simd_mul_f.
+ *
+ * \copydetails gmx_simd_mul_f
+ */
+# define gmx_simd_mul_r gmx_simd_mul_f
+
+/*! \brief SIMD a*b+c for three \ref gmx_simd_real_t.
+ *
+ * If GMX_DOUBLE is defined, this will be aliased to \ref gmx_simd_fmadd_d,
+ * otherwise \ref gmx_simd_fmadd_f.
+ *
+ * \copydetails gmx_simd_fmadd_f
+ */
+# define gmx_simd_fmadd_r gmx_simd_fmadd_f
+
+/*! \brief SIMD a*b-c for three \ref gmx_simd_real_t.
+ *
+ * If GMX_DOUBLE is defined, this will be aliased to \ref gmx_simd_fmsub_d,
+ * otherwise \ref gmx_simd_fmsub_f.
+ *
+ * \copydetails gmx_simd_fmsub_f
+ */
+# define gmx_simd_fmsub_r gmx_simd_fmsub_f
+
+/*! \brief SIMD -a*b+c for three \ref gmx_simd_real_t.
+ *
+ * If GMX_DOUBLE is defined, this will be aliased to \ref gmx_simd_fnmadd_d,
+ * otherwise \ref gmx_simd_fnmadd_f.
+ *
+ * \copydetails gmx_simd_fnmadd_f
+ */
+# define gmx_simd_fnmadd_r gmx_simd_fnmadd_f
+
+/*! \brief SIMD -a*b-c for three \ref gmx_simd_real_t.
+ *
+ * If GMX_DOUBLE is defined, this will be aliased to \ref gmx_simd_fnmsub_d,
+ * otherwise \ref gmx_simd_fnmsub_f.
+ *
+ * \copydetails gmx_simd_fnmsub_f
+ */
+# define gmx_simd_fnmsub_r gmx_simd_fnmsub_f
+
+/*! \brief SIMD table lookup for 1/sqrt(x) approximation.
+ *
+ * If GMX_DOUBLE is defined, this will be aliased to \ref gmx_simd_rsqrt_d,
+ * otherwise \ref gmx_simd_rsqrt_f.
+ *
+ * \copydetails gmx_simd_rsqrt_f
+ */
+# define gmx_simd_rsqrt_r gmx_simd_rsqrt_f
+
+/*! \brief SIMD table lookup for 1/x approximation.
+ *
+ * If GMX_DOUBLE is defined, this will be aliased to \ref gmx_simd_rcp_d,
+ * otherwise \ref gmx_simd_rcp_f.
+ *
+ * \copydetails gmx_simd_rcp_f
+ */
+# define gmx_simd_rcp_r gmx_simd_rcp_f
+
+/*! \brief SIMD fabs(x) for \ref gmx_simd_real_t.
+ *
+ * If GMX_DOUBLE is defined, this will be aliased to \ref gmx_simd_fabs_d,
+ * otherwise \ref gmx_simd_fabs_f.
+ *
+ * \copydetails gmx_simd_fabs_f
+ */
+# define gmx_simd_fabs_r gmx_simd_fabs_f
+
+/*! \brief SIMD -x for \ref gmx_simd_real_t.
+ *
+ * If GMX_DOUBLE is defined, this will be aliased to \ref gmx_simd_fneg_d,
+ * otherwise \ref gmx_simd_fneg_f.
+ *
+ * \copydetails gmx_simd_fneg_f
+ */
+# define gmx_simd_fneg_r gmx_simd_fneg_f
+
+/*! \brief SIMD max(a,b) for each element in \ref gmx_simd_real_t.
+ *
+ * If GMX_DOUBLE is defined, this will be aliased to \ref gmx_simd_max_d,
+ * otherwise \ref gmx_simd_max_f.
+ *
+ * \copydetails gmx_simd_max_f
+ */
+# define gmx_simd_max_r gmx_simd_max_f
+
+/*! \brief SIMD min(a,b) for each element in \ref gmx_simd_real_t.
+ *
+ * If GMX_DOUBLE is defined, this will be aliased to \ref gmx_simd_min_d,
+ * otherwise \ref gmx_simd_min_f.
+ *
+ * \copydetails gmx_simd_min_f
+ */
+# define gmx_simd_min_r gmx_simd_min_f
+
+/*! \brief Round \ref gmx_simd_real_t to nearest int, return \ref gmx_simd_real_t.
+ *
+ * If GMX_DOUBLE is defined, this will be aliased to \ref gmx_simd_round_d,
+ * otherwise \ref gmx_simd_round_f.
+ *
+ * \copydetails gmx_simd_round_f
+ */
+# define gmx_simd_round_r gmx_simd_round_f
+
+/*! \brief Truncate \ref gmx_simd_real_t towards 0, return \ref gmx_simd_real_t.
+ *
+ * If GMX_DOUBLE is defined, this will be aliased to \ref gmx_simd_trunc_d,
+ * otherwise \ref gmx_simd_trunc_f.
+ *
+ * \copydetails gmx_simd_trunc_f
+ */
+# define gmx_simd_trunc_r gmx_simd_trunc_f
+
+/*! \brief SIMD Fraction, i.e. x-trunc(x) for \ref gmx_simd_real_t.
+ *
+ * If GMX_DOUBLE is defined, this will be aliased to \ref gmx_simd_fraction_d,
+ * otherwise \ref gmx_simd_fraction_f.
+ *
+ * \copydetails gmx_simd_fraction_f
+ */
+# define gmx_simd_fraction_r gmx_simd_fraction_f
+
+/*! \brief Return the FP exponent of a SIMD \ref gmx_simd_real_t as a \ref gmx_simd_real_t.
+ *
+ * If GMX_DOUBLE is defined, this will be aliased to \ref gmx_simd_get_exponent_d,
+ * otherwise \ref gmx_simd_get_exponent_f.
+ *
+ * \copydetails gmx_simd_exponent_f
+ */
+# define gmx_simd_get_exponent_r gmx_simd_get_exponent_f
+
+/*! \brief Return the FP mantissa of a SIMD \ref gmx_simd_real_t as a \ref gmx_simd_real_t.
+ *
+ * If GMX_DOUBLE is defined, this will be aliased to \ref gmx_simd_get_mantissa_d,
+ * otherwise \ref gmx_simd_get_mantissa_f.
+ *
+ * \copydetails gmx_simd_mantissa_f
+ */
+# define gmx_simd_get_mantissa_r gmx_simd_get_mantissa_f
+
+/*! \brief Set the exponent of a SIMD \ref gmx_simd_real_t from a \ref gmx_simd_real_t.
+ *
+ * If GMX_DOUBLE is defined, this will be aliased to \ref gmx_simd_set_exponent_d,
+ * otherwise \ref gmx_simd_set_exponent_f.
+ *
+ * \copydetails gmx_simd_set_exponent_f
+ */
+# define gmx_simd_set_exponent_r gmx_simd_set_exponent_f
+
+/*! \}
+ * \name SIMD comparison, boolean, and select operations for gmx_simd_real_t
+ * \{
+ */
+
+/*! \brief SIMD a==b for \ref gmx_simd_real_t. Returns a \ref gmx_simd_bool_t.
+ *
+ * If GMX_DOUBLE is defined, this will be aliased to \ref gmx_simd_cmpeq_d,
+ * otherwise \ref gmx_simd_cmpeq_f.
+ *
+ * \copydetails gmx_simd_cmpeq_f
+ */
+# define gmx_simd_cmpeq_r gmx_simd_cmpeq_f
+
+/*! \brief SIMD a<b for \ref gmx_simd_real_t. Returns a \ref gmx_simd_bool_t.
+ *
+ * If GMX_DOUBLE is defined, this will be aliased to \ref gmx_simd_cmplt_d,
+ * otherwise \ref gmx_simd_cmplt_f.
+ *
+ * \copydetails gmx_simd_cmplt_f
+ */
+# define gmx_simd_cmplt_r gmx_simd_cmplt_f
+
+/*! \brief SIMD a<=b for \ref gmx_simd_real_t. Returns a \ref gmx_simd_bool_t.
+ *
+ * If GMX_DOUBLE is defined, this will be aliased to \ref gmx_simd_cmple_d,
+ * otherwise \ref gmx_simd_cmple_f.
+ *
+ * \copydetails gmx_simd_cmple_f
+ */
+# define gmx_simd_cmple_r gmx_simd_cmple_f
+
+/*! \brief For each element, the result boolean is true if both arguments are true
+ *
+ * If GMX_DOUBLE is defined, this will be aliased to \ref gmx_simd_and_db,
+ * otherwise \ref gmx_simd_and_fb.
+ *
+ * \copydetails gmx_simd_and_fb
+ */
+# define gmx_simd_and_b gmx_simd_and_fb
+
+/*! \brief For each element, the result boolean is true if either argument is true
+ *
+ * If GMX_DOUBLE is defined, this will be aliased to \ref gmx_simd_or_db,
+ * otherwise \ref gmx_simd_or_fb.
+ *
+ * \copydetails gmx_simd_or_fn
+ */
+# define gmx_simd_or_b gmx_simd_or_fb
+
+/*! \brief Return nonzero if any element in gmx_simd_bool_t is true, otherwise 0.
+ *
+ * If GMX_DOUBLE is defined, this will be aliased to \ref gmx_simd_anytrue_db,
+ * otherwise \ref gmx_simd_anytrue_fb.
+ *
+ * \copydetails gmx_simd_anytrue_fb
+ */
+# define gmx_simd_anytrue_b gmx_simd_anytrue_fb
+
+/*! \brief Selects elements from \ref gmx_simd_real_t where boolean is true, otherwise 0.
+ *
+ * If GMX_DOUBLE is defined, this will be aliased to \ref gmx_simd_blendzero_d,
+ * otherwise \ref gmx_simd_blendzero_f.
+ *
+ * \copydetails gmx_simd_blendzero_f
+ *
+ * \sa gmx_simd_blendzero_i
+ */
+# define gmx_simd_blendzero_r gmx_simd_blendzero_f
+
+/*! \brief Selects elements from \ref gmx_simd_real_t where boolean is false, otherwise 0.
+ *
+ * If GMX_DOUBLE is defined, this will be aliased to \ref gmx_simd_blendnotzero_d,
+ * otherwise \ref gmx_simd_blendnotzero_f.
+ *
+ * \copydetails gmx_simd_blendnotzero_f
+ */
+# define gmx_simd_blendnotzero_r gmx_simd_blendnotzero_f
+
+/*! \brief Selects from 2nd real SIMD arg where boolean is true, otherwise 1st arg.
+ *
+ * If GMX_DOUBLE is defined, this will be aliased to \ref gmx_simd_blendv_d,
+ * otherwise \ref gmx_simd_blendv_f.
+ *
+ * \copydetails gmx_simd_blendv_f
+ */
+# define gmx_simd_blendv_r gmx_simd_blendv_f
+
+/*! \brief Return sum of all elements in SIMD floating-point variable.
+ *
+ * If GMX_DOUBLE is defined, this will be aliased to \ref gmx_simd_reduce_d,
+ * otherwise \ref gmx_simd_reduce_f.
+ *
+ * \copydetails gmx_simd_reduce_f
+ */
+# define gmx_simd_reduce_r gmx_simd_reduce_f
+
+/*! \}
+ * \name SIMD integer logical operations on gmx_simd_int32_t
+ *
+ * These instructions are available if \ref GMX_SIMD_HAVE_INT32_LOGICAL is defined.
+ * \{
+ */
+
+/*! \brief Shift each element in \ref gmx_simd_int32_t left by immediate
+ *
+ * If GMX_DOUBLE is defined, this will be aliased to \ref gmx_simd_slli_di,
+ * otherwise \ref gmx_simd_slli_fi.
+ *
+ * \copydetails gmx_simd_slli_fi
+ */
+# define gmx_simd_slli_i gmx_simd_slli_fi
+
+/*! \brief Shift each element in \ref gmx_simd_int32_t right by immediate
+ *
+ * If GMX_DOUBLE is defined, this will be aliased to \ref gmx_simd_srli_di,
+ * otherwise \ref gmx_simd_srli_fi.
+ *
+ * \copydetails gmx_simd_srli_fi
+ */
+# define gmx_simd_srli_i gmx_simd_srli_fi
+
+/*! \brief Bitwise \a and on two \ref gmx_simd_int32_t.
+ *
+ * If GMX_DOUBLE is defined, this will be aliased to \ref gmx_simd_and_di,
+ * otherwise \ref gmx_simd_and_fi.
+ *
+ * \copydetails gmx_simd_and_fi
+ */
+# define gmx_simd_and_i gmx_simd_and_fi
+
+/*! \brief Bitwise \a and-not on two \ref gmx_simd_int32_t; 1st arg is complemented.
+ *
+ * If GMX_DOUBLE is defined, this will be aliased to \ref gmx_simd_andnot_di,
+ * otherwise \ref gmx_simd_andnot_fi.
+ *
+ * \copydetails gmx_simd_andnot_fi
+ */
+# define gmx_simd_andnot_i gmx_simd_andnot_fi
+
+/*! \brief Bitwise \a or on two \ref gmx_simd_int32_t.
+ *
+ * If GMX_DOUBLE is defined, this will be aliased to \ref gmx_simd_or_di,
+ * otherwise \ref gmx_simd_or_fi.
+ *
+ * \copydetails gmx_simd_or_fi
+ */
+# define gmx_simd_or_i gmx_simd_or_fi
+
+/*! \brief Bitwise \a xor on two \ref gmx_simd_int32_t.
+ *
+ * If GMX_DOUBLE is defined, this will be aliased to \ref gmx_simd_xor_di,
+ * otherwise \ref gmx_simd_xor_fi.
+ *
+ * \copydetails gmx_simd_xor_fi
+ */
+# define gmx_simd_xor_i gmx_simd_xor_fi
+
+/*! \}
+ * \name SIMD integer arithmetic operations on gmx_simd_int32_t
+ *
+ * These instructions are available if \ref GMX_SIMD_HAVE_INT32_ARITHMETICS is defined.
+ * \{
+ */
+
+/*! \brief SIMD a+b for two \ref gmx_simd_int32_t.
+ *
+ * If GMX_DOUBLE is defined, this will be aliased to \ref gmx_simd_add_di,
+ * otherwise \ref gmx_simd_add_fi.
+ *
+ * \copydetails gmx_simd_add_fi
+ */
+# define gmx_simd_add_i gmx_simd_add_fi
+
+/*! \brief SIMD a-b for two \ref gmx_simd_int32_t.
+ *
+ * If GMX_DOUBLE is defined, this will be aliased to \ref gmx_simd_sub_di,
+ * otherwise \ref gmx_simd_sub_fi.
+ *
+ * \copydetails gmx_simd_sub_fi
+ */
+# define gmx_simd_sub_i gmx_simd_sub_fi
+
+/*! \brief SIMD a*b for two \ref gmx_simd_int32_t.
+ *
+ * If GMX_DOUBLE is defined, this will be aliased to \ref gmx_simd_mul_di,
+ * otherwise \ref gmx_simd_mul_fi.
+ *
+ * \copydetails gmx_simd_mul_fi
+ */
+# define gmx_simd_mul_i gmx_simd_mul_fi
+
+/*! \}
+ * \name SIMD integer comparison, booleans, and selection on gmx_simd_int32_t
+ *
+ * These instructions are available if \ref GMX_SIMD_HAVE_INT32_ARITHMETICS is defined.
+ * \{
+ */
+
+/*! \brief Returns boolean describing whether a==b, for \ref gmx_simd_int32_t
+ *
+ * If GMX_DOUBLE is defined, this will be aliased to \ref gmx_simd_cmpeq_di,
+ * otherwise \ref gmx_simd_cmpeq_fi.
+ *
+ * \copydetails gmx_simd_cmpeq_fi
+ */
+# define gmx_simd_cmpeq_i gmx_simd_cmpeq_fi
+
+/*! \brief Returns boolean describing whether a<b, for \ref gmx_simd_int32_t
+ *
+ * If GMX_DOUBLE is defined, this will be aliased to \ref gmx_simd_cmplt_di,
+ * otherwise \ref gmx_simd_cmplt_fi.
+ *
+ * \copydetails gmx_simd_cmplt_fi
+ */
+# define gmx_simd_cmplt_i gmx_simd_cmplt_fi
+
+/*! \brief For each element, the result boolean is true if both arguments are true
+ *
+ * If GMX_DOUBLE is defined, this will be aliased to \ref gmx_simd_and_dib,
+ * otherwise \ref gmx_simd_and_fib.
+ *
+ * \copydetails gmx_simd_and_fib
+ */
+# define gmx_simd_and_ib gmx_simd_and_fib
+
+/*! \brief For each element, the result boolean is true if either argument is true.
+ *
+ * If GMX_DOUBLE is defined, this will be aliased to \ref gmx_simd_or_dib,
+ * otherwise \ref gmx_simd_or_fib.
+ *
+ * \copydetails gmx_simd_or_fib
+ */
+# define gmx_simd_or_ib gmx_simd_or_fib
+
+/*! \brief Return nonzero if any element in gmx_simd_ibool_t is true, otherwise 0.
+ *
+ * If GMX_DOUBLE is defined, this will be aliased to \ref gmx_simd_anytrue_dib,
+ * otherwise \ref gmx_simd_anytrue_fib.
+ *
+ * \copydetails gmx_simd_anytrue_fib
+ */
+# define gmx_simd_anytrue_ib gmx_simd_anytrue_fib
+
+/*! \brief Selects elements from \ref gmx_simd_int32_t where boolean is true, otherwise 0.
+ *
+ * If GMX_DOUBLE is defined, this will be aliased to \ref gmx_simd_blendzero_di,
+ * otherwise \ref gmx_simd_blendzero_fi.
+ *
+ * \copydetails gmx_simd_blendzero_fi
+ */
+# define gmx_simd_blendzero_i gmx_simd_blendzero_fi
+
+/*! \brief Selects elements from \ref gmx_simd_int32_t where boolean is false, otherwise 0.
+ *
+ * If GMX_DOUBLE is defined, this will be aliased to \ref gmx_simd_blendnotzero_di,
+ * otherwise \ref gmx_simd_blendnotzero_fi.
+ *
+ * \copydetails gmx_simd_blendnotzero_fi
+ */
+# define gmx_simd_blendnotzero_i gmx_simd_blendnotzero_fi
+
+/*! \brief Selects from 2nd int SIMD arg where boolean is true, otherwise 1st arg.
+ *
+ * If GMX_DOUBLE is defined, this will be aliased to \ref gmx_simd_blendv_di,
+ * otherwise \ref gmx_simd_blendv_fi.
+ *
+ * \copydetails gmx_simd_blendv_fi
+ */
+# define gmx_simd_blendv_i gmx_simd_blendv_fi
+
+/*! \}
+ * \name SIMD conversion operations
+ *
+ * These instructions are available when both types involved in the conversion
+ * are defined, e.g. \ref GMX_SIMD_HAVE_REAL and \ref GMX_SIMD_HAVE_INT32
+ * for real-to-integer conversion.
+ * \{
+ */
+
+/*! \brief Convert gmx_simd_real_t to gmx_simd_int32_t, round to nearest integer.
+ *
+ * If GMX_DOUBLE is defined, this will be aliased to \ref gmx_simd_cvt_d2i,
+ * otherwise \ref gmx_simd_cvt_f2i.
+ *
+ * \copydetails gmx_simd_cvt_f2i
+ */
+# define gmx_simd_cvt_r2i gmx_simd_cvt_f2i
+
+/*! \brief Convert gmx_simd_real_t to gmx_simd_int32_t, truncate towards zero
+ *
+ * If GMX_DOUBLE is defined, this will be aliased to \ref gmx_simd_cvtt_d2i,
+ * otherwise \ref gmx_simd_cvtt_f2i.
+ *
+ * \copydetails gmx_simd_cvtt_f2i
+ */
+# define gmx_simd_cvtt_r2i gmx_simd_cvtt_f2i
+
+/*! \brief Convert gmx_simd_int32_t to gmx_simd_real_t
+ *
+ * If GMX_DOUBLE is defined, this will be aliased to \ref gmx_simd_cvt_i2d,
+ * otherwise \ref gmx_simd_cvt_i2f.
+ *
+ * \copydetails gmx_simd_cvt_i2f
+ */
+# define gmx_simd_cvt_i2r gmx_simd_cvt_i2f
+
+/*! \brief Convert from gmx_simd_bool_t to gmx_simd_ibool_t
+ *
+ * If GMX_DOUBLE is defined, this will be aliased to \ref gmx_simd_cvt_db2dib,
+ * otherwise \ref gmx_simd_cvt_fb2fib.
+ *
+ * \copydetails gmx_simd_cvt_fb2fib
+ */
+# define gmx_simd_cvt_b2ib gmx_simd_cvt_fb2fib
+
+/*! \brief Convert from gmx_simd_ibool_t to gmx_simd_bool_t
+ *
+ * If GMX_DOUBLE is defined, this will be aliased to \ref gmx_simd_cvt_dib2db,
+ * otherwise \ref gmx_simd_cvt_fib2fb.
+ *
+ * \copydetails gmx_simd_cvt_fib2fb
+ */
+# define gmx_simd_cvt_ib2b gmx_simd_cvt_fib2fb
+
+
+/*! \}
+ * \name SIMD memory alignment operations
+ * \{
+ */
+
+/*! \brief Align real memory for SIMD usage.
+ *
+ * This routine will only align memory if \ref GMX_SIMD_HAVE_REAL is defined.
+ * Otherwise the original pointer will be returned.
+ *
+ * Start by allocating an extra \ref GMX_SIMD_REAL_WIDTH float elements of memory,
+ * and then call this function. The returned pointer will be greater or equal
+ * to the one you provided, and point to an address inside your provided memory
+ * that is aligned to the SIMD width.
+ *
+ * If GMX_DOUBLE is defined, this will be aliased to \ref gmx_simd_align_d,
+ * otherwise \ref gmx_simd_align_f. For detailed documentation, see the
+ * precision-specific implementation routines.
+ */
+# define gmx_simd_align_r gmx_simd_align_f
+
+/*! \brief Align integer memory for SIMD usage.
+ *
+ * This routine will only align memory if \ref GMX_SIMD_HAVE_INT32 is defined.
+ * Otherwise the original pointer will be returned.
+ *
+ * Start by allocating an extra \ref GMX_SIMD_INT32_WIDTH elements of memory,
+ * and then call this function. The returned pointer will be greater or equal
+ * to the one you provided, and point to an address inside your provided memory
+ * that is aligned to the SIMD width.
+ *
+ * If GMX_DOUBLE is defined, this will be aliased to \ref gmx_simd_align_di,
+ * otherwise \ref gmx_simd_align_fi. For detailed documentation, see the
+ * precision-specific implementation routines.
+ */
+# define gmx_simd_align_i gmx_simd_align_fi
+
+/*! \} */
+
+/*! \name SIMD4 - constant width-four SIMD datatypes
+ *
+ * These operations are only meant to be used for a few coordinate
+ * manipulation and grid interpolation routines, so we only support a subset
+ * of operations for SIMD4. To avoid repeating all the documentation from
+ * the generic width SIMD routines, we only provide brief documentation for
+ * these operations. Follow the link to the implementation documentation or the
+ * reference to the corresponding generic SIMD routine. The format will be
+ * exactly the same, but they have SIMD replaced with SIMD4.
+ * \{
+ */
+
+/*! \brief SIMD real datatype guaranteed to be 4 elements wide, if available.
+ *
+ * All the SIMD4 datatypes and operations behave like their counterparts for
+ * the generic SIMD implementation, but they might be implemented with different
+ * registers, or not supported at all. It is important that you check the
+ * define \ref GMX_SIMD4_HAVE_REAL before using it.
+ *
+ * Just as the normal SIMD operations, all SIMD4 types and routines will
+ * be aliased to either single or double precision ones based on whether
+ * GMX_DOUBLE is defined.
+ *
+ * \note There is no support for integer or math operations in SIMD4.
+ */
+# define gmx_simd4_real_t gmx_simd4_float_t
+
+/*! \brief Boolean for \ref gmx_simd4_real_t comparision/selection */
+# define gmx_simd4_bool_t gmx_simd4_fbool_t
+
+/*! \brief Load aligned data to gmx_simd4_real_t.
+ *
+ * \copydetails gmx_simd4_load_f
+ */
+# define gmx_simd4_load_r gmx_simd4_load_f
+
+/*! \brief Load single element to gmx_simd4_real_t
+ *
+ * \copydetails gmx_simd4_load1_f
+ */
+# define gmx_simd4_load1_r gmx_simd4_load1_f
+
+/*! \brief Set gmx_simd4_real_t from scalar value
+ *
+ * \copydetails gmx_simd4_set1_f
+ */
+# define gmx_simd4_set1_r gmx_simd4_set1_f
+
+/*! \brief store aligned data from gmx_simd4_real_t
+ *
+ * \copydetails gmx_simd4_store_f
+ */
+# define gmx_simd4_store_r gmx_simd4_store_f
+
+/*! \brief Load unaligned data to gmx_simd4_real_t
+ *
+ * \copydetails gmx_simd4_loadu_f
+ */
+# define gmx_simd4_loadu_r gmx_simd4_loadu_f
+
+/*! \brief Store unaligned data from gmx_simd4_real_t
+ *
+ * \copydetails gmx_simd4_storeu_f
+ */
+# define gmx_simd4_storeu_r gmx_simd4_storeu_f
+
+/*! \brief Set all elements in gmx_simd4_real_t to 0.0
+ *
+ * \copydetails gmx_simd4_setzero_f
+ */
+# define gmx_simd4_setzero_r gmx_simd4_setzero_f
+
+/*! \brief Bitwise and for two gmx_simd4_real_t
+ *
+ * \copydetails gmx_simd4_and_f
+ */
+# define gmx_simd4_and_r gmx_simd4_and_f
+
+/*! \brief Bitwise and-not for two gmx_simd4_real_t. 1st arg is complemented.
+ *
+ * \copydetails gmx_simd4_andnot_f
+ */
+# define gmx_simd4_andnot_r gmx_simd4_andnot_f
+
+/*! \brief Bitwise or for two gmx_simd4_real_t
+ *
+ * \copydetails gmx_simd4_or_f
+ */
+# define gmx_simd4_or_r gmx_simd4_or_f
+
+/*! \brief Bitwise xor for two gmx_simd4_real_t
+ *
+ * \copydetails gmx_simd4_xor_f
+ */
+# define gmx_simd4_xor_r gmx_simd4_xor_f
+
+/*! \brief a+b for \ref gmx_simd4_real_t
+ *
+ * \copydetails gmx_simd4_add_f
+ */
+# define gmx_simd4_add_r gmx_simd4_add_f
+
+/*! \brief a-b for \ref gmx_simd4_real_t
+ *
+ * \copydetails gmx_simd4_sub_f
+ */
+# define gmx_simd4_sub_r gmx_simd4_sub_f
+
+/*! \brief a*b for \ref gmx_simd4_real_t
+ *
+ * \copydetails gmx_simd4_mul_f
+ */
+# define gmx_simd4_mul_r gmx_simd4_mul_f
+
+/*! \brief a*b+c for \ref gmx_simd4_real_t
+ *
+ * \copydetails gmx_simd4_fmadd_f
+ */
+# define gmx_simd4_fmadd_r gmx_simd4_fmadd_f
+
+/*! \brief a*b-c for \ref gmx_simd4_real_t
+ *
+ * \copydetails gmx_simd4_fmsub_f
+ */
+# define gmx_simd4_fmsub_r gmx_simd4_fmsub_f
+
+/*! \brief -a*b+c for \ref gmx_simd4_real_t
+ *
+ * \copydetails gmx_simd4_fnmadd_f
+ */
+# define gmx_simd4_fnmadd_r gmx_simd4_fnmadd_f
+
+/*! \brief -a*b-c for \ref gmx_simd4_real_t
+ *
+ * \copydetails gmx_simd4_fnmsub_f
+ */
+# define gmx_simd4_fnmsub_r gmx_simd4_fnmsub_f
+
+/*! \brief 1/sqrt(x) approximate lookup for \ref gmx_simd4_real_t
+ *
+ * \copydetails gmx_simd4_rsqrt_f
+ */
+# define gmx_simd4_rsqrt_r gmx_simd4_rsqrt_f
+
+/*! \brief fabs(x) for \ref gmx_simd4_real_t
+ *
+ * \copydetails gmx_simd4_fabs_f
+ */
+# define gmx_simd4_fabs_r gmx_simd4_fabs_f
+
+/*! \brief Change sign (-x) for \ref gmx_simd4_real_t
+ *
+ * \copydetails gmx_simd4_fneg_f
+ */
+# define gmx_simd4_fneg_r gmx_simd4_fneg_f
+
+/*! \brief Select maximum of each pair of elements from args for \ref gmx_simd4_real_t
+ *
+ * \copydetails gmx_simd4_max_f
+ */
+# define gmx_simd4_max_r gmx_simd4_max_f
+
+/*! \brief Select minimum of each pair of elements from args for \ref gmx_simd4_real_t
+ *
+ * \copydetails gmx_simd4_min_f
+ */
+# define gmx_simd4_min_r gmx_simd4_min_f
+
+/*! \brief Round \ref gmx_simd4_real_t to nearest integer, return \ref gmx_simd4_real_t
+ *
+ * \copydetails gmx_simd4_round_f
+ */
+# define gmx_simd4_round_r gmx_simd4_round_f
+
+/*! \brief Truncate \ref gmx_simd4_real_t towards zero, return \ref gmx_simd4_real_t
+ *
+ * \copydetails gmx_simd4_trunc_f
+ */
+# define gmx_simd4_trunc_r gmx_simd4_trunc_f
+
+/*! \brief Scalar product of first three elements of two \ref gmx_simd4_real_t *
+ *
+ * \copydetails gmx_simd4_dotproduct3_f
+ */
+# define gmx_simd4_dotproduct3_r gmx_simd4_dotproduct3_f
+
+/*! \brief Return booleans whether a==b for each element two \ref gmx_simd4_real_t
+ *
+ * \copydetails gmx_simd4_cmpeq_f
+ */
+# define gmx_simd4_cmpeq_r gmx_simd4_cmpeq_f
+/*! \brief Return booleans whether a<b for each element two \ref gmx_simd4_real_t
+ *
+ * \copydetails gmx_simd4_cmplt_f
+ */
+# define gmx_simd4_cmplt_r gmx_simd4_cmplt_f
+/*! \brief Return booleans whether a<=b for each element two \ref gmx_simd4_real_t
+ *
+ * \copydetails gmx_simd4_cmple_f
+ */
+# define gmx_simd4_cmple_r gmx_simd4_cmple_f
+
+/*! \brief Logical and for two \ref gmx_simd4_bool_t
+ *
+ * \copydetails gmx_simd4_and_fb
+ */
+# define gmx_simd4_and_b gmx_simd4_and_fb
+/*! \brief Logical or for two \ref gmx_simd4_bool_t
+ *
+ * \copydetails gmx_simd4_or_fb
+ */
+# define gmx_simd4_or_b gmx_simd4_or_fb
+
+/*! \brief Return nonzero if any element in \ref gmx_simd4_bool_t is true, otherwise 0
+ *
+ * \copydetails gmx_simd4_anytrue_fb
+ */
+# define gmx_simd4_anytrue_b gmx_simd4_anytrue_fb
+
+/*! \brief Selects from 2nd real SIMD4 arg where boolean is true, otherwise 1st arg
+ *
+ * \copydetails gmx_simd4_blendzero_f
+ */
+# define gmx_simd4_blendzero_r gmx_simd4_blendzero_f
+
+/*! \brief Selects from 2nd real SIMD4 arg where boolean is false, otherwise 1st arg
+ *
+ * \copydetails gmx_simd4_blendnotzero_f
+ */
+# define gmx_simd4_blendnotzero_r gmx_simd4_blendnotzero_f
+
+/*! \brief Selects from 2nd real SIMD4 arg where boolean is true, otherwise 1st arg
+ *
+ * \copydetails gmx_simd4_blendv_f
+ */
+# define gmx_simd4_blendv_r gmx_simd4_blendv_f
+
+/*! \brief Return sum of all elements in SIMD4 floating-point variable.
+ *
+ * \copydetails gmx_simd4_reduce_f
+ */
+# define gmx_simd4_reduce_r gmx_simd4_reduce_f
+
+/*! Align real memory for SIMD4 usage.
+ *
+ * \copydetails gmx_simd4_align_f
+ */
+# define gmx_simd4_align_r gmx_simd4_align_f
+
+/*! \} */
+
+/*! \name SIMD predefined macros to describe high-level capabilities
+ * \{
+ */
+
+# if (defined GMX_SIMD_HAVE_FLOAT) || (defined DOXYGEN)
+/*! \brief Defined if gmx_simd_real_t is available.
+ *
+ * if GMX_DOUBLE is defined, this will be aliased to
+ * \ref GMX_SIMD_HAVE_DOUBLE, otherwise GMX_SIMD_HAVE_FLOAT.
+ */
+# define GMX_SIMD_HAVE_REAL
+/*! \brief Width of gmx_simd_real_t.
+ *
+ * if GMX_DOUBLE is defined, this will be aliased to
+ * \ref GMX_SIMD_DOUBLE_WIDTH, otherwise GMX_SIMD_FLOAT_WIDTH.
+ */
+# define GMX_SIMD_REAL_WIDTH GMX_SIMD_FLOAT_WIDTH
+# endif
+# if (defined GMX_SIMD_HAVE_FINT32) || (defined DOXYGEN)
+/*! \brief Defined if gmx_simd_int32_t is available.
+ *
+ * if GMX_DOUBLE is defined, this will be aliased to
+ * \ref GMX_SIMD_HAVE_DINT32, otherwise GMX_SIMD_HAVE_FINT32.
+ */
+# define GMX_SIMD_HAVE_INT32
+/*! \brief Width of gmx_simd_int32_t.
+ *
+ * if GMX_DOUBLE is defined, this will be aliased to
+ * \ref GMX_SIMD_DINT32_WIDTH, otherwise GMX_SIMD_FINT32_WIDTH.
+ */
+# define GMX_SIMD_INT32_WIDTH GMX_SIMD_FINT32_WIDTH
+# endif
+# if (defined GMX_SIMD_HAVE_FINT32_EXTRACT) || (defined DOXYGEN)
+/*! \brief Defined if gmx_simd_extract_i() is available.
+ *
+ * if GMX_DOUBLE is defined, this will be aliased to
+ * \ref GMX_SIMD_HAVE_DINT32_EXTRACT, otherwise GMX_SIMD_HAVE_FINT32_EXTRACT.
+ */
+# define GMX_SIMD_HAVE_INT32_EXTRACT
+# endif
+# if (defined GMX_SIMD_HAVE_FINT32_LOGICAL) || (defined DOXYGEN)
+/*! \brief Defined if logical ops are supported on gmx_simd_int32_t.
+ *
+ * if GMX_DOUBLE is defined, this will be aliased to
+ * \ref GMX_SIMD_HAVE_DINT32_LOGICAL, otherwise GMX_SIMD_HAVE_FINT32_LOGICAL.
+ */
+# define GMX_SIMD_HAVE_INT32_LOGICAL
+# endif
+# if (defined GMX_SIMD_HAVE_FINT32_ARITHMETICS) || (defined DOXYGEN)
+/*! \brief Defined if arithmetic ops are supported on gmx_simd_int32_t.
+ *
+ * if GMX_DOUBLE is defined, this will be aliased to
+ * \ref GMX_SIMD_HAVE_DINT32_ARITHMETICS, otherwise GMX_SIMD_HAVE_FINT32_ARITHMETICS.
+ */
+# define GMX_SIMD_HAVE_INT32_ARITHMETICS
+# endif
+# if (defined GMX_SIMD4_HAVE_FLOAT) || (defined DOXYGEN)
+/*! \brief Defined if gmx_simd4_real_t is available.
+ *
+ * if GMX_DOUBLE is defined, this will be aliased to
+ * \ref GMX_SIMD4_HAVE_DOUBLE, otherwise GMX_SIMD4_HAVE_FLOAT.
+ */
+# define GMX_SIMD4_HAVE_REAL
+# endif
+
+/*! \} */
+
+#endif /* GMX_DOUBLE */
+
+/*! \} */
+/*! \endcond */
+
+#endif /* GMX_SIMD_SIMD_H */
--- /dev/null
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+#ifndef GMX_SIMD_SIMD_MATH_H_
+#define GMX_SIMD_SIMD_MATH_H_
+
+/*! \libinternal \file
+ *
+ * \brief Math functions for SIMD datatypes.
+ *
+ * \attention This file is generic for all SIMD architectures, so you cannot
+ * assume that any of the optional SIMD features (as defined in simd.h) are
+ * present. In particular, this means you cannot assume support for integers,
+ * logical operations (neither on floating-point nor integer values), shifts,
+ * and the architecture might only have SIMD for either float or double.
+ * Second, to keep this file clean and general, any additions to this file
+ * must work for all possible SIMD architectures in both single and double
+ * precision (if they support it), and you cannot make any assumptions about
+ * SIMD width.
+ *
+ * \author Erik Lindahl <erik.lindahl@scilifelab.se>
+ *
+ * \inlibraryapi
+ * \ingroup module_simd
+ */
+
+#include <math.h>
+
+#include "gromacs/math/utilities.h"
+#include "gromacs/simd/simd.h"
+
+/*! \cond libapi */
+/*! \addtogroup module_simd */
+/*! \{ */
+
+/*! \name Implementation accuracy settings
+ * \{
+ */
+
+/*! \brief We accept lsb errors for 1/sqrt(x) and 1/x, so float target is 22 bits */
+#define GMX_SIMD_MATH_TARGET_SINGLE_BITS 22
+
+/*! \brief We accept "double" that has 2x single precision - 44 bits.
+ *
+ * This way two Newton-Raphson iterations will suffice in double precision.
+ */
+#define GMX_SIMD_MATH_TARGET_DOUBLE_BITS 44
+
+/*! \} */
+
+#ifdef GMX_SIMD_HAVE_FLOAT
+
+/*! \name Single precision SIMD math functions
+ *
+ * \note In most cases you should use the real-precision functions instead.
+ * \{
+ */
+
+/****************************************
+ * SINGLE PRECISION SIMD MATH FUNCTIONS *
+ ****************************************/
+
+/*! \brief SIMD float utility to sum a+b+c+d.
+ *
+ * You should normally call the real-precision routine \ref gmx_simd_sum4_r.
+ *
+ * \param a term 1 (multiple values)
+ * \param b term 2 (multiple values)
+ * \param c term 3 (multiple values)
+ * \param d term 4 (multiple values)
+ * \return sum of terms 1-4 (multiple values)
+ */
+static gmx_inline gmx_simd_float_t
+gmx_simd_sum4_f(gmx_simd_float_t a, gmx_simd_float_t b,
+ gmx_simd_float_t c, gmx_simd_float_t d)
+{
+ return gmx_simd_add_f(gmx_simd_add_f(a, b), gmx_simd_add_f(c, d));
+}
+
+/*! \brief Return -a if b is negative, SIMD float.
+ *
+ * You should normally call the real-precision routine \ref gmx_simd_xor_sign_r.
+ *
+ * \param a Values to set sign for
+ * \param b Values used to set sign
+ * \return if b is negative, the sign of a will be changed.
+ *
+ * This is equivalent to doing an xor operation on a with the sign bit of b,
+ * with the exception that negative zero is not considered to be negative
+ * on architectures where \ref GMX_SIMD_HAVE_LOGICAL is not set.
+ */
+static gmx_inline gmx_simd_float_t
+gmx_simd_xor_sign_f(gmx_simd_float_t a, gmx_simd_float_t b)
+{
+#ifdef GMX_SIMD_HAVE_LOGICAL
+ return gmx_simd_xor_f(a, gmx_simd_and_f(gmx_simd_set1_f(-0.0), b));
+#else
+ return gmx_simd_blendv_f(a, gmx_simd_fneg_f(a), gmx_simd_cmplt_f(b, gmx_simd_setzero_f()));
+#endif
+}
+
+/*! \brief Perform one Newton-Raphson iteration to improve 1/sqrt(x) for SIMD float.
+ *
+ * This is a low-level routine that should only be used by SIMD math routine
+ * that evaluates the inverse square root.
+ *
+ * \param lu Approximation of 1/sqrt(x), typically obtained from lookup.
+ * \param x The reference (starting) value x for which we want 1/sqrt(x).
+ * \return An improved approximation with roughly twice as many bits of accuracy.
+ */
+static gmx_inline gmx_simd_float_t
+gmx_simd_rsqrt_iter_f(gmx_simd_float_t lu, gmx_simd_float_t x)
+{
+# ifdef GMX_SIMD_HAVE_FMA
+ return gmx_simd_fmadd_f(gmx_simd_fnmadd_f(x, gmx_simd_mul_f(lu, lu), gmx_simd_set1_f(1.0f)), gmx_simd_mul_f(lu, gmx_simd_set1_f(0.5f)), lu);
+# else
+ return gmx_simd_mul_f(gmx_simd_set1_f(0.5f), gmx_simd_mul_f(gmx_simd_sub_f(gmx_simd_set1_f(3.0f), gmx_simd_mul_f(gmx_simd_mul_f(lu, lu), x)), lu));
+# endif
+}
+
+/*! \brief Calculate 1/sqrt(x) for SIMD float.
+ *
+ * You should normally call the real-precision routine \ref gmx_simd_invsqrt_r.
+ *
+ * \param x Argument that must be >0. This routine does not check arguments.
+ * \return 1/sqrt(x). Result is undefined if your argument was invalid.
+ */
+static gmx_inline gmx_simd_float_t
+gmx_simd_invsqrt_f(gmx_simd_float_t x)
+{
+ gmx_simd_float_t lu = gmx_simd_rsqrt_f(x);
+#if (GMX_SIMD_RSQRT_BITS < GMX_SIMD_MATH_TARGET_SINGLE_BITS)
+ lu = gmx_simd_rsqrt_iter_f(lu, x);
+#endif
+#if (GMX_SIMD_RSQRT_BITS*2 < GMX_SIMD_MATH_TARGET_SINGLE_BITS)
+ lu = gmx_simd_rsqrt_iter_f(lu, x);
+#endif
+#if (GMX_SIMD_RSQRT_BITS*4 < GMX_SIMD_MATH_TARGET_SINGLE_BITS)
+ lu = gmx_simd_rsqrt_iter_f(lu, x);
+#endif
+ return lu;
+}
+
+/*! \brief Calculate 1/sqrt(x) for two SIMD floats.
+ *
+ * You should normally call the real-precision routine \ref gmx_simd_invsqrt_pair_r.
+ *
+ * \param x0 First set of arguments, x0 must be positive - no argument checking.
+ * \param x1 Second set of arguments, x1 must be positive - no argument checking.
+ * \param[out] out0 Result 1/sqrt(x0)
+ * \param[out] out1 Result 1/sqrt(x1)
+ *
+ * In particular for double precision we can sometimes calculate square root
+ * pairs slightly faster by using single precision until the very last step.
+ */
+static gmx_inline void
+gmx_simd_invsqrt_pair_f(gmx_simd_float_t x0, gmx_simd_float_t x1,
+ gmx_simd_float_t *out0, gmx_simd_float_t *out1)
+{
+ *out0 = gmx_simd_invsqrt_f(x0);
+ *out1 = gmx_simd_invsqrt_f(x1);
+}
+
+/*! \brief Perform one Newton-Raphson iteration to improve 1/x for SIMD float.
+ *
+ * This is a low-level routine that should only be used by SIMD math routine
+ * that evaluates the reciprocal.
+ *
+ * \param lu Approximation of 1/x, typically obtained from lookup.
+ * \param x The reference (starting) value x for which we want 1/x.
+ * \return An improved approximation with roughly twice as many bits of accuracy.
+ */
+static gmx_inline gmx_simd_float_t
+gmx_simd_rcp_iter_f(gmx_simd_float_t lu, gmx_simd_float_t x)
+{
+ return gmx_simd_mul_f(lu, gmx_simd_fnmadd_f(lu, x, gmx_simd_set1_f(2.0f)));
+}
+
+/*! \brief Calculate 1/x for SIMD float.
+ *
+ * You should normally call the real-precision routine \ref gmx_simd_inv_r.
+ *
+ * \param x Argument that must be nonzero. This routine does not check arguments.
+ * \return 1/x. Result is undefined if your argument was invalid.
+ */
+static gmx_inline gmx_simd_float_t
+gmx_simd_inv_f(gmx_simd_float_t x)
+{
+ gmx_simd_float_t lu = gmx_simd_rcp_f(x);
+#if (GMX_SIMD_RCP_BITS < GMX_SIMD_MATH_TARGET_SINGLE_BITS)
+ lu = gmx_simd_rcp_iter_f(lu, x);
+#endif
+#if (GMX_SIMD_RCP_BITS*2 < GMX_SIMD_MATH_TARGET_SINGLE_BITS)
+ lu = gmx_simd_rcp_iter_f(lu, x);
+#endif
+#if (GMX_SIMD_RCP_BITS*4 < GMX_SIMD_MATH_TARGET_SINGLE_BITS)
+ lu = gmx_simd_rcp_iter_f(lu, x);
+#endif
+ return lu;
+}
+
+/*! \brief Calculate sqrt(x) correctly for SIMD floats, including argument 0.0.
+ *
+ * You should normally call the real-precision routine \ref gmx_simd_sqrt_r.
+ *
+ * \param x Argument that must be >=0.
+ * \return sqrt(x). If x=0, the result will correctly be set to 0.
+ * The result is undefined if the input value is negative.
+ */
+static gmx_inline gmx_simd_float_t
+gmx_simd_sqrt_f(gmx_simd_float_t x)
+{
+ gmx_simd_fbool_t mask;
+ gmx_simd_float_t res;
+
+ mask = gmx_simd_cmpeq_f(x, gmx_simd_setzero_f());
+ res = gmx_simd_blendnotzero_f(gmx_simd_invsqrt_f(x), mask);
+ return gmx_simd_mul_f(res, x);
+}
+
+/*! \brief SIMD float log(x). This is the natural logarithm.
+ *
+ * You should normally call the real-precision routine \ref gmx_simd_log_r.
+ *
+ * \param x Argument, should be >0.
+ * \result The natural logarithm of x. Undefined if argument is invalid.
+ */
+#ifndef gmx_simd_log_f
+static gmx_inline gmx_simd_float_t
+gmx_simd_log_f(gmx_simd_float_t x)
+{
+ const gmx_simd_float_t half = gmx_simd_set1_f(0.5f);
+ const gmx_simd_float_t one = gmx_simd_set1_f(1.0f);
+ const gmx_simd_float_t sqrt2 = gmx_simd_set1_f(sqrt(2.0f));
+ const gmx_simd_float_t corr = gmx_simd_set1_f(0.693147180559945286226764f);
+ const gmx_simd_float_t CL9 = gmx_simd_set1_f(0.2371599674224853515625f);
+ const gmx_simd_float_t CL7 = gmx_simd_set1_f(0.285279005765914916992188f);
+ const gmx_simd_float_t CL5 = gmx_simd_set1_f(0.400005519390106201171875f);
+ const gmx_simd_float_t CL3 = gmx_simd_set1_f(0.666666567325592041015625f);
+ const gmx_simd_float_t CL1 = gmx_simd_set1_f(2.0f);
+ gmx_simd_float_t fexp, x2, p;
+ gmx_simd_fbool_t mask;
+
+ fexp = gmx_simd_get_exponent_f(x);
+ x = gmx_simd_get_mantissa_f(x);
+
+ mask = gmx_simd_cmplt_f(sqrt2, x);
+ /* Adjust to non-IEEE format for x>sqrt(2): exponent += 1, mantissa *= 0.5 */
+ fexp = gmx_simd_add_f(fexp, gmx_simd_blendzero_f(one, mask));
+ x = gmx_simd_mul_f(x, gmx_simd_blendv_f(one, half, mask));
+
+ x = gmx_simd_mul_f( gmx_simd_sub_f(x, one), gmx_simd_inv_f( gmx_simd_add_f(x, one) ) );
+ x2 = gmx_simd_mul_f(x, x);
+
+ p = gmx_simd_fmadd_f(CL9, x2, CL7);
+ p = gmx_simd_fmadd_f(p, x2, CL5);
+ p = gmx_simd_fmadd_f(p, x2, CL3);
+ p = gmx_simd_fmadd_f(p, x2, CL1);
+ p = gmx_simd_fmadd_f(p, x, gmx_simd_mul_f(corr, fexp));
+
+ return p;
+}
+#endif
+
+#ifndef gmx_simd_exp2_f
+/*! \brief SIMD float 2^x.
+ *
+ * You should normally call the real-precision routine \ref gmx_simd_exp2_r.
+ *
+ * \param x Argument.
+ * \result 2^x. Undefined if input argument caused overflow.
+ */
+static gmx_inline gmx_simd_float_t
+gmx_simd_exp2_f(gmx_simd_float_t x)
+{
+ /* Lower bound: Disallow numbers that would lead to an IEEE fp exponent reaching +-127. */
+ const gmx_simd_float_t arglimit = gmx_simd_set1_f(126.0f);
+ const gmx_simd_float_t CC6 = gmx_simd_set1_f(0.0001534581200287996416911311);
+ const gmx_simd_float_t CC5 = gmx_simd_set1_f(0.001339993121934088894618990);
+ const gmx_simd_float_t CC4 = gmx_simd_set1_f(0.009618488957115180159497841);
+ const gmx_simd_float_t CC3 = gmx_simd_set1_f(0.05550328776964726865751735);
+ const gmx_simd_float_t CC2 = gmx_simd_set1_f(0.2402264689063408646490722);
+ const gmx_simd_float_t CC1 = gmx_simd_set1_f(0.6931472057372680777553816);
+ const gmx_simd_float_t one = gmx_simd_set1_f(1.0f);
+
+ gmx_simd_float_t fexppart;
+ gmx_simd_float_t intpart;
+ gmx_simd_float_t p;
+ gmx_simd_fbool_t valuemask;
+
+ fexppart = gmx_simd_set_exponent_f(x);
+ intpart = gmx_simd_round_f(x);
+ valuemask = gmx_simd_cmple_f(gmx_simd_fabs_f(x), arglimit);
+ fexppart = gmx_simd_blendzero_f(fexppart, valuemask);
+ x = gmx_simd_sub_f(x, intpart);
+
+ p = gmx_simd_fmadd_f(CC6, x, CC5);
+ p = gmx_simd_fmadd_f(p, x, CC4);
+ p = gmx_simd_fmadd_f(p, x, CC3);
+ p = gmx_simd_fmadd_f(p, x, CC2);
+ p = gmx_simd_fmadd_f(p, x, CC1);
+ p = gmx_simd_fmadd_f(p, x, one);
+ x = gmx_simd_mul_f(p, fexppart);
+ return x;
+}
+#endif
+
+#ifndef gmx_simd_exp_f
+/*! \brief SIMD float exp(x).
+ *
+ * You should normally call the real-precision routine \ref gmx_simd_exp_r.
+ *
+ * In addition to scaling the argument for 2^x this routine correctly does
+ * extended precision arithmetics to improve accuracy.
+ *
+ * \param x Argument.
+ * \result exp(x). Undefined if input argument caused overflow.
+ */
+static gmx_inline gmx_simd_float_t
+gmx_simd_exp_f(gmx_simd_float_t x)
+{
+ const gmx_simd_float_t argscale = gmx_simd_set1_f(1.44269504088896341f);
+ /* Lower bound: Disallow numbers that would lead to an IEEE fp exponent reaching +-127. */
+ const gmx_simd_float_t arglimit = gmx_simd_set1_f(126.0f);
+ const gmx_simd_float_t invargscale0 = gmx_simd_set1_f(0.693145751953125f);
+ const gmx_simd_float_t invargscale1 = gmx_simd_set1_f(1.428606765330187045e-06f);
+ const gmx_simd_float_t CC4 = gmx_simd_set1_f(0.00136324646882712841033936f);
+ const gmx_simd_float_t CC3 = gmx_simd_set1_f(0.00836596917361021041870117f);
+ const gmx_simd_float_t CC2 = gmx_simd_set1_f(0.0416710823774337768554688f);
+ const gmx_simd_float_t CC1 = gmx_simd_set1_f(0.166665524244308471679688f);
+ const gmx_simd_float_t CC0 = gmx_simd_set1_f(0.499999850988388061523438f);
+ const gmx_simd_float_t one = gmx_simd_set1_f(1.0f);
+ gmx_simd_float_t fexppart;
+ gmx_simd_float_t intpart;
+ gmx_simd_float_t y, p;
+ gmx_simd_fbool_t valuemask;
+
+ y = gmx_simd_mul_f(x, argscale);
+ fexppart = gmx_simd_set_exponent_f(y); /* rounds to nearest int internally */
+ intpart = gmx_simd_round_f(y); /* use same rounding algorithm here */
+ valuemask = gmx_simd_cmple_f(gmx_simd_fabs_f(y), arglimit);
+ fexppart = gmx_simd_blendzero_f(fexppart, valuemask);
+
+ /* Extended precision arithmetics */
+ x = gmx_simd_fnmadd_f(invargscale0, intpart, x);
+ x = gmx_simd_fnmadd_f(invargscale1, intpart, x);
+
+ p = gmx_simd_fmadd_f(CC4, x, CC3);
+ p = gmx_simd_fmadd_f(p, x, CC2);
+ p = gmx_simd_fmadd_f(p, x, CC1);
+ p = gmx_simd_fmadd_f(p, x, CC0);
+ p = gmx_simd_fmadd_f(gmx_simd_mul_f(x, x), p, x);
+ p = gmx_simd_add_f(p, one);
+ x = gmx_simd_mul_f(p, fexppart);
+ return x;
+}
+#endif
+
+/*! \brief SIMD float erf(x).
+ *
+ * You should normally call the real-precision routine \ref gmx_simd_erf_r.
+ *
+ * \param x The value to calculate erf(x) for.
+ * \result erf(x)
+ *
+ * This routine achieves very close to full precision, but we do not care about
+ * the last bit or the subnormal result range.
+ */
+static gmx_inline gmx_simd_float_t
+gmx_simd_erf_f(gmx_simd_float_t x)
+{
+ /* Coefficients for minimax approximation of erf(x)=x*P(x^2) in range [-1,1] */
+ const gmx_simd_float_t CA6 = gmx_simd_set1_f(7.853861353153693e-5f);
+ const gmx_simd_float_t CA5 = gmx_simd_set1_f(-8.010193625184903e-4f);
+ const gmx_simd_float_t CA4 = gmx_simd_set1_f(5.188327685732524e-3f);
+ const gmx_simd_float_t CA3 = gmx_simd_set1_f(-2.685381193529856e-2f);
+ const gmx_simd_float_t CA2 = gmx_simd_set1_f(1.128358514861418e-1f);
+ const gmx_simd_float_t CA1 = gmx_simd_set1_f(-3.761262582423300e-1f);
+ const gmx_simd_float_t CA0 = gmx_simd_set1_f(1.128379165726710f);
+ /* Coefficients for minimax approximation of erfc(x)=Exp(-x^2)*P((1/(x-1))^2) in range [0.67,2] */
+ const gmx_simd_float_t CB9 = gmx_simd_set1_f(-0.0018629930017603923f);
+ const gmx_simd_float_t CB8 = gmx_simd_set1_f(0.003909821287598495f);
+ const gmx_simd_float_t CB7 = gmx_simd_set1_f(-0.0052094582210355615f);
+ const gmx_simd_float_t CB6 = gmx_simd_set1_f(0.005685614362160572f);
+ const gmx_simd_float_t CB5 = gmx_simd_set1_f(-0.0025367682853477272f);
+ const gmx_simd_float_t CB4 = gmx_simd_set1_f(-0.010199799682318782f);
+ const gmx_simd_float_t CB3 = gmx_simd_set1_f(0.04369575504816542f);
+ const gmx_simd_float_t CB2 = gmx_simd_set1_f(-0.11884063474674492f);
+ const gmx_simd_float_t CB1 = gmx_simd_set1_f(0.2732120154030589f);
+ const gmx_simd_float_t CB0 = gmx_simd_set1_f(0.42758357702025784f);
+ /* Coefficients for minimax approximation of erfc(x)=Exp(-x^2)*(1/x)*P((1/x)^2) in range [2,9.19] */
+ const gmx_simd_float_t CC10 = gmx_simd_set1_f(-0.0445555913112064f);
+ const gmx_simd_float_t CC9 = gmx_simd_set1_f(0.21376355144663348f);
+ const gmx_simd_float_t CC8 = gmx_simd_set1_f(-0.3473187200259257f);
+ const gmx_simd_float_t CC7 = gmx_simd_set1_f(0.016690861551248114f);
+ const gmx_simd_float_t CC6 = gmx_simd_set1_f(0.7560973182491192f);
+ const gmx_simd_float_t CC5 = gmx_simd_set1_f(-1.2137903600145787f);
+ const gmx_simd_float_t CC4 = gmx_simd_set1_f(0.8411872321232948f);
+ const gmx_simd_float_t CC3 = gmx_simd_set1_f(-0.08670413896296343f);
+ const gmx_simd_float_t CC2 = gmx_simd_set1_f(-0.27124782687240334f);
+ const gmx_simd_float_t CC1 = gmx_simd_set1_f(-0.0007502488047806069f);
+ const gmx_simd_float_t CC0 = gmx_simd_set1_f(0.5642114853803148f);
+ const gmx_simd_float_t one = gmx_simd_set1_f(1.0f);
+ const gmx_simd_float_t two = gmx_simd_set1_f(2.0f);
+
+ gmx_simd_float_t x2, x4, y;
+ gmx_simd_float_t t, t2, w, w2;
+ gmx_simd_float_t pA0, pA1, pB0, pB1, pC0, pC1;
+ gmx_simd_float_t expmx2;
+ gmx_simd_float_t res_erf, res_erfc, res;
+ gmx_simd_fbool_t mask;
+
+ /* Calculate erf() */
+ x2 = gmx_simd_mul_f(x, x);
+ x4 = gmx_simd_mul_f(x2, x2);
+
+ pA0 = gmx_simd_fmadd_f(CA6, x4, CA4);
+ pA1 = gmx_simd_fmadd_f(CA5, x4, CA3);
+ pA0 = gmx_simd_fmadd_f(pA0, x4, CA2);
+ pA1 = gmx_simd_fmadd_f(pA1, x4, CA1);
+ pA0 = gmx_simd_mul_f(pA0, x4);
+ pA0 = gmx_simd_fmadd_f(pA1, x2, pA0);
+ /* Constant term must come last for precision reasons */
+ pA0 = gmx_simd_add_f(pA0, CA0);
+
+ res_erf = gmx_simd_mul_f(x, pA0);
+
+ /* Calculate erfc */
+ y = gmx_simd_fabs_f(x);
+ t = gmx_simd_inv_f(y);
+ w = gmx_simd_sub_f(t, one);
+ t2 = gmx_simd_mul_f(t, t);
+ w2 = gmx_simd_mul_f(w, w);
+
+ /* No need for a floating-point sieve here (as in erfc), since erf()
+ * will never return values that are extremely small for large args.
+ */
+ expmx2 = gmx_simd_exp_f( gmx_simd_fneg_f( gmx_simd_mul_f(y, y)));
+
+ pB1 = gmx_simd_fmadd_f(CB9, w2, CB7);
+ pB0 = gmx_simd_fmadd_f(CB8, w2, CB6);
+ pB1 = gmx_simd_fmadd_f(pB1, w2, CB5);
+ pB0 = gmx_simd_fmadd_f(pB0, w2, CB4);
+ pB1 = gmx_simd_fmadd_f(pB1, w2, CB3);
+ pB0 = gmx_simd_fmadd_f(pB0, w2, CB2);
+ pB1 = gmx_simd_fmadd_f(pB1, w2, CB1);
+ pB0 = gmx_simd_fmadd_f(pB0, w2, CB0);
+ pB0 = gmx_simd_fmadd_f(pB1, w, pB0);
+
+ pC0 = gmx_simd_fmadd_f(CC10, t2, CC8);
+ pC1 = gmx_simd_fmadd_f(CC9, t2, CC7);
+ pC0 = gmx_simd_fmadd_f(pC0, t2, CC6);
+ pC1 = gmx_simd_fmadd_f(pC1, t2, CC5);
+ pC0 = gmx_simd_fmadd_f(pC0, t2, CC4);
+ pC1 = gmx_simd_fmadd_f(pC1, t2, CC3);
+ pC0 = gmx_simd_fmadd_f(pC0, t2, CC2);
+ pC1 = gmx_simd_fmadd_f(pC1, t2, CC1);
+
+ pC0 = gmx_simd_fmadd_f(pC0, t2, CC0);
+ pC0 = gmx_simd_fmadd_f(pC1, t, pC0);
+ pC0 = gmx_simd_mul_f(pC0, t);
+
+ /* SELECT pB0 or pC0 for erfc() */
+ mask = gmx_simd_cmplt_f(two, y);
+ res_erfc = gmx_simd_blendv_f(pB0, pC0, mask);
+ res_erfc = gmx_simd_mul_f(res_erfc, expmx2);
+
+ /* erfc(x<0) = 2-erfc(|x|) */
+ mask = gmx_simd_cmplt_f(x, gmx_simd_setzero_f());
+ res_erfc = gmx_simd_blendv_f(res_erfc, gmx_simd_sub_f(two, res_erfc), mask);
+
+ /* Select erf() or erfc() */
+ mask = gmx_simd_cmplt_f(y, gmx_simd_set1_f(0.75f));
+ res = gmx_simd_blendv_f(gmx_simd_sub_f(one, res_erfc), res_erf, mask);
+
+ return res;
+}
+
+/*! \brief SIMD float erfc(x).
+ *
+ * You should normally call the real-precision routine \ref gmx_simd_erfc_r.
+ *
+ * \param x The value to calculate erfc(x) for.
+ * \result erfc(x)
+ *
+ * This routine achieves full precision (bar the last bit) over most of the
+ * input range, but for large arguments where the result is getting close
+ * to the minimum representable numbers we accept slightly larger errors
+ * (think results that are in the ballpark of 10^-30 for single precision,
+ * or 10^-200 for double) since that is not relevant for MD.
+ */
+static gmx_inline gmx_simd_float_t
+gmx_simd_erfc_f(gmx_simd_float_t x)
+{
+ /* Coefficients for minimax approximation of erf(x)=x*P(x^2) in range [-1,1] */
+ const gmx_simd_float_t CA6 = gmx_simd_set1_f(7.853861353153693e-5f);
+ const gmx_simd_float_t CA5 = gmx_simd_set1_f(-8.010193625184903e-4f);
+ const gmx_simd_float_t CA4 = gmx_simd_set1_f(5.188327685732524e-3f);
+ const gmx_simd_float_t CA3 = gmx_simd_set1_f(-2.685381193529856e-2f);
+ const gmx_simd_float_t CA2 = gmx_simd_set1_f(1.128358514861418e-1f);
+ const gmx_simd_float_t CA1 = gmx_simd_set1_f(-3.761262582423300e-1f);
+ const gmx_simd_float_t CA0 = gmx_simd_set1_f(1.128379165726710f);
+ /* Coefficients for minimax approximation of erfc(x)=Exp(-x^2)*P((1/(x-1))^2) in range [0.67,2] */
+ const gmx_simd_float_t CB9 = gmx_simd_set1_f(-0.0018629930017603923f);
+ const gmx_simd_float_t CB8 = gmx_simd_set1_f(0.003909821287598495f);
+ const gmx_simd_float_t CB7 = gmx_simd_set1_f(-0.0052094582210355615f);
+ const gmx_simd_float_t CB6 = gmx_simd_set1_f(0.005685614362160572f);
+ const gmx_simd_float_t CB5 = gmx_simd_set1_f(-0.0025367682853477272f);
+ const gmx_simd_float_t CB4 = gmx_simd_set1_f(-0.010199799682318782f);
+ const gmx_simd_float_t CB3 = gmx_simd_set1_f(0.04369575504816542f);
+ const gmx_simd_float_t CB2 = gmx_simd_set1_f(-0.11884063474674492f);
+ const gmx_simd_float_t CB1 = gmx_simd_set1_f(0.2732120154030589f);
+ const gmx_simd_float_t CB0 = gmx_simd_set1_f(0.42758357702025784f);
+ /* Coefficients for minimax approximation of erfc(x)=Exp(-x^2)*(1/x)*P((1/x)^2) in range [2,9.19] */
+ const gmx_simd_float_t CC10 = gmx_simd_set1_f(-0.0445555913112064f);
+ const gmx_simd_float_t CC9 = gmx_simd_set1_f(0.21376355144663348f);
+ const gmx_simd_float_t CC8 = gmx_simd_set1_f(-0.3473187200259257f);
+ const gmx_simd_float_t CC7 = gmx_simd_set1_f(0.016690861551248114f);
+ const gmx_simd_float_t CC6 = gmx_simd_set1_f(0.7560973182491192f);
+ const gmx_simd_float_t CC5 = gmx_simd_set1_f(-1.2137903600145787f);
+ const gmx_simd_float_t CC4 = gmx_simd_set1_f(0.8411872321232948f);
+ const gmx_simd_float_t CC3 = gmx_simd_set1_f(-0.08670413896296343f);
+ const gmx_simd_float_t CC2 = gmx_simd_set1_f(-0.27124782687240334f);
+ const gmx_simd_float_t CC1 = gmx_simd_set1_f(-0.0007502488047806069f);
+ const gmx_simd_float_t CC0 = gmx_simd_set1_f(0.5642114853803148f);
+ /* Coefficients for expansion of exp(x) in [0,0.1] */
+ /* CD0 and CD1 are both 1.0, so no need to declare them separately */
+ const gmx_simd_float_t CD2 = gmx_simd_set1_f(0.5000066608081202f);
+ const gmx_simd_float_t CD3 = gmx_simd_set1_f(0.1664795422874624f);
+ const gmx_simd_float_t CD4 = gmx_simd_set1_f(0.04379839977652482f);
+ const gmx_simd_float_t one = gmx_simd_set1_f(1.0f);
+ const gmx_simd_float_t two = gmx_simd_set1_f(2.0f);
+
+ /* We need to use a small trick here, since we cannot assume all SIMD
+ * architectures support integers, and the flag we want (0xfffff000) would
+ * evaluate to NaN (i.e., it cannot be expressed as a floating-point num).
+ * Instead, we represent the flags 0xf0f0f000 and 0x0f0f0000 as valid
+ * fp numbers, and perform a logical or. Since the expression is constant,
+ * we can at least hope it is evaluated at compile-time.
+ */
+#ifdef GMX_SIMD_HAVE_LOGICAL
+ const gmx_simd_float_t sieve = gmx_simd_or_f(gmx_simd_set1_f(-5.965323564e+29f), gmx_simd_set1_f(7.05044434e-30f));
+#else
+ const int isieve = 0xFFFFF000;
+ float mem[GMX_SIMD_REAL_WIDTH*2];
+ float * pmem = gmx_simd_align_f(mem);
+ union {
+ float f; int i;
+ } conv;
+ int i;
+#endif
+
+ gmx_simd_float_t x2, x4, y;
+ gmx_simd_float_t q, z, t, t2, w, w2;
+ gmx_simd_float_t pA0, pA1, pB0, pB1, pC0, pC1;
+ gmx_simd_float_t expmx2, corr;
+ gmx_simd_float_t res_erf, res_erfc, res;
+ gmx_simd_fbool_t mask;
+
+ /* Calculate erf() */
+ x2 = gmx_simd_mul_f(x, x);
+ x4 = gmx_simd_mul_f(x2, x2);
+
+ pA0 = gmx_simd_fmadd_f(CA6, x4, CA4);
+ pA1 = gmx_simd_fmadd_f(CA5, x4, CA3);
+ pA0 = gmx_simd_fmadd_f(pA0, x4, CA2);
+ pA1 = gmx_simd_fmadd_f(pA1, x4, CA1);
+ pA1 = gmx_simd_mul_f(pA1, x2);
+ pA0 = gmx_simd_fmadd_f(pA0, x4, pA1);
+ /* Constant term must come last for precision reasons */
+ pA0 = gmx_simd_add_f(pA0, CA0);
+
+ res_erf = gmx_simd_mul_f(x, pA0);
+
+ /* Calculate erfc */
+ y = gmx_simd_fabs_f(x);
+ t = gmx_simd_inv_f(y);
+ w = gmx_simd_sub_f(t, one);
+ t2 = gmx_simd_mul_f(t, t);
+ w2 = gmx_simd_mul_f(w, w);
+ /*
+ * We cannot simply calculate exp(-y2) directly in single precision, since
+ * that will lose a couple of bits of precision due to the multiplication.
+ * Instead, we introduce y=z+w, where the last 12 bits of precision are in w.
+ * Then we get exp(-y2) = exp(-z2)*exp((z-y)*(z+y)).
+ *
+ * The only drawback with this is that it requires TWO separate exponential
+ * evaluations, which would be horrible performance-wise. However, the argument
+ * for the second exp() call is always small, so there we simply use a
+ * low-order minimax expansion on [0,0.1].
+ *
+ * However, this neat idea requires support for logical ops (and) on
+ * FP numbers, which some vendors decided isn't necessary in their SIMD
+ * instruction sets (Hi, IBM VSX!). In principle we could use some tricks
+ * in double, but we still need memory as a backup when that is not available,
+ * and this case is rare enough that we go directly there...
+ */
+#ifdef GMX_SIMD_HAVE_LOGICAL
+ z = gmx_simd_and_f(y, sieve);
+#else
+ gmx_simd_store_f(pmem, y);
+ for (i = 0; i < GMX_SIMD_FLOAT_WIDTH; i++)
+ {
+ conv.f = pmem[i];
+ conv.i = conv.i & isieve;
+ pmem[i] = conv.f;
+ }
+ z = gmx_simd_load_f(pmem);
+#endif
+ q = gmx_simd_mul_f( gmx_simd_sub_f(z, y), gmx_simd_add_f(z, y) );
+ corr = gmx_simd_fmadd_f(CD4, q, CD3);
+ corr = gmx_simd_fmadd_f(corr, q, CD2);
+ corr = gmx_simd_fmadd_f(corr, q, one);
+ corr = gmx_simd_fmadd_f(corr, q, one);
+
+ expmx2 = gmx_simd_exp_f( gmx_simd_fneg_f( gmx_simd_mul_f(z, z) ) );
+ expmx2 = gmx_simd_mul_f(expmx2, corr);
+
+ pB1 = gmx_simd_fmadd_f(CB9, w2, CB7);
+ pB0 = gmx_simd_fmadd_f(CB8, w2, CB6);
+ pB1 = gmx_simd_fmadd_f(pB1, w2, CB5);
+ pB0 = gmx_simd_fmadd_f(pB0, w2, CB4);
+ pB1 = gmx_simd_fmadd_f(pB1, w2, CB3);
+ pB0 = gmx_simd_fmadd_f(pB0, w2, CB2);
+ pB1 = gmx_simd_fmadd_f(pB1, w2, CB1);
+ pB0 = gmx_simd_fmadd_f(pB0, w2, CB0);
+ pB0 = gmx_simd_fmadd_f(pB1, w, pB0);
+
+ pC0 = gmx_simd_fmadd_f(CC10, t2, CC8);
+ pC1 = gmx_simd_fmadd_f(CC9, t2, CC7);
+ pC0 = gmx_simd_fmadd_f(pC0, t2, CC6);
+ pC1 = gmx_simd_fmadd_f(pC1, t2, CC5);
+ pC0 = gmx_simd_fmadd_f(pC0, t2, CC4);
+ pC1 = gmx_simd_fmadd_f(pC1, t2, CC3);
+ pC0 = gmx_simd_fmadd_f(pC0, t2, CC2);
+ pC1 = gmx_simd_fmadd_f(pC1, t2, CC1);
+
+ pC0 = gmx_simd_fmadd_f(pC0, t2, CC0);
+ pC0 = gmx_simd_fmadd_f(pC1, t, pC0);
+ pC0 = gmx_simd_mul_f(pC0, t);
+
+ /* SELECT pB0 or pC0 for erfc() */
+ mask = gmx_simd_cmplt_f(two, y);
+ res_erfc = gmx_simd_blendv_f(pB0, pC0, mask);
+ res_erfc = gmx_simd_mul_f(res_erfc, expmx2);
+
+ /* erfc(x<0) = 2-erfc(|x|) */
+ mask = gmx_simd_cmplt_f(x, gmx_simd_setzero_f());
+ res_erfc = gmx_simd_blendv_f(res_erfc, gmx_simd_sub_f(two, res_erfc), mask);
+
+ /* Select erf() or erfc() */
+ mask = gmx_simd_cmplt_f(y, gmx_simd_set1_f(0.75f));
+ res = gmx_simd_blendv_f(res_erfc, gmx_simd_sub_f(one, res_erf), mask);
+
+ return res;
+}
+
+/*! \brief SIMD float sin \& cos.
+ *
+ * You should normally call the real-precision routine \ref gmx_simd_sincos_r.
+ *
+ * \param x The argument to evaluate sin/cos for
+ * \param[out] sinval Sin(x)
+ * \param[out] cosval Cos(x)
+ *
+ * This version achieves close to machine precision, but for very large
+ * magnitudes of the argument we inherently begin to lose accuracy due to the
+ * argument reduction, despite using extended precision arithmetics internally.
+ */
+static gmx_inline void
+gmx_simd_sincos_f(gmx_simd_float_t x, gmx_simd_float_t *sinval, gmx_simd_float_t *cosval)
+{
+ /* Constants to subtract Pi/4*x from y while minimizing precision loss */
+ const gmx_simd_float_t argred0 = gmx_simd_set1_f(1.5703125);
+ const gmx_simd_float_t argred1 = gmx_simd_set1_f(4.83751296997070312500e-04f);
+ const gmx_simd_float_t argred2 = gmx_simd_set1_f(7.54953362047672271729e-08f);
+ const gmx_simd_float_t argred3 = gmx_simd_set1_f(2.56334406825708960298e-12f);
+ const gmx_simd_float_t two_over_pi = gmx_simd_set1_f(2.0f/M_PI);
+ const gmx_simd_float_t const_sin2 = gmx_simd_set1_f(-1.9515295891e-4f);
+ const gmx_simd_float_t const_sin1 = gmx_simd_set1_f( 8.3321608736e-3f);
+ const gmx_simd_float_t const_sin0 = gmx_simd_set1_f(-1.6666654611e-1f);
+ const gmx_simd_float_t const_cos2 = gmx_simd_set1_f( 2.443315711809948e-5f);
+ const gmx_simd_float_t const_cos1 = gmx_simd_set1_f(-1.388731625493765e-3f);
+ const gmx_simd_float_t const_cos0 = gmx_simd_set1_f( 4.166664568298827e-2f);
+ const gmx_simd_float_t half = gmx_simd_set1_f(0.5f);
+ const gmx_simd_float_t one = gmx_simd_set1_f(1.0f);
+ gmx_simd_float_t ssign, csign;
+ gmx_simd_float_t x2, y, z, psin, pcos, sss, ccc;
+ gmx_simd_fbool_t mask;
+#if (defined GMX_SIMD_HAVE_FINT32) && (defined GMX_SIMD_HAVE_FINT32_ARITHMETICS) && (defined GMX_SIMD_HAVE_LOGICAL)
+ const gmx_simd_fint32_t ione = gmx_simd_set1_fi(1);
+ const gmx_simd_fint32_t itwo = gmx_simd_set1_fi(2);
+ gmx_simd_fint32_t iy;
+
+ z = gmx_simd_mul_f(x, two_over_pi);
+ iy = gmx_simd_cvt_f2i(z);
+ y = gmx_simd_round_f(z);
+
+ mask = gmx_simd_cvt_fib2fb(gmx_simd_cmpeq_fi(gmx_simd_and_fi(iy, ione), gmx_simd_setzero_fi()));
+ ssign = gmx_simd_blendzero_f(gmx_simd_set1_f(-0.0f), gmx_simd_cvt_fib2fb(gmx_simd_cmpeq_fi(gmx_simd_and_fi(iy, itwo), itwo)));
+ csign = gmx_simd_blendzero_f(gmx_simd_set1_f(-0.0f), gmx_simd_cvt_fib2fb(gmx_simd_cmpeq_fi(gmx_simd_and_fi(gmx_simd_add_fi(iy, ione), itwo), itwo)));
+#else
+ const gmx_simd_float_t quarter = gmx_simd_set1_f(0.25f);
+ const gmx_simd_float_t minusquarter = gmx_simd_set1_f(-0.25f);
+ gmx_simd_float_t q;
+ gmx_simd_fbool_t m1, m2, m3;
+
+ /* The most obvious way to find the arguments quadrant in the unit circle
+ * to calculate the sign is to use integer arithmetic, but that is not
+ * present in all SIMD implementations. As an alternative, we have devised a
+ * pure floating-point algorithm that uses truncation for argument reduction
+ * so that we get a new value 0<=q<1 over the unit circle, and then
+ * do floating-point comparisons with fractions. This is likely to be
+ * slightly slower (~10%) due to the longer latencies of floating-point, so
+ * we only use it when integer SIMD arithmetic is not present.
+ */
+ ssign = x;
+ x = gmx_simd_fabs_f(x);
+ /* It is critical that half-way cases are rounded down */
+ z = gmx_simd_fmadd_f(x, two_over_pi, half);
+ y = gmx_simd_trunc_f(z);
+ q = gmx_simd_mul_f(z, quarter);
+ q = gmx_simd_sub_f(q, gmx_simd_trunc_f(q));
+ /* z now starts at 0.0 for x=-pi/4 (although neg. values cannot occur), and
+ * then increased by 1.0 as x increases by 2*Pi, when it resets to 0.0.
+ * This removes the 2*Pi periodicity without using any integer arithmetic.
+ * First check if y had the value 2 or 3, set csign if true.
+ */
+ q = gmx_simd_sub_f(q, half);
+ /* If we have logical operations we can work directly on the signbit, which
+ * saves instructions. Otherwise we need to represent signs as +1.0/-1.0.
+ * Thus, if you are altering defines to debug alternative code paths, the
+ * two GMX_SIMD_HAVE_LOGICAL sections in this routine must either both be
+ * active or inactive - you will get errors if only one is used.
+ */
+# ifdef GMX_SIMD_HAVE_LOGICAL
+ ssign = gmx_simd_and_f(ssign, gmx_simd_set1_f(-0.0f));
+ csign = gmx_simd_andnot_f(q, gmx_simd_set1_f(-0.0f));
+ ssign = gmx_simd_xor_f(ssign, csign);
+# else
+ csign = gmx_simd_xor_sign_f(gmx_simd_set1_f(-1.0f), q);
+ // ALT: csign = gmx_simd_fneg_f(gmx_simd_copysign(gmx_simd_set1_f(1.0),q));
+
+ ssign = gmx_simd_xor_sign_f(ssign, csign); /* swap ssign if csign was set. */
+# endif
+ /* Check if y had value 1 or 3 (remember we subtracted 0.5 from q) */
+ m1 = gmx_simd_cmplt_f(q, minusquarter);
+ m2 = gmx_simd_cmple_f(gmx_simd_setzero_f(), q);
+ m3 = gmx_simd_cmplt_f(q, quarter);
+ m2 = gmx_simd_and_fb(m2, m3);
+ mask = gmx_simd_or_fb(m1, m2);
+ /* where mask is FALSE, set sign. */
+ csign = gmx_simd_xor_sign_f(csign, gmx_simd_blendv_f(gmx_simd_set1_f(-1.0f), one, mask));
+#endif
+ x = gmx_simd_fnmadd_f(y, argred0, x);
+ x = gmx_simd_fnmadd_f(y, argred1, x);
+ x = gmx_simd_fnmadd_f(y, argred2, x);
+ x = gmx_simd_fnmadd_f(y, argred3, x);
+ x2 = gmx_simd_mul_f(x, x);
+
+ psin = gmx_simd_fmadd_f(const_sin2, x2, const_sin1);
+ psin = gmx_simd_fmadd_f(psin, x2, const_sin0);
+ psin = gmx_simd_fmadd_f(psin, gmx_simd_mul_f(x, x2), x);
+ pcos = gmx_simd_fmadd_f(const_cos2, x2, const_cos1);
+ pcos = gmx_simd_fmadd_f(pcos, x2, const_cos0);
+ pcos = gmx_simd_fmsub_f(pcos, x2, half);
+ pcos = gmx_simd_fmadd_f(pcos, x2, one);
+
+ sss = gmx_simd_blendv_f(pcos, psin, mask);
+ ccc = gmx_simd_blendv_f(psin, pcos, mask);
+ /* See comment for GMX_SIMD_HAVE_LOGICAL section above. */
+#ifdef GMX_SIMD_HAVE_LOGICAL
+ *sinval = gmx_simd_xor_f(sss, ssign);
+ *cosval = gmx_simd_xor_f(ccc, csign);
+#else
+ *sinval = gmx_simd_xor_sign_f(sss, ssign);
+ *cosval = gmx_simd_xor_sign_f(ccc, csign);
+#endif
+}
+
+/*! \brief SIMD float sin(x).
+ *
+ * You should normally call the real-precision routine \ref gmx_simd_sin_r.
+ *
+ * \param x The argument to evaluate sin for
+ * \result Sin(x)
+ *
+ * \attention Do NOT call both sin & cos if you need both results, since each of them
+ * will then call \ref gmx_simd_sincos_r and waste a factor 2 in performance.
+ */
+static gmx_inline gmx_simd_float_t
+gmx_simd_sin_f(gmx_simd_float_t x)
+{
+ gmx_simd_float_t s, c;
+ gmx_simd_sincos_f(x, &s, &c);
+ return s;
+}
+
+/*! \brief SIMD float cos(x).
+ *
+ * You should normally call the real-precision routine \ref gmx_simd_cos_r.
+ *
+ * \param x The argument to evaluate cos for
+ * \result Cos(x)
+ *
+ * \attention Do NOT call both sin & cos if you need both results, since each of them
+ * will then call \ref gmx_simd_sincos_r and waste a factor 2 in performance.
+ */
+static gmx_inline gmx_simd_float_t
+gmx_simd_cos_f(gmx_simd_float_t x)
+{
+ gmx_simd_float_t s, c;
+ gmx_simd_sincos_f(x, &s, &c);
+ return c;
+}
+
+/*! \brief SIMD float tan(x).
+ *
+ * You should normally call the real-precision routine \ref gmx_simd_tan_r.
+ *
+ * \param x The argument to evaluate tan for
+ * \result Tan(x)
+ */
+static gmx_inline gmx_simd_float_t
+gmx_simd_tan_f(gmx_simd_float_t x)
+{
+ const gmx_simd_float_t argred0 = gmx_simd_set1_f(1.5703125);
+ const gmx_simd_float_t argred1 = gmx_simd_set1_f(4.83751296997070312500e-04f);
+ const gmx_simd_float_t argred2 = gmx_simd_set1_f(7.54953362047672271729e-08f);
+ const gmx_simd_float_t argred3 = gmx_simd_set1_f(2.56334406825708960298e-12f);
+ const gmx_simd_float_t two_over_pi = gmx_simd_set1_f(2.0f/M_PI);
+ const gmx_simd_float_t CT6 = gmx_simd_set1_f(0.009498288995810566122993911);
+ const gmx_simd_float_t CT5 = gmx_simd_set1_f(0.002895755790837379295226923);
+ const gmx_simd_float_t CT4 = gmx_simd_set1_f(0.02460087336161924491836265);
+ const gmx_simd_float_t CT3 = gmx_simd_set1_f(0.05334912882656359828045988);
+ const gmx_simd_float_t CT2 = gmx_simd_set1_f(0.1333989091464957704418495);
+ const gmx_simd_float_t CT1 = gmx_simd_set1_f(0.3333307599244198227797507);
+
+ gmx_simd_float_t x2, p, y, z;
+ gmx_simd_fbool_t mask;
+
+#if (defined GMX_SIMD_HAVE_FINT32) && (defined GMX_SIMD_HAVE_FINT32_ARITHMETICS) && (defined GMX_SIMD_HAVE_LOGICAL)
+ gmx_simd_fint32_t iy;
+ gmx_simd_fint32_t ione = gmx_simd_set1_fi(1);
+
+ z = gmx_simd_mul_f(x, two_over_pi);
+ iy = gmx_simd_cvt_f2i(z);
+ y = gmx_simd_round_f(z);
+ mask = gmx_simd_cvt_fib2fb(gmx_simd_cmpeq_fi(gmx_simd_and_fi(iy, ione), ione));
+
+ x = gmx_simd_fnmadd_f(y, argred0, x);
+ x = gmx_simd_fnmadd_f(y, argred1, x);
+ x = gmx_simd_fnmadd_f(y, argred2, x);
+ x = gmx_simd_fnmadd_f(y, argred3, x);
+ x = gmx_simd_xor_f(gmx_simd_blendzero_f(gmx_simd_set1_f(-0.0f), mask), x);
+#else
+ const gmx_simd_float_t quarter = gmx_simd_set1_f(0.25f);
+ const gmx_simd_float_t half = gmx_simd_set1_f(0.5f);
+ const gmx_simd_float_t threequarter = gmx_simd_set1_f(0.75f);
+ gmx_simd_float_t w, q;
+ gmx_simd_fbool_t m1, m2, m3;
+
+ w = gmx_simd_fabs_f(x);
+ z = gmx_simd_fmadd_f(w, two_over_pi, half);
+ y = gmx_simd_trunc_f(z);
+ q = gmx_simd_mul_f(z, quarter);
+ q = gmx_simd_sub_f(q, gmx_simd_trunc_f(q));
+ m1 = gmx_simd_cmple_f(quarter, q);
+ m2 = gmx_simd_cmplt_f(q, half);
+ m3 = gmx_simd_cmple_f(threequarter, q);
+ m1 = gmx_simd_and_fb(m1, m2);
+ mask = gmx_simd_or_fb(m1, m3);
+ w = gmx_simd_fnmadd_f(y, argred0, w);
+ w = gmx_simd_fnmadd_f(y, argred1, w);
+ w = gmx_simd_fnmadd_f(y, argred2, w);
+ w = gmx_simd_fnmadd_f(y, argred3, w);
+
+ w = gmx_simd_blendv_f(w, gmx_simd_fneg_f(w), mask);
+ x = gmx_simd_xor_sign_f(w, x);
+#endif
+ x2 = gmx_simd_mul_f(x, x);
+ p = gmx_simd_fmadd_f(CT6, x2, CT5);
+ p = gmx_simd_fmadd_f(p, x2, CT4);
+ p = gmx_simd_fmadd_f(p, x2, CT3);
+ p = gmx_simd_fmadd_f(p, x2, CT2);
+ p = gmx_simd_fmadd_f(p, x2, CT1);
+ p = gmx_simd_fmadd_f(x2, gmx_simd_mul_f(p, x), x);
+
+ p = gmx_simd_blendv_f( p, gmx_simd_inv_f(p), mask);
+ return p;
+}
+
+/*! \brief SIMD float asin(x).
+ *
+ * You should normally call the real-precision routine \ref gmx_simd_asin_r.
+ *
+ * \param x The argument to evaluate asin for
+ * \result Asin(x)
+ */
+static gmx_inline gmx_simd_float_t
+gmx_simd_asin_f(gmx_simd_float_t x)
+{
+ const gmx_simd_float_t limitlow = gmx_simd_set1_f(1e-4f);
+ const gmx_simd_float_t half = gmx_simd_set1_f(0.5f);
+ const gmx_simd_float_t one = gmx_simd_set1_f(1.0f);
+ const gmx_simd_float_t halfpi = gmx_simd_set1_f((float)M_PI/2.0f);
+ const gmx_simd_float_t CC5 = gmx_simd_set1_f(4.2163199048E-2f);
+ const gmx_simd_float_t CC4 = gmx_simd_set1_f(2.4181311049E-2f);
+ const gmx_simd_float_t CC3 = gmx_simd_set1_f(4.5470025998E-2f);
+ const gmx_simd_float_t CC2 = gmx_simd_set1_f(7.4953002686E-2f);
+ const gmx_simd_float_t CC1 = gmx_simd_set1_f(1.6666752422E-1f);
+ gmx_simd_float_t xabs;
+ gmx_simd_float_t z, z1, z2, q, q1, q2;
+ gmx_simd_float_t pA, pB;
+ gmx_simd_fbool_t mask;
+
+ xabs = gmx_simd_fabs_f(x);
+ mask = gmx_simd_cmplt_f(half, xabs);
+ z1 = gmx_simd_mul_f(half, gmx_simd_sub_f(one, xabs));
+ q1 = gmx_simd_mul_f(z1, gmx_simd_invsqrt_f(z1));
+ q1 = gmx_simd_blendnotzero_f(q1, gmx_simd_cmpeq_f(xabs, one));
+ q2 = xabs;
+ z2 = gmx_simd_mul_f(q2, q2);
+ z = gmx_simd_blendv_f(z2, z1, mask);
+ q = gmx_simd_blendv_f(q2, q1, mask);
+
+ z2 = gmx_simd_mul_f(z, z);
+ pA = gmx_simd_fmadd_f(CC5, z2, CC3);
+ pB = gmx_simd_fmadd_f(CC4, z2, CC2);
+ pA = gmx_simd_fmadd_f(pA, z2, CC1);
+ pA = gmx_simd_mul_f(pA, z);
+ z = gmx_simd_fmadd_f(pB, z2, pA);
+ z = gmx_simd_fmadd_f(z, q, q);
+ q2 = gmx_simd_sub_f(halfpi, z);
+ q2 = gmx_simd_sub_f(q2, z);
+ z = gmx_simd_blendv_f(z, q2, mask);
+
+ mask = gmx_simd_cmplt_f(limitlow, xabs);
+ z = gmx_simd_blendv_f( xabs, z, mask );
+ z = gmx_simd_xor_sign_f(z, x);
+
+ return z;
+}
+
+/*! \brief SIMD float acos(x).
+ *
+ * You should normally call the real-precision routine \ref gmx_simd_acos_r.
+ *
+ * \param x The argument to evaluate acos for
+ * \result Acos(x)
+ */
+static gmx_inline gmx_simd_float_t
+gmx_simd_acos_f(gmx_simd_float_t x)
+{
+ const gmx_simd_float_t one = gmx_simd_set1_f(1.0f);
+ const gmx_simd_float_t half = gmx_simd_set1_f(0.5f);
+ const gmx_simd_float_t pi = gmx_simd_set1_f((float)M_PI);
+ const gmx_simd_float_t halfpi = gmx_simd_set1_f((float)M_PI/2.0f);
+ gmx_simd_float_t xabs;
+ gmx_simd_float_t z, z1, z2, z3;
+ gmx_simd_fbool_t mask1, mask2;
+
+ xabs = gmx_simd_fabs_f(x);
+ mask1 = gmx_simd_cmplt_f(half, xabs);
+ mask2 = gmx_simd_cmplt_f(gmx_simd_setzero_f(), x);
+
+ z = gmx_simd_mul_f(half, gmx_simd_sub_f(one, xabs));
+ z = gmx_simd_mul_f(z, gmx_simd_invsqrt_f(z));
+ z = gmx_simd_blendnotzero_f(z, gmx_simd_cmpeq_f(xabs, one));
+ z = gmx_simd_blendv_f(x, z, mask1);
+ z = gmx_simd_asin_f(z);
+
+ z2 = gmx_simd_add_f(z, z);
+ z1 = gmx_simd_sub_f(pi, z2);
+ z3 = gmx_simd_sub_f(halfpi, z);
+ z = gmx_simd_blendv_f(z1, z2, mask2);
+ z = gmx_simd_blendv_f(z3, z, mask1);
+
+ return z;
+}
+
+/*! \brief SIMD float asin(x).
+ *
+ * You should normally call the real-precision routine \ref gmx_simd_atan_r.
+ *
+ * \param x The argument to evaluate atan for
+ * \result Atan(x), same argument/value range as standard math library.
+ */
+static gmx_inline gmx_simd_float_t
+gmx_simd_atan_f(gmx_simd_float_t x)
+{
+ const gmx_simd_float_t halfpi = gmx_simd_set1_f(M_PI/2);
+ const gmx_simd_float_t CA17 = gmx_simd_set1_f(0.002823638962581753730774f);
+ const gmx_simd_float_t CA15 = gmx_simd_set1_f(-0.01595690287649631500244f);
+ const gmx_simd_float_t CA13 = gmx_simd_set1_f(0.04250498861074447631836f);
+ const gmx_simd_float_t CA11 = gmx_simd_set1_f(-0.07489009201526641845703f);
+ const gmx_simd_float_t CA9 = gmx_simd_set1_f(0.1063479334115982055664f);
+ const gmx_simd_float_t CA7 = gmx_simd_set1_f(-0.1420273631811141967773f);
+ const gmx_simd_float_t CA5 = gmx_simd_set1_f(0.1999269574880599975585f);
+ const gmx_simd_float_t CA3 = gmx_simd_set1_f(-0.3333310186862945556640f);
+ gmx_simd_float_t x2, x3, x4, pA, pB;
+ gmx_simd_fbool_t mask, mask2;
+
+ mask = gmx_simd_cmplt_f(x, gmx_simd_setzero_f());
+ x = gmx_simd_fabs_f(x);
+ mask2 = gmx_simd_cmplt_f(gmx_simd_set1_f(1.0f), x);
+ x = gmx_simd_blendv_f(x, gmx_simd_inv_f(x), mask2);
+
+ x2 = gmx_simd_mul_f(x, x);
+ x3 = gmx_simd_mul_f(x2, x);
+ x4 = gmx_simd_mul_f(x2, x2);
+ pA = gmx_simd_fmadd_f(CA17, x4, CA13);
+ pB = gmx_simd_fmadd_f(CA15, x4, CA11);
+ pA = gmx_simd_fmadd_f(pA, x4, CA9);
+ pB = gmx_simd_fmadd_f(pB, x4, CA7);
+ pA = gmx_simd_fmadd_f(pA, x4, CA5);
+ pB = gmx_simd_fmadd_f(pB, x4, CA3);
+ pA = gmx_simd_fmadd_f(pA, x2, pB);
+ pA = gmx_simd_fmadd_f(pA, x3, x);
+
+ pA = gmx_simd_blendv_f(pA, gmx_simd_sub_f(halfpi, pA), mask2);
+ pA = gmx_simd_blendv_f(pA, gmx_simd_fneg_f(pA), mask);
+
+ return pA;
+}
+
+/*! \brief SIMD float atan2(y,x).
+ *
+ * You should normally call the real-precision routine \ref gmx_simd_atan2_r.
+ *
+ * \param y Y component of vector, any quartile
+ * \param x X component of vector, any quartile
+ * \result Atan(y,x), same argument/value range as standard math library.
+ *
+ * \note This routine should provide correct results for all finite
+ * non-zero or positive-zero arguments. However, negative zero arguments will
+ * be treated as positive zero, which means the return value will deviate from
+ * the standard math library atan2(y,x) for those cases. That should not be
+ * of any concern in Gromacs, and in particular it will not affect calculations
+ * of angles from vectors.
+ */
+static gmx_inline gmx_simd_float_t
+gmx_simd_atan2_f(gmx_simd_float_t y, gmx_simd_float_t x)
+{
+ const gmx_simd_float_t pi = gmx_simd_set1_f(M_PI);
+ const gmx_simd_float_t halfpi = gmx_simd_set1_f(M_PI/2.0);
+ gmx_simd_float_t xinv, p, aoffset;
+ gmx_simd_fbool_t mask_x0, mask_y0, mask_xlt0, mask_ylt0;
+
+ mask_x0 = gmx_simd_cmpeq_f(x, gmx_simd_setzero_f());
+ mask_y0 = gmx_simd_cmpeq_f(y, gmx_simd_setzero_f());
+ mask_xlt0 = gmx_simd_cmplt_f(x, gmx_simd_setzero_f());
+ mask_ylt0 = gmx_simd_cmplt_f(y, gmx_simd_setzero_f());
+
+ aoffset = gmx_simd_blendzero_f(halfpi, mask_x0);
+ aoffset = gmx_simd_blendnotzero_f(aoffset, mask_y0);
+
+ aoffset = gmx_simd_blendv_f(aoffset, pi, mask_xlt0);
+ aoffset = gmx_simd_blendv_f(aoffset, gmx_simd_fneg_f(aoffset), mask_ylt0);
+
+ xinv = gmx_simd_blendnotzero_f(gmx_simd_inv_f(x), mask_x0);
+ p = gmx_simd_mul_f(y, xinv);
+ p = gmx_simd_atan_f(p);
+ p = gmx_simd_add_f(p, aoffset);
+
+ return p;
+}
+
+/*! \brief Calculate the force correction due to PME analytically in SIMD float.
+ *
+ * You should normally call the real-precision routine \ref gmx_simd_pmecorrF_r.
+ *
+ * \param z2 \f$(r \beta)^2\f$ - see below for details.
+ * \result Correction factor to coulomb force - see below for details.
+ *
+ * This routine is meant to enable analytical evaluation of the
+ * direct-space PME electrostatic force to avoid tables.
+ *
+ * The direct-space potential should be \f$ \mbox{erfc}(\beta r)/r\f$, but there
+ * are some problems evaluating that:
+ *
+ * First, the error function is difficult (read: expensive) to
+ * approxmiate accurately for intermediate to large arguments, and
+ * this happens already in ranges of \f$(\beta r)\f$ that occur in simulations.
+ * Second, we now try to avoid calculating potentials in Gromacs but
+ * use forces directly.
+ *
+ * We can simply things slight by noting that the PME part is really
+ * a correction to the normal Coulomb force since \f$\mbox{erfc}(z)=1-\mbox{erf}(z)\f$, i.e.
+ * \f[
+ * V = \frac{1}{r} - \frac{\mbox{erf}(\beta r)}{r}
+ * \f]
+ * The first term we already have from the inverse square root, so
+ * that we can leave out of this routine.
+ *
+ * For pme tolerances of 1e-3 to 1e-8 and cutoffs of 0.5nm to 1.8nm,
+ * the argument \f$beta r\f$ will be in the range 0.15 to ~4. Use your
+ * favorite plotting program to realize how well-behaved \f$\frac{\mbox{erf}(z)}{z}\f$ is
+ * in this range!
+ *
+ * We approximate \f$f(z)=\mbox{erf}(z)/z\f$ with a rational minimax polynomial.
+ * However, it turns out it is more efficient to approximate \f$f(z)/z\f$ and
+ * then only use even powers. This is another minor optimization, since
+ * we actually \a want \f$f(z)/z\f$, because it is going to be multiplied by
+ * the vector between the two atoms to get the vectorial force. The
+ * fastest flops are the ones we can avoid calculating!
+ *
+ * So, here's how it should be used:
+ *
+ * 1. Calculate \f$r^2\f$.
+ * 2. Multiply by \f$\beta^2\f$, so you get \f$z^2=(\beta r)^2\f$.
+ * 3. Evaluate this routine with \f$z^2\f$ as the argument.
+ * 4. The return value is the expression:
+ *
+ * \f[
+ * \frac{2 \exp{-z^2}}{\sqrt{\pi} z^2}-\frac{\mbox{erf}(z)}{z^3}
+ * \f]
+ *
+ * 5. Multiply the entire expression by \f$\beta^3\f$. This will get you
+ *
+ * \f[
+ * \frac{2 \beta^3 \exp(-z^2)}{\sqrt{\pi} z^2} - \frac{\beta^3 \mbox{erf}(z)}{z^3}
+ * \f]
+ *
+ * or, switching back to \f$r\f$ (since \f$z=r \beta\f$):
+ *
+ * \f[
+ * \frac{2 \beta \exp(-r^2 \beta^2)}{\sqrt{\pi} r^2} - \frac{\mbox{erf}(r \beta)}{r^3}
+ * \f]
+ *
+ * With a bit of math exercise you should be able to confirm that
+ * this is exactly
+ *
+ * \f[
+ * \frac{\frac{d}{dr}\left( \frac{\mbox{erf}(\beta r)}{r} \right)}{r}
+ * \f]
+ *
+ * 6. Add the result to \f$r^{-3}\f$, multiply by the product of the charges,
+ * and you have your force (divided by \f$r\f$). A final multiplication
+ * with the vector connecting the two particles and you have your
+ * vectorial force to add to the particles.
+ *
+ * This approximation achieves an accuracy slightly lower than 1e-6; when
+ * added to \f$1/r\f$ the error will be insignificant.
+ *
+ */
+static gmx_simd_float_t
+gmx_simd_pmecorrF_f(gmx_simd_float_t z2)
+{
+ const gmx_simd_float_t FN6 = gmx_simd_set1_f(-1.7357322914161492954e-8f);
+ const gmx_simd_float_t FN5 = gmx_simd_set1_f(1.4703624142580877519e-6f);
+ const gmx_simd_float_t FN4 = gmx_simd_set1_f(-0.000053401640219807709149f);
+ const gmx_simd_float_t FN3 = gmx_simd_set1_f(0.0010054721316683106153f);
+ const gmx_simd_float_t FN2 = gmx_simd_set1_f(-0.019278317264888380590f);
+ const gmx_simd_float_t FN1 = gmx_simd_set1_f(0.069670166153766424023f);
+ const gmx_simd_float_t FN0 = gmx_simd_set1_f(-0.75225204789749321333f);
+
+ const gmx_simd_float_t FD4 = gmx_simd_set1_f(0.0011193462567257629232f);
+ const gmx_simd_float_t FD3 = gmx_simd_set1_f(0.014866955030185295499f);
+ const gmx_simd_float_t FD2 = gmx_simd_set1_f(0.11583842382862377919f);
+ const gmx_simd_float_t FD1 = gmx_simd_set1_f(0.50736591960530292870f);
+ const gmx_simd_float_t FD0 = gmx_simd_set1_f(1.0f);
+
+ gmx_simd_float_t z4;
+ gmx_simd_float_t polyFN0, polyFN1, polyFD0, polyFD1;
+
+ z4 = gmx_simd_mul_f(z2, z2);
+
+ polyFD0 = gmx_simd_fmadd_f(FD4, z4, FD2);
+ polyFD1 = gmx_simd_fmadd_f(FD3, z4, FD1);
+ polyFD0 = gmx_simd_fmadd_f(polyFD0, z4, FD0);
+ polyFD0 = gmx_simd_fmadd_f(polyFD1, z2, polyFD0);
+
+ polyFD0 = gmx_simd_inv_f(polyFD0);
+
+ polyFN0 = gmx_simd_fmadd_f(FN6, z4, FN4);
+ polyFN1 = gmx_simd_fmadd_f(FN5, z4, FN3);
+ polyFN0 = gmx_simd_fmadd_f(polyFN0, z4, FN2);
+ polyFN1 = gmx_simd_fmadd_f(polyFN1, z4, FN1);
+ polyFN0 = gmx_simd_fmadd_f(polyFN0, z4, FN0);
+ polyFN0 = gmx_simd_fmadd_f(polyFN1, z2, polyFN0);
+
+ return gmx_simd_mul_f(polyFN0, polyFD0);
+}
+
+
+
+/*! \brief Calculate the potential correction due to PME analytically in SIMD float.
+ *
+ * You should normally call the real-precision routine \ref gmx_simd_pmecorrV_r.
+ *
+ * \param z2 \f$(r \beta)^2\f$ - see below for details.
+ * \result Correction factor to coulomb potential - see below for details.
+ *
+ * See \ref gmx_simd_pmecorrF_f for details about the approximation.
+ *
+ * This routine calculates \f$\mbox{erf}(z)/z\f$, although you should provide \f$z^2\f$
+ * as the input argument.
+ *
+ * Here's how it should be used:
+ *
+ * 1. Calculate \f$r^2\f$.
+ * 2. Multiply by \f$\beta^2\f$, so you get \f$z^2=\beta^2*r^2\f$.
+ * 3. Evaluate this routine with z^2 as the argument.
+ * 4. The return value is the expression:
+ *
+ * \f[
+ * \frac{\mbox{erf}(z)}{z}
+ * \f]
+ *
+ * 5. Multiply the entire expression by beta and switching back to \f$r\f$ (since \f$z=r \beta\f$):
+ *
+ * \f[
+ * \frac{\mbox{erf}(r \beta)}{r}
+ * \f]
+ *
+ * 6. Subtract the result from \f$1/r\f$, multiply by the product of the charges,
+ * and you have your potential.
+ *
+ * This approximation achieves an accuracy slightly lower than 1e-6; when
+ * added to \f$1/r\f$ the error will be insignificant.
+ */
+static gmx_simd_float_t
+gmx_simd_pmecorrV_f(gmx_simd_float_t z2)
+{
+ const gmx_simd_float_t VN6 = gmx_simd_set1_f(1.9296833005951166339e-8f);
+ const gmx_simd_float_t VN5 = gmx_simd_set1_f(-1.4213390571557850962e-6f);
+ const gmx_simd_float_t VN4 = gmx_simd_set1_f(0.000041603292906656984871f);
+ const gmx_simd_float_t VN3 = gmx_simd_set1_f(-0.00013134036773265025626f);
+ const gmx_simd_float_t VN2 = gmx_simd_set1_f(0.038657983986041781264f);
+ const gmx_simd_float_t VN1 = gmx_simd_set1_f(0.11285044772717598220f);
+ const gmx_simd_float_t VN0 = gmx_simd_set1_f(1.1283802385263030286f);
+
+ const gmx_simd_float_t VD3 = gmx_simd_set1_f(0.0066752224023576045451f);
+ const gmx_simd_float_t VD2 = gmx_simd_set1_f(0.078647795836373922256f);
+ const gmx_simd_float_t VD1 = gmx_simd_set1_f(0.43336185284710920150f);
+ const gmx_simd_float_t VD0 = gmx_simd_set1_f(1.0f);
+
+ gmx_simd_float_t z4;
+ gmx_simd_float_t polyVN0, polyVN1, polyVD0, polyVD1;
+
+ z4 = gmx_simd_mul_f(z2, z2);
+
+ polyVD1 = gmx_simd_fmadd_f(VD3, z4, VD1);
+ polyVD0 = gmx_simd_fmadd_f(VD2, z4, VD0);
+ polyVD0 = gmx_simd_fmadd_f(polyVD1, z2, polyVD0);
+
+ polyVD0 = gmx_simd_inv_f(polyVD0);
+
+ polyVN0 = gmx_simd_fmadd_f(VN6, z4, VN4);
+ polyVN1 = gmx_simd_fmadd_f(VN5, z4, VN3);
+ polyVN0 = gmx_simd_fmadd_f(polyVN0, z4, VN2);
+ polyVN1 = gmx_simd_fmadd_f(polyVN1, z4, VN1);
+ polyVN0 = gmx_simd_fmadd_f(polyVN0, z4, VN0);
+ polyVN0 = gmx_simd_fmadd_f(polyVN1, z2, polyVN0);
+
+ return gmx_simd_mul_f(polyVN0, polyVD0);
+}
+#endif
+
+/*! \} */
+
+#ifdef GMX_SIMD_HAVE_DOUBLE
+
+/*! \name Double precision SIMD math functions
+ *
+ * \note In most cases you should use the real-precision functions instead.
+ * \{
+ */
+
+/****************************************
+ * DOUBLE PRECISION SIMD MATH FUNCTIONS *
+ ****************************************/
+
+/*! \brief SIMD utility function to sum a+b+c+d for SIMD doubles.
+ *
+ * \copydetails gmx_simd_sum4_f
+ */
+static gmx_inline gmx_simd_double_t
+gmx_simd_sum4_d(gmx_simd_double_t a, gmx_simd_double_t b,
+ gmx_simd_double_t c, gmx_simd_double_t d)
+{
+ return gmx_simd_add_d(gmx_simd_add_d(a, b), gmx_simd_add_d(c, d));
+}
+
+/*! \brief Return -a if b is negative, SIMD double.
+ *
+ * You should normally call the real-precision routine \ref gmx_simd_xor_sign_r.
+ *
+ * \param a Values to set sign for
+ * \param b Values used to set sign
+ * \return if b is negative, the sign of a will be changed.
+ *
+ * This is equivalent to doing an xor operation on a with the sign bit of b,
+ * with the exception that negative zero is not considered to be negative
+ * on architectures where \ref GMX_SIMD_HAVE_LOGICAL is not set.
+ */
+static gmx_inline gmx_simd_double_t
+gmx_simd_xor_sign_d(gmx_simd_double_t a, gmx_simd_double_t b)
+{
+#ifdef GMX_SIMD_HAVE_LOGICAL
+ return gmx_simd_xor_d(a, gmx_simd_and_d(gmx_simd_set1_d(-0.0), b));
+#else
+ return gmx_simd_blendv_d(a, gmx_simd_fneg_d(a), gmx_simd_cmplt_d(b, gmx_simd_setzero_d()));
+#endif
+}
+
+/*! \brief Perform one Newton-Raphson iteration to improve 1/sqrt(x) for SIMD double.
+ *
+ * \copydetails gmx_simd_rsqrt_iter_f
+ */
+static gmx_inline gmx_simd_double_t
+gmx_simd_rsqrt_iter_d(gmx_simd_double_t lu, gmx_simd_double_t x)
+{
+#ifdef GMX_SIMD_HAVE_FMA
+ return gmx_simd_fmadd_d(gmx_simd_fnmadd_d(x, gmx_simd_mul_d(lu, lu), gmx_simd_set1_d(1.0)), gmx_simd_mul_d(lu, gmx_simd_set1_d(0.5)), lu);
+#else
+ return gmx_simd_mul_d(gmx_simd_set1_d(0.5), gmx_simd_mul_d(gmx_simd_sub_d(gmx_simd_set1_d(3.0), gmx_simd_mul_d(gmx_simd_mul_d(lu, lu), x)), lu));
+#endif
+}
+
+
+/*! \brief Calculate 1/sqrt(x) for SIMD double
+ *
+ * \copydetails gmx_simd_invsqrt_f
+ */
+static gmx_inline gmx_simd_double_t
+gmx_simd_invsqrt_d(gmx_simd_double_t x)
+{
+ gmx_simd_double_t lu = gmx_simd_rsqrt_d(x);
+#if (GMX_SIMD_RSQRT_BITS < GMX_SIMD_MATH_TARGET_DOUBLE_BITS)
+ lu = gmx_simd_rsqrt_iter_d(lu, x);
+#endif
+#if (GMX_SIMD_RSQRT_BITS*2 < GMX_SIMD_MATH_TARGET_DOUBLE_BITS)
+ lu = gmx_simd_rsqrt_iter_d(lu, x);
+#endif
+#if (GMX_SIMD_RSQRT_BITS*4 < GMX_SIMD_MATH_TARGET_DOUBLE_BITS)
+ lu = gmx_simd_rsqrt_iter_d(lu, x);
+#endif
+#if (GMX_SIMD_RSQRT_BITS*8 < GMX_SIMD_MATH_TARGET_DOUBLE_BITS)
+ lu = gmx_simd_rsqrt_iter_d(lu, x);
+#endif
+ return lu;
+}
+
+/*! \brief Calculate 1/sqrt(x) for two SIMD doubles.
+ *
+ * \copydetails gmx_simd_invsqrt_pair_f
+ */
+static gmx_inline void
+gmx_simd_invsqrt_pair_d(gmx_simd_double_t x0, gmx_simd_double_t x1,
+ gmx_simd_double_t *out0, gmx_simd_double_t *out1)
+{
+#if (defined GMX_SIMD_HAVE_FLOAT) && (GMX_SIMD_FLOAT_WIDTH == 2*GMX_SIMD_DOUBLE_WIDTH) && (GMX_SIMD_RSQRT_BITS < 22)
+ gmx_simd_float_t xf = gmx_simd_cvt_dd2f(x0, x1);
+ gmx_simd_float_t luf = gmx_simd_rsqrt_f(xf);
+ gmx_simd_double_t lu0, lu1;
+ /* Intermediate target is single - mantissa+1 bits */
+#if (GMX_SIMD_RSQRT_BITS < GMX_SIMD_MATH_TARGET_SINGLE_BITS)
+ luf = gmx_simd_rsqrt_iter_f(luf, xf);
+#endif
+#if (GMX_SIMD_RSQRT_BITS*2 < GMX_SIMD_MATH_TARGET_SINGLE_BITS)
+ luf = gmx_simd_rsqrt_iter_f(luf, xf);
+#endif
+#if (GMX_SIMD_RSQRT_BITS*4 < GMX_SIMD_MATH_TARGET_SINGLE_BITS)
+ luf = gmx_simd_rsqrt_iter_f(luf, xf);
+#endif
+ gmx_simd_cvt_f2dd(luf, &lu0, &lu1);
+ /* Last iteration(s) performed in double - if we had 22 bits, this gets us to 44 (~1e-15) */
+#if (GMX_SIMD_MATH_TARGET_SINGLE_BITS < GMX_SIMD_MATH_TARGET_DOUBLE_BITS)
+ lu0 = gmx_simd_rsqrt_iter_d(lu0, x0);
+ lu1 = gmx_simd_rsqrt_iter_d(lu1, x1);
+#endif
+#if (GMX_SIMD_MATH_TARGET_SINGLE_BITS*2 < GMX_SIMD_MATH_TARGET_DOUBLE_BITS)
+ lu0 = gmx_simd_rsqrt_iter_d(lu0, x0);
+ lu1 = gmx_simd_rsqrt_iter_d(lu1, x1);
+#endif
+ *out0 = lu0;
+ *out1 = lu1;
+#else
+ *out0 = gmx_simd_invsqrt_d(x0);
+ *out1 = gmx_simd_invsqrt_d(x1);
+#endif
+}
+
+/*! \brief Perform one Newton-Raphson iteration to improve 1/x for SIMD double.
+ *
+ * \copydetails gmx_simd_rcp_iter_f
+ */
+static gmx_inline gmx_simd_double_t
+gmx_simd_rcp_iter_d(gmx_simd_double_t lu, gmx_simd_double_t x)
+{
+ return gmx_simd_mul_d(lu, gmx_simd_fnmadd_d(lu, x, gmx_simd_set1_d(2.0)));
+}
+
+/*! \brief Calculate 1/x for SIMD double.
+ *
+ * \copydetails gmx_simd_inv_f
+ */
+static gmx_inline gmx_simd_double_t
+gmx_simd_inv_d(gmx_simd_double_t x)
+{
+ gmx_simd_double_t lu = gmx_simd_rcp_d(x);
+#if (GMX_SIMD_RCP_BITS < GMX_SIMD_MATH_TARGET_DOUBLE_BITS)
+ lu = gmx_simd_rcp_iter_d(lu, x);
+#endif
+#if (GMX_SIMD_RCP_BITS*2 < GMX_SIMD_MATH_TARGET_DOUBLE_BITS)
+ lu = gmx_simd_rcp_iter_d(lu, x);
+#endif
+#if (GMX_SIMD_RCP_BITS*4 < GMX_SIMD_MATH_TARGET_DOUBLE_BITS)
+ lu = gmx_simd_rcp_iter_d(lu, x);
+#endif
+#if (GMX_SIMD_RCP_BITS*8 < GMX_SIMD_MATH_TARGET_DOUBLE_BITS)
+ lu = gmx_simd_rcp_iter_d(lu, x);
+#endif
+ return lu;
+}
+
+/*! \brief Calculate sqrt(x) correctly for SIMD doubles, including argument 0.0.
+ *
+ * \copydetails gmx_simd_sqrt_f
+ */
+static gmx_inline gmx_simd_double_t
+gmx_simd_sqrt_d(gmx_simd_double_t x)
+{
+ gmx_simd_dbool_t mask;
+ gmx_simd_double_t res;
+
+ mask = gmx_simd_cmpeq_d(x, gmx_simd_setzero_d());
+ res = gmx_simd_blendnotzero_d(gmx_simd_invsqrt_d(x), mask);
+ return gmx_simd_mul_d(res, x);
+}
+
+/*! \brief SIMD double log(x). This is the natural logarithm.
+ *
+ * \copydetails gmx_simd_log_f
+ */
+static gmx_inline gmx_simd_double_t
+gmx_simd_log_d(gmx_simd_double_t x)
+{
+ const gmx_simd_double_t half = gmx_simd_set1_d(0.5);
+ const gmx_simd_double_t one = gmx_simd_set1_d(1.0);
+ const gmx_simd_double_t sqrt2 = gmx_simd_set1_d(sqrt(2.0));
+ const gmx_simd_double_t corr = gmx_simd_set1_d(0.693147180559945286226764);
+ const gmx_simd_double_t CL15 = gmx_simd_set1_d(0.148197055177935105296783);
+ const gmx_simd_double_t CL13 = gmx_simd_set1_d(0.153108178020442575739679);
+ const gmx_simd_double_t CL11 = gmx_simd_set1_d(0.181837339521549679055568);
+ const gmx_simd_double_t CL9 = gmx_simd_set1_d(0.22222194152736701733275);
+ const gmx_simd_double_t CL7 = gmx_simd_set1_d(0.285714288030134544449368);
+ const gmx_simd_double_t CL5 = gmx_simd_set1_d(0.399999999989941956712869);
+ const gmx_simd_double_t CL3 = gmx_simd_set1_d(0.666666666666685503450651);
+ const gmx_simd_double_t CL1 = gmx_simd_set1_d(2.0);
+ gmx_simd_double_t fexp, x2, p;
+ gmx_simd_dbool_t mask;
+
+ fexp = gmx_simd_get_exponent_d(x);
+ x = gmx_simd_get_mantissa_d(x);
+
+ mask = gmx_simd_cmplt_d(sqrt2, x);
+ /* Adjust to non-IEEE format for x>sqrt(2): exponent += 1, mantissa *= 0.5 */
+ fexp = gmx_simd_add_d(fexp, gmx_simd_blendzero_d(one, mask));
+ x = gmx_simd_mul_d(x, gmx_simd_blendv_d(one, half, mask));
+
+ x = gmx_simd_mul_d( gmx_simd_sub_d(x, one), gmx_simd_inv_d( gmx_simd_add_d(x, one) ) );
+ x2 = gmx_simd_mul_d(x, x);
+
+ p = gmx_simd_fmadd_d(CL15, x2, CL13);
+ p = gmx_simd_fmadd_d(p, x2, CL11);
+ p = gmx_simd_fmadd_d(p, x2, CL9);
+ p = gmx_simd_fmadd_d(p, x2, CL7);
+ p = gmx_simd_fmadd_d(p, x2, CL5);
+ p = gmx_simd_fmadd_d(p, x2, CL3);
+ p = gmx_simd_fmadd_d(p, x2, CL1);
+ p = gmx_simd_fmadd_d(p, x, gmx_simd_mul_d(corr, fexp));
+
+ return p;
+}
+
+/*! \brief SIMD double 2^x.
+ *
+ * \copydetails gmx_simd_exp2_f
+ */
+static gmx_inline gmx_simd_double_t
+gmx_simd_exp2_d(gmx_simd_double_t x)
+{
+ const gmx_simd_double_t arglimit = gmx_simd_set1_d(1022.0);
+ const gmx_simd_double_t CE11 = gmx_simd_set1_d(4.435280790452730022081181e-10);
+ const gmx_simd_double_t CE10 = gmx_simd_set1_d(7.074105630863314448024247e-09);
+ const gmx_simd_double_t CE9 = gmx_simd_set1_d(1.017819803432096698472621e-07);
+ const gmx_simd_double_t CE8 = gmx_simd_set1_d(1.321543308956718799557863e-06);
+ const gmx_simd_double_t CE7 = gmx_simd_set1_d(0.00001525273348995851746990884);
+ const gmx_simd_double_t CE6 = gmx_simd_set1_d(0.0001540353046251466849082632);
+ const gmx_simd_double_t CE5 = gmx_simd_set1_d(0.001333355814678995257307880);
+ const gmx_simd_double_t CE4 = gmx_simd_set1_d(0.009618129107588335039176502);
+ const gmx_simd_double_t CE3 = gmx_simd_set1_d(0.05550410866481992147457793);
+ const gmx_simd_double_t CE2 = gmx_simd_set1_d(0.2402265069591015620470894);
+ const gmx_simd_double_t CE1 = gmx_simd_set1_d(0.6931471805599453304615075);
+ const gmx_simd_double_t one = gmx_simd_set1_d(1.0);
+ gmx_simd_double_t fexppart;
+ gmx_simd_double_t intpart;
+ gmx_simd_double_t p;
+ gmx_simd_dbool_t valuemask;
+
+ fexppart = gmx_simd_set_exponent_d(x); /* rounds to nearest int internally */
+ intpart = gmx_simd_round_d(x); /* use same rounding mode here */
+ valuemask = gmx_simd_cmple_d(gmx_simd_fabs_d(x), arglimit);
+ fexppart = gmx_simd_blendzero_d(fexppart, valuemask);
+ x = gmx_simd_sub_d(x, intpart);
+
+ p = gmx_simd_fmadd_d(CE11, x, CE10);
+ p = gmx_simd_fmadd_d(p, x, CE9);
+ p = gmx_simd_fmadd_d(p, x, CE8);
+ p = gmx_simd_fmadd_d(p, x, CE7);
+ p = gmx_simd_fmadd_d(p, x, CE6);
+ p = gmx_simd_fmadd_d(p, x, CE5);
+ p = gmx_simd_fmadd_d(p, x, CE4);
+ p = gmx_simd_fmadd_d(p, x, CE3);
+ p = gmx_simd_fmadd_d(p, x, CE2);
+ p = gmx_simd_fmadd_d(p, x, CE1);
+ p = gmx_simd_fmadd_d(p, x, one);
+ x = gmx_simd_mul_d(p, fexppart);
+ return x;
+}
+
+/*! \brief SIMD double exp(x).
+ *
+ * \copydetails gmx_simd_exp_f
+ */
+static gmx_inline gmx_simd_double_t
+gmx_simd_exp_d(gmx_simd_double_t x)
+{
+ const gmx_simd_double_t argscale = gmx_simd_set1_d(1.44269504088896340735992468100);
+ const gmx_simd_double_t arglimit = gmx_simd_set1_d(1022.0);
+ const gmx_simd_double_t invargscale0 = gmx_simd_set1_d(0.69314718055966295651160180568695068359375);
+ const gmx_simd_double_t invargscale1 = gmx_simd_set1_d(2.8235290563031577122588448175013436025525412068e-13);
+ const gmx_simd_double_t CE12 = gmx_simd_set1_d(2.078375306791423699350304e-09);
+ const gmx_simd_double_t CE11 = gmx_simd_set1_d(2.518173854179933105218635e-08);
+ const gmx_simd_double_t CE10 = gmx_simd_set1_d(2.755842049600488770111608e-07);
+ const gmx_simd_double_t CE9 = gmx_simd_set1_d(2.755691815216689746619849e-06);
+ const gmx_simd_double_t CE8 = gmx_simd_set1_d(2.480158383706245033920920e-05);
+ const gmx_simd_double_t CE7 = gmx_simd_set1_d(0.0001984127043518048611841321);
+ const gmx_simd_double_t CE6 = gmx_simd_set1_d(0.001388888889360258341755930);
+ const gmx_simd_double_t CE5 = gmx_simd_set1_d(0.008333333332907368102819109);
+ const gmx_simd_double_t CE4 = gmx_simd_set1_d(0.04166666666663836745814631);
+ const gmx_simd_double_t CE3 = gmx_simd_set1_d(0.1666666666666796929434570);
+ const gmx_simd_double_t CE2 = gmx_simd_set1_d(0.5);
+ const gmx_simd_double_t one = gmx_simd_set1_d(1.0);
+ gmx_simd_double_t fexppart;
+ gmx_simd_double_t intpart;
+ gmx_simd_double_t y, p;
+ gmx_simd_dbool_t valuemask;
+
+ y = gmx_simd_mul_d(x, argscale);
+ fexppart = gmx_simd_set_exponent_d(y); /* rounds to nearest int internally */
+ intpart = gmx_simd_round_d(y); /* use same rounding mode here */
+ valuemask = gmx_simd_cmple_d(gmx_simd_fabs_d(y), arglimit);
+ fexppart = gmx_simd_blendzero_d(fexppart, valuemask);
+
+ /* Extended precision arithmetics */
+ x = gmx_simd_fnmadd_d(invargscale0, intpart, x);
+ x = gmx_simd_fnmadd_d(invargscale1, intpart, x);
+
+ p = gmx_simd_fmadd_d(CE12, x, CE11);
+ p = gmx_simd_fmadd_d(p, x, CE10);
+ p = gmx_simd_fmadd_d(p, x, CE9);
+ p = gmx_simd_fmadd_d(p, x, CE8);
+ p = gmx_simd_fmadd_d(p, x, CE7);
+ p = gmx_simd_fmadd_d(p, x, CE6);
+ p = gmx_simd_fmadd_d(p, x, CE5);
+ p = gmx_simd_fmadd_d(p, x, CE4);
+ p = gmx_simd_fmadd_d(p, x, CE3);
+ p = gmx_simd_fmadd_d(p, x, CE2);
+ p = gmx_simd_fmadd_d(p, gmx_simd_mul_d(x, x), gmx_simd_add_d(x, one));
+ x = gmx_simd_mul_d(p, fexppart);
+ return x;
+}
+
+/*! \brief SIMD double erf(x).
+ *
+ * \copydetails gmx_simd_erf_f
+ */
+static gmx_inline gmx_simd_double_t
+gmx_simd_erf_d(gmx_simd_double_t x)
+{
+ /* Coefficients for minimax approximation of erf(x)=x*(CAoffset + P(x^2)/Q(x^2)) in range [-0.75,0.75] */
+ const gmx_simd_double_t CAP4 = gmx_simd_set1_d(-0.431780540597889301512e-4);
+ const gmx_simd_double_t CAP3 = gmx_simd_set1_d(-0.00578562306260059236059);
+ const gmx_simd_double_t CAP2 = gmx_simd_set1_d(-0.028593586920219752446);
+ const gmx_simd_double_t CAP1 = gmx_simd_set1_d(-0.315924962948621698209);
+ const gmx_simd_double_t CAP0 = gmx_simd_set1_d(0.14952975608477029151);
+
+ const gmx_simd_double_t CAQ5 = gmx_simd_set1_d(-0.374089300177174709737e-5);
+ const gmx_simd_double_t CAQ4 = gmx_simd_set1_d(0.00015126584532155383535);
+ const gmx_simd_double_t CAQ3 = gmx_simd_set1_d(0.00536692680669480725423);
+ const gmx_simd_double_t CAQ2 = gmx_simd_set1_d(0.0668686825594046122636);
+ const gmx_simd_double_t CAQ1 = gmx_simd_set1_d(0.402604990869284362773);
+ /* CAQ0 == 1.0 */
+ const gmx_simd_double_t CAoffset = gmx_simd_set1_d(0.9788494110107421875);
+
+ /* Coefficients for minimax approximation of erfc(x)=exp(-x^2)*x*(P(x-1)/Q(x-1)) in range [1.0,4.5] */
+ const gmx_simd_double_t CBP6 = gmx_simd_set1_d(2.49650423685462752497647637088e-10);
+ const gmx_simd_double_t CBP5 = gmx_simd_set1_d(0.00119770193298159629350136085658);
+ const gmx_simd_double_t CBP4 = gmx_simd_set1_d(0.0164944422378370965881008942733);
+ const gmx_simd_double_t CBP3 = gmx_simd_set1_d(0.0984581468691775932063932439252);
+ const gmx_simd_double_t CBP2 = gmx_simd_set1_d(0.317364595806937763843589437418);
+ const gmx_simd_double_t CBP1 = gmx_simd_set1_d(0.554167062641455850932670067075);
+ const gmx_simd_double_t CBP0 = gmx_simd_set1_d(0.427583576155807163756925301060);
+ const gmx_simd_double_t CBQ7 = gmx_simd_set1_d(0.00212288829699830145976198384930);
+ const gmx_simd_double_t CBQ6 = gmx_simd_set1_d(0.0334810979522685300554606393425);
+ const gmx_simd_double_t CBQ5 = gmx_simd_set1_d(0.2361713785181450957579508850717);
+ const gmx_simd_double_t CBQ4 = gmx_simd_set1_d(0.955364736493055670530981883072);
+ const gmx_simd_double_t CBQ3 = gmx_simd_set1_d(2.36815675631420037315349279199);
+ const gmx_simd_double_t CBQ2 = gmx_simd_set1_d(3.55261649184083035537184223542);
+ const gmx_simd_double_t CBQ1 = gmx_simd_set1_d(2.93501136050160872574376997993);
+ /* CBQ0 == 1.0 */
+
+ /* Coefficients for minimax approximation of erfc(x)=exp(-x^2)/x*(P(1/x)/Q(1/x)) in range [4.5,inf] */
+ const gmx_simd_double_t CCP6 = gmx_simd_set1_d(-2.8175401114513378771);
+ const gmx_simd_double_t CCP5 = gmx_simd_set1_d(-3.22729451764143718517);
+ const gmx_simd_double_t CCP4 = gmx_simd_set1_d(-2.5518551727311523996);
+ const gmx_simd_double_t CCP3 = gmx_simd_set1_d(-0.687717681153649930619);
+ const gmx_simd_double_t CCP2 = gmx_simd_set1_d(-0.212652252872804219852);
+ const gmx_simd_double_t CCP1 = gmx_simd_set1_d(0.0175389834052493308818);
+ const gmx_simd_double_t CCP0 = gmx_simd_set1_d(0.00628057170626964891937);
+
+ const gmx_simd_double_t CCQ6 = gmx_simd_set1_d(5.48409182238641741584);
+ const gmx_simd_double_t CCQ5 = gmx_simd_set1_d(13.5064170191802889145);
+ const gmx_simd_double_t CCQ4 = gmx_simd_set1_d(22.9367376522880577224);
+ const gmx_simd_double_t CCQ3 = gmx_simd_set1_d(15.930646027911794143);
+ const gmx_simd_double_t CCQ2 = gmx_simd_set1_d(11.0567237927800161565);
+ const gmx_simd_double_t CCQ1 = gmx_simd_set1_d(2.79257750980575282228);
+ /* CCQ0 == 1.0 */
+ const gmx_simd_double_t CCoffset = gmx_simd_set1_d(0.5579090118408203125);
+
+ const gmx_simd_double_t one = gmx_simd_set1_d(1.0);
+ const gmx_simd_double_t two = gmx_simd_set1_d(2.0);
+
+ gmx_simd_double_t xabs, x2, x4, t, t2, w, w2;
+ gmx_simd_double_t PolyAP0, PolyAP1, PolyAQ0, PolyAQ1;
+ gmx_simd_double_t PolyBP0, PolyBP1, PolyBQ0, PolyBQ1;
+ gmx_simd_double_t PolyCP0, PolyCP1, PolyCQ0, PolyCQ1;
+ gmx_simd_double_t res_erf, res_erfcB, res_erfcC, res_erfc, res;
+ gmx_simd_double_t expmx2;
+ gmx_simd_dbool_t mask;
+
+ /* Calculate erf() */
+ xabs = gmx_simd_fabs_d(x);
+ x2 = gmx_simd_mul_d(x, x);
+ x4 = gmx_simd_mul_d(x2, x2);
+
+ PolyAP0 = gmx_simd_mul_d(CAP4, x4);
+ PolyAP1 = gmx_simd_mul_d(CAP3, x4);
+ PolyAP0 = gmx_simd_add_d(PolyAP0, CAP2);
+ PolyAP1 = gmx_simd_add_d(PolyAP1, CAP1);
+ PolyAP0 = gmx_simd_mul_d(PolyAP0, x4);
+ PolyAP1 = gmx_simd_mul_d(PolyAP1, x2);
+ PolyAP0 = gmx_simd_add_d(PolyAP0, CAP0);
+ PolyAP0 = gmx_simd_add_d(PolyAP0, PolyAP1);
+
+ PolyAQ1 = gmx_simd_mul_d(CAQ5, x4);
+ PolyAQ0 = gmx_simd_mul_d(CAQ4, x4);
+ PolyAQ1 = gmx_simd_add_d(PolyAQ1, CAQ3);
+ PolyAQ0 = gmx_simd_add_d(PolyAQ0, CAQ2);
+ PolyAQ1 = gmx_simd_mul_d(PolyAQ1, x4);
+ PolyAQ0 = gmx_simd_mul_d(PolyAQ0, x4);
+ PolyAQ1 = gmx_simd_add_d(PolyAQ1, CAQ1);
+ PolyAQ0 = gmx_simd_add_d(PolyAQ0, one);
+ PolyAQ1 = gmx_simd_mul_d(PolyAQ1, x2);
+ PolyAQ0 = gmx_simd_add_d(PolyAQ0, PolyAQ1);
+
+ res_erf = gmx_simd_mul_d(PolyAP0, gmx_simd_inv_d(PolyAQ0));
+ res_erf = gmx_simd_add_d(CAoffset, res_erf);
+ res_erf = gmx_simd_mul_d(x, res_erf);
+
+ /* Calculate erfc() in range [1,4.5] */
+ t = gmx_simd_sub_d(xabs, one);
+ t2 = gmx_simd_mul_d(t, t);
+
+ PolyBP0 = gmx_simd_mul_d(CBP6, t2);
+ PolyBP1 = gmx_simd_mul_d(CBP5, t2);
+ PolyBP0 = gmx_simd_add_d(PolyBP0, CBP4);
+ PolyBP1 = gmx_simd_add_d(PolyBP1, CBP3);
+ PolyBP0 = gmx_simd_mul_d(PolyBP0, t2);
+ PolyBP1 = gmx_simd_mul_d(PolyBP1, t2);
+ PolyBP0 = gmx_simd_add_d(PolyBP0, CBP2);
+ PolyBP1 = gmx_simd_add_d(PolyBP1, CBP1);
+ PolyBP0 = gmx_simd_mul_d(PolyBP0, t2);
+ PolyBP1 = gmx_simd_mul_d(PolyBP1, t);
+ PolyBP0 = gmx_simd_add_d(PolyBP0, CBP0);
+ PolyBP0 = gmx_simd_add_d(PolyBP0, PolyBP1);
+
+ PolyBQ1 = gmx_simd_mul_d(CBQ7, t2);
+ PolyBQ0 = gmx_simd_mul_d(CBQ6, t2);
+ PolyBQ1 = gmx_simd_add_d(PolyBQ1, CBQ5);
+ PolyBQ0 = gmx_simd_add_d(PolyBQ0, CBQ4);
+ PolyBQ1 = gmx_simd_mul_d(PolyBQ1, t2);
+ PolyBQ0 = gmx_simd_mul_d(PolyBQ0, t2);
+ PolyBQ1 = gmx_simd_add_d(PolyBQ1, CBQ3);
+ PolyBQ0 = gmx_simd_add_d(PolyBQ0, CBQ2);
+ PolyBQ1 = gmx_simd_mul_d(PolyBQ1, t2);
+ PolyBQ0 = gmx_simd_mul_d(PolyBQ0, t2);
+ PolyBQ1 = gmx_simd_add_d(PolyBQ1, CBQ1);
+ PolyBQ0 = gmx_simd_add_d(PolyBQ0, one);
+ PolyBQ1 = gmx_simd_mul_d(PolyBQ1, t);
+ PolyBQ0 = gmx_simd_add_d(PolyBQ0, PolyBQ1);
+
+ res_erfcB = gmx_simd_mul_d(PolyBP0, gmx_simd_inv_d(PolyBQ0));
+
+ res_erfcB = gmx_simd_mul_d(res_erfcB, xabs);
+
+ /* Calculate erfc() in range [4.5,inf] */
+ w = gmx_simd_inv_d(xabs);
+ w2 = gmx_simd_mul_d(w, w);
+
+ PolyCP0 = gmx_simd_mul_d(CCP6, w2);
+ PolyCP1 = gmx_simd_mul_d(CCP5, w2);
+ PolyCP0 = gmx_simd_add_d(PolyCP0, CCP4);
+ PolyCP1 = gmx_simd_add_d(PolyCP1, CCP3);
+ PolyCP0 = gmx_simd_mul_d(PolyCP0, w2);
+ PolyCP1 = gmx_simd_mul_d(PolyCP1, w2);
+ PolyCP0 = gmx_simd_add_d(PolyCP0, CCP2);
+ PolyCP1 = gmx_simd_add_d(PolyCP1, CCP1);
+ PolyCP0 = gmx_simd_mul_d(PolyCP0, w2);
+ PolyCP1 = gmx_simd_mul_d(PolyCP1, w);
+ PolyCP0 = gmx_simd_add_d(PolyCP0, CCP0);
+ PolyCP0 = gmx_simd_add_d(PolyCP0, PolyCP1);
+
+ PolyCQ0 = gmx_simd_mul_d(CCQ6, w2);
+ PolyCQ1 = gmx_simd_mul_d(CCQ5, w2);
+ PolyCQ0 = gmx_simd_add_d(PolyCQ0, CCQ4);
+ PolyCQ1 = gmx_simd_add_d(PolyCQ1, CCQ3);
+ PolyCQ0 = gmx_simd_mul_d(PolyCQ0, w2);
+ PolyCQ1 = gmx_simd_mul_d(PolyCQ1, w2);
+ PolyCQ0 = gmx_simd_add_d(PolyCQ0, CCQ2);
+ PolyCQ1 = gmx_simd_add_d(PolyCQ1, CCQ1);
+ PolyCQ0 = gmx_simd_mul_d(PolyCQ0, w2);
+ PolyCQ1 = gmx_simd_mul_d(PolyCQ1, w);
+ PolyCQ0 = gmx_simd_add_d(PolyCQ0, one);
+ PolyCQ0 = gmx_simd_add_d(PolyCQ0, PolyCQ1);
+
+ expmx2 = gmx_simd_exp_d( gmx_simd_fneg_d(x2) );
+
+ res_erfcC = gmx_simd_mul_d(PolyCP0, gmx_simd_inv_d(PolyCQ0));
+ res_erfcC = gmx_simd_add_d(res_erfcC, CCoffset);
+ res_erfcC = gmx_simd_mul_d(res_erfcC, w);
+
+ mask = gmx_simd_cmplt_d(gmx_simd_set1_d(4.5), xabs);
+ res_erfc = gmx_simd_blendv_d(res_erfcB, res_erfcC, mask);
+
+ res_erfc = gmx_simd_mul_d(res_erfc, expmx2);
+
+ /* erfc(x<0) = 2-erfc(|x|) */
+ mask = gmx_simd_cmplt_d(x, gmx_simd_setzero_d());
+ res_erfc = gmx_simd_blendv_d(res_erfc, gmx_simd_sub_d(two, res_erfc), mask);
+
+ /* Select erf() or erfc() */
+ mask = gmx_simd_cmplt_d(xabs, one);
+ res = gmx_simd_blendv_d(gmx_simd_sub_d(one, res_erfc), res_erf, mask);
+
+ return res;
+}
+
+/*! \brief SIMD double erfc(x).
+ *
+ * \copydetails gmx_simd_erfc_f
+ */
+static gmx_inline gmx_simd_double_t
+gmx_simd_erfc_d(gmx_simd_double_t x)
+{
+ /* Coefficients for minimax approximation of erf(x)=x*(CAoffset + P(x^2)/Q(x^2)) in range [-0.75,0.75] */
+ const gmx_simd_double_t CAP4 = gmx_simd_set1_d(-0.431780540597889301512e-4);
+ const gmx_simd_double_t CAP3 = gmx_simd_set1_d(-0.00578562306260059236059);
+ const gmx_simd_double_t CAP2 = gmx_simd_set1_d(-0.028593586920219752446);
+ const gmx_simd_double_t CAP1 = gmx_simd_set1_d(-0.315924962948621698209);
+ const gmx_simd_double_t CAP0 = gmx_simd_set1_d(0.14952975608477029151);
+
+ const gmx_simd_double_t CAQ5 = gmx_simd_set1_d(-0.374089300177174709737e-5);
+ const gmx_simd_double_t CAQ4 = gmx_simd_set1_d(0.00015126584532155383535);
+ const gmx_simd_double_t CAQ3 = gmx_simd_set1_d(0.00536692680669480725423);
+ const gmx_simd_double_t CAQ2 = gmx_simd_set1_d(0.0668686825594046122636);
+ const gmx_simd_double_t CAQ1 = gmx_simd_set1_d(0.402604990869284362773);
+ /* CAQ0 == 1.0 */
+ const gmx_simd_double_t CAoffset = gmx_simd_set1_d(0.9788494110107421875);
+
+ /* Coefficients for minimax approximation of erfc(x)=exp(-x^2)*x*(P(x-1)/Q(x-1)) in range [1.0,4.5] */
+ const gmx_simd_double_t CBP6 = gmx_simd_set1_d(2.49650423685462752497647637088e-10);
+ const gmx_simd_double_t CBP5 = gmx_simd_set1_d(0.00119770193298159629350136085658);
+ const gmx_simd_double_t CBP4 = gmx_simd_set1_d(0.0164944422378370965881008942733);
+ const gmx_simd_double_t CBP3 = gmx_simd_set1_d(0.0984581468691775932063932439252);
+ const gmx_simd_double_t CBP2 = gmx_simd_set1_d(0.317364595806937763843589437418);
+ const gmx_simd_double_t CBP1 = gmx_simd_set1_d(0.554167062641455850932670067075);
+ const gmx_simd_double_t CBP0 = gmx_simd_set1_d(0.427583576155807163756925301060);
+ const gmx_simd_double_t CBQ7 = gmx_simd_set1_d(0.00212288829699830145976198384930);
+ const gmx_simd_double_t CBQ6 = gmx_simd_set1_d(0.0334810979522685300554606393425);
+ const gmx_simd_double_t CBQ5 = gmx_simd_set1_d(0.2361713785181450957579508850717);
+ const gmx_simd_double_t CBQ4 = gmx_simd_set1_d(0.955364736493055670530981883072);
+ const gmx_simd_double_t CBQ3 = gmx_simd_set1_d(2.36815675631420037315349279199);
+ const gmx_simd_double_t CBQ2 = gmx_simd_set1_d(3.55261649184083035537184223542);
+ const gmx_simd_double_t CBQ1 = gmx_simd_set1_d(2.93501136050160872574376997993);
+ /* CBQ0 == 1.0 */
+
+ /* Coefficients for minimax approximation of erfc(x)=exp(-x^2)/x*(P(1/x)/Q(1/x)) in range [4.5,inf] */
+ const gmx_simd_double_t CCP6 = gmx_simd_set1_d(-2.8175401114513378771);
+ const gmx_simd_double_t CCP5 = gmx_simd_set1_d(-3.22729451764143718517);
+ const gmx_simd_double_t CCP4 = gmx_simd_set1_d(-2.5518551727311523996);
+ const gmx_simd_double_t CCP3 = gmx_simd_set1_d(-0.687717681153649930619);
+ const gmx_simd_double_t CCP2 = gmx_simd_set1_d(-0.212652252872804219852);
+ const gmx_simd_double_t CCP1 = gmx_simd_set1_d(0.0175389834052493308818);
+ const gmx_simd_double_t CCP0 = gmx_simd_set1_d(0.00628057170626964891937);
+
+ const gmx_simd_double_t CCQ6 = gmx_simd_set1_d(5.48409182238641741584);
+ const gmx_simd_double_t CCQ5 = gmx_simd_set1_d(13.5064170191802889145);
+ const gmx_simd_double_t CCQ4 = gmx_simd_set1_d(22.9367376522880577224);
+ const gmx_simd_double_t CCQ3 = gmx_simd_set1_d(15.930646027911794143);
+ const gmx_simd_double_t CCQ2 = gmx_simd_set1_d(11.0567237927800161565);
+ const gmx_simd_double_t CCQ1 = gmx_simd_set1_d(2.79257750980575282228);
+ /* CCQ0 == 1.0 */
+ const gmx_simd_double_t CCoffset = gmx_simd_set1_d(0.5579090118408203125);
+
+ const gmx_simd_double_t one = gmx_simd_set1_d(1.0);
+ const gmx_simd_double_t two = gmx_simd_set1_d(2.0);
+
+ gmx_simd_double_t xabs, x2, x4, t, t2, w, w2;
+ gmx_simd_double_t PolyAP0, PolyAP1, PolyAQ0, PolyAQ1;
+ gmx_simd_double_t PolyBP0, PolyBP1, PolyBQ0, PolyBQ1;
+ gmx_simd_double_t PolyCP0, PolyCP1, PolyCQ0, PolyCQ1;
+ gmx_simd_double_t res_erf, res_erfcB, res_erfcC, res_erfc, res;
+ gmx_simd_double_t expmx2;
+ gmx_simd_dbool_t mask;
+
+ /* Calculate erf() */
+ xabs = gmx_simd_fabs_d(x);
+ x2 = gmx_simd_mul_d(x, x);
+ x4 = gmx_simd_mul_d(x2, x2);
+
+ PolyAP0 = gmx_simd_mul_d(CAP4, x4);
+ PolyAP1 = gmx_simd_mul_d(CAP3, x4);
+ PolyAP0 = gmx_simd_add_d(PolyAP0, CAP2);
+ PolyAP1 = gmx_simd_add_d(PolyAP1, CAP1);
+ PolyAP0 = gmx_simd_mul_d(PolyAP0, x4);
+ PolyAP1 = gmx_simd_mul_d(PolyAP1, x2);
+ PolyAP0 = gmx_simd_add_d(PolyAP0, CAP0);
+ PolyAP0 = gmx_simd_add_d(PolyAP0, PolyAP1);
+
+ PolyAQ1 = gmx_simd_mul_d(CAQ5, x4);
+ PolyAQ0 = gmx_simd_mul_d(CAQ4, x4);
+ PolyAQ1 = gmx_simd_add_d(PolyAQ1, CAQ3);
+ PolyAQ0 = gmx_simd_add_d(PolyAQ0, CAQ2);
+ PolyAQ1 = gmx_simd_mul_d(PolyAQ1, x4);
+ PolyAQ0 = gmx_simd_mul_d(PolyAQ0, x4);
+ PolyAQ1 = gmx_simd_add_d(PolyAQ1, CAQ1);
+ PolyAQ0 = gmx_simd_add_d(PolyAQ0, one);
+ PolyAQ1 = gmx_simd_mul_d(PolyAQ1, x2);
+ PolyAQ0 = gmx_simd_add_d(PolyAQ0, PolyAQ1);
+
+ res_erf = gmx_simd_mul_d(PolyAP0, gmx_simd_inv_d(PolyAQ0));
+ res_erf = gmx_simd_add_d(CAoffset, res_erf);
+ res_erf = gmx_simd_mul_d(x, res_erf);
+
+ /* Calculate erfc() in range [1,4.5] */
+ t = gmx_simd_sub_d(xabs, one);
+ t2 = gmx_simd_mul_d(t, t);
+
+ PolyBP0 = gmx_simd_mul_d(CBP6, t2);
+ PolyBP1 = gmx_simd_mul_d(CBP5, t2);
+ PolyBP0 = gmx_simd_add_d(PolyBP0, CBP4);
+ PolyBP1 = gmx_simd_add_d(PolyBP1, CBP3);
+ PolyBP0 = gmx_simd_mul_d(PolyBP0, t2);
+ PolyBP1 = gmx_simd_mul_d(PolyBP1, t2);
+ PolyBP0 = gmx_simd_add_d(PolyBP0, CBP2);
+ PolyBP1 = gmx_simd_add_d(PolyBP1, CBP1);
+ PolyBP0 = gmx_simd_mul_d(PolyBP0, t2);
+ PolyBP1 = gmx_simd_mul_d(PolyBP1, t);
+ PolyBP0 = gmx_simd_add_d(PolyBP0, CBP0);
+ PolyBP0 = gmx_simd_add_d(PolyBP0, PolyBP1);
+
+ PolyBQ1 = gmx_simd_mul_d(CBQ7, t2);
+ PolyBQ0 = gmx_simd_mul_d(CBQ6, t2);
+ PolyBQ1 = gmx_simd_add_d(PolyBQ1, CBQ5);
+ PolyBQ0 = gmx_simd_add_d(PolyBQ0, CBQ4);
+ PolyBQ1 = gmx_simd_mul_d(PolyBQ1, t2);
+ PolyBQ0 = gmx_simd_mul_d(PolyBQ0, t2);
+ PolyBQ1 = gmx_simd_add_d(PolyBQ1, CBQ3);
+ PolyBQ0 = gmx_simd_add_d(PolyBQ0, CBQ2);
+ PolyBQ1 = gmx_simd_mul_d(PolyBQ1, t2);
+ PolyBQ0 = gmx_simd_mul_d(PolyBQ0, t2);
+ PolyBQ1 = gmx_simd_add_d(PolyBQ1, CBQ1);
+ PolyBQ0 = gmx_simd_add_d(PolyBQ0, one);
+ PolyBQ1 = gmx_simd_mul_d(PolyBQ1, t);
+ PolyBQ0 = gmx_simd_add_d(PolyBQ0, PolyBQ1);
+
+ res_erfcB = gmx_simd_mul_d(PolyBP0, gmx_simd_inv_d(PolyBQ0));
+
+ res_erfcB = gmx_simd_mul_d(res_erfcB, xabs);
+
+ /* Calculate erfc() in range [4.5,inf] */
+ w = gmx_simd_inv_d(xabs);
+ w2 = gmx_simd_mul_d(w, w);
+
+ PolyCP0 = gmx_simd_mul_d(CCP6, w2);
+ PolyCP1 = gmx_simd_mul_d(CCP5, w2);
+ PolyCP0 = gmx_simd_add_d(PolyCP0, CCP4);
+ PolyCP1 = gmx_simd_add_d(PolyCP1, CCP3);
+ PolyCP0 = gmx_simd_mul_d(PolyCP0, w2);
+ PolyCP1 = gmx_simd_mul_d(PolyCP1, w2);
+ PolyCP0 = gmx_simd_add_d(PolyCP0, CCP2);
+ PolyCP1 = gmx_simd_add_d(PolyCP1, CCP1);
+ PolyCP0 = gmx_simd_mul_d(PolyCP0, w2);
+ PolyCP1 = gmx_simd_mul_d(PolyCP1, w);
+ PolyCP0 = gmx_simd_add_d(PolyCP0, CCP0);
+ PolyCP0 = gmx_simd_add_d(PolyCP0, PolyCP1);
+
+ PolyCQ0 = gmx_simd_mul_d(CCQ6, w2);
+ PolyCQ1 = gmx_simd_mul_d(CCQ5, w2);
+ PolyCQ0 = gmx_simd_add_d(PolyCQ0, CCQ4);
+ PolyCQ1 = gmx_simd_add_d(PolyCQ1, CCQ3);
+ PolyCQ0 = gmx_simd_mul_d(PolyCQ0, w2);
+ PolyCQ1 = gmx_simd_mul_d(PolyCQ1, w2);
+ PolyCQ0 = gmx_simd_add_d(PolyCQ0, CCQ2);
+ PolyCQ1 = gmx_simd_add_d(PolyCQ1, CCQ1);
+ PolyCQ0 = gmx_simd_mul_d(PolyCQ0, w2);
+ PolyCQ1 = gmx_simd_mul_d(PolyCQ1, w);
+ PolyCQ0 = gmx_simd_add_d(PolyCQ0, one);
+ PolyCQ0 = gmx_simd_add_d(PolyCQ0, PolyCQ1);
+
+ expmx2 = gmx_simd_exp_d( gmx_simd_fneg_d(x2) );
+
+ res_erfcC = gmx_simd_mul_d(PolyCP0, gmx_simd_inv_d(PolyCQ0));
+ res_erfcC = gmx_simd_add_d(res_erfcC, CCoffset);
+ res_erfcC = gmx_simd_mul_d(res_erfcC, w);
+
+ mask = gmx_simd_cmplt_d(gmx_simd_set1_d(4.5), xabs);
+ res_erfc = gmx_simd_blendv_d(res_erfcB, res_erfcC, mask);
+
+ res_erfc = gmx_simd_mul_d(res_erfc, expmx2);
+
+ /* erfc(x<0) = 2-erfc(|x|) */
+ mask = gmx_simd_cmplt_d(x, gmx_simd_setzero_d());
+ res_erfc = gmx_simd_blendv_d(res_erfc, gmx_simd_sub_d(two, res_erfc), mask);
+
+ /* Select erf() or erfc() */
+ mask = gmx_simd_cmplt_d(xabs, one);
+ res = gmx_simd_blendv_d(res_erfc, gmx_simd_sub_d(one, res_erf), mask);
+
+ return res;
+}
+
+/*! \brief SIMD double sin \& cos.
+ *
+ * \copydetails gmx_simd_sincos_f
+ */
+static gmx_inline void
+gmx_simd_sincos_d(gmx_simd_double_t x, gmx_simd_double_t *sinval, gmx_simd_double_t *cosval)
+{
+ /* Constants to subtract Pi/4*x from y while minimizing precision loss */
+ const gmx_simd_double_t argred0 = gmx_simd_set1_d(2*0.78539816290140151978);
+ const gmx_simd_double_t argred1 = gmx_simd_set1_d(2*4.9604678871439933374e-10);
+ const gmx_simd_double_t argred2 = gmx_simd_set1_d(2*1.1258708853173288931e-18);
+ const gmx_simd_double_t argred3 = gmx_simd_set1_d(2*1.7607799325916000908e-27);
+ const gmx_simd_double_t two_over_pi = gmx_simd_set1_d(2.0/M_PI);
+ const gmx_simd_double_t const_sin5 = gmx_simd_set1_d( 1.58938307283228937328511e-10);
+ const gmx_simd_double_t const_sin4 = gmx_simd_set1_d(-2.50506943502539773349318e-08);
+ const gmx_simd_double_t const_sin3 = gmx_simd_set1_d( 2.75573131776846360512547e-06);
+ const gmx_simd_double_t const_sin2 = gmx_simd_set1_d(-0.000198412698278911770864914);
+ const gmx_simd_double_t const_sin1 = gmx_simd_set1_d( 0.0083333333333191845961746);
+ const gmx_simd_double_t const_sin0 = gmx_simd_set1_d(-0.166666666666666130709393);
+
+ const gmx_simd_double_t const_cos7 = gmx_simd_set1_d(-1.13615350239097429531523e-11);
+ const gmx_simd_double_t const_cos6 = gmx_simd_set1_d( 2.08757471207040055479366e-09);
+ const gmx_simd_double_t const_cos5 = gmx_simd_set1_d(-2.75573144028847567498567e-07);
+ const gmx_simd_double_t const_cos4 = gmx_simd_set1_d( 2.48015872890001867311915e-05);
+ const gmx_simd_double_t const_cos3 = gmx_simd_set1_d(-0.00138888888888714019282329);
+ const gmx_simd_double_t const_cos2 = gmx_simd_set1_d( 0.0416666666666665519592062);
+ const gmx_simd_double_t half = gmx_simd_set1_d(0.5);
+ const gmx_simd_double_t one = gmx_simd_set1_d(1.0);
+ gmx_simd_double_t ssign, csign;
+ gmx_simd_double_t x2, y, z, psin, pcos, sss, ccc;
+ gmx_simd_dbool_t mask;
+#if (defined GMX_SIMD_HAVE_DINT32) && (defined GMX_SIMD_HAVE_DINT32_ARITHMETICS) && (defined GMX_SIMD_HAVE_LOGICAL)
+ const gmx_simd_dint32_t ione = gmx_simd_set1_di(1);
+ const gmx_simd_dint32_t itwo = gmx_simd_set1_di(2);
+ gmx_simd_dint32_t iy;
+
+ z = gmx_simd_mul_d(x, two_over_pi);
+ iy = gmx_simd_cvt_d2i(z);
+ y = gmx_simd_round_d(z);
+
+ mask = gmx_simd_cvt_dib2db(gmx_simd_cmpeq_di(gmx_simd_and_di(iy, ione), gmx_simd_setzero_di()));
+ ssign = gmx_simd_blendzero_d(gmx_simd_set1_d(-0.0), gmx_simd_cvt_dib2db(gmx_simd_cmpeq_di(gmx_simd_and_di(iy, itwo), itwo)));
+ csign = gmx_simd_blendzero_d(gmx_simd_set1_d(-0.0), gmx_simd_cvt_dib2db(gmx_simd_cmpeq_di(gmx_simd_and_di(gmx_simd_add_di(iy, ione), itwo), itwo)));
+#else
+ const gmx_simd_double_t quarter = gmx_simd_set1_d(0.25);
+ const gmx_simd_double_t minusquarter = gmx_simd_set1_d(-0.25);
+ gmx_simd_double_t q;
+ gmx_simd_dbool_t m1, m2, m3;
+
+ /* The most obvious way to find the arguments quadrant in the unit circle
+ * to calculate the sign is to use integer arithmetic, but that is not
+ * present in all SIMD implementations. As an alternative, we have devised a
+ * pure floating-point algorithm that uses truncation for argument reduction
+ * so that we get a new value 0<=q<1 over the unit circle, and then
+ * do floating-point comparisons with fractions. This is likely to be
+ * slightly slower (~10%) due to the longer latencies of floating-point, so
+ * we only use it when integer SIMD arithmetic is not present.
+ */
+ ssign = x;
+ x = gmx_simd_fabs_d(x);
+ /* It is critical that half-way cases are rounded down */
+ z = gmx_simd_fmadd_d(x, two_over_pi, half);
+ y = gmx_simd_trunc_d(z);
+ q = gmx_simd_mul_d(z, quarter);
+ q = gmx_simd_sub_d(q, gmx_simd_trunc_d(q));
+ /* z now starts at 0.0 for x=-pi/4 (although neg. values cannot occur), and
+ * then increased by 1.0 as x increases by 2*Pi, when it resets to 0.0.
+ * This removes the 2*Pi periodicity without using any integer arithmetic.
+ * First check if y had the value 2 or 3, set csign if true.
+ */
+ q = gmx_simd_sub_d(q, half);
+ /* If we have logical operations we can work directly on the signbit, which
+ * saves instructions. Otherwise we need to represent signs as +1.0/-1.0.
+ * Thus, if you are altering defines to debug alternative code paths, the
+ * two GMX_SIMD_HAVE_LOGICAL sections in this routine must either both be
+ * active or inactive - you will get errors if only one is used.
+ */
+# ifdef GMX_SIMD_HAVE_LOGICAL
+ ssign = gmx_simd_and_d(ssign, gmx_simd_set1_d(-0.0));
+ csign = gmx_simd_andnot_d(q, gmx_simd_set1_d(-0.0));
+ ssign = gmx_simd_xor_d(ssign, csign);
+# else
+ csign = gmx_simd_xor_sign_d(gmx_simd_set1_d(-1.0), q);
+ ssign = gmx_simd_xor_sign_d(ssign, csign); /* swap ssign if csign was set. */
+# endif
+ /* Check if y had value 1 or 3 (remember we subtracted 0.5 from q) */
+ m1 = gmx_simd_cmplt_d(q, minusquarter);
+ m2 = gmx_simd_cmple_d(gmx_simd_setzero_d(), q);
+ m3 = gmx_simd_cmplt_d(q, quarter);
+ m2 = gmx_simd_and_db(m2, m3);
+ mask = gmx_simd_or_db(m1, m2);
+ /* where mask is FALSE, set sign. */
+ csign = gmx_simd_xor_sign_d(csign, gmx_simd_blendv_d(gmx_simd_set1_d(-1.0), one, mask));
+#endif
+ x = gmx_simd_fnmadd_d(y, argred0, x);
+ x = gmx_simd_fnmadd_d(y, argred1, x);
+ x = gmx_simd_fnmadd_d(y, argred2, x);
+ x = gmx_simd_fnmadd_d(y, argred3, x);
+ x2 = gmx_simd_mul_d(x, x);
+
+ psin = gmx_simd_fmadd_d(const_sin5, x2, const_sin4);
+ psin = gmx_simd_fmadd_d(psin, x2, const_sin3);
+ psin = gmx_simd_fmadd_d(psin, x2, const_sin2);
+ psin = gmx_simd_fmadd_d(psin, x2, const_sin1);
+ psin = gmx_simd_fmadd_d(psin, x2, const_sin0);
+ psin = gmx_simd_fmadd_d(psin, gmx_simd_mul_d(x2, x), x);
+
+ pcos = gmx_simd_fmadd_d(const_cos7, x2, const_cos6);
+ pcos = gmx_simd_fmadd_d(pcos, x2, const_cos5);
+ pcos = gmx_simd_fmadd_d(pcos, x2, const_cos4);
+ pcos = gmx_simd_fmadd_d(pcos, x2, const_cos3);
+ pcos = gmx_simd_fmadd_d(pcos, x2, const_cos2);
+ pcos = gmx_simd_fmsub_d(pcos, x2, half);
+ pcos = gmx_simd_fmadd_d(pcos, x2, one);
+
+ sss = gmx_simd_blendv_d(pcos, psin, mask);
+ ccc = gmx_simd_blendv_d(psin, pcos, mask);
+ /* See comment for GMX_SIMD_HAVE_LOGICAL section above. */
+#ifdef GMX_SIMD_HAVE_LOGICAL
+ *sinval = gmx_simd_xor_d(sss, ssign);
+ *cosval = gmx_simd_xor_d(ccc, csign);
+#else
+ *sinval = gmx_simd_xor_sign_d(sss, ssign);
+ *cosval = gmx_simd_xor_sign_d(ccc, csign);
+#endif
+}
+
+/*! \brief SIMD double sin(x).
+ *
+ * \copydetails gmx_simd_sin_f
+ */
+static gmx_inline gmx_simd_double_t
+gmx_simd_sin_d(gmx_simd_double_t x)
+{
+ gmx_simd_double_t s, c;
+ gmx_simd_sincos_d(x, &s, &c);
+ return s;
+}
+
+/*! \brief SIMD double cos(x).
+ *
+ * \copydetails gmx_simd_cos_f
+ */
+static gmx_inline gmx_simd_double_t
+gmx_simd_cos_d(gmx_simd_double_t x)
+{
+ gmx_simd_double_t s, c;
+ gmx_simd_sincos_d(x, &s, &c);
+ return c;
+}
+
+/*! \brief SIMD double tan(x).
+ *
+ * \copydetails gmx_simd_tan_f
+ */
+static gmx_inline gmx_simd_double_t
+gmx_simd_tan_d(gmx_simd_double_t x)
+{
+ const gmx_simd_double_t argred0 = gmx_simd_set1_d(2*0.78539816290140151978);
+ const gmx_simd_double_t argred1 = gmx_simd_set1_d(2*4.9604678871439933374e-10);
+ const gmx_simd_double_t argred2 = gmx_simd_set1_d(2*1.1258708853173288931e-18);
+ const gmx_simd_double_t argred3 = gmx_simd_set1_d(2*1.7607799325916000908e-27);
+ const gmx_simd_double_t two_over_pi = gmx_simd_set1_d(2.0/M_PI);
+ const gmx_simd_double_t CT15 = gmx_simd_set1_d(1.01419718511083373224408e-05);
+ const gmx_simd_double_t CT14 = gmx_simd_set1_d(-2.59519791585924697698614e-05);
+ const gmx_simd_double_t CT13 = gmx_simd_set1_d(5.23388081915899855325186e-05);
+ const gmx_simd_double_t CT12 = gmx_simd_set1_d(-3.05033014433946488225616e-05);
+ const gmx_simd_double_t CT11 = gmx_simd_set1_d(7.14707504084242744267497e-05);
+ const gmx_simd_double_t CT10 = gmx_simd_set1_d(8.09674518280159187045078e-05);
+ const gmx_simd_double_t CT9 = gmx_simd_set1_d(0.000244884931879331847054404);
+ const gmx_simd_double_t CT8 = gmx_simd_set1_d(0.000588505168743587154904506);
+ const gmx_simd_double_t CT7 = gmx_simd_set1_d(0.00145612788922812427978848);
+ const gmx_simd_double_t CT6 = gmx_simd_set1_d(0.00359208743836906619142924);
+ const gmx_simd_double_t CT5 = gmx_simd_set1_d(0.00886323944362401618113356);
+ const gmx_simd_double_t CT4 = gmx_simd_set1_d(0.0218694882853846389592078);
+ const gmx_simd_double_t CT3 = gmx_simd_set1_d(0.0539682539781298417636002);
+ const gmx_simd_double_t CT2 = gmx_simd_set1_d(0.133333333333125941821962);
+ const gmx_simd_double_t CT1 = gmx_simd_set1_d(0.333333333333334980164153);
+
+ gmx_simd_double_t x2, p, y, z;
+ gmx_simd_dbool_t mask;
+
+#if (defined GMX_SIMD_HAVE_DINT32) && (defined GMX_SIMD_HAVE_DINT32_ARITHMETICS) && (defined GMX_SIMD_HAVE_LOGICAL)
+ gmx_simd_dint32_t iy;
+ gmx_simd_dint32_t ione = gmx_simd_set1_di(1);
+
+ z = gmx_simd_mul_d(x, two_over_pi);
+ iy = gmx_simd_cvt_d2i(z);
+ y = gmx_simd_round_d(z);
+ mask = gmx_simd_cvt_dib2db(gmx_simd_cmpeq_di(gmx_simd_and_di(iy, ione), ione));
+
+ x = gmx_simd_fnmadd_d(y, argred0, x);
+ x = gmx_simd_fnmadd_d(y, argred1, x);
+ x = gmx_simd_fnmadd_d(y, argred2, x);
+ x = gmx_simd_fnmadd_d(y, argred3, x);
+ x = gmx_simd_xor_d(gmx_simd_blendzero_d(gmx_simd_set1_d(-0.0), mask), x);
+#else
+ const gmx_simd_double_t quarter = gmx_simd_set1_d(0.25);
+ const gmx_simd_double_t half = gmx_simd_set1_d(0.5);
+ const gmx_simd_double_t threequarter = gmx_simd_set1_d(0.75);
+ gmx_simd_double_t w, q;
+ gmx_simd_dbool_t m1, m2, m3;
+
+ w = gmx_simd_fabs_d(x);
+ z = gmx_simd_fmadd_d(w, two_over_pi, half);
+ y = gmx_simd_trunc_d(z);
+ q = gmx_simd_mul_d(z, quarter);
+ q = gmx_simd_sub_d(q, gmx_simd_trunc_d(q));
+ m1 = gmx_simd_cmple_d(quarter, q);
+ m2 = gmx_simd_cmplt_d(q, half);
+ m3 = gmx_simd_cmple_d(threequarter, q);
+ m1 = gmx_simd_and_db(m1, m2);
+ mask = gmx_simd_or_db(m1, m3);
+ w = gmx_simd_fnmadd_d(y, argred0, w);
+ w = gmx_simd_fnmadd_d(y, argred1, w);
+ w = gmx_simd_fnmadd_d(y, argred2, w);
+ w = gmx_simd_fnmadd_d(y, argred3, w);
+
+ w = gmx_simd_blendv_d(w, gmx_simd_fneg_d(w), mask);
+ x = gmx_simd_xor_sign_d(w, x);
+#endif
+ x2 = gmx_simd_mul_d(x, x);
+ p = gmx_simd_fmadd_d(CT15, x2, CT14);
+ p = gmx_simd_fmadd_d(p, x2, CT13);
+ p = gmx_simd_fmadd_d(p, x2, CT12);
+ p = gmx_simd_fmadd_d(p, x2, CT11);
+ p = gmx_simd_fmadd_d(p, x2, CT10);
+ p = gmx_simd_fmadd_d(p, x2, CT9);
+ p = gmx_simd_fmadd_d(p, x2, CT8);
+ p = gmx_simd_fmadd_d(p, x2, CT7);
+ p = gmx_simd_fmadd_d(p, x2, CT6);
+ p = gmx_simd_fmadd_d(p, x2, CT5);
+ p = gmx_simd_fmadd_d(p, x2, CT4);
+ p = gmx_simd_fmadd_d(p, x2, CT3);
+ p = gmx_simd_fmadd_d(p, x2, CT2);
+ p = gmx_simd_fmadd_d(p, x2, CT1);
+ p = gmx_simd_fmadd_d(x2, gmx_simd_mul_d(p, x), x);
+
+ p = gmx_simd_blendv_d( p, gmx_simd_inv_d(p), mask);
+ return p;
+}
+
+/*! \brief SIMD double asin(x).
+ *
+ * \copydetails gmx_simd_asin_f
+ */
+static gmx_inline gmx_simd_double_t
+gmx_simd_asin_d(gmx_simd_double_t x)
+{
+ /* Same algorithm as cephes library */
+ const gmx_simd_double_t limit1 = gmx_simd_set1_d(0.625);
+ const gmx_simd_double_t limit2 = gmx_simd_set1_d(1e-8);
+ const gmx_simd_double_t one = gmx_simd_set1_d(1.0);
+ const gmx_simd_double_t quarterpi = gmx_simd_set1_d(M_PI/4.0);
+ const gmx_simd_double_t morebits = gmx_simd_set1_d(6.123233995736765886130e-17);
+
+ const gmx_simd_double_t P5 = gmx_simd_set1_d(4.253011369004428248960e-3);
+ const gmx_simd_double_t P4 = gmx_simd_set1_d(-6.019598008014123785661e-1);
+ const gmx_simd_double_t P3 = gmx_simd_set1_d(5.444622390564711410273e0);
+ const gmx_simd_double_t P2 = gmx_simd_set1_d(-1.626247967210700244449e1);
+ const gmx_simd_double_t P1 = gmx_simd_set1_d(1.956261983317594739197e1);
+ const gmx_simd_double_t P0 = gmx_simd_set1_d(-8.198089802484824371615e0);
+
+ const gmx_simd_double_t Q4 = gmx_simd_set1_d(-1.474091372988853791896e1);
+ const gmx_simd_double_t Q3 = gmx_simd_set1_d(7.049610280856842141659e1);
+ const gmx_simd_double_t Q2 = gmx_simd_set1_d(-1.471791292232726029859e2);
+ const gmx_simd_double_t Q1 = gmx_simd_set1_d(1.395105614657485689735e2);
+ const gmx_simd_double_t Q0 = gmx_simd_set1_d(-4.918853881490881290097e1);
+
+ const gmx_simd_double_t R4 = gmx_simd_set1_d(2.967721961301243206100e-3);
+ const gmx_simd_double_t R3 = gmx_simd_set1_d(-5.634242780008963776856e-1);
+ const gmx_simd_double_t R2 = gmx_simd_set1_d(6.968710824104713396794e0);
+ const gmx_simd_double_t R1 = gmx_simd_set1_d(-2.556901049652824852289e1);
+ const gmx_simd_double_t R0 = gmx_simd_set1_d(2.853665548261061424989e1);
+
+ const gmx_simd_double_t S3 = gmx_simd_set1_d(-2.194779531642920639778e1);
+ const gmx_simd_double_t S2 = gmx_simd_set1_d(1.470656354026814941758e2);
+ const gmx_simd_double_t S1 = gmx_simd_set1_d(-3.838770957603691357202e2);
+ const gmx_simd_double_t S0 = gmx_simd_set1_d(3.424398657913078477438e2);
+
+ gmx_simd_double_t xabs;
+ gmx_simd_double_t zz, ww, z, q, w, zz2, ww2;
+ gmx_simd_double_t PA, PB;
+ gmx_simd_double_t QA, QB;
+ gmx_simd_double_t RA, RB;
+ gmx_simd_double_t SA, SB;
+ gmx_simd_double_t nom, denom;
+ gmx_simd_dbool_t mask;
+
+ xabs = gmx_simd_fabs_d(x);
+
+ mask = gmx_simd_cmplt_d(limit1, xabs);
+
+ zz = gmx_simd_sub_d(one, xabs);
+ ww = gmx_simd_mul_d(xabs, xabs);
+ zz2 = gmx_simd_mul_d(zz, zz);
+ ww2 = gmx_simd_mul_d(ww, ww);
+
+ /* R */
+ RA = gmx_simd_mul_d(R4, zz2);
+ RB = gmx_simd_mul_d(R3, zz2);
+ RA = gmx_simd_add_d(RA, R2);
+ RB = gmx_simd_add_d(RB, R1);
+ RA = gmx_simd_mul_d(RA, zz2);
+ RB = gmx_simd_mul_d(RB, zz);
+ RA = gmx_simd_add_d(RA, R0);
+ RA = gmx_simd_add_d(RA, RB);
+
+ /* S, SA = zz2 */
+ SB = gmx_simd_mul_d(S3, zz2);
+ SA = gmx_simd_add_d(zz2, S2);
+ SB = gmx_simd_add_d(SB, S1);
+ SA = gmx_simd_mul_d(SA, zz2);
+ SB = gmx_simd_mul_d(SB, zz);
+ SA = gmx_simd_add_d(SA, S0);
+ SA = gmx_simd_add_d(SA, SB);
+
+ /* P */
+ PA = gmx_simd_mul_d(P5, ww2);
+ PB = gmx_simd_mul_d(P4, ww2);
+ PA = gmx_simd_add_d(PA, P3);
+ PB = gmx_simd_add_d(PB, P2);
+ PA = gmx_simd_mul_d(PA, ww2);
+ PB = gmx_simd_mul_d(PB, ww2);
+ PA = gmx_simd_add_d(PA, P1);
+ PB = gmx_simd_add_d(PB, P0);
+ PA = gmx_simd_mul_d(PA, ww);
+ PA = gmx_simd_add_d(PA, PB);
+
+ /* Q, QA = ww2 */
+ QB = gmx_simd_mul_d(Q4, ww2);
+ QA = gmx_simd_add_d(ww2, Q3);
+ QB = gmx_simd_add_d(QB, Q2);
+ QA = gmx_simd_mul_d(QA, ww2);
+ QB = gmx_simd_mul_d(QB, ww2);
+ QA = gmx_simd_add_d(QA, Q1);
+ QB = gmx_simd_add_d(QB, Q0);
+ QA = gmx_simd_mul_d(QA, ww);
+ QA = gmx_simd_add_d(QA, QB);
+
+ RA = gmx_simd_mul_d(RA, zz);
+ PA = gmx_simd_mul_d(PA, ww);
+
+ nom = gmx_simd_blendv_d( PA, RA, mask );
+ denom = gmx_simd_blendv_d( QA, SA, mask );
+
+ q = gmx_simd_mul_d( nom, gmx_simd_inv_d(denom) );
+
+ zz = gmx_simd_add_d(zz, zz);
+ zz = gmx_simd_sqrt_d(zz);
+ z = gmx_simd_sub_d(quarterpi, zz);
+ zz = gmx_simd_mul_d(zz, q);
+ zz = gmx_simd_sub_d(zz, morebits);
+ z = gmx_simd_sub_d(z, zz);
+ z = gmx_simd_add_d(z, quarterpi);
+
+ w = gmx_simd_mul_d(xabs, q);
+ w = gmx_simd_add_d(w, xabs);
+
+ z = gmx_simd_blendv_d( w, z, mask );
+
+ mask = gmx_simd_cmplt_d(limit2, xabs);
+ z = gmx_simd_blendv_d( xabs, z, mask );
+
+ z = gmx_simd_xor_sign_d(z, x);
+
+ return z;
+}
+
+/*! \brief SIMD double acos(x).
+ *
+ * \copydetails gmx_simd_acos_f
+ */
+static gmx_inline gmx_simd_double_t
+gmx_simd_acos_d(gmx_simd_double_t x)
+{
+ const gmx_simd_double_t one = gmx_simd_set1_d(1.0);
+ const gmx_simd_double_t half = gmx_simd_set1_d(0.5);
+ const gmx_simd_double_t quarterpi0 = gmx_simd_set1_d(7.85398163397448309616e-1);
+ const gmx_simd_double_t quarterpi1 = gmx_simd_set1_d(6.123233995736765886130e-17);
+
+ gmx_simd_dbool_t mask1;
+ gmx_simd_double_t z, z1, z2;
+
+ mask1 = gmx_simd_cmplt_d(half, x);
+ z1 = gmx_simd_mul_d(half, gmx_simd_sub_d(one, x));
+ z1 = gmx_simd_sqrt_d(z1);
+ z = gmx_simd_blendv_d( x, z1, mask1 );
+
+ z = gmx_simd_asin_d(z);
+
+ z1 = gmx_simd_add_d(z, z);
+
+ z2 = gmx_simd_sub_d(quarterpi0, z);
+ z2 = gmx_simd_add_d(z2, quarterpi1);
+ z2 = gmx_simd_add_d(z2, quarterpi0);
+
+ z = gmx_simd_blendv_d(z2, z1, mask1);
+
+ return z;
+}
+
+/*! \brief SIMD double atan(x).
+ *
+ * \copydetails gmx_simd_atan_f
+ */
+static gmx_inline gmx_simd_double_t
+gmx_simd_atan_d(gmx_simd_double_t x)
+{
+ /* Same algorithm as cephes library */
+ const gmx_simd_double_t limit1 = gmx_simd_set1_d(0.66);
+ const gmx_simd_double_t limit2 = gmx_simd_set1_d(2.41421356237309504880);
+ const gmx_simd_double_t quarterpi = gmx_simd_set1_d(M_PI/4.0);
+ const gmx_simd_double_t halfpi = gmx_simd_set1_d(M_PI/2.0);
+ const gmx_simd_double_t mone = gmx_simd_set1_d(-1.0);
+ const gmx_simd_double_t morebits1 = gmx_simd_set1_d(0.5*6.123233995736765886130E-17);
+ const gmx_simd_double_t morebits2 = gmx_simd_set1_d(6.123233995736765886130E-17);
+
+ const gmx_simd_double_t P4 = gmx_simd_set1_d(-8.750608600031904122785E-1);
+ const gmx_simd_double_t P3 = gmx_simd_set1_d(-1.615753718733365076637E1);
+ const gmx_simd_double_t P2 = gmx_simd_set1_d(-7.500855792314704667340E1);
+ const gmx_simd_double_t P1 = gmx_simd_set1_d(-1.228866684490136173410E2);
+ const gmx_simd_double_t P0 = gmx_simd_set1_d(-6.485021904942025371773E1);
+
+ const gmx_simd_double_t Q4 = gmx_simd_set1_d(2.485846490142306297962E1);
+ const gmx_simd_double_t Q3 = gmx_simd_set1_d(1.650270098316988542046E2);
+ const gmx_simd_double_t Q2 = gmx_simd_set1_d(4.328810604912902668951E2);
+ const gmx_simd_double_t Q1 = gmx_simd_set1_d(4.853903996359136964868E2);
+ const gmx_simd_double_t Q0 = gmx_simd_set1_d(1.945506571482613964425E2);
+
+ gmx_simd_double_t y, xabs, t1, t2;
+ gmx_simd_double_t z, z2;
+ gmx_simd_double_t P_A, P_B, Q_A, Q_B;
+ gmx_simd_dbool_t mask1, mask2;
+
+ xabs = gmx_simd_fabs_d(x);
+
+ mask1 = gmx_simd_cmplt_d(limit1, xabs);
+ mask2 = gmx_simd_cmplt_d(limit2, xabs);
+
+ t1 = gmx_simd_mul_d(gmx_simd_add_d(xabs, mone), gmx_simd_inv_d(gmx_simd_sub_d(xabs, mone)));
+ t2 = gmx_simd_mul_d(mone, gmx_simd_inv_d(xabs));
+
+ y = gmx_simd_blendzero_d(quarterpi, mask1);
+ y = gmx_simd_blendv_d(y, halfpi, mask2);
+ xabs = gmx_simd_blendv_d(xabs, t1, mask1);
+ xabs = gmx_simd_blendv_d(xabs, t2, mask2);
+
+ z = gmx_simd_mul_d(xabs, xabs);
+ z2 = gmx_simd_mul_d(z, z);
+
+ P_A = gmx_simd_mul_d(P4, z2);
+ P_B = gmx_simd_mul_d(P3, z2);
+ P_A = gmx_simd_add_d(P_A, P2);
+ P_B = gmx_simd_add_d(P_B, P1);
+ P_A = gmx_simd_mul_d(P_A, z2);
+ P_B = gmx_simd_mul_d(P_B, z);
+ P_A = gmx_simd_add_d(P_A, P0);
+ P_A = gmx_simd_add_d(P_A, P_B);
+
+ /* Q_A = z2 */
+ Q_B = gmx_simd_mul_d(Q4, z2);
+ Q_A = gmx_simd_add_d(z2, Q3);
+ Q_B = gmx_simd_add_d(Q_B, Q2);
+ Q_A = gmx_simd_mul_d(Q_A, z2);
+ Q_B = gmx_simd_mul_d(Q_B, z2);
+ Q_A = gmx_simd_add_d(Q_A, Q1);
+ Q_B = gmx_simd_add_d(Q_B, Q0);
+ Q_A = gmx_simd_mul_d(Q_A, z);
+ Q_A = gmx_simd_add_d(Q_A, Q_B);
+
+ z = gmx_simd_mul_d(z, P_A);
+ z = gmx_simd_mul_d(z, gmx_simd_inv_d(Q_A));
+ z = gmx_simd_mul_d(z, xabs);
+ z = gmx_simd_add_d(z, xabs);
+
+ t1 = gmx_simd_blendzero_d(morebits1, mask1);
+ t1 = gmx_simd_blendv_d(t1, morebits2, mask2);
+
+ z = gmx_simd_add_d(z, t1);
+ y = gmx_simd_add_d(y, z);
+
+ y = gmx_simd_xor_sign_d(y, x);
+
+ return y;
+}
+
+/*! \brief SIMD double atan2(y,x).
+ *
+ * \copydetails gmx_simd_atan2_f
+ */
+static gmx_inline gmx_simd_double_t
+gmx_simd_atan2_d(gmx_simd_double_t y, gmx_simd_double_t x)
+{
+ const gmx_simd_double_t pi = gmx_simd_set1_d(M_PI);
+ const gmx_simd_double_t halfpi = gmx_simd_set1_d(M_PI/2.0);
+ gmx_simd_double_t xinv, p, aoffset;
+ gmx_simd_dbool_t mask_x0, mask_y0, mask_xlt0, mask_ylt0;
+
+ mask_x0 = gmx_simd_cmpeq_d(x, gmx_simd_setzero_d());
+ mask_y0 = gmx_simd_cmpeq_d(y, gmx_simd_setzero_d());
+ mask_xlt0 = gmx_simd_cmplt_d(x, gmx_simd_setzero_d());
+ mask_ylt0 = gmx_simd_cmplt_d(y, gmx_simd_setzero_d());
+
+ aoffset = gmx_simd_blendzero_d(halfpi, mask_x0);
+ aoffset = gmx_simd_blendnotzero_d(aoffset, mask_y0);
+
+ aoffset = gmx_simd_blendv_d(aoffset, pi, mask_xlt0);
+ aoffset = gmx_simd_blendv_d(aoffset, gmx_simd_fneg_d(aoffset), mask_ylt0);
+
+ xinv = gmx_simd_blendnotzero_d(gmx_simd_inv_d(x), mask_x0);
+ p = gmx_simd_mul_d(y, xinv);
+ p = gmx_simd_atan_d(p);
+ p = gmx_simd_add_d(p, aoffset);
+
+ return p;
+}
+
+
+/*! \brief Calculate the force correction due to PME analytically for SIMD double.
+ *
+ * \copydetails gmx_simd_pmecorrF_f
+ */
+static gmx_simd_double_t
+gmx_simd_pmecorrF_d(gmx_simd_double_t z2)
+{
+ const gmx_simd_double_t FN10 = gmx_simd_set1_d(-8.0072854618360083154e-14);
+ const gmx_simd_double_t FN9 = gmx_simd_set1_d(1.1859116242260148027e-11);
+ const gmx_simd_double_t FN8 = gmx_simd_set1_d(-8.1490406329798423616e-10);
+ const gmx_simd_double_t FN7 = gmx_simd_set1_d(3.4404793543907847655e-8);
+ const gmx_simd_double_t FN6 = gmx_simd_set1_d(-9.9471420832602741006e-7);
+ const gmx_simd_double_t FN5 = gmx_simd_set1_d(0.000020740315999115847456);
+ const gmx_simd_double_t FN4 = gmx_simd_set1_d(-0.00031991745139313364005);
+ const gmx_simd_double_t FN3 = gmx_simd_set1_d(0.0035074449373659008203);
+ const gmx_simd_double_t FN2 = gmx_simd_set1_d(-0.031750380176100813405);
+ const gmx_simd_double_t FN1 = gmx_simd_set1_d(0.13884101728898463426);
+ const gmx_simd_double_t FN0 = gmx_simd_set1_d(-0.75225277815249618847);
+
+ const gmx_simd_double_t FD5 = gmx_simd_set1_d(0.000016009278224355026701);
+ const gmx_simd_double_t FD4 = gmx_simd_set1_d(0.00051055686934806966046);
+ const gmx_simd_double_t FD3 = gmx_simd_set1_d(0.0081803507497974289008);
+ const gmx_simd_double_t FD2 = gmx_simd_set1_d(0.077181146026670287235);
+ const gmx_simd_double_t FD1 = gmx_simd_set1_d(0.41543303143712535988);
+ const gmx_simd_double_t FD0 = gmx_simd_set1_d(1.0);
+
+ gmx_simd_double_t z4;
+ gmx_simd_double_t polyFN0, polyFN1, polyFD0, polyFD1;
+
+ z4 = gmx_simd_mul_d(z2, z2);
+
+ polyFD1 = gmx_simd_fmadd_d(FD5, z4, FD3);
+ polyFD1 = gmx_simd_fmadd_d(polyFD1, z4, FD1);
+ polyFD1 = gmx_simd_mul_d(polyFD1, z2);
+ polyFD0 = gmx_simd_fmadd_d(FD4, z4, FD2);
+ polyFD0 = gmx_simd_fmadd_d(polyFD0, z4, FD0);
+ polyFD0 = gmx_simd_add_d(polyFD0, polyFD1);
+
+ polyFD0 = gmx_simd_inv_d(polyFD0);
+
+ polyFN0 = gmx_simd_fmadd_d(FN10, z4, FN8);
+ polyFN0 = gmx_simd_fmadd_d(polyFN0, z4, FN6);
+ polyFN0 = gmx_simd_fmadd_d(polyFN0, z4, FN4);
+ polyFN0 = gmx_simd_fmadd_d(polyFN0, z4, FN2);
+ polyFN0 = gmx_simd_fmadd_d(polyFN0, z4, FN0);
+ polyFN1 = gmx_simd_fmadd_d(FN9, z4, FN7);
+ polyFN1 = gmx_simd_fmadd_d(polyFN1, z4, FN5);
+ polyFN1 = gmx_simd_fmadd_d(polyFN1, z4, FN3);
+ polyFN1 = gmx_simd_fmadd_d(polyFN1, z4, FN1);
+ polyFN0 = gmx_simd_fmadd_d(polyFN1, z2, polyFN0);
+
+
+ return gmx_simd_mul_d(polyFN0, polyFD0);
+}
+
+
+
+/*! \brief Calculate the potential correction due to PME analytically for SIMD double.
+ *
+ * \copydetails gmx_simd_pmecorrV_f
+ */
+static gmx_simd_double_t
+gmx_simd_pmecorrV_d(gmx_simd_double_t z2)
+{
+ const gmx_simd_double_t VN9 = gmx_simd_set1_d(-9.3723776169321855475e-13);
+ const gmx_simd_double_t VN8 = gmx_simd_set1_d(1.2280156762674215741e-10);
+ const gmx_simd_double_t VN7 = gmx_simd_set1_d(-7.3562157912251309487e-9);
+ const gmx_simd_double_t VN6 = gmx_simd_set1_d(2.6215886208032517509e-7);
+ const gmx_simd_double_t VN5 = gmx_simd_set1_d(-4.9532491651265819499e-6);
+ const gmx_simd_double_t VN4 = gmx_simd_set1_d(0.00025907400778966060389);
+ const gmx_simd_double_t VN3 = gmx_simd_set1_d(0.0010585044856156469792);
+ const gmx_simd_double_t VN2 = gmx_simd_set1_d(0.045247661136833092885);
+ const gmx_simd_double_t VN1 = gmx_simd_set1_d(0.11643931522926034421);
+ const gmx_simd_double_t VN0 = gmx_simd_set1_d(1.1283791671726767970);
+
+ const gmx_simd_double_t VD5 = gmx_simd_set1_d(0.000021784709867336150342);
+ const gmx_simd_double_t VD4 = gmx_simd_set1_d(0.00064293662010911388448);
+ const gmx_simd_double_t VD3 = gmx_simd_set1_d(0.0096311444822588683504);
+ const gmx_simd_double_t VD2 = gmx_simd_set1_d(0.085608012351550627051);
+ const gmx_simd_double_t VD1 = gmx_simd_set1_d(0.43652499166614811084);
+ const gmx_simd_double_t VD0 = gmx_simd_set1_d(1.0);
+
+ gmx_simd_double_t z4;
+ gmx_simd_double_t polyVN0, polyVN1, polyVD0, polyVD1;
+
+ z4 = gmx_simd_mul_d(z2, z2);
+
+ polyVD1 = gmx_simd_fmadd_d(VD5, z4, VD3);
+ polyVD0 = gmx_simd_fmadd_d(VD4, z4, VD2);
+ polyVD1 = gmx_simd_fmadd_d(polyVD1, z4, VD1);
+ polyVD0 = gmx_simd_fmadd_d(polyVD0, z4, VD0);
+ polyVD0 = gmx_simd_fmadd_d(polyVD1, z2, polyVD0);
+
+ polyVD0 = gmx_simd_inv_d(polyVD0);
+
+ polyVN1 = gmx_simd_fmadd_d(VN9, z4, VN7);
+ polyVN0 = gmx_simd_fmadd_d(VN8, z4, VN6);
+ polyVN1 = gmx_simd_fmadd_d(polyVN1, z4, VN5);
+ polyVN0 = gmx_simd_fmadd_d(polyVN0, z4, VN4);
+ polyVN1 = gmx_simd_fmadd_d(polyVN1, z4, VN3);
+ polyVN0 = gmx_simd_fmadd_d(polyVN0, z4, VN2);
+ polyVN1 = gmx_simd_fmadd_d(polyVN1, z4, VN1);
+ polyVN0 = gmx_simd_fmadd_d(polyVN0, z4, VN0);
+ polyVN0 = gmx_simd_fmadd_d(polyVN1, z2, polyVN0);
+
+ return gmx_simd_mul_d(polyVN0, polyVD0);
+}
+
+/*! \} */
+
+#endif
+
+
+/*! \name SIMD4 math functions
+ *
+ * \note Only a subset of the math functions are implemented for SIMD4.
+ * \{
+ */
+
+
+#ifdef GMX_SIMD4_HAVE_FLOAT
+
+/*************************************************************************
+ * SINGLE PRECISION SIMD4 MATH FUNCTIONS - JUST A SMALL SUBSET SUPPORTED *
+ *************************************************************************/
+
+/*! \brief SIMD4 utility function to sum a+b+c+d for SIMD4 floats.
+ *
+ * \copydetails gmx_simd_sum4_f
+ */
+static gmx_inline gmx_simd4_float_t
+gmx_simd4_sum4_f(gmx_simd4_float_t a, gmx_simd4_float_t b,
+ gmx_simd4_float_t c, gmx_simd4_float_t d)
+{
+ return gmx_simd4_add_f(gmx_simd4_add_f(a, b), gmx_simd4_add_f(c, d));
+}
+
+/*! \brief Perform one Newton-Raphson iteration to improve 1/sqrt(x) for SIMD4 float.
+ *
+ * \copydetails gmx_simd_rsqrt_iter_f
+ */
+static gmx_inline gmx_simd4_float_t
+gmx_simd4_rsqrt_iter_f(gmx_simd4_float_t lu, gmx_simd4_float_t x)
+{
+# ifdef GMX_SIMD_HAVE_FMA
+ return gmx_simd4_fmadd_f(gmx_simd4_fnmadd_f(x, gmx_simd4_mul_f(lu, lu), gmx_simd4_set1_f(1.0f)), gmx_simd4_mul_f(lu, gmx_simd4_set1_f(0.5f)), lu);
+# else
+ return gmx_simd4_mul_f(gmx_simd4_set1_f(0.5f), gmx_simd4_mul_f(gmx_simd4_sub_f(gmx_simd4_set1_f(3.0f), gmx_simd4_mul_f(gmx_simd4_mul_f(lu, lu), x)), lu));
+# endif
+}
+
+/*! \brief Calculate 1/sqrt(x) for SIMD4 float.
+ *
+ * \copydetails gmx_simd_invsqrt_f
+ */
+static gmx_inline gmx_simd4_float_t
+gmx_simd4_invsqrt_f(gmx_simd4_float_t x)
+{
+ gmx_simd4_float_t lu = gmx_simd4_rsqrt_f(x);
+#if (GMX_SIMD_RSQRT_BITS < GMX_SIMD_MATH_TARGET_SINGLE_BITS)
+ lu = gmx_simd4_rsqrt_iter_f(lu, x);
+#endif
+#if (GMX_SIMD_RSQRT_BITS*2 < GMX_SIMD_MATH_TARGET_SINGLE_BITS)
+ lu = gmx_simd4_rsqrt_iter_f(lu, x);
+#endif
+#if (GMX_SIMD_RSQRT_BITS*4 < GMX_SIMD_MATH_TARGET_SINGLE_BITS)
+ lu = gmx_simd4_rsqrt_iter_f(lu, x);
+#endif
+ return lu;
+}
+
+#endif /* GMX_SIMD4_HAVE_FLOAT */
+
+
+
+#ifdef GMX_SIMD4_HAVE_DOUBLE
+/*************************************************************************
+ * DOUBLE PRECISION SIMD4 MATH FUNCTIONS - JUST A SMALL SUBSET SUPPORTED *
+ *************************************************************************/
+
+
+/*! \brief SIMD4 utility function to sum a+b+c+d for SIMD4 doubles.
+ *
+ * \copydetails gmx_simd_sum4_f
+ */
+static gmx_inline gmx_simd4_double_t
+gmx_simd4_sum4_d(gmx_simd4_double_t a, gmx_simd4_double_t b,
+ gmx_simd4_double_t c, gmx_simd4_double_t d)
+{
+ return gmx_simd4_add_d(gmx_simd4_add_d(a, b), gmx_simd4_add_d(c, d));
+}
+
+/*! \brief Perform one Newton-Raphson iteration to improve 1/sqrt(x) for SIMD4 double.
+ *
+ * \copydetails gmx_simd_rsqrt_iter_f
+ */
+static gmx_inline gmx_simd4_double_t
+gmx_simd4_rsqrt_iter_d(gmx_simd4_double_t lu, gmx_simd4_double_t x)
+{
+#ifdef GMX_SIMD_HAVE_FMA
+ return gmx_simd4_fmadd_d(gmx_simd4_fnmadd_d(x, gmx_simd4_mul_d(lu, lu), gmx_simd4_set1_d(1.0)), gmx_simd4_mul_d(lu, gmx_simd4_set1_d(0.5)), lu);
+#else
+ return gmx_simd4_mul_d(gmx_simd4_set1_d(0.5), gmx_simd4_mul_d(gmx_simd4_sub_d(gmx_simd4_set1_d(3.0), gmx_simd4_mul_d(gmx_simd4_mul_d(lu, lu), x)), lu));
+#endif
+}
+
+/*! \brief Calculate 1/sqrt(x) for SIMD4 double.
+ *
+ * \copydetails gmx_simd_invsqrt_f
+ */
+static gmx_inline gmx_simd4_double_t
+gmx_simd4_invsqrt_d(gmx_simd4_double_t x)
+{
+ gmx_simd4_double_t lu = gmx_simd4_rsqrt_d(x);
+#if (GMX_SIMD_RSQRT_BITS < GMX_SIMD_MATH_TARGET_DOUBLE_BITS)
+ lu = gmx_simd4_rsqrt_iter_d(lu, x);
+#endif
+#if (GMX_SIMD_RSQRT_BITS*2 < GMX_SIMD_MATH_TARGET_DOUBLE_BITS)
+ lu = gmx_simd4_rsqrt_iter_d(lu, x);
+#endif
+#if (GMX_SIMD_RSQRT_BITS*4 < GMX_SIMD_MATH_TARGET_DOUBLE_BITS)
+ lu = gmx_simd4_rsqrt_iter_d(lu, x);
+#endif
+#if (GMX_SIMD_RSQRT_BITS*8 < GMX_SIMD_MATH_TARGET_DOUBLE_BITS)
+ lu = gmx_simd4_rsqrt_iter_d(lu, x);
+#endif
+ return lu;
+}
+#endif /* GMX_SIMD4_HAVE_DOUBLE */
+
+/*! \} */
+
+
+/* Set defines based on default Gromacs precision */
+#ifdef GMX_DOUBLE
+/* Documentation in single branch below */
+# define gmx_simd_sum4_r gmx_simd_sum4_d
+# define gmx_simd_xor_sign_r gmx_simd_xor_sign_d
+# define gmx_simd_invsqrt_r gmx_simd_invsqrt_d
+# define gmx_simd_invsqrt_pair_r gmx_simd_invsqrt_pair_d
+# define gmx_simd_sqrt_r gmx_simd_sqrt_d
+# define gmx_simd_inv_r gmx_simd_inv_d
+# define gmx_simd_log_r gmx_simd_log_d
+# define gmx_simd_exp2_r gmx_simd_exp2_d
+# define gmx_simd_exp_r gmx_simd_exp_d
+# define gmx_simd_erf_r gmx_simd_erf_d
+# define gmx_simd_erfc_r gmx_simd_erfc_d
+# define gmx_simd_sincos_r gmx_simd_sincos_d
+# define gmx_simd_sin_r gmx_simd_sin_d
+# define gmx_simd_cos_r gmx_simd_cos_d
+# define gmx_simd_tan_r gmx_simd_tan_d
+# define gmx_simd_asin_r gmx_simd_asin_d
+# define gmx_simd_acos_r gmx_simd_acos_d
+# define gmx_simd_atan_r gmx_simd_atan_d
+# define gmx_simd_atan2_r gmx_simd_atan2_d
+# define gmx_simd_pmecorrF_r gmx_simd_pmecorrF_d
+# define gmx_simd_pmecorrV_r gmx_simd_pmecorrV_d
+# define gmx_simd4_sum4_r gmx_simd4_sum4_d
+# define gmx_simd4_invsqrt_r gmx_simd4_invsqrt_d
+
+#else /* GMX_DOUBLE */
+
+/*! \name Real-precision SIMD math functions
+ *
+ * These are the ones you should typically call in Gromacs.
+ * \{
+ */
+
+/*! \brief SIMD utility function to sum a+b+c+d for SIMD reals.
+ *
+ * \copydetails gmx_simd_sum4_f
+ */
+# define gmx_simd_sum4_r gmx_simd_sum4_f
+
+/*! \brief Return -a if b is negative, SIMD real.
+ *
+ * \copydetails gmx_simd_xor_sign_f
+ */
+# define gmx_simd_xor_sign_r gmx_simd_xor_sign_f
+
+/*! \brief Calculate 1/sqrt(x) for SIMD real.
+ *
+ * \copydetails gmx_simd_invsqrt_f
+ */
+# define gmx_simd_invsqrt_r gmx_simd_invsqrt_f
+
+/*! \brief Calculate 1/sqrt(x) for two SIMD reals.
+ *
+ * \copydetails gmx_simd_invsqrt_pair_f
+ */
+# define gmx_simd_invsqrt_pair_r gmx_simd_invsqrt_pair_f
+
+/*! \brief Calculate sqrt(x) correctly for SIMD real, including argument 0.0.
+ *
+ * \copydetails gmx_simd_sqrt_f
+ */
+# define gmx_simd_sqrt_r gmx_simd_sqrt_f
+
+/*! \brief Calculate 1/x for SIMD real.
+ *
+ * \copydetails gmx_simd_inv_f
+ */
+# define gmx_simd_inv_r gmx_simd_inv_f
+
+/*! \brief SIMD real log(x). This is the natural logarithm.
+ *
+ * \copydetails gmx_simd_log_f
+ */
+# define gmx_simd_log_r gmx_simd_log_f
+
+/*! \brief SIMD real 2^x.
+ *
+ * \copydetails gmx_simd_exp2_f
+ */
+# define gmx_simd_exp2_r gmx_simd_exp2_f
+
+/*! \brief SIMD real e^x.
+ *
+ * \copydetails gmx_simd_exp_f
+ */
+# define gmx_simd_exp_r gmx_simd_exp_f
+
+/*! \brief SIMD real erf(x).
+ *
+ * \copydetails gmx_simd_erf_f
+ */
+# define gmx_simd_erf_r gmx_simd_erf_f
+
+/*! \brief SIMD real erfc(x).
+ *
+ * \copydetails gmx_simd_erfc_f
+ */
+# define gmx_simd_erfc_r gmx_simd_erfc_f
+
+/*! \brief SIMD real sin \& cos.
+ *
+ * \copydetails gmx_simd_sincos_f
+ */
+# define gmx_simd_sincos_r gmx_simd_sincos_f
+
+/*! \brief SIMD real sin(x).
+ *
+ * \copydetails gmx_simd_sin_f
+ */
+# define gmx_simd_sin_r gmx_simd_sin_f
+
+/*! \brief SIMD real cos(x).
+ *
+ * \copydetails gmx_simd_cos_f
+ */
+# define gmx_simd_cos_r gmx_simd_cos_f
+
+/*! \brief SIMD real tan(x).
+ *
+ * \copydetails gmx_simd_tan_f
+ */
+# define gmx_simd_tan_r gmx_simd_tan_f
+
+/*! \brief SIMD real asin(x).
+ *
+ * \copydetails gmx_simd_asin_f
+ */
+# define gmx_simd_asin_r gmx_simd_asin_f
+
+/*! \brief SIMD real acos(x).
+ *
+ * \copydetails gmx_simd_acos_f
+ */
+# define gmx_simd_acos_r gmx_simd_acos_f
+
+/*! \brief SIMD real atan(x).
+ *
+ * \copydetails gmx_simd_atan_f
+ */
+# define gmx_simd_atan_r gmx_simd_atan_f
+
+/*! \brief SIMD real atan2(y,x).
+ *
+ * \copydetails gmx_simd_atan2_f
+ */
+# define gmx_simd_atan2_r gmx_simd_atan2_f
+
+/*! \brief SIMD Analytic PME force correction.
+ *
+ * \copydetails gmx_simd_pmecorrF_f
+ */
+# define gmx_simd_pmecorrF_r gmx_simd_pmecorrF_f
+
+/*! \brief SIMD Analytic PME potential correction.
+ *
+ * \copydetails gmx_simd_pmecorrV_f
+ */
+# define gmx_simd_pmecorrV_r gmx_simd_pmecorrV_f
+
+/*! \}
+ * \name SIMD4 math functions
+ * \{
+ */
+
+/*! \brief SIMD4 utility function to sum a+b+c+d for SIMD4 reals.
+ *
+ * \copydetails gmx_simd_sum4_f
+ */
+# define gmx_simd4_sum4_r gmx_simd4_sum4_f
+
+/*! \brief Calculate 1/sqrt(x) for SIMD4 real.
+ *
+ * \copydetails gmx_simd_invsqrt_f
+ */
+# define gmx_simd4_invsqrt_r gmx_simd4_invsqrt_f
+
+/*! \} */
+
+#endif /* GMX_DOUBLE */
+
+/*! \} */
+/*! \endcond */
+
+#endif /* GMX_SIMD_SIMD_MATH_H_ */
--- /dev/null
+#
+# This file is part of the GROMACS molecular simulation package.
+#
+# Copyright (c) 2014, by the GROMACS development team, led by
+# Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+# and including many others, as listed in the AUTHORS file in the
+# top-level source directory and at http://www.gromacs.org.
+#
+# GROMACS is free software; you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public License
+# as published by the Free Software Foundation; either version 2.1
+# of the License, or (at your option) any later version.
+#
+# GROMACS is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public
+# License along with GROMACS; if not, see
+# http://www.gnu.org/licenses, or write to the Free Software Foundation,
+# Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+#
+# If you want to redistribute modifications to GROMACS, please
+# consider that scientific software is very special. Version
+# control is crucial - bugs must be traceable. We will be happy to
+# consider code for inclusion in the official distribution, but
+# derived work must not be called official GROMACS. Details are found
+# in the README & COPYING files - if they are missing, get the
+# official version at http://www.gromacs.org.
+#
+# To help us fund GROMACS development, we humbly ask that you cite
+# the research papers on the package. Check out http://www.gromacs.org.
+
+gmx_add_unit_test(SimdUnitTests simd-test
+ bootstrap_loadstore.cpp
+ base.cpp
+ simd.cpp
+ simd_floatingpoint.cpp
+ simd_vector_operations.cpp
+ simd_math.cpp
+ simd_integer.cpp
+ simd4.cpp
+ simd4_floatingpoint.cpp
+ simd4_vector_operations.cpp
+ simd4_math.cpp)
+
+
+
--- /dev/null
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2014, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+
+#include "testutils/testoptions.h"
+#include "gromacs/options/options.h"
+#include "gromacs/options/basicoptions.h"
+
+#include "base.h"
+
+namespace gmx
+{
+namespace test
+{
+
+namespace
+{
+
+/*! \cond */
+/*! \brief Command-line option to adjust the number of points used to test SIMD math functions. */
+GMX_TEST_OPTIONS(SimdBaseTestOptions, options)
+{
+ options->addOption(::gmx::IntegerOption("npoints")
+ .store(&SimdBaseTest::s_nPoints)
+ .description("Number of points to test for SIMD math functions"));
+}
+/*! \endcond */
+
+} // namespace
+
+/*! \cond internal */
+/*! \addtogroup module_simd */
+/*! \{ */
+
+int SimdBaseTest::s_nPoints = 10000;
+
+::testing::AssertionResult
+SimdBaseTest::compareVectorRealUlp(const char * refExpr, const char * tstExpr,
+ const std::vector<real> &ref, const std::vector<real> &tst)
+{
+ std::vector<real> absDiff(tst.size());
+ std::vector<gmx_int64_t> ulpDiff(tst.size());
+ bool allOk;
+ size_t i;
+
+ union {
+#ifdef GMX_DOUBLE
+ double r; gmx_int64_t i;
+#else
+ float r; gmx_int32_t i;
+#endif
+ } conv0, conv1;
+
+ // Internal test of the test - make sure reference and test have the same length.
+ if (ref.size() != tst.size())
+ {
+ return ::testing::AssertionFailure()
+ << "Internal test error - unequal size vectors in compareVectorRealUlp" << std::endl;
+ }
+
+ for (i = 0, allOk = true; i < tst.size(); i++)
+ {
+ absDiff[i] = fabs(ref[i]-tst[i]);
+ conv0.r = ref[i];
+ conv1.r = tst[i];
+ ulpDiff[i] = llabs(conv0.i-conv1.i);
+
+ /* Use strict smaller-than for absolute tolerance check, so we disable it with absTol_=0 */
+ allOk = allOk && ( ( absDiff[i] < absTol_ ) || ( ( ref[i]*tst[i] >= 0 ) && (ulpDiff[i] <= ulpTol_) ) );
+ }
+
+ if (allOk == true)
+ {
+ return ::testing::AssertionSuccess();
+ }
+ else
+ {
+ return ::testing::AssertionFailure()
+ << "Failing comparison between " << refExpr << " and " << tstExpr << std::endl
+ << "Requested abs tolerance: " << absTol_ << std::endl
+ << "Requested ulp tolerance: " << ulpTol_ << std::endl
+ << "(And values should not differ in sign unless within abs tolerance.)" << std::endl
+ << "Reference values: " << ::testing::PrintToString(ref) << std::endl
+ << "SIMD values: " << ::testing::PrintToString(tst) << std::endl
+ << "Abs. difference: " << ::testing::PrintToString(absDiff) << std::endl
+ << "Ulp difference: " << ::testing::PrintToString(ulpDiff) << std::endl;
+ }
+}
+
+/*! \} */
+/*! \endcond */
+
+} // namespace
+} // namespace
--- /dev/null
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2014, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+
+#ifndef GMX_SIMD_TESTS_BASE_H
+#define GMX_SIMD_TESTS_BASE_H
+
+/*! \internal \file
+ * \brief
+ * Declares common base class for testing SIMD and SIMD4.
+ *
+ * The base class contains the settings for absolute and ulp tolerances,
+ * as well as testing ranges used for both SIMD and SIMD4 tests, mainly
+ * to keep everything symmetric and clean. The class also defines a couple
+ * of generic tests that compare vectors of elements with arbitrary length for
+ * either exact or approximate matching (in terms of ulp). These are used in
+ * derived classes that convert either SIMD or SIMD4 values to
+ * std::vector<real> and then performs the comparison.
+ *
+ * \author Erik Lindahl <erik.lindahl@scilifelab.se>
+ * \ingroup module_simd
+ */
+#include <vector>
+#include <gtest/gtest.h>
+#include "gromacs/simd/simd.h"
+
+
+namespace gmx
+{
+namespace test
+{
+
+/*! \cond internal */
+/*! \addtogroup module_simd */
+/*! \{ */
+
+/*! \brief Base class for SIMD test fixtures.
+ *
+ * This class contains settings that are common for SIMD and SIMD4 tests,
+ * and it is thus not used directly for any tests, but derived separately
+ * in simd.h and simd4.h.
+ */
+class SimdBaseTest : public ::testing::Test
+{
+ public:
+ /*! \brief Initialize new SIMD test fixture with default tolerances.
+ *
+ * The default absolute tolerance is set to 0, which means the we always
+ * check the ulp tolerance by default (passing the absolute tolerance
+ * test would otherwise mean we approve the test instantly).
+ * The default ulp tolerance is set to 10 units in single, and 255 units
+ * in double precision.
+ * Most SIMD math functions achieve 2-3 ulp accuracy in single, but by
+ * being a bit liberal we avoid tests failing on aggressive compilers.
+ *
+ * For double precision we only aim to achieve twice the accuracy of
+ * single. This way we can make do with a single extra iteration
+ * in some algorithms, in particular 1/sqrt(x).
+ *
+ * The range is used by derived classes to test math functions. The
+ * default test range will be [1,10], which is intentionally
+ * conservative so it works with (inverse) square root, division,
+ * exponentials, logarithms, and error functions.
+ */
+ SimdBaseTest()
+ {
+#ifdef GMX_DOUBLE
+ ulpTol_ = 255LL; // Aim for roughly twice the precision we have in single.
+#else
+ ulpTol_ = 10LL; // Be a bit liberal so compiler optimization doesn't bite us.
+#endif
+ absTol_ = 0;
+ range_ = std::pair<real, real>(1, 10);
+ }
+
+ /*! \brief Adjust ulp tolerance from the default 10 (float) or 255 (double). */
+ void setUlpTol(gmx_int64_t newTol) { ulpTol_ = newTol; }
+
+ /*! \brief Adjust the absolute tolerance from the default 0.
+ *
+ * If values are closer than the absolute tolerance, the test will pass
+ * no matter what their ulp difference is.
+ */
+ void setAbsTol(real newTol) { absTol_ = newTol; }
+
+ /*! \brief Change math function testing range from the default [1,10]. */
+ void setRange(real low, real high) { range_.first = low; range_.second = high; }
+
+ static int s_nPoints; //!< Number of test points to use, settable on command line.
+
+ /*! \brief Compare two std::vector<real> for approximate equality.
+ *
+ * This is an internal implementation routine that will be used by
+ * routines in derived child classes that first convert SIMD or SIMD4
+ * variables to std::vector<real>. Do not call it directly.
+ *
+ * This routine is designed according to the Google test specs, so the char
+ * strings will describe the arguments to the macro.
+ *
+ * The comparison is applied to each element, and it returns true if each element
+ * in the vector test variable is within the class tolerances of the corresponding
+ * reference elements.
+ */
+ ::testing::AssertionResult
+ compareVectorRealUlp(const char * refExpr, const char * tstExpr,
+ const std::vector<real> &ref, const std::vector<real> &tst);
+
+ /*! \brief Compare std::vectors for exact equality.
+ *
+ * The template in this class makes it usable for testing both
+ * SIMD floating-point and integers variables, after conversion to
+ * vectors.
+ * This is an internal implementation routine that will be used by
+ * routines in derived child classes that first convert SIMD or SIMD4
+ * variables to std::vector<real>. Do not call it directly.
+ *
+ * This routine is designed according to the Google test specs, so the char
+ * strings will describe the arguments to the macro.
+ *
+ * The comparison is applied to each element, and it returns true if each element
+ * in the vector test variable is within the class tolerances of the corresponding
+ * reference elements.
+ */
+ template <typename T> ::testing::AssertionResult
+ compareVectorEq(const char * refExpr, const char * tstExpr,
+ const std::vector<T> &ref, const std::vector<T> &tst)
+ {
+ if (ref == tst)
+ {
+ return ::testing::AssertionSuccess();
+ }
+ else
+ {
+ return ::testing::AssertionFailure()
+ << "Failing SIMD comparison between " << refExpr << " and " << tstExpr << std::endl
+ << "Ref. values: " << ::testing::PrintToString(ref) << std::endl
+ << "Test values: " << ::testing::PrintToString(tst) << std::endl;
+ }
+ }
+
+ protected:
+ gmx_int64_t ulpTol_; //!< Current tolerance in units-in-last-position.
+ real absTol_; //!< Current absolute tolerance.
+ std::pair<real, real> range_; //!< Range for math function tests.
+};
+
+/*! \} */
+/*! \endcond */
+
+} // namespace
+} // namespace
+
+#endif // GMX_SIMD_TESTS_BASE_H
--- /dev/null
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2014, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+/*! \internal \file
+ * \brief
+ * Separate test of SIMD load/store, before we use them in the SIMD test classes.
+ *
+ * Simple tests without using any classes/utilities, so we can use load/store
+ * functions inside our test utilities after this has passed.
+ *
+ * This file tests:
+ *
+ * - gmx_simd_align_r(),gmx_simd_align_i(),gmx_simd4_align_r(),
+ * - gmx_simd_load_r(),gmx_simd_store_r(),gmx_simd_loadu_r(),gmx_simd_storeu_r()
+ * - gmx_simd_load_i(),gmx_simd_store_i(), gmx_simd_loadu_i(),gmx_simd_storeu_i()
+ * - gmx_simd4_load_r(),gmx_simd4_store_r(), gmx_simd4_loadu_r(),gmx_simd4_storeu_r()
+ *
+ * \author Erik Lindahl <erik.lindahl@scilifelab.se>
+ * \ingroup module_simd
+ */
+
+#include <gtest/gtest.h>
+#include "gromacs/simd/simd.h"
+
+namespace
+{
+
+/*! \cond internal */
+/*! \addtogroup module_simd */
+/*! \{ */
+
+TEST(SimdBootstrapTest, gmxSimdAlign)
+{
+#ifdef GMX_SIMD_HAVE_REAL
+ real rdata[GMX_SIMD_REAL_WIDTH*2];
+ for (int i = 0; i < GMX_SIMD_REAL_WIDTH; i++)
+ {
+ EXPECT_EQ(((size_t)gmx_simd_align_r(&rdata[i]) & (GMX_SIMD_REAL_WIDTH*sizeof(real)-1)), (size_t)0);
+ }
+#endif
+#ifdef GMX_SIMD_HAVE_INT32
+ int idata[GMX_SIMD_INT32_WIDTH*2];
+ for (int i = 0; i < GMX_SIMD_INT32_WIDTH; i++)
+ {
+ EXPECT_EQ(((size_t)gmx_simd_align_i(&idata[i]) & (GMX_SIMD_INT32_WIDTH*sizeof(int)-1)), (size_t)0);
+ }
+#endif
+}
+
+/*! \brief Generic routine to test load & store of SIMD, and check for side effects.
+ *
+ * The tests for load, store, unaligned load and unaligned store both for
+ * real and int are pretty much similar, so we use a template function with
+ * additional function pointers for the actual load/store calls. This would
+ * be more hacking to turn into a class, since the SIMD functionality uses
+ * macros rather than functions that can be overloaded.
+ */
+template <typename T, typename TSimd> void
+simdLoadStoreTester(TSimd simdLoadFn(T* mem), void simdStoreFn(T* mem, TSimd),
+ T * simdAlignFn(T *mem),
+ const int loadOffset, const int storeOffset, const int simdWidth)
+{
+ /* We want simdWidth elements before the data to check we are not polluting
+ * memory. Then we need 2*simdWidth storage to be able to extract an aligned
+ * pointer, another simdWidth elements so we can create (deliberately)
+ * offset un-aligned pointers, and finally simdWidth elements at the end
+ * to test we are not polluting memory there either. Sum=5*simdWidth!
+ */
+ std::vector<T> src(simdWidth*5);
+ std::vector<T> dst(simdWidth*5);
+ // Make sure we have memory to check both before and after the test pointers
+ T * pCopySrc = simdAlignFn(&src[0]) + simdWidth + loadOffset;
+ T * pCopyDst = simdAlignFn(&dst[0]) + simdWidth + storeOffset;
+ int i;
+
+ for (i = 0; i < simdWidth*5; i++)
+ {
+ src[i] = 1+i;
+ dst[i] = -1-i;
+ }
+
+ simdStoreFn(pCopyDst, simdLoadFn(pCopySrc));
+
+ for (i = 0; i < simdWidth; i++)
+ {
+ EXPECT_EQ(pCopySrc[i], pCopyDst[i]) << "SIMD load or store not moving data correctly for element " << i;
+ }
+
+ for (i = 0; i < simdWidth*5; i++)
+ {
+ EXPECT_EQ(src[i], (T)(1+i)) << "Side effect on source memory, i = " << i;
+ if (&dst[0]+i < pCopyDst || &dst[0]+i >= pCopyDst+simdWidth)
+ {
+ EXPECT_EQ(dst[i], (T)(-1-i)) << "Side effect on destination memory, i = " << i;
+ }
+ }
+}
+
+#ifdef GMX_SIMD_HAVE_REAL
+//! Wrapper for SIMD macro to load aligned floating-point data.
+gmx_simd_real_t wrapperSimdLoadR(real *m)
+{
+ return gmx_simd_load_r(m);
+}
+//! Wrapper for SIMD macro to store to aligned floating-point data.
+void wrapperSimdStoreR(real *m, gmx_simd_real_t s)
+{
+ gmx_simd_store_r(m, s);
+}
+
+TEST(SimdBootstrapTest, gmxSimdLoadStoreR)
+{
+ simdLoadStoreTester(wrapperSimdLoadR, wrapperSimdStoreR, gmx_simd_align_r, 0, 0, GMX_SIMD_REAL_WIDTH);
+}
+
+# ifdef GMX_SIMD_HAVE_LOADU
+//! Wrapper for SIMD macro to load unaligned floating-point data.
+gmx_simd_real_t WrapperSimdLoadUR(real *m)
+{
+ return gmx_simd_loadu_r(m);
+}
+
+TEST(SimdBootstrapTest, gmxSimdLoadUR)
+{
+ for (int i = 0; i < GMX_SIMD_REAL_WIDTH; i++)
+ {
+ simdLoadStoreTester(WrapperSimdLoadUR, wrapperSimdStoreR, gmx_simd_align_r, i, 0, GMX_SIMD_REAL_WIDTH);
+ }
+}
+# endif
+
+# ifdef GMX_SIMD_HAVE_STOREU
+//! Wrapper for SIMD macro to store to unaligned floating-point data.
+void WrapperSimdStoreUR(real *m, gmx_simd_real_t s)
+{
+ gmx_simd_storeu_r(m, s);
+}
+
+TEST(SimdBootstrapTest, gmxSimdStoreUR)
+{
+ for (int i = 0; i < GMX_SIMD_REAL_WIDTH; i++)
+ {
+ simdLoadStoreTester(wrapperSimdLoadR, WrapperSimdStoreUR, gmx_simd_align_r, 0, i, GMX_SIMD_REAL_WIDTH);
+ }
+}
+# endif
+#endif
+
+#ifdef GMX_SIMD_HAVE_INT32
+// Tests for gmx_simd_int32_t load & store operations
+
+//! Wrapper for SIMD macro to load aligned integer data.
+gmx_simd_int32_t wrapperSimdLoadI(int *m)
+{
+ return gmx_simd_load_i(m);
+}
+//! Wrapper for SIMD macro to store to aligned integer data.
+void wrapperSimdStoreI(int *m, gmx_simd_int32_t s)
+{
+ gmx_simd_store_i(m, s);
+}
+
+TEST(SimdBootstrapTest, gmxSimdLoadStoreI)
+{
+ simdLoadStoreTester(wrapperSimdLoadI, wrapperSimdStoreI, gmx_simd_align_i, 0, 0, GMX_SIMD_INT32_WIDTH);
+}
+
+# ifdef GMX_SIMD_HAVE_LOADU
+//! Wrapper for SIMD macro to load unaligned integer data.
+gmx_simd_int32_t wrapperSimdLoadUI(int *m)
+{
+ return gmx_simd_loadu_i(m);
+}
+
+TEST(SimdBootstrapTest, gmxSimdLoadUI)
+{
+ for (int i = 0; i < GMX_SIMD_INT32_WIDTH; i++)
+ {
+ simdLoadStoreTester(wrapperSimdLoadUI, wrapperSimdStoreI, gmx_simd_align_i, i, 0, GMX_SIMD_INT32_WIDTH);
+ }
+}
+# endif
+
+# ifdef GMX_SIMD_HAVE_STOREU
+//! Wrapper for SIMD macro to store to unaligned integer data.
+void wrapperSimdStoreUI(int *m, gmx_simd_int32_t s)
+{
+ gmx_simd_storeu_i(m, s);
+}
+
+TEST(SimdBootstrapTest, gmxSimdStoreUI)
+{
+ for (int i = 0; i < GMX_SIMD_INT32_WIDTH; i++)
+ {
+ simdLoadStoreTester(wrapperSimdLoadI, wrapperSimdStoreUI, gmx_simd_align_i, 0, i, GMX_SIMD_INT32_WIDTH);
+ }
+}
+# endif
+#endif
+
+#ifdef GMX_SIMD4_HAVE_REAL
+/* Tests for gmx_simd4_real_t load & store operations. Define wrapper functions
+ * for the SIMD instructions that are typically implemented as macros.
+ */
+
+/*! \brief Separate load/store tester function for SIMD4.
+ *
+ * Due to the way SIMD variables
+ * are implemented as deep internal data, some compilers treat them as
+ * float/double with special prefixes. Unfortunately, this means that some C++
+ * compilers think an 8-wide normal real SIMD and a 4-wide SIMD4 real type
+ * cannot be overloaded (e.g. with gcc using 256-bit AVX single precision).
+ */
+template <typename T, typename TSimd> void
+simd4LoadStoreTester(TSimd simd4LoadFn(T* mem), void simd4StoreFn(T* mem, TSimd),
+ T * simd4AlignFn(T *mem),
+ const int loadOffset, const int storeOffset)
+{
+ /* We want simdWidth elements before the data to check we are not polluting
+ * memory. Then we need 2*simdWidth storage to be able to extract an aligned
+ * pointer, another simdWidth elements so we can create (deliberately)
+ * offset un-aligned pointers, and finally simdWidth elements at the end
+ * to test we are not polluting memory there either. Sum=5*simdWidth!
+ */
+ T src[GMX_SIMD4_WIDTH*5];
+ T dst[GMX_SIMD4_WIDTH*5];
+ // Make sure we have memory to check both before and after the test pointers
+ T * pCopySrc = simd4AlignFn(src) + GMX_SIMD4_WIDTH + loadOffset;
+ T * pCopyDst = simd4AlignFn(dst) + GMX_SIMD4_WIDTH + storeOffset;
+ int i;
+
+ for (i = 0; i < GMX_SIMD4_WIDTH*5; i++)
+ {
+ src[i] = 1+i;
+ dst[i] = -1-i;
+ }
+
+ simd4StoreFn(pCopyDst, simd4LoadFn(pCopySrc));
+
+ for (i = 0; i < GMX_SIMD4_WIDTH; i++)
+ {
+ EXPECT_EQ(pCopySrc[i], pCopyDst[i]) << "SIMD4 load or store not moving data correctly for element " << i;
+ }
+
+ for (i = 0; i < GMX_SIMD4_WIDTH*5; i++)
+ {
+ EXPECT_EQ(src[i], (T)(1+i)) << "Side effect on source memory, i = " << i;
+ if (dst+i < pCopyDst || dst+i >= pCopyDst+GMX_SIMD4_WIDTH)
+ {
+ EXPECT_EQ(dst[i], (T)(-1-i)) << "Side effect on destination memory, i = " << i;
+ }
+ }
+}
+
+//! Wrapper for SIMD4 macro to load aligned floating-point data.
+gmx_simd4_real_t wrapperSimd4LoadR(real *m)
+{
+ return gmx_simd4_load_r(m);
+}
+//! Wrapper for SIMD4 macro to store to aligned floating-point data.
+void wrapperSimd4StoreR(real *m, gmx_simd4_real_t s)
+{
+ gmx_simd4_store_r(m, s);
+}
+
+TEST(SimdBootstrapTest, gmxSimd4LoadStoreR)
+{
+ simd4LoadStoreTester(wrapperSimd4LoadR, wrapperSimd4StoreR, gmx_simd4_align_r, 0, 0);
+}
+
+# ifdef GMX_SIMD_HAVE_LOADU
+//! Wrapper for SIMD4 macro to load unaligned floating-point data.
+gmx_simd4_real_t WrapperSimd4LoadUR(real *m)
+{
+ return gmx_simd4_loadu_r(m);
+}
+
+TEST(SimdBootstrapTest, gmxSimd4LoadUR)
+{
+ for (int i = 0; i < GMX_SIMD4_WIDTH; i++)
+ {
+ simd4LoadStoreTester(WrapperSimd4LoadUR, wrapperSimd4StoreR, gmx_simd4_align_r, i, 0);
+ }
+}
+# endif
+
+# ifdef GMX_SIMD_HAVE_STOREU
+//! Wrapper for SIMD4 macro to store to unaligned floating-point data.
+void WrapperSimd4StoreUR(real *m, gmx_simd4_real_t s)
+{
+ gmx_simd4_storeu_r(m, s);
+}
+
+TEST(SimdBootstrapTest, gmxSimd4StoreUR)
+{
+ for (int i = 0; i < GMX_SIMD4_WIDTH; i++)
+ {
+ simd4LoadStoreTester(wrapperSimd4LoadR, WrapperSimd4StoreUR, gmx_simd4_align_r, 0, i);
+ }
+}
+# endif
+#endif
+
+/*! \} */
+/*! \endcond */
+
+} // namespace
--- /dev/null
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2014, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include "simd.h"
+
+namespace gmx
+{
+namespace test
+{
+
+/*! \cond internal */
+/*! \addtogroup module_simd */
+/*! \{ */
+
+/* Unfortunately we cannot keep static SIMD constants in the test fixture class.
+ * The problem is that SIMD memory need to be aligned, and in particular
+ * this applies to automatic storage of variables in classes. For SSE registers
+ * this means 16-byte alignment (which seems to work), but AVX requires 32-bit
+ * alignment. At least both gcc-4.7.3 and Apple clang-5.0 (OS X 10.9) fail to
+ * align these variables when they are stored as data in a class.
+ *
+ * In theory we could set some of these on-the-fly e.g. with setSimdRealFrom3R()
+ * instead (although that would mean repeating code between tests), but many of
+ * the constants depend on the current precision not to mention they
+ * occasionally have many digits that need to be exactly right, and keeping
+ * them in a single place makes sure they are consistent.
+ */
+#ifdef GMX_SIMD_HAVE_REAL
+const gmx_simd_real_t rSimd_1_2_3 = setSimdRealFrom3R(1, 2, 3);
+const gmx_simd_real_t rSimd_4_5_6 = setSimdRealFrom3R(4, 5, 6);
+const gmx_simd_real_t rSimd_7_8_9 = setSimdRealFrom3R(7, 8, 9);
+const gmx_simd_real_t rSimd_5_7_9 = setSimdRealFrom3R(5, 7, 9);
+const gmx_simd_real_t rSimd_m1_m2_m3 = setSimdRealFrom3R(-1, -2, -3);
+const gmx_simd_real_t rSimd_3_1_4 = setSimdRealFrom3R(3, 1, 4);
+const gmx_simd_real_t rSimd_m3_m1_m4 = setSimdRealFrom3R(-3, -1, -4);
+const gmx_simd_real_t rSimd_2p25 = setSimdRealFrom1R(2.25);
+const gmx_simd_real_t rSimd_3p75 = setSimdRealFrom1R(3.75);
+const gmx_simd_real_t rSimd_m2p25 = setSimdRealFrom1R(-2.25);
+const gmx_simd_real_t rSimd_m3p75 = setSimdRealFrom1R(-3.75);
+const gmx_simd_real_t rSimd_Exp = setSimdRealFrom3R( 1.4055235171027452623914516e+18,
+ 5.3057102734253445623914516e-13,
+ -2.1057102745623934534514516e+16);
+# if (defined GMX_SIMD_HAVE_DOUBLE) && (defined GMX_DOUBLE)
+// Make sure we also test exponents outside single precision when we use double
+const gmx_simd_real_t rSimd_ExpDouble = setSimdRealFrom3R( 6.287393598732017379054414e+176,
+ 8.794495252903116023030553e-140,
+ -3.637060701570496477655022e+202);
+// Magic FP numbers corresponding to specific bit patterns
+const gmx_simd_real_t rSimd_Bits1 = setSimdRealFrom1R(-1.07730874267432137e+236);
+const gmx_simd_real_t rSimd_Bits2 = setSimdRealFrom1R(-9.25596313493178307e+061);
+const gmx_simd_real_t rSimd_Bits3 = setSimdRealFrom1R(-8.57750588235293981e+003);
+const gmx_simd_real_t rSimd_Bits4 = setSimdRealFrom1R( 1.22416778341839096e-250);
+const gmx_simd_real_t rSimd_Bits5 = setSimdRealFrom1R(-1.15711777004554095e+294);
+const gmx_simd_real_t rSimd_Bits6 = setSimdRealFrom1R( 1.53063836115600621e-018);
+# else
+// Magic FP numbers corresponding to specific bit patterns
+const gmx_simd_real_t rSimd_Bits1 = setSimdRealFrom1R(-5.9654142337e+29);
+const gmx_simd_real_t rSimd_Bits2 = setSimdRealFrom1R(-1.0737417600e+08);
+const gmx_simd_real_t rSimd_Bits3 = setSimdRealFrom1R(-6.0235290527e+00);
+const gmx_simd_real_t rSimd_Bits4 = setSimdRealFrom1R( 1.0788832913e-31);
+const gmx_simd_real_t rSimd_Bits5 = setSimdRealFrom1R(-1.0508719529e+37);
+const gmx_simd_real_t rSimd_Bits6 = setSimdRealFrom1R( 1.1488970369e-02);
+# endif
+#endif // GMX_SIMD_HAVE_REAL
+#ifdef GMX_SIMD_HAVE_INT32
+const gmx_simd_int32_t iSimd_1_2_3 = setSimdIntFrom3I(1, 2, 3);
+const gmx_simd_int32_t iSimd_4_5_6 = setSimdIntFrom3I(4, 5, 6);
+const gmx_simd_int32_t iSimd_7_8_9 = setSimdIntFrom3I(7, 8, 9);
+const gmx_simd_int32_t iSimd_5_7_9 = setSimdIntFrom3I(5, 7, 9);
+const gmx_simd_int32_t iSimd_1M_2M_3M = setSimdIntFrom3I(1000000, 2000000, 3000000);
+const gmx_simd_int32_t iSimd_4M_5M_6M = setSimdIntFrom3I(4000000, 5000000, 6000000);
+const gmx_simd_int32_t iSimd_5M_7M_9M = setSimdIntFrom3I(5000000, 7000000, 9000000);
+const gmx_simd_int32_t iSimd_0xF0F0F0F0 = setSimdIntFrom1I(0xF0F0F0F0);
+const gmx_simd_int32_t iSimd_0xCCCCCCCC = setSimdIntFrom1I(0xCCCCCCCC);
+#endif // GMX_SIMD_HAVE_INT32
+
+#ifdef GMX_SIMD_HAVE_REAL
+::std::vector<real>
+simdReal2Vector(const gmx_simd_real_t simd)
+{
+ real mem[GMX_SIMD_REAL_WIDTH*2];
+ real * p = gmx_simd_align_r(mem);
+
+ gmx_simd_store_r(p, simd);
+ std::vector<real> v(p, p+GMX_SIMD_REAL_WIDTH);
+
+ return v;
+}
+
+gmx_simd_real_t
+vector2SimdReal(const std::vector<real> &v)
+{
+ real mem[GMX_SIMD_REAL_WIDTH*2];
+ real * p = gmx_simd_align_r(mem);
+
+ for (int i = 0; i < GMX_SIMD_REAL_WIDTH; i++)
+ {
+ p[i] = v[i % v.size()]; // repeat vector contents to fill simd width
+ }
+ return gmx_simd_load_r(p);
+}
+
+gmx_simd_real_t
+setSimdRealFrom3R(real r0, real r1, real r2)
+{
+ std::vector<real> v(3);
+ v[0] = r0;
+ v[1] = r1;
+ v[2] = r2;
+ return vector2SimdReal(v);
+}
+
+gmx_simd_real_t
+setSimdRealFrom1R(real value)
+{
+ std::vector<real> v(GMX_SIMD_REAL_WIDTH);
+ for (int i = 0; i < GMX_SIMD_REAL_WIDTH; i++)
+ {
+ v[i] = value;
+ }
+ return vector2SimdReal(v);
+}
+
+testing::AssertionResult
+SimdTest::compareSimdRealUlp(const char * refExpr, const char * tstExpr,
+ const gmx_simd_real_t ref, const gmx_simd_real_t tst)
+{
+ return compareVectorRealUlp(refExpr, tstExpr, simdReal2Vector(ref), simdReal2Vector(tst));
+}
+
+testing::AssertionResult
+SimdTest::compareSimdRealEq(const char * refExpr, const char * tstExpr,
+ const gmx_simd_real_t ref, const gmx_simd_real_t tst)
+{
+ return compareVectorEq(refExpr, tstExpr, simdReal2Vector(ref), simdReal2Vector(tst));
+}
+
+#endif // GMX_SIMD_HAVE_REAL
+
+#ifdef GMX_SIMD_HAVE_INT32
+std::vector<int>
+simdInt2Vector(const gmx_simd_int32_t simd)
+{
+ int mem[GMX_SIMD_INT32_WIDTH*2];
+ int * p = gmx_simd_align_i(mem);
+
+ gmx_simd_store_i(p, simd);
+ std::vector<int> v(p, p+GMX_SIMD_INT32_WIDTH);
+
+ return v;
+}
+
+gmx_simd_int32_t
+vector2SimdInt(const std::vector<int> &v)
+{
+ int mem[GMX_SIMD_INT32_WIDTH*2];
+ int * p = gmx_simd_align_i(mem);
+
+ for (int i = 0; i < GMX_SIMD_INT32_WIDTH; i++)
+ {
+ p[i] = v[i % v.size()]; // repeat vector contents to fill simd width
+ }
+ return gmx_simd_load_i(p);
+}
+
+gmx_simd_int32_t
+setSimdIntFrom3I(int i0, int i1, int i2)
+{
+ std::vector<int> v(3);
+ v[0] = i0;
+ v[1] = i1;
+ v[2] = i2;
+ return vector2SimdInt(v);
+}
+
+gmx_simd_int32_t
+setSimdIntFrom1I(int value)
+{
+ std::vector<int> v(GMX_SIMD_INT32_WIDTH);
+ for (int i = 0; i < GMX_SIMD_INT32_WIDTH; i++)
+ {
+ v[i] = value;
+ }
+ return vector2SimdInt(v);
+}
+
+::testing::AssertionResult
+SimdTest::compareSimdInt32(const char * refExpr, const char * tstExpr,
+ const gmx_simd_int32_t ref, const gmx_simd_int32_t tst)
+{
+ return compareVectorEq(refExpr, tstExpr, simdInt2Vector(ref), simdInt2Vector(tst));
+}
+
+#endif // GMX_SIMD_HAVE_INT32
+
+/*! \} */
+/*! \endcond */
+
+} // namespace
+} // namespace
--- /dev/null
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2014, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+#ifndef GMX_SIMD_TESTS_SIMD_H
+#define GMX_SIMD_TESTS_SIMD_H
+
+/*! \internal \file
+ * \brief
+ * Declares fixture for testing of normal SIMD (not SIMD4) functionality.
+ *
+ * The SIMD tests are both simple and complicated. The actual testing logic
+ * is \a very straightforward since we just need to test single values against
+ * the math library, and for some math functions we need to do it in a loop.
+ * This could have been achieved in minutes with the default Google Test tools,
+ * if it wasn't for the problem that we cannot access or compare SIMD contents
+ * directly without using lots of other SIMD functionality. For this reason
+ * we have separate the basic testing of load/store operations into a separate
+ * bootstrapping test. Once this works, we use a set of utility routines to
+ * convert SIMD contents to/from std:vector<> and perform the rest of the tests,
+ * which then can farmed out to the base class SimdBaseTest that is common
+ * to SIMD and SIMD4.
+ *
+ * Another complication is that the width of the SIMD implementation will
+ * depend on the hardware and precision. For some simple operations it is
+ * sufficient to set all SIMD elements to the same value, and check that the
+ * result is present in all elements. However, for a few more complex
+ * instructions that might rely on shuffling under-the-hood it is important
+ * that we can test operations with different elements. We achieve this by
+ * having test code that can initialize a SIMD variable from an std::vector
+ * of arbitrary length; the vector is simply repeated to fill all elements in
+ * the SIMD variable. We also have similar routines to compare a SIMD result
+ * with values in a vector, which returns true iff all elements match.
+ *
+ * This way we can write simple tests that use different values for all SIMD
+ * elements. Personally I like using vectors of length 3, since this means
+ * there are no simple repeated patterns in low/high halves of SIMD variables
+ * that are 2,4,8,or 16 elements wide, and we still don't have to care about
+ * the exact SIMD width of the underlying implementation.
+ *
+ * Note that this utility uses a few SIMD load/store instructions internally -
+ * those have been tested separately in the bootstrap_loadstore.cpp file.
+ *
+ * \author Erik Lindahl <erik.lindahl@scilifelab.se>
+ * \ingroup module_simd
+ */
+#include <vector>
+#include <gtest/gtest.h>
+#include "gromacs/simd/simd.h"
+
+#include "base.h"
+
+namespace gmx
+{
+namespace test
+{
+
+/*! \cond internal */
+/*! \addtogroup module_simd */
+/*! \{ */
+
+/* Unfortunately we cannot keep static SIMD constants in the test fixture class.
+ * The problem is that SIMD memory need to be aligned, and in particular
+ * this applies to automatic storage of variables in classes. For SSE registers
+ * this means 16-byte alignment (which seems to work), but AVX requires 32-bit
+ * alignment. At least both gcc-4.7.3 and Apple clang-5.0 (OS X 10.9) fail to
+ * align these variables when they are stored as data in a class.
+ *
+ * In theory we could set some of these on-the-fly e.g. with setSimdFrom3R()
+ * instead (although that would mean repeating code between tests), but many of
+ * the constants depend on the current precision not to mention they
+ * occasionally have many digits that need to be exactly right, and keeping
+ * them in a single place makes sure they are consistent.
+ */
+#ifdef GMX_SIMD_HAVE_REAL
+extern const gmx_simd_real_t rSimd_1_2_3; //!< Generic (different) fp values.
+extern const gmx_simd_real_t rSimd_4_5_6; //!< Generic (different) fp values.
+extern const gmx_simd_real_t rSimd_7_8_9; //!< Generic (different) fp values.
+extern const gmx_simd_real_t rSimd_5_7_9; //!< rSimd_1_2_3 + rSimd_4_5_6.
+extern const gmx_simd_real_t rSimd_m1_m2_m3; //!< Generic negative floating-point values.
+extern const gmx_simd_real_t rSimd_3_1_4; //!< Used to test min/max.
+extern const gmx_simd_real_t rSimd_m3_m1_m4; //!< negative rSimd_3_1_4.
+extern const gmx_simd_real_t rSimd_2p25; //!< Value that rounds down.
+extern const gmx_simd_real_t rSimd_3p75; //!< Value that rounds up.
+extern const gmx_simd_real_t rSimd_m2p25; //!< Negative value that rounds up.
+extern const gmx_simd_real_t rSimd_m3p75; //!< Negative value that rounds down.
+//! Three large floating-point values whose exponents are >32.
+extern const gmx_simd_real_t rSimd_Exp;
+# if (defined GMX_SIMD_HAVE_DOUBLE) && (defined GMX_DOUBLE)
+// Make sure we also test exponents outside single precision when we use double
+extern const gmx_simd_real_t rSimd_ExpDouble;
+# endif
+// Magic FP numbers corresponding to specific bit patterns
+extern const gmx_simd_real_t rSimd_Bits1; //!< Pattern F0 repeated to fill single/double.
+extern const gmx_simd_real_t rSimd_Bits2; //!< Pattern CC repeated to fill single/double.
+extern const gmx_simd_real_t rSimd_Bits3; //!< Pattern C0 repeated to fill single/double.
+extern const gmx_simd_real_t rSimd_Bits4; //!< Pattern 0C repeated to fill single/double.
+extern const gmx_simd_real_t rSimd_Bits5; //!< Pattern FC repeated to fill single/double.
+extern const gmx_simd_real_t rSimd_Bits6; //!< Pattern 3C repeated to fill single/double.
+#endif // GMX_SIMD_HAVE_REAL
+#ifdef GMX_SIMD_HAVE_INT32
+extern const gmx_simd_int32_t iSimd_1_2_3; //!< Three generic ints.
+extern const gmx_simd_int32_t iSimd_4_5_6; //!< Three generic ints.
+extern const gmx_simd_int32_t iSimd_7_8_9; //!< Three generic ints.
+extern const gmx_simd_int32_t iSimd_5_7_9; //!< iSimd_1_2_3 + iSimd_4_5_6.
+extern const gmx_simd_int32_t iSimd_1M_2M_3M; //!< Term1 for 32bit add/sub.
+extern const gmx_simd_int32_t iSimd_4M_5M_6M; //!< Term2 for 32bit add/sub.
+extern const gmx_simd_int32_t iSimd_5M_7M_9M; //!< iSimd_1M_2M_3M + iSimd_4M_5M_6M.
+extern const gmx_simd_int32_t iSimd_0xF0F0F0F0; //!< Bitpattern to test integer logical operations.
+extern const gmx_simd_int32_t iSimd_0xCCCCCCCC; //!< Bitpattern to test integer logical operations.
+#endif // GMX_SIMD_HAVE_INT32
+
+
+/*! \brief Test fixture for SIMD tests.
+ *
+ * This is a very simple test fixture that basically just takes the common
+ * SIMD/SIMD4 functionality from SimdBaseTest and creates wrapper routines
+ * specific for normal SIMD functionality.
+ */
+class SimdTest : public SimdBaseTest
+{
+ public:
+#ifdef GMX_SIMD_HAVE_REAL
+ /*! \brief Compare two real SIMD variables for approximate equality.
+ *
+ * This is an internal implementation routine. YOu should always use
+ * GMX_EXPECT_SIMD_REAL_NEAR() instead.
+ *
+ * This routine is designed according to the Google test specs, so the char
+ * strings will describe the arguments to the macro.
+ *
+ * The comparison is applied to each element, and it returns true if each element
+ * in the SIMD test variable is within the class tolerances of the corresponding
+ * reference element.
+ */
+ ::testing::AssertionResult
+ compareSimdRealUlp(const char * refExpr, const char * tstExpr,
+ const gmx_simd_real_t ref, const gmx_simd_real_t tst);
+
+ /*! \brief Compare two real SIMD variables for exact equality.
+ *
+ * This is an internal implementation routine. YOu should always use
+ * GMX_EXPECT_SIMD_REAL_NEAR() instead.
+ *
+ * This routine is designed according to the Google test specs, so the char
+ * strings will describe the arguments to the macro.
+ *
+ * The comparison is applied to each element, and it returns true if each element
+ * in the SIMD test variable is within the class tolerances of the corresponding
+ * reference element.
+ */
+ ::testing::AssertionResult
+ compareSimdRealEq(const char * refExpr, const char * tstExpr,
+ const gmx_simd_real_t ref, const gmx_simd_real_t tst);
+
+#endif
+
+#ifdef GMX_SIMD_HAVE_INT32
+ /*! \brief Compare two 32-bit integer SIMD variables.
+ *
+ * This is an internal implementation routine. YOu should always use
+ * GMX_EXPECT_SIMD_INT_EQ() instead.
+ *
+ * This routine is designed according to the Google test specs, so the char
+ * strings will describe the arguments to the macro, while the SIMD and
+ * tolerance arguments are used to decide if the values are approximately equal.
+ *
+ * The comparison is applied to each element, and it returns true if each element
+ * in the SIMD variable tst is identical to the corresponding reference element.
+ */
+ ::testing::AssertionResult
+ compareSimdInt32(const char * refExpr, const char * tstExpr,
+ const gmx_simd_int32_t ref, const gmx_simd_int32_t tst);
+#endif
+};
+
+#ifdef GMX_SIMD_HAVE_REAL
+/*! \brief Convert SIMD real to std::vector<real>.
+ *
+ * The returned vector will have the same length as the SIMD width.
+ */
+std::vector<real> simdReal2Vector(const gmx_simd_real_t simd);
+
+/*! \brief Return floating-point SIMD value from std::vector<real>.
+ *
+ * If the vector is longer than SIMD width, only the first elements will be used.
+ * If it is shorter, the contents will be repeated to fill the SIMD register.
+ */
+gmx_simd_real_t vector2SimdReal(const std::vector<real> &v);
+
+/*! \brief Set SIMD register contents from three real values.
+ *
+ * Our reason for using three values is that 3 is not a factor in any known
+ * SIMD width, so this way there will not be any simple repeated patterns e.g.
+ * between the low/high 64/128/256 bits in the SIMD register, which could hide bugs.
+ */
+gmx_simd_real_t setSimdRealFrom3R(real r0, real r1, real r2);
+
+/*! \brief Set SIMD register contents from single real value.
+ *
+ * All elements is set from the given value. This is effectively the same
+ * operation as gmx_simd_set1_r(), but is implemented using only load/store
+ * operations that have been tested separately in the bootstrapping tests.
+ */
+gmx_simd_real_t setSimdRealFrom1R(real value);
+
+/*! \brief Test if a SIMD real is bitwise identical to reference SIMD value. */
+#define GMX_EXPECT_SIMD_REAL_EQ(ref, tst) EXPECT_PRED_FORMAT2(compareSimdRealEq, ref, tst)
+
+/*! \brief Test if a SIMD real is within tolerance of reference SIMD value. */
+#define GMX_EXPECT_SIMD_REAL_NEAR(ref, tst) EXPECT_PRED_FORMAT2(compareSimdRealUlp, ref, tst)
+
+#endif // GMX_SIMD_HAVE_REAL
+
+#ifdef GMX_SIMD_HAVE_INT32
+/*! \brief Convert SIMD integer to std::vector<int>.
+ *
+ * The returned vector will have the same length as the SIMD width.
+ */
+std::vector<int> simdInt2Vector(const gmx_simd_int32_t simd);
+
+/*! \brief Return 32-bit integer SIMD value from std::vector<int>.
+ *
+ * If the vector is longer than SIMD width, only the first elements will be used.
+ * If it is shorter, the contents will be repeated to fill the SIMD register.
+ */
+gmx_simd_int32_t vector2SimdInt(const std::vector<int> &v);
+
+/*! \brief Set SIMD register contents from three int values.
+ *
+ * Our reason for using three values is that 3 is not a factor in any known
+ * SIMD width, so this way there will not be any simple repeated patterns e.g.
+ * between the low/high 64/128/256 bits in the SIMD register, which could hide bugs.
+ */
+gmx_simd_int32_t setSimdIntFrom3I(int i0, int i1, int i2);
+
+/*! \brief Set SIMD register contents from single integer value.
+ *
+ * All elements is set from the given value. This is effectively the same
+ * operation as gmx_simd_set1_i(), but is implemented using only load/store
+ * operations that have been tested separately in the bootstrapping tests.
+ */
+gmx_simd_int32_t setSimdIntFrom1I(int value);
+
+/*! \brief Macro that checks SIMD integer expression against SIMD or reference int.
+ *
+ * If the reference argument is a scalar integer it will be expanded into
+ * the width of the SIMD register and tested against all elements.
+ */
+#define GMX_EXPECT_SIMD_INT_EQ(ref, tst) EXPECT_PRED_FORMAT2(compareSimdInt32, ref, tst)
+
+#endif // GMX_SIMD_HAVE_INT32
+
+/*! \} */
+/*! \endcond */
+
+} // namespace
+} // namespace
+
+#endif // GMX_SIMD_TESTS_SIMD_H
--- /dev/null
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2014, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include "simd4.h"
+
+namespace gmx
+{
+namespace test
+{
+
+/*! \cond internal */
+/*! \addtogroup module_simd */
+/*! \{ */
+
+#ifdef GMX_SIMD4_HAVE_REAL
+
+const gmx_simd4_real_t rSimd4_1_2_3 = setSimd4RealFrom3R(1, 2, 3);
+const gmx_simd4_real_t rSimd4_4_5_6 = setSimd4RealFrom3R(4, 5, 6);
+const gmx_simd4_real_t rSimd4_7_8_9 = setSimd4RealFrom3R(7, 8, 9);
+const gmx_simd4_real_t rSimd4_5_7_9 = setSimd4RealFrom3R(5, 7, 9);
+const gmx_simd4_real_t rSimd4_m1_m2_m3 = setSimd4RealFrom3R(-1, -2, -3);
+const gmx_simd4_real_t rSimd4_3_1_4 = setSimd4RealFrom3R(3, 1, 4);
+const gmx_simd4_real_t rSimd4_m3_m1_m4 = setSimd4RealFrom3R(-3, -1, -4);
+const gmx_simd4_real_t rSimd4_2p25 = setSimd4RealFrom1R(2.25);
+const gmx_simd4_real_t rSimd4_3p75 = setSimd4RealFrom1R(3.75);
+const gmx_simd4_real_t rSimd4_m2p25 = setSimd4RealFrom1R(-2.25);
+const gmx_simd4_real_t rSimd4_m3p75 = setSimd4RealFrom1R(-3.75);
+const gmx_simd4_real_t rSimd4_Exp = setSimd4RealFrom3R( 1.4055235171027452623914516e+18,
+ 5.3057102734253445623914516e-13,
+ -2.1057102745623934534514516e+16);
+# if (defined GMX_SIMD_HAVE_DOUBLE) && (defined GMX_DOUBLE)
+// Make sure we also test exponents outside single precision when we use double
+const gmx_simd4_real_t rSimd_ExpDouble = setSimd4RealFrom3R( 6.287393598732017379054414e+176,
+ 8.794495252903116023030553e-140,
+ -3.637060701570496477655022e+202);
+// Magic FP numbers corresponding to specific bit patterns
+const gmx_simd4_real_t rSimd4_Bits1 = setSimd4RealFrom1R(-1.07730874267432137e+236);
+const gmx_simd4_real_t rSimd4_Bits2 = setSimd4RealFrom1R(-9.25596313493178307e+061);
+const gmx_simd4_real_t rSimd4_Bits3 = setSimd4RealFrom1R(-8.57750588235293981e+003);
+const gmx_simd4_real_t rSimd4_Bits4 = setSimd4RealFrom1R( 1.22416778341839096e-250);
+const gmx_simd4_real_t rSimd4_Bits5 = setSimd4RealFrom1R(-1.15711777004554095e+294);
+const gmx_simd4_real_t rSimd4_Bits6 = setSimd4RealFrom1R( 1.53063836115600621e-018);
+# else
+const gmx_simd4_real_t rSimd4_Bits1 = setSimd4RealFrom1R(-5.9654142337e+29);
+const gmx_simd4_real_t rSimd4_Bits2 = setSimd4RealFrom1R(-1.0737417600e+08);
+const gmx_simd4_real_t rSimd4_Bits3 = setSimd4RealFrom1R(-6.0235290527e+00);
+const gmx_simd4_real_t rSimd4_Bits4 = setSimd4RealFrom1R( 1.0788832913e-31);
+const gmx_simd4_real_t rSimd4_Bits5 = setSimd4RealFrom1R(-1.0508719529e+37);
+const gmx_simd4_real_t rSimd4_Bits6 = setSimd4RealFrom1R( 1.1488970369e-02);
+# endif
+
+::std::vector<real>
+simd4Real2Vector(const gmx_simd4_real_t simd4)
+{
+ real mem[GMX_SIMD4_WIDTH*2];
+ real * p = gmx_simd4_align_r(mem);
+
+ gmx_simd4_store_r(p, simd4);
+ std::vector<real> v(p, p+GMX_SIMD4_WIDTH);
+
+ return v;
+}
+
+gmx_simd4_real_t
+vector2Simd4Real(const std::vector<real> &v)
+{
+ real mem[GMX_SIMD4_WIDTH*2];
+ real * p = gmx_simd4_align_r(mem);
+
+ for (int i = 0; i < GMX_SIMD4_WIDTH; i++)
+ {
+ p[i] = v[i % v.size()]; // repeat vector contents to fill simd width
+ }
+ return gmx_simd4_load_r(p);
+}
+
+gmx_simd4_real_t
+setSimd4RealFrom3R(real r0, real r1, real r2)
+{
+ std::vector<real> v(3);
+ v[0] = r0;
+ v[1] = r1;
+ v[2] = r2;
+ return vector2Simd4Real(v);
+}
+
+gmx_simd4_real_t
+setSimd4RealFrom1R(real value)
+{
+ std::vector<real> v(GMX_SIMD4_WIDTH);
+ for (int i = 0; i < GMX_SIMD4_WIDTH; i++)
+ {
+ v[i] = value;
+ }
+ return vector2Simd4Real(v);
+}
+
+testing::AssertionResult
+Simd4Test::compareSimd4RealUlp(const char * refExpr, const char * tstExpr,
+ const gmx_simd4_real_t ref, const gmx_simd4_real_t tst)
+{
+ return compareVectorRealUlp(refExpr, tstExpr, simd4Real2Vector(ref), simd4Real2Vector(tst));
+}
+
+testing::AssertionResult
+Simd4Test::compareSimd4RealEq(const char * refExpr, const char * tstExpr,
+ const gmx_simd4_real_t ref, const gmx_simd4_real_t tst)
+{
+ return compareVectorEq(refExpr, tstExpr, simd4Real2Vector(ref), simd4Real2Vector(tst));
+}
+
+#endif // GMX_SIMD4_HAVE_REAL
+
+/*! \} */
+/*! \endcond */
+
+} // namespace
+} // namespace
--- /dev/null
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2014, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+#ifndef GMX_SIMD_TESTS_SIMD4_H
+#define GMX_SIMD_TESTS_SIMD4_H
+
+/*! \internal \file
+ * \brief
+ * Declares fixture for testing of SIMD4 functionality.
+ *
+ * This files specializes the common base test utilities to be used
+ * for SIMD4 variables. For detailed documentation, check out the normal
+ * SIMD test classes and files.
+ *
+ * \author Erik Lindahl <erik.lindahl@scilifelab.se>
+ * \ingroup module_simd
+ */
+
+#include <vector>
+#include <gtest/gtest.h>
+#include "gromacs/simd/simd.h"
+#include "gromacs/simd/tests/base.h"
+
+namespace gmx
+{
+namespace test
+{
+
+/*! \cond internal */
+/*! \addtogroup module_simd */
+/*! \{ */
+
+#ifdef GMX_SIMD4_HAVE_REAL
+extern const gmx_simd4_real_t rSimd4_1_2_3; //!< Generic (different) fp values.
+extern const gmx_simd4_real_t rSimd4_4_5_6; //!< Generic (different) fp values.
+extern const gmx_simd4_real_t rSimd4_7_8_9; //!< Generic (different) fp values.
+extern const gmx_simd4_real_t rSimd4_5_7_9; //!< rSimd_1_2_3 + rSimd_4_5_6.
+extern const gmx_simd4_real_t rSimd4_m1_m2_m3; //!< Generic negative fp values.
+extern const gmx_simd4_real_t rSimd4_3_1_4; //!< Used to test min/max.
+extern const gmx_simd4_real_t rSimd4_m3_m1_m4; //!< negative rSimd_3_1_4.
+extern const gmx_simd4_real_t rSimd4_2p25; //!< Value that rounds down.
+extern const gmx_simd4_real_t rSimd4_3p75; //!< Value that rounds up.
+extern const gmx_simd4_real_t rSimd4_m2p25; //!< Negative value that rounds up.
+extern const gmx_simd4_real_t rSimd4_m3p75; //!< Negative value that rounds down.
+//! Three large floating-point values whose exponents are >32.
+extern const gmx_simd4_real_t rSimd4_Exp;
+# if (defined GMX_SIMD_HAVE_DOUBLE) && (defined GMX_DOUBLE)
+// Make sure we also test exponents outside single precision when we use double
+extern const gmx_simd4_real_t rSimd4_ExpDouble;
+# endif
+extern const gmx_simd4_real_t rSimd4_Bits1; //!< Pattern F0 repeated to fill single/double.
+extern const gmx_simd4_real_t rSimd4_Bits2; //!< Pattern CC repeated to fill single/double.
+extern const gmx_simd4_real_t rSimd4_Bits3; //!< Pattern C0 repeated to fill single/double.
+extern const gmx_simd4_real_t rSimd4_Bits4; //!< Pattern 0C repeated to fill single/double.
+extern const gmx_simd4_real_t rSimd4_Bits5; //!< Pattern FC repeated to fill single/double.
+extern const gmx_simd4_real_t rSimd4_Bits6; //!< Pattern 3C repeated to fill single/double.
+
+/*! \brief Test fixture for SIMD4 tests - contains test settings.
+ *
+ * This is a very simple test fixture that basically just takes the common
+ * SIMD/SIMD4 functionality from SimdBaseTest and creates wrapper routines
+ * specific for SIMD4 functionality.
+ */
+class Simd4Test : public SimdBaseTest
+{
+ public:
+ /*! \brief Compare two real SIMD4 variables for approximate equality.
+ *
+ * This is an internal implementation routine. YOu should always use
+ * GMX_EXPECT_SIMD4_REAL_NEAR() instead.
+ *
+ * This routine is designed according to the Google test specs, so the char
+ * strings will describe the arguments to the macro.
+ *
+ * The comparison is applied to each element, and it returns true if each element
+ * in the SIMD4 test variable is within the class tolerances of the corresponding
+ * reference element.
+ */
+ ::testing::AssertionResult
+ compareSimd4RealUlp(const char * refExpr, const char * tstExpr,
+ const gmx_simd4_real_t ref, const gmx_simd4_real_t tst);
+
+ /*! \brief Compare two real SIMD4 variables for exact equality.
+ *
+ * This is an internal implementation routine. YOu should always use
+ * GMX_EXPECT_SIMD4_REAL_NEAR() instead.
+ *
+ * This routine is designed according to the Google test specs, so the char
+ * strings will describe the arguments to the macro.
+ *
+ * The comparison is applied to each element, and it returns true if each element
+ * in the SIMD4 test variable is within the class tolerances of the corresponding
+ * reference element.
+ */
+ ::testing::AssertionResult
+ compareSimd4RealEq(const char * refExpr, const char * tstExpr,
+ const gmx_simd4_real_t ref, const gmx_simd4_real_t tst);
+};
+
+/*! \brief Convert SIMD4 real to std::vector<real>.
+ *
+ * The returned vector will have the same length as the SIMD4 width.
+ */
+std::vector<real> simd4Real2Vector(const gmx_simd4_real_t simd4);
+
+/*! \brief Return floating-point SIMD4 value from std::vector<real>.
+ *
+ * If the vector is longer than SIMD4 width, only the first elements will be used.
+ * If it is shorter, the contents will be repeated to fill the SIMD4 register.
+ */
+gmx_simd4_real_t vector2Simd4Real(const std::vector<real> &v);
+
+/*! \brief Set SIMD4 register contents from three real values.
+ *
+ * It might seem stupid to use three values when we know that the SIMD4 width
+ * is 4, but it simplifies the test organization when the SIMD and SIMD4 tests
+ * are completely symmetric.
+ */
+gmx_simd4_real_t setSimd4RealFrom3R(real r0, real r1, real r2);
+
+/*! \brief Set SIMD4 register contents from single real value.
+ *
+ * All elements is set from the given value. This is effectively the same
+ * operation as gmx_simd4_set1_r(), but is implemented using only load/store
+ * operations that have been tested separately in the bootstrapping tests.
+ */
+gmx_simd4_real_t setSimd4RealFrom1R(real value);
+
+/*! \brief Test if a SIMD4 real is bitwise identical to reference SIMD4 value. */
+#define GMX_EXPECT_SIMD4_REAL_EQ(ref, tst) EXPECT_PRED_FORMAT2(compareSimd4RealEq, ref, tst)
+
+/*! \brief Test if a SIMD4 real is within tolerance of reference SIMD4 value. */
+#define GMX_EXPECT_SIMD4_REAL_NEAR(ref, tst) EXPECT_PRED_FORMAT2(compareSimd4RealUlp, ref, tst)
+
+#endif // GMX_SIMD4_HAVE_REAL
+
+/*! \} */
+/*! \endcond */
+
+} // namespace
+} // namespace
+
+#endif // GMX_SIMD_TESTS_SIMD4_H
--- /dev/null
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2014, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+#include "gromacs/math/utilities.h"
+
+#include "simd4.h"
+
+namespace gmx
+{
+namespace test
+{
+namespace
+{
+
+/*! \cond internal */
+/*! \addtogroup module_simd */
+/*! \{ */
+
+#ifdef GMX_SIMD4_HAVE_REAL
+
+/*! \brief Test fixture for SIMD4 floating-point operations (identical to the SIMD4 \ref Simd4Test) */
+typedef Simd4Test Simd4FloatingpointTest;
+
+TEST_F(Simd4FloatingpointTest, gmxSimd4SetZeroR)
+{
+ GMX_EXPECT_SIMD4_REAL_EQ(setSimd4RealFrom1R(0.0), gmx_simd4_setzero_r());
+}
+
+TEST_F(Simd4FloatingpointTest, gmxSimd4Set1R)
+{
+ GMX_EXPECT_SIMD4_REAL_EQ(setSimd4RealFrom1R(1.0), gmx_simd4_set1_r(1.0));
+}
+
+TEST_F(Simd4FloatingpointTest, gmxSimd4Load1R)
+{
+ real r = 2.0;
+ GMX_EXPECT_SIMD4_REAL_EQ(setSimd4RealFrom1R(r), gmx_simd4_load1_r(&r));
+}
+
+TEST_F(Simd4FloatingpointTest, gmxSimd4AddR)
+{
+ GMX_EXPECT_SIMD4_REAL_EQ(rSimd4_5_7_9, gmx_simd4_add_r(rSimd4_1_2_3, rSimd4_4_5_6)); // 1+4=5, 2+5=7, 3+6=9
+}
+
+TEST_F(Simd4FloatingpointTest, gmxSimd4SubR)
+{
+ GMX_EXPECT_SIMD4_REAL_EQ(rSimd4_4_5_6, gmx_simd4_sub_r(rSimd4_5_7_9, rSimd4_1_2_3)); // 5-1=4, 7-2=5, 9-3=6
+}
+
+TEST_F(Simd4FloatingpointTest, gmxSimd4MulR)
+{
+ GMX_EXPECT_SIMD4_REAL_EQ(setSimd4RealFrom3R(4, 10, 18), gmx_simd4_mul_r(rSimd4_1_2_3, rSimd4_4_5_6));
+}
+
+TEST_F(Simd4FloatingpointTest, gmxSimd4FmaddR)
+{
+ GMX_EXPECT_SIMD4_REAL_EQ(setSimd4RealFrom3R(11, 18, 27), gmx_simd4_fmadd_r(rSimd4_1_2_3, rSimd4_4_5_6, rSimd4_7_8_9)); // 1*4+7, etc.
+}
+
+TEST_F(Simd4FloatingpointTest, gmxSimd4FmsubR)
+{
+ GMX_EXPECT_SIMD4_REAL_EQ(setSimd4RealFrom3R(-3, 2, 9), gmx_simd4_fmsub_r(rSimd4_1_2_3, rSimd4_4_5_6, rSimd4_7_8_9)); // 1*4-7, etc.
+}
+
+TEST_F(Simd4FloatingpointTest, gmxSimd4FnmaddR)
+{
+ GMX_EXPECT_SIMD4_REAL_EQ(setSimd4RealFrom3R(3, -2, -9), gmx_simd4_fnmadd_r(rSimd4_1_2_3, rSimd4_4_5_6, rSimd4_7_8_9)); // -1*4+7, etc.
+}
+
+TEST_F(Simd4FloatingpointTest, gmxSimd4FnmsubR)
+{
+ GMX_EXPECT_SIMD4_REAL_EQ(setSimd4RealFrom3R(-11, -18, -27), gmx_simd4_fnmsub_r(rSimd4_1_2_3, rSimd4_4_5_6, rSimd4_7_8_9)); // -1*4-7, etc.
+}
+
+TEST_F(Simd4FloatingpointTest, gmxSimd4FabsR)
+{
+ GMX_EXPECT_SIMD4_REAL_EQ(rSimd4_1_2_3, gmx_simd4_fabs_r(rSimd4_1_2_3)); // fabs(x)=x
+ GMX_EXPECT_SIMD4_REAL_EQ(rSimd4_1_2_3, gmx_simd4_fabs_r(rSimd4_m1_m2_m3)); // fabs(-x)=x
+}
+
+TEST_F(Simd4FloatingpointTest, gmxSimd4FnegR)
+{
+ GMX_EXPECT_SIMD4_REAL_EQ(rSimd4_m1_m2_m3, gmx_simd4_fneg_r(rSimd4_1_2_3)); // fneg(x)=-x
+ GMX_EXPECT_SIMD4_REAL_EQ(rSimd4_1_2_3, gmx_simd4_fneg_r(rSimd4_m1_m2_m3)); // fneg(-x)=x
+}
+
+#ifdef GMX_SIMD4_HAVE_LOGICAL
+TEST_F(Simd4FloatingpointTest, gmxSimd4AndR)
+{
+ GMX_EXPECT_SIMD4_REAL_EQ(rSimd4_Bits3, gmx_simd4_and_r(rSimd4_Bits1, rSimd4_Bits2)); // Bits1 & Bits2 = Bits3
+}
+
+TEST_F(Simd4FloatingpointTest, gmxSimd4AndnotR)
+{
+ GMX_EXPECT_SIMD4_REAL_EQ(rSimd4_Bits4, gmx_simd4_andnot_r(rSimd4_Bits1, rSimd4_Bits2)); // (~Bits1) & Bits2 = Bits3
+}
+
+TEST_F(Simd4FloatingpointTest, gmxSimd4OrR)
+{
+ GMX_EXPECT_SIMD4_REAL_EQ(rSimd4_Bits5, gmx_simd4_or_r(rSimd4_Bits1, rSimd4_Bits2)); // Bits1 | Bits2 = Bits3
+}
+
+TEST_F(Simd4FloatingpointTest, gmxSimd4XorR)
+{
+ GMX_EXPECT_SIMD4_REAL_EQ(rSimd4_Bits6, gmx_simd4_xor_r(rSimd4_Bits1, rSimd4_Bits2)); // Bits1 ^ Bits2 = Bits3
+}
+#endif
+
+TEST_F(Simd4FloatingpointTest, gmxSimd4MaxR)
+{
+ GMX_EXPECT_SIMD4_REAL_EQ(setSimd4RealFrom3R(3, 2, 4), gmx_simd4_max_r(rSimd4_1_2_3, rSimd4_3_1_4));
+ GMX_EXPECT_SIMD4_REAL_EQ(setSimd4RealFrom3R(3, 2, 4), gmx_simd4_max_r(rSimd4_3_1_4, rSimd4_1_2_3));
+ GMX_EXPECT_SIMD4_REAL_EQ(setSimd4RealFrom3R(-1, -1, -3), gmx_simd4_max_r(rSimd4_m1_m2_m3, rSimd4_m3_m1_m4));
+ GMX_EXPECT_SIMD4_REAL_EQ(setSimd4RealFrom3R(-1, -1, -3), gmx_simd4_max_r(rSimd4_m3_m1_m4, rSimd4_m1_m2_m3));
+}
+
+TEST_F(Simd4FloatingpointTest, gmxSimd4MinR)
+{
+ GMX_EXPECT_SIMD4_REAL_EQ(setSimd4RealFrom3R(1, 1, 3), gmx_simd4_min_r(rSimd4_1_2_3, rSimd4_3_1_4));
+ GMX_EXPECT_SIMD4_REAL_EQ(setSimd4RealFrom3R(1, 1, 3), gmx_simd4_min_r(rSimd4_3_1_4, rSimd4_1_2_3));
+ GMX_EXPECT_SIMD4_REAL_EQ(setSimd4RealFrom3R(-3, -2, -4), gmx_simd4_min_r(rSimd4_m1_m2_m3, rSimd4_m3_m1_m4));
+ GMX_EXPECT_SIMD4_REAL_EQ(setSimd4RealFrom3R(-3, -2, -4), gmx_simd4_min_r(rSimd4_m3_m1_m4, rSimd4_m1_m2_m3));
+}
+
+TEST_F(Simd4FloatingpointTest, gmxSimd4RoundR)
+{
+ GMX_EXPECT_SIMD4_REAL_EQ(setSimd4RealFrom1R(2), gmx_simd4_round_r(gmx_simd4_set1_r(2.25)));
+ GMX_EXPECT_SIMD4_REAL_EQ(setSimd4RealFrom1R(4), gmx_simd4_round_r(gmx_simd4_set1_r(3.75)));
+ GMX_EXPECT_SIMD4_REAL_EQ(setSimd4RealFrom1R(-2), gmx_simd4_round_r(gmx_simd4_set1_r(-2.25)));
+ GMX_EXPECT_SIMD4_REAL_EQ(setSimd4RealFrom1R(-4), gmx_simd4_round_r(gmx_simd4_set1_r(-3.75)));
+}
+
+TEST_F(Simd4FloatingpointTest, gmxSimd4TruncR)
+{
+ GMX_EXPECT_SIMD4_REAL_EQ(setSimd4RealFrom1R(2), gmx_simd4_trunc_r(rSimd4_2p25));
+ GMX_EXPECT_SIMD4_REAL_EQ(setSimd4RealFrom1R(3), gmx_simd4_trunc_r(rSimd4_3p75));
+ GMX_EXPECT_SIMD4_REAL_EQ(setSimd4RealFrom1R(-2), gmx_simd4_trunc_r(rSimd4_m2p25));
+ GMX_EXPECT_SIMD4_REAL_EQ(setSimd4RealFrom1R(-3), gmx_simd4_trunc_r(rSimd4_m3p75));
+}
+
+/* We do extensive 1/sqrt(x) and 1/x accuracy testing in the tests for
+ * the SIMD math functions, so we just make sure the lookup instructions
+ * appear to work for a few values here.
+ */
+TEST_F(Simd4FloatingpointTest, gmxSimd4RsqrtR)
+{
+ gmx_simd4_real_t x = setSimd4RealFrom3R(4.0, M_PI, 1234567890.0);
+ gmx_simd4_real_t ref = setSimd4RealFrom3R(0.5, 1.0/sqrt(M_PI), 1.0/sqrt(1234567890.0));
+
+ // The allowed Ulp deviation is 2 to the power of the number of mantissa
+ // digits, minus the number of bits provided by the table lookup
+ setUlpTol(1LL << (std::numeric_limits<real>::digits-GMX_SIMD_RSQRT_BITS));
+ GMX_EXPECT_SIMD4_REAL_NEAR(ref, gmx_simd4_rsqrt_r(x));
+}
+
+TEST_F(Simd4FloatingpointTest, gmxSimd4BoolCmpEqAndBlendZeroR)
+{
+ gmx_simd4_bool_t eq = gmx_simd4_cmpeq_r(rSimd4_5_7_9, rSimd4_7_8_9);
+ GMX_EXPECT_SIMD4_REAL_EQ(setSimd4RealFrom3R(0, 0, 3), gmx_simd4_blendzero_r(rSimd4_1_2_3, eq));
+}
+
+TEST_F(Simd4FloatingpointTest, gmxSimd4BlendNotZeroR)
+{
+ gmx_simd4_bool_t eq = gmx_simd4_cmpeq_r(rSimd4_5_7_9, rSimd4_7_8_9);
+ GMX_EXPECT_SIMD4_REAL_EQ(setSimd4RealFrom3R(1, 2, 0), gmx_simd4_blendnotzero_r(rSimd4_1_2_3, eq));
+}
+
+TEST_F(Simd4FloatingpointTest, gmxSimd4BoolCmpLER)
+{
+ gmx_simd4_bool_t le = gmx_simd4_cmple_r(rSimd4_5_7_9, rSimd4_7_8_9);
+ GMX_EXPECT_SIMD4_REAL_EQ(rSimd4_1_2_3, gmx_simd4_blendzero_r(rSimd4_1_2_3, le));
+}
+
+TEST_F(Simd4FloatingpointTest, gmxSimd4BoolCmpLTR)
+{
+ gmx_simd4_bool_t lt = gmx_simd4_cmplt_r(rSimd4_5_7_9, rSimd4_7_8_9);
+ GMX_EXPECT_SIMD4_REAL_EQ(setSimd4RealFrom3R(1, 2, 0), gmx_simd4_blendzero_r(rSimd4_1_2_3, lt));
+}
+
+TEST_F(Simd4FloatingpointTest, gmxSimd4BoolAndB)
+{
+ gmx_simd4_bool_t eq = gmx_simd4_cmpeq_r(rSimd4_5_7_9, rSimd4_7_8_9);
+ gmx_simd4_bool_t le = gmx_simd4_cmple_r(rSimd4_5_7_9, rSimd4_7_8_9);
+ GMX_EXPECT_SIMD4_REAL_EQ(setSimd4RealFrom3R(0, 0, 3), gmx_simd4_blendzero_r(rSimd4_1_2_3, gmx_simd4_and_b(eq, le)));
+}
+
+TEST_F(Simd4FloatingpointTest, gmxSimd4BoolOrB)
+{
+ gmx_simd4_bool_t eq = gmx_simd4_cmpeq_r(rSimd4_5_7_9, rSimd4_7_8_9);
+ gmx_simd4_bool_t lt = gmx_simd4_cmplt_r(rSimd4_5_7_9, rSimd4_7_8_9);
+ GMX_EXPECT_SIMD4_REAL_EQ(setSimd4RealFrom3R(1, 2, 3), gmx_simd4_blendzero_r(rSimd4_1_2_3, gmx_simd4_or_b(eq, lt)));
+}
+
+TEST_F(Simd4FloatingpointTest, gmxSimd4AnytrueB)
+{
+ gmx_simd4_bool_t eq;
+
+ /* this test is a bit tricky since we don't know the simd width.
+ * We cannot check for truth values for "any" element beyond the first,
+ * since that part of the data will not be used if simd width is 1.
+ */
+ eq = gmx_simd4_cmpeq_r(rSimd4_5_7_9, setSimd4RealFrom3R(5, 0, 0));
+ EXPECT_NE(0, gmx_simd4_anytrue_b(eq));
+
+ eq = gmx_simd4_cmpeq_r(rSimd4_1_2_3, rSimd4_4_5_6);
+ EXPECT_EQ(0, gmx_simd4_anytrue_b(eq));
+}
+
+TEST_F(Simd4FloatingpointTest, gmxSimd4BlendvR)
+{
+ gmx_simd4_bool_t lt = gmx_simd4_cmplt_r(rSimd4_5_7_9, rSimd4_7_8_9);
+ GMX_EXPECT_SIMD4_REAL_EQ(setSimd4RealFrom3R(4, 5, 3), gmx_simd4_blendv_r(rSimd4_1_2_3, rSimd4_4_5_6, lt));
+}
+
+TEST_F(Simd4FloatingpointTest, gmxSimd4ReduceR)
+{
+ // The horizontal sum of the SIMD variable depends on the width, so
+ // simply store it an extra time and calculate what the sum should be
+ std::vector<real> v = simd4Real2Vector(rSimd4_1_2_3);
+ real sum = 0.0;
+
+ for (int i = 0; i < GMX_SIMD4_WIDTH; i++)
+ {
+ sum += v[i];
+ }
+
+ EXPECT_EQ(sum, gmx_simd4_reduce_r(rSimd4_1_2_3));
+}
+
+
+TEST_F(Simd4FloatingpointTest, gmxSimd4Dotproduct3R)
+{
+ gmx_simd4_real_t v1 = setSimd4RealFrom3R(1, 4, 5);
+ gmx_simd4_real_t v2 = setSimd4RealFrom3R(3, 8, 2);
+# ifdef GMX_DOUBLE
+ EXPECT_DOUBLE_EQ(45.0, gmx_simd4_dotproduct3_r(v1, v2));
+# else
+ EXPECT_FLOAT_EQ(45.0, gmx_simd4_dotproduct3_r(v1, v2));
+# endif
+}
+
+#endif // GMX_SIMD4_HAVE_REAL
+
+/*! \} */
+/*! \endcond */
+
+} // namespace
+} // namespace
+} // namespace
--- /dev/null
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2014, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <vector>
+#include "gromacs/math/utilities.h"
+#include "gromacs/simd/simd.h"
+#include "gromacs/simd/simd_math.h"
+#include "gromacs/options/basicoptions.h"
+
+#include "simd4.h"
+
+namespace gmx
+{
+namespace test
+{
+
+/*! \cond internal */
+/*! \addtogroup module_simd */
+/*! \{ */
+
+#ifdef GMX_SIMD4_HAVE_REAL
+
+class Simd4MathTest : public Simd4Test
+{
+ public:
+ ::testing::AssertionResult
+ compareSimd4MathFunction(const char * refFuncExpr, const char *simd4FuncExpr,
+ real refFunc(real x), gmx_simd4_real_t simd4Func(gmx_simd4_real_t x));
+};
+
+/*! \brief Test approximate equality of SIMD4 vs reference version of a function.
+ *
+ * This macro takes vanilla C and SIMD flavors of a function and tests it with
+ * the number of points, range, and tolerances specified by the test fixture class.
+ */
+#define GMX_EXPECT_SIMD4_FUNC_NEAR(refFunc, tstFunc) \
+ EXPECT_PRED_FORMAT2(compareSimd4MathFunction, refFunc, tstFunc)
+
+
+/*! \brief Implementation routine to compare SIMD4 vs reference functions.
+ *
+ * \param refFuncExpr Description of reference function expression
+ * \param simd4FuncExpr Description of SIMD function expression
+ * \param refFunc Reference math function pointer
+ * \param simd4Func SIMD math function pointer
+ *
+ * The function will be tested with the range and tolerances specified in
+ * the SimdBaseTest class. You should not never call this function directly,
+ * but use the macro GMX_EXPECT_SIMD4_FUNC_NEAR(refFunc,tstFunc) instead.
+ */
+::testing::AssertionResult
+Simd4MathTest::compareSimd4MathFunction(const char * refFuncExpr, const char *simd4FuncExpr,
+ real refFunc(real x), gmx_simd4_real_t simd4Func(gmx_simd4_real_t x))
+{
+ std::vector<real> vx(GMX_SIMD4_WIDTH);
+ std::vector<real> vref(GMX_SIMD4_WIDTH);
+ std::vector<real> vtst(GMX_SIMD4_WIDTH);
+ real dx;
+ gmx_int64_t ulpDiff, maxUlpDiff;
+ real maxUlpDiffPos;
+ real refValMaxUlpDiff, simdValMaxUlpDiff;
+ bool eq, signOk;
+ int i, iter;
+ int niter = s_nPoints/GMX_SIMD4_WIDTH;
+ int npoints = niter*GMX_SIMD4_WIDTH;
+# ifdef GMX_DOUBLE
+ union {
+ double r; gmx_int64_t i;
+ } conv0, conv1;
+# else
+ union {
+ float r; gmx_int32_t i;
+ } conv0, conv1;
+# endif
+
+ maxUlpDiff = 0;
+ dx = (range_.second-range_.first)/npoints;
+
+ for (iter = 0; iter < niter; iter++)
+ {
+ for (i = 0; i < GMX_SIMD4_WIDTH; i++)
+ {
+ vx[i] = range_.first+dx*(iter*GMX_SIMD4_WIDTH+i);
+ vref[i] = refFunc(vx[i]);
+ }
+ vtst = simd4Real2Vector(simd4Func(vector2Simd4Real(vx)));
+
+ for (i = 0, eq = true, signOk = true; i < GMX_SIMD4_WIDTH && eq == true; i++)
+ {
+ eq = eq && ( fabs(vref[i]-vtst[i]) < absTol_ );
+ signOk = signOk && ( vref[i]*vtst[i] >= 0 );
+ }
+ if (eq == true)
+ {
+ // Go to next point if everything within absolute tolerance
+ continue;
+ }
+ else if (signOk == false)
+ {
+ return ::testing::AssertionFailure()
+ << "Failing SIMD4 math function comparison due to sign differences." << std::endl
+ << "Reference function: " << refFuncExpr << std::endl
+ << "Simd function: " << simd4FuncExpr << std::endl
+ << "Test range is ( " << range_.first << " , " << range_.second << " ) " << std::endl
+ << "First sign difference around x=" << std::setprecision(20) << ::testing::PrintToString(vx) << std::endl
+ << "Ref values: " << std::setprecision(20) << ::testing::PrintToString(vref) << std::endl
+ << "SIMD4 values: " << std::setprecision(20) << ::testing::PrintToString(vtst) << std::endl;
+ }
+ /* We replicate the trivial ulp differences comparison here rather than
+ * calling the lower-level routine for comparing them, since this enables
+ * us to run through the entire test range and report the largest deviation
+ * without lots of extra glue routines.
+ */
+ for (i = 0; i < GMX_SIMD4_WIDTH; i++)
+ {
+ conv0.r = vref[i];
+ conv1.r = vtst[i];
+ ulpDiff = llabs(conv0.i-conv1.i);
+ if (ulpDiff > maxUlpDiff)
+ {
+ maxUlpDiff = ulpDiff;
+ maxUlpDiffPos = vx[i];
+ refValMaxUlpDiff = vref[i];
+ simdValMaxUlpDiff = vtst[i];
+ }
+ }
+ }
+
+ if (maxUlpDiff <= ulpTol_)
+ {
+ return ::testing::AssertionSuccess();
+ }
+ else
+ {
+ return ::testing::AssertionFailure()
+ << "Failing SIMD4 math function ulp comparison between " << refFuncExpr << " and " << simd4FuncExpr << std::endl
+ << "Requested ulp tolerance: " << ulpTol_ << std::endl
+ << "Requested abs tolerance: " << absTol_ << std::endl
+ << "Largest Ulp difference occurs for x=" << std::setprecision(20) << maxUlpDiffPos << std::endl
+ << "Ref values: " << std::setprecision(20) << refValMaxUlpDiff << std::endl
+ << "SIMD4 values: " << std::setprecision(20) << simdValMaxUlpDiff << std::endl
+ << "Ulp diff.: " << std::setprecision(20) << maxUlpDiff << std::endl;
+ }
+}
+
+/*! \} */
+/*! \endcond */
+
+// Actual math function tests below
+
+namespace
+{
+
+/*! \cond internal */
+/*! \addtogroup module_simd */
+/*! \{ */
+
+/*! \brief Function wrapper to evaluate reference 1/sqrt(x) */
+static real
+ref_invsqrt(real x)
+{
+ return 1.0/sqrt(x);
+}
+
+TEST_F(Simd4MathTest, gmxSimd4InvsqrtR)
+{
+ setRange(1e-10, 1e10);
+ GMX_EXPECT_SIMD4_FUNC_NEAR(ref_invsqrt, gmx_simd4_invsqrt_r);
+}
+
+} // namespace
+
+#endif // GMX_SIMD4_HAVE_REAL
+
+/*! \} */
+/*! \endcond */
+
+} // namespace
+} // namespace
--- /dev/null
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2014, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+#include "gromacs/simd/simd.h"
+#include "gromacs/simd/vector_operations.h"
+
+#include "simd4.h"
+
+namespace gmx
+{
+namespace test
+{
+namespace
+{
+
+/*! \cond internal */
+/*! \addtogroup module_simd */
+/*! \{ */
+
+#ifdef GMX_SIMD4_HAVE_REAL
+
+/*! \brief Test fixture for SIMD4 vector operations (identical to the SIMD4 \ref Simd4Test) */
+typedef Simd4Test Simd4VectorOperationsTest;
+
+TEST_F(Simd4VectorOperationsTest, gmxSimd4CalcRsqR)
+{
+ gmx_simd4_real_t simdX = setSimd4RealFrom3R(1, 2, 3);
+ gmx_simd4_real_t simdY = setSimd4RealFrom3R(3, 0, 5);
+ gmx_simd4_real_t simdZ = setSimd4RealFrom3R(4, 1, 8);
+ gmx_simd4_real_t simdR2 = setSimd4RealFrom3R(26, 5, 98);
+
+ setUlpTol(2);
+ GMX_EXPECT_SIMD4_REAL_NEAR(simdR2, gmx_simd4_calc_rsq_r(simdX, simdY, simdZ));
+}
+
+#endif // GMX_SIMD4_HAVE_REAL
+
+/*! \} */
+/*! \endcond */
+
+} // namespace
+} // namespace
+} // namespace
--- /dev/null
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2014, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+#include "gromacs/math/utilities.h"
+
+#include "simd.h"
+
+namespace gmx
+{
+namespace test
+{
+namespace
+{
+
+/*! \cond internal */
+/*! \addtogroup module_simd */
+/*! \{ */
+
+#ifdef GMX_SIMD_HAVE_REAL
+
+/*! \brief Test fixture for floating-point tests (identical to the generic \ref SimdTest) */
+typedef SimdTest SimdFloatingpointTest;
+
+TEST_F(SimdFloatingpointTest, gmxSimdSetZeroR)
+{
+ GMX_EXPECT_SIMD_REAL_EQ(setSimdRealFrom1R(0.0), gmx_simd_setzero_r());
+}
+
+TEST_F(SimdFloatingpointTest, gmxSimdSet1R)
+{
+ GMX_EXPECT_SIMD_REAL_EQ(setSimdRealFrom1R(1.0), gmx_simd_set1_r(1.0));
+}
+
+TEST_F(SimdFloatingpointTest, gmxSimdLoad1R)
+{
+ real r = 2.0;
+ GMX_EXPECT_SIMD_REAL_EQ(setSimdRealFrom1R(r), gmx_simd_load1_r(&r));
+}
+
+TEST_F(SimdFloatingpointTest, gmxSimdAddR)
+{
+ GMX_EXPECT_SIMD_REAL_EQ(rSimd_5_7_9,
+ gmx_simd_add_r(rSimd_1_2_3, rSimd_4_5_6)); // 1+4=5, 2+5=7, 3+6=9
+}
+
+TEST_F(SimdFloatingpointTest, gmxSimdSubR)
+{
+ GMX_EXPECT_SIMD_REAL_EQ(rSimd_4_5_6,
+ gmx_simd_sub_r(rSimd_5_7_9, rSimd_1_2_3)); // 5-1=4, 7-2=5, 9-3=6
+}
+
+TEST_F(SimdFloatingpointTest, gmxSimdMulR)
+{
+ GMX_EXPECT_SIMD_REAL_EQ(setSimdRealFrom3R(4, 10, 18),
+ gmx_simd_mul_r(rSimd_1_2_3, rSimd_4_5_6));
+}
+
+TEST_F(SimdFloatingpointTest, gmxSimdFmaddR)
+{
+ GMX_EXPECT_SIMD_REAL_EQ(setSimdRealFrom3R(11, 18, 27),
+ gmx_simd_fmadd_r(rSimd_1_2_3, rSimd_4_5_6, rSimd_7_8_9)); // 1*4+7, etc.
+}
+
+TEST_F(SimdFloatingpointTest, gmxSimdFmsubR)
+{
+ GMX_EXPECT_SIMD_REAL_EQ(setSimdRealFrom3R(-3, 2, 9),
+ gmx_simd_fmsub_r(rSimd_1_2_3, rSimd_4_5_6, rSimd_7_8_9)); // 1*4-7, etc.
+}
+
+TEST_F(SimdFloatingpointTest, gmxSimdFnmaddR)
+{
+ GMX_EXPECT_SIMD_REAL_EQ(setSimdRealFrom3R(3, -2, -9),
+ gmx_simd_fnmadd_r(rSimd_1_2_3, rSimd_4_5_6, rSimd_7_8_9)); // -1*4+7, etc.
+}
+
+TEST_F(SimdFloatingpointTest, gmxSimdFnmsubR)
+{
+ GMX_EXPECT_SIMD_REAL_EQ(setSimdRealFrom3R(-11, -18, -27),
+ gmx_simd_fnmsub_r(rSimd_1_2_3, rSimd_4_5_6, rSimd_7_8_9)); // -1*4-7, etc.
+}
+
+TEST_F(SimdFloatingpointTest, gmxSimdFabsR)
+{
+ GMX_EXPECT_SIMD_REAL_EQ(rSimd_1_2_3, gmx_simd_fabs_r(rSimd_1_2_3)); // fabs(x)=x
+ GMX_EXPECT_SIMD_REAL_EQ(rSimd_1_2_3, gmx_simd_fabs_r(rSimd_m1_m2_m3)); // fabs(-x)=x
+}
+
+TEST_F(SimdFloatingpointTest, gmxSimdFnegR)
+{
+ GMX_EXPECT_SIMD_REAL_EQ(rSimd_m1_m2_m3, gmx_simd_fneg_r(rSimd_1_2_3)); // fneg(x)=-x
+ GMX_EXPECT_SIMD_REAL_EQ(rSimd_1_2_3, gmx_simd_fneg_r(rSimd_m1_m2_m3)); // fneg(-x)=x
+}
+
+#ifdef GMX_SIMD_HAVE_LOGICAL
+TEST_F(SimdFloatingpointTest, gmxSimdAndR)
+{
+ GMX_EXPECT_SIMD_REAL_EQ(rSimd_Bits3, gmx_simd_and_r(rSimd_Bits1, rSimd_Bits2)); // Bits1 & Bits2 = Bits3
+}
+
+TEST_F(SimdFloatingpointTest, gmxSimdAndnotR)
+{
+ GMX_EXPECT_SIMD_REAL_EQ(rSimd_Bits4, gmx_simd_andnot_r(rSimd_Bits1, rSimd_Bits2)); // (~Bits1) & Bits2 = Bits3
+}
+
+TEST_F(SimdFloatingpointTest, gmxSimdOrR)
+{
+ GMX_EXPECT_SIMD_REAL_EQ(rSimd_Bits5, gmx_simd_or_r(rSimd_Bits1, rSimd_Bits2)); // Bits1 | Bits2 = Bits3
+}
+
+TEST_F(SimdFloatingpointTest, gmxSimdXorR)
+{
+ GMX_EXPECT_SIMD_REAL_EQ(rSimd_Bits6, gmx_simd_xor_r(rSimd_Bits1, rSimd_Bits2)); // Bits1 ^ Bits2 = Bits3
+}
+#endif
+
+TEST_F(SimdFloatingpointTest, gmxSimdMaxR)
+{
+ GMX_EXPECT_SIMD_REAL_EQ(setSimdRealFrom3R(3, 2, 4), gmx_simd_max_r(rSimd_1_2_3, rSimd_3_1_4));
+ GMX_EXPECT_SIMD_REAL_EQ(setSimdRealFrom3R(3, 2, 4), gmx_simd_max_r(rSimd_3_1_4, rSimd_1_2_3));
+ GMX_EXPECT_SIMD_REAL_EQ(setSimdRealFrom3R(-1, -1, -3), gmx_simd_max_r(rSimd_m1_m2_m3, rSimd_m3_m1_m4));
+ GMX_EXPECT_SIMD_REAL_EQ(setSimdRealFrom3R(-1, -1, -3), gmx_simd_max_r(rSimd_m3_m1_m4, rSimd_m1_m2_m3));
+}
+
+TEST_F(SimdFloatingpointTest, gmxSimdMinR)
+{
+ GMX_EXPECT_SIMD_REAL_EQ(setSimdRealFrom3R(1, 1, 3), gmx_simd_min_r(rSimd_1_2_3, rSimd_3_1_4));
+ GMX_EXPECT_SIMD_REAL_EQ(setSimdRealFrom3R(1, 1, 3), gmx_simd_min_r(rSimd_3_1_4, rSimd_1_2_3));
+ GMX_EXPECT_SIMD_REAL_EQ(setSimdRealFrom3R(-3, -2, -4), gmx_simd_min_r(rSimd_m1_m2_m3, rSimd_m3_m1_m4));
+ GMX_EXPECT_SIMD_REAL_EQ(setSimdRealFrom3R(-3, -2, -4), gmx_simd_min_r(rSimd_m3_m1_m4, rSimd_m1_m2_m3));
+}
+
+TEST_F(SimdFloatingpointTest, gmxSimdRoundR)
+{
+ GMX_EXPECT_SIMD_REAL_EQ(setSimdRealFrom1R(2), gmx_simd_round_r(gmx_simd_set1_r(2.25)));
+ GMX_EXPECT_SIMD_REAL_EQ(setSimdRealFrom1R(4), gmx_simd_round_r(gmx_simd_set1_r(3.75)));
+ GMX_EXPECT_SIMD_REAL_EQ(setSimdRealFrom1R(-2), gmx_simd_round_r(gmx_simd_set1_r(-2.25)));
+ GMX_EXPECT_SIMD_REAL_EQ(setSimdRealFrom1R(-4), gmx_simd_round_r(gmx_simd_set1_r(-3.75)));
+}
+
+TEST_F(SimdFloatingpointTest, gmxSimdTruncR)
+{
+ GMX_EXPECT_SIMD_REAL_EQ(setSimdRealFrom1R(2), gmx_simd_trunc_r(rSimd_2p25));
+ GMX_EXPECT_SIMD_REAL_EQ(setSimdRealFrom1R(3), gmx_simd_trunc_r(rSimd_3p75));
+ GMX_EXPECT_SIMD_REAL_EQ(setSimdRealFrom1R(-2), gmx_simd_trunc_r(rSimd_m2p25));
+ GMX_EXPECT_SIMD_REAL_EQ(setSimdRealFrom1R(-3), gmx_simd_trunc_r(rSimd_m3p75));
+}
+
+TEST_F(SimdFloatingpointTest, gmxSimdFractionR)
+{
+ GMX_EXPECT_SIMD_REAL_EQ(setSimdRealFrom1R(0.25), gmx_simd_fraction_r(rSimd_2p25)); // fract(2.25)=0.25
+ GMX_EXPECT_SIMD_REAL_EQ(setSimdRealFrom1R(0.75), gmx_simd_fraction_r(rSimd_3p75)); // fract(3.75)=0.75
+ GMX_EXPECT_SIMD_REAL_EQ(setSimdRealFrom1R(-0.25), gmx_simd_fraction_r(rSimd_m2p25)); // fract(-2.25)=-0.25
+ GMX_EXPECT_SIMD_REAL_EQ(setSimdRealFrom1R(-0.75), gmx_simd_fraction_r(rSimd_m3p75)); // fract(-3.75)=-0.75
+}
+
+// We explicitly test the exponent/mantissa routines with double precision data,
+// since these usually rely on direct manipulation and shift of the SIMD registers,
+// where it is easy to make mistakes with single vs double precision.
+
+TEST_F(SimdFloatingpointTest, gmxSimdGetExponentR)
+{
+ GMX_EXPECT_SIMD_REAL_EQ(setSimdRealFrom3R(60.0, -41.0, 54.0), gmx_simd_get_exponent_r(rSimd_Exp));
+#if (defined GMX_SIMD_HAVE_DOUBLE) && (defined GMX_DOUBLE)
+ GMX_EXPECT_SIMD_REAL_EQ(setSimdRealFrom3R(587.0, -462.0, 672.0), gmx_simd_get_exponent_r(rSimd_ExpDouble));
+#endif
+}
+
+TEST_F(SimdFloatingpointTest, gmxSimdGetMantissaR)
+{
+ GMX_EXPECT_SIMD_REAL_EQ(setSimdRealFrom3R(1.219097320577810839026256,
+ 1.166738027848349235071623,
+ 1.168904015004464724825084), gmx_simd_get_mantissa_r(rSimd_Exp));
+#if (defined GMX_SIMD_HAVE_DOUBLE) && (defined GMX_DOUBLE)
+ GMX_EXPECT_SIMD_REAL_EQ(setSimdRealFrom3R(1.241261238952345623563251,
+ 1.047294723759123852359232,
+ 1.856066204750275957395734), gmx_simd_get_mantissa_r(rSimd_ExpDouble));
+#endif
+}
+
+TEST_F(SimdFloatingpointTest, gmxSimdSetExponentR)
+{
+ gmx_simd_real_t x0 = setSimdRealFrom3R(0.5, 11.5, 99.5);
+ gmx_simd_real_t x1 = setSimdRealFrom3R(-0.5, -11.5, -99.5);
+
+ GMX_EXPECT_SIMD_REAL_EQ(setSimdRealFrom3R(pow(2.0, 60.0), pow(2.0, -41.0), pow(2.0, 54.0)),
+ gmx_simd_set_exponent_r(setSimdRealFrom3R(60.0, -41.0, 54.0)));
+#if (defined GMX_SIMD_HAVE_DOUBLE) && (defined GMX_DOUBLE)
+ GMX_EXPECT_SIMD_REAL_EQ(setSimdRealFrom3R(pow(2.0, 587.0), pow(2.0, -462.0), pow(2.0, 672.0)),
+ gmx_simd_set_exponent_r(setSimdRealFrom3R(587.0, -462.0, 672.0)));
+#endif
+ /* Rounding mode in gmx_simd_set_exponent_r() must be consistent with gmx_simd_round_r() */
+ GMX_EXPECT_SIMD_REAL_EQ(gmx_simd_set_exponent_r(gmx_simd_round_r(x0)), gmx_simd_set_exponent_r(x0));
+ GMX_EXPECT_SIMD_REAL_EQ(gmx_simd_set_exponent_r(gmx_simd_round_r(x1)), gmx_simd_set_exponent_r(x1));
+}
+
+/*
+ * We do extensive 1/sqrt(x) and 1/x accuracy testing in the math module, so
+ * we just make sure the lookup instructions appear to work here
+ */
+
+TEST_F(SimdFloatingpointTest, gmxSimdRsqrtR)
+{
+ gmx_simd_real_t x = setSimdRealFrom3R(4.0, M_PI, 1234567890.0);
+ gmx_simd_real_t ref = setSimdRealFrom3R(0.5, 1.0/sqrt(M_PI), 1.0/sqrt(1234567890.0));
+
+ /* Set the allowed ulp error as 2 to the power of the number of bits in
+ * the mantissa that do not have to be correct after the table lookup.
+ */
+ setUlpTol(1LL << (std::numeric_limits<real>::digits-GMX_SIMD_RSQRT_BITS));
+
+ GMX_EXPECT_SIMD_REAL_NEAR(ref, gmx_simd_rsqrt_r(x));
+}
+
+TEST_F(SimdFloatingpointTest, gmxSimdRcpR)
+{
+ gmx_simd_real_t x = setSimdRealFrom3R(4.0, M_PI, 1234567890.0);
+ gmx_simd_real_t ref = setSimdRealFrom3R(0.25, 1.0/M_PI, 1.0/1234567890.0);
+
+ /* Set the allowed ulp error as 2 to the power of the number of bits in
+ * the mantissa that do not have to be correct after the table lookup.
+ */
+ setUlpTol(1LL << (std::numeric_limits<real>::digits-GMX_SIMD_RCP_BITS));
+
+ GMX_EXPECT_SIMD_REAL_NEAR(ref, gmx_simd_rcp_r(x));
+}
+
+TEST_F(SimdFloatingpointTest, gmxSimdBoolCmpEqAndBlendZeroR)
+{
+ gmx_simd_bool_t eq = gmx_simd_cmpeq_r(rSimd_5_7_9, rSimd_7_8_9);
+ GMX_EXPECT_SIMD_REAL_EQ(setSimdRealFrom3R(0, 0, 3), gmx_simd_blendzero_r(rSimd_1_2_3, eq));
+}
+
+TEST_F(SimdFloatingpointTest, gmxSimdBlendNotZeroR)
+{
+ gmx_simd_bool_t eq = gmx_simd_cmpeq_r(rSimd_5_7_9, rSimd_7_8_9);
+ GMX_EXPECT_SIMD_REAL_EQ(setSimdRealFrom3R(1, 2, 0), gmx_simd_blendnotzero_r(rSimd_1_2_3, eq));
+}
+
+TEST_F(SimdFloatingpointTest, gmxSimdBoolCmpLER)
+{
+ gmx_simd_bool_t le = gmx_simd_cmple_r(rSimd_5_7_9, rSimd_7_8_9);
+ GMX_EXPECT_SIMD_REAL_EQ(rSimd_1_2_3, gmx_simd_blendzero_r(rSimd_1_2_3, le));
+}
+
+TEST_F(SimdFloatingpointTest, gmxSimdBoolCmpLTR)
+{
+ gmx_simd_bool_t lt = gmx_simd_cmplt_r(rSimd_5_7_9, rSimd_7_8_9);
+ GMX_EXPECT_SIMD_REAL_EQ(setSimdRealFrom3R(1, 2, 0), gmx_simd_blendzero_r(rSimd_1_2_3, lt));
+}
+
+TEST_F(SimdFloatingpointTest, gmxSimdBoolAndB)
+{
+ gmx_simd_bool_t eq = gmx_simd_cmpeq_r(rSimd_5_7_9, rSimd_7_8_9);
+ gmx_simd_bool_t le = gmx_simd_cmple_r(rSimd_5_7_9, rSimd_7_8_9);
+ GMX_EXPECT_SIMD_REAL_EQ(setSimdRealFrom3R(0, 0, 3), gmx_simd_blendzero_r(rSimd_1_2_3, gmx_simd_and_b(eq, le)));
+}
+
+TEST_F(SimdFloatingpointTest, gmxSimdBoolOrB)
+{
+ gmx_simd_bool_t eq = gmx_simd_cmpeq_r(rSimd_5_7_9, rSimd_7_8_9);
+ gmx_simd_bool_t lt = gmx_simd_cmplt_r(rSimd_5_7_9, rSimd_7_8_9);
+ GMX_EXPECT_SIMD_REAL_EQ(setSimdRealFrom3R(1, 2, 3), gmx_simd_blendzero_r(rSimd_1_2_3, gmx_simd_or_b(eq, lt)));
+}
+
+TEST_F(SimdFloatingpointTest, gmxSimdAnytrueB)
+{
+ gmx_simd_bool_t eq;
+
+ /* this test is a bit tricky since we don't know the simd width.
+ * We cannot check for truth values for "any" element beyond the first,
+ * since that part of the data will not be used if simd width is 1.
+ */
+ eq = gmx_simd_cmpeq_r(rSimd_5_7_9, setSimdRealFrom3R(5, 0, 0));
+ EXPECT_NE(0, gmx_simd_anytrue_b(eq));
+
+ eq = gmx_simd_cmpeq_r(rSimd_1_2_3, rSimd_4_5_6);
+ EXPECT_EQ(0, gmx_simd_anytrue_b(eq));
+}
+
+TEST_F(SimdFloatingpointTest, gmxSimdBlendvR)
+{
+ gmx_simd_bool_t lt = gmx_simd_cmplt_r(rSimd_5_7_9, rSimd_7_8_9);
+ GMX_EXPECT_SIMD_REAL_EQ(setSimdRealFrom3R(4, 5, 3), gmx_simd_blendv_r(rSimd_1_2_3, rSimd_4_5_6, lt));
+}
+
+TEST_F(SimdFloatingpointTest, gmxSimdReduceR)
+{
+ // The horizontal sum of the SIMD variable depends on the width, so
+ // simply store it an extra time and calculate what the sum should be
+ std::vector<real> v = simdReal2Vector(rSimd_4_5_6);
+ real sum = 0.0;
+
+ for (int i = 0; i < GMX_SIMD_REAL_WIDTH; i++)
+ {
+ sum += v[i];
+ }
+
+ EXPECT_EQ(sum, gmx_simd_reduce_r(rSimd_4_5_6));
+}
+
+#endif // GMX_SIMD_HAVE_REAL
+
+/*! \} */
+/*! \endcond */
+
+} // namespace
+} // namespace
+} // namespace
--- /dev/null
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2014, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include "simd.h"
+
+/* Some notes on the setup of these tests:
+ *
+ * It might seem strange to mix different instructions for "setting" SIMD
+ * registers, but the difference is that the routines like setSimdIntFrom1I()
+ * only use the load/store operations that we already test separately in
+ * bootstrap_loadstore.cpp. Since these are "known good" if the bootstrap
+ * tests pass, we use them to test the normal SIMD implementation instructions
+ * that all have gmx_simd_ prefixes.
+ */
+
+namespace gmx
+{
+namespace test
+{
+namespace
+{
+
+/*! \cond internal */
+/*! \addtogroup module_simd */
+/*! \{ */
+
+#ifdef GMX_SIMD_HAVE_INT32
+
+/*! \brief Test fixture for integer tests (identical to the generic \ref SimdTest) */
+typedef SimdTest SimdIntegerTest;
+
+TEST_F(SimdIntegerTest, gmxSimdSetZeroI)
+{
+ GMX_EXPECT_SIMD_INT_EQ(setSimdIntFrom1I(0), gmx_simd_setzero_i());
+}
+
+TEST_F(SimdIntegerTest, gmxSimdSet1I)
+{
+ GMX_EXPECT_SIMD_INT_EQ(setSimdIntFrom1I(1), gmx_simd_set1_i(1));
+}
+
+#ifdef GMX_SIMD_HAVE_FINT32_ARITHMETICS
+TEST_F(SimdIntegerTest, gmxSimdAddI)
+{
+ GMX_EXPECT_SIMD_INT_EQ(iSimd_5_7_9, gmx_simd_add_i(iSimd_1_2_3, iSimd_4_5_6) ); // short add
+ GMX_EXPECT_SIMD_INT_EQ(iSimd_5M_7M_9M, gmx_simd_add_i(iSimd_1M_2M_3M, iSimd_4M_5M_6M)); // 32 bit add
+}
+
+TEST_F(SimdIntegerTest, gmxSimdSubI)
+{
+ GMX_EXPECT_SIMD_INT_EQ(iSimd_1_2_3, gmx_simd_sub_i(iSimd_5_7_9, iSimd_4_5_6) ); // short sub
+ GMX_EXPECT_SIMD_INT_EQ(iSimd_1M_2M_3M, gmx_simd_sub_i(iSimd_5M_7M_9M, iSimd_4M_5M_6M)); // 32 bit sub
+}
+
+TEST_F(SimdIntegerTest, gmxSimdMulI)
+{
+ GMX_EXPECT_SIMD_INT_EQ(setSimdIntFrom3I(4, 10, 18), gmx_simd_mul_i(iSimd_1_2_3, iSimd_4_5_6)); // 2*3=6 (short mul)
+ GMX_EXPECT_SIMD_INT_EQ(setSimdIntFrom1I(268435456), gmx_simd_mul_i(gmx_simd_set1_i(16384), gmx_simd_set1_i(16384))); // 16384*16384 = 268435456 (long mul)
+}
+#endif
+
+#ifdef GMX_SIMD_HAVE_FINT32_LOGICAL
+TEST_F(SimdIntegerTest, gmxSimdSlliI)
+{
+ GMX_EXPECT_SIMD_INT_EQ(setSimdIntFrom1I(4194304), gmx_simd_slli_i(gmx_simd_set1_i(2), 21)); // 2 << 21 = 4194304
+}
+
+TEST_F(SimdIntegerTest, gmxSimdSrliI)
+{
+ GMX_EXPECT_SIMD_INT_EQ(setSimdIntFrom1I(4), gmx_simd_srli_i(gmx_simd_set1_i(4194304), 20)); // 4194304 >> 20 = 4
+}
+
+TEST_F(SimdIntegerTest, gmxSimdAndI)
+{
+ GMX_EXPECT_SIMD_INT_EQ(setSimdIntFrom1I(0xC0C0C0C0), gmx_simd_and_i(iSimd_0xF0F0F0F0, iSimd_0xCCCCCCCC));
+}
+
+TEST_F(SimdIntegerTest, gmxSimdAndnotI)
+{
+ GMX_EXPECT_SIMD_INT_EQ(setSimdIntFrom1I(0x0C0C0C0C), gmx_simd_andnot_i(iSimd_0xF0F0F0F0, iSimd_0xCCCCCCCC));
+}
+
+TEST_F(SimdIntegerTest, gmxSimdOrI)
+{
+ GMX_EXPECT_SIMD_INT_EQ(setSimdIntFrom1I(0xFCFCFCFC), gmx_simd_or_i(iSimd_0xF0F0F0F0, iSimd_0xCCCCCCCC));
+}
+
+TEST_F(SimdIntegerTest, gmxSimdXorI)
+{
+ GMX_EXPECT_SIMD_INT_EQ(setSimdIntFrom1I(0x3C3C3C3C), gmx_simd_xor_i(iSimd_0xF0F0F0F0, iSimd_0xCCCCCCCC));
+}
+#endif
+
+#ifdef GMX_SIMD_HAVE_INT32_EXTRACT
+TEST_F(SimdIntegerTest, gmxSimdExtractI)
+{
+ int idata[GMX_SIMD_INT32_WIDTH*2];
+ int * p = gmx_simd_align_i(idata);
+ gmx_simd_int32_t simd;
+ int i, extracted_int;
+
+ for (i = 0; i < GMX_SIMD_INT32_WIDTH; i++)
+ {
+ p[i] = i+1;
+ }
+ simd = gmx_simd_load_i(p);
+
+ /* We cannot do a loop here, since
+ * - C++ gets confused about signed/unsigned if SSE macros are used in EXPECT_EQ()
+ * - Extract macros can only take immediates (not variables) on some archs,
+ * and some compilers are not smart enough to expand the for loop.
+ *
+ * To solve this we use a few values manually instead of a for-loop.
+ */
+ extracted_int = gmx_simd_extract_i(simd, 0);
+ EXPECT_EQ(1, extracted_int);
+ if (GMX_SIMD_INT32_WIDTH >= 2)
+ {
+ extracted_int = gmx_simd_extract_i(simd, 1);
+ EXPECT_EQ(2, extracted_int);
+ }
+ if (GMX_SIMD_INT32_WIDTH >= 4)
+ {
+ extracted_int = gmx_simd_extract_i(simd, 3);
+ EXPECT_EQ(4, extracted_int);
+ }
+ if (GMX_SIMD_INT32_WIDTH >= 6)
+ {
+ extracted_int = gmx_simd_extract_i(simd, 5);
+ EXPECT_EQ(6, extracted_int);
+ }
+ if (GMX_SIMD_INT32_WIDTH >= 8)
+ {
+ extracted_int = gmx_simd_extract_i(simd, 7);
+ EXPECT_EQ(8, extracted_int);
+ }
+}
+#endif
+
+#ifdef GMX_SIMD_HAVE_REAL
+TEST_F(SimdIntegerTest, gmxSimdCvtR2I)
+{
+ GMX_EXPECT_SIMD_INT_EQ(setSimdIntFrom1I(4), gmx_simd_cvt_r2i(rSimd_3p75));
+ GMX_EXPECT_SIMD_INT_EQ(setSimdIntFrom1I(-4), gmx_simd_cvt_r2i(rSimd_m3p75));
+}
+
+TEST_F(SimdIntegerTest, gmxSimdCvttR2I)
+{
+ GMX_EXPECT_SIMD_INT_EQ(setSimdIntFrom1I(3), gmx_simd_cvtt_r2i(rSimd_3p75));
+ GMX_EXPECT_SIMD_INT_EQ(setSimdIntFrom1I(-3), gmx_simd_cvtt_r2i(rSimd_m3p75));
+}
+
+TEST_F(SimdIntegerTest, gmxSimdCvtI2R)
+{
+ GMX_EXPECT_SIMD_REAL_EQ(setSimdRealFrom1R(2.0), gmx_simd_cvt_i2r(gmx_simd_set1_i(2)));
+ GMX_EXPECT_SIMD_REAL_EQ(setSimdRealFrom1R(-2.0), gmx_simd_cvt_i2r(gmx_simd_set1_i(-2)));
+}
+#endif
+
+#ifdef GMX_SIMD_HAVE_FINT32_ARITHMETICS
+TEST_F(SimdIntegerTest, gmxSimdBoolCmpEqAndBlendZeroI)
+{
+ gmx_simd_ibool_t eq = gmx_simd_cmpeq_i(iSimd_5_7_9, iSimd_7_8_9);
+ GMX_EXPECT_SIMD_INT_EQ(setSimdIntFrom3I(0, 0, 3), gmx_simd_blendzero_i(iSimd_1_2_3, eq));
+}
+
+TEST_F(SimdIntegerTest, gmxSimdBlendNotZeroI)
+{
+ gmx_simd_ibool_t eq = gmx_simd_cmpeq_i(iSimd_5_7_9, iSimd_7_8_9);
+ GMX_EXPECT_SIMD_INT_EQ(setSimdIntFrom3I(1, 2, 0), gmx_simd_blendnotzero_i(iSimd_1_2_3, eq));
+}
+
+TEST_F(SimdIntegerTest, gmxSimdBoolCmpLTI)
+{
+ gmx_simd_ibool_t lt = gmx_simd_cmplt_i(iSimd_5_7_9, iSimd_7_8_9);
+ GMX_EXPECT_SIMD_INT_EQ(setSimdIntFrom3I(1, 2, 0), gmx_simd_blendzero_i(iSimd_1_2_3, lt));
+}
+
+TEST_F(SimdIntegerTest, gmxSimdBoolAndIB)
+{
+ gmx_simd_ibool_t eq1 = gmx_simd_cmpeq_i(iSimd_5_7_9, iSimd_7_8_9);
+ gmx_simd_ibool_t eq2 = gmx_simd_cmpeq_i(iSimd_5_7_9, iSimd_5_7_9);
+ GMX_EXPECT_SIMD_INT_EQ(setSimdIntFrom3I(0, 0, 3), gmx_simd_blendzero_i(iSimd_1_2_3, gmx_simd_and_ib(eq1, eq2)));
+}
+
+TEST_F(SimdIntegerTest, gmxSimdBoolOrIB)
+{
+ gmx_simd_ibool_t eq1 = gmx_simd_cmpeq_i(iSimd_5_7_9, iSimd_7_8_9);
+ gmx_simd_ibool_t eq2 = gmx_simd_cmpeq_i(iSimd_5_7_9, setSimdIntFrom3I(5, 0, 0));
+ GMX_EXPECT_SIMD_INT_EQ(setSimdIntFrom3I(1, 0, 3), gmx_simd_blendzero_i(iSimd_1_2_3, gmx_simd_or_ib(eq1, eq2)));
+}
+
+TEST_F(SimdIntegerTest, gmxSimdAnytrueIB)
+{
+ gmx_simd_ibool_t eq;
+
+ /* See comment in floatingpoint.cpp. We should only check the first element here,
+ * since the SIMD width could be 1 as a special case.
+ */
+ eq = gmx_simd_cmpeq_i(iSimd_5_7_9, setSimdIntFrom3I(5, 0, 0));
+ EXPECT_NE(0, gmx_simd_anytrue_ib(eq));
+
+ eq = gmx_simd_cmpeq_i(iSimd_1_2_3, iSimd_4_5_6);
+ EXPECT_EQ(0, gmx_simd_anytrue_ib(eq));
+}
+
+TEST_F(SimdIntegerTest, gmxSimdBlendvI)
+{
+ gmx_simd_ibool_t lt = gmx_simd_cmplt_i(iSimd_5_7_9, iSimd_7_8_9);
+ GMX_EXPECT_SIMD_INT_EQ(setSimdIntFrom3I(4, 5, 3), gmx_simd_blendv_i(iSimd_1_2_3, iSimd_4_5_6, lt));
+}
+#endif
+
+#if (defined GMX_SIMD_HAVE_REAL) && (defined GMX_SIMD_HAVE_FINT32_ARITHMETICS)
+TEST_F(SimdIntegerTest, gmxSimdCvtB2IB)
+{
+ gmx_simd_bool_t eq = gmx_simd_cmpeq_r(rSimd_5_7_9, setSimdRealFrom3R(5, 0, 0)); // eq should be T,F,F
+ gmx_simd_ibool_t eqi = gmx_simd_cvt_b2ib(eq);
+ GMX_EXPECT_SIMD_INT_EQ(setSimdIntFrom3I(1, 0, 0), gmx_simd_blendzero_i(iSimd_1_2_3, eqi));
+
+}
+
+TEST_F(SimdIntegerTest, gmxSimdCvtIB2B)
+{
+ gmx_simd_ibool_t eqi = gmx_simd_cmpeq_i(iSimd_5_7_9, setSimdIntFrom3I(5, 0, 0)); // eq should be T,F,F
+ gmx_simd_bool_t eq = gmx_simd_cvt_ib2b(eqi);
+ GMX_EXPECT_SIMD_REAL_EQ(setSimdRealFrom3R(1.0, 0, 0), gmx_simd_blendzero_r(rSimd_1_2_3, eq));
+}
+#endif
+
+#endif // GMX_SIMD_HAVE_INT32
+
+/*! \} */
+/*! \endcond */
+
+} // namespace
+} // namespace
+} // namespace
--- /dev/null
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2014, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <vector>
+#include "gromacs/math/utilities.h"
+#include "gromacs/simd/simd.h"
+#include "gromacs/simd/simd_math.h"
+#include "gromacs/options/basicoptions.h"
+
+#include "simd.h"
+
+namespace gmx
+{
+namespace test
+{
+
+/*! \cond internal */
+/*! \addtogroup module_simd */
+/*! \{ */
+
+#ifdef GMX_SIMD_HAVE_REAL
+
+class SimdMathTest : public SimdTest
+{
+ public:
+ ::testing::AssertionResult
+ compareSimdMathFunction(const char * refFuncExpr, const char *simdFuncExpr,
+ real refFunc(real x), gmx_simd_real_t simdFunc(gmx_simd_real_t x));
+};
+
+/*! \brief Test approximate equality of SIMD vs reference version of a function.
+ *
+ * This macro takes vanilla C and SIMD flavors of a function and tests it with
+ * the number of points, range, and tolerances specified by the test fixture class.
+ */
+#define GMX_EXPECT_SIMD_FUNC_NEAR(refFunc, tstFunc) \
+ EXPECT_PRED_FORMAT2(compareSimdMathFunction, refFunc, tstFunc)
+
+/*! \brief Implementation routine to compare SIMD vs reference functions.
+ *
+ * \param refFuncExpr Description of reference function expression
+ * \param simdFuncExpr Description of SIMD function expression
+ * \param refFunc Reference math function pointer
+ * \param simdFunc SIMD math function pointer
+ *
+ * The function will be tested with the range and tolerances specified in
+ * the SimdBaseTest class. You should not never call this function directly,
+ * but use the macro GMX_EXPECT_SIMD_FUNC_NEAR(refFunc,tstFunc) instead.
+ */
+::testing::AssertionResult
+SimdMathTest::compareSimdMathFunction(const char * refFuncExpr, const char *simdFuncExpr,
+ real refFunc(real x), gmx_simd_real_t simdFunc(gmx_simd_real_t x))
+{
+ std::vector<real> vx(GMX_SIMD_REAL_WIDTH);
+ std::vector<real> vref(GMX_SIMD_REAL_WIDTH);
+ std::vector<real> vtst(GMX_SIMD_REAL_WIDTH);
+ real dx, absDiff;
+ gmx_int64_t ulpDiff, maxUlpDiff;
+ real maxUlpDiffPos;
+ real refValMaxUlpDiff, simdValMaxUlpDiff;
+ bool absOk, signOk;
+ int i, iter;
+ int niter = s_nPoints/GMX_SIMD_REAL_WIDTH;
+ int npoints = niter*GMX_SIMD_REAL_WIDTH;
+# ifdef GMX_DOUBLE
+ union {
+ double r; gmx_int64_t i;
+ } conv0, conv1;
+# else
+ union {
+ float r; gmx_int32_t i;
+ } conv0, conv1;
+# endif
+
+ maxUlpDiff = 0;
+ dx = (range_.second-range_.first)/npoints;
+
+ for (iter = 0; iter < niter; iter++)
+ {
+ for (i = 0; i < GMX_SIMD_REAL_WIDTH; i++)
+ {
+ vx[i] = range_.first+dx*(iter*GMX_SIMD_REAL_WIDTH+i);
+ vref[i] = refFunc(vx[i]);
+ }
+ vtst = simdReal2Vector(simdFunc(vector2SimdReal(vx)));
+
+ for (i = 0, signOk = true, absOk = true; i < GMX_SIMD_REAL_WIDTH; i++)
+ {
+ absDiff = fabs(vref[i]-vtst[i]);
+ absOk = absOk && ( absDiff < absTol_ );
+ signOk = signOk && ( vref[i]*vtst[i] >= 0 );
+
+ if (absDiff >= absTol_)
+ {
+ /* We replicate the trivial ulp differences comparison here rather than
+ * calling the lower-level routine for comparing them, since this enables
+ * us to run through the entire test range and report the largest deviation
+ * without lots of extra glue routines.
+ */
+ conv0.r = vref[i];
+ conv1.r = vtst[i];
+ ulpDiff = llabs(conv0.i-conv1.i);
+ if (ulpDiff > maxUlpDiff)
+ {
+ maxUlpDiff = ulpDiff;
+ maxUlpDiffPos = vx[i];
+ refValMaxUlpDiff = vref[i];
+ simdValMaxUlpDiff = vtst[i];
+ }
+ }
+ }
+ if ( (absOk == false) && (signOk == false) )
+ {
+ return ::testing::AssertionFailure()
+ << "Failing SIMD math function comparison due to sign differences." << std::endl
+ << "Reference function: " << refFuncExpr << std::endl
+ << "Simd function: " << simdFuncExpr << std::endl
+ << "Test range is ( " << range_.first << " , " << range_.second << " ) " << std::endl
+ << "First sign difference around x=" << std::setprecision(20) << ::testing::PrintToString(vx) << std::endl
+ << "Ref values: " << std::setprecision(20) << ::testing::PrintToString(vref) << std::endl
+ << "SIMD values: " << std::setprecision(20) << ::testing::PrintToString(vtst) << std::endl;
+ }
+ }
+
+ if (maxUlpDiff <= ulpTol_)
+ {
+ return ::testing::AssertionSuccess();
+ }
+ else
+ {
+ return ::testing::AssertionFailure()
+ << "Failing SIMD math function ulp comparison between " << refFuncExpr << " and " << simdFuncExpr << std::endl
+ << "Requested ulp tolerance: " << ulpTol_ << std::endl
+ << "Requested abs tolerance: " << absTol_ << std::endl
+ << "Largest Ulp difference occurs for x=" << std::setprecision(20) << maxUlpDiffPos << std::endl
+ << "Ref values: " << std::setprecision(20) << refValMaxUlpDiff << std::endl
+ << "SIMD values: " << std::setprecision(20) << simdValMaxUlpDiff << std::endl
+ << "Ulp diff.: " << std::setprecision(20) << maxUlpDiff << std::endl;
+ }
+}
+
+/*! \} */
+/*! \endcond */
+
+
+// Actual math function tests below
+
+
+namespace
+{
+
+/*! \cond internal */
+/*! \addtogroup module_simd */
+/*! \{ */
+
+TEST_F(SimdMathTest, gmxSimdXorSignR)
+{
+ GMX_EXPECT_SIMD_REAL_EQ(setSimdRealFrom3R(-4, 5, 6), gmx_simd_xor_sign_r(setSimdRealFrom3R(4, 5, 6), setSimdRealFrom3R(-5, 2, 0)));
+ GMX_EXPECT_SIMD_REAL_EQ(setSimdRealFrom3R(4, -5, -6), gmx_simd_xor_sign_r(setSimdRealFrom3R(-4, -5, -6), setSimdRealFrom3R(-5, 2, 0)));
+}
+
+/*! \brief Function wrapper to evaluate reference 1/sqrt(x) */
+static real
+ref_invsqrt(real x)
+{
+ return 1.0/sqrt(x);
+}
+
+TEST_F(SimdMathTest, gmxSimdInvsqrtR)
+{
+ setRange(1e-10, 1e10);
+ GMX_EXPECT_SIMD_FUNC_NEAR(ref_invsqrt, gmx_simd_invsqrt_r);
+}
+
+/*! \brief Function wrapper to return first result when testing \ref gmx_simd_invsqrt_pair_r */
+gmx_simd_real_t
+tst_invsqrt_pair0(gmx_simd_real_t x)
+{
+ gmx_simd_real_t r0, r1;
+ gmx_simd_invsqrt_pair_r(x, x, &r0, &r1);
+ return r0;
+}
+
+/*! \brief Function wrapper to return second result when testing \ref gmx_simd_invsqrt_pair_r */
+gmx_simd_real_t
+tst_invsqrt_pair1(gmx_simd_real_t x)
+{
+ gmx_simd_real_t r0, r1;
+ gmx_simd_invsqrt_pair_r(x, x, &r0, &r1);
+ return r1;
+}
+
+TEST_F(SimdMathTest, gmxSimdInvsqrtPairR)
+{
+ setRange(1e-10, 1e10);
+ // The accuracy conversions lose a bit of extra accuracy compared to
+ // doing the iterations in all-double.
+ setUlpTol(4*ulpTol_);
+
+ GMX_EXPECT_SIMD_FUNC_NEAR(ref_invsqrt, tst_invsqrt_pair0);
+ GMX_EXPECT_SIMD_FUNC_NEAR(ref_invsqrt, tst_invsqrt_pair1);
+}
+
+TEST_F(SimdMathTest, gmxSimdSqrtR)
+{
+ // Just make sure sqrt(0)=0 works and isn't evaluated as 0*1/sqrt(0)=NaN
+ GMX_EXPECT_SIMD_REAL_NEAR(setSimdRealFrom3R(0, 2, 3), gmx_simd_sqrt_r(setSimdRealFrom3R(0, 4, 9)));
+}
+
+/*! \brief Function wrapper to evaluate reference 1/x */
+real ref_inv(real x)
+{
+ return 1.0/x;
+}
+
+TEST_F(SimdMathTest, gmxSimdInvR)
+{
+ // test <0
+ setRange(-1e10, -1e-10);
+ GMX_EXPECT_SIMD_FUNC_NEAR(ref_inv, gmx_simd_inv_r);
+ setRange(1e-10, 1e10);
+ GMX_EXPECT_SIMD_FUNC_NEAR(ref_inv, gmx_simd_inv_r);
+}
+
+/*! \brief Function wrapper for log(x), with argument/return in default Gromacs precision */
+real ref_log(real x)
+{
+ return log(x);
+}
+
+TEST_F(SimdMathTest, gmxSimdLogR)
+{
+ setRange(1e-30, 1e30);
+ GMX_EXPECT_SIMD_FUNC_NEAR(ref_log, gmx_simd_log_r);
+}
+
+// MSVC does not support exp2(), so we have no reference to test against
+#ifndef _MSC_VER
+/*! \brief Function wrapper for exp2(x), with argument/return in default Gromacs precision */
+real ref_exp2(real x)
+{
+ return exp2(x);
+}
+
+TEST_F(SimdMathTest, gmxSimdExp2R)
+{
+ setRange(-100, 100);
+ GMX_EXPECT_SIMD_FUNC_NEAR(ref_exp2, gmx_simd_exp2_r);
+}
+#endif
+
+/*! \brief Function wrapper for exp(x), with argument/return in default Gromacs precision */
+real ref_exp(real x)
+{
+ return exp(x);
+}
+
+TEST_F(SimdMathTest, gmxSimdExpR)
+{
+ setRange(-75, 75);
+ GMX_EXPECT_SIMD_FUNC_NEAR(ref_exp, gmx_simd_exp_r);
+}
+
+/*! \brief Function wrapper for erf(x), with argument/return in default Gromacs precision.
+ *
+ * \note The single-precision gmx_erff() in gmxlib is slightly lower precision
+ * than the SIMD flavor, so we use double for reference.
+ */
+real ref_erf(real x)
+{
+ return gmx_erfd(x);
+}
+
+TEST_F(SimdMathTest, gmxSimdErfR)
+{
+ setRange(-9, 9);
+ setAbsTol(GMX_REAL_MIN);
+ GMX_EXPECT_SIMD_FUNC_NEAR(ref_erf, gmx_simd_erf_r);
+}
+
+/*! \brief Function wrapper for erfc(x), with argument/return in default Gromacs precision.
+ *
+ * \note The single-precision gmx_erfcf() in gmxlib is slightly lower precision
+ * than the SIMD flavor, so we use double for reference.
+ */
+real ref_erfc(real x)
+{
+ return gmx_erfcd(x);
+}
+
+TEST_F(SimdMathTest, gmxSimdErfcR)
+{
+ setRange(-9, 9);
+ setAbsTol(GMX_REAL_MIN);
+ // Our erfc algorithm has 4 ulp accuracy, so relax defaultTol a bit
+ setUlpTol(4*ulpTol_);
+ GMX_EXPECT_SIMD_FUNC_NEAR(ref_erfc, gmx_simd_erfc_r);
+}
+
+/*! \brief Function wrapper for sin(x), with argument/return in default Gromacs precision */
+real ref_sin(real x)
+{
+ return sin(x);
+}
+
+TEST_F(SimdMathTest, gmxSimdSinR)
+{
+ setRange(-8*M_PI, 8*M_PI);
+ GMX_EXPECT_SIMD_FUNC_NEAR(ref_sin, gmx_simd_sin_r);
+ // Range reduction leads to accuracy loss, so we might want higher tolerance here
+ setRange(-10000, 10000);
+ setUlpTol(2*ulpTol_);
+ GMX_EXPECT_SIMD_FUNC_NEAR(ref_sin, gmx_simd_sin_r);
+}
+
+/*! \brief Function wrapper for cos(x), with argument/return in default Gromacs precision */
+real ref_cos(real x)
+{
+ return cos(x);
+}
+
+TEST_F(SimdMathTest, gmxSimdCosR)
+{
+ setRange(-8*M_PI, 8*M_PI);
+ GMX_EXPECT_SIMD_FUNC_NEAR(ref_cos, gmx_simd_cos_r);
+ // Range reduction leads to accuracy loss, so we might want higher tolerance here
+ setRange(-10000, 10000);
+ setUlpTol(2*ulpTol_);
+ GMX_EXPECT_SIMD_FUNC_NEAR(ref_cos, gmx_simd_cos_r);
+}
+
+/*! \brief Function wrapper for tan(x), with argument/return in default Gromacs precision */
+real ref_tan(real x)
+{
+ return tan(x);
+}
+
+TEST_F(SimdMathTest, gmxSimdTanR)
+{
+ // Tan(x) is a little sensitive due to the division in the algorithm.
+ // Rather than using lots of extra FP operations, we accept the algorithm
+ // presently only achieves a ~3 ulp error and use the medium tolerance.
+ setRange(-8*M_PI, 8*M_PI);
+ GMX_EXPECT_SIMD_FUNC_NEAR(ref_tan, gmx_simd_tan_r);
+ // Range reduction leads to accuracy loss, so we might want higher tolerance here
+ setRange(-10000, 10000);
+ setUlpTol(2*ulpTol_);
+ GMX_EXPECT_SIMD_FUNC_NEAR(ref_tan, gmx_simd_tan_r);
+}
+
+/*! \brief Function wrapper for asin(x), with argument/return in default Gromacs precision */
+real ref_asin(real x)
+{
+ return asin(x);
+}
+
+TEST_F(SimdMathTest, gmxSimdAsinR)
+{
+ // Our present asin(x) algorithm achieves 2-3 ulp accuracy
+ setRange(-1, 1);
+ GMX_EXPECT_SIMD_FUNC_NEAR(ref_asin, gmx_simd_asin_r);
+}
+
+/*! \brief Function wrapper for acos(x), with argument/return in default Gromacs precision */
+real ref_acos(real x)
+{
+ return acos(x);
+}
+
+TEST_F(SimdMathTest, gmxSimdAcosR)
+{
+ // Our present acos(x) algorithm achieves 2-3 ulp accuracy
+ setRange(-1, 1);
+ GMX_EXPECT_SIMD_FUNC_NEAR(ref_acos, gmx_simd_acos_r);
+}
+
+/*! \brief Function wrapper for atan(x), with argument/return in default Gromacs precision */
+real ref_atan(real x)
+{
+ return atan(x);
+}
+
+TEST_F(SimdMathTest, gmxSimdAtanR)
+{
+ // Our present atan(x) algorithm achieves 1 ulp accuracy
+ setRange(-10000, 10000);
+ GMX_EXPECT_SIMD_FUNC_NEAR(ref_atan, gmx_simd_atan_r);
+}
+
+TEST_F(SimdMathTest, gmxSimdAtan2R)
+{
+ // test each quadrant
+ GMX_EXPECT_SIMD_REAL_NEAR(setSimdRealFrom1R(atan2(1.0, 1.0)), gmx_simd_atan2_r(rSimd_1_2_3, rSimd_1_2_3));
+ GMX_EXPECT_SIMD_REAL_NEAR(setSimdRealFrom1R(atan2(-1.0, 1.0)), gmx_simd_atan2_r(rSimd_m1_m2_m3, rSimd_1_2_3));
+ GMX_EXPECT_SIMD_REAL_NEAR(setSimdRealFrom1R(atan2(-1.0, -1.0)), gmx_simd_atan2_r(rSimd_m1_m2_m3, rSimd_m1_m2_m3));
+ GMX_EXPECT_SIMD_REAL_NEAR(setSimdRealFrom1R(atan2(1.0, -1.0)), gmx_simd_atan2_r(rSimd_1_2_3, rSimd_m1_m2_m3));
+ // cases important for calculating angles
+ // values on coordinate axes
+ GMX_EXPECT_SIMD_REAL_NEAR(setSimdRealFrom1R(atan2(0.0, 1.0)), gmx_simd_atan2_r(gmx_simd_setzero_r(), rSimd_1_2_3));
+ GMX_EXPECT_SIMD_REAL_NEAR(setSimdRealFrom1R(atan2(1.0, 0.0)), gmx_simd_atan2_r(rSimd_1_2_3, gmx_simd_setzero_r()));
+ GMX_EXPECT_SIMD_REAL_NEAR(setSimdRealFrom1R(atan2(0.0, -1.0)), gmx_simd_atan2_r(gmx_simd_setzero_r(), rSimd_m1_m2_m3));
+ GMX_EXPECT_SIMD_REAL_NEAR(setSimdRealFrom1R(atan2(-1.0, 0.0)), gmx_simd_atan2_r(rSimd_m1_m2_m3, gmx_simd_setzero_r()));
+ // degenerate value (origin) should return 0.0
+ GMX_EXPECT_SIMD_REAL_NEAR(setSimdRealFrom1R(atan2(0.0, 0.0)), gmx_simd_atan2_r(setSimdRealFrom3R(0.0, 0.0, 0.0), gmx_simd_setzero_r()));
+}
+
+/*! \brief Evaluate reference version of PME force correction. */
+real ref_pmecorrF(real x)
+{
+ real y = sqrt(x);
+ return 2*exp(-x)/(sqrt(M_PI)*x) - gmx_erfd(y)/(x*y);
+}
+
+// The PME corrections will be added to ~1/r2, so absolute tolerance of EPS is fine.
+TEST_F(SimdMathTest, gmxSimdPmecorrForceR)
+{
+ // Pme correction only needs to be ~1e-6 accuracy single, 1e-10 double
+#ifdef GMX_DOUBLE
+ setUlpTol((gmx_int64_t)(1e-10/GMX_REAL_EPS));
+#else
+ setUlpTol((gmx_int64_t)(1e-6/GMX_REAL_EPS));
+#endif
+
+ setRange(0.15, 4);
+ setAbsTol(GMX_REAL_EPS);
+ GMX_EXPECT_SIMD_FUNC_NEAR(ref_pmecorrF, gmx_simd_pmecorrF_r);
+}
+
+/*! \brief Evaluate reference version of PME potential correction. */
+real ref_pmecorrV(real x)
+{
+ real y = sqrt(x);
+ return gmx_erfd(y)/y;
+}
+
+// The PME corrections will be added to ~1/r, so absolute tolerance of EPS is fine.
+TEST_F(SimdMathTest, gmxSimdPmecorrPotentialR)
+{
+ // Pme correction only needs to be ~1e-6 accuracy single, 1e-10 double
+#ifdef GMX_DOUBLE
+ setUlpTol((gmx_int64_t)(1e-10/GMX_REAL_EPS));
+#else
+ setUlpTol((gmx_int64_t)(1e-6/GMX_REAL_EPS));
+#endif
+ setRange(0.15, 4);
+ setAbsTol(GMX_REAL_EPS);
+ GMX_EXPECT_SIMD_FUNC_NEAR(ref_pmecorrV, gmx_simd_pmecorrV_r);
+}
+
+} // namespace
+
+#endif // GMX_SIMD_HAVE_REAL
+
+/*! \} */
+/*! \endcond */
+
+} // namespace
+} // namespace
--- /dev/null
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2014, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+#include "gromacs/simd/simd.h"
+#include "gromacs/simd/vector_operations.h"
+
+#include "simd.h"
+
+namespace gmx
+{
+namespace test
+{
+namespace
+{
+
+/*! \cond internal */
+/*! \addtogroup module_simd */
+/*! \{ */
+
+#ifdef GMX_SIMD_HAVE_REAL
+
+/*! \internal \brief Test fixture for vector operations tests (identical to the generic \ref SimdTest) */
+typedef SimdTest SimdVectorOperationsTest;
+
+TEST_F(SimdVectorOperationsTest, gmxSimdCalcRsqR)
+{
+ gmx_simd_real_t simdX = setSimdRealFrom3R(1, 2, 3);
+ gmx_simd_real_t simdY = setSimdRealFrom3R(3, 0, 5);
+ gmx_simd_real_t simdZ = setSimdRealFrom3R(4, 1, 8);
+ gmx_simd_real_t simdR2 = setSimdRealFrom3R(26, 5, 98);
+
+ setUlpTol(2);
+ GMX_EXPECT_SIMD_REAL_NEAR(simdR2, gmx_simd_calc_rsq_r(simdX, simdY, simdZ));
+}
+
+TEST_F(SimdVectorOperationsTest, gmxSimdIprodR)
+{
+ gmx_simd_real_t aX = setSimdRealFrom3R(1, 2, 3);
+ gmx_simd_real_t aY = setSimdRealFrom3R(3, 0, 5);
+ gmx_simd_real_t aZ = setSimdRealFrom3R(4, 1, 8);
+ gmx_simd_real_t bX = setSimdRealFrom3R(8, 3, 6);
+ gmx_simd_real_t bY = setSimdRealFrom3R(2, 3, 1);
+ gmx_simd_real_t bZ = setSimdRealFrom3R(5, 7, 9);
+ gmx_simd_real_t iprod = setSimdRealFrom3R(34, 13, 95);
+
+ setUlpTol(2);
+ GMX_EXPECT_SIMD_REAL_NEAR(iprod, gmx_simd_iprod_r(aX, aY, aZ, bX, bY, bZ));
+}
+
+TEST_F(SimdVectorOperationsTest, gmxSimdNorm2R)
+{
+ gmx_simd_real_t simdX = setSimdRealFrom3R(1, 2, 3);
+ gmx_simd_real_t simdY = setSimdRealFrom3R(3, 0, 5);
+ gmx_simd_real_t simdZ = setSimdRealFrom3R(4, 1, 8);
+ gmx_simd_real_t simdNorm2 = setSimdRealFrom3R(26, 5, 98);
+
+ setUlpTol(2);
+ GMX_EXPECT_SIMD_REAL_NEAR(simdNorm2, gmx_simd_norm2_r(simdX, simdY, simdZ));
+}
+
+TEST_F(SimdVectorOperationsTest, gmxSimdCprodR)
+{
+ gmx_simd_real_t aX = setSimdRealFrom3R(1, 2, 3);
+ gmx_simd_real_t aY = setSimdRealFrom3R(3, 0, 5);
+ gmx_simd_real_t aZ = setSimdRealFrom3R(4, 1, 8);
+ gmx_simd_real_t bX = setSimdRealFrom3R(8, 3, 6);
+ gmx_simd_real_t bY = setSimdRealFrom3R(2, 3, 1);
+ gmx_simd_real_t bZ = setSimdRealFrom3R(5, 7, 9);
+ gmx_simd_real_t refcX = setSimdRealFrom3R(7, -3, 37);
+ gmx_simd_real_t refcY = setSimdRealFrom3R(27, -11, 21);
+ gmx_simd_real_t refcZ = setSimdRealFrom3R(-22, 6, -27);
+ gmx_simd_real_t cX, cY, cZ;
+ gmx_simd_cprod_r(aX, aY, aZ, bX, bY, bZ, &cX, &cY, &cZ);
+
+ setUlpTol(2);
+ GMX_EXPECT_SIMD_REAL_NEAR(refcX, cX);
+ GMX_EXPECT_SIMD_REAL_NEAR(refcY, cY);
+ GMX_EXPECT_SIMD_REAL_NEAR(refcZ, cZ);
+}
+
+#endif // GMX_SIMD_HAVE_REAL
+
+/*! \} */
+/*! \endcond */
+
+} // namespace
+} // namespace
+} // namespace
/*
* This file is part of the GROMACS molecular simulation package.
*
- * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
+ * Copyright (c) 2013,2014, by the GROMACS development team, led by
* Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
* and including many others, as listed in the AUTHORS file in the
* top-level source directory and at http://www.gromacs.org.
* the research papers on the package. Check out http://www.gromacs.org.
*/
-/* The macros in this file are intended to be used for writing
- * architecture-independent SIMD intrinsics code.
- * To support a new architecture, adding macros here should be (nearly)
- * all that is needed.
- */
-
-/* This file contains vector operation functions using SIMD intrinsics.
- * gromacs/simd/macros.h should be included before including this file.
+/*! \libinternal \file
+ *
+ * \brief SIMD operations corresponding to Gromacs rvec datatypes.
+ *
+ * \author Erik Lindahl <erik.lindahl@scilifelab.se>
+ *
+ * \inlibraryapi
+ * \ingroup module_simd
*/
#ifndef GMX_SIMD_VECTOR_OPERATIONS_H
#define GMX_SIMD_VECTOR_OPERATIONS_H
-#ifndef GMX_SIMD_MACROS_H
-#error "gromacs/simd/macros.h was not included before including gromacs/simd/vector_operations.h"
-#endif
+#include "gromacs/simd/simd.h"
+
+/*! \cond libapi */
+/*! \addtogroup module_simd */
+/*! \{ */
+
+#if (defined GMX_SIMD_HAVE_FLOAT) || (defined DOXYGEN)
+/*! \brief SIMD float inner product of multiple float vectors.
+ *
+ * For normal usage you should always call the real-precision \ref gmx_simd_iprod_r.
+ *
+ * \param ax X components of first vectors
+ * \param ay Y components of first vectors
+ * \param az Z components of first vectors
+ * \param bx X components of second vectors
+ * \param by Y components of second vectors
+ * \param bz Z components of second vectors
+ *
+ * \return Element i will be res[i] = ax[i]*bx[i]+ay[i]*by[i]+az[i]*bz[i].
+ *
+ * \note The SIMD part is that we calculate many scalar products in one call.
+ */
+static gmx_inline gmx_simd_float_t
+gmx_simd_iprod_f(gmx_simd_float_t ax, gmx_simd_float_t ay, gmx_simd_float_t az,
+ gmx_simd_float_t bx, gmx_simd_float_t by, gmx_simd_float_t bz)
+{
+ gmx_simd_float_t ret;
+
+ ret = gmx_simd_mul_f(ax, bx);
+ ret = gmx_simd_fmadd_f(ay, by, ret);
+ ret = gmx_simd_fmadd_f(az, bz, ret);
+
+ return ret;
+}
+
+/*! \brief SIMD float norm squared of multiple vectors.
+ *
+ * For normal usage you should always call the real-precision \ref gmx_simd_norm2_r.
+ *
+ * \param ax X components of vectors
+ * \param ay Y components of vectors
+ * \param az Z components of vectors
+ *
+ * \return Element i will be res[i] = ax[i]*ax[i]+ay[i]*ay[i]+az[i]*az[i].
+ *
+ * \note This corresponds to the scalar product of the vector with itself, but
+ * the compiler might be able to optimize it better with identical vectors.
+ */
+static gmx_inline gmx_simd_float_t
+gmx_simd_norm2_f(gmx_simd_float_t ax, gmx_simd_float_t ay, gmx_simd_float_t az)
+{
+ gmx_simd_float_t ret;
+
+ ret = gmx_simd_mul_f(ax, ax);
+ ret = gmx_simd_fmadd_f(ay, ay, ret);
+ ret = gmx_simd_fmadd_f(az, az, ret);
+
+ return ret;
+}
+/*! \brief Calculating r^2 is the same as evaluating the norm of dx*dx.
+ *
+ * For details, see \ref gmx_simd_norm2_f.
+ */
+#define gmx_simd_calc_rsq_f gmx_simd_norm2_f
-/* x^2 + y^2 + z^2 */
-static gmx_inline gmx_simd_real_t
-gmx_simd_calc_rsq_r(gmx_simd_real_t x, gmx_simd_real_t y, gmx_simd_real_t z)
+/*! \brief SIMD float cross-product of multiple vectors.
+ *
+ * For normal usage you should always call the real-precision \ref gmx_simd_cprod_r.
+ *
+ * \param ax X components of first vectors
+ * \param ay Y components of first vectors
+ * \param az Z components of first vectors
+ * \param bx X components of second vectors
+ * \param by Y components of second vectors
+ * \param bz Z components of second vectors
+ * \param[out] cx X components of cross product vectors
+ * \param[out] cy Y components of cross product vectors
+ * \param[out] cz Z components of cross product vectors
+ *
+ * \returns void
+ *
+ * This calculates C = A x B, where the cross denotes the cross product.
+ * The arguments x/y/z denotes the different components, and each element
+ * corresponds to a separate vector.
+ */
+static gmx_inline void
+gmx_simd_cprod_f(gmx_simd_float_t ax, gmx_simd_float_t ay, gmx_simd_float_t az,
+ gmx_simd_float_t bx, gmx_simd_float_t by, gmx_simd_float_t bz,
+ gmx_simd_float_t *cx, gmx_simd_float_t *cy, gmx_simd_float_t *cz)
{
- return gmx_simd_fmadd_r(z, z, gmx_simd_fmadd_r(y, y, gmx_simd_mul_r(x, x)));
+ *cx = gmx_simd_mul_f(ay, bz);
+ *cx = gmx_simd_fnmadd_f(az, by, *cx);
+
+ *cy = gmx_simd_mul_f(az, bx);
+ *cy = gmx_simd_fnmadd_f(ax, bz, *cy);
+
+ *cz = gmx_simd_mul_f(ax, by);
+ *cz = gmx_simd_fnmadd_f(ay, bx, *cz);
}
+#endif /* GMX_SIMD_HAVE_FLOAT */
-/* inner-product of multiple vectors */
-static gmx_inline gmx_simd_real_t
-gmx_simd_iprod_r(gmx_simd_real_t ax, gmx_simd_real_t ay, gmx_simd_real_t az,
- gmx_simd_real_t bx, gmx_simd_real_t by, gmx_simd_real_t bz)
+#if (defined GMX_SIMD_HAVE_DOUBLE) || (defined DOXYGEN)
+/*! \brief SIMD double inner product of multiple double vectors.
+ *
+ * \copydetails gmx_simd_iprod_f
+ */
+static gmx_inline gmx_simd_double_t
+gmx_simd_iprod_d(gmx_simd_double_t ax, gmx_simd_double_t ay, gmx_simd_double_t az,
+ gmx_simd_double_t bx, gmx_simd_double_t by, gmx_simd_double_t bz)
{
- gmx_simd_real_t ret;
+ gmx_simd_double_t ret;
- ret = gmx_simd_mul_r(ax, bx);
- ret = gmx_simd_fmadd_r(ay, by, ret);
- ret = gmx_simd_fmadd_r(az, bz, ret);
+ ret = gmx_simd_mul_d(ax, bx);
+ ret = gmx_simd_fmadd_d(ay, by, ret);
+ ret = gmx_simd_fmadd_d(az, bz, ret);
return ret;
}
-/* norm squared of multiple vectors */
-static gmx_inline gmx_simd_real_t
-gmx_simd_norm2_r(gmx_simd_real_t ax, gmx_simd_real_t ay, gmx_simd_real_t az)
+/*! \brief SIMD double norm squared of multiple vectors.
+ *
+ * \copydetails gmx_simd_norm2_f
+ */
+static gmx_inline gmx_simd_double_t
+gmx_simd_norm2_d(gmx_simd_double_t ax, gmx_simd_double_t ay, gmx_simd_double_t az)
{
- gmx_simd_real_t ret;
+ gmx_simd_double_t ret;
- ret = gmx_simd_mul_r(ax, ax);
- ret = gmx_simd_fmadd_r(ay, ay, ret);
- ret = gmx_simd_fmadd_r(az, az, ret);
+ ret = gmx_simd_mul_d(ax, ax);
+ ret = gmx_simd_fmadd_d(ay, ay, ret);
+ ret = gmx_simd_fmadd_d(az, az, ret);
return ret;
}
-/* cross-product of multiple vectors */
+/*! \brief Calculating r^2 is the same as evaluating the norm of dx*dx.
+ *
+ * For details, see \ref gmx_simd_norm2_d.
+ */
+#define gmx_simd_calc_rsq_d gmx_simd_norm2_d
+
+/*! \brief SIMD double cross-product of multiple vectors.
+ *
+ * \copydetails gmx_simd_cprod_f
+ */
static gmx_inline void
-gmx_simd_cprod_r(gmx_simd_real_t ax, gmx_simd_real_t ay, gmx_simd_real_t az,
- gmx_simd_real_t bx, gmx_simd_real_t by, gmx_simd_real_t bz,
- gmx_simd_real_t *cx, gmx_simd_real_t *cy, gmx_simd_real_t *cz)
+gmx_simd_cprod_d(gmx_simd_double_t ax, gmx_simd_double_t ay, gmx_simd_double_t az,
+ gmx_simd_double_t bx, gmx_simd_double_t by, gmx_simd_double_t bz,
+ gmx_simd_double_t *cx, gmx_simd_double_t *cy, gmx_simd_double_t *cz)
+{
+ *cx = gmx_simd_mul_d(ay, bz);
+ *cx = gmx_simd_fnmadd_d(az, by, *cx);
+
+ *cy = gmx_simd_mul_d(az, bx);
+ *cy = gmx_simd_fnmadd_d(ax, bz, *cy);
+
+ *cz = gmx_simd_mul_d(ax, by);
+ *cz = gmx_simd_fnmadd_d(ay, bx, *cz);
+}
+#endif /* GMX_SIMD_HAVE_DOUBLE */
+
+
+#if (defined GMX_SIMD4_HAVE_FLOAT) || (defined DOXYGEN)
+/*! \brief SIMD4 float inner product of four float vectors.
+ *
+ * \copydetails gmx_simd_norm2_f
+ */
+static gmx_inline gmx_simd4_float_t
+gmx_simd4_norm2_f(gmx_simd4_float_t ax, gmx_simd4_float_t ay, gmx_simd4_float_t az)
{
- *cx = gmx_simd_mul_r(ay, bz);
- *cx = gmx_simd_fnmadd_r(az, by, *cx);
+ gmx_simd4_float_t ret;
- *cy = gmx_simd_mul_r(az, bx);
- *cy = gmx_simd_fnmadd_r(ax, bz, *cy);
+ ret = gmx_simd4_mul_f(ax, ax);
+ ret = gmx_simd4_fmadd_f(ay, ay, ret);
+ ret = gmx_simd4_fmadd_f(az, az, ret);
- *cz = gmx_simd_mul_r(ax, by);
- *cz = gmx_simd_fnmadd_r(ay, bx, *cz);
+ return ret;
}
-/* a + b + c + d (not really a vector operation, but where else put this?) */
-static gmx_inline gmx_simd_real_t
-gmx_simd_sum4_r(gmx_simd_real_t a, gmx_simd_real_t b, gmx_simd_real_t c, gmx_simd_real_t d)
+/*! \brief Calculating r^2 is the same as evaluating the norm of dx*dx.
+ *
+ * For details, see \ref gmx_simd4_norm2_f
+ */
+#define gmx_simd4_calc_rsq_f gmx_simd4_norm2_f
+
+#endif /* GMX_SIMD4_HAVE_FLOAT */
+
+#if (defined GMX_SIMD4_HAVE_DOUBLE) || (defined DOXYGEN)
+/*! \brief SIMD4 double norm squared of multiple vectors.
+ *
+ * \copydetails gmx_simd_norm2_f
+ */
+static gmx_inline gmx_simd4_double_t
+gmx_simd4_norm2_d(gmx_simd4_double_t ax, gmx_simd4_double_t ay, gmx_simd4_double_t az)
{
- return gmx_simd_add_r(gmx_simd_add_r(a, b), gmx_simd_add_r(c, d));
+ gmx_simd4_double_t ret;
+
+ ret = gmx_simd4_mul_d(ax, ax);
+ ret = gmx_simd4_fmadd_d(ay, ay, ret);
+ ret = gmx_simd4_fmadd_d(az, az, ret);
+
+ return ret;
}
+/*! \brief Calculating r^2 is the same as evaluating the norm of dx*dx.
+ *
+ * For details, see \ref gmx_simd4_norm2_d.
+ */
+#define gmx_simd4_calc_rsq_d gmx_simd4_norm2_d
+
+#endif /* GMX_SIMD4_HAVE_DOUBLE */
+
+
+#ifdef GMX_DOUBLE
+/* Documented for the single branch below */
+# define gmx_simd_iprod_r gmx_simd_iprod_d
+# define gmx_simd_norm2_r gmx_simd_norm2_d
+# define gmx_simd_calc_rsq_r gmx_simd_calc_rsq_d
+# define gmx_simd_cprod_r gmx_simd_cprod_d
+# define gmx_simd4_norm2_r gmx_simd4_norm2_d
+# define gmx_simd4_calc_rsq_r gmx_simd4_calc_rsq_d
+#else /* GMX_DOUBLE */
+
+/*! \brief SIMD real inner product of multiple real vectors.
+ *
+ * This will call \ref gmx_simd_iprod_d if GMX_DOUBLE is defined, otherwise
+ * \ref gmx_simd_iprod_f.
+ *
+ * \copydetails gmx_simd_iprod_f
+ */
+# define gmx_simd_iprod_r gmx_simd_iprod_f
+
+/*! \brief SIMD real norm squared of multiple real vectors.
+ *
+ * This will call \ref gmx_simd_norm2_d if GMX_DOUBLE is defined, otherwise
+ * \ref gmx_simd_norm2_f.
+ *
+ * \copydetails gmx_simd_norm2_f
+ */
+# define gmx_simd_norm2_r gmx_simd_norm2_f
+
+/*! \brief Calculating r^2 is the same as evaluating the norm of dx*dx.
+ *
+ * This will call \ref gmx_simd_calc_rsq_d if GMX_DOUBLE is defined, otherwise
+ * \ref gmx_simd_calc_rsq_f.
+ *
+ * \copydetails gmx_simd_calc_rsq_f
+ */
+# define gmx_simd_calc_rsq_r gmx_simd_calc_rsq_f
+
+/*! \brief SIMD real cross-product of multiple real vectors.
+ *
+ * This will call \ref gmx_simd_cprod_d if GMX_DOUBLE is defined, otherwise
+ * \ref gmx_simd_cprod_f.
+ *
+ * \copydetails gmx_simd_cprod_f
+ */
+# define gmx_simd_cprod_r gmx_simd_cprod_f
+
+/*! \brief SIMD4 real norm squared of multiple vectors.
+ *
+ * This will call \ref gmx_simd4_norm2_d if GMX_DOUBLE is defined, otherwise
+ * \ref gmx_simd4_norm2_f.
+ *
+ * \copydetails gmx_simd4_norm2_f
+ */
+# define gmx_simd4_norm2_r gmx_simd4_norm2_f
+
+/*! \brief Calculating r^2 is the same as evaluating the norm of dx*dx.
+ *
+ * This will call \ref gmx_simd4_calc_rsq_d if GMX_DOUBLE is defined, otherwise
+ * \ref gmx_simd4_calc_rsq_f.
+ *
+ * \copydetails gmx_simd4_calc_rsq_f
+ */
+# define gmx_simd4_calc_rsq_r gmx_simd4_calc_rsq_f
+
+#endif /* GMX_DOUBLE */
+
+/*! \} */
+/*! \endcond */
-#endif
+#endif /* GMX_SIMD_VECTOR_OPERATIONS_H */
#include "config.h"
#endif
-#ifdef GMX_SIMD_X86_SSE2_OR_HIGHER
-#include <xmmintrin.h>
+/* Ugly hack because the openmp implementation below hacks into the SIMD
+ * settings to decide when to use _mm_pause(). This should eventually be
+ * changed into proper detection of the intrinsics uses, not SIMD.
+ */
+#if (defined GMX_SIMD_X86_SSE2) || (defined GMX_SIMD_X86_SSE4_1) || \
+ (defined GMX_SIMD_X86_AVX_128_FMA) || (defined GMX_SIMD_X86_AVX_256) || \
+ (defined GMX_SIMD_X86_AVX2_256)
+# include <xmmintrin.h>
#endif
#include "types/commrec.h"
*/
static gmx_inline void gmx_pause()
{
+ /* Ugly hack because the openmp implementation below hacks into the SIMD
+ * settings to decide when to use _mm_pause(). This should eventually be
+ * changed into proper detection of the intrinsics uses, not SIMD.
+ */
+#if (defined GMX_SIMD_X86_SSE2) || (defined GMX_SIMD_X86_SSE4_1) || \
+ (defined GMX_SIMD_X86_AVX_128_FMA) || (defined GMX_SIMD_X86_AVX_256) || \
+ (defined GMX_SIMD_X86_AVX2_256)
/* Replace with tbb::internal::atomic_backoff when/if we use TBB */
-#if defined GMX_SIMD_X86_SSE2_OR_HIGHER
_mm_pause();
#elif defined __MIC__
_mm_delay_32(32);