From f1234c6422f448e3091d2bcd33053df2e1ad769e Mon Sep 17 00:00:00 2001
From: Erik Lindahl <erik@kth.se>
Date: Fri, 4 Jul 2014 13:36:13 +0200
Subject: [PATCH] Add Power/PowerPC VMX SIMD support

This adds the low-level SIMD implementation
for IBM VMX, which is present on Power 6 and
later (and PowerPC after PPC970). This will
not generate nbnxn kernels yet, so it is not
enabled by default. Unit tests pass on Power8,
where I also had to work around some vec_lvsl()
issues due to the new little-endian PowerPC
architecture. IBM has some quirks in their
implementation of fnmadd, which occasionally
returns negative zero when IEEE754 would expect
positive zero. This should not cause any problems
in normal Gromacs use, and to avoid problems in
SIMD-math I have changed some operations there to
use fmadd, with swapped signs of constants instead.
This might even save a cycle on some platforms.

Change-Id: I71a70b8807d5bfdba18da887adbcfc97ce3c8cc3
---
 CMakeLists.txt                               |   2 +-
 cmake/gmxTestSimd.cmake                      |  20 ++
 src/config.h.cmakein                         |   3 +
 src/gromacs/gmxlib/gmx_cpuid.c               |  78 +++-
 src/gromacs/legacyheaders/gmx_cpuid.h        |   3 +
 src/gromacs/simd/impl_ibm_vmx/impl_ibm_vmx.h | 360 +++++++++++++++++++
 src/gromacs/simd/simd.h                      |   2 +
 src/gromacs/simd/simd_math.h                 |  96 ++---
 8 files changed, 509 insertions(+), 55 deletions(-)
 create mode 100644 src/gromacs/simd/impl_ibm_vmx/impl_ibm_vmx.h

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 7482208571..28faeb375a 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -211,7 +211,7 @@ gmx_option_multichoice(
     GMX_SIMD
     "SIMD instruction set for CPU kernels and compiler optimization"
     "${GMX_SUGGESTED_SIMD}"
-    None SSE2 SSE4.1 AVX_128_FMA AVX_256 AVX2_256 ARM_NEON ARM_NEON_ASIMD IBM_QPX Sparc64_HPC_ACE Reference)
+    None SSE2 SSE4.1 AVX_128_FMA AVX_256 AVX2_256 ARM_NEON ARM_NEON_ASIMD IBM_QPX IBM_VMX Sparc64_HPC_ACE Reference)
 
 gmx_option_multichoice(
     GMX_FFT_LIBRARY
diff --git a/cmake/gmxTestSimd.cmake b/cmake/gmxTestSimd.cmake
index e5d9e255ac..fe477d7f2f 100644
--- a/cmake/gmxTestSimd.cmake
+++ b/cmake/gmxTestSimd.cmake
@@ -313,6 +313,26 @@ elseif(${GMX_SIMD} STREQUAL "IBM_QPX")
         message(FATAL_ERROR "Cannot compile the requested IBM QPX intrinsics. If you are compiling for BlueGene/Q with the XL compilers, use 'cmake .. -DCMAKE_TOOLCHAIN_FILE=Platform/BlueGeneQ-static-XL-C' to set up the tool chain.")
     endif()
 
+elseif(${GMX_SIMD} STREQUAL "IBM_VMX")
+
+    gmx_find_cflag_for_source(CFLAGS_IBM_VMX "C compiler IBM VMX SIMD flag"
+                              "#include<altivec.h>
+                              int main(){vector float x,y=vec_ctf(vec_splat_s32(1),0);x=vec_madd(y,y,y);return vec_all_ge(y,x);}"
+                              SIMD_C_FLAGS
+                              "-maltivec -mabi=altivec" "-qarch=auto -qaltivec")
+    gmx_find_cxxflag_for_source(CXXFLAGS_IBM_VMX "C++ compiler IBM VMX SIMD flag"
+                                "#include<altivec.h>
+                                int main(){vector float x,y=vec_ctf(vec_splat_s32(1),0);x=vec_madd(y,y,y);return vec_all_ge(y,x);}"
+                                SIMD_CXX_FLAGS
+                                "-maltivec -mabi=altivec" "-qarch=auto -qaltivec")
+
+    if(NOT CFLAGS_IBM_VMX OR NOT CXXFLAGS_IBM_VMX)
+        message(FATAL_ERROR "Cannot find IBM VMX SIMD compiler flag. Use a newer compiler, or disable VMX SIMD.")
+    endif()
+
+    set(GMX_SIMD_IBM_VMX 1)
+    set(SIMD_STATUS_MESSAGE "Enabling IBM VMX SIMD instructions")
+
 elseif(${GMX_SIMD} STREQUAL "SPARC64_HPC_ACE")
 
     # Note that GMX_RELAXED_DOUBLE_PRECISION is enabled by default in the top-level CMakeLists.txt
diff --git a/src/config.h.cmakein b/src/config.h.cmakein
index 00177ee8ff..19f97bc1c4 100644
--- a/src/config.h.cmakein
+++ b/src/config.h.cmakein
@@ -123,6 +123,9 @@
 /* IBM QPX was selected as SIMD instructions (e.g. BlueGene/Q) */
 #cmakedefine GMX_SIMD_IBM_QPX
 
+/* IBM VMX was selected as SIMD instructions (Power 6 and later) */
+#cmakedefine GMX_SIMD_IBM_VMX
+
 /* Fujitsu Sparc64 HPC-ACE SIMD acceleration */
 #cmakedefine GMX_SIMD_SPARC64_HPC_ACE
 
diff --git a/src/gromacs/gmxlib/gmx_cpuid.c b/src/gromacs/gmxlib/gmx_cpuid.c
index 3ea2d42c40..ee74bcbb51 100644
--- a/src/gromacs/gmxlib/gmx_cpuid.c
+++ b/src/gromacs/gmxlib/gmx_cpuid.c
@@ -81,7 +81,7 @@ gmx_cpuid_vendor_string[GMX_CPUID_NVENDORS] =
     "GenuineIntel",
     "AuthenticAMD",
     "Fujitsu",
-    "IBM",
+    "IBM", /* Used on Power and BlueGene/Q */
     "ARM"
 };
 
@@ -93,7 +93,7 @@ gmx_cpuid_vendor_string_alternative[GMX_CPUID_NVENDORS] =
     "GenuineIntel",
     "AuthenticAMD",
     "Fujitsu",
-    "ibm", /* Used on BlueGene/Q */
+    "ibm", /* Used on Power and BlueGene/Q */
     "AArch64"
 };
 
@@ -136,7 +136,9 @@ gmx_cpuid_feature_string[GMX_CPUID_NFEATURES] =
     "x2apic",
     "xop",
     "arm_neon",
-    "arm_neon_asimd"
+    "arm_neon_asimd",
+    "QPX",
+    "VMX"
 };
 
 const char *
@@ -152,6 +154,7 @@ gmx_cpuid_simd_string[GMX_CPUID_NSIMD] =
     "AVX2_256",
     "Sparc64 HPC-ACE",
     "IBM_QPX",
+    "IBM_VMX",
     "ARM_NEON",
     "ARM_NEON_ASIMD"
 };
@@ -250,6 +253,8 @@ static const enum gmx_cpuid_simd compiled_simd = GMX_CPUID_SIMD_ARM_NEON_ASIMD;
 static const enum gmx_cpuid_simd compiled_simd = GMX_CPUID_SIMD_SPARC64_HPC_ACE;
 #elif defined GMX_SIMD_IBM_QPX
 static const enum gmx_cpuid_simd compiled_simd = GMX_CPUID_SIMD_IBM_QPX;
+#elif defined GMX_SIMD_IBM_VMX
+static const enum gmx_cpuid_simd compiled_simd = GMX_CPUID_SIMD_IBM_VMX;
 #elif defined GMX_SIMD_REFERENCE
 static const enum gmx_cpuid_simd compiled_simd = GMX_CPUID_SIMD_REFERENCE;
 #else
@@ -842,6 +847,50 @@ cpuid_check_arm(gmx_cpuid_t                cpuid)
 }
 
 
+static int
+cpuid_check_ibm(gmx_cpuid_t                cpuid)
+{
+#if defined(__linux__) || defined(__linux)
+    FILE *fp;
+    char  buffer[GMX_CPUID_STRLEN], before_colon[GMX_CPUID_STRLEN], after_colon[GMX_CPUID_STRLEN];
+
+    if ( (fp = fopen("/proc/cpuinfo", "r")) != NULL)
+    {
+        while ( (fgets(buffer, sizeof(buffer), fp) != NULL))
+        {
+            chomp_substring_before_colon(buffer, before_colon, GMX_CPUID_STRLEN);
+            chomp_substring_after_colon(buffer, after_colon, GMX_CPUID_STRLEN);
+
+            if (!strcmp(before_colon, "model name") ||
+                !strcmp(before_colon, "model") ||
+                !strcmp(before_colon, "Processor") ||
+                !strcmp(before_colon, "cpu"))
+            {
+                strncpy(cpuid->brand, after_colon, GMX_CPUID_STRLEN);
+
+                if (strstr(after_colon, "altivec"))
+                {
+                    cpuid->feature[GMX_CPUID_FEATURE_IBM_VMX] = 1;
+                }
+            }
+        }
+    }
+    fclose(fp);
+
+    if (strstr(cpuid->brand, "A2"))
+    {
+        /* BlueGene/Q */
+        cpuid->feature[GMX_CPUID_FEATURE_IBM_QPX] = 1;
+    }
+#else
+    strncpy(cpuid->brand, "Unknown CPU brand", GMX_CPUID_STRLEN);
+    cpuid->feature[GMX_CPUID_FEATURE_IBM_QPX] = 0;
+    cpuid->feature[GMX_CPUID_FEATURE_IBM_VMX] = 0;
+#endif
+    return 0;
+}
+
+
 /* Try to find the vendor of the current CPU, so we know what specific
  * detection routine to call.
  */
@@ -883,7 +932,7 @@ cpuid_check_vendor(void)
         while ( (vendor == GMX_CPUID_VENDOR_UNKNOWN) && (fgets(buffer, sizeof(buffer), fp) != NULL))
         {
             chomp_substring_before_colon(buffer, before_colon, sizeof(before_colon));
-            /* Intel/AMD use "vendor_id", IBM "vendor"(?) or "model". Fujitsu "manufacture".
+            /* Intel/AMD use "vendor_id", IBM "vendor", "model", or "cpu". Fujitsu "manufacture".
              * On ARM there does not seem to be a vendor, but ARM or AArch64 is listed in the Processor string.
              * Add others if you have them!
              */
@@ -891,7 +940,8 @@ cpuid_check_vendor(void)
                 || !strcmp(before_colon, "vendor")
                 || !strcmp(before_colon, "manufacture")
                 || !strcmp(before_colon, "model")
-                || !strcmp(before_colon, "Processor"))
+                || !strcmp(before_colon, "Processor")
+                || !strcmp(before_colon, "cpu"))
             {
                 chomp_substring_after_colon(buffer, after_colon, sizeof(after_colon));
                 for (i = GMX_CPUID_VENDOR_UNKNOWN; i < GMX_CPUID_NVENDORS; i++)
@@ -905,6 +955,15 @@ cpuid_check_vendor(void)
                         vendor = i;
                     }
                 }
+                /* If we did not find vendor yet, check if it is IBM:
+                 * On some Power/PowerPC systems it only says power, not IBM.
+                 */
+                if (vendor == GMX_CPUID_VENDOR_UNKNOWN &&
+                    ((strstr(after_colon, "POWER") || strstr(after_colon, "Power") ||
+                      strstr(after_colon, "power"))))
+                {
+                    vendor = GMX_CPUID_VENDOR_IBM;
+                }
             }
         }
     }
@@ -1017,6 +1076,9 @@ gmx_cpuid_init               (gmx_cpuid_t *              pcpuid)
         case GMX_CPUID_VENDOR_ARM:
             cpuid_check_arm(cpuid);
             break;
+        case GMX_CPUID_VENDOR_IBM:
+            cpuid_check_ibm(cpuid);
+            break;
         default:
             /* Default value */
             strncpy(cpuid->brand, "Unknown CPU brand", GMX_CPUID_STRLEN);
@@ -1172,10 +1234,14 @@ gmx_cpuid_simd_suggest  (gmx_cpuid_t                 cpuid)
     }
     else if (gmx_cpuid_vendor(cpuid) == GMX_CPUID_VENDOR_IBM)
     {
-        if (strstr(gmx_cpuid_brand(cpuid), "A2"))
+        if (gmx_cpuid_feature(cpuid, GMX_CPUID_FEATURE_IBM_QPX))
         {
             tmpsimd = GMX_CPUID_SIMD_IBM_QPX;
         }
+        else if (gmx_cpuid_feature(cpuid, GMX_CPUID_FEATURE_IBM_VMX))
+        {
+            tmpsimd = GMX_CPUID_SIMD_IBM_VMX;
+        }
     }
     else if (gmx_cpuid_vendor(cpuid) == GMX_CPUID_VENDOR_ARM)
     {
diff --git a/src/gromacs/legacyheaders/gmx_cpuid.h b/src/gromacs/legacyheaders/gmx_cpuid.h
index c6bfe9993a..83ad7f6081 100644
--- a/src/gromacs/legacyheaders/gmx_cpuid.h
+++ b/src/gromacs/legacyheaders/gmx_cpuid.h
@@ -114,6 +114,8 @@ enum gmx_cpuid_feature
     GMX_CPUID_FEATURE_X86_XOP,           /* AMD extended instructions, only AMD for now  */
     GMX_CPUID_FEATURE_ARM_NEON,          /* 32-bit ARM NEON                              */
     GMX_CPUID_FEATURE_ARM_NEON_ASIMD,    /* 64-bit ARM AArch64 Advanced SIMD             */
+    GMX_CPUID_FEATURE_IBM_QPX,           /* IBM QPX SIMD (BlueGene/Q and later)          */
+    GMX_CPUID_FEATURE_IBM_VMX,           /* IBM VMX SIMD (Altivec on Power6 and later)   */
     GMX_CPUID_NFEATURES
 };
 
@@ -135,6 +137,7 @@ enum gmx_cpuid_simd
     GMX_CPUID_SIMD_X86_AVX2_256,
     GMX_CPUID_SIMD_SPARC64_HPC_ACE,
     GMX_CPUID_SIMD_IBM_QPX,
+    GMX_CPUID_SIMD_IBM_VMX,
     GMX_CPUID_SIMD_ARM_NEON,
     GMX_CPUID_SIMD_ARM_NEON_ASIMD,
     GMX_CPUID_NSIMD
diff --git a/src/gromacs/simd/impl_ibm_vmx/impl_ibm_vmx.h b/src/gromacs/simd/impl_ibm_vmx/impl_ibm_vmx.h
new file mode 100644
index 0000000000..810a13433b
--- /dev/null
+++ b/src/gromacs/simd/impl_ibm_vmx/impl_ibm_vmx.h
@@ -0,0 +1,360 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2014, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+
+#ifndef GMX_SIMD_IMPLEMENTATION_IBM_VMX_H
+#define GMX_SIMD_IMPLEMENTATION_IBM_VMX_H
+
+#include <math.h>
+
+#include <altivec.h>
+
+/* Make sure we do not screw up c++ - undefine vector/bool, and rely on __vector and __bool */
+#undef vector
+#undef bool
+
+/* IBM VMX SIMD instruction wrappers. Power6 and later.
+ *
+ * Please see documentation in gromacs/simd/simd.h for the available
+ * defines.
+ */
+/* Capability definitions for IBM VMX */
+#define GMX_SIMD_HAVE_FLOAT
+#undef  GMX_SIMD_HAVE_DOUBLE
+#define GMX_SIMD_HAVE_HARDWARE
+#undef  GMX_SIMD_HAVE_LOADU
+#undef  GMX_SIMD_HAVE_STOREU
+#define GMX_SIMD_HAVE_LOGICAL
+/* VMX only provides fmadd/fnmadd (our definitions), but not fmsub/fnmsub.
+ * However, fnmadd is what we need for 1/sqrt(x).
+ */
+#define GMX_SIMD_HAVE_FMA
+#undef  GMX_SIMD_HAVE_FRACTION
+#define GMX_SIMD_HAVE_FINT32
+#undef  GMX_SIMD_HAVE_FINT32_EXTRACT
+#define GMX_SIMD_HAVE_FINT32_LOGICAL
+#define GMX_SIMD_HAVE_FINT32_ARITHMETICS
+#undef  GMX_SIMD_HAVE_DINT32
+#undef  GMX_SIMD_HAVE_DINT32_EXTRACT
+#undef  GMX_SIMD_HAVE_DINT32_LOGICAL
+#undef  GMX_SIMD_HAVE_DINT32_ARITHMETICS
+#define GMX_SIMD4_HAVE_FLOAT
+#undef  GMX_SIMD4_HAVE_DOUBLE
+
+/* Implementation details */
+#define GMX_SIMD_FLOAT_WIDTH         4
+#undef  GMX_SIMD_DOUBLE_WIDTH
+#define GMX_SIMD_FINT32_WIDTH        4
+#undef  GMX_SIMD_DINT32_WIDTH
+#define GMX_SIMD_RSQRT_BITS         14
+#define GMX_SIMD_RCP_BITS           14
+
+/****************************************************
+ *      SINGLE PRECISION SIMD IMPLEMENTATION        *
+ ****************************************************/
+#define gmx_simd_float_t           __vector float
+#define gmx_simd_load_f(m)         vec_ld(0, (const __vector float *)m)
+#define gmx_simd_load1_f(m)        gmx_simd_load1_f_ibm_vmx(m)
+#define gmx_simd_set1_f(x)         gmx_simd_set1_f_ibm_vmx(x)
+#define gmx_simd_store_f(m, x)     vec_st(x, 0, (__vector float *)m)
+#undef  gmx_simd_loadu_f
+#undef  gmx_simd_storeu_f
+#define gmx_simd_setzero_f()       ((__vector float)vec_splat_u32(0))
+#define gmx_simd_add_f(a, b)       vec_add(a, b)
+#define gmx_simd_sub_f(a, b)       vec_sub(a, b)
+#define gmx_simd_mul_f(a, b)       vec_mul(a, b)
+#define gmx_simd_fmadd_f(a, b, c)  vec_madd(a, b, c)
+#define gmx_simd_fmsub_f(a, b, c)  vec_sub(vec_mul(a, b), c)
+/* IBM uses an alternative FMA definition, so -a*b+c=-(a*b-c) is "nmsub" */
+#define gmx_simd_fnmadd_f(a, b, c) vec_nmsub(a, b, c)
+/* IBM uses an alternative FMA definition, so -a*b-c=-(a*b+c) is "nmadd" */
+#define gmx_simd_fnmsub_f(a, b, c) vec_sub(gmx_simd_setzero_f(), vec_madd(a, b, c))
+#define gmx_simd_and_f(a, b)       vec_and(a, b)
+#define gmx_simd_andnot_f(a, b)    vec_andc(b, a)
+#define gmx_simd_or_f(a, b)        vec_or(a, b)
+#define gmx_simd_xor_f(a, b)       vec_xor(a, b)
+#define gmx_simd_rsqrt_f(a)        vec_rsqrte(a)
+#define gmx_simd_rcp_f(a)          vec_re(a)
+#define gmx_simd_fabs_f(a)         vec_abs(a)
+#define gmx_simd_fneg_f(a)         vec_xor(a, (__vector float)vec_sl(vec_splat_u32(-1), vec_splat_u32(-1)))
+#define gmx_simd_max_f(a, b)       vec_max(a, b)
+#define gmx_simd_min_f(a, b)       vec_min(a, b)
+#define gmx_simd_round_f(a)        vec_round(a)
+#define gmx_simd_trunc_f(a)        vec_trunc(a)
+#define gmx_simd_fraction_f(x)     vec_sub(x, vec_trunc(x))
+#define gmx_simd_get_exponent_f(a) gmx_simd_get_exponent_f_ibm_vmx(a)
+#define gmx_simd_get_mantissa_f(a) gmx_simd_get_mantissa_f_ibm_vmx(a)
+#define gmx_simd_set_exponent_f(a) gmx_simd_set_exponent_f_ibm_vmx(a)
+/* integer datatype corresponding to float: gmx_simd_fint32_t */
+#define gmx_simd_fint32_t          __vector int
+#define gmx_simd_load_fi(m)        vec_ld(0, (const __vector int *)m)
+#define gmx_simd_set1_fi(i)        gmx_simd_set1_fi_ibm_vmx((int)i)
+#define gmx_simd_store_fi(m, x)    vec_st(x, 0, (__vector int *)m)
+#undef  gmx_simd_loadu_fi
+#undef  gmx_simd_storeu_fi
+#define gmx_simd_setzero_fi()      vec_splat_s32(0)
+#define gmx_simd_cvt_f2i(a)        vec_cts(vec_round(a), 0)
+#define gmx_simd_cvtt_f2i(a)       vec_cts(a, 0)
+#define gmx_simd_cvt_i2f(a)        vec_ctf(a, 0)
+#undef  gmx_simd_extract_fi
+/* Integer logical ops on gmx_simd_fint32_t */
+/* The shift constant magic requires an explanation:
+ * VMX only allows literals up to 15 to be created directly with vec_splat_u32,
+ * and we need to be able to shift up to 31 bits. The code on the right hand
+ * side splits the constant in three parts with values in the range 0..15.
+ * Since the argument has to be a constant (but our and VMX requirement), these
+ * constants will be evaluated at compile-time, and if one or two parts evaluate
+ * to zero they will be removed with -O2 or higher optimization (checked).
+ */
+#define gmx_simd_slli_fi(a, i)      vec_sl(a, vec_add(vec_add(vec_splat_u32( (((i&0xF)+(i/16))&0xF)+i/31 ), vec_splat_u32( (i/16)*15 )), vec_splat_u32( (i/31)*15 )))
+#define gmx_simd_srli_fi(a, i)      vec_sr(a, vec_add(vec_add(vec_splat_u32( (((i&0xF)+(i/16))&0xF)+i/31 ), vec_splat_u32( (i/16)*15 )), vec_splat_u32( (i/31)*15 )))
+#define gmx_simd_and_fi(a, b)       vec_and(a, b)
+#define gmx_simd_andnot_fi(a, b)   vec_andc(b, a)
+#define gmx_simd_or_fi(a, b)        vec_or(a, b)
+#define gmx_simd_xor_fi(a, b)       vec_xor(a, b)
+/* Integer arithmetic ops on gmx_simd_fint32_t */
+#define gmx_simd_add_fi(a, b)       vec_add(a, b)
+#define gmx_simd_sub_fi(a, b)       vec_sub(a, b)
+#define gmx_simd_mul_fi(a, b)       vec_mule((__vector short)a, (__vector short)b)
+/* Boolean & comparison operations on gmx_simd_float_t */
+#define gmx_simd_fbool_t           __vector __bool int
+#define gmx_simd_cmpeq_f(a, b)     vec_cmpeq(a, b)
+#define gmx_simd_cmplt_f(a, b)     vec_cmplt(a, b)
+#define gmx_simd_cmple_f(a, b)     vec_cmple(a, b)
+#define gmx_simd_and_fb(a, b)      vec_and(a, b)
+#define gmx_simd_or_fb(a, b)       vec_or(a, b)
+#define gmx_simd_anytrue_fb(a)     vec_any_ne(a, (__vector __bool int)vec_splat_u32(0))
+#define gmx_simd_blendzero_f(a, sel)    vec_and(a, (__vector float)sel)
+#define gmx_simd_blendnotzero_f(a, sel) vec_andc(a, (__vector float)sel)
+#define gmx_simd_blendv_f(a, b, sel)    vec_sel(a, b, sel)
+#define gmx_simd_reduce_f(a)       gmx_simd_reduce_f_ibm_vmx(a)
+/* Boolean & comparison operations on gmx_simd_fint32_t */
+#define gmx_simd_fibool_t          __vector __bool int
+#define gmx_simd_cmpeq_fi(a, b)     vec_cmpeq(a, b)
+#define gmx_simd_cmplt_fi(a, b)     vec_cmplt(a, b)
+#define gmx_simd_and_fib(a, b)      vec_and(a, b)
+#define gmx_simd_or_fib(a, b)       vec_or(a, b)
+#define gmx_simd_anytrue_fib(a)          vec_any_ne(a, (__vector __bool int)vec_splat_u32(0))
+#define gmx_simd_blendzero_fi(a, sel)    vec_and(a, (__vector int)sel)
+#define gmx_simd_blendnotzero_fi(a, sel) vec_andc(a, (__vector int)sel)
+#define gmx_simd_blendv_fi(a, b, sel)    vec_sel(a, b, sel)
+/* Conversions between different booleans */
+#define gmx_simd_cvt_fb2fib(x)     (x)
+#define gmx_simd_cvt_fib2fb(x)     (x)
+
+/* Double is not available with VMX SIMD */
+
+/****************************************************
+ * IMPLEMENTATION HELPER FUNCTIONS                  *
+ ****************************************************/
+static gmx_inline gmx_simd_float_t
+gmx_simd_set1_f_ibm_vmx(const float x)
+{
+    /* In the old days when PPC was all big endian we could
+     * use the vec_lvsl() instruction to permute bytes based on
+     * a load adress. However, at least with gcc-4.8.2 the bytes
+     * end up reversed on Power8 running little endian (Linux).
+     * Since this is not a critical instruction we work around
+     * it by first putting the data in an aligned position before
+     * loading, so we can avoid vec_lvsl() entirely. We can
+     * do this slightly faster on GCC with alignment attributes.
+     */
+    __vector float vx;
+#ifdef __GNUC__
+    float alignedx __attribute ((aligned (16)));
+    alignedx = x;
+    vx       = vec_lde(0, &alignedx);
+#else
+    struct {
+        vector float vx; float x[4];
+    } conv;
+    conv.x[0] = x;
+    vx        = vec_lde(0, conv.x);
+#endif
+    return vec_splat(vx, 0);
+}
+
+static gmx_inline gmx_simd_float_t
+gmx_simd_load1_f_ibm_vmx(const float * m)
+{
+    return gmx_simd_set1_f_ibm_vmx(*m);
+}
+
+static gmx_inline gmx_simd_fint32_t
+gmx_simd_set1_fi_ibm_vmx(const int x)
+{
+    /* See comment in gmx_simd_set1_f_ibm_vmx why we
+     * cannot use vec_lvsl().
+     */
+    __vector int vx;
+#ifdef __GNUC__
+    int alignedx __attribute ((aligned (16)));
+    alignedx = x;
+    vx       = vec_lde(0, &alignedx);
+#else
+    struct {
+        vector int vx; int x[4];
+    } conv;
+    conv.x[0] = x;
+    vx        = vec_lde(0, conv.x);
+#endif
+    return vec_splat(vx, 0);
+}
+
+
+static gmx_inline gmx_simd_float_t
+gmx_simd_get_exponent_f_ibm_vmx(gmx_simd_float_t x)
+{
+    /* Generate 0x7F800000 without memory operations */
+    gmx_simd_float_t     expmask = (__vector float)gmx_simd_slli_fi(vec_add(vec_splat_s32(15), vec_sl(vec_splat_s32(15), vec_splat_u32(4))), 23);
+    gmx_simd_fint32_t    i127    = vec_sub(vec_sl(vec_splat_s32(1), vec_splat_u32(7)), vec_splat_s32(1));
+    gmx_simd_fint32_t    iexp;
+
+    iexp = (__vector int)gmx_simd_and_f(x, expmask);
+    iexp = vec_sub(gmx_simd_srli_fi(iexp, 23), i127);
+    return vec_ctf(iexp, 0);
+}
+
+static gmx_inline gmx_simd_float_t
+gmx_simd_get_mantissa_f_ibm_vmx(gmx_simd_float_t x)
+{
+    gmx_simd_float_t     expmask = (__vector float)gmx_simd_slli_fi(vec_add(vec_splat_s32(15), vec_sl(vec_splat_s32(15), vec_splat_u32(4))), 23);
+
+    /* Get mantissa. By taking the absolute value (to get rid of the sign bit) we can
+     * use the same mask as for gmx_simd_get_exponent_f() (but complement it). Since
+     * these two routines are typically called together, this will save a few operations.
+     */
+    x = gmx_simd_andnot_f(expmask, vec_abs(x));
+    /* Reset zero (but correctly biased) exponent */
+    return gmx_simd_or_f(x, vec_ctf(vec_splat_s32(1), 0));
+}
+
+static gmx_inline gmx_simd_float_t
+gmx_simd_set_exponent_f_ibm_vmx(gmx_simd_float_t x)
+{
+    gmx_simd_fint32_t  iexp = gmx_simd_cvt_f2i(x);
+    gmx_simd_fint32_t  i127 = vec_sub(vec_sl(vec_splat_s32(1), vec_splat_u32(7)), vec_splat_s32(1));
+
+    iexp = gmx_simd_slli_fi(vec_add(iexp, i127), 23);
+    return (__vector float)iexp;
+}
+
+static gmx_inline float
+gmx_simd_reduce_f_ibm_vmx(gmx_simd_float_t x)
+{
+    float res;
+    x = vec_add(x, vec_sld(x, x, 8));
+    x = vec_add(x, vec_sld(x, x, 4));
+    vec_ste(x, 0, &res);
+    return res;
+}
+
+
+
+/* SINGLE */
+#define gmx_simd4_float_t                gmx_simd_float_t
+#define gmx_simd4_load_f                 gmx_simd_load_f
+#define gmx_simd4_load1_f                gmx_simd_load1_f
+#define gmx_simd4_set1_f                 gmx_simd_set1_f
+#define gmx_simd4_store_f                gmx_simd_store_f
+#define gmx_simd4_loadu_f                gmx_simd_loadu_f
+#define gmx_simd4_storeu_f               gmx_simd_storeu_f
+#define gmx_simd4_setzero_f              gmx_simd_setzero_f
+#define gmx_simd4_add_f                  gmx_simd_add_f
+#define gmx_simd4_sub_f                  gmx_simd_sub_f
+#define gmx_simd4_mul_f                  gmx_simd_mul_f
+#define gmx_simd4_fmadd_f                gmx_simd_fmadd_f
+#define gmx_simd4_fmsub_f                gmx_simd_fmsub_f
+#define gmx_simd4_fnmadd_f               gmx_simd_fnmadd_f
+#define gmx_simd4_fnmsub_f               gmx_simd_fnmsub_f
+#define gmx_simd4_and_f                  gmx_simd_and_f
+#define gmx_simd4_andnot_f               gmx_simd_andnot_f
+#define gmx_simd4_or_f                   gmx_simd_or_f
+#define gmx_simd4_xor_f                  gmx_simd_xor_f
+#define gmx_simd4_rsqrt_f                gmx_simd_rsqrt_f
+#define gmx_simd4_rcp_f                  gmx_simd_rcp_f
+#define gmx_simd4_fabs_f                 gmx_simd_fabs_f
+#define gmx_simd4_fneg_f                 gmx_simd_fneg_f
+#define gmx_simd4_max_f                  gmx_simd_max_f
+#define gmx_simd4_min_f                  gmx_simd_min_f
+#define gmx_simd4_round_f                gmx_simd_round_f
+#define gmx_simd4_trunc_f                gmx_simd_trunc_f
+#define gmx_simd4_fraction_f             gmx_simd_fraction_f
+#define gmx_simd4_get_exponent_f         gmx_simd_get_exponent_f
+#define gmx_simd4_get_mantissa_f         gmx_simd_get_mantissa_f
+#define gmx_simd4_set_exponent_f         gmx_simd_set_exponent_f
+#define gmx_simd4_dotproduct3_f          gmx_simd4_dotproduct3_f_ibm_vmx
+#define gmx_simd4_fint32_t               gmx_simd_fint32_t
+#define gmx_simd4_load_fi                gmx_simd_load_fi
+#define gmx_simd4_load1_fi               gmx_simd_load1_fi
+#define gmx_simd4_set1_fi                gmx_simd_set1_fi
+#define gmx_simd4_store_fi               gmx_simd_store_fi
+#define gmx_simd4_loadu_fi               gmx_simd_loadu_fi
+#define gmx_simd4_storeu_fi              gmx_simd_storeu_fi
+#define gmx_simd4_setzero_fi             gmx_simd_setzero_fi
+#define gmx_simd4_cvt_f2i                gmx_simd_cvt_f2i
+#define gmx_simd4_cvtt_f2i               gmx_simd_cvtt_f2i
+#define gmx_simd4_cvt_i2f                gmx_simd_cvt_i2f
+#define gmx_simd4_fbool_t                gmx_simd_fbool_t
+#define gmx_simd4_cmpeq_f                gmx_simd_cmpeq_f
+#define gmx_simd4_cmplt_f                gmx_simd_cmplt_f
+#define gmx_simd4_cmple_f                gmx_simd_cmple_f
+#define gmx_simd4_and_fb                 gmx_simd_and_fb
+#define gmx_simd4_or_fb                  gmx_simd_or_fb
+#define gmx_simd4_anytrue_fb             gmx_simd_anytrue_fb
+#define gmx_simd4_blendzero_f            gmx_simd_blendzero_f
+#define gmx_simd4_blendnotzero_f         gmx_simd_blendnotzero_f
+#define gmx_simd4_blendv_f               gmx_simd_blendv_f
+#define gmx_simd4_reduce_f               gmx_simd_reduce_f
+
+static gmx_inline float
+gmx_simd4_dotproduct3_f_ibm_vmx(gmx_simd4_float_t a, gmx_simd4_float_t b)
+{
+    gmx_simd4_float_t c = vec_mul(a, b);
+    /* Keep only elements 0,1,2 by shifting in zero from right */
+    c = vec_sld(c, gmx_simd_setzero_f(), 4);
+    /* calculate sum */
+    return gmx_simd_reduce_f_ibm_vmx(c);
+}
+
+/* Function to check whether SIMD operations have resulted in overflow.
+ * For now, this is unfortunately a dummy for this architecture.
+ */
+static int
+gmx_simd_check_and_reset_overflow(void)
+{
+    return 0;
+}
+
+#endif /* GMX_SIMD_IMPLEMENTATION_IBM_VMX_H */
diff --git a/src/gromacs/simd/simd.h b/src/gromacs/simd/simd.h
index 988ae65044..5e0128e392 100644
--- a/src/gromacs/simd/simd.h
+++ b/src/gromacs/simd/simd.h
@@ -129,6 +129,8 @@ static gmx_inline double * gmx_simd4_align_d(double *p);
 #    include "impl_arm_neon_asimd/impl_arm_neon_asimd.h"
 #elif defined GMX_SIMD_IBM_QPX
 #    include "impl_ibm_qpx/impl_ibm_qpx.h"
+#elif defined GMX_SIMD_IBM_VMX
+#    include "impl_ibm_vmx/impl_ibm_vmx.h"
 #elif defined GMX_SIMD_SPARC64_HPC_ACE
 #    include "impl_sparc64_hpc_ace/impl_sparc64_hpc_ace.h"
 #elif (defined GMX_SIMD_REFERENCE) || (defined DOXYGEN)
diff --git a/src/gromacs/simd/simd_math.h b/src/gromacs/simd/simd_math.h
index 85d545c3d0..e2cbacdd3a 100644
--- a/src/gromacs/simd/simd_math.h
+++ b/src/gromacs/simd/simd_math.h
@@ -360,8 +360,8 @@ gmx_simd_exp_f(gmx_simd_float_t x)
     const gmx_simd_float_t  argscale     = gmx_simd_set1_f(1.44269504088896341f);
     /* Lower bound: Disallow numbers that would lead to an IEEE fp exponent reaching +-127. */
     const gmx_simd_float_t  arglimit     = gmx_simd_set1_f(126.0f);
-    const gmx_simd_float_t  invargscale0 = gmx_simd_set1_f(0.693145751953125f);
-    const gmx_simd_float_t  invargscale1 = gmx_simd_set1_f(1.428606765330187045e-06f);
+    const gmx_simd_float_t  invargscale0 = gmx_simd_set1_f(-0.693145751953125f);
+    const gmx_simd_float_t  invargscale1 = gmx_simd_set1_f(-1.428606765330187045e-06f);
     const gmx_simd_float_t  CC4          = gmx_simd_set1_f(0.00136324646882712841033936f);
     const gmx_simd_float_t  CC3          = gmx_simd_set1_f(0.00836596917361021041870117f);
     const gmx_simd_float_t  CC2          = gmx_simd_set1_f(0.0416710823774337768554688f);
@@ -380,8 +380,8 @@ gmx_simd_exp_f(gmx_simd_float_t x)
     fexppart  = gmx_simd_blendzero_f(fexppart, valuemask);
 
     /* Extended precision arithmetics */
-    x         = gmx_simd_fnmadd_f(invargscale0, intpart, x);
-    x         = gmx_simd_fnmadd_f(invargscale1, intpart, x);
+    x         = gmx_simd_fmadd_f(invargscale0, intpart, x);
+    x         = gmx_simd_fmadd_f(invargscale1, intpart, x);
 
     p         = gmx_simd_fmadd_f(CC4, x, CC3);
     p         = gmx_simd_fmadd_f(p, x, CC2);
@@ -709,10 +709,10 @@ static gmx_inline void gmx_simdcall
 gmx_simd_sincos_f(gmx_simd_float_t x, gmx_simd_float_t *sinval, gmx_simd_float_t *cosval)
 {
     /* Constants to subtract Pi/4*x from y while minimizing precision loss */
-    const gmx_simd_float_t  argred0         = gmx_simd_set1_f(1.5703125);
-    const gmx_simd_float_t  argred1         = gmx_simd_set1_f(4.83751296997070312500e-04f);
-    const gmx_simd_float_t  argred2         = gmx_simd_set1_f(7.54953362047672271729e-08f);
-    const gmx_simd_float_t  argred3         = gmx_simd_set1_f(2.56334406825708960298e-12f);
+    const gmx_simd_float_t  argred0         = gmx_simd_set1_f(-1.5703125);
+    const gmx_simd_float_t  argred1         = gmx_simd_set1_f(-4.83751296997070312500e-04f);
+    const gmx_simd_float_t  argred2         = gmx_simd_set1_f(-7.54953362047672271729e-08f);
+    const gmx_simd_float_t  argred3         = gmx_simd_set1_f(-2.56334406825708960298e-12f);
     const gmx_simd_float_t  two_over_pi     = gmx_simd_set1_f(2.0f/M_PI);
     const gmx_simd_float_t  const_sin2      = gmx_simd_set1_f(-1.9515295891e-4f);
     const gmx_simd_float_t  const_sin1      = gmx_simd_set1_f( 8.3321608736e-3f);
@@ -790,10 +790,10 @@ gmx_simd_sincos_f(gmx_simd_float_t x, gmx_simd_float_t *sinval, gmx_simd_float_t
     /* where mask is FALSE, set sign. */
     csign   = gmx_simd_xor_sign_f(csign, gmx_simd_blendv_f(gmx_simd_set1_f(-1.0f), one, mask));
 #endif
-    x       = gmx_simd_fnmadd_f(y, argred0, x);
-    x       = gmx_simd_fnmadd_f(y, argred1, x);
-    x       = gmx_simd_fnmadd_f(y, argred2, x);
-    x       = gmx_simd_fnmadd_f(y, argred3, x);
+    x       = gmx_simd_fmadd_f(y, argred0, x);
+    x       = gmx_simd_fmadd_f(y, argred1, x);
+    x       = gmx_simd_fmadd_f(y, argred2, x);
+    x       = gmx_simd_fmadd_f(y, argred3, x);
     x2      = gmx_simd_mul_f(x, x);
 
     psin    = gmx_simd_fmadd_f(const_sin2, x2, const_sin1);
@@ -862,10 +862,10 @@ gmx_simd_cos_f(gmx_simd_float_t x)
 static gmx_inline gmx_simd_float_t gmx_simdcall
 gmx_simd_tan_f(gmx_simd_float_t x)
 {
-    const gmx_simd_float_t  argred0         = gmx_simd_set1_f(1.5703125);
-    const gmx_simd_float_t  argred1         = gmx_simd_set1_f(4.83751296997070312500e-04f);
-    const gmx_simd_float_t  argred2         = gmx_simd_set1_f(7.54953362047672271729e-08f);
-    const gmx_simd_float_t  argred3         = gmx_simd_set1_f(2.56334406825708960298e-12f);
+    const gmx_simd_float_t  argred0         = gmx_simd_set1_f(-1.5703125);
+    const gmx_simd_float_t  argred1         = gmx_simd_set1_f(-4.83751296997070312500e-04f);
+    const gmx_simd_float_t  argred2         = gmx_simd_set1_f(-7.54953362047672271729e-08f);
+    const gmx_simd_float_t  argred3         = gmx_simd_set1_f(-2.56334406825708960298e-12f);
     const gmx_simd_float_t  two_over_pi     = gmx_simd_set1_f(2.0f/M_PI);
     const gmx_simd_float_t  CT6             = gmx_simd_set1_f(0.009498288995810566122993911);
     const gmx_simd_float_t  CT5             = gmx_simd_set1_f(0.002895755790837379295226923);
@@ -886,10 +886,10 @@ gmx_simd_tan_f(gmx_simd_float_t x)
     y       = gmx_simd_round_f(z);
     mask    = gmx_simd_cvt_fib2fb(gmx_simd_cmpeq_fi(gmx_simd_and_fi(iy, ione), ione));
 
-    x       = gmx_simd_fnmadd_f(y, argred0, x);
-    x       = gmx_simd_fnmadd_f(y, argred1, x);
-    x       = gmx_simd_fnmadd_f(y, argred2, x);
-    x       = gmx_simd_fnmadd_f(y, argred3, x);
+    x       = gmx_simd_fmadd_f(y, argred0, x);
+    x       = gmx_simd_fmadd_f(y, argred1, x);
+    x       = gmx_simd_fmadd_f(y, argred2, x);
+    x       = gmx_simd_fmadd_f(y, argred3, x);
     x       = gmx_simd_xor_f(gmx_simd_blendzero_f(gmx_simd_set1_f(GMX_FLOAT_NEGZERO), mask), x);
 #else
     const gmx_simd_float_t  quarter         = gmx_simd_set1_f(0.25f);
@@ -908,10 +908,10 @@ gmx_simd_tan_f(gmx_simd_float_t x)
     m3      = gmx_simd_cmple_f(threequarter, q);
     m1      = gmx_simd_and_fb(m1, m2);
     mask    = gmx_simd_or_fb(m1, m3);
-    w       = gmx_simd_fnmadd_f(y, argred0, w);
-    w       = gmx_simd_fnmadd_f(y, argred1, w);
-    w       = gmx_simd_fnmadd_f(y, argred2, w);
-    w       = gmx_simd_fnmadd_f(y, argred3, w);
+    w       = gmx_simd_fmadd_f(y, argred0, w);
+    w       = gmx_simd_fmadd_f(y, argred1, w);
+    w       = gmx_simd_fmadd_f(y, argred2, w);
+    w       = gmx_simd_fmadd_f(y, argred3, w);
 
     w       = gmx_simd_blendv_f(w, gmx_simd_fneg_f(w), mask);
     x       = gmx_simd_xor_sign_f(w, x);
@@ -1579,8 +1579,8 @@ gmx_simd_exp_d(gmx_simd_double_t x)
 {
     const gmx_simd_double_t  argscale      = gmx_simd_set1_d(1.44269504088896340735992468100);
     const gmx_simd_double_t  arglimit      = gmx_simd_set1_d(1022.0);
-    const gmx_simd_double_t  invargscale0  = gmx_simd_set1_d(0.69314718055966295651160180568695068359375);
-    const gmx_simd_double_t  invargscale1  = gmx_simd_set1_d(2.8235290563031577122588448175013436025525412068e-13);
+    const gmx_simd_double_t  invargscale0  = gmx_simd_set1_d(-0.69314718055966295651160180568695068359375);
+    const gmx_simd_double_t  invargscale1  = gmx_simd_set1_d(-2.8235290563031577122588448175013436025525412068e-13);
     const gmx_simd_double_t  CE12          = gmx_simd_set1_d(2.078375306791423699350304e-09);
     const gmx_simd_double_t  CE11          = gmx_simd_set1_d(2.518173854179933105218635e-08);
     const gmx_simd_double_t  CE10          = gmx_simd_set1_d(2.755842049600488770111608e-07);
@@ -1605,8 +1605,8 @@ gmx_simd_exp_d(gmx_simd_double_t x)
     fexppart  = gmx_simd_blendzero_d(fexppart, valuemask);
 
     /* Extended precision arithmetics */
-    x         = gmx_simd_fnmadd_d(invargscale0, intpart, x);
-    x         = gmx_simd_fnmadd_d(invargscale1, intpart, x);
+    x         = gmx_simd_fmadd_d(invargscale0, intpart, x);
+    x         = gmx_simd_fmadd_d(invargscale1, intpart, x);
 
     p         = gmx_simd_fmadd_d(CE12, x, CE11);
     p         = gmx_simd_fmadd_d(p, x, CE10);
@@ -2001,10 +2001,10 @@ static gmx_inline void gmx_simdcall
 gmx_simd_sincos_d(gmx_simd_double_t x, gmx_simd_double_t *sinval, gmx_simd_double_t *cosval)
 {
     /* Constants to subtract Pi/4*x from y while minimizing precision loss */
-    const gmx_simd_double_t  argred0         = gmx_simd_set1_d(2*0.78539816290140151978);
-    const gmx_simd_double_t  argred1         = gmx_simd_set1_d(2*4.9604678871439933374e-10);
-    const gmx_simd_double_t  argred2         = gmx_simd_set1_d(2*1.1258708853173288931e-18);
-    const gmx_simd_double_t  argred3         = gmx_simd_set1_d(2*1.7607799325916000908e-27);
+    const gmx_simd_double_t  argred0         = gmx_simd_set1_d(-2*0.78539816290140151978);
+    const gmx_simd_double_t  argred1         = gmx_simd_set1_d(-2*4.9604678871439933374e-10);
+    const gmx_simd_double_t  argred2         = gmx_simd_set1_d(-2*1.1258708853173288931e-18);
+    const gmx_simd_double_t  argred3         = gmx_simd_set1_d(-2*1.7607799325916000908e-27);
     const gmx_simd_double_t  two_over_pi     = gmx_simd_set1_d(2.0/M_PI);
     const gmx_simd_double_t  const_sin5      = gmx_simd_set1_d( 1.58938307283228937328511e-10);
     const gmx_simd_double_t  const_sin4      = gmx_simd_set1_d(-2.50506943502539773349318e-08);
@@ -2087,10 +2087,10 @@ gmx_simd_sincos_d(gmx_simd_double_t x, gmx_simd_double_t *sinval, gmx_simd_doubl
     /* where mask is FALSE, set sign. */
     csign   = gmx_simd_xor_sign_d(csign, gmx_simd_blendv_d(gmx_simd_set1_d(-1.0), one, mask));
 #endif
-    x       = gmx_simd_fnmadd_d(y, argred0, x);
-    x       = gmx_simd_fnmadd_d(y, argred1, x);
-    x       = gmx_simd_fnmadd_d(y, argred2, x);
-    x       = gmx_simd_fnmadd_d(y, argred3, x);
+    x       = gmx_simd_fmadd_d(y, argred0, x);
+    x       = gmx_simd_fmadd_d(y, argred1, x);
+    x       = gmx_simd_fmadd_d(y, argred2, x);
+    x       = gmx_simd_fmadd_d(y, argred3, x);
     x2      = gmx_simd_mul_d(x, x);
 
     psin    = gmx_simd_fmadd_d(const_sin5, x2, const_sin4);
@@ -2151,10 +2151,10 @@ gmx_simd_cos_d(gmx_simd_double_t x)
 static gmx_inline gmx_simd_double_t gmx_simdcall
 gmx_simd_tan_d(gmx_simd_double_t x)
 {
-    const gmx_simd_double_t  argred0         = gmx_simd_set1_d(2*0.78539816290140151978);
-    const gmx_simd_double_t  argred1         = gmx_simd_set1_d(2*4.9604678871439933374e-10);
-    const gmx_simd_double_t  argred2         = gmx_simd_set1_d(2*1.1258708853173288931e-18);
-    const gmx_simd_double_t  argred3         = gmx_simd_set1_d(2*1.7607799325916000908e-27);
+    const gmx_simd_double_t  argred0         = gmx_simd_set1_d(-2*0.78539816290140151978);
+    const gmx_simd_double_t  argred1         = gmx_simd_set1_d(-2*4.9604678871439933374e-10);
+    const gmx_simd_double_t  argred2         = gmx_simd_set1_d(-2*1.1258708853173288931e-18);
+    const gmx_simd_double_t  argred3         = gmx_simd_set1_d(-2*1.7607799325916000908e-27);
     const gmx_simd_double_t  two_over_pi     = gmx_simd_set1_d(2.0/M_PI);
     const gmx_simd_double_t  CT15            = gmx_simd_set1_d(1.01419718511083373224408e-05);
     const gmx_simd_double_t  CT14            = gmx_simd_set1_d(-2.59519791585924697698614e-05);
@@ -2184,10 +2184,10 @@ gmx_simd_tan_d(gmx_simd_double_t x)
     y       = gmx_simd_round_d(z);
     mask    = gmx_simd_cvt_dib2db(gmx_simd_cmpeq_di(gmx_simd_and_di(iy, ione), ione));
 
-    x       = gmx_simd_fnmadd_d(y, argred0, x);
-    x       = gmx_simd_fnmadd_d(y, argred1, x);
-    x       = gmx_simd_fnmadd_d(y, argred2, x);
-    x       = gmx_simd_fnmadd_d(y, argred3, x);
+    x       = gmx_simd_fmadd_d(y, argred0, x);
+    x       = gmx_simd_fmadd_d(y, argred1, x);
+    x       = gmx_simd_fmadd_d(y, argred2, x);
+    x       = gmx_simd_fmadd_d(y, argred3, x);
     x       = gmx_simd_xor_d(gmx_simd_blendzero_d(gmx_simd_set1_d(GMX_DOUBLE_NEGZERO), mask), x);
 #else
     const gmx_simd_double_t  quarter         = gmx_simd_set1_d(0.25);
@@ -2206,10 +2206,10 @@ gmx_simd_tan_d(gmx_simd_double_t x)
     m3      = gmx_simd_cmple_d(threequarter, q);
     m1      = gmx_simd_and_db(m1, m2);
     mask    = gmx_simd_or_db(m1, m3);
-    w       = gmx_simd_fnmadd_d(y, argred0, w);
-    w       = gmx_simd_fnmadd_d(y, argred1, w);
-    w       = gmx_simd_fnmadd_d(y, argred2, w);
-    w       = gmx_simd_fnmadd_d(y, argred3, w);
+    w       = gmx_simd_fmadd_d(y, argred0, w);
+    w       = gmx_simd_fmadd_d(y, argred1, w);
+    w       = gmx_simd_fmadd_d(y, argred2, w);
+    w       = gmx_simd_fmadd_d(y, argred3, w);
 
     w       = gmx_simd_blendv_d(w, gmx_simd_fneg_d(w), mask);
     x       = gmx_simd_xor_sign_d(w, x);
-- 
2.22.0