From f1234c6422f448e3091d2bcd33053df2e1ad769e Mon Sep 17 00:00:00 2001 From: Erik Lindahl Date: Fri, 4 Jul 2014 13:36:13 +0200 Subject: [PATCH] Add Power/PowerPC VMX SIMD support This adds the low-level SIMD implementation for IBM VMX, which is present on Power 6 and later (and PowerPC after PPC970). This will not generate nbnxn kernels yet, so it is not enabled by default. Unit tests pass on Power8, where I also had to work around some vec_lvsl() issues due to the new little-endian PowerPC architecture. IBM has some quirks in their implementation of fnmadd, which occasionally returns negative zero when IEEE754 would expect positive zero. This should not cause any problems in normal Gromacs use, and to avoid problems in SIMD-math I have changed some operations there to use fmadd, with swapped signs of constants instead. This might even save a cycle on some platforms. Change-Id: I71a70b8807d5bfdba18da887adbcfc97ce3c8cc3 --- CMakeLists.txt | 2 +- cmake/gmxTestSimd.cmake | 20 ++ src/config.h.cmakein | 3 + src/gromacs/gmxlib/gmx_cpuid.c | 78 +++- src/gromacs/legacyheaders/gmx_cpuid.h | 3 + src/gromacs/simd/impl_ibm_vmx/impl_ibm_vmx.h | 360 +++++++++++++++++++ src/gromacs/simd/simd.h | 2 + src/gromacs/simd/simd_math.h | 96 ++--- 8 files changed, 509 insertions(+), 55 deletions(-) create mode 100644 src/gromacs/simd/impl_ibm_vmx/impl_ibm_vmx.h diff --git a/CMakeLists.txt b/CMakeLists.txt index 7482208571..28faeb375a 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -211,7 +211,7 @@ gmx_option_multichoice( GMX_SIMD "SIMD instruction set for CPU kernels and compiler optimization" "${GMX_SUGGESTED_SIMD}" - None SSE2 SSE4.1 AVX_128_FMA AVX_256 AVX2_256 ARM_NEON ARM_NEON_ASIMD IBM_QPX Sparc64_HPC_ACE Reference) + None SSE2 SSE4.1 AVX_128_FMA AVX_256 AVX2_256 ARM_NEON ARM_NEON_ASIMD IBM_QPX IBM_VMX Sparc64_HPC_ACE Reference) gmx_option_multichoice( GMX_FFT_LIBRARY diff --git a/cmake/gmxTestSimd.cmake b/cmake/gmxTestSimd.cmake index e5d9e255ac..fe477d7f2f 100644 --- a/cmake/gmxTestSimd.cmake +++ b/cmake/gmxTestSimd.cmake @@ -313,6 +313,26 @@ elseif(${GMX_SIMD} STREQUAL "IBM_QPX") message(FATAL_ERROR "Cannot compile the requested IBM QPX intrinsics. If you are compiling for BlueGene/Q with the XL compilers, use 'cmake .. -DCMAKE_TOOLCHAIN_FILE=Platform/BlueGeneQ-static-XL-C' to set up the tool chain.") endif() +elseif(${GMX_SIMD} STREQUAL "IBM_VMX") + + gmx_find_cflag_for_source(CFLAGS_IBM_VMX "C compiler IBM VMX SIMD flag" + "#include + int main(){vector float x,y=vec_ctf(vec_splat_s32(1),0);x=vec_madd(y,y,y);return vec_all_ge(y,x);}" + SIMD_C_FLAGS + "-maltivec -mabi=altivec" "-qarch=auto -qaltivec") + gmx_find_cxxflag_for_source(CXXFLAGS_IBM_VMX "C++ compiler IBM VMX SIMD flag" + "#include + int main(){vector float x,y=vec_ctf(vec_splat_s32(1),0);x=vec_madd(y,y,y);return vec_all_ge(y,x);}" + SIMD_CXX_FLAGS + "-maltivec -mabi=altivec" "-qarch=auto -qaltivec") + + if(NOT CFLAGS_IBM_VMX OR NOT CXXFLAGS_IBM_VMX) + message(FATAL_ERROR "Cannot find IBM VMX SIMD compiler flag. Use a newer compiler, or disable VMX SIMD.") + endif() + + set(GMX_SIMD_IBM_VMX 1) + set(SIMD_STATUS_MESSAGE "Enabling IBM VMX SIMD instructions") + elseif(${GMX_SIMD} STREQUAL "SPARC64_HPC_ACE") # Note that GMX_RELAXED_DOUBLE_PRECISION is enabled by default in the top-level CMakeLists.txt diff --git a/src/config.h.cmakein b/src/config.h.cmakein index 00177ee8ff..19f97bc1c4 100644 --- a/src/config.h.cmakein +++ b/src/config.h.cmakein @@ -123,6 +123,9 @@ /* IBM QPX was selected as SIMD instructions (e.g. BlueGene/Q) */ #cmakedefine GMX_SIMD_IBM_QPX +/* IBM VMX was selected as SIMD instructions (Power 6 and later) */ +#cmakedefine GMX_SIMD_IBM_VMX + /* Fujitsu Sparc64 HPC-ACE SIMD acceleration */ #cmakedefine GMX_SIMD_SPARC64_HPC_ACE diff --git a/src/gromacs/gmxlib/gmx_cpuid.c b/src/gromacs/gmxlib/gmx_cpuid.c index 3ea2d42c40..ee74bcbb51 100644 --- a/src/gromacs/gmxlib/gmx_cpuid.c +++ b/src/gromacs/gmxlib/gmx_cpuid.c @@ -81,7 +81,7 @@ gmx_cpuid_vendor_string[GMX_CPUID_NVENDORS] = "GenuineIntel", "AuthenticAMD", "Fujitsu", - "IBM", + "IBM", /* Used on Power and BlueGene/Q */ "ARM" }; @@ -93,7 +93,7 @@ gmx_cpuid_vendor_string_alternative[GMX_CPUID_NVENDORS] = "GenuineIntel", "AuthenticAMD", "Fujitsu", - "ibm", /* Used on BlueGene/Q */ + "ibm", /* Used on Power and BlueGene/Q */ "AArch64" }; @@ -136,7 +136,9 @@ gmx_cpuid_feature_string[GMX_CPUID_NFEATURES] = "x2apic", "xop", "arm_neon", - "arm_neon_asimd" + "arm_neon_asimd", + "QPX", + "VMX" }; const char * @@ -152,6 +154,7 @@ gmx_cpuid_simd_string[GMX_CPUID_NSIMD] = "AVX2_256", "Sparc64 HPC-ACE", "IBM_QPX", + "IBM_VMX", "ARM_NEON", "ARM_NEON_ASIMD" }; @@ -250,6 +253,8 @@ static const enum gmx_cpuid_simd compiled_simd = GMX_CPUID_SIMD_ARM_NEON_ASIMD; static const enum gmx_cpuid_simd compiled_simd = GMX_CPUID_SIMD_SPARC64_HPC_ACE; #elif defined GMX_SIMD_IBM_QPX static const enum gmx_cpuid_simd compiled_simd = GMX_CPUID_SIMD_IBM_QPX; +#elif defined GMX_SIMD_IBM_VMX +static const enum gmx_cpuid_simd compiled_simd = GMX_CPUID_SIMD_IBM_VMX; #elif defined GMX_SIMD_REFERENCE static const enum gmx_cpuid_simd compiled_simd = GMX_CPUID_SIMD_REFERENCE; #else @@ -842,6 +847,50 @@ cpuid_check_arm(gmx_cpuid_t cpuid) } +static int +cpuid_check_ibm(gmx_cpuid_t cpuid) +{ +#if defined(__linux__) || defined(__linux) + FILE *fp; + char buffer[GMX_CPUID_STRLEN], before_colon[GMX_CPUID_STRLEN], after_colon[GMX_CPUID_STRLEN]; + + if ( (fp = fopen("/proc/cpuinfo", "r")) != NULL) + { + while ( (fgets(buffer, sizeof(buffer), fp) != NULL)) + { + chomp_substring_before_colon(buffer, before_colon, GMX_CPUID_STRLEN); + chomp_substring_after_colon(buffer, after_colon, GMX_CPUID_STRLEN); + + if (!strcmp(before_colon, "model name") || + !strcmp(before_colon, "model") || + !strcmp(before_colon, "Processor") || + !strcmp(before_colon, "cpu")) + { + strncpy(cpuid->brand, after_colon, GMX_CPUID_STRLEN); + + if (strstr(after_colon, "altivec")) + { + cpuid->feature[GMX_CPUID_FEATURE_IBM_VMX] = 1; + } + } + } + } + fclose(fp); + + if (strstr(cpuid->brand, "A2")) + { + /* BlueGene/Q */ + cpuid->feature[GMX_CPUID_FEATURE_IBM_QPX] = 1; + } +#else + strncpy(cpuid->brand, "Unknown CPU brand", GMX_CPUID_STRLEN); + cpuid->feature[GMX_CPUID_FEATURE_IBM_QPX] = 0; + cpuid->feature[GMX_CPUID_FEATURE_IBM_VMX] = 0; +#endif + return 0; +} + + /* Try to find the vendor of the current CPU, so we know what specific * detection routine to call. */ @@ -883,7 +932,7 @@ cpuid_check_vendor(void) while ( (vendor == GMX_CPUID_VENDOR_UNKNOWN) && (fgets(buffer, sizeof(buffer), fp) != NULL)) { chomp_substring_before_colon(buffer, before_colon, sizeof(before_colon)); - /* Intel/AMD use "vendor_id", IBM "vendor"(?) or "model". Fujitsu "manufacture". + /* Intel/AMD use "vendor_id", IBM "vendor", "model", or "cpu". Fujitsu "manufacture". * On ARM there does not seem to be a vendor, but ARM or AArch64 is listed in the Processor string. * Add others if you have them! */ @@ -891,7 +940,8 @@ cpuid_check_vendor(void) || !strcmp(before_colon, "vendor") || !strcmp(before_colon, "manufacture") || !strcmp(before_colon, "model") - || !strcmp(before_colon, "Processor")) + || !strcmp(before_colon, "Processor") + || !strcmp(before_colon, "cpu")) { chomp_substring_after_colon(buffer, after_colon, sizeof(after_colon)); for (i = GMX_CPUID_VENDOR_UNKNOWN; i < GMX_CPUID_NVENDORS; i++) @@ -905,6 +955,15 @@ cpuid_check_vendor(void) vendor = i; } } + /* If we did not find vendor yet, check if it is IBM: + * On some Power/PowerPC systems it only says power, not IBM. + */ + if (vendor == GMX_CPUID_VENDOR_UNKNOWN && + ((strstr(after_colon, "POWER") || strstr(after_colon, "Power") || + strstr(after_colon, "power")))) + { + vendor = GMX_CPUID_VENDOR_IBM; + } } } } @@ -1017,6 +1076,9 @@ gmx_cpuid_init (gmx_cpuid_t * pcpuid) case GMX_CPUID_VENDOR_ARM: cpuid_check_arm(cpuid); break; + case GMX_CPUID_VENDOR_IBM: + cpuid_check_ibm(cpuid); + break; default: /* Default value */ strncpy(cpuid->brand, "Unknown CPU brand", GMX_CPUID_STRLEN); @@ -1172,10 +1234,14 @@ gmx_cpuid_simd_suggest (gmx_cpuid_t cpuid) } else if (gmx_cpuid_vendor(cpuid) == GMX_CPUID_VENDOR_IBM) { - if (strstr(gmx_cpuid_brand(cpuid), "A2")) + if (gmx_cpuid_feature(cpuid, GMX_CPUID_FEATURE_IBM_QPX)) { tmpsimd = GMX_CPUID_SIMD_IBM_QPX; } + else if (gmx_cpuid_feature(cpuid, GMX_CPUID_FEATURE_IBM_VMX)) + { + tmpsimd = GMX_CPUID_SIMD_IBM_VMX; + } } else if (gmx_cpuid_vendor(cpuid) == GMX_CPUID_VENDOR_ARM) { diff --git a/src/gromacs/legacyheaders/gmx_cpuid.h b/src/gromacs/legacyheaders/gmx_cpuid.h index c6bfe9993a..83ad7f6081 100644 --- a/src/gromacs/legacyheaders/gmx_cpuid.h +++ b/src/gromacs/legacyheaders/gmx_cpuid.h @@ -114,6 +114,8 @@ enum gmx_cpuid_feature GMX_CPUID_FEATURE_X86_XOP, /* AMD extended instructions, only AMD for now */ GMX_CPUID_FEATURE_ARM_NEON, /* 32-bit ARM NEON */ GMX_CPUID_FEATURE_ARM_NEON_ASIMD, /* 64-bit ARM AArch64 Advanced SIMD */ + GMX_CPUID_FEATURE_IBM_QPX, /* IBM QPX SIMD (BlueGene/Q and later) */ + GMX_CPUID_FEATURE_IBM_VMX, /* IBM VMX SIMD (Altivec on Power6 and later) */ GMX_CPUID_NFEATURES }; @@ -135,6 +137,7 @@ enum gmx_cpuid_simd GMX_CPUID_SIMD_X86_AVX2_256, GMX_CPUID_SIMD_SPARC64_HPC_ACE, GMX_CPUID_SIMD_IBM_QPX, + GMX_CPUID_SIMD_IBM_VMX, GMX_CPUID_SIMD_ARM_NEON, GMX_CPUID_SIMD_ARM_NEON_ASIMD, GMX_CPUID_NSIMD diff --git a/src/gromacs/simd/impl_ibm_vmx/impl_ibm_vmx.h b/src/gromacs/simd/impl_ibm_vmx/impl_ibm_vmx.h new file mode 100644 index 0000000000..810a13433b --- /dev/null +++ b/src/gromacs/simd/impl_ibm_vmx/impl_ibm_vmx.h @@ -0,0 +1,360 @@ +/* + * This file is part of the GROMACS molecular simulation package. + * + * Copyright (c) 2014, by the GROMACS development team, led by + * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, + * and including many others, as listed in the AUTHORS file in the + * top-level source directory and at http://www.gromacs.org. + * + * GROMACS is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 + * of the License, or (at your option) any later version. + * + * GROMACS is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with GROMACS; if not, see + * http://www.gnu.org/licenses, or write to the Free Software Foundation, + * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + * + * If you want to redistribute modifications to GROMACS, please + * consider that scientific software is very special. Version + * control is crucial - bugs must be traceable. We will be happy to + * consider code for inclusion in the official distribution, but + * derived work must not be called official GROMACS. Details are found + * in the README & COPYING files - if they are missing, get the + * official version at http://www.gromacs.org. + * + * To help us fund GROMACS development, we humbly ask that you cite + * the research papers on the package. Check out http://www.gromacs.org. + */ + +#ifndef GMX_SIMD_IMPLEMENTATION_IBM_VMX_H +#define GMX_SIMD_IMPLEMENTATION_IBM_VMX_H + +#include + +#include + +/* Make sure we do not screw up c++ - undefine vector/bool, and rely on __vector and __bool */ +#undef vector +#undef bool + +/* IBM VMX SIMD instruction wrappers. Power6 and later. + * + * Please see documentation in gromacs/simd/simd.h for the available + * defines. + */ +/* Capability definitions for IBM VMX */ +#define GMX_SIMD_HAVE_FLOAT +#undef GMX_SIMD_HAVE_DOUBLE +#define GMX_SIMD_HAVE_HARDWARE +#undef GMX_SIMD_HAVE_LOADU +#undef GMX_SIMD_HAVE_STOREU +#define GMX_SIMD_HAVE_LOGICAL +/* VMX only provides fmadd/fnmadd (our definitions), but not fmsub/fnmsub. + * However, fnmadd is what we need for 1/sqrt(x). + */ +#define GMX_SIMD_HAVE_FMA +#undef GMX_SIMD_HAVE_FRACTION +#define GMX_SIMD_HAVE_FINT32 +#undef GMX_SIMD_HAVE_FINT32_EXTRACT +#define GMX_SIMD_HAVE_FINT32_LOGICAL +#define GMX_SIMD_HAVE_FINT32_ARITHMETICS +#undef GMX_SIMD_HAVE_DINT32 +#undef GMX_SIMD_HAVE_DINT32_EXTRACT +#undef GMX_SIMD_HAVE_DINT32_LOGICAL +#undef GMX_SIMD_HAVE_DINT32_ARITHMETICS +#define GMX_SIMD4_HAVE_FLOAT +#undef GMX_SIMD4_HAVE_DOUBLE + +/* Implementation details */ +#define GMX_SIMD_FLOAT_WIDTH 4 +#undef GMX_SIMD_DOUBLE_WIDTH +#define GMX_SIMD_FINT32_WIDTH 4 +#undef GMX_SIMD_DINT32_WIDTH +#define GMX_SIMD_RSQRT_BITS 14 +#define GMX_SIMD_RCP_BITS 14 + +/**************************************************** + * SINGLE PRECISION SIMD IMPLEMENTATION * + ****************************************************/ +#define gmx_simd_float_t __vector float +#define gmx_simd_load_f(m) vec_ld(0, (const __vector float *)m) +#define gmx_simd_load1_f(m) gmx_simd_load1_f_ibm_vmx(m) +#define gmx_simd_set1_f(x) gmx_simd_set1_f_ibm_vmx(x) +#define gmx_simd_store_f(m, x) vec_st(x, 0, (__vector float *)m) +#undef gmx_simd_loadu_f +#undef gmx_simd_storeu_f +#define gmx_simd_setzero_f() ((__vector float)vec_splat_u32(0)) +#define gmx_simd_add_f(a, b) vec_add(a, b) +#define gmx_simd_sub_f(a, b) vec_sub(a, b) +#define gmx_simd_mul_f(a, b) vec_mul(a, b) +#define gmx_simd_fmadd_f(a, b, c) vec_madd(a, b, c) +#define gmx_simd_fmsub_f(a, b, c) vec_sub(vec_mul(a, b), c) +/* IBM uses an alternative FMA definition, so -a*b+c=-(a*b-c) is "nmsub" */ +#define gmx_simd_fnmadd_f(a, b, c) vec_nmsub(a, b, c) +/* IBM uses an alternative FMA definition, so -a*b-c=-(a*b+c) is "nmadd" */ +#define gmx_simd_fnmsub_f(a, b, c) vec_sub(gmx_simd_setzero_f(), vec_madd(a, b, c)) +#define gmx_simd_and_f(a, b) vec_and(a, b) +#define gmx_simd_andnot_f(a, b) vec_andc(b, a) +#define gmx_simd_or_f(a, b) vec_or(a, b) +#define gmx_simd_xor_f(a, b) vec_xor(a, b) +#define gmx_simd_rsqrt_f(a) vec_rsqrte(a) +#define gmx_simd_rcp_f(a) vec_re(a) +#define gmx_simd_fabs_f(a) vec_abs(a) +#define gmx_simd_fneg_f(a) vec_xor(a, (__vector float)vec_sl(vec_splat_u32(-1), vec_splat_u32(-1))) +#define gmx_simd_max_f(a, b) vec_max(a, b) +#define gmx_simd_min_f(a, b) vec_min(a, b) +#define gmx_simd_round_f(a) vec_round(a) +#define gmx_simd_trunc_f(a) vec_trunc(a) +#define gmx_simd_fraction_f(x) vec_sub(x, vec_trunc(x)) +#define gmx_simd_get_exponent_f(a) gmx_simd_get_exponent_f_ibm_vmx(a) +#define gmx_simd_get_mantissa_f(a) gmx_simd_get_mantissa_f_ibm_vmx(a) +#define gmx_simd_set_exponent_f(a) gmx_simd_set_exponent_f_ibm_vmx(a) +/* integer datatype corresponding to float: gmx_simd_fint32_t */ +#define gmx_simd_fint32_t __vector int +#define gmx_simd_load_fi(m) vec_ld(0, (const __vector int *)m) +#define gmx_simd_set1_fi(i) gmx_simd_set1_fi_ibm_vmx((int)i) +#define gmx_simd_store_fi(m, x) vec_st(x, 0, (__vector int *)m) +#undef gmx_simd_loadu_fi +#undef gmx_simd_storeu_fi +#define gmx_simd_setzero_fi() vec_splat_s32(0) +#define gmx_simd_cvt_f2i(a) vec_cts(vec_round(a), 0) +#define gmx_simd_cvtt_f2i(a) vec_cts(a, 0) +#define gmx_simd_cvt_i2f(a) vec_ctf(a, 0) +#undef gmx_simd_extract_fi +/* Integer logical ops on gmx_simd_fint32_t */ +/* The shift constant magic requires an explanation: + * VMX only allows literals up to 15 to be created directly with vec_splat_u32, + * and we need to be able to shift up to 31 bits. The code on the right hand + * side splits the constant in three parts with values in the range 0..15. + * Since the argument has to be a constant (but our and VMX requirement), these + * constants will be evaluated at compile-time, and if one or two parts evaluate + * to zero they will be removed with -O2 or higher optimization (checked). + */ +#define gmx_simd_slli_fi(a, i) vec_sl(a, vec_add(vec_add(vec_splat_u32( (((i&0xF)+(i/16))&0xF)+i/31 ), vec_splat_u32( (i/16)*15 )), vec_splat_u32( (i/31)*15 ))) +#define gmx_simd_srli_fi(a, i) vec_sr(a, vec_add(vec_add(vec_splat_u32( (((i&0xF)+(i/16))&0xF)+i/31 ), vec_splat_u32( (i/16)*15 )), vec_splat_u32( (i/31)*15 ))) +#define gmx_simd_and_fi(a, b) vec_and(a, b) +#define gmx_simd_andnot_fi(a, b) vec_andc(b, a) +#define gmx_simd_or_fi(a, b) vec_or(a, b) +#define gmx_simd_xor_fi(a, b) vec_xor(a, b) +/* Integer arithmetic ops on gmx_simd_fint32_t */ +#define gmx_simd_add_fi(a, b) vec_add(a, b) +#define gmx_simd_sub_fi(a, b) vec_sub(a, b) +#define gmx_simd_mul_fi(a, b) vec_mule((__vector short)a, (__vector short)b) +/* Boolean & comparison operations on gmx_simd_float_t */ +#define gmx_simd_fbool_t __vector __bool int +#define gmx_simd_cmpeq_f(a, b) vec_cmpeq(a, b) +#define gmx_simd_cmplt_f(a, b) vec_cmplt(a, b) +#define gmx_simd_cmple_f(a, b) vec_cmple(a, b) +#define gmx_simd_and_fb(a, b) vec_and(a, b) +#define gmx_simd_or_fb(a, b) vec_or(a, b) +#define gmx_simd_anytrue_fb(a) vec_any_ne(a, (__vector __bool int)vec_splat_u32(0)) +#define gmx_simd_blendzero_f(a, sel) vec_and(a, (__vector float)sel) +#define gmx_simd_blendnotzero_f(a, sel) vec_andc(a, (__vector float)sel) +#define gmx_simd_blendv_f(a, b, sel) vec_sel(a, b, sel) +#define gmx_simd_reduce_f(a) gmx_simd_reduce_f_ibm_vmx(a) +/* Boolean & comparison operations on gmx_simd_fint32_t */ +#define gmx_simd_fibool_t __vector __bool int +#define gmx_simd_cmpeq_fi(a, b) vec_cmpeq(a, b) +#define gmx_simd_cmplt_fi(a, b) vec_cmplt(a, b) +#define gmx_simd_and_fib(a, b) vec_and(a, b) +#define gmx_simd_or_fib(a, b) vec_or(a, b) +#define gmx_simd_anytrue_fib(a) vec_any_ne(a, (__vector __bool int)vec_splat_u32(0)) +#define gmx_simd_blendzero_fi(a, sel) vec_and(a, (__vector int)sel) +#define gmx_simd_blendnotzero_fi(a, sel) vec_andc(a, (__vector int)sel) +#define gmx_simd_blendv_fi(a, b, sel) vec_sel(a, b, sel) +/* Conversions between different booleans */ +#define gmx_simd_cvt_fb2fib(x) (x) +#define gmx_simd_cvt_fib2fb(x) (x) + +/* Double is not available with VMX SIMD */ + +/**************************************************** + * IMPLEMENTATION HELPER FUNCTIONS * + ****************************************************/ +static gmx_inline gmx_simd_float_t +gmx_simd_set1_f_ibm_vmx(const float x) +{ + /* In the old days when PPC was all big endian we could + * use the vec_lvsl() instruction to permute bytes based on + * a load adress. However, at least with gcc-4.8.2 the bytes + * end up reversed on Power8 running little endian (Linux). + * Since this is not a critical instruction we work around + * it by first putting the data in an aligned position before + * loading, so we can avoid vec_lvsl() entirely. We can + * do this slightly faster on GCC with alignment attributes. + */ + __vector float vx; +#ifdef __GNUC__ + float alignedx __attribute ((aligned (16))); + alignedx = x; + vx = vec_lde(0, &alignedx); +#else + struct { + vector float vx; float x[4]; + } conv; + conv.x[0] = x; + vx = vec_lde(0, conv.x); +#endif + return vec_splat(vx, 0); +} + +static gmx_inline gmx_simd_float_t +gmx_simd_load1_f_ibm_vmx(const float * m) +{ + return gmx_simd_set1_f_ibm_vmx(*m); +} + +static gmx_inline gmx_simd_fint32_t +gmx_simd_set1_fi_ibm_vmx(const int x) +{ + /* See comment in gmx_simd_set1_f_ibm_vmx why we + * cannot use vec_lvsl(). + */ + __vector int vx; +#ifdef __GNUC__ + int alignedx __attribute ((aligned (16))); + alignedx = x; + vx = vec_lde(0, &alignedx); +#else + struct { + vector int vx; int x[4]; + } conv; + conv.x[0] = x; + vx = vec_lde(0, conv.x); +#endif + return vec_splat(vx, 0); +} + + +static gmx_inline gmx_simd_float_t +gmx_simd_get_exponent_f_ibm_vmx(gmx_simd_float_t x) +{ + /* Generate 0x7F800000 without memory operations */ + gmx_simd_float_t expmask = (__vector float)gmx_simd_slli_fi(vec_add(vec_splat_s32(15), vec_sl(vec_splat_s32(15), vec_splat_u32(4))), 23); + gmx_simd_fint32_t i127 = vec_sub(vec_sl(vec_splat_s32(1), vec_splat_u32(7)), vec_splat_s32(1)); + gmx_simd_fint32_t iexp; + + iexp = (__vector int)gmx_simd_and_f(x, expmask); + iexp = vec_sub(gmx_simd_srli_fi(iexp, 23), i127); + return vec_ctf(iexp, 0); +} + +static gmx_inline gmx_simd_float_t +gmx_simd_get_mantissa_f_ibm_vmx(gmx_simd_float_t x) +{ + gmx_simd_float_t expmask = (__vector float)gmx_simd_slli_fi(vec_add(vec_splat_s32(15), vec_sl(vec_splat_s32(15), vec_splat_u32(4))), 23); + + /* Get mantissa. By taking the absolute value (to get rid of the sign bit) we can + * use the same mask as for gmx_simd_get_exponent_f() (but complement it). Since + * these two routines are typically called together, this will save a few operations. + */ + x = gmx_simd_andnot_f(expmask, vec_abs(x)); + /* Reset zero (but correctly biased) exponent */ + return gmx_simd_or_f(x, vec_ctf(vec_splat_s32(1), 0)); +} + +static gmx_inline gmx_simd_float_t +gmx_simd_set_exponent_f_ibm_vmx(gmx_simd_float_t x) +{ + gmx_simd_fint32_t iexp = gmx_simd_cvt_f2i(x); + gmx_simd_fint32_t i127 = vec_sub(vec_sl(vec_splat_s32(1), vec_splat_u32(7)), vec_splat_s32(1)); + + iexp = gmx_simd_slli_fi(vec_add(iexp, i127), 23); + return (__vector float)iexp; +} + +static gmx_inline float +gmx_simd_reduce_f_ibm_vmx(gmx_simd_float_t x) +{ + float res; + x = vec_add(x, vec_sld(x, x, 8)); + x = vec_add(x, vec_sld(x, x, 4)); + vec_ste(x, 0, &res); + return res; +} + + + +/* SINGLE */ +#define gmx_simd4_float_t gmx_simd_float_t +#define gmx_simd4_load_f gmx_simd_load_f +#define gmx_simd4_load1_f gmx_simd_load1_f +#define gmx_simd4_set1_f gmx_simd_set1_f +#define gmx_simd4_store_f gmx_simd_store_f +#define gmx_simd4_loadu_f gmx_simd_loadu_f +#define gmx_simd4_storeu_f gmx_simd_storeu_f +#define gmx_simd4_setzero_f gmx_simd_setzero_f +#define gmx_simd4_add_f gmx_simd_add_f +#define gmx_simd4_sub_f gmx_simd_sub_f +#define gmx_simd4_mul_f gmx_simd_mul_f +#define gmx_simd4_fmadd_f gmx_simd_fmadd_f +#define gmx_simd4_fmsub_f gmx_simd_fmsub_f +#define gmx_simd4_fnmadd_f gmx_simd_fnmadd_f +#define gmx_simd4_fnmsub_f gmx_simd_fnmsub_f +#define gmx_simd4_and_f gmx_simd_and_f +#define gmx_simd4_andnot_f gmx_simd_andnot_f +#define gmx_simd4_or_f gmx_simd_or_f +#define gmx_simd4_xor_f gmx_simd_xor_f +#define gmx_simd4_rsqrt_f gmx_simd_rsqrt_f +#define gmx_simd4_rcp_f gmx_simd_rcp_f +#define gmx_simd4_fabs_f gmx_simd_fabs_f +#define gmx_simd4_fneg_f gmx_simd_fneg_f +#define gmx_simd4_max_f gmx_simd_max_f +#define gmx_simd4_min_f gmx_simd_min_f +#define gmx_simd4_round_f gmx_simd_round_f +#define gmx_simd4_trunc_f gmx_simd_trunc_f +#define gmx_simd4_fraction_f gmx_simd_fraction_f +#define gmx_simd4_get_exponent_f gmx_simd_get_exponent_f +#define gmx_simd4_get_mantissa_f gmx_simd_get_mantissa_f +#define gmx_simd4_set_exponent_f gmx_simd_set_exponent_f +#define gmx_simd4_dotproduct3_f gmx_simd4_dotproduct3_f_ibm_vmx +#define gmx_simd4_fint32_t gmx_simd_fint32_t +#define gmx_simd4_load_fi gmx_simd_load_fi +#define gmx_simd4_load1_fi gmx_simd_load1_fi +#define gmx_simd4_set1_fi gmx_simd_set1_fi +#define gmx_simd4_store_fi gmx_simd_store_fi +#define gmx_simd4_loadu_fi gmx_simd_loadu_fi +#define gmx_simd4_storeu_fi gmx_simd_storeu_fi +#define gmx_simd4_setzero_fi gmx_simd_setzero_fi +#define gmx_simd4_cvt_f2i gmx_simd_cvt_f2i +#define gmx_simd4_cvtt_f2i gmx_simd_cvtt_f2i +#define gmx_simd4_cvt_i2f gmx_simd_cvt_i2f +#define gmx_simd4_fbool_t gmx_simd_fbool_t +#define gmx_simd4_cmpeq_f gmx_simd_cmpeq_f +#define gmx_simd4_cmplt_f gmx_simd_cmplt_f +#define gmx_simd4_cmple_f gmx_simd_cmple_f +#define gmx_simd4_and_fb gmx_simd_and_fb +#define gmx_simd4_or_fb gmx_simd_or_fb +#define gmx_simd4_anytrue_fb gmx_simd_anytrue_fb +#define gmx_simd4_blendzero_f gmx_simd_blendzero_f +#define gmx_simd4_blendnotzero_f gmx_simd_blendnotzero_f +#define gmx_simd4_blendv_f gmx_simd_blendv_f +#define gmx_simd4_reduce_f gmx_simd_reduce_f + +static gmx_inline float +gmx_simd4_dotproduct3_f_ibm_vmx(gmx_simd4_float_t a, gmx_simd4_float_t b) +{ + gmx_simd4_float_t c = vec_mul(a, b); + /* Keep only elements 0,1,2 by shifting in zero from right */ + c = vec_sld(c, gmx_simd_setzero_f(), 4); + /* calculate sum */ + return gmx_simd_reduce_f_ibm_vmx(c); +} + +/* Function to check whether SIMD operations have resulted in overflow. + * For now, this is unfortunately a dummy for this architecture. + */ +static int +gmx_simd_check_and_reset_overflow(void) +{ + return 0; +} + +#endif /* GMX_SIMD_IMPLEMENTATION_IBM_VMX_H */ diff --git a/src/gromacs/simd/simd.h b/src/gromacs/simd/simd.h index 988ae65044..5e0128e392 100644 --- a/src/gromacs/simd/simd.h +++ b/src/gromacs/simd/simd.h @@ -129,6 +129,8 @@ static gmx_inline double * gmx_simd4_align_d(double *p); # include "impl_arm_neon_asimd/impl_arm_neon_asimd.h" #elif defined GMX_SIMD_IBM_QPX # include "impl_ibm_qpx/impl_ibm_qpx.h" +#elif defined GMX_SIMD_IBM_VMX +# include "impl_ibm_vmx/impl_ibm_vmx.h" #elif defined GMX_SIMD_SPARC64_HPC_ACE # include "impl_sparc64_hpc_ace/impl_sparc64_hpc_ace.h" #elif (defined GMX_SIMD_REFERENCE) || (defined DOXYGEN) diff --git a/src/gromacs/simd/simd_math.h b/src/gromacs/simd/simd_math.h index 85d545c3d0..e2cbacdd3a 100644 --- a/src/gromacs/simd/simd_math.h +++ b/src/gromacs/simd/simd_math.h @@ -360,8 +360,8 @@ gmx_simd_exp_f(gmx_simd_float_t x) const gmx_simd_float_t argscale = gmx_simd_set1_f(1.44269504088896341f); /* Lower bound: Disallow numbers that would lead to an IEEE fp exponent reaching +-127. */ const gmx_simd_float_t arglimit = gmx_simd_set1_f(126.0f); - const gmx_simd_float_t invargscale0 = gmx_simd_set1_f(0.693145751953125f); - const gmx_simd_float_t invargscale1 = gmx_simd_set1_f(1.428606765330187045e-06f); + const gmx_simd_float_t invargscale0 = gmx_simd_set1_f(-0.693145751953125f); + const gmx_simd_float_t invargscale1 = gmx_simd_set1_f(-1.428606765330187045e-06f); const gmx_simd_float_t CC4 = gmx_simd_set1_f(0.00136324646882712841033936f); const gmx_simd_float_t CC3 = gmx_simd_set1_f(0.00836596917361021041870117f); const gmx_simd_float_t CC2 = gmx_simd_set1_f(0.0416710823774337768554688f); @@ -380,8 +380,8 @@ gmx_simd_exp_f(gmx_simd_float_t x) fexppart = gmx_simd_blendzero_f(fexppart, valuemask); /* Extended precision arithmetics */ - x = gmx_simd_fnmadd_f(invargscale0, intpart, x); - x = gmx_simd_fnmadd_f(invargscale1, intpart, x); + x = gmx_simd_fmadd_f(invargscale0, intpart, x); + x = gmx_simd_fmadd_f(invargscale1, intpart, x); p = gmx_simd_fmadd_f(CC4, x, CC3); p = gmx_simd_fmadd_f(p, x, CC2); @@ -709,10 +709,10 @@ static gmx_inline void gmx_simdcall gmx_simd_sincos_f(gmx_simd_float_t x, gmx_simd_float_t *sinval, gmx_simd_float_t *cosval) { /* Constants to subtract Pi/4*x from y while minimizing precision loss */ - const gmx_simd_float_t argred0 = gmx_simd_set1_f(1.5703125); - const gmx_simd_float_t argred1 = gmx_simd_set1_f(4.83751296997070312500e-04f); - const gmx_simd_float_t argred2 = gmx_simd_set1_f(7.54953362047672271729e-08f); - const gmx_simd_float_t argred3 = gmx_simd_set1_f(2.56334406825708960298e-12f); + const gmx_simd_float_t argred0 = gmx_simd_set1_f(-1.5703125); + const gmx_simd_float_t argred1 = gmx_simd_set1_f(-4.83751296997070312500e-04f); + const gmx_simd_float_t argred2 = gmx_simd_set1_f(-7.54953362047672271729e-08f); + const gmx_simd_float_t argred3 = gmx_simd_set1_f(-2.56334406825708960298e-12f); const gmx_simd_float_t two_over_pi = gmx_simd_set1_f(2.0f/M_PI); const gmx_simd_float_t const_sin2 = gmx_simd_set1_f(-1.9515295891e-4f); const gmx_simd_float_t const_sin1 = gmx_simd_set1_f( 8.3321608736e-3f); @@ -790,10 +790,10 @@ gmx_simd_sincos_f(gmx_simd_float_t x, gmx_simd_float_t *sinval, gmx_simd_float_t /* where mask is FALSE, set sign. */ csign = gmx_simd_xor_sign_f(csign, gmx_simd_blendv_f(gmx_simd_set1_f(-1.0f), one, mask)); #endif - x = gmx_simd_fnmadd_f(y, argred0, x); - x = gmx_simd_fnmadd_f(y, argred1, x); - x = gmx_simd_fnmadd_f(y, argred2, x); - x = gmx_simd_fnmadd_f(y, argred3, x); + x = gmx_simd_fmadd_f(y, argred0, x); + x = gmx_simd_fmadd_f(y, argred1, x); + x = gmx_simd_fmadd_f(y, argred2, x); + x = gmx_simd_fmadd_f(y, argred3, x); x2 = gmx_simd_mul_f(x, x); psin = gmx_simd_fmadd_f(const_sin2, x2, const_sin1); @@ -862,10 +862,10 @@ gmx_simd_cos_f(gmx_simd_float_t x) static gmx_inline gmx_simd_float_t gmx_simdcall gmx_simd_tan_f(gmx_simd_float_t x) { - const gmx_simd_float_t argred0 = gmx_simd_set1_f(1.5703125); - const gmx_simd_float_t argred1 = gmx_simd_set1_f(4.83751296997070312500e-04f); - const gmx_simd_float_t argred2 = gmx_simd_set1_f(7.54953362047672271729e-08f); - const gmx_simd_float_t argred3 = gmx_simd_set1_f(2.56334406825708960298e-12f); + const gmx_simd_float_t argred0 = gmx_simd_set1_f(-1.5703125); + const gmx_simd_float_t argred1 = gmx_simd_set1_f(-4.83751296997070312500e-04f); + const gmx_simd_float_t argred2 = gmx_simd_set1_f(-7.54953362047672271729e-08f); + const gmx_simd_float_t argred3 = gmx_simd_set1_f(-2.56334406825708960298e-12f); const gmx_simd_float_t two_over_pi = gmx_simd_set1_f(2.0f/M_PI); const gmx_simd_float_t CT6 = gmx_simd_set1_f(0.009498288995810566122993911); const gmx_simd_float_t CT5 = gmx_simd_set1_f(0.002895755790837379295226923); @@ -886,10 +886,10 @@ gmx_simd_tan_f(gmx_simd_float_t x) y = gmx_simd_round_f(z); mask = gmx_simd_cvt_fib2fb(gmx_simd_cmpeq_fi(gmx_simd_and_fi(iy, ione), ione)); - x = gmx_simd_fnmadd_f(y, argred0, x); - x = gmx_simd_fnmadd_f(y, argred1, x); - x = gmx_simd_fnmadd_f(y, argred2, x); - x = gmx_simd_fnmadd_f(y, argred3, x); + x = gmx_simd_fmadd_f(y, argred0, x); + x = gmx_simd_fmadd_f(y, argred1, x); + x = gmx_simd_fmadd_f(y, argred2, x); + x = gmx_simd_fmadd_f(y, argred3, x); x = gmx_simd_xor_f(gmx_simd_blendzero_f(gmx_simd_set1_f(GMX_FLOAT_NEGZERO), mask), x); #else const gmx_simd_float_t quarter = gmx_simd_set1_f(0.25f); @@ -908,10 +908,10 @@ gmx_simd_tan_f(gmx_simd_float_t x) m3 = gmx_simd_cmple_f(threequarter, q); m1 = gmx_simd_and_fb(m1, m2); mask = gmx_simd_or_fb(m1, m3); - w = gmx_simd_fnmadd_f(y, argred0, w); - w = gmx_simd_fnmadd_f(y, argred1, w); - w = gmx_simd_fnmadd_f(y, argred2, w); - w = gmx_simd_fnmadd_f(y, argred3, w); + w = gmx_simd_fmadd_f(y, argred0, w); + w = gmx_simd_fmadd_f(y, argred1, w); + w = gmx_simd_fmadd_f(y, argred2, w); + w = gmx_simd_fmadd_f(y, argred3, w); w = gmx_simd_blendv_f(w, gmx_simd_fneg_f(w), mask); x = gmx_simd_xor_sign_f(w, x); @@ -1579,8 +1579,8 @@ gmx_simd_exp_d(gmx_simd_double_t x) { const gmx_simd_double_t argscale = gmx_simd_set1_d(1.44269504088896340735992468100); const gmx_simd_double_t arglimit = gmx_simd_set1_d(1022.0); - const gmx_simd_double_t invargscale0 = gmx_simd_set1_d(0.69314718055966295651160180568695068359375); - const gmx_simd_double_t invargscale1 = gmx_simd_set1_d(2.8235290563031577122588448175013436025525412068e-13); + const gmx_simd_double_t invargscale0 = gmx_simd_set1_d(-0.69314718055966295651160180568695068359375); + const gmx_simd_double_t invargscale1 = gmx_simd_set1_d(-2.8235290563031577122588448175013436025525412068e-13); const gmx_simd_double_t CE12 = gmx_simd_set1_d(2.078375306791423699350304e-09); const gmx_simd_double_t CE11 = gmx_simd_set1_d(2.518173854179933105218635e-08); const gmx_simd_double_t CE10 = gmx_simd_set1_d(2.755842049600488770111608e-07); @@ -1605,8 +1605,8 @@ gmx_simd_exp_d(gmx_simd_double_t x) fexppart = gmx_simd_blendzero_d(fexppart, valuemask); /* Extended precision arithmetics */ - x = gmx_simd_fnmadd_d(invargscale0, intpart, x); - x = gmx_simd_fnmadd_d(invargscale1, intpart, x); + x = gmx_simd_fmadd_d(invargscale0, intpart, x); + x = gmx_simd_fmadd_d(invargscale1, intpart, x); p = gmx_simd_fmadd_d(CE12, x, CE11); p = gmx_simd_fmadd_d(p, x, CE10); @@ -2001,10 +2001,10 @@ static gmx_inline void gmx_simdcall gmx_simd_sincos_d(gmx_simd_double_t x, gmx_simd_double_t *sinval, gmx_simd_double_t *cosval) { /* Constants to subtract Pi/4*x from y while minimizing precision loss */ - const gmx_simd_double_t argred0 = gmx_simd_set1_d(2*0.78539816290140151978); - const gmx_simd_double_t argred1 = gmx_simd_set1_d(2*4.9604678871439933374e-10); - const gmx_simd_double_t argred2 = gmx_simd_set1_d(2*1.1258708853173288931e-18); - const gmx_simd_double_t argred3 = gmx_simd_set1_d(2*1.7607799325916000908e-27); + const gmx_simd_double_t argred0 = gmx_simd_set1_d(-2*0.78539816290140151978); + const gmx_simd_double_t argred1 = gmx_simd_set1_d(-2*4.9604678871439933374e-10); + const gmx_simd_double_t argred2 = gmx_simd_set1_d(-2*1.1258708853173288931e-18); + const gmx_simd_double_t argred3 = gmx_simd_set1_d(-2*1.7607799325916000908e-27); const gmx_simd_double_t two_over_pi = gmx_simd_set1_d(2.0/M_PI); const gmx_simd_double_t const_sin5 = gmx_simd_set1_d( 1.58938307283228937328511e-10); const gmx_simd_double_t const_sin4 = gmx_simd_set1_d(-2.50506943502539773349318e-08); @@ -2087,10 +2087,10 @@ gmx_simd_sincos_d(gmx_simd_double_t x, gmx_simd_double_t *sinval, gmx_simd_doubl /* where mask is FALSE, set sign. */ csign = gmx_simd_xor_sign_d(csign, gmx_simd_blendv_d(gmx_simd_set1_d(-1.0), one, mask)); #endif - x = gmx_simd_fnmadd_d(y, argred0, x); - x = gmx_simd_fnmadd_d(y, argred1, x); - x = gmx_simd_fnmadd_d(y, argred2, x); - x = gmx_simd_fnmadd_d(y, argred3, x); + x = gmx_simd_fmadd_d(y, argred0, x); + x = gmx_simd_fmadd_d(y, argred1, x); + x = gmx_simd_fmadd_d(y, argred2, x); + x = gmx_simd_fmadd_d(y, argred3, x); x2 = gmx_simd_mul_d(x, x); psin = gmx_simd_fmadd_d(const_sin5, x2, const_sin4); @@ -2151,10 +2151,10 @@ gmx_simd_cos_d(gmx_simd_double_t x) static gmx_inline gmx_simd_double_t gmx_simdcall gmx_simd_tan_d(gmx_simd_double_t x) { - const gmx_simd_double_t argred0 = gmx_simd_set1_d(2*0.78539816290140151978); - const gmx_simd_double_t argred1 = gmx_simd_set1_d(2*4.9604678871439933374e-10); - const gmx_simd_double_t argred2 = gmx_simd_set1_d(2*1.1258708853173288931e-18); - const gmx_simd_double_t argred3 = gmx_simd_set1_d(2*1.7607799325916000908e-27); + const gmx_simd_double_t argred0 = gmx_simd_set1_d(-2*0.78539816290140151978); + const gmx_simd_double_t argred1 = gmx_simd_set1_d(-2*4.9604678871439933374e-10); + const gmx_simd_double_t argred2 = gmx_simd_set1_d(-2*1.1258708853173288931e-18); + const gmx_simd_double_t argred3 = gmx_simd_set1_d(-2*1.7607799325916000908e-27); const gmx_simd_double_t two_over_pi = gmx_simd_set1_d(2.0/M_PI); const gmx_simd_double_t CT15 = gmx_simd_set1_d(1.01419718511083373224408e-05); const gmx_simd_double_t CT14 = gmx_simd_set1_d(-2.59519791585924697698614e-05); @@ -2184,10 +2184,10 @@ gmx_simd_tan_d(gmx_simd_double_t x) y = gmx_simd_round_d(z); mask = gmx_simd_cvt_dib2db(gmx_simd_cmpeq_di(gmx_simd_and_di(iy, ione), ione)); - x = gmx_simd_fnmadd_d(y, argred0, x); - x = gmx_simd_fnmadd_d(y, argred1, x); - x = gmx_simd_fnmadd_d(y, argred2, x); - x = gmx_simd_fnmadd_d(y, argred3, x); + x = gmx_simd_fmadd_d(y, argred0, x); + x = gmx_simd_fmadd_d(y, argred1, x); + x = gmx_simd_fmadd_d(y, argred2, x); + x = gmx_simd_fmadd_d(y, argred3, x); x = gmx_simd_xor_d(gmx_simd_blendzero_d(gmx_simd_set1_d(GMX_DOUBLE_NEGZERO), mask), x); #else const gmx_simd_double_t quarter = gmx_simd_set1_d(0.25); @@ -2206,10 +2206,10 @@ gmx_simd_tan_d(gmx_simd_double_t x) m3 = gmx_simd_cmple_d(threequarter, q); m1 = gmx_simd_and_db(m1, m2); mask = gmx_simd_or_db(m1, m3); - w = gmx_simd_fnmadd_d(y, argred0, w); - w = gmx_simd_fnmadd_d(y, argred1, w); - w = gmx_simd_fnmadd_d(y, argred2, w); - w = gmx_simd_fnmadd_d(y, argred3, w); + w = gmx_simd_fmadd_d(y, argred0, w); + w = gmx_simd_fmadd_d(y, argred1, w); + w = gmx_simd_fmadd_d(y, argred2, w); + w = gmx_simd_fmadd_d(y, argred3, w); w = gmx_simd_blendv_d(w, gmx_simd_fneg_d(w), mask); x = gmx_simd_xor_sign_d(w, x); -- 2.22.0