From c723a327aba4d1456902d348a5d710569a2d83d4 Mon Sep 17 00:00:00 2001 From: Erik Lindahl Date: Tue, 1 Jul 2014 10:59:57 +0200 Subject: [PATCH] Add 32-bit ARM Neon SIMD support This adds the low-level SIMD implementation for 32-bit ARM Neon instructions. We will still not generate nbnxn kernels for it; that is coming in a future update. For this reason we will also not enable ARM_NEON automatically in GMX_SIMD yet. The port passes our unit tests on tcbl04.theophys.kth.se (ARMv7). Change-Id: I61f771970777e82fcef757ab6915e07061912957 --- CMakeLists.txt | 2 +- cmake/gmxTestSimd.cmake | 20 ++ src/config.h.cmakein | 3 + src/gromacs/gmxlib/gmx_cpuid.c | 103 ++++++- src/gromacs/legacyheaders/gmx_cpuid.h | 3 + src/gromacs/mdlib/nbnxn_search.c | 2 +- .../simd/impl_arm_neon/impl_arm_neon.h | 284 ++++++++++++++++++ src/gromacs/simd/simd.h | 2 + src/gromacs/simd/simd_math.h | 4 + 9 files changed, 405 insertions(+), 18 deletions(-) create mode 100644 src/gromacs/simd/impl_arm_neon/impl_arm_neon.h diff --git a/CMakeLists.txt b/CMakeLists.txt index 66144b8960..2eefa8675b 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -211,7 +211,7 @@ gmx_option_multichoice( GMX_SIMD "SIMD instruction set for CPU kernels and compiler optimization" "${GMX_SUGGESTED_SIMD}" - None SSE2 SSE4.1 AVX_128_FMA AVX_256 AVX2_256 IBM_QPX Sparc64_HPC_ACE Reference) + None SSE2 SSE4.1 AVX_128_FMA AVX_256 AVX2_256 ARM_NEON IBM_QPX Sparc64_HPC_ACE Reference) gmx_option_multichoice( GMX_FFT_LIBRARY diff --git a/cmake/gmxTestSimd.cmake b/cmake/gmxTestSimd.cmake index f58429a239..47213b67f0 100644 --- a/cmake/gmxTestSimd.cmake +++ b/cmake/gmxTestSimd.cmake @@ -249,6 +249,26 @@ elseif(${GMX_SIMD} STREQUAL "AVX2_256") set(GMX_SIMD_X86_AVX2_256 1) set(SIMD_STATUS_MESSAGE "Enabling 256-bit AVX2 SIMD instructions") +elseif(${GMX_SIMD} STREQUAL "ARM_NEON") + + gmx_find_cflag_for_source(CFLAGS_ARM_NEON "C compiler 32-bit ARM NEON flag" + "#include + int main(){float32x4_t x=vdupq_n_f32(0.5);x=vmlaq_f32(x,x,x);return vgetq_lane_f32(x,0)>0;}" + SIMD_C_FLAGS + "-mfpu=neon" "") + gmx_find_cxxflag_for_source(CXXFLAGS_ARM_NEON "C++ compiler 32-bit ARM NEON flag" + "#include + int main(){float32x4_t x=vdupq_n_f32(0.5);x=vmlaq_f32(x,x,x);return vgetq_lane_f32(x,0)>0;}" + SIMD_CXX_FLAGS + "-mfpu=neon" "") + + if(NOT CFLAGS_ARM_NEON OR NOT CXXFLAGS_ARM_NEON) + message(FATAL_ERROR "Cannot find ARM 32-bit NEON compiler flag. Use a newer compiler, or disable NEON SIMD.") + endif() + + set(GMX_SIMD_ARM_NEON 1) + set(SIMD_STATUS_MESSAGE "Enabling 32-bit ARM NEON SIMD instructions") + elseif(${GMX_SIMD} STREQUAL "IBM_QPX") try_compile(TEST_QPX ${CMAKE_BINARY_DIR} diff --git a/src/config.h.cmakein b/src/config.h.cmakein index 4dffa2952a..31e700c760 100644 --- a/src/config.h.cmakein +++ b/src/config.h.cmakein @@ -114,6 +114,9 @@ /* AVX2 256-bit SIMD instruction set level was selected */ #cmakedefine GMX_SIMD_X86_AVX2_256 +/* 32-bit ARM NEON SIMD instruction set level was selected */ +#cmakedefine GMX_SIMD_ARM_NEON + /* IBM QPX was selected as SIMD instructions (e.g. BlueGene/Q) */ #cmakedefine GMX_SIMD_IBM_QPX diff --git a/src/gromacs/gmxlib/gmx_cpuid.c b/src/gromacs/gmxlib/gmx_cpuid.c index c786c0e5bb..6fd35c42f9 100644 --- a/src/gromacs/gmxlib/gmx_cpuid.c +++ b/src/gromacs/gmxlib/gmx_cpuid.c @@ -85,7 +85,8 @@ gmx_cpuid_vendor_string[GMX_CPUID_NVENDORS] = "GenuineIntel", "AuthenticAMD", "Fujitsu", - "IBM" + "IBM", + "ARM" }; const char * @@ -96,7 +97,8 @@ gmx_cpuid_vendor_string_alternative[GMX_CPUID_NVENDORS] = "GenuineIntel", "AuthenticAMD", "Fujitsu", - "ibm" /* Used on BlueGene/Q */ + "ibm", /* Used on BlueGene/Q */ + "arm" }; const char * @@ -136,7 +138,8 @@ gmx_cpuid_feature_string[GMX_CPUID_NFEATURES] = "ssse3", "tdt", "x2apic", - "xop" + "xop", + "arm_neon" }; const char * @@ -151,18 +154,19 @@ gmx_cpuid_simd_string[GMX_CPUID_NSIMD] = "AVX_256", "AVX2_256", "Sparc64 HPC-ACE", - "IBM_QPX" + "IBM_QPX", + "ARM_NEON" }; /* Max length of brand string */ -#define GMX_CPUID_BRAND_MAXLEN 256 +#define GMX_CPUID_STRLEN 256 /* Contents of the abstract datatype */ struct gmx_cpuid { enum gmx_cpuid_vendor vendor; - char brand[GMX_CPUID_BRAND_MAXLEN]; + char brand[GMX_CPUID_STRLEN]; int family; int model; int stepping; @@ -240,6 +244,8 @@ static const enum gmx_cpuid_simd compiled_simd = GMX_CPUID_SIMD_X86_AVX_128_FMA; static const enum gmx_cpuid_simd compiled_simd = GMX_CPUID_SIMD_X86_SSE4_1; #elif defined GMX_SIMD_X86_SSE2 static const enum gmx_cpuid_simd compiled_simd = GMX_CPUID_SIMD_X86_SSE2; +#elif defined GMX_SIMD_ARM_NEON +static const enum gmx_cpuid_simd compiled_simd = GMX_CPUID_SIMD_ARM_NEON; #elif defined GMX_SIMD_SPARC64_HPC_ACE static const enum gmx_cpuid_simd compiled_simd = GMX_CPUID_SIMD_SPARC64_HPC_ACE; #elif defined GMX_SIMD_IBM_QPX @@ -336,7 +342,7 @@ cpuid_check_common_x86(gmx_cpuid_t cpuid) { int fn, max_stdfn, max_extfn; unsigned int eax, ebx, ecx, edx; - char str[GMX_CPUID_BRAND_MAXLEN]; + char str[GMX_CPUID_STRLEN]; char * p; /* Find largest standard/extended function input value */ @@ -366,11 +372,11 @@ cpuid_check_common_x86(gmx_cpuid_t cpuid) { p++; } - strncpy(cpuid->brand, p, GMX_CPUID_BRAND_MAXLEN); + strncpy(cpuid->brand, p, GMX_CPUID_STRLEN); } else { - strncpy(cpuid->brand, "Unknown CPU brand", GMX_CPUID_BRAND_MAXLEN); + strncpy(cpuid->brand, "Unknown CPU brand", GMX_CPUID_STRLEN); } /* Find basic CPU properties */ @@ -732,7 +738,6 @@ cpuid_check_intel_x86(gmx_cpuid_t cpuid) - static void chomp_substring_before_colon(const char *in, char *s, int maxlength) { @@ -777,6 +782,50 @@ chomp_substring_after_colon(const char *in, char *s, int maxlength) } } +static int +cpuid_check_arm(gmx_cpuid_t cpuid) +{ +#if defined(__linux__) || defined(__linux) + FILE *fp; + char buffer[GMX_CPUID_STRLEN], buffer2[GMX_CPUID_STRLEN], buffer3[GMX_CPUID_STRLEN]; + + if ( (fp = fopen("/proc/cpuinfo", "r")) != NULL) + { + while ( (fgets(buffer, sizeof(buffer), fp) != NULL)) + { + chomp_substring_before_colon(buffer, buffer2, GMX_CPUID_STRLEN); + chomp_substring_after_colon(buffer, buffer3, GMX_CPUID_STRLEN); + + if (!strcmp(buffer2, "Processor")) + { + strncpy(cpuid->brand, buffer3, GMX_CPUID_STRLEN); + } + else if (!strcmp(buffer2, "CPU architecture")) + { + cpuid->family = strtol(buffer3, NULL, 10); + } + else if (!strcmp(buffer2, "CPU part")) + { + cpuid->model = strtol(buffer3, NULL, 16); + } + else if (!strcmp(buffer2, "CPU revision")) + { + cpuid->stepping = strtol(buffer3, NULL, 10); + } + else if (!strcmp(buffer2, "Features") && strstr(buffer3, "neon")) + { + cpuid->feature[GMX_CPUID_FEATURE_ARM_NEON] = 1; + } + } + } + fclose(fp); +#else + /* Strange non-linux platform. We cannot assume that neon is present. */ + cpuid->feature[GMX_CPUID_FEATURE_ARM_NEON] = 0; +#endif + return 0; +} + /* Try to find the vendor of the current CPU, so we know what specific * detection routine to call. */ @@ -788,7 +837,9 @@ cpuid_check_vendor(void) unsigned int eax, ebx, ecx, edx; char vendorstring[13]; FILE * fp; - char buffer[255], before_colon[255], after_colon[255]; + char buffer[GMX_CPUID_STRLEN]; + char before_colon[GMX_CPUID_STRLEN]; + char after_colon[GMX_CPUID_STRLEN]; /* Set default first */ vendor = GMX_CPUID_VENDOR_UNKNOWN; @@ -816,11 +867,15 @@ cpuid_check_vendor(void) while ( (vendor == GMX_CPUID_VENDOR_UNKNOWN) && (fgets(buffer, sizeof(buffer), fp) != NULL)) { chomp_substring_before_colon(buffer, before_colon, sizeof(before_colon)); - /* Intel/AMD use "vendor_id", IBM "vendor"(?) or "model". Fujitsu "manufacture". Add others if you have them! */ + /* Intel/AMD use "vendor_id", IBM "vendor"(?) or "model". Fujitsu "manufacture". + * On ARM there does not seem to be a vendor, but ARM is listed in the Processor string. + * Add others if you have them! + */ if (!strcmp(before_colon, "vendor_id") || !strcmp(before_colon, "vendor") || !strcmp(before_colon, "manufacture") - || !strcmp(before_colon, "model")) + || !strcmp(before_colon, "model") + || !strcmp(before_colon, "Processor")) { chomp_substring_after_colon(buffer, after_colon, sizeof(after_colon)); for (i = GMX_CPUID_VENDOR_UNKNOWN; i < GMX_CPUID_NVENDORS; i++) @@ -838,6 +893,12 @@ cpuid_check_vendor(void) } } fclose(fp); +#elif defined(__arm__) || defined (__arm) + /* If we are using ARM on something that is not linux we have to trust the compiler, + * and we cannot get the extra info that might be present in /proc/cpuinfo. + * This path will not trigger 64-bit arm, which is identified by __aarch64__ instead. + */ + vendor = GMX_CPUID_VENDOR_ARM; #endif return vendor; @@ -905,7 +966,7 @@ gmx_cpuid_init (gmx_cpuid_t * pcpuid) gmx_cpuid_t cpuid; int i; FILE * fp; - char buffer[255], buffer2[255]; + char buffer[GMX_CPUID_STRLEN], buffer2[GMX_CPUID_STRLEN]; int found_brand; cpuid = malloc(sizeof(*cpuid)); @@ -939,9 +1000,12 @@ gmx_cpuid_init (gmx_cpuid_t * pcpuid) cpuid_check_amd_x86(cpuid); break; #endif + case GMX_CPUID_VENDOR_ARM: + cpuid_check_arm(cpuid); + break; default: /* Default value */ - strncpy(cpuid->brand, "Unknown CPU brand", GMX_CPUID_BRAND_MAXLEN); + strncpy(cpuid->brand, "Unknown CPU brand", GMX_CPUID_STRLEN); #if defined(__linux__) || defined(__linux) /* General Linux. Try to get CPU type from /proc/cpuinfo */ if ( (fp = fopen("/proc/cpuinfo", "r")) != NULL) @@ -953,7 +1017,7 @@ gmx_cpuid_init (gmx_cpuid_t * pcpuid) /* Intel uses "model name", Fujitsu and IBM "cpu". */ if (!strcmp(buffer2, "model name") || !strcmp(buffer2, "cpu")) { - chomp_substring_after_colon(buffer, cpuid->brand, GMX_CPUID_BRAND_MAXLEN); + chomp_substring_after_colon(buffer, cpuid->brand, GMX_CPUID_STRLEN); found_brand = 1; } } @@ -1099,6 +1163,13 @@ gmx_cpuid_simd_suggest (gmx_cpuid_t cpuid) tmpsimd = GMX_CPUID_SIMD_IBM_QPX; } } + else if (gmx_cpuid_vendor(cpuid) == GMX_CPUID_VENDOR_ARM) + { + if (gmx_cpuid_feature(cpuid, GMX_CPUID_FEATURE_ARM_NEON)) + { + tmpsimd = GMX_CPUID_SIMD_ARM_NEON; + } + } return tmpsimd; } diff --git a/src/gromacs/legacyheaders/gmx_cpuid.h b/src/gromacs/legacyheaders/gmx_cpuid.h index d595c51a37..e9362f8c4a 100644 --- a/src/gromacs/legacyheaders/gmx_cpuid.h +++ b/src/gromacs/legacyheaders/gmx_cpuid.h @@ -55,6 +55,7 @@ enum gmx_cpuid_vendor GMX_CPUID_VENDOR_AMD, GMX_CPUID_VENDOR_FUJITSU, GMX_CPUID_VENDOR_IBM, + GMX_CPUID_VENDOR_ARM, GMX_CPUID_NVENDORS }; @@ -111,6 +112,7 @@ enum gmx_cpuid_feature GMX_CPUID_FEATURE_X86_TDT, /* TSC deadline timer */ GMX_CPUID_FEATURE_X86_X2APIC, /* Extended xAPIC Support */ GMX_CPUID_FEATURE_X86_XOP, /* AMD extended instructions, only AMD for now */ + GMX_CPUID_FEATURE_ARM_NEON, /* 32-bit ARM NEON */ GMX_CPUID_NFEATURES }; @@ -132,6 +134,7 @@ enum gmx_cpuid_simd GMX_CPUID_SIMD_X86_AVX2_256, GMX_CPUID_SIMD_SPARC64_HPC_ACE, GMX_CPUID_SIMD_IBM_QPX, + GMX_CPUID_SIMD_ARM_NEON, GMX_CPUID_NSIMD }; diff --git a/src/gromacs/mdlib/nbnxn_search.c b/src/gromacs/mdlib/nbnxn_search.c index 8108297b36..5e2882122d 100644 --- a/src/gromacs/mdlib/nbnxn_search.c +++ b/src/gromacs/mdlib/nbnxn_search.c @@ -48,7 +48,7 @@ #include "nbnxn_consts.h" /* nbnxn_internal.h included gromacs/simd/macros.h */ #include "nbnxn_internal.h" -#ifdef GMX_NBNXN_SIMD +#ifdef GMX_SIMD #include "gromacs/simd/vector_operations.h" #endif #include "nbnxn_atomdata.h" diff --git a/src/gromacs/simd/impl_arm_neon/impl_arm_neon.h b/src/gromacs/simd/impl_arm_neon/impl_arm_neon.h new file mode 100644 index 0000000000..59d99d6300 --- /dev/null +++ b/src/gromacs/simd/impl_arm_neon/impl_arm_neon.h @@ -0,0 +1,284 @@ +/* + * This file is part of the GROMACS molecular simulation package. + * + * Copyright (c) 2014, by the GROMACS development team, led by + * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, + * and including many others, as listed in the AUTHORS file in the + * top-level source directory and at http://www.gromacs.org. + * + * GROMACS is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 + * of the License, or (at your option) any later version. + * + * GROMACS is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with GROMACS; if not, see + * http://www.gnu.org/licenses, or write to the Free Software Foundation, + * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + * + * If you want to redistribute modifications to GROMACS, please + * consider that scientific software is very special. Version + * control is crucial - bugs must be traceable. We will be happy to + * consider code for inclusion in the official distribution, but + * derived work must not be called official GROMACS. Details are found + * in the README & COPYING files - if they are missing, get the + * official version at http://www.gromacs.org. + * + * To help us fund GROMACS development, we humbly ask that you cite + * the research papers on the package. Check out http://www.gromacs.org. + */ + +#ifndef GMX_SIMD_IMPL_ARM_NEON_H +#define GMX_SIMD_IMPL_ARM_NEON_H + +#include + +#include + +/* ARM 32-bit NEON SIMD instruction wrappers + * + * Please see documentation in gromacs/simd/simd.h for defines. + */ + +/* Capability definitions for ARM 32-bit NEON */ +#define GMX_SIMD_HAVE_FLOAT +#undef GMX_SIMD_HAVE_DOUBLE +#define GMX_SIMD_HAVE_HARDWARE +#define GMX_SIMD_HAVE_LOADU +#define GMX_SIMD_HAVE_STOREU +#define GMX_SIMD_HAVE_LOGICAL +#define GMX_SIMD_HAVE_FMA +#undef GMX_SIMD_HAVE_FRACTION +#define GMX_SIMD_HAVE_FINT32 +#define GMX_SIMD_HAVE_FINT32_EXTRACT +#define GMX_SIMD_HAVE_FINT32_LOGICAL +#define GMX_SIMD_HAVE_FINT32_ARITHMETICS +#undef GMX_SIMD_HAVE_DINT32 +#undef GMX_SIMD_HAVE_DINT32_EXTRACT +#undef GMX_SIMD_HAVE_DINT32_LOGICAL +#undef GMX_SIMD_HAVE_DINT32_ARITHMETICS +#define GMX_SIMD4_HAVE_FLOAT +#undef GMX_SIMD4_HAVE_DOUBLE + +/* Implementation details */ +#define GMX_SIMD_FLOAT_WIDTH 4 +#undef GMX_SIMD_DOUBLE_WIDTH +#define GMX_SIMD_FINT32_WIDTH 4 +#undef GMX_SIMD_DINT32_WIDTH +#define GMX_SIMD_RSQRT_BITS 8 +#define GMX_SIMD_RCP_BITS 8 + +/**************************************************** + * SINGLE PRECISION SIMD IMPLEMENTATION * + ****************************************************/ +#define gmx_simd_float_t float32x4_t +#define gmx_simd_load_f vld1q_f32 +#define gmx_simd_load1_f vld1q_dup_f32 +#define gmx_simd_set1_f vdupq_n_f32 +#define gmx_simd_store_f vst1q_f32 +#define gmx_simd_loadu_f vld1q_f32 +#define gmx_simd_storeu_f vst1q_f32 +#define gmx_simd_setzero_f() vdupq_n_f32(0.0f) +#define gmx_simd_add_f vaddq_f32 +#define gmx_simd_sub_f vsubq_f32 +#define gmx_simd_mul_f vmulq_f32 +#ifdef __ARM_FEATURE_FMA +# define gmx_simd_fmadd_f(a, b, c) vfmaq_f32(c, b, a) +# define gmx_simd_fmsub_f(a, b, c) vnegq_f32(vfmsq_f32(c, b, a)) +# define gmx_simd_fnmadd_f(a, b, c) vfmaq_f32(c, b, a) +# define gmx_simd_fnmsub_f(a, b, c) vnegq_f32(vfmaq_f32(c, b, a)) +#else +# define gmx_simd_fmadd_f(a, b, c) vmlaq_f32(c, b, a) +# define gmx_simd_fmsub_f(a, b, c) vnegq_f32(vmlsq_f32(c, b, a)) +# define gmx_simd_fnmadd_f(a, b, c) vmlsq_f32(c, b, a) +# define gmx_simd_fnmsub_f(a, b, c) vnegq_f32(vmlaq_f32(c, b, a)) +#endif +#define gmx_simd_and_f(a, b) vreinterpretq_f32_s32(vandq_s32(vreinterpretq_s32_f32(a), vreinterpretq_s32_f32(b))) +#define gmx_simd_andnot_f(a, b) vreinterpretq_f32_s32(vbicq_s32(vreinterpretq_s32_f32(b), vreinterpretq_s32_f32(a))) +#define gmx_simd_or_f(a, b) vreinterpretq_f32_s32(vorrq_s32(vreinterpretq_s32_f32(a), vreinterpretq_s32_f32(b))) +#define gmx_simd_xor_f(a, b) vreinterpretq_f32_s32(veorq_s32(vreinterpretq_s32_f32(a), vreinterpretq_s32_f32(b))) +#define gmx_simd_rsqrt_f vrsqrteq_f32 +#define gmx_simd_rsqrt_iter_f(lu, x) vmulq_f32(lu, vrsqrtsq_f32(vmulq_f32(lu, lu), x)) +#define gmx_simd_rcp_f vrecpeq_f32 +#define gmx_simd_rcp_iter_f(lu, x) vmulq_f32(lu, vrecpsq_f32(lu, x)) +#define gmx_simd_fabs_f(x) vabsq_f32(x) +#define gmx_simd_fneg_f(x) vnegq_f32(x) +#define gmx_simd_max_f vmaxq_f32 +#define gmx_simd_min_f vminq_f32 +#define gmx_simd_round_f(x) gmx_simd_cvt_i2f(gmx_simd_cvt_f2i(x)) +#define gmx_simd_trunc_f(x) gmx_simd_cvt_i2f(gmx_simd_cvtt_f2i(x)) +#define gmx_simd_fraction_f(x) vsubq_f32(x, gmx_simd_trunc_f(x)) +#define gmx_simd_get_exponent_f gmx_simd_get_exponent_f_arm_neon +#define gmx_simd_get_mantissa_f gmx_simd_get_mantissa_f_arm_neon +#define gmx_simd_set_exponent_f gmx_simd_set_exponent_f_arm_neon +/* integer datatype corresponding to float: gmx_simd_fint32_t */ +#define gmx_simd_fint32_t int32x4_t +#define gmx_simd_load_fi(m) vld1q_s32(m) +#define gmx_simd_set1_fi vdupq_n_s32 +#define gmx_simd_store_fi(m, x) vst1q_s32(m, x) +#define gmx_simd_loadu_fi(m) vld1q_s32(m) +#define gmx_simd_storeu_fi(m, x) vst1q_s32(m, x) +#define gmx_simd_setzero_fi() vdupq_n_s32(0) +#define gmx_simd_cvtt_f2i vcvtq_s32_f32 +#define gmx_simd_cvt_f2i(x) vcvtq_s32_f32(gmx_simd_add_f(gmx_simd_or_f(gmx_simd_and_f(vdupq_n_f32(-0.0f), x), vdupq_n_f32(0.5f)), x)) +#define gmx_simd_cvt_i2f vcvtq_f32_s32 +#define gmx_simd_extract_fi(x, i) vgetq_lane_s32(x, i) +/* Integer logical ops on gmx_simd_fint32_t */ +#define gmx_simd_slli_fi vshlq_n_s32 +#define gmx_simd_srli_fi vshrq_n_s32 +#define gmx_simd_and_fi vandq_s32 +#define gmx_simd_andnot_fi(a, b) vbicq_s32(b, a) +#define gmx_simd_or_fi vorrq_s32 +#define gmx_simd_xor_fi veorq_s32 +/* Integer arithmetic ops on gmx_simd_fint32_t */ +#define gmx_simd_add_fi vaddq_s32 +#define gmx_simd_sub_fi vsubq_s32 +#define gmx_simd_mul_fi vmulq_s32 +/* Boolean & comparison operations on gmx_simd_float_t */ +#define gmx_simd_fbool_t uint32x4_t +#define gmx_simd_cmpeq_f vceqq_f32 +#define gmx_simd_cmplt_f vcltq_f32 +#define gmx_simd_cmple_f vcleq_f32 +#define gmx_simd_and_fb vandq_u32 +#define gmx_simd_or_fb vorrq_u32 +#define gmx_simd_anytrue_fb gmx_simd_anytrue_fb_arm_neon +#define gmx_simd_blendzero_f(a, sel) vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(a), sel)) +#define gmx_simd_blendnotzero_f(a, sel) vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(a), sel)) +#define gmx_simd_blendv_f(a, b, sel) vbslq_f32(sel, b, a) +#define gmx_simd_reduce_f(a) gmx_simd_reduce_f_arm_neon(a) +/* Boolean & comparison operations on gmx_simd_fint32_t */ +#define gmx_simd_fibool_t uint32x4_t +#define gmx_simd_cmpeq_fi vceqq_s32 +#define gmx_simd_cmplt_fi vcltq_s32 +#define gmx_simd_and_fib vandq_u32 +#define gmx_simd_or_fib vorrq_u32 +#define gmx_simd_anytrue_fib gmx_simd_anytrue_fb +#define gmx_simd_blendzero_fi(a, sel) vandq_s32(a, vreinterpretq_s32_u32(sel)) +#define gmx_simd_blendnotzero_fi(a, sel) vbicq_s32(a, vreinterpretq_s32_u32(sel)) +#define gmx_simd_blendv_fi(a, b, sel) vbslq_s32(sel, b, a) +/* Conversions between different booleans */ +#define gmx_simd_cvt_fb2fib(x) (x) +#define gmx_simd_cvt_fib2fb(x) (x) + +/**************************************************** + * NO DOUBLE PRECISION SIMD AVAILABLE * + ****************************************************/ + + +/**************************************************** + * SINGLE PRECISION IMPLEMENTATION HELPER FUNCTIONS * + ****************************************************/ +static gmx_inline gmx_simd_float_t +gmx_simd_get_exponent_f_arm_neon(gmx_simd_float_t x) +{ + const float32x4_t expmask = vreinterpretq_f32_s32( vdupq_n_s32(0x7F800000) ); + int32x4_t iexp; + + iexp = vreinterpretq_s32_f32(gmx_simd_and_f(x, expmask)); + iexp = vsubq_s32(vshrq_n_s32(iexp, 23), vdupq_n_s32(127)); + return vcvtq_f32_s32(iexp); +} + + +static gmx_inline gmx_simd_float_t +gmx_simd_get_mantissa_f_arm_neon(gmx_simd_float_t x) +{ + const float32x4_t mantmask = vreinterpretq_f32_s32( vdupq_n_s32(0x007FFFFF) ); + const float32x4_t one = vdupq_n_f32(1.0f); + + /* Get mantissa */ + x = gmx_simd_and_f(mantmask, x); + /* Reset zero (but correctly biased) exponent */ + return gmx_simd_or_f(x, one); +} + + +static gmx_inline gmx_simd_float_t +gmx_simd_set_exponent_f_arm_neon(gmx_simd_float_t x) +{ + int32x4_t iexp = gmx_simd_cvt_f2i(x); + + iexp = vshlq_n_s32(vaddq_s32(iexp, vdupq_n_s32(127)), 23); + return vreinterpretq_f32_s32(iexp); +} + +static gmx_inline float +gmx_simd_reduce_f_arm_neon(gmx_simd_float_t a) +{ + float32x4_t b = vextq_f32(a, a, 2); + + a = vaddq_f32(a, b); + b = vextq_f32(a, a, 1); + a = vaddq_f32(a, b); + return vgetq_lane_f32(a, 0); +} + +static gmx_inline int +gmx_simd_anytrue_fb_arm_neon(gmx_simd_fbool_t a) +{ + uint32x4_t b = vextq_u32(a, a, 2); + + a = gmx_simd_or_fb(a, b); + b = vextq_u32(a, a, 1); + a = gmx_simd_or_fb(a, b); + return (vgetq_lane_u32(a, 0) != 0); +} + + +/* ARM 32-bit Neon is already 4-wide in single, so just reuse float type for SIMD4 */ +#define gmx_simd4_float_t gmx_simd_float_t +#define gmx_simd4_load_f gmx_simd_load_f +#define gmx_simd4_load1_f gmx_simd_load1_f +#define gmx_simd4_set1_f gmx_simd_set1_f +#define gmx_simd4_store_f gmx_simd_store_f +#define gmx_simd4_loadu_f gmx_simd_loadu_f +#define gmx_simd4_storeu_f gmx_simd_storeu_f +#define gmx_simd4_setzero_f gmx_simd_setzero_f +#define gmx_simd4_add_f gmx_simd_add_f +#define gmx_simd4_sub_f gmx_simd_sub_f +#define gmx_simd4_mul_f gmx_simd_mul_f +#define gmx_simd4_fmadd_f gmx_simd_fmadd_f +#define gmx_simd4_fmsub_f gmx_simd_fmsub_f +#define gmx_simd4_fnmadd_f gmx_simd_fnmadd_f +#define gmx_simd4_fnmsub_f gmx_simd_fnmsub_f +#define gmx_simd4_and_f gmx_simd_and_f +#define gmx_simd4_andnot_f gmx_simd_andnot_f +#define gmx_simd4_or_f gmx_simd_or_f +#define gmx_simd4_xor_f gmx_simd_xor_f +#define gmx_simd4_rsqrt_f gmx_simd_rsqrt_f +#define gmx_simd4_fabs_f gmx_simd_fabs_f +#define gmx_simd4_fneg_f gmx_simd_fneg_f +#define gmx_simd4_max_f gmx_simd_max_f +#define gmx_simd4_min_f gmx_simd_min_f +#define gmx_simd4_round_f gmx_simd_round_f +#define gmx_simd4_trunc_f gmx_simd_trunc_f +#define gmx_simd4_dotproduct3_f gmx_simd4_dotproduct3_f_arm_neon +#define gmx_simd4_fbool_t gmx_simd_fbool_t +#define gmx_simd4_cmpeq_f gmx_simd_cmpeq_f +#define gmx_simd4_cmplt_f gmx_simd_cmplt_f +#define gmx_simd4_cmple_f gmx_simd_cmple_f +#define gmx_simd4_and_fb gmx_simd_and_fb +#define gmx_simd4_or_fb gmx_simd_or_fb +#define gmx_simd4_anytrue_fb gmx_simd_anytrue_fb +#define gmx_simd4_blendzero_f gmx_simd_blendzero_f +#define gmx_simd4_blendnotzero_f gmx_simd_blendnotzero_f +#define gmx_simd4_blendv_f gmx_simd_blendv_f +#define gmx_simd4_reduce_f gmx_simd_reduce_f + +/* SIMD4 Dotproduct helper function */ +static gmx_inline float +gmx_simd4_dotproduct3_f_arm_neon(gmx_simd_float_t a, gmx_simd_float_t b) +{ + gmx_simd_float_t c; + c = gmx_simd_mul_f(a, b); + /* set 4th element to 0, then add all of them */ + c = vsetq_lane_f32(0.0f, c, 3); + return gmx_simd_reduce_f_arm_neon(c); +} + +#endif /* GMX_SIMD_IMPL_ARM_NEON_H */ diff --git a/src/gromacs/simd/simd.h b/src/gromacs/simd/simd.h index 49ca593f50..3b3a2d852e 100644 --- a/src/gromacs/simd/simd.h +++ b/src/gromacs/simd/simd.h @@ -123,6 +123,8 @@ static gmx_inline double * gmx_simd4_align_d(double *p); # include "gromacs/simd/impl_x86_sse4_1/impl_x86_sse4_1.h" #elif defined GMX_SIMD_X86_SSE2 # include "gromacs/simd/impl_x86_sse2/impl_x86_sse2.h" +#elif defined GMX_SIMD_ARM_NEON +# include "gromacs/simd/impl_arm_neon/impl_arm_neon.h" #elif defined GMX_SIMD_IBM_QPX # include "gromacs/simd/impl_ibm_qpx/impl_ibm_qpx.h" #elif defined GMX_SIMD_SPARC64_HPC_ACE diff --git a/src/gromacs/simd/simd_math.h b/src/gromacs/simd/simd_math.h index 7601aba820..84a43a7709 100644 --- a/src/gromacs/simd/simd_math.h +++ b/src/gromacs/simd/simd_math.h @@ -132,6 +132,7 @@ gmx_simd_xor_sign_f(gmx_simd_float_t a, gmx_simd_float_t b) #endif } +#ifndef gmx_simd_rsqrt_iter_f /*! \brief Perform one Newton-Raphson iteration to improve 1/sqrt(x) for SIMD float. * * This is a low-level routine that should only be used by SIMD math routine @@ -150,6 +151,7 @@ gmx_simd_rsqrt_iter_f(gmx_simd_float_t lu, gmx_simd_float_t x) return gmx_simd_mul_f(gmx_simd_set1_f(0.5f), gmx_simd_mul_f(gmx_simd_sub_f(gmx_simd_set1_f(3.0f), gmx_simd_mul_f(gmx_simd_mul_f(lu, lu), x)), lu)); # endif } +#endif /*! \brief Calculate 1/sqrt(x) for SIMD float. * @@ -194,6 +196,7 @@ gmx_simd_invsqrt_pair_f(gmx_simd_float_t x0, gmx_simd_float_t x1, *out1 = gmx_simd_invsqrt_f(x1); } +#ifndef gmx_simd_rcp_iter_f /*! \brief Perform one Newton-Raphson iteration to improve 1/x for SIMD float. * * This is a low-level routine that should only be used by SIMD math routine @@ -208,6 +211,7 @@ gmx_simd_rcp_iter_f(gmx_simd_float_t lu, gmx_simd_float_t x) { return gmx_simd_mul_f(lu, gmx_simd_fnmadd_f(lu, x, gmx_simd_set1_f(2.0f))); } +#endif /*! \brief Calculate 1/x for SIMD float. * -- 2.22.0