Add 64-bit AArch64 asimd SIMD support

author Erik Lindahl <erik@kth.se>

Wed, 2 Jul 2014 12:15:32 +0000 (14:15 +0200)

committer Mark Abraham <mark.j.abraham@gmail.com>

Mon, 8 Sep 2014 12:21:52 +0000 (14:21 +0200)
author Erik Lindahl <erik@kth.se>
Wed, 2 Jul 2014 12:15:32 +0000 (14:15 +0200)
committer Mark Abraham <mark.j.abraham@gmail.com>
Mon, 8 Sep 2014 12:21:52 +0000 (14:21 +0200)
diff --git a/CMakeLists.txt b/CMakeLists.txt

index 2eefa8675b0f7086ec39e78a34a8b9294f79daae..748220857183b9628fae154aea6d960493d8e2fa 100644 (file)
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -211,7 +211,7 @@ gmx_option_multichoice(
      GMX_SIMD
      "SIMD instruction set for CPU kernels and compiler optimization"
      "${GMX_SUGGESTED_SIMD}"
-    None SSE2 SSE4.1 AVX_128_FMA AVX_256 AVX2_256 ARM_NEON IBM_QPX Sparc64_HPC_ACE Reference)
+    None SSE2 SSE4.1 AVX_128_FMA AVX_256 AVX2_256 ARM_NEON ARM_NEON_ASIMD IBM_QPX Sparc64_HPC_ACE Reference)
  
  gmx_option_multichoice(
      GMX_FFT_LIBRARY
diff --git a/cmake/gmxTestSimd.cmake b/cmake/gmxTestSimd.cmake

index 47213b67f08c01941ddad6d44ce1aa4815d0f780..e5d9e255ac15d42671d7ac9c9fb89b8ac4881484 100644 (file)
--- a/cmake/gmxTestSimd.cmake
+++ b/cmake/gmxTestSimd.cmake
@@ -260,7 +260,7 @@ elseif(${GMX_SIMD} STREQUAL "ARM_NEON")
                                  "#include<arm_neon.h>
                                  int main(){float32x4_t x=vdupq_n_f32(0.5);x=vmlaq_f32(x,x,x);return vgetq_lane_f32(x,0)>0;}"
                                  SIMD_CXX_FLAGS
-                                "-mfpu=neon" "")
+                                "-mfpu=neon" "-D__STDC_CONSTANT_MACROS" "")
  
      if(NOT CFLAGS_ARM_NEON OR NOT CXXFLAGS_ARM_NEON)
          message(FATAL_ERROR "Cannot find ARM 32-bit NEON compiler flag. Use a newer compiler, or disable NEON SIMD.")
@@ -269,6 +269,36 @@ elseif(${GMX_SIMD} STREQUAL "ARM_NEON")
      set(GMX_SIMD_ARM_NEON 1)
      set(SIMD_STATUS_MESSAGE "Enabling 32-bit ARM NEON SIMD instructions")
  
+elseif(${GMX_SIMD} STREQUAL "ARM_NEON_ASIMD")
+    # Gcc-4.8.1 appears to have a bug where the c++ compiler requires
+    # -D__STDC_CONSTANT_MACROS if we include arm_neon.h
+
+    gmx_find_cflag_for_source(CFLAGS_ARM_NEON_ASIMD "C compiler ARM NEON Advanced SIMD flag"
+                              "#include<arm_neon.h>
+                              int main(){float64x2_t x=vdupq_n_f64(0.5);x=vfmaq_f64(x,x,x);return vgetq_lane_f64(x,0)>0;}"
+                              SIMD_C_FLAGS
+                              "")
+    gmx_find_cxxflag_for_source(CXXFLAGS_ARM_NEON_ASIMD "C++ compiler ARM NEON Advanced SIMD flag"
+                                "#include<arm_neon.h>
+                                int main(){float64x2_t x=vdupq_n_f64(0.5);x=vfmaq_f64(x,x,x);return vgetq_lane_f64(x,0)>0;}"
+                                SIMD_CXX_FLAGS
+                                "-D__STDC_CONSTANT_MACROS" "")
+
+    if(NOT CFLAGS_ARM_NEON_ASIMD OR NOT CXXFLAGS_ARM_NEON_ASIMD)
+        message(FATAL_ERROR "Cannot find ARM (AArch64) NEON Advanced SIMD compiler flag. Use a newer compiler, or disable SIMD.")
+    endif()
+
+    if(CMAKE_C_COMPILER_ID MATCHES "GNU" AND CMAKE_C_COMPILER_VERSION VERSION_LESS "4.9")
+        message(WARNING "At least gcc-4.8.1 has many bugs for ARM (AArch64) NEON Advanced SIMD compilation. You might need gcc version 4.9 or later.")
+    endif()
+
+    if(CMAKE_C_COMPILER_ID MATCHES "Clang" AND CMAKE_C_COMPILER_VERSION VERSION_LESS "3.4")
+        message(FATAL_ERROR "Clang version 3.4 or later is required for ARM (AArch64) NEON Advanced SIMD.")
+    endif()
+
+    set(GMX_SIMD_ARM_NEON_ASIMD 1)
+    set(SIMD_STATUS_MESSAGE "Enabling ARM (AArch64) NEON Advanced SIMD instructions")
+
  elseif(${GMX_SIMD} STREQUAL "IBM_QPX")
  
      try_compile(TEST_QPX ${CMAKE_BINARY_DIR}
diff --git a/src/config.h.cmakein b/src/config.h.cmakein

index 31e700c7606815f29473606661ae11d3c68d2c19..00177ee8ff4033c612c406f373540c66f0aa048f 100644 (file)
--- a/src/config.h.cmakein
+++ b/src/config.h.cmakein
@@ -117,6 +117,9 @@
  /* 32-bit ARM NEON SIMD instruction set level was selected */
  #cmakedefine GMX_SIMD_ARM_NEON
  
+/* ARM (AArch64) NEON Advanced SIMD instruction set level was selected */
+#cmakedefine GMX_SIMD_ARM_NEON_ASIMD
+
  /* IBM QPX was selected as SIMD instructions (e.g. BlueGene/Q) */
  #cmakedefine GMX_SIMD_IBM_QPX
  
diff --git a/src/gromacs/gmxlib/gmx_cpuid.c b/src/gromacs/gmxlib/gmx_cpuid.c

index 6fd35c42f93176dabc059d540d4238686f417af1..0e69251321b32dfe64b5bd0a8aadee0daf016069 100644 (file)
--- a/src/gromacs/gmxlib/gmx_cpuid.c
+++ b/src/gromacs/gmxlib/gmx_cpuid.c
@@ -98,7 +98,7 @@ gmx_cpuid_vendor_string_alternative[GMX_CPUID_NVENDORS] =
      "AuthenticAMD",
      "Fujitsu",
      "ibm", /* Used on BlueGene/Q */
-    "arm"
+    "AArch64"
  };
  
  const char *
@@ -139,7 +139,8 @@ gmx_cpuid_feature_string[GMX_CPUID_NFEATURES] =
      "tdt",
      "x2apic",
      "xop",
-    "arm_neon"
+    "arm_neon",
+    "arm_neon_asimd"
  };
  
  const char *
@@ -155,7 +156,8 @@ gmx_cpuid_simd_string[GMX_CPUID_NSIMD] =
      "AVX2_256",
      "Sparc64 HPC-ACE",
      "IBM_QPX",
-    "ARM_NEON"
+    "ARM_NEON",
+    "ARM_NEON_ASIMD"
  };
  
  /* Max length of brand string */
@@ -246,6 +248,8 @@ static const enum gmx_cpuid_simd compiled_simd = GMX_CPUID_SIMD_X86_SSE4_1;
  static const enum gmx_cpuid_simd compiled_simd = GMX_CPUID_SIMD_X86_SSE2;
  #elif defined GMX_SIMD_ARM_NEON
  static const enum gmx_cpuid_simd compiled_simd = GMX_CPUID_SIMD_ARM_NEON;
+#elif defined GMX_SIMD_ARM_NEON_ASIMD
+static const enum gmx_cpuid_simd compiled_simd = GMX_CPUID_SIMD_ARM_NEON_ASIMD;
  #elif defined GMX_SIMD_SPARC64_HPC_ACE
  static const enum gmx_cpuid_simd compiled_simd = GMX_CPUID_SIMD_SPARC64_HPC_ACE;
  #elif defined GMX_SIMD_IBM_QPX
@@ -803,6 +807,10 @@ cpuid_check_arm(gmx_cpuid_t                cpuid)
              else if (!strcmp(buffer2, "CPU architecture"))
              {
                  cpuid->family = strtol(buffer3, NULL, 10);
+                if (!strcmp(buffer3, "AArch64"))
+                {
+                    cpuid->family = 8;
+                }
              }
              else if (!strcmp(buffer2, "CPU part"))
              {
@@ -816,16 +824,28 @@ cpuid_check_arm(gmx_cpuid_t                cpuid)
              {
                  cpuid->feature[GMX_CPUID_FEATURE_ARM_NEON] = 1;
              }
+            else if (!strcmp(buffer2, "Features") && strstr(buffer3, "asimd"))
+            {
+                cpuid->feature[GMX_CPUID_FEATURE_ARM_NEON_ASIMD] = 1;
+            }
          }
      }
      fclose(fp);
  #else
-    /* Strange non-linux platform. We cannot assume that neon is present. */
+#    ifdef __aarch64__
+    /* Strange 64-bit non-linux platform. However, since NEON ASIMD is present on all
+     * implementations of AArch64 this far, we assume it is present for now.
+     */
+    cpuid->feature[GMX_CPUID_FEATURE_ARM_NEON_ASIMD] = 1;
+#    else
+    /* Strange 32-bit non-linux platform. We cannot assume that neon is present. */
      cpuid->feature[GMX_CPUID_FEATURE_ARM_NEON] = 0;
+#    endif
  #endif
      return 0;
  }
  
+
  /* Try to find the vendor of the current CPU, so we know what specific
   * detection routine to call.
   */
@@ -868,7 +888,7 @@ cpuid_check_vendor(void)
          {
              chomp_substring_before_colon(buffer, before_colon, sizeof(before_colon));
              /* Intel/AMD use "vendor_id", IBM "vendor"(?) or "model". Fujitsu "manufacture".
-             * On ARM there does not seem to be a vendor, but ARM is listed in the Processor string.
+             * On ARM there does not seem to be a vendor, but ARM or AArch64 is listed in the Processor string.
               * Add others if you have them!
               */
              if (!strcmp(before_colon, "vendor_id")
@@ -893,14 +913,12 @@ cpuid_check_vendor(void)
          }
      }
      fclose(fp);
-#elif defined(__arm__) || defined (__arm)
+#elif defined(__arm__) || defined (__arm) || defined(__aarch64__)
      /* If we are using ARM on something that is not linux we have to trust the compiler,
       * and we cannot get the extra info that might be present in /proc/cpuinfo.
-     * This path will not trigger 64-bit arm, which is identified by __aarch64__ instead.
       */
      vendor = GMX_CPUID_VENDOR_ARM;
  #endif
-
      return vendor;
  }
  
@@ -1165,7 +1183,11 @@ gmx_cpuid_simd_suggest  (gmx_cpuid_t                 cpuid)
      }
      else if (gmx_cpuid_vendor(cpuid) == GMX_CPUID_VENDOR_ARM)
      {
-        if (gmx_cpuid_feature(cpuid, GMX_CPUID_FEATURE_ARM_NEON))
+        if (gmx_cpuid_feature(cpuid, GMX_CPUID_FEATURE_ARM_NEON_ASIMD))
+        {
+            tmpsimd = GMX_CPUID_SIMD_ARM_NEON_ASIMD;
+        }
+        else if (gmx_cpuid_feature(cpuid, GMX_CPUID_FEATURE_ARM_NEON))
          {
              tmpsimd = GMX_CPUID_SIMD_ARM_NEON;
          }
diff --git a/src/gromacs/legacyheaders/gmx_cpuid.h b/src/gromacs/legacyheaders/gmx_cpuid.h

index e9362f8c4a095364c5013ef8a5b21a59338fa13b..c6bfe9993a1a1c7cae65e0d43235842d523fd61e 100644 (file)
--- a/src/gromacs/legacyheaders/gmx_cpuid.h
+++ b/src/gromacs/legacyheaders/gmx_cpuid.h
@@ -113,6 +113,7 @@ enum gmx_cpuid_feature
      GMX_CPUID_FEATURE_X86_X2APIC,        /* Extended xAPIC Support                       */
      GMX_CPUID_FEATURE_X86_XOP,           /* AMD extended instructions, only AMD for now  */
      GMX_CPUID_FEATURE_ARM_NEON,          /* 32-bit ARM NEON                              */
+    GMX_CPUID_FEATURE_ARM_NEON_ASIMD,    /* 64-bit ARM AArch64 Advanced SIMD             */
      GMX_CPUID_NFEATURES
  };
  
@@ -135,6 +136,7 @@ enum gmx_cpuid_simd
      GMX_CPUID_SIMD_SPARC64_HPC_ACE,
      GMX_CPUID_SIMD_IBM_QPX,
      GMX_CPUID_SIMD_ARM_NEON,
+    GMX_CPUID_SIMD_ARM_NEON_ASIMD,
      GMX_CPUID_NSIMD
  };
  
diff --git a/src/gromacs/simd/impl_arm_neon_asimd/impl_arm_neon_asimd.h b/src/gromacs/simd/impl_arm_neon_asimd/impl_arm_neon_asimd.h

new file mode 100644 (file)

index 0000000..609354a
--- /dev/null
+++ b/src/gromacs/simd/impl_arm_neon_asimd/impl_arm_neon_asimd.h
@@ -0,0 +1,262 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2014, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+
+#ifndef GMX_SIMD_IMPL_ARM_NEON_ASIMD_H
+#define GMX_SIMD_IMPL_ARM_NEON_ASIMD_H
+
+#include <math.h>
+
+#include <arm_neon.h>
+
+/* ARM (AArch64) NEON Advanced SIMD instruction wrappers
+ *
+ * Please see documentation in gromacs/simd/simd.h for defines.
+ */
+
+/* Inherit single-precision and integer part from 32-bit arm */
+#include "gromacs/simd/impl_arm_neon/impl_arm_neon.h"
+
+/* Override some capability definitions from ARM 32-bit NEON - we now have double */
+#define GMX_SIMD_HAVE_DOUBLE
+#define GMX_SIMD_HAVE_DINT32
+#define GMX_SIMD_HAVE_DINT32_EXTRACT
+#define GMX_SIMD_HAVE_DINT32_LOGICAL
+#define GMX_SIMD_HAVE_DINT32_ARITHMETICS
+
+/* Implementation details */
+#define GMX_SIMD_DOUBLE_WIDTH        2
+#define GMX_SIMD_DINT32_WIDTH        2
+
+/* NEON ASIMD always has FMA support, so make sure we use that for single too. */
+#undef  gmx_simd_fmadd_f
+#define gmx_simd_fmadd_f(a, b, c)  vfmaq_f32(c, b, a)
+#undef  gmx_simd_fmsub_f
+#define gmx_simd_fmsub_f(a, b, c)  vnegq_f32(vfmsq_f32(c, b, a))
+#undef  gmx_simd_fnmadd_f
+#define gmx_simd_fnmadd_f(a, b, c) vfmsq_f32(c, b, a)
+#undef  gmx_simd_fnmsub_f
+#define gmx_simd_fnmsub_f(a, b, c) vnegq_f32(vfmaq_f32(c, b, a))
+
+/* The rounding instructions were actually added already in ARMv8, but most
+ * compilers did not add intrinsics for them. Make sure we use them for single
+ * precision too when enabling NEON Advanced SIMD.
+ */
+#undef  gmx_simd_round_f
+#define gmx_simd_round_f(x)        vrndnq_f32(x)
+#undef  gmx_simd_trunc_f
+#define gmx_simd_trunc_f(x)        vrndq_f32(x)
+
+/* NEON Advanced SIMD has a real rounding conversion instruction */
+#undef  gmx_simd_cvt_f2i
+#define gmx_simd_cvt_f2i(x)        vcvtnq_s32_f32(x)
+
+/* Since we redefine rounding/conversion-with-rounding, make
+ * sure we use the new operations by redefining the routine
+ * to set the exponent too.
+ */
+#undef  gmx_simd_set_exponent_f
+#define gmx_simd_set_exponent_f    gmx_simd_set_exponent_f_arm_neon_asimd
+
+/* We can do more efficient reduce with vector pairwise arithmetic */
+#undef  gmx_simd_reduce_f
+#define gmx_simd_reduce_f(a)       gmx_simd_reduce_f_arm_neon_asimd(a)
+
+/* Pick the largest unsigned integer as a shortcut for any-true */
+#undef  gmx_simd_anytrue_fb
+#define gmx_simd_anytrue_fb(x)     (vmaxvq_u32(x) != 0)
+#undef  gmx_simd_anytrue_fib
+#define gmx_simd_anytrue_fib(x)    (vmaxvq_u32(x) != 0)
+
+/* gcc-4.8 is missing the proper vreinterpretq casts
+ * for 64-bit operands. However, since these datatypes
+ * are opaque to the compiler we can safely cast one
+ * to the other without any conversion happening.
+ */
+
+/****************************************************
+ *      DOUBLE PRECISION SIMD IMPLEMENTATION        *
+ ****************************************************/
+#define gmx_simd_double_t          float64x2_t
+#define gmx_simd_load_d            vld1q_f64
+#define gmx_simd_load1_d           vld1q_dup_f64
+#define gmx_simd_set1_d            vdupq_n_f64
+#define gmx_simd_store_d           vst1q_f64
+#define gmx_simd_loadu_d           vld1q_f64
+#define gmx_simd_storeu_d          vst1q_f64
+#define gmx_simd_setzero_d()       vdupq_n_f64(0.0)
+#define gmx_simd_add_d             vaddq_f64
+#define gmx_simd_sub_d             vsubq_f64
+#define gmx_simd_mul_d             vmulq_f64
+#define gmx_simd_fmadd_d(a, b, c)  vfmaq_f64(c, b, a)
+#define gmx_simd_fmsub_d(a, b, c)  vnegq_f64(vfmsq_f64(c, b, a))
+#define gmx_simd_fnmadd_d(a, b, c) vfmsq_f64(c, b, a)
+#define gmx_simd_fnmsub_d(a, b, c) vnegq_f64(vfmaq_f64(c, b, a))
+#define gmx_simd_and_d(a, b)        (float64x2_t)(vandq_s64((int64x2_t)(a), (int64x2_t)(b)))
+#define gmx_simd_andnot_d(a, b)     (float64x2_t)(vbicq_s64((int64x2_t)(b), (int64x2_t)(a)))
+#define gmx_simd_or_d(a, b)         (float64x2_t)(vorrq_s64((int64x2_t)(a), (int64x2_t)(b)))
+#define gmx_simd_xor_d(a, b)        (float64x2_t)(veorq_s64((int64x2_t)(a), (int64x2_t)(b)))
+#define gmx_simd_rsqrt_d            vrsqrteq_f64
+#define gmx_simd_rsqrt_iter_d(lu, x) vmulq_f64(lu, vrsqrtsq_f64(vmulq_f64(lu, lu), x))
+#define gmx_simd_rcp_d              vrecpeq_f64
+#define gmx_simd_rcp_iter_d(lu, x)   vmulq_f64(lu, vrecpsq_f64(lu, x))
+#define gmx_simd_fabs_d(x)         vabsq_f64(x)
+#define gmx_simd_fneg_d(x)         vnegq_f64(x)
+#define gmx_simd_max_d             vmaxq_f64
+#define gmx_simd_min_d             vminq_f64
+#define gmx_simd_round_d(x)        vrndnq_f64(x)
+#define gmx_simd_trunc_d(x)        vrndq_f64(x)
+#define gmx_simd_fraction_d(x)     vsubq_f64(x, gmx_simd_trunc_d(x))
+#define gmx_simd_get_exponent_d    gmx_simd_get_exponent_d_arm_neon_asimd
+#define gmx_simd_get_mantissa_d    gmx_simd_get_mantissa_d_arm_neon_asimd
+#define gmx_simd_set_exponent_d    gmx_simd_set_exponent_d_arm_neon_asimd
+/* integer datatype corresponding to double: gmx_simd_dint32_t */
+#define gmx_simd_dint32_t          int32x2_t
+#define gmx_simd_load_di(m)        vld1_s32(m)
+#define gmx_simd_set1_di           vdup_n_s32
+#define gmx_simd_store_di(m, x)    vst1_s32(m, x)
+#define gmx_simd_loadu_di(m)       vld1_s32(m)
+#define gmx_simd_storeu_di(m, x)   vst1_s32(m, x)
+#define gmx_simd_setzero_di()      vdup_n_s32(0)
+#define gmx_simd_cvtt_d2i(x)       vmovn_s64(vcvtq_s64_f64(x))
+#define gmx_simd_cvt_d2i(x)        vmovn_s64(vcvtnq_s64_f64(x))
+#define gmx_simd_cvt_i2d(x)        vcvtq_f64_s64(vmovl_s32(x))
+#define gmx_simd_extract_di(x, i)  vget_lane_s32(x, i)
+/* Integer logical ops on gmx_simd_dint32_t */
+#define gmx_simd_slli_di           vshl_n_s32
+#define gmx_simd_srli_di           vshr_n_s32
+#define gmx_simd_and_di            vand_s32
+#define gmx_simd_andnot_di(a, b)    vbic_s32(b, a)
+#define gmx_simd_or_di             vorr_s32
+#define gmx_simd_xor_di            veor_s32
+/* Integer arithmetic ops on gmx_simd_dint32_t */
+#define gmx_simd_add_di            vadd_s32
+#define gmx_simd_sub_di            vsub_s32
+#define gmx_simd_mul_di            vmul_s32
+/* Boolean & comparison operations on gmx_simd_double_t */
+#define gmx_simd_dbool_t           uint64x2_t
+#define gmx_simd_cmpeq_d           vceqq_f64
+#define gmx_simd_cmplt_d           vcltq_f64
+#define gmx_simd_cmple_d           vcleq_f64
+#define gmx_simd_and_db            vandq_u64
+#define gmx_simd_or_db             vorrq_u64
+#define gmx_simd_anytrue_db(x)     (vmaxvq_u32((uint32x4_t)(x)) != 0)
+#define gmx_simd_blendzero_d(a, sel)     (float64x2_t)(vandq_u64((uint64x2_t)(a), sel))
+#define gmx_simd_blendnotzero_d(a, sel)  (float64x2_t)(vbicq_u64((uint64x2_t)(a), sel))
+#define gmx_simd_blendv_d(a, b, sel)     vbslq_f64(sel, b, a)
+#define gmx_simd_reduce_d(a)       gmx_simd_reduce_d_arm_neon_asimd(a)
+/* Boolean & comparison operations on gmx_simd_dint32_t */
+#define gmx_simd_dibool_t          uint32x2_t
+#define gmx_simd_cmpeq_di          vceq_s32
+#define gmx_simd_cmplt_di          vclt_s32
+#define gmx_simd_and_dib           vand_u32
+#define gmx_simd_or_dib            vorr_u32
+#define gmx_simd_anytrue_dib(x)    (vmaxv_u32(x) != 0)
+#define gmx_simd_blendzero_di(a, sel)      vand_s32(a, vreinterpret_s32_u32(sel))
+#define gmx_simd_blendnotzero_di(a, sel)  vbic_s32(a, vreinterpret_s32_u32(sel))
+#define gmx_simd_blendv_di(a, b, sel)     vbsl_s32(sel, b, a)
+/* Conversions between different booleans */
+#define gmx_simd_cvt_db2dib(x)     vqmovn_u64(x)
+#define gmx_simd_cvt_dib2db(x)     vorrq_u64(vmovl_u32(x), vshlq_n_u64(vmovl_u32(x), 32))
+
+/* Float/double conversion */
+#define gmx_simd_cvt_f2dd(f, d0, d1)  { *d0 = vcvt_f64_f32(vget_low_f32(f)); *d1 = vcvt_high_f64_f32(f); }
+#define gmx_simd_cvt_dd2f(d0, d1)     vcvt_high_f32_f64(vcvt_f32_f64(d0), d1)
+
+/****************************************************
+ * SINGLE PRECISION IMPLEMENTATION HELPER FUNCTIONS *
+ ****************************************************/
+static gmx_inline gmx_simd_float_t
+gmx_simd_set_exponent_f_arm_neon_asimd(gmx_simd_float_t x)
+{
+    int32x4_t  iexp = vcvtnq_s32_f32(x);
+
+    iexp = vshlq_n_s32(vaddq_s32(iexp, vdupq_n_s32(127)), 23);
+    return vreinterpretq_f32_s32(iexp);
+}
+
+static gmx_inline float
+gmx_simd_reduce_f_arm_neon_asimd(gmx_simd_float_t a)
+{
+    a = vpaddq_f32(a, a);
+    a = vpaddq_f32(a, a);
+    return vgetq_lane_f32(a, 0);
+}
+
+
+/****************************************************
+ * DOUBLE PRECISION IMPLEMENTATION HELPER FUNCTIONS *
+ ****************************************************/
+static gmx_inline gmx_simd_double_t
+gmx_simd_get_exponent_d_arm_neon_asimd(gmx_simd_double_t x)
+{
+    const float64x2_t expmask    = (float64x2_t)( vdupq_n_s64(0x7FF0000000000000LL) );
+    int64x2_t         iexp;
+
+    iexp = (int64x2_t)(gmx_simd_and_d(x, expmask));
+    iexp = vsubq_s64(vshrq_n_s64(iexp, 52), vdupq_n_s64(1023));
+    return vcvtq_f64_s64(iexp);
+}
+
+
+static gmx_inline gmx_simd_double_t
+gmx_simd_get_mantissa_d_arm_neon_asimd(gmx_simd_double_t x)
+{
+    const float64x2_t mantmask   = (float64x2_t)( vdupq_n_s64(0x000FFFFFFFFFFFFFLL) );
+    const float64x2_t one        = vdupq_n_f64(1.0);
+
+    /* Get mantissa */
+    x = gmx_simd_and_d(mantmask, x);
+    /* Reset zero (but correctly biased) exponent */
+    return gmx_simd_or_d(x, one);
+}
+
+
+static gmx_inline gmx_simd_double_t
+gmx_simd_set_exponent_d_arm_neon_asimd(gmx_simd_double_t x)
+{
+    int64x2_t  iexp = vcvtnq_s64_f64(x);
+
+    iexp = vshlq_n_s64(vaddq_s64(iexp, vdupq_n_s64(1023)), 52);
+    return (float64x2_t)(iexp);
+}
+
+static gmx_inline double
+gmx_simd_reduce_d_arm_neon_asimd(gmx_simd_double_t a)
+{
+    a = vpaddq_f64(a, a);
+    return vgetq_lane_f64(a, 0);
+}
+
+#endif /* GMX_SIMD_IMPL_ARM_NEON_ASIMD_H */
diff --git a/src/gromacs/simd/simd.h b/src/gromacs/simd/simd.h

index 3b3a2d852e60365dc5a0e2f51a147da36c002acd..a979fd3e45ad930a0f14bcc11af13eae5dccd09b 100644 (file)
--- a/src/gromacs/simd/simd.h
+++ b/src/gromacs/simd/simd.h
@@ -125,6 +125,8 @@ static gmx_inline double * gmx_simd4_align_d(double *p);
  #    include "gromacs/simd/impl_x86_sse2/impl_x86_sse2.h"
  #elif defined GMX_SIMD_ARM_NEON
  #    include "gromacs/simd/impl_arm_neon/impl_arm_neon.h"
+#elif defined GMX_SIMD_ARM_NEON_ASIMD
+#    include "gromacs/simd/impl_arm_neon_asimd/impl_arm_neon_asimd.h"
  #elif defined GMX_SIMD_IBM_QPX
  #    include "gromacs/simd/impl_ibm_qpx/impl_ibm_qpx.h"
  #elif defined GMX_SIMD_SPARC64_HPC_ACE
diff --git a/src/gromacs/simd/simd_math.h b/src/gromacs/simd/simd_math.h

index 84a43a7709572b475a5e2af134aee4c616e1ace5..177aed4b07650c0e6ae2a6f922e46e327ea7a443 100644 (file)
--- a/src/gromacs/simd/simd_math.h
+++ b/src/gromacs/simd/simd_math.h
@@ -1350,6 +1350,7 @@ gmx_simd_xor_sign_d(gmx_simd_double_t a, gmx_simd_double_t b)
  #endif
  }
  
+#ifndef gmx_simd_rsqrt_iter_d
  /*! \brief Perform one Newton-Raphson iteration to improve 1/sqrt(x) for SIMD double.
   *
   * \copydetails gmx_simd_rsqrt_iter_f
@@ -1363,7 +1364,7 @@ gmx_simd_rsqrt_iter_d(gmx_simd_double_t lu, gmx_simd_double_t x)
      return gmx_simd_mul_d(gmx_simd_set1_d(0.5), gmx_simd_mul_d(gmx_simd_sub_d(gmx_simd_set1_d(3.0), gmx_simd_mul_d(gmx_simd_mul_d(lu, lu), x)), lu));
  #endif
  }
-
+#endif
  
  /*! \brief Calculate 1/sqrt(x) for SIMD double
   *
@@ -1428,6 +1429,7 @@ gmx_simd_invsqrt_pair_d(gmx_simd_double_t x0,    gmx_simd_double_t x1,
  #endif
  }
  
+#ifndef gmx_simd_rcp_iter_d
  /*! \brief Perform one Newton-Raphson iteration to improve 1/x for SIMD double.
   *
   * \copydetails gmx_simd_rcp_iter_f
@@ -1437,6 +1439,7 @@ gmx_simd_rcp_iter_d(gmx_simd_double_t lu, gmx_simd_double_t x)
  {
      return gmx_simd_mul_d(lu, gmx_simd_fnmadd_d(lu, x, gmx_simd_set1_d(2.0)));
  }
+#endif
  
  /*! \brief Calculate 1/x for SIMD double.
   *
author	Erik Lindahl <erik@kth.se>
	Wed, 2 Jul 2014 12:15:32 +0000 (14:15 +0200)
committer	Mark Abraham <mark.j.abraham@gmail.com>
	Mon, 8 Sep 2014 12:21:52 +0000 (14:21 +0200)
CMakeLists.txt		patch \| blob \| history
cmake/gmxTestSimd.cmake		patch \| blob \| history
src/config.h.cmakein		patch \| blob \| history
src/gromacs/gmxlib/gmx_cpuid.c		patch \| blob \| history
src/gromacs/legacyheaders/gmx_cpuid.h		patch \| blob \| history
src/gromacs/simd/impl_arm_neon_asimd/impl_arm_neon_asimd.h	[new file with mode: 0644]	patch \| blob
src/gromacs/simd/simd.h		patch \| blob \| history
src/gromacs/simd/simd_math.h		patch \| blob \| history