From c723a327aba4d1456902d348a5d710569a2d83d4 Mon Sep 17 00:00:00 2001
From: Erik Lindahl <erik@kth.se>
Date: Tue, 1 Jul 2014 10:59:57 +0200
Subject: [PATCH] Add 32-bit ARM Neon SIMD support

This adds the low-level SIMD implementation
for 32-bit ARM Neon instructions. We will still
not generate nbnxn kernels for it; that is coming
in a future update. For this reason we will also
not enable ARM_NEON automatically in GMX_SIMD yet.
The port passes our unit tests on
tcbl04.theophys.kth.se (ARMv7).

Change-Id: I61f771970777e82fcef757ab6915e07061912957
---
 CMakeLists.txt                                |   2 +-
 cmake/gmxTestSimd.cmake                       |  20 ++
 src/config.h.cmakein                          |   3 +
 src/gromacs/gmxlib/gmx_cpuid.c                | 103 ++++++-
 src/gromacs/legacyheaders/gmx_cpuid.h         |   3 +
 src/gromacs/mdlib/nbnxn_search.c              |   2 +-
 .../simd/impl_arm_neon/impl_arm_neon.h        | 284 ++++++++++++++++++
 src/gromacs/simd/simd.h                       |   2 +
 src/gromacs/simd/simd_math.h                  |   4 +
 9 files changed, 405 insertions(+), 18 deletions(-)
 create mode 100644 src/gromacs/simd/impl_arm_neon/impl_arm_neon.h

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 66144b8960..2eefa8675b 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -211,7 +211,7 @@ gmx_option_multichoice(
     GMX_SIMD
     "SIMD instruction set for CPU kernels and compiler optimization"
     "${GMX_SUGGESTED_SIMD}"
-    None SSE2 SSE4.1 AVX_128_FMA AVX_256 AVX2_256 IBM_QPX Sparc64_HPC_ACE Reference)
+    None SSE2 SSE4.1 AVX_128_FMA AVX_256 AVX2_256 ARM_NEON IBM_QPX Sparc64_HPC_ACE Reference)
 
 gmx_option_multichoice(
     GMX_FFT_LIBRARY
diff --git a/cmake/gmxTestSimd.cmake b/cmake/gmxTestSimd.cmake
index f58429a239..47213b67f0 100644
--- a/cmake/gmxTestSimd.cmake
+++ b/cmake/gmxTestSimd.cmake
@@ -249,6 +249,26 @@ elseif(${GMX_SIMD} STREQUAL "AVX2_256")
     set(GMX_SIMD_X86_AVX2_256 1)
     set(SIMD_STATUS_MESSAGE "Enabling 256-bit AVX2 SIMD instructions")
 
+elseif(${GMX_SIMD} STREQUAL "ARM_NEON")
+
+    gmx_find_cflag_for_source(CFLAGS_ARM_NEON "C compiler 32-bit ARM NEON flag"
+                              "#include<arm_neon.h>
+                              int main(){float32x4_t x=vdupq_n_f32(0.5);x=vmlaq_f32(x,x,x);return vgetq_lane_f32(x,0)>0;}"
+                              SIMD_C_FLAGS
+                              "-mfpu=neon" "")
+    gmx_find_cxxflag_for_source(CXXFLAGS_ARM_NEON "C++ compiler 32-bit ARM NEON flag"
+                                "#include<arm_neon.h>
+                                int main(){float32x4_t x=vdupq_n_f32(0.5);x=vmlaq_f32(x,x,x);return vgetq_lane_f32(x,0)>0;}"
+                                SIMD_CXX_FLAGS
+                                "-mfpu=neon" "")
+
+    if(NOT CFLAGS_ARM_NEON OR NOT CXXFLAGS_ARM_NEON)
+        message(FATAL_ERROR "Cannot find ARM 32-bit NEON compiler flag. Use a newer compiler, or disable NEON SIMD.")
+    endif()
+
+    set(GMX_SIMD_ARM_NEON 1)
+    set(SIMD_STATUS_MESSAGE "Enabling 32-bit ARM NEON SIMD instructions")
+
 elseif(${GMX_SIMD} STREQUAL "IBM_QPX")
 
     try_compile(TEST_QPX ${CMAKE_BINARY_DIR}
diff --git a/src/config.h.cmakein b/src/config.h.cmakein
index 4dffa2952a..31e700c760 100644
--- a/src/config.h.cmakein
+++ b/src/config.h.cmakein
@@ -114,6 +114,9 @@
 /* AVX2 256-bit SIMD instruction set level was selected */
 #cmakedefine GMX_SIMD_X86_AVX2_256
 
+/* 32-bit ARM NEON SIMD instruction set level was selected */
+#cmakedefine GMX_SIMD_ARM_NEON
+
 /* IBM QPX was selected as SIMD instructions (e.g. BlueGene/Q) */
 #cmakedefine GMX_SIMD_IBM_QPX
 
diff --git a/src/gromacs/gmxlib/gmx_cpuid.c b/src/gromacs/gmxlib/gmx_cpuid.c
index c786c0e5bb..6fd35c42f9 100644
--- a/src/gromacs/gmxlib/gmx_cpuid.c
+++ b/src/gromacs/gmxlib/gmx_cpuid.c
@@ -85,7 +85,8 @@ gmx_cpuid_vendor_string[GMX_CPUID_NVENDORS] =
     "GenuineIntel",
     "AuthenticAMD",
     "Fujitsu",
-    "IBM"
+    "IBM",
+    "ARM"
 };
 
 const char *
@@ -96,7 +97,8 @@ gmx_cpuid_vendor_string_alternative[GMX_CPUID_NVENDORS] =
     "GenuineIntel",
     "AuthenticAMD",
     "Fujitsu",
-    "ibm" /* Used on BlueGene/Q */
+    "ibm", /* Used on BlueGene/Q */
+    "arm"
 };
 
 const char *
@@ -136,7 +138,8 @@ gmx_cpuid_feature_string[GMX_CPUID_NFEATURES] =
     "ssse3",
     "tdt",
     "x2apic",
-    "xop"
+    "xop",
+    "arm_neon"
 };
 
 const char *
@@ -151,18 +154,19 @@ gmx_cpuid_simd_string[GMX_CPUID_NSIMD] =
     "AVX_256",
     "AVX2_256",
     "Sparc64 HPC-ACE",
-    "IBM_QPX"
+    "IBM_QPX",
+    "ARM_NEON"
 };
 
 /* Max length of brand string */
-#define GMX_CPUID_BRAND_MAXLEN 256
+#define GMX_CPUID_STRLEN 256
 
 
 /* Contents of the abstract datatype */
 struct gmx_cpuid
 {
     enum gmx_cpuid_vendor      vendor;
-    char                       brand[GMX_CPUID_BRAND_MAXLEN];
+    char                       brand[GMX_CPUID_STRLEN];
     int                        family;
     int                        model;
     int                        stepping;
@@ -240,6 +244,8 @@ static const enum gmx_cpuid_simd compiled_simd = GMX_CPUID_SIMD_X86_AVX_128_FMA;
 static const enum gmx_cpuid_simd compiled_simd = GMX_CPUID_SIMD_X86_SSE4_1;
 #elif defined GMX_SIMD_X86_SSE2
 static const enum gmx_cpuid_simd compiled_simd = GMX_CPUID_SIMD_X86_SSE2;
+#elif defined GMX_SIMD_ARM_NEON
+static const enum gmx_cpuid_simd compiled_simd = GMX_CPUID_SIMD_ARM_NEON;
 #elif defined GMX_SIMD_SPARC64_HPC_ACE
 static const enum gmx_cpuid_simd compiled_simd = GMX_CPUID_SIMD_SPARC64_HPC_ACE;
 #elif defined GMX_SIMD_IBM_QPX
@@ -336,7 +342,7 @@ cpuid_check_common_x86(gmx_cpuid_t                cpuid)
 {
     int                       fn, max_stdfn, max_extfn;
     unsigned int              eax, ebx, ecx, edx;
-    char                      str[GMX_CPUID_BRAND_MAXLEN];
+    char                      str[GMX_CPUID_STRLEN];
     char *                    p;
 
     /* Find largest standard/extended function input value */
@@ -366,11 +372,11 @@ cpuid_check_common_x86(gmx_cpuid_t                cpuid)
         {
             p++;
         }
-        strncpy(cpuid->brand, p, GMX_CPUID_BRAND_MAXLEN);
+        strncpy(cpuid->brand, p, GMX_CPUID_STRLEN);
     }
     else
     {
-        strncpy(cpuid->brand, "Unknown CPU brand", GMX_CPUID_BRAND_MAXLEN);
+        strncpy(cpuid->brand, "Unknown CPU brand", GMX_CPUID_STRLEN);
     }
 
     /* Find basic CPU properties */
@@ -732,7 +738,6 @@ cpuid_check_intel_x86(gmx_cpuid_t                cpuid)
 
 
 
-
 static void
 chomp_substring_before_colon(const char *in, char *s, int maxlength)
 {
@@ -777,6 +782,50 @@ chomp_substring_after_colon(const char *in, char *s, int maxlength)
     }
 }
 
+static int
+cpuid_check_arm(gmx_cpuid_t                cpuid)
+{
+#if defined(__linux__) || defined(__linux)
+    FILE *fp;
+    char  buffer[GMX_CPUID_STRLEN], buffer2[GMX_CPUID_STRLEN], buffer3[GMX_CPUID_STRLEN];
+
+    if ( (fp = fopen("/proc/cpuinfo", "r")) != NULL)
+    {
+        while ( (fgets(buffer, sizeof(buffer), fp) != NULL))
+        {
+            chomp_substring_before_colon(buffer, buffer2, GMX_CPUID_STRLEN);
+            chomp_substring_after_colon(buffer, buffer3, GMX_CPUID_STRLEN);
+
+            if (!strcmp(buffer2, "Processor"))
+            {
+                strncpy(cpuid->brand, buffer3, GMX_CPUID_STRLEN);
+            }
+            else if (!strcmp(buffer2, "CPU architecture"))
+            {
+                cpuid->family = strtol(buffer3, NULL, 10);
+            }
+            else if (!strcmp(buffer2, "CPU part"))
+            {
+                cpuid->model = strtol(buffer3, NULL, 16);
+            }
+            else if (!strcmp(buffer2, "CPU revision"))
+            {
+                cpuid->stepping = strtol(buffer3, NULL, 10);
+            }
+            else if (!strcmp(buffer2, "Features") && strstr(buffer3, "neon"))
+            {
+                cpuid->feature[GMX_CPUID_FEATURE_ARM_NEON] = 1;
+            }
+        }
+    }
+    fclose(fp);
+#else
+    /* Strange non-linux platform. We cannot assume that neon is present. */
+    cpuid->feature[GMX_CPUID_FEATURE_ARM_NEON] = 0;
+#endif
+    return 0;
+}
+
 /* Try to find the vendor of the current CPU, so we know what specific
  * detection routine to call.
  */
@@ -788,7 +837,9 @@ cpuid_check_vendor(void)
     unsigned int               eax, ebx, ecx, edx;
     char                       vendorstring[13];
     FILE *                     fp;
-    char                       buffer[255], before_colon[255], after_colon[255];
+    char                       buffer[GMX_CPUID_STRLEN];
+    char                       before_colon[GMX_CPUID_STRLEN];
+    char                       after_colon[GMX_CPUID_STRLEN];
 
     /* Set default first */
     vendor = GMX_CPUID_VENDOR_UNKNOWN;
@@ -816,11 +867,15 @@ cpuid_check_vendor(void)
         while ( (vendor == GMX_CPUID_VENDOR_UNKNOWN) && (fgets(buffer, sizeof(buffer), fp) != NULL))
         {
             chomp_substring_before_colon(buffer, before_colon, sizeof(before_colon));
-            /* Intel/AMD use "vendor_id", IBM "vendor"(?) or "model". Fujitsu "manufacture". Add others if you have them! */
+            /* Intel/AMD use "vendor_id", IBM "vendor"(?) or "model". Fujitsu "manufacture".
+             * On ARM there does not seem to be a vendor, but ARM is listed in the Processor string.
+             * Add others if you have them!
+             */
             if (!strcmp(before_colon, "vendor_id")
                 || !strcmp(before_colon, "vendor")
                 || !strcmp(before_colon, "manufacture")
-                || !strcmp(before_colon, "model"))
+                || !strcmp(before_colon, "model")
+                || !strcmp(before_colon, "Processor"))
             {
                 chomp_substring_after_colon(buffer, after_colon, sizeof(after_colon));
                 for (i = GMX_CPUID_VENDOR_UNKNOWN; i < GMX_CPUID_NVENDORS; i++)
@@ -838,6 +893,12 @@ cpuid_check_vendor(void)
         }
     }
     fclose(fp);
+#elif defined(__arm__) || defined (__arm)
+    /* If we are using ARM on something that is not linux we have to trust the compiler,
+     * and we cannot get the extra info that might be present in /proc/cpuinfo.
+     * This path will not trigger 64-bit arm, which is identified by __aarch64__ instead.
+     */
+    vendor = GMX_CPUID_VENDOR_ARM;
 #endif
 
     return vendor;
@@ -905,7 +966,7 @@ gmx_cpuid_init               (gmx_cpuid_t *              pcpuid)
     gmx_cpuid_t cpuid;
     int         i;
     FILE *      fp;
-    char        buffer[255], buffer2[255];
+    char        buffer[GMX_CPUID_STRLEN], buffer2[GMX_CPUID_STRLEN];
     int         found_brand;
 
     cpuid = malloc(sizeof(*cpuid));
@@ -939,9 +1000,12 @@ gmx_cpuid_init               (gmx_cpuid_t *              pcpuid)
             cpuid_check_amd_x86(cpuid);
             break;
 #endif
+        case GMX_CPUID_VENDOR_ARM:
+            cpuid_check_arm(cpuid);
+            break;
         default:
             /* Default value */
-            strncpy(cpuid->brand, "Unknown CPU brand", GMX_CPUID_BRAND_MAXLEN);
+            strncpy(cpuid->brand, "Unknown CPU brand", GMX_CPUID_STRLEN);
 #if defined(__linux__) || defined(__linux)
             /* General Linux. Try to get CPU type from /proc/cpuinfo */
             if ( (fp = fopen("/proc/cpuinfo", "r")) != NULL)
@@ -953,7 +1017,7 @@ gmx_cpuid_init               (gmx_cpuid_t *              pcpuid)
                     /* Intel uses "model name", Fujitsu and IBM "cpu". */
                     if (!strcmp(buffer2, "model name") || !strcmp(buffer2, "cpu"))
                     {
-                        chomp_substring_after_colon(buffer, cpuid->brand, GMX_CPUID_BRAND_MAXLEN);
+                        chomp_substring_after_colon(buffer, cpuid->brand, GMX_CPUID_STRLEN);
                         found_brand = 1;
                     }
                 }
@@ -1099,6 +1163,13 @@ gmx_cpuid_simd_suggest  (gmx_cpuid_t                 cpuid)
             tmpsimd = GMX_CPUID_SIMD_IBM_QPX;
         }
     }
+    else if (gmx_cpuid_vendor(cpuid) == GMX_CPUID_VENDOR_ARM)
+    {
+        if (gmx_cpuid_feature(cpuid, GMX_CPUID_FEATURE_ARM_NEON))
+        {
+            tmpsimd = GMX_CPUID_SIMD_ARM_NEON;
+        }
+    }
     return tmpsimd;
 }
 
diff --git a/src/gromacs/legacyheaders/gmx_cpuid.h b/src/gromacs/legacyheaders/gmx_cpuid.h
index d595c51a37..e9362f8c4a 100644
--- a/src/gromacs/legacyheaders/gmx_cpuid.h
+++ b/src/gromacs/legacyheaders/gmx_cpuid.h
@@ -55,6 +55,7 @@ enum gmx_cpuid_vendor
     GMX_CPUID_VENDOR_AMD,
     GMX_CPUID_VENDOR_FUJITSU,
     GMX_CPUID_VENDOR_IBM,
+    GMX_CPUID_VENDOR_ARM,
     GMX_CPUID_NVENDORS
 };
 
@@ -111,6 +112,7 @@ enum gmx_cpuid_feature
     GMX_CPUID_FEATURE_X86_TDT,           /* TSC deadline timer                           */
     GMX_CPUID_FEATURE_X86_X2APIC,        /* Extended xAPIC Support                       */
     GMX_CPUID_FEATURE_X86_XOP,           /* AMD extended instructions, only AMD for now  */
+    GMX_CPUID_FEATURE_ARM_NEON,          /* 32-bit ARM NEON                              */
     GMX_CPUID_NFEATURES
 };
 
@@ -132,6 +134,7 @@ enum gmx_cpuid_simd
     GMX_CPUID_SIMD_X86_AVX2_256,
     GMX_CPUID_SIMD_SPARC64_HPC_ACE,
     GMX_CPUID_SIMD_IBM_QPX,
+    GMX_CPUID_SIMD_ARM_NEON,
     GMX_CPUID_NSIMD
 };
 
diff --git a/src/gromacs/mdlib/nbnxn_search.c b/src/gromacs/mdlib/nbnxn_search.c
index 8108297b36..5e2882122d 100644
--- a/src/gromacs/mdlib/nbnxn_search.c
+++ b/src/gromacs/mdlib/nbnxn_search.c
@@ -48,7 +48,7 @@
 #include "nbnxn_consts.h"
 /* nbnxn_internal.h included gromacs/simd/macros.h */
 #include "nbnxn_internal.h"
-#ifdef GMX_NBNXN_SIMD
+#ifdef GMX_SIMD
 #include "gromacs/simd/vector_operations.h"
 #endif
 #include "nbnxn_atomdata.h"
diff --git a/src/gromacs/simd/impl_arm_neon/impl_arm_neon.h b/src/gromacs/simd/impl_arm_neon/impl_arm_neon.h
new file mode 100644
index 0000000000..59d99d6300
--- /dev/null
+++ b/src/gromacs/simd/impl_arm_neon/impl_arm_neon.h
@@ -0,0 +1,284 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2014, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+
+#ifndef GMX_SIMD_IMPL_ARM_NEON_H
+#define GMX_SIMD_IMPL_ARM_NEON_H
+
+#include <math.h>
+
+#include <arm_neon.h>
+
+/* ARM 32-bit NEON SIMD instruction wrappers
+ *
+ * Please see documentation in gromacs/simd/simd.h for defines.
+ */
+
+/* Capability definitions for ARM 32-bit NEON */
+#define GMX_SIMD_HAVE_FLOAT
+#undef  GMX_SIMD_HAVE_DOUBLE
+#define GMX_SIMD_HAVE_HARDWARE
+#define GMX_SIMD_HAVE_LOADU
+#define GMX_SIMD_HAVE_STOREU
+#define GMX_SIMD_HAVE_LOGICAL
+#define GMX_SIMD_HAVE_FMA
+#undef  GMX_SIMD_HAVE_FRACTION
+#define GMX_SIMD_HAVE_FINT32
+#define GMX_SIMD_HAVE_FINT32_EXTRACT
+#define GMX_SIMD_HAVE_FINT32_LOGICAL
+#define GMX_SIMD_HAVE_FINT32_ARITHMETICS
+#undef  GMX_SIMD_HAVE_DINT32
+#undef  GMX_SIMD_HAVE_DINT32_EXTRACT
+#undef  GMX_SIMD_HAVE_DINT32_LOGICAL
+#undef  GMX_SIMD_HAVE_DINT32_ARITHMETICS
+#define GMX_SIMD4_HAVE_FLOAT
+#undef  GMX_SIMD4_HAVE_DOUBLE
+
+/* Implementation details */
+#define GMX_SIMD_FLOAT_WIDTH         4
+#undef  GMX_SIMD_DOUBLE_WIDTH
+#define GMX_SIMD_FINT32_WIDTH        4
+#undef  GMX_SIMD_DINT32_WIDTH
+#define GMX_SIMD_RSQRT_BITS          8
+#define GMX_SIMD_RCP_BITS            8
+
+/****************************************************
+ *      SINGLE PRECISION SIMD IMPLEMENTATION        *
+ ****************************************************/
+#define gmx_simd_float_t           float32x4_t
+#define gmx_simd_load_f            vld1q_f32
+#define gmx_simd_load1_f           vld1q_dup_f32
+#define gmx_simd_set1_f            vdupq_n_f32
+#define gmx_simd_store_f           vst1q_f32
+#define gmx_simd_loadu_f           vld1q_f32
+#define gmx_simd_storeu_f          vst1q_f32
+#define gmx_simd_setzero_f()       vdupq_n_f32(0.0f)
+#define gmx_simd_add_f             vaddq_f32
+#define gmx_simd_sub_f             vsubq_f32
+#define gmx_simd_mul_f             vmulq_f32
+#ifdef __ARM_FEATURE_FMA
+#    define gmx_simd_fmadd_f(a, b, c)  vfmaq_f32(c, b, a)
+#    define gmx_simd_fmsub_f(a, b, c)  vnegq_f32(vfmsq_f32(c, b, a))
+#    define gmx_simd_fnmadd_f(a, b, c) vfmaq_f32(c, b, a)
+#    define gmx_simd_fnmsub_f(a, b, c) vnegq_f32(vfmaq_f32(c, b, a))
+#else
+#    define gmx_simd_fmadd_f(a, b, c)  vmlaq_f32(c, b, a)
+#    define gmx_simd_fmsub_f(a, b, c)  vnegq_f32(vmlsq_f32(c, b, a))
+#    define gmx_simd_fnmadd_f(a, b, c) vmlsq_f32(c, b, a)
+#    define gmx_simd_fnmsub_f(a, b, c) vnegq_f32(vmlaq_f32(c, b, a))
+#endif
+#define gmx_simd_and_f(a, b)        vreinterpretq_f32_s32(vandq_s32(vreinterpretq_s32_f32(a), vreinterpretq_s32_f32(b)))
+#define gmx_simd_andnot_f(a, b)     vreinterpretq_f32_s32(vbicq_s32(vreinterpretq_s32_f32(b), vreinterpretq_s32_f32(a)))
+#define gmx_simd_or_f(a, b)         vreinterpretq_f32_s32(vorrq_s32(vreinterpretq_s32_f32(a), vreinterpretq_s32_f32(b)))
+#define gmx_simd_xor_f(a, b)        vreinterpretq_f32_s32(veorq_s32(vreinterpretq_s32_f32(a), vreinterpretq_s32_f32(b)))
+#define gmx_simd_rsqrt_f            vrsqrteq_f32
+#define gmx_simd_rsqrt_iter_f(lu, x) vmulq_f32(lu, vrsqrtsq_f32(vmulq_f32(lu, lu), x))
+#define gmx_simd_rcp_f              vrecpeq_f32
+#define gmx_simd_rcp_iter_f(lu, x)   vmulq_f32(lu, vrecpsq_f32(lu, x))
+#define gmx_simd_fabs_f(x)         vabsq_f32(x)
+#define gmx_simd_fneg_f(x)         vnegq_f32(x)
+#define gmx_simd_max_f             vmaxq_f32
+#define gmx_simd_min_f             vminq_f32
+#define gmx_simd_round_f(x)        gmx_simd_cvt_i2f(gmx_simd_cvt_f2i(x))
+#define gmx_simd_trunc_f(x)        gmx_simd_cvt_i2f(gmx_simd_cvtt_f2i(x))
+#define gmx_simd_fraction_f(x)     vsubq_f32(x, gmx_simd_trunc_f(x))
+#define gmx_simd_get_exponent_f    gmx_simd_get_exponent_f_arm_neon
+#define gmx_simd_get_mantissa_f    gmx_simd_get_mantissa_f_arm_neon
+#define gmx_simd_set_exponent_f    gmx_simd_set_exponent_f_arm_neon
+/* integer datatype corresponding to float: gmx_simd_fint32_t */
+#define gmx_simd_fint32_t         int32x4_t
+#define gmx_simd_load_fi(m)        vld1q_s32(m)
+#define gmx_simd_set1_fi           vdupq_n_s32
+#define gmx_simd_store_fi(m, x)    vst1q_s32(m, x)
+#define gmx_simd_loadu_fi(m)       vld1q_s32(m)
+#define gmx_simd_storeu_fi(m, x)   vst1q_s32(m, x)
+#define gmx_simd_setzero_fi()      vdupq_n_s32(0)
+#define gmx_simd_cvtt_f2i          vcvtq_s32_f32
+#define gmx_simd_cvt_f2i(x)        vcvtq_s32_f32(gmx_simd_add_f(gmx_simd_or_f(gmx_simd_and_f(vdupq_n_f32(-0.0f), x), vdupq_n_f32(0.5f)), x))
+#define gmx_simd_cvt_i2f           vcvtq_f32_s32
+#define gmx_simd_extract_fi(x, i)  vgetq_lane_s32(x, i)
+/* Integer logical ops on gmx_simd_fint32_t */
+#define gmx_simd_slli_fi           vshlq_n_s32
+#define gmx_simd_srli_fi           vshrq_n_s32
+#define gmx_simd_and_fi            vandq_s32
+#define gmx_simd_andnot_fi(a, b)   vbicq_s32(b, a)
+#define gmx_simd_or_fi             vorrq_s32
+#define gmx_simd_xor_fi            veorq_s32
+/* Integer arithmetic ops on gmx_simd_fint32_t */
+#define gmx_simd_add_fi            vaddq_s32
+#define gmx_simd_sub_fi            vsubq_s32
+#define gmx_simd_mul_fi            vmulq_s32
+/* Boolean & comparison operations on gmx_simd_float_t */
+#define gmx_simd_fbool_t           uint32x4_t
+#define gmx_simd_cmpeq_f           vceqq_f32
+#define gmx_simd_cmplt_f           vcltq_f32
+#define gmx_simd_cmple_f           vcleq_f32
+#define gmx_simd_and_fb            vandq_u32
+#define gmx_simd_or_fb             vorrq_u32
+#define gmx_simd_anytrue_fb        gmx_simd_anytrue_fb_arm_neon
+#define gmx_simd_blendzero_f(a, sel)     vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(a), sel))
+#define gmx_simd_blendnotzero_f(a, sel)  vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(a), sel))
+#define gmx_simd_blendv_f(a, b, sel)     vbslq_f32(sel, b, a)
+#define gmx_simd_reduce_f(a)       gmx_simd_reduce_f_arm_neon(a)
+/* Boolean & comparison operations on gmx_simd_fint32_t */
+#define gmx_simd_fibool_t          uint32x4_t
+#define gmx_simd_cmpeq_fi          vceqq_s32
+#define gmx_simd_cmplt_fi          vcltq_s32
+#define gmx_simd_and_fib           vandq_u32
+#define gmx_simd_or_fib            vorrq_u32
+#define gmx_simd_anytrue_fib       gmx_simd_anytrue_fb
+#define gmx_simd_blendzero_fi(a, sel)     vandq_s32(a, vreinterpretq_s32_u32(sel))
+#define gmx_simd_blendnotzero_fi(a, sel)  vbicq_s32(a, vreinterpretq_s32_u32(sel))
+#define gmx_simd_blendv_fi(a, b, sel)     vbslq_s32(sel, b, a)
+/* Conversions between different booleans */
+#define gmx_simd_cvt_fb2fib(x)     (x)
+#define gmx_simd_cvt_fib2fb(x)     (x)
+
+/****************************************************
+ *     NO DOUBLE PRECISION SIMD AVAILABLE           *
+ ****************************************************/
+
+
+/****************************************************
+ * SINGLE PRECISION IMPLEMENTATION HELPER FUNCTIONS *
+ ****************************************************/
+static gmx_inline gmx_simd_float_t
+gmx_simd_get_exponent_f_arm_neon(gmx_simd_float_t x)
+{
+    const float32x4_t expmask    = vreinterpretq_f32_s32( vdupq_n_s32(0x7F800000) );
+    int32x4_t         iexp;
+
+    iexp = vreinterpretq_s32_f32(gmx_simd_and_f(x, expmask));
+    iexp = vsubq_s32(vshrq_n_s32(iexp, 23), vdupq_n_s32(127));
+    return vcvtq_f32_s32(iexp);
+}
+
+
+static gmx_inline gmx_simd_float_t
+gmx_simd_get_mantissa_f_arm_neon(gmx_simd_float_t x)
+{
+    const float32x4_t mantmask   = vreinterpretq_f32_s32( vdupq_n_s32(0x007FFFFF) );
+    const float32x4_t one        = vdupq_n_f32(1.0f);
+
+    /* Get mantissa */
+    x = gmx_simd_and_f(mantmask, x);
+    /* Reset zero (but correctly biased) exponent */
+    return gmx_simd_or_f(x, one);
+}
+
+
+static gmx_inline gmx_simd_float_t
+gmx_simd_set_exponent_f_arm_neon(gmx_simd_float_t x)
+{
+    int32x4_t  iexp = gmx_simd_cvt_f2i(x);
+
+    iexp = vshlq_n_s32(vaddq_s32(iexp, vdupq_n_s32(127)), 23);
+    return vreinterpretq_f32_s32(iexp);
+}
+
+static gmx_inline float
+gmx_simd_reduce_f_arm_neon(gmx_simd_float_t a)
+{
+    float32x4_t b = vextq_f32(a, a, 2);
+
+    a = vaddq_f32(a, b);
+    b = vextq_f32(a, a, 1);
+    a = vaddq_f32(a, b);
+    return vgetq_lane_f32(a, 0);
+}
+
+static gmx_inline int
+gmx_simd_anytrue_fb_arm_neon(gmx_simd_fbool_t a)
+{
+    uint32x4_t b = vextq_u32(a, a, 2);
+
+    a = gmx_simd_or_fb(a, b);
+    b = vextq_u32(a, a, 1);
+    a = gmx_simd_or_fb(a, b);
+    return (vgetq_lane_u32(a, 0) != 0);
+}
+
+
+/* ARM 32-bit Neon is already 4-wide in single, so just reuse float type for SIMD4 */
+#define gmx_simd4_float_t                gmx_simd_float_t
+#define gmx_simd4_load_f                 gmx_simd_load_f
+#define gmx_simd4_load1_f                gmx_simd_load1_f
+#define gmx_simd4_set1_f                 gmx_simd_set1_f
+#define gmx_simd4_store_f                gmx_simd_store_f
+#define gmx_simd4_loadu_f                gmx_simd_loadu_f
+#define gmx_simd4_storeu_f               gmx_simd_storeu_f
+#define gmx_simd4_setzero_f              gmx_simd_setzero_f
+#define gmx_simd4_add_f                  gmx_simd_add_f
+#define gmx_simd4_sub_f                  gmx_simd_sub_f
+#define gmx_simd4_mul_f                  gmx_simd_mul_f
+#define gmx_simd4_fmadd_f                gmx_simd_fmadd_f
+#define gmx_simd4_fmsub_f                gmx_simd_fmsub_f
+#define gmx_simd4_fnmadd_f               gmx_simd_fnmadd_f
+#define gmx_simd4_fnmsub_f               gmx_simd_fnmsub_f
+#define gmx_simd4_and_f                  gmx_simd_and_f
+#define gmx_simd4_andnot_f               gmx_simd_andnot_f
+#define gmx_simd4_or_f                   gmx_simd_or_f
+#define gmx_simd4_xor_f                  gmx_simd_xor_f
+#define gmx_simd4_rsqrt_f                gmx_simd_rsqrt_f
+#define gmx_simd4_fabs_f                 gmx_simd_fabs_f
+#define gmx_simd4_fneg_f                 gmx_simd_fneg_f
+#define gmx_simd4_max_f                  gmx_simd_max_f
+#define gmx_simd4_min_f                  gmx_simd_min_f
+#define gmx_simd4_round_f                gmx_simd_round_f
+#define gmx_simd4_trunc_f                gmx_simd_trunc_f
+#define gmx_simd4_dotproduct3_f          gmx_simd4_dotproduct3_f_arm_neon
+#define gmx_simd4_fbool_t                gmx_simd_fbool_t
+#define gmx_simd4_cmpeq_f                gmx_simd_cmpeq_f
+#define gmx_simd4_cmplt_f                gmx_simd_cmplt_f
+#define gmx_simd4_cmple_f                gmx_simd_cmple_f
+#define gmx_simd4_and_fb                 gmx_simd_and_fb
+#define gmx_simd4_or_fb                  gmx_simd_or_fb
+#define gmx_simd4_anytrue_fb             gmx_simd_anytrue_fb
+#define gmx_simd4_blendzero_f            gmx_simd_blendzero_f
+#define gmx_simd4_blendnotzero_f         gmx_simd_blendnotzero_f
+#define gmx_simd4_blendv_f               gmx_simd_blendv_f
+#define gmx_simd4_reduce_f               gmx_simd_reduce_f
+
+/* SIMD4 Dotproduct helper function */
+static gmx_inline float
+gmx_simd4_dotproduct3_f_arm_neon(gmx_simd_float_t a, gmx_simd_float_t b)
+{
+    gmx_simd_float_t  c;
+    c = gmx_simd_mul_f(a, b);
+    /* set 4th element to 0, then add all of them */
+    c = vsetq_lane_f32(0.0f, c, 3);
+    return gmx_simd_reduce_f_arm_neon(c);
+}
+
+#endif /* GMX_SIMD_IMPL_ARM_NEON_H */
diff --git a/src/gromacs/simd/simd.h b/src/gromacs/simd/simd.h
index 49ca593f50..3b3a2d852e 100644
--- a/src/gromacs/simd/simd.h
+++ b/src/gromacs/simd/simd.h
@@ -123,6 +123,8 @@ static gmx_inline double * gmx_simd4_align_d(double *p);
 #    include "gromacs/simd/impl_x86_sse4_1/impl_x86_sse4_1.h"
 #elif defined GMX_SIMD_X86_SSE2
 #    include "gromacs/simd/impl_x86_sse2/impl_x86_sse2.h"
+#elif defined GMX_SIMD_ARM_NEON
+#    include "gromacs/simd/impl_arm_neon/impl_arm_neon.h"
 #elif defined GMX_SIMD_IBM_QPX
 #    include "gromacs/simd/impl_ibm_qpx/impl_ibm_qpx.h"
 #elif defined GMX_SIMD_SPARC64_HPC_ACE
diff --git a/src/gromacs/simd/simd_math.h b/src/gromacs/simd/simd_math.h
index 7601aba820..84a43a7709 100644
--- a/src/gromacs/simd/simd_math.h
+++ b/src/gromacs/simd/simd_math.h
@@ -132,6 +132,7 @@ gmx_simd_xor_sign_f(gmx_simd_float_t a, gmx_simd_float_t b)
 #endif
 }
 
+#ifndef gmx_simd_rsqrt_iter_f
 /*! \brief Perform one Newton-Raphson iteration to improve 1/sqrt(x) for SIMD float.
  *
  * This is a low-level routine that should only be used by SIMD math routine
@@ -150,6 +151,7 @@ gmx_simd_rsqrt_iter_f(gmx_simd_float_t lu, gmx_simd_float_t x)
     return gmx_simd_mul_f(gmx_simd_set1_f(0.5f), gmx_simd_mul_f(gmx_simd_sub_f(gmx_simd_set1_f(3.0f), gmx_simd_mul_f(gmx_simd_mul_f(lu, lu), x)), lu));
 #    endif
 }
+#endif
 
 /*! \brief Calculate 1/sqrt(x) for SIMD float.
  *
@@ -194,6 +196,7 @@ gmx_simd_invsqrt_pair_f(gmx_simd_float_t x0,    gmx_simd_float_t x1,
     *out1 = gmx_simd_invsqrt_f(x1);
 }
 
+#ifndef gmx_simd_rcp_iter_f
 /*! \brief Perform one Newton-Raphson iteration to improve 1/x for SIMD float.
  *
  * This is a low-level routine that should only be used by SIMD math routine
@@ -208,6 +211,7 @@ gmx_simd_rcp_iter_f(gmx_simd_float_t lu, gmx_simd_float_t x)
 {
     return gmx_simd_mul_f(lu, gmx_simd_fnmadd_f(lu, x, gmx_simd_set1_f(2.0f)));
 }
+#endif
 
 /*! \brief Calculate 1/x for SIMD float.
  *
-- 
2.22.0