src/gromacs/simd/impl_arm_neon_asimd/impl_arm_neon_asimd.h

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2014, by the GROMACS development team, led by
   5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   6  * and including many others, as listed in the AUTHORS file in the
   7  * top-level source directory and at http://www.gromacs.org.
   8  *
   9  * GROMACS is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public License
  11  * as published by the Free Software Foundation; either version 2.1
  12  * of the License, or (at your option) any later version.
  13  *
  14  * GROMACS is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public
  20  * License along with GROMACS; if not, see
  21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  23  *
  24  * If you want to redistribute modifications to GROMACS, please
  25  * consider that scientific software is very special. Version
  26  * control is crucial - bugs must be traceable. We will be happy to
  27  * consider code for inclusion in the official distribution, but
  28  * derived work must not be called official GROMACS. Details are found
  29  * in the README & COPYING files - if they are missing, get the
  30  * official version at http://www.gromacs.org.
  31  *
  32  * To help us fund GROMACS development, we humbly ask that you cite
  33  * the research papers on the package. Check out http://www.gromacs.org.
  34  */
  35
  36 #ifndef GMX_SIMD_IMPL_ARM_NEON_ASIMD_H
  37 #define GMX_SIMD_IMPL_ARM_NEON_ASIMD_H
  38
  39 #include <math.h>
  40
  41 #include <arm_neon.h>
  42
  43 /* ARM (AArch64) NEON Advanced SIMD instruction wrappers
  44  *
  45  * Please see documentation in gromacs/simd/simd.h for defines.
  46  */
  47
  48 /* Inherit single-precision and integer part from 32-bit arm */
  49 #include "gromacs/simd/impl_arm_neon/impl_arm_neon.h"
  50
  51 /* Override some capability definitions from ARM 32-bit NEON - we now have double */
  52 #define GMX_SIMD_HAVE_DOUBLE
  53 #define GMX_SIMD_HAVE_DINT32
  54 #define GMX_SIMD_HAVE_DINT32_EXTRACT
  55 #define GMX_SIMD_HAVE_DINT32_LOGICAL
  56 #define GMX_SIMD_HAVE_DINT32_ARITHMETICS
  57
  58 /* Implementation details */
  59 #define GMX_SIMD_DOUBLE_WIDTH        2
  60 #define GMX_SIMD_DINT32_WIDTH        2
  61
  62 /* NEON ASIMD always has FMA support, so make sure we use that for single too. */
  63 #undef  gmx_simd_fmadd_f
  64 #define gmx_simd_fmadd_f(a, b, c)  vfmaq_f32(c, b, a)
  65 #undef  gmx_simd_fmsub_f
  66 #define gmx_simd_fmsub_f(a, b, c)  vnegq_f32(vfmsq_f32(c, b, a))
  67 #undef  gmx_simd_fnmadd_f
  68 #define gmx_simd_fnmadd_f(a, b, c) vfmsq_f32(c, b, a)
  69 #undef  gmx_simd_fnmsub_f
  70 #define gmx_simd_fnmsub_f(a, b, c) vnegq_f32(vfmaq_f32(c, b, a))
  71
  72 /* The rounding instructions were actually added already in ARMv8, but most
  73  * compilers did not add intrinsics for them. Make sure we use them for single
  74  * precision too when enabling NEON Advanced SIMD.
  75  */
  76 #undef  gmx_simd_round_f
  77 #define gmx_simd_round_f(x)        vrndnq_f32(x)
  78 #undef  gmx_simd_trunc_f
  79 #define gmx_simd_trunc_f(x)        vrndq_f32(x)
  80
  81 /* NEON Advanced SIMD has a real rounding conversion instruction */
  82 #undef  gmx_simd_cvt_f2i
  83 #define gmx_simd_cvt_f2i(x)        vcvtnq_s32_f32(x)
  84
  85 /* Since we redefine rounding/conversion-with-rounding, make
  86  * sure we use the new operations by redefining the routine
  87  * to set the exponent too.
  88  */
  89 #undef  gmx_simd_set_exponent_f
  90 #define gmx_simd_set_exponent_f    gmx_simd_set_exponent_f_arm_neon_asimd
  91
  92 /* We can do more efficient reduce with vector pairwise arithmetic */
  93 #undef  gmx_simd_reduce_f
  94 #define gmx_simd_reduce_f(a)       gmx_simd_reduce_f_arm_neon_asimd(a)
  95
  96 /* Pick the largest unsigned integer as a shortcut for any-true */
  97 #undef  gmx_simd_anytrue_fb
  98 #define gmx_simd_anytrue_fb(x)     (vmaxvq_u32(x) != 0)
  99 #undef  gmx_simd_anytrue_fib
 100 #define gmx_simd_anytrue_fib(x)    (vmaxvq_u32(x) != 0)
 101
 102 /* gcc-4.8 is missing the proper vreinterpretq casts
 103  * for 64-bit operands. However, since these datatypes
 104  * are opaque to the compiler we can safely cast one
 105  * to the other without any conversion happening.
 106  */
 107
 108 /****************************************************
 109  *      DOUBLE PRECISION SIMD IMPLEMENTATION        *
 110  ****************************************************/
 111 #define gmx_simd_double_t          float64x2_t
 112 #define gmx_simd_load_d            vld1q_f64
 113 #define gmx_simd_load1_d           vld1q_dup_f64
 114 #define gmx_simd_set1_d            vdupq_n_f64
 115 #define gmx_simd_store_d           vst1q_f64
 116 #define gmx_simd_loadu_d           vld1q_f64
 117 #define gmx_simd_storeu_d          vst1q_f64
 118 #define gmx_simd_setzero_d()       vdupq_n_f64(0.0)
 119 #define gmx_simd_add_d             vaddq_f64
 120 #define gmx_simd_sub_d             vsubq_f64
 121 #define gmx_simd_mul_d             vmulq_f64
 122 #define gmx_simd_fmadd_d(a, b, c)  vfmaq_f64(c, b, a)
 123 #define gmx_simd_fmsub_d(a, b, c)  vnegq_f64(vfmsq_f64(c, b, a))
 124 #define gmx_simd_fnmadd_d(a, b, c) vfmsq_f64(c, b, a)
 125 #define gmx_simd_fnmsub_d(a, b, c) vnegq_f64(vfmaq_f64(c, b, a))
 126 #define gmx_simd_and_d(a, b)        (float64x2_t)(vandq_s64((int64x2_t)(a), (int64x2_t)(b)))
 127 #define gmx_simd_andnot_d(a, b)     (float64x2_t)(vbicq_s64((int64x2_t)(b), (int64x2_t)(a)))
 128 #define gmx_simd_or_d(a, b)         (float64x2_t)(vorrq_s64((int64x2_t)(a), (int64x2_t)(b)))
 129 #define gmx_simd_xor_d(a, b)        (float64x2_t)(veorq_s64((int64x2_t)(a), (int64x2_t)(b)))
 130 #define gmx_simd_rsqrt_d            vrsqrteq_f64
 131 #define gmx_simd_rsqrt_iter_d(lu, x) vmulq_f64(lu, vrsqrtsq_f64(vmulq_f64(lu, lu), x))
 132 #define gmx_simd_rcp_d              vrecpeq_f64
 133 #define gmx_simd_rcp_iter_d(lu, x)   vmulq_f64(lu, vrecpsq_f64(lu, x))
 134 #define gmx_simd_fabs_d(x)         vabsq_f64(x)
 135 #define gmx_simd_fneg_d(x)         vnegq_f64(x)
 136 #define gmx_simd_max_d             vmaxq_f64
 137 #define gmx_simd_min_d             vminq_f64
 138 #define gmx_simd_round_d(x)        vrndnq_f64(x)
 139 #define gmx_simd_trunc_d(x)        vrndq_f64(x)
 140 #define gmx_simd_fraction_d(x)     vsubq_f64(x, gmx_simd_trunc_d(x))
 141 #define gmx_simd_get_exponent_d    gmx_simd_get_exponent_d_arm_neon_asimd
 142 #define gmx_simd_get_mantissa_d    gmx_simd_get_mantissa_d_arm_neon_asimd
 143 #define gmx_simd_set_exponent_d    gmx_simd_set_exponent_d_arm_neon_asimd
 144 /* integer datatype corresponding to double: gmx_simd_dint32_t */
 145 #define gmx_simd_dint32_t          int32x2_t
 146 #define gmx_simd_load_di(m)        vld1_s32(m)
 147 #define gmx_simd_set1_di           vdup_n_s32
 148 #define gmx_simd_store_di(m, x)    vst1_s32(m, x)
 149 #define gmx_simd_loadu_di(m)       vld1_s32(m)
 150 #define gmx_simd_storeu_di(m, x)   vst1_s32(m, x)
 151 #define gmx_simd_setzero_di()      vdup_n_s32(0)
 152 #define gmx_simd_cvtt_d2i(x)       vmovn_s64(vcvtq_s64_f64(x))
 153 #define gmx_simd_cvt_d2i(x)        vmovn_s64(vcvtnq_s64_f64(x))
 154 #define gmx_simd_cvt_i2d(x)        vcvtq_f64_s64(vmovl_s32(x))
 155 #define gmx_simd_extract_di(x, i)  vget_lane_s32(x, i)
 156 /* Integer logical ops on gmx_simd_dint32_t */
 157 #define gmx_simd_slli_di           vshl_n_s32
 158 #define gmx_simd_srli_di           vshr_n_s32
 159 #define gmx_simd_and_di            vand_s32
 160 #define gmx_simd_andnot_di(a, b)    vbic_s32(b, a)
 161 #define gmx_simd_or_di             vorr_s32
 162 #define gmx_simd_xor_di            veor_s32
 163 /* Integer arithmetic ops on gmx_simd_dint32_t */
 164 #define gmx_simd_add_di            vadd_s32
 165 #define gmx_simd_sub_di            vsub_s32
 166 #define gmx_simd_mul_di            vmul_s32
 167 /* Boolean & comparison operations on gmx_simd_double_t */
 168 #define gmx_simd_dbool_t           uint64x2_t
 169 #define gmx_simd_cmpeq_d           vceqq_f64
 170 #define gmx_simd_cmplt_d           vcltq_f64
 171 #define gmx_simd_cmple_d           vcleq_f64
 172 #define gmx_simd_and_db            vandq_u64
 173 #define gmx_simd_or_db             vorrq_u64
 174 #define gmx_simd_anytrue_db(x)     (vmaxvq_u32((uint32x4_t)(x)) != 0)
 175 #define gmx_simd_blendzero_d(a, sel)     (float64x2_t)(vandq_u64((uint64x2_t)(a), sel))
 176 #define gmx_simd_blendnotzero_d(a, sel)  (float64x2_t)(vbicq_u64((uint64x2_t)(a), sel))
 177 #define gmx_simd_blendv_d(a, b, sel)     vbslq_f64(sel, b, a)
 178 #define gmx_simd_reduce_d(a)       gmx_simd_reduce_d_arm_neon_asimd(a)
 179 /* Boolean & comparison operations on gmx_simd_dint32_t */
 180 #define gmx_simd_dibool_t          uint32x2_t
 181 #define gmx_simd_cmpeq_di          vceq_s32
 182 #define gmx_simd_cmplt_di          vclt_s32
 183 #define gmx_simd_and_dib           vand_u32
 184 #define gmx_simd_or_dib            vorr_u32
 185 #define gmx_simd_anytrue_dib(x)    (vmaxv_u32(x) != 0)
 186 #define gmx_simd_blendzero_di(a, sel)      vand_s32(a, vreinterpret_s32_u32(sel))
 187 #define gmx_simd_blendnotzero_di(a, sel)  vbic_s32(a, vreinterpret_s32_u32(sel))
 188 #define gmx_simd_blendv_di(a, b, sel)     vbsl_s32(sel, b, a)
 189 /* Conversions between different booleans */
 190 #define gmx_simd_cvt_db2dib(x)     vqmovn_u64(x)
 191 #define gmx_simd_cvt_dib2db(x)     vorrq_u64(vmovl_u32(x), vshlq_n_u64(vmovl_u32(x), 32))
 192
 193 /* Float/double conversion */
 194 #define gmx_simd_cvt_f2dd(f, d0, d1)  { *d0 = vcvt_f64_f32(vget_low_f32(f)); *d1 = vcvt_high_f64_f32(f); }
 195 #define gmx_simd_cvt_dd2f(d0, d1)     vcvt_high_f32_f64(vcvt_f32_f64(d0), d1)
 196
 197 /****************************************************
 198  * SINGLE PRECISION IMPLEMENTATION HELPER FUNCTIONS *
 199  ****************************************************/
 200 static gmx_inline gmx_simd_float_t
 201 gmx_simd_set_exponent_f_arm_neon_asimd(gmx_simd_float_t x)
 202 {
 203     int32x4_t  iexp = vcvtnq_s32_f32(x);
 204
 205     iexp = vshlq_n_s32(vaddq_s32(iexp, vdupq_n_s32(127)), 23);
 206     return vreinterpretq_f32_s32(iexp);
 207 }
 208
 209 static gmx_inline float
 210 gmx_simd_reduce_f_arm_neon_asimd(gmx_simd_float_t a)
 211 {
 212     a = vpaddq_f32(a, a);
 213     a = vpaddq_f32(a, a);
 214     return vgetq_lane_f32(a, 0);
 215 }
 216
 217
 218 /****************************************************
 219  * DOUBLE PRECISION IMPLEMENTATION HELPER FUNCTIONS *
 220  ****************************************************/
 221 static gmx_inline gmx_simd_double_t
 222 gmx_simd_get_exponent_d_arm_neon_asimd(gmx_simd_double_t x)
 223 {
 224     const float64x2_t expmask    = (float64x2_t)( vdupq_n_s64(0x7FF0000000000000LL) );
 225     int64x2_t         iexp;
 226
 227     iexp = (int64x2_t)(gmx_simd_and_d(x, expmask));
 228     iexp = vsubq_s64(vshrq_n_s64(iexp, 52), vdupq_n_s64(1023));
 229     return vcvtq_f64_s64(iexp);
 230 }
 231
 232
 233 static gmx_inline gmx_simd_double_t
 234 gmx_simd_get_mantissa_d_arm_neon_asimd(gmx_simd_double_t x)
 235 {
 236     const float64x2_t mantmask   = (float64x2_t)( vdupq_n_s64(0x000FFFFFFFFFFFFFLL) );
 237     const float64x2_t one        = vdupq_n_f64(1.0);
 238
 239     /* Get mantissa */
 240     x = gmx_simd_and_d(mantmask, x);
 241     /* Reset zero (but correctly biased) exponent */
 242     return gmx_simd_or_d(x, one);
 243 }
 244
 245
 246 static gmx_inline gmx_simd_double_t
 247 gmx_simd_set_exponent_d_arm_neon_asimd(gmx_simd_double_t x)
 248 {
 249     int64x2_t  iexp = vcvtnq_s64_f64(x);
 250
 251     iexp = vshlq_n_s64(vaddq_s64(iexp, vdupq_n_s64(1023)), 52);
 252     return (float64x2_t)(iexp);
 253 }
 254
 255 static gmx_inline double
 256 gmx_simd_reduce_d_arm_neon_asimd(gmx_simd_double_t a)
 257 {
 258     a = vpaddq_f64(a, a);
 259     return vgetq_lane_f64(a, 0);
 260 }
 261
 262 #endif /* GMX_SIMD_IMPL_ARM_NEON_ASIMD_H */