src/gromacs/simd/impl_x86_sse2/impl_x86_sse2.h

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2014, by the GROMACS development team, led by
   5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   6  * and including many others, as listed in the AUTHORS file in the
   7  * top-level source directory and at http://www.gromacs.org.
   8  *
   9  * GROMACS is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public License
  11  * as published by the Free Software Foundation; either version 2.1
  12  * of the License, or (at your option) any later version.
  13  *
  14  * GROMACS is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public
  20  * License along with GROMACS; if not, see
  21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  23  *
  24  * If you want to redistribute modifications to GROMACS, please
  25  * consider that scientific software is very special. Version
  26  * control is crucial - bugs must be traceable. We will be happy to
  27  * consider code for inclusion in the official distribution, but
  28  * derived work must not be called official GROMACS. Details are found
  29  * in the README & COPYING files - if they are missing, get the
  30  * official version at http://www.gromacs.org.
  31  *
  32  * To help us fund GROMACS development, we humbly ask that you cite
  33  * the research papers on the package. Check out http://www.gromacs.org.
  34  */
  35
  36 #ifndef GMX_SIMD_IMPL_X86_SSE2_H
  37 #define GMX_SIMD_IMPL_X86_SSE2_H
  38
  39 #include "config.h"
  40
  41 #include <math.h>
  42
  43 #include <emmintrin.h>
  44
  45 /* Set capabilities that can be inherited */
  46 #define GMX_SIMD_X86_SSE2_OR_HIGHER
  47
  48 /* x86 SSE2 SIMD instruction wrappers
  49  *
  50  * Please see documentation in gromacs/simd/simd.h for defines.
  51  */
  52
  53 /* Capability definitions for SSE2 */
  54 #define GMX_SIMD_HAVE_FLOAT
  55 #define GMX_SIMD_HAVE_DOUBLE
  56 #define GMX_SIMD_HAVE_HARDWARE
  57 #define GMX_SIMD_HAVE_LOADU
  58 #define GMX_SIMD_HAVE_STOREU
  59 #define GMX_SIMD_HAVE_LOGICAL
  60 #undef  GMX_SIMD_HAVE_FMA
  61 #undef  GMX_SIMD_HAVE_FRACTION
  62 #define GMX_SIMD_HAVE_FINT32
  63 #define GMX_SIMD_HAVE_FINT32_EXTRACT   /* No SSE2 instruction, but use shifts */
  64 #define GMX_SIMD_HAVE_FINT32_LOGICAL
  65 #define GMX_SIMD_HAVE_FINT32_ARITHMETICS
  66 #define GMX_SIMD_HAVE_DINT32
  67 #define GMX_SIMD_HAVE_DINT32_EXTRACT   /* No SSE2 instruction, but use shifts */
  68 #define GMX_SIMD_HAVE_DINT32_LOGICAL
  69 #define GMX_SIMD_HAVE_DINT32_ARITHMETICS
  70 #define GMX_SIMD4_HAVE_FLOAT
  71 #undef  GMX_SIMD4_HAVE_DOUBLE
  72
  73 /* Implementation details */
  74 #define GMX_SIMD_FLOAT_WIDTH         4
  75 #define GMX_SIMD_DOUBLE_WIDTH        2
  76 #define GMX_SIMD_FINT32_WIDTH        4
  77 #define GMX_SIMD_DINT32_WIDTH        2
  78 #define GMX_SIMD_RSQRT_BITS         11
  79 #define GMX_SIMD_RCP_BITS           11
  80
  81 /****************************************************
  82  *      SINGLE PRECISION SIMD IMPLEMENTATION        *
  83  ****************************************************/
  84 #define gmx_simd_float_t          __m128
  85 #define gmx_simd_load_f           _mm_load_ps
  86 #define gmx_simd_load1_f          _mm_load1_ps
  87 #define gmx_simd_set1_f           _mm_set1_ps
  88 #define gmx_simd_store_f          _mm_store_ps
  89 #define gmx_simd_loadu_f          _mm_loadu_ps
  90 #define gmx_simd_storeu_f         _mm_storeu_ps
  91 #define gmx_simd_setzero_f        _mm_setzero_ps
  92 #define gmx_simd_add_f            _mm_add_ps
  93 #define gmx_simd_sub_f            _mm_sub_ps
  94 #define gmx_simd_mul_f            _mm_mul_ps
  95 #define gmx_simd_fmadd_f(a, b, c)   _mm_add_ps(_mm_mul_ps(a, b), c)
  96 #define gmx_simd_fmsub_f(a, b, c)   _mm_sub_ps(_mm_mul_ps(a, b), c)
  97 #define gmx_simd_fnmadd_f(a, b, c)  _mm_sub_ps(c, _mm_mul_ps(a, b))
  98 #define gmx_simd_fnmsub_f(a, b, c)  _mm_sub_ps(_mm_setzero_ps(), gmx_simd_fmadd_f(a, b, c))
  99 #define gmx_simd_and_f            _mm_and_ps
 100 #define gmx_simd_andnot_f         _mm_andnot_ps
 101 #define gmx_simd_or_f             _mm_or_ps
 102 #define gmx_simd_xor_f            _mm_xor_ps
 103 #define gmx_simd_rsqrt_f          _mm_rsqrt_ps
 104 #define gmx_simd_rcp_f            _mm_rcp_ps
 105 #define gmx_simd_fabs_f(x)        _mm_andnot_ps(_mm_set1_ps(GMX_FLOAT_NEGZERO), x)
 106 #define gmx_simd_fneg_f(x)        _mm_xor_ps(x, _mm_set1_ps(GMX_FLOAT_NEGZERO))
 107 #define gmx_simd_max_f            _mm_max_ps
 108 #define gmx_simd_min_f            _mm_min_ps
 109 #define gmx_simd_round_f(x)       _mm_cvtepi32_ps(_mm_cvtps_epi32(x))
 110 #define gmx_simd_trunc_f(x)       _mm_cvtepi32_ps(_mm_cvttps_epi32(x))
 111 #define gmx_simd_fraction_f(x)    _mm_sub_ps(x, gmx_simd_trunc_f(x))
 112 #define gmx_simd_get_exponent_f   gmx_simd_get_exponent_f_sse2
 113 #define gmx_simd_get_mantissa_f   gmx_simd_get_mantissa_f_sse2
 114 #define gmx_simd_set_exponent_f   gmx_simd_set_exponent_f_sse2
 115 /* integer datatype corresponding to float: gmx_simd_fint32_t */
 116 #define gmx_simd_fint32_t         __m128i
 117 #define gmx_simd_load_fi(m)       _mm_load_si128((const __m128i *)m)
 118 #define gmx_simd_set1_fi          _mm_set1_epi32
 119 #define gmx_simd_store_fi(m, x)    _mm_store_si128((__m128i *)m, x)
 120 #define gmx_simd_loadu_fi(m)      _mm_loadu_si128((const __m128i *)m)
 121 #define gmx_simd_storeu_fi(m, x)   _mm_storeu_si128((__m128i *)m, x)
 122 #define gmx_simd_setzero_fi       _mm_setzero_si128
 123 #define gmx_simd_cvt_f2i          _mm_cvtps_epi32
 124 #define gmx_simd_cvtt_f2i         _mm_cvttps_epi32
 125 #define gmx_simd_cvt_i2f          _mm_cvtepi32_ps
 126 #define gmx_simd_extract_fi(x, i)  _mm_cvtsi128_si32(_mm_srli_si128((x), 4 * (i)))
 127 /* Integer logical ops on gmx_simd_fint32_t */
 128 #define gmx_simd_slli_fi          _mm_slli_epi32
 129 #define gmx_simd_srli_fi          _mm_srli_epi32
 130 #define gmx_simd_and_fi           _mm_and_si128
 131 #define gmx_simd_andnot_fi        _mm_andnot_si128
 132 #define gmx_simd_or_fi            _mm_or_si128
 133 #define gmx_simd_xor_fi           _mm_xor_si128
 134 /* Integer arithmetic ops on gmx_simd_fint32_t */
 135 #define gmx_simd_add_fi           _mm_add_epi32
 136 #define gmx_simd_sub_fi           _mm_sub_epi32
 137 #define gmx_simd_mul_fi           gmx_simd_mul_fi_sse2
 138 /* Boolean & comparison operations on gmx_simd_float_t */
 139 #define gmx_simd_fbool_t          __m128
 140 #define gmx_simd_cmpeq_f          _mm_cmpeq_ps
 141 #define gmx_simd_cmplt_f          _mm_cmplt_ps
 142 #define gmx_simd_cmple_f          _mm_cmple_ps
 143 #define gmx_simd_and_fb           _mm_and_ps
 144 #define gmx_simd_or_fb            _mm_or_ps
 145 #define gmx_simd_anytrue_fb       _mm_movemask_ps
 146 #define gmx_simd_blendzero_f      _mm_and_ps
 147 #define gmx_simd_blendnotzero_f(a, sel)   _mm_andnot_ps(sel, a)
 148 #define gmx_simd_blendv_f(a, b, s)  _mm_or_ps(_mm_andnot_ps(s, a), _mm_and_ps(s, b))
 149 #define gmx_simd_reduce_f(a)      gmx_simd_reduce_f_sse2(a)
 150 /* Boolean & comparison operations on gmx_simd_fint32_t */
 151 #define gmx_simd_fibool_t         __m128i
 152 #define gmx_simd_cmpeq_fi         _mm_cmpeq_epi32
 153 #define gmx_simd_cmplt_fi         _mm_cmplt_epi32
 154 #define gmx_simd_and_fib          _mm_and_si128
 155 #define gmx_simd_or_fib           _mm_or_si128
 156 #define gmx_simd_anytrue_fib      _mm_movemask_epi8
 157 #define gmx_simd_blendzero_fi     _mm_and_si128
 158 #define gmx_simd_blendnotzero_fi(a, sel) _mm_andnot_si128(sel, a)
 159 #define gmx_simd_blendv_fi(a, b, s) _mm_or_si128(_mm_andnot_si128(s, a), _mm_and_si128(s, b))
 160 /* Conversions between different booleans */
 161 #define gmx_simd_cvt_fb2fib       _mm_castps_si128
 162 #define gmx_simd_cvt_fib2fb       _mm_castsi128_ps
 163
 164 /****************************************************
 165  *      DOUBLE PRECISION SIMD IMPLEMENTATION        *
 166  ****************************************************/
 167 #define gmx_simd_double_t          __m128d
 168 #define gmx_simd_load_d            _mm_load_pd
 169 #define gmx_simd_load1_d           _mm_load1_pd
 170 #define gmx_simd_set1_d            _mm_set1_pd
 171 #define gmx_simd_store_d           _mm_store_pd
 172 #define gmx_simd_loadu_d           _mm_loadu_pd
 173 #define gmx_simd_storeu_d          _mm_storeu_pd
 174 #define gmx_simd_setzero_d         _mm_setzero_pd
 175 #define gmx_simd_add_d             _mm_add_pd
 176 #define gmx_simd_sub_d             _mm_sub_pd
 177 #define gmx_simd_mul_d             _mm_mul_pd
 178 #define gmx_simd_fmadd_d(a, b, c)    _mm_add_pd(_mm_mul_pd(a, b), c)
 179 #define gmx_simd_fmsub_d(a, b, c)    _mm_sub_pd(_mm_mul_pd(a, b), c)
 180 #define gmx_simd_fnmadd_d(a, b, c)   _mm_sub_pd(c, _mm_mul_pd(a, b))
 181 #define gmx_simd_fnmsub_d(a, b, c)   _mm_sub_pd(_mm_setzero_pd(), gmx_simd_fmadd_d(a, b, c))
 182 #define gmx_simd_and_d             _mm_and_pd
 183 #define gmx_simd_andnot_d          _mm_andnot_pd
 184 #define gmx_simd_or_d              _mm_or_pd
 185 #define gmx_simd_xor_d             _mm_xor_pd
 186 #define gmx_simd_rsqrt_d(x)        _mm_cvtps_pd(_mm_rsqrt_ps(_mm_cvtpd_ps(x)))
 187 /* Don't use FMA for sqrt N-R iterations - this saves 1 instruction without FMA hardware */
 188 #define gmx_simd_rcp_d(x)          _mm_cvtps_pd(_mm_rcp_ps(_mm_cvtpd_ps(x)))
 189 #define gmx_simd_fabs_d(x)         _mm_andnot_pd(_mm_set1_pd(GMX_DOUBLE_NEGZERO), x)
 190 #define gmx_simd_fneg_d(x)         _mm_xor_pd(x, _mm_set1_pd(GMX_DOUBLE_NEGZERO))
 191 #define gmx_simd_max_d             _mm_max_pd
 192 #define gmx_simd_min_d             _mm_min_pd
 193 #define gmx_simd_round_d(x)        _mm_cvtepi32_pd(_mm_cvtpd_epi32(x))
 194 #define gmx_simd_trunc_d(x)        _mm_cvtepi32_pd(_mm_cvttpd_epi32(x))
 195 #define gmx_simd_fraction_d(x)     _mm_sub_pd(x, gmx_simd_trunc_d(x))
 196 #define gmx_simd_get_exponent_d    gmx_simd_get_exponent_d_sse2
 197 #define gmx_simd_get_mantissa_d    gmx_simd_get_mantissa_d_sse2
 198 #define gmx_simd_set_exponent_d    gmx_simd_set_exponent_d_sse2
 199 /* integer datatype corresponding to double: gmx_simd_dint32_t */
 200 #define gmx_simd_dint32_t          __m128i
 201 #define gmx_simd_load_di(m)        _mm_loadl_epi64((const __m128i *)m)
 202 #define gmx_simd_set1_di           _mm_set1_epi32
 203 #define gmx_simd_store_di(m, x)     _mm_storel_epi64((__m128i *)m, x)
 204 #define gmx_simd_loadu_di(m)       _mm_loadl_epi64((const __m128i *)m)
 205 #define gmx_simd_storeu_di(m, x)    _mm_storel_epi64((__m128i *)m, x)
 206 #define gmx_simd_setzero_di        _mm_setzero_si128
 207 #define gmx_simd_cvt_d2i           _mm_cvtpd_epi32
 208 #define gmx_simd_cvtt_d2i          _mm_cvttpd_epi32
 209 #define gmx_simd_cvt_i2d           _mm_cvtepi32_pd
 210 #define gmx_simd_extract_di(x, i)   _mm_cvtsi128_si32(_mm_srli_si128((x), 4 * (i)))
 211 /* Integer logical ops on gmx_simd_dint32_t */
 212 #define gmx_simd_slli_di           _mm_slli_epi32
 213 #define gmx_simd_srli_di           _mm_srli_epi32
 214 #define gmx_simd_and_di            _mm_and_si128
 215 #define gmx_simd_andnot_di         _mm_andnot_si128
 216 #define gmx_simd_or_di             _mm_or_si128
 217 #define gmx_simd_xor_di            _mm_xor_si128
 218 /* Integer arithmetic ops on integer datatype corresponding to double */
 219 #define gmx_simd_add_di            _mm_add_epi32
 220 #define gmx_simd_sub_di            _mm_sub_epi32
 221 #define gmx_simd_mul_di            gmx_simd_mul_di_sse2
 222 /* Boolean & comparison operations on gmx_simd_double_t */
 223 #define gmx_simd_dbool_t            __m128d
 224 #define gmx_simd_cmpeq_d            _mm_cmpeq_pd
 225 #define gmx_simd_cmplt_d            _mm_cmplt_pd
 226 #define gmx_simd_cmple_d            _mm_cmple_pd
 227 #define gmx_simd_and_db             _mm_and_pd
 228 #define gmx_simd_or_db              _mm_or_pd
 229 #define gmx_simd_anytrue_db         _mm_movemask_pd
 230 #define gmx_simd_blendzero_d        _mm_and_pd
 231 #define gmx_simd_blendnotzero_d(a, sel) _mm_andnot_pd(sel, a)
 232 #define gmx_simd_blendv_d(a, b, sel)  _mm_or_pd(_mm_andnot_pd(sel, a), _mm_and_pd(sel, b))
 233 #define gmx_simd_reduce_d(a)        gmx_simd_reduce_d_sse2(a)
 234
 235 /* Boolean & comparison operations on gmx_simd_dint32_t */
 236 #define gmx_simd_dibool_t           __m128i
 237 #define gmx_simd_cmpeq_di           _mm_cmpeq_epi32
 238 #define gmx_simd_cmplt_di           _mm_cmplt_epi32
 239 #define gmx_simd_and_dib            _mm_and_si128
 240 #define gmx_simd_or_dib             _mm_or_si128
 241 #define gmx_simd_anytrue_dib(x)     _mm_movemask_epi8(_mm_shuffle_epi32(x, _MM_SHUFFLE(1, 0, 1, 0)))
 242 #define gmx_simd_blendzero_di       _mm_and_si128
 243 #define gmx_simd_blendnotzero_di(a, sel)  _mm_andnot_si128(sel, a)
 244 #define gmx_simd_blendv_di(a, b, sel) _mm_or_si128(_mm_andnot_si128(sel, a), _mm_and_si128(sel, b))
 245 #define gmx_simd_cvt_db2dib(x)      _mm_shuffle_epi32(_mm_castpd_si128(x), _MM_SHUFFLE(2, 0, 2, 0))
 246 #define gmx_simd_cvt_dib2db(x)      _mm_castsi128_pd(_mm_shuffle_epi32(x, _MM_SHUFFLE(1, 1, 0, 0)))
 247 /* Float/double conversion */
 248 #define gmx_simd_cvt_f2dd(f, d0, d1)  { *d0 = _mm_cvtps_pd(f); *d1 = _mm_cvtps_pd(_mm_movehl_ps(f, f)); }
 249 #define gmx_simd_cvt_dd2f(d0, d1)    _mm_movelh_ps(_mm_cvtpd_ps(d0), _mm_cvtpd_ps(d1))
 250
 251
 252 /****************************************************
 253  * SINGLE PRECISION IMPLEMENTATION HELPER FUNCTIONS *
 254  ****************************************************/
 255 static gmx_inline __m128 gmx_simdcall
 256 gmx_simd_get_exponent_f_sse2(__m128 x)
 257 {
 258     const __m128  expmask      = _mm_castsi128_ps(_mm_set1_epi32(0x7F800000));
 259     const __m128i expbias      = _mm_set1_epi32(127);
 260     __m128i       iexp;
 261
 262     iexp = _mm_castps_si128(_mm_and_ps(x, expmask));
 263     iexp = _mm_sub_epi32(_mm_srli_epi32(iexp, 23), expbias);
 264     return _mm_cvtepi32_ps(iexp);
 265 }
 266
 267 static gmx_inline __m128 gmx_simdcall
 268 gmx_simd_get_mantissa_f_sse2(__m128 x)
 269 {
 270     const __m128 mantmask = _mm_castsi128_ps(_mm_set1_epi32(0x007FFFFF));
 271     const __m128 one      = _mm_set1_ps(1.0f);
 272
 273     x = _mm_and_ps(x, mantmask);
 274     return _mm_or_ps(x, one);
 275 }
 276
 277 static gmx_inline __m128 gmx_simdcall
 278 gmx_simd_set_exponent_f_sse2(__m128 x)
 279 {
 280     const __m128i expbias      = _mm_set1_epi32(127);
 281     __m128i       iexp         = _mm_cvtps_epi32(x);
 282
 283     iexp = _mm_slli_epi32(_mm_add_epi32(iexp, expbias), 23);
 284     return _mm_castsi128_ps(iexp);
 285 }
 286
 287 static gmx_inline __m128i gmx_simdcall
 288 gmx_simd_mul_fi_sse2(__m128i a, __m128i b)
 289 {
 290     __m128i a1 = _mm_srli_si128(a, 4); /* - a[3] a[2] a[1] */
 291     __m128i b1 = _mm_srli_si128(b, 4); /* - b[3] b[2] b[1] */
 292     __m128i c  = _mm_mul_epu32(a, b);
 293     __m128i c1 = _mm_mul_epu32(a1, b1);
 294
 295     c  = _mm_shuffle_epi32(c, _MM_SHUFFLE(3, 1, 2, 0));  /* - - a[2]*b[2] a[0]*b[0] */
 296     c1 = _mm_shuffle_epi32(c1, _MM_SHUFFLE(3, 1, 2, 0)); /* - - a[3]*b[3] a[1]*b[1] */
 297
 298     return _mm_unpacklo_epi32(c, c1);
 299 }
 300
 301 static gmx_inline float gmx_simdcall
 302 gmx_simd_reduce_f_sse2(__m128 a)
 303 {
 304     __m128 b;
 305     float  f;
 306     b = _mm_add_ps(a, _mm_shuffle_ps(a, a, _MM_SHUFFLE(1, 0, 3, 2)));
 307     b = _mm_add_ss(b, _mm_shuffle_ps(b, b, _MM_SHUFFLE(0, 3, 2, 1)));
 308     _mm_store_ss(&f, b);
 309     return f;
 310 }
 311
 312 /****************************************************
 313  * DOUBLE PRECISION IMPLEMENTATION HELPER FUNCTIONS *
 314  ****************************************************/
 315 static gmx_inline __m128d gmx_simdcall
 316 gmx_simd_get_exponent_d_sse2(__m128d x)
 317 {
 318     /* Don't use _mm_set1_epi64x() - on MSVC it is only supported for 64-bit builds */
 319     const __m128d expmask      = _mm_castsi128_pd( _mm_set_epi32(0x7FF00000, 0x00000000, 0x7FF00000, 0x00000000) );
 320     const __m128i expbias      = _mm_set1_epi32(1023);
 321     __m128i       iexp;
 322
 323     iexp   = _mm_castpd_si128(_mm_and_pd(x, expmask));
 324     iexp   = _mm_sub_epi32(_mm_srli_epi64(iexp, 52), expbias);
 325     iexp   = _mm_shuffle_epi32(iexp, _MM_SHUFFLE(3, 1, 2, 0) );
 326     return _mm_cvtepi32_pd(iexp);
 327 }
 328
 329 static gmx_inline __m128d gmx_simdcall
 330 gmx_simd_get_mantissa_d_sse2(__m128d x)
 331 {
 332     /* Don't use _mm_set1_epi64x() - on MSVC it is only supported for 64-bit builds */
 333     const __m128d mantmask = _mm_castsi128_pd( _mm_set_epi32(0x000FFFFF, 0xFFFFFFFF, 0x000FFFFF, 0xFFFFFFFF) );
 334     const __m128d one      = _mm_set1_pd(1.0);
 335
 336     x = _mm_and_pd(x, mantmask);
 337     return _mm_or_pd(x, one);
 338 }
 339
 340 static gmx_inline __m128d gmx_simdcall
 341 gmx_simd_set_exponent_d_sse2(__m128d x)
 342 {
 343     const __m128i  expbias      = _mm_set1_epi32(1023);
 344     __m128i        iexp         = _mm_cvtpd_epi32(x);
 345
 346     /* After conversion integers will be in slot 0,1. Move them to 0,2 so
 347      * we can do a 64-bit shift and get them to the dp exponents. */
 348     iexp = _mm_shuffle_epi32(iexp, _MM_SHUFFLE(3, 1, 2, 0));
 349     iexp = _mm_slli_epi64(_mm_add_epi32(iexp, expbias), 52);
 350     return _mm_castsi128_pd(iexp);
 351 }
 352
 353 static gmx_inline __m128i gmx_simdcall
 354 gmx_simd_mul_di_sse2(__m128i a, __m128i b)
 355 {
 356     __m128i c;
 357
 358     a = _mm_unpacklo_epi32(a, _mm_setzero_si128());       /* 0 a[1] 0 a[0] */
 359     b = _mm_unpacklo_epi32(b, _mm_setzero_si128());       /* 0 b[1] 0 b[0] */
 360
 361     c  = _mm_mul_epu32(a, b);                             /* 0 a[1]*b[1] 0 a[0]*b[0] */
 362     return _mm_shuffle_epi32(c, _MM_SHUFFLE(3, 1, 2, 0)); /* 0 0 a[1]*b[1] a[0]*b[0] */
 363 }
 364
 365 static gmx_inline double gmx_simdcall
 366 gmx_simd_reduce_d_sse2(__m128d a)
 367 {
 368     __m128d b;
 369     double  f;
 370
 371     b = _mm_add_sd(a, _mm_shuffle_pd(a, a, _MM_SHUFFLE2(1, 1)));
 372     _mm_store_sd(&f, b);
 373     return f;
 374 }
 375
 376 /* Function to check whether SIMD operations have resulted in overflow */
 377 static int
 378 gmx_simd_check_and_reset_overflow(void)
 379 {
 380     int MXCSR;
 381     int sse_overflow;
 382
 383     MXCSR = _mm_getcsr();
 384     /* The overflow flag is bit 3 in the register */
 385     if (MXCSR & 0x0008)
 386     {
 387         sse_overflow = 1;
 388         /* Set the overflow flag to zero */
 389         MXCSR = MXCSR & 0xFFF7;
 390         _mm_setcsr(MXCSR);
 391     }
 392     else
 393     {
 394         sse_overflow = 0;
 395     }
 396     return sse_overflow;
 397 }
 398
 399 /* SSE2 is already 4-wide in single, so we just reuse float datatype for SIMD4.
 400  * SSE2 cannot do double-precision SIMD4.
 401  */
 402 #define gmx_simd4_float_t                gmx_simd_float_t
 403 #define gmx_simd4_load_f                 gmx_simd_load_f
 404 #define gmx_simd4_load1_f                gmx_simd_load1_f
 405 #define gmx_simd4_set1_f                 gmx_simd_set1_f
 406 #define gmx_simd4_store_f                gmx_simd_store_f
 407 #define gmx_simd4_loadu_f                gmx_simd_loadu_f
 408 #define gmx_simd4_storeu_f               gmx_simd_storeu_f
 409 #define gmx_simd4_setzero_f              gmx_simd_setzero_f
 410 #define gmx_simd4_add_f                  gmx_simd_add_f
 411 #define gmx_simd4_sub_f                  gmx_simd_sub_f
 412 #define gmx_simd4_mul_f                  gmx_simd_mul_f
 413 #define gmx_simd4_fmadd_f                gmx_simd_fmadd_f
 414 #define gmx_simd4_fmsub_f                gmx_simd_fmsub_f
 415 #define gmx_simd4_fnmadd_f               gmx_simd_fnmadd_f
 416 #define gmx_simd4_fnmsub_f               gmx_simd_fnmsub_f
 417 #define gmx_simd4_and_f                  gmx_simd_and_f
 418 #define gmx_simd4_andnot_f               gmx_simd_andnot_f
 419 #define gmx_simd4_or_f                   gmx_simd_or_f
 420 #define gmx_simd4_xor_f                  gmx_simd_xor_f
 421 #define gmx_simd4_rsqrt_f                gmx_simd_rsqrt_f
 422 #define gmx_simd4_fabs_f                 gmx_simd_fabs_f
 423 #define gmx_simd4_fneg_f                 gmx_simd_fneg_f
 424 #define gmx_simd4_max_f                  gmx_simd_max_f
 425 #define gmx_simd4_min_f                  gmx_simd_min_f
 426 #define gmx_simd4_round_f                gmx_simd_round_f
 427 #define gmx_simd4_trunc_f                gmx_simd_trunc_f
 428 #define gmx_simd4_dotproduct3_f          gmx_simd4_dotproduct3_f_sse2
 429 #define gmx_simd4_fbool_t                gmx_simd_fbool_t
 430 #define gmx_simd4_cmpeq_f                gmx_simd_cmpeq_f
 431 #define gmx_simd4_cmplt_f                gmx_simd_cmplt_f
 432 #define gmx_simd4_cmple_f                gmx_simd_cmple_f
 433 #define gmx_simd4_and_fb                 gmx_simd_and_fb
 434 #define gmx_simd4_or_fb                  gmx_simd_or_fb
 435 #define gmx_simd4_anytrue_fb             gmx_simd_anytrue_fb
 436 #define gmx_simd4_blendzero_f            gmx_simd_blendzero_f
 437 #define gmx_simd4_blendnotzero_f         gmx_simd_blendnotzero_f
 438 #define gmx_simd4_blendv_f               gmx_simd_blendv_f
 439 #define gmx_simd4_reduce_f               gmx_simd_reduce_f
 440
 441 /* SIMD4 Dotproduct helper function */
 442 static gmx_inline float gmx_simdcall
 443 gmx_simd4_dotproduct3_f_sse2(__m128 a, __m128 b)
 444 {
 445     float  f;
 446     __m128 c;
 447     a = _mm_mul_ps(a, b);
 448     c = _mm_add_ps(a, _mm_shuffle_ps(a, a, _MM_SHUFFLE(2, 1, 2, 1)));
 449     c = _mm_add_ps(c, _mm_shuffle_ps(a, a, _MM_SHUFFLE(3, 2, 3, 2)));
 450     _mm_store_ss(&f, c);
 451     return f;
 452 }
 453
 454 #endif /* GMX_SIMD_IMPL_X86_SSE2_H */