src/gromacs/simd/impl_x86_avx_256/impl_x86_avx_256.h

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2014, by the GROMACS development team, led by
   5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   6  * and including many others, as listed in the AUTHORS file in the
   7  * top-level source directory and at http://www.gromacs.org.
   8  *
   9  * GROMACS is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public License
  11  * as published by the Free Software Foundation; either version 2.1
  12  * of the License, or (at your option) any later version.
  13  *
  14  * GROMACS is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public
  20  * License along with GROMACS; if not, see
  21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  23  *
  24  * If you want to redistribute modifications to GROMACS, please
  25  * consider that scientific software is very special. Version
  26  * control is crucial - bugs must be traceable. We will be happy to
  27  * consider code for inclusion in the official distribution, but
  28  * derived work must not be called official GROMACS. Details are found
  29  * in the README & COPYING files - if they are missing, get the
  30  * official version at http://www.gromacs.org.
  31  *
  32  * To help us fund GROMACS development, we humbly ask that you cite
  33  * the research papers on the package. Check out http://www.gromacs.org.
  34  */
  35
  36 #ifndef GMX_SIMD_IMPL_X86_AVX_256_H
  37 #define GMX_SIMD_IMPL_X86_AVX_256_H
  38
  39 #include <math.h>
  40 #include <immintrin.h>
  41
  42 /* It is cleaner to start the AVX implementation from scratch rather than
  43  * first inheriting from SSE4.1, which in turn inherits from SSE2. However,
  44  * the capabilities still form a superset.
  45  */
  46 #define GMX_SIMD_X86_SSE2_OR_HIGHER
  47 #define GMX_SIMD_X86_SSE4_1_OR_HIGHER
  48 #define GMX_SIMD_X86_AVX_256_OR_HIGHER
  49
  50
  51 /* x86 256-bit AVX SIMD instruction wrappers
  52  *
  53  * Please see documentation in gromacs/simd/simd.h for defines.
  54  */
  55
  56 /* Capability definitions for 256-bit AVX - no inheritance from SSE */
  57 #define GMX_SIMD_HAVE_FLOAT
  58 #define GMX_SIMD_HAVE_DOUBLE
  59 #define GMX_SIMD_HAVE_SIMD_HARDWARE
  60 #define GMX_SIMD_HAVE_LOADU
  61 #define GMX_SIMD_HAVE_STOREU
  62 #define GMX_SIMD_HAVE_LOGICAL
  63 #undef  GMX_SIMD_HAVE_FMA
  64 #undef  GMX_SIMD_HAVE_FRACTION
  65 #define GMX_SIMD_HAVE_FINT32
  66 #define GMX_SIMD_HAVE_FINT32_EXTRACT     /* Emulated */
  67 #undef  GMX_SIMD_HAVE_FINT32_LOGICAL     /* AVX1 cannot do 256-bit int shifts */
  68 #undef  GMX_SIMD_HAVE_FINT32_ARITHMETICS /* AVX1 cannot do 256-bit int +,-,*  */
  69 #define GMX_SIMD_HAVE_DINT32
  70 #define GMX_SIMD_HAVE_DINT32_EXTRACT     /* Native, dint uses 128-bit SIMD    */
  71 #define GMX_SIMD_HAVE_DINT32_LOGICAL
  72 #define GMX_SIMD_HAVE_DINT32_ARITHMETICS
  73 #define GMX_SIMD4_HAVE_FLOAT
  74 #define GMX_SIMD4_HAVE_DOUBLE
  75
  76 /* Implementation details */
  77 #define GMX_SIMD_FLOAT_WIDTH         8
  78 #define GMX_SIMD_DOUBLE_WIDTH        4
  79 #define GMX_SIMD_FINT32_WIDTH        8
  80 #define GMX_SIMD_DINT32_WIDTH        4
  81 #define GMX_SIMD_RSQRT_BITS         11
  82 #define GMX_SIMD_RCP_BITS           11
  83
  84 /****************************************************
  85  *      SINGLE PRECISION SIMD IMPLEMENTATION        *
  86  ****************************************************/
  87 #define gmx_simd_float_t           __m256
  88 #define gmx_simd_load_f            _mm256_load_ps
  89 #define gmx_simd_load1_f           _mm256_broadcast_ss
  90 #define gmx_simd_set1_f            _mm256_set1_ps
  91 #define gmx_simd_store_f           _mm256_store_ps
  92 #define gmx_simd_loadu_f           _mm256_loadu_ps
  93 #define gmx_simd_storeu_f          _mm256_storeu_ps
  94 #define gmx_simd_setzero_f         _mm256_setzero_ps
  95 #define gmx_simd_add_f             _mm256_add_ps
  96 #define gmx_simd_sub_f             _mm256_sub_ps
  97 #define gmx_simd_mul_f             _mm256_mul_ps
  98 #define gmx_simd_fmadd_f(a, b, c)    _mm256_add_ps(_mm256_mul_ps(a, b), c)
  99 #define gmx_simd_fmsub_f(a, b, c)    _mm256_sub_ps(_mm256_mul_ps(a, b), c)
 100 #define gmx_simd_fnmadd_f(a, b, c)   _mm256_sub_ps(c, _mm256_mul_ps(a, b))
 101 #define gmx_simd_fnmsub_f(a, b, c)   _mm256_sub_ps(_mm256_setzero_ps(), gmx_simd_fmadd_f(a, b, c))
 102 #define gmx_simd_and_f             _mm256_and_ps
 103 #define gmx_simd_andnot_f          _mm256_andnot_ps
 104 #define gmx_simd_or_f              _mm256_or_ps
 105 #define gmx_simd_xor_f             _mm256_xor_ps
 106 #define gmx_simd_rsqrt_f           _mm256_rsqrt_ps
 107 #define gmx_simd_rcp_f             _mm256_rcp_ps
 108 #define gmx_simd_fabs_f(x)         _mm256_andnot_ps(_mm256_set1_ps(GMX_FLOAT_NEGZERO), x)
 109 #define gmx_simd_fneg_f(x)         _mm256_xor_ps(x, _mm256_set1_ps(GMX_FLOAT_NEGZERO))
 110 #define gmx_simd_max_f             _mm256_max_ps
 111 #define gmx_simd_min_f             _mm256_min_ps
 112 #define gmx_simd_round_f(x)        _mm256_round_ps(x, _MM_FROUND_NINT)
 113 #define gmx_simd_trunc_f(x)        _mm256_round_ps(x, _MM_FROUND_TRUNC)
 114 #define gmx_simd_fraction_f(x)     _mm256_sub_ps(x, gmx_simd_trunc_f(x))
 115 #define gmx_simd_get_exponent_f    gmx_simd_get_exponent_f_avx_256
 116 #define gmx_simd_get_mantissa_f    gmx_simd_get_mantissa_f_avx_256
 117 #define gmx_simd_set_exponent_f    gmx_simd_set_exponent_f_avx_256
 118 /* integer datatype corresponding to float: gmx_simd_fint32_t */
 119 #define gmx_simd_fint32_t          __m256i
 120 #define gmx_simd_load_fi(m)        _mm256_castps_si256(_mm256_load_ps((const float *)m))
 121 #define gmx_simd_set1_fi           _mm256_set1_epi32
 122 #define gmx_simd_store_fi(m, x)     _mm256_store_ps((float *)m, _mm256_castsi256_ps(x))
 123 #define gmx_simd_loadu_fi(m)       _mm256_castps_si256(_mm256_loadu_ps((const float *)m))
 124 #define gmx_simd_storeu_fi(m, x)    _mm256_storeu_ps((float *)m, _mm256_castsi256_ps(x))
 125 #define gmx_simd_setzero_fi        _mm256_setzero_si256
 126 #define gmx_simd_cvt_f2i           _mm256_cvtps_epi32
 127 #define gmx_simd_cvtt_f2i          _mm256_cvttps_epi32
 128 #define gmx_simd_cvt_i2f           _mm256_cvtepi32_ps
 129 #define gmx_simd_extract_fi(x, i)   _mm_extract_epi32(_mm256_extractf128_si256(x, (i)>>2), (i)&0x3)
 130 /* Integer logical ops on gmx_simd_fint32_t */
 131 /* gmx_simd_add_fi not supported     */
 132 /* gmx_simd_sub_fi not supported     */
 133 /* gmx_simd_mul_fi not supported     */
 134 /* gmx_simd_slli_fi not supported    */
 135 /* gmx_simd_srli_fi not supported    */
 136 /* gmx_simd_and_fi not supported     */
 137 /* gmx_simd_andnot_fi not supported  */
 138 /* gmx_simd_or_fi not supported      */
 139 /* gmx_simd_xor_fi not supported     */
 140 /* Integer arithmetic ops on gmx_simd_fint32_t */
 141 /* gmx_simd_add_fi not supported     */
 142 /* gmx_simd_sub_fi not supported     */
 143 /* gmx_simd_mul_fi not supported     */
 144 /* Boolean & comparison operations on gmx_simd_float_t */
 145 #define gmx_simd_fbool_t           __m256
 146 #define gmx_simd_cmpeq_f(a, b)      _mm256_cmp_ps(a, b, _CMP_EQ_OQ)
 147 #define gmx_simd_cmplt_f(a, b)      _mm256_cmp_ps(a, b, _CMP_LT_OQ)
 148 #define gmx_simd_cmple_f(a, b)      _mm256_cmp_ps(a, b, _CMP_LE_OQ)
 149 #define gmx_simd_and_fb            _mm256_and_ps
 150 #define gmx_simd_or_fb             _mm256_or_ps
 151 #define gmx_simd_anytrue_fb        _mm256_movemask_ps
 152 #define gmx_simd_blendzero_f       _mm256_and_ps
 153 #define gmx_simd_blendnotzero_f(a, sel)  _mm256_andnot_ps(sel, a)
 154 #define gmx_simd_blendv_f          _mm256_blendv_ps
 155 #define gmx_simd_reduce_f          gmx_simd_reduce_f_avx_256
 156 /* Boolean & comparison operations on gmx_simd_fint32_t */
 157 #define gmx_simd_fibool_t          __m256i
 158 /* gmx_simd_cmpeq_fi not supported        */
 159 /* gmx_simd_cmplt_fi not supported        */
 160 /* gmx_simd_and_fib not supported         */
 161 /* gmx_simd_or_fib not supported          */
 162 /* gmx_simd_anytrue_fib not supported     */
 163 /* gmx_simd_blendzero_fi not supported    */
 164 /* gmx_simd_blendnotzero_fi not supported    */
 165 /* gmx_simd_blendv_fi not supported       */
 166 /* Conversions between different booleans */
 167 #define gmx_simd_cvt_fb2fib        _mm256_castps_si256
 168 #define gmx_simd_cvt_fib2fb        _mm256_castsi256_ps
 169
 170 /****************************************************
 171  *      DOUBLE PRECISION SIMD IMPLEMENTATION        *
 172  ****************************************************/
 173 #define gmx_simd_double_t          __m256d
 174 #define gmx_simd_load_d            _mm256_load_pd
 175 #define gmx_simd_load1_d           _mm256_broadcast_sd
 176 #define gmx_simd_set1_d            _mm256_set1_pd
 177 #define gmx_simd_store_d           _mm256_store_pd
 178 #define gmx_simd_loadu_d           _mm256_loadu_pd
 179 #define gmx_simd_storeu_d          _mm256_storeu_pd
 180 #define gmx_simd_setzero_d         _mm256_setzero_pd
 181 #define gmx_simd_add_d             _mm256_add_pd
 182 #define gmx_simd_sub_d             _mm256_sub_pd
 183 #define gmx_simd_mul_d             _mm256_mul_pd
 184 #define gmx_simd_fmadd_d(a, b, c)    _mm256_add_pd(_mm256_mul_pd(a, b), c)
 185 #define gmx_simd_fmsub_d(a, b, c)    _mm256_sub_pd(_mm256_mul_pd(a, b), c)
 186 #define gmx_simd_fnmadd_d(a, b, c)   _mm256_sub_pd(c, _mm256_mul_pd(a, b))
 187 #define gmx_simd_fnmsub_d(a, b, c)   _mm256_sub_pd(_mm256_setzero_pd(), gmx_simd_fmadd_d(a, b, c))
 188 #define gmx_simd_and_d             _mm256_and_pd
 189 #define gmx_simd_andnot_d          _mm256_andnot_pd
 190 #define gmx_simd_or_d              _mm256_or_pd
 191 #define gmx_simd_xor_d             _mm256_xor_pd
 192 #define gmx_simd_rsqrt_d(x)        _mm256_cvtps_pd(_mm_rsqrt_ps(_mm256_cvtpd_ps(x)))
 193 #define gmx_simd_rcp_d(x)          _mm256_cvtps_pd(_mm_rcp_ps(_mm256_cvtpd_ps(x)))
 194 #define gmx_simd_fabs_d(x)         _mm256_andnot_pd(_mm256_set1_pd(-0.0), x)
 195 #define gmx_simd_fneg_d(x)         _mm256_xor_pd(x, _mm256_set1_pd(-0.0))
 196 #define gmx_simd_max_d             _mm256_max_pd
 197 #define gmx_simd_min_d             _mm256_min_pd
 198 #define gmx_simd_round_d(x)        _mm256_round_pd(x, _MM_FROUND_NINT)
 199 #define gmx_simd_trunc_d(x)        _mm256_round_pd(x, _MM_FROUND_TRUNC)
 200 #define gmx_simd_fraction_d(x)     _mm256_sub_pd(x, gmx_simd_trunc_d(x))
 201 #define gmx_simd_get_exponent_d    gmx_simd_get_exponent_d_avx_256
 202 #define gmx_simd_get_mantissa_d    gmx_simd_get_mantissa_d_avx_256
 203 #define gmx_simd_set_exponent_d    gmx_simd_set_exponent_d_avx_256
 204 /* integer datatype corresponding to double: gmx_simd_dint32_t */
 205 #define gmx_simd_dint32_t          __m128i
 206 #define gmx_simd_load_di(m)        _mm_load_si128((const __m128i *)m)
 207 #define gmx_simd_set1_di           _mm_set1_epi32
 208 #define gmx_simd_store_di(m, x)     _mm_store_si128((__m128i *)m, x)
 209 #define gmx_simd_loadu_di(m)       _mm_loadu_si128((const __m128i *)m)
 210 #define gmx_simd_storeu_di(m, x)    _mm_storeu_si128((__m128i *)m, x)
 211 #define gmx_simd_setzero_di        _mm_setzero_si128
 212 #define gmx_simd_cvt_d2i           _mm256_cvtpd_epi32
 213 #define gmx_simd_cvtt_d2i          _mm256_cvttpd_epi32
 214 #define gmx_simd_cvt_i2d           _mm256_cvtepi32_pd
 215 #define gmx_simd_extract_di        _mm_extract_epi32
 216 /* Integer logical ops on gmx_simd_dint32_t */
 217 #define gmx_simd_slli_di           _mm_slli_epi32
 218 #define gmx_simd_srli_di           _mm_srli_epi32
 219 #define gmx_simd_and_di            _mm_and_si128
 220 #define gmx_simd_andnot_di         _mm_andnot_si128
 221 #define gmx_simd_or_di             _mm_or_si128
 222 #define gmx_simd_xor_di            _mm_xor_si128
 223 /* Integer arithmetic ops on integer datatype corresponding to double */
 224 #define gmx_simd_add_di            _mm_add_epi32
 225 #define gmx_simd_sub_di            _mm_sub_epi32
 226 #define gmx_simd_mul_di            _mm_mullo_epi32
 227 /* Boolean & comparison operations on gmx_simd_double_t */
 228 #define gmx_simd_dbool_t           __m256d
 229 #define gmx_simd_cmpeq_d(a, b)      _mm256_cmp_pd(a, b, _CMP_EQ_OQ)
 230 #define gmx_simd_cmplt_d(a, b)      _mm256_cmp_pd(a, b, _CMP_LT_OQ)
 231 #define gmx_simd_cmple_d(a, b)      _mm256_cmp_pd(a, b, _CMP_LE_OQ)
 232 #define gmx_simd_and_db            _mm256_and_pd
 233 #define gmx_simd_or_db             _mm256_or_pd
 234 #define gmx_simd_anytrue_db        _mm256_movemask_pd
 235 #define gmx_simd_blendzero_d       _mm256_and_pd
 236 #define gmx_simd_blendnotzero_d(a, sel)  _mm256_andnot_pd(sel, a)
 237 #define gmx_simd_blendv_d          _mm256_blendv_pd
 238 #define gmx_simd_reduce_d          gmx_simd_reduce_d_avx_256
 239 /* Boolean & comparison operations on gmx_simd_dint32_t */
 240 #define gmx_simd_dibool_t          __m128i
 241 #define gmx_simd_cmpeq_di          _mm_cmpeq_epi32
 242 #define gmx_simd_cmplt_di          _mm_cmplt_epi32
 243 #define gmx_simd_and_dib           _mm_and_si128
 244 #define gmx_simd_or_dib            _mm_or_si128
 245 #define gmx_simd_anytrue_dib       _mm_movemask_epi8
 246 #define gmx_simd_blendzero_di      _mm_and_si128
 247 #define gmx_simd_blendnotzero_di(a, sel)  _mm_andnot_si128(sel, a)
 248 #define gmx_simd_blendv_di         _mm_blendv_epi8
 249 /* Conversions between different booleans */
 250 #define gmx_simd_cvt_db2dib        gmx_simd_cvt_db2dib_avx_256
 251 #define gmx_simd_cvt_dib2db        gmx_simd_cvt_dib2db_avx_256
 252 /* Float/double conversion */
 253 #define gmx_simd_cvt_f2dd          gmx_simd_cvt_f2dd_avx_256
 254 #define gmx_simd_cvt_dd2f          gmx_simd_cvt_dd2f_avx_256
 255
 256 /****************************************************
 257  *      SINGLE PRECISION SIMD4 IMPLEMENTATION       *
 258  ****************************************************/
 259 #define gmx_simd4_float_t          __m128
 260 #define gmx_simd4_load_f           _mm_load_ps
 261 #define gmx_simd4_load1_f          _mm_broadcast_ss
 262 #define gmx_simd4_set1_f           _mm_set1_ps
 263 #define gmx_simd4_store_f          _mm_store_ps
 264 #define gmx_simd4_loadu_f          _mm_loadu_ps
 265 #define gmx_simd4_storeu_f         _mm_storeu_ps
 266 #define gmx_simd4_setzero_f        _mm_setzero_ps
 267 #define gmx_simd4_add_f            _mm_add_ps
 268 #define gmx_simd4_sub_f            _mm_sub_ps
 269 #define gmx_simd4_mul_f            _mm_mul_ps
 270 #define gmx_simd4_fmadd_f(a, b, c)   _mm_add_ps(_mm_mul_ps(a, b), c)
 271 #define gmx_simd4_fmsub_f(a, b, c)   _mm_sub_ps(_mm_mul_ps(a, b), c)
 272 #define gmx_simd4_fnmadd_f(a, b, c)  _mm_sub_ps(c, _mm_mul_ps(a, b))
 273 #define gmx_simd4_fnmsub_f(a, b, c)  _mm_sub_ps(_mm_setzero_ps(), gmx_simd4_fmadd_f(a, b, c))
 274 #define gmx_simd4_and_f            _mm_and_ps
 275 #define gmx_simd4_andnot_f         _mm_andnot_ps
 276 #define gmx_simd4_or_f             _mm_or_ps
 277 #define gmx_simd4_xor_f            _mm_xor_ps
 278 #define gmx_simd4_rsqrt_f          _mm_rsqrt_ps
 279 #define gmx_simd4_fabs_f(x)        _mm_andnot_ps(_mm_set1_ps(-0.0), x)
 280 #define gmx_simd4_fneg_f(x)        _mm_xor_ps(x, _mm_set1_ps(-0.0))
 281 #define gmx_simd4_max_f            _mm_max_ps
 282 #define gmx_simd4_min_f            _mm_min_ps
 283 #define gmx_simd4_round_f(x)       _mm_round_ps(x, _MM_FROUND_NINT)
 284 #define gmx_simd4_trunc_f(x)       _mm_round_ps(x, _MM_FROUND_TRUNC)
 285 #define gmx_simd4_dotproduct3_f    gmx_simd4_dotproduct3_f_avx_256
 286 #define gmx_simd4_fbool_t          __m128
 287 #define gmx_simd4_cmpeq_f          _mm_cmpeq_ps
 288 #define gmx_simd4_cmplt_f          _mm_cmplt_ps
 289 #define gmx_simd4_cmple_f          _mm_cmple_ps
 290 #define gmx_simd4_and_fb           _mm_and_ps
 291 #define gmx_simd4_or_fb            _mm_or_ps
 292 #define gmx_simd4_anytrue_fb       _mm_movemask_ps
 293 #define gmx_simd4_blendzero_f      _mm_and_ps
 294 #define gmx_simd4_blendnotzero_f(a, sel)  _mm_andnot_ps(sel, a)
 295 #define gmx_simd4_blendv_f         _mm_blendv_ps
 296 #define gmx_simd4_reduce_f         gmx_simd4_reduce_f_avx_256
 297
 298 /****************************************************
 299  *      DOUBLE PRECISION SIMD4 IMPLEMENTATION       *
 300  ****************************************************/
 301 #define gmx_simd4_double_t          gmx_simd_double_t
 302 #define gmx_simd4_load_d            gmx_simd_load_d
 303 #define gmx_simd4_load1_d           gmx_simd_load1_d
 304 #define gmx_simd4_set1_d            gmx_simd_set1_d
 305 #define gmx_simd4_store_d           gmx_simd_store_d
 306 #define gmx_simd4_loadu_d           gmx_simd_loadu_d
 307 #define gmx_simd4_storeu_d          gmx_simd_storeu_d
 308 #define gmx_simd4_setzero_d         gmx_simd_setzero_d
 309 #define gmx_simd4_add_d             gmx_simd_add_d
 310 #define gmx_simd4_sub_d             gmx_simd_sub_d
 311 #define gmx_simd4_mul_d             gmx_simd_mul_d
 312 #define gmx_simd4_fmadd_d           gmx_simd_fmadd_d
 313 #define gmx_simd4_fmsub_d           gmx_simd_fmsub_d
 314 #define gmx_simd4_fnmadd_d          gmx_simd_fnmadd_d
 315 #define gmx_simd4_fnmsub_d          gmx_simd_fnmsub_d
 316 #define gmx_simd4_and_d             gmx_simd_and_d
 317 #define gmx_simd4_andnot_d          gmx_simd_andnot_d
 318 #define gmx_simd4_or_d              gmx_simd_or_d
 319 #define gmx_simd4_xor_d             gmx_simd_xor_d
 320 #define gmx_simd4_rsqrt_d           gmx_simd_rsqrt_d
 321 #define gmx_simd4_fabs_d            gmx_simd_fabs_d
 322 #define gmx_simd4_fneg_d            gmx_simd_fneg_d
 323 #define gmx_simd4_max_d             gmx_simd_max_d
 324 #define gmx_simd4_min_d             gmx_simd_min_d
 325 #define gmx_simd4_round_d           gmx_simd_round_d
 326 #define gmx_simd4_trunc_d           gmx_simd_trunc_d
 327 #define gmx_simd4_dotproduct3_d     gmx_simd4_dotproduct3_d_avx_256
 328 #define gmx_simd4_dbool_t           gmx_simd_dbool_t
 329 #define gmx_simd4_cmpeq_d           gmx_simd_cmpeq_d
 330 #define gmx_simd4_cmplt_d           gmx_simd_cmplt_d
 331 #define gmx_simd4_cmple_d           gmx_simd_cmple_d
 332 #define gmx_simd4_and_db            gmx_simd_and_db
 333 #define gmx_simd4_or_db             gmx_simd_or_db
 334 #define gmx_simd4_anytrue_db        gmx_simd_anytrue_db
 335 #define gmx_simd4_blendzero_d       gmx_simd_blendzero_d
 336 #define gmx_simd4_blendnotzero_d    gmx_simd_blendnotzero_d
 337 #define gmx_simd4_blendv_d          gmx_simd_blendv_d
 338 #define gmx_simd4_reduce_d          gmx_simd_reduce_d
 339 /* SIMD4 float/double conversion */
 340 #define gmx_simd4_cvt_f2d           _mm256_cvtps_pd
 341 #define gmx_simd4_cvt_d2f           _mm256_cvtpd_ps
 342
 343 /*********************************************************
 344  * SIMD SINGLE PRECISION IMPLEMENTATION HELPER FUNCTIONS *
 345  *********************************************************/
 346 static gmx_inline __m256 gmx_simdcall
 347 gmx_simd_get_exponent_f_avx_256(__m256 x)
 348 {
 349     const __m256  expmask      = _mm256_castsi256_ps(_mm256_set1_epi32(0x7F800000));
 350     const __m128i expbias      = _mm_set1_epi32(127);
 351     __m256i       iexp256;
 352     __m128i       iexp128a, iexp128b;
 353
 354     iexp256   = _mm256_castps_si256(_mm256_and_ps(x, expmask));
 355     iexp128b  = _mm256_extractf128_si256(iexp256, 0x1);
 356     iexp128a  = _mm256_castsi256_si128(iexp256);
 357     iexp128a  = _mm_srli_epi32(iexp128a, 23);
 358     iexp128b  = _mm_srli_epi32(iexp128b, 23);
 359     iexp128a  = _mm_sub_epi32(iexp128a, expbias);
 360     iexp128b  = _mm_sub_epi32(iexp128b, expbias);
 361     iexp256   = _mm256_castsi128_si256(iexp128a);
 362     iexp256   = _mm256_insertf128_si256(iexp256, iexp128b, 0x1);
 363     return _mm256_cvtepi32_ps(iexp256);
 364 }
 365
 366 static gmx_inline __m256 gmx_simdcall
 367 gmx_simd_get_mantissa_f_avx_256(__m256 x)
 368 {
 369     const __m256 mantmask   = _mm256_castsi256_ps(_mm256_set1_epi32(0x007FFFFF));
 370     const __m256 one        = _mm256_set1_ps(1.0);
 371
 372     x = _mm256_and_ps(x, mantmask);
 373     return _mm256_or_ps(x, one);
 374 }
 375
 376 static gmx_inline __m256 gmx_simdcall
 377 gmx_simd_set_exponent_f_avx_256(__m256 x)
 378 {
 379     const __m128i expbias      = _mm_set1_epi32(127);
 380     __m256i       iexp256;
 381     __m128i       iexp128a, iexp128b;
 382
 383     iexp256   = _mm256_cvtps_epi32(x);
 384     iexp128b  = _mm256_extractf128_si256(iexp256, 0x1);
 385     iexp128a  = _mm256_castsi256_si128(iexp256);
 386     iexp128a  = _mm_slli_epi32(_mm_add_epi32(iexp128a, expbias), 23);
 387     iexp128b  = _mm_slli_epi32(_mm_add_epi32(iexp128b, expbias), 23);
 388     iexp256   = _mm256_castsi128_si256(iexp128a);
 389     iexp256   = _mm256_insertf128_si256(iexp256, iexp128b, 0x1);
 390     return _mm256_castsi256_ps(iexp256);
 391 }
 392
 393 static gmx_inline float gmx_simdcall
 394 gmx_simd_reduce_f_avx_256(__m256 a)
 395 {
 396     float  f;
 397
 398     __m128 a0, a1;
 399     a  = _mm256_hadd_ps(a, a);
 400     a  = _mm256_hadd_ps(a, a);
 401     a0 = _mm256_castps256_ps128(a);
 402     a1 = _mm256_extractf128_ps(a, 0x1);
 403     a0 = _mm_add_ss(a0, a1);
 404     _mm_store_ss(&f, a0);
 405     return f;
 406 }
 407
 408 /*********************************************************
 409  * SIMD DOUBLE PRECISION IMPLEMENTATION HELPER FUNCTIONS *
 410  *********************************************************/
 411 static gmx_inline __m256d gmx_simdcall
 412 gmx_simd_get_exponent_d_avx_256(__m256d x)
 413 {
 414     const __m256d expmask      = _mm256_castsi256_pd( _mm256_set1_epi64x(0x7FF0000000000000LL));
 415     const __m128i expbias      = _mm_set1_epi32(1023);
 416     __m256i       iexp256;
 417     __m128i       iexp128a, iexp128b;
 418
 419     iexp256   = _mm256_castpd_si256(_mm256_and_pd(x, expmask));
 420     iexp128b  = _mm256_extractf128_si256(iexp256, 0x1);
 421     iexp128a  = _mm256_castsi256_si128(iexp256);
 422     iexp128a  = _mm_srli_epi64(iexp128a, 52);
 423     iexp128b  = _mm_srli_epi64(iexp128b, 52);
 424     iexp128a  = _mm_shuffle_epi32(iexp128a, _MM_SHUFFLE(1, 1, 2, 0));
 425     iexp128b  = _mm_shuffle_epi32(iexp128b, _MM_SHUFFLE(2, 0, 1, 1));
 426     iexp128a  = _mm_or_si128(iexp128a, iexp128b);
 427     iexp128a  = _mm_sub_epi32(iexp128a, expbias);
 428     return _mm256_cvtepi32_pd(iexp128a);
 429 }
 430
 431 static gmx_inline __m256d gmx_simdcall
 432 gmx_simd_get_mantissa_d_avx_256(__m256d x)
 433 {
 434     const __m256d mantmask   = _mm256_castsi256_pd(_mm256_set1_epi64x(0x000FFFFFFFFFFFFFLL));
 435     const __m256d one        = _mm256_set1_pd(1.0);
 436
 437     x = _mm256_and_pd(x, mantmask);
 438     return _mm256_or_pd(x, one);
 439 }
 440
 441 static gmx_inline __m256d gmx_simdcall
 442 gmx_simd_set_exponent_d_avx_256(__m256d x)
 443 {
 444     const __m128i expbias      = _mm_set1_epi32(1023);
 445     __m128i       iexp128a, iexp128b;
 446
 447     iexp128a = _mm256_cvtpd_epi32(x);
 448     iexp128a = _mm_add_epi32(iexp128a, expbias);
 449     iexp128b = _mm_shuffle_epi32(iexp128a, _MM_SHUFFLE(3, 3, 2, 2));
 450     iexp128a = _mm_shuffle_epi32(iexp128a, _MM_SHUFFLE(1, 1, 0, 0));
 451     iexp128b = _mm_slli_epi64(iexp128b, 52);
 452     iexp128a = _mm_slli_epi64(iexp128a, 52);
 453     return _mm256_castsi256_pd(_mm256_insertf128_si256(_mm256_castsi128_si256(iexp128a), iexp128b, 0x1));
 454 }
 455
 456 static gmx_inline double gmx_simdcall
 457 gmx_simd_reduce_d_avx_256(__m256d a)
 458 {
 459     double  f;
 460     __m128d a0, a1;
 461     a  = _mm256_hadd_pd(a, a);
 462     a0 = _mm256_castpd256_pd128(a);
 463     a1 = _mm256_extractf128_pd(a, 0x1);
 464     a0 = _mm_add_sd(a0, a1);
 465     _mm_store_sd(&f, a0);
 466     return f;
 467 }
 468
 469 static gmx_inline gmx_simd_dibool_t gmx_simdcall
 470 gmx_simd_cvt_db2dib_avx_256(gmx_simd_dbool_t a)
 471 {
 472     __m128i a1 = _mm256_extractf128_si256(_mm256_castpd_si256(a), 0x1);
 473     __m128i a0 = _mm256_castsi256_si128(_mm256_castpd_si256(a));
 474     a0 = _mm_shuffle_epi32(a0, _MM_SHUFFLE(2, 0, 2, 0));
 475     a1 = _mm_shuffle_epi32(a1, _MM_SHUFFLE(2, 0, 2, 0));
 476     return _mm_blend_epi16(a0, a1, 0xF0);
 477 }
 478
 479 static gmx_inline gmx_simd_dbool_t gmx_simdcall
 480 gmx_simd_cvt_dib2db_avx_256(gmx_simd_dibool_t a)
 481 {
 482     __m128i a1 = _mm_shuffle_epi32(a, _MM_SHUFFLE(3, 3, 2, 2));
 483     __m128i a0 = _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 1, 0, 0));
 484     return _mm256_castsi256_pd(_mm256_insertf128_si256(_mm256_castsi128_si256(a0), a1, 0x1));
 485 }
 486
 487 static gmx_inline void gmx_simdcall
 488 gmx_simd_cvt_f2dd_avx_256(__m256 f, __m256d *d0, __m256d *d1)
 489 {
 490     *d0 = _mm256_cvtps_pd(_mm256_castps256_ps128(f));
 491     *d1 = _mm256_cvtps_pd(_mm256_extractf128_ps(f, 0x1));
 492 }
 493
 494 static gmx_inline __m256 gmx_simdcall
 495 gmx_simd_cvt_dd2f_avx_256(__m256d d0, __m256d d1)
 496 {
 497     __m128 f0 = _mm256_cvtpd_ps(d0);
 498     __m128 f1 = _mm256_cvtpd_ps(d1);
 499     return _mm256_insertf128_ps(_mm256_castps128_ps256(f0), f1, 0x1);
 500 }
 501
 502 /* SIMD4 reduce helper */
 503 static gmx_inline float gmx_simdcall
 504 gmx_simd4_reduce_f_avx_256(__m128 a)
 505 {
 506     float f;
 507     a = _mm_hadd_ps(a, a);
 508     a = _mm_hadd_ps(a, a);
 509     _mm_store_ss(&f, a);
 510     return f;
 511 }
 512
 513 /* SIMD4 Dotproduct helper function */
 514 static gmx_inline float gmx_simdcall
 515 gmx_simd4_dotproduct3_f_avx_256(__m128 a, __m128 b)
 516 {
 517     float  f;
 518     __m128 c;
 519     a = _mm_mul_ps(a, b);
 520     c = _mm_add_ps(a, _mm_permute_ps(a, _MM_SHUFFLE(0, 3, 2, 1)));
 521     c = _mm_add_ps(c, _mm_permute_ps(a, _MM_SHUFFLE(1, 0, 3, 2)));
 522     _mm_store_ss(&f, c);
 523     return f;
 524 }
 525
 526 static gmx_inline double gmx_simdcall
 527 gmx_simd4_dotproduct3_d_avx_256(__m256d a, __m256d b)
 528 {
 529     double  d;
 530     __m128d tmp1, tmp2;
 531     a    = _mm256_mul_pd(a, b);
 532     tmp1 = _mm256_castpd256_pd128(a);
 533     tmp2 = _mm256_extractf128_pd(a, 0x1);
 534
 535     tmp1 = _mm_add_pd(tmp1, _mm_permute_pd(tmp1, _MM_SHUFFLE2(0, 1)));
 536     tmp1 = _mm_add_pd(tmp1, tmp2);
 537     _mm_store_sd(&d, tmp1);
 538     return d;
 539 }
 540
 541 /* Function to check whether SIMD operations have resulted in overflow */
 542 static int
 543 gmx_simd_check_and_reset_overflow(void)
 544 {
 545     int MXCSR;
 546     int sse_overflow;
 547
 548     MXCSR = _mm_getcsr();
 549     /* The overflow flag is bit 3 in the register */
 550     if (MXCSR & 0x0008)
 551     {
 552         sse_overflow = 1;
 553         /* Set the overflow flag to zero */
 554         MXCSR = MXCSR & 0xFFF7;
 555         _mm_setcsr(MXCSR);
 556     }
 557     else
 558     {
 559         sse_overflow = 0;
 560     }
 561     return sse_overflow;
 562 }
 563
 564
 565 #endif /* GMX_SIMD_IMPL_X86_AVX_256_H */