include/gmx_simd_macros.h

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
   5  * Copyright (c) 2001-2012, The GROMACS Development Team
   6  * Copyright (c) 2012,2013, by the GROMACS development team, led by
   7  * David van der Spoel, Berk Hess, Erik Lindahl, and including many
   8  * others, as listed in the AUTHORS file in the top-level source
   9  * directory and at http://www.gromacs.org.
  10  *
  11  * GROMACS is free software; you can redistribute it and/or
  12  * modify it under the terms of the GNU Lesser General Public License
  13  * as published by the Free Software Foundation; either version 2.1
  14  * of the License, or (at your option) any later version.
  15  *
  16  * GROMACS is distributed in the hope that it will be useful,
  17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  19  * Lesser General Public License for more details.
  20  *
  21  * You should have received a copy of the GNU Lesser General Public
  22  * License along with GROMACS; if not, see
  23  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  24  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  25  *
  26  * If you want to redistribute modifications to GROMACS, please
  27  * consider that scientific software is very special. Version
  28  * control is crucial - bugs must be traceable. We will be happy to
  29  * consider code for inclusion in the official distribution, but
  30  * derived work must not be called official GROMACS. Details are found
  31  * in the README & COPYING files - if they are missing, get the
  32  * official version at http://www.gromacs.org.
  33  *
  34  * To help us fund GROMACS development, we humbly ask that you cite
  35  * the research papers on the package. Check out http://www.gromacs.org.
  36  */
  37
  38 /* The macros in this file are intended to be used for writing
  39  * architecture-independent SIMD intrinsics code.
  40  * To support a new architecture, adding macros here should be (nearly)
  41  * all that is needed.
  42  */
  43
  44 #ifdef _gmx_simd_macros_h_
  45 #error "gmx_simd_macros.h included twice"
  46 #else
  47 #define _gmx_simd_macros_h_
  48
  49 /* NOTE: SSE2 acceleration does not include floor or blendv */
  50
  51
  52 /* Uncomment the next line, without other SIMD active, for testing plain-C */
  53 /* #define GMX_SIMD_REFERENCE_PLAIN_C */
  54 #ifdef GMX_SIMD_REFERENCE_PLAIN_C
  55 /* Plain C SIMD reference implementation, also serves as documentation */
  56 #define GMX_HAVE_SIMD_MACROS
  57
  58 /* In general the reference SIMD supports any SIMD width, including 1.
  59  * For the nbnxn 4xn kernels all widths (2, 4 and 8) are supported.
  60  * The nbnxn 2xnn kernels are currently not supported.
  61  */
  62 #define GMX_SIMD_REF_WIDTH  4
  63
  64 /* Include plain-C reference implementation, also serves as documentation */
  65 #include "gmx_simd_ref.h"
  66
  67 #define GMX_SIMD_WIDTH_HERE  GMX_SIMD_REF_WIDTH
  68
  69 /* float/double SIMD register type */
  70 #define gmx_mm_pr  gmx_simd_ref_pr
  71
  72 /* boolean SIMD register type */
  73 #define gmx_mm_pb  gmx_simd_ref_pb
  74
  75 /* integer SIMD register type, only for table indexing and exclusion masks */
  76 #define gmx_epi32  gmx_simd_ref_epi32
  77 #define GMX_SIMD_EPI32_WIDTH  GMX_SIMD_REF_EPI32_WIDTH
  78
  79 /* Load GMX_SIMD_WIDTH_HERE reals for memory starting at r */
  80 #define gmx_load_pr       gmx_simd_ref_load_pr
  81 /* Set all SIMD register elements to *r */
  82 #define gmx_load1_pr      gmx_simd_ref_load1_pr
  83 #define gmx_set1_pr       gmx_simd_ref_set1_pr
  84 #define gmx_setzero_pr    gmx_simd_ref_setzero_pr
  85 #define gmx_store_pr      gmx_simd_ref_store_pr
  86
  87 #define gmx_add_pr        gmx_simd_ref_add_pr
  88 #define gmx_sub_pr        gmx_simd_ref_sub_pr
  89 #define gmx_mul_pr        gmx_simd_ref_mul_pr
  90 /* For the FMA macros below, aim for c=d in code, so FMA3 uses 1 instruction */
  91 #define gmx_madd_pr       gmx_simd_ref_madd_pr
  92 #define gmx_nmsub_pr      gmx_simd_ref_nmsub_pr
  93
  94 #define gmx_max_pr        gmx_simd_ref_max_pr
  95 #define gmx_blendzero_pr  gmx_simd_ref_blendzero_pr
  96
  97 #define gmx_round_pr      gmx_simd_ref_round_pr
  98
  99 /* Not required, only used to speed up the nbnxn tabulated PME kernels */
 100 #define GMX_SIMD_HAVE_FLOOR
 101 #ifdef GMX_SIMD_HAVE_FLOOR
 102 #define gmx_floor_pr      gmx_simd_ref_floor_pr
 103 #endif
 104
 105 /* Not required, only used when blendv is faster than comparison */
 106 #define GMX_SIMD_HAVE_BLENDV
 107 #ifdef GMX_SIMD_HAVE_BLENDV
 108 #define gmx_blendv_pr     gmx_simd_ref_blendv_pr
 109 #endif
 110
 111 /* Copy the sign of a to b, assumes b >= 0 for efficiency */
 112 #define gmx_cpsgn_nonneg_pr  gmx_simd_ref_cpsgn_nonneg_pr
 113
 114 /* Very specific operation required in the non-bonded kernels */
 115 #define gmx_masknot_add_pr   gmx_simd_ref_masknot_add_pr
 116
 117 /* Comparison */
 118 #define gmx_cmplt_pr      gmx_simd_ref_cmplt_pr
 119
 120 /* Logical operations on SIMD booleans */
 121 #define gmx_and_pb        gmx_simd_ref_and_pb
 122 #define gmx_or_pb         gmx_simd_ref_or_pb
 123
 124 /* Returns a single int (0/1) which tells if any of the 4 booleans is True */
 125 #define gmx_anytrue_pb    gmx_simd_ref_anytrue_pb
 126
 127 /* Conversions only used for PME table lookup */
 128 #define gmx_cvttpr_epi32  gmx_simd_ref_cvttpr_epi32
 129 #define gmx_cvtepi32_pr   gmx_simd_ref_cvtepi32_pr
 130
 131 /* These two function only need to be approximate, Newton-Raphson iteration
 132  * is used for full accuracy in gmx_invsqrt_pr and gmx_inv_pr.
 133  */
 134 #define gmx_rsqrt_pr      gmx_simd_ref_rsqrt_pr
 135 #define gmx_rcp_pr        gmx_simd_ref_rcp_pr
 136
 137 /* sqrt+inv+sin+cos+acos+atan2 are used for bonded potentials, exp for PME */
 138 #define GMX_SIMD_HAVE_EXP
 139 #ifdef GMX_SIMD_HAVE_EXP
 140 #define gmx_exp_pr        gmx_simd_ref_exp_pr
 141 #endif
 142 #define GMX_SIMD_HAVE_TRIGONOMETRIC
 143 #ifdef GMX_SIMD_HAVE_TRIGONOMETRIC
 144 #define gmx_sqrt_pr       gmx_simd_ref_sqrt_pr
 145 #define gmx_sincos_pr     gmx_simd_ref_sincos_pr
 146 #define gmx_acos_pr       gmx_simd_ref_acos_pr
 147 #define gmx_atan2_pr      gmx_simd_ref_atan2_pr
 148 #endif
 149
 150 #endif /* GMX_SIMD_REFERENCE_PLAIN_C */
 151
 152
 153 /* The same SIMD macros can be translated to SIMD intrinsics (and compiled
 154  * to instructions for) different SIMD width and float precision.
 155  *
 156  * On x86: The gmx_ prefix is replaced by _mm_ or _mm256_ (SSE or AVX).
 157  * The _pr suffix is replaced by _ps or _pd (for single or double precision).
 158  * Compiler settings will decide if 128-bit intrinsics will
 159  * be translated into SSE or AVX instructions.
 160  */
 161
 162
 163 #ifdef GMX_USE_HALF_WIDTH_SIMD_HERE
 164 #if defined GMX_X86_AVX_256
 165 /* We have half SIMD width support, continue */
 166 #else
 167 #error "half SIMD width intrinsics are not supported"
 168 #endif
 169 #endif
 170
 171
 172 #ifdef GMX_X86_SSE2
 173 /* This is for general x86 SIMD instruction sets that also support SSE2 */
 174 #define GMX_HAVE_SIMD_MACROS
 175
 176 /* Include the highest supported x86 SIMD intrisics + math functions */
 177 #ifdef GMX_X86_AVX_256
 178 #include "gmx_x86_avx_256.h"
 179 #ifdef GMX_DOUBLE
 180 #include "gmx_math_x86_avx_256_double.h"
 181 #else
 182 #include "gmx_math_x86_avx_256_single.h"
 183 #endif
 184 #else
 185 #ifdef GMX_X86_AVX_128_FMA
 186 #include "gmx_x86_avx_128_fma.h"
 187 #ifdef GMX_DOUBLE
 188 #include "gmx_math_x86_avx_128_fma_double.h"
 189 #else
 190 #include "gmx_math_x86_avx_128_fma_single.h"
 191 #endif
 192 #else
 193 #ifdef GMX_X86_SSE4_1
 194 #include "gmx_x86_sse4_1.h"
 195 #ifdef GMX_DOUBLE
 196 #include "gmx_math_x86_sse4_1_double.h"
 197 #else
 198 #include "gmx_math_x86_sse4_1_single.h"
 199 #endif
 200 #else
 201 #ifdef GMX_X86_SSE2
 202 #include "gmx_x86_sse2.h"
 203 #ifdef GMX_DOUBLE
 204 #include "gmx_math_x86_sse2_double.h"
 205 #else
 206 #include "gmx_math_x86_sse2_single.h"
 207 #endif
 208 #else
 209 #error No x86 acceleration defined
 210 #endif
 211 #endif
 212 #endif
 213 #endif
 214 /* exp and trigonometric functions are included above */
 215 #define GMX_SIMD_HAVE_EXP
 216 #define GMX_SIMD_HAVE_TRIGONOMETRIC
 217
 218 #if !defined GMX_X86_AVX_256 || defined GMX_USE_HALF_WIDTH_SIMD_HERE
 219
 220 #ifndef GMX_DOUBLE
 221
 222 #define GMX_SIMD_WIDTH_HERE  4
 223
 224 #define gmx_mm_pr  __m128
 225
 226 #define gmx_mm_pb  __m128
 227
 228 #define gmx_epi32  __m128i
 229 #define GMX_SIMD_EPI32_WIDTH  4
 230
 231 #define gmx_load_pr       _mm_load_ps
 232 #define gmx_load1_pr      _mm_load1_ps
 233 #define gmx_set1_pr       _mm_set1_ps
 234 #define gmx_setzero_pr    _mm_setzero_ps
 235 #define gmx_store_pr      _mm_store_ps
 236
 237 #define gmx_add_pr        _mm_add_ps
 238 #define gmx_sub_pr        _mm_sub_ps
 239 #define gmx_mul_pr        _mm_mul_ps
 240 #ifdef GMX_X86_AVX_128_FMA
 241 #define GMX_SIMD_HAVE_FMA
 242 #define gmx_madd_pr(a, b, c)   _mm_macc_ps(a, b, c)
 243 #define gmx_nmsub_pr(a, b, c)  _mm_nmacc_ps(a, b, c)
 244 #else
 245 #define gmx_madd_pr(a, b, c)   _mm_add_ps(c, _mm_mul_ps(a, b))
 246 #define gmx_nmsub_pr(a, b, c)  _mm_sub_ps(c, _mm_mul_ps(a, b))
 247 #endif
 248 #define gmx_max_pr        _mm_max_ps
 249 #define gmx_blendzero_pr  _mm_and_ps
 250
 251 #define gmx_cmplt_pr      _mm_cmplt_ps
 252 #define gmx_and_pb        _mm_and_ps
 253 #define gmx_or_pb         _mm_or_ps
 254
 255 #ifdef GMX_X86_SSE4_1
 256 #define gmx_round_pr(x)   _mm_round_ps(x, 0x0)
 257 #define GMX_SIMD_HAVE_FLOOR
 258 #define gmx_floor_pr      _mm_floor_ps
 259 #else
 260 #define gmx_round_pr(x)   _mm_cvtepi32_ps(_mm_cvtps_epi32(x))
 261 #endif
 262
 263 #ifdef GMX_X86_SSE4_1
 264 #define GMX_SIMD_HAVE_BLENDV
 265 #define gmx_blendv_pr     _mm_blendv_ps
 266 #endif
 267
 268 static gmx_inline gmx_mm_pr gmx_cpsgn_nonneg_pr(gmx_mm_pr a, gmx_mm_pr b)
 269 {
 270     /* The value -0.0 has only the sign-bit set */
 271     gmx_mm_pr sign_mask = _mm_set1_ps(-0.0);
 272     return _mm_or_ps(_mm_and_ps(a, sign_mask), b);
 273 };
 274
 275 static gmx_inline gmx_mm_pr gmx_masknot_add_pr(gmx_mm_pb a, gmx_mm_pr b, gmx_mm_pr c) { return _mm_add_ps(b, _mm_andnot_ps(a, c)); };
 276
 277 #define gmx_anytrue_pb    _mm_movemask_ps
 278
 279 #define gmx_cvttpr_epi32  _mm_cvttps_epi32
 280 #define gmx_cvtepi32_pr   _mm_cvtepi32_ps
 281
 282 #define gmx_rsqrt_pr      _mm_rsqrt_ps
 283 #define gmx_rcp_pr        _mm_rcp_ps
 284
 285 #define gmx_exp_pr        gmx_mm_exp_ps
 286 #define gmx_sqrt_pr       gmx_mm_sqrt_ps
 287 #define gmx_sincos_pr     gmx_mm_sincos_ps
 288 #define gmx_acos_pr       gmx_mm_acos_ps
 289 #define gmx_atan2_pr      gmx_mm_atan2_ps
 290
 291 #else /* ifndef GMX_DOUBLE */
 292
 293 #define GMX_SIMD_WIDTH_HERE  2
 294
 295 #define gmx_mm_pr  __m128d
 296
 297 #define gmx_mm_pb  __m128d
 298
 299 #define gmx_epi32  __m128i
 300 #define GMX_SIMD_EPI32_WIDTH  4
 301
 302 #define gmx_load_pr       _mm_load_pd
 303 #define gmx_load1_pr      _mm_load1_pd
 304 #define gmx_set1_pr       _mm_set1_pd
 305 #define gmx_setzero_pr    _mm_setzero_pd
 306 #define gmx_store_pr      _mm_store_pd
 307
 308 #define gmx_add_pr        _mm_add_pd
 309 #define gmx_sub_pr        _mm_sub_pd
 310 #define gmx_mul_pr        _mm_mul_pd
 311 #ifdef GMX_X86_AVX_128_FMA
 312 #define GMX_SIMD_HAVE_FMA
 313 #define gmx_madd_pr(a, b, c)   _mm_macc_pd(a, b, c)
 314 #define gmx_nmsub_pr(a, b, c)  _mm_nmacc_pd(a, b, c)
 315 #else
 316 #define gmx_madd_pr(a, b, c)   _mm_add_pd(c, _mm_mul_pd(a, b))
 317 #define gmx_nmsub_pr(a, b, c)  _mm_sub_pd(c, _mm_mul_pd(a, b))
 318 #endif
 319 #define gmx_max_pr        _mm_max_pd
 320 #define gmx_blendzero_pr  _mm_and_pd
 321
 322 #ifdef GMX_X86_SSE4_1
 323 #define gmx_round_pr(x)   _mm_round_pd(x, 0x0)
 324 #define GMX_SIMD_HAVE_FLOOR
 325 #define gmx_floor_pr      _mm_floor_pd
 326 #else
 327 #define gmx_round_pr(x)   _mm_cvtepi32_pd(_mm_cvtpd_epi32(x))
 328 /* gmx_floor_pr is not used in code for pre-SSE4_1 hardware */
 329 #endif
 330
 331 #ifdef GMX_X86_SSE4_1
 332 #define GMX_SIMD_HAVE_BLENDV
 333 #define gmx_blendv_pr     _mm_blendv_pd
 334 #endif
 335
 336 static gmx_inline gmx_mm_pr gmx_cpsgn_nonneg_pr(gmx_mm_pr a, gmx_mm_pr b)
 337 {
 338     gmx_mm_pr sign_mask = _mm_set1_pd(-0.0);
 339     return _mm_or_pd(_mm_and_pd(a, sign_mask), b);
 340 };
 341
 342 static gmx_inline gmx_mm_pr gmx_masknot_add_pr(gmx_mm_pb a, gmx_mm_pr b, gmx_mm_pr c) { return _mm_add_pd(b, _mm_andnot_pd(a, c)); };
 343
 344 #define gmx_cmplt_pr      _mm_cmplt_pd
 345
 346 #define gmx_and_pb        _mm_and_pd
 347 #define gmx_or_pb         _mm_or_pd
 348
 349 #define gmx_anytrue_pb    _mm_movemask_pd
 350
 351 #define gmx_cvttpr_epi32  _mm_cvttpd_epi32
 352 #define gmx_cvtepi32_pr   _mm_cvtepi32_pd
 353
 354 #define gmx_rsqrt_pr(r)   _mm_cvtps_pd(_mm_rsqrt_ps(_mm_cvtpd_ps(r)))
 355 #define gmx_rcp_pr(r)     _mm_cvtps_pd(_mm_rcp_ps(_mm_cvtpd_ps(r)))
 356
 357 #define gmx_exp_pr        gmx_mm_exp_pd
 358 #define gmx_sqrt_pr       gmx_mm_sqrt_pd
 359 #define gmx_sincos_pr     gmx_mm_sincos_pd
 360 #define gmx_acos_pr       gmx_mm_acos_pd
 361 #define gmx_atan2_pr      gmx_mm_atan2_pd
 362
 363 #endif /* ifndef GMX_DOUBLE */
 364
 365 #else
 366 /* We have GMX_X86_AVX_256 and not GMX_USE_HALF_WIDTH_SIMD_HERE,
 367  * so we use 256-bit SIMD.
 368  */
 369
 370 #ifndef GMX_DOUBLE
 371
 372 #define GMX_SIMD_WIDTH_HERE  8
 373
 374 #define gmx_mm_pr  __m256
 375
 376 #define gmx_mm_pb  __m256
 377
 378 #define gmx_epi32  __m256i
 379 #define GMX_SIMD_EPI32_WIDTH  8
 380
 381 #define gmx_load_pr       _mm256_load_ps
 382 #define gmx_load1_pr(x)   _mm256_set1_ps((x)[0])
 383 #define gmx_set1_pr       _mm256_set1_ps
 384 #define gmx_setzero_pr    _mm256_setzero_ps
 385 #define gmx_store_pr      _mm256_store_ps
 386
 387 #define gmx_add_pr        _mm256_add_ps
 388 #define gmx_sub_pr        _mm256_sub_ps
 389 #define gmx_mul_pr        _mm256_mul_ps
 390 #define gmx_madd_pr(a, b, c)   _mm256_add_ps(c, _mm256_mul_ps(a, b))
 391 #define gmx_nmsub_pr(a, b, c)  _mm256_sub_ps(c, _mm256_mul_ps(a, b))
 392 #define gmx_max_pr        _mm256_max_ps
 393 #define gmx_blendzero_pr  _mm256_and_ps
 394
 395 #define gmx_round_pr(x)   _mm256_round_ps(x, 0x0)
 396 #define GMX_SIMD_HAVE_FLOOR
 397 #define gmx_floor_pr      _mm256_floor_ps
 398
 399 #define GMX_SIMD_HAVE_BLENDV
 400 #define gmx_blendv_pr     _mm256_blendv_ps
 401
 402 static gmx_inline gmx_mm_pr gmx_cpsgn_nonneg_pr(gmx_mm_pr a, gmx_mm_pr b)
 403 {
 404     gmx_mm_pr sign_mask = _mm256_set1_ps(-0.0);
 405     return _mm256_or_ps(_mm256_and_ps(a, sign_mask), b);
 406 };
 407
 408 static gmx_inline gmx_mm_pr gmx_masknot_add_pr(gmx_mm_pb a, gmx_mm_pr b, gmx_mm_pr c) { return _mm256_add_ps(b, _mm256_andnot_ps(a, c)); };
 409
 410 /* Less-than (we use ordered, non-signaling, but that's not required) */
 411 #define gmx_cmplt_pr(x, y) _mm256_cmp_ps(x, y, 0x11)
 412 #define gmx_and_pb        _mm256_and_ps
 413 #define gmx_or_pb         _mm256_or_ps
 414
 415 #define gmx_anytrue_pb    _mm256_movemask_ps
 416
 417 #define gmx_cvttpr_epi32  _mm256_cvttps_epi32
 418
 419 #define gmx_rsqrt_pr      _mm256_rsqrt_ps
 420 #define gmx_rcp_pr        _mm256_rcp_ps
 421
 422 #define gmx_exp_pr        gmx_mm256_exp_ps
 423 #define gmx_sqrt_pr       gmx_mm256_sqrt_ps
 424 #define gmx_sincos_pr     gmx_mm256_sincos_ps
 425 #define gmx_acos_pr       gmx_mm256_acos_ps
 426 #define gmx_atan2_pr      gmx_mm256_atan2_ps
 427
 428 #else /* ifndef GMX_DOUBLE */
 429
 430 #define GMX_SIMD_WIDTH_HERE  4
 431
 432 #define gmx_mm_pr  __m256d
 433
 434 #define gmx_mm_pb  __m256d
 435
 436 /* We use 128-bit integer registers because of missing 256-bit operations */
 437 #define gmx_epi32  __m128i
 438 #define GMX_SIMD_EPI32_WIDTH  4
 439
 440 #define gmx_load_pr       _mm256_load_pd
 441 #define gmx_load1_pr(x)   _mm256_set1_pd((x)[0])
 442 #define gmx_set1_pr       _mm256_set1_pd
 443 #define gmx_setzero_pr    _mm256_setzero_pd
 444 #define gmx_store_pr      _mm256_store_pd
 445
 446 #define gmx_add_pr        _mm256_add_pd
 447 #define gmx_sub_pr        _mm256_sub_pd
 448 #define gmx_mul_pr        _mm256_mul_pd
 449 #define gmx_madd_pr(a, b, c)   _mm256_add_pd(c, _mm256_mul_pd(a, b))
 450 #define gmx_nmsub_pr(a, b, c)  _mm256_sub_pd(c, _mm256_mul_pd(a, b))
 451 #define gmx_max_pr        _mm256_max_pd
 452 #define gmx_blendzero_pr  _mm256_and_pd
 453
 454 #define gmx_round_pr(x)   _mm256_round_pd(x, 0x0)
 455 #define GMX_SIMD_HAVE_FLOOR
 456 #define gmx_floor_pr      _mm256_floor_pd
 457
 458 #define GMX_SIMD_HAVE_BLENDV
 459 #define gmx_blendv_pr     _mm256_blendv_pd
 460
 461 static gmx_inline gmx_mm_pr gmx_cpsgn_nonneg_pr(gmx_mm_pr a, gmx_mm_pr b)
 462 {
 463     gmx_mm_pr sign_mask = _mm256_set1_pd(-0.0);
 464     return _mm256_or_pd(_mm256_and_pd(a, sign_mask), b);
 465 };
 466
 467 static gmx_inline gmx_mm_pr gmx_masknot_add_pr(gmx_mm_pb a, gmx_mm_pr b, gmx_mm_pr c) { return _mm256_add_pd(b, _mm256_andnot_pd(a, c)); };
 468
 469 /* Less-than (we use ordered, non-signaling, but that's not required) */
 470 #define gmx_cmplt_pr(x, y) _mm256_cmp_pd(x, y, 0x11)
 471
 472 #define gmx_and_pb        _mm256_and_pd
 473 #define gmx_or_pb         _mm256_or_pd
 474
 475 #define gmx_anytrue_pb    _mm256_movemask_pd
 476
 477 #define gmx_cvttpr_epi32  _mm256_cvttpd_epi32
 478
 479 #define gmx_rsqrt_pr(r)   _mm256_cvtps_pd(_mm_rsqrt_ps(_mm256_cvtpd_ps(r)))
 480 #define gmx_rcp_pr(r)     _mm256_cvtps_pd(_mm_rcp_ps(_mm256_cvtpd_ps(r)))
 481
 482 #define gmx_exp_pr        gmx_mm256_exp_pd
 483 #define gmx_sqrt_pr       gmx_mm256_sqrt_pd
 484 #define gmx_sincos_pr     gmx_mm256_sincos_pd
 485 #define gmx_acos_pr       gmx_mm256_acos_pd
 486 #define gmx_atan2_pr      gmx_mm256_atan2_pd
 487
 488 #endif /* ifndef GMX_DOUBLE */
 489
 490 #endif /* 128- or 256-bit x86 SIMD */
 491
 492 #endif /* GMX_X86_SSE2 */
 493
 494
 495 #ifdef GMX_HAVE_SIMD_MACROS
 496 /* Generic functions to extract a SIMD aligned pointer from a pointer x.
 497  * x should have at least GMX_SIMD_WIDTH_HERE elements extra compared
 498  * to how many you want to use, to avoid indexing outside the aligned region.
 499  */
 500
 501 static gmx_inline real *
 502 gmx_simd_align_real(const real *x)
 503 {
 504     return (real *)(((size_t)((x)+GMX_SIMD_WIDTH_HERE)) & (~((size_t)(GMX_SIMD_WIDTH_HERE*sizeof(real)-1))));
 505 }
 506
 507 static gmx_inline int *
 508 gmx_simd_align_int(const int *x)
 509 {
 510     return (int  *)(((size_t)((x)+GMX_SIMD_WIDTH_HERE)) & (~((size_t)(GMX_SIMD_WIDTH_HERE*sizeof(int )-1))));
 511 }
 512
 513
 514 /* Include the math functions which only need the above macros,
 515  * generally these are the ones that don't need masking operations.
 516  */
 517 #ifdef GMX_DOUBLE
 518 #include "gmx_simd_math_double.h"
 519 #else
 520 #include "gmx_simd_math_single.h"
 521 #endif
 522
 523 #endif /* GMX_HAVE_SIMD_MACROS */
 524
 525 #endif /* _gmx_simd_macros_h_ */