src/gromacs/legacyheaders/gmx_simd_macros.h

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
   5  * Copyright (c) 2001-2012, The GROMACS Development Team
   6  * Copyright (c) 2012, by the GROMACS development team, led by
   7  * David van der Spoel, Berk Hess, Erik Lindahl, and including many
   8  * others, as listed in the AUTHORS file in the top-level source
   9  * directory and at http://www.gromacs.org.
  10  *
  11  * GROMACS is free software; you can redistribute it and/or
  12  * modify it under the terms of the GNU Lesser General Public License
  13  * as published by the Free Software Foundation; either version 2.1
  14  * of the License, or (at your option) any later version.
  15  *
  16  * GROMACS is distributed in the hope that it will be useful,
  17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  19  * Lesser General Public License for more details.
  20  *
  21  * You should have received a copy of the GNU Lesser General Public
  22  * License along with GROMACS; if not, see
  23  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  24  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  25  *
  26  * If you want to redistribute modifications to GROMACS, please
  27  * consider that scientific software is very special. Version
  28  * control is crucial - bugs must be traceable. We will be happy to
  29  * consider code for inclusion in the official distribution, but
  30  * derived work must not be called official GROMACS. Details are found
  31  * in the README & COPYING files - if they are missing, get the
  32  * official version at http://www.gromacs.org.
  33  *
  34  * To help us fund GROMACS development, we humbly ask that you cite
  35  * the research papers on the package. Check out http://www.gromacs.org.
  36  */
  37
  38 /* The macros in this file are intended to be used for writing
  39  * architecture-independent SIMD intrinsics code.
  40  * To support a new architecture, adding macros here should be (nearly)
  41  * all that is needed.
  42  */
  43
  44 #ifdef _gmx_simd_macros_h_
  45 #error "gmx_simd_macros.h included twice"
  46 #else
  47 #define _gmx_simd_macros_h_
  48
  49 /* NOTE: SSE2 acceleration does not include floor or blendv */
  50
  51
  52 /* Uncomment the next line, without other SIMD active, for testing plain-C */
  53 /* #define GMX_SIMD_REFERENCE_PLAIN_C */
  54 #ifdef GMX_SIMD_REFERENCE_PLAIN_C
  55 /* Plain C SIMD reference implementation, also serves as documentation */
  56 #define GMX_HAVE_SIMD_MACROS
  57
  58 /* In general the reference SIMD supports any SIMD width, including 1.
  59  * For the nbnxn 4xn kernels all widths (2, 4 and 8) are supported.
  60  * The nbnxn 2xnn kernels are currently not supported.
  61  */
  62 #define GMX_SIMD_REF_WIDTH  4
  63
  64 /* Include plain-C reference implementation, also serves as documentation */
  65 #include "gmx_simd_ref.h"
  66
  67 #define GMX_SIMD_WIDTH_HERE  GMX_SIMD_REF_WIDTH
  68
  69 /* float/double SIMD register type */
  70 #define gmx_mm_pr  gmx_simd_ref_pr
  71
  72 /* boolean SIMD register type */
  73 #define gmx_mm_pb  gmx_simd_ref_pb
  74
  75 /* integer SIMD register type, only for table indexing and exclusion masks */
  76 #define gmx_epi32  gmx_simd_ref_epi32
  77 #define GMX_SIMD_EPI32_WIDTH  GMX_SIMD_REF_EPI32_WIDTH
  78
  79 /* Load GMX_SIMD_WIDTH_HERE reals for memory starting at r */
  80 #define gmx_load_pr       gmx_simd_ref_load_pr
  81 /* Set all SIMD register elements to *r */
  82 #define gmx_load1_pr      gmx_simd_ref_load1_pr
  83 #define gmx_set1_pr       gmx_simd_ref_set1_pr
  84 #define gmx_setzero_pr    gmx_simd_ref_setzero_pr
  85 #define gmx_store_pr      gmx_simd_ref_store_pr
  86
  87 #define gmx_add_pr        gmx_simd_ref_add_pr
  88 #define gmx_sub_pr        gmx_simd_ref_sub_pr
  89 #define gmx_mul_pr        gmx_simd_ref_mul_pr
  90 /* For the FMA macros below, aim for c=d in code, so FMA3 uses 1 instruction */
  91 #define gmx_madd_pr       gmx_simd_ref_madd_pr
  92 #define gmx_nmsub_pr      gmx_simd_ref_nmsub_pr
  93
  94 #define gmx_max_pr        gmx_simd_ref_max_pr
  95 #define gmx_blendzero_pr  gmx_simd_ref_blendzero_pr
  96
  97 #define gmx_round_pr      gmx_simd_ref_round_pr
  98
  99 /* Not required, only used to speed up the nbnxn tabulated PME kernels */
 100 #define GMX_SIMD_HAVE_FLOOR
 101 #ifdef GMX_SIMD_HAVE_FLOOR
 102 #define gmx_floor_pr      gmx_simd_ref_floor_pr
 103 #endif
 104
 105 /* Not required, only used when blendv is faster than comparison */
 106 #define GMX_SIMD_HAVE_BLENDV
 107 #ifdef GMX_SIMD_HAVE_BLENDV
 108 #define gmx_blendv_pr     gmx_simd_ref_blendv_pr
 109 #endif
 110
 111 /* Copy the sign of a to b, assumes b >= 0 for efficiency */
 112 #define gmx_cpsgn_nonneg_pr  gmx_simd_ref_cpsgn_nonneg_pr
 113
 114 /* Very specific operation required in the non-bonded kernels */
 115 #define gmx_masknot_add_pr   gmx_simd_ref_masknot_add_pr
 116
 117 /* Comparison */
 118 #define gmx_cmplt_pr      gmx_simd_ref_cmplt_pr
 119
 120 /* Logical operations on SIMD booleans */
 121 #define gmx_and_pb        gmx_simd_ref_and_pb
 122 #define gmx_or_pb         gmx_simd_ref_or_pb
 123
 124 /* Not required, gmx_anytrue_pb(x) returns if any of the boolean is x is True.
 125  * If this is not present, define GMX_SIMD_IS_TRUE(real x),
 126  * which should return x==True, where True is True as defined in SIMD.
 127  */
 128 #define GMX_SIMD_HAVE_ANYTRUE
 129 #ifdef GMX_SIMD_HAVE_ANYTRUE
 130 #define gmx_anytrue_pb    gmx_simd_ref_anytrue_pb
 131 #else
 132 /* If we don't have gmx_anytrue_pb, we need to store gmx_mm_pb */
 133 #define gmx_store_pb      gmx_simd_ref_store_pb
 134 #endif
 135
 136 /* Conversions only used for PME table lookup */
 137 #define gmx_cvttpr_epi32  gmx_simd_ref_cvttpr_epi32
 138 #define gmx_cvtepi32_pr   gmx_simd_ref_cvtepi32_pr
 139
 140 /* These two function only need to be approximate, Newton-Raphson iteration
 141  * is used for full accuracy in gmx_invsqrt_pr and gmx_inv_pr.
 142  */
 143 #define gmx_rsqrt_pr      gmx_simd_ref_rsqrt_pr
 144 #define gmx_rcp_pr        gmx_simd_ref_rcp_pr
 145
 146 /* sqrt+inv+sin+cos+acos+atan2 are used for bonded potentials, exp for PME */
 147 #define GMX_SIMD_HAVE_EXP
 148 #ifdef GMX_SIMD_HAVE_EXP
 149 #define gmx_exp_pr        gmx_simd_ref_exp_pr
 150 #endif
 151 #define GMX_SIMD_HAVE_TRIGONOMETRIC
 152 #ifdef GMX_SIMD_HAVE_TRIGONOMETRIC
 153 #define gmx_sqrt_pr       gmx_simd_ref_sqrt_pr
 154 #define gmx_sincos_pr     gmx_simd_ref_sincos_pr
 155 #define gmx_acos_pr       gmx_simd_ref_acos_pr
 156 #define gmx_atan2_pr      gmx_simd_ref_atan2_pr
 157 #endif
 158
 159 #endif /* GMX_SIMD_REFERENCE_PLAIN_C */
 160
 161
 162 /* The same SIMD macros can be translated to SIMD intrinsics (and compiled
 163  * to instructions for) different SIMD width and float precision.
 164  *
 165  * On x86: The gmx_ prefix is replaced by _mm_ or _mm256_ (SSE or AVX).
 166  * The _pr suffix is replaced by _ps or _pd (for single or double precision).
 167  * Compiler settings will decide if 128-bit intrinsics will
 168  * be translated into SSE or AVX instructions.
 169  */
 170
 171
 172 #ifdef GMX_USE_HALF_WIDTH_SIMD_HERE
 173 #if defined GMX_X86_AVX_256
 174 /* We have half SIMD width support, continue */
 175 #else
 176 #error "half SIMD width intrinsics are not supported"
 177 #endif
 178 #endif
 179
 180
 181 #ifdef GMX_X86_SSE2
 182 /* This is for general x86 SIMD instruction sets that also support SSE2 */
 183 #define GMX_HAVE_SIMD_MACROS
 184
 185 /* Include the highest supported x86 SIMD intrisics + math functions */
 186 #ifdef GMX_X86_AVX_256
 187 #include "gmx_x86_avx_256.h"
 188 #ifdef GMX_DOUBLE
 189 #include "gmx_math_x86_avx_256_double.h"
 190 #else
 191 #include "gmx_math_x86_avx_256_single.h"
 192 #endif
 193 #else
 194 #ifdef GMX_X86_AVX_128_FMA
 195 #include "gmx_x86_avx_128_fma.h"
 196 #ifdef GMX_DOUBLE
 197 #include "gmx_math_x86_avx_128_fma_double.h"
 198 #else
 199 #include "gmx_math_x86_avx_128_fma_single.h"
 200 #endif
 201 #else
 202 #ifdef GMX_X86_SSE4_1
 203 #include "gmx_x86_sse4_1.h"
 204 #ifdef GMX_DOUBLE
 205 #include "gmx_math_x86_sse4_1_double.h"
 206 #else
 207 #include "gmx_math_x86_sse4_1_single.h"
 208 #endif
 209 #else
 210 #ifdef GMX_X86_SSE2
 211 #include "gmx_x86_sse2.h"
 212 #ifdef GMX_DOUBLE
 213 #include "gmx_math_x86_sse2_double.h"
 214 #else
 215 #include "gmx_math_x86_sse2_single.h"
 216 #endif
 217 #else
 218 #error No x86 acceleration defined
 219 #endif
 220 #endif
 221 #endif
 222 #endif
 223 /* exp and trigonometric functions are included above */
 224 #define GMX_SIMD_HAVE_EXP
 225 #define GMX_SIMD_HAVE_TRIGONOMETRIC
 226
 227 #if !defined GMX_X86_AVX_256 || defined GMX_USE_HALF_WIDTH_SIMD_HERE
 228
 229 #ifndef GMX_DOUBLE
 230
 231 #define GMX_SIMD_WIDTH_HERE  4
 232
 233 #define gmx_mm_pr  __m128
 234
 235 #define gmx_mm_pb  __m128
 236
 237 #define gmx_epi32  __m128i
 238 #define GMX_SIMD_EPI32_WIDTH  4
 239
 240 #define gmx_load_pr       _mm_load_ps
 241 #define gmx_load1_pr      _mm_load1_ps
 242 #define gmx_set1_pr       _mm_set1_ps
 243 #define gmx_setzero_pr    _mm_setzero_ps
 244 #define gmx_store_pr      _mm_store_ps
 245
 246 #define gmx_add_pr        _mm_add_ps
 247 #define gmx_sub_pr        _mm_sub_ps
 248 #define gmx_mul_pr        _mm_mul_ps
 249 #ifdef GMX_X86_AVX_128_FMA
 250 #define GMX_SIMD_HAVE_FMA
 251 #define gmx_madd_pr(a, b, c)   _mm_macc_ps(a, b, c)
 252 #define gmx_nmsub_pr(a, b, c)  _mm_nmacc_ps(a, b, c)
 253 #else
 254 #define gmx_madd_pr(a, b, c)   _mm_add_ps(c, _mm_mul_ps(a, b))
 255 #define gmx_nmsub_pr(a, b, c)  _mm_sub_ps(c, _mm_mul_ps(a, b))
 256 #endif
 257 #define gmx_max_pr        _mm_max_ps
 258 #define gmx_blendzero_pr  _mm_and_ps
 259
 260 #define gmx_cmplt_pr      _mm_cmplt_ps
 261 #define gmx_and_pb        _mm_and_ps
 262 #define gmx_or_pb         _mm_or_ps
 263
 264 #ifdef GMX_X86_SSE4_1
 265 #define gmx_round_pr(x)   _mm_round_ps(x, 0x0)
 266 #define GMX_SIMD_HAVE_FLOOR
 267 #define gmx_floor_pr      _mm_floor_ps
 268 #else
 269 #define gmx_round_pr(x)   _mm_cvtepi32_ps(_mm_cvtps_epi32(x))
 270 #endif
 271
 272 #ifdef GMX_X86_SSE4_1
 273 #define GMX_SIMD_HAVE_BLENDV
 274 #define gmx_blendv_pr     _mm_blendv_ps
 275 #endif
 276
 277 static gmx_inline gmx_mm_pr gmx_cpsgn_nonneg_pr(gmx_mm_pr a, gmx_mm_pr b)
 278 {
 279     /* The value -0.0 has only the sign-bit set */
 280     gmx_mm_pr sign_mask = _mm_set1_ps(-0.0);
 281     return _mm_or_ps(_mm_and_ps(a, sign_mask), b);
 282 };
 283
 284 static gmx_inline gmx_mm_pr gmx_masknot_add_pr(gmx_mm_pb a, gmx_mm_pr b, gmx_mm_pr c) { return _mm_add_ps(b, _mm_andnot_ps(a, c)); };
 285
 286 #define GMX_SIMD_HAVE_ANYTRUE
 287 #define gmx_anytrue_pb    _mm_movemask_ps
 288
 289 #define gmx_cvttpr_epi32  _mm_cvttps_epi32
 290 #define gmx_cvtepi32_pr   _mm_cvtepi32_ps
 291
 292 #define gmx_rsqrt_pr      _mm_rsqrt_ps
 293 #define gmx_rcp_pr        _mm_rcp_ps
 294
 295 #define gmx_exp_pr        gmx_mm_exp_ps
 296 #define gmx_sqrt_pr       gmx_mm_sqrt_ps
 297 #define gmx_sincos_pr     gmx_mm_sincos_ps
 298 #define gmx_acos_pr       gmx_mm_acos_ps
 299 #define gmx_atan2_pr      gmx_mm_atan2_ps
 300
 301 #else /* ifndef GMX_DOUBLE */
 302
 303 #define GMX_SIMD_WIDTH_HERE  2
 304
 305 #define gmx_mm_pr  __m128d
 306
 307 #define gmx_mm_pb  __m128d
 308
 309 #define gmx_epi32  __m128i
 310 #define GMX_SIMD_EPI32_WIDTH  4
 311
 312 #define gmx_load_pr       _mm_load_pd
 313 #define gmx_load1_pr      _mm_load1_pd
 314 #define gmx_set1_pr       _mm_set1_pd
 315 #define gmx_setzero_pr    _mm_setzero_pd
 316 #define gmx_store_pr      _mm_store_pd
 317
 318 #define gmx_add_pr        _mm_add_pd
 319 #define gmx_sub_pr        _mm_sub_pd
 320 #define gmx_mul_pr        _mm_mul_pd
 321 #ifdef GMX_X86_AVX_128_FMA
 322 #define GMX_SIMD_HAVE_FMA
 323 #define gmx_madd_pr(a, b, c)   _mm_macc_pd(a, b, c)
 324 #define gmx_nmsub_pr(a, b, c)  _mm_nmacc_pd(a, b, c)
 325 #else
 326 #define gmx_madd_pr(a, b, c)   _mm_add_pd(c, _mm_mul_pd(a, b))
 327 #define gmx_nmsub_pr(a, b, c)  _mm_sub_pd(c, _mm_mul_pd(a, b))
 328 #endif
 329 #define gmx_max_pr        _mm_max_pd
 330 #define gmx_blendzero_pr  _mm_and_pd
 331
 332 #ifdef GMX_X86_SSE4_1
 333 #define gmx_round_pr(x)   _mm_round_pd(x, 0x0)
 334 #define GMX_SIMD_HAVE_FLOOR
 335 #define gmx_floor_pr      _mm_floor_pd
 336 #else
 337 #define gmx_round_pr(x)   _mm_cvtepi32_pd(_mm_cvtpd_epi32(x))
 338 /* gmx_floor_pr is not used in code for pre-SSE4_1 hardware */
 339 #endif
 340
 341 #ifdef GMX_X86_SSE4_1
 342 #define GMX_SIMD_HAVE_BLENDV
 343 #define gmx_blendv_pr     _mm_blendv_pd
 344 #endif
 345
 346 static gmx_inline gmx_mm_pr gmx_cpsgn_nonneg_pr(gmx_mm_pr a, gmx_mm_pr b)
 347 {
 348     gmx_mm_pr sign_mask = _mm_set1_pd(-0.0);
 349     return _mm_or_pd(_mm_and_pd(a, sign_mask), b);
 350 };
 351
 352 static gmx_inline gmx_mm_pr gmx_masknot_add_pr(gmx_mm_pb a, gmx_mm_pr b, gmx_mm_pr c) { return _mm_add_pd(b, _mm_andnot_pd(a, c)); };
 353
 354 #define gmx_cmplt_pr      _mm_cmplt_pd
 355
 356 #define gmx_and_pb        _mm_and_pd
 357 #define gmx_or_pb         _mm_or_pd
 358
 359 #define GMX_SIMD_HAVE_ANYTRUE
 360 #define gmx_anytrue_pb    _mm_movemask_pd
 361
 362 #define gmx_cvttpr_epi32  _mm_cvttpd_epi32
 363 #define gmx_cvtepi32_pr   _mm_cvtepi32_pd
 364
 365 #define gmx_rsqrt_pr(r)   _mm_cvtps_pd(_mm_rsqrt_ps(_mm_cvtpd_ps(r)))
 366 #define gmx_rcp_pr(r)     _mm_cvtps_pd(_mm_rcp_ps(_mm_cvtpd_ps(r)))
 367
 368 #define gmx_exp_pr        gmx_mm_exp_pd
 369 #define gmx_sqrt_pr       gmx_mm_sqrt_pd
 370 #define gmx_sincos_pr     gmx_mm_sincos_pd
 371 #define gmx_acos_pr       gmx_mm_acos_pd
 372 #define gmx_atan2_pr      gmx_mm_atan2_pd
 373
 374 #endif /* ifndef GMX_DOUBLE */
 375
 376 #else
 377 /* We have GMX_X86_AVX_256 and not GMX_USE_HALF_WIDTH_SIMD_HERE,
 378  * so we use 256-bit SIMD.
 379  */
 380
 381 #ifndef GMX_DOUBLE
 382
 383 #define GMX_SIMD_WIDTH_HERE  8
 384
 385 #define gmx_mm_pr  __m256
 386
 387 #define gmx_mm_pb  __m256
 388
 389 #define gmx_epi32  __m256i
 390 #define GMX_SIMD_EPI32_WIDTH  8
 391
 392 #define gmx_load_pr       _mm256_load_ps
 393 #define gmx_load1_pr(x)   _mm256_set1_ps((x)[0])
 394 #define gmx_set1_pr       _mm256_set1_ps
 395 #define gmx_setzero_pr    _mm256_setzero_ps
 396 #define gmx_store_pr      _mm256_store_ps
 397
 398 #define gmx_add_pr        _mm256_add_ps
 399 #define gmx_sub_pr        _mm256_sub_ps
 400 #define gmx_mul_pr        _mm256_mul_ps
 401 #define gmx_madd_pr(a, b, c)   _mm256_add_ps(c, _mm256_mul_ps(a, b))
 402 #define gmx_nmsub_pr(a, b, c)  _mm256_sub_ps(c, _mm256_mul_ps(a, b))
 403 #define gmx_max_pr        _mm256_max_ps
 404 #define gmx_blendzero_pr  _mm256_and_ps
 405
 406 #define gmx_round_pr(x)   _mm256_round_ps(x, 0x0)
 407 #define GMX_SIMD_HAVE_FLOOR
 408 #define gmx_floor_pr      _mm256_floor_ps
 409
 410 #define GMX_SIMD_HAVE_BLENDV
 411 #define gmx_blendv_pr     _mm256_blendv_ps
 412
 413 static gmx_inline gmx_mm_pr gmx_cpsgn_nonneg_pr(gmx_mm_pr a, gmx_mm_pr b)
 414 {
 415     gmx_mm_pr sign_mask = _mm256_set1_ps(-0.0);
 416     return _mm256_or_ps(_mm256_and_ps(a, sign_mask), b);
 417 };
 418
 419 static gmx_inline gmx_mm_pr gmx_masknot_add_pr(gmx_mm_pb a, gmx_mm_pr b, gmx_mm_pr c) { return _mm256_add_ps(b, _mm256_andnot_ps(a, c)); };
 420
 421 /* Less-than (we use ordered, non-signaling, but that's not required) */
 422 #define gmx_cmplt_pr(x, y) _mm256_cmp_ps(x, y, 0x11)
 423 #define gmx_and_pb        _mm256_and_ps
 424 #define gmx_or_pb         _mm256_or_ps
 425
 426 #define GMX_SIMD_HAVE_ANYTRUE
 427 #define gmx_anytrue_pb    _mm256_movemask_ps
 428
 429 #define gmx_cvttpr_epi32  _mm256_cvttps_epi32
 430
 431 #define gmx_rsqrt_pr      _mm256_rsqrt_ps
 432 #define gmx_rcp_pr        _mm256_rcp_ps
 433
 434 #define gmx_exp_pr        gmx_mm256_exp_ps
 435 #define gmx_sqrt_pr       gmx_mm256_sqrt_ps
 436 #define gmx_sincos_pr     gmx_mm256_sincos_ps
 437 #define gmx_acos_pr       gmx_mm256_acos_ps
 438 #define gmx_atan2_pr      gmx_mm256_atan2_ps
 439
 440 #else /* ifndef GMX_DOUBLE */
 441
 442 #define GMX_SIMD_WIDTH_HERE  4
 443
 444 #define gmx_mm_pr  __m256d
 445
 446 #define gmx_mm_pb  __m256d
 447
 448 /* We use 128-bit integer registers because of missing 256-bit operations */
 449 #define gmx_epi32  __m128i
 450 #define GMX_SIMD_EPI32_WIDTH  4
 451
 452 #define gmx_load_pr       _mm256_load_pd
 453 #define gmx_load1_pr(x)   _mm256_set1_pd((x)[0])
 454 #define gmx_set1_pr       _mm256_set1_pd
 455 #define gmx_setzero_pr    _mm256_setzero_pd
 456 #define gmx_store_pr      _mm256_store_pd
 457
 458 #define gmx_add_pr        _mm256_add_pd
 459 #define gmx_sub_pr        _mm256_sub_pd
 460 #define gmx_mul_pr        _mm256_mul_pd
 461 #define gmx_madd_pr(a, b, c)   _mm256_add_pd(c, _mm256_mul_pd(a, b))
 462 #define gmx_nmsub_pr(a, b, c)  _mm256_sub_pd(c, _mm256_mul_pd(a, b))
 463 #define gmx_max_pr        _mm256_max_pd
 464 #define gmx_blendzero_pr  _mm256_and_pd
 465
 466 #define gmx_round_pr(x)   _mm256_round_pd(x, 0x0)
 467 #define GMX_SIMD_HAVE_FLOOR
 468 #define gmx_floor_pr      _mm256_floor_pd
 469
 470 #define GMX_SIMD_HAVE_BLENDV
 471 #define gmx_blendv_pr     _mm256_blendv_pd
 472
 473 static gmx_inline gmx_mm_pr gmx_cpsgn_nonneg_pr(gmx_mm_pr a, gmx_mm_pr b)
 474 {
 475     gmx_mm_pr sign_mask = _mm256_set1_pd(-0.0);
 476     return _mm256_or_pd(_mm256_and_pd(a, sign_mask), b);
 477 };
 478
 479 static gmx_inline gmx_mm_pr gmx_masknot_add_pr(gmx_mm_pb a, gmx_mm_pr b, gmx_mm_pr c) { return _mm256_add_pd(b, _mm256_andnot_pd(a, c)); };
 480
 481 /* Less-than (we use ordered, non-signaling, but that's not required) */
 482 #define gmx_cmplt_pr(x, y) _mm256_cmp_pd(x, y, 0x11)
 483
 484 #define gmx_and_pb        _mm256_and_pd
 485 #define gmx_or_pb         _mm256_or_pd
 486
 487 #define GMX_SIMD_HAVE_ANYTRUE
 488 #define gmx_anytrue_pb    _mm256_movemask_pd
 489
 490 #define gmx_cvttpr_epi32  _mm256_cvttpd_epi32
 491
 492 #define gmx_rsqrt_pr(r)   _mm256_cvtps_pd(_mm_rsqrt_ps(_mm256_cvtpd_ps(r)))
 493 #define gmx_rcp_pr(r)     _mm256_cvtps_pd(_mm_rcp_ps(_mm256_cvtpd_ps(r)))
 494
 495 #define gmx_exp_pr        gmx_mm256_exp_pd
 496 #define gmx_sqrt_pr       gmx_mm256_sqrt_pd
 497 #define gmx_sincos_pr     gmx_mm256_sincos_pd
 498 #define gmx_acos_pr       gmx_mm256_acos_pd
 499 #define gmx_atan2_pr      gmx_mm256_atan2_pd
 500
 501 #endif /* ifndef GMX_DOUBLE */
 502
 503 #endif /* 128- or 256-bit x86 SIMD */
 504
 505 #endif /* GMX_X86_SSE2 */
 506
 507
 508 #ifdef GMX_HAVE_SIMD_MACROS
 509 /* Generic functions to extract a SIMD aligned pointer from a pointer x.
 510  * x should have at least GMX_SIMD_WIDTH_HERE elements extra compared
 511  * to how many you want to use, to avoid indexing outside the aligned region.
 512  */
 513
 514 static gmx_inline real *
 515 gmx_simd_align_real(const real *x)
 516 {
 517     return (real *)(((size_t)((x)+GMX_SIMD_WIDTH_HERE)) & (~((size_t)(GMX_SIMD_WIDTH_HERE*sizeof(real)-1))));
 518 }
 519
 520 static gmx_inline int *
 521 gmx_simd_align_int(const int *x)
 522 {
 523     return (int  *)(((size_t)((x)+GMX_SIMD_WIDTH_HERE)) & (~((size_t)(GMX_SIMD_WIDTH_HERE*sizeof(int )-1))));
 524 }
 525
 526
 527 /* Include the math functions which only need the above macros,
 528  * generally these are the ones that don't need masking operations.
 529  */
 530 #ifdef GMX_DOUBLE
 531 #include "gmx_simd_math_double.h"
 532 #else
 533 #include "gmx_simd_math_single.h"
 534 #endif
 535
 536 #endif /* GMX_HAVE_SIMD_MACROS */
 537
 538 #endif /* _gmx_simd_macros_h_ */