src/gromacs/legacyheaders/gmx_simd_macros.h

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
   5  * Copyright (c) 2001-2012, The GROMACS Development Team
   6  * Copyright (c) 2012, by the GROMACS development team, led by
   7  * David van der Spoel, Berk Hess, Erik Lindahl, and including many
   8  * others, as listed in the AUTHORS file in the top-level source
   9  * directory and at http://www.gromacs.org.
  10  *
  11  * GROMACS is free software; you can redistribute it and/or
  12  * modify it under the terms of the GNU Lesser General Public License
  13  * as published by the Free Software Foundation; either version 2.1
  14  * of the License, or (at your option) any later version.
  15  *
  16  * GROMACS is distributed in the hope that it will be useful,
  17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  19  * Lesser General Public License for more details.
  20  *
  21  * You should have received a copy of the GNU Lesser General Public
  22  * License along with GROMACS; if not, see
  23  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  24  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  25  *
  26  * If you want to redistribute modifications to GROMACS, please
  27  * consider that scientific software is very special. Version
  28  * control is crucial - bugs must be traceable. We will be happy to
  29  * consider code for inclusion in the official distribution, but
  30  * derived work must not be called official GROMACS. Details are found
  31  * in the README & COPYING files - if they are missing, get the
  32  * official version at http://www.gromacs.org.
  33  *
  34  * To help us fund GROMACS development, we humbly ask that you cite
  35  * the research papers on the package. Check out http://www.gromacs.org.
  36  */
  37
  38 /* The macros in this file are intended to be used for writing
  39  * architecture-independent SIMD intrinsics code.
  40  * To support a new architecture, adding macros here should be (nearly)
  41  * all that is needed.
  42  */
  43
  44 #ifdef _gmx_simd_macros_h_
  45 #error "gmx_simd_macros.h included twice"
  46 #else
  47 #define _gmx_simd_macros_h_
  48
  49 /* NOTE: SSE2 acceleration does not include floor or blendv */
  50
  51
  52 /* Uncomment the next line, without other SIMD active, for testing plain-C */
  53 /* #define GMX_SIMD_REFERENCE_PLAIN_C */
  54 #ifdef GMX_SIMD_REFERENCE_PLAIN_C
  55 /* Plain C SIMD reference implementation, also serves as documentation */
  56 #define GMX_HAVE_SIMD_MACROS
  57
  58 /* In general the reference SIMD supports any SIMD width, including 1.
  59  * For the nbnxn 4xn kernels all widths (2, 4 and 8) are supported.
  60  * The nbnxn 2xnn kernels are currently not supported.
  61  */
  62 #define GMX_SIMD_REF_WIDTH  4
  63
  64 /* Include plain-C reference implementation, also serves as documentation */
  65 #include "gmx_simd_ref.h"
  66
  67 #define GMX_SIMD_WIDTH_HERE  GMX_SIMD_REF_WIDTH
  68
  69 /* float/double SIMD register type */
  70 #define gmx_mm_pr  gmx_simd_ref_pr
  71
  72 /* boolean SIMD register type */
  73 #define gmx_mm_pb  gmx_simd_ref_pb
  74
  75 /* integer SIMD register type, only for table indexing and exclusion masks */
  76 #define gmx_epi32  gmx_simd_ref_epi32
  77 #define GMX_SIMD_EPI32_WIDTH  GMX_SIMD_REF_EPI32_WIDTH
  78
  79 /* Load GMX_SIMD_WIDTH_HERE reals for memory starting at r */
  80 #define gmx_load_pr       gmx_simd_ref_load_pr
  81 /* Set all SIMD register elements to *r */
  82 #define gmx_load1_pr      gmx_simd_ref_load1_pr
  83 #define gmx_set1_pr       gmx_simd_ref_set1_pr
  84 #define gmx_setzero_pr    gmx_simd_ref_setzero_pr
  85 #define gmx_store_pr      gmx_simd_ref_store_pr
  86
  87 #define gmx_add_pr        gmx_simd_ref_add_pr
  88 #define gmx_sub_pr        gmx_simd_ref_sub_pr
  89 #define gmx_mul_pr        gmx_simd_ref_mul_pr
  90 /* For the FMA macros below, aim for c=d in code, so FMA3 uses 1 instruction */
  91 #define gmx_madd_pr       gmx_simd_ref_madd_pr
  92 #define gmx_nmsub_pr      gmx_simd_ref_nmsub_pr
  93
  94 #define gmx_max_pr        gmx_simd_ref_max_pr
  95 #define gmx_blendzero_pr  gmx_simd_ref_blendzero_pr
  96
  97 #define gmx_round_pr      gmx_simd_ref_round_pr
  98
  99 /* Not required, only used to speed up the nbnxn tabulated PME kernels */
 100 #define GMX_SIMD_HAVE_FLOOR
 101 #ifdef GMX_SIMD_HAVE_FLOOR
 102 #define gmx_floor_pr      gmx_simd_ref_floor_pr
 103 #endif
 104
 105 /* Not required, only used when blendv is faster than comparison */
 106 #define GMX_SIMD_HAVE_BLENDV
 107 #ifdef GMX_SIMD_HAVE_BLENDV
 108 #define gmx_blendv_pr     gmx_simd_ref_blendv_pr
 109 #endif
 110
 111 /* Copy the sign of a to b, assumes b >= 0 for efficiency */
 112 #define gmx_cpsgn_nonneg_pr  gmx_simd_ref_cpsgn_nonneg_pr
 113
 114 /* Very specific operation required in the non-bonded kernels */
 115 #define gmx_masknot_add_pr   gmx_simd_ref_masknot_add_pr
 116
 117 /* Comparison */
 118 #define gmx_cmplt_pr      gmx_simd_ref_cmplt_pr
 119
 120 /* Logical operations on SIMD booleans */
 121 #define gmx_and_pb        gmx_simd_ref_and_pb
 122 #define gmx_or_pb         gmx_simd_ref_or_pb
 123
 124 /* Not required, gmx_anytrue_pb(x) returns if any of the boolean is x is True.
 125  * If this is not present, define GMX_SIMD_IS_TRUE(real x),
 126  * which should return x==True, where True is True as defined in SIMD.
 127  */
 128 #define GMX_SIMD_HAVE_ANYTRUE
 129 #ifdef GMX_SIMD_HAVE_ANYTRUE
 130 #define gmx_anytrue_pb    gmx_simd_ref_anytrue_pb
 131 #else
 132 /* If we don't have gmx_anytrue_pb, we need to store gmx_mm_pb */
 133 #define gmx_store_pb      gmx_simd_ref_store_pb
 134 #endif
 135
 136 /* Conversions only used for PME table lookup */
 137 #define gmx_cvttpr_epi32  gmx_simd_ref_cvttpr_epi32
 138 #define gmx_cvtepi32_pr   gmx_simd_ref_cvtepi32_pr
 139
 140 /* These two function only need to be approximate, Newton-Raphson iteration
 141  * is used for full accuracy in gmx_invsqrt_pr and gmx_inv_pr.
 142  */
 143 #define gmx_rsqrt_pr      gmx_simd_ref_rsqrt_pr
 144 #define gmx_rcp_pr        gmx_simd_ref_rcp_pr
 145
 146 /* sqrt+inv+sin+cos+acos+atan2 are used for bonded potentials, exp for PME */
 147 #define GMX_SIMD_HAVE_EXP
 148 #ifdef GMX_SIMD_HAVE_EXP
 149 #define gmx_exp_pr        gmx_simd_ref_exp_pr
 150 #endif
 151 #define GMX_SIMD_HAVE_TRIGONOMETRIC
 152 #ifdef GMX_SIMD_HAVE_TRIGONOMETRIC
 153 #define gmx_sqrt_pr       gmx_simd_ref_sqrt_pr
 154 #define gmx_sincos_pr     gmx_simd_ref_sincos_pr
 155 #define gmx_acos_pr       gmx_simd_ref_acos_pr
 156 #define gmx_atan2_pr      gmx_simd_ref_atan2_pr
 157 #endif
 158
 159 #endif /* GMX_SIMD_REFERENCE_PLAIN_C */
 160
 161
 162 /* The same SIMD macros can be translated to SIMD intrinsics (and compiled
 163  * to instructions for) different SIMD width and float precision.
 164  *
 165  * On x86: The gmx_ prefix is replaced by _mm_ or _mm256_ (SSE or AVX).
 166  * The _pr suffix is replaced by _ps or _pd (for single or double precision).
 167  * Compiler settings will decide if 128-bit intrinsics will
 168  * be translated into SSE or AVX instructions.
 169  */
 170
 171
 172 #ifdef GMX_USE_HALF_WIDTH_SIMD_HERE
 173 #if defined GMX_X86_AVX_256
 174 /* We have half SIMD width support, continue */
 175 #else
 176 #error "half SIMD width intrinsics are not supported"
 177 #endif
 178 #endif
 179
 180
 181 #ifdef GMX_X86_SSE2
 182 /* This is for general x86 SIMD instruction sets that also support SSE2 */
 183 #define GMX_HAVE_SIMD_MACROS
 184
 185 /* Include the highest supported x86 SIMD intrisics + math functions */
 186 #ifdef GMX_X86_AVX_256
 187 #include "gmx_x86_avx_256.h"
 188 #ifdef GMX_DOUBLE
 189 #include "gmx_math_x86_avx_256_double.h"
 190 #else
 191 #include "gmx_math_x86_avx_256_single.h"
 192 #endif
 193 #else
 194 #ifdef GMX_X86_AVX_128_FMA
 195 #include "gmx_x86_avx_128_fma.h"
 196 #ifdef GMX_DOUBLE
 197 #include "gmx_math_x86_avx_128_fma_double.h"
 198 #else
 199 #include "gmx_math_x86_avx_128_fma_single.h"
 200 #endif
 201 #else
 202 #ifdef GMX_X86_SSE4_1
 203 #include "gmx_x86_sse4_1.h"
 204 #ifdef GMX_DOUBLE
 205 #include "gmx_math_x86_sse4_1_double.h"
 206 #else
 207 #include "gmx_math_x86_sse4_1_single.h"
 208 #endif
 209 #else
 210 #ifdef GMX_X86_SSE2
 211 #include "gmx_x86_sse2.h"
 212 #ifdef GMX_DOUBLE
 213 #include "gmx_math_x86_sse2_double.h"
 214 #else
 215 #include "gmx_math_x86_sse2_single.h"
 216 #endif
 217 #else
 218 #error No x86 acceleration defined
 219 #endif
 220 #endif
 221 #endif
 222 #endif
 223 /* exp and trigonometric functions are included above */
 224 #define GMX_SIMD_HAVE_EXP
 225 #define GMX_SIMD_HAVE_TRIGONOMETRIC
 226
 227 #if !defined GMX_X86_AVX_256 || defined GMX_USE_HALF_WIDTH_SIMD_HERE
 228
 229 #ifndef GMX_DOUBLE
 230
 231 #define GMX_SIMD_WIDTH_HERE  4
 232
 233 #define gmx_mm_pr  __m128
 234
 235 #define gmx_mm_pb  __m128
 236
 237 #define gmx_epi32  __m128i
 238 #define GMX_SIMD_EPI32_WIDTH  4
 239
 240 #define gmx_load_pr       _mm_load_ps
 241 #define gmx_load1_pr      _mm_load1_ps
 242 #define gmx_set1_pr       _mm_set1_ps
 243 #define gmx_setzero_pr    _mm_setzero_ps
 244 #define gmx_store_pr      _mm_store_ps
 245
 246 #define gmx_add_pr        _mm_add_ps
 247 #define gmx_sub_pr        _mm_sub_ps
 248 #define gmx_mul_pr        _mm_mul_ps
 249 #ifdef GMX_X86_AVX_128_FMA
 250 #define gmx_madd_pr(a, b, c)   _mm_macc_ps(a, b, c)
 251 #define gmx_nmsub_pr(a, b, c)  _mm_nmacc_ps(a, b, c)
 252 #else
 253 #define gmx_madd_pr(a, b, c)   _mm_add_ps(c, _mm_mul_ps(a, b))
 254 #define gmx_nmsub_pr(a, b, c)  _mm_sub_ps(c, _mm_mul_ps(a, b))
 255 #endif
 256 #define gmx_max_pr        _mm_max_ps
 257 #define gmx_blendzero_pr  _mm_and_ps
 258
 259 #define gmx_cmplt_pr      _mm_cmplt_ps
 260 #define gmx_and_pb        _mm_and_ps
 261 #define gmx_or_pb         _mm_or_ps
 262
 263 #ifdef GMX_X86_SSE4_1
 264 #define gmx_round_pr(x)   _mm_round_ps(x, 0x0)
 265 #define GMX_SIMD_HAVE_FLOOR
 266 #define gmx_floor_pr      _mm_floor_ps
 267 #else
 268 #define gmx_round_pr(x)   _mm_cvtepi32_ps(_mm_cvtps_epi32(x))
 269 #endif
 270
 271 #ifdef GMX_X86_SSE4_1
 272 #define GMX_SIMD_HAVE_BLENDV
 273 #define gmx_blendv_pr     _mm_blendv_ps
 274 #endif
 275
 276 static gmx_inline gmx_mm_pr gmx_cpsgn_nonneg_pr(gmx_mm_pr a, gmx_mm_pr b)
 277 {
 278     /* The value -0.0 has only the sign-bit set */
 279     gmx_mm_pr sign_mask = _mm_set1_ps(-0.0);
 280     return _mm_or_ps(_mm_and_ps(a, sign_mask), b);
 281 };
 282
 283 static gmx_inline gmx_mm_pr gmx_masknot_add_pr(gmx_mm_pb a, gmx_mm_pr b, gmx_mm_pr c) { return _mm_add_ps(b, _mm_andnot_ps(a, c)); };
 284
 285 #define GMX_SIMD_HAVE_ANYTRUE
 286 #define gmx_anytrue_pb    _mm_movemask_ps
 287
 288 #define gmx_cvttpr_epi32  _mm_cvttps_epi32
 289 #define gmx_cvtepi32_pr   _mm_cvtepi32_ps
 290
 291 #define gmx_rsqrt_pr      _mm_rsqrt_ps
 292 #define gmx_rcp_pr        _mm_rcp_ps
 293
 294 #define gmx_exp_pr        gmx_mm_exp_ps
 295 #define gmx_sqrt_pr       gmx_mm_sqrt_ps
 296 #define gmx_sincos_pr     gmx_mm_sincos_ps
 297 #define gmx_acos_pr       gmx_mm_acos_ps
 298 #define gmx_atan2_pr      gmx_mm_atan2_ps
 299
 300 #else /* ifndef GMX_DOUBLE */
 301
 302 #define GMX_SIMD_WIDTH_HERE  2
 303
 304 #define gmx_mm_pr  __m128d
 305
 306 #define gmx_mm_pb  __m128d
 307
 308 #define gmx_epi32  __m128i
 309 #define GMX_SIMD_EPI32_WIDTH  4
 310
 311 #define gmx_load_pr       _mm_load_pd
 312 #define gmx_load1_pr      _mm_load1_pd
 313 #define gmx_set1_pr       _mm_set1_pd
 314 #define gmx_setzero_pr    _mm_setzero_pd
 315 #define gmx_store_pr      _mm_store_pd
 316
 317 #define gmx_add_pr        _mm_add_pd
 318 #define gmx_sub_pr        _mm_sub_pd
 319 #define gmx_mul_pr        _mm_mul_pd
 320 #ifdef GMX_X86_AVX_128_FMA
 321 #define gmx_madd_pr(a, b, c)   _mm_macc_pd(a, b, c)
 322 #define gmx_nmsub_pr(a, b, c)  _mm_nmacc_pd(a, b, c)
 323 #else
 324 #define gmx_madd_pr(a, b, c)   _mm_add_pd(c, _mm_mul_pd(a, b))
 325 #define gmx_nmsub_pr(a, b, c)  _mm_sub_pd(c, _mm_mul_pd(a, b))
 326 #endif
 327 #define gmx_max_pr        _mm_max_pd
 328 #define gmx_blendzero_pr  _mm_and_pd
 329
 330 #ifdef GMX_X86_SSE4_1
 331 #define gmx_round_pr(x)   _mm_round_pd(x, 0x0)
 332 #define GMX_SIMD_HAVE_FLOOR
 333 #define gmx_floor_pr      _mm_floor_pd
 334 #else
 335 #define gmx_round_pr(x)   _mm_cvtepi32_pd(_mm_cvtpd_epi32(x))
 336 /* gmx_floor_pr is not used in code for pre-SSE4_1 hardware */
 337 #endif
 338
 339 #ifdef GMX_X86_SSE4_1
 340 #define GMX_SIMD_HAVE_BLENDV
 341 #define gmx_blendv_pr     _mm_blendv_pd
 342 #endif
 343
 344 static gmx_inline gmx_mm_pr gmx_cpsgn_nonneg_pr(gmx_mm_pr a, gmx_mm_pr b)
 345 {
 346     gmx_mm_pr sign_mask = _mm_set1_pd(-0.0);
 347     return _mm_or_pd(_mm_and_pd(a, sign_mask), b);
 348 };
 349
 350 static gmx_inline gmx_mm_pr gmx_masknot_add_pr(gmx_mm_pb a, gmx_mm_pr b, gmx_mm_pr c) { return _mm_add_pd(b, _mm_andnot_pd(a, c)); };
 351
 352 #define gmx_cmplt_pr      _mm_cmplt_pd
 353
 354 #define gmx_and_pb        _mm_and_pd
 355 #define gmx_or_pb         _mm_or_pd
 356
 357 #define GMX_SIMD_HAVE_ANYTRUE
 358 #define gmx_anytrue_pb    _mm_movemask_pd
 359
 360 #define gmx_cvttpr_epi32  _mm_cvttpd_epi32
 361 #define gmx_cvtepi32_pr   _mm_cvtepi32_pd
 362
 363 #define gmx_rsqrt_pr(r)   _mm_cvtps_pd(_mm_rsqrt_ps(_mm_cvtpd_ps(r)))
 364 #define gmx_rcp_pr(r)     _mm_cvtps_pd(_mm_rcp_ps(_mm_cvtpd_ps(r)))
 365
 366 #define gmx_exp_pr        gmx_mm_exp_pd
 367 #define gmx_sqrt_pr       gmx_mm_sqrt_pd
 368 #define gmx_sincos_pr     gmx_mm_sincos_pd
 369 #define gmx_acos_pr       gmx_mm_acos_pd
 370 #define gmx_atan2_pr      gmx_mm_atan2_pd
 371
 372 #endif /* ifndef GMX_DOUBLE */
 373
 374 #else
 375 /* We have GMX_X86_AVX_256 and not GMX_USE_HALF_WIDTH_SIMD_HERE,
 376  * so we use 256-bit SIMD.
 377  */
 378
 379 #ifndef GMX_DOUBLE
 380
 381 #define GMX_SIMD_WIDTH_HERE  8
 382
 383 #define gmx_mm_pr  __m256
 384
 385 #define gmx_mm_pb  __m256
 386
 387 #define gmx_epi32  __m256i
 388 #define GMX_SIMD_EPI32_WIDTH  8
 389
 390 #define gmx_load_pr       _mm256_load_ps
 391 #define gmx_load1_pr(x)   _mm256_set1_ps((x)[0])
 392 #define gmx_set1_pr       _mm256_set1_ps
 393 #define gmx_setzero_pr    _mm256_setzero_ps
 394 #define gmx_store_pr      _mm256_store_ps
 395
 396 #define gmx_add_pr        _mm256_add_ps
 397 #define gmx_sub_pr        _mm256_sub_ps
 398 #define gmx_mul_pr        _mm256_mul_ps
 399 #define gmx_madd_pr(a, b, c)   _mm256_add_ps(c, _mm256_mul_ps(a, b))
 400 #define gmx_nmsub_pr(a, b, c)  _mm256_sub_ps(c, _mm256_mul_ps(a, b))
 401 #define gmx_max_pr        _mm256_max_ps
 402 #define gmx_blendzero_pr  _mm256_and_ps
 403
 404 #define gmx_round_pr(x)   _mm256_round_ps(x, 0x0)
 405 #define GMX_SIMD_HAVE_FLOOR
 406 #define gmx_floor_pr      _mm256_floor_ps
 407
 408 #define GMX_SIMD_HAVE_BLENDV
 409 #define gmx_blendv_pr     _mm256_blendv_ps
 410
 411 static gmx_inline gmx_mm_pr gmx_cpsgn_nonneg_pr(gmx_mm_pr a, gmx_mm_pr b)
 412 {
 413     gmx_mm_pr sign_mask = _mm256_set1_ps(-0.0);
 414     return _mm256_or_ps(_mm256_and_ps(a, sign_mask), b);
 415 };
 416
 417 static gmx_inline gmx_mm_pr gmx_masknot_add_pr(gmx_mm_pb a, gmx_mm_pr b, gmx_mm_pr c) { return _mm256_add_ps(b, _mm256_andnot_ps(a, c)); };
 418
 419 /* Less-than (we use ordered, non-signaling, but that's not required) */
 420 #define gmx_cmplt_pr(x, y) _mm256_cmp_ps(x, y, 0x11)
 421 #define gmx_and_pb        _mm256_and_ps
 422 #define gmx_or_pb         _mm256_or_ps
 423
 424 #define GMX_SIMD_HAVE_ANYTRUE
 425 #define gmx_anytrue_pb    _mm256_movemask_ps
 426
 427 #define gmx_cvttpr_epi32  _mm256_cvttps_epi32
 428
 429 #define gmx_rsqrt_pr      _mm256_rsqrt_ps
 430 #define gmx_rcp_pr        _mm256_rcp_ps
 431
 432 #define gmx_exp_pr        gmx_mm256_exp_ps
 433 #define gmx_sqrt_pr       gmx_mm256_sqrt_ps
 434 #define gmx_sincos_pr     gmx_mm256_sincos_ps
 435 #define gmx_acos_pr       gmx_mm256_acos_ps
 436 #define gmx_atan2_pr      gmx_mm256_atan2_ps
 437
 438 #else /* ifndef GMX_DOUBLE */
 439
 440 #define GMX_SIMD_WIDTH_HERE  4
 441
 442 #define gmx_mm_pr  __m256d
 443
 444 #define gmx_mm_pb  __m256d
 445
 446 /* We use 128-bit integer registers because of missing 256-bit operations */
 447 #define gmx_epi32  __m128i
 448 #define GMX_SIMD_EPI32_WIDTH  4
 449
 450 #define gmx_load_pr       _mm256_load_pd
 451 #define gmx_load1_pr(x)   _mm256_set1_pd((x)[0])
 452 #define gmx_set1_pr       _mm256_set1_pd
 453 #define gmx_setzero_pr    _mm256_setzero_pd
 454 #define gmx_store_pr      _mm256_store_pd
 455
 456 #define gmx_add_pr        _mm256_add_pd
 457 #define gmx_sub_pr        _mm256_sub_pd
 458 #define gmx_mul_pr        _mm256_mul_pd
 459 #define gmx_madd_pr(a, b, c)   _mm256_add_pd(c, _mm256_mul_pd(a, b))
 460 #define gmx_nmsub_pr(a, b, c)  _mm256_sub_pd(c, _mm256_mul_pd(a, b))
 461 #define gmx_max_pr        _mm256_max_pd
 462 #define gmx_blendzero_pr  _mm256_and_pd
 463
 464 #define gmx_round_pr(x)   _mm256_round_pd(x, 0x0)
 465 #define GMX_SIMD_HAVE_FLOOR
 466 #define gmx_floor_pr      _mm256_floor_pd
 467
 468 #define GMX_SIMD_HAVE_BLENDV
 469 #define gmx_blendv_pr     _mm256_blendv_pd
 470
 471 static gmx_inline gmx_mm_pr gmx_cpsgn_nonneg_pr(gmx_mm_pr a, gmx_mm_pr b)
 472 {
 473     gmx_mm_pr sign_mask = _mm256_set1_pd(-0.0);
 474     return _mm256_or_pd(_mm256_and_pd(a, sign_mask), b);
 475 };
 476
 477 static gmx_inline gmx_mm_pr gmx_masknot_add_pr(gmx_mm_pb a, gmx_mm_pr b, gmx_mm_pr c) { return _mm256_add_pd(b, _mm256_andnot_pd(a, c)); };
 478
 479 /* Less-than (we use ordered, non-signaling, but that's not required) */
 480 #define gmx_cmplt_pr(x, y) _mm256_cmp_pd(x, y, 0x11)
 481
 482 #define gmx_and_pb        _mm256_and_pd
 483 #define gmx_or_pb         _mm256_or_pd
 484
 485 #define GMX_SIMD_HAVE_ANYTRUE
 486 #define gmx_anytrue_pb    _mm256_movemask_pd
 487
 488 #define gmx_cvttpr_epi32  _mm256_cvttpd_epi32
 489
 490 #define gmx_rsqrt_pr(r)   _mm256_cvtps_pd(_mm_rsqrt_ps(_mm256_cvtpd_ps(r)))
 491 #define gmx_rcp_pr(r)     _mm256_cvtps_pd(_mm_rcp_ps(_mm256_cvtpd_ps(r)))
 492
 493 #define gmx_exp_pr        gmx_mm256_exp_pd
 494 #define gmx_sqrt_pr       gmx_mm256_sqrt_pd
 495 #define gmx_sincos_pr     gmx_mm256_sincos_pd
 496 #define gmx_acos_pr       gmx_mm256_acos_pd
 497 #define gmx_atan2_pr      gmx_mm256_atan2_pd
 498
 499 #endif /* ifndef GMX_DOUBLE */
 500
 501 #endif /* 128- or 256-bit x86 SIMD */
 502
 503 #endif /* GMX_X86_SSE2 */
 504
 505
 506 #ifdef GMX_HAVE_SIMD_MACROS
 507 /* Generic functions to extract a SIMD aligned pointer from a pointer x.
 508  * x should have at least GMX_SIMD_WIDTH_HERE elements extra compared
 509  * to how many you want to use, to avoid indexing outside the aligned region.
 510  */
 511
 512 static gmx_inline real *
 513 gmx_simd_align_real(const real *x)
 514 {
 515     return (real *)(((size_t)((x)+GMX_SIMD_WIDTH_HERE)) & (~((size_t)(GMX_SIMD_WIDTH_HERE*sizeof(real)-1))));
 516 }
 517
 518 static gmx_inline int *
 519 gmx_simd_align_int(const int *x)
 520 {
 521     return (int  *)(((size_t)((x)+GMX_SIMD_WIDTH_HERE)) & (~((size_t)(GMX_SIMD_WIDTH_HERE*sizeof(int )-1))));
 522 }
 523
 524
 525 /* Include the math functions which only need the above macros,
 526  * generally these are the ones that don't need masking operations.
 527  */
 528 #ifdef GMX_DOUBLE
 529 #include "gmx_simd_math_double.h"
 530 #else
 531 #include "gmx_simd_math_single.h"
 532 #endif
 533
 534 #endif /* GMX_HAVE_SIMD_MACROS */
 535
 536 #endif /* _gmx_simd_macros_h_ */