include/gmx_simd_macros.h

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
   5  * Copyright (c) 2001-2012, The GROMACS Development Team
   6  * Copyright (c) 2012,2013, by the GROMACS development team, led by
   7  * David van der Spoel, Berk Hess, Erik Lindahl, and including many
   8  * others, as listed in the AUTHORS file in the top-level source
   9  * directory and at http://www.gromacs.org.
  10  *
  11  * GROMACS is free software; you can redistribute it and/or
  12  * modify it under the terms of the GNU Lesser General Public License
  13  * as published by the Free Software Foundation; either version 2.1
  14  * of the License, or (at your option) any later version.
  15  *
  16  * GROMACS is distributed in the hope that it will be useful,
  17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  19  * Lesser General Public License for more details.
  20  *
  21  * You should have received a copy of the GNU Lesser General Public
  22  * License along with GROMACS; if not, see
  23  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  24  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  25  *
  26  * If you want to redistribute modifications to GROMACS, please
  27  * consider that scientific software is very special. Version
  28  * control is crucial - bugs must be traceable. We will be happy to
  29  * consider code for inclusion in the official distribution, but
  30  * derived work must not be called official GROMACS. Details are found
  31  * in the README & COPYING files - if they are missing, get the
  32  * official version at http://www.gromacs.org.
  33  *
  34  * To help us fund GROMACS development, we humbly ask that you cite
  35  * the research papers on the package. Check out http://www.gromacs.org.
  36  */
  37
  38 /* The macros in this file are intended to be used for writing
  39  * architecture-independent SIMD intrinsics code.
  40  * To support a new architecture, adding macros here should be (nearly)
  41  * all that is needed.
  42  */
  43
  44 #ifdef _gmx_simd_macros_h_
  45 #error "gmx_simd_macros.h included twice"
  46 #else
  47 #define _gmx_simd_macros_h_
  48
  49 /* NOTE: SSE2 acceleration does not include floor or blendv */
  50
  51
  52 /* Uncomment the next line, without other SIMD active, for testing plain-C */
  53 /* #define GMX_SIMD_REFERENCE_PLAIN_C */
  54 #ifdef GMX_SIMD_REFERENCE_PLAIN_C
  55 /* Plain C SIMD reference implementation, also serves as documentation */
  56 #define GMX_HAVE_SIMD_MACROS
  57
  58 /* In general the reference SIMD supports any SIMD width, including 1.
  59  * See types/nb_verlet.h for details
  60  */
  61 #define GMX_SIMD_REF_WIDTH  4
  62
  63 /* Include plain-C reference implementation, also serves as documentation */
  64 #include "gmx_simd_ref.h"
  65
  66 #define GMX_SIMD_WIDTH_HERE  GMX_SIMD_REF_WIDTH
  67
  68 /* float/double SIMD register type */
  69 #define gmx_mm_pr  gmx_simd_ref_pr
  70
  71 /* boolean SIMD register type */
  72 #define gmx_mm_pb  gmx_simd_ref_pb
  73
  74 /* integer SIMD register type, only for table indexing and exclusion masks */
  75 #define gmx_epi32  gmx_simd_ref_epi32
  76 #define GMX_SIMD_EPI32_WIDTH  GMX_SIMD_REF_EPI32_WIDTH
  77
  78 /* Load GMX_SIMD_WIDTH_HERE reals for memory starting at r */
  79 #define gmx_load_pr       gmx_simd_ref_load_pr
  80 /* Set all SIMD register elements to *r */
  81 #define gmx_load1_pr      gmx_simd_ref_load1_pr
  82 #define gmx_set1_pr       gmx_simd_ref_set1_pr
  83 #define gmx_setzero_pr    gmx_simd_ref_setzero_pr
  84 #define gmx_store_pr      gmx_simd_ref_store_pr
  85
  86 #define gmx_add_pr        gmx_simd_ref_add_pr
  87 #define gmx_sub_pr        gmx_simd_ref_sub_pr
  88 #define gmx_mul_pr        gmx_simd_ref_mul_pr
  89 /* For the FMA macros below, aim for c=d in code, so FMA3 uses 1 instruction */
  90 #define gmx_madd_pr       gmx_simd_ref_madd_pr
  91 #define gmx_nmsub_pr      gmx_simd_ref_nmsub_pr
  92
  93 #define gmx_max_pr        gmx_simd_ref_max_pr
  94 #define gmx_blendzero_pr  gmx_simd_ref_blendzero_pr
  95
  96 #define gmx_round_pr      gmx_simd_ref_round_pr
  97
  98 /* Not required, only used to speed up the nbnxn tabulated PME kernels */
  99 #define GMX_SIMD_HAVE_FLOOR
 100 #ifdef GMX_SIMD_HAVE_FLOOR
 101 #define gmx_floor_pr      gmx_simd_ref_floor_pr
 102 #endif
 103
 104 /* Not required, only used when blendv is faster than comparison */
 105 #define GMX_SIMD_HAVE_BLENDV
 106 #ifdef GMX_SIMD_HAVE_BLENDV
 107 #define gmx_blendv_pr     gmx_simd_ref_blendv_pr
 108 #endif
 109
 110 /* Copy the sign of a to b, assumes b >= 0 for efficiency */
 111 #define gmx_cpsgn_nonneg_pr  gmx_simd_ref_cpsgn_nonneg_pr
 112
 113 /* Very specific operation required in the non-bonded kernels */
 114 #define gmx_masknot_add_pr   gmx_simd_ref_masknot_add_pr
 115
 116 /* Comparison */
 117 #define gmx_cmplt_pr      gmx_simd_ref_cmplt_pr
 118
 119 /* Logical operations on SIMD booleans */
 120 #define gmx_and_pb        gmx_simd_ref_and_pb
 121 #define gmx_or_pb         gmx_simd_ref_or_pb
 122
 123 /* Returns a single int (0/1) which tells if any of the 4 booleans is True */
 124 #define gmx_anytrue_pb    gmx_simd_ref_anytrue_pb
 125
 126 /* Conversions only used for PME table lookup */
 127 #define gmx_cvttpr_epi32  gmx_simd_ref_cvttpr_epi32
 128 #define gmx_cvtepi32_pr   gmx_simd_ref_cvtepi32_pr
 129
 130 /* These two function only need to be approximate, Newton-Raphson iteration
 131  * is used for full accuracy in gmx_invsqrt_pr and gmx_inv_pr.
 132  */
 133 #define gmx_rsqrt_pr      gmx_simd_ref_rsqrt_pr
 134 #define gmx_rcp_pr        gmx_simd_ref_rcp_pr
 135
 136 /* sqrt+inv+sin+cos+acos+atan2 are used for bonded potentials, exp for PME */
 137 #define GMX_SIMD_HAVE_EXP
 138 #ifdef GMX_SIMD_HAVE_EXP
 139 #define gmx_exp_pr        gmx_simd_ref_exp_pr
 140 #endif
 141 #define GMX_SIMD_HAVE_TRIGONOMETRIC
 142 #ifdef GMX_SIMD_HAVE_TRIGONOMETRIC
 143 #define gmx_sqrt_pr       gmx_simd_ref_sqrt_pr
 144 #define gmx_sincos_pr     gmx_simd_ref_sincos_pr
 145 #define gmx_acos_pr       gmx_simd_ref_acos_pr
 146 #define gmx_atan2_pr      gmx_simd_ref_atan2_pr
 147 #endif
 148
 149 #endif /* GMX_SIMD_REFERENCE_PLAIN_C */
 150
 151
 152 /* The same SIMD macros can be translated to SIMD intrinsics (and compiled
 153  * to instructions for) different SIMD width and float precision.
 154  *
 155  * On x86: The gmx_ prefix is replaced by _mm_ or _mm256_ (SSE or AVX).
 156  * The _pr suffix is replaced by _ps or _pd (for single or double precision).
 157  * Compiler settings will decide if 128-bit intrinsics will
 158  * be translated into SSE or AVX instructions.
 159  */
 160
 161
 162 #ifdef GMX_USE_HALF_WIDTH_SIMD_HERE
 163 #if defined GMX_X86_AVX_256
 164 /* We have half SIMD width support, continue */
 165 #else
 166 #error "half SIMD width intrinsics are not supported"
 167 #endif
 168 #endif
 169
 170
 171 #ifdef GMX_X86_SSE2
 172 /* This is for general x86 SIMD instruction sets that also support SSE2 */
 173 #define GMX_HAVE_SIMD_MACROS
 174
 175 /* Include the highest supported x86 SIMD intrisics + math functions */
 176 #ifdef GMX_X86_AVX_256
 177 #include "gmx_x86_avx_256.h"
 178 #ifdef GMX_DOUBLE
 179 #include "gmx_math_x86_avx_256_double.h"
 180 #else
 181 #include "gmx_math_x86_avx_256_single.h"
 182 #endif
 183 #else
 184 #ifdef GMX_X86_AVX_128_FMA
 185 #include "gmx_x86_avx_128_fma.h"
 186 #ifdef GMX_DOUBLE
 187 #include "gmx_math_x86_avx_128_fma_double.h"
 188 #else
 189 #include "gmx_math_x86_avx_128_fma_single.h"
 190 #endif
 191 #else
 192 #ifdef GMX_X86_SSE4_1
 193 #include "gmx_x86_sse4_1.h"
 194 #ifdef GMX_DOUBLE
 195 #include "gmx_math_x86_sse4_1_double.h"
 196 #else
 197 #include "gmx_math_x86_sse4_1_single.h"
 198 #endif
 199 #else
 200 #ifdef GMX_X86_SSE2
 201 #include "gmx_x86_sse2.h"
 202 #ifdef GMX_DOUBLE
 203 #include "gmx_math_x86_sse2_double.h"
 204 #else
 205 #include "gmx_math_x86_sse2_single.h"
 206 #endif
 207 #else
 208 #error No x86 acceleration defined
 209 #endif
 210 #endif
 211 #endif
 212 #endif
 213 /* exp and trigonometric functions are included above */
 214 #define GMX_SIMD_HAVE_EXP
 215 #define GMX_SIMD_HAVE_TRIGONOMETRIC
 216
 217 #if !defined GMX_X86_AVX_256 || defined GMX_USE_HALF_WIDTH_SIMD_HERE
 218
 219 #ifndef GMX_DOUBLE
 220
 221 #define GMX_SIMD_WIDTH_HERE  4
 222
 223 #define gmx_mm_pr  __m128
 224
 225 #define gmx_mm_pb  __m128
 226
 227 #define gmx_epi32  __m128i
 228 #define GMX_SIMD_EPI32_WIDTH  4
 229
 230 #define gmx_load_pr       _mm_load_ps
 231 #define gmx_load1_pr      _mm_load1_ps
 232 #define gmx_set1_pr       _mm_set1_ps
 233 #define gmx_setzero_pr    _mm_setzero_ps
 234 #define gmx_store_pr      _mm_store_ps
 235
 236 #define gmx_add_pr        _mm_add_ps
 237 #define gmx_sub_pr        _mm_sub_ps
 238 #define gmx_mul_pr        _mm_mul_ps
 239 #ifdef GMX_X86_AVX_128_FMA
 240 #define GMX_SIMD_HAVE_FMA
 241 #define gmx_madd_pr(a, b, c)   _mm_macc_ps(a, b, c)
 242 #define gmx_nmsub_pr(a, b, c)  _mm_nmacc_ps(a, b, c)
 243 #else
 244 #define gmx_madd_pr(a, b, c)   _mm_add_ps(c, _mm_mul_ps(a, b))
 245 #define gmx_nmsub_pr(a, b, c)  _mm_sub_ps(c, _mm_mul_ps(a, b))
 246 #endif
 247 #define gmx_max_pr        _mm_max_ps
 248 #define gmx_blendzero_pr  _mm_and_ps
 249
 250 #define gmx_cmplt_pr      _mm_cmplt_ps
 251 #define gmx_and_pb        _mm_and_ps
 252 #define gmx_or_pb         _mm_or_ps
 253
 254 #ifdef GMX_X86_SSE4_1
 255 #define gmx_round_pr(x)   _mm_round_ps(x, 0x0)
 256 #define GMX_SIMD_HAVE_FLOOR
 257 #define gmx_floor_pr      _mm_floor_ps
 258 #else
 259 #define gmx_round_pr(x)   _mm_cvtepi32_ps(_mm_cvtps_epi32(x))
 260 #endif
 261
 262 #ifdef GMX_X86_SSE4_1
 263 #define GMX_SIMD_HAVE_BLENDV
 264 #define gmx_blendv_pr     _mm_blendv_ps
 265 #endif
 266
 267 static gmx_inline gmx_mm_pr gmx_cpsgn_nonneg_pr(gmx_mm_pr a, gmx_mm_pr b)
 268 {
 269     /* The value -0.0 has only the sign-bit set */
 270     gmx_mm_pr sign_mask = _mm_set1_ps(-0.0);
 271     return _mm_or_ps(_mm_and_ps(a, sign_mask), b);
 272 };
 273
 274 static gmx_inline gmx_mm_pr gmx_masknot_add_pr(gmx_mm_pb a, gmx_mm_pr b, gmx_mm_pr c) { return _mm_add_ps(b, _mm_andnot_ps(a, c)); };
 275
 276 #define gmx_anytrue_pb    _mm_movemask_ps
 277
 278 #define gmx_cvttpr_epi32  _mm_cvttps_epi32
 279 #define gmx_cvtepi32_pr   _mm_cvtepi32_ps
 280
 281 #define gmx_rsqrt_pr      _mm_rsqrt_ps
 282 #define gmx_rcp_pr        _mm_rcp_ps
 283
 284 #define gmx_exp_pr        gmx_mm_exp_ps
 285 #define gmx_sqrt_pr       gmx_mm_sqrt_ps
 286 #define gmx_sincos_pr     gmx_mm_sincos_ps
 287 #define gmx_acos_pr       gmx_mm_acos_ps
 288 #define gmx_atan2_pr      gmx_mm_atan2_ps
 289
 290 #else /* ifndef GMX_DOUBLE */
 291
 292 #define GMX_SIMD_WIDTH_HERE  2
 293
 294 #define gmx_mm_pr  __m128d
 295
 296 #define gmx_mm_pb  __m128d
 297
 298 #define gmx_epi32  __m128i
 299 #define GMX_SIMD_EPI32_WIDTH  4
 300
 301 #define gmx_load_pr       _mm_load_pd
 302 #define gmx_load1_pr      _mm_load1_pd
 303 #define gmx_set1_pr       _mm_set1_pd
 304 #define gmx_setzero_pr    _mm_setzero_pd
 305 #define gmx_store_pr      _mm_store_pd
 306
 307 #define gmx_add_pr        _mm_add_pd
 308 #define gmx_sub_pr        _mm_sub_pd
 309 #define gmx_mul_pr        _mm_mul_pd
 310 #ifdef GMX_X86_AVX_128_FMA
 311 #define GMX_SIMD_HAVE_FMA
 312 #define gmx_madd_pr(a, b, c)   _mm_macc_pd(a, b, c)
 313 #define gmx_nmsub_pr(a, b, c)  _mm_nmacc_pd(a, b, c)
 314 #else
 315 #define gmx_madd_pr(a, b, c)   _mm_add_pd(c, _mm_mul_pd(a, b))
 316 #define gmx_nmsub_pr(a, b, c)  _mm_sub_pd(c, _mm_mul_pd(a, b))
 317 #endif
 318 #define gmx_max_pr        _mm_max_pd
 319 #define gmx_blendzero_pr  _mm_and_pd
 320
 321 #ifdef GMX_X86_SSE4_1
 322 #define gmx_round_pr(x)   _mm_round_pd(x, 0x0)
 323 #define GMX_SIMD_HAVE_FLOOR
 324 #define gmx_floor_pr      _mm_floor_pd
 325 #else
 326 #define gmx_round_pr(x)   _mm_cvtepi32_pd(_mm_cvtpd_epi32(x))
 327 /* gmx_floor_pr is not used in code for pre-SSE4_1 hardware */
 328 #endif
 329
 330 #ifdef GMX_X86_SSE4_1
 331 #define GMX_SIMD_HAVE_BLENDV
 332 #define gmx_blendv_pr     _mm_blendv_pd
 333 #endif
 334
 335 static gmx_inline gmx_mm_pr gmx_cpsgn_nonneg_pr(gmx_mm_pr a, gmx_mm_pr b)
 336 {
 337     gmx_mm_pr sign_mask = _mm_set1_pd(-0.0);
 338     return _mm_or_pd(_mm_and_pd(a, sign_mask), b);
 339 };
 340
 341 static gmx_inline gmx_mm_pr gmx_masknot_add_pr(gmx_mm_pb a, gmx_mm_pr b, gmx_mm_pr c) { return _mm_add_pd(b, _mm_andnot_pd(a, c)); };
 342
 343 #define gmx_cmplt_pr      _mm_cmplt_pd
 344
 345 #define gmx_and_pb        _mm_and_pd
 346 #define gmx_or_pb         _mm_or_pd
 347
 348 #define gmx_anytrue_pb    _mm_movemask_pd
 349
 350 #define gmx_cvttpr_epi32  _mm_cvttpd_epi32
 351 #define gmx_cvtepi32_pr   _mm_cvtepi32_pd
 352
 353 #define gmx_rsqrt_pr(r)   _mm_cvtps_pd(_mm_rsqrt_ps(_mm_cvtpd_ps(r)))
 354 #define gmx_rcp_pr(r)     _mm_cvtps_pd(_mm_rcp_ps(_mm_cvtpd_ps(r)))
 355
 356 #define gmx_exp_pr        gmx_mm_exp_pd
 357 #define gmx_sqrt_pr       gmx_mm_sqrt_pd
 358 #define gmx_sincos_pr     gmx_mm_sincos_pd
 359 #define gmx_acos_pr       gmx_mm_acos_pd
 360 #define gmx_atan2_pr      gmx_mm_atan2_pd
 361
 362 #endif /* ifndef GMX_DOUBLE */
 363
 364 #else
 365 /* We have GMX_X86_AVX_256 and not GMX_USE_HALF_WIDTH_SIMD_HERE,
 366  * so we use 256-bit SIMD.
 367  */
 368
 369 #ifndef GMX_DOUBLE
 370
 371 #define GMX_SIMD_WIDTH_HERE  8
 372
 373 #define gmx_mm_pr  __m256
 374
 375 #define gmx_mm_pb  __m256
 376
 377 #define gmx_epi32  __m256i
 378 #define GMX_SIMD_EPI32_WIDTH  8
 379
 380 #define gmx_load_pr       _mm256_load_ps
 381 #define gmx_load1_pr(x)   _mm256_set1_ps((x)[0])
 382 #define gmx_set1_pr       _mm256_set1_ps
 383 #define gmx_setzero_pr    _mm256_setzero_ps
 384 #define gmx_store_pr      _mm256_store_ps
 385
 386 #define gmx_add_pr        _mm256_add_ps
 387 #define gmx_sub_pr        _mm256_sub_ps
 388 #define gmx_mul_pr        _mm256_mul_ps
 389 #define gmx_madd_pr(a, b, c)   _mm256_add_ps(c, _mm256_mul_ps(a, b))
 390 #define gmx_nmsub_pr(a, b, c)  _mm256_sub_ps(c, _mm256_mul_ps(a, b))
 391 #define gmx_max_pr        _mm256_max_ps
 392 #define gmx_blendzero_pr  _mm256_and_ps
 393
 394 #define gmx_round_pr(x)   _mm256_round_ps(x, 0x0)
 395 #define GMX_SIMD_HAVE_FLOOR
 396 #define gmx_floor_pr      _mm256_floor_ps
 397
 398 #define GMX_SIMD_HAVE_BLENDV
 399 #define gmx_blendv_pr     _mm256_blendv_ps
 400
 401 static gmx_inline gmx_mm_pr gmx_cpsgn_nonneg_pr(gmx_mm_pr a, gmx_mm_pr b)
 402 {
 403     gmx_mm_pr sign_mask = _mm256_set1_ps(-0.0);
 404     return _mm256_or_ps(_mm256_and_ps(a, sign_mask), b);
 405 };
 406
 407 static gmx_inline gmx_mm_pr gmx_masknot_add_pr(gmx_mm_pb a, gmx_mm_pr b, gmx_mm_pr c) { return _mm256_add_ps(b, _mm256_andnot_ps(a, c)); };
 408
 409 /* Less-than (we use ordered, non-signaling, but that's not required) */
 410 #define gmx_cmplt_pr(x, y) _mm256_cmp_ps(x, y, 0x11)
 411 #define gmx_and_pb        _mm256_and_ps
 412 #define gmx_or_pb         _mm256_or_ps
 413
 414 #define gmx_anytrue_pb    _mm256_movemask_ps
 415
 416 #define gmx_cvttpr_epi32  _mm256_cvttps_epi32
 417
 418 #define gmx_rsqrt_pr      _mm256_rsqrt_ps
 419 #define gmx_rcp_pr        _mm256_rcp_ps
 420
 421 #define gmx_exp_pr        gmx_mm256_exp_ps
 422 #define gmx_sqrt_pr       gmx_mm256_sqrt_ps
 423 #define gmx_sincos_pr     gmx_mm256_sincos_ps
 424 #define gmx_acos_pr       gmx_mm256_acos_ps
 425 #define gmx_atan2_pr      gmx_mm256_atan2_ps
 426
 427 #else /* ifndef GMX_DOUBLE */
 428
 429 #define GMX_SIMD_WIDTH_HERE  4
 430
 431 #define gmx_mm_pr  __m256d
 432
 433 #define gmx_mm_pb  __m256d
 434
 435 /* We use 128-bit integer registers because of missing 256-bit operations */
 436 #define gmx_epi32  __m128i
 437 #define GMX_SIMD_EPI32_WIDTH  4
 438
 439 #define gmx_load_pr       _mm256_load_pd
 440 #define gmx_load1_pr(x)   _mm256_set1_pd((x)[0])
 441 #define gmx_set1_pr       _mm256_set1_pd
 442 #define gmx_setzero_pr    _mm256_setzero_pd
 443 #define gmx_store_pr      _mm256_store_pd
 444
 445 #define gmx_add_pr        _mm256_add_pd
 446 #define gmx_sub_pr        _mm256_sub_pd
 447 #define gmx_mul_pr        _mm256_mul_pd
 448 #define gmx_madd_pr(a, b, c)   _mm256_add_pd(c, _mm256_mul_pd(a, b))
 449 #define gmx_nmsub_pr(a, b, c)  _mm256_sub_pd(c, _mm256_mul_pd(a, b))
 450 #define gmx_max_pr        _mm256_max_pd
 451 #define gmx_blendzero_pr  _mm256_and_pd
 452
 453 #define gmx_round_pr(x)   _mm256_round_pd(x, 0x0)
 454 #define GMX_SIMD_HAVE_FLOOR
 455 #define gmx_floor_pr      _mm256_floor_pd
 456
 457 #define GMX_SIMD_HAVE_BLENDV
 458 #define gmx_blendv_pr     _mm256_blendv_pd
 459
 460 static gmx_inline gmx_mm_pr gmx_cpsgn_nonneg_pr(gmx_mm_pr a, gmx_mm_pr b)
 461 {
 462     gmx_mm_pr sign_mask = _mm256_set1_pd(-0.0);
 463     return _mm256_or_pd(_mm256_and_pd(a, sign_mask), b);
 464 };
 465
 466 static gmx_inline gmx_mm_pr gmx_masknot_add_pr(gmx_mm_pb a, gmx_mm_pr b, gmx_mm_pr c) { return _mm256_add_pd(b, _mm256_andnot_pd(a, c)); };
 467
 468 /* Less-than (we use ordered, non-signaling, but that's not required) */
 469 #define gmx_cmplt_pr(x, y) _mm256_cmp_pd(x, y, 0x11)
 470
 471 #define gmx_and_pb        _mm256_and_pd
 472 #define gmx_or_pb         _mm256_or_pd
 473
 474 #define gmx_anytrue_pb    _mm256_movemask_pd
 475
 476 #define gmx_cvttpr_epi32  _mm256_cvttpd_epi32
 477
 478 #define gmx_rsqrt_pr(r)   _mm256_cvtps_pd(_mm_rsqrt_ps(_mm256_cvtpd_ps(r)))
 479 #define gmx_rcp_pr(r)     _mm256_cvtps_pd(_mm_rcp_ps(_mm256_cvtpd_ps(r)))
 480
 481 #define gmx_exp_pr        gmx_mm256_exp_pd
 482 #define gmx_sqrt_pr       gmx_mm256_sqrt_pd
 483 #define gmx_sincos_pr     gmx_mm256_sincos_pd
 484 #define gmx_acos_pr       gmx_mm256_acos_pd
 485 #define gmx_atan2_pr      gmx_mm256_atan2_pd
 486
 487 #endif /* ifndef GMX_DOUBLE */
 488
 489 #endif /* 128- or 256-bit x86 SIMD */
 490
 491 #endif /* GMX_X86_SSE2 */
 492
 493
 494 #ifdef GMX_HAVE_SIMD_MACROS
 495 /* Generic functions to extract a SIMD aligned pointer from a pointer x.
 496  * x should have at least GMX_SIMD_WIDTH_HERE elements extra compared
 497  * to how many you want to use, to avoid indexing outside the aligned region.
 498  */
 499
 500 static gmx_inline real *
 501 gmx_simd_align_real(const real *x)
 502 {
 503     return (real *)(((size_t)((x)+GMX_SIMD_WIDTH_HERE)) & (~((size_t)(GMX_SIMD_WIDTH_HERE*sizeof(real)-1))));
 504 }
 505
 506 static gmx_inline int *
 507 gmx_simd_align_int(const int *x)
 508 {
 509     return (int  *)(((size_t)((x)+GMX_SIMD_WIDTH_HERE)) & (~((size_t)(GMX_SIMD_WIDTH_HERE*sizeof(int )-1))));
 510 }
 511
 512
 513 /* Include the math functions which only need the above macros,
 514  * generally these are the ones that don't need masking operations.
 515  */
 516 #ifdef GMX_DOUBLE
 517 #include "gmx_simd_math_double.h"
 518 #else
 519 #include "gmx_simd_math_single.h"
 520 #endif
 521
 522 #endif /* GMX_HAVE_SIMD_MACROS */
 523
 524 #endif /* _gmx_simd_macros_h_ */