src/gromacs/simd/macros.h

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
   5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   6  * and including many others, as listed in the AUTHORS file in the
   7  * top-level source directory and at http://www.gromacs.org.
   8  *
   9  * GROMACS is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public License
  11  * as published by the Free Software Foundation; either version 2.1
  12  * of the License, or (at your option) any later version.
  13  *
  14  * GROMACS is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public
  20  * License along with GROMACS; if not, see
  21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  23  *
  24  * If you want to redistribute modifications to GROMACS, please
  25  * consider that scientific software is very special. Version
  26  * control is crucial - bugs must be traceable. We will be happy to
  27  * consider code for inclusion in the official distribution, but
  28  * derived work must not be called official GROMACS. Details are found
  29  * in the README & COPYING files - if they are missing, get the
  30  * official version at http://www.gromacs.org.
  31  *
  32  * To help us fund GROMACS development, we humbly ask that you cite
  33  * the research papers on the package. Check out http://www.gromacs.org.
  34  */
  35
  36 /* The macros in this file are intended to be used for writing
  37  * architecture-independent SIMD intrinsics code.
  38  * To support a new architecture, adding macros here should be (nearly)
  39  * all that is needed.
  40  */
  41
  42 #ifdef GMX_SIMD_MACROS_H
  43 #error "gromacs/simd/macros.h included twice"
  44 #else
  45 #define GMX_SIMD_MACROS_H
  46
  47 /* NOTE: SSE2 acceleration does not include floor or blendv */
  48
  49 #ifdef GMX_SIMD_REFERENCE
  50 /* Plain C SIMD reference implementation, also serves as documentation */
  51 #define GMX_HAVE_SIMD_MACROS
  52
  53 /* Include plain-C reference implementation, also serves as documentation */
  54 #include "gromacs/simd/macros_ref.h"
  55
  56 #define GMX_SIMD_REAL_WIDTH  GMX_SIMD_REF_WIDTH
  57
  58 /* float/double SIMD register type */
  59 #define gmx_simd_real_t  gmx_simd_ref_pr
  60
  61 /* boolean SIMD register type */
  62 #define gmx_simd_bool_t  gmx_simd_ref_pb
  63
  64 /* integer SIMD register type, only for table indexing and exclusion masks */
  65 #define gmx_simd_int32_t  gmx_simd_ref_epi32
  66 #define GMX_SIMD_INT32_WIDTH  GMX_SIMD_REF_EPI32_WIDTH
  67
  68 /* Load GMX_SIMD_REAL_WIDTH reals for memory starting at r */
  69 #define gmx_simd_load_r       gmx_simd_ref_load_pr
  70 /* Set all SIMD register elements to *r */
  71 #define gmx_simd_load1_r      gmx_simd_ref_load1_pr
  72 #define gmx_simd_set1_r       gmx_simd_ref_set1_pr
  73 #define gmx_simd_setzero_r    gmx_simd_ref_setzero_pr
  74 #define gmx_simd_store_r      gmx_simd_ref_store_pr
  75
  76 #define gmx_simd_add_r        gmx_simd_ref_add_pr
  77 #define gmx_simd_sub_r        gmx_simd_ref_sub_pr
  78 #define gmx_simd_mul_r        gmx_simd_ref_mul_pr
  79 /* For the FMA macros below, aim for c=d in code, so FMA3 uses 1 instruction */
  80 #define gmx_simd_fmadd_r       gmx_simd_ref_madd_pr
  81 #define gmx_simd_fnmadd_r      gmx_simd_ref_nmsub_pr
  82
  83 #define gmx_simd_max_r        gmx_simd_ref_max_pr
  84 #define gmx_simd_blendzero_r  gmx_simd_ref_blendzero_pr
  85
  86 #define gmx_simd_round_r      gmx_simd_ref_round_pr
  87
  88 /* Not required, only used to speed up the nbnxn tabulated PME kernels */
  89 #define GMX_SIMD_HAVE_FLOOR
  90 #ifdef GMX_SIMD_HAVE_FLOOR
  91 #define gmx_simd_floor_r      gmx_simd_ref_floor_pr
  92 #endif
  93
  94 /* Not required, only used when blendv is faster than comparison */
  95 #define GMX_SIMD_HAVE_BLENDV
  96 #ifdef GMX_SIMD_HAVE_BLENDV
  97 #define gmx_simd_blendv_r     gmx_simd_ref_blendv_pr
  98 #endif
  99
 100 /* Copy the sign of a to b, assumes b >= 0 for efficiency */
 101 #define gmx_cpsgn_nonneg_pr  gmx_simd_ref_cpsgn_nonneg_pr
 102
 103 /* Very specific operation required in the non-bonded kernels */
 104 #define gmx_masknot_add_pr   gmx_simd_ref_masknot_add_pr
 105
 106 /* Comparison */
 107 #define gmx_simd_cmplt_r      gmx_simd_ref_cmplt_pr
 108
 109 /* Logical operations on SIMD booleans */
 110 #define gmx_simd_and_b        gmx_simd_ref_and_pb
 111 #define gmx_simd_or_b         gmx_simd_ref_or_pb
 112
 113 /* Returns a single int (0/1) which tells if any of the 4 booleans is True */
 114 #define gmx_simd_anytrue_b    gmx_simd_ref_anytrue_pb
 115
 116 /* Conversions only used for PME table lookup */
 117 #define gmx_simd_cvtt_r2i  gmx_simd_ref_cvttpr_epi32
 118 #define gmx_simd_cvt_i2r   gmx_simd_ref_cvtepi32_pr
 119
 120 /* These two function only need to be approximate, Newton-Raphson iteration
 121  * is used for full accuracy in gmx_simd_invsqrt_r and gmx_simd_inv_r.
 122  */
 123 #define gmx_simd_rsqrt_r      gmx_simd_ref_rsqrt_pr
 124 #define gmx_simd_rcp_r        gmx_simd_ref_rcp_pr
 125
 126 /* sqrt+inv+sin+cos+acos+atan2 are used for bonded potentials, exp for PME */
 127 #define GMX_SIMD_HAVE_EXP
 128 #ifdef GMX_SIMD_HAVE_EXP
 129 #define gmx_simd_exp_r        gmx_simd_ref_exp_pr
 130 #endif
 131 #define GMX_SIMD_HAVE_TRIGONOMETRIC
 132 #ifdef GMX_SIMD_HAVE_TRIGONOMETRIC
 133 #define gmx_simd_sqrt_r       gmx_simd_ref_sqrt_pr
 134 #define gmx_simd_sincos_r     gmx_simd_ref_sincos_pr
 135 #define gmx_simd_acos_r       gmx_simd_ref_acos_pr
 136 #define gmx_simd_atan2_r      gmx_simd_ref_atan2_pr
 137 #endif
 138
 139 #endif /* GMX_SIMD_REFERENCE */
 140
 141
 142 /* The same SIMD macros can be translated to SIMD intrinsics (and compiled
 143  * to instructions for) different SIMD width and float precision.
 144  *
 145  * On x86: The gmx_ prefix is replaced by _mm_ or _mm256_ (SSE or AVX).
 146  * The _pr suffix is replaced by _ps or _pd (for single or double precision).
 147  * Compiler settings will decide if 128-bit intrinsics will
 148  * be translated into SSE or AVX instructions.
 149  */
 150
 151
 152 #ifdef GMX_USE_HALF_WIDTH_SIMD_HERE
 153 #if defined GMX_SIMD_X86_AVX_256_OR_HIGHER || defined __MIC__
 154 /* We have half SIMD width support, continue */
 155 #else
 156 #error "half SIMD width intrinsics are not supported"
 157 #endif
 158 #endif
 159
 160 #if defined GMX_TARGET_X86 && !defined __MIC__
 161
 162 #ifdef GMX_SIMD_X86_SSE2_OR_HIGHER
 163 /* This is for general x86 SIMD instruction sets that also support SSE2 */
 164 #define GMX_HAVE_SIMD_MACROS
 165
 166 /* Include the highest supported x86 SIMD intrisics + math functions */
 167 #ifdef GMX_SIMD_X86_AVX_256_OR_HIGHER
 168 #include "general_x86_avx_256.h"
 169 #ifdef GMX_DOUBLE
 170 #include "math_x86_avx_256_double.h"
 171 #else  /* GMX_DOUBLE */
 172 #include "math_x86_avx_256_single.h"
 173 #endif /* GMX_DOUBLE */
 174 #else  /* GMX_SIMD_X86_AVX_256_OR_HIGHER */
 175 #ifdef GMX_SIMD_X86_AVX_128_FMA_OR_HIGHER
 176 #include "general_x86_avx_128_fma.h"
 177 #ifdef GMX_DOUBLE
 178 #include "math_x86_avx_128_fma_double.h"
 179 #else  /* GMX_DOUBLE */
 180 #include "math_x86_avx_128_fma_single.h"
 181 #endif /* GMX_DOUBLE */
 182 #else  /* GMX_SIMD_X86_AVX_128_FMA_OR_HIGHER */
 183 #ifdef GMX_SIMD_X86_SSE4_1
 184 #include "general_x86_sse4_1.h"
 185 #ifdef GMX_DOUBLE
 186 #include "math_x86_sse4_1_double.h"
 187 #else  /* GMX_DOUBLE */
 188 #include "math_x86_sse4_1_single.h"
 189 #endif /* GMX_DOUBLE */
 190 #else  /* GMX_SIMD_X86_SSE4_1_OR_HIGHER */
 191 #ifdef GMX_SIMD_X86_SSE2_OR_HIGHER
 192 #include "general_x86_sse2.h"
 193 #ifdef GMX_DOUBLE
 194 #include "math_x86_sse2_double.h"
 195 #else  /* GMX_DOUBLE */
 196 #include "math_x86_sse2_single.h"
 197 #endif /* GMX_DOUBLE */
 198 #else  /* GMX_SIMD_X86_SSE2_OR_HIGHER */
 199 #error No x86 acceleration defined
 200 #endif /* GMX_SIMD_X86_SSE2_OR_HIGHER */
 201 #endif /* GMX_SIMD_X86_SSE4_1_OR_HIGHER */
 202 #endif /* GMX_SIMD_X86_AVX_128_FMA_OR_HIGHER */
 203 #endif /* GMX_SIMD_X86_AVX_256_OR_HIGHER */
 204
 205 /* exp and trigonometric functions are included above */
 206 #define GMX_SIMD_HAVE_EXP
 207 #define GMX_SIMD_HAVE_ERFC
 208 #define GMX_SIMD_HAVE_TRIGONOMETRIC
 209
 210 #if !defined GMX_SIMD_X86_AVX_256_OR_HIGHER || defined GMX_USE_HALF_WIDTH_SIMD_HERE
 211
 212 #ifndef GMX_DOUBLE
 213
 214 #define GMX_SIMD_REAL_WIDTH  4
 215
 216 #define gmx_simd_real_t  __m128
 217
 218 #define gmx_simd_bool_t  __m128
 219
 220 #define gmx_simd_int32_t  __m128i
 221 #define GMX_SIMD_INT32_WIDTH  4
 222
 223 #define gmx_simd_load_r       _mm_load_ps
 224 #define gmx_simd_load1_r      _mm_load1_ps
 225 #define gmx_simd_set1_r       _mm_set1_ps
 226 #define gmx_simd_setzero_r    _mm_setzero_ps
 227 #define gmx_simd_store_r      _mm_store_ps
 228
 229 #define gmx_simd_add_r        _mm_add_ps
 230 #define gmx_simd_sub_r        _mm_sub_ps
 231 #define gmx_simd_mul_r        _mm_mul_ps
 232 #ifdef GMX_SIMD_X86_AVX_128_FMA_OR_HIGHER
 233 #define GMX_SIMD_HAVE_FMA
 234 #define gmx_simd_fmadd_r(a, b, c)   _mm_macc_ps(a, b, c)
 235 #define gmx_simd_fnmadd_r(a, b, c)  _mm_nmacc_ps(a, b, c)
 236 #else
 237 #define gmx_simd_fmadd_r(a, b, c)   _mm_add_ps(c, _mm_mul_ps(a, b))
 238 #define gmx_simd_fnmadd_r(a, b, c)  _mm_sub_ps(c, _mm_mul_ps(a, b))
 239 #endif
 240 #define gmx_simd_max_r        _mm_max_ps
 241 #define gmx_simd_blendzero_r  _mm_and_ps
 242
 243 #define gmx_simd_cmplt_r      _mm_cmplt_ps
 244 #define gmx_simd_and_b        _mm_and_ps
 245 #define gmx_simd_or_b         _mm_or_ps
 246
 247 #ifdef GMX_SIMD_X86_SSE4_1_OR_HIGHER
 248 #define gmx_simd_round_r(x)   _mm_round_ps(x, 0x0)
 249 #define GMX_SIMD_HAVE_FLOOR
 250 #define gmx_simd_floor_r      _mm_floor_ps
 251 #else
 252 #define gmx_simd_round_r(x)   _mm_cvtepi32_ps(_mm_cvtps_epi32(x))
 253 #endif
 254
 255 #ifdef GMX_SIMD_X86_SSE4_1_OR_HIGHER
 256 #define GMX_SIMD_HAVE_BLENDV
 257 #define gmx_simd_blendv_r     _mm_blendv_ps
 258 #endif
 259
 260 static gmx_inline gmx_simd_real_t gmx_cpsgn_nonneg_pr(gmx_simd_real_t a, gmx_simd_real_t b)
 261 {
 262     /* The value -0.0 has only the sign-bit set */
 263     gmx_simd_real_t sign_mask = _mm_set1_ps(-0.0);
 264     return _mm_or_ps(_mm_and_ps(a, sign_mask), b);
 265 };
 266
 267 static gmx_inline gmx_simd_real_t gmx_masknot_add_pr(gmx_simd_bool_t a, gmx_simd_real_t b, gmx_simd_real_t c)
 268 {
 269     return _mm_add_ps(b, _mm_andnot_ps(a, c));
 270 };
 271
 272 #define gmx_simd_anytrue_b    _mm_movemask_ps
 273
 274 #define gmx_simd_cvtt_r2i  _mm_cvttps_epi32
 275 #define gmx_simd_cvt_i2r   _mm_cvtepi32_ps
 276
 277 #define gmx_simd_rsqrt_r      _mm_rsqrt_ps
 278 #define gmx_simd_rcp_r        _mm_rcp_ps
 279
 280 #define gmx_simd_exp_r        gmx_mm_exp_ps
 281 #define gmx_simd_sqrt_r       gmx_mm_sqrt_ps
 282 #define gmx_simd_sincos_r     gmx_mm_sincos_ps
 283 #define gmx_simd_acos_r       gmx_mm_acos_ps
 284 #define gmx_simd_atan2_r      gmx_mm_atan2_ps
 285 #define gmx_simd_erfc_r       gmx_mm_erfc_ps
 286
 287 #else /* ifndef GMX_DOUBLE */
 288
 289 #define GMX_SIMD_REAL_WIDTH  2
 290
 291 #define gmx_simd_real_t  __m128d
 292
 293 #define gmx_simd_bool_t  __m128d
 294
 295 #define gmx_simd_int32_t  __m128i
 296 #define GMX_SIMD_INT32_WIDTH  4
 297
 298 #define gmx_simd_load_r       _mm_load_pd
 299 #define gmx_simd_load1_r      _mm_load1_pd
 300 #define gmx_simd_set1_r       _mm_set1_pd
 301 #define gmx_simd_setzero_r    _mm_setzero_pd
 302 #define gmx_simd_store_r      _mm_store_pd
 303
 304 #define gmx_simd_add_r        _mm_add_pd
 305 #define gmx_simd_sub_r        _mm_sub_pd
 306 #define gmx_simd_mul_r        _mm_mul_pd
 307 #ifdef GMX_SIMD_X86_AVX_128_FMA_OR_HIGHER
 308 #define GMX_SIMD_HAVE_FMA
 309 #define gmx_simd_fmadd_r(a, b, c)   _mm_macc_pd(a, b, c)
 310 #define gmx_simd_fnmadd_r(a, b, c)  _mm_nmacc_pd(a, b, c)
 311 #else
 312 #define gmx_simd_fmadd_r(a, b, c)   _mm_add_pd(c, _mm_mul_pd(a, b))
 313 #define gmx_simd_fnmadd_r(a, b, c)  _mm_sub_pd(c, _mm_mul_pd(a, b))
 314 #endif
 315 #define gmx_simd_max_r        _mm_max_pd
 316 #define gmx_simd_blendzero_r  _mm_and_pd
 317
 318 #ifdef GMX_SIMD_X86_SSE4_1_OR_HIGHER
 319 #define gmx_simd_round_r(x)   _mm_round_pd(x, 0x0)
 320 #define GMX_SIMD_HAVE_FLOOR
 321 #define gmx_simd_floor_r      _mm_floor_pd
 322 #else
 323 #define gmx_simd_round_r(x)   _mm_cvtepi32_pd(_mm_cvtpd_epi32(x))
 324 /* gmx_simd_floor_r is not used in code for pre-SSE4_1 hardware */
 325 #endif
 326
 327 #ifdef GMX_SIMD_X86_SSE4_1_OR_HIGHER
 328 #define GMX_SIMD_HAVE_BLENDV
 329 #define gmx_simd_blendv_r     _mm_blendv_pd
 330 #endif
 331
 332 static gmx_inline gmx_simd_real_t gmx_cpsgn_nonneg_pr(gmx_simd_real_t a, gmx_simd_real_t b)
 333 {
 334     gmx_simd_real_t sign_mask = _mm_set1_pd(-0.0);
 335     return _mm_or_pd(_mm_and_pd(a, sign_mask), b);
 336 };
 337
 338 static gmx_inline gmx_simd_real_t gmx_masknot_add_pr(gmx_simd_bool_t a, gmx_simd_real_t b, gmx_simd_real_t c)
 339 {
 340     return _mm_add_pd(b, _mm_andnot_pd(a, c));
 341 };
 342
 343 #define gmx_simd_cmplt_r      _mm_cmplt_pd
 344
 345 #define gmx_simd_and_b        _mm_and_pd
 346 #define gmx_simd_or_b         _mm_or_pd
 347
 348 #define gmx_simd_anytrue_b    _mm_movemask_pd
 349
 350 #define gmx_simd_cvtt_r2i  _mm_cvttpd_epi32
 351 #define gmx_simd_cvt_i2r   _mm_cvtepi32_pd
 352
 353 #define gmx_simd_rsqrt_r(r)   _mm_cvtps_pd(_mm_rsqrt_ps(_mm_cvtpd_ps(r)))
 354 #define gmx_simd_rcp_r(r)     _mm_cvtps_pd(_mm_rcp_ps(_mm_cvtpd_ps(r)))
 355
 356 #define gmx_simd_exp_r        gmx_mm_exp_pd
 357 #define gmx_simd_sqrt_r       gmx_mm_sqrt_pd
 358 #define gmx_simd_sincos_r     gmx_mm_sincos_pd
 359 #define gmx_simd_acos_r       gmx_mm_acos_pd
 360 #define gmx_simd_atan2_r      gmx_mm_atan2_pd
 361 #define gmx_simd_erfc_r       gmx_mm_erfc_pd
 362
 363 #endif /* ifndef GMX_DOUBLE */
 364
 365 #else
 366 /* We have GMX_SIMD_X86_AVX_256_OR_HIGHER and not GMX_USE_HALF_WIDTH_SIMD_HERE,
 367  * so we use 256-bit SIMD.
 368  */
 369
 370 #ifndef GMX_DOUBLE
 371
 372 #define GMX_SIMD_REAL_WIDTH  8
 373
 374 #define gmx_simd_real_t  __m256
 375
 376 #define gmx_simd_bool_t  __m256
 377
 378 #define gmx_simd_int32_t  __m256i
 379 #define GMX_SIMD_INT32_WIDTH  8
 380
 381 #define gmx_simd_load_r       _mm256_load_ps
 382 #define gmx_simd_load1_r(x)   _mm256_set1_ps((x)[0])
 383 #define gmx_simd_set1_r       _mm256_set1_ps
 384 #define gmx_simd_setzero_r    _mm256_setzero_ps
 385 #define gmx_simd_store_r      _mm256_store_ps
 386
 387 #define gmx_simd_add_r        _mm256_add_ps
 388 #define gmx_simd_sub_r        _mm256_sub_ps
 389 #define gmx_simd_mul_r        _mm256_mul_ps
 390 #define gmx_simd_fmadd_r(a, b, c)   _mm256_add_ps(c, _mm256_mul_ps(a, b))
 391 #define gmx_simd_fnmadd_r(a, b, c)  _mm256_sub_ps(c, _mm256_mul_ps(a, b))
 392 #define gmx_simd_max_r        _mm256_max_ps
 393 #define gmx_simd_blendzero_r  _mm256_and_ps
 394
 395 #define gmx_simd_round_r(x)   _mm256_round_ps(x, 0x0)
 396 #define GMX_SIMD_HAVE_FLOOR
 397 #define gmx_simd_floor_r      _mm256_floor_ps
 398
 399 #define GMX_SIMD_HAVE_BLENDV
 400 #define gmx_simd_blendv_r     _mm256_blendv_ps
 401
 402 static gmx_inline gmx_simd_real_t gmx_cpsgn_nonneg_pr(gmx_simd_real_t a, gmx_simd_real_t b)
 403 {
 404     gmx_simd_real_t sign_mask = _mm256_set1_ps(-0.0);
 405     return _mm256_or_ps(_mm256_and_ps(a, sign_mask), b);
 406 };
 407
 408 static gmx_inline gmx_simd_real_t gmx_masknot_add_pr(gmx_simd_bool_t a, gmx_simd_real_t b, gmx_simd_real_t c)
 409 {
 410     return _mm256_add_ps(b, _mm256_andnot_ps(a, c));
 411 };
 412
 413 /* Less-than (we use ordered, non-signaling, but that's not required) */
 414 #define gmx_simd_cmplt_r(x, y) _mm256_cmp_ps(x, y, 0x11)
 415 #define gmx_simd_and_b        _mm256_and_ps
 416 #define gmx_simd_or_b         _mm256_or_ps
 417
 418 #define gmx_simd_anytrue_b    _mm256_movemask_ps
 419
 420 #define gmx_simd_cvtt_r2i  _mm256_cvttps_epi32
 421
 422 #define gmx_simd_rsqrt_r      _mm256_rsqrt_ps
 423 #define gmx_simd_rcp_r        _mm256_rcp_ps
 424
 425 #define gmx_simd_exp_r        gmx_mm256_exp_ps
 426 #define gmx_simd_sqrt_r       gmx_mm256_sqrt_ps
 427 #define gmx_simd_sincos_r     gmx_mm256_sincos_ps
 428 #define gmx_simd_acos_r       gmx_mm256_acos_ps
 429 #define gmx_simd_atan2_r      gmx_mm256_atan2_ps
 430 #define gmx_simd_erfc_r       gmx_mm256_erfc_ps
 431
 432 #else /* ifndef GMX_DOUBLE */
 433
 434 #define GMX_SIMD_REAL_WIDTH  4
 435
 436 #define gmx_simd_real_t  __m256d
 437
 438 #define gmx_simd_bool_t  __m256d
 439
 440 /* We use 128-bit integer registers because of missing 256-bit operations */
 441 #define gmx_simd_int32_t  __m128i
 442 #define GMX_SIMD_INT32_WIDTH  4
 443
 444 #define gmx_simd_load_r       _mm256_load_pd
 445 #define gmx_simd_load1_r(x)   _mm256_set1_pd((x)[0])
 446 #define gmx_simd_set1_r       _mm256_set1_pd
 447 #define gmx_simd_setzero_r    _mm256_setzero_pd
 448 #define gmx_simd_store_r      _mm256_store_pd
 449
 450 #define gmx_simd_add_r        _mm256_add_pd
 451 #define gmx_simd_sub_r        _mm256_sub_pd
 452 #define gmx_simd_mul_r        _mm256_mul_pd
 453 #define gmx_simd_fmadd_r(a, b, c)   _mm256_add_pd(c, _mm256_mul_pd(a, b))
 454 #define gmx_simd_fnmadd_r(a, b, c)  _mm256_sub_pd(c, _mm256_mul_pd(a, b))
 455 #define gmx_simd_max_r        _mm256_max_pd
 456 #define gmx_simd_blendzero_r  _mm256_and_pd
 457
 458 #define gmx_simd_round_r(x)   _mm256_round_pd(x, 0x0)
 459 #define GMX_SIMD_HAVE_FLOOR
 460 #define gmx_simd_floor_r      _mm256_floor_pd
 461
 462 #define GMX_SIMD_HAVE_BLENDV
 463 #define gmx_simd_blendv_r     _mm256_blendv_pd
 464
 465 static gmx_inline gmx_simd_real_t gmx_cpsgn_nonneg_pr(gmx_simd_real_t a, gmx_simd_real_t b)
 466 {
 467     gmx_simd_real_t sign_mask = _mm256_set1_pd(-0.0);
 468     return _mm256_or_pd(_mm256_and_pd(a, sign_mask), b);
 469 };
 470
 471 static gmx_inline gmx_simd_real_t gmx_masknot_add_pr(gmx_simd_bool_t a, gmx_simd_real_t b, gmx_simd_real_t c)
 472 {
 473     return _mm256_add_pd(b, _mm256_andnot_pd(a, c));
 474 };
 475
 476 /* Less-than (we use ordered, non-signaling, but that's not required) */
 477 #define gmx_simd_cmplt_r(x, y) _mm256_cmp_pd(x, y, 0x11)
 478
 479 #define gmx_simd_and_b        _mm256_and_pd
 480 #define gmx_simd_or_b         _mm256_or_pd
 481
 482 #define gmx_simd_anytrue_b    _mm256_movemask_pd
 483
 484 #define gmx_simd_cvtt_r2i  _mm256_cvttpd_epi32
 485
 486 #define gmx_simd_rsqrt_r(r)   _mm256_cvtps_pd(_mm_rsqrt_ps(_mm256_cvtpd_ps(r)))
 487 #define gmx_simd_rcp_r(r)     _mm256_cvtps_pd(_mm_rcp_ps(_mm256_cvtpd_ps(r)))
 488
 489 #define gmx_simd_exp_r        gmx_mm256_exp_pd
 490 #define gmx_simd_sqrt_r       gmx_mm256_sqrt_pd
 491 #define gmx_simd_sincos_r     gmx_mm256_sincos_pd
 492 #define gmx_simd_acos_r       gmx_mm256_acos_pd
 493 #define gmx_simd_atan2_r      gmx_mm256_atan2_pd
 494 #define gmx_simd_erfc_r       gmx_mm256_erfc_pd
 495
 496 #endif /* ifndef GMX_DOUBLE */
 497
 498 #endif /* 128- or 256-bit x86 SIMD */
 499
 500 #endif /* GMX_SIMD_X86_SSE2_OR_HIGHER */
 501
 502 #endif /* GMX_TARGET_X86 */
 503
 504 #ifdef GMX_SIMD_IBM_QPX
 505
 506 /* This hack works on the compilers that can reach this code. A real
 507    solution with broader scope will be proposed in master branch. */
 508 #define gmx_always_inline __attribute__((always_inline))
 509
 510 /* This is for the A2 core on BlueGene/Q that supports IBM's QPX
 511    vector built-in functions */
 512 #include <mass_simd.h>
 513 #define GMX_HAVE_SIMD_MACROS
 514 #ifdef __clang__
 515 #include <qpxmath.h>
 516 #endif
 517
 518 /* No need to version the code by the precision, because the QPX AXU
 519    extends to and truncates from double precision for free. */
 520
 521 #define GMX_SIMD_REAL_WIDTH  4
 522 typedef vector4double gmx_simd_real_t;
 523 typedef vector4double gmx_simd_bool_t;
 524 typedef vector4double gmx_simd_int32_t;
 525 #define GMX_SIMD_INT32_WIDTH  4
 526
 527 static gmx_inline gmx_simd_real_t gmx_always_inline gmx_simd_load_r(const real *a)
 528 {
 529 #ifdef NDEBUG
 530     return vec_ld(0, (real *) a);
 531 #else
 532     return vec_lda(0, (real *) a);
 533 #endif
 534 }
 535
 536 static gmx_inline gmx_simd_real_t gmx_always_inline gmx_simd_load1_r(const real *a)
 537 {
 538     return vec_splats(*a);
 539 }
 540
 541 static gmx_inline gmx_simd_real_t gmx_always_inline gmx_simd_set1_r(real a)
 542 {
 543     return vec_splats(a);
 544 }
 545
 546 static gmx_inline gmx_simd_real_t gmx_always_inline gmx_simd_setzero_r()
 547 {
 548     return vec_splats(0.0);
 549 }
 550
 551 static gmx_inline void gmx_always_inline gmx_simd_store_r(real *a, gmx_simd_real_t b)
 552 {
 553 #ifdef NDEBUG
 554     vec_st(b, 0, a);
 555 #else
 556     vec_sta(b, 0, a);
 557 #endif
 558 }
 559
 560 static gmx_inline gmx_simd_real_t gmx_always_inline gmx_simd_add_r(gmx_simd_real_t a, gmx_simd_real_t b)
 561 {
 562     return vec_add(a, b);
 563 }
 564
 565 static gmx_inline gmx_simd_real_t gmx_always_inline gmx_simd_sub_r(gmx_simd_real_t a, gmx_simd_real_t b)
 566 {
 567     return vec_sub(a, b);
 568 }
 569
 570 static gmx_inline gmx_simd_real_t gmx_always_inline gmx_simd_mul_r(gmx_simd_real_t a, gmx_simd_real_t b)
 571 {
 572     return vec_mul(a, b);
 573 }
 574
 575 static gmx_inline gmx_simd_real_t gmx_always_inline gmx_simd_fmadd_r(gmx_simd_real_t a, gmx_simd_real_t b, gmx_simd_real_t c)
 576 {
 577     return vec_madd(a, b, c);
 578 }
 579
 580 static gmx_inline gmx_simd_real_t gmx_always_inline gmx_simd_fnmadd_r(gmx_simd_real_t a, gmx_simd_real_t b, gmx_simd_real_t c)
 581 {
 582     return vec_nmsub(a, b, c);
 583 }
 584
 585 static gmx_inline gmx_simd_real_t gmx_always_inline gmx_simd_max_r(gmx_simd_real_t a, gmx_simd_real_t b)
 586 {
 587     return vec_sel(b, a, vec_sub(a, b));
 588 }
 589
 590 static gmx_inline gmx_simd_real_t gmx_always_inline gmx_simd_blendzero_r(gmx_simd_real_t a, gmx_simd_real_t b)
 591 {
 592     return vec_sel(gmx_simd_setzero_r(), a, b);
 593 }
 594
 595 static gmx_inline gmx_simd_bool_t gmx_always_inline gmx_simd_cmplt_r(gmx_simd_real_t a, gmx_simd_real_t b)
 596 {
 597     return vec_cmplt(a, b);
 598 }
 599
 600 static gmx_inline gmx_simd_bool_t gmx_always_inline gmx_simd_and_b(gmx_simd_bool_t a, gmx_simd_bool_t b)
 601 {
 602     return vec_and(a, b);
 603 }
 604
 605 static gmx_inline gmx_simd_bool_t gmx_always_inline gmx_simd_or_b(gmx_simd_bool_t a, gmx_simd_bool_t b)
 606 {
 607     return vec_or(a, b);
 608 }
 609
 610 static gmx_inline gmx_simd_real_t gmx_always_inline gmx_simd_round_r(gmx_simd_real_t a)
 611 {
 612     return vec_round(a);
 613 }
 614
 615 #define GMX_SIMD_HAVE_FLOOR
 616 static gmx_inline gmx_simd_real_t gmx_always_inline gmx_simd_floor_r(gmx_simd_real_t a)
 617 {
 618     return vec_floor(a);
 619 }
 620
 621 #define GMX_SIMD_HAVE_BLENDV
 622 static gmx_inline gmx_simd_real_t gmx_always_inline gmx_simd_blendv_r(gmx_simd_real_t a, gmx_simd_real_t b, gmx_simd_real_t c)
 623 {
 624     return vec_sel(b, a, gmx_simd_cmplt_r(gmx_simd_setzero_r(), c));
 625 }
 626
 627 static gmx_inline gmx_simd_real_t gmx_always_inline gmx_cpsgn_nonneg_pr(gmx_simd_real_t a, gmx_simd_real_t b)
 628 {
 629     return vec_cpsgn(a, b);
 630 };
 631
 632 static gmx_inline gmx_simd_real_t gmx_always_inline gmx_masknot_add_pr(gmx_simd_bool_t a, gmx_simd_real_t b, gmx_simd_real_t c)
 633 {
 634     return vec_add(b, vec_sel(c, gmx_simd_setzero_r(), a));
 635 };
 636
 637 static gmx_inline gmx_bool gmx_always_inline
 638 GMX_SIMD_IS_TRUE(real x)
 639 {
 640     return x >= 0.0;
 641 }
 642
 643 static gmx_inline gmx_simd_int32_t gmx_always_inline gmx_simd_cvtt_r2i(gmx_simd_real_t a)
 644 {
 645     return vec_ctiwuz(a);
 646 }
 647 /* Don't want this, we have floor */
 648 /* #define gmx_simd_cvt_i2r   vec_cvtepi32 */
 649
 650 /* A2 core on BG/Q delivers relative error of 2^-14, whereas Power ISA
 651    Architecture only promises 2^-8. So probably no need for
 652    Newton-Raphson iterates at single or double. */
 653 static gmx_inline gmx_simd_real_t gmx_always_inline gmx_simd_rsqrt_r(gmx_simd_real_t a)
 654 {
 655     return vec_rsqrte(a);
 656 }
 657
 658 /* A2 core on BG/Q delivers relative error of 2^-14, whereas Power ISA
 659    Architecture only promises 2^-5. So probably no need for
 660    Newton-Raphson iterates at single or double. */
 661 static gmx_inline gmx_simd_real_t gmx_always_inline gmx_simd_rcp_r(gmx_simd_real_t a)
 662 {
 663     return vec_re(a);
 664 }
 665
 666 /* Note that here, and below, we use the built-in SLEEF port when
 667    compiling on BlueGene/Q with clang */
 668
 669 #define GMX_SIMD_HAVE_EXP
 670 static gmx_inline gmx_simd_real_t gmx_always_inline gmx_simd_exp_r(gmx_simd_real_t a)
 671 {
 672 #ifdef __clang__
 673 #ifndef GMX_DOUBLE
 674     return xexpf(a);
 675 #else
 676     return xexp(a);
 677 #endif
 678 #else
 679 #ifndef GMX_DOUBLE
 680     return expf4(a);
 681 #else
 682     return expd4(a);
 683 #endif
 684 #endif
 685 }
 686
 687 static gmx_inline gmx_simd_real_t gmx_always_inline gmx_simd_sqrt_r(gmx_simd_real_t a)
 688 {
 689 #ifdef NDEBUG
 690     return vec_swsqrt_nochk(a);
 691 #else
 692     return vec_swsqrt(a);
 693 #endif
 694 }
 695
 696 #define GMX_SIMD_HAVE_TRIGONOMETRIC
 697 static gmx_inline int gmx_always_inline gmx_simd_sincos_r(gmx_simd_real_t a, gmx_simd_real_t *b, gmx_simd_real_t *c)
 698 {
 699 #ifdef __clang__
 700 #ifndef GMX_DOUBLE
 701     xsincosf(a, b, c);
 702 #else
 703     xsincos(a, b, c);
 704 #endif
 705 #else
 706 #ifndef GMX_DOUBLE
 707     sincosf4(a, b, c);
 708 #else
 709     sincosd4(a, b, c);
 710 #endif
 711 #endif
 712     return 1;
 713 }
 714
 715 static gmx_inline gmx_simd_real_t gmx_always_inline gmx_simd_acos_r(gmx_simd_real_t a)
 716 {
 717 #ifdef __clang__
 718 #ifndef GMX_DOUBLE
 719     return xacosf(a);
 720 #else
 721     return xacos(a);
 722 #endif
 723 #else
 724 #ifndef GMX_DOUBLE
 725     return acosf4(a);
 726 #else
 727     return acosd4(a);
 728 #endif
 729 #endif
 730 }
 731
 732 /* NB The order of parameters here is correct; the
 733    documentation of atan2[df]4 in SIMD MASS is wrong. */
 734 static gmx_inline gmx_simd_real_t gmx_always_inline gmx_simd_atan2_r(gmx_simd_real_t a, gmx_simd_real_t b)
 735 {
 736 #ifdef __clang__
 737 #ifndef GMX_DOUBLE
 738     return xatan2f(a, b);
 739 #else
 740     return xatan2(a, b);
 741 #endif
 742 #else
 743 #ifndef GMX_DOUBLE
 744     return atan2f4(a, b);
 745 #else
 746     return atan2d4(a, b);
 747 #endif
 748 #endif
 749 }
 750
 751 #define GMX_SIMD_HAVE_ERFC
 752 static gmx_inline gmx_simd_real_t gmx_always_inline gmx_simd_erfc_r(gmx_simd_real_t a)
 753 {
 754     /* The BG/Q qpxmath.h vector math library intended for use with
 755        bgclang does not have erfc, so we need to use a function from
 756        mass_simd.h. If this changes, then the #include <mass_simd.h> can
 757        become conditional. */
 758 #ifndef GMX_DOUBLE
 759     return erfcf4(a);
 760 #else
 761     return erfcd4(a);
 762 #endif
 763 }
 764
 765 /* TODO: gmx_mm_erfc_p[sd] should be generalized using gmx_*_pr, so that it just works on BlueGene */
 766
 767 static gmx_inline int gmx_always_inline
 768 gmx_simd_anytrue_b(gmx_simd_bool_t a)
 769 {
 770     /* The "anytrue" is done solely on the QPX AXU (which is the only
 771        available FPU). This is awkward, because pretty much no
 772        "horizontal" SIMD-vector operations exist, unlike x86 where
 773        SSE4.1 added various kinds of horizontal operations. So we have
 774        to make do with shifting vector elements and operating on the
 775        results. This makes for lots of data dependency, but the main
 776        alternative of storing to memory and reloading is not going to
 777        help, either. OpenMP over 2 or 4 hardware threads per core will
 778        hide much of the latency from the data dependency. The
 779        vec_extract() lets the compiler correctly use a floating-point
 780        comparison on the zeroth vector element, which avoids needing
 781        memory at all.
 782      */
 783     gmx_simd_bool_t vec_shifted_left_0 = a;
 784     gmx_simd_bool_t vec_shifted_left_1 = vec_sldw(a, a, 1);
 785     gmx_simd_bool_t vec_shifted_left_2 = vec_sldw(a, a, 2);
 786     gmx_simd_bool_t vec_shifted_left_3 = vec_sldw(a, a, 3);
 787
 788     gmx_simd_bool_t vec_return = vec_or(vec_or(vec_shifted_left_2, vec_shifted_left_3),
 789                                         vec_or(vec_shifted_left_0, vec_shifted_left_1));
 790     return (0.0 < vec_extract(vec_return, 0));
 791 };
 792
 793 #undef gmx_always_inline
 794
 795 #endif /* GMX_SIMD_IBM_QPX */
 796
 797 #ifdef __MIC__
 798 #include "general_x86_mic.h"
 799 #endif
 800
 801 #ifdef GMX_HAVE_SIMD_MACROS
 802 /* Generic functions to extract a SIMD aligned pointer from a pointer x.
 803  * x should have at least GMX_SIMD_REAL_WIDTH elements extra compared
 804  * to how many you want to use, to avoid indexing outside the aligned region.
 805  */
 806
 807 static gmx_inline real *
 808 gmx_simd_align_r(const real *x)
 809 {
 810     return (real *)(((size_t)((x)+GMX_SIMD_REAL_WIDTH)) & (~((size_t)(GMX_SIMD_REAL_WIDTH*sizeof(real)-1))));
 811 }
 812
 813 static gmx_inline int *
 814 gmx_simd_align_i(const int *x)
 815 {
 816     return (int  *)(((size_t)((x)+GMX_SIMD_REAL_WIDTH)) & (~((size_t)(GMX_SIMD_REAL_WIDTH*sizeof(int )-1))));
 817 }
 818
 819
 820 /* Include the math functions which only need the above macros,
 821  * generally these are the ones that don't need masking operations.
 822  */
 823 #ifdef GMX_DOUBLE
 824 #include "math_double.h"
 825 #else
 826 #include "math_single.h"
 827 #endif
 828
 829
 830 #endif /* GMX_HAVE_SIMD_MACROS */
 831
 832 #endif