src/gromacs/simd/four_wide_macros.h

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
   5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   6  * and including many others, as listed in the AUTHORS file in the
   7  * top-level source directory and at http://www.gromacs.org.
   8  *
   9  * GROMACS is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public License
  11  * as published by the Free Software Foundation; either version 2.1
  12  * of the License, or (at your option) any later version.
  13  *
  14  * GROMACS is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public
  20  * License along with GROMACS; if not, see
  21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  23  *
  24  * If you want to redistribute modifications to GROMACS, please
  25  * consider that scientific software is very special. Version
  26  * control is crucial - bugs must be traceable. We will be happy to
  27  * consider code for inclusion in the official distribution, but
  28  * derived work must not be called official GROMACS. Details are found
  29  * in the README & COPYING files - if they are missing, get the
  30  * official version at http://www.gromacs.org.
  31  *
  32  * To help us fund GROMACS development, we humbly ask that you cite
  33  * the research papers on the package. Check out http://www.gromacs.org.
  34  */
  35
  36 /* The macros in this file are intended to be used for writing
  37  * architecture-independent SIMD intrinsics code with a SIMD width of 4.
  38  * To support a new architecture, adding macros here should be all
  39  * that is needed.
  40  *
  41  * Note that this file is intended only for SIMD operations that require
  42  * a SIMD width of 4. In general gmx_simd_macros.h provides wider hardware
  43  * support, more functionality and higher performance, but the SIMD width is
  44  * not necessarily equal to 4.
  45  */
  46
  47 #ifdef GMX_SIMD_FOUR_WIDE_MACROS_H
  48 #error "four_wide_macros.h included twice"
  49 #else
  50 #define GMX_SIMD_FOUR_WIDE_MACROS_H
  51
  52
  53 /* The SIMD width here is always 4, since that is the whole point */
  54 #define GMX_SIMD4_WIDTH  4
  55
  56
  57 #if defined GMX_SIMD4_SINGLE || defined GMX_SIMD4_DOUBLE
  58 /* Precision set before inclusion, honour that request */
  59 #else
  60 /* Match precision to the Gromacs real precision */
  61 #ifdef GMX_DOUBLE
  62 #define GMX_SIMD4_DOUBLE
  63 #else
  64 #define GMX_SIMD4_SINGLE
  65 #endif
  66 #endif
  67
  68 #ifdef GMX_SIMD4_DOUBLE
  69 typedef double  gmx_simd4_real;
  70 #endif
  71 #ifdef GMX_SIMD4_SINGLE
  72 typedef float   gmx_simd4_real;
  73 #endif
  74
  75 /* Uncomment the next line, without other SIMD active, for testing plain-C */
  76 /* #define GMX_SIMD4_REFERENCE */
  77 #ifdef GMX_SIMD4_REFERENCE
  78 /* Plain C SIMD reference implementation, also serves as documentation */
  79 #define GMX_HAVE_SIMD4_MACROS
  80
  81 /* Include plain-C reference implementation, also serves as documentation */
  82 #include "four_wide_macros_ref.h"
  83
  84 /* float/double SIMD register type */
  85 #define gmx_simd4_real_t  gmx_simd4_ref_pr
  86
  87 /* boolean SIMD register type */
  88 #define gmx_simd4_bool_t  gmx_simd4_ref_pb
  89
  90 #define gmx_simd4_load_r       gmx_simd4_ref_load_pr
  91 #define gmx_simd4_load_bb_pr    gmx_simd4_ref_load_pr
  92 #define gmx_simd4_set1_r       gmx_simd4_ref_set1_pr
  93 #define gmx_simd4_setzero_r    gmx_simd4_ref_setzero_pr
  94 #define gmx_simd4_store_r      gmx_simd4_ref_store_pr
  95
  96 /* Unaligned load+store are not required,
  97  * but they can speed up the PME spread+gather operations.
  98  */
  99 #define GMX_SIMD4_HAVE_UNALIGNED
 100 #ifdef GMX_SIMD4_HAVE_UNALIGNED
 101 #define gmx_simd4_loadu_r      gmx_simd4_ref_load_pr
 102 #define gmx_simd4_storeu_r     gmx_simd4_ref_store_pr
 103 #endif
 104
 105 #define gmx_simd4_add_r        gmx_simd4_ref_add_pr
 106 #define gmx_simd4_sub_r        gmx_simd4_ref_sub_pr
 107 #define gmx_simd4_mul_r        gmx_simd4_ref_mul_pr
 108 /* For the FMA macros below, aim for c=d in code, so FMA3 uses 1 instruction */
 109 #define gmx_simd4_fmadd_r       gmx_simd4_ref_madd_pr
 110 #define gmx_simd4_fnmadd_r      gmx_simd4_ref_nmsub_pr
 111
 112 #define gmx_simd4_dotproduct3_r   gmx_simd4_ref_dotproduct3
 113
 114 #define gmx_simd4_min_r        gmx_simd4_ref_min_pr
 115 #define gmx_simd4_max_r        gmx_simd4_ref_max_pr
 116
 117 #define gmx_simd4_blendzero_r  gmx_simd4_ref_blendzero_pr
 118
 119 /* Comparison */
 120 #define gmx_simd4_cmplt_r      gmx_simd4_ref_cmplt_pr
 121
 122 /* Logical operations on SIMD booleans */
 123 #define gmx_simd4_and_b        gmx_simd4_ref_and_pb
 124 #define gmx_simd4_or_b         gmx_simd4_ref_or_pb
 125
 126 /* Returns a single int (0/1) which tells if any of the 4 booleans is True */
 127 #define gmx_simd4_anytrue_b    gmx_simd4_ref_anytrue_pb
 128
 129 #endif /* GMX_SIMD4_REFERENCE */
 130
 131
 132 /* The same SIMD macros can be translated to SIMD intrinsics (and compiled
 133  * to instructions for) different SIMD width and float precision.
 134  *
 135  * On x86: The gmx_simd4 prefix is replaced by _mm_ or _mm256_ (SSE or AVX).
 136  * The _pr suffix is replaced by _ps or _pd (for single or double precision).
 137  * Compiler settings will decide if 128-bit intrinsics will
 138  * be translated into SSE or AVX instructions.
 139  */
 140
 141
 142 #ifdef GMX_SIMD_X86_SSE2_OR_HIGHER
 143 /* This is for general x86 SIMD instruction sets that also support SSE2 */
 144
 145 #ifdef GMX_SIMD4_SINGLE
 146 #define GMX_HAVE_SIMD4_MACROS
 147 #endif
 148
 149 #ifdef GMX_SIMD4_DOUBLE
 150 /* Note that here we will use 256-bit SIMD with GMX_SIMD_X86_AVX_128_FMA_OR_HIGHER.
 151  * This is inconsistent naming wise, but should give the best performance.
 152  */
 153 #if defined GMX_SIMD_X86_AVX_128_FMA_OR_HIGHER || defined GMX_SIMD_X86_AVX_256_OR_HIGHER
 154 #define GMX_HAVE_SIMD4_MACROS
 155 #endif
 156 #endif
 157
 158 #ifdef GMX_HAVE_SIMD4_MACROS
 159
 160 #if defined GMX_SIMD_X86_AVX_128_FMA_OR_HIGHER || defined GMX_SIMD_X86_AVX_256_OR_HIGHER
 161
 162 #include <immintrin.h>
 163 #ifdef HAVE_X86INTRIN_H
 164 #include <x86intrin.h> /* FMA */
 165 #endif
 166 #ifdef HAVE_INTRIN_H
 167 #include <intrin.h> /* FMA MSVC */
 168 #endif
 169
 170 #else
 171 #ifdef GMX_SIMD_X86_SSE4_1_OR_HIGHER
 172 #include <smmintrin.h>
 173 #else
 174 /* We only have SSE2 */
 175 #include <emmintrin.h>
 176 #endif
 177 #endif
 178
 179 #ifdef GMX_SIMD4_SINGLE
 180
 181 #define gmx_simd4_real_t  __m128
 182
 183 #define gmx_simd4_bool_t  __m128
 184
 185 #define gmx_simd4_load_r       _mm_load_ps
 186 #define gmx_simd4_load_bb_pr    _mm_load_ps
 187 #define gmx_simd4_set1_r       _mm_set1_ps
 188 #define gmx_simd4_setzero_r    _mm_setzero_ps
 189 #define gmx_simd4_store_r      _mm_store_ps
 190
 191 /* Some old AMD processors could have problems with unaligned loads+stores */
 192 #ifndef GMX_FAHCORE
 193 #define GMX_SIMD4_HAVE_UNALIGNED
 194 #endif
 195 #ifdef GMX_SIMD4_HAVE_UNALIGNED
 196 #define gmx_simd4_loadu_r      _mm_loadu_ps
 197 #define gmx_simd4_storeu_r     _mm_storeu_ps
 198 #endif
 199
 200 #define gmx_simd4_add_r        _mm_add_ps
 201 #define gmx_simd4_sub_r        _mm_sub_ps
 202 #define gmx_simd4_mul_r        _mm_mul_ps
 203
 204 #ifdef GMX_SIMD_X86_AVX_128_FMA_OR_HIGHER
 205 #define gmx_simd4_fmadd_r(a, b, c)   _mm_macc_ps(a, b, c)
 206 #define gmx_simd4_fnmadd_r(a, b, c)  _mm_nmacc_ps(a, b, c)
 207 #else
 208 #define gmx_simd4_fmadd_r(a, b, c)   _mm_add_ps(c, _mm_mul_ps(a, b))
 209 #define gmx_simd4_fnmadd_r(a, b, c)  _mm_sub_ps(c, _mm_mul_ps(a, b))
 210 #endif
 211
 212 static inline float gmx_simd4_dotproduct3_r(__m128 a, __m128 b)
 213 #ifdef GMX_SIMD_X86_SSE4_1_OR_HIGHER
 214 {
 215     float dp;
 216
 217     /* SSE4.1 dot product of components 0,1,2, stored in component 0 */
 218     _mm_store_ss(&dp, _mm_dp_ps(a, b, 0x71));
 219
 220     return dp;
 221 }
 222 #else
 223 {
 224     float        dp_array[7], *dp;
 225
 226     /* Generate an aligned pointer */
 227     dp = (float *)(((size_t)(dp_array+3)) & (~((size_t)15)));
 228
 229     _mm_store_ps(dp, _mm_mul_ps(a, b));
 230
 231     return dp[0] + dp[1] + dp[2];
 232 }
 233 #endif
 234
 235 #define gmx_simd4_min_r        _mm_min_ps
 236 #define gmx_simd4_max_r        _mm_max_ps
 237
 238 #define gmx_simd4_blendzero_r  _mm_and_ps
 239
 240 #define gmx_simd4_cmplt_r      _mm_cmplt_ps
 241 #define gmx_simd4_and_b        _mm_and_ps
 242 #define gmx_simd4_or_b         _mm_or_ps
 243
 244 #define gmx_simd4_anytrue_b    _mm_movemask_ps
 245
 246 #endif /* GMX_SIMD4_SINGLE */
 247
 248
 249 #ifdef GMX_SIMD4_DOUBLE
 250
 251 #define gmx_simd4_real_t  __m256d
 252
 253 #define gmx_simd4_bool_t  __m256d
 254
 255 #define gmx_simd4_load_r       _mm256_load_pd
 256 #define gmx_simd4_load_bb_pr    _mm256_load_pd
 257 #define gmx_simd4_set1_r       _mm256_set1_pd
 258 #define gmx_simd4_setzero_r    _mm256_setzero_pd
 259 #define gmx_simd4_store_r      _mm256_store_pd
 260
 261 #define GMX_SIMD4_HAVE_UNALIGNED
 262 #define gmx_simd4_loadu_r      _mm256_loadu_pd
 263 #define gmx_simd4_storeu_r     _mm256_storeu_pd
 264
 265 #define gmx_simd4_add_r        _mm256_add_pd
 266 #define gmx_simd4_sub_r        _mm256_sub_pd
 267 #define gmx_simd4_mul_r        _mm256_mul_pd
 268 #ifdef GMX_SIMD_X86_AVX_128_FMA_OR_HIGHER
 269 #define gmx_simd4_fmadd_r(a, b, c)   _mm256_macc_pd(a, b, c)
 270 #define gmx_simd4_fnmadd_r(a, b, c)  _mm256_nmacc_pd(a, b, c)
 271 #else
 272 #define gmx_simd4_fmadd_r(a, b, c)   _mm256_add_pd(c, _mm256_mul_pd(a, b))
 273 #define gmx_simd4_fnmadd_r(a, b, c)  _mm256_sub_pd(c, _mm256_mul_pd(a, b))
 274 #endif
 275 #define gmx_simd4_min_r        _mm256_min_pd
 276 #define gmx_simd4_max_r        _mm256_max_pd
 277
 278 #define gmx_simd4_blendzero_r  _mm256_and_pd
 279
 280 /* Less-than (we use ordered, non-signaling, but that's not required) */
 281 #define gmx_simd4_cmplt_r(x, y) _mm256_cmp_pd(x, y, 0x11)
 282 #define gmx_simd4_and_b        _mm256_and_pd
 283 #define gmx_simd4_or_b         _mm256_or_pd
 284
 285 #define gmx_simd4_anytrue_b    _mm256_movemask_pd
 286
 287 #endif /* GMX_SIMD4_DOUBLE */
 288
 289
 290 #endif /* GMX_HAVE_SIMD4_MACROS */
 291
 292 #endif /* GMX_SIMD_X86_SSE2_OR_HIGHER */
 293
 294 #ifdef GMX_SIMD_IBM_QPX
 295 /* i.e. BlueGene/Q */
 296
 297 /* This hack works on the compilers that can reach this code. A real
 298    solution with broader scope will be proposed in master branch. */
 299 #define gmx_always_inline __attribute__((always_inline))
 300
 301 #ifdef GMX_SIMD4_SINGLE
 302 #define GMX_HAVE_SIMD4_MACROS
 303 #endif
 304
 305 typedef vector4double gmx_simd4_real_t;
 306 typedef vector4double gmx_simd4_bool_t;
 307
 308 /* The declarations of vec_ld* use non-const pointers, and IBM
 309    can't/won't fix this any time soon. So GROMACS has to cast away the
 310    const-ness of its pointers before loads. Four-wide SIMD loads
 311    sometimes occur from variables of type real, and sometimes from
 312    variables of type float (even at double precison), so the correct
 313    cast cannot be done easily. The correct cast is necessary because
 314    the resulting type determines the alignment assumption of vec_ld*,
 315    which is different for float and double. So the loads of
 316    always-float variables have to be done with a function that does
 317    the correct cast. Since functions cannot be overloaded by type in
 318    C, they have to have different names. Thus we have
 319    gmx_simd4_load_r and gmx_simd4_load_bb_pr.
 320  */
 321
 322 static gmx_inline gmx_simd4_real_t gmx_always_inline gmx_simd4_load_r(const real *a)
 323 {
 324 #ifdef NDEBUG
 325     return vec_ld(0, (real *) a);
 326 #else
 327     return vec_lda(0, (real *) a);
 328 #endif
 329 }
 330
 331 static gmx_inline gmx_simd4_real_t gmx_always_inline gmx_simd4_load_bb_pr(const float *a)
 332 {
 333 #ifdef NDEBUG
 334     return vec_ld(0, (float *) a);
 335 #else
 336     return vec_lda(0, (float *) a);
 337 #endif
 338 }
 339
 340 static gmx_inline gmx_simd4_real_t gmx_always_inline gmx_simd4_set1_r(const real a)
 341 {
 342     return vec_splats(a);
 343 }
 344
 345 static gmx_inline gmx_simd4_real_t gmx_always_inline gmx_simd4_setzero_r()
 346 {
 347     return vec_splats(0.0);
 348 }
 349
 350 /* TODO this will not yet work, because the function might be passed a
 351    pointer to a float when running in double precision.
 352  */
 353 static gmx_inline void gmx_always_inline gmx_simd4_store_r(real *a, gmx_simd4_real_t b)
 354 {
 355 #ifdef NDEBUG
 356     vec_st(b, 0, a);
 357 #else
 358     vec_sta(b, 0, a);
 359 #endif
 360 }
 361
 362 static gmx_inline gmx_simd4_real_t gmx_always_inline gmx_simd4_add_r(gmx_simd4_real_t a, gmx_simd4_real_t b)
 363 {
 364     return vec_add(a, b);
 365 }
 366
 367 static gmx_inline gmx_simd4_real_t gmx_always_inline gmx_simd4_sub_r(gmx_simd4_real_t a, gmx_simd4_real_t b)
 368 {
 369     return vec_sub(a, b);
 370 }
 371
 372 static gmx_inline gmx_simd4_real_t gmx_always_inline gmx_simd4_mul_r(gmx_simd4_real_t a, gmx_simd4_real_t b)
 373 {
 374     return vec_mul(a, b);
 375 }
 376
 377 static gmx_inline gmx_simd4_real_t gmx_always_inline gmx_simd4_fmadd_r(gmx_simd4_real_t a, gmx_simd4_real_t b, gmx_simd4_real_t c)
 378 {
 379     return vec_madd(a, b, c);
 380 }
 381
 382 static gmx_inline gmx_simd4_real_t gmx_always_inline gmx_simd4_fnmadd_r(gmx_simd4_real_t a, gmx_simd4_real_t b, gmx_simd4_real_t c)
 383 {
 384     return vec_nmsub(a, b, c);
 385 }
 386
 387 static gmx_inline gmx_simd4_real_t gmx_always_inline gmx_simd4_min_r(gmx_simd4_real_t a, gmx_simd4_real_t b)
 388 {
 389     /* Implemented the same way as max, but with the subtraction
 390        operands swapped. */
 391     return vec_sel(b, a, vec_sub(b, a));
 392 }
 393
 394 static gmx_inline gmx_simd4_real_t gmx_always_inline gmx_simd4_max_r(gmx_simd4_real_t a, gmx_simd4_real_t b)
 395 {
 396     return vec_sel(b, a, vec_sub(a, b));
 397 }
 398
 399 static gmx_inline gmx_simd4_real_t gmx_always_inline gmx_simd4_blendzero_r(gmx_simd4_real_t a, gmx_simd4_real_t b)
 400 {
 401     return vec_sel(gmx_simd_setzero_r(), a, b);
 402 }
 403
 404 static gmx_inline gmx_simd4_bool_t gmx_always_inline gmx_simd4_cmplt_r(gmx_simd4_real_t a, gmx_simd4_real_t b)
 405 {
 406     return vec_cmplt(a, b);
 407 }
 408
 409 static gmx_inline gmx_simd4_bool_t gmx_always_inline gmx_simd4_and_b(gmx_simd4_bool_t a, gmx_simd4_bool_t b)
 410 {
 411     return vec_and(a, b);
 412 }
 413
 414 static gmx_inline gmx_simd4_bool_t gmx_always_inline gmx_simd4_or_b(gmx_simd4_bool_t a, gmx_simd4_bool_t b)
 415 {
 416     return vec_or(a, b);
 417 }
 418
 419 static gmx_inline float gmx_always_inline gmx_simd4_dotproduct3_r(gmx_simd4_real_t a, gmx_simd4_real_t b)
 420 {
 421     /* The dot product is done solely on the QPX AXU (which is the
 422        only available FPU). This is awkward, because pretty much no
 423        "horizontal" SIMD-vector operations exist, unlike x86 where
 424        SSE4.1 added various kinds of horizontal operations. So we have
 425        to make do with shifting vector elements and operating on the
 426        results. This makes for lots of data dependency, but the main
 427        alternative of storing to memory and reloading is not going to
 428        help, either. OpenMP over 2 or 4 hardware threads per core will
 429        hide much of the latency from the data dependency. The
 430        vec_extract() lets the compiler correctly use a floating-point
 431        comparison on the zeroth vector element, which avoids needing
 432        memory at all.
 433      */
 434
 435     gmx_simd4_real_t dp_shifted_left_0 = vec_mul(a, b);
 436     gmx_simd4_real_t dp_shifted_left_1 = vec_sldw(dp_shifted_left_0, dp_shifted_left_0, 1);
 437     gmx_simd4_real_t dp_shifted_left_2 = vec_sldw(dp_shifted_left_0, dp_shifted_left_0, 2);
 438     gmx_simd4_real_t dp                = vec_add(dp_shifted_left_2,
 439                                                  vec_add(dp_shifted_left_0, dp_shifted_left_1));
 440
 441     /* See comment in nbnxn_make_pairlist_part() about how this should
 442        be able to return a double on PowerPC. */
 443     return (float) vec_extract(dp, 0);
 444 }
 445
 446 static gmx_inline int gmx_always_inline gmx_simd4_anytrue_b(gmx_simd4_bool_t a)
 447 {
 448     return gmx_simd_anytrue_b(a);
 449 }
 450
 451 #undef gmx_always_inline
 452
 453 #endif /* GMX_SIMD_IBM_QPX */
 454
 455 #ifdef GMX_HAVE_SIMD4_MACROS
 456 /* Generic functions to extract a SIMD4 aligned pointer from a pointer x.
 457  * x should have at least GMX_SIMD4_WIDTH=4 elements extra compared
 458  * to how many you want to use, to avoid indexing outside the aligned region.
 459  */
 460
 461 static gmx_inline gmx_simd4_real *
 462 gmx_simd4_align_real(const gmx_simd4_real *x)
 463 {
 464     return (gmx_simd4_real *)(((size_t)((x)+GMX_SIMD4_WIDTH)) & (~((size_t)(GMX_SIMD4_WIDTH*sizeof(gmx_simd4_real)-1))));
 465 }
 466 #endif
 467
 468
 469 #endif