src/gromacs/legacyheaders/gmx_simd4_macros.h

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
   5  * Copyright (c) 2001-2012, The GROMACS Development Team.
   6  * Copyright (c) 2012,2013, by the GROMACS development team, led by
   7  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   8  * and including many others, as listed in the AUTHORS file in the
   9  * top-level source directory and at http://www.gromacs.org.
  10  *
  11  * GROMACS is free software; you can redistribute it and/or
  12  * modify it under the terms of the GNU Lesser General Public License
  13  * as published by the Free Software Foundation; either version 2.1
  14  * of the License, or (at your option) any later version.
  15  *
  16  * GROMACS is distributed in the hope that it will be useful,
  17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  19  * Lesser General Public License for more details.
  20  *
  21  * You should have received a copy of the GNU Lesser General Public
  22  * License along with GROMACS; if not, see
  23  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  24  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  25  *
  26  * If you want to redistribute modifications to GROMACS, please
  27  * consider that scientific software is very special. Version
  28  * control is crucial - bugs must be traceable. We will be happy to
  29  * consider code for inclusion in the official distribution, but
  30  * derived work must not be called official GROMACS. Details are found
  31  * in the README & COPYING files - if they are missing, get the
  32  * official version at http://www.gromacs.org.
  33  *
  34  * To help us fund GROMACS development, we humbly ask that you cite
  35  * the research papers on the package. Check out http://www.gromacs.org.
  36  */
  37
  38 /* The macros in this file are intended to be used for writing
  39  * architecture-independent SIMD intrinsics code with a SIMD width of 4.
  40  * To support a new architecture, adding macros here should be all
  41  * that is needed.
  42  *
  43  * Note that this file is intended only for SIMD operations that require
  44  * a SIMD width of 4. In general gmx_simd_macros.h provides wider hardware
  45  * support, more functionality and higher performance, but the SIMD width is
  46  * not necessarily equal to 4.
  47  */
  48
  49 #ifdef _gmx_simd4_macros_h_
  50 #error "gmx_simd4_macros.h included twice"
  51 #else
  52 #define _gmx_simd4_macros_h_
  53
  54
  55 /* The SIMD width here is always 4, since that is the whole point */
  56 #define GMX_SIMD4_WIDTH  4
  57
  58
  59 #if defined GMX_SIMD4_SINGLE || defined GMX_SIMD4_DOUBLE
  60 /* Precision set before inclusion, honour that request */
  61 #else
  62 /* Match precision to the Gromacs real precision */
  63 #ifdef GMX_DOUBLE
  64 #define GMX_SIMD4_DOUBLE
  65 #else
  66 #define GMX_SIMD4_SINGLE
  67 #endif
  68 #endif
  69
  70 #ifdef GMX_SIMD4_DOUBLE
  71 typedef double  gmx_simd4_real;
  72 #endif
  73 #ifdef GMX_SIMD4_SINGLE
  74 typedef float   gmx_simd4_real;
  75 #endif
  76
  77 /* Uncomment the next line, without other SIMD active, for testing plain-C */
  78 /* #define GMX_SIMD4_REFERENCE_PLAIN_C */
  79 #ifdef GMX_SIMD4_REFERENCE_PLAIN_C
  80 /* Plain C SIMD reference implementation, also serves as documentation */
  81 #define GMX_HAVE_SIMD4_MACROS
  82
  83 /* Include plain-C reference implementation, also serves as documentation */
  84 #include "gmx_simd4_ref.h"
  85
  86 /* float/double SIMD register type */
  87 #define gmx_simd4_pr  gmx_simd4_ref_pr
  88
  89 /* boolean SIMD register type */
  90 #define gmx_simd4_pb  gmx_simd4_ref_pb
  91
  92 #define gmx_simd4_load_pr       gmx_simd4_ref_load_pr
  93 #define gmx_simd4_load_bb_pr    gmx_simd4_ref_load_pr
  94 #define gmx_simd4_set1_pr       gmx_simd4_ref_set1_pr
  95 #define gmx_simd4_setzero_pr    gmx_simd4_ref_setzero_pr
  96 #define gmx_simd4_store_pr      gmx_simd4_ref_store_pr
  97
  98 /* Unaligned load+store are not required,
  99  * but they can speed up the PME spread+gather operations.
 100  */
 101 #define GMX_SIMD4_HAVE_UNALIGNED
 102 #ifdef GMX_SIMD4_HAVE_UNALIGNED
 103 #define gmx_simd4_loadu_pr      gmx_simd4_ref_load_pr
 104 #define gmx_simd4_storeu_pr     gmx_simd4_ref_store_pr
 105 #endif
 106
 107 #define gmx_simd4_add_pr        gmx_simd4_ref_add_pr
 108 #define gmx_simd4_sub_pr        gmx_simd4_ref_sub_pr
 109 #define gmx_simd4_mul_pr        gmx_simd4_ref_mul_pr
 110 /* For the FMA macros below, aim for c=d in code, so FMA3 uses 1 instruction */
 111 #define gmx_simd4_madd_pr       gmx_simd4_ref_madd_pr
 112 #define gmx_simd4_nmsub_pr      gmx_simd4_ref_nmsub_pr
 113
 114 #define gmx_simd4_dotproduct3   gmx_simd4_ref_dotproduct3
 115
 116 #define gmx_simd4_min_pr        gmx_simd4_ref_min_pr
 117 #define gmx_simd4_max_pr        gmx_simd4_ref_max_pr
 118
 119 #define gmx_simd4_blendzero_pr  gmx_simd4_ref_blendzero_pr
 120
 121 /* Comparison */
 122 #define gmx_simd4_cmplt_pr      gmx_simd4_ref_cmplt_pr
 123
 124 /* Logical operations on SIMD booleans */
 125 #define gmx_simd4_and_pb        gmx_simd4_ref_and_pb
 126 #define gmx_simd4_or_pb         gmx_simd4_ref_or_pb
 127
 128 /* Returns a single int (0/1) which tells if any of the 4 booleans is True */
 129 #define gmx_simd4_anytrue_pb    gmx_simd4_ref_anytrue_pb
 130
 131 #endif /* GMX_SIMD4_REFERENCE_PLAIN_C */
 132
 133
 134 /* The same SIMD macros can be translated to SIMD intrinsics (and compiled
 135  * to instructions for) different SIMD width and float precision.
 136  *
 137  * On x86: The gmx_simd4 prefix is replaced by _mm_ or _mm256_ (SSE or AVX).
 138  * The _pr suffix is replaced by _ps or _pd (for single or double precision).
 139  * Compiler settings will decide if 128-bit intrinsics will
 140  * be translated into SSE or AVX instructions.
 141  */
 142
 143
 144 #ifdef GMX_X86_SSE2
 145 /* This is for general x86 SIMD instruction sets that also support SSE2 */
 146
 147 #ifdef GMX_SIMD4_SINGLE
 148 #define GMX_HAVE_SIMD4_MACROS
 149 #endif
 150
 151 #ifdef GMX_SIMD4_DOUBLE
 152 /* Note that here we will use 256-bit SIMD with GMX_X86_AVX_128_FMA.
 153  * This is inconsistent naming wise, but should give the best performance.
 154  */
 155 #if defined GMX_X86_AVX_128_FMA || defined GMX_X86_AVX_256
 156 #define GMX_HAVE_SIMD4_MACROS
 157 #endif
 158 #endif
 159
 160 #ifdef GMX_HAVE_SIMD4_MACROS
 161
 162 #if defined GMX_X86_AVX_128_FMA || defined GMX_X86_AVX_256
 163
 164 #include <immintrin.h>
 165 #ifdef HAVE_X86INTRIN_H
 166 #include <x86intrin.h> /* FMA */
 167 #endif
 168 #ifdef HAVE_INTRIN_H
 169 #include <intrin.h> /* FMA MSVC */
 170 #endif
 171
 172 #else
 173 #ifdef GMX_X86_SSE4_1
 174 #include <smmintrin.h>
 175 #else
 176 /* We only have SSE2 */
 177 #include <emmintrin.h>
 178 #endif
 179 #endif
 180
 181 #ifdef GMX_SIMD4_SINGLE
 182
 183 #define gmx_simd4_pr  __m128
 184
 185 #define gmx_simd4_pb  __m128
 186
 187 #define gmx_simd4_load_pr       _mm_load_ps
 188 #define gmx_simd4_load_bb_pr    _mm_load_ps
 189 #define gmx_simd4_set1_pr       _mm_set1_ps
 190 #define gmx_simd4_setzero_pr    _mm_setzero_ps
 191 #define gmx_simd4_store_pr      _mm_store_ps
 192
 193 /* Some old AMD processors could have problems with unaligned loads+stores */
 194 #ifndef GMX_FAHCORE
 195 #define GMX_SIMD4_HAVE_UNALIGNED
 196 #endif
 197 #ifdef GMX_SIMD4_HAVE_UNALIGNED
 198 #define gmx_simd4_loadu_pr      _mm_loadu_ps
 199 #define gmx_simd4_storeu_pr     _mm_storeu_ps
 200 #endif
 201
 202 #define gmx_simd4_add_pr        _mm_add_ps
 203 #define gmx_simd4_sub_pr        _mm_sub_ps
 204 #define gmx_simd4_mul_pr        _mm_mul_ps
 205
 206 #ifdef GMX_X86_AVX_128_FMA
 207 #define gmx_simd4_madd_pr(a, b, c)   _mm_macc_ps(a, b, c)
 208 #define gmx_simd4_nmsub_pr(a, b, c)  _mm_nmacc_ps(a, b, c)
 209 #else
 210 #define gmx_simd4_madd_pr(a, b, c)   _mm_add_ps(c, _mm_mul_ps(a, b))
 211 #define gmx_simd4_nmsub_pr(a, b, c)  _mm_sub_ps(c, _mm_mul_ps(a, b))
 212 #endif
 213
 214 static inline float gmx_simd4_dotproduct3(__m128 a, __m128 b)
 215 #ifdef GMX_X86_SSE4_1
 216 {
 217     float dp;
 218
 219     /* SSE4.1 dot product of components 0,1,2, stored in component 0 */
 220     _mm_store_ss(&dp, _mm_dp_ps(a, b, 0x71));
 221
 222     return dp;
 223 }
 224 #else
 225 {
 226     float        dp_array[7], *dp;
 227
 228     /* Generate an aligned pointer */
 229     dp = (float *)(((size_t)(dp_array+3)) & (~((size_t)15)));
 230
 231     _mm_store_ps(dp, _mm_mul_ps(a, b));
 232
 233     return dp[0] + dp[1] + dp[2];
 234 }
 235 #endif
 236
 237 #define gmx_simd4_min_pr        _mm_min_ps
 238 #define gmx_simd4_max_pr        _mm_max_ps
 239
 240 #define gmx_simd4_blendzero_pr  _mm_and_ps
 241
 242 #define gmx_simd4_cmplt_pr      _mm_cmplt_ps
 243 #define gmx_simd4_and_pb        _mm_and_ps
 244 #define gmx_simd4_or_pb         _mm_or_ps
 245
 246 #define gmx_simd4_anytrue_pb    _mm_movemask_ps
 247
 248 #endif /* GMX_SIMD4_SINGLE */
 249
 250
 251 #ifdef GMX_SIMD4_DOUBLE
 252
 253 #define gmx_simd4_pr  __m256d
 254
 255 #define gmx_simd4_pb  __m256d
 256
 257 #define gmx_simd4_load_pr       _mm256_load_pd
 258 #define gmx_simd4_load_bb_pr    _mm256_load_pd
 259 #define gmx_simd4_set1_pr       _mm256_set1_pd
 260 #define gmx_simd4_setzero_pr    _mm256_setzero_pd
 261 #define gmx_simd4_store_pr      _mm256_store_pd
 262
 263 #define GMX_SIMD4_HAVE_UNALIGNED
 264 #define gmx_simd4_loadu_pr      _mm256_loadu_pd
 265 #define gmx_simd4_storeu_pr     _mm256_storeu_pd
 266
 267 #define gmx_simd4_add_pr        _mm256_add_pd
 268 #define gmx_simd4_sub_pr        _mm256_sub_pd
 269 #define gmx_simd4_mul_pr        _mm256_mul_pd
 270 #ifdef GMX_X86_AVX_128_FMA
 271 #define gmx_simd4_madd_pr(a, b, c)   _mm256_macc_pd(a, b, c)
 272 #define gmx_simd4_nmsub_pr(a, b, c)  _mm256_nmacc_pd(a, b, c)
 273 #else
 274 #define gmx_simd4_madd_pr(a, b, c)   _mm256_add_pd(c, _mm256_mul_pd(a, b))
 275 #define gmx_simd4_nmsub_pr(a, b, c)  _mm256_sub_pd(c, _mm256_mul_pd(a, b))
 276 #endif
 277 #define gmx_simd4_min_pr        _mm256_min_pd
 278 #define gmx_simd4_max_pr        _mm256_max_pd
 279
 280 #define gmx_simd4_blendzero_pr  _mm256_and_pd
 281
 282 /* Less-than (we use ordered, non-signaling, but that's not required) */
 283 #define gmx_simd4_cmplt_pr(x, y) _mm256_cmp_pd(x, y, 0x11)
 284 #define gmx_simd4_and_pb        _mm256_and_pd
 285 #define gmx_simd4_or_pb         _mm256_or_pd
 286
 287 #define gmx_simd4_anytrue_pb    _mm256_movemask_pd
 288
 289 #endif /* GMX_SIMD4_DOUBLE */
 290
 291
 292 #endif /* GMX_HAVE_SIMD4_MACROS */
 293
 294 #endif /* GMX_X86_SSE2 */
 295
 296 #ifdef GMX_CPU_ACCELERATION_IBM_QPX
 297 /* i.e. BlueGene/Q */
 298
 299 /* This hack works on the compilers that can reach this code. A real
 300    solution with broader scope will be proposed in master branch. */
 301 #define gmx_always_inline __attribute__((always_inline))
 302
 303 #ifdef GMX_SIMD4_SINGLE
 304 #define GMX_HAVE_SIMD4_MACROS
 305 #endif
 306
 307 typedef vector4double gmx_simd4_pr;
 308 typedef vector4double gmx_simd4_pb;
 309
 310 /* The declarations of vec_ld* use non-const pointers, and IBM
 311    can't/won't fix this any time soon. So GROMACS has to cast away the
 312    const-ness of its pointers before loads. Four-wide SIMD loads
 313    sometimes occur from variables of type real, and sometimes from
 314    variables of type float (even at double precison), so the correct
 315    cast cannot be done easily. The correct cast is necessary because
 316    the resulting type determines the alignment assumption of vec_ld*,
 317    which is different for float and double. So the loads of
 318    always-float variables have to be done with a function that does
 319    the correct cast. Since functions cannot be overloaded by type in
 320    C, they have to have different names. Thus we have
 321    gmx_simd4_load_pr and gmx_simd4_load_bb_pr.
 322  */
 323
 324 static gmx_inline gmx_simd4_pr gmx_always_inline gmx_simd4_load_pr(const real *a)
 325 {
 326 #ifdef NDEBUG
 327     return vec_ld(0, (real *) a);
 328 #else
 329     return vec_lda(0, (real *) a);
 330 #endif
 331 }
 332
 333 static gmx_inline gmx_simd4_pr gmx_always_inline gmx_simd4_load_bb_pr(const float *a)
 334 {
 335 #ifdef NDEBUG
 336     return vec_ld(0, (float *) a);
 337 #else
 338     return vec_lda(0, (float *) a);
 339 #endif
 340 }
 341
 342 static gmx_inline gmx_simd4_pr gmx_always_inline gmx_simd4_set1_pr(const real a)
 343 {
 344     return vec_splats(a);
 345 }
 346
 347 static gmx_inline gmx_simd4_pr gmx_always_inline gmx_simd4_setzero_pr()
 348 {
 349     return vec_splats(0.0);
 350 }
 351
 352 /* TODO this will not yet work, because the function might be passed a
 353    pointer to a float when running in double precision.
 354  */
 355 static gmx_inline void gmx_always_inline gmx_simd4_store_pr(real *a, gmx_simd4_pr b)
 356 {
 357 #ifdef NDEBUG
 358     vec_st(b, 0, a);
 359 #else
 360     vec_sta(b, 0, a);
 361 #endif
 362 }
 363
 364 static gmx_inline gmx_simd4_pr gmx_always_inline gmx_simd4_add_pr(gmx_simd4_pr a, gmx_simd4_pr b)
 365 {
 366     return vec_add(a, b);
 367 }
 368
 369 static gmx_inline gmx_simd4_pr gmx_always_inline gmx_simd4_sub_pr(gmx_simd4_pr a, gmx_simd4_pr b)
 370 {
 371     return vec_sub(a, b);
 372 }
 373
 374 static gmx_inline gmx_simd4_pr gmx_always_inline gmx_simd4_mul_pr(gmx_simd4_pr a, gmx_simd4_pr b)
 375 {
 376     return vec_mul(a, b);
 377 }
 378
 379 static gmx_inline gmx_simd4_pr gmx_always_inline gmx_simd4_madd_pr(gmx_simd4_pr a, gmx_simd4_pr b, gmx_simd4_pr c)
 380 {
 381     return vec_madd(a, b, c);
 382 }
 383
 384 static gmx_inline gmx_simd4_pr gmx_always_inline gmx_simd4_nmsub_pr(gmx_simd4_pr a, gmx_simd4_pr b, gmx_simd4_pr c)
 385 {
 386     return vec_nmsub(a, b, c);
 387 }
 388
 389 static gmx_inline gmx_simd4_pr gmx_always_inline gmx_simd4_min_pr(gmx_simd4_pr a, gmx_simd4_pr b)
 390 {
 391     /* Implemented the same way as max, but with the subtraction
 392        operands swapped. */
 393     return vec_sel(b, a, vec_sub(b, a));
 394 }
 395
 396 static gmx_inline gmx_simd4_pr gmx_always_inline gmx_simd4_max_pr(gmx_simd4_pr a, gmx_simd4_pr b)
 397 {
 398     return vec_sel(b, a, vec_sub(a, b));
 399 }
 400
 401 static gmx_inline gmx_simd4_pr gmx_always_inline gmx_simd4_blendzero_pr(gmx_simd4_pr a, gmx_simd4_pr b)
 402 {
 403     return vec_sel(gmx_setzero_pr(), a, b);
 404 }
 405
 406 static gmx_inline gmx_simd4_pb gmx_always_inline gmx_simd4_cmplt_pr(gmx_simd4_pr a, gmx_simd4_pr b)
 407 {
 408     return vec_cmplt(a, b);
 409 }
 410
 411 static gmx_inline gmx_simd4_pb gmx_always_inline gmx_simd4_and_pb(gmx_simd4_pb a, gmx_simd4_pb b)
 412 {
 413     return vec_and(a, b);
 414 }
 415
 416 static gmx_inline gmx_simd4_pb gmx_always_inline gmx_simd4_or_pb(gmx_simd4_pb a, gmx_simd4_pb b)
 417 {
 418     return vec_or(a, b);
 419 }
 420
 421 static gmx_inline float gmx_always_inline gmx_simd4_dotproduct3(gmx_simd4_pr a, gmx_simd4_pr b)
 422 {
 423     /* The dot product is done solely on the QPX AXU (which is the
 424        only available FPU). This is awkward, because pretty much no
 425        "horizontal" SIMD-vector operations exist, unlike x86 where
 426        SSE4.1 added various kinds of horizontal operations. So we have
 427        to make do with shifting vector elements and operating on the
 428        results. This makes for lots of data dependency, but the main
 429        alternative of storing to memory and reloading is not going to
 430        help, either. OpenMP over 2 or 4 hardware threads per core will
 431        hide much of the latency from the data dependency. The
 432        vec_extract() lets the compiler correctly use a floating-point
 433        comparison on the zeroth vector element, which avoids needing
 434        memory at all.
 435      */
 436
 437     gmx_simd4_pr dp_shifted_left_0 = vec_mul(a, b);
 438     gmx_simd4_pr dp_shifted_left_1 = vec_sldw(dp_shifted_left_0, dp_shifted_left_0, 1);
 439     gmx_simd4_pr dp_shifted_left_2 = vec_sldw(dp_shifted_left_0, dp_shifted_left_0, 2);
 440     gmx_simd4_pr dp                = vec_add(dp_shifted_left_2,
 441                                              vec_add(dp_shifted_left_0, dp_shifted_left_1));
 442
 443     /* See comment in nbnxn_make_pairlist_part() about how this should
 444        be able to return a double on PowerPC. */
 445     return (float) vec_extract(dp, 0);
 446 }
 447
 448 static gmx_inline int gmx_always_inline gmx_simd4_anytrue_pb(gmx_simd4_pb a)
 449 {
 450     return gmx_anytrue_pb(a);
 451 }
 452
 453 #undef gmx_always_inline
 454
 455 #endif /* GMX_CPU_ACCELERATION_IBM_QPX */
 456
 457 #ifdef GMX_HAVE_SIMD4_MACROS
 458 /* Generic functions to extract a SIMD4 aligned pointer from a pointer x.
 459  * x should have at least GMX_SIMD4_WIDTH=4 elements extra compared
 460  * to how many you want to use, to avoid indexing outside the aligned region.
 461  */
 462
 463 static gmx_inline gmx_simd4_real *
 464 gmx_simd4_align_real(const gmx_simd4_real *x)
 465 {
 466     return (gmx_simd4_real *)(((size_t)((x)+GMX_SIMD4_WIDTH)) & (~((size_t)(GMX_SIMD4_WIDTH*sizeof(gmx_simd4_real)-1))));
 467 }
 468 #endif
 469
 470
 471 #endif /* _gmx_simd4_macros_h_ */