include/gmx_simd4_macros.h

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
   5  * Copyright (c) 2001-2012, The GROMACS Development Team
   6  * Copyright (c) 2012,2013, by the GROMACS development team, led by
   7  * David van der Spoel, Berk Hess, Erik Lindahl, and including many
   8  * others, as listed in the AUTHORS file in the top-level source
   9  * directory and at http://www.gromacs.org.
  10  *
  11  * GROMACS is free software; you can redistribute it and/or
  12  * modify it under the terms of the GNU Lesser General Public License
  13  * as published by the Free Software Foundation; either version 2.1
  14  * of the License, or (at your option) any later version.
  15  *
  16  * GROMACS is distributed in the hope that it will be useful,
  17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  19  * Lesser General Public License for more details.
  20  *
  21  * You should have received a copy of the GNU Lesser General Public
  22  * License along with GROMACS; if not, see
  23  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  24  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  25  *
  26  * If you want to redistribute modifications to GROMACS, please
  27  * consider that scientific software is very special. Version
  28  * control is crucial - bugs must be traceable. We will be happy to
  29  * consider code for inclusion in the official distribution, but
  30  * derived work must not be called official GROMACS. Details are found
  31  * in the README & COPYING files - if they are missing, get the
  32  * official version at http://www.gromacs.org.
  33  *
  34  * To help us fund GROMACS development, we humbly ask that you cite
  35  * the research papers on the package. Check out http://www.gromacs.org.
  36  */
  37
  38 /* The macros in this file are intended to be used for writing
  39  * architecture-independent SIMD intrinsics code with a SIMD width of 4.
  40  * To support a new architecture, adding macros here should be all
  41  * that is needed.
  42  *
  43  * Note that this file is intended only for SIMD operations that require
  44  * a SIMD width of 4. In general gmx_simd_macros.h provides wider hardware
  45  * support, more functionality and higher performance, but the SIMD width is
  46  * not necessarily equal to 4.
  47  */
  48
  49 #ifdef _gmx_simd4_macros_h_
  50 #error "gmx_simd4_macros.h included twice"
  51 #else
  52 #define _gmx_simd4_macros_h_
  53
  54
  55 /* The SIMD width here is always 4, since that is the whole point */
  56 #define GMX_SIMD4_WIDTH  4
  57
  58
  59 #if defined GMX_SIMD4_SINGLE || defined GMX_SIMD4_DOUBLE
  60 /* Precision set before inclusion, honour that request */
  61 #else
  62 /* Match precision to the Gromacs real precision */
  63 #ifdef GMX_DOUBLE
  64 #define GMX_SIMD4_DOUBLE
  65 #else
  66 #define GMX_SIMD4_SINGLE
  67 #endif
  68 #endif
  69
  70 #ifdef GMX_SIMD4_DOUBLE
  71 typedef double  gmx_simd4_real;
  72 #endif
  73 #ifdef GMX_SIMD4_SINGLE
  74 typedef float   gmx_simd4_real;
  75 #endif
  76
  77 /* Uncomment the next line, without other SIMD active, for testing plain-C */
  78 /* #define GMX_SIMD4_REFERENCE_PLAIN_C */
  79 #ifdef GMX_SIMD4_REFERENCE_PLAIN_C
  80 /* Plain C SIMD reference implementation, also serves as documentation */
  81 #define GMX_HAVE_SIMD4_MACROS
  82
  83 /* Include plain-C reference implementation, also serves as documentation */
  84 #include "gmx_simd4_ref.h"
  85
  86 /* float/double SIMD register type */
  87 #define gmx_simd4_pr  gmx_simd4_ref_pr
  88
  89 /* boolean SIMD register type */
  90 #define gmx_simd4_pb  gmx_simd4_ref_pb
  91
  92 #define gmx_simd4_load_pr       gmx_simd4_ref_load_pr
  93 #define gmx_simd4_set1_pr       gmx_simd4_ref_set1_pr
  94 #define gmx_simd4_setzero_pr    gmx_simd4_ref_setzero_pr
  95 #define gmx_simd4_store_pr      gmx_simd4_ref_store_pr
  96
  97 /* Unaligned load+store are not required,
  98  * but they can speed up the PME spread+gather operations.
  99  */
 100 #define GMX_SIMD4_HAVE_UNALIGNED
 101 #ifdef GMX_SIMD4_HAVE_UNALIGNED
 102 #define gmx_simd4_loadu_pr      gmx_simd4_ref_load_pr
 103 #define gmx_simd4_storeu_pr     gmx_simd4_ref_store_pr
 104 #endif
 105
 106 #define gmx_simd4_add_pr        gmx_simd4_ref_add_pr
 107 #define gmx_simd4_sub_pr        gmx_simd4_ref_sub_pr
 108 #define gmx_simd4_mul_pr        gmx_simd4_ref_mul_pr
 109 /* For the FMA macros below, aim for c=d in code, so FMA3 uses 1 instruction */
 110 #define gmx_simd4_madd_pr       gmx_simd4_ref_madd_pr
 111 #define gmx_simd4_nmsub_pr      gmx_simd4_ref_nmsub_pr
 112
 113 #define gmx_simd4_dotproduct3   gmx_simd4_ref_dotproduct3
 114
 115 #define gmx_simd4_min_pr        gmx_simd4_ref_min_pr
 116 #define gmx_simd4_max_pr        gmx_simd4_ref_max_pr
 117
 118 #define gmx_simd4_blendzero_pr  gmx_simd4_ref_blendzero_pr
 119
 120 /* Comparison */
 121 #define gmx_simd4_cmplt_pr      gmx_simd4_ref_cmplt_pr
 122
 123 /* Logical operations on SIMD booleans */
 124 #define gmx_simd4_and_pb        gmx_simd4_ref_and_pb
 125 #define gmx_simd4_or_pb         gmx_simd4_ref_or_pb
 126
 127 /* Returns a single int (0/1) which tells if any of the 4 booleans is True */
 128 #define gmx_simd4_anytrue_pb    gmx_simd4_ref_anytrue_pb
 129
 130 #endif /* GMX_SIMD4_REFERENCE_PLAIN_C */
 131
 132
 133 /* The same SIMD macros can be translated to SIMD intrinsics (and compiled
 134  * to instructions for) different SIMD width and float precision.
 135  *
 136  * On x86: The gmx_simd4 prefix is replaced by _mm_ or _mm256_ (SSE or AVX).
 137  * The _pr suffix is replaced by _ps or _pd (for single or double precision).
 138  * Compiler settings will decide if 128-bit intrinsics will
 139  * be translated into SSE or AVX instructions.
 140  */
 141
 142
 143 #ifdef GMX_X86_SSE2
 144 /* This is for general x86 SIMD instruction sets that also support SSE2 */
 145
 146 #ifdef GMX_SIMD4_SINGLE
 147 #define GMX_HAVE_SIMD4_MACROS
 148 #endif
 149
 150 #ifdef GMX_SIMD4_DOUBLE
 151 /* Note that here we will use 256-bit SIMD with GMX_X86_AVX_128_FMA.
 152  * This is inconsistent naming wise, but should give the best performance.
 153  */
 154 #if defined GMX_X86_AVX_128_FMA || defined GMX_X86_AVX_256
 155 #define GMX_HAVE_SIMD4_MACROS
 156 #endif
 157 #endif
 158
 159 #ifdef GMX_HAVE_SIMD4_MACROS
 160
 161 #if defined GMX_X86_AVX_128_FMA || defined GMX_X86_AVX_256
 162
 163 #include <immintrin.h>
 164 #ifdef HAVE_X86INTRIN_H
 165 #include <x86intrin.h> /* FMA */
 166 #endif
 167 #ifdef HAVE_INTRIN_H
 168 #include <intrin.h> /* FMA MSVC */
 169 #endif
 170
 171 #else
 172 #ifdef GMX_X86_SSE4_1
 173 #include <smmintrin.h>
 174 #else
 175 /* We only have SSE2 */
 176 #include <emmintrin.h>
 177 #endif
 178 #endif
 179
 180 #ifdef GMX_SIMD4_SINGLE
 181
 182 #define gmx_simd4_pr  __m128
 183
 184 #define gmx_simd4_pb  __m128
 185
 186 #define gmx_simd4_load_pr       _mm_load_ps
 187 #define gmx_simd4_set1_pr       _mm_set1_ps
 188 #define gmx_simd4_setzero_pr    _mm_setzero_ps
 189 #define gmx_simd4_store_pr      _mm_store_ps
 190
 191 /* Some old AMD processors could have problems with unaligned loads+stores */
 192 #ifndef GMX_FAHCORE
 193 #define GMX_SIMD4_HAVE_UNALIGNED
 194 #endif
 195 #ifdef GMX_SIMD4_HAVE_UNALIGNED
 196 #define gmx_simd4_loadu_pr      _mm_loadu_ps
 197 #define gmx_simd4_storeu_pr     _mm_storeu_ps
 198 #endif
 199
 200 #define gmx_simd4_add_pr        _mm_add_ps
 201 #define gmx_simd4_sub_pr        _mm_sub_ps
 202 #define gmx_simd4_mul_pr        _mm_mul_ps
 203
 204 #ifdef GMX_X86_AVX_128_FMA
 205 #define gmx_simd4_madd_pr(a, b, c)   _mm_macc_ps(a, b, c)
 206 #define gmx_simd4_nmsub_pr(a, b, c)  _mm_nmacc_ps(a, b, c)
 207 #else
 208 #define gmx_simd4_madd_pr(a, b, c)   _mm_add_ps(c, _mm_mul_ps(a, b))
 209 #define gmx_simd4_nmsub_pr(a, b, c)  _mm_sub_ps(c, _mm_mul_ps(a, b))
 210 #endif
 211
 212 static inline float gmx_simd4_dotproduct3(__m128 a, __m128 b)
 213 #ifdef GMX_X86_SSE4_1
 214 {
 215     float dp;
 216
 217     /* SSE4.1 dot product of components 0,1,2, stored in component 0 */
 218     _mm_store_ss(&dp, _mm_dp_ps(a, b, 0x71));
 219
 220     return dp;
 221 }
 222 #else
 223 {
 224     float        dp_array[7], *dp;
 225
 226     /* Generate an aligned pointer */
 227     dp = (float *)(((size_t)(dp_array+3)) & (~((size_t)15)));
 228
 229     _mm_store_ps(dp, _mm_mul_ps(a, b));
 230
 231     return dp[0] + dp[1] + dp[2];
 232 }
 233 #endif
 234
 235 #define gmx_simd4_min_pr        _mm_min_ps
 236 #define gmx_simd4_max_pr        _mm_max_ps
 237
 238 #define gmx_simd4_blendzero_pr  _mm_and_ps
 239
 240 #define gmx_simd4_cmplt_pr      _mm_cmplt_ps
 241 #define gmx_simd4_and_pb        _mm_and_ps
 242 #define gmx_simd4_or_pb         _mm_or_ps
 243
 244 #define gmx_simd4_anytrue_pb    _mm_movemask_ps
 245
 246 #endif /* GMX_SIMD4_SINGLE */
 247
 248
 249 #ifdef GMX_SIMD4_DOUBLE
 250
 251 #define gmx_simd4_pr  __m256d
 252
 253 #define gmx_simd4_pb  __m256d
 254
 255 #define gmx_simd4_load_pr       _mm256_load_pd
 256 #define gmx_simd4_set1_pr       _mm256_set1_pd
 257 #define gmx_simd4_setzero_pr    _mm256_setzero_pd
 258 #define gmx_simd4_store_pr      _mm256_store_pd
 259
 260 #define GMX_SIMD4_HAVE_UNALIGNED
 261 #define gmx_simd4_loadu_pr      _mm256_loadu_pd
 262 #define gmx_simd4_storeu_pr     _mm256_storeu_pd
 263
 264 #define gmx_simd4_add_pr        _mm256_add_pd
 265 #define gmx_simd4_sub_pr        _mm256_sub_pd
 266 #define gmx_simd4_mul_pr        _mm256_mul_pd
 267 #ifdef GMX_X86_AVX_128_FMA
 268 #define gmx_simd4_madd_pr(a, b, c)   _mm256_macc_pd(a, b, c)
 269 #define gmx_simd4_nmsub_pr(a, b, c)  _mm256_nmacc_pd(a, b, c)
 270 #else
 271 #define gmx_simd4_madd_pr(a, b, c)   _mm256_add_pd(c, _mm256_mul_pd(a, b))
 272 #define gmx_simd4_nmsub_pr(a, b, c)  _mm256_sub_pd(c, _mm256_mul_pd(a, b))
 273 #endif
 274 #define gmx_simd4_min_pr        _mm256_min_pd
 275 #define gmx_simd4_max_pr        _mm256_max_pd
 276
 277 #define gmx_simd4_blendzero_pr  _mm256_and_pd
 278
 279 /* Less-than (we use ordered, non-signaling, but that's not required) */
 280 #define gmx_simd4_cmplt_pr(x, y) _mm256_cmp_pd(x, y, 0x11)
 281 #define gmx_simd4_and_pb        _mm256_and_pd
 282 #define gmx_simd4_or_pb         _mm256_or_pd
 283
 284 #define gmx_simd4_anytrue_pb    _mm256_movemask_pd
 285
 286 #endif /* GMX_SIMD4_DOUBLE */
 287
 288
 289 #endif /* GMX_HAVE_SIMD4_MACROS */
 290
 291
 292 #endif /* GMX_X86_SSE2 */
 293
 294
 295 #ifdef GMX_HAVE_SIMD4_MACROS
 296 /* Generic functions to extract a SIMD4 aligned pointer from a pointer x.
 297  * x should have at least GMX_SIMD4_WIDTH=4 elements extra compared
 298  * to how many you want to use, to avoid indexing outside the aligned region.
 299  */
 300
 301 static gmx_inline gmx_simd4_real *
 302 gmx_simd4_align_real(const gmx_simd4_real *x)
 303 {
 304     return (gmx_simd4_real *)(((size_t)((x)+GMX_SIMD4_WIDTH)) & (~((size_t)(GMX_SIMD4_WIDTH*sizeof(gmx_simd4_real)-1))));
 305 }
 306 #endif
 307
 308
 309 #endif /* _gmx_simd4_macros_h_ */