src/gromacs/legacyheaders/gmx_simd_macros.h

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
   5  * Copyright (c) 2001-2012, The GROMACS Development Team
   6  * Copyright (c) 2012, by the GROMACS development team, led by
   7  * David van der Spoel, Berk Hess, Erik Lindahl, and including many
   8  * others, as listed in the AUTHORS file in the top-level source
   9  * directory and at http://www.gromacs.org.
  10  *
  11  * GROMACS is free software; you can redistribute it and/or
  12  * modify it under the terms of the GNU Lesser General Public License
  13  * as published by the Free Software Foundation; either version 2.1
  14  * of the License, or (at your option) any later version.
  15  *
  16  * GROMACS is distributed in the hope that it will be useful,
  17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  19  * Lesser General Public License for more details.
  20  *
  21  * You should have received a copy of the GNU Lesser General Public
  22  * License along with GROMACS; if not, see
  23  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  24  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  25  *
  26  * If you want to redistribute modifications to GROMACS, please
  27  * consider that scientific software is very special. Version
  28  * control is crucial - bugs must be traceable. We will be happy to
  29  * consider code for inclusion in the official distribution, but
  30  * derived work must not be called official GROMACS. Details are found
  31  * in the README & COPYING files - if they are missing, get the
  32  * official version at http://www.gromacs.org.
  33  *
  34  * To help us fund GROMACS development, we humbly ask that you cite
  35  * the research papers on the package. Check out http://www.gromacs.org.
  36  */
  37
  38 /* The macros in this file are intended to be used for writing
  39  * architecture independent SIMD intrinsics code.
  40  * To support a new architecture, adding macros here should be (nearly)
  41  * all that is needed.
  42  */
  43
  44 /* Undefine all defines used below so we can include this file multiple times
  45  * with different settings from the same source file.
  46  */
  47
  48 /* NOTE: floor and blendv are NOT available with SSE2 only acceleration */
  49
  50 #undef GMX_SIMD_WIDTH_HERE
  51
  52 #undef gmx_epi32
  53
  54 /* float/double SIMD register type */
  55 #undef gmx_mm_pr
  56
  57 #undef gmx_load_pr
  58 #undef gmx_load1_pr
  59 #undef gmx_set1_pr
  60 #undef gmx_setzero_pr
  61 #undef gmx_store_pr
  62 /* Only used for debugging */
  63 #undef gmx_storeu_pr
  64
  65 #undef gmx_add_pr
  66 #undef gmx_sub_pr
  67 #undef gmx_mul_pr
  68 #undef gmx_max_pr
  69 #undef gmx_cmplt_pr
  70 #undef gmx_and_pr
  71 #undef gmx_or_pr
  72 #undef gmx_andnot_pr
  73
  74 /* Only used to speed up the nbnxn tabulated PME kernels */
  75 #undef gmx_floor_pr
  76 /* Only used with x86 when blendv is faster than comparison */
  77 #undef gmx_blendv_pr
  78
  79 #undef gmx_movemask_pr
  80
  81 /* Integer casts are only used for nbnxn x86 exclusion masks */
  82 #undef gmx_mm_castsi128_pr
  83 #undef gmx_mm_castsi256_pr
  84
  85 /* Conversions only used for nbnxn x86 exclusion masks and PME table lookup */
  86 #undef gmx_cvttpr_epi32
  87 #undef gmx_cvtepi32_pr
  88
  89 #undef gmx_invsqrt_pr
  90 #undef gmx_calc_rsq_pr
  91 #undef gmx_sum4_pr
  92
  93 /* Only required for nbnxn analytical PME kernels */
  94 #undef gmx_pmecorrF_pr
  95 #undef gmx_pmecorrV_pr
  96
  97
  98 /* Half SIMD-width types and operations only for nbnxn 2xnn search+kernels */
  99 #undef gmx_mm_hpr
 100
 101 #undef gmx_load_hpr
 102 #undef gmx_load1_hpr
 103 #undef gmx_store_hpr
 104 #undef gmx_add_hpr
 105 #undef gmx_sub_hpr
 106
 107 #undef gmx_sum4_hpr
 108
 109 #undef gmx_2hpr_to_pr
 110
 111
 112 /* By defining GMX_MM128_HERE or GMX_MM256_HERE before including this file
 113  * the same intrinsics, with defines, can be compiled for either 128 or 256
 114  * bit wide SSE or AVX instructions.
 115  * The gmx_ prefix is replaced by _mm_ or _mm256_ (SSE or AVX).
 116  * The _pr suffix is replaced by _ps or _pd (single or double precision).
 117  * Note that compiler settings will decide if 128-bit intrinsics will
 118  * be translated into SSE or AVX instructions.
 119  */
 120
 121 #if !defined GMX_MM128_HERE && !defined GMX_MM256_HERE
 122 #error "You should define GMX_MM128_HERE or GMX_MM256_HERE"
 123 #endif
 124
 125 #if defined GMX_MM128_HERE && defined GMX_MM256_HERE
 126 #error "You should not define both GMX_MM128_HERE and GMX_MM256_HERE"
 127 #endif
 128
 129
 130 #ifdef GMX_X86_SSE2
 131
 132 #ifdef GMX_MM128_HERE
 133
 134 #define gmx_epi32  __m128i
 135
 136 #ifndef GMX_DOUBLE
 137
 138 #include "gmx_x86_simd_single.h"
 139
 140 #define GMX_SIMD_WIDTH_HERE  4
 141
 142 #define gmx_mm_pr  __m128
 143
 144 #define gmx_load_pr       _mm_load_ps
 145 #define gmx_load1_pr      _mm_load1_ps
 146 #define gmx_set1_pr       _mm_set1_ps
 147 #define gmx_setzero_pr    _mm_setzero_ps
 148 #define gmx_store_pr      _mm_store_ps
 149 #define gmx_storeu_pr     _mm_storeu_ps
 150
 151 #define gmx_add_pr        _mm_add_ps
 152 #define gmx_sub_pr        _mm_sub_ps
 153 #define gmx_mul_pr        _mm_mul_ps
 154 #define gmx_max_pr        _mm_max_ps
 155 #define gmx_cmplt_pr      _mm_cmplt_ps
 156 #define gmx_and_pr        _mm_and_ps
 157 #define gmx_or_pr         _mm_or_ps
 158 #define gmx_andnot_pr     _mm_andnot_ps
 159
 160 #define gmx_floor_pr      _mm_floor_ps
 161 #define gmx_blendv_pr     _mm_blendv_ps
 162
 163 #define gmx_movemask_pr   _mm_movemask_ps
 164
 165 #define gmx_mm_castsi128_pr gmx_mm_castsi128_ps
 166
 167 #define gmx_cvttpr_epi32  _mm_cvttps_epi32
 168 #define gmx_cvtepi32_pr   _mm_cvtepi32_ps
 169
 170 #define gmx_invsqrt_pr    gmx_mm_invsqrt_ps
 171 #define gmx_calc_rsq_pr   gmx_mm_calc_rsq_ps
 172 #define gmx_sum4_pr       gmx_mm_sum4_ps
 173
 174 #define gmx_pmecorrF_pr   gmx_mm_pmecorrF_ps
 175 #define gmx_pmecorrV_pr   gmx_mm_pmecorrV_ps
 176
 177 #else /* ifndef GMX_DOUBLE */
 178
 179 #include "gmx_x86_simd_double.h"
 180
 181 #define GMX_SIMD_WIDTH_HERE  2
 182
 183 #define gmx_mm_pr  __m128d
 184
 185 #define gmx_load_pr       _mm_load_pd
 186 #define gmx_load1_pr      _mm_load1_pd
 187 #define gmx_set1_pr       _mm_set1_pd
 188 #define gmx_setzero_pr    _mm_setzero_pd
 189 #define gmx_store_pr      _mm_store_pd
 190 #define gmx_storeu_pr     _mm_storeu_pd
 191
 192 #define gmx_add_pr        _mm_add_pd
 193 #define gmx_sub_pr        _mm_sub_pd
 194 #define gmx_mul_pr        _mm_mul_pd
 195 #define gmx_max_pr        _mm_max_pd
 196 #define gmx_cmplt_pr      _mm_cmplt_pd
 197 #define gmx_and_pr        _mm_and_pd
 198 #define gmx_or_pr         _mm_or_pd
 199 #define gmx_andnot_pr     _mm_andnot_pd
 200
 201 #define gmx_floor_pr      _mm_floor_pd
 202 #define gmx_blendv_pr     _mm_blendv_pd
 203
 204 #define gmx_movemask_pr   _mm_movemask_pd
 205
 206 #define gmx_mm_castsi128_pr gmx_mm_castsi128_pd
 207
 208 #define gmx_cvttpr_epi32  _mm_cvttpd_epi32
 209 #define gmx_cvtepi32_pr   _mm_cvtepi32_pd
 210
 211 #define gmx_invsqrt_pr    gmx_mm_invsqrt_pd
 212 #define gmx_calc_rsq_pr   gmx_mm_calc_rsq_pd
 213 #define gmx_sum4_pr       gmx_mm_sum4_pd
 214
 215 #define gmx_pmecorrF_pr   gmx_mm_pmecorrF_pd
 216 #define gmx_pmecorrV_pr   gmx_mm_pmecorrV_pd
 217
 218 #endif /* ifndef GMX_DOUBLE */
 219
 220 #endif /* GMX_MM128_HERE */
 221
 222 #ifdef GMX_MM256_HERE
 223
 224 #define gmx_epi32 __m256i
 225
 226 #ifndef GMX_DOUBLE
 227
 228 #include "gmx_x86_simd_single.h"
 229
 230 #define GMX_SIMD_WIDTH_HERE  8
 231
 232 #define gmx_mm_pr  __m256
 233
 234 #define gmx_load_pr       _mm256_load_ps
 235 #define gmx_load1_pr(x)   _mm256_set1_ps((x)[0])
 236 #define gmx_set1_pr       _mm256_set1_ps
 237 #define gmx_setzero_pr    _mm256_setzero_ps
 238 #define gmx_store_pr      _mm256_store_ps
 239 #define gmx_storeu_pr     _mm256_storeu_ps
 240
 241 #define gmx_add_pr        _mm256_add_ps
 242 #define gmx_sub_pr        _mm256_sub_ps
 243 #define gmx_mul_pr        _mm256_mul_ps
 244 #define gmx_max_pr        _mm256_max_ps
 245 /* Not-equal (ordered, non-signaling)  */
 246 #define gmx_cmpneq_pr(x, y)  _mm256_cmp_ps(x, y, 0x0c)
 247 /* Less-than (ordered, non-signaling)  */
 248 #define gmx_cmplt_pr(x, y) _mm256_cmp_ps(x, y, 0x11)
 249 #define gmx_and_pr        _mm256_and_ps
 250 #define gmx_or_pr         _mm256_or_ps
 251 #define gmx_andnot_pr     _mm256_andnot_ps
 252
 253 #define gmx_floor_pr      _mm256_floor_ps
 254 #define gmx_blendv_pr     _mm256_blendv_ps
 255
 256 #define gmx_movemask_pr   _mm256_movemask_ps
 257
 258 #define gmx_mm_castsi256_pr _mm256_castsi256_ps
 259
 260 #define gmx_cvttpr_epi32  _mm256_cvttps_epi32
 261
 262 #define gmx_invsqrt_pr    gmx_mm256_invsqrt_ps
 263 #define gmx_calc_rsq_pr   gmx_mm256_calc_rsq_ps
 264 #define gmx_sum4_pr       gmx_mm256_sum4_ps
 265
 266 #define gmx_pmecorrF_pr   gmx_mm256_pmecorrF_ps
 267 #define gmx_pmecorrV_pr   gmx_mm256_pmecorrV_ps
 268
 269 #define gmx_loaddh_pr     gmx_mm256_load4_ps
 270
 271 /* Half SIMD-width type */
 272 #define gmx_mm_hpr  __m128
 273
 274 /* Half SIMD-width macros */
 275 #define gmx_load_hpr      _mm_load_ps
 276 #define gmx_load1_hpr(x)  _mm_set1_ps((x)[0])
 277 #define gmx_store_hpr     _mm_store_ps
 278 #define gmx_add_hpr       _mm_add_ps
 279 #define gmx_sub_hpr       _mm_sub_ps
 280
 281 #define gmx_sum4_hpr      gmx_mm256_sum4h_m128
 282
 283 /* Conversion between half and full SIMD-width */
 284 #define gmx_2hpr_to_pr    gmx_mm256_set_m128
 285
 286 #else
 287
 288 #include "gmx_x86_simd_double.h"
 289
 290 #define GMX_SIMD_WIDTH_HERE  4
 291
 292 #define gmx_mm_pr  __m256d
 293
 294 #define gmx_load_pr       _mm256_load_pd
 295 #define gmx_load1_pr(x)   _mm256_set1_pd((x)[0])
 296 #define gmx_set1_pr       _mm256_set1_pd
 297 #define gmx_setzero_pr    _mm256_setzero_pd
 298 #define gmx_store_pr      _mm256_store_pd
 299 #define gmx_storeu_pr     _mm256_storeu_pd
 300
 301 #define gmx_add_pr        _mm256_add_pd
 302 #define gmx_sub_pr        _mm256_sub_pd
 303 #define gmx_mul_pr        _mm256_mul_pd
 304 #define gmx_max_pr        _mm256_max_pd
 305 /* Not-equal (ordered, non-signaling)  */
 306 #define gmx_cmpneq_pr(x, y)  _mm256_cmp_pd(x, y, 0x0c)
 307 /* Less-than (ordered, non-signaling)  */
 308 #define gmx_cmplt_pr(x, y) _mm256_cmp_pd(x, y, 0x11)
 309 #define gmx_and_pr        _mm256_and_pd
 310 #define gmx_or_pr         _mm256_or_pd
 311 #define gmx_andnot_pr     _mm256_andnot_pd
 312
 313 #define gmx_floor_pr      _mm256_floor_pd
 314 #define gmx_blendv_pr     _mm256_blendv_pd
 315
 316 #define gmx_movemask_pr   _mm256_movemask_pd
 317
 318 #define gmx_mm_castsi256_pr _mm256_castsi256_pd
 319
 320 #define gmx_cvttpr_epi32  _mm256_cvttpd_epi32
 321
 322 #define gmx_invsqrt_pr    gmx_mm256_invsqrt_pd
 323 #define gmx_calc_rsq_pr   gmx_mm256_calc_rsq_pd
 324 #define gmx_sum4_pr       gmx_mm256_sum4_pd
 325
 326 #define gmx_pmecorrF_pr   gmx_mm256_pmecorrF_pd
 327 #define gmx_pmecorrV_pr   gmx_mm256_pmecorrV_pd
 328
 329 #endif
 330
 331 #endif /* GMX_MM256_HERE */
 332
 333 #endif /* GMX_X86_SSE2 */