src/gromacs/simd/simd.h

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2013,2014,2015,2016,2017 by the GROMACS development team.
   5  * Copyright (c) 2018,2019,2020, by the GROMACS development team, led by
   6  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   7  * and including many others, as listed in the AUTHORS file in the
   8  * top-level source directory and at http://www.gromacs.org.
   9  *
  10  * GROMACS is free software; you can redistribute it and/or
  11  * modify it under the terms of the GNU Lesser General Public License
  12  * as published by the Free Software Foundation; either version 2.1
  13  * of the License, or (at your option) any later version.
  14  *
  15  * GROMACS is distributed in the hope that it will be useful,
  16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  18  * Lesser General Public License for more details.
  19  *
  20  * You should have received a copy of the GNU Lesser General Public
  21  * License along with GROMACS; if not, see
  22  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  23  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  24  *
  25  * If you want to redistribute modifications to GROMACS, please
  26  * consider that scientific software is very special. Version
  27  * control is crucial - bugs must be traceable. We will be happy to
  28  * consider code for inclusion in the official distribution, but
  29  * derived work must not be called official GROMACS. Details are found
  30  * in the README & COPYING files - if they are missing, get the
  31  * official version at http://www.gromacs.org.
  32  *
  33  * To help us fund GROMACS development, we humbly ask that you cite
  34  * the research papers on the package. Check out http://www.gromacs.org.
  35  */
  36
  37 /*! \libinternal
  38  * \defgroup module_simd SIMD intrinsics interface (simd)
  39  * \ingroup group_utilitymodules
  40  *
  41  * \brief Provides an architecture-independent way of doing SIMD coding.
  42  *
  43  * Overview of the SIMD implementation is provided in \ref page_simd.
  44  * The details are documented in gromacs/simd/simd.h and the reference
  45  * implementation impl_reference.h.
  46  *
  47  * \author Erik Lindahl <erik.lindahl@scilifelab.se>
  48  */
  49
  50 #ifndef GMX_SIMD_SIMD_H
  51 #define GMX_SIMD_SIMD_H
  52
  53 /*! \libinternal \file
  54  *
  55  * \brief Definitions, capabilities, and wrappers for SIMD module.
  56  *
  57  * The macros in this file are intended to be used for writing
  58  * architecture-independent SIMD intrinsics code.
  59  * To support a new architecture, adding a new sub-include with macros here
  60  * should be (nearly) all that is needed.
  61  *
  62  * The defines in this top-level file will set default Gromacs real precision
  63  * operations to either single or double precision based on whether
  64  * GMX_DOUBLE is 1. The actual implementation - including e.g.
  65  * conversion operations specifically between single and double - is documented
  66  * in impl_reference.h.
  67  *
  68  * \author Erik Lindahl <erik.lindahl@scilifelab.se>
  69  *
  70  * \inlibraryapi
  71  * \ingroup module_simd
  72  */
  73
  74 #include "config.h"
  75
  76 #include <cstddef>
  77 #include <cstdint>
  78
  79 #include <array>
  80 #include <type_traits>
  81
  82 #include "gromacs/utility/basedefinitions.h"
  83 #include "gromacs/utility/classhelpers.h"
  84 #include "gromacs/utility/real.h"
  85
  86 //! \cond libapi
  87
  88
  89 /*! \addtogroup module_simd
  90  * \{
  91  */
  92
  93 namespace gmx
  94 {
  95 /*! \libinternal \brief Tag type to select to load SimdFloat with simdLoad(U) */
  96 struct SimdFloatTag
  97 {
  98 };
  99 /*! \libinternal \brief Tag type to select to load SimdDouble with simdLoad(U) */
 100 struct SimdDoubleTag
 101 {
 102 };
 103 /*! \libinternal \brief Tag type to select to load SimdFInt32 with simdLoad(U) */
 104 struct SimdFInt32Tag
 105 {
 106 };
 107 /*! \libinternal \brief Tag type to select to load SimdDInt32 with simdLoad(U) */
 108 struct SimdDInt32Tag
 109 {
 110 };
 111 } // namespace gmx
 112
 113 /*! \name SIMD predefined macros to describe high-level capabilities
 114  *
 115  *  These macros are used to describe the features available in default
 116  *  Gromacs real precision. They are set from the lower-level implementation
 117  *  files that have macros describing single and double precision individually,
 118  *  as well as the implementation details.
 119  *  \{
 120  */
 121
 122 #ifdef __clang__
 123 #    pragma clang diagnostic push
 124 /* reinterpret_cast is used for SIMD->scalar conversion
 125  *
 126  * In general using reinterpret_cast for bit_cast is UB but
 127  * for intrinsics types it works for all known compilers
 128  * and not all compilers produce as good code for memcpy.
 129  */
 130 #    pragma clang diagnostic ignored "-Wundefined-reinterpret-cast"
 131 #endif
 132
 133 #if GMX_SIMD_X86_SSE2
 134 #    include "impl_x86_sse2/impl_x86_sse2.h"
 135 #elif GMX_SIMD_X86_SSE4_1
 136 #    include "impl_x86_sse4_1/impl_x86_sse4_1.h"
 137 #elif GMX_SIMD_X86_AVX_128_FMA
 138 #    include "impl_x86_avx_128_fma/impl_x86_avx_128_fma.h"
 139 #elif GMX_SIMD_X86_AVX_256
 140 #    include "impl_x86_avx_256/impl_x86_avx_256.h"
 141 #elif GMX_SIMD_X86_AVX2_256
 142 #    include "impl_x86_avx2_256/impl_x86_avx2_256.h"
 143 #elif GMX_SIMD_X86_AVX2_128
 144 #    include "impl_x86_avx2_128/impl_x86_avx2_128.h"
 145 #elif GMX_SIMD_X86_MIC
 146 #    include "impl_x86_mic/impl_x86_mic.h"
 147 #elif GMX_SIMD_X86_AVX_512
 148 #    include "impl_x86_avx_512/impl_x86_avx_512.h"
 149 #elif GMX_SIMD_X86_AVX_512_KNL
 150 #    include "impl_x86_avx_512_knl/impl_x86_avx_512_knl.h"
 151 #elif GMX_SIMD_ARM_NEON
 152 #    include "impl_arm_neon/impl_arm_neon.h"
 153 #elif GMX_SIMD_ARM_NEON_ASIMD
 154 #    include "impl_arm_neon_asimd/impl_arm_neon_asimd.h"
 155 #elif GMX_SIMD_IBM_VMX
 156 #    include "impl_ibm_vmx/impl_ibm_vmx.h"
 157 #elif GMX_SIMD_IBM_VSX
 158 #    include "impl_ibm_vsx/impl_ibm_vsx.h"
 159 #elif (GMX_SIMD_REFERENCE || defined DOXYGEN)
 160 #    include "impl_reference/impl_reference.h" // Includes doxygen documentation
 161 #else
 162 #    include "impl_none/impl_none.h"
 163 #endif
 164
 165 #ifdef __clang__
 166 #    pragma clang diagnostic pop
 167 #endif
 168
 169 // The scalar SIMD-mimicking functions are always included so we can use
 170 // templated functions even without SIMD support.
 171 #include "gromacs/simd/scalar/scalar.h"
 172 #include "gromacs/simd/scalar/scalar_math.h"
 173 #include "gromacs/simd/scalar/scalar_util.h"
 174
 175
 176 #if GMX_DOUBLE
 177 #    define GMX_SIMD_HAVE_REAL GMX_SIMD_HAVE_DOUBLE
 178 #    define GMX_SIMD_REAL_WIDTH GMX_SIMD_DOUBLE_WIDTH
 179 #    define GMX_SIMD_HAVE_INT32_EXTRACT GMX_SIMD_HAVE_DINT32_EXTRACT
 180 #    define GMX_SIMD_HAVE_INT32_LOGICAL GMX_SIMD_HAVE_DINT32_LOGICAL
 181 #    define GMX_SIMD_HAVE_INT32_ARITHMETICS GMX_SIMD_HAVE_DINT32_ARITHMETICS
 182 #    define GMX_SIMD_HAVE_GATHER_LOADU_BYSIMDINT_TRANSPOSE_REAL \
 183         GMX_SIMD_HAVE_GATHER_LOADU_BYSIMDINT_TRANSPOSE_DOUBLE
 184 #    define GMX_SIMD_HAVE_HSIMD_UTIL_REAL GMX_SIMD_HAVE_HSIMD_UTIL_DOUBLE
 185 #    define GMX_SIMD4_HAVE_REAL GMX_SIMD4_HAVE_DOUBLE
 186 #else // GMX_DOUBLE
 187
 188 /*! \brief 1 if SimdReal is available, otherwise 0.
 189  *
 190  *  \ref GMX_SIMD_HAVE_DOUBLE if GMX_DOUBLE is 1, otherwise \ref GMX_SIMD_HAVE_FLOAT.
 191  */
 192 #    define GMX_SIMD_HAVE_REAL GMX_SIMD_HAVE_FLOAT
 193
 194 /*! \brief Width of SimdReal.
 195  *
 196  *  \ref GMX_SIMD_DOUBLE_WIDTH if GMX_DOUBLE is 1, otherwise \ref GMX_SIMD_FLOAT_WIDTH.
 197  */
 198 #    define GMX_SIMD_REAL_WIDTH GMX_SIMD_FLOAT_WIDTH
 199
 200 /*! \brief 1 if support is available for extracting elements from SimdInt32, otherwise 0
 201  *
 202  *  \ref GMX_SIMD_HAVE_DINT32_EXTRACT if GMX_DOUBLE is 1, otherwise
 203  *  \ref GMX_SIMD_HAVE_FINT32_EXTRACT.
 204  */
 205 #    define GMX_SIMD_HAVE_INT32_EXTRACT GMX_SIMD_HAVE_FINT32_EXTRACT
 206
 207 /*! \brief 1 if logical ops are supported on SimdInt32, otherwise 0.
 208  *
 209  *  \ref GMX_SIMD_HAVE_DINT32_LOGICAL if GMX_DOUBLE is 1, otherwise
 210  *  \ref GMX_SIMD_HAVE_FINT32_LOGICAL.
 211  */
 212 #    define GMX_SIMD_HAVE_INT32_LOGICAL GMX_SIMD_HAVE_FINT32_LOGICAL
 213
 214 /*! \brief 1 if arithmetic ops are supported on SimdInt32, otherwise 0.
 215  *
 216  *  \ref GMX_SIMD_HAVE_DINT32_ARITHMETICS if GMX_DOUBLE is 1, otherwise
 217  *  \ref GMX_SIMD_HAVE_FINT32_ARITHMETICS.
 218  */
 219 #    define GMX_SIMD_HAVE_INT32_ARITHMETICS GMX_SIMD_HAVE_FINT32_ARITHMETICS
 220
 221 /*! \brief 1 if gmx::simdGatherLoadUBySimdIntTranspose is present, otherwise 0
 222  *
 223  *  \ref GMX_SIMD_HAVE_GATHER_LOADU_BYSIMDINT_TRANSPOSE_DOUBLE if GMX_DOUBLE is 1, otherwise
 224  *  \ref GMX_SIMD_HAVE_GATHER_LOADU_BYSIMDINT_TRANSPOSE_FLOAT.
 225  */
 226 #    define GMX_SIMD_HAVE_GATHER_LOADU_BYSIMDINT_TRANSPOSE_REAL \
 227         GMX_SIMD_HAVE_GATHER_LOADU_BYSIMDINT_TRANSPOSE_FLOAT
 228
 229 /*! \brief 1 if real half-register load/store/reduce utils present, otherwise 0
 230  *
 231  *  \ref GMX_SIMD_HAVE_HSIMD_UTIL_DOUBLE if GMX_DOUBLE is 1, otherwise
 232  *  \ref GMX_SIMD_HAVE_HSIMD_UTIL_FLOAT.
 233  */
 234 #    define GMX_SIMD_HAVE_HSIMD_UTIL_REAL GMX_SIMD_HAVE_HSIMD_UTIL_FLOAT
 235
 236 /*! \brief 1 if Simd4Real is available, otherwise 0.
 237  *
 238  *  \ref GMX_SIMD4_HAVE_DOUBLE if GMX_DOUBLE is 1, otherwise \ref GMX_SIMD4_HAVE_FLOAT.
 239  */
 240 #    define GMX_SIMD4_HAVE_REAL GMX_SIMD4_HAVE_FLOAT
 241
 242 #endif // GMX_DOUBLE
 243
 244 //! \}  end of name-group describing high-level capabilities
 245
 246 namespace gmx
 247 {
 248
 249 template<class T, size_t N>
 250 struct AlignedArray;
 251
 252 #if GMX_SIMD_HAVE_FLOAT
 253 /*! \libinternal \brief Identical to std::array with GMX_SIMD_FLOAT_WIDTH alignment.
 254  *  Should not be deleted through base pointer (destructor is non-virtual).
 255  */
 256 template<size_t N>
 257 struct alignas(GMX_SIMD_FLOAT_WIDTH * sizeof(float)) AlignedArray<float, N> :
 258     public std::array<float, N>
 259 {
 260 };
 261 #endif
 262
 263 #if GMX_SIMD_HAVE_DOUBLE
 264 /*! \libinternal \brief  Identical to std::array with GMX_SIMD_DOUBLE_WIDTH alignment.
 265  *  Should not be deleted through base pointer (destructor is non-virtual).
 266  */
 267 template<size_t N>
 268 struct alignas(GMX_SIMD_DOUBLE_WIDTH * sizeof(double)) AlignedArray<double, N> :
 269     public std::array<double, N>
 270 {
 271 };
 272 #endif
 273
 274 #if GMX_SIMD_HAVE_REAL
 275
 276 /*! \name SIMD data types
 277  *
 278  *  The actual storage of these types is implementation dependent. The
 279  *  documentation is generated from the reference implementation, but for
 280  *  normal usage this will likely not be what you are using.
 281  * \{
 282  */
 283
 284 /*! \brief Real precision floating-point SIMD datatype.
 285  *
 286  * This type is only available if \ref GMX_SIMD_HAVE_REAL is 1.
 287  *
 288  * \ref SimdDouble if GMX_DOUBLE is 1, otherwise \ref SimdFloat.
 289  *
 290  * \note This variable cannot be placed inside other structures or classes, since
 291  *       some compilers (including at least clang-3.7) appear to lose the
 292  *       alignment. This is likely particularly severe when allocating such
 293  *       memory on the heap, but it occurs for stack structures too.
 294  */
 295 #    if GMX_DOUBLE
 296 typedef SimdDouble SimdReal;
 297 #    else
 298 typedef SimdFloat  SimdReal;
 299 #    endif
 300
 301
 302 /*! \brief Boolean SIMD type for usage with \ref SimdReal.
 303  *
 304  * This type is only available if \ref GMX_SIMD_HAVE_REAL is 1.
 305  *
 306  * If GMX_DOUBLE is 1, this will be set to \ref SimdDBool
 307  * internally, otherwise \ref SimdFBool. This is necessary since some
 308  * SIMD implementations use bitpatterns for marking truth, so single-
 309  * vs. double precision booleans are not necessarily exchangable.
 310  * As long as you just use this type you will not have to worry about precision.
 311  *
 312  * See \ref SimdIBool for an explanation of real vs. integer booleans.
 313  *
 314  * \note This variable cannot be placed inside other structures or classes, since
 315  *       some compilers (including at least clang-3.7) appear to lose the
 316  *       alignment. This is likely particularly severe when allocating such
 317  *       memory on the heap, but it occurs for stack structures too.
 318  */
 319 #    if GMX_DOUBLE
 320 typedef SimdDBool SimdBool;
 321 #    else
 322 typedef SimdFBool  SimdBool;
 323 #    endif
 324
 325
 326 /*! \brief 32-bit integer SIMD type.
 327  *
 328  * If GMX_DOUBLE is 1, this will be set to \ref SimdDInt32
 329  * internally, otherwise \ref SimdFInt32. This might seem a strange
 330  * implementation detail, but it is because some SIMD implementations use
 331  * different types/widths of integers registers when converting from
 332  * double vs. single precision floating point. As long as you just use
 333  * this type you will not have to worry about precision.
 334  *
 335  * \note This variable cannot be placed inside other structures or classes, since
 336  *       some compilers (including at least clang-3.7) appear to lose the
 337  *       alignment. This is likely particularly severe when allocating such
 338  *       memory on the heap, but it occurs for stack structures too.
 339  */
 340 #    if GMX_DOUBLE
 341 typedef SimdDInt32 SimdInt32;
 342 #    else
 343 typedef SimdFInt32 SimdInt32;
 344 #    endif
 345
 346 #    if GMX_SIMD_HAVE_INT32_ARITHMETICS
 347 /*! \brief Boolean SIMD type for usage with \ref SimdInt32.
 348  *
 349  * This type is only available if \ref GMX_SIMD_HAVE_INT32_ARITHMETICS is 1.
 350  *
 351  * If GMX_DOUBLE is 1, this will be set to \ref SimdDIBool
 352  * internally, otherwise \ref SimdFIBool. This is necessary since some
 353  * SIMD implementations use bitpatterns for marking truth, so single-
 354  * vs. double precision booleans are not necessarily exchangable, and while
 355  * a double-precision boolean might be represented with a 64-bit mask, the
 356  * corresponding integer might only use a 32-bit mask.
 357  *
 358  * We provide conversion routines for these cases, so the only thing you need to
 359  * keep in mind is to use \ref SimdBool when working with
 360  * \ref SimdReal while you pick \ref SimdIBool when working with
 361  * \ref SimdInt32 .
 362  *
 363  * To convert between them, use \ref cvtB2IB and \ref cvtIB2B.
 364  *
 365  * \note This variable cannot be placed inside other structures or classes, since
 366  *       some compilers (including at least clang-3.7) appear to lose the
 367  *       alignment. This is likely particularly severe when allocating such
 368  *       memory on the heap, but it occurs for stack structures too.
 369  */
 370 #        if GMX_DOUBLE
 371 typedef SimdDIBool SimdIBool;
 372 #        else
 373 typedef SimdFIBool SimdIBool;
 374 #        endif
 375 #    endif // GMX_SIMD_HAVE_INT32_ARITHMETICS
 376
 377
 378 #    if GMX_DOUBLE
 379 const int c_simdBestPairAlignment = c_simdBestPairAlignmentDouble;
 380 #    else
 381 const int          c_simdBestPairAlignment = c_simdBestPairAlignmentFloat;
 382 #    endif
 383
 384 #endif // GMX_SIMD_HAVE_REAL
 385
 386 #if GMX_SIMD4_HAVE_REAL
 387 /*! \brief Real precision floating-point SIMD4 datatype.
 388  *
 389  * This type is only available if \ref GMX_SIMD4_HAVE_REAL is 1.
 390  *
 391  * \ref Simd4Double if GMX_DOUBLE is 1, otherwise \ref Simd4Float.
 392  *
 393  * \note This variable cannot be placed inside other structures or classes, since
 394  *       some compilers (including at least clang-3.7) appear to lose the
 395  *       alignment. This is likely particularly severe when allocating such
 396  *       memory on the heap, but it occurs for stack structures too.
 397  */
 398 #    if GMX_DOUBLE
 399 typedef Simd4Double Simd4Real;
 400 #    else
 401 typedef Simd4Float Simd4Real;
 402 #    endif
 403
 404
 405 /*! \brief Boolean SIMD4 type for usage with \ref SimdReal.
 406  *
 407  * This type is only available if \ref GMX_SIMD4_HAVE_REAL is 1.
 408  *
 409  * If GMX_DOUBLE is 1, this will be set to \ref Simd4DBool
 410  * internally, otherwise \ref Simd4FBool. This is necessary since some
 411  * SIMD implementations use bitpatterns for marking truth, so single-
 412  * vs. double precision booleans are not necessarily exchangable.
 413  * As long as you just use this type you will not have to worry about precision.
 414  *
 415  * \note This variable cannot be placed inside other structures or classes, since
 416  *       some compilers (including at least clang-3.7) appear to lose the
 417  *       alignment. This is likely particularly severe when allocating such
 418  *       memory on the heap, but it occurs for stack structures too.
 419  */
 420 #    if GMX_DOUBLE
 421 typedef Simd4DBool Simd4Bool;
 422 #    else
 423 typedef Simd4FBool Simd4Bool;
 424 #    endif
 425 #endif // GMX_SIMD4_HAVE_REAL
 426
 427 //! \}  end of name-group describing SIMD data types
 428
 429 /*! \name High-level SIMD proxy objects to disambiguate load/set operations
 430  * \{
 431  */
 432
 433 namespace internal
 434 {
 435 /*! \libinternal \brief Simd traits
 436  *
 437  * These traits are used to query data about SIMD types. Currently provided
 438  * data is useful for SIMD loads (load function and helper classes for
 439  * ArrayRef<> in simd_memory.h). Provided data:
 440  *  - type: scalar type corresponding to the SIMD type
 441  *  - width: SIMD width
 442  *  - tag: tag used for type dispatch of load function
 443  */
 444 template<typename T>
 445 struct SimdTraits
 446 {
 447 };
 448
 449 #if GMX_SIMD_HAVE_FLOAT
 450 template<>
 451 struct SimdTraits<SimdFloat>
 452 {
 453     using type                 = float;
 454     static constexpr int width = GMX_SIMD_FLOAT_WIDTH;
 455     using tag                  = SimdFloatTag;
 456 };
 457 #endif
 458 #if GMX_SIMD_HAVE_DOUBLE
 459 template<>
 460 struct SimdTraits<SimdDouble>
 461 {
 462     using type                 = double;
 463     static constexpr int width = GMX_SIMD_DOUBLE_WIDTH;
 464     using tag                  = SimdDoubleTag;
 465 };
 466 #endif
 467 #if GMX_SIMD_HAVE_FLOAT
 468 template<>
 469 struct SimdTraits<SimdFInt32>
 470 {
 471     using type                 = int;
 472     static constexpr int width = GMX_SIMD_FINT32_WIDTH;
 473     using tag                  = SimdFInt32Tag;
 474 };
 475 #endif
 476 #if GMX_SIMD_HAVE_DOUBLE
 477 template<>
 478 struct SimdTraits<SimdDInt32>
 479 {
 480     using type                 = int;
 481     static constexpr int width = GMX_SIMD_DINT32_WIDTH;
 482     using tag                  = SimdDInt32Tag;
 483 };
 484 #endif
 485
 486 template<typename T>
 487 struct SimdTraits<const T>
 488 {
 489     using type                 = const typename SimdTraits<T>::type;
 490     static constexpr int width = SimdTraits<T>::width;
 491     using tag                  = typename SimdTraits<T>::tag;
 492 };
 493 } // namespace internal
 494
 495 /*! \brief Load function that returns SIMD or scalar
 496  *
 497  * Note that a load of T* where T is const returns a value, which is a
 498  * copy, and the caller cannot be constrained to not change it, so the
 499  * return type uses std::remove_const_t.
 500  *
 501  * \tparam T Type to load (type is always mandatory)
 502  * \param  m Pointer to aligned memory
 503  * \return   Loaded value
 504  */
 505 template<typename T>
 506 static inline std::remove_const_t<T>
 507 load(const typename internal::SimdTraits<T>::type* m) // disabled by SFINAE for non-SIMD types
 508 {
 509     return simdLoad(m, typename internal::SimdTraits<T>::tag());
 510 }
 511
 512 template<typename T>
 513 static inline T
 514 /* the enable_if serves to prevent two different type of misuse:
 515  * 1) load<SimdReal>(SimdReal*); should only be called on real* or int*
 516  * 2) load(real*); template parameter is mandatory because otherwise ambiguity is
 517  *    created. The dependent type disables type deduction.
 518  */
 519 load(const std::enable_if_t<std::is_arithmetic<T>::value, T> *m)
 520 {
 521     return *m;
 522 }
 523
 524 template<typename T, size_t N>
 525 static inline T gmx_simdcall load(const AlignedArray<typename internal::SimdTraits<T>::type, N>& m)
 526 {
 527     return simdLoad(m.data(), typename internal::SimdTraits<T>::tag());
 528 }
 529
 530 /*! \brief Load function that returns SIMD or scalar based on template argument
 531  *
 532  * \tparam T Type to load (type is always mandatory)
 533  * \param m Pointer to unaligned memory
 534  * \return Loaded SimdFloat/Double/Int or basic scalar type
 535  */
 536 template<typename T>
 537 static inline T loadU(const typename internal::SimdTraits<T>::type* m)
 538 {
 539     return simdLoadU(m, typename internal::SimdTraits<T>::tag());
 540 }
 541
 542 template<typename T>
 543 static inline T loadU(const std::enable_if_t<std::is_arithmetic<T>::value, T>* m)
 544 {
 545     return *m;
 546 }
 547
 548 template<typename T, size_t N>
 549 static inline T gmx_simdcall loadU(const AlignedArray<typename internal::SimdTraits<T>::type, N>& m)
 550 {
 551     return simdLoadU(m.data(), typename internal::SimdTraits<T>::tag());
 552 }
 553
 554 /*! \libinternal \brief Proxy object to enable setZero() for SIMD and real types.
 555  *
 556  * This object is returned by setZero(), and depending on what type you assign
 557  * the result to the conversion method will call the right low-level function.
 558  */
 559 class SimdSetZeroProxy
 560 {
 561 public:
 562     //!\brief Conversion method that returns 0.0 as float
 563     operator float() const { return 0.0F; }
 564     //!\brief Conversion method that returns 0.0 as double
 565     operator double() const { return 0.0; }
 566     //!\brief Conversion method that returns 0.0 as int32
 567     operator std::int32_t() const { return 0; }
 568 #if GMX_SIMD_HAVE_FLOAT
 569     //!\brief Conversion method that will execute setZero() for SimdFloat
 570     operator SimdFloat() const { return setZeroF(); }
 571     //!\brief Conversion method that will execute setZero() for SimdFInt32
 572     operator SimdFInt32() const { return setZeroFI(); }
 573 #endif
 574 #if GMX_SIMD4_HAVE_FLOAT
 575     //!\brief Conversion method that will execute setZero() for Simd4Float
 576     operator Simd4Float() const { return simd4SetZeroF(); }
 577 #endif
 578 #if GMX_SIMD_HAVE_DOUBLE
 579     //!\brief Conversion method that will execute setZero() for SimdDouble
 580     operator SimdDouble() const { return setZeroD(); }
 581     //!\brief Conversion method that will execute setZero() for SimdDInt32
 582     operator SimdDInt32() const { return setZeroDI(); }
 583 #endif
 584 #if GMX_SIMD4_HAVE_DOUBLE
 585     //!\brief Conversion method that will execute setZero() for Simd4Double
 586     operator Simd4Double() const { return simd4SetZeroD(); }
 587 #endif
 588 };
 589
 590 /*! \brief Helper function to set any SIMD or scalar variable to zero
 591  *
 592  * \return Proxy object that will call the actual function to set a SIMD/scalar
 593  *         variable to zero based on the conversion function called when you
 594  *         assign the result.
 595  */
 596 static inline SimdSetZeroProxy gmx_simdcall setZero()
 597 {
 598     return {};
 599 }
 600
 601 namespace internal
 602 {
 603 // TODO: Don't foward function but properly rename them and use proper traits
 604 template<typename T>
 605 struct Simd4Traits
 606 {
 607 };
 608
 609 #if GMX_SIMD4_HAVE_FLOAT
 610 template<>
 611 struct Simd4Traits<Simd4Float>
 612 {
 613     using type = float;
 614 };
 615 #endif
 616
 617 #if GMX_SIMD4_HAVE_DOUBLE
 618 template<>
 619 struct Simd4Traits<Simd4Double>
 620 {
 621     using type = double;
 622 };
 623 #endif
 624 } // namespace internal
 625
 626 #if GMX_SIMD4_HAVE_REAL
 627 template<typename T>
 628 T load(const typename internal::Simd4Traits<T>::type* m)
 629 {
 630     return load4(m);
 631 }
 632 template<typename T>
 633 T loadU(const typename internal::Simd4Traits<T>::type* m)
 634 {
 635     return load4U(m);
 636 }
 637 #endif
 638
 639 /* Implement most of 4xn functions by forwarding them to other functions when possible.
 640  * The functions forwarded here don't need to be implemented by each implementation.
 641  * For width=4 all functions are forwarded and for width=8 all but loadU4NOffset are forwarded.
 642  */
 643 #if GMX_SIMD_HAVE_FLOAT
 644 #    if GMX_SIMD_FLOAT_WIDTH < 4
 645 #        define GMX_SIMD_HAVE_4NSIMD_UTIL_FLOAT (GMX_SIMD_HAVE_LOADU && GMX_SIMD4_HAVE_FLOAT)
 646 #    elif GMX_SIMD_FLOAT_WIDTH == 4
 647 #        define GMX_SIMD_HAVE_4NSIMD_UTIL_FLOAT GMX_SIMD_HAVE_LOADU
 648 // For GMX_SIMD_FLOAT_WIDTH>4 it is the reponsibility of the implementation to set
 649 // GMX_SIMD_HAVE_4NSIMD_UTIL_FLOAT
 650 #    endif
 651
 652 #    if GMX_SIMD_HAVE_4NSIMD_UTIL_FLOAT
 653 #        if GMX_SIMD_FLOAT_WIDTH < 4
 654 using Simd4NFloat = Simd4Float;
 655 #            define GMX_SIMD4N_FLOAT_WIDTH 4
 656 #        else
 657 using Simd4NFloat = SimdFloat;
 658 #            define GMX_SIMD4N_FLOAT_WIDTH GMX_SIMD_FLOAT_WIDTH
 659 #        endif
 660
 661 #        if GMX_SIMD_FLOAT_WIDTH <= 4
 662 static inline Simd4NFloat gmx_simdcall loadUNDuplicate4(const float* f)
 663 {
 664     return Simd4NFloat(*f);
 665 }
 666 static inline Simd4NFloat gmx_simdcall load4DuplicateN(const float* f)
 667 {
 668     return load<Simd4NFloat>(f);
 669 }
 670 static inline Simd4NFloat gmx_simdcall loadU4NOffset(const float* f, int)
 671 {
 672     return loadU<Simd4NFloat>(f);
 673 }
 674 #        elif GMX_SIMD_FLOAT_WIDTH == 8
 675 static inline Simd4NFloat gmx_simdcall loadUNDuplicate4(const float* f)
 676 {
 677     return loadU1DualHsimd(f);
 678 }
 679 static inline Simd4NFloat gmx_simdcall load4DuplicateN(const float* f)
 680 {
 681     return loadDuplicateHsimd(f);
 682 }
 683 #        endif
 684 #    endif // GMX_SIMD_HAVE_4NSIMD_UTIL_FLOAT
 685 #else      // GMX_SIMD_HAVE_FLOAT
 686 #    define GMX_SIMD_HAVE_4NSIMD_UTIL_FLOAT 0
 687 #endif
 688
 689 #if GMX_SIMD_HAVE_DOUBLE
 690 #    if GMX_SIMD_DOUBLE_WIDTH < 4
 691 #        define GMX_SIMD_HAVE_4NSIMD_UTIL_DOUBLE (GMX_SIMD_HAVE_LOADU && GMX_SIMD4_HAVE_DOUBLE)
 692 #    elif GMX_SIMD_DOUBLE_WIDTH == 4
 693 #        define GMX_SIMD_HAVE_4NSIMD_UTIL_DOUBLE GMX_SIMD_HAVE_LOADU
 694 // For GMX_SIMD_DOUBLE_WIDTH>4 it is the reponsibility of the implementation to set
 695 // GMX_SIMD_HAVE_4NSIMD_UTIL_DOUBLE
 696 #    endif
 697
 698 #    if GMX_SIMD_HAVE_4NSIMD_UTIL_DOUBLE
 699 #        if GMX_SIMD_DOUBLE_WIDTH < 4
 700 using Simd4NDouble = Simd4Double;
 701 #            define GMX_SIMD4N_DOUBLE_WIDTH 4
 702 #        else
 703 using Simd4NDouble = SimdDouble;
 704 #            define GMX_SIMD4N_DOUBLE_WIDTH GMX_SIMD_DOUBLE_WIDTH
 705 #        endif
 706
 707 #        if GMX_SIMD_DOUBLE_WIDTH <= 4
 708 static inline Simd4NDouble gmx_simdcall loadUNDuplicate4(const double* f)
 709 {
 710     return Simd4NDouble(*f);
 711 }
 712 static inline Simd4NDouble gmx_simdcall load4DuplicateN(const double* f)
 713 {
 714     return load<Simd4NDouble>(f);
 715 }
 716 static inline Simd4NDouble gmx_simdcall loadU4NOffset(const double* f, int /*unused*/)
 717 {
 718     return loadU<Simd4NDouble>(f);
 719 }
 720 #        elif GMX_SIMD_DOUBLE_WIDTH == 8
 721 static inline Simd4NDouble gmx_simdcall loadUNDuplicate4(const double* f)
 722 {
 723     return loadU1DualHsimd(f);
 724 }
 725 static inline Simd4NDouble gmx_simdcall load4DuplicateN(const double* f)
 726 {
 727     return loadDuplicateHsimd(f);
 728 }
 729 #        endif
 730 #    endif // GMX_SIMD_HAVE_4NSIMD_UTIL_DOUBLE
 731 #else      // GMX_SIMD_HAVE_DOUBLE
 732 #    define GMX_SIMD_HAVE_4NSIMD_UTIL_DOUBLE 0
 733 #endif
 734
 735 #if GMX_DOUBLE
 736 #    define GMX_SIMD_HAVE_4NSIMD_UTIL_REAL GMX_SIMD_HAVE_4NSIMD_UTIL_DOUBLE
 737 #else
 738 #    define GMX_SIMD_HAVE_4NSIMD_UTIL_REAL GMX_SIMD_HAVE_4NSIMD_UTIL_FLOAT
 739 #endif
 740
 741 #if GMX_SIMD_HAVE_4NSIMD_UTIL_REAL
 742 #    if GMX_DOUBLE
 743 using Simd4NReal = Simd4NDouble;
 744 #        define GMX_SIMD4N_REAL_WIDTH GMX_SIMD4N_DOUBLE_WIDTH
 745 #    else
 746 using Simd4NReal = Simd4NFloat;
 747 #        define GMX_SIMD4N_REAL_WIDTH GMX_SIMD4N_FLOAT_WIDTH
 748 #    endif
 749 #endif
 750
 751 //! \}  end of name-group proxy objects
 752
 753 } // namespace gmx
 754
 755 // \}          end of module_simd
 756
 757 //! \endcond   end of condition libapi
 758
 759
 760 #if GMX_SIMD_HAVE_FLOAT
 761
 762 /*! \brief Returns whether a pointer to float is aligned to a SIMD boundary
 763  *
 764  * \param[in] ptr  A pointer to a float
 765  */
 766 static inline bool isSimdAligned(const float* ptr)
 767 {
 768     return reinterpret_cast<std::size_t>(ptr) % (GMX_SIMD_FLOAT_WIDTH * sizeof(float)) == 0;
 769 }
 770
 771 #endif // GMX_SIMD_HAVE_FLOAT
 772
 773 #if GMX_SIMD_HAVE_DOUBLE
 774
 775 /*! \brief Returns whether a pointer to double is aligned to a SIMD boundary
 776  *
 777  * \param[in] ptr  A pointer to a double
 778  */
 779 static inline bool isSimdAligned(const double* ptr)
 780 {
 781     return reinterpret_cast<std::size_t>(ptr) % (GMX_SIMD_DOUBLE_WIDTH * sizeof(double)) == 0;
 782 }
 783
 784 #endif // GMX_SIMD_HAVE_DOUBLE
 785
 786
 787 #if GMX_SIMD_HAVE_REAL
 788 #    if GMX_SIMD_REAL_WIDTH > GMX_REAL_MAX_SIMD_WIDTH
 789 #        error "GMX_SIMD_REAL_WIDTH > GMX_REAL_MAX_SIMD_WIDTH: increase GMX_REAL_MAX_SIMD_WIDTH in real.h"
 790 #    endif
 791 #endif
 792
 793
 794 #if 0
 795 /* This is a hack to cover the corner case of using an
 796    explicit GMX_SIMD_HAVE_FLOAT or GMX_SIMD_HAVE_DOUBLE, rather than
 797    GMX_SIMD_HAVE_REAL.
 798
 799    Such code is expected to include simd.h to get those symbols
 800    defined, but the actual definitions are in the implemention headers
 801    included by simd.h. check-source.py is not a full preprocessor, so
 802    it does not see the definitions in the implementation headers as
 803    belonging to simd.h, thus it cannot check that simd.h is being used
 804    correctly in the above hypothetical corner case. However, the
 805    checker also does not parse #if 0, so we can fool the checker into
 806    thinking that definition occurs here, and that will work well
 807    enough.
 808
 809    If there's ever other kinds of SIMD code that might have the same
 810    problem, we might want to add other variables here.
 811  */
 812 #    define GMX_SIMD_HAVE_FLOAT 1
 813 #    define GMX_SIMD_HAVE_DOUBLE 1
 814
 815 #endif // end of hack
 816
 817 // The ArrayRef<SimdReal> specialization is always included, because compiler
 818 // errors are confusing when template specialization aren't available.
 819 #include "gromacs/simd/simd_memory.h"
 820
 821 #endif // GMX_SIMD_SIMD_H