src/gromacs/simd/impl_reference/impl_reference.h

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2014, by the GROMACS development team, led by
   5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   6  * and including many others, as listed in the AUTHORS file in the
   7  * top-level source directory and at http://www.gromacs.org.
   8  *
   9  * GROMACS is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public License
  11  * as published by the Free Software Foundation; either version 2.1
  12  * of the License, or (at your option) any later version.
  13  *
  14  * GROMACS is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public
  20  * License along with GROMACS; if not, see
  21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  23  *
  24  * If you want to redistribute modifications to GROMACS, please
  25  * consider that scientific software is very special. Version
  26  * control is crucial - bugs must be traceable. We will be happy to
  27  * consider code for inclusion in the official distribution, but
  28  * derived work must not be called official GROMACS. Details are found
  29  * in the README & COPYING files - if they are missing, get the
  30  * official version at http://www.gromacs.org.
  31  *
  32  * To help us fund GROMACS development, we humbly ask that you cite
  33  * the research papers on the package. Check out http://www.gromacs.org.
  34  */
  35
  36 #ifndef GMX_SIMD_IMPL_REFERENCE_H
  37 #define GMX_SIMD_IMPL_REFERENCE_H
  38
  39 /*! \libinternal \file
  40  *
  41  * \brief Reference SIMD implementation, including SIMD documentation.
  42  *
  43  * \author Erik Lindahl <erik.lindahl@scilifelab.se>
  44  *
  45  * \ingroup module_simd
  46  */
  47
  48
  49 #include <math.h>
  50
  51 #include "gromacs/utility/fatalerror.h"
  52
  53 /*! \cond libapi */
  54 /*! \addtogroup module_simd */
  55 /*! \{ */
  56
  57 /*! \name SIMD implementation capability definitions
  58  *  \{
  59  */
  60
  61 /*! \brief
  62  * Defined when SIMD float support is present.
  63  *
  64  * You should only use this to specifically check for single precision SIMD,
  65  * support, even when the rest of Gromacs uses double precision.
  66  * \sa GMX_SIMD_HAVE_REAL, GMX_SIMD_HAVE_DOUBLE
  67  */
  68 #define GMX_SIMD_HAVE_FLOAT
  69
  70 /*! \brief Defined if SIMD double support is present. */
  71 #define GMX_SIMD_HAVE_DOUBLE
  72
  73 /*! \brief Defined if SIMD is implemented with real hardware instructions. */
  74 #define GMX_SIMD_HAVE_HARDWARE /* For Doxygen */
  75 #undef  GMX_SIMD_HAVE_HARDWARE /* Reference implementation setting */
  76
  77 /*! \brief Defined if the SIMD implementation supports unaligned loads. */
  78 #define GMX_SIMD_HAVE_LOADU
  79
  80 /*! \brief Defined if the SIMD implementation supports unaligned stores. */
  81 #define GMX_SIMD_HAVE_STOREU
  82
  83 /*! \brief Defined if SIMD implementation has logical operations on floating-point data. */
  84 #define GMX_SIMD_HAVE_LOGICAL
  85
  86 /*! \brief Defined if SIMD fused multiply-add uses hardware instructions */
  87 #define GMX_SIMD_HAVE_FMA  /* For Doxygen */
  88 #undef  GMX_SIMD_HAVE_FMA  /* Reference implementation setting */
  89
  90 /*! \brief Defined if the SIMD fraction has a direct hardware instruction. */
  91 #define GMX_SIMD_HAVE_FRACTION /* For Doxygen */
  92 #undef  GMX_SIMD_HAVE_FRACTION /* Reference implementation setting */
  93
  94 /*! \brief Defined if the SIMD implementation has \ref gmx_simd_fint32_t. */
  95 #define GMX_SIMD_HAVE_FINT32
  96
  97 /*! \brief Support for extracting integers from \ref gmx_simd_fint32_t. */
  98 #define GMX_SIMD_HAVE_FINT32_EXTRACT
  99
 100 /*! \brief Defined if SIMD logical operations are supported for \ref gmx_simd_fint32_t */
 101 #define GMX_SIMD_HAVE_FINT32_LOGICAL
 102
 103 /*! \brief Defined if SIMD arithmetic operations are supported for \ref gmx_simd_fint32_t */
 104 #define GMX_SIMD_HAVE_FINT32_ARITHMETICS
 105
 106 /*! \brief Defined if the SIMD implementation has \ref gmx_simd_dint32_t.
 107  *
 108  * \note The Gromacs SIMD module works entirely with 32 bit integers, both
 109  * in single and double precision, since some platforms do not support 64 bit
 110  * SIMD integers at all. In particular, this means it is up to each
 111  * implementation to get this working even if the architectures internal
 112  * representation uses 64 bit integers when converting to/from double SIMD
 113  * variables. For now we will try HARD to use conversions, packing or shuffling
 114  * so the integer datatype has the same width as the floating-point type, i.e.
 115  * if you use double precision SIMD with a width of 8, we want the integers
 116  * we work with to also use a SIMD width of 8 to make it easy to load/store
 117  * indices from arrays. This refers entirely to the function calls
 118  * and how many integers we load/store in one call; the actual SIMD registers
 119  * might be wider for integers internally (e.g. on x86 gmx_simd_dint32_t will
 120  * only fill half the register), but this is none of the user's business.
 121  * While this works for all current architectures, and we think it will work
 122  * for future ones, we might have to alter this decision in the future. To
 123  * avoid rewriting every single instance that refers to the SIMD width we still
 124  * provide separate defines for the width of SIMD integer variables that you
 125  * should use.
 126  */
 127 #define GMX_SIMD_HAVE_DINT32
 128
 129 /*! \brief Support for extracting integer from \ref gmx_simd_dint32_t */
 130 #define GMX_SIMD_HAVE_DINT32_EXTRACT
 131
 132 /*! \brief Defined if logical operations are supported for \ref gmx_simd_dint32_t */
 133 #define GMX_SIMD_HAVE_DINT32_LOGICAL
 134
 135 /*! \brief Defined if SIMD arithmetic operations are supported for \ref gmx_simd_dint32_t */
 136 #define GMX_SIMD_HAVE_DINT32_ARITHMETICS
 137
 138 /*! \brief Defined if the implementation provides \ref gmx_simd4_float_t. */
 139 #define GMX_SIMD4_HAVE_FLOAT
 140
 141 /*! \brief Defined if the implementation provides \ref gmx_simd4_double_t. */
 142 #define GMX_SIMD4_HAVE_DOUBLE
 143
 144 #ifdef GMX_SIMD_REF_FLOAT_WIDTH
 145 #    define GMX_SIMD_FLOAT_WIDTH             GMX_SIMD_REF_FLOAT_WIDTH
 146 #else
 147 /*! \brief Width of the \ref gmx_simd_float_t datatype. */
 148 #    define GMX_SIMD_FLOAT_WIDTH             4
 149 #endif
 150
 151 #ifdef GMX_SIMD_REF_DOUBLE_WIDTH
 152 #    define GMX_SIMD_DOUBLE_WIDTH            GMX_SIMD_REF_DOUBLE_WIDTH
 153 #else
 154 /*! \brief Width of the \ref gmx_simd_double_t datatype. */
 155 #    define GMX_SIMD_DOUBLE_WIDTH            4
 156 #endif
 157
 158 /*! \brief Width of the \ref gmx_simd_fint32_t datatype. */
 159 #define GMX_SIMD_FINT32_WIDTH            GMX_SIMD_FLOAT_WIDTH
 160
 161 /*! \brief Width of the \ref gmx_simd_dint32_t datatype. */
 162 #define GMX_SIMD_DINT32_WIDTH            GMX_SIMD_DOUBLE_WIDTH
 163
 164 /*! \brief Accuracy of SIMD 1/sqrt(x) lookup. Used to determine number of iterations. */
 165 #define GMX_SIMD_RSQRT_BITS             23
 166
 167 /*! \brief Accuracy of SIMD 1/x lookup. Used to determine number of iterations. */
 168 #define GMX_SIMD_RCP_BITS               23
 169
 170 /*! \}
 171  *
 172  * \name SIMD implementation data types
 173  * \{
 174  */
 175 /*! \libinternal \brief Float SIMD variable. Supported with GMX_SIMD_HAVE_FLOAT.
 176  */
 177 typedef struct
 178 {
 179     float r[GMX_SIMD_FLOAT_WIDTH]; /**< Implementation dependent. Don't touch. */
 180 }
 181 gmx_simd_float_t;
 182
 183 /*! \libinternal \brief Floating-point SIMD variable type in double precision.
 184  *
 185  * Supported with GMX_SIMD_HAVE_DOUBLE.
 186  */
 187 typedef struct
 188 {
 189     double r[GMX_SIMD_DOUBLE_WIDTH]; /**< Implementation dependent. Don't touch. */
 190 }
 191 gmx_simd_double_t;
 192
 193 /*! \libinternal \brief Integer SIMD variable type to use for conversions to/from float.
 194  *
 195  * This is also the widest integer SIMD type.
 196  */
 197 typedef struct
 198 {
 199     gmx_int32_t i[GMX_SIMD_FINT32_WIDTH]; /**< Implementation dependent. Don't touch. */
 200 }
 201 gmx_simd_fint32_t;
 202
 203 /*! \libinternal \brief Integer SIMD variable type to use for conversions to/from double.
 204  *
 205  * Available with GMX_SIMD_HAVE_DINT32.
 206  */
 207 typedef struct
 208 {
 209     gmx_int32_t i[GMX_SIMD_DINT32_WIDTH]; /**< Implementation dependent. Don't touch. */
 210 }
 211 gmx_simd_dint32_t;
 212
 213 /*! \libinternal \brief Boolean type for float SIMD data.
 214  *
 215  * You should likely use gmx_simd_bool_t
 216  * (for gmx_simd_real_t) instead, unless you really know what you are doing.
 217  */
 218 typedef struct
 219 {
 220     gmx_int32_t b[GMX_SIMD_FLOAT_WIDTH]; /**< Implementation dependent. Don't touch. */
 221 }
 222 gmx_simd_fbool_t;
 223
 224 /*! \libinternal \brief Boolean type for double precision SIMD data.
 225  *
 226  * Use the generic gmx_simd_bool_t
 227  * (for gmx_simd_real_t) instead, unless you really know what you are doing.
 228  */
 229 typedef struct
 230 {
 231     gmx_int32_t b[GMX_SIMD_DOUBLE_WIDTH]; /**< Implementation dependent. Don't touch. */
 232 }
 233 gmx_simd_dbool_t;
 234
 235 /*! \libinternal \brief Boolean type for integer datatypes corresponding to float SIMD. */
 236 typedef struct
 237 {
 238     gmx_int32_t b[GMX_SIMD_FINT32_WIDTH]; /**< Implementation dependent. Don't touch. */
 239 }
 240 gmx_simd_fibool_t;
 241
 242 /*! \libinternal \brief Boolean type for integer datatypes corresponding to double SIMD.
 243  *
 244  * You should likely use gmx_simd_ibool_t (for gmx_simd_int32_t) instead,
 245  * unless you really know what you are doing.
 246  */
 247 typedef struct
 248 {
 249     gmx_int32_t b[GMX_SIMD_DINT32_WIDTH]; /**< Implementation dependent. Don't touch. */
 250 }
 251 gmx_simd_dibool_t;
 252
 253 /*! \}
 254  *
 255  * \name SIMD implementation load/store operations for single precision floating point
 256  * \{
 257  */
 258
 259 /*! \brief Load \ref GMX_SIMD_FLOAT_WIDTH numbers from aligned memory.
 260  *
 261  * \param m Pointer to memory aligned to the SIMD width.
 262  * \return SIMD variable with data loaded.
 263  */
 264 static gmx_inline gmx_simd_float_t
 265 gmx_simd_load_f(const float *m)
 266 {
 267     gmx_simd_float_t  a;
 268     int               i;
 269
 270     for (i = 0; i < GMX_SIMD_FLOAT_WIDTH; i++)
 271     {
 272         a.r[i] = m[i];
 273     }
 274     return a;
 275 }
 276
 277 /*! \brief Set all SIMD variable elements to float pointed to by m (unaligned).
 278  *
 279  * \param m Pointer to single value in memory.
 280  * \return SIMD variable with all elements set to *m.
 281  */
 282 static gmx_inline gmx_simd_float_t
 283 gmx_simd_load1_f(const float *m)
 284 {
 285     gmx_simd_float_t  a;
 286     int               i;
 287     float             f = *m;
 288
 289     for (i = 0; i < GMX_SIMD_FLOAT_WIDTH; i++)
 290     {
 291         a.r[i] = f;
 292     }
 293     return a;
 294 }
 295
 296 /*! \brief Set all SIMD float variable elements to the value r.
 297  *
 298  *  \param r floating-point constant
 299  *  \return SIMD variable with all elements set to r.
 300  */
 301 static gmx_inline gmx_simd_float_t
 302 gmx_simd_set1_f(float r)
 303 {
 304     gmx_simd_float_t  a;
 305     int               i;
 306
 307     for (i = 0; i < GMX_SIMD_FLOAT_WIDTH; i++)
 308     {
 309         a.r[i] = r;
 310     }
 311     return a;
 312 }
 313
 314 /*! \brief Set all SIMD float variable elements to 0.0f.
 315  *
 316  *  \return The value 0.0 in all elements of a SIMD variable.
 317  */
 318 static gmx_inline gmx_simd_float_t
 319 gmx_simd_setzero_f()
 320 {
 321     gmx_simd_float_t  a;
 322     int               i;
 323
 324     for (i = 0; i < GMX_SIMD_FLOAT_WIDTH; i++)
 325     {
 326         a.r[i] = 0.0;
 327     }
 328     return a;
 329 }
 330
 331 /*! \brief Store the contents of the SIMD float variable pr to aligned memory m.
 332  *
 333  * \param[out] m Pointer to memory, aligned to SIMD width.
 334  * \param a SIMD variable to store
 335  */
 336 static gmx_inline void
 337 gmx_simd_store_f(float *m, gmx_simd_float_t a)
 338 {
 339     int i;
 340
 341     for (i = 0; i < GMX_SIMD_FLOAT_WIDTH; i++)
 342     {
 343         m[i] = a.r[i];
 344     }
 345 }
 346
 347 /*! \brief Load SIMD float from unaligned memory.
 348  *
 349  * Available with \ref GMX_SIMD_HAVE_LOADU.
 350  *
 351  * \param m Pointer to memory, no alignment requirement.
 352  * \return SIMD variable with data loaded.
 353  */
 354 #define gmx_simd_loadu_f gmx_simd_load_f
 355
 356 /*! \brief Store SIMD float to unaligned memory.
 357  *
 358  * Available with \ref GMX_SIMD_HAVE_STOREU.
 359  *
 360  * \param[out] m Pointer to memory, no alignment requirement.
 361  * \param a SIMD variable to store.
 362  */
 363 #define gmx_simd_storeu_f gmx_simd_store_f
 364
 365 /*! \}
 366  *
 367  * \name SIMD implementation load/store operations for double precision floating point
 368  * \{
 369  */
 370
 371 /*! \brief Load \ref GMX_SIMD_DOUBLE_WIDTH numbers from aligned memory.
 372  *
 373  * \copydetails gmx_simd_load_f
 374  */
 375 static gmx_inline gmx_simd_double_t
 376 gmx_simd_load_d(const double *m)
 377 {
 378     gmx_simd_double_t  a;
 379     int                i;
 380
 381     for (i = 0; i < GMX_SIMD_DOUBLE_WIDTH; i++)
 382     {
 383         a.r[i] = m[i];
 384     }
 385     return a;
 386 }
 387
 388 /*! \brief Set all SIMD variable elements to double pointed to by m (unaligned).
 389  *
 390  * \copydetails gmx_simd_load1_f
 391  */
 392 static gmx_inline gmx_simd_double_t
 393 gmx_simd_load1_d(const double *m)
 394 {
 395     gmx_simd_double_t  a;
 396     int                i;
 397     double             d = *m;
 398
 399     for (i = 0; i < GMX_SIMD_DOUBLE_WIDTH; i++)
 400     {
 401         a.r[i] = d;
 402     }
 403     return a;
 404 }
 405
 406 /*! \brief Set all SIMD double variable elements to the value r.
 407  *
 408  * \copydetails gmx_simd_set1_f
 409  */
 410 static gmx_inline gmx_simd_double_t
 411 gmx_simd_set1_d(double r)
 412 {
 413     gmx_simd_double_t  a;
 414     int                i;
 415
 416     for (i = 0; i < GMX_SIMD_DOUBLE_WIDTH; i++)
 417     {
 418         a.r[i] = r;
 419     }
 420     return a;
 421 }
 422
 423 /*! \brief Set all SIMD double variable elements to 0.0.
 424  *
 425  * \copydetails gmx_simd_setzero_f
 426  */
 427 static gmx_inline gmx_simd_double_t
 428 gmx_simd_setzero_d()
 429 {
 430     gmx_simd_double_t  a;
 431     int                i;
 432
 433     for (i = 0; i < GMX_SIMD_DOUBLE_WIDTH; i++)
 434     {
 435         a.r[i] = 0.0;
 436     }
 437     return a;
 438 }
 439
 440 /*! \brief Store the contents of the SIMD double variable pr to aligned memory m.
 441  *
 442  * \copydetails gmx_simd_store_f
 443  */
 444 static gmx_inline void
 445 gmx_simd_store_d(double *m, gmx_simd_double_t a)
 446 {
 447     int i;
 448
 449     for (i = 0; i < GMX_SIMD_DOUBLE_WIDTH; i++)
 450     {
 451         m[i] = a.r[i];
 452     }
 453 }
 454
 455 /*! \brief Load SIMD double from unaligned memory.
 456  *
 457  * Available with \ref GMX_SIMD_HAVE_LOADU.
 458  *
 459  * \copydetails gmx_simd_loadu_f
 460  */
 461 #define gmx_simd_loadu_d gmx_simd_load_d
 462
 463 /*! \brief Store SIMD double to unaligned memory.
 464  *
 465  * Available with \ref GMX_SIMD_HAVE_STOREU.
 466  *
 467  * \copydetails gmx_simd_storeu_f
 468  */
 469 #define gmx_simd_storeu_d gmx_simd_store_d
 470
 471 /*! \}
 472  *
 473  * \name SIMD implementation load/store operations for integers (corresponding to float)
 474  * \{
 475  */
 476
 477 /*! \brief Load aligned SIMD integer data, width corresponds to \ref gmx_simd_float_t.
 478  *
 479  * You should typically call the real-precision \ref gmx_simd_load_i.
 480  *
 481  * \param m Pointer to memory, aligned to integer SIMD width.
 482  * \return SIMD integer variable.
 483  */
 484 static gmx_inline gmx_simd_fint32_t
 485 gmx_simd_load_fi(const gmx_int32_t * m)
 486 {
 487     gmx_simd_fint32_t  a;
 488     int                i;
 489     for (i = 0; i < GMX_SIMD_FINT32_WIDTH; i++)
 490     {
 491         a.i[i] = m[i];
 492     }
 493     return a;
 494 };
 495
 496 /*! \brief Set SIMD from integer, width corresponds to \ref gmx_simd_float_t.
 497  *
 498  * You should typically call the real-precision \ref gmx_simd_set1_i.
 499  *
 500  *  \param b integer value to set variable to.
 501  *  \return SIMD variable with all elements set to b.
 502  */
 503 static gmx_inline gmx_simd_fint32_t
 504 gmx_simd_set1_fi(gmx_int32_t b)
 505 {
 506     gmx_simd_fint32_t  a;
 507     int                i;
 508     for (i = 0; i < GMX_SIMD_FINT32_WIDTH; i++)
 509     {
 510         a.i[i] = b;
 511     }
 512     return a;
 513 }
 514
 515 /*! \brief Set all SIMD variable elements to 0, width corresponds to \ref gmx_simd_float_t.
 516  *
 517  * You should typically call the real-precision \ref gmx_simd_setzero_i.
 518  *
 519  * \return SIMD integer variable with all bits set to zero.
 520  */
 521 static gmx_inline gmx_simd_fint32_t
 522 gmx_simd_setzero_fi()
 523 {
 524     gmx_simd_fint32_t  a;
 525     int                i;
 526
 527     for (i = 0; i < GMX_SIMD_FINT32_WIDTH; i++)
 528     {
 529         a.i[i] = 0;
 530     }
 531     return a;
 532 }
 533
 534 /*! \brief Store aligned SIMD integer data, width corresponds to \ref gmx_simd_float_t.
 535  *
 536  * You should typically call the real-precision \ref gmx_simd_store_i.
 537  *
 538  * \param m Memory aligned to integer SIMD width.
 539  * \param a SIMD variable to store.
 540  */
 541 static gmx_inline gmx_simd_fint32_t
 542 gmx_simd_store_fi(int * m, gmx_simd_fint32_t a)
 543 {
 544     int                i;
 545     for (i = 0; i < GMX_SIMD_FINT32_WIDTH; i++)
 546     {
 547         m[i] = a.i[i];
 548     }
 549     return a;
 550 };
 551
 552 /*! \brief Load unaligned integer SIMD data, width corresponds to \ref gmx_simd_float_t.
 553  *
 554  * You should typically call the real-precision \ref gmx_simd_loadu_i.
 555  *
 556  * Supported with \ref GMX_SIMD_HAVE_LOADU.
 557  *
 558  * \param m Pointer to memory, no alignment requirements.
 559  * \return SIMD integer variable.
 560  */
 561 #define gmx_simd_loadu_fi  gmx_simd_load_fi
 562
 563 /*! \brief Store unaligned SIMD integer data, width corresponds to \ref gmx_simd_float_t.
 564  *
 565  * You should typically call the real-precision \ref gmx_simd_storeu_i.
 566  *
 567  * Supported with \ref GMX_SIMD_HAVE_STOREU.
 568  *
 569  * \param m Memory pointer, no alignment requirements.
 570  * \param a SIMD variable to store.
 571  */
 572 #define gmx_simd_storeu_fi gmx_simd_store_fi
 573
 574 /*! \brief Extract element with index i from \ref gmx_simd_fint32_t.
 575  *
 576  * You should typically call the real-precision \ref gmx_simd_extract_i.
 577  *
 578  * Available with \ref GMX_SIMD_HAVE_FINT32_EXTRACT.
 579  *
 580  * \param a SIMD variable
 581  * \param index Position to extract integer from
 582  * \return Single integer from position index in SIMD variable.
 583  */
 584 static gmx_inline gmx_int32_t
 585 gmx_simd_extract_fi(gmx_simd_fint32_t a, int index)
 586 {
 587     return a.i[index];
 588 }
 589
 590 /*! \}
 591  *
 592  * \name SIMD implementation load/store operations for integers (corresponding to double)
 593  * \{
 594  */
 595
 596 /*! \brief Load aligned SIMD integer data, width corresponds to \ref gmx_simd_double_t.
 597  *
 598  * \copydetails gmx_simd_load_fi
 599  */
 600 static gmx_inline gmx_simd_dint32_t
 601 gmx_simd_load_di(const gmx_int32_t * m)
 602 {
 603     gmx_simd_dint32_t  a;
 604     int                i;
 605     for (i = 0; i < GMX_SIMD_DINT32_WIDTH; i++)
 606     {
 607         a.i[i] = m[i];
 608     }
 609     return a;
 610 };
 611
 612 /*! \brief Set SIMD from integer, width corresponds to \ref gmx_simd_double_t.
 613  *
 614  *  \copydetails gmx_simd_set1_fi
 615  */
 616 static gmx_inline gmx_simd_dint32_t
 617 gmx_simd_set1_di(gmx_int32_t b)
 618 {
 619     gmx_simd_dint32_t  a;
 620     int                i;
 621     for (i = 0; i < GMX_SIMD_DINT32_WIDTH; i++)
 622     {
 623         a.i[i] = b;
 624     }
 625     return a;
 626 }
 627
 628 /*! \brief Set all SIMD variable elements to 0, width corresponds to \ref gmx_simd_double_t.
 629  *
 630  * \copydetails gmx_simd_setzero_fi
 631  */
 632 static gmx_inline gmx_simd_dint32_t
 633 gmx_simd_setzero_di()
 634 {
 635     gmx_simd_dint32_t  a;
 636     int                i;
 637
 638     for (i = 0; i < GMX_SIMD_DINT32_WIDTH; i++)
 639     {
 640         a.i[i] = 0;
 641     }
 642     return a;
 643 }
 644
 645 /*! \brief Store aligned SIMD integer data, width corresponds to \ref gmx_simd_double_t.
 646  *
 647  * \copydetails gmx_simd_store_fi
 648  */
 649 static gmx_inline gmx_simd_dint32_t
 650 gmx_simd_store_di(gmx_int32_t * m, gmx_simd_dint32_t a)
 651 {
 652     int                i;
 653     for (i = 0; i < GMX_SIMD_DINT32_WIDTH; i++)
 654     {
 655         m[i] = a.i[i];
 656     }
 657     return a;
 658 };
 659
 660 /*! \brief Load unaligned integer SIMD data, width corresponds to \ref gmx_simd_double_t.
 661  *
 662  * \copydetails gmx_simd_loadu_fi
 663  */
 664 #define gmx_simd_loadu_di  gmx_simd_load_di
 665
 666 /*! \brief Store unaligned SIMD integer data, width corresponds to \ref gmx_simd_double_t.
 667  *
 668  * \copydetails gmx_simd_storeu_fi
 669  */
 670 #define gmx_simd_storeu_di gmx_simd_store_di
 671
 672 /*! \brief Extract element with index i from \ref gmx_simd_dint32_t.
 673  *
 674  * \copydetails gmx_simd_extract_fi
 675  */
 676 static gmx_inline gmx_int32_t
 677 gmx_simd_extract_di(gmx_simd_dint32_t a, int index)
 678 {
 679     return a.i[index];
 680 }
 681
 682 /*! \}
 683  *
 684  * \name SIMD implementation single precision floating-point bitwise logical operations
 685  * \{
 686  */
 687
 688 /*! \brief Bitwise and for two SIMD float variables. Supported with \ref GMX_SIMD_HAVE_LOGICAL.
 689  *
 690  * You should typically call the real-precision \ref gmx_simd_and_r.
 691  *
 692  * \param a data1
 693  * \param b data2
 694  * \return data1 & data2
 695  */
 696 static gmx_inline gmx_simd_float_t
 697 gmx_simd_and_f(gmx_simd_float_t a, gmx_simd_float_t b)
 698 {
 699     gmx_simd_float_t  c;
 700     int               i;
 701 #ifdef __cplusplus
 702     gmx_int32_t       val1, val2, res;
 703 #else
 704     union
 705     {
 706         float        r;
 707         gmx_int32_t  i;
 708     }
 709     conv1, conv2;
 710 #endif
 711
 712     for (i = 0; i < GMX_SIMD_FLOAT_WIDTH; i++)
 713     {
 714 #ifdef __cplusplus
 715         val1   = reinterpret_cast<int &>(a.r[i]);
 716         val2   = reinterpret_cast<int &>(b.r[i]);
 717         res    = val1 & val2;
 718         c.r[i] = reinterpret_cast<float &>(res);
 719 #else
 720         conv1.r = a.r[i];
 721         conv2.r = b.r[i];
 722         conv1.i = conv1.i & conv2.i;
 723         c.r[i]  = conv1.r;
 724 #endif
 725     }
 726     return c;
 727 }
 728
 729 /*! \brief Bitwise andnot for SIMD float. c=(~a) & b. Supported with \ref GMX_SIMD_HAVE_LOGICAL.
 730  *
 731  * You should typically call the real-precision \ref gmx_simd_andnot_r.
 732  *
 733  * \param a data1
 734  * \param b data2
 735  * \return (~data1) & data2
 736  */
 737 static gmx_inline gmx_simd_float_t
 738 gmx_simd_andnot_f(gmx_simd_float_t a, gmx_simd_float_t b)
 739 {
 740     gmx_simd_float_t  c;
 741     int               i;
 742 #ifdef __cplusplus
 743     gmx_int32_t       val1, val2, res;
 744 #else
 745     union
 746     {
 747         float        r;
 748         gmx_int32_t  i;
 749     }
 750     conv1, conv2;
 751 #endif
 752
 753     for (i = 0; i < GMX_SIMD_FLOAT_WIDTH; i++)
 754     {
 755 #ifdef __cplusplus
 756         val1   = reinterpret_cast<int &>(a.r[i]);
 757         val2   = reinterpret_cast<int &>(b.r[i]);
 758         res    = (~val1) & val2;
 759         c.r[i] = reinterpret_cast<float &>(res);
 760 #else
 761         conv1.r = a.r[i];
 762         conv2.r = b.r[i];
 763         conv1.i = (~conv1.i) & conv2.i;
 764         c.r[i]  = conv1.r;
 765 #endif
 766     }
 767     return c;
 768 }
 769
 770 /*! \brief Bitwise or for SIMD float. Supported with \ref GMX_SIMD_HAVE_LOGICAL.
 771  *
 772  * You should typically call the real-precision \ref gmx_simd_or_r.
 773  *
 774  * \param a data1
 775  * \param b data2
 776  * \return data1 | data2
 777  */
 778 static gmx_inline gmx_simd_float_t
 779 gmx_simd_or_f(gmx_simd_float_t a, gmx_simd_float_t b)
 780 {
 781     gmx_simd_float_t  c;
 782     int               i;
 783 #ifdef __cplusplus
 784     gmx_int32_t       val1, val2, res;
 785 #else
 786     union
 787     {
 788         float        r;
 789         gmx_int32_t  i;
 790     }
 791     conv1, conv2;
 792 #endif
 793
 794     for (i = 0; i < GMX_SIMD_FLOAT_WIDTH; i++)
 795     {
 796 #ifdef __cplusplus
 797         val1   = reinterpret_cast<int &>(a.r[i]);
 798         val2   = reinterpret_cast<int &>(b.r[i]);
 799         res    = val1 | val2;
 800         c.r[i] = reinterpret_cast<float &>(res);
 801 #else
 802         conv1.r = a.r[i];
 803         conv2.r = b.r[i];
 804         conv1.i = conv1.i | conv2.i;
 805         c.r[i]  = conv1.r;
 806 #endif
 807     }
 808     return c;
 809 }
 810
 811 /*! \brief Bitwise xor for SIMD float. Supported with \ref GMX_SIMD_HAVE_LOGICAL.
 812  *
 813  * You should typically call the real-precision \ref gmx_simd_xor_r.
 814  *
 815  * \param a data1
 816  * \param b data2
 817  * \return data1 ^ data2
 818  */
 819 static gmx_inline gmx_simd_float_t
 820 gmx_simd_xor_f(gmx_simd_float_t a, gmx_simd_float_t b)
 821 {
 822     gmx_simd_float_t  c;
 823     int               i;
 824 #ifdef __cplusplus
 825     gmx_int32_t       val1, val2, res;
 826 #else
 827     union
 828     {
 829         float        r;
 830         gmx_int32_t  i;
 831     }
 832     conv1, conv2;
 833 #endif
 834
 835     for (i = 0; i < GMX_SIMD_FLOAT_WIDTH; i++)
 836     {
 837 #ifdef __cplusplus
 838         val1   = reinterpret_cast<int &>(a.r[i]);
 839         val2   = reinterpret_cast<int &>(b.r[i]);
 840         res    = val1 ^ val2;
 841         c.r[i] = reinterpret_cast<float &>(res);
 842 #else
 843         conv1.r = a.r[i];
 844         conv2.r = b.r[i];
 845         conv1.i = conv1.i ^ conv2.i;
 846         c.r[i]  = conv1.r;
 847 #endif
 848     }
 849     return c;
 850 }
 851
 852 /*! \}
 853  *
 854  * \name SIMD implementation single precision floating-point arithmetics
 855  * \{
 856  */
 857 /*! \brief Add two float SIMD variables.
 858  *
 859  * You should typically call the real-precision \ref gmx_simd_add_r.
 860  *
 861  * \param a term1
 862  * \param b term2
 863  * \return a+b
 864  */
 865 static gmx_inline gmx_simd_float_t
 866 gmx_simd_add_f(gmx_simd_float_t a, gmx_simd_float_t b)
 867 {
 868     gmx_simd_float_t  c;
 869     int               i;
 870
 871     for (i = 0; i < GMX_SIMD_FLOAT_WIDTH; i++)
 872     {
 873         c.r[i] = a.r[i] + b.r[i];
 874     }
 875     return c;
 876 }
 877
 878 /*! \brief Subtract two SIMD variables.
 879  *
 880  * You should typically call the real-precision \ref gmx_simd_sub_r.
 881  *
 882  * \param a term1
 883  * \param b term2
 884  * \return a-b
 885  */
 886 static gmx_inline gmx_simd_float_t
 887 gmx_simd_sub_f(gmx_simd_float_t a, gmx_simd_float_t b)
 888 {
 889     gmx_simd_float_t  c;
 890     int               i;
 891
 892     for (i = 0; i < GMX_SIMD_FLOAT_WIDTH; i++)
 893     {
 894         c.r[i] = a.r[i] - b.r[i];
 895     }
 896     return c;
 897 }
 898
 899 /*! \brief Multiply two SIMD variables.
 900  *
 901  * You should typically call the real-precision \ref gmx_simd_mul_r.
 902  *
 903  * \param a factor1
 904  * \param b factor2
 905  * \return a*b.
 906  */
 907 static gmx_inline gmx_simd_float_t
 908 gmx_simd_mul_f(gmx_simd_float_t a, gmx_simd_float_t b)
 909 {
 910     gmx_simd_float_t  c;
 911     int               i;
 912
 913     for (i = 0; i < GMX_SIMD_FLOAT_WIDTH; i++)
 914     {
 915         c.r[i] = a.r[i]*b.r[i];
 916     }
 917     return c;
 918 }
 919
 920 /*! \brief Fused-multiply-add. Result is a*b+c.
 921  *
 922  * You should typically call the real-precision \ref gmx_simd_fmadd_r.
 923  *
 924  *  If \ref GMX_SIMD_HAVE_FMA is defined this is a single hardware instruction.
 925  *
 926  * \param a value
 927  * \param b value
 928  * \param c value
 929  * \return a*b+c
 930  *
 931  * For some implementations you save an instruction if you assign the result
 932  * to c.
 933  */
 934 #define gmx_simd_fmadd_f(a, b, c) gmx_simd_add_f(gmx_simd_mul_f(a, b), c)
 935
 936
 937 /*! \brief Fused-multiply-subtract. Result is a*b-c.
 938  *
 939  * You should typically call the real-precision \ref gmx_simd_fmsub_r.
 940  *
 941  *  If \ref GMX_SIMD_HAVE_FMA is defined this is a single hardware instruction.
 942  *
 943  * \param a value
 944  * \param b value
 945  * \param c value
 946  * \return a*b-c
 947  *
 948  * For some implementations you save an instruction if you assign the result
 949  * to c.
 950  */
 951 #define gmx_simd_fmsub_f(a, b, c) gmx_simd_sub_f(gmx_simd_mul_f(a, b), c)
 952
 953
 954 /*! \brief Fused-negated-multiply-add. Result is -a*b+c.
 955  *
 956  * You should typically call the real-precision \ref gmx_simd_fnmadd_r.
 957  *
 958  *  If \ref GMX_SIMD_HAVE_FMA is defined this is a single hardware instruction.
 959  *
 960  * \param a value
 961  * \param b value
 962  * \param c value
 963  * \return -a*b+c
 964  *
 965  * For some implementations you save an instruction if you assign the result
 966  * to c.
 967  */
 968 #define gmx_simd_fnmadd_f(a, b, c) gmx_simd_sub_f(c, gmx_simd_mul_f(a, b))
 969
 970
 971 /*! \brief Fused-negated-multiply-sub. Result is -a*b-c.
 972  *
 973  * You should typically call the real-precision \ref gmx_simd_fnmsub_r.
 974  *
 975  *  If \ref GMX_SIMD_HAVE_FMA is defined this is a single hardware instruction.
 976  *
 977  * \param a value
 978  * \param b value
 979  * \param c value
 980  * \return -a*b-c
 981  *
 982  * For some implementations you save an instruction if you assign the result
 983  * to c.
 984  */
 985 #define gmx_simd_fnmsub_f(a, b, c) gmx_simd_sub_f(gmx_simd_setzero_f(), gmx_simd_fmadd_f(a, b, c))
 986
 987 /*! \brief SIMD 1.0/sqrt(x) lookup.
 988  *
 989  * You should typically call the real-precision \ref gmx_simd_rsqrt_r.
 990  *
 991  * This is a low-level instruction that should only be called from routines
 992  * implementing the inverse square root in simd_math.h.
 993  *
 994  * \param x Argument, x>0
 995  * \return Approximation of 1/sqrt(x), accuracy is \ref GMX_SIMD_RSQRT_BITS.
 996  */
 997 static gmx_inline gmx_simd_float_t
 998 gmx_simd_rsqrt_f(gmx_simd_float_t x)
 999 {
1000     gmx_simd_float_t  b;
1001     int               i;
1002
1003     for (i = 0; i < GMX_SIMD_FLOAT_WIDTH; i++)
1004     {
1005         b.r[i] = (x.r[i] > 0.0f) ? 1.0f/sqrtf(x.r[i]) : 0.0f;
1006     }
1007     return b;
1008 };
1009
1010 /*! \brief SIMD 1.0/x lookup.
1011  *
1012  * You should typically call the real-precision \ref gmx_simd_rcp_r.
1013  *
1014  * This is a low-level instruction that should only be called from routines
1015  * implementing the reciprocal in simd_math.h.
1016  *
1017  * \param x Argument, x!=0
1018  * \return Approximation of 1/x, accuracy is \ref GMX_SIMD_RCP_BITS.
1019  */
1020 static gmx_inline gmx_simd_float_t
1021 gmx_simd_rcp_f(gmx_simd_float_t x)
1022 {
1023     gmx_simd_float_t  b;
1024     int               i;
1025
1026     for (i = 0; i < GMX_SIMD_FLOAT_WIDTH; i++)
1027     {
1028         b.r[i] = (x.r[i] != 0.0f) ? 1.0f/x.r[i] : 0.0f;
1029     }
1030     return b;
1031 };
1032
1033 /*! \brief SIMD Floating-point fabs().
1034  *
1035  * You should typically call the real-precision \ref gmx_simd_fabs_r.
1036  *
1037  * \param a any floating point values
1038  * \return fabs(a) for each element.
1039  */
1040 static gmx_inline gmx_simd_float_t
1041 gmx_simd_fabs_f(gmx_simd_float_t a)
1042 {
1043     gmx_simd_float_t  c;
1044     int               i;
1045
1046     for (i = 0; i < GMX_SIMD_FLOAT_WIDTH; i++)
1047     {
1048         c.r[i] = fabsf(a.r[i]);
1049     }
1050     return c;
1051 }
1052
1053 /*! \brief SIMD floating-point negate.
1054  *
1055  * You should typically call the real-precision \ref gmx_simd_fneg_r.
1056  *
1057  * \param a Any floating-point value
1058  * \return -a
1059  */
1060 static gmx_inline gmx_simd_float_t
1061 gmx_simd_fneg_f(gmx_simd_float_t a)
1062 {
1063     gmx_simd_float_t  c;
1064     int               i;
1065
1066     for (i = 0; i < GMX_SIMD_FLOAT_WIDTH; i++)
1067     {
1068         c.r[i] = -a.r[i];
1069     }
1070     return c;
1071 }
1072
1073 /*! \brief Set each SIMD element to the largest from two variables.
1074  *
1075  * You should typically call the real-precision \ref gmx_simd_max_r.
1076  *
1077  * \param a Any floating-point value
1078  * \param b Any floating-point value
1079  * \return max(a,b) for each element.
1080  */
1081 static gmx_inline gmx_simd_float_t
1082 gmx_simd_max_f(gmx_simd_float_t a, gmx_simd_float_t b)
1083 {
1084     gmx_simd_float_t  c;
1085     int               i;
1086
1087     for (i = 0; i < GMX_SIMD_FLOAT_WIDTH; i++)
1088     {
1089         c.r[i] = (a.r[i] >= b.r[i] ? a.r[i] : b.r[i]);
1090     }
1091     return c;
1092 }
1093
1094 /*! \brief Set each SIMD element to the smallest from two variables.
1095  *
1096  * You should typically call the real-precision \ref gmx_simd_min_r.
1097  *
1098  * \param a Any floating-point value
1099  * \param b Any floating-point value
1100  * \return min(a,b) for each element.
1101  */
1102 static gmx_inline gmx_simd_float_t
1103 gmx_simd_min_f(gmx_simd_float_t a, gmx_simd_float_t b)
1104 {
1105     gmx_simd_float_t  c;
1106     int               i;
1107
1108     for (i = 0; i < GMX_SIMD_FLOAT_WIDTH; i++)
1109     {
1110         c.r[i] = (a.r[i] <= b.r[i] ? a.r[i] : b.r[i]);
1111     }
1112     return c;
1113 }
1114
1115 /*! \brief Round to nearest integer value (in floating-point format).
1116  *
1117  * You should typically call the real-precision \ref gmx_simd_round_r.
1118  *
1119  * \param a Any floating-point value
1120  * \return The nearest integer, represented in floating-point format.
1121  *
1122  * \note The reference implementation rounds exact half-way cases
1123  * away from zero, whereas most SIMD intrinsics will round to nearest even.
1124  * This could be fixed by using rint/rintf, but the bigger problem is that
1125  * MSVC does not support full C99, and none of the round or rint
1126  * functions are defined. It's much easier to approximately implement
1127  * round() than rint(), so we do that and hope we never get bitten in
1128  * testing. (Thanks, Microsoft.)
1129  */
1130 static gmx_inline gmx_simd_float_t
1131 gmx_simd_round_f(gmx_simd_float_t a)
1132 {
1133     gmx_simd_float_t  b;
1134     int               i;
1135
1136     for (i = 0; i < GMX_SIMD_FLOAT_WIDTH; i++)
1137     {
1138 #ifdef _MSC_VER
1139         int temp = (a.r[i] >= 0.0f) ? (a.r[i] + 0.5f) : (a.r[i] - 0.5f);
1140         b.r[i] = temp;
1141 #else
1142         b.r[i] = roundf(a.r[i]);
1143 #endif
1144     }
1145     return b;
1146 }
1147
1148 /*! \brief Truncate SIMD, i.e. round towards zero - common hardware instruction.
1149  *
1150  * You should typically call the real-precision \ref gmx_simd_trunc_r.
1151  *
1152  * \param a Any floating-point value
1153  * \return Integer rounded towards zero, represented in floating-point format.
1154  *
1155  * \note This is truncation towards zero, not floor(). The reason for this
1156  * is that truncation is virtually always present as a dedicated hardware
1157  * instruction, but floor() frequently isn't.
1158  */
1159 static gmx_inline gmx_simd_float_t
1160 gmx_simd_trunc_f(gmx_simd_float_t a)
1161 {
1162     gmx_simd_float_t  b;
1163     int               i;
1164
1165     for (i = 0; i < GMX_SIMD_FLOAT_WIDTH; i++)
1166     {
1167         b.r[i] = truncf(a.r[i]);
1168     }
1169     return b;
1170 }
1171
1172
1173 /*! \brief Fraction of the SIMD floating point number.
1174  *
1175  * You should typically call the real-precision \ref gmx_simd_fraction_r.
1176  *
1177  * \param a Any floating-point value
1178  * \return a-trunc(r)
1179  *
1180  * To maximize compatibility, we use the same definition of fractions as used
1181  * e.g. for the AMD64 hardware instructions. This relies on truncation towards
1182  * zero for the integer part, and the remaining fraction can thus be either
1183  * positive or negative. As an example, -1.42 would return the fraction -0.42.
1184  *
1185  * Hardware support with \ref GMX_SIMD_HAVE_FRACTION, otherwise emulated.
1186  */
1187 static gmx_inline gmx_simd_float_t
1188 gmx_simd_fraction_f(gmx_simd_float_t a)
1189 {
1190     return gmx_simd_sub_f(a, gmx_simd_trunc_f(a));
1191 }
1192
1193 /*! \brief Extract (integer) exponent from single precision SIMD.
1194  *
1195  * You should typically call the real-precision \ref gmx_simd_get_exponent_r.
1196  *
1197  * \param a Any floating-point value
1198  * \return Exponent value, represented in floating-point format.
1199  *
1200  * The IEEE754 exponent field is selected, the bias removed, and it is converted to
1201  * a normal floating-point SIMD.
1202  */
1203 static gmx_inline gmx_simd_float_t
1204 gmx_simd_get_exponent_f(gmx_simd_float_t a)
1205 {
1206     /* Mask with ones for the exponent field of single precision fp */
1207     const gmx_int32_t  expmask = 0x7f800000;
1208     gmx_simd_float_t   b;
1209     int                i;
1210     union
1211     {
1212         float        f;
1213         gmx_int32_t  i;
1214     }
1215     conv;
1216
1217     for (i = 0; i < GMX_SIMD_FLOAT_WIDTH; i++)
1218     {
1219         conv.f = a.r[i];
1220         /* Keep exponent, shift 23 right (float mantissa), remove bias (127) */
1221         b.r[i] = ((conv.i & expmask) >> 23) - 127;
1222     }
1223     return b;
1224 }
1225
1226 /*! \brief Get SIMD mantissa.
1227  *
1228  * You should typically call the real-precision \ref gmx_simd_get_mantissa_r.
1229  *
1230  * \param a Any floating-point value
1231  * \return Mantissa, represented in floating-point format.
1232  *
1233  * The mantissa field is selected, and a new neutral exponent created.
1234  */
1235 static gmx_inline gmx_simd_float_t
1236 gmx_simd_get_mantissa_f(gmx_simd_float_t a)
1237 {
1238     const gmx_int32_t  mantmask = 0x007fffff;
1239     const gmx_int32_t  one      = 0x3f800000;
1240     gmx_simd_float_t   b;
1241     int                i;
1242     union
1243     {
1244         float        f;
1245         gmx_int32_t  i;
1246     }
1247     conv;
1248
1249     for (i = 0; i < GMX_SIMD_FLOAT_WIDTH; i++)
1250     {
1251         conv.f = a.r[i];
1252         /* remove current exponent, add a biased exponent for 1.0 (i.e., 2^0=1) */
1253         conv.i = (conv.i & (mantmask)) | one;
1254         b.r[i] = conv.f;
1255     }
1256     return b;
1257 }
1258
1259 /*! \brief Set (integer) exponent from single precision floating-point SIMD.
1260  *
1261  * You should typically call the real-precision \ref gmx_simd_set_exponent_r.
1262  *
1263  * \param a A floating point value that will not overflow as 2^a.
1264  * \return 2^(round(a)).
1265  *
1266  * The input is \a rounded to the nearest integer, the exponent bias is added
1267  * to this integer, and the bits are shifted to the IEEE754 exponent part of the number.
1268  *
1269  * \note The argument will be \a rounded to nearest integer since that is what
1270  * we need for the exponential functions, and this integer x will be set as the
1271  * exponent so the new floating-point number will be 2^x.
1272  */
1273 static gmx_inline gmx_simd_float_t
1274 gmx_simd_set_exponent_f(gmx_simd_float_t a)
1275 {
1276     gmx_simd_float_t   b;
1277     gmx_int32_t        iexp;
1278     int                i;
1279     union
1280     {
1281         float        f;
1282         gmx_int32_t  i;
1283     }
1284     conv;
1285
1286     for (i = 0; i < GMX_SIMD_FLOAT_WIDTH; i++)
1287     {
1288         /* Critical to use same algorithm as for gmx_simd_round_f() */
1289 #ifdef _MSC_VER
1290         iexp = (a.r[i] >= 0.0f) ? (a.r[i] + 0.5f) : (a.r[i] - 0.5f);
1291 #else
1292         iexp = roundf(a.r[i]);
1293 #endif
1294         /* Add bias (127), and shift 23 bits left (mantissa size) */
1295         conv.i = (iexp + 127) << 23;
1296         b.r[i] = conv.f;
1297     }
1298     return b;
1299 }
1300
1301 /*! \}
1302  *
1303  * \name SIMD implementation single precision floating-point comparisons, boolean, selection.
1304  * \{
1305  */
1306 /*! \brief SIMD a==b for single SIMD.
1307  *
1308  * You should typically call the real-precision \ref gmx_simd_cmpeq_r.
1309  *
1310  * \param a value1
1311  * \param b value2
1312  * \return Each element of the boolean will be set to true if a==b.
1313  *
1314  * Beware that exact floating-point comparisons are difficult.
1315  */
1316 static gmx_inline gmx_simd_fbool_t
1317 gmx_simd_cmpeq_f(gmx_simd_float_t a, gmx_simd_float_t b)
1318 {
1319     gmx_simd_fbool_t  c;
1320     int               i;
1321
1322     for (i = 0; i < GMX_SIMD_FLOAT_WIDTH; i++)
1323     {
1324         c.b[i] = (a.r[i] == b.r[i]);
1325     }
1326     return c;
1327 }
1328
1329 /*! \brief SIMD a<b for single SIMD.
1330  *
1331  * You should typically call the real-precision \ref gmx_simd_cmplt_r.
1332  *
1333  * \param a value1
1334  * \param b value2
1335  * \return Each element of the boolean will be set to true if a<b.
1336  */
1337 static gmx_inline gmx_simd_fbool_t
1338 gmx_simd_cmplt_f(gmx_simd_float_t a, gmx_simd_float_t b)
1339 {
1340     gmx_simd_fbool_t   c;
1341     int                i;
1342
1343     for (i = 0; i < GMX_SIMD_FLOAT_WIDTH; i++)
1344     {
1345         c.b[i] = (a.r[i] < b.r[i]);
1346     }
1347     return c;
1348 }
1349
1350 /*! \brief SIMD a<=b for single SIMD.
1351  *
1352  * You should typically call the real-precision \ref gmx_simd_cmple_r.
1353  *
1354  * \param a value1
1355  * \param b value2
1356  * \return Each element of the boolean will be set to true if a<=b.
1357  */
1358 static gmx_inline gmx_simd_fbool_t
1359 gmx_simd_cmple_f(gmx_simd_float_t a, gmx_simd_float_t b)
1360 {
1361     gmx_simd_fbool_t   c;
1362     int                i;
1363
1364     for (i = 0; i < GMX_SIMD_FLOAT_WIDTH; i++)
1365     {
1366         c.b[i] = (a.r[i] <= b.r[i]);
1367     }
1368     return c;
1369 }
1370
1371 /*! \brief Logical \a and on single precision SIMD booleans.
1372  *
1373  * You should typically call the real-precision \ref gmx_simd_and_r.
1374  *
1375  * \param a logical vars 1
1376  * \param b logical vars 2
1377  * \return For each element, the result boolean is true if a \& b are true.
1378  *
1379  * \note This is not necessarily a bitwise operation - the storage format
1380  * of booleans is implementation-dependent.
1381  *
1382  * \sa gmx_simd_and_ib
1383  */
1384 static gmx_inline gmx_simd_fbool_t
1385 gmx_simd_and_fb(gmx_simd_fbool_t a, gmx_simd_fbool_t b)
1386 {
1387     gmx_simd_fbool_t  c;
1388     int               i;
1389
1390     for (i = 0; i < GMX_SIMD_FLOAT_WIDTH; i++)
1391     {
1392         c.b[i] = (a.b[i] && b.b[i]);
1393     }
1394     return c;
1395 }
1396
1397 /*! \brief Logical \a or on single precision SIMD booleans.
1398  *
1399  * You should typically call the real-precision \ref gmx_simd_or_r.
1400  *
1401  * \param a logical vars 1
1402  * \param b logical vars 2
1403  * \return For each element, the result boolean is true if a or b is true.
1404  *
1405  * Note that this is not necessarily a bitwise operation - the storage format
1406  * of booleans is implementation-dependent.
1407  *
1408  * \sa gmx_simd_or_ib
1409  */
1410 static gmx_inline gmx_simd_fbool_t
1411 gmx_simd_or_fb(gmx_simd_fbool_t a, gmx_simd_fbool_t b)
1412 {
1413     gmx_simd_fbool_t  c;
1414     int               i;
1415
1416     for (i = 0; i < GMX_SIMD_FLOAT_WIDTH; i++)
1417     {
1418         c.b[i] = (a.b[i] || b.b[i]);
1419     }
1420     return c;
1421 }
1422
1423 /*! \brief Returns non-zero if any of the boolean in x is True, otherwise 0.
1424  *
1425  * You should typically call the real-precision \ref gmx_simd_anytrue_b.
1426  *
1427  * \param a Logical variable.
1428  * \return non-zero if any element in a is true, otherwise 0.
1429  *
1430  * The actual return value for truth will depend on the architecture,
1431  * so any non-zero value is considered truth.
1432  */
1433 static gmx_inline int
1434 gmx_simd_anytrue_fb(gmx_simd_fbool_t a)
1435 {
1436     int             anytrue;
1437     int             i;
1438
1439     anytrue = 0;
1440     for (i = 0; i < GMX_SIMD_FLOAT_WIDTH; i++)
1441     {
1442         anytrue = anytrue || a.b[i];
1443     }
1444     return anytrue;
1445 }
1446
1447 /*! \brief Select from single precision SIMD variable where boolean is true.
1448  *
1449  * You should typically call the real-precision \ref gmx_simd_blendzero_r.
1450  *
1451  * \param a Floating-point variable to select from
1452  * \param sel Boolean selector
1453  * \return  For each element, a is selected for true, 0 for false.
1454  */
1455 static gmx_inline gmx_simd_float_t
1456 gmx_simd_blendzero_f(gmx_simd_float_t a, gmx_simd_fbool_t sel)
1457 {
1458     gmx_simd_float_t   c;
1459     int                i;
1460
1461     for (i = 0; i < GMX_SIMD_FLOAT_WIDTH; i++)
1462     {
1463         c.r[i] = sel.b[i] ? a.r[i] : 0.0;
1464     }
1465     return c;
1466 }
1467
1468 /*! \brief Select from single precision SIMD variable where boolean is false.
1469  *
1470  * You should typically call the real-precision \ref gmx_simd_blendnotzero_r.
1471  *
1472  * \param a Floating-point variable to select from
1473  * \param sel Boolean selector
1474  * \return  For each element, a is selected for false, 0 for true (sic).
1475  */
1476 static gmx_inline gmx_simd_float_t
1477 gmx_simd_blendnotzero_f(gmx_simd_float_t a, gmx_simd_fbool_t sel)
1478 {
1479     gmx_simd_float_t   c;
1480     int                i;
1481
1482     for (i = 0; i < GMX_SIMD_FLOAT_WIDTH; i++)
1483     {
1484         c.r[i] = sel.b[i] ? 0.0 : a.r[i];
1485     }
1486     return c;
1487 }
1488
1489 /*! \brief Vector-blend SIMD selection.
1490  *
1491  * You should typically call the real-precision \ref gmx_simd_blendv_r.
1492  *
1493  * \param a First source
1494  * \param b Second source
1495  * \param sel Boolean selector
1496  * \return For each element, select b if sel is true, a otherwise.
1497  */
1498 static gmx_inline gmx_simd_float_t
1499 gmx_simd_blendv_f(gmx_simd_float_t a, gmx_simd_float_t b, gmx_simd_fbool_t sel)
1500 {
1501     gmx_simd_float_t  d;
1502     int               i;
1503
1504     for (i = 0; i < GMX_SIMD_FLOAT_WIDTH; i++)
1505     {
1506         d.r[i] = sel.b[i] ? b.r[i] : a.r[i];
1507     }
1508     return d;
1509 }
1510
1511 /*! \brief Return sum of all elements in SIMD float variable.
1512  *
1513  * You should typically call the real-precision \ref gmx_simd_reduce_r.
1514  *
1515  * \param a SIMD variable to reduce/sum.
1516  * \return The sum of all elements in the argument variable.
1517  *
1518  */
1519 static gmx_inline float
1520 gmx_simd_reduce_f(gmx_simd_float_t a)
1521 {
1522     float     sum = 0.0;
1523     int       i;
1524
1525     for (i = 0; i < GMX_SIMD_FLOAT_WIDTH; i++)
1526     {
1527         sum += a.r[i];
1528     }
1529     return sum;
1530 }
1531
1532 /*! \}
1533  *
1534  * \name SIMD implementation double precision floating-point bitwise logical operations
1535  * \{
1536  */
1537 /*! \brief Bitwise and for two SIMD double variables. Supported with \ref GMX_SIMD_HAVE_LOGICAL.
1538  *
1539  * \copydetails gmx_simd_and_f
1540  */
1541 static gmx_inline gmx_simd_double_t
1542 gmx_simd_and_d(gmx_simd_double_t a, gmx_simd_double_t b)
1543 {
1544     gmx_simd_double_t  c;
1545     int                i;
1546 #ifdef __cplusplus
1547     gmx_int64_t        val1, val2, res;
1548 #else
1549     union
1550     {
1551         double       r;
1552         gmx_int64_t  i;
1553     }
1554     conv1, conv2;
1555 #endif
1556
1557     for (i = 0; i < GMX_SIMD_DOUBLE_WIDTH; i++)
1558     {
1559 #ifdef __cplusplus
1560         val1   = reinterpret_cast<gmx_int64_t &>(a.r[i]);
1561         val2   = reinterpret_cast<gmx_int64_t &>(b.r[i]);
1562         res    = val1 & val2;
1563         c.r[i] = reinterpret_cast<double &>(res);
1564 #else
1565         conv1.r = a.r[i];
1566         conv2.r = b.r[i];
1567         conv1.i = conv1.i & conv2.i;
1568         c.r[i]  = conv1.r;
1569 #endif
1570     }
1571     return c;
1572 }
1573
1574 /*! \brief Bitwise andnot for SIMD double. c=(~a) & b. Supported with \ref GMX_SIMD_HAVE_LOGICAL.
1575  *
1576  * \copydetails gmx_simd_andnot_f
1577  */
1578 static gmx_inline gmx_simd_double_t
1579 gmx_simd_andnot_d(gmx_simd_double_t a, gmx_simd_double_t b)
1580 {
1581     gmx_simd_double_t  c;
1582     int                i;
1583 #ifdef __cplusplus
1584     gmx_int64_t        val1, val2, res;
1585 #else
1586     union
1587     {
1588         double       r;
1589         gmx_int64_t  i;
1590     }
1591     conv1, conv2;
1592 #endif
1593
1594     for (i = 0; i < GMX_SIMD_DOUBLE_WIDTH; i++)
1595     {
1596 #ifdef __cplusplus
1597         val1   = reinterpret_cast<gmx_int64_t &>(a.r[i]);
1598         val2   = reinterpret_cast<gmx_int64_t &>(b.r[i]);
1599         res    = (~val1) & val2;
1600         c.r[i] = reinterpret_cast<double &>(res);
1601 #else
1602         conv1.r = a.r[i];
1603         conv2.r = b.r[i];
1604         conv1.i = conv1.i & conv2.i;
1605         c.r[i]  = conv1.r;
1606 #endif
1607     }
1608     return c;
1609 }
1610
1611 /*! \brief Bitwise or for SIMD double. Supported with \ref GMX_SIMD_HAVE_LOGICAL.
1612  *
1613  * \copydetails gmx_simd_or_f
1614  */
1615 static gmx_inline gmx_simd_double_t
1616 gmx_simd_or_d(gmx_simd_double_t a, gmx_simd_double_t b)
1617 {
1618     gmx_simd_double_t  c;
1619     int                i;
1620 #ifdef __cplusplus
1621     gmx_int64_t        val1, val2, res;
1622 #else
1623     union
1624     {
1625         double       r;
1626         gmx_int64_t  i;
1627     }
1628     conv1, conv2;
1629 #endif
1630
1631     for (i = 0; i < GMX_SIMD_DOUBLE_WIDTH; i++)
1632     {
1633 #ifdef __cplusplus
1634         val1   = reinterpret_cast<gmx_int64_t &>(a.r[i]);
1635         val2   = reinterpret_cast<gmx_int64_t &>(b.r[i]);
1636         res    = val1 | val2;
1637         c.r[i] = reinterpret_cast<double &>(res);
1638 #else
1639         conv1.r = a.r[i];
1640         conv2.r = b.r[i];
1641         conv1.i = conv1.i & conv2.i;
1642         c.r[i]  = conv1.r;
1643 #endif
1644     }
1645     return c;
1646 }
1647
1648 /*! \brief Bitwise xor for SIMD double. Supported with \ref GMX_SIMD_HAVE_LOGICAL.
1649  *
1650  * \copydetails gmx_simd_xor_f
1651  */
1652 static gmx_inline gmx_simd_double_t
1653 gmx_simd_xor_d(gmx_simd_double_t a, gmx_simd_double_t b)
1654 {
1655     gmx_simd_double_t  c;
1656     int                i;
1657 #ifdef __cplusplus
1658     gmx_int64_t        val1, val2, res;
1659 #else
1660     union
1661     {
1662         double       r;
1663         gmx_int64_t  i;
1664     }
1665     conv1, conv2;
1666 #endif
1667
1668     for (i = 0; i < GMX_SIMD_DOUBLE_WIDTH; i++)
1669     {
1670 #ifdef __cplusplus
1671         val1   = reinterpret_cast<gmx_int64_t &>(a.r[i]);
1672         val2   = reinterpret_cast<gmx_int64_t &>(b.r[i]);
1673         res    = val1 ^ val2;
1674         c.r[i] = reinterpret_cast<double &>(res);
1675 #else
1676         conv1.r = a.r[i];
1677         conv2.r = b.r[i];
1678         conv1.i = conv1.i & conv2.i;
1679         c.r[i]  = conv1.r;
1680 #endif
1681     }
1682     return c;
1683 }
1684
1685 /*! \}
1686  *
1687  * \name SIMD implementation double precision floating-point arithmetics
1688  * \{
1689  */
1690 /*! \brief Add two double SIMD variables.
1691  *
1692  * \copydetails gmx_simd_add_f
1693  */
1694 static gmx_inline gmx_simd_double_t
1695 gmx_simd_add_d(gmx_simd_double_t a, gmx_simd_double_t b)
1696 {
1697     gmx_simd_double_t  c;
1698     int                i;
1699
1700     for (i = 0; i < GMX_SIMD_DOUBLE_WIDTH; i++)
1701     {
1702         c.r[i] = a.r[i] + b.r[i];
1703     }
1704     return c;
1705 }
1706
1707 /*! \brief Add two float SIMD variables.
1708  *
1709  * \copydetails gmx_simd_sub_f
1710  */
1711 static gmx_inline gmx_simd_double_t
1712 gmx_simd_sub_d(gmx_simd_double_t a, gmx_simd_double_t b)
1713 {
1714     gmx_simd_double_t  c;
1715     int                i;
1716
1717     for (i = 0; i < GMX_SIMD_DOUBLE_WIDTH; i++)
1718     {
1719         c.r[i] = a.r[i] - b.r[i];
1720     }
1721     return c;
1722 }
1723
1724 /*! \brief Multiply two SIMD variables.
1725  *
1726  * \copydetails gmx_simd_mul_f
1727  */
1728 static gmx_inline gmx_simd_double_t
1729 gmx_simd_mul_d(gmx_simd_double_t a, gmx_simd_double_t b)
1730 {
1731     gmx_simd_double_t  c;
1732     int                i;
1733
1734     for (i = 0; i < GMX_SIMD_DOUBLE_WIDTH; i++)
1735     {
1736         c.r[i] = a.r[i]*b.r[i];
1737     }
1738     return c;
1739 }
1740
1741 /*! \brief Fused-multiply-add. Result is a*b+c.
1742  *
1743  * \copydetails gmx_simd_fmadd_f
1744  */
1745 #define gmx_simd_fmadd_d(a, b, c) gmx_simd_add_d(gmx_simd_mul_d(a, b), c)
1746
1747 /*! \brief Fused-multiply-subtract. Result is a*b-c.
1748  *
1749  * \copydetails gmx_simd_fmsub_f
1750  */
1751 #define gmx_simd_fmsub_d(a, b, c) gmx_simd_sub_d(gmx_simd_mul_d(a, b), c)
1752
1753 /*! \brief Fused-negated-multiply-add. Result is -a*b+c.
1754  *
1755  * \copydetails gmx_simd_fnmadd_f
1756  */
1757 #define gmx_simd_fnmadd_d(a, b, c) gmx_simd_sub_d(c, gmx_simd_mul_d(a, b))
1758
1759 /*! \brief Fused-negated-multiply-add. Result is -a*b-c.
1760  *
1761  * \copydetails gmx_simd_fnmsub_f
1762  */
1763 #define gmx_simd_fnmsub_d(a, b, c) gmx_simd_sub_d(gmx_simd_setzero_d(), gmx_simd_fmadd_d(a, b, c))
1764
1765 /*! \brief SIMD 1.0/sqrt(x) lookup.
1766  *
1767  * \copydetails gmx_simd_rsqrt_f
1768  */
1769 static gmx_inline gmx_simd_double_t
1770 gmx_simd_rsqrt_d(gmx_simd_double_t x)
1771 {
1772     gmx_simd_double_t  b;
1773     int                i;
1774
1775     for (i = 0; i < GMX_SIMD_DOUBLE_WIDTH; i++)
1776     {
1777         /* Sic - we only need single precision for the reference lookup, since
1778          * we have defined GMX_SIMD_RSQRT_BITS to 23.
1779          */
1780         b.r[i] = (x.r[i] > 0.0) ? 1.0f/sqrtf(x.r[i]) : 0.0;
1781     }
1782     return b;
1783 };
1784
1785 /*! \brief 1.0/x lookup.
1786  *
1787  * \copydetails gmx_simd_rcp_f
1788  */
1789 static gmx_inline gmx_simd_double_t
1790 gmx_simd_rcp_d(gmx_simd_double_t x)
1791 {
1792     gmx_simd_double_t  b;
1793     int                i;
1794
1795     for (i = 0; i < GMX_SIMD_DOUBLE_WIDTH; i++)
1796     {
1797         /* Sic - we only need single precision for the reference lookup, since
1798          * we have defined GMX_SIMD_RCP_BITS to 23.
1799          */
1800         b.r[i] = (x.r[i] != 0.0) ? 1.0f/x.r[i] : 0.0;
1801     }
1802     return b;
1803 };
1804
1805 /*! \brief SIMD Floating-point fabs().
1806  *
1807  * \copydetails gmx_simd_fabs_f
1808  */
1809 static gmx_inline gmx_simd_double_t
1810 gmx_simd_fabs_d(gmx_simd_double_t a)
1811 {
1812     gmx_simd_double_t  c;
1813     int                i;
1814
1815     for (i = 0; i < GMX_SIMD_DOUBLE_WIDTH; i++)
1816     {
1817         c.r[i] = fabs(a.r[i]);
1818     }
1819     return c;
1820 }
1821
1822 /*! \brief SIMD floating-point negate.
1823  *
1824  * \copydetails gmx_simd_fneg_f
1825  */
1826 static gmx_inline gmx_simd_double_t
1827 gmx_simd_fneg_d(gmx_simd_double_t a)
1828 {
1829     gmx_simd_double_t  c;
1830     int                i;
1831
1832     for (i = 0; i < GMX_SIMD_DOUBLE_WIDTH; i++)
1833     {
1834         c.r[i] = -a.r[i];
1835     }
1836     return c;
1837 }
1838
1839 /*! \brief Set each SIMD element to the largest from two variables.
1840  *
1841  * \copydetails gmx_simd_max_f
1842  */
1843 static gmx_inline gmx_simd_double_t
1844 gmx_simd_max_d(gmx_simd_double_t a, gmx_simd_double_t b)
1845 {
1846     gmx_simd_double_t  c;
1847     int                i;
1848
1849     for (i = 0; i < GMX_SIMD_DOUBLE_WIDTH; i++)
1850     {
1851         c.r[i] = (a.r[i] >= b.r[i] ? a.r[i] : b.r[i]);
1852     }
1853     return c;
1854 }
1855
1856 /*! \brief Set each SIMD element to the smallest from two variables.
1857  *
1858  * \copydetails gmx_simd_min_f
1859  */
1860 static gmx_inline gmx_simd_double_t
1861 gmx_simd_min_d(gmx_simd_double_t a, gmx_simd_double_t b)
1862 {
1863     gmx_simd_double_t  c;
1864     int                i;
1865
1866     for (i = 0; i < GMX_SIMD_DOUBLE_WIDTH; i++)
1867     {
1868         c.r[i] = (a.r[i] <= b.r[i] ? a.r[i] : b.r[i]);
1869     }
1870     return c;
1871 }
1872
1873 /*! \brief Round to nearest integer value (in double floating-point format).
1874  *
1875  * \copydetails gmx_simd_round_f
1876  */
1877 static gmx_inline gmx_simd_double_t
1878 gmx_simd_round_d(gmx_simd_double_t a)
1879 {
1880     gmx_simd_double_t  b;
1881     int                i;
1882
1883     for (i = 0; i < GMX_SIMD_DOUBLE_WIDTH; i++)
1884     {
1885 #ifdef _MSC_VER
1886         int temp = (a.r[i] >= 0.0) ? (a.r[i] + 0.5) : (a.r[i] - 0.5);
1887         b.r[i] = temp;
1888 #else
1889         b.r[i] = round(a.r[i]);
1890 #endif
1891     }
1892     return b;
1893 }
1894
1895 /*! \brief Truncate SIMD, i.e. round towards zero - common hardware instruction.
1896  *
1897  * \copydetails gmx_simd_trunc_f
1898  */
1899 static gmx_inline gmx_simd_double_t
1900 gmx_simd_trunc_d(gmx_simd_double_t a)
1901 {
1902     gmx_simd_double_t  b;
1903     int                i;
1904
1905     for (i = 0; i < GMX_SIMD_DOUBLE_WIDTH; i++)
1906     {
1907         b.r[i] = trunc(a.r[i]);
1908     }
1909     return b;
1910 }
1911
1912 /*! \brief Fraction of the SIMD floating point number.
1913  *
1914  * \copydetails gmx_simd_fraction_f
1915  */
1916 static gmx_inline gmx_simd_double_t
1917 gmx_simd_fraction_d(gmx_simd_double_t a)
1918 {
1919     return gmx_simd_sub_d(a, gmx_simd_trunc_d(a));
1920 }
1921
1922
1923 /*! \brief Extract (integer) exponent from double precision SIMD.
1924  *
1925  * \copydetails gmx_simd_get_exponent_f
1926  */
1927 static gmx_inline gmx_simd_double_t
1928 gmx_simd_get_exponent_d(gmx_simd_double_t a)
1929 {
1930     /* Mask with ones for the exponent field of double precision fp */
1931     const gmx_int64_t      expmask = 0x7ff0000000000000LL;
1932     gmx_simd_double_t      b;
1933     int                    i;
1934     union
1935     {
1936         double             d;
1937         gmx_int64_t        i;
1938     }
1939     conv;
1940
1941     for (i = 0; i < GMX_SIMD_DOUBLE_WIDTH; i++)
1942     {
1943         conv.d = a.r[i];
1944         /* Zero everything but exponent field (remove sign),
1945          * shift 23 bits right (mantissa size), and remove exponent bias (1023).
1946          */
1947         b.r[i] = ((conv.i & expmask) >> 52) - 1023;
1948     }
1949     return b;
1950 }
1951
1952 /*! \brief Get SIMD doublemantissa.
1953  *
1954  * \copydetails gmx_simd_get_mantissa_f
1955  */
1956 static gmx_inline gmx_simd_double_t
1957 gmx_simd_get_mantissa_d(gmx_simd_double_t a)
1958 {
1959     const gmx_int64_t      mantmask = 0x000fffffffffffffLL;
1960     const gmx_int64_t      one      = 0x3ff0000000000000LL;
1961     gmx_simd_double_t      b;
1962     int                    i;
1963     union
1964     {
1965         double          d;
1966         gmx_int64_t     i;
1967     }
1968     conv;
1969
1970     for (i = 0; i < GMX_SIMD_DOUBLE_WIDTH; i++)
1971     {
1972         conv.d = a.r[i];
1973         conv.i = (conv.i & (mantmask)) | one;
1974         b.r[i] = conv.d;
1975     }
1976     return b;
1977 }
1978
1979 /*! \brief Set (integer) exponent from single precision floating-point SIMD.
1980  *
1981  * \copydetails gmx_simd_set_exponent_f
1982  */
1983 static gmx_inline gmx_simd_double_t
1984 gmx_simd_set_exponent_d(gmx_simd_double_t a)
1985 {
1986     gmx_simd_double_t      b;
1987     int                    i;
1988     gmx_int64_t            iexp;
1989     union
1990     {
1991         double          d;
1992         gmx_int64_t     i;
1993     }
1994     conv;
1995
1996     for (i = 0; i < GMX_SIMD_DOUBLE_WIDTH; i++)
1997     {
1998         /* Critical to use same algorithm as for gmx_simd_round_d() */
1999 #ifdef _MSC_VER
2000         iexp = (a.r[i] >= 0.0) ? (a.r[i] + 0.5) : (a.r[i] - 0.5);
2001 #else
2002         iexp = round(a.r[i]);
2003 #endif
2004         /* Add bias (1023), and shift 52 bits left (mantissa size) */
2005         conv.i = (iexp + 1023) << 52;
2006         b.r[i] = conv.d;
2007     }
2008     return b;
2009 }
2010
2011 /*! \}
2012  *
2013  * \name SIMD implementation double precision floating-point comparison, boolean, selection.
2014  * \{
2015  */
2016 /*! \brief SIMD a==b for double SIMD.
2017  *
2018  * \copydetails gmx_simd_cmpeq_f
2019  */
2020 static gmx_inline gmx_simd_dbool_t
2021 gmx_simd_cmpeq_d(gmx_simd_double_t a, gmx_simd_double_t b)
2022 {
2023     gmx_simd_dbool_t  c;
2024     int               i;
2025
2026     for (i = 0; i < GMX_SIMD_DOUBLE_WIDTH; i++)
2027     {
2028         c.b[i] = (a.r[i] == b.r[i]);
2029     }
2030     return c;
2031 }
2032
2033 /*! \brief SIMD a<b for double SIMD.
2034  *
2035  * \copydetails gmx_simd_cmplt_f
2036  */
2037 static gmx_inline gmx_simd_dbool_t
2038 gmx_simd_cmplt_d(gmx_simd_double_t a, gmx_simd_double_t b)
2039 {
2040     gmx_simd_dbool_t  c;
2041     int               i;
2042
2043     for (i = 0; i < GMX_SIMD_DOUBLE_WIDTH; i++)
2044     {
2045         c.b[i] = (a.r[i] < b.r[i]);
2046     }
2047     return c;
2048 }
2049
2050 /*! \brief SIMD a<=b for double SIMD.
2051  *
2052  * \copydetails gmx_simd_cmple_f
2053  */
2054 static gmx_inline gmx_simd_dbool_t
2055 gmx_simd_cmple_d(gmx_simd_double_t a, gmx_simd_double_t b)
2056 {
2057     gmx_simd_dbool_t  c;
2058     int               i;
2059
2060     for (i = 0; i < GMX_SIMD_DOUBLE_WIDTH; i++)
2061     {
2062         c.b[i] = (a.r[i] <= b.r[i]);
2063     }
2064     return c;
2065 }
2066
2067
2068 /*! \brief Logical \a and on double precision SIMD booleans.
2069  *
2070  * \copydetails gmx_simd_and_fb
2071  */
2072 static gmx_inline gmx_simd_dbool_t
2073 gmx_simd_and_db(gmx_simd_dbool_t a, gmx_simd_dbool_t b)
2074 {
2075     gmx_simd_dbool_t  c;
2076     int               i;
2077
2078     for (i = 0; i < GMX_SIMD_DOUBLE_WIDTH; i++)
2079     {
2080         c.b[i] = (a.b[i] && b.b[i]);
2081     }
2082     return c;
2083 }
2084
2085 /*! \brief Logical \a or on double precision SIMD booleans.
2086  *
2087  * \copydetails gmx_simd_or_fb
2088  */
2089 static gmx_inline gmx_simd_dbool_t
2090 gmx_simd_or_db(gmx_simd_dbool_t a, gmx_simd_dbool_t b)
2091 {
2092     gmx_simd_dbool_t  c;
2093     int               i;
2094
2095     for (i = 0; i < GMX_SIMD_DOUBLE_WIDTH; i++)
2096     {
2097         c.b[i] = (a.b[i] || b.b[i]);
2098     }
2099     return c;
2100 }
2101
2102
2103 /*! \brief Returns non-zero if any of the boolean in x is True, otherwise 0.
2104  *
2105  * \copydetails gmx_simd_anytrue_fb
2106  */
2107 static gmx_inline int
2108 gmx_simd_anytrue_db(gmx_simd_dbool_t a)
2109 {
2110     int         anytrue;
2111     int         i;
2112
2113     anytrue = 0;
2114     for (i = 0; i < GMX_SIMD_DOUBLE_WIDTH; i++)
2115     {
2116         anytrue = anytrue || a.b[i];
2117     }
2118     return anytrue;
2119 }
2120
2121
2122 /*! \brief Select from double SIMD variable where boolean is true.
2123  *
2124  * \copydetails gmx_simd_blendzero_f
2125  */
2126 static gmx_inline gmx_simd_double_t
2127 gmx_simd_blendzero_d(gmx_simd_double_t a, gmx_simd_dbool_t sel)
2128 {
2129     gmx_simd_double_t  c;
2130     int                i;
2131
2132     for (i = 0; i < GMX_SIMD_DOUBLE_WIDTH; i++)
2133     {
2134         c.r[i] = sel.b[i] ? a.r[i] : 0.0;
2135     }
2136     return c;
2137 }
2138
2139 /*! \brief Select from double SIMD variable where boolean is false.
2140  *
2141  * \copydetails gmx_simd_blendnotzero_f
2142  */
2143 static gmx_inline gmx_simd_double_t
2144 gmx_simd_blendnotzero_d(gmx_simd_double_t a, gmx_simd_dbool_t sel)
2145 {
2146     gmx_simd_double_t  c;
2147     int                i;
2148
2149     for (i = 0; i < GMX_SIMD_DOUBLE_WIDTH; i++)
2150     {
2151         c.r[i] = sel.b[i] ? 0.0 : a.r[i];
2152     }
2153     return c;
2154 }
2155
2156 /*! \brief Vector-blend double SIMD selection.
2157  *
2158  * \copydetails gmx_simd_blendv_f
2159  */
2160 static gmx_inline gmx_simd_double_t
2161 gmx_simd_blendv_d(gmx_simd_double_t a, gmx_simd_double_t b, gmx_simd_dbool_t sel)
2162 {
2163     gmx_simd_double_t  d;
2164     int                i;
2165
2166     for (i = 0; i < GMX_SIMD_DOUBLE_WIDTH; i++)
2167     {
2168         d.r[i] = sel.b[i] ? b.r[i] : a.r[i];
2169     }
2170     return d;
2171 }
2172
2173 /*! \brief Return sum of all elements in SIMD double variable.
2174  *
2175  * \copydetails gmx_simd_reduce_f
2176  *
2177  */
2178 static gmx_inline double
2179 gmx_simd_reduce_d(gmx_simd_double_t a)
2180 {
2181     double    sum = 0.0;
2182     int       i;
2183
2184     for (i = 0; i < GMX_SIMD_DOUBLE_WIDTH; i++)
2185     {
2186         sum += a.r[i];
2187     }
2188     return sum;
2189 }
2190
2191 /*! \}
2192  *
2193  * \name SIMD implementation integer (corresponding to float) bitwise logical operations
2194  * \{
2195  */
2196
2197 /*! \brief SIMD integer shift left logical, based on immediate value.
2198  *
2199  * You should typically call the real-precision \ref gmx_simd_slli_i.
2200  *
2201  *  Logical shift. Each element is shifted (independently) up to 32 positions
2202  *  left, while zeros are shifted in from the right. Only available if
2203  * \ref GMX_SIMD_HAVE_FINT32_LOGICAL (single) or \ref GMX_SIMD_HAVE_DINT32_LOGICAL
2204  *  (double) is defined.
2205  *
2206  * \param a integer data to shift
2207  * \param n number of positions to shift left. n<=32.
2208  * \return shifted values
2209  */
2210 static gmx_inline gmx_simd_fint32_t
2211 gmx_simd_slli_fi(gmx_simd_fint32_t a, int n)
2212 {
2213     gmx_simd_fint32_t  c;
2214     int                i;
2215
2216     for (i = 0; i < GMX_SIMD_FINT32_WIDTH; i++)
2217     {
2218         c.i[i] = a.i[i] << n;
2219     }
2220     return c;
2221 }
2222
2223 /*! \brief SIMD integer shift right logical, based on immediate value.
2224  *
2225  * You should typically call the real-precision \ref gmx_simd_srli_i.
2226  *
2227  *  Logical shift. Each element is shifted (independently) up to 32 positions
2228  *  right, while zeros are shifted in from the left. Only available if
2229  * \ref GMX_SIMD_HAVE_FINT32_LOGICAL (single) or \ref GMX_SIMD_HAVE_DINT32_LOGICAL
2230  *  (double) is defined.
2231  *
2232  * \param a integer data to shift
2233  * \param n number of positions to shift right. n<=32.
2234  * \return shifted values
2235  */
2236 static gmx_inline gmx_simd_fint32_t
2237 gmx_simd_srli_fi(gmx_simd_fint32_t a, int n)
2238 {
2239     gmx_simd_fint32_t  c;
2240     int                i;
2241
2242     for (i = 0; i < GMX_SIMD_FINT32_WIDTH; i++)
2243     {
2244         c.i[i] = a.i[i] >> n;
2245     }
2246     return c;
2247 }
2248
2249 /*! \brief Integer SIMD bitwise and.
2250  *
2251  * You should typically call the real-precision \ref gmx_simd_and_i.
2252  *
2253  * This routine is only available if \ref GMX_SIMD_HAVE_FINT32_LOGICAL (single)
2254  *  or \ref GMX_SIMD_HAVE_DINT32_LOGICAL (double) is defined.
2255  *
2256  * \note You can \a not use this operation directly to select based on a boolean
2257  * SIMD variable, since booleans are separate from integer SIMD. If that
2258  * is what you need, have a look at \ref gmx_simd_blendzero_i instead.
2259  *
2260  * \param a first integer SIMD
2261  * \param b second integer SIMD
2262  * \return a \& b (bitwise and)
2263  */
2264 static gmx_inline gmx_simd_fint32_t
2265 gmx_simd_and_fi(gmx_simd_fint32_t a, gmx_simd_fint32_t b)
2266 {
2267     gmx_simd_fint32_t  c;
2268     int                i;
2269
2270     for (i = 0; i < GMX_SIMD_FINT32_WIDTH; i++)
2271     {
2272         c.i[i] = a.i[i] & b.i[i];
2273     }
2274     return c;
2275 }
2276
2277 /*! \brief Integer SIMD bitwise not-and.
2278  *
2279  * You should typically call the real-precision \ref gmx_simd_andnot_i.
2280  *
2281  * This routine is only available if \ref GMX_SIMD_HAVE_FINT32_LOGICAL (single)
2282  *  or \ref GMX_SIMD_HAVE_DINT32_LOGICAL (double) is defined.
2283  *
2284  * Note that you can NOT use this operation directly to select based on a boolean
2285  * SIMD variable, since booleans are separate from integer SIMD. If that
2286  * is what you need, have a look at \ref gmx_simd_blendnotzero_i instead.
2287  *
2288  * \param a first integer SIMD
2289  * \param b second integer SIMD
2290  * \return (~a) \& b (bitwise andnot)
2291  */
2292 static gmx_inline gmx_simd_fint32_t
2293 gmx_simd_andnot_fi(gmx_simd_fint32_t a, gmx_simd_fint32_t b)
2294 {
2295     gmx_simd_fint32_t  c;
2296     int                i;
2297
2298     for (i = 0; i < GMX_SIMD_FINT32_WIDTH; i++)
2299     {
2300         c.i[i] = (~a.i[i]) & b.i[i];
2301     }
2302     return c;
2303 }
2304
2305 /*! \brief Integer SIMD bitwise or.
2306  *
2307  * You should typically call the real-precision \ref gmx_simd_or_i.
2308  *
2309  * This routine is only available if \ref GMX_SIMD_HAVE_FINT32_LOGICAL (single)
2310  *  or \ref GMX_SIMD_HAVE_DINT32_LOGICAL (double) is defined.
2311  *
2312  * \param a first integer SIMD
2313  * \param b second integer SIMD
2314  * \return a \| b (bitwise or)
2315  */
2316 static gmx_inline gmx_simd_fint32_t
2317 gmx_simd_or_fi(gmx_simd_fint32_t a, gmx_simd_fint32_t b)
2318 {
2319     gmx_simd_fint32_t  c;
2320     int                i;
2321
2322     for (i = 0; i < GMX_SIMD_FINT32_WIDTH; i++)
2323     {
2324         c.i[i] = a.i[i] | b.i[i];
2325     }
2326     return c;
2327 }
2328
2329 /*! \brief Integer SIMD bitwise xor.
2330  *
2331  * You should typically call the real-precision \ref gmx_simd_xor_i.
2332  *
2333  * This routine is only available if \ref GMX_SIMD_HAVE_FINT32_LOGICAL (single)
2334  *  or \ref GMX_SIMD_HAVE_DINT32_LOGICAL (double) is defined.
2335  *
2336  * \param a first integer SIMD
2337  * \param b second integer SIMD
2338  * \return a ^ b (bitwise xor)
2339  */
2340 static gmx_inline gmx_simd_fint32_t
2341 gmx_simd_xor_fi(gmx_simd_fint32_t a, gmx_simd_fint32_t b)
2342 {
2343     gmx_simd_fint32_t  c;
2344     int                i;
2345
2346     for (i = 0; i < GMX_SIMD_FINT32_WIDTH; i++)
2347     {
2348         c.i[i] = a.i[i] ^ b.i[i];
2349     }
2350     return c;
2351 }
2352
2353 /*! \}
2354  *
2355  * \name SIMD implementation integer (corresponding to float) arithmetics
2356  * \{
2357  */
2358 /*! \brief Add SIMD integers.
2359  *
2360  * You should typically call the real-precision \ref gmx_simd_xor_i.
2361  *
2362  * This routine is only available if \ref GMX_SIMD_HAVE_FINT32_ARITHMETICS (single)
2363  *  or \ref GMX_SIMD_HAVE_DINT32_ARITHMETICS (double) is defined.
2364  *
2365  * \param a term1
2366  * \param b term2
2367  * \return a+b
2368  */
2369 static gmx_inline gmx_simd_fint32_t
2370 gmx_simd_add_fi(gmx_simd_fint32_t a, gmx_simd_fint32_t b)
2371 {
2372     gmx_simd_fint32_t  c;
2373     int                i;
2374
2375     for (i = 0; i < GMX_SIMD_FINT32_WIDTH; i++)
2376     {
2377         c.i[i] = a.i[i] + b.i[i];
2378     }
2379     return c;
2380 }
2381
2382 /*! \brief Subtract SIMD integers.
2383  *
2384  * You should typically call the real-precision \ref gmx_simd_xor_i.
2385  *
2386  * This routine is only available if \ref GMX_SIMD_HAVE_FINT32_ARITHMETICS (single)
2387  *  or \ref GMX_SIMD_HAVE_DINT32_ARITHMETICS (double) is defined.
2388  *
2389  * \param a term1
2390  * \param b term2
2391  * \return a-b
2392  */
2393 static gmx_inline gmx_simd_fint32_t
2394 gmx_simd_sub_fi(gmx_simd_fint32_t a, gmx_simd_fint32_t b)
2395 {
2396     gmx_simd_fint32_t  c;
2397     int                i;
2398
2399     for (i = 0; i < GMX_SIMD_FINT32_WIDTH; i++)
2400     {
2401         c.i[i] = a.i[i] - b.i[i];
2402     }
2403     return c;
2404 }
2405
2406 /*! \brief Multiply SIMD integers.
2407  *
2408  * You should typically call the real-precision \ref gmx_simd_xor_i.
2409  *
2410  * This routine is only available if \ref GMX_SIMD_HAVE_FINT32_ARITHMETICS (single)
2411  *  or \ref GMX_SIMD_HAVE_DINT32_ARITHMETICS (double) is defined.
2412  *
2413  * \param a factor1
2414  * \param b factor2
2415  * \return a*b.
2416  *
2417  * \note Only the low 32 bits are retained, so this can overflow.
2418  */
2419 static gmx_inline gmx_simd_fint32_t
2420 gmx_simd_mul_fi(gmx_simd_fint32_t a, gmx_simd_fint32_t b)
2421 {
2422     gmx_simd_fint32_t  c;
2423     int                i;
2424
2425     for (i = 0; i < GMX_SIMD_FINT32_WIDTH; i++)
2426     {
2427         c.i[i] = a.i[i]*b.i[i];
2428     }
2429     return c;
2430 }
2431
2432 /*! \}
2433  *
2434  * \name SIMD implementation integer (corresponding to float) comparisons, boolean, selection
2435  * \{
2436  */
2437
2438 /*! \brief Equality comparison of two integers corresponding to float values.
2439  *
2440  * You should typically call the real-precision \ref gmx_simd_cmpeq_i.
2441  *
2442  * This routine is only available if \ref GMX_SIMD_HAVE_FINT32_ARITHMETICS (single)
2443  *  or \ref GMX_SIMD_HAVE_DINT32_ARITHMETICS (double) is defined.
2444  *
2445  * \param a SIMD integer1
2446  * \param b SIMD integer2
2447  * \return SIMD integer boolean with true for elements where a==b
2448  */
2449 static gmx_inline gmx_simd_fibool_t
2450 gmx_simd_cmpeq_fi(gmx_simd_fint32_t a, gmx_simd_fint32_t b)
2451 {
2452     gmx_simd_fibool_t  c;
2453     int                i;
2454
2455     for (i = 0; i < GMX_SIMD_FINT32_WIDTH; i++)
2456     {
2457         c.b[i] = (a.i[i] == b.i[i]);
2458     }
2459     return c;
2460 }
2461
2462 /*! \brief Less-than comparison of two SIMD integers corresponding to float values.
2463  *
2464  * You should typically call the real-precision \ref gmx_simd_cmplt_i.
2465  *
2466  * This routine is only available if \ref GMX_SIMD_HAVE_FINT32_ARITHMETICS (single)
2467  *  or \ref GMX_SIMD_HAVE_DINT32_ARITHMETICS (double) is defined.
2468  *
2469  * \param a SIMD integer1
2470  * \param b SIMD integer2
2471  * \return SIMD integer boolean with true for elements where a<b
2472  */
2473 static gmx_inline gmx_simd_fibool_t
2474 gmx_simd_cmplt_fi(gmx_simd_fint32_t a, gmx_simd_fint32_t b)
2475 {
2476     gmx_simd_fibool_t  c;
2477     int                i;
2478
2479     for (i = 0; i < GMX_SIMD_FINT32_WIDTH; i++)
2480     {
2481         c.b[i] = (a.i[i] < b.i[i]);
2482     }
2483     return c;
2484 }
2485
2486 /*! \brief Logical AND on gmx_simd_fibool_t.
2487  *
2488  * You should typically call the real-precision \ref gmx_simd_and_ib.
2489  *
2490  * This routine is only available if \ref GMX_SIMD_HAVE_FINT32_ARITHMETICS (single)
2491  *  or \ref GMX_SIMD_HAVE_DINT32_ARITHMETICS (double) is defined.
2492  *
2493  * \param a SIMD boolean 1
2494  * \param b SIMD boolean 2
2495  * \return True for elements where both a and b are true.
2496  */
2497 static gmx_inline gmx_simd_fibool_t
2498 gmx_simd_and_fib(gmx_simd_fibool_t a, gmx_simd_fibool_t b)
2499 {
2500     gmx_simd_fibool_t c;
2501     int               i;
2502
2503     for (i = 0; i < GMX_SIMD_FINT32_WIDTH; i++)
2504     {
2505         c.b[i] = (a.b[i] && b.b[i]);
2506     }
2507     return c;
2508 }
2509
2510 /*! \brief Logical OR on gmx_simd_fibool_t.
2511  *
2512  * You should typically call the real-precision \ref gmx_simd_or_ib.
2513  *
2514  * This routine is only available if \ref GMX_SIMD_HAVE_FINT32_ARITHMETICS (single)
2515  *  or \ref GMX_SIMD_HAVE_DINT32_ARITHMETICS (double) is defined.
2516  *
2517  * \param a SIMD boolean 1
2518  * \param b SIMD boolean 2
2519  * \return True for elements where both a and b are true.
2520  */
2521 static gmx_inline gmx_simd_fibool_t
2522 gmx_simd_or_fib(gmx_simd_fibool_t a, gmx_simd_fibool_t b)
2523 {
2524     gmx_simd_fibool_t  c;
2525     int                i;
2526
2527     for (i = 0; i < GMX_SIMD_FINT32_WIDTH; i++)
2528     {
2529         c.b[i] = (a.b[i] || b.b[i]);
2530     }
2531     return c;
2532 }
2533
2534 /*! \brief Returns non-zero if any of the boolean in x is True, otherwise 0.
2535  *
2536  * You should typically call the real-precision \ref gmx_simd_anytrue_ib.
2537  *
2538  * This routine is only available if \ref GMX_SIMD_HAVE_FINT32_ARITHMETICS (single)
2539  *  or \ref GMX_SIMD_HAVE_DINT32_ARITHMETICS (double) is defined.
2540  *
2541  * The actual return value for "any true" will depend on the architecture.
2542  * Any non-zero value should be considered truth.
2543  *
2544  * \param a SIMD boolean
2545  * \return Nonzero integer if any of the elements in a is true, otherwise 0.
2546  */
2547 static gmx_inline int
2548 gmx_simd_anytrue_fib(gmx_simd_fibool_t a)
2549 {
2550     int             anytrue;
2551     int             i;
2552
2553     anytrue = 0;
2554     for (i = 0; i < GMX_SIMD_FINT32_WIDTH; i++)
2555     {
2556         anytrue = anytrue || a.b[i];
2557     }
2558     return anytrue;
2559 }
2560
2561 /*! \brief Select from \ref gmx_simd_fint32_t variable where boolean is true.
2562  *
2563  * You should typically call the real-precision \ref gmx_simd_blendzero_i.
2564  *
2565  * This routine is only available if \ref GMX_SIMD_HAVE_FINT32_ARITHMETICS (single)
2566  *  or \ref GMX_SIMD_HAVE_DINT32_ARITHMETICS (double) is defined.
2567  *
2568  * \param a SIMD integer to select from
2569  * \param sel Boolean selector
2570  * \return Elements from a where sel is true, 0 otherwise.
2571  */
2572 static gmx_inline gmx_simd_fint32_t
2573 gmx_simd_blendzero_fi(gmx_simd_fint32_t a, gmx_simd_fibool_t sel)
2574 {
2575     gmx_simd_fint32_t  c;
2576     int                i;
2577
2578     for (i = 0; i < GMX_SIMD_FINT32_WIDTH; i++)
2579     {
2580         c.i[i] = sel.b[i] ? a.i[i] : 0.0;
2581     }
2582     return c;
2583 }
2584
2585 /*! \brief Select from \ref gmx_simd_fint32_t variable where boolean is false.
2586  *
2587  * You should typically call the real-precision \ref gmx_simd_blendnotzero_i.
2588  *
2589  * This routine is only available if \ref GMX_SIMD_HAVE_FINT32_ARITHMETICS (single)
2590  *  or \ref GMX_SIMD_HAVE_DINT32_ARITHMETICS (double) is defined.
2591  *
2592  * \param a SIMD integer to select from
2593  * \param sel Boolean selector
2594  * \return Elements from a where sel is false, 0 otherwise (sic).
2595  */
2596 static gmx_inline gmx_simd_fint32_t
2597 gmx_simd_blendnotzero_fi(gmx_simd_fint32_t a, gmx_simd_fibool_t sel)
2598 {
2599     gmx_simd_fint32_t  c;
2600     int                i;
2601
2602     for (i = 0; i < GMX_SIMD_FINT32_WIDTH; i++)
2603     {
2604         c.i[i] = sel.b[i] ? 0.0 : a.i[i];
2605     }
2606     return c;
2607 }
2608
2609 /*! \brief Vector-blend SIMD selection.
2610  *
2611  * You should typically call the real-precision \ref gmx_simd_blendv_i.
2612  *
2613  * This routine is only available if \ref GMX_SIMD_HAVE_FINT32_ARITHMETICS (single)
2614  *  or \ref GMX_SIMD_HAVE_DINT32_ARITHMETICS (double) is defined.
2615  *
2616  * \param a First source
2617  * \param b Second source
2618  * \param sel Boolean selector
2619  * \return For each element, select b if sel is true, a otherwise.
2620  */
2621 static gmx_inline gmx_simd_fint32_t
2622 gmx_simd_blendv_fi(gmx_simd_fint32_t a, gmx_simd_fint32_t b, gmx_simd_fibool_t sel)
2623 {
2624     gmx_simd_fint32_t d;
2625     int               i;
2626
2627     for (i = 0; i < GMX_SIMD_FINT32_WIDTH; i++)
2628     {
2629         d.i[i] = sel.b[i] ? b.i[i] : a.i[i];
2630     }
2631     return d;
2632 }
2633
2634 /*! \}
2635  *
2636  * \name SIMD implementation integer (corresponding to double) bitwise logical operations
2637  * \{
2638  */
2639
2640 /*! \brief SIMD integer shift left, based on immediate value.
2641  *
2642  * \copydetails gmx_simd_slli_fi
2643  */
2644 static gmx_inline gmx_simd_dint32_t
2645 gmx_simd_slli_di(gmx_simd_dint32_t a, int n)
2646 {
2647     gmx_simd_dint32_t  c;
2648     int                i;
2649
2650     for (i = 0; i < GMX_SIMD_DINT32_WIDTH; i++)
2651     {
2652         c.i[i] = a.i[i] << n;
2653     }
2654     return c;
2655 }
2656
2657 /*! \brief SIMD integer shift right, based on immediate value.
2658  *
2659  * \copydetails gmx_simd_srli_fi
2660  */
2661 static gmx_inline gmx_simd_dint32_t
2662 gmx_simd_srli_di(gmx_simd_dint32_t a, int n)
2663 {
2664     gmx_simd_dint32_t  c;
2665     int                i;
2666
2667     for (i = 0; i < GMX_SIMD_DINT32_WIDTH; i++)
2668     {
2669         c.i[i] = a.i[i] >> n;
2670     }
2671     return c;
2672 }
2673
2674 /*! \brief Integer bitwise and for SIMD variables.
2675  *
2676  * \copydetails gmx_simd_and_fi
2677  */
2678 static gmx_inline gmx_simd_dint32_t
2679 gmx_simd_and_di(gmx_simd_dint32_t a, gmx_simd_dint32_t b)
2680 {
2681     gmx_simd_dint32_t  c;
2682     int                i;
2683
2684     for (i = 0; i < GMX_SIMD_DINT32_WIDTH; i++)
2685     {
2686         c.i[i] = a.i[i] & b.i[i];
2687     }
2688     return c;
2689 }
2690
2691 /*! \brief Integer bitwise not-and for SIMD variables.
2692  *
2693  * \copydetails gmx_simd_andnot_fi
2694  */
2695 static gmx_inline gmx_simd_dint32_t
2696 gmx_simd_andnot_di(gmx_simd_dint32_t a, gmx_simd_dint32_t b)
2697 {
2698     gmx_simd_dint32_t  c;
2699     int                i;
2700
2701     for (i = 0; i < GMX_SIMD_DINT32_WIDTH; i++)
2702     {
2703         c.i[i] = (~a.i[i]) & b.i[i];
2704     }
2705     return c;
2706 }
2707
2708 /*! \brief Integer bitwise or for SIMD variables.
2709  *
2710  * \copydetails gmx_simd_or_fi
2711  */
2712 static gmx_inline gmx_simd_dint32_t
2713 gmx_simd_or_di(gmx_simd_dint32_t a, gmx_simd_dint32_t b)
2714 {
2715     gmx_simd_dint32_t  c;
2716     int                i;
2717
2718     for (i = 0; i < GMX_SIMD_DINT32_WIDTH; i++)
2719     {
2720         c.i[i] = a.i[i] | b.i[i];
2721     }
2722     return c;
2723 }
2724
2725 /*! \brief Integer bitwise xor for SIMD variables.
2726  *
2727  * \copydetails gmx_simd_xor_fi
2728  */
2729 static gmx_inline gmx_simd_dint32_t
2730 gmx_simd_xor_di(gmx_simd_dint32_t a, gmx_simd_dint32_t b)
2731 {
2732     gmx_simd_dint32_t  c;
2733     int                i;
2734
2735     for (i = 0; i < GMX_SIMD_DINT32_WIDTH; i++)
2736     {
2737         c.i[i] = a.i[i] ^ b.i[i];
2738     }
2739     return c;
2740 }
2741
2742 /*! \}
2743  *
2744  * \name SIMD implementation integer (corresponding to double) arithmetics
2745  * \{
2746  */
2747 /*! \brief Add SIMD integers, corresponding to double precision.
2748  *
2749  * \copydetails gmx_simd_add_fi
2750  */
2751 static gmx_inline gmx_simd_dint32_t
2752 gmx_simd_add_di(gmx_simd_dint32_t a, gmx_simd_dint32_t b)
2753 {
2754     gmx_simd_dint32_t  c;
2755     int                i;
2756
2757     for (i = 0; i < GMX_SIMD_DINT32_WIDTH; i++)
2758     {
2759         c.i[i] = a.i[i] + b.i[i];
2760     }
2761     return c;
2762 }
2763
2764 /*! \brief Subtract SIMD integers, corresponding to double precision.
2765  *
2766  * \copydetails gmx_simd_sub_fi
2767  */
2768 static gmx_inline gmx_simd_dint32_t
2769 gmx_simd_sub_di(gmx_simd_dint32_t a, gmx_simd_dint32_t b)
2770 {
2771     gmx_simd_dint32_t  c;
2772     int                i;
2773
2774     for (i = 0; i < GMX_SIMD_DINT32_WIDTH; i++)
2775     {
2776         c.i[i] = a.i[i] - b.i[i];
2777     }
2778     return c;
2779 }
2780
2781 /*! \brief Multiply SIMD integers, corresponding to double precision.
2782  *
2783  * \copydetails gmx_simd_mul_fi
2784  */
2785 static gmx_inline gmx_simd_dint32_t
2786 gmx_simd_mul_di(gmx_simd_dint32_t a, gmx_simd_dint32_t b)
2787 {
2788     gmx_simd_dint32_t  c;
2789     int                i;
2790
2791     for (i = 0; i < GMX_SIMD_DINT32_WIDTH; i++)
2792     {
2793         c.i[i] = a.i[i]*b.i[i];
2794     }
2795     return c;
2796 }
2797
2798 /*! \}
2799  *
2800  * \name SIMD implementation integer (corresponding to double) comparisons, boolean selection
2801  * \{
2802  */
2803
2804 /*! \brief Equality comparison of two ints corresponding to double SIMD data.
2805  *
2806  * \copydetails gmx_simd_cmpeq_fi
2807  */
2808 static gmx_inline gmx_simd_dibool_t
2809 gmx_simd_cmpeq_di(gmx_simd_dint32_t a, gmx_simd_dint32_t b)
2810 {
2811     gmx_simd_dibool_t  c;
2812     int                i;
2813
2814     for (i = 0; i < GMX_SIMD_DINT32_WIDTH; i++)
2815     {
2816         c.b[i] = (a.i[i] == b.i[i]);
2817     }
2818     return c;
2819 }
2820
2821 /*! \brief Less-than comparison of two ints corresponding to double SIMD data.
2822  *
2823  * \copydetails gmx_simd_cmplt_fi
2824  */
2825 static gmx_inline gmx_simd_dibool_t
2826 gmx_simd_cmplt_di(gmx_simd_dint32_t a, gmx_simd_dint32_t b)
2827 {
2828     gmx_simd_dibool_t  c;
2829     int                i;
2830
2831     for (i = 0; i < GMX_SIMD_DINT32_WIDTH; i++)
2832     {
2833         c.b[i] = (a.i[i] < b.i[i]);
2834     }
2835     return c;
2836 }
2837
2838 /*! \brief Logical AND on gmx_simd_dibool_t.
2839  *
2840  * \copydetails gmx_simd_and_fib
2841  */
2842 static gmx_inline gmx_simd_dibool_t
2843 gmx_simd_and_dib(gmx_simd_dibool_t a, gmx_simd_dibool_t b)
2844 {
2845     gmx_simd_dibool_t c;
2846     int               i;
2847
2848     for (i = 0; i < GMX_SIMD_DINT32_WIDTH; i++)
2849     {
2850         c.b[i] = (a.b[i] && b.b[i]);
2851     }
2852     return c;
2853 }
2854
2855 /*! \brief Logical OR on gmx_simd_dibool_t.
2856  *
2857  * \copydetails gmx_simd_or_fib
2858  */
2859 static gmx_inline gmx_simd_dibool_t
2860 gmx_simd_or_dib(gmx_simd_dibool_t a, gmx_simd_dibool_t b)
2861 {
2862     gmx_simd_dibool_t  c;
2863     int                i;
2864
2865     for (i = 0; i < GMX_SIMD_DINT32_WIDTH; i++)
2866     {
2867         c.b[i] = (a.b[i] || b.b[i]);
2868     }
2869     return c;
2870 }
2871
2872 /*! \brief Returns non-zero if any of the double-int SIMD booleans in x is True, otherwise 0.
2873  *
2874  * \copydetails gmx_simd_anytrue_fib
2875  */
2876 static gmx_inline int
2877 gmx_simd_anytrue_dib(gmx_simd_dibool_t a)
2878 {
2879     int             anytrue;
2880     int             i;
2881
2882     anytrue = 0;
2883     for (i = 0; i < GMX_SIMD_DINT32_WIDTH; i++)
2884     {
2885         anytrue = anytrue || a.b[i];
2886     }
2887     return anytrue;
2888 }
2889
2890 /*! \brief Select from SIMD ints (corresponding to double) where boolean is true.
2891  *
2892  * \copydetails gmx_simd_blendzero_fi
2893  */
2894 static gmx_inline gmx_simd_dint32_t
2895 gmx_simd_blendzero_di(gmx_simd_dint32_t a, gmx_simd_dibool_t sel)
2896 {
2897     gmx_simd_dint32_t  c;
2898     int                i;
2899
2900     for (i = 0; i < GMX_SIMD_DINT32_WIDTH; i++)
2901     {
2902         c.i[i] = sel.b[i] ? a.i[i] : 0.0;
2903     }
2904     return c;
2905 }
2906
2907 /*! \brief Select from SIMD ints (corresponding to double) where boolean is false.
2908  *
2909  * \copydetails gmx_simd_blendnotzero_fi
2910  */
2911 static gmx_inline gmx_simd_dint32_t
2912 gmx_simd_blendnotzero_di(gmx_simd_dint32_t a, gmx_simd_dibool_t sel)
2913 {
2914     gmx_simd_dint32_t  c;
2915     int                i;
2916
2917     for (i = 0; i < GMX_SIMD_DINT32_WIDTH; i++)
2918     {
2919         c.i[i] = sel.b[i] ? 0.0 : a.i[i];
2920     }
2921     return c;
2922 }
2923
2924 /*! \brief Vector-blend SIMD selection for double-int SIMD.
2925  *
2926  * \copydetails gmx_simd_blendv_fi
2927  */
2928 static gmx_inline gmx_simd_dint32_t
2929 gmx_simd_blendv_di(gmx_simd_dint32_t a, gmx_simd_dint32_t b, gmx_simd_dibool_t sel)
2930 {
2931     gmx_simd_dint32_t  d;
2932     int                i;
2933
2934     for (i = 0; i < GMX_SIMD_DINT32_WIDTH; i++)
2935     {
2936         d.i[i] = sel.b[i] ? b.i[i] : a.i[i];
2937     }
2938     return d;
2939 }
2940
2941 /*! \}
2942  *
2943  * \name SIMD implementation conversion operations
2944  * \{
2945  */
2946
2947 /*! \brief Round single precision floating point to integer.
2948  *
2949  * You should typically call the real-precision \ref gmx_simd_cvt_r2i.
2950  *
2951  * \param a SIMD floating-point
2952  * \return SIMD integer, rounded to nearest integer.
2953  */
2954 static gmx_inline gmx_simd_fint32_t
2955 gmx_simd_cvt_f2i(gmx_simd_float_t a)
2956 {
2957     gmx_simd_fint32_t  b;
2958     int                i;
2959
2960     for (i = 0; i < GMX_SIMD_FINT32_WIDTH; i++)
2961     {
2962 #ifdef _MSC_VER
2963         b.i[i] = (a.r[i] >= 0.0) ? (a.r[i] + 0.5) : (a.r[i] - 0.5);
2964 #else
2965         b.i[i] = roundf(a.r[i]);
2966 #endif
2967     }
2968     return b;
2969 };
2970
2971 /*! \brief Truncate single precision floating point to integer.
2972  *
2973  * You should typically call the real-precision \ref gmx_simd_cvtt_r2i.
2974  *
2975  * \param a SIMD floating-point
2976  * \return SIMD integer, truncated towards zero.
2977  */
2978 static gmx_inline gmx_simd_fint32_t
2979 gmx_simd_cvtt_f2i(gmx_simd_float_t a)
2980 {
2981     gmx_simd_fint32_t  b;
2982     int                i;
2983
2984     for (i = 0; i < GMX_SIMD_FINT32_WIDTH; i++)
2985     {
2986         b.i[i] = a.r[i];
2987     }
2988     return b;
2989 };
2990
2991 /*! \brief Convert integer to single precision floating-point.
2992  *
2993  * You should typically call the real-precision \ref gmx_simd_cvt_i2r.
2994  *
2995  * \param a SIMD integer
2996  * \return SIMD floating-pint
2997  */
2998 static gmx_inline gmx_simd_float_t
2999 gmx_simd_cvt_i2f(gmx_simd_fint32_t a)
3000 {
3001     gmx_simd_float_t   b;
3002     int                i;
3003
3004     for (i = 0; i < GMX_SIMD_FINT32_WIDTH; i++)
3005     {
3006         b.r[i] = a.i[i];
3007     }
3008     return b;
3009 };
3010
3011 /*! \brief Round double precision floating point to integer.
3012  *
3013  * \copydetails gmx_simd_cvt_f2i
3014  */
3015 static gmx_inline gmx_simd_dint32_t
3016 gmx_simd_cvt_d2i(gmx_simd_double_t a)
3017 {
3018     gmx_simd_dint32_t  b;
3019     int                i;
3020
3021     for (i = 0; i < GMX_SIMD_DINT32_WIDTH; i++)
3022     {
3023 #ifdef _MSC_VER
3024         b.i[i] = (a.r[i] >= 0.0) ? (a.r[i] + 0.5) : (a.r[i] - 0.5);
3025 #else
3026         b.i[i] = round(a.r[i]);
3027 #endif
3028     }
3029     return b;
3030 };
3031
3032 /*! \brief Truncate double precision floating point to integer.
3033  *
3034  * \copydetails gmx_simd_cvtt_f2i
3035  */
3036 static gmx_inline gmx_simd_dint32_t
3037 gmx_simd_cvtt_d2i(gmx_simd_double_t a)
3038 {
3039     gmx_simd_dint32_t  b;
3040     int                i;
3041
3042     for (i = 0; i < GMX_SIMD_DINT32_WIDTH; i++)
3043     {
3044         b.i[i] = a.r[i];
3045     }
3046     return b;
3047 };
3048
3049 /*! \brief Convert integer to single precision floating-point.
3050  *
3051  * \copydetails gmx_simd_cvt_i2f
3052  */
3053 static gmx_inline gmx_simd_double_t
3054 gmx_simd_cvt_i2d(gmx_simd_dint32_t a)
3055 {
3056     gmx_simd_double_t  b;
3057     int                i;
3058
3059     for (i = 0; i < GMX_SIMD_DINT32_WIDTH; i++)
3060     {
3061         b.r[i] = a.i[i];
3062     }
3063     return b;
3064 };
3065
3066 /*! \brief Convert from float boolean to corresponding integer boolean.
3067  *
3068  * You should typically call the real-precision \ref gmx_simd_cvt_b2ib.
3069  *
3070  * \param a Boolean corresponding to SIMD floating-point
3071  * \return Boolean that can be applied to SIMD integer operations.
3072  */
3073 static gmx_inline gmx_simd_fibool_t
3074 gmx_simd_cvt_fb2fib(gmx_simd_fbool_t a)
3075 {
3076     gmx_simd_fibool_t  b;
3077     int                i;
3078
3079     /* Integer width >= float width */
3080     for (i = 0; i < GMX_SIMD_FLOAT_WIDTH; i++)
3081     {
3082         b.b[i] = a.b[i];
3083     }
3084     return b;
3085 }
3086
3087 /*! \brief Convert from integer boolean (corresponding to float) to float boolean.
3088  *
3089  * You should typically call the real-precision \ref gmx_simd_cvt_ib2b.
3090  *
3091  * \param a Boolean corresponding to SIMD integer
3092  * \return Boolean that can be applied to SIMD floating-point.
3093  */
3094 static gmx_inline gmx_simd_fbool_t
3095 gmx_simd_cvt_fib2fb(gmx_simd_fibool_t a)
3096 {
3097     gmx_simd_fbool_t  b;
3098     int               i;
3099
3100     /* Integer width >= float width */
3101     for (i = 0; i < GMX_SIMD_FLOAT_WIDTH; i++)
3102     {
3103         b.b[i] = a.b[i];
3104     }
3105     return b;
3106 }
3107
3108 /*! \brief Convert from double boolean to corresponding integer boolean.
3109  *
3110  * \copydetails gmx_simd_cvt_fb2fib
3111  */
3112 static gmx_inline gmx_simd_dibool_t
3113 gmx_simd_cvt_db2dib(gmx_simd_dbool_t a)
3114 {
3115     gmx_simd_dibool_t  b;
3116     int                i;
3117
3118     /* Integer width >= double width */
3119     for (i = 0; i < GMX_SIMD_DOUBLE_WIDTH; i++)
3120     {
3121         b.b[i] = a.b[i];
3122     }
3123     return b;
3124 }
3125
3126 /*! \brief Convert from integer boolean (corresponding to double) to double boolean.
3127  *
3128  * \copydetails gmx_simd_cvt_fib2fb
3129  */
3130 static gmx_inline gmx_simd_dbool_t
3131 gmx_simd_cvt_dib2db(gmx_simd_dibool_t a)
3132 {
3133     gmx_simd_dbool_t  b;
3134     int               i;
3135
3136     /* Integer width >= double width */
3137     for (i = 0; i < GMX_SIMD_DOUBLE_WIDTH; i++)
3138     {
3139         b.b[i] = a.b[i];
3140     }
3141     return b;
3142 }
3143
3144 /*! \brief Convert SIMD float to double.
3145  *
3146  * This version is available if \ref GMX_SIMD_FLOAT_WIDTH is identical to
3147  * \ref GMX_SIMD_DOUBLE_WIDTH.
3148  *
3149  * Float/double conversions are complex since the SIMD width could either
3150  * be different (e.g. on x86) or identical (e.g. IBM QPX). This means you will
3151  * need to check for the width in the code, and have different code paths.
3152  *
3153  * \param f Single-precision SIMD variable
3154  * \return Double-precision SIMD variable of the same width
3155  */
3156 static gmx_inline gmx_simd_double_t
3157 gmx_simd_cvt_f2d(gmx_simd_float_t f)
3158 {
3159     gmx_simd_double_t d;
3160 #if (GMX_SIMD_FLOAT_WIDTH == GMX_SIMD_DOUBLE_WIDTH)
3161     int               i;
3162     for (i = 0; i < GMX_SIMD_DOUBLE_WIDTH; i++)
3163     {
3164         d.r[i] = f.r[i];
3165     }
3166 #else
3167     gmx_fatal(FARGS, "gmx_simd_cvt_f2d() requires GMX_SIMD_FLOAT_WIDTH==GMX_SIMD_DOUBLE_WIDTH");
3168     /* Avoid compiler warnings */
3169     d.r[0] = f.r[0];
3170 #endif
3171     return d;
3172 }
3173
3174 /*! \brief Convert SIMD double to float.
3175  *
3176  * This version is available if \ref GMX_SIMD_FLOAT_WIDTH is identical to
3177  * \ref GMX_SIMD_DOUBLE_WIDTH.
3178  *
3179  * Float/double conversions are complex since the SIMD width could either
3180  * be different (e.g. on x86) or identical (e.g. IBM QPX). This means you will
3181  * need to check for the width in the code, and have different code paths.
3182  *
3183  * \param d Double-precision SIMD variable
3184  * \return Single-precision SIMD variable of the same width
3185  */
3186 static gmx_inline gmx_simd_float_t
3187 gmx_simd_cvt_d2f(gmx_simd_double_t d)
3188 {
3189     gmx_simd_float_t f;
3190 #if (GMX_SIMD_FLOAT_WIDTH == GMX_SIMD_DOUBLE_WIDTH)
3191     int              i;
3192     for (i = 0; i < GMX_SIMD_DOUBLE_WIDTH; i++)
3193     {
3194         f.r[i] = d.r[i];
3195     }
3196 #else
3197     gmx_fatal(FARGS, "gmx_simd_cvt_d2f() requires GMX_SIMD_FLOAT_WIDTH==GMX_SIMD_DOUBLE_WIDTH");
3198     /* Avoid compiler warnings */
3199     f.r[0] = d.r[0];
3200 #endif
3201     return f;
3202 }
3203
3204 /*! \brief Convert SIMD float to double.
3205  *
3206  * This version is available if \ref GMX_SIMD_FLOAT_WIDTH is twice as large
3207  * as \ref GMX_SIMD_DOUBLE_WIDTH.
3208  *
3209  * Float/double conversions are complex since the SIMD width could either
3210  * be different (e.g. on x86) or identical (e.g. IBM QPX). This means you will
3211  * need to check for the width in the code, and have different code paths.
3212  *
3213  * \param f Single-precision SIMD variable
3214  * \param[out] d0 Double-precision SIMD variable, first half of values from f.
3215  * \param[out] d1 Double-precision SIMD variable, second half of values from f.
3216  */
3217 static gmx_inline void
3218 gmx_simd_cvt_f2dd(gmx_simd_float_t f, gmx_simd_double_t *d0, gmx_simd_double_t *d1)
3219 {
3220 #if (GMX_SIMD_FLOAT_WIDTH == 2*GMX_SIMD_DOUBLE_WIDTH)
3221     int i;
3222     for (i = 0; i < GMX_SIMD_DOUBLE_WIDTH; i++)
3223     {
3224         d0->r[i] = f.r[i];
3225         d1->r[i] = f.r[GMX_SIMD_DOUBLE_WIDTH+i];
3226     }
3227 #else
3228     gmx_fatal(FARGS, "gmx_simd_cvt_f2dd() requires GMX_SIMD_FLOAT_WIDTH==2*GMX_SIMD_DOUBLE_WIDTH");
3229     /* Avoid compiler warnings about unused arguments */
3230     d0->r[0] = f.r[0];
3231     d1->r[0] = f.r[0];
3232 #endif
3233 }
3234
3235 /*! \brief Convert SIMD double to float.
3236  *
3237  * This version is available if \ref GMX_SIMD_FLOAT_WIDTH is twice as large
3238  * as \ref GMX_SIMD_DOUBLE_WIDTH.
3239  *
3240  * Float/double conversions are complex since the SIMD width could either
3241  * be different (e.g. on x86) or identical (e.g. IBM QPX). This means you will
3242  * need to check for the width in the code, and have different code paths.
3243  *
3244  * \param d0 Double-precision SIMD variable, first half of values to put in f.
3245  * \param d1 Double-precision SIMD variable, second half of values to put in f.
3246  * \return Single-precision SIMD variable with all values.
3247  */
3248 static gmx_inline gmx_simd_float_t
3249 gmx_simd_cvt_dd2f(gmx_simd_double_t d0, gmx_simd_double_t d1)
3250 {
3251     gmx_simd_float_t f;
3252 #if (GMX_SIMD_FLOAT_WIDTH == 2*GMX_SIMD_DOUBLE_WIDTH)
3253     int              i;
3254     for (i = 0; i < GMX_SIMD_DOUBLE_WIDTH; i++)
3255     {
3256         f.r[i]                       = d0.r[i];
3257         f.r[GMX_SIMD_DOUBLE_WIDTH+i] = d1.r[i];
3258     }
3259 #else
3260     gmx_fatal(FARGS, "gmx_simd_cvt_dd2f() requires GMX_SIMD_FLOAT_WIDTH==2*GMX_SIMD_DOUBLE_WIDTH");
3261     /* Avoid compiler warnings about unused arguments & uninitialized f */
3262     f.r[0] = d0.r[0] + d1.r[0];
3263 #endif
3264     return f;
3265 }
3266
3267 /*! \} */
3268
3269 /*! \name SIMD4. Constant width-4 SIMD types and instructions
3270  * \{
3271  */
3272
3273 #if (GMX_SIMD_FLOAT_WIDTH == 4) || (defined DOXYGEN)
3274
3275
3276 /*! \brief SIMD4 float type. Available with \ref GMX_SIMD4_HAVE_FLOAT.
3277  *
3278  * Unless you specifically want a single-precision type you should check
3279  * \ref gmx_simd4_real_t instead.
3280  *
3281  * While the SIMD4 datatype is identical to the normal SIMD type in the
3282  * reference implementation, this will often not be the case for
3283  * other architectures.
3284  */
3285 #    define gmx_simd4_float_t    gmx_simd_float_t
3286
3287 /*! \brief Load SIMD4 float from aligned memory.
3288  *  \copydetails gmx_simd_load_f
3289  */
3290 #    define gmx_simd4_load_f     gmx_simd_load_f
3291
3292 /*! \brief Set all elements of SIMD4 float from single pointer.
3293  *  \copydetails gmx_simd_load1_f
3294  */
3295 #    define gmx_simd4_load1_f    gmx_simd_load1_f
3296
3297 /*! \brief Set all SIMD4 float elements to the value r.
3298  *  \copydetails gmx_simd_set1_f
3299  */
3300 #    define gmx_simd4_set1_f     gmx_simd_set1_f
3301
3302 /*! \brief Store the contents of SIMD4 float pr to aligned memory m.
3303  *  \copydetails gmx_simd_store_f
3304  */
3305 #    define gmx_simd4_store_f    gmx_simd_store_f
3306
3307 /*! \brief Load SIMD4 float from unaligned memory.
3308  * \copydetails gmx_simd_loadu_f
3309  */
3310 #    define gmx_simd4_loadu_f    gmx_simd_loadu_f
3311
3312 /*! \brief Store SIMD4 float to unaligned memory.
3313  * \copydetails gmx_simd_storeu_f
3314  */
3315 #    define gmx_simd4_storeu_f   gmx_simd_storeu_f
3316
3317 /*! \brief Set all SIMD4 float elements to 0.
3318  * \copydetails gmx_simd_setzero_f
3319  */
3320 #    define gmx_simd4_setzero_f  gmx_simd_setzero_f
3321
3322 /*! \brief Bitwise and for two SIMD4 float variables.
3323  * \copydetails gmx_simd_and_f
3324  */
3325 #    define gmx_simd4_and_f      gmx_simd_and_f
3326
3327 /*! \brief Bitwise andnot for two SIMD4 float variables. c=(~a) & b.
3328  * \copydetails gmx_simd_andnot_f
3329  */
3330 #    define gmx_simd4_andnot_f   gmx_simd_andnot_f
3331
3332 /*! \brief Bitwise or for two SIMD4 float variables.
3333  * \copydetails gmx_simd_or_f
3334  */
3335 #    define gmx_simd4_or_f       gmx_simd_or_f
3336
3337 /*! \brief Bitwise xor for two SIMD4 float variables.
3338  * \copydetails gmx_simd_xor_f
3339  */
3340 #    define gmx_simd4_xor_f      gmx_simd_xor_f
3341
3342 /*! \brief Add two SIMD4 float variables.
3343  * \copydetails gmx_simd_add_f
3344  */
3345 #    define gmx_simd4_add_f      gmx_simd_add_f
3346
3347 /*! \brief Subtract two SIMD4 float variables.
3348  * \copydetails gmx_simd_sub_f
3349  */
3350 #    define gmx_simd4_sub_f      gmx_simd_sub_f
3351
3352 /*! \brief Multiply two SIMD4 float variables.
3353  * \copydetails gmx_simd_mul_f
3354  */
3355 #    define gmx_simd4_mul_f      gmx_simd_mul_f
3356
3357 /*! \brief Fused-multiply-add for SIMD4 float. Result is a*b+c.
3358  * \copydetails gmx_simd_fmadd_f
3359  */
3360 #    define gmx_simd4_fmadd_f    gmx_simd_fmadd_f
3361
3362 /*! \brief Fused-multiply-subtract for SIMD4 float. Result is a*b-c.
3363  * \copydetails gmx_simd_fmsub_f
3364  */
3365 #    define gmx_simd4_fmsub_f    gmx_simd_fmsub_f
3366
3367 /*! \brief Fused-negated-multiply-add for SIMD4 float. Result is -a*b+c.
3368  * \copydetails gmx_simd_fnmadd_f
3369  */
3370 #    define gmx_simd4_fnmadd_f   gmx_simd_fnmadd_f
3371
3372 /*! \brief Fused-negated-multiply-add for SIMD4 float. Result is -a*b-c.
3373  * \copydetails gmx_simd_fnmsub_f
3374  */
3375 #    define gmx_simd4_fnmsub_f   gmx_simd_fnmsub_f
3376
3377 /*! \brief Lookup of approximate 1/sqrt(x) for SIMD4 float.
3378  * \copydetails gmx_simd_rsqrt_f
3379  */
3380 #    define gmx_simd4_rsqrt_f    gmx_simd_rsqrt_f
3381
3382 /*! \brief Floating-point absolute value for SIMD4 float.
3383  * \copydetails gmx_simd_fabs_f
3384  */
3385 #    define gmx_simd4_fabs_f     gmx_simd_fabs_f
3386
3387 /*! \brief Floating-point negate for SIMD4 float.
3388  * \copydetails gmx_simd_fneg_f
3389  */
3390 #    define gmx_simd4_fneg_f     gmx_simd_fneg_f
3391
3392 /*! \brief Set each SIMD4 float element to the largest from two variables.
3393  * \copydetails gmx_simd_max_f
3394  */
3395 #    define gmx_simd4_max_f      gmx_simd_max_f
3396
3397 /*! \brief Set each SIMD4 float element to the smallest from two variables.
3398  * \copydetails gmx_simd_min_f
3399  */
3400 #    define gmx_simd4_min_f      gmx_simd_min_f
3401
3402 /*! \brief Round to nearest integer value for SIMD4 float.
3403  * \copydetails gmx_simd_round_f
3404  */
3405 #    define gmx_simd4_round_f    gmx_simd_round_f
3406
3407 /*! \brief Round to largest integral value for SIMD4 float.
3408  * \copydetails gmx_simd_trunc_f
3409  */
3410 #    define gmx_simd4_trunc_f    gmx_simd_trunc_f
3411
3412 /*! \brief Return dot product of two single precision SIMD4 variables.
3413  *
3414  * The dot product is calculated between the first three elements in the two
3415  * vectors, while the fourth is ignored. The result is returned as a scalar.
3416  *
3417  * \param a vector1
3418  * \param b vector2
3419  * \result a[0]*b[0]+a[1]*b[1]+a[2]*b[2], returned as scalar. Last element is ignored.
3420  */
3421 static gmx_inline float
3422 gmx_simd4_dotproduct3_f(gmx_simd_float_t a, gmx_simd_float_t b)
3423 {
3424     return a.r[0]*b.r[0]+a.r[1]*b.r[1]+a.r[2]*b.r[2];
3425 }
3426
3427 /*! \brief SIMD4 variable type to use for logical comparisons on floats.
3428  * \copydetails gmx_simd_fbool_t
3429  */
3430 #    define gmx_simd4_fbool_t   gmx_simd_fbool_t
3431
3432 /*! \brief Equality comparison of two single precision SIMD4.
3433  * \copydetails gmx_simd_cmpeq_f
3434  */
3435 #    define gmx_simd4_cmpeq_f   gmx_simd_cmpeq_f
3436
3437 /*! \brief Less-than comparison of two single precision SIMD4.
3438  * \copydetails gmx_simd_cmplt_f
3439  */
3440 #    define gmx_simd4_cmplt_f   gmx_simd_cmplt_f
3441
3442 /*! \brief Less-than comparison of two single precision SIMD4.
3443  * \copydetails gmx_simd_cmple_f
3444  */
3445 #    define gmx_simd4_cmple_f   gmx_simd_cmple_f
3446
3447 /*! \brief Logical AND on float SIMD4 booleans.
3448  * \copydetails gmx_simd_and_fb
3449  */
3450 #    define gmx_simd4_and_fb gmx_simd_and_fb
3451
3452 /*! \brief Logical OR on float SIMD4 booleans.
3453  * \copydetails gmx_simd_or_fb
3454  */
3455 #    define gmx_simd4_or_fb gmx_simd_or_fb
3456
3457 /*! \brief Returns non-zero if any of the SIMD4 boolean in x is True.
3458  * \copydetails gmx_simd_anytrue_fb
3459  */
3460 #    define gmx_simd4_anytrue_fb gmx_simd_anytrue_fb
3461
3462 /*! \brief Select from single precision SIMD4 variable where boolean is true.
3463  * \copydetails gmx_simd_blendzero_f
3464  */
3465 #    define gmx_simd4_blendzero_f gmx_simd_blendzero_f
3466
3467 /*! \brief Select from single precision SIMD4 variable where boolean is false.
3468  * \copydetails gmx_simd_blendnotzero_f
3469  */
3470 #    define gmx_simd4_blendnotzero_f gmx_simd_blendnotzero_f
3471
3472 /*! \brief Vector-blend instruction form SIMD4 float.
3473  * \copydetails gmx_simd_blendv_f
3474  */
3475 #    define gmx_simd4_blendv_f  gmx_simd_blendv_f
3476
3477 /*! \brief Return sum of all elements in SIMD4 float.
3478  * \copydetails gmx_simd_reduce_f
3479  */
3480 #    define gmx_simd4_reduce_f  gmx_simd_reduce_f
3481
3482 #else /* GMX_SIMD_FLOAT_WIDTH!=4 */
3483 #    undef GMX_SIMD4_HAVE_FLOAT
3484 #endif
3485
3486
3487 #if (GMX_SIMD_DOUBLE_WIDTH == 4) || (defined DOXYGEN)
3488
3489 /*! \brief SIMD4 double type. Available with \ref GMX_SIMD4_HAVE_DOUBLE.
3490  *
3491  * Unless you specifically want a double-precision type you should check
3492  * \ref gmx_simd4_real_t instead.
3493  *
3494  * While the SIMD4 datatype is identical to the normal SIMD type in the
3495  * reference implementation, this will often not be the case for
3496  * other architectures.
3497  */
3498 #    define gmx_simd4_double_t   gmx_simd_double_t
3499
3500 /*! \brief Double precision SIMD4 load aligned.
3501  * \copydetails gmx_simd_load_d
3502  */
3503 #    define gmx_simd4_load_d     gmx_simd_load_d
3504
3505 /*! \brief Double precision SIMD4 load single value to all elements.
3506  * \copydetails gmx_simd_load1_d
3507  */
3508 #    define gmx_simd4_load1_d    gmx_simd_load1_d
3509
3510 /*! \brief Double precision SIMD4 set all elements from value.
3511  * \copydetails gmx_simd_set1_d
3512  */
3513 #    define gmx_simd4_set1_d     gmx_simd_set1_d
3514
3515 /*! \brief Double precision SIMD4 store to aligned memory.
3516  * \copydetails gmx_simd_store_d
3517  */
3518 #    define gmx_simd4_store_d   gmx_simd_store_d
3519
3520 /*! \brief Load unaligned SIMD4 double.
3521  * \copydetails gmx_simd_loadu_d
3522  */
3523 #    define gmx_simd4_loadu_d   gmx_simd_loadu_d
3524
3525 /*! \brief Store unaligned SIMD4 double.
3526  * \copydetails gmx_simd_storeu_d
3527  */
3528 #    define gmx_simd4_storeu_d  gmx_simd_storeu_d
3529
3530 /*! \brief Set all elements in SIMD4 double to 0.0.
3531  * \copydetails gmx_simd_setzero_d
3532  */
3533 #    define gmx_simd4_setzero_d gmx_simd_setzero_d
3534
3535 /*! \brief Bitwise and for two SIMD4 double variables.
3536  * \copydetails gmx_simd_and_d
3537  */
3538 #    define gmx_simd4_and_d     gmx_simd_and_d
3539
3540 /*! \brief Bitwise andnot for SIMD4 double. c=(~a) & b.
3541  * \copydetails gmx_simd_andnot_d
3542  */
3543 #    define gmx_simd4_andnot_d  gmx_simd_andnot_d
3544
3545 /*! \brief Bitwise or for SIMD4 double.
3546  * \copydetails gmx_simd_or_d
3547  */
3548 #    define gmx_simd4_or_d      gmx_simd_or_d
3549
3550 /*! \brief Bitwise xor for SIMD4 double.
3551  * \copydetails gmx_simd_xor_d
3552  */
3553 #    define gmx_simd4_xor_d     gmx_simd_xor_d
3554
3555 /*! \brief Add two SIMD4 double values.
3556  * \copydetails gmx_simd_add_d
3557  */
3558 #    define gmx_simd4_add_d     gmx_simd_add_d
3559
3560 /*! \brief Subtract two SIMD4 double values.
3561  * \copydetails gmx_simd_sub_d
3562  */
3563 #    define gmx_simd4_sub_d     gmx_simd_sub_d
3564
3565 /*! \brief Multiply two SIMD4 double values.
3566  * \copydetails gmx_simd_mul_d
3567  */
3568 #    define gmx_simd4_mul_d     gmx_simd_mul_d
3569
3570 /*! \brief Fused-multiply-add for SIMD4 double. Result is a*b+c.
3571  * \copydetails gmx_simd_fmadd_d
3572  */
3573 #    define gmx_simd4_fmadd_d   gmx_simd_fmadd_d
3574
3575 /*! \brief Fused-multiply-subtract for SIMD4 double. Result is a*b-c.
3576  * \copydetails gmx_simd_fmsub_d
3577  */
3578 #    define gmx_simd4_fmsub_d   gmx_simd_fmsub_d
3579
3580 /*! \brief Fused-negated-multiply-add for SIMD4 double. Result is -a*b+c.
3581  * \copydetails gmx_simd_fnmadd_d
3582  */
3583 #    define gmx_simd4_fnmadd_d  gmx_simd_fnmadd_d
3584
3585 /*! \brief Fused-negated-multiply-sub for SIMD4 double. Result is -a*b-c.
3586  * \copydetails gmx_simd_fnmsub_d
3587  */
3588 #    define gmx_simd4_fnmsub_d  gmx_simd_fnmsub_d
3589
3590 /*! \brief SIMD4 double 1.0/sqrt(x) lookup.
3591  * \copydetails gmx_simd_rsqrt_d
3592  */
3593 #    define gmx_simd4_rsqrt_d   gmx_simd_rsqrt_d
3594
3595 /*! \brief SIMD4 double Floating-point fabs().
3596  * \copydetails gmx_simd_fabs_d
3597  */
3598 #    define gmx_simd4_fabs_d    gmx_simd_fabs_d
3599
3600 /*! \brief SIMD4 double floating-point negate.
3601  * \copydetails gmx_simd_fneg_d
3602  */
3603 #    define gmx_simd4_fneg_d    gmx_simd_fneg_d
3604
3605 /*! \brief Set each SIMD4 element to the largest from two variables.
3606  * \copydetails gmx_simd_max_d
3607  */
3608 #    define gmx_simd4_max_d     gmx_simd_max_d
3609
3610 /*! \brief Set each SIMD4 element to the smallest from two variables.
3611  * \copydetails gmx_simd_min_d
3612  */
3613 #    define gmx_simd4_min_d     gmx_simd_min_d
3614
3615 /*!  \brief Round SIMD4 double to nearest integer value (in floating-point format).
3616  * \copydetails gmx_simd_round_d
3617  */
3618 #    define gmx_simd4_round_d   gmx_simd_round_d
3619
3620 /*! \brief Truncate SIMD4 double, i.e. round towards zero.
3621  * \copydetails gmx_simd_trunc_d
3622  */
3623 #    define gmx_simd4_trunc_d   gmx_simd_trunc_d
3624
3625 /*! \brief Return dot product of two double precision SIMD4 variables.
3626  * \copydetails gmx_simd_setzero_f
3627  */
3628 static gmx_inline double
3629 gmx_simd4_dotproduct3_d(gmx_simd_double_t a, gmx_simd_double_t b)
3630 {
3631     return a.r[0]*b.r[0]+a.r[1]*b.r[1]+a.r[2]*b.r[2];
3632 }
3633
3634 /*! \brief SIMD4 variable type to use for logical comparisons on doubles.
3635  * \copydetails gmx_simd_dbool_t
3636  */
3637 #    define gmx_simd4_dbool_t   gmx_simd_dbool_t
3638
3639 /*! \brief Equality comparison of two double precision SIMD4 values.
3640  * \copydetails gmx_simd_cmpeq_d
3641  */
3642 #    define gmx_simd4_cmpeq_d   gmx_simd_cmpeq_d
3643
3644 /*! \brief Less-than comparison of two double precision SIMD4 values.
3645  * \copydetails gmx_simd_cmplt_d
3646  */
3647 #    define gmx_simd4_cmplt_d   gmx_simd_cmplt_d
3648
3649 /*! \brief Less-than comparison of two double precision SIMD4 values.
3650  * \copydetails gmx_simd_cmple_d
3651  */
3652 #    define gmx_simd4_cmple_d   gmx_simd_cmple_d
3653
3654 /*! \brief Logical AND on double SIMD4 booleans.
3655  * \copydetails gmx_simd_and_db
3656  */
3657 #    define gmx_simd4_and_db gmx_simd_and_db
3658
3659 /*! \brief Logical OR on double SIMD4 booleans.
3660  * \copydetails gmx_simd_or_db
3661  */
3662 #    define gmx_simd4_or_db gmx_simd_or_db
3663
3664 /*! \brief Returns non-zero if any of the SIMD4 booleans in x is True.
3665  * \copydetails gmx_simd_anytrue_db
3666  */
3667 #    define gmx_simd4_anytrue_db gmx_simd_anytrue_db
3668
3669 /*! \brief Select from double precision SIMD4 variable where boolean is true.
3670  * \copydetails gmx_simd_blendzero_d
3671  */
3672 #    define gmx_simd4_blendzero_d gmx_simd_blendzero_d
3673
3674 /*! \brief Select from double precision SIMD4 variable where boolean is false.
3675  * \copydetails gmx_simd_blendnotzero_d
3676  */
3677 #    define gmx_simd4_blendnotzero_d gmx_simd_blendnotzero_d
3678
3679 /*! \brief Vector-blend instruction for SIMD4 double.
3680  * \copydetails gmx_simd_blendv_d
3681  */
3682 #    define gmx_simd4_blendv_d  gmx_simd_blendv_d
3683
3684 /*! \brief Return sum of all elements in SIMD4 double.
3685  * \copydetails gmx_simd_reduce_d
3686  */
3687 #    define gmx_simd4_reduce_d  gmx_simd_reduce_d
3688
3689 #else /* GMX_SIMD4_DOUBLE_WIDTH!=4 */
3690 #    undef GMX_SIMD4_HAVE_DOUBLE
3691 #endif
3692
3693 /*! \} */
3694
3695
3696 /*! \brief Return 1 if SIMD floating-point ops have overflowed, and reset check.
3697
3698  * This function to check whether SIMD operations have resulted in overflow,
3699  * and returns 1 if it occured, 0 otherwise.
3700  * For now, this is unfortunately a dummy for all architectures except x86.
3701  */
3702 static int
3703 gmx_simd_check_and_reset_overflow(void)
3704 {
3705     return 0;
3706 }
3707
3708 /*! \} */
3709 /*! \endcond */
3710
3711 #endif /* GMX_SIMD_IMPL_REFERENCE_H */