src/gromacs/simd/impl_reference/impl_reference_util_float.h

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2014,2015,2017, by the GROMACS development team, led by
   5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   6  * and including many others, as listed in the AUTHORS file in the
   7  * top-level source directory and at http://www.gromacs.org.
   8  *
   9  * GROMACS is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public License
  11  * as published by the Free Software Foundation; either version 2.1
  12  * of the License, or (at your option) any later version.
  13  *
  14  * GROMACS is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public
  20  * License along with GROMACS; if not, see
  21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  23  *
  24  * If you want to redistribute modifications to GROMACS, please
  25  * consider that scientific software is very special. Version
  26  * control is crucial - bugs must be traceable. We will be happy to
  27  * consider code for inclusion in the official distribution, but
  28  * derived work must not be called official GROMACS. Details are found
  29  * in the README & COPYING files - if they are missing, get the
  30  * official version at http://www.gromacs.org.
  31  *
  32  * To help us fund GROMACS development, we humbly ask that you cite
  33  * the research papers on the package. Check out http://www.gromacs.org.
  34  */
  35
  36 #ifndef GMX_SIMD_IMPL_REFERENCE_UTIL_FLOAT_H
  37 #define GMX_SIMD_IMPL_REFERENCE_UTIL_FLOAT_H
  38
  39 /*! \libinternal \file
  40  *
  41  * \brief Reference impl., higher-level single prec. SIMD utility functions
  42  *
  43  * \author Erik Lindahl <erik.lindahl@scilifelab.se>
  44  *
  45  * \ingroup module_simd
  46  */
  47
  48 /* Avoid adding dependencies on the rest of GROMACS here (e.g. gmxassert.h)
  49  * since we want to be able run the low-level SIMD implementations independently
  50  * in simulators for new hardware.
  51  */
  52
  53 #include "config.h"
  54
  55 #include <cassert>
  56 #include <cstddef>
  57 #include <cstdint>
  58
  59 #include <algorithm>
  60
  61 #include "impl_reference_definitions.h"
  62 #include "impl_reference_simd_float.h"
  63
  64 namespace gmx
  65 {
  66
  67 /*! \cond libapi */
  68 /*! \addtogroup module_simd */
  69 /*! \{ */
  70
  71 /* \name Higher-level SIMD utility functions, single precision.
  72  *
  73  * These include generic functions to work with triplets of data, typically
  74  * coordinates, and a few utility functions to load and update data in the
  75  * nonbonded kernels.
  76  * These functions should be available on all implementations, although
  77  * some wide SIMD implementations (width>=8) also provide special optional
  78  * versions to work with half or quarter registers to improve the performance
  79  * in the nonbonded kernels.
  80  *
  81  * \{
  82  */
  83
  84 /*! \brief Load 4 consecutive floats from each of GMX_SIMD_FLOAT_WIDTH offsets,
  85  *         and transpose into 4 SIMD float variables.
  86  *
  87  * \tparam     align  Alignment of the memory from which we read, i.e. distance
  88  *                    (measured in elements, not bytes) between index points.
  89  *                    When this is identical to the number of SIMD variables
  90  *                    (i.e., 4 for this routine) the input data is packed without
  91  *                    padding in memory. See the SIMD parameters for exactly
  92  *                    what memory positions are loaded.
  93  * \param      base   Pointer to the start of the memory area
  94  * \param      offset Array with offsets to the start of each data point.
  95  * \param[out] v0     1st component of data, base[align*offset[i]] for each i.
  96  * \param[out] v1     2nd component of data, base[align*offset[i] + 1] for each i.
  97  * \param[out] v2     3rd component of data, base[align*offset[i] + 2] for each i.
  98  * \param[out] v3     4th component of data, base[align*offset[i] + 3] for each i.
  99  *
 100  * The floating-point memory locations must be aligned, but only to the smaller
 101  * of four elements and the floating-point SIMD width.
 102  *
 103  * The offset memory must be aligned to GMX_SIMD_DINT32_WIDTH.
 104  *
 105  * \note You should NOT scale offsets before calling this routine; it is
 106  *       done internally by using the alignment template parameter instead.
 107  */
 108 template <int align>
 109 static inline void gmx_simdcall
 110 gatherLoadTranspose(const float  *        base,
 111                     const std::int32_t    offset[],
 112                     SimdFloat *           v0,
 113                     SimdFloat *           v1,
 114                     SimdFloat *           v2,
 115                     SimdFloat *           v3)
 116 {
 117     // Offset list must be aligned for SIMD FINT32
 118     assert(std::size_t(offset) % (GMX_SIMD_FINT32_WIDTH*sizeof(std::int32_t)) == 0);
 119     // Base pointer must be aligned to the smaller of 4 elements and float SIMD width
 120     assert(std::size_t(base) % (std::min(GMX_SIMD_FLOAT_WIDTH, 4)*sizeof(float)) == 0);
 121     // align parameter must also be a multiple of the above alignment requirement
 122     assert(align % std::min(GMX_SIMD_FLOAT_WIDTH, 4) == 0);
 123
 124     for (std::size_t i = 0; i < v0->simdInternal_.size(); i++)
 125     {
 126         v0->simdInternal_[i] = base[align * offset[i]];
 127         v1->simdInternal_[i] = base[align * offset[i] + 1];
 128         v2->simdInternal_[i] = base[align * offset[i] + 2];
 129         v3->simdInternal_[i] = base[align * offset[i] + 3];
 130     }
 131 }
 132
 133 /*! \brief Load 2 consecutive floats from each of GMX_SIMD_FLOAT_WIDTH offsets,
 134  *         and transpose into 2 SIMD float variables.
 135  *
 136  * \tparam     align  Alignment of the memory from which we read, i.e. distance
 137  *                    (measured in elements, not bytes) between index points.
 138  *                    When this is identical to the number of SIMD variables
 139  *                    (i.e., 2 for this routine) the input data is packed without
 140  *                    padding in memory. See the SIMD parameters for exactly
 141  *                    what memory positions are loaded.
 142  * \param      base   Pointer to the start of the memory area
 143  * \param      offset Array with offsets to the start of each data point.
 144  * \param[out] v0     1st component of data, base[align*offset[i]] for each i.
 145  * \param[out] v1     2nd component of data, base[align*offset[i] + 1] for each i.
 146  *
 147  * The floating-point memory locations must be aligned, but only to the smaller
 148  * of two elements and the floating-point SIMD width.
 149  *
 150  * The offset memory must be aligned to GMX_SIMD_FINT32_WIDTH.
 151  *
 152  * To achieve the best possible performance, you should store your data with
 153  * alignment \ref c_simdBestPairAlignmentFloat in single, or
 154  * \ref c_simdBestPairAlignmentDouble in double.
 155  *
 156  * \note You should NOT scale offsets before calling this routine; it is
 157  *       done internally by using the alignment template parameter instead.
 158  */
 159 template <int align>
 160 static inline void gmx_simdcall
 161 gatherLoadTranspose(const float  *        base,
 162                     const std::int32_t    offset[],
 163                     SimdFloat *           v0,
 164                     SimdFloat *           v1)
 165 {
 166     // Offset list must be aligned for SIMD FINT32
 167     assert(std::size_t(offset) % (GMX_SIMD_FINT32_WIDTH*sizeof(std::int32_t)) == 0);
 168     // Base pointer must be aligned to the smaller of 2 elements and float SIMD width
 169     assert(std::size_t(base) % (std::min(GMX_SIMD_FLOAT_WIDTH, 2)*sizeof(float)) == 0);
 170     // align parameter must also be a multiple of the above alignment requirement
 171     assert(align % std::min(GMX_SIMD_FLOAT_WIDTH, 2) == 0);
 172
 173     for (std::size_t i = 0; i < v0->simdInternal_.size(); i++)
 174     {
 175         v0->simdInternal_[i] = base[align * offset[i]];
 176         v1->simdInternal_[i] = base[align * offset[i] + 1];
 177     }
 178 }
 179
 180
 181 /*! \brief Best alignment to use for aligned pairs of float data.
 182  *
 183  *  The routines to load and transpose data will work with a wide range of
 184  *  alignments, but some might be faster than others, depending on the load
 185  *  instructions available in the hardware. This specifies the best
 186  *  alignment for each implementation when working with pairs of data.
 187  *
 188  *  To allow each architecture to use the most optimal form, we use a constant
 189  *  that code outside the SIMD module should use to store things properly. It
 190  *  must be at least 2. For example, a value of 2 means the two parameters A & B
 191  *  are stored as [A0 B0 A1 B1] while align-4 means [A0 B0 - - A1 B1 - -].
 192  *
 193  *  This alignment depends on the efficiency of partial-register load/store
 194  *  operations, and will depend on the architecture.
 195  */
 196 static const int c_simdBestPairAlignmentFloat = 2;
 197
 198
 199 /*! \brief Load 3 consecutive floats from each of GMX_SIMD_FLOAT_WIDTH offsets,
 200  *         and transpose into 3 SIMD float variables.
 201  *
 202  * \tparam     align  Alignment of the memory from which we read, i.e. distance
 203  *                    (measured in elements, not bytes) between index points.
 204  *                    When this is identical to the number of SIMD variables
 205  *                    (i.e., 3 for this routine) the input data is packed without
 206  *                    padding in memory. See the SIMD parameters for exactly
 207  *                    what memory positions are loaded.
 208  * \param      base   Pointer to the start of the memory area
 209  * \param      offset Array with offsets to the start of each data point.
 210  * \param[out] v0     1st component of data, base[align*offset[i]] for each i.
 211  * \param[out] v1     2nd component of data, base[align*offset[i] + 1] for each i.
 212  * \param[out] v2     3rd component of data, base[align*offset[i] + 2] for each i.
 213  *
 214  * This function can work with both aligned (better performance) and unaligned
 215  * memory. When the align parameter is not a power-of-two (align==3 would be normal
 216  * for packed atomic coordinates) the memory obviously cannot be aligned, and
 217  * we account for this.
 218  * However, in the case where align is a power-of-two, we assume the base pointer
 219  * also has the same alignment, which will enable many platforms to use faster
 220  * aligned memory load operations.
 221  * An easy way to think of this is that each triplet of data in memory must be
 222  * aligned to the align parameter you specify when it's a power-of-two.
 223  *
 224  * The offset memory must always be aligned to GMX_SIMD_FINT32_WIDTH, since this
 225  * enables us to use SIMD loads and gather operations on platforms that support it.
 226  *
 227  * \note You should NOT scale offsets before calling this routine; it is
 228  *       done internally by using the alignment template parameter instead.
 229  * \note This routine uses a normal array for the offsets, since we typically
 230  *       load this data from memory. On the architectures we have tested this
 231  *       is faster even when a SIMD integer datatype is present.
 232  * \note To improve performance, this function might use full-SIMD-width
 233  *       unaligned loads. This means you need to ensure the memory is padded
 234  *       at the end, so we always can load GMX_SIMD_REAL_WIDTH elements
 235  *       starting at the last offset. If you use the Gromacs aligned memory
 236  *       allocation routines this will always be the case.
 237  */
 238 template <int align>
 239 static inline void gmx_simdcall
 240 gatherLoadUTranspose(const float  *        base,
 241                      const std::int32_t    offset[],
 242                      SimdFloat *           v0,
 243                      SimdFloat *           v1,
 244                      SimdFloat *           v2)
 245 {
 246     // Offset list must be aligned for SIMD FINT32
 247     assert(std::size_t(offset) % (GMX_SIMD_FINT32_WIDTH*sizeof(std::int32_t)) == 0);
 248
 249     for (std::size_t i = 0; i < v0->simdInternal_.size(); i++)
 250     {
 251         v0->simdInternal_[i] = base[align * offset[i]];
 252         v1->simdInternal_[i] = base[align * offset[i] + 1];
 253         v2->simdInternal_[i] = base[align * offset[i] + 2];
 254     }
 255 }
 256
 257
 258 /*! \brief Transpose and store 3 SIMD floats to 3 consecutive addresses at
 259  *         GMX_SIMD_FLOAT_WIDTH offsets.
 260  *
 261  * \tparam     align  Alignment of the memory to which we write, i.e. distance
 262  *                    (measured in elements, not bytes) between index points.
 263  *                    When this is identical to the number of SIMD variables
 264  *                    (i.e., 3 for this routine) the output data is packed without
 265  *                    padding in memory. See the SIMD parameters for exactly
 266  *                    what memory positions are written.
 267  * \param[out] base   Pointer to the start of the memory area
 268  * \param      offset Aligned array with offsets to the start of each triplet.
 269  * \param      v0     1st component of triplets, written to base[align*offset[i]].
 270  * \param      v1     2nd component of triplets, written to base[align*offset[i] + 1].
 271  * \param      v2     3rd component of triplets, written to base[align*offset[i] + 2].
 272  *
 273  * This function can work with both aligned (better performance) and unaligned
 274  * memory. When the align parameter is not a power-of-two (align==3 would be normal
 275  * for packed atomic coordinates) the memory obviously cannot be aligned, and
 276  * we account for this.
 277  * However, in the case where align is a power-of-two, we assume the base pointer
 278  * also has the same alignment, which will enable many platforms to use faster
 279  * aligned memory store operations.
 280  * An easy way to think of this is that each triplet of data in memory must be
 281  * aligned to the align parameter you specify when it's a power-of-two.
 282  *
 283  * The offset memory must always be aligned to GMX_SIMD_FINT32_WIDTH, since this
 284  * enables us to use SIMD loads and gather operations on platforms that support it.
 285  *
 286  * \note You should NOT scale offsets before calling this routine; it is
 287  *       done internally by using the alignment template parameter instead.
 288  * \note This routine uses a normal array for the offsets, since we typically
 289  *       load the data from memory. On the architectures we have tested this
 290  *       is faster even when a SIMD integer datatype is present.
 291  */
 292 template <int align>
 293 static inline void gmx_simdcall
 294 transposeScatterStoreU(float  *              base,
 295                        const std::int32_t    offset[],
 296                        SimdFloat             v0,
 297                        SimdFloat             v1,
 298                        SimdFloat             v2)
 299 {
 300     // Offset list must be aligned for SIMD FINT32
 301     assert(std::size_t(offset) % (GMX_SIMD_FINT32_WIDTH*sizeof(std::int32_t)) == 0);
 302
 303     for (std::size_t i = 0; i < v0.simdInternal_.size(); i++)
 304     {
 305         base[align * offset[i]]     = v0.simdInternal_[i];
 306         base[align * offset[i] + 1] = v1.simdInternal_[i];
 307         base[align * offset[i] + 2] = v2.simdInternal_[i];
 308     }
 309 }
 310
 311
 312 /*! \brief Transpose and add 3 SIMD floats to 3 consecutive addresses at
 313  *         GMX_SIMD_FLOAT_WIDTH offsets.
 314  *
 315  * \tparam     align  Alignment of the memory to which we write, i.e. distance
 316  *                    (measured in elements, not bytes) between index points.
 317  *                    When this is identical to the number of SIMD variables
 318  *                    (i.e., 3 for this routine) the output data is packed without
 319  *                    padding in memory. See the SIMD parameters for exactly
 320  *                    what memory positions are incremented.
 321  * \param[out] base   Pointer to the start of the memory area
 322  * \param      offset Aligned array with offsets to the start of each triplet.
 323  * \param      v0     1st component of triplets, added to base[align*offset[i]].
 324  * \param      v1     2nd component of triplets, added to base[align*offset[i] + 1].
 325  * \param      v2     3rd component of triplets, added to base[align*offset[i] + 2].
 326  *
 327  * This function can work with both aligned (better performance) and unaligned
 328  * memory. When the align parameter is not a power-of-two (align==3 would be normal
 329  * for packed atomic coordinates) the memory obviously cannot be aligned, and
 330  * we account for this.
 331  * However, in the case where align is a power-of-two, we assume the base pointer
 332  * also has the same alignment, which will enable many platforms to use faster
 333  * aligned memory load/store operations.
 334  * An easy way to think of this is that each triplet of data in memory must be
 335  * aligned to the align parameter you specify when it's a power-of-two.
 336  *
 337  * The offset memory must always be aligned to GMX_SIMD_FINT32_WIDTH, since this
 338  * enables us to use SIMD loads and gather operations on platforms that support it.
 339  *
 340  * \note You should NOT scale offsets before calling this routine; it is
 341  *       done internally by using the alignment template parameter instead.
 342  * \note This routine uses a normal array for the offsets, since we typically
 343  *       load the data from memory. On the architectures we have tested this
 344  *       is faster even when a SIMD integer datatype is present.
 345  * \note To improve performance, this function might use full-SIMD-width
 346  *       unaligned load/store, and add 0.0 to the extra elements.
 347  *       This means you need to ensure the memory is padded
 348  *       at the end, so we always can load GMX_SIMD_REAL_WIDTH elements
 349  *       starting at the last offset. If you use the Gromacs aligned memory
 350  *       allocation routines this will always be the case.
 351  */
 352 template <int align>
 353 static inline void gmx_simdcall
 354 transposeScatterIncrU(float  *              base,
 355                       const std::int32_t    offset[],
 356                       SimdFloat             v0,
 357                       SimdFloat             v1,
 358                       SimdFloat             v2)
 359 {
 360     // Offset list must be aligned for SIMD FINT32
 361     assert(std::size_t(offset) % (GMX_SIMD_FINT32_WIDTH*sizeof(std::int32_t)) == 0);
 362
 363     for (std::size_t i = 0; i < v0.simdInternal_.size(); i++)
 364     {
 365         base[align * offset[i]]     += v0.simdInternal_[i];
 366         base[align * offset[i] + 1] += v1.simdInternal_[i];
 367         base[align * offset[i] + 2] += v2.simdInternal_[i];
 368     }
 369 }
 370
 371
 372 /*! \brief Transpose and subtract 3 SIMD floats to 3 consecutive addresses at
 373  *         GMX_SIMD_FLOAT_WIDTH offsets.
 374  *
 375  * \tparam     align  Alignment of the memory to which we write, i.e. distance
 376  *                    (measured in elements, not bytes) between index points.
 377  *                    When this is identical to the number of SIMD variables
 378  *                    (i.e., 3 for this routine) the output data is packed without
 379  *                    padding in memory. See the SIMD parameters for exactly
 380  *                    what memory positions are decremented.
 381  * \param[out] base    Pointer to start of memory.
 382  * \param      offset  Aligned array with offsets to the start of each triplet.
 383  * \param      v0      1st component, subtracted from base[align*offset[i]]
 384  * \param      v1      2nd component, subtracted from base[align*offset[i]+1]
 385  * \param      v2      3rd component, subtracted from base[align*offset[i]+2]
 386  *
 387  * This function can work with both aligned (better performance) and unaligned
 388  * memory. When the align parameter is not a power-of-two (align==3 would be normal
 389  * for packed atomic coordinates) the memory obviously cannot be aligned, and
 390  * we account for this.
 391  * However, in the case where align is a power-of-two, we assume the base pointer
 392  * also has the same alignment, which will enable many platforms to use faster
 393  * aligned memory load/store operations.
 394  * An easy way to think of this is that each triplet of data in memory must be
 395  * aligned to the align parameter you specify when it's a power-of-two.
 396  *
 397  * The offset memory must always be aligned to GMX_SIMD_FINT32_WIDTH, since this
 398  * enables us to use SIMD loads and gather operations on platforms that support it.
 399  *
 400  * \note You should NOT scale offsets before calling this routine; it is
 401  *       done internally by using the alignment template parameter instead.
 402  * \note This routine uses a normal array for the offsets, since we typically
 403  *       load the data from memory. On the architectures we have tested this
 404  *       is faster even when a SIMD integer datatype is present.
 405  * \note To improve performance, this function might use full-SIMD-width
 406  *       unaligned load/store, and subtract 0.0 from the extra elements.
 407  *       This means you need to ensure the memory is padded
 408  *       at the end, so we always can load GMX_SIMD_REAL_WIDTH elements
 409  *       starting at the last offset. If you use the Gromacs aligned memory
 410  *       allocation routines this will always be the case.
 411  */
 412 template <int align>
 413 static inline void gmx_simdcall
 414 transposeScatterDecrU(float  *              base,
 415                       const std::int32_t    offset[],
 416                       SimdFloat             v0,
 417                       SimdFloat             v1,
 418                       SimdFloat             v2)
 419 {
 420     // Offset list must be aligned for SIMD FINT32
 421     assert(std::size_t(offset) % (GMX_SIMD_FINT32_WIDTH*sizeof(std::int32_t)) == 0);
 422
 423     for (std::size_t i = 0; i < v0.simdInternal_.size(); i++)
 424     {
 425         base[align * offset[i]]     -= v0.simdInternal_[i];
 426         base[align * offset[i] + 1] -= v1.simdInternal_[i];
 427         base[align * offset[i] + 2] -= v2.simdInternal_[i];
 428     }
 429 }
 430
 431
 432 /*! \brief Expand each element of float SIMD variable into three identical
 433  *         consecutive elements in three SIMD outputs.
 434  *
 435  * \param      scalar    Floating-point input, e.g. [s0 s1 s2 s3] if width=4.
 436  * \param[out] triplets0 First output, e.g. [s0 s0 s0 s1] if width=4.
 437  * \param[out] triplets1 Second output, e.g. [s1 s1 s2 s2] if width=4.
 438  * \param[out] triplets2 Third output, e.g. [s2 s3 s3 s3] if width=4.
 439  *
 440  * This routine is meant to use for things like scalar-vector multiplication,
 441  * where the vectors are stored in a merged format like [x0 y0 z0 x1 y1 z1 ...],
 442  * while the scalars are stored as [s0 s1 s2...], and the data cannot easily
 443  * be changed to SIMD-friendly layout.
 444  *
 445  * In this case, load 3 full-width SIMD variables from the vector array (This
 446  * will always correspond to GMX_SIMD_FLOAT_WIDTH triplets),
 447  * load a single full-width variable from the scalar array, and
 448  * call this routine to expand the data. You can then simply multiply the
 449  * first, second and third pair of SIMD variables, and store the three
 450  * results back into a suitable vector-format array.
 451  */
 452 static inline void gmx_simdcall
 453 expandScalarsToTriplets(SimdFloat    scalar,
 454                         SimdFloat *  triplets0,
 455                         SimdFloat *  triplets1,
 456                         SimdFloat *  triplets2)
 457 {
 458     for (std::size_t i = 0; i < scalar.simdInternal_.size(); i++)
 459     {
 460         triplets0->simdInternal_[i] = scalar.simdInternal_[i / 3];
 461         triplets1->simdInternal_[i] = scalar.simdInternal_[(i + scalar.simdInternal_.size()) / 3];
 462         triplets2->simdInternal_[i] = scalar.simdInternal_[(i + 2 * scalar.simdInternal_.size()) / 3];
 463     }
 464 }
 465
 466 /*! \brief Load 4 consecutive floats from each of GMX_SIMD_FLOAT_WIDTH offsets
 467  *         specified by a SIMD integer, transpose into 4 SIMD float variables.
 468  *
 469  * \tparam     align  Alignment of the memory from which we read, i.e. distance
 470  *                    (measured in elements, not bytes) between index points.
 471  *                    When this is identical to the number of SIMD variables
 472  *                    (i.e., 4 for this routine) the input data is packed without
 473  *                    padding in memory. See the SIMD parameters for exactly
 474  *                    what memory positions are loaded.
 475  * \param      base   Aligned pointer to the start of the memory.
 476  * \param      offset SIMD integer type with offsets to the start of each triplet.
 477  * \param[out] v0     First component, base[align*offset[i]] for each i.
 478  * \param[out] v1     Second component, base[align*offset[i] + 1] for each i.
 479  * \param[out] v2     Third component, base[align*offset[i] + 2] for each i.
 480  * \param[out] v3     Fourth component, base[align*offset[i] + 3] for each i.
 481  *
 482  * The floating-point memory locations must be aligned, but only to the smaller
 483  * of four elements and the floating-point SIMD width.
 484  *
 485  * \note You should NOT scale offsets before calling this routine; it is
 486  *       done internally by using the alignment template parameter instead.
 487  * \note This is a special routine primarily intended for loading Gromacs
 488  *       table data as efficiently as possible - this is the reason for using
 489  *       a SIMD offset index, since the result of the  real-to-integer conversion
 490  *       is present in a SIMD register just before calling this routine.
 491  */
 492 template <int align>
 493 static inline void gmx_simdcall
 494 gatherLoadBySimdIntTranspose(const float *       base,
 495                              SimdFInt32          offset,
 496                              SimdFloat *         v0,
 497                              SimdFloat *         v1,
 498                              SimdFloat *         v2,
 499                              SimdFloat *         v3)
 500 {
 501     // Base pointer must be aligned to the smaller of 4 elements and float SIMD width
 502     assert(std::size_t(base) % (std::min(GMX_SIMD_FLOAT_WIDTH, 4)*sizeof(float)) == 0);
 503     // align parameter must also be a multiple of the above alignment requirement
 504     assert(align % std::min(GMX_SIMD_FLOAT_WIDTH, 4) == 0);
 505
 506     for (std::size_t i = 0; i < v0->simdInternal_.size(); i++)
 507     {
 508         v0->simdInternal_[i] = base[align * offset.simdInternal_[i]];
 509         v1->simdInternal_[i] = base[align * offset.simdInternal_[i] + 1];
 510         v2->simdInternal_[i] = base[align * offset.simdInternal_[i] + 2];
 511         v3->simdInternal_[i] = base[align * offset.simdInternal_[i] + 3];
 512     }
 513 }
 514
 515
 516 /*! \brief Load 2 consecutive floats from each of GMX_SIMD_FLOAT_WIDTH offsets
 517  *         (unaligned) specified by SIMD integer, transpose into 2 SIMD floats.
 518  *
 519  * \tparam     align  Alignment of the memory from which we read, i.e. distance
 520  *                    (measured in elements, not bytes) between index points.
 521  *                    When this is identical to the number of SIMD variables
 522  *                    (i.e., 2 for this routine) the input data is packed without
 523  *                    padding in memory. See the SIMD parameters for exactly
 524  *                    what memory positions are loaded.
 525  * \param      base   Pointer to the start of the memory.
 526  * \param      offset SIMD integer type with offsets to the start of each triplet.
 527  * \param[out] v0     First component, base[align*offset[i]] for each i.
 528  * \param[out] v1     Second component, base[align*offset[i] + 1] for each i.
 529  *
 530  * Since some SIMD architectures cannot handle any unaligned loads, this routine
 531  * is only available if GMX_SIMD_HAVE_GATHER_LOADU_BYSIMDINT_TRANSPOSE is 1.
 532  *
 533  * \note You should NOT scale offsets before calling this routine; it is
 534  *       done internally by using the alignment template parameter instead.
 535  * \note This is a special routine primarily intended for loading Gromacs
 536  *       table data as efficiently as possible - this is the reason for using
 537  *       a SIMD offset index, since the result of the  real-to-integer conversion
 538  *       is present in a SIMD register just before calling this routine.
 539  */
 540 template <int align>
 541 static inline void gmx_simdcall
 542 gatherLoadUBySimdIntTranspose(const float *       base,
 543                               SimdFInt32          offset,
 544                               SimdFloat *         v0,
 545                               SimdFloat *         v1)
 546 {
 547     for (std::size_t i = 0; i < v0->simdInternal_.size(); i++)
 548     {
 549         v0->simdInternal_[i] = base[align * offset.simdInternal_[i]];
 550         v1->simdInternal_[i] = base[align * offset.simdInternal_[i] + 1];
 551     }
 552 }
 553
 554 /*! \brief Load 2 consecutive floats from each of GMX_SIMD_FLOAT_WIDTH offsets
 555  *         specified by a SIMD integer, transpose into 2 SIMD float variables.
 556  *
 557  * \tparam     align  Alignment of the memory from which we read, i.e. distance
 558  *                    (measured in elements, not bytes) between index points.
 559  *                    When this is identical to the number of SIMD variables
 560  *                    (i.e., 2 for this routine) the input data is packed without
 561  *                    padding in memory. See the SIMD parameters for exactly
 562  *                    what memory positions are loaded.
 563  * \param      base   Aligned pointer to the start of the memory.
 564  * \param      offset SIMD integer type with offsets to the start of each triplet.
 565  * \param[out] v0     First component, base[align*offset[i]] for each i.
 566  * \param[out] v1     Second component, base[align*offset[i] + 1] for each i.
 567  *
 568  * The floating-point memory locations must be aligned, but only to the smaller
 569  * of two elements and the floating-point SIMD width.
 570  *
 571  * \note You should NOT scale offsets before calling this routine; it is
 572  *       done internally by using the alignment template parameter instead.
 573  * \note This is a special routine primarily intended for loading Gromacs
 574  *       table data as efficiently as possible - this is the reason for using
 575  *       a SIMD offset index, since the result of the  real-to-integer conversion
 576  *       is present in a SIMD register just before calling this routine.
 577  */
 578 template <int align>
 579 static inline void gmx_simdcall
 580 gatherLoadBySimdIntTranspose(const float *       base,
 581                              SimdFInt32          offset,
 582                              SimdFloat *         v0,
 583                              SimdFloat *         v1)
 584 {
 585     // Base pointer must be aligned to the smaller of 2 elements and float SIMD width
 586     assert(std::size_t(base) % (std::min(GMX_SIMD_FLOAT_WIDTH, 2)*sizeof(float)) == 0);
 587     // align parameter must also be a multiple of the above alignment requirement
 588     assert(align % std::min(GMX_SIMD_FLOAT_WIDTH, 2) == 0);
 589
 590     for (std::size_t i = 0; i < v0->simdInternal_.size(); i++)
 591     {
 592         v0->simdInternal_[i] = base[align * offset.simdInternal_[i]];
 593         v1->simdInternal_[i] = base[align * offset.simdInternal_[i] + 1];
 594     }
 595 }
 596
 597
 598 /*! \brief Reduce each of four SIMD floats, add those values to four consecutive
 599  *         floats in memory, return sum.
 600  *
 601  * \param m   Pointer to memory where four floats should be incremented
 602  * \param v0  SIMD variable whose sum should be added to m[0]
 603  * \param v1  SIMD variable whose sum should be added to m[1]
 604  * \param v2  SIMD variable whose sum should be added to m[2]
 605  * \param v3  SIMD variable whose sum should be added to m[3]
 606  *
 607  * \return Sum of all elements in the four SIMD variables.
 608  *
 609  * The pointer m must be aligned to the smaller of four elements and the
 610  * floating-point SIMD width.
 611  *
 612  * \note This is a special routine intended for the Gromacs nonbonded kernels.
 613  * It is used in the epilogue of the outer loop, where the variables will
 614  * contain unrolled forces for one outer-loop-particle each, corresponding to
 615  * a single coordinate (i.e, say, four x-coordinate force variables). These
 616  * should be summed and added to the force array in memory. Since we always work
 617  * with contiguous SIMD-layout , we can use efficient aligned loads/stores.
 618  * When calculating the virial, we also need the total sum of all forces for
 619  * each coordinate. This is provided as the return value. For routines that
 620  * do not need these, this extra code will be optimized away completely if you
 621  * just ignore the return value (Checked with gcc-4.9.1 and clang-3.6 for AVX).
 622  */
 623 static inline float gmx_simdcall
 624 reduceIncr4ReturnSum(float *           m,
 625                      SimdFloat         v0,
 626                      SimdFloat         v1,
 627                      SimdFloat         v2,
 628                      SimdFloat         v3)
 629 {
 630     float sum[4]; // Note that the 4 here corresponds to the 4 m-elements, not any SIMD width
 631
 632     // Make sure the memory pointer is aligned to the smaller of 4 elements and float SIMD width
 633     assert(std::size_t(m) % (std::min(GMX_SIMD_FLOAT_WIDTH, 4)*sizeof(float)) == 0);
 634
 635     sum[0] = reduce(v0);
 636     sum[1] = reduce(v1);
 637     sum[2] = reduce(v2);
 638     sum[3] = reduce(v3);
 639
 640     m[0] += sum[0];
 641     m[1] += sum[1];
 642     m[2] += sum[2];
 643     m[3] += sum[3];
 644
 645     return sum[0] + sum[1] + sum[2] + sum[3];
 646 }
 647
 648
 649 /*! \}
 650  *
 651  * \name Higher-level SIMD utilities accessing partial (half-width) SIMD floats.
 652  *
 653  * These functions are optional. The are only useful for SIMD implementation
 654  * where the width is 8 or larger, and where it would be inefficient
 655  * to process 4*8, 8*8, or more, interactions in parallel.
 656  *
 657  * Currently, only Intel provides very wide SIMD implementations, but these
 658  * also come with excellent support for loading, storing, accessing and
 659  * shuffling parts of the register in so-called 'lanes' of 4 bytes each.
 660  * We can use this to load separate parts into the low/high halves of the
 661  * register in the inner loop of the nonbonded kernel, which e.g. makes it
 662  * possible to process 4*4 nonbonded interactions as a pattern of 2*8. We
 663  * can also use implementations with width 16 or greater.
 664  *
 665  * To make this more generic, when \ref GMX_SIMD_HAVE_HSIMD_UTIL_REAL is 1,
 666  * the SIMD implementation provides seven special routines that:
 667  *
 668  * - Load the low/high parts of a SIMD variable from different pointers
 669  * - Load half the SIMD width from one pointer, and duplicate in low/high parts
 670  * - Load two reals, put 1st one in all low elements, and 2nd in all high ones.
 671  * - Store the low/high parts of a SIMD variable to different pointers
 672  * - Subtract both SIMD halves from a single half-SIMD-width memory location.
 673  * - Load aligned pairs (LJ parameters) from two base pointers, with a common
 674  *   offset list, and put these in the low/high SIMD halves.
 675  * - Reduce each half of two SIMD registers (i.e., 4 parts in total), increment
 676  *   four adjacent memory positions, and return the total sum.
 677  *
 678  * Remember: this is ONLY used when the native SIMD width is large. You will
 679  * just waste time if you implement it for normal 16-byte SIMD architectures.
 680  *
 681  * This is part of the new C++ SIMD interface, so these functions are only
 682  * available when using C++. Since some Gromacs code reliying on the SIMD
 683  * module is still C (not C++), we have kept the C-style naming for now - this
 684  * will change once we are entirely C++.
 685  *
 686  * \{
 687  */
 688
 689
 690 /*! \brief Load low & high parts of SIMD float from different locations.
 691  *
 692  * \param m0 Pointer to memory aligned to half SIMD width.
 693  * \param m1 Pointer to memory aligned to half SIMD width.
 694  *
 695  * \return SIMD variable with low part loaded from m0, high from m1.
 696  *
 697  * Available if \ref GMX_SIMD_HAVE_HSIMD_UTIL_FLOAT is 1.
 698  */
 699 static inline SimdFloat gmx_simdcall
 700 loadDualHsimd(const float *  m0,
 701               const float *  m1)
 702 {
 703     SimdFloat        a;
 704
 705     // Make sure the memory pointers are aligned to half float SIMD width
 706     assert(std::size_t(m0) % (GMX_SIMD_FLOAT_WIDTH/2*sizeof(float)) == 0);
 707     assert(std::size_t(m1) % (GMX_SIMD_FLOAT_WIDTH/2*sizeof(float)) == 0);
 708
 709     for (std::size_t i = 0; i < a.simdInternal_.size()/2; i++)
 710     {
 711         a.simdInternal_[i]                            = m0[i];
 712         a.simdInternal_[a.simdInternal_.size()/2 + i] = m1[i];
 713     }
 714     return a;
 715 }
 716
 717 /*! \brief Load half-SIMD-width float data, spread to both halves.
 718  *
 719  * \param m Pointer to memory aligned to half SIMD width.
 720  *
 721  * \return SIMD variable with both halves loaded from m..
 722  *
 723  * Available if \ref GMX_SIMD_HAVE_HSIMD_UTIL_FLOAT is 1.
 724  */
 725 static inline SimdFloat gmx_simdcall
 726 loadDuplicateHsimd(const float *  m)
 727 {
 728     SimdFloat        a;
 729
 730     // Make sure the memory pointer is aligned
 731     assert(std::size_t(m) % (GMX_SIMD_FLOAT_WIDTH/2*sizeof(float)) == 0);
 732
 733     for (std::size_t i = 0; i < a.simdInternal_.size()/2; i++)
 734     {
 735         a.simdInternal_[i]                            = m[i];
 736         a.simdInternal_[a.simdInternal_.size()/2 + i] = a.simdInternal_[i];
 737     }
 738     return a;
 739 }
 740
 741 /*! \brief Load two floats, spread 1st in low half, 2nd in high half.
 742  *
 743  * \param m Pointer to two adjacent float values.
 744  *
 745  * \return SIMD variable where all elements in the low half have been set
 746  *         to m[0], and all elements in high half to m[1].
 747  *
 748  * \note This routine always loads two values and sets the halves separately.
 749  *       If you want to set all elements to the same value, simply use
 750  *       the standard (non-half-SIMD) operations.
 751  *
 752  * Available if \ref GMX_SIMD_HAVE_HSIMD_UTIL_FLOAT is 1.
 753  */
 754 static inline SimdFloat gmx_simdcall
 755 loadU1DualHsimd(const float *  m)
 756 {
 757     SimdFloat        a;
 758
 759     for (std::size_t i = 0; i < a.simdInternal_.size()/2; i++)
 760     {
 761         a.simdInternal_[i]                            = m[0];
 762         a.simdInternal_[a.simdInternal_.size()/2 + i] = m[1];
 763     }
 764     return a;
 765 }
 766
 767
 768 /*! \brief Store low & high parts of SIMD float to different locations.
 769  *
 770  * \param m0 Pointer to memory aligned to half SIMD width.
 771  * \param m1 Pointer to memory aligned to half SIMD width.
 772  * \param a  SIMD variable. Low half should be stored to m0, high to m1.
 773  *
 774  * Available if \ref GMX_SIMD_HAVE_HSIMD_UTIL_FLOAT is 1.
 775  */
 776 static inline void gmx_simdcall
 777 storeDualHsimd(float *           m0,
 778                float *           m1,
 779                SimdFloat         a)
 780 {
 781     // Make sure the memory pointers are aligned to half float SIMD width
 782     assert(std::size_t(m0) % (GMX_SIMD_FLOAT_WIDTH/2*sizeof(float)) == 0);
 783     assert(std::size_t(m1) % (GMX_SIMD_FLOAT_WIDTH/2*sizeof(float)) == 0);
 784
 785     for (std::size_t i = 0; i < a.simdInternal_.size()/2; i++)
 786     {
 787         m0[i] = a.simdInternal_[i];
 788         m1[i] = a.simdInternal_[a.simdInternal_.size()/2 + i];
 789     }
 790 }
 791
 792 /*! \brief Add each half of SIMD variable to separate memory adresses
 793  *
 794  * \param m0 Pointer to memory aligned to half SIMD width.
 795  * \param m1 Pointer to memory aligned to half SIMD width.
 796  * \param a  SIMD variable. Lower half will be added to m0, upper half to m1.
 797  *
 798  * The memory must be aligned to half SIMD width.
 799  *
 800  * \note The updated m0 value is written before m1 is read from memory, so
 801  *       the result will be correct even if the memory regions overlap.
 802  *
 803  * Available if \ref GMX_SIMD_HAVE_HSIMD_UTIL_FLOAT is 1.
 804  */
 805 static inline void gmx_simdcall
 806 incrDualHsimd(float *           m0,
 807               float *           m1,
 808               SimdFloat         a)
 809 {
 810     // Make sure the memory pointer is aligned to half float SIMD width
 811     assert(std::size_t(m0) % (GMX_SIMD_FLOAT_WIDTH/2*sizeof(float)) == 0);
 812     assert(std::size_t(m1) % (GMX_SIMD_FLOAT_WIDTH/2*sizeof(float)) == 0);
 813
 814     for (std::size_t i = 0; i < a.simdInternal_.size()/2; i++)
 815     {
 816         m0[i] += a.simdInternal_[i];
 817     }
 818     for (std::size_t i = 0; i < a.simdInternal_.size()/2; i++)
 819     {
 820         m1[i] += a.simdInternal_[a.simdInternal_.size()/2 + i];
 821     }
 822 }
 823
 824 /*! \brief Add the two halves of a SIMD float, subtract the sum from
 825  *         half-SIMD-width consecutive floats in memory.
 826  *
 827  * \param m  half-width aligned memory, from which sum of the halves will be subtracted.
 828  * \param a  SIMD variable. Upper & lower halves will first be added.
 829  *
 830  * If the SIMD width is 8 and contains [a b c d e f g h], the
 831  * memory will be modified to [m[0]-(a+e) m[1]-(b+f) m[2]-(c+g) m[3]-(d+h)].
 832  *
 833  * The memory must be aligned to half SIMD width.
 834  *
 835  * Available if \ref GMX_SIMD_HAVE_HSIMD_UTIL_FLOAT is 1.
 836  */
 837 static inline void gmx_simdcall
 838 decrHsimd(float *           m,
 839           SimdFloat         a)
 840 {
 841     // Make sure the memory pointer is aligned to half float SIMD width
 842     assert(std::size_t(m) % (GMX_SIMD_FLOAT_WIDTH/2*sizeof(float)) == 0);
 843
 844     for (std::size_t i = 0; i < a.simdInternal_.size()/2; i++)
 845     {
 846         m[i] -= a.simdInternal_[i] + a.simdInternal_[a.simdInternal_.size()/2 + i];
 847     }
 848 }
 849
 850 /*! \brief Load 2 consecutive floats from each of GMX_SIMD_FLOAT_WIDTH/2 offsets,
 851  *         transpose into SIMD float (low half from base0, high from base1).
 852  *
 853  * \tparam     align  Alignment of the storage, i.e. the distance
 854  *                    (measured in elements, not bytes) between index points.
 855  *                    When this is identical to the number of output components
 856  *                    the data is packed without padding. This must be a
 857  *                    multiple of the alignment to keep all data aligned.
 858  * \param      base0  Pointer to base of first aligned memory
 859  * \param      base1  Pointer to base of second aligned memory
 860  * \param      offset Offset to the start of each pair
 861  * \param[out] v0     1st element in each pair, base0 in low and base1 in high half.
 862  * \param[out] v1     2nd element in each pair, base0 in low and base1 in high half.
 863  *
 864  * The offset array should be of half the SIMD width length, so it corresponds
 865  * to the half-SIMD-register operations. This also means it must be aligned
 866  * to half the integer SIMD width (i.e., GMX_SIMD_FINT32_WIDTH/2).
 867  *
 868  * The floating-point memory locations must be aligned, but only to the smaller
 869  * of two elements and the floating-point SIMD width.
 870  *
 871  * This routine is primarily designed to load nonbonded parameters in the
 872  * kernels. It is the equivalent of the full-width routine
 873  * gatherLoadTranspose(), but just
 874  * as the other hsimd routines it will pick half-SIMD-width data from base0
 875  * and put in the lower half, while the upper half comes from base1.
 876  *
 877  * For an example, assume the SIMD width is 8, align is 2, that
 878  * base0 is [A0 A1 B0 B1 C0 C1 D0 D1 ...], and base1 [E0 E1 F0 F1 G0 G1 H0 H1...].
 879  *
 880  * Then we will get v0 as [A0 B0 C0 D0 E0 F0 G0 H0] and v1 as [A1 B1 C1 D1 E1 F1 G1 H1].
 881  *
 882  * Available if \ref GMX_SIMD_HAVE_HSIMD_UTIL_FLOAT is 1.
 883  */
 884 template <int align>
 885 static inline void gmx_simdcall
 886 gatherLoadTransposeHsimd(const float  *       base0,
 887                          const float  *       base1,
 888                          const std::int32_t   offset[],
 889                          SimdFloat *          v0,
 890                          SimdFloat *          v1)
 891 {
 892     // Offset list must be aligned for half SIMD FINT32 width
 893     assert(std::size_t(offset) % (GMX_SIMD_FINT32_WIDTH/2*sizeof(std::int32_t)) == 0);
 894     // base pointers must be aligned to the smaller of 2 elements and float SIMD width
 895     assert(std::size_t(base0) % (std::min(GMX_SIMD_FLOAT_WIDTH, 2)*sizeof(float)) == 0);
 896     assert(std::size_t(base1) % (std::min(GMX_SIMD_FLOAT_WIDTH, 2)*sizeof(float)) == 0);
 897     // alignment parameter must be also be multiple of the above required alignment
 898     assert(align % std::min(GMX_SIMD_FLOAT_WIDTH, 2) == 0);
 899
 900     for (std::size_t i = 0; i < v0->simdInternal_.size()/2; i++)
 901     {
 902         v0->simdInternal_[i] = base0[align * offset[i]];
 903         v1->simdInternal_[i] = base0[align * offset[i] + 1];
 904         v0->simdInternal_[v0->simdInternal_.size()/2 + i] = base1[align * offset[i]];
 905         v1->simdInternal_[v1->simdInternal_.size()/2 + i] = base1[align * offset[i] + 1];
 906     }
 907 }
 908
 909 /*! \brief Reduce the 4 half-SIMD-with floats in 2 SIMD variables (sum halves),
 910  *         increment four consecutive floats in memory, return sum.
 911  *
 912  * \param m    Pointer to memory where the four values should be incremented
 913  * \param v0   Variable whose half-SIMD sums should be added to m[0]/m[1], respectively.
 914  * \param v1   Variable whose half-SIMD sums should be added to m[2]/m[3], respectively.
 915  *
 916  * \return Sum of all elements in the four SIMD variables.
 917  *
 918  * The pointer m must be aligned, but only to the smaller
 919  * of four elements and the floating-point SIMD width.
 920  *
 921  * \note This is the half-SIMD-width version of
 922  *      reduceIncr4ReturnSum(). The only difference is that the
 923  *      four half-SIMD inputs needed are present in the low/high halves of the
 924  *      two SIMD arguments.
 925  *
 926  * Available if \ref GMX_SIMD_HAVE_HSIMD_UTIL_FLOAT is 1.
 927  */
 928 static inline float gmx_simdcall
 929 reduceIncr4ReturnSumHsimd(float *            m,
 930                           SimdFloat          v0,
 931                           SimdFloat          v1)
 932 {
 933     // The 4 here corresponds to the 4 elements in memory, not any SIMD width
 934     float sum[4] = { 0.0f, 0.0f, 0.0f, 0.0f };
 935
 936     for (std::size_t i = 0; i < v0.simdInternal_.size()/2; i++)
 937     {
 938         sum[0] += v0.simdInternal_[i];
 939         sum[1] += v0.simdInternal_[v0.simdInternal_.size()/2 + i];
 940         sum[2] += v1.simdInternal_[i];
 941         sum[3] += v1.simdInternal_[v1.simdInternal_.size()/2 + i];
 942     }
 943
 944     // Make sure the memory pointer is aligned to the smaller of 4 elements and float SIMD width
 945     assert(std::size_t(m) % (std::min(GMX_SIMD_FLOAT_WIDTH, 4)*sizeof(float)) == 0);
 946
 947     m[0] += sum[0];
 948     m[1] += sum[1];
 949     m[2] += sum[2];
 950     m[3] += sum[3];
 951
 952     return sum[0] + sum[1] + sum[2] + sum[3];
 953 }
 954
 955 #if GMX_SIMD_FLOAT_WIDTH > 8 || defined DOXYGEN
 956 /*! \brief Load N floats and duplicate them 4 times each.
 957  *
 958  * \param m Pointer to unaligned memory
 959  *
 960  * \return SIMD variable with N floats from m duplicated 4x.
 961  *
 962  * Available if \ref GMX_SIMD_HAVE_4NSIMD_UTIL_FLOAT is 1.
 963  * N is GMX_SIMD_FLOAT_WIDTH/4. Duplicated values are
 964  * contigous and different values are 4 positions in SIMD
 965  * apart.
 966  */
 967 static inline SimdFloat gmx_simdcall
 968 loadUNDuplicate4(const float* m)
 969 {
 970     SimdFloat        a;
 971     for (std::size_t i = 0; i < a.simdInternal_.size()/4; i++)
 972     {
 973         a.simdInternal_[i*4]   = m[i];
 974         a.simdInternal_[i*4+1] = m[i];
 975         a.simdInternal_[i*4+2] = m[i];
 976         a.simdInternal_[i*4+3] = m[i];
 977     }
 978     return a;
 979 }
 980
 981 /*! \brief Load 4 floats and duplicate them N times each.
 982  *
 983  * \param m Pointer to memory aligned to 4 floats
 984  *
 985  * \return SIMD variable with 4 floats from m duplicated Nx.
 986  *
 987  * Available if \ref GMX_SIMD_HAVE_4NSIMD_UTIL_FLOAT is 1.
 988  * N is GMX_SIMD_FLOAT_WIDTH/4. Different values are
 989  * contigous and same values are 4 positions in SIMD
 990  * apart.
 991  */
 992 static inline SimdFloat gmx_simdcall
 993 load4DuplicateN(const float* m)
 994 {
 995     SimdFloat        a;
 996     for (std::size_t i = 0; i < a.simdInternal_.size()/4; i++)
 997     {
 998         a.simdInternal_[i*4]   = m[0];
 999         a.simdInternal_[i*4+1] = m[1];
1000         a.simdInternal_[i*4+2] = m[2];
1001         a.simdInternal_[i*4+3] = m[3];
1002     }
1003     return a;
1004 }
1005 #endif
1006
1007 #if GMX_SIMD_FLOAT_WIDTH >= 8 || defined DOXYGEN
1008 /*! \brief Load floats in blocks of 4 at fixed offsets
1009  *
1010  * \param m Pointer to unaligned memory
1011  * \param offset Offset in memory between input blocks of 4
1012  *
1013  * \return SIMD variable with floats from m.
1014  *
1015  * Available if \ref GMX_SIMD_HAVE_4NSIMD_UTIL_FLOAT is 1.
1016  * Blocks of 4 floats are loaded from m+n*offset where n
1017  * is the n-th block of 4 floats.
1018  */
1019 static inline SimdFloat gmx_simdcall
1020 loadU4NOffset(const float* m, int offset)
1021 {
1022     SimdFloat        a;
1023     for (std::size_t i = 0; i < a.simdInternal_.size()/4; i++)
1024     {
1025         a.simdInternal_[i*4]   = m[offset*i + 0];
1026         a.simdInternal_[i*4+1] = m[offset*i + 1];
1027         a.simdInternal_[i*4+2] = m[offset*i + 2];
1028         a.simdInternal_[i*4+3] = m[offset*i + 3];
1029     }
1030     return a;
1031 }
1032 #endif
1033
1034 /*! \} */
1035
1036 /*! \} */
1037 /*! \endcond */
1038
1039 }      // namespace gmx
1040
1041 #endif // GMX_SIMD_IMPL_REFERENCE_UTIL_FLOAT_H