src/gromacs/simd/impl_reference/impl_reference_util_float.h

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2014,2015,2017,2019,2020, by the GROMACS development team, led by
   5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   6  * and including many others, as listed in the AUTHORS file in the
   7  * top-level source directory and at http://www.gromacs.org.
   8  *
   9  * GROMACS is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public License
  11  * as published by the Free Software Foundation; either version 2.1
  12  * of the License, or (at your option) any later version.
  13  *
  14  * GROMACS is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public
  20  * License along with GROMACS; if not, see
  21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  23  *
  24  * If you want to redistribute modifications to GROMACS, please
  25  * consider that scientific software is very special. Version
  26  * control is crucial - bugs must be traceable. We will be happy to
  27  * consider code for inclusion in the official distribution, but
  28  * derived work must not be called official GROMACS. Details are found
  29  * in the README & COPYING files - if they are missing, get the
  30  * official version at http://www.gromacs.org.
  31  *
  32  * To help us fund GROMACS development, we humbly ask that you cite
  33  * the research papers on the package. Check out http://www.gromacs.org.
  34  */
  35
  36 #ifndef GMX_SIMD_IMPL_REFERENCE_UTIL_FLOAT_H
  37 #define GMX_SIMD_IMPL_REFERENCE_UTIL_FLOAT_H
  38
  39 /*! \libinternal \file
  40  *
  41  * \brief Reference impl., higher-level single prec. SIMD utility functions
  42  *
  43  * \author Erik Lindahl <erik.lindahl@scilifelab.se>
  44  *
  45  * \ingroup module_simd
  46  */
  47
  48 /* Avoid adding dependencies on the rest of GROMACS here (e.g. gmxassert.h)
  49  * since we want to be able run the low-level SIMD implementations independently
  50  * in simulators for new hardware.
  51  */
  52
  53 #include "config.h"
  54
  55 #include <cassert>
  56 #include <cstddef>
  57 #include <cstdint>
  58
  59 #include <algorithm>
  60
  61 #include "impl_reference_definitions.h"
  62 #include "impl_reference_simd_float.h"
  63
  64 namespace gmx
  65 {
  66
  67 /*! \cond libapi */
  68 /*! \addtogroup module_simd */
  69 /*! \{ */
  70
  71 /* \name Higher-level SIMD utility functions, single precision.
  72  *
  73  * These include generic functions to work with triplets of data, typically
  74  * coordinates, and a few utility functions to load and update data in the
  75  * nonbonded kernels.
  76  * These functions should be available on all implementations, although
  77  * some wide SIMD implementations (width>=8) also provide special optional
  78  * versions to work with half or quarter registers to improve the performance
  79  * in the nonbonded kernels.
  80  *
  81  * \{
  82  */
  83
  84 /*! \brief Load 4 consecutive floats from each of GMX_SIMD_FLOAT_WIDTH offsets,
  85  *         and transpose into 4 SIMD float variables.
  86  *
  87  * \tparam     align  Alignment of the memory from which we read, i.e. distance
  88  *                    (measured in elements, not bytes) between index points.
  89  *                    When this is identical to the number of SIMD variables
  90  *                    (i.e., 4 for this routine) the input data is packed without
  91  *                    padding in memory. See the SIMD parameters for exactly
  92  *                    what memory positions are loaded.
  93  * \param      base   Pointer to the start of the memory area
  94  * \param      offset Array with offsets to the start of each data point.
  95  * \param[out] v0     1st component of data, base[align*offset[i]] for each i.
  96  * \param[out] v1     2nd component of data, base[align*offset[i] + 1] for each i.
  97  * \param[out] v2     3rd component of data, base[align*offset[i] + 2] for each i.
  98  * \param[out] v3     4th component of data, base[align*offset[i] + 3] for each i.
  99  *
 100  * The floating-point memory locations must be aligned, but only to the smaller
 101  * of four elements and the floating-point SIMD width.
 102  *
 103  * The offset memory must be aligned to GMX_SIMD_DINT32_WIDTH.
 104  *
 105  * \note You should NOT scale offsets before calling this routine; it is
 106  *       done internally by using the alignment template parameter instead.
 107  */
 108 template<int align>
 109 static inline void gmx_simdcall gatherLoadTranspose(const float*       base,
 110                                                     const std::int32_t offset[],
 111                                                     SimdFloat*         v0,
 112                                                     SimdFloat*         v1,
 113                                                     SimdFloat*         v2,
 114                                                     SimdFloat*         v3)
 115 {
 116     // Offset list must be aligned for SIMD FINT32
 117     assert(std::size_t(offset) % (GMX_SIMD_FINT32_WIDTH * sizeof(std::int32_t)) == 0);
 118     // Base pointer must be aligned to the smaller of 4 elements and float SIMD width
 119     assert(std::size_t(base) % (std::min(GMX_SIMD_FLOAT_WIDTH, 4) * sizeof(float)) == 0);
 120     // align parameter must also be a multiple of the above alignment requirement
 121     assert(align % std::min(GMX_SIMD_FLOAT_WIDTH, 4) == 0);
 122
 123     for (std::size_t i = 0; i < v0->simdInternal_.size(); i++)
 124     {
 125         v0->simdInternal_[i] = base[align * offset[i]];
 126         v1->simdInternal_[i] = base[align * offset[i] + 1];
 127         v2->simdInternal_[i] = base[align * offset[i] + 2];
 128         v3->simdInternal_[i] = base[align * offset[i] + 3];
 129     }
 130 }
 131
 132 /*! \brief Load 2 consecutive floats from each of GMX_SIMD_FLOAT_WIDTH offsets,
 133  *         and transpose into 2 SIMD float variables.
 134  *
 135  * \tparam     align  Alignment of the memory from which we read, i.e. distance
 136  *                    (measured in elements, not bytes) between index points.
 137  *                    When this is identical to the number of SIMD variables
 138  *                    (i.e., 2 for this routine) the input data is packed without
 139  *                    padding in memory. See the SIMD parameters for exactly
 140  *                    what memory positions are loaded.
 141  * \param      base   Pointer to the start of the memory area
 142  * \param      offset Array with offsets to the start of each data point.
 143  * \param[out] v0     1st component of data, base[align*offset[i]] for each i.
 144  * \param[out] v1     2nd component of data, base[align*offset[i] + 1] for each i.
 145  *
 146  * The floating-point memory locations must be aligned, but only to the smaller
 147  * of two elements and the floating-point SIMD width.
 148  *
 149  * The offset memory must be aligned to GMX_SIMD_FINT32_WIDTH.
 150  *
 151  * To achieve the best possible performance, you should store your data with
 152  * alignment \ref c_simdBestPairAlignmentFloat in single, or
 153  * \ref c_simdBestPairAlignmentDouble in double.
 154  *
 155  * \note You should NOT scale offsets before calling this routine; it is
 156  *       done internally by using the alignment template parameter instead.
 157  */
 158 template<int align>
 159 static inline void gmx_simdcall
 160                    gatherLoadTranspose(const float* base, const std::int32_t offset[], SimdFloat* v0, SimdFloat* v1)
 161 {
 162     // Offset list must be aligned for SIMD FINT32
 163     assert(std::size_t(offset) % (GMX_SIMD_FINT32_WIDTH * sizeof(std::int32_t)) == 0);
 164     // Base pointer must be aligned to the smaller of 2 elements and float SIMD width
 165     assert(std::size_t(base) % (std::min(GMX_SIMD_FLOAT_WIDTH, 2) * sizeof(float)) == 0);
 166     // align parameter must also be a multiple of the above alignment requirement
 167     assert(align % std::min(GMX_SIMD_FLOAT_WIDTH, 2) == 0);
 168
 169     for (std::size_t i = 0; i < v0->simdInternal_.size(); i++)
 170     {
 171         v0->simdInternal_[i] = base[align * offset[i]];
 172         v1->simdInternal_[i] = base[align * offset[i] + 1];
 173     }
 174 }
 175
 176
 177 /*! \brief Best alignment to use for aligned pairs of float data.
 178  *
 179  *  The routines to load and transpose data will work with a wide range of
 180  *  alignments, but some might be faster than others, depending on the load
 181  *  instructions available in the hardware. This specifies the best
 182  *  alignment for each implementation when working with pairs of data.
 183  *
 184  *  To allow each architecture to use the most optimal form, we use a constant
 185  *  that code outside the SIMD module should use to store things properly. It
 186  *  must be at least 2. For example, a value of 2 means the two parameters A & B
 187  *  are stored as [A0 B0 A1 B1] while align-4 means [A0 B0 - - A1 B1 - -].
 188  *
 189  *  This alignment depends on the efficiency of partial-register load/store
 190  *  operations, and will depend on the architecture.
 191  */
 192 static const int c_simdBestPairAlignmentFloat = 2;
 193
 194
 195 /*! \brief Load 3 consecutive floats from each of GMX_SIMD_FLOAT_WIDTH offsets,
 196  *         and transpose into 3 SIMD float variables.
 197  *
 198  * \tparam     align  Alignment of the memory from which we read, i.e. distance
 199  *                    (measured in elements, not bytes) between index points.
 200  *                    When this is identical to the number of SIMD variables
 201  *                    (i.e., 3 for this routine) the input data is packed without
 202  *                    padding in memory. See the SIMD parameters for exactly
 203  *                    what memory positions are loaded.
 204  * \param      base   Pointer to the start of the memory area
 205  * \param      offset Array with offsets to the start of each data point.
 206  * \param[out] v0     1st component of data, base[align*offset[i]] for each i.
 207  * \param[out] v1     2nd component of data, base[align*offset[i] + 1] for each i.
 208  * \param[out] v2     3rd component of data, base[align*offset[i] + 2] for each i.
 209  *
 210  * This function can work with both aligned (better performance) and unaligned
 211  * memory. When the align parameter is not a power-of-two (align==3 would be normal
 212  * for packed atomic coordinates) the memory obviously cannot be aligned, and
 213  * we account for this.
 214  * However, in the case where align is a power-of-two, we assume the base pointer
 215  * also has the same alignment, which will enable many platforms to use faster
 216  * aligned memory load operations.
 217  * An easy way to think of this is that each triplet of data in memory must be
 218  * aligned to the align parameter you specify when it's a power-of-two.
 219  *
 220  * The offset memory must always be aligned to GMX_SIMD_FINT32_WIDTH, since this
 221  * enables us to use SIMD loads and gather operations on platforms that support it.
 222  *
 223  * \note You should NOT scale offsets before calling this routine; it is
 224  *       done internally by using the alignment template parameter instead.
 225  * \note This routine uses a normal array for the offsets, since we typically
 226  *       load this data from memory. On the architectures we have tested this
 227  *       is faster even when a SIMD integer datatype is present.
 228  * \note To improve performance, this function might use full-SIMD-width
 229  *       unaligned loads. This means you need to ensure the memory is padded
 230  *       at the end, so we always can load GMX_SIMD_REAL_WIDTH elements
 231  *       starting at the last offset. If you use the Gromacs aligned memory
 232  *       allocation routines this will always be the case.
 233  */
 234 template<int align>
 235 static inline void gmx_simdcall gatherLoadUTranspose(const float*       base,
 236                                                      const std::int32_t offset[],
 237                                                      SimdFloat*         v0,
 238                                                      SimdFloat*         v1,
 239                                                      SimdFloat*         v2)
 240 {
 241     // Offset list must be aligned for SIMD FINT32
 242     assert(std::size_t(offset) % (GMX_SIMD_FINT32_WIDTH * sizeof(std::int32_t)) == 0);
 243
 244     for (std::size_t i = 0; i < v0->simdInternal_.size(); i++)
 245     {
 246         v0->simdInternal_[i] = base[align * offset[i]];
 247         v1->simdInternal_[i] = base[align * offset[i] + 1];
 248         v2->simdInternal_[i] = base[align * offset[i] + 2];
 249     }
 250 }
 251
 252
 253 /*! \brief Transpose and store 3 SIMD floats to 3 consecutive addresses at
 254  *         GMX_SIMD_FLOAT_WIDTH offsets.
 255  *
 256  * \tparam     align  Alignment of the memory to which we write, i.e. distance
 257  *                    (measured in elements, not bytes) between index points.
 258  *                    When this is identical to the number of SIMD variables
 259  *                    (i.e., 3 for this routine) the output data is packed without
 260  *                    padding in memory. See the SIMD parameters for exactly
 261  *                    what memory positions are written.
 262  * \param[out] base   Pointer to the start of the memory area
 263  * \param      offset Aligned array with offsets to the start of each triplet.
 264  * \param      v0     1st component of triplets, written to base[align*offset[i]].
 265  * \param      v1     2nd component of triplets, written to base[align*offset[i] + 1].
 266  * \param      v2     3rd component of triplets, written to base[align*offset[i] + 2].
 267  *
 268  * This function can work with both aligned (better performance) and unaligned
 269  * memory. When the align parameter is not a power-of-two (align==3 would be normal
 270  * for packed atomic coordinates) the memory obviously cannot be aligned, and
 271  * we account for this.
 272  * However, in the case where align is a power-of-two, we assume the base pointer
 273  * also has the same alignment, which will enable many platforms to use faster
 274  * aligned memory store operations.
 275  * An easy way to think of this is that each triplet of data in memory must be
 276  * aligned to the align parameter you specify when it's a power-of-two.
 277  *
 278  * The offset memory must always be aligned to GMX_SIMD_FINT32_WIDTH, since this
 279  * enables us to use SIMD loads and gather operations on platforms that support it.
 280  *
 281  * \note You should NOT scale offsets before calling this routine; it is
 282  *       done internally by using the alignment template parameter instead.
 283  * \note This routine uses a normal array for the offsets, since we typically
 284  *       load the data from memory. On the architectures we have tested this
 285  *       is faster even when a SIMD integer datatype is present.
 286  */
 287 template<int align>
 288 static inline void gmx_simdcall
 289                    transposeScatterStoreU(float* base, const std::int32_t offset[], SimdFloat v0, SimdFloat v1, SimdFloat v2)
 290 {
 291     // Offset list must be aligned for SIMD FINT32
 292     assert(std::size_t(offset) % (GMX_SIMD_FINT32_WIDTH * sizeof(std::int32_t)) == 0);
 293
 294     for (std::size_t i = 0; i < v0.simdInternal_.size(); i++)
 295     {
 296         base[align * offset[i]]     = v0.simdInternal_[i];
 297         base[align * offset[i] + 1] = v1.simdInternal_[i];
 298         base[align * offset[i] + 2] = v2.simdInternal_[i];
 299     }
 300 }
 301
 302
 303 /*! \brief Transpose and add 3 SIMD floats to 3 consecutive addresses at
 304  *         GMX_SIMD_FLOAT_WIDTH offsets.
 305  *
 306  * \tparam     align  Alignment of the memory to which we write, i.e. distance
 307  *                    (measured in elements, not bytes) between index points.
 308  *                    When this is identical to the number of SIMD variables
 309  *                    (i.e., 3 for this routine) the output data is packed without
 310  *                    padding in memory. See the SIMD parameters for exactly
 311  *                    what memory positions are incremented.
 312  * \param[out] base   Pointer to the start of the memory area
 313  * \param      offset Aligned array with offsets to the start of each triplet.
 314  * \param      v0     1st component of triplets, added to base[align*offset[i]].
 315  * \param      v1     2nd component of triplets, added to base[align*offset[i] + 1].
 316  * \param      v2     3rd component of triplets, added to base[align*offset[i] + 2].
 317  *
 318  * This function can work with both aligned (better performance) and unaligned
 319  * memory. When the align parameter is not a power-of-two (align==3 would be normal
 320  * for packed atomic coordinates) the memory obviously cannot be aligned, and
 321  * we account for this.
 322  * However, in the case where align is a power-of-two, we assume the base pointer
 323  * also has the same alignment, which will enable many platforms to use faster
 324  * aligned memory load/store operations.
 325  * An easy way to think of this is that each triplet of data in memory must be
 326  * aligned to the align parameter you specify when it's a power-of-two.
 327  *
 328  * The offset memory must always be aligned to GMX_SIMD_FINT32_WIDTH, since this
 329  * enables us to use SIMD loads and gather operations on platforms that support it.
 330  *
 331  * \note You should NOT scale offsets before calling this routine; it is
 332  *       done internally by using the alignment template parameter instead.
 333  * \note This routine uses a normal array for the offsets, since we typically
 334  *       load the data from memory. On the architectures we have tested this
 335  *       is faster even when a SIMD integer datatype is present.
 336  * \note To improve performance, this function might use full-SIMD-width
 337  *       unaligned load/store, and add 0.0 to the extra elements.
 338  *       This means you need to ensure the memory is padded
 339  *       at the end, so we always can load GMX_SIMD_REAL_WIDTH elements
 340  *       starting at the last offset. If you use the Gromacs aligned memory
 341  *       allocation routines this will always be the case.
 342  */
 343 template<int align>
 344 static inline void gmx_simdcall
 345                    transposeScatterIncrU(float* base, const std::int32_t offset[], SimdFloat v0, SimdFloat v1, SimdFloat v2)
 346 {
 347     // Offset list must be aligned for SIMD FINT32
 348     assert(std::size_t(offset) % (GMX_SIMD_FINT32_WIDTH * sizeof(std::int32_t)) == 0);
 349
 350     for (std::size_t i = 0; i < v0.simdInternal_.size(); i++)
 351     {
 352         base[align * offset[i]] += v0.simdInternal_[i];
 353         base[align * offset[i] + 1] += v1.simdInternal_[i];
 354         base[align * offset[i] + 2] += v2.simdInternal_[i];
 355     }
 356 }
 357
 358
 359 /*! \brief Transpose and subtract 3 SIMD floats to 3 consecutive addresses at
 360  *         GMX_SIMD_FLOAT_WIDTH offsets.
 361  *
 362  * \tparam     align  Alignment of the memory to which we write, i.e. distance
 363  *                    (measured in elements, not bytes) between index points.
 364  *                    When this is identical to the number of SIMD variables
 365  *                    (i.e., 3 for this routine) the output data is packed without
 366  *                    padding in memory. See the SIMD parameters for exactly
 367  *                    what memory positions are decremented.
 368  * \param[out] base    Pointer to start of memory.
 369  * \param      offset  Aligned array with offsets to the start of each triplet.
 370  * \param      v0      1st component, subtracted from base[align*offset[i]]
 371  * \param      v1      2nd component, subtracted from base[align*offset[i]+1]
 372  * \param      v2      3rd component, subtracted from base[align*offset[i]+2]
 373  *
 374  * This function can work with both aligned (better performance) and unaligned
 375  * memory. When the align parameter is not a power-of-two (align==3 would be normal
 376  * for packed atomic coordinates) the memory obviously cannot be aligned, and
 377  * we account for this.
 378  * However, in the case where align is a power-of-two, we assume the base pointer
 379  * also has the same alignment, which will enable many platforms to use faster
 380  * aligned memory load/store operations.
 381  * An easy way to think of this is that each triplet of data in memory must be
 382  * aligned to the align parameter you specify when it's a power-of-two.
 383  *
 384  * The offset memory must always be aligned to GMX_SIMD_FINT32_WIDTH, since this
 385  * enables us to use SIMD loads and gather operations on platforms that support it.
 386  *
 387  * \note You should NOT scale offsets before calling this routine; it is
 388  *       done internally by using the alignment template parameter instead.
 389  * \note This routine uses a normal array for the offsets, since we typically
 390  *       load the data from memory. On the architectures we have tested this
 391  *       is faster even when a SIMD integer datatype is present.
 392  * \note To improve performance, this function might use full-SIMD-width
 393  *       unaligned load/store, and subtract 0.0 from the extra elements.
 394  *       This means you need to ensure the memory is padded
 395  *       at the end, so we always can load GMX_SIMD_REAL_WIDTH elements
 396  *       starting at the last offset. If you use the Gromacs aligned memory
 397  *       allocation routines this will always be the case.
 398  */
 399 template<int align>
 400 static inline void gmx_simdcall
 401                    transposeScatterDecrU(float* base, const std::int32_t offset[], SimdFloat v0, SimdFloat v1, SimdFloat v2)
 402 {
 403     // Offset list must be aligned for SIMD FINT32
 404     assert(std::size_t(offset) % (GMX_SIMD_FINT32_WIDTH * sizeof(std::int32_t)) == 0);
 405
 406     for (std::size_t i = 0; i < v0.simdInternal_.size(); i++)
 407     {
 408         base[align * offset[i]] -= v0.simdInternal_[i];
 409         base[align * offset[i] + 1] -= v1.simdInternal_[i];
 410         base[align * offset[i] + 2] -= v2.simdInternal_[i];
 411     }
 412 }
 413
 414
 415 /*! \brief Expand each element of float SIMD variable into three identical
 416  *         consecutive elements in three SIMD outputs.
 417  *
 418  * \param      scalar    Floating-point input, e.g. [s0 s1 s2 s3] if width=4.
 419  * \param[out] triplets0 First output, e.g. [s0 s0 s0 s1] if width=4.
 420  * \param[out] triplets1 Second output, e.g. [s1 s1 s2 s2] if width=4.
 421  * \param[out] triplets2 Third output, e.g. [s2 s3 s3 s3] if width=4.
 422  *
 423  * This routine is meant to use for things like scalar-vector multiplication,
 424  * where the vectors are stored in a merged format like [x0 y0 z0 x1 y1 z1 ...],
 425  * while the scalars are stored as [s0 s1 s2...], and the data cannot easily
 426  * be changed to SIMD-friendly layout.
 427  *
 428  * In this case, load 3 full-width SIMD variables from the vector array (This
 429  * will always correspond to GMX_SIMD_FLOAT_WIDTH triplets),
 430  * load a single full-width variable from the scalar array, and
 431  * call this routine to expand the data. You can then simply multiply the
 432  * first, second and third pair of SIMD variables, and store the three
 433  * results back into a suitable vector-format array.
 434  */
 435 static inline void gmx_simdcall expandScalarsToTriplets(SimdFloat  scalar,
 436                                                         SimdFloat* triplets0,
 437                                                         SimdFloat* triplets1,
 438                                                         SimdFloat* triplets2)
 439 {
 440     for (std::size_t i = 0; i < scalar.simdInternal_.size(); i++)
 441     {
 442         triplets0->simdInternal_[i] = scalar.simdInternal_[i / 3];
 443         triplets1->simdInternal_[i] = scalar.simdInternal_[(i + scalar.simdInternal_.size()) / 3];
 444         triplets2->simdInternal_[i] = scalar.simdInternal_[(i + 2 * scalar.simdInternal_.size()) / 3];
 445     }
 446 }
 447
 448 /*! \brief Load 4 consecutive floats from each of GMX_SIMD_FLOAT_WIDTH offsets
 449  *         specified by a SIMD integer, transpose into 4 SIMD float variables.
 450  *
 451  * \tparam     align  Alignment of the memory from which we read, i.e. distance
 452  *                    (measured in elements, not bytes) between index points.
 453  *                    When this is identical to the number of SIMD variables
 454  *                    (i.e., 4 for this routine) the input data is packed without
 455  *                    padding in memory. See the SIMD parameters for exactly
 456  *                    what memory positions are loaded.
 457  * \param      base   Aligned pointer to the start of the memory.
 458  * \param      offset SIMD integer type with offsets to the start of each triplet.
 459  * \param[out] v0     First component, base[align*offset[i]] for each i.
 460  * \param[out] v1     Second component, base[align*offset[i] + 1] for each i.
 461  * \param[out] v2     Third component, base[align*offset[i] + 2] for each i.
 462  * \param[out] v3     Fourth component, base[align*offset[i] + 3] for each i.
 463  *
 464  * The floating-point memory locations must be aligned, but only to the smaller
 465  * of four elements and the floating-point SIMD width.
 466  *
 467  * \note You should NOT scale offsets before calling this routine; it is
 468  *       done internally by using the alignment template parameter instead.
 469  * \note This is a special routine primarily intended for loading Gromacs
 470  *       table data as efficiently as possible - this is the reason for using
 471  *       a SIMD offset index, since the result of the  real-to-integer conversion
 472  *       is present in a SIMD register just before calling this routine.
 473  */
 474 template<int align>
 475 static inline void gmx_simdcall gatherLoadBySimdIntTranspose(const float* base,
 476                                                              SimdFInt32   offset,
 477                                                              SimdFloat*   v0,
 478                                                              SimdFloat*   v1,
 479                                                              SimdFloat*   v2,
 480                                                              SimdFloat*   v3)
 481 {
 482     // Base pointer must be aligned to the smaller of 4 elements and float SIMD width
 483     assert(std::size_t(base) % (std::min(GMX_SIMD_FLOAT_WIDTH, 4) * sizeof(float)) == 0);
 484     // align parameter must also be a multiple of the above alignment requirement
 485     assert(align % std::min(GMX_SIMD_FLOAT_WIDTH, 4) == 0);
 486
 487     for (std::size_t i = 0; i < v0->simdInternal_.size(); i++)
 488     {
 489         v0->simdInternal_[i] = base[align * offset.simdInternal_[i]];
 490         v1->simdInternal_[i] = base[align * offset.simdInternal_[i] + 1];
 491         v2->simdInternal_[i] = base[align * offset.simdInternal_[i] + 2];
 492         v3->simdInternal_[i] = base[align * offset.simdInternal_[i] + 3];
 493     }
 494 }
 495
 496
 497 /*! \brief Load 2 consecutive floats from each of GMX_SIMD_FLOAT_WIDTH offsets
 498  *         (unaligned) specified by SIMD integer, transpose into 2 SIMD floats.
 499  *
 500  * \tparam     align  Alignment of the memory from which we read, i.e. distance
 501  *                    (measured in elements, not bytes) between index points.
 502  *                    When this is identical to the number of SIMD variables
 503  *                    (i.e., 2 for this routine) the input data is packed without
 504  *                    padding in memory. See the SIMD parameters for exactly
 505  *                    what memory positions are loaded.
 506  * \param      base   Pointer to the start of the memory.
 507  * \param      offset SIMD integer type with offsets to the start of each triplet.
 508  * \param[out] v0     First component, base[align*offset[i]] for each i.
 509  * \param[out] v1     Second component, base[align*offset[i] + 1] for each i.
 510  *
 511  * Since some SIMD architectures cannot handle any unaligned loads, this routine
 512  * is only available if GMX_SIMD_HAVE_GATHER_LOADU_BYSIMDINT_TRANSPOSE is 1.
 513  *
 514  * \note You should NOT scale offsets before calling this routine; it is
 515  *       done internally by using the alignment template parameter instead.
 516  * \note This is a special routine primarily intended for loading Gromacs
 517  *       table data as efficiently as possible - this is the reason for using
 518  *       a SIMD offset index, since the result of the  real-to-integer conversion
 519  *       is present in a SIMD register just before calling this routine.
 520  */
 521 template<int align>
 522 static inline void gmx_simdcall
 523                    gatherLoadUBySimdIntTranspose(const float* base, SimdFInt32 offset, SimdFloat* v0, SimdFloat* v1)
 524 {
 525     for (std::size_t i = 0; i < v0->simdInternal_.size(); i++)
 526     {
 527         v0->simdInternal_[i] = base[align * offset.simdInternal_[i]];
 528         v1->simdInternal_[i] = base[align * offset.simdInternal_[i] + 1];
 529     }
 530 }
 531
 532 /*! \brief Load 2 consecutive floats from each of GMX_SIMD_FLOAT_WIDTH offsets
 533  *         specified by a SIMD integer, transpose into 2 SIMD float variables.
 534  *
 535  * \tparam     align  Alignment of the memory from which we read, i.e. distance
 536  *                    (measured in elements, not bytes) between index points.
 537  *                    When this is identical to the number of SIMD variables
 538  *                    (i.e., 2 for this routine) the input data is packed without
 539  *                    padding in memory. See the SIMD parameters for exactly
 540  *                    what memory positions are loaded.
 541  * \param      base   Aligned pointer to the start of the memory.
 542  * \param      offset SIMD integer type with offsets to the start of each triplet.
 543  * \param[out] v0     First component, base[align*offset[i]] for each i.
 544  * \param[out] v1     Second component, base[align*offset[i] + 1] for each i.
 545  *
 546  * The floating-point memory locations must be aligned, but only to the smaller
 547  * of two elements and the floating-point SIMD width.
 548  *
 549  * \note You should NOT scale offsets before calling this routine; it is
 550  *       done internally by using the alignment template parameter instead.
 551  * \note This is a special routine primarily intended for loading Gromacs
 552  *       table data as efficiently as possible - this is the reason for using
 553  *       a SIMD offset index, since the result of the  real-to-integer conversion
 554  *       is present in a SIMD register just before calling this routine.
 555  */
 556 template<int align>
 557 static inline void gmx_simdcall
 558                    gatherLoadBySimdIntTranspose(const float* base, SimdFInt32 offset, SimdFloat* v0, SimdFloat* v1)
 559 {
 560     // Base pointer must be aligned to the smaller of 2 elements and float SIMD width
 561     assert(std::size_t(base) % (std::min(GMX_SIMD_FLOAT_WIDTH, 2) * sizeof(float)) == 0);
 562     // align parameter must also be a multiple of the above alignment requirement
 563     assert(align % std::min(GMX_SIMD_FLOAT_WIDTH, 2) == 0);
 564
 565     for (std::size_t i = 0; i < v0->simdInternal_.size(); i++)
 566     {
 567         v0->simdInternal_[i] = base[align * offset.simdInternal_[i]];
 568         v1->simdInternal_[i] = base[align * offset.simdInternal_[i] + 1];
 569     }
 570 }
 571
 572
 573 /*! \brief Reduce each of four SIMD floats, add those values to four consecutive
 574  *         floats in memory, return sum.
 575  *
 576  * \param m   Pointer to memory where four floats should be incremented
 577  * \param v0  SIMD variable whose sum should be added to m[0]
 578  * \param v1  SIMD variable whose sum should be added to m[1]
 579  * \param v2  SIMD variable whose sum should be added to m[2]
 580  * \param v3  SIMD variable whose sum should be added to m[3]
 581  *
 582  * \return Sum of all elements in the four SIMD variables.
 583  *
 584  * The pointer m must be aligned to the smaller of four elements and the
 585  * floating-point SIMD width.
 586  *
 587  * \note This is a special routine intended for the Gromacs nonbonded kernels.
 588  * It is used in the epilogue of the outer loop, where the variables will
 589  * contain unrolled forces for one outer-loop-particle each, corresponding to
 590  * a single coordinate (i.e, say, four x-coordinate force variables). These
 591  * should be summed and added to the force array in memory. Since we always work
 592  * with contiguous SIMD-layout , we can use efficient aligned loads/stores.
 593  * When calculating the virial, we also need the total sum of all forces for
 594  * each coordinate. This is provided as the return value. For routines that
 595  * do not need these, this extra code will be optimized away completely if you
 596  * just ignore the return value (Checked with gcc-4.9.1 and clang-3.6 for AVX).
 597  */
 598 static inline float gmx_simdcall reduceIncr4ReturnSum(float* m, SimdFloat v0, SimdFloat v1, SimdFloat v2, SimdFloat v3)
 599 {
 600     float sum[4]; // Note that the 4 here corresponds to the 4 m-elements, not any SIMD width
 601
 602     // Make sure the memory pointer is aligned to the smaller of 4 elements and float SIMD width
 603     assert(std::size_t(m) % (std::min(GMX_SIMD_FLOAT_WIDTH, 4) * sizeof(float)) == 0);
 604
 605     sum[0] = reduce(v0);
 606     sum[1] = reduce(v1);
 607     sum[2] = reduce(v2);
 608     sum[3] = reduce(v3);
 609
 610     m[0] += sum[0];
 611     m[1] += sum[1];
 612     m[2] += sum[2];
 613     m[3] += sum[3];
 614
 615     return sum[0] + sum[1] + sum[2] + sum[3];
 616 }
 617
 618
 619 /*! \}
 620  *
 621  * \name Higher-level SIMD utilities accessing partial (half-width) SIMD floats.
 622  *
 623  * These functions are optional. The are only useful for SIMD implementation
 624  * where the width is 8 or larger, and where it would be inefficient
 625  * to process 4*8, 8*8, or more, interactions in parallel.
 626  *
 627  * Currently, only Intel provides very wide SIMD implementations, but these
 628  * also come with excellent support for loading, storing, accessing and
 629  * shuffling parts of the register in so-called 'lanes' of 4 bytes each.
 630  * We can use this to load separate parts into the low/high halves of the
 631  * register in the inner loop of the nonbonded kernel, which e.g. makes it
 632  * possible to process 4*4 nonbonded interactions as a pattern of 2*8. We
 633  * can also use implementations with width 16 or greater.
 634  *
 635  * To make this more generic, when \ref GMX_SIMD_HAVE_HSIMD_UTIL_REAL is 1,
 636  * the SIMD implementation provides seven special routines that:
 637  *
 638  * - Load the low/high parts of a SIMD variable from different pointers
 639  * - Load half the SIMD width from one pointer, and duplicate in low/high parts
 640  * - Load two reals, put 1st one in all low elements, and 2nd in all high ones.
 641  * - Store the low/high parts of a SIMD variable to different pointers
 642  * - Subtract both SIMD halves from a single half-SIMD-width memory location.
 643  * - Load aligned pairs (LJ parameters) from two base pointers, with a common
 644  *   offset list, and put these in the low/high SIMD halves.
 645  * - Reduce each half of two SIMD registers (i.e., 4 parts in total), increment
 646  *   four adjacent memory positions, and return the total sum.
 647  *
 648  * Remember: this is ONLY used when the native SIMD width is large. You will
 649  * just waste time if you implement it for normal 16-byte SIMD architectures.
 650  *
 651  * This is part of the new C++ SIMD interface, so these functions are only
 652  * available when using C++. Since some Gromacs code reliying on the SIMD
 653  * module is still C (not C++), we have kept the C-style naming for now - this
 654  * will change once we are entirely C++.
 655  *
 656  * \{
 657  */
 658
 659
 660 /*! \brief Load low & high parts of SIMD float from different locations.
 661  *
 662  * \param m0 Pointer to memory aligned to half SIMD width.
 663  * \param m1 Pointer to memory aligned to half SIMD width.
 664  *
 665  * \return SIMD variable with low part loaded from m0, high from m1.
 666  *
 667  * Available if \ref GMX_SIMD_HAVE_HSIMD_UTIL_FLOAT is 1.
 668  */
 669 static inline SimdFloat gmx_simdcall loadDualHsimd(const float* m0, const float* m1)
 670 {
 671     SimdFloat a;
 672
 673     // Make sure the memory pointers are aligned to half float SIMD width
 674     assert(std::size_t(m0) % (GMX_SIMD_FLOAT_WIDTH / 2 * sizeof(float)) == 0);
 675     assert(std::size_t(m1) % (GMX_SIMD_FLOAT_WIDTH / 2 * sizeof(float)) == 0);
 676
 677     for (std::size_t i = 0; i < a.simdInternal_.size() / 2; i++)
 678     {
 679         a.simdInternal_[i]                              = m0[i];
 680         a.simdInternal_[a.simdInternal_.size() / 2 + i] = m1[i];
 681     }
 682     return a;
 683 }
 684
 685 /*! \brief Load half-SIMD-width float data, spread to both halves.
 686  *
 687  * \param m Pointer to memory aligned to half SIMD width.
 688  *
 689  * \return SIMD variable with both halves loaded from m..
 690  *
 691  * Available if \ref GMX_SIMD_HAVE_HSIMD_UTIL_FLOAT is 1.
 692  */
 693 static inline SimdFloat gmx_simdcall loadDuplicateHsimd(const float* m)
 694 {
 695     SimdFloat a;
 696
 697     // Make sure the memory pointer is aligned
 698     assert(std::size_t(m) % (GMX_SIMD_FLOAT_WIDTH / 2 * sizeof(float)) == 0);
 699
 700     for (std::size_t i = 0; i < a.simdInternal_.size() / 2; i++)
 701     {
 702         a.simdInternal_[i]                              = m[i];
 703         a.simdInternal_[a.simdInternal_.size() / 2 + i] = a.simdInternal_[i];
 704     }
 705     return a;
 706 }
 707
 708 /*! \brief Load two floats, spread 1st in low half, 2nd in high half.
 709  *
 710  * \param m Pointer to two adjacent float values.
 711  *
 712  * \return SIMD variable where all elements in the low half have been set
 713  *         to m[0], and all elements in high half to m[1].
 714  *
 715  * \note This routine always loads two values and sets the halves separately.
 716  *       If you want to set all elements to the same value, simply use
 717  *       the standard (non-half-SIMD) operations.
 718  *
 719  * Available if \ref GMX_SIMD_HAVE_HSIMD_UTIL_FLOAT is 1.
 720  */
 721 static inline SimdFloat gmx_simdcall loadU1DualHsimd(const float* m)
 722 {
 723     SimdFloat a;
 724
 725     for (std::size_t i = 0; i < a.simdInternal_.size() / 2; i++)
 726     {
 727         a.simdInternal_[i]                              = m[0];
 728         a.simdInternal_[a.simdInternal_.size() / 2 + i] = m[1];
 729     }
 730     return a;
 731 }
 732
 733
 734 /*! \brief Store low & high parts of SIMD float to different locations.
 735  *
 736  * \param m0 Pointer to memory aligned to half SIMD width.
 737  * \param m1 Pointer to memory aligned to half SIMD width.
 738  * \param a  SIMD variable. Low half should be stored to m0, high to m1.
 739  *
 740  * Available if \ref GMX_SIMD_HAVE_HSIMD_UTIL_FLOAT is 1.
 741  */
 742 static inline void gmx_simdcall storeDualHsimd(float* m0, float* m1, SimdFloat a)
 743 {
 744     // Make sure the memory pointers are aligned to half float SIMD width
 745     assert(std::size_t(m0) % (GMX_SIMD_FLOAT_WIDTH / 2 * sizeof(float)) == 0);
 746     assert(std::size_t(m1) % (GMX_SIMD_FLOAT_WIDTH / 2 * sizeof(float)) == 0);
 747
 748     for (std::size_t i = 0; i < a.simdInternal_.size() / 2; i++)
 749     {
 750         m0[i] = a.simdInternal_[i];
 751         m1[i] = a.simdInternal_[a.simdInternal_.size() / 2 + i];
 752     }
 753 }
 754
 755 /*! \brief Add each half of SIMD variable to separate memory adresses
 756  *
 757  * \param m0 Pointer to memory aligned to half SIMD width.
 758  * \param m1 Pointer to memory aligned to half SIMD width.
 759  * \param a  SIMD variable. Lower half will be added to m0, upper half to m1.
 760  *
 761  * The memory must be aligned to half SIMD width.
 762  *
 763  * \note The updated m0 value is written before m1 is read from memory, so
 764  *       the result will be correct even if the memory regions overlap.
 765  *
 766  * Available if \ref GMX_SIMD_HAVE_HSIMD_UTIL_FLOAT is 1.
 767  */
 768 static inline void gmx_simdcall incrDualHsimd(float* m0, float* m1, SimdFloat a)
 769 {
 770     // Make sure the memory pointer is aligned to half float SIMD width
 771     assert(std::size_t(m0) % (GMX_SIMD_FLOAT_WIDTH / 2 * sizeof(float)) == 0);
 772     assert(std::size_t(m1) % (GMX_SIMD_FLOAT_WIDTH / 2 * sizeof(float)) == 0);
 773
 774     for (std::size_t i = 0; i < a.simdInternal_.size() / 2; i++)
 775     {
 776         m0[i] += a.simdInternal_[i];
 777     }
 778     for (std::size_t i = 0; i < a.simdInternal_.size() / 2; i++)
 779     {
 780         m1[i] += a.simdInternal_[a.simdInternal_.size() / 2 + i];
 781     }
 782 }
 783
 784 /*! \brief Add the two halves of three SIMD floats, subtract the sum from
 785  *         three half-SIMD-width consecutive floats in memory.
 786  *
 787  * \param m  half-width aligned memory, from which sum of the halves will be subtracted.
 788  * \param a0 SIMD variable. Upper & lower halves will first be added.
 789  * \param a1 SIMD variable. Upper & lower halves will second be added.
 790  * \param a2 SIMD variable. Upper & lower halves will third be added.
 791  *
 792  * If the SIMD width is 8 and the vectors contain [a0 b0 c0 d0 e0 f0 g0 h0],
 793  * [a1 b1 c1 d1 e1 f1 g1 g1] and [a2 b2 c2 d2 e2 f2 g2 h2], the
 794  * memory will be modified to [m[0]-(a0+e0) m[1]-(b0+f0) m[2]-(c0+g0) m[3]-(d0+h0)
 795  *                             m[4]-(a1+e1) m[5]-(b1+f1) m[6]-(c1+g1) m[7]-(d1+h1)
 796  *                             m[8]-(a2+e2) m[9]-(b2+f2) m[10]-(c2+g2) m[11]-(d2+h2)].
 797  *
 798  * The memory must be aligned to half SIMD width.
 799  *
 800  * Available if \ref GMX_SIMD_HAVE_HSIMD_UTIL_FLOAT is 1.
 801  */
 802 static inline void gmx_simdcall decr3Hsimd(float* m, SimdFloat a0, SimdFloat a1, SimdFloat a2)
 803 {
 804     assert(std::size_t(m) % (GMX_SIMD_FLOAT_WIDTH / 2 * sizeof(float)) == 0);
 805     for (std::size_t i = 0; i < a0.simdInternal_.size() / 2; i++)
 806     {
 807         m[i] -= a0.simdInternal_[i] + a0.simdInternal_[a0.simdInternal_.size() / 2 + i];
 808     }
 809     for (std::size_t i = 0; i < a1.simdInternal_.size() / 2; i++)
 810     {
 811         m[a1.simdInternal_.size() / 2 + i] -=
 812                 a1.simdInternal_[i] + a1.simdInternal_[a1.simdInternal_.size() / 2 + i];
 813     }
 814     for (std::size_t i = 0; i < a2.simdInternal_.size() / 2; i++)
 815     {
 816         m[a2.simdInternal_.size() + i] -=
 817                 a2.simdInternal_[i] + a2.simdInternal_[a2.simdInternal_.size() / 2 + i];
 818     }
 819 }
 820
 821 /*! \brief Load 2 consecutive floats from each of GMX_SIMD_FLOAT_WIDTH/2 offsets,
 822  *         transpose into SIMD float (low half from base0, high from base1).
 823  *
 824  * \tparam     align  Alignment of the storage, i.e. the distance
 825  *                    (measured in elements, not bytes) between index points.
 826  *                    When this is identical to the number of output components
 827  *                    the data is packed without padding. This must be a
 828  *                    multiple of the alignment to keep all data aligned.
 829  * \param      base0  Pointer to base of first aligned memory
 830  * \param      base1  Pointer to base of second aligned memory
 831  * \param      offset Offset to the start of each pair
 832  * \param[out] v0     1st element in each pair, base0 in low and base1 in high half.
 833  * \param[out] v1     2nd element in each pair, base0 in low and base1 in high half.
 834  *
 835  * The offset array should be of half the SIMD width length, so it corresponds
 836  * to the half-SIMD-register operations. This also means it must be aligned
 837  * to half the integer SIMD width (i.e., GMX_SIMD_FINT32_WIDTH/2).
 838  *
 839  * The floating-point memory locations must be aligned, but only to the smaller
 840  * of two elements and the floating-point SIMD width.
 841  *
 842  * This routine is primarily designed to load nonbonded parameters in the
 843  * kernels. It is the equivalent of the full-width routine
 844  * gatherLoadTranspose(), but just
 845  * as the other hsimd routines it will pick half-SIMD-width data from base0
 846  * and put in the lower half, while the upper half comes from base1.
 847  *
 848  * For an example, assume the SIMD width is 8, align is 2, that
 849  * base0 is [A0 A1 B0 B1 C0 C1 D0 D1 ...], and base1 [E0 E1 F0 F1 G0 G1 H0 H1...].
 850  *
 851  * Then we will get v0 as [A0 B0 C0 D0 E0 F0 G0 H0] and v1 as [A1 B1 C1 D1 E1 F1 G1 H1].
 852  *
 853  * Available if \ref GMX_SIMD_HAVE_HSIMD_UTIL_FLOAT is 1.
 854  */
 855 template<int align>
 856 static inline void gmx_simdcall gatherLoadTransposeHsimd(const float*       base0,
 857                                                          const float*       base1,
 858                                                          const std::int32_t offset[],
 859                                                          SimdFloat*         v0,
 860                                                          SimdFloat*         v1)
 861 {
 862     // Offset list must be aligned for half SIMD FINT32 width
 863     assert(std::size_t(offset) % (GMX_SIMD_FINT32_WIDTH / 2 * sizeof(std::int32_t)) == 0);
 864     // base pointers must be aligned to the smaller of 2 elements and float SIMD width
 865     assert(std::size_t(base0) % (std::min(GMX_SIMD_FLOAT_WIDTH, 2) * sizeof(float)) == 0);
 866     assert(std::size_t(base1) % (std::min(GMX_SIMD_FLOAT_WIDTH, 2) * sizeof(float)) == 0);
 867     // alignment parameter must be also be multiple of the above required alignment
 868     assert(align % std::min(GMX_SIMD_FLOAT_WIDTH, 2) == 0);
 869
 870     for (std::size_t i = 0; i < v0->simdInternal_.size() / 2; i++)
 871     {
 872         v0->simdInternal_[i]                                = base0[align * offset[i]];
 873         v1->simdInternal_[i]                                = base0[align * offset[i] + 1];
 874         v0->simdInternal_[v0->simdInternal_.size() / 2 + i] = base1[align * offset[i]];
 875         v1->simdInternal_[v1->simdInternal_.size() / 2 + i] = base1[align * offset[i] + 1];
 876     }
 877 }
 878
 879 /*! \brief Reduce the 4 half-SIMD-with floats in 2 SIMD variables (sum halves),
 880  *         increment four consecutive floats in memory, return sum.
 881  *
 882  * \param m    Pointer to memory where the four values should be incremented
 883  * \param v0   Variable whose half-SIMD sums should be added to m[0]/m[1], respectively.
 884  * \param v1   Variable whose half-SIMD sums should be added to m[2]/m[3], respectively.
 885  *
 886  * \return Sum of all elements in the four SIMD variables.
 887  *
 888  * The pointer m must be aligned, but only to the smaller
 889  * of four elements and the floating-point SIMD width.
 890  *
 891  * \note This is the half-SIMD-width version of
 892  *      reduceIncr4ReturnSum(). The only difference is that the
 893  *      four half-SIMD inputs needed are present in the low/high halves of the
 894  *      two SIMD arguments.
 895  *
 896  * Available if \ref GMX_SIMD_HAVE_HSIMD_UTIL_FLOAT is 1.
 897  */
 898 static inline float gmx_simdcall reduceIncr4ReturnSumHsimd(float* m, SimdFloat v0, SimdFloat v1)
 899 {
 900     // The 4 here corresponds to the 4 elements in memory, not any SIMD width
 901     float sum[4] = { 0.0F, 0.0F, 0.0F, 0.0F };
 902
 903     for (std::size_t i = 0; i < v0.simdInternal_.size() / 2; i++)
 904     {
 905         sum[0] += v0.simdInternal_[i];
 906         sum[1] += v0.simdInternal_[v0.simdInternal_.size() / 2 + i];
 907         sum[2] += v1.simdInternal_[i];
 908         sum[3] += v1.simdInternal_[v1.simdInternal_.size() / 2 + i];
 909     }
 910
 911     // Make sure the memory pointer is aligned to the smaller of 4 elements and float SIMD width
 912     assert(std::size_t(m) % (std::min(GMX_SIMD_FLOAT_WIDTH, 4) * sizeof(float)) == 0);
 913
 914     m[0] += sum[0];
 915     m[1] += sum[1];
 916     m[2] += sum[2];
 917     m[3] += sum[3];
 918
 919     return sum[0] + sum[1] + sum[2] + sum[3];
 920 }
 921
 922 #if GMX_SIMD_FLOAT_WIDTH > 8 || defined DOXYGEN
 923 /*! \brief Load N floats and duplicate them 4 times each.
 924  *
 925  * \param m Pointer to unaligned memory
 926  *
 927  * \return SIMD variable with N floats from m duplicated 4x.
 928  *
 929  * Available if \ref GMX_SIMD_HAVE_4NSIMD_UTIL_FLOAT is 1.
 930  * N is GMX_SIMD_FLOAT_WIDTH/4. Duplicated values are
 931  * contigous and different values are 4 positions in SIMD
 932  * apart.
 933  */
 934 static inline SimdFloat gmx_simdcall loadUNDuplicate4(const float* m)
 935 {
 936     SimdFloat a;
 937     for (std::size_t i = 0; i < a.simdInternal_.size() / 4; i++)
 938     {
 939         a.simdInternal_[i * 4]     = m[i];
 940         a.simdInternal_[i * 4 + 1] = m[i];
 941         a.simdInternal_[i * 4 + 2] = m[i];
 942         a.simdInternal_[i * 4 + 3] = m[i];
 943     }
 944     return a;
 945 }
 946
 947 /*! \brief Load 4 floats and duplicate them N times each.
 948  *
 949  * \param m Pointer to memory aligned to 4 floats
 950  *
 951  * \return SIMD variable with 4 floats from m duplicated Nx.
 952  *
 953  * Available if \ref GMX_SIMD_HAVE_4NSIMD_UTIL_FLOAT is 1.
 954  * N is GMX_SIMD_FLOAT_WIDTH/4. Different values are
 955  * contigous and same values are 4 positions in SIMD
 956  * apart.
 957  */
 958 static inline SimdFloat gmx_simdcall load4DuplicateN(const float* m)
 959 {
 960     SimdFloat a;
 961     for (std::size_t i = 0; i < a.simdInternal_.size() / 4; i++)
 962     {
 963         a.simdInternal_[i * 4]     = m[0];
 964         a.simdInternal_[i * 4 + 1] = m[1];
 965         a.simdInternal_[i * 4 + 2] = m[2];
 966         a.simdInternal_[i * 4 + 3] = m[3];
 967     }
 968     return a;
 969 }
 970 #endif
 971
 972 #if GMX_SIMD_FLOAT_WIDTH >= 8 || defined DOXYGEN
 973 /*! \brief Load floats in blocks of 4 at fixed offsets
 974  *
 975  * \param m Pointer to unaligned memory
 976  * \param offset Offset in memory between input blocks of 4
 977  *
 978  * \return SIMD variable with floats from m.
 979  *
 980  * Available if \ref GMX_SIMD_HAVE_4NSIMD_UTIL_FLOAT is 1.
 981  * Blocks of 4 floats are loaded from m+n*offset where n
 982  * is the n-th block of 4 floats.
 983  */
 984 static inline SimdFloat gmx_simdcall loadU4NOffset(const float* m, int offset)
 985 {
 986     SimdFloat a;
 987     for (std::size_t i = 0; i < a.simdInternal_.size() / 4; i++)
 988     {
 989         a.simdInternal_[i * 4]     = m[offset * i + 0];
 990         a.simdInternal_[i * 4 + 1] = m[offset * i + 1];
 991         a.simdInternal_[i * 4 + 2] = m[offset * i + 2];
 992         a.simdInternal_[i * 4 + 3] = m[offset * i + 3];
 993     }
 994     return a;
 995 }
 996 #endif
 997
 998 /*! \} */
 999
1000 /*! \} */
1001 /*! \endcond */
1002
1003 } // namespace gmx
1004
1005 #endif // GMX_SIMD_IMPL_REFERENCE_UTIL_FLOAT_H