2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2014,2015,2017,2019,2020, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 #ifndef GMX_SIMD_IMPL_REFERENCE_UTIL_FLOAT_H
37 #define GMX_SIMD_IMPL_REFERENCE_UTIL_FLOAT_H
39 /*! \libinternal \file
41 * \brief Reference impl., higher-level single prec. SIMD utility functions
43 * \author Erik Lindahl <erik.lindahl@scilifelab.se>
45 * \ingroup module_simd
48 /* Avoid adding dependencies on the rest of GROMACS here (e.g. gmxassert.h)
49 * since we want to be able run the low-level SIMD implementations independently
50 * in simulators for new hardware.
61 #include "impl_reference_definitions.h"
62 #include "impl_reference_simd_float.h"
68 /*! \addtogroup module_simd */
71 /* \name Higher-level SIMD utility functions, single precision.
73 * These include generic functions to work with triplets of data, typically
74 * coordinates, and a few utility functions to load and update data in the
76 * These functions should be available on all implementations, although
77 * some wide SIMD implementations (width>=8) also provide special optional
78 * versions to work with half or quarter registers to improve the performance
79 * in the nonbonded kernels.
84 /*! \brief Load 4 consecutive floats from each of GMX_SIMD_FLOAT_WIDTH offsets,
85 * and transpose into 4 SIMD float variables.
87 * \tparam align Alignment of the memory from which we read, i.e. distance
88 * (measured in elements, not bytes) between index points.
89 * When this is identical to the number of SIMD variables
90 * (i.e., 4 for this routine) the input data is packed without
91 * padding in memory. See the SIMD parameters for exactly
92 * what memory positions are loaded.
93 * \param base Pointer to the start of the memory area
94 * \param offset Array with offsets to the start of each data point.
95 * \param[out] v0 1st component of data, base[align*offset[i]] for each i.
96 * \param[out] v1 2nd component of data, base[align*offset[i] + 1] for each i.
97 * \param[out] v2 3rd component of data, base[align*offset[i] + 2] for each i.
98 * \param[out] v3 4th component of data, base[align*offset[i] + 3] for each i.
100 * The floating-point memory locations must be aligned, but only to the smaller
101 * of four elements and the floating-point SIMD width.
103 * The offset memory must be aligned to GMX_SIMD_DINT32_WIDTH.
105 * \note You should NOT scale offsets before calling this routine; it is
106 * done internally by using the alignment template parameter instead.
109 static inline void gmx_simdcall gatherLoadTranspose(const float* base,
110 const std::int32_t offset[],
116 // Offset list must be aligned for SIMD FINT32
117 assert(std::size_t(offset) % (GMX_SIMD_FINT32_WIDTH * sizeof(std::int32_t)) == 0);
118 // Base pointer must be aligned to the smaller of 4 elements and float SIMD width
119 assert(std::size_t(base) % (std::min(GMX_SIMD_FLOAT_WIDTH, 4) * sizeof(float)) == 0);
120 // align parameter must also be a multiple of the above alignment requirement
121 assert(align % std::min(GMX_SIMD_FLOAT_WIDTH, 4) == 0);
123 for (std::size_t i = 0; i < v0->simdInternal_.size(); i++)
125 v0->simdInternal_[i] = base[align * offset[i]];
126 v1->simdInternal_[i] = base[align * offset[i] + 1];
127 v2->simdInternal_[i] = base[align * offset[i] + 2];
128 v3->simdInternal_[i] = base[align * offset[i] + 3];
132 /*! \brief Load 2 consecutive floats from each of GMX_SIMD_FLOAT_WIDTH offsets,
133 * and transpose into 2 SIMD float variables.
135 * \tparam align Alignment of the memory from which we read, i.e. distance
136 * (measured in elements, not bytes) between index points.
137 * When this is identical to the number of SIMD variables
138 * (i.e., 2 for this routine) the input data is packed without
139 * padding in memory. See the SIMD parameters for exactly
140 * what memory positions are loaded.
141 * \param base Pointer to the start of the memory area
142 * \param offset Array with offsets to the start of each data point.
143 * \param[out] v0 1st component of data, base[align*offset[i]] for each i.
144 * \param[out] v1 2nd component of data, base[align*offset[i] + 1] for each i.
146 * The floating-point memory locations must be aligned, but only to the smaller
147 * of two elements and the floating-point SIMD width.
149 * The offset memory must be aligned to GMX_SIMD_FINT32_WIDTH.
151 * To achieve the best possible performance, you should store your data with
152 * alignment \ref c_simdBestPairAlignmentFloat in single, or
153 * \ref c_simdBestPairAlignmentDouble in double.
155 * \note You should NOT scale offsets before calling this routine; it is
156 * done internally by using the alignment template parameter instead.
159 static inline void gmx_simdcall
160 gatherLoadTranspose(const float* base, const std::int32_t offset[], SimdFloat* v0, SimdFloat* v1)
162 // Offset list must be aligned for SIMD FINT32
163 assert(std::size_t(offset) % (GMX_SIMD_FINT32_WIDTH * sizeof(std::int32_t)) == 0);
164 // Base pointer must be aligned to the smaller of 2 elements and float SIMD width
165 assert(std::size_t(base) % (std::min(GMX_SIMD_FLOAT_WIDTH, 2) * sizeof(float)) == 0);
166 // align parameter must also be a multiple of the above alignment requirement
167 assert(align % std::min(GMX_SIMD_FLOAT_WIDTH, 2) == 0);
169 for (std::size_t i = 0; i < v0->simdInternal_.size(); i++)
171 v0->simdInternal_[i] = base[align * offset[i]];
172 v1->simdInternal_[i] = base[align * offset[i] + 1];
177 /*! \brief Best alignment to use for aligned pairs of float data.
179 * The routines to load and transpose data will work with a wide range of
180 * alignments, but some might be faster than others, depending on the load
181 * instructions available in the hardware. This specifies the best
182 * alignment for each implementation when working with pairs of data.
184 * To allow each architecture to use the most optimal form, we use a constant
185 * that code outside the SIMD module should use to store things properly. It
186 * must be at least 2. For example, a value of 2 means the two parameters A & B
187 * are stored as [A0 B0 A1 B1] while align-4 means [A0 B0 - - A1 B1 - -].
189 * This alignment depends on the efficiency of partial-register load/store
190 * operations, and will depend on the architecture.
192 static const int c_simdBestPairAlignmentFloat = 2;
195 /*! \brief Load 3 consecutive floats from each of GMX_SIMD_FLOAT_WIDTH offsets,
196 * and transpose into 3 SIMD float variables.
198 * \tparam align Alignment of the memory from which we read, i.e. distance
199 * (measured in elements, not bytes) between index points.
200 * When this is identical to the number of SIMD variables
201 * (i.e., 3 for this routine) the input data is packed without
202 * padding in memory. See the SIMD parameters for exactly
203 * what memory positions are loaded.
204 * \param base Pointer to the start of the memory area
205 * \param offset Array with offsets to the start of each data point.
206 * \param[out] v0 1st component of data, base[align*offset[i]] for each i.
207 * \param[out] v1 2nd component of data, base[align*offset[i] + 1] for each i.
208 * \param[out] v2 3rd component of data, base[align*offset[i] + 2] for each i.
210 * This function can work with both aligned (better performance) and unaligned
211 * memory. When the align parameter is not a power-of-two (align==3 would be normal
212 * for packed atomic coordinates) the memory obviously cannot be aligned, and
213 * we account for this.
214 * However, in the case where align is a power-of-two, we assume the base pointer
215 * also has the same alignment, which will enable many platforms to use faster
216 * aligned memory load operations.
217 * An easy way to think of this is that each triplet of data in memory must be
218 * aligned to the align parameter you specify when it's a power-of-two.
220 * The offset memory must always be aligned to GMX_SIMD_FINT32_WIDTH, since this
221 * enables us to use SIMD loads and gather operations on platforms that support it.
223 * \note You should NOT scale offsets before calling this routine; it is
224 * done internally by using the alignment template parameter instead.
225 * \note This routine uses a normal array for the offsets, since we typically
226 * load this data from memory. On the architectures we have tested this
227 * is faster even when a SIMD integer datatype is present.
228 * \note To improve performance, this function might use full-SIMD-width
229 * unaligned loads. This means you need to ensure the memory is padded
230 * at the end, so we always can load GMX_SIMD_REAL_WIDTH elements
231 * starting at the last offset. If you use the Gromacs aligned memory
232 * allocation routines this will always be the case.
235 static inline void gmx_simdcall gatherLoadUTranspose(const float* base,
236 const std::int32_t offset[],
241 // Offset list must be aligned for SIMD FINT32
242 assert(std::size_t(offset) % (GMX_SIMD_FINT32_WIDTH * sizeof(std::int32_t)) == 0);
244 for (std::size_t i = 0; i < v0->simdInternal_.size(); i++)
246 v0->simdInternal_[i] = base[align * offset[i]];
247 v1->simdInternal_[i] = base[align * offset[i] + 1];
248 v2->simdInternal_[i] = base[align * offset[i] + 2];
253 /*! \brief Transpose and store 3 SIMD floats to 3 consecutive addresses at
254 * GMX_SIMD_FLOAT_WIDTH offsets.
256 * \tparam align Alignment of the memory to which we write, i.e. distance
257 * (measured in elements, not bytes) between index points.
258 * When this is identical to the number of SIMD variables
259 * (i.e., 3 for this routine) the output data is packed without
260 * padding in memory. See the SIMD parameters for exactly
261 * what memory positions are written.
262 * \param[out] base Pointer to the start of the memory area
263 * \param offset Aligned array with offsets to the start of each triplet.
264 * \param v0 1st component of triplets, written to base[align*offset[i]].
265 * \param v1 2nd component of triplets, written to base[align*offset[i] + 1].
266 * \param v2 3rd component of triplets, written to base[align*offset[i] + 2].
268 * This function can work with both aligned (better performance) and unaligned
269 * memory. When the align parameter is not a power-of-two (align==3 would be normal
270 * for packed atomic coordinates) the memory obviously cannot be aligned, and
271 * we account for this.
272 * However, in the case where align is a power-of-two, we assume the base pointer
273 * also has the same alignment, which will enable many platforms to use faster
274 * aligned memory store operations.
275 * An easy way to think of this is that each triplet of data in memory must be
276 * aligned to the align parameter you specify when it's a power-of-two.
278 * The offset memory must always be aligned to GMX_SIMD_FINT32_WIDTH, since this
279 * enables us to use SIMD loads and gather operations on platforms that support it.
281 * \note You should NOT scale offsets before calling this routine; it is
282 * done internally by using the alignment template parameter instead.
283 * \note This routine uses a normal array for the offsets, since we typically
284 * load the data from memory. On the architectures we have tested this
285 * is faster even when a SIMD integer datatype is present.
288 static inline void gmx_simdcall
289 transposeScatterStoreU(float* base, const std::int32_t offset[], SimdFloat v0, SimdFloat v1, SimdFloat v2)
291 // Offset list must be aligned for SIMD FINT32
292 assert(std::size_t(offset) % (GMX_SIMD_FINT32_WIDTH * sizeof(std::int32_t)) == 0);
294 for (std::size_t i = 0; i < v0.simdInternal_.size(); i++)
296 base[align * offset[i]] = v0.simdInternal_[i];
297 base[align * offset[i] + 1] = v1.simdInternal_[i];
298 base[align * offset[i] + 2] = v2.simdInternal_[i];
303 /*! \brief Transpose and add 3 SIMD floats to 3 consecutive addresses at
304 * GMX_SIMD_FLOAT_WIDTH offsets.
306 * \tparam align Alignment of the memory to which we write, i.e. distance
307 * (measured in elements, not bytes) between index points.
308 * When this is identical to the number of SIMD variables
309 * (i.e., 3 for this routine) the output data is packed without
310 * padding in memory. See the SIMD parameters for exactly
311 * what memory positions are incremented.
312 * \param[out] base Pointer to the start of the memory area
313 * \param offset Aligned array with offsets to the start of each triplet.
314 * \param v0 1st component of triplets, added to base[align*offset[i]].
315 * \param v1 2nd component of triplets, added to base[align*offset[i] + 1].
316 * \param v2 3rd component of triplets, added to base[align*offset[i] + 2].
318 * This function can work with both aligned (better performance) and unaligned
319 * memory. When the align parameter is not a power-of-two (align==3 would be normal
320 * for packed atomic coordinates) the memory obviously cannot be aligned, and
321 * we account for this.
322 * However, in the case where align is a power-of-two, we assume the base pointer
323 * also has the same alignment, which will enable many platforms to use faster
324 * aligned memory load/store operations.
325 * An easy way to think of this is that each triplet of data in memory must be
326 * aligned to the align parameter you specify when it's a power-of-two.
328 * The offset memory must always be aligned to GMX_SIMD_FINT32_WIDTH, since this
329 * enables us to use SIMD loads and gather operations on platforms that support it.
331 * \note You should NOT scale offsets before calling this routine; it is
332 * done internally by using the alignment template parameter instead.
333 * \note This routine uses a normal array for the offsets, since we typically
334 * load the data from memory. On the architectures we have tested this
335 * is faster even when a SIMD integer datatype is present.
336 * \note To improve performance, this function might use full-SIMD-width
337 * unaligned load/store, and add 0.0 to the extra elements.
338 * This means you need to ensure the memory is padded
339 * at the end, so we always can load GMX_SIMD_REAL_WIDTH elements
340 * starting at the last offset. If you use the Gromacs aligned memory
341 * allocation routines this will always be the case.
344 static inline void gmx_simdcall
345 transposeScatterIncrU(float* base, const std::int32_t offset[], SimdFloat v0, SimdFloat v1, SimdFloat v2)
347 // Offset list must be aligned for SIMD FINT32
348 assert(std::size_t(offset) % (GMX_SIMD_FINT32_WIDTH * sizeof(std::int32_t)) == 0);
350 for (std::size_t i = 0; i < v0.simdInternal_.size(); i++)
352 base[align * offset[i]] += v0.simdInternal_[i];
353 base[align * offset[i] + 1] += v1.simdInternal_[i];
354 base[align * offset[i] + 2] += v2.simdInternal_[i];
359 /*! \brief Transpose and subtract 3 SIMD floats to 3 consecutive addresses at
360 * GMX_SIMD_FLOAT_WIDTH offsets.
362 * \tparam align Alignment of the memory to which we write, i.e. distance
363 * (measured in elements, not bytes) between index points.
364 * When this is identical to the number of SIMD variables
365 * (i.e., 3 for this routine) the output data is packed without
366 * padding in memory. See the SIMD parameters for exactly
367 * what memory positions are decremented.
368 * \param[out] base Pointer to start of memory.
369 * \param offset Aligned array with offsets to the start of each triplet.
370 * \param v0 1st component, subtracted from base[align*offset[i]]
371 * \param v1 2nd component, subtracted from base[align*offset[i]+1]
372 * \param v2 3rd component, subtracted from base[align*offset[i]+2]
374 * This function can work with both aligned (better performance) and unaligned
375 * memory. When the align parameter is not a power-of-two (align==3 would be normal
376 * for packed atomic coordinates) the memory obviously cannot be aligned, and
377 * we account for this.
378 * However, in the case where align is a power-of-two, we assume the base pointer
379 * also has the same alignment, which will enable many platforms to use faster
380 * aligned memory load/store operations.
381 * An easy way to think of this is that each triplet of data in memory must be
382 * aligned to the align parameter you specify when it's a power-of-two.
384 * The offset memory must always be aligned to GMX_SIMD_FINT32_WIDTH, since this
385 * enables us to use SIMD loads and gather operations on platforms that support it.
387 * \note You should NOT scale offsets before calling this routine; it is
388 * done internally by using the alignment template parameter instead.
389 * \note This routine uses a normal array for the offsets, since we typically
390 * load the data from memory. On the architectures we have tested this
391 * is faster even when a SIMD integer datatype is present.
392 * \note To improve performance, this function might use full-SIMD-width
393 * unaligned load/store, and subtract 0.0 from the extra elements.
394 * This means you need to ensure the memory is padded
395 * at the end, so we always can load GMX_SIMD_REAL_WIDTH elements
396 * starting at the last offset. If you use the Gromacs aligned memory
397 * allocation routines this will always be the case.
400 static inline void gmx_simdcall
401 transposeScatterDecrU(float* base, const std::int32_t offset[], SimdFloat v0, SimdFloat v1, SimdFloat v2)
403 // Offset list must be aligned for SIMD FINT32
404 assert(std::size_t(offset) % (GMX_SIMD_FINT32_WIDTH * sizeof(std::int32_t)) == 0);
406 for (std::size_t i = 0; i < v0.simdInternal_.size(); i++)
408 base[align * offset[i]] -= v0.simdInternal_[i];
409 base[align * offset[i] + 1] -= v1.simdInternal_[i];
410 base[align * offset[i] + 2] -= v2.simdInternal_[i];
415 /*! \brief Expand each element of float SIMD variable into three identical
416 * consecutive elements in three SIMD outputs.
418 * \param scalar Floating-point input, e.g. [s0 s1 s2 s3] if width=4.
419 * \param[out] triplets0 First output, e.g. [s0 s0 s0 s1] if width=4.
420 * \param[out] triplets1 Second output, e.g. [s1 s1 s2 s2] if width=4.
421 * \param[out] triplets2 Third output, e.g. [s2 s3 s3 s3] if width=4.
423 * This routine is meant to use for things like scalar-vector multiplication,
424 * where the vectors are stored in a merged format like [x0 y0 z0 x1 y1 z1 ...],
425 * while the scalars are stored as [s0 s1 s2...], and the data cannot easily
426 * be changed to SIMD-friendly layout.
428 * In this case, load 3 full-width SIMD variables from the vector array (This
429 * will always correspond to GMX_SIMD_FLOAT_WIDTH triplets),
430 * load a single full-width variable from the scalar array, and
431 * call this routine to expand the data. You can then simply multiply the
432 * first, second and third pair of SIMD variables, and store the three
433 * results back into a suitable vector-format array.
435 static inline void gmx_simdcall expandScalarsToTriplets(SimdFloat scalar,
436 SimdFloat* triplets0,
437 SimdFloat* triplets1,
438 SimdFloat* triplets2)
440 for (std::size_t i = 0; i < scalar.simdInternal_.size(); i++)
442 triplets0->simdInternal_[i] = scalar.simdInternal_[i / 3];
443 triplets1->simdInternal_[i] = scalar.simdInternal_[(i + scalar.simdInternal_.size()) / 3];
444 triplets2->simdInternal_[i] = scalar.simdInternal_[(i + 2 * scalar.simdInternal_.size()) / 3];
448 /*! \brief Load 4 consecutive floats from each of GMX_SIMD_FLOAT_WIDTH offsets
449 * specified by a SIMD integer, transpose into 4 SIMD float variables.
451 * \tparam align Alignment of the memory from which we read, i.e. distance
452 * (measured in elements, not bytes) between index points.
453 * When this is identical to the number of SIMD variables
454 * (i.e., 4 for this routine) the input data is packed without
455 * padding in memory. See the SIMD parameters for exactly
456 * what memory positions are loaded.
457 * \param base Aligned pointer to the start of the memory.
458 * \param offset SIMD integer type with offsets to the start of each triplet.
459 * \param[out] v0 First component, base[align*offset[i]] for each i.
460 * \param[out] v1 Second component, base[align*offset[i] + 1] for each i.
461 * \param[out] v2 Third component, base[align*offset[i] + 2] for each i.
462 * \param[out] v3 Fourth component, base[align*offset[i] + 3] for each i.
464 * The floating-point memory locations must be aligned, but only to the smaller
465 * of four elements and the floating-point SIMD width.
467 * \note You should NOT scale offsets before calling this routine; it is
468 * done internally by using the alignment template parameter instead.
469 * \note This is a special routine primarily intended for loading Gromacs
470 * table data as efficiently as possible - this is the reason for using
471 * a SIMD offset index, since the result of the real-to-integer conversion
472 * is present in a SIMD register just before calling this routine.
475 static inline void gmx_simdcall gatherLoadBySimdIntTranspose(const float* base,
482 // Base pointer must be aligned to the smaller of 4 elements and float SIMD width
483 assert(std::size_t(base) % (std::min(GMX_SIMD_FLOAT_WIDTH, 4) * sizeof(float)) == 0);
484 // align parameter must also be a multiple of the above alignment requirement
485 assert(align % std::min(GMX_SIMD_FLOAT_WIDTH, 4) == 0);
487 for (std::size_t i = 0; i < v0->simdInternal_.size(); i++)
489 v0->simdInternal_[i] = base[align * offset.simdInternal_[i]];
490 v1->simdInternal_[i] = base[align * offset.simdInternal_[i] + 1];
491 v2->simdInternal_[i] = base[align * offset.simdInternal_[i] + 2];
492 v3->simdInternal_[i] = base[align * offset.simdInternal_[i] + 3];
497 /*! \brief Load 2 consecutive floats from each of GMX_SIMD_FLOAT_WIDTH offsets
498 * (unaligned) specified by SIMD integer, transpose into 2 SIMD floats.
500 * \tparam align Alignment of the memory from which we read, i.e. distance
501 * (measured in elements, not bytes) between index points.
502 * When this is identical to the number of SIMD variables
503 * (i.e., 2 for this routine) the input data is packed without
504 * padding in memory. See the SIMD parameters for exactly
505 * what memory positions are loaded.
506 * \param base Pointer to the start of the memory.
507 * \param offset SIMD integer type with offsets to the start of each triplet.
508 * \param[out] v0 First component, base[align*offset[i]] for each i.
509 * \param[out] v1 Second component, base[align*offset[i] + 1] for each i.
511 * Since some SIMD architectures cannot handle any unaligned loads, this routine
512 * is only available if GMX_SIMD_HAVE_GATHER_LOADU_BYSIMDINT_TRANSPOSE is 1.
514 * \note You should NOT scale offsets before calling this routine; it is
515 * done internally by using the alignment template parameter instead.
516 * \note This is a special routine primarily intended for loading Gromacs
517 * table data as efficiently as possible - this is the reason for using
518 * a SIMD offset index, since the result of the real-to-integer conversion
519 * is present in a SIMD register just before calling this routine.
522 static inline void gmx_simdcall
523 gatherLoadUBySimdIntTranspose(const float* base, SimdFInt32 offset, SimdFloat* v0, SimdFloat* v1)
525 for (std::size_t i = 0; i < v0->simdInternal_.size(); i++)
527 v0->simdInternal_[i] = base[align * offset.simdInternal_[i]];
528 v1->simdInternal_[i] = base[align * offset.simdInternal_[i] + 1];
532 /*! \brief Load 2 consecutive floats from each of GMX_SIMD_FLOAT_WIDTH offsets
533 * specified by a SIMD integer, transpose into 2 SIMD float variables.
535 * \tparam align Alignment of the memory from which we read, i.e. distance
536 * (measured in elements, not bytes) between index points.
537 * When this is identical to the number of SIMD variables
538 * (i.e., 2 for this routine) the input data is packed without
539 * padding in memory. See the SIMD parameters for exactly
540 * what memory positions are loaded.
541 * \param base Aligned pointer to the start of the memory.
542 * \param offset SIMD integer type with offsets to the start of each triplet.
543 * \param[out] v0 First component, base[align*offset[i]] for each i.
544 * \param[out] v1 Second component, base[align*offset[i] + 1] for each i.
546 * The floating-point memory locations must be aligned, but only to the smaller
547 * of two elements and the floating-point SIMD width.
549 * \note You should NOT scale offsets before calling this routine; it is
550 * done internally by using the alignment template parameter instead.
551 * \note This is a special routine primarily intended for loading Gromacs
552 * table data as efficiently as possible - this is the reason for using
553 * a SIMD offset index, since the result of the real-to-integer conversion
554 * is present in a SIMD register just before calling this routine.
557 static inline void gmx_simdcall
558 gatherLoadBySimdIntTranspose(const float* base, SimdFInt32 offset, SimdFloat* v0, SimdFloat* v1)
560 // Base pointer must be aligned to the smaller of 2 elements and float SIMD width
561 assert(std::size_t(base) % (std::min(GMX_SIMD_FLOAT_WIDTH, 2) * sizeof(float)) == 0);
562 // align parameter must also be a multiple of the above alignment requirement
563 assert(align % std::min(GMX_SIMD_FLOAT_WIDTH, 2) == 0);
565 for (std::size_t i = 0; i < v0->simdInternal_.size(); i++)
567 v0->simdInternal_[i] = base[align * offset.simdInternal_[i]];
568 v1->simdInternal_[i] = base[align * offset.simdInternal_[i] + 1];
573 /*! \brief Reduce each of four SIMD floats, add those values to four consecutive
574 * floats in memory, return sum.
576 * \param m Pointer to memory where four floats should be incremented
577 * \param v0 SIMD variable whose sum should be added to m[0]
578 * \param v1 SIMD variable whose sum should be added to m[1]
579 * \param v2 SIMD variable whose sum should be added to m[2]
580 * \param v3 SIMD variable whose sum should be added to m[3]
582 * \return Sum of all elements in the four SIMD variables.
584 * The pointer m must be aligned to the smaller of four elements and the
585 * floating-point SIMD width.
587 * \note This is a special routine intended for the Gromacs nonbonded kernels.
588 * It is used in the epilogue of the outer loop, where the variables will
589 * contain unrolled forces for one outer-loop-particle each, corresponding to
590 * a single coordinate (i.e, say, four x-coordinate force variables). These
591 * should be summed and added to the force array in memory. Since we always work
592 * with contiguous SIMD-layout , we can use efficient aligned loads/stores.
593 * When calculating the virial, we also need the total sum of all forces for
594 * each coordinate. This is provided as the return value. For routines that
595 * do not need these, this extra code will be optimized away completely if you
596 * just ignore the return value (Checked with gcc-4.9.1 and clang-3.6 for AVX).
598 static inline float gmx_simdcall reduceIncr4ReturnSum(float* m, SimdFloat v0, SimdFloat v1, SimdFloat v2, SimdFloat v3)
600 float sum[4]; // Note that the 4 here corresponds to the 4 m-elements, not any SIMD width
602 // Make sure the memory pointer is aligned to the smaller of 4 elements and float SIMD width
603 assert(std::size_t(m) % (std::min(GMX_SIMD_FLOAT_WIDTH, 4) * sizeof(float)) == 0);
615 return sum[0] + sum[1] + sum[2] + sum[3];
621 * \name Higher-level SIMD utilities accessing partial (half-width) SIMD floats.
623 * These functions are optional. The are only useful for SIMD implementation
624 * where the width is 8 or larger, and where it would be inefficient
625 * to process 4*8, 8*8, or more, interactions in parallel.
627 * Currently, only Intel provides very wide SIMD implementations, but these
628 * also come with excellent support for loading, storing, accessing and
629 * shuffling parts of the register in so-called 'lanes' of 4 bytes each.
630 * We can use this to load separate parts into the low/high halves of the
631 * register in the inner loop of the nonbonded kernel, which e.g. makes it
632 * possible to process 4*4 nonbonded interactions as a pattern of 2*8. We
633 * can also use implementations with width 16 or greater.
635 * To make this more generic, when \ref GMX_SIMD_HAVE_HSIMD_UTIL_REAL is 1,
636 * the SIMD implementation provides seven special routines that:
638 * - Load the low/high parts of a SIMD variable from different pointers
639 * - Load half the SIMD width from one pointer, and duplicate in low/high parts
640 * - Load two reals, put 1st one in all low elements, and 2nd in all high ones.
641 * - Store the low/high parts of a SIMD variable to different pointers
642 * - Subtract both SIMD halves from a single half-SIMD-width memory location.
643 * - Load aligned pairs (LJ parameters) from two base pointers, with a common
644 * offset list, and put these in the low/high SIMD halves.
645 * - Reduce each half of two SIMD registers (i.e., 4 parts in total), increment
646 * four adjacent memory positions, and return the total sum.
648 * Remember: this is ONLY used when the native SIMD width is large. You will
649 * just waste time if you implement it for normal 16-byte SIMD architectures.
651 * This is part of the new C++ SIMD interface, so these functions are only
652 * available when using C++. Since some Gromacs code reliying on the SIMD
653 * module is still C (not C++), we have kept the C-style naming for now - this
654 * will change once we are entirely C++.
660 /*! \brief Load low & high parts of SIMD float from different locations.
662 * \param m0 Pointer to memory aligned to half SIMD width.
663 * \param m1 Pointer to memory aligned to half SIMD width.
665 * \return SIMD variable with low part loaded from m0, high from m1.
667 * Available if \ref GMX_SIMD_HAVE_HSIMD_UTIL_FLOAT is 1.
669 static inline SimdFloat gmx_simdcall loadDualHsimd(const float* m0, const float* m1)
673 // Make sure the memory pointers are aligned to half float SIMD width
674 assert(std::size_t(m0) % (GMX_SIMD_FLOAT_WIDTH / 2 * sizeof(float)) == 0);
675 assert(std::size_t(m1) % (GMX_SIMD_FLOAT_WIDTH / 2 * sizeof(float)) == 0);
677 for (std::size_t i = 0; i < a.simdInternal_.size() / 2; i++)
679 a.simdInternal_[i] = m0[i];
680 a.simdInternal_[a.simdInternal_.size() / 2 + i] = m1[i];
685 /*! \brief Load half-SIMD-width float data, spread to both halves.
687 * \param m Pointer to memory aligned to half SIMD width.
689 * \return SIMD variable with both halves loaded from m..
691 * Available if \ref GMX_SIMD_HAVE_HSIMD_UTIL_FLOAT is 1.
693 static inline SimdFloat gmx_simdcall loadDuplicateHsimd(const float* m)
697 // Make sure the memory pointer is aligned
698 assert(std::size_t(m) % (GMX_SIMD_FLOAT_WIDTH / 2 * sizeof(float)) == 0);
700 for (std::size_t i = 0; i < a.simdInternal_.size() / 2; i++)
702 a.simdInternal_[i] = m[i];
703 a.simdInternal_[a.simdInternal_.size() / 2 + i] = a.simdInternal_[i];
708 /*! \brief Load two floats, spread 1st in low half, 2nd in high half.
710 * \param m Pointer to two adjacent float values.
712 * \return SIMD variable where all elements in the low half have been set
713 * to m[0], and all elements in high half to m[1].
715 * \note This routine always loads two values and sets the halves separately.
716 * If you want to set all elements to the same value, simply use
717 * the standard (non-half-SIMD) operations.
719 * Available if \ref GMX_SIMD_HAVE_HSIMD_UTIL_FLOAT is 1.
721 static inline SimdFloat gmx_simdcall loadU1DualHsimd(const float* m)
725 for (std::size_t i = 0; i < a.simdInternal_.size() / 2; i++)
727 a.simdInternal_[i] = m[0];
728 a.simdInternal_[a.simdInternal_.size() / 2 + i] = m[1];
734 /*! \brief Store low & high parts of SIMD float to different locations.
736 * \param m0 Pointer to memory aligned to half SIMD width.
737 * \param m1 Pointer to memory aligned to half SIMD width.
738 * \param a SIMD variable. Low half should be stored to m0, high to m1.
740 * Available if \ref GMX_SIMD_HAVE_HSIMD_UTIL_FLOAT is 1.
742 static inline void gmx_simdcall storeDualHsimd(float* m0, float* m1, SimdFloat a)
744 // Make sure the memory pointers are aligned to half float SIMD width
745 assert(std::size_t(m0) % (GMX_SIMD_FLOAT_WIDTH / 2 * sizeof(float)) == 0);
746 assert(std::size_t(m1) % (GMX_SIMD_FLOAT_WIDTH / 2 * sizeof(float)) == 0);
748 for (std::size_t i = 0; i < a.simdInternal_.size() / 2; i++)
750 m0[i] = a.simdInternal_[i];
751 m1[i] = a.simdInternal_[a.simdInternal_.size() / 2 + i];
755 /*! \brief Add each half of SIMD variable to separate memory adresses
757 * \param m0 Pointer to memory aligned to half SIMD width.
758 * \param m1 Pointer to memory aligned to half SIMD width.
759 * \param a SIMD variable. Lower half will be added to m0, upper half to m1.
761 * The memory must be aligned to half SIMD width.
763 * \note The updated m0 value is written before m1 is read from memory, so
764 * the result will be correct even if the memory regions overlap.
766 * Available if \ref GMX_SIMD_HAVE_HSIMD_UTIL_FLOAT is 1.
768 static inline void gmx_simdcall incrDualHsimd(float* m0, float* m1, SimdFloat a)
770 // Make sure the memory pointer is aligned to half float SIMD width
771 assert(std::size_t(m0) % (GMX_SIMD_FLOAT_WIDTH / 2 * sizeof(float)) == 0);
772 assert(std::size_t(m1) % (GMX_SIMD_FLOAT_WIDTH / 2 * sizeof(float)) == 0);
774 for (std::size_t i = 0; i < a.simdInternal_.size() / 2; i++)
776 m0[i] += a.simdInternal_[i];
778 for (std::size_t i = 0; i < a.simdInternal_.size() / 2; i++)
780 m1[i] += a.simdInternal_[a.simdInternal_.size() / 2 + i];
784 /*! \brief Add the two halves of three SIMD floats, subtract the sum from
785 * three half-SIMD-width consecutive floats in memory.
787 * \param m half-width aligned memory, from which sum of the halves will be subtracted.
788 * \param a0 SIMD variable. Upper & lower halves will first be added.
789 * \param a1 SIMD variable. Upper & lower halves will second be added.
790 * \param a2 SIMD variable. Upper & lower halves will third be added.
792 * If the SIMD width is 8 and the vectors contain [a0 b0 c0 d0 e0 f0 g0 h0],
793 * [a1 b1 c1 d1 e1 f1 g1 g1] and [a2 b2 c2 d2 e2 f2 g2 h2], the
794 * memory will be modified to [m[0]-(a0+e0) m[1]-(b0+f0) m[2]-(c0+g0) m[3]-(d0+h0)
795 * m[4]-(a1+e1) m[5]-(b1+f1) m[6]-(c1+g1) m[7]-(d1+h1)
796 * m[8]-(a2+e2) m[9]-(b2+f2) m[10]-(c2+g2) m[11]-(d2+h2)].
798 * The memory must be aligned to half SIMD width.
800 * Available if \ref GMX_SIMD_HAVE_HSIMD_UTIL_FLOAT is 1.
802 static inline void gmx_simdcall decr3Hsimd(float* m, SimdFloat a0, SimdFloat a1, SimdFloat a2)
804 assert(std::size_t(m) % (GMX_SIMD_FLOAT_WIDTH / 2 * sizeof(float)) == 0);
805 for (std::size_t i = 0; i < a0.simdInternal_.size() / 2; i++)
807 m[i] -= a0.simdInternal_[i] + a0.simdInternal_[a0.simdInternal_.size() / 2 + i];
809 for (std::size_t i = 0; i < a1.simdInternal_.size() / 2; i++)
811 m[a1.simdInternal_.size() / 2 + i] -=
812 a1.simdInternal_[i] + a1.simdInternal_[a1.simdInternal_.size() / 2 + i];
814 for (std::size_t i = 0; i < a2.simdInternal_.size() / 2; i++)
816 m[a2.simdInternal_.size() + i] -=
817 a2.simdInternal_[i] + a2.simdInternal_[a2.simdInternal_.size() / 2 + i];
821 /*! \brief Load 2 consecutive floats from each of GMX_SIMD_FLOAT_WIDTH/2 offsets,
822 * transpose into SIMD float (low half from base0, high from base1).
824 * \tparam align Alignment of the storage, i.e. the distance
825 * (measured in elements, not bytes) between index points.
826 * When this is identical to the number of output components
827 * the data is packed without padding. This must be a
828 * multiple of the alignment to keep all data aligned.
829 * \param base0 Pointer to base of first aligned memory
830 * \param base1 Pointer to base of second aligned memory
831 * \param offset Offset to the start of each pair
832 * \param[out] v0 1st element in each pair, base0 in low and base1 in high half.
833 * \param[out] v1 2nd element in each pair, base0 in low and base1 in high half.
835 * The offset array should be of half the SIMD width length, so it corresponds
836 * to the half-SIMD-register operations. This also means it must be aligned
837 * to half the integer SIMD width (i.e., GMX_SIMD_FINT32_WIDTH/2).
839 * The floating-point memory locations must be aligned, but only to the smaller
840 * of two elements and the floating-point SIMD width.
842 * This routine is primarily designed to load nonbonded parameters in the
843 * kernels. It is the equivalent of the full-width routine
844 * gatherLoadTranspose(), but just
845 * as the other hsimd routines it will pick half-SIMD-width data from base0
846 * and put in the lower half, while the upper half comes from base1.
848 * For an example, assume the SIMD width is 8, align is 2, that
849 * base0 is [A0 A1 B0 B1 C0 C1 D0 D1 ...], and base1 [E0 E1 F0 F1 G0 G1 H0 H1...].
851 * Then we will get v0 as [A0 B0 C0 D0 E0 F0 G0 H0] and v1 as [A1 B1 C1 D1 E1 F1 G1 H1].
853 * Available if \ref GMX_SIMD_HAVE_HSIMD_UTIL_FLOAT is 1.
856 static inline void gmx_simdcall gatherLoadTransposeHsimd(const float* base0,
858 const std::int32_t offset[],
862 // Offset list must be aligned for half SIMD FINT32 width
863 assert(std::size_t(offset) % (GMX_SIMD_FINT32_WIDTH / 2 * sizeof(std::int32_t)) == 0);
864 // base pointers must be aligned to the smaller of 2 elements and float SIMD width
865 assert(std::size_t(base0) % (std::min(GMX_SIMD_FLOAT_WIDTH, 2) * sizeof(float)) == 0);
866 assert(std::size_t(base1) % (std::min(GMX_SIMD_FLOAT_WIDTH, 2) * sizeof(float)) == 0);
867 // alignment parameter must be also be multiple of the above required alignment
868 assert(align % std::min(GMX_SIMD_FLOAT_WIDTH, 2) == 0);
870 for (std::size_t i = 0; i < v0->simdInternal_.size() / 2; i++)
872 v0->simdInternal_[i] = base0[align * offset[i]];
873 v1->simdInternal_[i] = base0[align * offset[i] + 1];
874 v0->simdInternal_[v0->simdInternal_.size() / 2 + i] = base1[align * offset[i]];
875 v1->simdInternal_[v1->simdInternal_.size() / 2 + i] = base1[align * offset[i] + 1];
879 /*! \brief Reduce the 4 half-SIMD-with floats in 2 SIMD variables (sum halves),
880 * increment four consecutive floats in memory, return sum.
882 * \param m Pointer to memory where the four values should be incremented
883 * \param v0 Variable whose half-SIMD sums should be added to m[0]/m[1], respectively.
884 * \param v1 Variable whose half-SIMD sums should be added to m[2]/m[3], respectively.
886 * \return Sum of all elements in the four SIMD variables.
888 * The pointer m must be aligned, but only to the smaller
889 * of four elements and the floating-point SIMD width.
891 * \note This is the half-SIMD-width version of
892 * reduceIncr4ReturnSum(). The only difference is that the
893 * four half-SIMD inputs needed are present in the low/high halves of the
894 * two SIMD arguments.
896 * Available if \ref GMX_SIMD_HAVE_HSIMD_UTIL_FLOAT is 1.
898 static inline float gmx_simdcall reduceIncr4ReturnSumHsimd(float* m, SimdFloat v0, SimdFloat v1)
900 // The 4 here corresponds to the 4 elements in memory, not any SIMD width
901 float sum[4] = { 0.0F, 0.0F, 0.0F, 0.0F };
903 for (std::size_t i = 0; i < v0.simdInternal_.size() / 2; i++)
905 sum[0] += v0.simdInternal_[i];
906 sum[1] += v0.simdInternal_[v0.simdInternal_.size() / 2 + i];
907 sum[2] += v1.simdInternal_[i];
908 sum[3] += v1.simdInternal_[v1.simdInternal_.size() / 2 + i];
911 // Make sure the memory pointer is aligned to the smaller of 4 elements and float SIMD width
912 assert(std::size_t(m) % (std::min(GMX_SIMD_FLOAT_WIDTH, 4) * sizeof(float)) == 0);
919 return sum[0] + sum[1] + sum[2] + sum[3];
922 #if GMX_SIMD_FLOAT_WIDTH > 8 || defined DOXYGEN
923 /*! \brief Load N floats and duplicate them 4 times each.
925 * \param m Pointer to unaligned memory
927 * \return SIMD variable with N floats from m duplicated 4x.
929 * Available if \ref GMX_SIMD_HAVE_4NSIMD_UTIL_FLOAT is 1.
930 * N is GMX_SIMD_FLOAT_WIDTH/4. Duplicated values are
931 * contigous and different values are 4 positions in SIMD
934 static inline SimdFloat gmx_simdcall loadUNDuplicate4(const float* m)
937 for (std::size_t i = 0; i < a.simdInternal_.size() / 4; i++)
939 a.simdInternal_[i * 4] = m[i];
940 a.simdInternal_[i * 4 + 1] = m[i];
941 a.simdInternal_[i * 4 + 2] = m[i];
942 a.simdInternal_[i * 4 + 3] = m[i];
947 /*! \brief Load 4 floats and duplicate them N times each.
949 * \param m Pointer to memory aligned to 4 floats
951 * \return SIMD variable with 4 floats from m duplicated Nx.
953 * Available if \ref GMX_SIMD_HAVE_4NSIMD_UTIL_FLOAT is 1.
954 * N is GMX_SIMD_FLOAT_WIDTH/4. Different values are
955 * contigous and same values are 4 positions in SIMD
958 static inline SimdFloat gmx_simdcall load4DuplicateN(const float* m)
961 for (std::size_t i = 0; i < a.simdInternal_.size() / 4; i++)
963 a.simdInternal_[i * 4] = m[0];
964 a.simdInternal_[i * 4 + 1] = m[1];
965 a.simdInternal_[i * 4 + 2] = m[2];
966 a.simdInternal_[i * 4 + 3] = m[3];
972 #if GMX_SIMD_FLOAT_WIDTH >= 8 || defined DOXYGEN
973 /*! \brief Load floats in blocks of 4 at fixed offsets
975 * \param m Pointer to unaligned memory
976 * \param offset Offset in memory between input blocks of 4
978 * \return SIMD variable with floats from m.
980 * Available if \ref GMX_SIMD_HAVE_4NSIMD_UTIL_FLOAT is 1.
981 * Blocks of 4 floats are loaded from m+n*offset where n
982 * is the n-th block of 4 floats.
984 static inline SimdFloat gmx_simdcall loadU4NOffset(const float* m, int offset)
987 for (std::size_t i = 0; i < a.simdInternal_.size() / 4; i++)
989 a.simdInternal_[i * 4] = m[offset * i + 0];
990 a.simdInternal_[i * 4 + 1] = m[offset * i + 1];
991 a.simdInternal_[i * 4 + 2] = m[offset * i + 2];
992 a.simdInternal_[i * 4 + 3] = m[offset * i + 3];
1005 #endif // GMX_SIMD_IMPL_REFERENCE_UTIL_FLOAT_H