2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2015,2017,2018,2019,2020, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
39 #include "gromacs/simd/simd.h"
40 #include "gromacs/utility/alignedallocator.h"
41 #include "gromacs/utility/basedefinitions.h"
43 #include "testutils/testasserts.h"
55 /*! \addtogroup module_simd */
58 #if GMX_SIMD_HAVE_REAL
60 /*! \brief Test fixture for higher-level floating-point utility functions.
62 * Inherit from main SimdTest, add code to generate aligned memory and data.
64 class SimdFloatingpointUtilTest : public SimdTest
67 SimdFloatingpointUtilTest()
69 // Resize vectors to get the amount of memory we need
70 integerMemory_.resize(GMX_SIMD_REAL_WIDTH);
72 // The total memory we allocate corresponds to two work arrays
73 // and 4 values each of GMX_SIMD_REAL_WIDTH.
74 realMemory_.resize(2 * s_workMemSize_ + 4 * GMX_SIMD_REAL_WIDTH);
76 offset_ = integerMemory_.data();
77 val0_ = realMemory_.data();
78 val1_ = val0_ + GMX_SIMD_REAL_WIDTH;
79 val2_ = val1_ + GMX_SIMD_REAL_WIDTH;
80 val3_ = val2_ + GMX_SIMD_REAL_WIDTH;
81 mem0_ = val3_ + GMX_SIMD_REAL_WIDTH;
82 mem1_ = mem0_ + s_workMemSize_;
84 // Set default values for offset and variables val0_ through val3_
85 // We cannot fill mem_ here since those values depend on the test.
86 for (int i = 0; i < GMX_SIMD_REAL_WIDTH; i++)
88 // Use every third point to avoid a continguous access pattern
90 // Multiply numbers by 1+100*GMX_REAL_EPS ensures some low bits are
91 // set too, so the tests make sure we read all bits correctly.
92 val0_[i] = (i) * (1.0 + 100 * GMX_REAL_EPS);
93 val1_[i] = (i + 0.1) * (1.0 + 100 * GMX_REAL_EPS);
94 val2_[i] = (i + 0.2) * (1.0 + 100 * GMX_REAL_EPS);
95 val3_[i] = (i + 0.3) * (1.0 + 100 * GMX_REAL_EPS);
100 //! \brief Size of memory work buffers
102 // To have a somewhat odd access pattern, we use every
103 // third entry, so the largest value of offset_[i] is 3*GMX_SIMD_REAL_WIDTH.
104 // Then we also allow alignments up to 16, which means the largest index in mem0_[]
105 // that we might access is 16*3*GMX_SIMD_REAL_WIDTH+3.
106 static const std::size_t s_workMemSize_ = 16 * 3 * GMX_SIMD_REAL_WIDTH + 4;
108 std::vector<int, AlignedAllocator<int>> integerMemory_; //!< Aligned integer memory
109 std::vector<real, AlignedAllocator<real>> realMemory_; //!< Aligned real memory
111 int* offset_; //!< Pointer to offset indices, aligned memory
112 real* val0_; //!< Pointer to GMX_SIMD_REAL_WIDTH values, aligned
113 real* val1_; //!< Pointer to GMX_SIMD_REAL_WIDTH values, aligned
114 real* val2_; //!< Pointer to GMX_SIMD_REAL_WIDTH values, aligned
115 real* val3_; //!< Pointer to GMX_SIMD_REAL_WIDTH values, aligned
117 real* mem0_; //!< Pointer to aligned memory, s_workMemSize real values
118 real* mem1_; //!< Pointer to aligned memory, s_workMemSize real values
122 TEST_F(SimdFloatingpointUtilTest, gatherLoadTranspose4)
124 SimdReal v0, v1, v2, v3;
125 SimdReal ref0, ref1, ref2, ref3;
126 const int nalign = 3;
127 int alignmentList[nalign] = { 4, 8, 12 };
130 for (i = 0; i < nalign; i++)
132 align = alignmentList[i];
133 for (j = 0; j < GMX_SIMD_REAL_WIDTH; j++)
135 mem0_[align * offset_[j]] = val0_[j];
136 mem0_[align * offset_[j] + 1] = val1_[j];
137 mem0_[align * offset_[j] + 2] = val2_[j];
138 mem0_[align * offset_[j] + 3] = val3_[j];
141 ref0 = load<SimdReal>(val0_);
142 ref1 = load<SimdReal>(val1_);
143 ref2 = load<SimdReal>(val2_);
144 ref3 = load<SimdReal>(val3_);
148 gatherLoadTranspose<4>(mem0_, offset_, &v0, &v1, &v2, &v3);
152 gatherLoadTranspose<8>(mem0_, offset_, &v0, &v1, &v2, &v3);
154 else if (align == 12)
156 gatherLoadTranspose<12>(mem0_, offset_, &v0, &v1, &v2, &v3);
163 GMX_EXPECT_SIMD_REAL_EQ(ref0, v0);
164 GMX_EXPECT_SIMD_REAL_EQ(ref1, v1);
165 GMX_EXPECT_SIMD_REAL_EQ(ref2, v2);
166 GMX_EXPECT_SIMD_REAL_EQ(ref3, v3);
170 TEST_F(SimdFloatingpointUtilTest, gatherLoadTranspose2)
174 const int nalign = 3;
175 int alignmentList[nalign] = { 2, 4, c_simdBestPairAlignment };
178 EXPECT_TRUE(c_simdBestPairAlignment <= GMX_SIMD_REAL_WIDTH);
180 for (i = 0; i < nalign; i++)
182 align = alignmentList[i];
183 for (j = 0; j < GMX_SIMD_REAL_WIDTH; j++)
185 mem0_[align * offset_[j]] = val0_[j];
186 mem0_[align * offset_[j] + 1] = val1_[j];
189 ref0 = load<SimdReal>(val0_);
190 ref1 = load<SimdReal>(val1_);
194 gatherLoadTranspose<2>(mem0_, offset_, &v0, &v1);
198 gatherLoadTranspose<4>(mem0_, offset_, &v0, &v1);
200 else if (align == c_simdBestPairAlignment)
202 gatherLoadTranspose<c_simdBestPairAlignment>(mem0_, offset_, &v0, &v1);
209 GMX_EXPECT_SIMD_REAL_EQ(ref0, v0);
210 GMX_EXPECT_SIMD_REAL_EQ(ref1, v1);
214 TEST_F(SimdFloatingpointUtilTest, gatherLoadUTranspose3)
217 SimdReal ref0, ref1, ref2;
218 const int nalign = 2;
219 int alignmentList[nalign] = { 3, 4 };
222 for (i = 0; i < nalign; i++)
224 align = alignmentList[i];
225 for (j = 0; j < GMX_SIMD_REAL_WIDTH; j++)
227 mem0_[align * offset_[j]] = val0_[j];
228 mem0_[align * offset_[j] + 1] = val1_[j];
229 mem0_[align * offset_[j] + 2] = val2_[j];
232 ref0 = load<SimdReal>(val0_);
233 ref1 = load<SimdReal>(val1_);
234 ref2 = load<SimdReal>(val2_);
238 gatherLoadUTranspose<3>(mem0_, offset_, &v0, &v1, &v2);
242 gatherLoadUTranspose<4>(mem0_, offset_, &v0, &v1, &v2);
249 GMX_EXPECT_SIMD_REAL_EQ(ref0, v0);
250 GMX_EXPECT_SIMD_REAL_EQ(ref1, v1);
251 GMX_EXPECT_SIMD_REAL_EQ(ref2, v2);
255 TEST_F(SimdFloatingpointUtilTest, transposeScatterStoreU3)
258 real refmem[s_workMemSize_];
259 const int nalign = 2;
260 int alignmentList[nalign] = { 3, 4 };
262 FloatingPointTolerance tolerance(defaultRealTolerance());
264 for (i = 0; i < nalign; i++)
266 align = alignmentList[i];
268 // Set test and reference memory to background value
269 for (std::size_t j = 0; j < s_workMemSize_; j++)
271 // Multiply by 1+100*eps to make sure low bits are also used
272 mem0_[j] = refmem[j] = (1000.0 + j) * (1.0 + 100 * GMX_REAL_EPS);
275 for (std::size_t j = 0; j < GMX_SIMD_REAL_WIDTH; j++)
277 // set values in _reference_ memory (we will then test with mem0_, and compare)
278 refmem[align * offset_[j]] = val0_[j];
279 refmem[align * offset_[j] + 1] = val1_[j];
280 refmem[align * offset_[j] + 2] = val2_[j];
283 v0 = load<SimdReal>(val0_);
284 v1 = load<SimdReal>(val1_);
285 v2 = load<SimdReal>(val2_);
289 transposeScatterStoreU<3>(mem0_, offset_, v0, v1, v2);
293 transposeScatterStoreU<4>(mem0_, offset_, v0, v1, v2);
300 for (std::size_t j = 0; j < s_workMemSize_; j++)
302 EXPECT_REAL_EQ_TOL(refmem[j], mem0_[j], tolerance);
307 TEST_F(SimdFloatingpointUtilTest, transposeScatterIncrU3)
310 real refmem[s_workMemSize_];
311 const int nalign = 2;
312 int alignmentList[nalign] = { 3, 4 };
314 FloatingPointTolerance tolerance(defaultRealTolerance());
316 for (i = 0; i < nalign; i++)
318 align = alignmentList[i];
320 // Set test and reference memory to background value
321 for (std::size_t j = 0; j < s_workMemSize_; j++)
323 // Multiply by 1+100*eps to make sure low bits are also used
324 mem0_[j] = refmem[j] = (1000.0 + j) * (1.0 + 100 * GMX_REAL_EPS);
327 for (std::size_t j = 0; j < GMX_SIMD_REAL_WIDTH; j++)
329 // Add values to _reference_ memory (we will then test with mem0_, and compare)
330 refmem[align * offset_[j]] += val0_[j];
331 refmem[align * offset_[j] + 1] += val1_[j];
332 refmem[align * offset_[j] + 2] += val2_[j];
335 v0 = load<SimdReal>(val0_);
336 v1 = load<SimdReal>(val1_);
337 v2 = load<SimdReal>(val2_);
341 transposeScatterIncrU<3>(mem0_, offset_, v0, v1, v2);
345 transposeScatterIncrU<4>(mem0_, offset_, v0, v1, v2);
352 for (std::size_t j = 0; j < s_workMemSize_; j++)
354 EXPECT_REAL_EQ_TOL(refmem[j], mem0_[j], tolerance);
359 TEST_F(SimdFloatingpointUtilTest, transposeScatterIncrU3Overlapping)
362 real refmem[s_workMemSize_];
363 FloatingPointTolerance tolerance(defaultRealTolerance());
365 // Alter offset_ to make all entries point to the same (first) value, so all entries will overlap
366 for (std::size_t j = 0; j < GMX_SIMD_REAL_WIDTH; j++)
371 // Set test and reference memory to background value
372 for (std::size_t j = 0; j < s_workMemSize_; j++)
374 // Multiply by 1+100*eps to make sure low bits are also used
375 mem0_[j] = refmem[j] = (1000.0 + j) * (1.0 + 100 * GMX_REAL_EPS);
378 for (std::size_t j = 0; j < GMX_SIMD_REAL_WIDTH; j++)
380 // Add values to _reference_ memory (we will then test with mem0_, and compare)
381 refmem[3 * offset_[j]] += val0_[j];
382 refmem[3 * offset_[j] + 1] += val1_[j];
383 refmem[3 * offset_[j] + 2] += val2_[j];
386 v0 = load<SimdReal>(val0_);
387 v1 = load<SimdReal>(val1_);
388 v2 = load<SimdReal>(val2_);
390 transposeScatterIncrU<3>(mem0_, offset_, v0, v1, v2);
392 for (std::size_t j = 0; j < s_workMemSize_; j++)
394 EXPECT_REAL_EQ_TOL(refmem[j], mem0_[j], tolerance);
398 TEST_F(SimdFloatingpointUtilTest, transposeScatterDecrU3)
401 real refmem[s_workMemSize_];
402 const int nalign = 2;
403 int alignmentList[nalign] = { 3, 4 };
405 FloatingPointTolerance tolerance(defaultRealTolerance());
407 for (i = 0; i < nalign; i++)
409 align = alignmentList[i];
411 // Set test and reference memory to background value
412 for (std::size_t j = 0; j < s_workMemSize_; j++)
414 // Multiply by 1+100*eps to make sure low bits are also used
415 mem0_[j] = refmem[j] = (1000.0 + j) * (1.0 + 100 * GMX_REAL_EPS);
418 for (std::size_t j = 0; j < GMX_SIMD_REAL_WIDTH; j++)
420 // Subtract values from _reference_ memory (we will then test with mem0_, and compare)
421 refmem[align * offset_[j]] -= val0_[j];
422 refmem[align * offset_[j] + 1] -= val1_[j];
423 refmem[align * offset_[j] + 2] -= val2_[j];
426 v0 = load<SimdReal>(val0_);
427 v1 = load<SimdReal>(val1_);
428 v2 = load<SimdReal>(val2_);
432 transposeScatterDecrU<3>(mem0_, offset_, v0, v1, v2);
436 transposeScatterDecrU<4>(mem0_, offset_, v0, v1, v2);
443 for (std::size_t j = 0; j < s_workMemSize_; j++)
445 EXPECT_REAL_EQ_TOL(refmem[j], mem0_[j], tolerance);
450 TEST_F(SimdFloatingpointUtilTest, transposeScatterDecrU3Overlapping)
453 real refmem[s_workMemSize_];
454 FloatingPointTolerance tolerance(defaultRealTolerance());
456 // Alter offset_ to make all entries point to the same (first) value, so all entries will overlap
457 for (std::size_t j = 0; j < GMX_SIMD_REAL_WIDTH; j++)
462 // Set test and reference memory to background value
463 for (std::size_t j = 0; j < s_workMemSize_; j++)
465 // Multiply by 1+100*eps to make sure low bits are also used
466 mem0_[j] = refmem[j] = (1000.0 + j) * (1.0 + 100 * GMX_REAL_EPS);
469 # if defined(__INTEL_COMPILER) && (__INTEL_COMPILER < 2021) // Bug in (at least) 19u1 and 18u5 (03424712)
472 for (std::size_t j = 0; j < GMX_SIMD_REAL_WIDTH; j++)
474 // Subtract values from _reference_ memory (we will then test with mem0_, and compare)
475 refmem[3 * offset_[j]] -= val0_[j];
476 refmem[3 * offset_[j] + 1] -= val1_[j];
477 refmem[3 * offset_[j] + 2] -= val2_[j];
480 v0 = load<SimdReal>(val0_);
481 v1 = load<SimdReal>(val1_);
482 v2 = load<SimdReal>(val2_);
484 transposeScatterDecrU<3>(mem0_, offset_, v0, v1, v2);
486 for (std::size_t j = 0; j < s_workMemSize_; j++)
488 EXPECT_REAL_EQ_TOL(refmem[j], mem0_[j], tolerance);
492 TEST_F(SimdFloatingpointUtilTest, expandScalarsToTriplets)
494 SimdReal vs, v0, v1, v2;
497 for (i = 0; i < GMX_SIMD_REAL_WIDTH; i++)
502 vs = load<SimdReal>(mem0_);
504 expandScalarsToTriplets(vs, &v0, &v1, &v2);
510 for (i = 0; i < GMX_SIMD_REAL_WIDTH; i++)
512 EXPECT_EQ(i / 3, val0_[i]);
513 EXPECT_EQ((i + GMX_SIMD_REAL_WIDTH) / 3, val1_[i]);
514 EXPECT_EQ((i + 2 * GMX_SIMD_REAL_WIDTH) / 3, val2_[i]);
519 TEST_F(SimdFloatingpointUtilTest, gatherLoadBySimdIntTranspose4)
521 SimdReal v0, v1, v2, v3;
522 SimdReal ref0, ref1, ref2, ref3;
523 SimdInt32 simdoffset;
524 const int nalign = 3;
525 int alignmentList[nalign] = { 4, 8, 12 };
528 for (i = 0; i < nalign; i++)
530 align = alignmentList[i];
531 for (j = 0; j < GMX_SIMD_REAL_WIDTH; j++)
533 mem0_[align * offset_[j]] = val0_[j];
534 mem0_[align * offset_[j] + 1] = val1_[j];
535 mem0_[align * offset_[j] + 2] = val2_[j];
536 mem0_[align * offset_[j] + 3] = val3_[j];
539 simdoffset = load<SimdInt32>(offset_);
540 ref0 = load<SimdReal>(val0_);
541 ref1 = load<SimdReal>(val1_);
542 ref2 = load<SimdReal>(val2_);
543 ref3 = load<SimdReal>(val3_);
547 gatherLoadBySimdIntTranspose<4>(mem0_, simdoffset, &v0, &v1, &v2, &v3);
551 gatherLoadBySimdIntTranspose<8>(mem0_, simdoffset, &v0, &v1, &v2, &v3);
553 else if (align == 12)
555 gatherLoadBySimdIntTranspose<12>(mem0_, simdoffset, &v0, &v1, &v2, &v3);
562 GMX_EXPECT_SIMD_REAL_EQ(ref0, v0);
563 GMX_EXPECT_SIMD_REAL_EQ(ref1, v1);
564 GMX_EXPECT_SIMD_REAL_EQ(ref2, v2);
565 GMX_EXPECT_SIMD_REAL_EQ(ref3, v3);
570 TEST_F(SimdFloatingpointUtilTest, gatherLoadBySimdIntTranspose2)
574 SimdInt32 simdoffset;
575 const int nalign = 3;
576 int alignmentList[nalign] = { 4, 8, 12 };
579 for (i = 0; i < nalign; i++)
581 align = alignmentList[i];
582 for (j = 0; j < GMX_SIMD_REAL_WIDTH; j++)
584 mem0_[align * offset_[j]] = val0_[j];
585 mem0_[align * offset_[j] + 1] = val1_[j];
588 simdoffset = load<SimdInt32>(offset_);
589 ref0 = load<SimdReal>(val0_);
590 ref1 = load<SimdReal>(val1_);
594 gatherLoadBySimdIntTranspose<4>(mem0_, simdoffset, &v0, &v1);
598 gatherLoadBySimdIntTranspose<8>(mem0_, simdoffset, &v0, &v1);
600 else if (align == 12)
602 gatherLoadBySimdIntTranspose<12>(mem0_, simdoffset, &v0, &v1);
609 GMX_EXPECT_SIMD_REAL_EQ(ref0, v0);
610 GMX_EXPECT_SIMD_REAL_EQ(ref1, v1);
614 # if GMX_SIMD_HAVE_GATHER_LOADU_BYSIMDINT_TRANSPOSE_REAL
615 TEST_F(SimdFloatingpointUtilTest, gatherLoadUBySimdIntTranspose2)
619 SimdInt32 simdoffset;
620 const int nalign = 3;
621 int alignmentList[nalign] = { 1, 3, 5 };
624 for (i = 0; i < nalign; i++)
626 align = alignmentList[i];
627 for (j = 0; j < GMX_SIMD_REAL_WIDTH; j++)
629 mem0_[align * offset_[j]] = val0_[j];
630 mem0_[align * offset_[j] + 1] = val1_[j];
633 simdoffset = load<SimdInt32>(offset_);
634 ref0 = load<SimdReal>(val0_);
635 ref1 = load<SimdReal>(val1_);
639 gatherLoadUBySimdIntTranspose<1>(mem0_, simdoffset, &v0, &v1);
643 gatherLoadUBySimdIntTranspose<3>(mem0_, simdoffset, &v0, &v1);
647 gatherLoadUBySimdIntTranspose<5>(mem0_, simdoffset, &v0, &v1);
654 GMX_EXPECT_SIMD_REAL_EQ(ref0, v0);
655 GMX_EXPECT_SIMD_REAL_EQ(ref1, v1);
658 # endif // GMX_SIMD_HAVE_GATHER_LOADU_BYSIMDINT_TRANSPOSE_REAL
660 TEST_F(SimdFloatingpointUtilTest, reduceIncr4Sum)
663 SimdReal v0, v1, v2, v3;
664 real sum0, sum1, sum2, sum3, tstsum;
665 FloatingPointTolerance tolerance(defaultRealTolerance());
667 v0 = load<SimdReal>(val0_);
668 v1 = load<SimdReal>(val1_);
669 v2 = load<SimdReal>(val2_);
670 v3 = load<SimdReal>(val3_);
672 sum0 = sum1 = sum2 = sum3 = 0;
673 for (i = 0; i < GMX_SIMD_REAL_WIDTH; i++)
681 // Just put some numbers in memory so we check the addition is correct
687 tstsum = reduceIncr4ReturnSum(mem0_, v0, v1, v2, v3);
689 EXPECT_REAL_EQ_TOL(c0 + sum0, mem0_[0], tolerance);
690 EXPECT_REAL_EQ_TOL(c1 + sum1, mem0_[1], tolerance);
691 EXPECT_REAL_EQ_TOL(c2 + sum2, mem0_[2], tolerance);
692 EXPECT_REAL_EQ_TOL(c3 + sum3, mem0_[3], tolerance);
694 EXPECT_REAL_EQ_TOL(sum0 + sum1 + sum2 + sum3, tstsum, tolerance);
697 # if GMX_SIMD_HAVE_HSIMD_UTIL_REAL
699 TEST_F(SimdFloatingpointUtilTest, loadDualHsimd)
703 // Point p to the upper half of val0_
704 real* p = val0_ + GMX_SIMD_REAL_WIDTH / 2;
706 v0 = load<SimdReal>(val0_);
707 v1 = loadDualHsimd(val0_, p);
709 GMX_EXPECT_SIMD_REAL_EQ(v0, v1);
712 TEST_F(SimdFloatingpointUtilTest, loadDuplicateHsimd)
716 // Point p to the upper half of val0_
717 real* p = val0_ + GMX_SIMD_REAL_WIDTH / 2;
718 // Copy data so upper half is identical to lower
719 for (i = 0; i < GMX_SIMD_REAL_WIDTH / 2; i++)
724 v0 = load<SimdReal>(val0_);
725 v1 = loadDuplicateHsimd(val0_);
727 GMX_EXPECT_SIMD_REAL_EQ(v0, v1);
731 TEST_F(SimdFloatingpointUtilTest, loadU1DualHsimd)
735 real data[2] = { 1, 2 };
737 // Point p to the upper half of val0_
738 real* p = val0_ + GMX_SIMD_REAL_WIDTH / 2;
739 // Set all low elements to data[0], an high to data[1]
740 for (i = 0; i < GMX_SIMD_REAL_WIDTH / 2; i++)
746 v0 = load<SimdReal>(val0_);
747 v1 = loadU1DualHsimd(data);
749 GMX_EXPECT_SIMD_REAL_EQ(v0, v1);
753 TEST_F(SimdFloatingpointUtilTest, storeDualHsimd)
758 // Point p to the upper half of val0_
759 real* p = val0_ + GMX_SIMD_REAL_WIDTH / 2;
761 v0 = load<SimdReal>(val2_);
762 storeDualHsimd(val0_, p, v0);
764 for (i = 0; i < GMX_SIMD_REAL_WIDTH; i++)
766 EXPECT_EQ(val2_[i], val0_[i]);
770 TEST_F(SimdFloatingpointUtilTest, incrDualHsimd)
772 real reference[GMX_SIMD_REAL_WIDTH];
775 // Create reference values
776 for (std::size_t i = 0; i < GMX_SIMD_REAL_WIDTH; i++)
778 reference[i] = val0_[i] + val2_[i];
781 // Point p to the upper half of val0_
782 real* p = val0_ + GMX_SIMD_REAL_WIDTH / 2;
784 v0 = load<SimdReal>(val2_);
785 incrDualHsimd(val0_, p, v0);
787 for (std::size_t i = 0; i < GMX_SIMD_REAL_WIDTH; i++)
789 EXPECT_EQ(reference[i], val0_[i]);
793 TEST_F(SimdFloatingpointUtilTest, incrDualHsimdOverlapping)
795 real reference[GMX_SIMD_REAL_WIDTH / 2];
798 // Create reference values
799 for (std::size_t i = 0; i < GMX_SIMD_REAL_WIDTH / 2; i++)
801 reference[i] = val0_[i] + val2_[i] + val2_[GMX_SIMD_REAL_WIDTH / 2 + i];
804 v0 = load<SimdReal>(val2_);
805 incrDualHsimd(val0_, val0_, v0);
807 for (std::size_t i = 0; i < GMX_SIMD_REAL_WIDTH / 2; i++)
809 EXPECT_EQ(reference[i], val0_[i]);
813 TEST_F(SimdFloatingpointUtilTest, decr3Hsimd)
816 real ref[3 * GMX_SIMD_REAL_WIDTH / 2];
818 FloatingPointTolerance tolerance(defaultRealTolerance());
820 // Point p to the upper half of val1_
821 real* p = val1_ + GMX_SIMD_REAL_WIDTH / 2;
822 for (i = 0; i < GMX_SIMD_REAL_WIDTH / 2; i++)
824 ref[i] = val0_[i] - (val1_[i] + p[i]);
826 p = val2_ + GMX_SIMD_REAL_WIDTH / 2;
827 for (j = 0; j < GMX_SIMD_REAL_WIDTH / 2; i++, j++)
829 ref[i] = val0_[i] - (val2_[j] + p[j]);
831 p = val3_ + GMX_SIMD_REAL_WIDTH / 2;
832 for (j = 0; j < GMX_SIMD_REAL_WIDTH / 2; i++, j++)
834 ref[i] = val0_[i] - (val3_[j] + p[j]);
837 v0 = load<SimdReal>(val1_);
838 v1 = load<SimdReal>(val2_);
839 v2 = load<SimdReal>(val3_);
840 decr3Hsimd(val0_, v0, v1, v2);
842 for (i = 0; i < 3 * GMX_SIMD_REAL_WIDTH / 2; i++)
844 EXPECT_REAL_EQ_TOL(ref[i], val0_[i], tolerance);
849 TEST_F(SimdFloatingpointUtilTest, gatherLoadTranspose2Hsimd)
854 const int nalign = 3;
855 int alignmentList[nalign] = { 2, 4, c_simdBestPairAlignment };
858 for (i = 0; i < nalign; i++)
860 align = alignmentList[i];
861 for (j = 0; j < GMX_SIMD_REAL_WIDTH / 2; j++)
863 // Use mem0_ as base for lower half
864 mem0_[align * offset_[j]] = val0_[j];
865 mem0_[align * offset_[j] + 1] = val1_[j];
866 // Use mem1_ as base for upper half
867 mem1_[align * offset_[j]] = val0_[GMX_SIMD_REAL_WIDTH / 2 + j];
868 mem1_[align * offset_[j] + 1] = val1_[GMX_SIMD_REAL_WIDTH / 2 + j];
871 ref0 = load<SimdReal>(val0_);
872 ref1 = load<SimdReal>(val1_);
876 gatherLoadTransposeHsimd<2>(mem0_, mem1_, offset_, &v0, &v1);
880 gatherLoadTransposeHsimd<4>(mem0_, mem1_, offset_, &v0, &v1);
882 else if (align == c_simdBestPairAlignment)
884 gatherLoadTransposeHsimd<c_simdBestPairAlignment>(mem0_, mem1_, offset_, &v0, &v1);
891 GMX_EXPECT_SIMD_REAL_EQ(ref0, v0);
892 GMX_EXPECT_SIMD_REAL_EQ(ref1, v1);
897 TEST_F(SimdFloatingpointUtilTest, reduceIncr4SumHsimd)
901 real sum0, sum1, sum2, sum3, tstsum;
902 FloatingPointTolerance tolerance(defaultRealTolerance());
904 // Use the half-SIMD storage in memory val0_ and val1_.
905 v0 = load<SimdReal>(val0_);
906 v1 = load<SimdReal>(val1_);
908 sum0 = sum1 = sum2 = sum3 = 0;
909 for (i = 0; i < GMX_SIMD_REAL_WIDTH / 2; i++)
912 sum1 += val0_[GMX_SIMD_REAL_WIDTH / 2 + i];
914 sum3 += val1_[GMX_SIMD_REAL_WIDTH / 2 + i];
917 // Just put some numbers in memory so we check the addition is correct
923 tstsum = reduceIncr4ReturnSumHsimd(mem0_, v0, v1);
925 EXPECT_REAL_EQ_TOL(c0 + sum0, mem0_[0], tolerance);
926 EXPECT_REAL_EQ_TOL(c1 + sum1, mem0_[1], tolerance);
927 EXPECT_REAL_EQ_TOL(c2 + sum2, mem0_[2], tolerance);
928 EXPECT_REAL_EQ_TOL(c3 + sum3, mem0_[3], tolerance);
930 EXPECT_REAL_EQ_TOL(sum0 + sum1 + sum2 + sum3, tstsum, tolerance);
933 # endif // GMX_SIMD_HAVE_HSIMD_UTIL_REAL
935 // Test Currently doesn't work for GMX_SIMD_REAL_WIDTH<4. Should be fixed by having GMX_EXPECT_SIMD_REAL_EQ which works for both Simd and Simd4
936 # if GMX_SIMD_HAVE_4NSIMD_UTIL_REAL && GMX_SIMD_REAL_WIDTH >= 4
938 TEST_F(SimdFloatingpointUtilTest, loadUNDuplicate4)
942 real data[GMX_SIMD_REAL_WIDTH / 4];
943 std::iota(data, data + GMX_SIMD_REAL_WIDTH / 4, 1);
945 # if defined __ICC && __ICC == 1800 || defined __ICL && __ICL == 1800
946 # pragma novector /* Work-around for incorrect vectorization for AVX_512(_KNL) */
948 for (i = 0; i < GMX_SIMD_REAL_WIDTH / 4; i++)
950 val0_[i * 4] = val0_[i * 4 + 1] = val0_[i * 4 + 2] = val0_[i * 4 + 3] = data[i];
953 v0 = load<Simd4NReal>(val0_);
954 v1 = loadUNDuplicate4(data);
956 GMX_EXPECT_SIMD_REAL_EQ(v0, v1);
959 TEST_F(SimdFloatingpointUtilTest, load4DuplicateN)
963 real data[4] = { 1, 2, 3, 4 };
965 for (i = 0; i < GMX_SIMD_REAL_WIDTH / 4; i++)
967 val0_[i * 4] = data[0];
968 val0_[i * 4 + 1] = data[1];
969 val0_[i * 4 + 2] = data[2];
970 val0_[i * 4 + 3] = data[3];
973 v0 = load<Simd4NReal>(val0_);
974 v1 = load4DuplicateN(val0_);
976 GMX_EXPECT_SIMD_REAL_EQ(v0, v1);
979 TEST_F(SimdFloatingpointUtilTest, loadU4NOffset)
981 constexpr int offset = 6; // non power of 2
982 constexpr int dataLen = 4 + offset * (GMX_SIMD_REAL_WIDTH / 4 - 1);
984 std::iota(data, data + dataLen, 1);
986 for (int i = 0; i < GMX_SIMD_REAL_WIDTH / 4; i++)
988 val0_[i * 4] = data[0 + offset * i];
989 val0_[i * 4 + 1] = data[1 + offset * i];
990 val0_[i * 4 + 2] = data[2 + offset * i];
991 val0_[i * 4 + 3] = data[3 + offset * i];
994 const Simd4NReal v0 = load<Simd4NReal>(val0_);
995 const Simd4NReal v1 = loadU4NOffset(data, offset);
997 GMX_EXPECT_SIMD_REAL_EQ(v0, v1);
1000 # endif // GMX_SIMD_HAVE_4NSIMD_UTIL_REAL
1002 #endif // GMX_SIMD_HAVE_REAL