# making the right #includes should be on the source file that uses these.
# TODO: # Stop using the preprocessor for meta-programming!
src/gromacs/ewald/pme-simd4.h: warning: should include "pme-simd.h"
-src/gromacs/ewald/pme-gather.cpp: warning: includes "simd.h" unnecessarily
src/gromacs/ewald/pme-spline-work.cpp: warning: includes "simd.h" unnecessarily
src/gromacs/ewald/pme-spline-work.h: warning: includes "simd.h" unnecessarily
src/gromacs/ewald/pme-spread.cpp: warning: includes "simd.h" unnecessarily
return f;
}
-#ifdef PME_SIMD4_UNALIGNED //TODO: Consider always have at least a dummy implementation of Simd (enough for first phase of two-phase lookup) and then use enable_if instead of #ifdef
+//TODO: Consider always have at least a dummy implementation of Simd (enough for first phase of two-phase lookup) and then use enable_if instead of #ifdef
+#if GMX_SIMD_HAVE_4NSIMD_UTIL_REAL && GMX_SIMD_REAL_WIDTH <= 16
/* Gather for one charge with pme_order=4 with unaligned SIMD4 load+store.
+ * Uses 4N SIMD where N is SIMD_WIDTH/4 to operate on all of z and N of y.
* This code does not assume any memory alignment for the grid.
*/
RVec
const real *const gmx_restrict dthy = spline->dtheta[YY] + norder;
const real *const gmx_restrict dthz = spline->dtheta[ZZ] + norder;
- Simd4Real fx_S = setZero();
- Simd4Real fy_S = setZero();
- Simd4Real fz_S = setZero();
+ SimdReal fx_S = setZero();
+ SimdReal fy_S = setZero();
+ SimdReal fz_S = setZero();
/* With order 4 the z-spline is actually aligned */
- const Simd4Real tz_S = load4(thz);
- const Simd4Real dz_S = load4(dthz);
+ const SimdReal tz_S = load4DuplicateN(thz);
+ const SimdReal dz_S = load4DuplicateN(dthz);
- for (int ithx = 0; (ithx < 4); ithx++)
+ for (int ithx = 0; ithx < 4; ithx++)
{
- const int index_x = (idxX + ithx)*gridNY*gridNZ;
- const Simd4Real tx_S = Simd4Real(thx[ithx]);
- const Simd4Real dx_S = Simd4Real(dthx[ithx]);
+ const int index_x = (idxX + ithx)*gridNY*gridNZ;
+ const SimdReal tx_S = SimdReal(thx[ithx]);
+ const SimdReal dx_S = SimdReal(dthx[ithx]);
- for (int ithy = 0; (ithy < 4); ithy++)
+ for (int ithy = 0; ithy < 4; ithy += GMX_SIMD_REAL_WIDTH/4)
{
- const int index_xy = index_x + (idxY + ithy)*gridNZ;
- const Simd4Real ty_S = Simd4Real(thy[ithy]);
- const Simd4Real dy_S = Simd4Real(dthy[ithy]);
+ const int index_xy = index_x + (idxY+ithy)*gridNZ;
+
+ const SimdReal ty_S = loadUNDuplicate4(thy +ithy);
+ const SimdReal dy_S = loadUNDuplicate4(dthy+ithy);
+
+ const SimdReal gval_S = loadU4NOffset(grid+index_xy+idxZ, gridNZ);
- const Simd4Real gval_S = load4U(grid + index_xy + idxZ);
- const Simd4Real fxy1_S = tz_S * gval_S;
- const Simd4Real fz1_S = dz_S * gval_S;
+ const SimdReal fxy1_S = tz_S * gval_S;
+ const SimdReal fz1_S = dz_S * gval_S;
fx_S = fma(dx_S * ty_S, fxy1_S, fx_S);
fy_S = fma(tx_S * dy_S, fxy1_S, fy_S);
static inline void loadOrderU(const real* data, std::integral_constant<int, order>,
int offset, Simd4Real* S0, Simd4Real* S1)
{
-#ifdef PME_SIMD4_UNALIGNED //TODO: Extract into helper function
+#ifdef PME_SIMD4_UNALIGNED
*S0 = load4U(data-offset);
*S1 = load4U(data-offset+4);
#else
/*
* This file is part of the GROMACS molecular simulation package.
*
- * Copyright (c) 2014,2015, by the GROMACS development team, led by
+ * Copyright (c) 2014,2015,2017, by the GROMACS development team, led by
* Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
* and including many others, as listed in the AUTHORS file in the
* top-level source directory and at http://www.gromacs.org.
# define GMX_SIMD_DOUBLE_WIDTH 4
#endif
+#if GMX_SIMD_FLOAT_WIDTH >= 8 || defined DOXYGEN //set in simd.h for GMX_SIMD_FLOAT_WIDTH<=4
+//! \brief 1 if float 4xN load utils present, otherwise 0
+#define GMX_SIMD_HAVE_4NSIMD_UTIL_FLOAT 1
+#endif
+
+#if GMX_SIMD_DOUBLE_WIDTH >= 8 || defined DOXYGEN //set in simd.h for GMX_SIMD_DOUBLE_WIDTH<=4
+//! \brief 1 if double 4xN load utils present, otherwise 0
+#define GMX_SIMD_HAVE_4NSIMD_UTIL_DOUBLE 1
+#endif
+
//! \brief 1 if implementation provides \ref gmx::Simd4Float, otherwise 0.
#define GMX_SIMD4_HAVE_FLOAT 1
return sum[0] + sum[1] + sum[2] + sum[3];
}
+#if GMX_SIMD_DOUBLE_WIDTH > 8 || defined DOXYGEN
+/*! \brief Load N doubles and duplicate them 4 times each.
+ *
+ * \param m Pointer to unaligned memory
+ *
+ * \return SIMD variable with N doubles from m duplicated 4x.
+ *
+ * Available if \ref GMX_SIMD_HAVE_4NSIMD_UTIL_DOUBLE is 1.
+ * N is GMX_SIMD_DOUBLE_WIDTH/4. Duplicated values are
+ * contigous and different values are 4 positions in SIMD
+ * apart.
+ */
+static inline SimdDouble gmx_simdcall
+loadUNDuplicate4(const double* m)
+{
+ SimdDouble a;
+ for (std::size_t i = 0; i < a.simdInternal_.size()/4; i++)
+ {
+ a.simdInternal_[i*4] = m[i];
+ a.simdInternal_[i*4+1] = m[i];
+ a.simdInternal_[i*4+2] = m[i];
+ a.simdInternal_[i*4+3] = m[i];
+ }
+ return a;
+}
+
+/*! \brief Load 4 doubles and duplicate them N times each.
+ *
+ * \param m Pointer to memory aligned to 4 doubles
+ *
+ * \return SIMD variable with 4 doubles from m duplicated Nx.
+ *
+ * Available if \ref GMX_SIMD_HAVE_4NSIMD_UTIL_DOUBLE is 1.
+ * N is GMX_SIMD_DOUBLE_WIDTH/4. Different values are
+ * contigous and same values are 4 positions in SIMD
+ * apart.
+ */
+static inline SimdDouble gmx_simdcall
+load4DuplicateN(const double* m)
+{
+ SimdDouble a;
+ for (std::size_t i = 0; i < a.simdInternal_.size()/4; i++)
+ {
+ a.simdInternal_[i*4] = m[0];
+ a.simdInternal_[i*4+1] = m[1];
+ a.simdInternal_[i*4+2] = m[2];
+ a.simdInternal_[i*4+3] = m[3];
+ }
+ return a;
+}
+#endif
+
+#if GMX_SIMD_DOUBLE_WIDTH >= 8 || defined DOXYGEN
+/*! \brief Load doubles in blocks of 4 at fixed offsets
+ *
+ * \param m Pointer to unaligned memory
+ * \param offset Offset in memory between input blocks of 4
+ *
+ * \return SIMD variable with doubles from m.
+ *
+ * Available if \ref GMX_SIMD_HAVE_4NSIMD_UTIL_DOUBLE is 1.
+ * Blocks of 4 doubles are loaded from m+n*offset where n
+ * is the n-th block of 4 doubles.
+ */
+static inline SimdDouble gmx_simdcall
+loadU4NOffset(const double* m, int offset)
+{
+ SimdDouble a;
+ for (std::size_t i = 0; i < a.simdInternal_.size()/4; i++)
+ {
+ a.simdInternal_[i*4] = m[offset*i + 0];
+ a.simdInternal_[i*4+1] = m[offset*i + 1];
+ a.simdInternal_[i*4+2] = m[offset*i + 2];
+ a.simdInternal_[i*4+3] = m[offset*i + 3];
+ }
+ return a;
+}
+#endif
+
+
/*! \} */
/*! \} */
return sum[0] + sum[1] + sum[2] + sum[3];
}
+#if GMX_SIMD_FLOAT_WIDTH > 8 || defined DOXYGEN
+/*! \brief Load N floats and duplicate them 4 times each.
+ *
+ * \param m Pointer to unaligned memory
+ *
+ * \return SIMD variable with N floats from m duplicated 4x.
+ *
+ * Available if \ref GMX_SIMD_HAVE_4NSIMD_UTIL_FLOAT is 1.
+ * N is GMX_SIMD_FLOAT_WIDTH/4. Duplicated values are
+ * contigous and different values are 4 positions in SIMD
+ * apart.
+ */
+static inline SimdFloat gmx_simdcall
+loadUNDuplicate4(const float* m)
+{
+ SimdFloat a;
+ for (std::size_t i = 0; i < a.simdInternal_.size()/4; i++)
+ {
+ a.simdInternal_[i*4] = m[i];
+ a.simdInternal_[i*4+1] = m[i];
+ a.simdInternal_[i*4+2] = m[i];
+ a.simdInternal_[i*4+3] = m[i];
+ }
+ return a;
+}
+
+/*! \brief Load 4 floats and duplicate them N times each.
+ *
+ * \param m Pointer to memory aligned to 4 floats
+ *
+ * \return SIMD variable with 4 floats from m duplicated Nx.
+ *
+ * Available if \ref GMX_SIMD_HAVE_4NSIMD_UTIL_FLOAT is 1.
+ * N is GMX_SIMD_FLOAT_WIDTH/4. Different values are
+ * contigous and same values are 4 positions in SIMD
+ * apart.
+ */
+static inline SimdFloat gmx_simdcall
+load4DuplicateN(const float* m)
+{
+ SimdFloat a;
+ for (std::size_t i = 0; i < a.simdInternal_.size()/4; i++)
+ {
+ a.simdInternal_[i*4] = m[0];
+ a.simdInternal_[i*4+1] = m[1];
+ a.simdInternal_[i*4+2] = m[2];
+ a.simdInternal_[i*4+3] = m[3];
+ }
+ return a;
+}
+#endif
+
+#if GMX_SIMD_FLOAT_WIDTH >= 8 || defined DOXYGEN
+/*! \brief Load floats in blocks of 4 at fixed offsets
+ *
+ * \param m Pointer to unaligned memory
+ * \param offset Offset in memory between input blocks of 4
+ *
+ * \return SIMD variable with floats from m.
+ *
+ * Available if \ref GMX_SIMD_HAVE_4NSIMD_UTIL_FLOAT is 1.
+ * Blocks of 4 floats are loaded from m+n*offset where n
+ * is the n-th block of 4 floats.
+ */
+static inline SimdFloat gmx_simdcall
+loadU4NOffset(const float* m, int offset)
+{
+ SimdFloat a;
+ for (std::size_t i = 0; i < a.simdInternal_.size()/4; i++)
+ {
+ a.simdInternal_[i*4] = m[offset*i + 0];
+ a.simdInternal_[i*4+1] = m[offset*i + 1];
+ a.simdInternal_[i*4+2] = m[offset*i + 2];
+ a.simdInternal_[i*4+3] = m[offset*i + 3];
+ }
+ return a;
+}
+#endif
+
/*! \} */
/*! \} */
/*
* This file is part of the GROMACS molecular simulation package.
*
- * Copyright (c) 2014,2015, by the GROMACS development team, led by
+ * Copyright (c) 2014,2015,2017, by the GROMACS development team, led by
* Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
* and including many others, as listed in the AUTHORS file in the
* top-level source directory and at http://www.gromacs.org.
#define GMX_SIMD_HAVE_GATHER_LOADU_BYSIMDINT_TRANSPOSE_DOUBLE 1
#define GMX_SIMD_HAVE_HSIMD_UTIL_FLOAT 1
#define GMX_SIMD_HAVE_HSIMD_UTIL_DOUBLE 0 // Not needed for width 4
+#define GMX_SIMD_HAVE_4NSIMD_UTIL_FLOAT 1
#define GMX_SIMD4_HAVE_FLOAT 1
#define GMX_SIMD4_HAVE_DOUBLE 1
/*
* This file is part of the GROMACS molecular simulation package.
*
- * Copyright (c) 2014,2015, by the GROMACS development team, led by
+ * Copyright (c) 2014,2015,2017, by the GROMACS development team, led by
* Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
* and including many others, as listed in the AUTHORS file in the
* top-level source directory and at http://www.gromacs.org.
#define GMX_SIMD_HAVE_GATHER_LOADU_BYSIMDINT_TRANSPOSE_DOUBLE 1
#define GMX_SIMD_HAVE_HSIMD_UTIL_FLOAT 1
#define GMX_SIMD_HAVE_HSIMD_UTIL_DOUBLE 0 // Not needed for width 4
+#define GMX_SIMD_HAVE_4NSIMD_UTIL_FLOAT 1
#define GMX_SIMD4_HAVE_FLOAT 1
#define GMX_SIMD4_HAVE_DOUBLE 1
return *reinterpret_cast<float *>(&t0);
}
+static inline SimdFloat gmx_simdcall
+loadU4NOffset(const float *m, int offset)
+{
+ return {
+ _mm256_insertf128_ps(_mm256_castps128_ps256(_mm_loadu_ps(m)), _mm_loadu_ps(m+offset), 0x1)
+ };
+}
+
+
} // namespace gmx
#endif // GMX_SIMD_IMPL_X86_AVX_256_UTIL_FLOAT_H
/*
* This file is part of the GROMACS molecular simulation package.
*
- * Copyright (c) 2014,2015,2016, by the GROMACS development team, led by
+ * Copyright (c) 2014,2015,2016,2017, by the GROMACS development team, led by
* Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
* and including many others, as listed in the AUTHORS file in the
* top-level source directory and at http://www.gromacs.org.
#define GMX_SIMD_HAVE_GATHER_LOADU_BYSIMDINT_TRANSPOSE_DOUBLE 1
#define GMX_SIMD_HAVE_HSIMD_UTIL_FLOAT 1
#define GMX_SIMD_HAVE_HSIMD_UTIL_DOUBLE 1
+#define GMX_SIMD_HAVE_4NSIMD_UTIL_FLOAT 1
+#define GMX_SIMD_HAVE_4NSIMD_UTIL_DOUBLE 1
#define GMX_SIMD4_HAVE_FLOAT 1
#define GMX_SIMD4_HAVE_DOUBLE 1
return _mm_cvtsd_f64(_mm256_castpd256_pd128(t2));
}
+static inline SimdDouble gmx_simdcall
+loadU4NOffset(const double *m, int offset)
+{
+ return {
+ _mm512_insertf64x4(_mm512_castpd256_pd512(_mm256_loadu_pd(m)),
+ _mm256_loadu_pd(m+offset), 1)
+ };
+}
+
} // namespace gmx
#endif // GMX_SIMD_IMPL_X86_AVX_512_UTIL_DOUBLE_H
return _mm_cvtss_f32(t3);
}
+static inline SimdFloat gmx_simdcall
+loadUNDuplicate4(const float* f)
+{
+ return {
+ _mm512_permute_ps(_mm512_maskz_expandloadu_ps(0x1111, f), 0)
+ };
+}
+
+static inline SimdFloat gmx_simdcall
+load4DuplicateN(const float* f)
+{
+ return {
+ _mm512_broadcast_f32x4(_mm_load_ps(f))
+ };
+}
+
+static inline SimdFloat gmx_simdcall
+loadU4NOffset(const float* f, int offset)
+{
+ const __m256i idx = _mm256_setr_epi32(0, 0, 1, 1, 2, 2, 3, 3);
+ const __m256i gdx = _mm256_add_epi32(_mm256_setr_epi32(0, 2, 0, 2, 0, 2, 0, 2),
+ _mm256_mullo_epi32(idx, _mm256_set1_epi32(offset)));
+ return {
+ _mm512_castpd_ps(_mm512_i32gather_pd(gdx, f, sizeof(float)))
+ };
+}
+
} // namespace gmx
#endif // GMX_SIMD_IMPL_X86_AVX_512_UTIL_FLOAT_H
/*
* This file is part of the GROMACS molecular simulation package.
*
- * Copyright (c) 2014,2015,2016, by the GROMACS development team, led by
+ * Copyright (c) 2014,2015,2016,2017, by the GROMACS development team, led by
* Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
* and including many others, as listed in the AUTHORS file in the
* top-level source directory and at http://www.gromacs.org.
#define GMX_SIMD_HAVE_GATHER_LOADU_BYSIMDINT_TRANSPOSE_DOUBLE 1
#define GMX_SIMD_HAVE_HSIMD_UTIL_FLOAT 1
#define GMX_SIMD_HAVE_HSIMD_UTIL_DOUBLE 1
+#define GMX_SIMD_HAVE_4NSIMD_UTIL_FLOAT 1
+#define GMX_SIMD_HAVE_4NSIMD_UTIL_DOUBLE 1
#define GMX_SIMD4_HAVE_FLOAT 1
#define GMX_SIMD4_HAVE_DOUBLE 1
{
return {};
}
+
+/* Implement most of 4xn functions by forwarding them to other functions when possible.
+ * The functions forwarded here don't need to be implemented by each implementation.
+ * For width=4 all functions are forwarded and for width=8 all but loadU4NOffset are forwarded.
+ */
+#if GMX_SIMD_HAVE_FLOAT
+#if GMX_SIMD_FLOAT_WIDTH < 4 || !GMX_SIMD_HAVE_LOADU
+#define GMX_SIMD_HAVE_4NSIMD_UTIL_FLOAT 0
+#elif GMX_SIMD_FLOAT_WIDTH == 4 && GMX_SIMD_HAVE_LOADU
+#define GMX_SIMD_HAVE_4NSIMD_UTIL_FLOAT 1
+//For GMX_SIMD_FLOAT_WIDTH>4 it is the reponsibility of the implementation to set
+//GMX_SIMD_HAVE_4NSIMD_UTIL_FLOAT
+#endif
+
+#if GMX_SIMD_FLOAT_WIDTH == 4 && GMX_SIMD_HAVE_LOADU
+static inline SimdFloat gmx_simdcall
+loadUNDuplicate4(const float* f)
+{
+ return SimdFloat(*f);
+}
+static inline SimdFloat gmx_simdcall
+load4DuplicateN(const float* f)
+{
+ return load<SimdFloat>(f);
+}
+static inline SimdFloat gmx_simdcall
+loadU4NOffset(const float* f, int)
+{
+ return loadU<SimdFloat>(f);
+}
+#elif GMX_SIMD_FLOAT_WIDTH == 8 && GMX_SIMD_HAVE_HSIMD_UTIL_FLOAT && GMX_SIMD_HAVE_LOADU
+static inline SimdFloat gmx_simdcall
+loadUNDuplicate4(const float* f)
+{
+ return loadU1DualHsimd(f);
+}
+static inline SimdFloat gmx_simdcall
+load4DuplicateN(const float* f)
+{
+ return loadDuplicateHsimd(f);
+}
+#endif
+#else //GMX_SIMD_HAVE_FLOAT
+#define GMX_SIMD_HAVE_4NSIMD_UTIL_FLOAT 0
+#endif
+
+#if GMX_SIMD_HAVE_DOUBLE
+#if GMX_SIMD_DOUBLE_WIDTH < 4 || !GMX_SIMD_HAVE_LOADU
+#define GMX_SIMD_HAVE_4NSIMD_UTIL_DOUBLE 0
+#elif GMX_SIMD_DOUBLE_WIDTH == 4 && GMX_SIMD_HAVE_LOADU
+#define GMX_SIMD_HAVE_4NSIMD_UTIL_DOUBLE 1
+//For GMX_SIMD_DOUBLE_WIDTH>4 it is the reponsibility of the implementation to set
+//GMX_SIMD_HAVE_4NSIMD_UTIL_DOUBLE
+#endif
+
+#if GMX_SIMD_DOUBLE_WIDTH == 4 && GMX_SIMD_HAVE_LOADU
+static inline SimdDouble gmx_simdcall
+loadUNDuplicate4(const double* f)
+{
+ return SimdDouble(*f);
+}
+static inline SimdDouble gmx_simdcall
+load4DuplicateN(const double* f)
+{
+ return load<SimdDouble>(f);
+}
+static inline SimdDouble gmx_simdcall
+loadU4NOffset(const double* f, int)
+{
+ return loadU<SimdDouble>(f);
+}
+#elif GMX_SIMD_DOUBLE_WIDTH == 8 && GMX_SIMD_HAVE_HSIMD_UTIL_DOUBLE && GMX_SIMD_HAVE_LOADU
+static inline SimdDouble gmx_simdcall
+loadUNDuplicate4(const double* f)
+{
+ return loadU1DualHsimd(f);
+}
+static inline SimdDouble gmx_simdcall
+load4DuplicateN(const double* f)
+{
+ return loadDuplicateHsimd(f);
+}
+#endif
+#else //GMX_SIMD_HAVE_DOUBLE
+#define GMX_SIMD_HAVE_4NSIMD_UTIL_DOUBLE 0
+#endif
+
+#if GMX_DOUBLE
+#define GMX_SIMD_HAVE_4NSIMD_UTIL_REAL GMX_SIMD_HAVE_4NSIMD_UTIL_DOUBLE
+#else
+#define GMX_SIMD_HAVE_4NSIMD_UTIL_REAL GMX_SIMD_HAVE_4NSIMD_UTIL_FLOAT
+#endif
+
//! \} end of name-group proxy objects
} // namespace gmx
*/
#include "gmxpre.h"
+#include <numeric>
+
#include "gromacs/simd/simd.h"
#include "gromacs/utility/alignedallocator.h"
#include "gromacs/utility/basedefinitions.h"
EXPECT_REAL_EQ_TOL(sum0 + sum1 + sum2 + sum3, tstsum, tolerance);
}
-
#endif // GMX_SIMD_HAVE_HSIMD_UTIL_REAL
+#if GMX_SIMD_HAVE_4NSIMD_UTIL_REAL
+
+TEST_F(SimdFloatingpointUtilTest, loadUNDuplicate4)
+{
+ SimdReal v0, v1;
+ int i;
+ real data[GMX_SIMD_REAL_WIDTH/4];
+ std::iota(data, data+GMX_SIMD_REAL_WIDTH/4, 1);
+
+ for (i = 0; i < GMX_SIMD_REAL_WIDTH / 4; i++)
+ {
+ val0_[i*4] = val0_[i*4+1] = val0_[i*4+2] = val0_[i*4+3] = data[i];
+ }
+
+ v0 = load<SimdReal>(val0_);
+ v1 = loadUNDuplicate4(data);
+
+ GMX_EXPECT_SIMD_REAL_EQ(v0, v1);
+}
+
+TEST_F(SimdFloatingpointUtilTest, load4DuplicateN)
+{
+ SimdReal v0, v1;
+ int i;
+ real data[4] = { 1, 2, 3, 4};
+
+ for (i = 0; i < GMX_SIMD_REAL_WIDTH / 4; i++)
+ {
+ val0_[i*4] = data[0];
+ val0_[i*4+1] = data[1];
+ val0_[i*4+2] = data[2];
+ val0_[i*4+3] = data[3];
+ }
+
+ v0 = load<SimdReal>(val0_);
+ v1 = load4DuplicateN(val0_);
+
+ GMX_EXPECT_SIMD_REAL_EQ(v0, v1);
+}
+
+TEST_F(SimdFloatingpointUtilTest, loadU4NOffset)
+{
+ constexpr int offset = 6; //non power of 2
+ constexpr int dataLen = 4+offset*(GMX_SIMD_REAL_WIDTH/4-1);
+ real data[dataLen];
+ std::iota(data, data+dataLen, 1);
+
+ for (int i = 0; i < GMX_SIMD_REAL_WIDTH / 4; i++)
+ {
+ val0_[i*4] = data[0+offset*i];
+ val0_[i*4+1] = data[1+offset*i];
+ val0_[i*4+2] = data[2+offset*i];
+ val0_[i*4+3] = data[3+offset*i];
+ }
+
+ const SimdReal v0 = load<SimdReal>(val0_);
+ const SimdReal v1 = loadU4NOffset(data, offset);
+
+ GMX_EXPECT_SIMD_REAL_EQ(v0, v1);
+}
+
+#endif // GMX_SIMD_HAVE_4NSIMD_UTIL_REAL
+
#endif // GMX_SIMD_HAVE_REAL
/*! \} */