/*
* This file is part of the GROMACS molecular simulation package.
*
- * Copyright (c) 2014,2015, by the GROMACS development team, led by
+ * Copyright (c) 2014,2015,2018, by the GROMACS development team, led by
* Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
* and including many others, as listed in the AUTHORS file in the
* top-level source directory and at http://www.gromacs.org.
#ifndef GMX_SIMD_IMPL_X86_AVX2_256_UTIL_FLOAT_H
#define GMX_SIMD_IMPL_X86_AVX2_256_UTIL_FLOAT_H
+#include "config.h"
+
#include <immintrin.h>
#include "gromacs/simd/impl_x86_avx_256/impl_x86_avx_256_util_float.h"
+namespace gmx
+{
+
+// This version is marginally slower than the AVX 4-wide component load
+// version on Intel Skylake. On older Intel architectures this version
+// is significantly slower.
+template <int align>
+static inline void gmx_simdcall
+gatherLoadUTransposeSafe(const float * base,
+ const std::int32_t offset[],
+ SimdFloat * v0,
+ SimdFloat * v1,
+ SimdFloat * v2)
+{
+ assert(std::size_t(offset) % 32 == 0);
+
+ const SimdFInt32 alignSimd = SimdFInt32(align);
+
+ SimdFInt32 vindex = simdLoad(offset, SimdFInt32Tag());
+ vindex = vindex*alignSimd;
+
+ *v0 = _mm256_i32gather_ps(base + 0, vindex.simdInternal_, sizeof(float));
+ *v1 = _mm256_i32gather_ps(base + 1, vindex.simdInternal_, sizeof(float));
+ *v2 = _mm256_i32gather_ps(base + 2, vindex.simdInternal_, sizeof(float));
+}
+
+} // namespace gmx
+
#endif // GMX_SIMD_IMPL_X86_AVX2_256_UTIL_FLOAT_H