#include "bonded.h"
+#include "config.h"
+
#include <assert.h>
#include <cmath>
#include "pairs.h"
#include "restcbt.h"
+
+#if defined(GMX_SIMD_X86_AVX_256) || defined(GMX_SIMD_X86_AVX2_256)
+
+// This was originally work-in-progress for augmenting the SIMD module with
+// masked load/store operations. Instead, that turned into and extended SIMD
+// interface that supports gather/scatter in all platforms, which will be
+// part of a future Gromacs version. However, since the code for bonded
+// interactions and LINCS was already written it would be a pity not to get
+// the performance gains in Gromacs-5.1. For this reason we have added it as
+// a bit of a hack in the two files that use it. It will be replaced with the
+// new generic functionality after version 5.1
+
+# ifdef GMX_DOUBLE
+static gmx_inline void gmx_simdcall
+gmx_hack_simd_transpose4_r(gmx_simd_double_t *row0,
+ gmx_simd_double_t *row1,
+ gmx_simd_double_t *row2,
+ gmx_simd_double_t *row3)
+{
+ __m256d tmp0, tmp1, tmp2, tmp3;
+
+ tmp0 = _mm256_unpacklo_pd(*row0, *row1);
+ tmp2 = _mm256_unpacklo_pd(*row2, *row3);
+ tmp1 = _mm256_unpackhi_pd(*row0, *row1);
+ tmp3 = _mm256_unpackhi_pd(*row2, *row3);
+ *row0 = _mm256_permute2f128_pd(tmp0, tmp2, 0b00100000);
+ *row1 = _mm256_permute2f128_pd(tmp1, tmp3, 0b00100000);
+ *row2 = _mm256_permute2f128_pd(tmp0, tmp2, 0b00110001);
+ *row3 = _mm256_permute2f128_pd(tmp1, tmp3, 0b00110001);
+}
+
+static gmx_inline void gmx_simdcall
+gmx_hack_simd4_transpose_to_simd_r(const gmx_simd4_double_t *a,
+ gmx_simd_double_t *row0,
+ gmx_simd_double_t *row1,
+ gmx_simd_double_t *row2,
+ gmx_simd_double_t *row3)
+{
+ *row0 = a[0];
+ *row1 = a[1];
+ *row2 = a[2];
+ *row3 = a[3];
+
+ gmx_hack_simd_transpose4_r(row0, row1, row2, row3);
+}
+
+# ifdef GMX_SIMD_X86_AVX_GCC_MASKLOAD_BUG
+# define gmx_hack_simd4_load3_r(mem) _mm256_maskload_pd((mem), _mm_castsi128_ps(_mm256_set_epi32(0, 0, -1, -1, -1, -1, -1, -1)))
+# else
+# define gmx_hack_simd4_load3_r(mem) _mm256_maskload_pd((mem), _mm256_set_epi32(0, 0, -1, -1, -1, -1, -1, -1))
+# endif
+
+# else /* single instead of double */
+static gmx_inline void gmx_simdcall
+gmx_hack_simd_transpose4_r(gmx_simd_float_t *row0,
+ gmx_simd_float_t *row1,
+ gmx_simd_float_t *row2,
+ gmx_simd_float_t *row3)
+{
+ __m256 tmp0, tmp1, tmp2, tmp3;
+
+ tmp0 = _mm256_unpacklo_ps(*row0, *row1);
+ tmp2 = _mm256_unpacklo_ps(*row2, *row3);
+ tmp1 = _mm256_unpackhi_ps(*row0, *row1);
+ tmp3 = _mm256_unpackhi_ps(*row2, *row3);
+ *row0 = _mm256_shuffle_ps(tmp0, tmp2, 0b0100010001000100);
+ *row1 = _mm256_shuffle_ps(tmp0, tmp2, 0b1110111011101110);
+ *row2 = _mm256_shuffle_ps(tmp1, tmp3, 0b0100010001000100);
+ *row3 = _mm256_shuffle_ps(tmp1, tmp3, 0b1110111011101110);
+}
+
+static gmx_inline void gmx_simdcall
+gmx_hack_simd4_transpose_to_simd_r(const gmx_simd4_float_t *a,
+ gmx_simd_float_t *row0,
+ gmx_simd_float_t *row1,
+ gmx_simd_float_t *row2,
+ gmx_simd_float_t *row3)
+{
+ *row0 = _mm256_insertf128_ps(_mm256_castps128_ps256(a[0]), a[4], 1);
+ *row1 = _mm256_insertf128_ps(_mm256_castps128_ps256(a[1]), a[5], 1);
+ *row2 = _mm256_insertf128_ps(_mm256_castps128_ps256(a[2]), a[6], 1);
+ *row3 = _mm256_insertf128_ps(_mm256_castps128_ps256(a[3]), a[7], 1);
+
+ gmx_hack_simd_transpose4_r(row0, row1, row2, row3);
+}
+#ifdef GMX_SIMD_X86_AVX_GCC_MASKLOAD_BUG
+# define gmx_hack_simd4_load3_r(mem) _mm_maskload_ps((mem), _mm_castsi256_pd(_mm_set_epi32(0, -1, -1, -1)))
+#else
+# define gmx_hack_simd4_load3_r(mem) _mm_maskload_ps((mem), _mm_set_epi32(0, -1, -1, -1))
+#endif
+
+#endif
+
+#endif /* AVX */
+
+
+
+#ifdef GMX_SIMD_HAVE_REAL
+/*! \brief Store differences between indexed rvecs in SIMD registers.
+ *
+ * Returns SIMD register with the difference vectors:
+ * v[index0[i]] - v[index1[i]]
+ *
+ * \param[in] v Array of rvecs
+ * \param[in] index0 Index into the vector array
+ * \param[in] index1 Index into the vector array
+ * \param[in,out] buf_aligned Aligned tmp buffer of size 3*GMX_SIMD_REAL_WIDTH
+ * \param[out] dx SIMD register with x difference
+ * \param[out] dy SIMD register with y difference
+ * \param[out] dz SIMD register with z difference
+ */
+static gmx_inline void gmx_simdcall
+gmx_hack_simd_gather_rvec_dist_two_index(const rvec *v,
+ const int *index0,
+ const int *index1,
+ real gmx_unused *buf_aligned,
+ gmx_simd_real_t *dx,
+ gmx_simd_real_t *dy,
+ gmx_simd_real_t *dz)
+{
+#if defined(GMX_SIMD_X86_AVX_256) || defined(GMX_SIMD_X86_AVX2_256)
+ int i;
+ gmx_simd4_real_t d[GMX_SIMD_REAL_WIDTH];
+ gmx_simd_real_t tmp;
+
+ for (i = 0; i < GMX_SIMD_REAL_WIDTH; i++)
+ {
+ d[i] = gmx_simd4_sub_r(gmx_hack_simd4_load3_r(&(v[index0[i]][0])),
+ gmx_hack_simd4_load3_r(&(v[index1[i]][0])));
+
+ }
+ gmx_hack_simd4_transpose_to_simd_r(d, dx, dy, dz, &tmp);
+#else /* generic SIMD */
+ int i, m;
+
+ for (i = 0; i < GMX_SIMD_REAL_WIDTH; i++)
+ {
+ /* Store the distances packed and aligned */
+ for (m = 0; m < DIM; m++)
+ {
+ buf_aligned[m*GMX_SIMD_REAL_WIDTH + i] =
+ v[index0[i]][m] - v[index1[i]][m];
+ }
+ }
+ *dx = gmx_simd_load_r(buf_aligned + 0*GMX_SIMD_REAL_WIDTH);
+ *dy = gmx_simd_load_r(buf_aligned + 1*GMX_SIMD_REAL_WIDTH);
+ *dz = gmx_simd_load_r(buf_aligned + 2*GMX_SIMD_REAL_WIDTH);
+#endif
+}
+#endif /* GMX_SIMD_HAVE_REAL */
+
+
/*! \brief Mysterious CMAP coefficient matrix */
const int cmap_coeff_matrix[] = {
1, 0, -3, 2, 0, 0, 0, 0, -3, 0, 9, -6, 2, 0, -6, 4,
coeff[s] = forceparams[type].harmonic.krA;
coeff[GMX_SIMD_REAL_WIDTH+s] = forceparams[type].harmonic.rA*DEG2RAD;
- /* Store the non PBC corrected distances packed and aligned */
- for (m = 0; m < DIM; m++)
- {
- dr[s + m *GMX_SIMD_REAL_WIDTH] = x[ai[s]][m] - x[aj[s]][m];
- dr[s + (DIM+m)*GMX_SIMD_REAL_WIDTH] = x[ak[s]][m] - x[aj[s]][m];
- }
-
/* At the end fill the arrays with identical entries */
if (iu + nfa1 < nbonds)
{
}
}
+ /* Store the non PBC corrected distances packed and aligned */
+ gmx_hack_simd_gather_rvec_dist_two_index(x, ai, aj, dr,
+ &rijx_S, &rijy_S, &rijz_S);
+ gmx_hack_simd_gather_rvec_dist_two_index(x, ak, aj, dr + 3*GMX_SIMD_REAL_WIDTH,
+ &rkjx_S, &rkjy_S, &rkjz_S);
+
k_S = gmx_simd_load_r(coeff);
theta0_S = gmx_simd_load_r(coeff+GMX_SIMD_REAL_WIDTH);
- rijx_S = gmx_simd_load_r(dr + 0*GMX_SIMD_REAL_WIDTH);
- rijy_S = gmx_simd_load_r(dr + 1*GMX_SIMD_REAL_WIDTH);
- rijz_S = gmx_simd_load_r(dr + 2*GMX_SIMD_REAL_WIDTH);
- rkjx_S = gmx_simd_load_r(dr + 3*GMX_SIMD_REAL_WIDTH);
- rkjy_S = gmx_simd_load_r(dr + 4*GMX_SIMD_REAL_WIDTH);
- rkjz_S = gmx_simd_load_r(dr + 5*GMX_SIMD_REAL_WIDTH);
-
pbc_correct_dx_simd(&rijx_S, &rijy_S, &rijz_S, &pbc_simd);
pbc_correct_dx_simd(&rkjx_S, &rkjy_S, &rkjz_S, &pbc_simd);
real *p,
real *q)
{
- int s, m;
gmx_simd_real_t rijx_S, rijy_S, rijz_S;
gmx_simd_real_t rkjx_S, rkjy_S, rkjz_S;
gmx_simd_real_t rklx_S, rkly_S, rklz_S;
/* The value of the last significant bit (GMX_REAL_EPS is half of that) */
real_eps_S = gmx_simd_set1_r(2*GMX_REAL_EPS);
- for (s = 0; s < GMX_SIMD_REAL_WIDTH; s++)
- {
- for (m = 0; m < DIM; m++)
- {
- dr[s + (0*DIM + m)*GMX_SIMD_REAL_WIDTH] = x[ai[s]][m] - x[aj[s]][m];
- dr[s + (1*DIM + m)*GMX_SIMD_REAL_WIDTH] = x[ak[s]][m] - x[aj[s]][m];
- dr[s + (2*DIM + m)*GMX_SIMD_REAL_WIDTH] = x[ak[s]][m] - x[al[s]][m];
- }
- }
-
- rijx_S = gmx_simd_load_r(dr + 0*GMX_SIMD_REAL_WIDTH);
- rijy_S = gmx_simd_load_r(dr + 1*GMX_SIMD_REAL_WIDTH);
- rijz_S = gmx_simd_load_r(dr + 2*GMX_SIMD_REAL_WIDTH);
- rkjx_S = gmx_simd_load_r(dr + 3*GMX_SIMD_REAL_WIDTH);
- rkjy_S = gmx_simd_load_r(dr + 4*GMX_SIMD_REAL_WIDTH);
- rkjz_S = gmx_simd_load_r(dr + 5*GMX_SIMD_REAL_WIDTH);
- rklx_S = gmx_simd_load_r(dr + 6*GMX_SIMD_REAL_WIDTH);
- rkly_S = gmx_simd_load_r(dr + 7*GMX_SIMD_REAL_WIDTH);
- rklz_S = gmx_simd_load_r(dr + 8*GMX_SIMD_REAL_WIDTH);
+ /* Store the non PBC corrected distances packed and aligned */
+ gmx_hack_simd_gather_rvec_dist_two_index(x, ai, aj, dr,
+ &rijx_S, &rijy_S, &rijz_S);
+ gmx_hack_simd_gather_rvec_dist_two_index(x, ak, aj, dr + 3*GMX_SIMD_REAL_WIDTH,
+ &rkjx_S, &rkjy_S, &rkjz_S);
+ gmx_hack_simd_gather_rvec_dist_two_index(x, ak, al, dr + 6*GMX_SIMD_REAL_WIDTH,
+ &rklx_S, &rkly_S, &rklz_S);
pbc_correct_dx_simd(&rijx_S, &rijy_S, &rijz_S, pbc);
pbc_correct_dx_simd(&rkjx_S, &rkjy_S, &rkjz_S, pbc);
#define LINCS_SIMD
#endif
+
+#if defined(GMX_SIMD_X86_AVX_256) || defined(GMX_SIMD_X86_AVX2_256)
+
+// This was originally work-in-progress for augmenting the SIMD module with
+// masked load/store operations. Instead, that turned into and extended SIMD
+// interface that supports gather/scatter in all platforms, which will be
+// part of a future Gromacs version. However, since the code for bonded
+// interactions and LINCS was already written it would be a pity not to get
+// the performance gains in Gromacs-5.1. For this reason we have added it as
+// a bit of a hack in the two files that use it. It will be replaced with the
+// new generic functionality after version 5.1
+
+# ifdef GMX_DOUBLE
+static gmx_inline void gmx_simdcall
+gmx_hack_simd_transpose4_r(gmx_simd_double_t *row0,
+ gmx_simd_double_t *row1,
+ gmx_simd_double_t *row2,
+ gmx_simd_double_t *row3)
+{
+ __m256d tmp0, tmp1, tmp2, tmp3;
+
+ tmp0 = _mm256_unpacklo_pd(*row0, *row1);
+ tmp2 = _mm256_unpacklo_pd(*row2, *row3);
+ tmp1 = _mm256_unpackhi_pd(*row0, *row1);
+ tmp3 = _mm256_unpackhi_pd(*row2, *row3);
+ *row0 = _mm256_permute2f128_pd(tmp0, tmp2, 0b00100000);
+ *row1 = _mm256_permute2f128_pd(tmp1, tmp3, 0b00100000);
+ *row2 = _mm256_permute2f128_pd(tmp0, tmp2, 0b00110001);
+ *row3 = _mm256_permute2f128_pd(tmp1, tmp3, 0b00110001);
+}
+
+static gmx_inline void gmx_simdcall
+gmx_hack_simd4_transpose_to_simd_r(const gmx_simd4_double_t *a,
+ gmx_simd_double_t *row0,
+ gmx_simd_double_t *row1,
+ gmx_simd_double_t *row2,
+ gmx_simd_double_t *row3)
+{
+ *row0 = a[0];
+ *row1 = a[1];
+ *row2 = a[2];
+ *row3 = a[3];
+
+ gmx_hack_simd_transpose4_r(row0, row1, row2, row3);
+}
+
+static gmx_inline void gmx_simdcall
+gmx_hack_simd_transpose_to_simd4_r(gmx_simd_double_t row0,
+ gmx_simd_double_t row1,
+ gmx_simd_double_t row2,
+ gmx_simd_double_t row3,
+ gmx_simd4_double_t *a)
+{
+ a[0] = row0;
+ a[1] = row1;
+ a[2] = row2;
+ a[3] = row3;
+
+ gmx_hack_simd_transpose4_r(&a[0], &a[1], &a[2], &a[3]);
+}
+
+
+# ifdef GMX_SIMD_X86_AVX_GCC_MASKLOAD_BUG
+# define gmx_hack_simd4_load3_r(mem) _mm256_maskload_pd((mem), _mm_castsi128_ps(_mm256_set_epi32(0, 0, -1, -1, -1, -1, -1, -1)))
+# define gmx_hack_simd4_store3_r(mem, x) _mm256_maskstore_pd((mem), _mm_castsi128_ps(_mm256_set_epi32(0, 0, -1, -1, -1, -1, -1, -1)), (x))
+# else
+# define gmx_hack_simd4_load3_r(mem) _mm256_maskload_pd((mem), _mm256_set_epi32(0, 0, -1, -1, -1, -1, -1, -1))
+# define gmx_hack_simd4_store3_r(mem, x) _mm256_maskstore_pd((mem), _mm256_set_epi32(0, 0, -1, -1, -1, -1, -1, -1), (x))
+# endif
+
+# else /* single instead of double */
+static gmx_inline void gmx_simdcall
+gmx_hack_simd_transpose4_r(gmx_simd_float_t *row0,
+ gmx_simd_float_t *row1,
+ gmx_simd_float_t *row2,
+ gmx_simd_float_t *row3)
+{
+ __m256 tmp0, tmp1, tmp2, tmp3;
+
+ tmp0 = _mm256_unpacklo_ps(*row0, *row1);
+ tmp2 = _mm256_unpacklo_ps(*row2, *row3);
+ tmp1 = _mm256_unpackhi_ps(*row0, *row1);
+ tmp3 = _mm256_unpackhi_ps(*row2, *row3);
+ *row0 = _mm256_shuffle_ps(tmp0, tmp2, 0b0100010001000100);
+ *row1 = _mm256_shuffle_ps(tmp0, tmp2, 0b1110111011101110);
+ *row2 = _mm256_shuffle_ps(tmp1, tmp3, 0b0100010001000100);
+ *row3 = _mm256_shuffle_ps(tmp1, tmp3, 0b1110111011101110);
+}
+
+static gmx_inline void gmx_simdcall
+gmx_hack_simd4_transpose_to_simd_r(const gmx_simd4_float_t *a,
+ gmx_simd_float_t *row0,
+ gmx_simd_float_t *row1,
+ gmx_simd_float_t *row2,
+ gmx_simd_float_t *row3)
+{
+ *row0 = _mm256_insertf128_ps(_mm256_castps128_ps256(a[0]), a[4], 1);
+ *row1 = _mm256_insertf128_ps(_mm256_castps128_ps256(a[1]), a[5], 1);
+ *row2 = _mm256_insertf128_ps(_mm256_castps128_ps256(a[2]), a[6], 1);
+ *row3 = _mm256_insertf128_ps(_mm256_castps128_ps256(a[3]), a[7], 1);
+
+ gmx_hack_simd_transpose4_r(row0, row1, row2, row3);
+}
+
+static gmx_inline void gmx_simdcall
+gmx_hack_simd_transpose_to_simd4_r(gmx_simd_float_t row0,
+ gmx_simd_float_t row1,
+ gmx_simd_float_t row2,
+ gmx_simd_float_t row3,
+ gmx_simd4_float_t *a)
+{
+ gmx_hack_simd_transpose4_r(&row0, &row1, &row2, &row3);
+
+ a[0] = _mm256_extractf128_ps(row0, 0);
+ a[1] = _mm256_extractf128_ps(row1, 0);
+ a[2] = _mm256_extractf128_ps(row2, 0);
+ a[3] = _mm256_extractf128_ps(row3, 0);
+ a[4] = _mm256_extractf128_ps(row0, 1);
+ a[5] = _mm256_extractf128_ps(row1, 1);
+ a[6] = _mm256_extractf128_ps(row2, 1);
+ a[7] = _mm256_extractf128_ps(row3, 1);
+}
+#ifdef GMX_SIMD_X86_AVX_GCC_MASKLOAD_BUG
+# define gmx_hack_simd4_load3_r(mem) _mm_maskload_ps((mem), _mm_castsi256_pd(_mm_set_epi32(0, -1, -1, -1)))
+# define gmx_hack_simd4_store3_r(mem, x) _mm_maskstore_ps((mem), _mm_castsi256_pd(_mm_set_epi32(0, -1, -1, -1)), (x))
+#else
+# define gmx_hack_simd4_load3_r(mem) _mm_maskload_ps((mem), _mm_set_epi32(0, -1, -1, -1))
+# define gmx_hack_simd4_store3_r(mem, x) _mm_maskstore_ps((mem), _mm_set_epi32(0, -1, -1, -1), (x))
+#endif
+
+#endif /* double */
+
+#endif /* AVX */
+
+
+
+
+#ifdef GMX_SIMD_HAVE_REAL
+/*! \brief Store differences between indexed rvecs in SIMD registers.
+ *
+ * Returns SIMD register with the difference vectors:
+ * v[pair_index[i*2]] - v[pair_index[i*2 + 1]]
+ *
+ * \param[in] v Array of rvecs
+ * \param[in] pair_index Index pairs for GMX_SIMD_REAL_WIDTH vector pairs
+ * \param[in,out] buf_aligned Aligned tmp buffer of size 3*GMX_SIMD_REAL_WIDTH
+ * \param[out] dx SIMD register with x difference
+ * \param[out] dy SIMD register with y difference
+ * \param[out] dz SIMD register with z difference
+ */
+static gmx_inline void gmx_simdcall
+gmx_hack_simd_gather_rvec_dist_pair_index(const rvec *v,
+ const int *pair_index,
+ real gmx_unused *buf_aligned,
+ gmx_simd_real_t *dx,
+ gmx_simd_real_t *dy,
+ gmx_simd_real_t *dz)
+{
+#if defined(GMX_SIMD_X86_AVX_256) || defined(GMX_SIMD_X86_AVX2_256)
+ int i;
+ gmx_simd4_real_t d[GMX_SIMD_REAL_WIDTH];
+ gmx_simd_real_t tmp;
+
+ for (i = 0; i < GMX_SIMD_REAL_WIDTH; i++)
+ {
+ d[i] = gmx_simd4_sub_r(gmx_hack_simd4_load3_r(&(v[pair_index[i*2 + 0]][0])),
+ gmx_hack_simd4_load3_r(&(v[pair_index[i*2 + 1]][0])));
+ }
+
+ gmx_hack_simd4_transpose_to_simd_r(d, dx, dy, dz, &tmp);
+#else
+ int i, m;
+
+ for (i = 0; i < GMX_SIMD_REAL_WIDTH; i++)
+ {
+ /* Store the distances packed and aligned */
+ for (m = 0; m < DIM; m++)
+ {
+ buf_aligned[m*GMX_SIMD_REAL_WIDTH + i] =
+ v[pair_index[i*2]][m] - v[pair_index[i*2 + 1]][m];
+ }
+ }
+ *dx = gmx_simd_load_r(buf_aligned + 0*GMX_SIMD_REAL_WIDTH);
+ *dy = gmx_simd_load_r(buf_aligned + 1*GMX_SIMD_REAL_WIDTH);
+ *dz = gmx_simd_load_r(buf_aligned + 2*GMX_SIMD_REAL_WIDTH);
+#endif
+}
+
+
+/*! \brief Stores SIMD vector into multiple rvecs.
+ *
+ * \param[in] x SIMD register with x-components of the vectors
+ * \param[in] y SIMD register with y-components of the vectors
+ * \param[in] z SIMD register with z-components of the vectors
+ * \param[in,out] buf_aligned Aligned tmp buffer of size 3*GMX_SIMD_REAL_WIDTH
+ * \param[out] v Array of GMX_SIMD_REAL_WIDTH rvecs
+ */
+static gmx_inline void gmx_simdcall
+gmx_simd_store_vec_to_rvec(gmx_simd_real_t x,
+ gmx_simd_real_t y,
+ gmx_simd_real_t z,
+ real gmx_unused *buf_aligned,
+ rvec *v)
+{
+#if defined(GMX_SIMD_X86_AVX_256) || defined(GMX_SIMD_X86_AVX2_256)
+ int i;
+ gmx_simd4_real_t s4[GMX_SIMD_REAL_WIDTH];
+ gmx_simd_real_t zero = gmx_simd_setzero_r();
+
+ gmx_hack_simd_transpose_to_simd4_r(x, y, z, zero, s4);
+
+ for (i = 0; i < GMX_SIMD_REAL_WIDTH; i++)
+ {
+ gmx_hack_simd4_store3_r(v[i], s4[i]);
+ }
+#else
+ int i, m;
+
+ gmx_simd_store_r(buf_aligned + 0*GMX_SIMD_REAL_WIDTH, x);
+ gmx_simd_store_r(buf_aligned + 1*GMX_SIMD_REAL_WIDTH, y);
+ gmx_simd_store_r(buf_aligned + 2*GMX_SIMD_REAL_WIDTH, z);
+
+ for (i = 0; i < GMX_SIMD_REAL_WIDTH; i++)
+ {
+ for (m = 0; m < DIM; m++)
+ {
+ v[i][m] = buf_aligned[m*GMX_SIMD_REAL_WIDTH + i];
+ }
+ }
+#endif
+}
+#endif /* GMX_SIMD_HAVE_REAL */
+
+
typedef struct {
int b0; /* first constraint for this task */
int b1; /* b1-1 is the last constraint for this task */
}
#ifdef LINCS_SIMD
-/* Calculate distances between indexed atom coordinate pairs
- * and store the result in 3 SIMD registers through an aligned buffer.
- * Start from index is and load GMX_SIMD_REAL_WIDTH elements.
- * Note that pair_index must contain valid indices up to is+GMX_SIMD_REAL_WIDTH.
- */
-static gmx_inline void gmx_simdcall
-rvec_dist_to_simd(const rvec *x,
- int is,
- const int *pair_index,
- real *buf,
- gmx_simd_real_t *dx,
- gmx_simd_real_t *dy,
- gmx_simd_real_t *dz)
-{
- int s, m;
-
- for (s = 0; s < GMX_SIMD_REAL_WIDTH; s++)
- {
- /* Store the non PBC corrected distances packed and aligned */
- for (m = 0; m < DIM; m++)
- {
- buf[m*GMX_SIMD_REAL_WIDTH + s] =
- x[pair_index[2*(is+s)]][m] - x[pair_index[2*(is+s) + 1]][m];
- }
- }
- *dx = gmx_simd_load_r(buf + 0*GMX_SIMD_REAL_WIDTH);
- *dy = gmx_simd_load_r(buf + 1*GMX_SIMD_REAL_WIDTH);
- *dz = gmx_simd_load_r(buf + 2*GMX_SIMD_REAL_WIDTH);
-}
-
-/* Store a SIMD vector in GMX_SIMD_REAL_WIDTH rvecs through an aligned buffer */
-static gmx_inline void gmx_simdcall
-copy_simd_vec_to_rvec(gmx_simd_real_t x,
- gmx_simd_real_t y,
- gmx_simd_real_t z,
- real *buf,
- rvec *v,
- int is)
-{
- int s, m;
-
- gmx_simd_store_r(buf + 0*GMX_SIMD_REAL_WIDTH, x);
- gmx_simd_store_r(buf + 1*GMX_SIMD_REAL_WIDTH, y);
- gmx_simd_store_r(buf + 2*GMX_SIMD_REAL_WIDTH, z);
-
- for (s = 0; s < GMX_SIMD_REAL_WIDTH; s++)
- {
- for (m = 0; m < DIM; m++)
- {
- v[is + s][m] = buf[m*GMX_SIMD_REAL_WIDTH + s];
- }
- }
-}
-
/* Calculate the constraint distance vectors r to project on from x.
* Determine the right-hand side of the matrix equation using quantity f.
* This function only differs from calc_dr_x_xp_simd below in that
gmx_simd_real_t rx_S, ry_S, rz_S, n2_S, il_S;
gmx_simd_real_t fx_S, fy_S, fz_S, ip_S, rhs_S;
- rvec_dist_to_simd(x, bs, bla, vbuf1, &rx_S, &ry_S, &rz_S);
+ gmx_hack_simd_gather_rvec_dist_pair_index(x, bla + bs*2, vbuf1,
+ &rx_S, &ry_S, &rz_S);
pbc_correct_dx_simd(&rx_S, &ry_S, &rz_S, pbc_simd);
ry_S = gmx_simd_mul_r(ry_S, il_S);
rz_S = gmx_simd_mul_r(rz_S, il_S);
- copy_simd_vec_to_rvec(rx_S, ry_S, rz_S, vbuf1, r, bs);
+ gmx_simd_store_vec_to_rvec(rx_S, ry_S, rz_S, vbuf1, r + bs);
- rvec_dist_to_simd(f, bs, bla, vbuf2, &fx_S, &fy_S, &fz_S);
+ gmx_hack_simd_gather_rvec_dist_pair_index(f, bla + bs*2, vbuf2,
+ &fx_S, &fy_S, &fz_S);
ip_S = gmx_simd_iprod_r(rx_S, ry_S, rz_S,
fx_S, fy_S, fz_S);
gmx_simd_real_t rx_S, ry_S, rz_S, n2_S, il_S;
gmx_simd_real_t rxp_S, ryp_S, rzp_S, ip_S, rhs_S;
- rvec_dist_to_simd(x, bs, bla, vbuf1, &rx_S, &ry_S, &rz_S);
+ gmx_hack_simd_gather_rvec_dist_pair_index(x, bla + bs*2, vbuf1,
+ &rx_S, &ry_S, &rz_S);
pbc_correct_dx_simd(&rx_S, &ry_S, &rz_S, pbc_simd);
ry_S = gmx_simd_mul_r(ry_S, il_S);
rz_S = gmx_simd_mul_r(rz_S, il_S);
- copy_simd_vec_to_rvec(rx_S, ry_S, rz_S, vbuf1, r, bs);
+ gmx_simd_store_vec_to_rvec(rx_S, ry_S, rz_S, vbuf1, r + bs);
- rvec_dist_to_simd(xp, bs, bla, vbuf2, &rxp_S, &ryp_S, &rzp_S);
+ gmx_hack_simd_gather_rvec_dist_pair_index(xp, bla + bs*2, vbuf2,
+ &rxp_S, &ryp_S, &rzp_S);
pbc_correct_dx_simd(&rxp_S, &ryp_S, &rzp_S, pbc_simd);
gmx_simd_real_t rx_S, ry_S, rz_S, n2_S;
gmx_simd_real_t len_S, len2_S, dlen2_S, lc_S, blc_S;
- rvec_dist_to_simd(x, bs, bla, vbuf, &rx_S, &ry_S, &rz_S);
+ gmx_hack_simd_gather_rvec_dist_pair_index(x, bla + bs*2, vbuf,
+ &rx_S, &ry_S, &rz_S);
pbc_correct_dx_simd(&rx_S, &ry_S, &rz_S, pbc_simd);