Added hacks for SIMD rvec/load store in lincs & bondeds
authorErik Lindahl <erik@kth.se>
Sat, 16 May 2015 10:45:22 +0000 (12:45 +0200)
committerGerrit Code Review <gerrit@gerrit.gromacs.org>
Sun, 17 May 2015 22:12:10 +0000 (00:12 +0200)
We have added proper gather/scatter operations to work on
rvecs for all SIMD architectures, but that will not make it into
Gromacs-5.1. Since Berk already wrote a few routines to use
maskloads at least for AVX & AVX2, this is a bit of a hack to
get the performance benefits of that code already in Gromacs-5.1
(for AVX/AVX2), without altering the SIMD module. This is definitely
a hack, and the code will be replaced once the extended SIMD
module is in place.

Change-Id: I385acb5f989b2ecf463948be84947fe1f6dfd19b

src/gromacs/listed-forces/bonded.cpp
src/gromacs/mdlib/clincs.cpp

index f34528ded50b51981e10ad0688df7e3854389f69..2e2c8598922ec49cd7cc7929dec1759bb98d0e95 100644 (file)
@@ -47,6 +47,8 @@
 
 #include "bonded.h"
 
+#include "config.h"
+
 #include <assert.h>
 
 #include <cmath>
 #include "pairs.h"
 #include "restcbt.h"
 
+
+#if defined(GMX_SIMD_X86_AVX_256) || defined(GMX_SIMD_X86_AVX2_256)
+
+// This was originally work-in-progress for augmenting the SIMD module with
+// masked load/store operations. Instead, that turned into and extended SIMD
+// interface that supports gather/scatter in all platforms, which will be
+// part of a future Gromacs version. However, since the code for bonded
+// interactions and LINCS was already written it would be a pity not to get
+// the performance gains in Gromacs-5.1. For this reason we have added it as
+// a bit of a hack in the two files that use it. It will be replaced with the
+// new generic functionality after version 5.1
+
+#    ifdef GMX_DOUBLE
+static gmx_inline void gmx_simdcall
+gmx_hack_simd_transpose4_r(gmx_simd_double_t *row0,
+                           gmx_simd_double_t *row1,
+                           gmx_simd_double_t *row2,
+                           gmx_simd_double_t *row3)
+{
+    __m256d tmp0, tmp1, tmp2, tmp3;
+
+    tmp0  = _mm256_unpacklo_pd(*row0, *row1);
+    tmp2  = _mm256_unpacklo_pd(*row2, *row3);
+    tmp1  = _mm256_unpackhi_pd(*row0, *row1);
+    tmp3  = _mm256_unpackhi_pd(*row2, *row3);
+    *row0 = _mm256_permute2f128_pd(tmp0, tmp2, 0b00100000);
+    *row1 = _mm256_permute2f128_pd(tmp1, tmp3, 0b00100000);
+    *row2 = _mm256_permute2f128_pd(tmp0, tmp2, 0b00110001);
+    *row3 = _mm256_permute2f128_pd(tmp1, tmp3, 0b00110001);
+}
+
+static gmx_inline void gmx_simdcall
+gmx_hack_simd4_transpose_to_simd_r(const gmx_simd4_double_t *a,
+                                   gmx_simd_double_t        *row0,
+                                   gmx_simd_double_t        *row1,
+                                   gmx_simd_double_t        *row2,
+                                   gmx_simd_double_t        *row3)
+{
+    *row0 = a[0];
+    *row1 = a[1];
+    *row2 = a[2];
+    *row3 = a[3];
+
+    gmx_hack_simd_transpose4_r(row0, row1, row2, row3);
+}
+
+#    ifdef GMX_SIMD_X86_AVX_GCC_MASKLOAD_BUG
+#        define gmx_hack_simd4_load3_r(mem)    _mm256_maskload_pd((mem), _mm_castsi128_ps(_mm256_set_epi32(0, 0, -1, -1, -1, -1, -1, -1)))
+#    else
+#        define gmx_hack_simd4_load3_r(mem)    _mm256_maskload_pd((mem), _mm256_set_epi32(0, 0, -1, -1, -1, -1, -1, -1))
+#    endif
+
+#    else /* single instead of double */
+static gmx_inline void gmx_simdcall
+gmx_hack_simd_transpose4_r(gmx_simd_float_t *row0,
+                           gmx_simd_float_t *row1,
+                           gmx_simd_float_t *row2,
+                           gmx_simd_float_t *row3)
+{
+    __m256 tmp0, tmp1, tmp2, tmp3;
+
+    tmp0  = _mm256_unpacklo_ps(*row0, *row1);
+    tmp2  = _mm256_unpacklo_ps(*row2, *row3);
+    tmp1  = _mm256_unpackhi_ps(*row0, *row1);
+    tmp3  = _mm256_unpackhi_ps(*row2, *row3);
+    *row0 = _mm256_shuffle_ps(tmp0, tmp2, 0b0100010001000100);
+    *row1 = _mm256_shuffle_ps(tmp0, tmp2, 0b1110111011101110);
+    *row2 = _mm256_shuffle_ps(tmp1, tmp3, 0b0100010001000100);
+    *row3 = _mm256_shuffle_ps(tmp1, tmp3, 0b1110111011101110);
+}
+
+static gmx_inline void gmx_simdcall
+gmx_hack_simd4_transpose_to_simd_r(const gmx_simd4_float_t *a,
+                                   gmx_simd_float_t        *row0,
+                                   gmx_simd_float_t        *row1,
+                                   gmx_simd_float_t        *row2,
+                                   gmx_simd_float_t        *row3)
+{
+    *row0 = _mm256_insertf128_ps(_mm256_castps128_ps256(a[0]), a[4], 1);
+    *row1 = _mm256_insertf128_ps(_mm256_castps128_ps256(a[1]), a[5], 1);
+    *row2 = _mm256_insertf128_ps(_mm256_castps128_ps256(a[2]), a[6], 1);
+    *row3 = _mm256_insertf128_ps(_mm256_castps128_ps256(a[3]), a[7], 1);
+
+    gmx_hack_simd_transpose4_r(row0, row1, row2, row3);
+}
+#ifdef GMX_SIMD_X86_AVX_GCC_MASKLOAD_BUG
+#        define gmx_hack_simd4_load3_r(mem)    _mm_maskload_ps((mem), _mm_castsi256_pd(_mm_set_epi32(0, -1, -1, -1)))
+#else
+#        define gmx_hack_simd4_load3_r(mem)    _mm_maskload_ps((mem), _mm_set_epi32(0, -1, -1, -1))
+#endif
+
+#endif
+
+#endif /* AVX */
+
+
+
+#ifdef GMX_SIMD_HAVE_REAL
+/*! \brief Store differences between indexed rvecs in SIMD registers.
+ *
+ * Returns SIMD register with the difference vectors:
+ *     v[index0[i]] - v[index1[i]]
+ *
+ * \param[in]     v           Array of rvecs
+ * \param[in]     index0      Index into the vector array
+ * \param[in]     index1      Index into the vector array
+ * \param[in,out] buf_aligned Aligned tmp buffer of size 3*GMX_SIMD_REAL_WIDTH
+ * \param[out]    dx          SIMD register with x difference
+ * \param[out]    dy          SIMD register with y difference
+ * \param[out]    dz          SIMD register with z difference
+ */
+static gmx_inline void gmx_simdcall
+gmx_hack_simd_gather_rvec_dist_two_index(const rvec      *v,
+                                         const int       *index0,
+                                         const int       *index1,
+                                         real gmx_unused *buf_aligned,
+                                         gmx_simd_real_t *dx,
+                                         gmx_simd_real_t *dy,
+                                         gmx_simd_real_t *dz)
+{
+#if defined(GMX_SIMD_X86_AVX_256) || defined(GMX_SIMD_X86_AVX2_256)
+    int              i;
+    gmx_simd4_real_t d[GMX_SIMD_REAL_WIDTH];
+    gmx_simd_real_t  tmp;
+
+    for (i = 0; i < GMX_SIMD_REAL_WIDTH; i++)
+    {
+        d[i] = gmx_simd4_sub_r(gmx_hack_simd4_load3_r(&(v[index0[i]][0])),
+                               gmx_hack_simd4_load3_r(&(v[index1[i]][0])));
+
+    }
+    gmx_hack_simd4_transpose_to_simd_r(d, dx, dy, dz, &tmp);
+#else /* generic SIMD */
+    int i, m;
+
+    for (i = 0; i < GMX_SIMD_REAL_WIDTH; i++)
+    {
+        /* Store the distances packed and aligned */
+        for (m = 0; m < DIM; m++)
+        {
+            buf_aligned[m*GMX_SIMD_REAL_WIDTH + i] =
+                v[index0[i]][m] - v[index1[i]][m];
+        }
+    }
+    *dx = gmx_simd_load_r(buf_aligned + 0*GMX_SIMD_REAL_WIDTH);
+    *dy = gmx_simd_load_r(buf_aligned + 1*GMX_SIMD_REAL_WIDTH);
+    *dz = gmx_simd_load_r(buf_aligned + 2*GMX_SIMD_REAL_WIDTH);
+#endif
+}
+#endif /* GMX_SIMD_HAVE_REAL */
+
+
 /*! \brief Mysterious CMAP coefficient matrix */
 const int cmap_coeff_matrix[] = {
     1, 0, -3,  2, 0, 0,  0,  0, -3,  0,  9, -6,  2,  0, -6,  4,
@@ -1025,13 +1179,6 @@ angles_noener_simd(int nbonds,
             coeff[s]                     = forceparams[type].harmonic.krA;
             coeff[GMX_SIMD_REAL_WIDTH+s] = forceparams[type].harmonic.rA*DEG2RAD;
 
-            /* Store the non PBC corrected distances packed and aligned */
-            for (m = 0; m < DIM; m++)
-            {
-                dr[s +      m *GMX_SIMD_REAL_WIDTH] = x[ai[s]][m] - x[aj[s]][m];
-                dr[s + (DIM+m)*GMX_SIMD_REAL_WIDTH] = x[ak[s]][m] - x[aj[s]][m];
-            }
-
             /* At the end fill the arrays with identical entries */
             if (iu + nfa1 < nbonds)
             {
@@ -1039,16 +1186,15 @@ angles_noener_simd(int nbonds,
             }
         }
 
+        /* Store the non PBC corrected distances packed and aligned */
+        gmx_hack_simd_gather_rvec_dist_two_index(x, ai, aj, dr,
+                                                 &rijx_S, &rijy_S, &rijz_S);
+        gmx_hack_simd_gather_rvec_dist_two_index(x, ak, aj, dr + 3*GMX_SIMD_REAL_WIDTH,
+                                                 &rkjx_S, &rkjy_S, &rkjz_S);
+
         k_S       = gmx_simd_load_r(coeff);
         theta0_S  = gmx_simd_load_r(coeff+GMX_SIMD_REAL_WIDTH);
 
-        rijx_S    = gmx_simd_load_r(dr + 0*GMX_SIMD_REAL_WIDTH);
-        rijy_S    = gmx_simd_load_r(dr + 1*GMX_SIMD_REAL_WIDTH);
-        rijz_S    = gmx_simd_load_r(dr + 2*GMX_SIMD_REAL_WIDTH);
-        rkjx_S    = gmx_simd_load_r(dr + 3*GMX_SIMD_REAL_WIDTH);
-        rkjy_S    = gmx_simd_load_r(dr + 4*GMX_SIMD_REAL_WIDTH);
-        rkjz_S    = gmx_simd_load_r(dr + 5*GMX_SIMD_REAL_WIDTH);
-
         pbc_correct_dx_simd(&rijx_S, &rijy_S, &rijz_S, &pbc_simd);
         pbc_correct_dx_simd(&rkjx_S, &rkjy_S, &rkjz_S, &pbc_simd);
 
@@ -1440,7 +1586,6 @@ dih_angle_simd(const rvec *x,
                real *p,
                real *q)
 {
-    int             s, m;
     gmx_simd_real_t rijx_S, rijy_S, rijz_S;
     gmx_simd_real_t rkjx_S, rkjy_S, rkjz_S;
     gmx_simd_real_t rklx_S, rkly_S, rklz_S;
@@ -1463,25 +1608,13 @@ dih_angle_simd(const rvec *x,
     /* The value of the last significant bit (GMX_REAL_EPS is half of that) */
     real_eps_S  = gmx_simd_set1_r(2*GMX_REAL_EPS);
 
-    for (s = 0; s < GMX_SIMD_REAL_WIDTH; s++)
-    {
-        for (m = 0; m < DIM; m++)
-        {
-            dr[s + (0*DIM + m)*GMX_SIMD_REAL_WIDTH] = x[ai[s]][m] - x[aj[s]][m];
-            dr[s + (1*DIM + m)*GMX_SIMD_REAL_WIDTH] = x[ak[s]][m] - x[aj[s]][m];
-            dr[s + (2*DIM + m)*GMX_SIMD_REAL_WIDTH] = x[ak[s]][m] - x[al[s]][m];
-        }
-    }
-
-    rijx_S = gmx_simd_load_r(dr + 0*GMX_SIMD_REAL_WIDTH);
-    rijy_S = gmx_simd_load_r(dr + 1*GMX_SIMD_REAL_WIDTH);
-    rijz_S = gmx_simd_load_r(dr + 2*GMX_SIMD_REAL_WIDTH);
-    rkjx_S = gmx_simd_load_r(dr + 3*GMX_SIMD_REAL_WIDTH);
-    rkjy_S = gmx_simd_load_r(dr + 4*GMX_SIMD_REAL_WIDTH);
-    rkjz_S = gmx_simd_load_r(dr + 5*GMX_SIMD_REAL_WIDTH);
-    rklx_S = gmx_simd_load_r(dr + 6*GMX_SIMD_REAL_WIDTH);
-    rkly_S = gmx_simd_load_r(dr + 7*GMX_SIMD_REAL_WIDTH);
-    rklz_S = gmx_simd_load_r(dr + 8*GMX_SIMD_REAL_WIDTH);
+    /* Store the non PBC corrected distances packed and aligned */
+    gmx_hack_simd_gather_rvec_dist_two_index(x, ai, aj, dr,
+                                             &rijx_S, &rijy_S, &rijz_S);
+    gmx_hack_simd_gather_rvec_dist_two_index(x, ak, aj, dr + 3*GMX_SIMD_REAL_WIDTH,
+                                             &rkjx_S, &rkjy_S, &rkjz_S);
+    gmx_hack_simd_gather_rvec_dist_two_index(x, ak, al, dr + 6*GMX_SIMD_REAL_WIDTH,
+                                             &rklx_S, &rkly_S, &rklz_S);
 
     pbc_correct_dx_simd(&rijx_S, &rijy_S, &rijz_S, pbc);
     pbc_correct_dx_simd(&rkjx_S, &rkjy_S, &rkjz_S, pbc);
index e4de5798422317f6d8e53f325ca2003ce8233044..a1b9ed893cdd3bcdbef0f6210b456a1c08a00571 100644 (file)
 #define LINCS_SIMD
 #endif
 
+
+#if defined(GMX_SIMD_X86_AVX_256) || defined(GMX_SIMD_X86_AVX2_256)
+
+// This was originally work-in-progress for augmenting the SIMD module with
+// masked load/store operations. Instead, that turned into and extended SIMD
+// interface that supports gather/scatter in all platforms, which will be
+// part of a future Gromacs version. However, since the code for bonded
+// interactions and LINCS was already written it would be a pity not to get
+// the performance gains in Gromacs-5.1. For this reason we have added it as
+// a bit of a hack in the two files that use it. It will be replaced with the
+// new generic functionality after version 5.1
+
+#    ifdef GMX_DOUBLE
+static gmx_inline void gmx_simdcall
+gmx_hack_simd_transpose4_r(gmx_simd_double_t *row0,
+                           gmx_simd_double_t *row1,
+                           gmx_simd_double_t *row2,
+                           gmx_simd_double_t *row3)
+{
+    __m256d tmp0, tmp1, tmp2, tmp3;
+
+    tmp0  = _mm256_unpacklo_pd(*row0, *row1);
+    tmp2  = _mm256_unpacklo_pd(*row2, *row3);
+    tmp1  = _mm256_unpackhi_pd(*row0, *row1);
+    tmp3  = _mm256_unpackhi_pd(*row2, *row3);
+    *row0 = _mm256_permute2f128_pd(tmp0, tmp2, 0b00100000);
+    *row1 = _mm256_permute2f128_pd(tmp1, tmp3, 0b00100000);
+    *row2 = _mm256_permute2f128_pd(tmp0, tmp2, 0b00110001);
+    *row3 = _mm256_permute2f128_pd(tmp1, tmp3, 0b00110001);
+}
+
+static gmx_inline void gmx_simdcall
+gmx_hack_simd4_transpose_to_simd_r(const gmx_simd4_double_t *a,
+                                   gmx_simd_double_t        *row0,
+                                   gmx_simd_double_t        *row1,
+                                   gmx_simd_double_t        *row2,
+                                   gmx_simd_double_t        *row3)
+{
+    *row0 = a[0];
+    *row1 = a[1];
+    *row2 = a[2];
+    *row3 = a[3];
+
+    gmx_hack_simd_transpose4_r(row0, row1, row2, row3);
+}
+
+static gmx_inline void gmx_simdcall
+gmx_hack_simd_transpose_to_simd4_r(gmx_simd_double_t   row0,
+                                   gmx_simd_double_t   row1,
+                                   gmx_simd_double_t   row2,
+                                   gmx_simd_double_t   row3,
+                                   gmx_simd4_double_t *a)
+{
+    a[0] = row0;
+    a[1] = row1;
+    a[2] = row2;
+    a[3] = row3;
+
+    gmx_hack_simd_transpose4_r(&a[0], &a[1], &a[2], &a[3]);
+}
+
+
+#    ifdef GMX_SIMD_X86_AVX_GCC_MASKLOAD_BUG
+#        define gmx_hack_simd4_load3_r(mem)      _mm256_maskload_pd((mem), _mm_castsi128_ps(_mm256_set_epi32(0, 0, -1, -1, -1, -1, -1, -1)))
+#        define gmx_hack_simd4_store3_r(mem, x)   _mm256_maskstore_pd((mem), _mm_castsi128_ps(_mm256_set_epi32(0, 0, -1, -1, -1, -1, -1, -1)), (x))
+#    else
+#        define gmx_hack_simd4_load3_r(mem)      _mm256_maskload_pd((mem), _mm256_set_epi32(0, 0, -1, -1, -1, -1, -1, -1))
+#        define gmx_hack_simd4_store3_r(mem, x)   _mm256_maskstore_pd((mem), _mm256_set_epi32(0, 0, -1, -1, -1, -1, -1, -1), (x))
+#    endif
+
+#    else /* single instead of double */
+static gmx_inline void gmx_simdcall
+gmx_hack_simd_transpose4_r(gmx_simd_float_t *row0,
+                           gmx_simd_float_t *row1,
+                           gmx_simd_float_t *row2,
+                           gmx_simd_float_t *row3)
+{
+    __m256 tmp0, tmp1, tmp2, tmp3;
+
+    tmp0  = _mm256_unpacklo_ps(*row0, *row1);
+    tmp2  = _mm256_unpacklo_ps(*row2, *row3);
+    tmp1  = _mm256_unpackhi_ps(*row0, *row1);
+    tmp3  = _mm256_unpackhi_ps(*row2, *row3);
+    *row0 = _mm256_shuffle_ps(tmp0, tmp2, 0b0100010001000100);
+    *row1 = _mm256_shuffle_ps(tmp0, tmp2, 0b1110111011101110);
+    *row2 = _mm256_shuffle_ps(tmp1, tmp3, 0b0100010001000100);
+    *row3 = _mm256_shuffle_ps(tmp1, tmp3, 0b1110111011101110);
+}
+
+static gmx_inline void gmx_simdcall
+gmx_hack_simd4_transpose_to_simd_r(const gmx_simd4_float_t *a,
+                                   gmx_simd_float_t        *row0,
+                                   gmx_simd_float_t        *row1,
+                                   gmx_simd_float_t        *row2,
+                                   gmx_simd_float_t        *row3)
+{
+    *row0 = _mm256_insertf128_ps(_mm256_castps128_ps256(a[0]), a[4], 1);
+    *row1 = _mm256_insertf128_ps(_mm256_castps128_ps256(a[1]), a[5], 1);
+    *row2 = _mm256_insertf128_ps(_mm256_castps128_ps256(a[2]), a[6], 1);
+    *row3 = _mm256_insertf128_ps(_mm256_castps128_ps256(a[3]), a[7], 1);
+
+    gmx_hack_simd_transpose4_r(row0, row1, row2, row3);
+}
+
+static gmx_inline void gmx_simdcall
+gmx_hack_simd_transpose_to_simd4_r(gmx_simd_float_t   row0,
+                                   gmx_simd_float_t   row1,
+                                   gmx_simd_float_t   row2,
+                                   gmx_simd_float_t   row3,
+                                   gmx_simd4_float_t *a)
+{
+    gmx_hack_simd_transpose4_r(&row0, &row1, &row2, &row3);
+
+    a[0] = _mm256_extractf128_ps(row0, 0);
+    a[1] = _mm256_extractf128_ps(row1, 0);
+    a[2] = _mm256_extractf128_ps(row2, 0);
+    a[3] = _mm256_extractf128_ps(row3, 0);
+    a[4] = _mm256_extractf128_ps(row0, 1);
+    a[5] = _mm256_extractf128_ps(row1, 1);
+    a[6] = _mm256_extractf128_ps(row2, 1);
+    a[7] = _mm256_extractf128_ps(row3, 1);
+}
+#ifdef GMX_SIMD_X86_AVX_GCC_MASKLOAD_BUG
+#        define gmx_hack_simd4_load3_r(mem)      _mm_maskload_ps((mem), _mm_castsi256_pd(_mm_set_epi32(0, -1, -1, -1)))
+#        define gmx_hack_simd4_store3_r(mem, x)   _mm_maskstore_ps((mem), _mm_castsi256_pd(_mm_set_epi32(0, -1, -1, -1)), (x))
+#else
+#        define gmx_hack_simd4_load3_r(mem)      _mm_maskload_ps((mem), _mm_set_epi32(0, -1, -1, -1))
+#        define gmx_hack_simd4_store3_r(mem, x)   _mm_maskstore_ps((mem), _mm_set_epi32(0, -1, -1, -1), (x))
+#endif
+
+#endif /* double */
+
+#endif /* AVX */
+
+
+
+
+#ifdef GMX_SIMD_HAVE_REAL
+/*! \brief Store differences between indexed rvecs in SIMD registers.
+ *
+ * Returns SIMD register with the difference vectors:
+ *     v[pair_index[i*2]] - v[pair_index[i*2 + 1]]
+ *
+ * \param[in]     v           Array of rvecs
+ * \param[in]     pair_index  Index pairs for GMX_SIMD_REAL_WIDTH vector pairs
+ * \param[in,out] buf_aligned Aligned tmp buffer of size 3*GMX_SIMD_REAL_WIDTH
+ * \param[out]    dx          SIMD register with x difference
+ * \param[out]    dy          SIMD register with y difference
+ * \param[out]    dz          SIMD register with z difference
+ */
+static gmx_inline void gmx_simdcall
+gmx_hack_simd_gather_rvec_dist_pair_index(const rvec      *v,
+                                          const int       *pair_index,
+                                          real gmx_unused *buf_aligned,
+                                          gmx_simd_real_t *dx,
+                                          gmx_simd_real_t *dy,
+                                          gmx_simd_real_t *dz)
+{
+#if defined(GMX_SIMD_X86_AVX_256) || defined(GMX_SIMD_X86_AVX2_256)
+    int              i;
+    gmx_simd4_real_t d[GMX_SIMD_REAL_WIDTH];
+    gmx_simd_real_t  tmp;
+
+    for (i = 0; i < GMX_SIMD_REAL_WIDTH; i++)
+    {
+        d[i] = gmx_simd4_sub_r(gmx_hack_simd4_load3_r(&(v[pair_index[i*2 + 0]][0])),
+                               gmx_hack_simd4_load3_r(&(v[pair_index[i*2 + 1]][0])));
+    }
+
+    gmx_hack_simd4_transpose_to_simd_r(d, dx, dy, dz, &tmp);
+#else
+    int i, m;
+
+    for (i = 0; i < GMX_SIMD_REAL_WIDTH; i++)
+    {
+        /* Store the distances packed and aligned */
+        for (m = 0; m < DIM; m++)
+        {
+            buf_aligned[m*GMX_SIMD_REAL_WIDTH + i] =
+                v[pair_index[i*2]][m] - v[pair_index[i*2 + 1]][m];
+        }
+    }
+    *dx = gmx_simd_load_r(buf_aligned + 0*GMX_SIMD_REAL_WIDTH);
+    *dy = gmx_simd_load_r(buf_aligned + 1*GMX_SIMD_REAL_WIDTH);
+    *dz = gmx_simd_load_r(buf_aligned + 2*GMX_SIMD_REAL_WIDTH);
+#endif
+}
+
+
+/*! \brief Stores SIMD vector into multiple rvecs.
+ *
+ * \param[in]     x           SIMD register with x-components of the vectors
+ * \param[in]     y           SIMD register with y-components of the vectors
+ * \param[in]     z           SIMD register with z-components of the vectors
+ * \param[in,out] buf_aligned Aligned tmp buffer of size 3*GMX_SIMD_REAL_WIDTH
+ * \param[out]    v           Array of GMX_SIMD_REAL_WIDTH rvecs
+ */
+static gmx_inline void gmx_simdcall
+gmx_simd_store_vec_to_rvec(gmx_simd_real_t  x,
+                           gmx_simd_real_t  y,
+                           gmx_simd_real_t  z,
+                           real gmx_unused *buf_aligned,
+                           rvec            *v)
+{
+#if defined(GMX_SIMD_X86_AVX_256) || defined(GMX_SIMD_X86_AVX2_256)
+    int              i;
+    gmx_simd4_real_t s4[GMX_SIMD_REAL_WIDTH];
+    gmx_simd_real_t  zero = gmx_simd_setzero_r();
+
+    gmx_hack_simd_transpose_to_simd4_r(x, y, z, zero, s4);
+
+    for (i = 0; i < GMX_SIMD_REAL_WIDTH; i++)
+    {
+        gmx_hack_simd4_store3_r(v[i], s4[i]);
+    }
+#else
+    int i, m;
+
+    gmx_simd_store_r(buf_aligned + 0*GMX_SIMD_REAL_WIDTH, x);
+    gmx_simd_store_r(buf_aligned + 1*GMX_SIMD_REAL_WIDTH, y);
+    gmx_simd_store_r(buf_aligned + 2*GMX_SIMD_REAL_WIDTH, z);
+
+    for (i = 0; i < GMX_SIMD_REAL_WIDTH; i++)
+    {
+        for (m = 0; m < DIM; m++)
+        {
+            v[i][m] = buf_aligned[m*GMX_SIMD_REAL_WIDTH + i];
+        }
+    }
+#endif
+}
+#endif /* GMX_SIMD_HAVE_REAL */
+
+
 typedef struct {
     int    b0;         /* first constraint for this task */
     int    b1;         /* b1-1 is the last constraint for this task */
@@ -407,60 +641,6 @@ static void lincs_update_atoms(struct gmx_lincsdata *li, int th,
 }
 
 #ifdef LINCS_SIMD
-/* Calculate distances between indexed atom coordinate pairs
- * and store the result in 3 SIMD registers through an aligned buffer.
- * Start from index is and load GMX_SIMD_REAL_WIDTH elements.
- * Note that pair_index must contain valid indices up to is+GMX_SIMD_REAL_WIDTH.
- */
-static gmx_inline void gmx_simdcall
-rvec_dist_to_simd(const rvec      *x,
-                  int              is,
-                  const int       *pair_index,
-                  real            *buf,
-                  gmx_simd_real_t *dx,
-                  gmx_simd_real_t *dy,
-                  gmx_simd_real_t *dz)
-{
-    int s, m;
-
-    for (s = 0; s < GMX_SIMD_REAL_WIDTH; s++)
-    {
-        /* Store the non PBC corrected distances packed and aligned */
-        for (m = 0; m < DIM; m++)
-        {
-            buf[m*GMX_SIMD_REAL_WIDTH + s] =
-                x[pair_index[2*(is+s)]][m] - x[pair_index[2*(is+s) + 1]][m];
-        }
-    }
-    *dx = gmx_simd_load_r(buf + 0*GMX_SIMD_REAL_WIDTH);
-    *dy = gmx_simd_load_r(buf + 1*GMX_SIMD_REAL_WIDTH);
-    *dz = gmx_simd_load_r(buf + 2*GMX_SIMD_REAL_WIDTH);
-}
-
-/* Store a SIMD vector in GMX_SIMD_REAL_WIDTH rvecs through an aligned buffer */
-static gmx_inline void gmx_simdcall
-copy_simd_vec_to_rvec(gmx_simd_real_t  x,
-                      gmx_simd_real_t  y,
-                      gmx_simd_real_t  z,
-                      real            *buf,
-                      rvec            *v,
-                      int              is)
-{
-    int s, m;
-
-    gmx_simd_store_r(buf + 0*GMX_SIMD_REAL_WIDTH, x);
-    gmx_simd_store_r(buf + 1*GMX_SIMD_REAL_WIDTH, y);
-    gmx_simd_store_r(buf + 2*GMX_SIMD_REAL_WIDTH, z);
-
-    for (s = 0; s < GMX_SIMD_REAL_WIDTH; s++)
-    {
-        for (m = 0; m < DIM; m++)
-        {
-            v[is + s][m] = buf[m*GMX_SIMD_REAL_WIDTH + s];
-        }
-    }
-}
-
 /* Calculate the constraint distance vectors r to project on from x.
  * Determine the right-hand side of the matrix equation using quantity f.
  * This function only differs from calc_dr_x_xp_simd below in that
@@ -489,7 +669,8 @@ calc_dr_x_f_simd(int                       b0,
         gmx_simd_real_t rx_S, ry_S, rz_S, n2_S, il_S;
         gmx_simd_real_t fx_S, fy_S, fz_S, ip_S, rhs_S;
 
-        rvec_dist_to_simd(x, bs, bla, vbuf1, &rx_S, &ry_S, &rz_S);
+        gmx_hack_simd_gather_rvec_dist_pair_index(x, bla + bs*2, vbuf1,
+                                                  &rx_S, &ry_S, &rz_S);
 
         pbc_correct_dx_simd(&rx_S, &ry_S, &rz_S, pbc_simd);
 
@@ -500,9 +681,10 @@ calc_dr_x_f_simd(int                       b0,
         ry_S  = gmx_simd_mul_r(ry_S, il_S);
         rz_S  = gmx_simd_mul_r(rz_S, il_S);
 
-        copy_simd_vec_to_rvec(rx_S, ry_S, rz_S, vbuf1, r, bs);
+        gmx_simd_store_vec_to_rvec(rx_S, ry_S, rz_S, vbuf1, r + bs);
 
-        rvec_dist_to_simd(f, bs, bla, vbuf2, &fx_S, &fy_S, &fz_S);
+        gmx_hack_simd_gather_rvec_dist_pair_index(f, bla + bs*2, vbuf2,
+                                                  &fx_S, &fy_S, &fz_S);
 
         ip_S  = gmx_simd_iprod_r(rx_S, ry_S, rz_S,
                                  fx_S, fy_S, fz_S);
@@ -727,7 +909,8 @@ calc_dr_x_xp_simd(int                       b0,
         gmx_simd_real_t rx_S, ry_S, rz_S, n2_S, il_S;
         gmx_simd_real_t rxp_S, ryp_S, rzp_S, ip_S, rhs_S;
 
-        rvec_dist_to_simd(x, bs, bla, vbuf1, &rx_S, &ry_S, &rz_S);
+        gmx_hack_simd_gather_rvec_dist_pair_index(x, bla + bs*2, vbuf1,
+                                                  &rx_S, &ry_S, &rz_S);
 
         pbc_correct_dx_simd(&rx_S, &ry_S, &rz_S, pbc_simd);
 
@@ -738,9 +921,10 @@ calc_dr_x_xp_simd(int                       b0,
         ry_S  = gmx_simd_mul_r(ry_S, il_S);
         rz_S  = gmx_simd_mul_r(rz_S, il_S);
 
-        copy_simd_vec_to_rvec(rx_S, ry_S, rz_S, vbuf1, r, bs);
+        gmx_simd_store_vec_to_rvec(rx_S, ry_S, rz_S, vbuf1, r + bs);
 
-        rvec_dist_to_simd(xp, bs, bla, vbuf2, &rxp_S, &ryp_S, &rzp_S);
+        gmx_hack_simd_gather_rvec_dist_pair_index(xp, bla + bs*2, vbuf2,
+                                                  &rxp_S, &ryp_S, &rzp_S);
 
         pbc_correct_dx_simd(&rxp_S, &ryp_S, &rzp_S, pbc_simd);
 
@@ -838,7 +1022,8 @@ calc_dist_iter_simd(int                       b0,
         gmx_simd_real_t rx_S, ry_S, rz_S, n2_S;
         gmx_simd_real_t len_S, len2_S, dlen2_S, lc_S, blc_S;
 
-        rvec_dist_to_simd(x, bs, bla, vbuf, &rx_S, &ry_S, &rz_S);
+        gmx_hack_simd_gather_rvec_dist_pair_index(x, bla + bs*2, vbuf,
+                                                  &rx_S, &ry_S, &rz_S);
 
         pbc_correct_dx_simd(&rx_S, &ry_S, &rz_S, pbc_simd);