Merge release-4-6 into master

[alexxy/gromacs.git] / src / gromacs / legacyheaders / gmx_x86_avx_256.h
diff --git a/src/gromacs/legacyheaders/gmx_x86_avx_256.h b/src/gromacs/legacyheaders/gmx_x86_avx_256.h

index 90317a5328085370c7d4b634955eedaa1bbd8ada..6a561fa04b5711dbd0d2a6e996327030c620bb3d 100644 (file)
--- a/src/gromacs/legacyheaders/gmx_x86_avx_256.h
+++ b/src/gromacs/legacyheaders/gmx_x86_avx_256.h
@@ -35,6 +35,7 @@
  
  #define gmx_mm_extract_epi32(x, imm) _mm_cvtsi128_si32(_mm_srli_si128((x), 4 * (imm)))
  
+#define _GMX_MM_BLEND256D(b3,b2,b1,b0) (((b3) << 3) | ((b2) << 2) | ((b1) << 1) | ((b0)))
  #define _GMX_MM_PERMUTE(fp3,fp2,fp1,fp0) (((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | ((fp0)))
  #define _GMX_MM_PERMUTE256D(fp3,fp2,fp1,fp0) (((fp3) << 3) | ((fp2) << 2) | ((fp1) << 1) | ((fp0)))
  #define _GMX_MM_PERMUTE128D(fp1,fp0)         (((fp1) << 1) | ((fp0)))
@@ -46,6 +47,18 @@
      row1           = _mm_unpackhi_pd(__gmx_t1,row1); \
  }
  
+#define GMX_MM256_FULLTRANSPOSE4_PD(row0,row1,row2,row3) \
+{                                                        \
+    __m256d _t0, _t1, _t2, _t3;                          \
+    _t0  = _mm256_unpacklo_pd((row0), (row1));           \
+    _t1  = _mm256_unpackhi_pd((row0), (row1));           \
+    _t2  = _mm256_unpacklo_pd((row2), (row3));           \
+    _t3  = _mm256_unpackhi_pd((row2), (row3));           \
+    row0 = _mm256_permute2f128_pd(_t0, _t2, 0x20);       \
+    row1 = _mm256_permute2f128_pd(_t1, _t3, 0x20);       \
+    row2 = _mm256_permute2f128_pd(_t0, _t2, 0x31);       \
+    row3 = _mm256_permute2f128_pd(_t1, _t3, 0x31);       \
+}
  
  #if (defined (_MSC_VER) || defined(__INTEL_COMPILER))
  #  define gmx_mm_castsi128_ps(a) _mm_castsi128_ps(a)
@@ -82,6 +95,44 @@ static __m128i gmx_mm_castpd_si128(__m128d a)
  }
  #endif
  
+static gmx_inline __m256
+gmx_mm256_unpack128lo_ps(__m256 xmm1, __m256 xmm2)
+{
+    return _mm256_permute2f128_ps(xmm1,xmm2,0x20);
+}
+
+static gmx_inline __m256
+gmx_mm256_unpack128hi_ps(__m256 xmm1, __m256 xmm2)
+{
+    return _mm256_permute2f128_ps(xmm1,xmm2,0x31);
+}
+
+static gmx_inline __m256
+gmx_mm256_set_m128(__m128 hi, __m128 lo)
+{
+    return _mm256_insertf128_ps(_mm256_castps128_ps256(lo), hi, 0x1);
+}
+
+
+static __m256d
+gmx_mm256_unpack128lo_pd(__m256d xmm1, __m256d xmm2)
+{
+    return _mm256_permute2f128_pd(xmm1,xmm2,0x20);
+}
+
+static __m256d
+gmx_mm256_unpack128hi_pd(__m256d xmm1, __m256d xmm2)
+{
+    return _mm256_permute2f128_pd(xmm1,xmm2,0x31);
+}
+
+static __m256d
+gmx_mm256_set_m128d(__m128d hi, __m128d lo)
+{
+    return _mm256_insertf128_pd(_mm256_castpd128_pd256(lo), hi, 0x1);
+}
+
+
  
  
  static void