src/gromacs/legacyheaders/gmx_x86_avx_256.h

   1 /* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
   2  *
   3  *
   4  * This file is part of GROMACS.
   5  * Copyright (c) 2012-
   6  *
   7  * Written by the Gromacs development team under coordination of
   8  * David van der Spoel, Berk Hess, and Erik Lindahl.
   9  *
  10  * This library is free software; you can redistribute it and/or
  11  * modify it under the terms of the GNU Lesser General Public License
  12  * as published by the Free Software Foundation; either version 2
  13  * of the License, or (at your option) any later version.
  14  *
  15  * To help us fund GROMACS development, we humbly ask that you cite
  16  * the research papers on the package. Check out http://www.gromacs.org
  17  *
  18  * And Hey:
  19  * Gnomes, ROck Monsters And Chili Sauce
  20  */
  21 #ifndef _gmx_x86_avx_256_h_
  22 #define _gmx_x86_avx_256_h_
  23
  24
  25 #include <immintrin.h>
  26 #ifdef HAVE_X86INTRIN_H
  27 #include <x86intrin.h> /* FMA */
  28 #endif
  29
  30
  31 #include <stdio.h>
  32
  33 #include "types/simple.h"
  34
  35
  36 #define gmx_mm_extract_epi32(x, imm) _mm_cvtsi128_si32(_mm_srli_si128((x), 4 * (imm)))
  37
  38 #define _GMX_MM_PERMUTE(fp3,fp2,fp1,fp0) (((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | ((fp0)))
  39 #define _GMX_MM_PERMUTE256D(fp3,fp2,fp1,fp0) (((fp3) << 3) | ((fp2) << 2) | ((fp1) << 1) | ((fp0)))
  40 #define _GMX_MM_PERMUTE128D(fp1,fp0)         (((fp1) << 1) | ((fp0)))
  41
  42
  43 #define GMX_MM_TRANSPOSE2_PD(row0, row1) {           \
  44     __m128d __gmx_t1 = row0;                         \
  45     row0           = _mm_unpacklo_pd(row0,row1);     \
  46     row1           = _mm_unpackhi_pd(__gmx_t1,row1); \
  47 }
  48
  49
  50 #if (defined (_MSC_VER) || defined(__INTEL_COMPILER))
  51 #  define gmx_mm_castsi128_ps(a) _mm_castsi128_ps(a)
  52 #  define gmx_mm_castps_si128(a) _mm_castps_si128(a)
  53 #  define gmx_mm_castps_ps128(a) (a)
  54 #  define gmx_mm_castsi128_pd(a) _mm_castsi128_pd(a)
  55 #  define gmx_mm_castpd_si128(a) _mm_castpd_si128(a)
  56 #elif defined(__GNUC__)
  57 #  define gmx_mm_castsi128_ps(a) ((__m128)(a))
  58 #  define gmx_mm_castps_si128(a) ((__m128i)(a))
  59 #  define gmx_mm_castps_ps128(a) ((__m128)(a))
  60 #  define gmx_mm_castsi128_pd(a) ((__m128d)(a))
  61 #  define gmx_mm_castpd_si128(a) ((__m128i)(a))
  62 #else
  63 static __m128  gmx_mm_castsi128_ps(__m128i a)
  64 {
  65     return *(__m128 *) &a;
  66 }
  67 static __m128i gmx_mm_castps_si128(__m128 a)
  68 {
  69     return *(__m128i *) &a;
  70 }
  71 static __m128  gmx_mm_castps_ps128(__m128 a)
  72 {
  73     return *(__m128 *) &a;
  74 }
  75 static __m128d gmx_mm_castsi128_pd(__m128i a)
  76 {
  77     return *(__m128d *) &a;
  78 }
  79 static __m128i gmx_mm_castpd_si128(__m128d a)
  80 {
  81     return *(__m128i *) &a;
  82 }
  83 #endif
  84
  85
  86
  87 static void
  88 gmx_mm_printxmm_ps(const char *s,__m128 xmm)
  89 {
  90     float f[4];
  91
  92     _mm_storeu_ps(f,xmm);
  93     printf("%s: %15.10e %15.10e %15.10e %15.10e\n",s,f[0],f[1],f[2],f[3]);
  94 }
  95
  96
  97 static void
  98 gmx_mm_printxmmsum_ps(const char *s,__m128 xmm)
  99 {
 100     float f[4];
 101
 102     _mm_storeu_ps(f,xmm);
 103     printf("%s (sum): %15.10g\n",s,f[0]+f[1]+f[2]+f[3]);
 104 }
 105
 106
 107 static void
 108 gmx_mm_printxmm_pd(const char *s,__m128d xmm)
 109 {
 110     double f[2];
 111
 112     _mm_storeu_pd(f,xmm);
 113     printf("%s: %30.20e %30.20e\n",s,f[0],f[1]);
 114 }
 115
 116 static void
 117 gmx_mm_printxmmsum_pd(const char *s,__m128d xmm)
 118 {
 119     double f[2];
 120
 121     _mm_storeu_pd(f,xmm);
 122     printf("%s (sum): %15.10g\n",s,f[0]+f[1]);
 123 }
 124
 125
 126 static void
 127 gmx_mm_printxmm_epi32(const char *s,__m128i xmmi)
 128 {
 129     int i[4];
 130
 131     _mm_storeu_si128((__m128i *)i,xmmi);
 132     printf("%10s: %2d %2d %2d %2d\n",s,i[0],i[1],i[2],i[3]);
 133 }
 134
 135 static void
 136 gmx_mm256_printymm_ps(const char *s,__m256 ymm)
 137 {
 138     float f[8];
 139
 140     _mm256_storeu_ps(f,ymm);
 141     printf("%s: %12.7f %12.7f %12.7f %12.7f %12.7f %12.7f %12.7f %12.7f\n",s,f[0],f[1],f[2],f[3],f[4],f[5],f[6],f[7]);
 142 }
 143
 144 static void
 145 gmx_mm256_printymmsum_ps(const char *s,__m256 ymm)
 146 {
 147     float f[8];
 148
 149     _mm256_storeu_ps(f,ymm);
 150     printf("%s (sum): %15.10g\n",s,f[0]+f[1]+f[2]+f[3]+f[4]+f[5]+f[6]+f[7]);
 151 }
 152
 153
 154 static void
 155 gmx_mm256_printymm_pd(const char *s,__m256d ymm)
 156 {
 157     double f[4];
 158
 159     _mm256_storeu_pd(f,ymm);
 160     printf("%s: %16.12f %16.12f %16.12f %16.12f\n",s,f[0],f[1],f[2],f[3]);
 161 }
 162
 163 static void
 164 gmx_mm256_printymmsum_pd(const char *s,__m256d ymm)
 165 {
 166     double f[4];
 167
 168     _mm256_storeu_pd(f,ymm);
 169     printf("%s (sum): %15.10g\n",s,f[0]+f[1]+f[2]+f[3]);
 170 }
 171
 172
 173
 174 static void
 175 gmx_mm256_printymm_epi32(const char *s,__m256i ymmi)
 176 {
 177     int i[8];
 178
 179     _mm256_storeu_si256((__m256i *)i,ymmi);
 180     printf("%10s: %2d %2d %2d %2d %2d %2d %2d %2d\n",s,i[0],i[1],i[2],i[3],i[4],i[5],i[6],i[7]);
 181 }
 182
 183
 184
 185 static int gmx_mm_check_and_reset_overflow(void)
 186 {
 187     int MXCSR;
 188     int sse_overflow;
 189
 190     MXCSR = _mm_getcsr();
 191     /* The overflow flag is bit 3 in the register */
 192     if (MXCSR & 0x0008)
 193     {
 194         sse_overflow = 1;
 195         /* Set the overflow flag to zero */
 196         MXCSR = MXCSR & 0xFFF7;
 197         _mm_setcsr(MXCSR);
 198     }
 199     else
 200     {
 201         sse_overflow = 0;
 202     }
 203
 204     return sse_overflow;
 205 }
 206
 207
 208
 209 #endif /* _gmx_x86_avx_256_h_ */