src/gromacs/simd/impl_x86_sse4_1/impl_x86_sse4_1.h

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2014, by the GROMACS development team, led by
   5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   6  * and including many others, as listed in the AUTHORS file in the
   7  * top-level source directory and at http://www.gromacs.org.
   8  *
   9  * GROMACS is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public License
  11  * as published by the Free Software Foundation; either version 2.1
  12  * of the License, or (at your option) any later version.
  13  *
  14  * GROMACS is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public
  20  * License along with GROMACS; if not, see
  21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  23  *
  24  * If you want to redistribute modifications to GROMACS, please
  25  * consider that scientific software is very special. Version
  26  * control is crucial - bugs must be traceable. We will be happy to
  27  * consider code for inclusion in the official distribution, but
  28  * derived work must not be called official GROMACS. Details are found
  29  * in the README & COPYING files - if they are missing, get the
  30  * official version at http://www.gromacs.org.
  31  *
  32  * To help us fund GROMACS development, we humbly ask that you cite
  33  * the research papers on the package. Check out http://www.gromacs.org.
  34  */
  35
  36 #ifndef GMX_SIMD_IMPL_X86_SSE4_1_H
  37 #define GMX_SIMD_IMPL_X86_SSE4_1_H
  38
  39 #include <math.h>
  40 #include <smmintrin.h>
  41
  42
  43 /* x86 SSE4.1 SIMD instruction wrappers
  44  *
  45  * Please see documentation in gromacs/simd/simd.h for the available
  46  * defines.
  47  */
  48
  49 /* Inherit most of SSE4.1 from SSE2 */
  50 #include "gromacs/simd/impl_x86_sse2/impl_x86_sse2.h"
  51 /* Increment over SSE2 capabilities */
  52 #define GMX_SIMD_X86_SSE4_1_OR_HIGHER
  53
  54
  55 /* Override capability definitions from SSE2 */
  56 #define  GMX_SIMD4_HAVE_FLOAT_DOTPRODUCT3
  57
  58 /* Almost all SSE4.1 instructions already exist in SSE2, but a few of them
  59  * can be implemented more efficiently in SSE4.1.
  60  */
  61 #undef  gmx_simd_round_f
  62 #define gmx_simd_round_f(x)       _mm_round_ps(x, _MM_FROUND_NINT)
  63 #undef  gmx_simd_trunc_f
  64 #define gmx_simd_trunc_f(x)       _mm_round_ps(x, _MM_FROUND_TRUNC)
  65 #undef  gmx_simd_round_d
  66 #define gmx_simd_round_d(x)       _mm_round_pd(x, _MM_FROUND_NINT)
  67 #undef  gmx_simd_trunc_d
  68 #define gmx_simd_trunc_d(x)       _mm_round_pd(x, _MM_FROUND_TRUNC)
  69
  70 #undef  gmx_simd_extract_fi
  71 #define gmx_simd_extract_fi       _mm_extract_epi32
  72 #undef  gmx_simd_mul_fi
  73 #define gmx_simd_mul_fi           _mm_mullo_epi32
  74
  75 #undef  gmx_simd_extract_di
  76 #define gmx_simd_extract_di       _mm_extract_epi32
  77 #undef  gmx_simd_mul_di
  78 #define gmx_simd_mul_di           _mm_mullo_epi32
  79
  80 #undef  gmx_simd_blendv_f
  81 #define gmx_simd_blendv_f         _mm_blendv_ps
  82 #undef  gmx_simd_blendv_d
  83 #define gmx_simd_blendv_d         _mm_blendv_pd
  84
  85 #undef  gmx_simd_reduce_f
  86 #define gmx_simd_reduce_f(a)      gmx_simd_reduce_f_sse4_1(a)
  87 #undef  gmx_simd_reduce_d
  88 #define gmx_simd_reduce_d(a)      gmx_simd_reduce_d_sse4_1(a)
  89
  90 #undef  gmx_simd_blendv_fi
  91 #define gmx_simd_blendv_fi        _mm_blendv_epi8
  92 #undef  gmx_simd_blendv_di
  93 #define gmx_simd_blendv_di        _mm_blendv_epi8
  94
  95 #undef  gmx_simd4_dotproduct3_f
  96 #define gmx_simd4_dotproduct3_f   gmx_simd4_dotproduct3_f_sse4_1
  97
  98 /* SIMD reduction function */
  99 static gmx_inline float gmx_simdcall
 100 gmx_simd_reduce_f_sse4_1(__m128 a)
 101 {
 102     float  f;
 103
 104     a = _mm_hadd_ps(a, a);
 105     a = _mm_hadd_ps(a, a);
 106     _mm_store_ss(&f, a);
 107     return f;
 108 }
 109
 110 /* SIMD4 Dotproduct helper function */
 111 static gmx_inline float gmx_simdcall
 112 gmx_simd4_dotproduct3_f_sse4_1(__m128 a, __m128 b)
 113 {
 114     float f;
 115     _MM_EXTRACT_FLOAT(f, _mm_dp_ps(a, b, 0x71), 0);
 116     return f;
 117 }
 118
 119 static gmx_inline double gmx_simdcall
 120 gmx_simd_reduce_d_sse4_1(__m128d a)
 121 {
 122     double  f;
 123
 124     a = _mm_hadd_pd(a, a);
 125     _mm_store_sd(&f, a);
 126     return f;
 127 }
 128
 129 #endif /* GMX_SIMD_IMPL_X86_SSE4_1_H */