2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
5 * Copyright (c) 2001-2012, The GROMACS Development Team
6 * Copyright (c) 2012, by the GROMACS development team, led by
7 * David van der Spoel, Berk Hess, Erik Lindahl, and including many
8 * others, as listed in the AUTHORS file in the top-level source
9 * directory and at http://www.gromacs.org.
11 * GROMACS is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU Lesser General Public License
13 * as published by the Free Software Foundation; either version 2.1
14 * of the License, or (at your option) any later version.
16 * GROMACS is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19 * Lesser General Public License for more details.
21 * You should have received a copy of the GNU Lesser General Public
22 * License along with GROMACS; if not, see
23 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
24 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
26 * If you want to redistribute modifications to GROMACS, please
27 * consider that scientific software is very special. Version
28 * control is crucial - bugs must be traceable. We will be happy to
29 * consider code for inclusion in the official distribution, but
30 * derived work must not be called official GROMACS. Details are found
31 * in the README & COPYING files - if they are missing, get the
32 * official version at http://www.gromacs.org.
34 * To help us fund GROMACS development, we humbly ask that you cite
35 * the research papers on the package. Check out http://www.gromacs.org.
38 /* Undefine all defines used below so we can include this file multiple times
39 * with different settings from the same source file.
42 /* NOTE: floor and blend are NOT available with SSE2 only acceleration */
44 #undef GMX_X86_SIMD_WIDTH_HERE
69 #undef gmx_movemask_pr
71 #undef gmx_mm_castsi128_pr
73 #undef gmx_cvttpr_epi32
74 #undef gmx_cvtepi32_pr
77 #undef gmx_calc_rsq_pr
80 #undef gmx_pmecorrF_pr
81 #undef gmx_pmecorrV_pr
84 /* By defining GMX_MM128_HERE or GMX_MM256_HERE before including this file
85 * the same intrinsics, with defines, can be compiled for either 128 or 256
86 * bit wide SSE or AVX instructions.
87 * The gmx_ prefix is replaced by _mm_ or _mm256_ (SSE or AVX).
88 * The _pr suffix is replaced by _ps or _pd (single or double precision).
89 * Note that compiler settings will decide if 128-bit intrinsics will
90 * be translated into SSE or AVX instructions.
93 #if !defined GMX_MM128_HERE && !defined GMX_MM256_HERE
94 "You should define GMX_MM128_HERE or GMX_MM256_HERE"
97 #if defined GMX_MM128_HERE && defined GMX_MM256_HERE
98 "You should not define both GMX_MM128_HERE and GMX_MM256_HERE"
101 #ifdef GMX_MM128_HERE
103 #define gmx_epi32 __m128i
107 #include "gmx_x86_simd_single.h"
109 #define GMX_X86_SIMD_WIDTH_HERE 4
111 #define gmx_mm_pr __m128
113 #define gmx_load_pr _mm_load_ps
114 #define gmx_load1_pr _mm_load1_ps
115 #define gmx_set1_pr _mm_set1_ps
116 #define gmx_setzero_pr _mm_setzero_ps
117 #define gmx_store_pr _mm_store_ps
118 #define gmx_storeu_pr _mm_storeu_ps
120 #define gmx_add_pr _mm_add_ps
121 #define gmx_sub_pr _mm_sub_ps
122 #define gmx_mul_pr _mm_mul_ps
123 #define gmx_max_pr _mm_max_ps
124 #define gmx_cmplt_pr _mm_cmplt_ps
125 #define gmx_and_pr _mm_and_ps
126 #define gmx_or_pr _mm_or_ps
127 #define gmx_andnot_pr _mm_andnot_ps
129 #define gmx_floor_pr _mm_floor_ps
130 #define gmx_blendv_pr _mm_blendv_ps
132 #define gmx_movemask_pr _mm_movemask_ps
134 #define gmx_mm_castsi128_pr gmx_mm_castsi128_ps
136 #define gmx_cvttpr_epi32 _mm_cvttps_epi32
137 #define gmx_cvtepi32_pr _mm_cvtepi32_ps
139 #define gmx_invsqrt_pr gmx_mm_invsqrt_ps
140 #define gmx_calc_rsq_pr gmx_mm_calc_rsq_ps
141 #define gmx_sum4_pr gmx_mm_sum4_ps
143 #define gmx_pmecorrF_pr gmx_mm_pmecorrF_ps
144 #define gmx_pmecorrV_pr gmx_mm_pmecorrV_ps
146 #else /* ifndef GMX_DOUBLE */
148 #include "gmx_x86_simd_double.h"
150 #define GMX_X86_SIMD_WIDTH_HERE 2
152 #define gmx_mm_pr __m128d
154 #define gmx_load_pr _mm_load_pd
155 #define gmx_load1_pr _mm_load1_pd
156 #define gmx_set1_pr _mm_set1_pd
157 #define gmx_setzero_pr _mm_setzero_pd
158 #define gmx_store_pr _mm_store_pd
159 #define gmx_storeu_pr _mm_storeu_pd
161 #define gmx_add_pr _mm_add_pd
162 #define gmx_sub_pr _mm_sub_pd
163 #define gmx_mul_pr _mm_mul_pd
164 #define gmx_max_pr _mm_max_pd
165 #define gmx_cmplt_pr _mm_cmplt_pd
166 #define gmx_and_pr _mm_and_pd
167 #define gmx_or_pr _mm_or_pd
168 #define gmx_andnot_pr _mm_andnot_pd
170 #define gmx_floor_pr _mm_floor_pd
171 #define gmx_blendv_pr _mm_blendv_pd
173 #define gmx_movemask_pr _mm_movemask_pd
175 #define gmx_mm_castsi128_pr gmx_mm_castsi128_pd
177 #define gmx_cvttpr_epi32 _mm_cvttpd_epi32
178 #define gmx_cvtepi32_pr _mm_cvtepi32_pd
180 #define gmx_invsqrt_pr gmx_mm_invsqrt_pd
181 #define gmx_calc_rsq_pr gmx_mm_calc_rsq_pd
182 #define gmx_sum4_pr gmx_mm_sum4_pd
184 #define gmx_pmecorrF_pr gmx_mm_pmecorrF_pd
185 #define gmx_pmecorrV_pr gmx_mm_pmecorrV_pd
187 #endif /* ifndef GMX_DOUBLE */
189 #endif /* GMX_MM128_HERE */
191 #ifdef GMX_MM256_HERE
193 #define gmx_epi32 __m256i
197 #include "gmx_x86_simd_single.h"
199 #define GMX_X86_SIMD_WIDTH_HERE 8
201 #define gmx_mm_pr __m256
203 #define gmx_load_pr _mm256_load_ps
204 #define gmx_load1_pr(x) _mm256_set1_ps((x)[0])
205 #define gmx_set1_pr _mm256_set1_ps
206 #define gmx_setzero_pr _mm256_setzero_ps
207 #define gmx_store_pr _mm256_store_ps
208 #define gmx_storeu_pr _mm256_storeu_ps
210 #define gmx_add_pr _mm256_add_ps
211 #define gmx_sub_pr _mm256_sub_ps
212 #define gmx_mul_pr _mm256_mul_ps
213 #define gmx_max_pr _mm256_max_ps
214 /* Not-equal (ordered, non-signaling) */
215 #define gmx_cmpneq_pr(x,y) _mm256_cmp_ps(x,y,0x0c)
216 /* Less-than (ordered, non-signaling) */
217 #define gmx_cmplt_pr(x,y) _mm256_cmp_ps(x,y,0x11)
218 #define gmx_and_pr _mm256_and_ps
219 #define gmx_or_pr _mm256_or_ps
220 #define gmx_andnot_pr _mm256_andnot_ps
222 #define gmx_floor_pr _mm256_floor_ps
223 #define gmx_blendv_pr _mm256_blendv_ps
225 #define gmx_movemask_pr _mm256_movemask_ps
227 #define gmx_mm_castsi256_pr _mm256_castsi256_ps
229 #define gmx_cvttpr_epi32 _mm256_cvttps_epi32
231 #define gmx_invsqrt_pr gmx_mm256_invsqrt_ps
232 #define gmx_calc_rsq_pr gmx_mm256_calc_rsq_ps
233 #define gmx_sum4_pr gmx_mm256_sum4_ps
235 #define gmx_pmecorrF_pr gmx_mm256_pmecorrF_ps
236 #define gmx_pmecorrV_pr gmx_mm256_pmecorrV_ps
240 #include "gmx_x86_simd_double.h"
242 #define GMX_X86_SIMD_WIDTH_HERE 4
244 #define gmx_mm_pr __m256d
246 #define gmx_load_pr _mm256_load_pd
247 #define gmx_load1_pr(x) _mm256_set1_pd((x)[0])
248 #define gmx_set1_pr _mm256_set1_pd
249 #define gmx_setzero_pr _mm256_setzero_pd
250 #define gmx_store_pr _mm256_store_pd
251 #define gmx_storeu_pr _mm256_storeu_pd
253 #define gmx_add_pr _mm256_add_pd
254 #define gmx_sub_pr _mm256_sub_pd
255 #define gmx_mul_pr _mm256_mul_pd
256 #define gmx_max_pr _mm256_max_pd
257 /* Not-equal (ordered, non-signaling) */
258 #define gmx_cmpneq_pr(x,y) _mm256_cmp_pd(x,y,0x0c)
259 /* Less-than (ordered, non-signaling) */
260 #define gmx_cmplt_pr(x,y) _mm256_cmp_pd(x,y,0x11)
261 #define gmx_and_pr _mm256_and_pd
262 #define gmx_or_pr _mm256_or_pd
263 #define gmx_andnot_pr _mm256_andnot_pd
265 #define gmx_floor_pr _mm256_floor_pd
266 #define gmx_blendv_pr _mm256_blendv_pd
268 #define gmx_movemask_pr _mm256_movemask_pd
270 #define gmx_mm_castsi256_pr _mm256_castsi256_pd
272 #define gmx_cvttpr_epi32 _mm256_cvttpd_epi32
274 #define gmx_invsqrt_pr gmx_mm256_invsqrt_pd
275 #define gmx_calc_rsq_pr gmx_mm256_calc_rsq_pd
276 #define gmx_sum4_pr gmx_mm256_sum4_pd
278 #define gmx_pmecorrF_pr gmx_mm256_pmecorrF_pd
279 #define gmx_pmecorrV_pr gmx_mm256_pmecorrV_pd
283 #endif /* GMX_MM256_HERE */