2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
5 * Copyright (c) 2001-2012, The GROMACS Development Team
6 * Copyright (c) 2012, by the GROMACS development team, led by
7 * David van der Spoel, Berk Hess, Erik Lindahl, and including many
8 * others, as listed in the AUTHORS file in the top-level source
9 * directory and at http://www.gromacs.org.
11 * GROMACS is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU Lesser General Public License
13 * as published by the Free Software Foundation; either version 2.1
14 * of the License, or (at your option) any later version.
16 * GROMACS is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19 * Lesser General Public License for more details.
21 * You should have received a copy of the GNU Lesser General Public
22 * License along with GROMACS; if not, see
23 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
24 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
26 * If you want to redistribute modifications to GROMACS, please
27 * consider that scientific software is very special. Version
28 * control is crucial - bugs must be traceable. We will be happy to
29 * consider code for inclusion in the official distribution, but
30 * derived work must not be called official GROMACS. Details are found
31 * in the README & COPYING files - if they are missing, get the
32 * official version at http://www.gromacs.org.
34 * To help us fund GROMACS development, we humbly ask that you cite
35 * the research papers on the package. Check out http://www.gromacs.org.
38 /* The macros in this file are intended to be used for writing
39 * architecture-independent SIMD intrinsics code.
40 * To support a new architecture, adding macros here should be (nearly)
44 /* Undefine all defines used below so we can include this file multiple times
45 * with different settings from the same source file.
48 /* NOTE: SSE2 acceleration does not include floor or blendv */
50 #undef GMX_SIMD_WIDTH_HERE
54 /* float/double SIMD register type */
62 /* Only used for debugging */
74 /* Only used to speed up the nbnxn tabulated PME kernels */
76 /* Only used with x86 when blendv is faster than comparison */
79 #undef gmx_movemask_pr
81 /* Integer casts are only used for nbnxn x86 exclusion masks */
82 #undef gmx_mm_castsi128_pr
83 #undef gmx_mm_castsi256_pr
85 /* Conversions only used for nbnxn x86 exclusion masks and PME table lookup */
86 #undef gmx_cvttpr_epi32
87 #undef gmx_cvtepi32_pr
90 #undef gmx_calc_rsq_pr
93 /* Only required for nbnxn analytical PME kernels */
94 #undef gmx_pmecorrF_pr
95 #undef gmx_pmecorrV_pr
98 /* Half SIMD-width types and operations only for nbnxn 2xnn search+kernels */
109 #undef gmx_2hpr_to_pr
112 /* By defining GMX_MM128_HERE or GMX_MM256_HERE before including this file
113 * the same intrinsics, with defines, can be compiled for either 128 or 256
114 * bit wide SSE or AVX instructions.
115 * The gmx_ prefix is replaced by _mm_ or _mm256_ (SSE or AVX).
116 * The _pr suffix is replaced by _ps or _pd (single or double precision).
117 * Note that compiler settings will decide if 128-bit intrinsics will
118 * be translated into SSE or AVX instructions.
121 #if !defined GMX_MM128_HERE && !defined GMX_MM256_HERE
122 #error "You should define GMX_MM128_HERE or GMX_MM256_HERE"
125 #if defined GMX_MM128_HERE && defined GMX_MM256_HERE
126 #error "You should not define both GMX_MM128_HERE and GMX_MM256_HERE"
132 #ifdef GMX_MM128_HERE
134 #define gmx_epi32 __m128i
138 #include "gmx_x86_simd_single.h"
140 #define GMX_SIMD_WIDTH_HERE 4
142 #define gmx_mm_pr __m128
144 #define gmx_load_pr _mm_load_ps
145 #define gmx_load1_pr _mm_load1_ps
146 #define gmx_set1_pr _mm_set1_ps
147 #define gmx_setzero_pr _mm_setzero_ps
148 #define gmx_store_pr _mm_store_ps
149 #define gmx_storeu_pr _mm_storeu_ps
151 #define gmx_add_pr _mm_add_ps
152 #define gmx_sub_pr _mm_sub_ps
153 #define gmx_mul_pr _mm_mul_ps
154 #define gmx_max_pr _mm_max_ps
155 #define gmx_cmplt_pr _mm_cmplt_ps
156 #define gmx_and_pr _mm_and_ps
157 #define gmx_or_pr _mm_or_ps
158 #define gmx_andnot_pr _mm_andnot_ps
160 #define gmx_floor_pr _mm_floor_ps
161 #define gmx_blendv_pr _mm_blendv_ps
163 #define gmx_movemask_pr _mm_movemask_ps
165 #define gmx_mm_castsi128_pr gmx_mm_castsi128_ps
167 #define gmx_cvttpr_epi32 _mm_cvttps_epi32
168 #define gmx_cvtepi32_pr _mm_cvtepi32_ps
170 #define gmx_invsqrt_pr gmx_mm_invsqrt_ps
171 #define gmx_calc_rsq_pr gmx_mm_calc_rsq_ps
172 #define gmx_sum4_pr gmx_mm_sum4_ps
174 #define gmx_pmecorrF_pr gmx_mm_pmecorrF_ps
175 #define gmx_pmecorrV_pr gmx_mm_pmecorrV_ps
177 #else /* ifndef GMX_DOUBLE */
179 #include "gmx_x86_simd_double.h"
181 #define GMX_SIMD_WIDTH_HERE 2
183 #define gmx_mm_pr __m128d
185 #define gmx_load_pr _mm_load_pd
186 #define gmx_load1_pr _mm_load1_pd
187 #define gmx_set1_pr _mm_set1_pd
188 #define gmx_setzero_pr _mm_setzero_pd
189 #define gmx_store_pr _mm_store_pd
190 #define gmx_storeu_pr _mm_storeu_pd
192 #define gmx_add_pr _mm_add_pd
193 #define gmx_sub_pr _mm_sub_pd
194 #define gmx_mul_pr _mm_mul_pd
195 #define gmx_max_pr _mm_max_pd
196 #define gmx_cmplt_pr _mm_cmplt_pd
197 #define gmx_and_pr _mm_and_pd
198 #define gmx_or_pr _mm_or_pd
199 #define gmx_andnot_pr _mm_andnot_pd
201 #define gmx_floor_pr _mm_floor_pd
202 #define gmx_blendv_pr _mm_blendv_pd
204 #define gmx_movemask_pr _mm_movemask_pd
206 #define gmx_mm_castsi128_pr gmx_mm_castsi128_pd
208 #define gmx_cvttpr_epi32 _mm_cvttpd_epi32
209 #define gmx_cvtepi32_pr _mm_cvtepi32_pd
211 #define gmx_invsqrt_pr gmx_mm_invsqrt_pd
212 #define gmx_calc_rsq_pr gmx_mm_calc_rsq_pd
213 #define gmx_sum4_pr gmx_mm_sum4_pd
215 #define gmx_pmecorrF_pr gmx_mm_pmecorrF_pd
216 #define gmx_pmecorrV_pr gmx_mm_pmecorrV_pd
218 #endif /* ifndef GMX_DOUBLE */
220 #endif /* GMX_MM128_HERE */
222 #ifdef GMX_MM256_HERE
224 #define gmx_epi32 __m256i
228 #include "gmx_x86_simd_single.h"
230 #define GMX_SIMD_WIDTH_HERE 8
232 #define gmx_mm_pr __m256
234 #define gmx_load_pr _mm256_load_ps
235 #define gmx_load1_pr(x) _mm256_set1_ps((x)[0])
236 #define gmx_set1_pr _mm256_set1_ps
237 #define gmx_setzero_pr _mm256_setzero_ps
238 #define gmx_store_pr _mm256_store_ps
239 #define gmx_storeu_pr _mm256_storeu_ps
241 #define gmx_add_pr _mm256_add_ps
242 #define gmx_sub_pr _mm256_sub_ps
243 #define gmx_mul_pr _mm256_mul_ps
244 #define gmx_max_pr _mm256_max_ps
245 /* Not-equal (ordered, non-signaling) */
246 #define gmx_cmpneq_pr(x, y) _mm256_cmp_ps(x, y, 0x0c)
247 /* Less-than (ordered, non-signaling) */
248 #define gmx_cmplt_pr(x, y) _mm256_cmp_ps(x, y, 0x11)
249 #define gmx_and_pr _mm256_and_ps
250 #define gmx_or_pr _mm256_or_ps
251 #define gmx_andnot_pr _mm256_andnot_ps
253 #define gmx_floor_pr _mm256_floor_ps
254 #define gmx_blendv_pr _mm256_blendv_ps
256 #define gmx_movemask_pr _mm256_movemask_ps
258 #define gmx_mm_castsi256_pr _mm256_castsi256_ps
260 #define gmx_cvttpr_epi32 _mm256_cvttps_epi32
262 #define gmx_invsqrt_pr gmx_mm256_invsqrt_ps
263 #define gmx_calc_rsq_pr gmx_mm256_calc_rsq_ps
264 #define gmx_sum4_pr gmx_mm256_sum4_ps
266 #define gmx_pmecorrF_pr gmx_mm256_pmecorrF_ps
267 #define gmx_pmecorrV_pr gmx_mm256_pmecorrV_ps
269 #define gmx_loaddh_pr gmx_mm256_load4_ps
271 /* Half SIMD-width type */
272 #define gmx_mm_hpr __m128
274 /* Half SIMD-width macros */
275 #define gmx_load_hpr _mm_load_ps
276 #define gmx_load1_hpr(x) _mm_set1_ps((x)[0])
277 #define gmx_store_hpr _mm_store_ps
278 #define gmx_add_hpr _mm_add_ps
279 #define gmx_sub_hpr _mm_sub_ps
281 #define gmx_sum4_hpr gmx_mm256_sum4h_m128
283 /* Conversion between half and full SIMD-width */
284 #define gmx_2hpr_to_pr gmx_mm256_set_m128
288 #include "gmx_x86_simd_double.h"
290 #define GMX_SIMD_WIDTH_HERE 4
292 #define gmx_mm_pr __m256d
294 #define gmx_load_pr _mm256_load_pd
295 #define gmx_load1_pr(x) _mm256_set1_pd((x)[0])
296 #define gmx_set1_pr _mm256_set1_pd
297 #define gmx_setzero_pr _mm256_setzero_pd
298 #define gmx_store_pr _mm256_store_pd
299 #define gmx_storeu_pr _mm256_storeu_pd
301 #define gmx_add_pr _mm256_add_pd
302 #define gmx_sub_pr _mm256_sub_pd
303 #define gmx_mul_pr _mm256_mul_pd
304 #define gmx_max_pr _mm256_max_pd
305 /* Not-equal (ordered, non-signaling) */
306 #define gmx_cmpneq_pr(x, y) _mm256_cmp_pd(x, y, 0x0c)
307 /* Less-than (ordered, non-signaling) */
308 #define gmx_cmplt_pr(x, y) _mm256_cmp_pd(x, y, 0x11)
309 #define gmx_and_pr _mm256_and_pd
310 #define gmx_or_pr _mm256_or_pd
311 #define gmx_andnot_pr _mm256_andnot_pd
313 #define gmx_floor_pr _mm256_floor_pd
314 #define gmx_blendv_pr _mm256_blendv_pd
316 #define gmx_movemask_pr _mm256_movemask_pd
318 #define gmx_mm_castsi256_pr _mm256_castsi256_pd
320 #define gmx_cvttpr_epi32 _mm256_cvttpd_epi32
322 #define gmx_invsqrt_pr gmx_mm256_invsqrt_pd
323 #define gmx_calc_rsq_pr gmx_mm256_calc_rsq_pd
324 #define gmx_sum4_pr gmx_mm256_sum4_pd
326 #define gmx_pmecorrF_pr gmx_mm256_pmecorrF_pd
327 #define gmx_pmecorrV_pr gmx_mm256_pmecorrV_pd
331 #endif /* GMX_MM256_HERE */
333 #endif /* GMX_X86_SSE2 */