2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
5 * Copyright (c) 2001-2012, The GROMACS Development Team.
6 * Copyright (c) 2012,2013, by the GROMACS development team, led by
7 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
8 * and including many others, as listed in the AUTHORS file in the
9 * top-level source directory and at http://www.gromacs.org.
11 * GROMACS is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU Lesser General Public License
13 * as published by the Free Software Foundation; either version 2.1
14 * of the License, or (at your option) any later version.
16 * GROMACS is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19 * Lesser General Public License for more details.
21 * You should have received a copy of the GNU Lesser General Public
22 * License along with GROMACS; if not, see
23 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
24 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
26 * If you want to redistribute modifications to GROMACS, please
27 * consider that scientific software is very special. Version
28 * control is crucial - bugs must be traceable. We will be happy to
29 * consider code for inclusion in the official distribution, but
30 * derived work must not be called official GROMACS. Details are found
31 * in the README & COPYING files - if they are missing, get the
32 * official version at http://www.gromacs.org.
34 * To help us fund GROMACS development, we humbly ask that you cite
35 * the research papers on the package. Check out http://www.gromacs.org.
38 /* The macros in this file are intended to be used for writing
39 * architecture-independent SIMD intrinsics code with a SIMD width of 4.
40 * To support a new architecture, adding macros here should be all
43 * Note that this file is intended only for SIMD operations that require
44 * a SIMD width of 4. In general gmx_simd_macros.h provides wider hardware
45 * support, more functionality and higher performance, but the SIMD width is
46 * not necessarily equal to 4.
49 #ifdef _gmx_simd4_macros_h_
50 #error "gmx_simd4_macros.h included twice"
52 #define _gmx_simd4_macros_h_
55 /* The SIMD width here is always 4, since that is the whole point */
56 #define GMX_SIMD4_WIDTH 4
59 #if defined GMX_SIMD4_SINGLE || defined GMX_SIMD4_DOUBLE
60 /* Precision set before inclusion, honour that request */
62 /* Match precision to the Gromacs real precision */
64 #define GMX_SIMD4_DOUBLE
66 #define GMX_SIMD4_SINGLE
70 #ifdef GMX_SIMD4_DOUBLE
71 typedef double gmx_simd4_real;
73 #ifdef GMX_SIMD4_SINGLE
74 typedef float gmx_simd4_real;
77 /* Uncomment the next line, without other SIMD active, for testing plain-C */
78 /* #define GMX_SIMD4_REFERENCE_PLAIN_C */
79 #ifdef GMX_SIMD4_REFERENCE_PLAIN_C
80 /* Plain C SIMD reference implementation, also serves as documentation */
81 #define GMX_HAVE_SIMD4_MACROS
83 /* Include plain-C reference implementation, also serves as documentation */
84 #include "gmx_simd4_ref.h"
86 /* float/double SIMD register type */
87 #define gmx_simd4_pr gmx_simd4_ref_pr
89 /* boolean SIMD register type */
90 #define gmx_simd4_pb gmx_simd4_ref_pb
92 #define gmx_simd4_load_pr gmx_simd4_ref_load_pr
93 #define gmx_simd4_load_bb_pr gmx_simd4_ref_load_pr
94 #define gmx_simd4_set1_pr gmx_simd4_ref_set1_pr
95 #define gmx_simd4_setzero_pr gmx_simd4_ref_setzero_pr
96 #define gmx_simd4_store_pr gmx_simd4_ref_store_pr
98 /* Unaligned load+store are not required,
99 * but they can speed up the PME spread+gather operations.
101 #define GMX_SIMD4_HAVE_UNALIGNED
102 #ifdef GMX_SIMD4_HAVE_UNALIGNED
103 #define gmx_simd4_loadu_pr gmx_simd4_ref_load_pr
104 #define gmx_simd4_storeu_pr gmx_simd4_ref_store_pr
107 #define gmx_simd4_add_pr gmx_simd4_ref_add_pr
108 #define gmx_simd4_sub_pr gmx_simd4_ref_sub_pr
109 #define gmx_simd4_mul_pr gmx_simd4_ref_mul_pr
110 /* For the FMA macros below, aim for c=d in code, so FMA3 uses 1 instruction */
111 #define gmx_simd4_madd_pr gmx_simd4_ref_madd_pr
112 #define gmx_simd4_nmsub_pr gmx_simd4_ref_nmsub_pr
114 #define gmx_simd4_dotproduct3 gmx_simd4_ref_dotproduct3
116 #define gmx_simd4_min_pr gmx_simd4_ref_min_pr
117 #define gmx_simd4_max_pr gmx_simd4_ref_max_pr
119 #define gmx_simd4_blendzero_pr gmx_simd4_ref_blendzero_pr
122 #define gmx_simd4_cmplt_pr gmx_simd4_ref_cmplt_pr
124 /* Logical operations on SIMD booleans */
125 #define gmx_simd4_and_pb gmx_simd4_ref_and_pb
126 #define gmx_simd4_or_pb gmx_simd4_ref_or_pb
128 /* Returns a single int (0/1) which tells if any of the 4 booleans is True */
129 #define gmx_simd4_anytrue_pb gmx_simd4_ref_anytrue_pb
131 #endif /* GMX_SIMD4_REFERENCE_PLAIN_C */
134 /* The same SIMD macros can be translated to SIMD intrinsics (and compiled
135 * to instructions for) different SIMD width and float precision.
137 * On x86: The gmx_simd4 prefix is replaced by _mm_ or _mm256_ (SSE or AVX).
138 * The _pr suffix is replaced by _ps or _pd (for single or double precision).
139 * Compiler settings will decide if 128-bit intrinsics will
140 * be translated into SSE or AVX instructions.
145 /* This is for general x86 SIMD instruction sets that also support SSE2 */
147 #ifdef GMX_SIMD4_SINGLE
148 #define GMX_HAVE_SIMD4_MACROS
151 #ifdef GMX_SIMD4_DOUBLE
152 /* Note that here we will use 256-bit SIMD with GMX_X86_AVX_128_FMA.
153 * This is inconsistent naming wise, but should give the best performance.
155 #if defined GMX_X86_AVX_128_FMA || defined GMX_X86_AVX_256
156 #define GMX_HAVE_SIMD4_MACROS
160 #ifdef GMX_HAVE_SIMD4_MACROS
162 #if defined GMX_X86_AVX_128_FMA || defined GMX_X86_AVX_256
164 #include <immintrin.h>
165 #ifdef HAVE_X86INTRIN_H
166 #include <x86intrin.h> /* FMA */
169 #include <intrin.h> /* FMA MSVC */
173 #ifdef GMX_X86_SSE4_1
174 #include <smmintrin.h>
176 /* We only have SSE2 */
177 #include <emmintrin.h>
181 #ifdef GMX_SIMD4_SINGLE
183 #define gmx_simd4_pr __m128
185 #define gmx_simd4_pb __m128
187 #define gmx_simd4_load_pr _mm_load_ps
188 #define gmx_simd4_load_bb_pr _mm_load_ps
189 #define gmx_simd4_set1_pr _mm_set1_ps
190 #define gmx_simd4_setzero_pr _mm_setzero_ps
191 #define gmx_simd4_store_pr _mm_store_ps
193 /* Some old AMD processors could have problems with unaligned loads+stores */
195 #define GMX_SIMD4_HAVE_UNALIGNED
197 #ifdef GMX_SIMD4_HAVE_UNALIGNED
198 #define gmx_simd4_loadu_pr _mm_loadu_ps
199 #define gmx_simd4_storeu_pr _mm_storeu_ps
202 #define gmx_simd4_add_pr _mm_add_ps
203 #define gmx_simd4_sub_pr _mm_sub_ps
204 #define gmx_simd4_mul_pr _mm_mul_ps
206 #ifdef GMX_X86_AVX_128_FMA
207 #define gmx_simd4_madd_pr(a, b, c) _mm_macc_ps(a, b, c)
208 #define gmx_simd4_nmsub_pr(a, b, c) _mm_nmacc_ps(a, b, c)
210 #define gmx_simd4_madd_pr(a, b, c) _mm_add_ps(c, _mm_mul_ps(a, b))
211 #define gmx_simd4_nmsub_pr(a, b, c) _mm_sub_ps(c, _mm_mul_ps(a, b))
214 static inline float gmx_simd4_dotproduct3(__m128 a, __m128 b)
215 #ifdef GMX_X86_SSE4_1
219 /* SSE4.1 dot product of components 0,1,2, stored in component 0 */
220 _mm_store_ss(&dp, _mm_dp_ps(a, b, 0x71));
226 float dp_array[7], *dp;
228 /* Generate an aligned pointer */
229 dp = (float *)(((size_t)(dp_array+3)) & (~((size_t)15)));
231 _mm_store_ps(dp, _mm_mul_ps(a, b));
233 return dp[0] + dp[1] + dp[2];
237 #define gmx_simd4_min_pr _mm_min_ps
238 #define gmx_simd4_max_pr _mm_max_ps
240 #define gmx_simd4_blendzero_pr _mm_and_ps
242 #define gmx_simd4_cmplt_pr _mm_cmplt_ps
243 #define gmx_simd4_and_pb _mm_and_ps
244 #define gmx_simd4_or_pb _mm_or_ps
246 #define gmx_simd4_anytrue_pb _mm_movemask_ps
248 #endif /* GMX_SIMD4_SINGLE */
251 #ifdef GMX_SIMD4_DOUBLE
253 #define gmx_simd4_pr __m256d
255 #define gmx_simd4_pb __m256d
257 #define gmx_simd4_load_pr _mm256_load_pd
258 #define gmx_simd4_load_bb_pr _mm256_load_pd
259 #define gmx_simd4_set1_pr _mm256_set1_pd
260 #define gmx_simd4_setzero_pr _mm256_setzero_pd
261 #define gmx_simd4_store_pr _mm256_store_pd
263 #define GMX_SIMD4_HAVE_UNALIGNED
264 #define gmx_simd4_loadu_pr _mm256_loadu_pd
265 #define gmx_simd4_storeu_pr _mm256_storeu_pd
267 #define gmx_simd4_add_pr _mm256_add_pd
268 #define gmx_simd4_sub_pr _mm256_sub_pd
269 #define gmx_simd4_mul_pr _mm256_mul_pd
270 #ifdef GMX_X86_AVX_128_FMA
271 #define gmx_simd4_madd_pr(a, b, c) _mm256_macc_pd(a, b, c)
272 #define gmx_simd4_nmsub_pr(a, b, c) _mm256_nmacc_pd(a, b, c)
274 #define gmx_simd4_madd_pr(a, b, c) _mm256_add_pd(c, _mm256_mul_pd(a, b))
275 #define gmx_simd4_nmsub_pr(a, b, c) _mm256_sub_pd(c, _mm256_mul_pd(a, b))
277 #define gmx_simd4_min_pr _mm256_min_pd
278 #define gmx_simd4_max_pr _mm256_max_pd
280 #define gmx_simd4_blendzero_pr _mm256_and_pd
282 /* Less-than (we use ordered, non-signaling, but that's not required) */
283 #define gmx_simd4_cmplt_pr(x, y) _mm256_cmp_pd(x, y, 0x11)
284 #define gmx_simd4_and_pb _mm256_and_pd
285 #define gmx_simd4_or_pb _mm256_or_pd
287 #define gmx_simd4_anytrue_pb _mm256_movemask_pd
289 #endif /* GMX_SIMD4_DOUBLE */
292 #endif /* GMX_HAVE_SIMD4_MACROS */
294 #endif /* GMX_X86_SSE2 */
296 #ifdef GMX_CPU_ACCELERATION_IBM_QPX
297 /* i.e. BlueGene/Q */
299 /* This hack works on the compilers that can reach this code. A real
300 solution with broader scope will be proposed in master branch. */
301 #define gmx_always_inline __attribute__((always_inline))
303 #ifdef GMX_SIMD4_SINGLE
304 #define GMX_HAVE_SIMD4_MACROS
307 typedef vector4double gmx_simd4_pr;
308 typedef vector4double gmx_simd4_pb;
310 /* The declarations of vec_ld* use non-const pointers, and IBM
311 can't/won't fix this any time soon. So GROMACS has to cast away the
312 const-ness of its pointers before loads. Four-wide SIMD loads
313 sometimes occur from variables of type real, and sometimes from
314 variables of type float (even at double precison), so the correct
315 cast cannot be done easily. The correct cast is necessary because
316 the resulting type determines the alignment assumption of vec_ld*,
317 which is different for float and double. So the loads of
318 always-float variables have to be done with a function that does
319 the correct cast. Since functions cannot be overloaded by type in
320 C, they have to have different names. Thus we have
321 gmx_simd4_load_pr and gmx_simd4_load_bb_pr.
324 static gmx_inline gmx_simd4_pr gmx_always_inline gmx_simd4_load_pr(const real *a)
327 return vec_ld(0, (real *) a);
329 return vec_lda(0, (real *) a);
333 static gmx_inline gmx_simd4_pr gmx_always_inline gmx_simd4_load_bb_pr(const float *a)
336 return vec_ld(0, (float *) a);
338 return vec_lda(0, (float *) a);
342 static gmx_inline gmx_simd4_pr gmx_always_inline gmx_simd4_set1_pr(const real a)
344 return vec_splats(a);
347 static gmx_inline gmx_simd4_pr gmx_always_inline gmx_simd4_setzero_pr()
349 return vec_splats(0.0);
352 /* TODO this will not yet work, because the function might be passed a
353 pointer to a float when running in double precision.
355 static gmx_inline void gmx_always_inline gmx_simd4_store_pr(real *a, gmx_simd4_pr b)
364 static gmx_inline gmx_simd4_pr gmx_always_inline gmx_simd4_add_pr(gmx_simd4_pr a, gmx_simd4_pr b)
366 return vec_add(a, b);
369 static gmx_inline gmx_simd4_pr gmx_always_inline gmx_simd4_sub_pr(gmx_simd4_pr a, gmx_simd4_pr b)
371 return vec_sub(a, b);
374 static gmx_inline gmx_simd4_pr gmx_always_inline gmx_simd4_mul_pr(gmx_simd4_pr a, gmx_simd4_pr b)
376 return vec_mul(a, b);
379 static gmx_inline gmx_simd4_pr gmx_always_inline gmx_simd4_madd_pr(gmx_simd4_pr a, gmx_simd4_pr b, gmx_simd4_pr c)
381 return vec_madd(a, b, c);
384 static gmx_inline gmx_simd4_pr gmx_always_inline gmx_simd4_nmsub_pr(gmx_simd4_pr a, gmx_simd4_pr b, gmx_simd4_pr c)
386 return vec_nmsub(a, b, c);
389 static gmx_inline gmx_simd4_pr gmx_always_inline gmx_simd4_min_pr(gmx_simd4_pr a, gmx_simd4_pr b)
391 /* Implemented the same way as max, but with the subtraction
393 return vec_sel(b, a, vec_sub(b, a));
396 static gmx_inline gmx_simd4_pr gmx_always_inline gmx_simd4_max_pr(gmx_simd4_pr a, gmx_simd4_pr b)
398 return vec_sel(b, a, vec_sub(a, b));
401 static gmx_inline gmx_simd4_pr gmx_always_inline gmx_simd4_blendzero_pr(gmx_simd4_pr a, gmx_simd4_pr b)
403 return vec_sel(gmx_setzero_pr(), a, b);
406 static gmx_inline gmx_simd4_pb gmx_always_inline gmx_simd4_cmplt_pr(gmx_simd4_pr a, gmx_simd4_pr b)
408 return vec_cmplt(a, b);
411 static gmx_inline gmx_simd4_pb gmx_always_inline gmx_simd4_and_pb(gmx_simd4_pb a, gmx_simd4_pb b)
413 return vec_and(a, b);
416 static gmx_inline gmx_simd4_pb gmx_always_inline gmx_simd4_or_pb(gmx_simd4_pb a, gmx_simd4_pb b)
421 static gmx_inline float gmx_always_inline gmx_simd4_dotproduct3(gmx_simd4_pr a, gmx_simd4_pr b)
423 /* The dot product is done solely on the QPX AXU (which is the
424 only available FPU). This is awkward, because pretty much no
425 "horizontal" SIMD-vector operations exist, unlike x86 where
426 SSE4.1 added various kinds of horizontal operations. So we have
427 to make do with shifting vector elements and operating on the
428 results. This makes for lots of data dependency, but the main
429 alternative of storing to memory and reloading is not going to
430 help, either. OpenMP over 2 or 4 hardware threads per core will
431 hide much of the latency from the data dependency. The
432 vec_extract() lets the compiler correctly use a floating-point
433 comparison on the zeroth vector element, which avoids needing
437 gmx_simd4_pr dp_shifted_left_0 = vec_mul(a, b);
438 gmx_simd4_pr dp_shifted_left_1 = vec_sldw(dp_shifted_left_0, dp_shifted_left_0, 1);
439 gmx_simd4_pr dp_shifted_left_2 = vec_sldw(dp_shifted_left_0, dp_shifted_left_0, 2);
440 gmx_simd4_pr dp = vec_add(dp_shifted_left_2,
441 vec_add(dp_shifted_left_0, dp_shifted_left_1));
443 /* See comment in nbnxn_make_pairlist_part() about how this should
444 be able to return a double on PowerPC. */
445 return (float) vec_extract(dp, 0);
448 static gmx_inline int gmx_always_inline gmx_simd4_anytrue_pb(gmx_simd4_pb a)
450 return gmx_anytrue_pb(a);
453 #undef gmx_always_inline
455 #endif /* GMX_CPU_ACCELERATION_IBM_QPX */
457 #ifdef GMX_HAVE_SIMD4_MACROS
458 /* Generic functions to extract a SIMD4 aligned pointer from a pointer x.
459 * x should have at least GMX_SIMD4_WIDTH=4 elements extra compared
460 * to how many you want to use, to avoid indexing outside the aligned region.
463 static gmx_inline gmx_simd4_real *
464 gmx_simd4_align_real(const gmx_simd4_real *x)
466 return (gmx_simd4_real *)(((size_t)((x)+GMX_SIMD4_WIDTH)) & (~((size_t)(GMX_SIMD4_WIDTH*sizeof(gmx_simd4_real)-1))));
471 #endif /* _gmx_simd4_macros_h_ */