2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
5 * Copyright (c) 2001-2012, The GROMACS Development Team
6 * Copyright (c) 2012,2013, by the GROMACS development team, led by
7 * David van der Spoel, Berk Hess, Erik Lindahl, and including many
8 * others, as listed in the AUTHORS file in the top-level source
9 * directory and at http://www.gromacs.org.
11 * GROMACS is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU Lesser General Public License
13 * as published by the Free Software Foundation; either version 2.1
14 * of the License, or (at your option) any later version.
16 * GROMACS is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19 * Lesser General Public License for more details.
21 * You should have received a copy of the GNU Lesser General Public
22 * License along with GROMACS; if not, see
23 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
24 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
26 * If you want to redistribute modifications to GROMACS, please
27 * consider that scientific software is very special. Version
28 * control is crucial - bugs must be traceable. We will be happy to
29 * consider code for inclusion in the official distribution, but
30 * derived work must not be called official GROMACS. Details are found
31 * in the README & COPYING files - if they are missing, get the
32 * official version at http://www.gromacs.org.
34 * To help us fund GROMACS development, we humbly ask that you cite
35 * the research papers on the package. Check out http://www.gromacs.org.
38 #ifndef _gmx_simd_ref_h_
39 #define _gmx_simd_ref_h_
41 /* This file contains a reference plain-C implementation of arbitrary width.
42 * This code is only useful for testing and documentation.
43 * The SIMD width is set by defining GMX_SIMD_REF_WIDTH before including.
47 #ifndef GMX_SIMD_REF_WIDTH
48 #error "GMX_SIMD_REF_WIDTH should be defined before including gmx_simd_ref.h"
53 /* float/double SIMD register type */
55 real r[GMX_SIMD_REF_WIDTH];
58 /* boolean SIMD register type */
60 char r[GMX_SIMD_REF_WIDTH];
63 /* integer SIMD register type, only for table indexing and exclusion masks */
65 int r[GMX_SIMD_REF_WIDTH];
67 #define GMX_SIMD_REF_EPI32_WIDTH GMX_SIMD_REF_WIDTH
69 /* Load GMX_SIMD_REF_WIDTH reals for memory starting at r */
70 static gmx_inline gmx_simd_ref_pr
71 gmx_simd_ref_load_pr(const real *r)
76 for (i = 0; i < GMX_SIMD_REF_WIDTH; i++)
84 /* Set all SIMD register elements to *r */
85 static gmx_inline gmx_simd_ref_pr
86 gmx_simd_ref_load1_pr(const real *r)
91 for (i = 0; i < GMX_SIMD_REF_WIDTH; i++)
99 /* Set all SIMD register elements to r */
100 static gmx_inline gmx_simd_ref_pr
101 gmx_simd_ref_set1_pr(real r)
106 for (i = 0; i < GMX_SIMD_REF_WIDTH; i++)
114 /* Set all SIMD register elements to 0 */
115 static gmx_inline gmx_simd_ref_pr
116 gmx_simd_ref_setzero_pr()
121 for (i = 0; i < GMX_SIMD_REF_WIDTH; i++)
129 static gmx_inline void
130 gmx_simd_ref_store_pr(real *dest, gmx_simd_ref_pr src)
134 for (i = 0; i < GMX_SIMD_REF_WIDTH; i++)
140 static gmx_inline gmx_simd_ref_pr
141 gmx_simd_ref_add_pr(gmx_simd_ref_pr a, gmx_simd_ref_pr b)
146 for (i = 0; i < GMX_SIMD_REF_WIDTH; i++)
148 c.r[i] = a.r[i] + b.r[i];
154 static gmx_inline gmx_simd_ref_pr
155 gmx_simd_ref_sub_pr(gmx_simd_ref_pr a, gmx_simd_ref_pr b)
160 for (i = 0; i < GMX_SIMD_REF_WIDTH; i++)
162 c.r[i] = a.r[i] - b.r[i];
168 static gmx_inline gmx_simd_ref_pr
169 gmx_simd_ref_mul_pr(gmx_simd_ref_pr a, gmx_simd_ref_pr b)
174 for (i = 0; i < GMX_SIMD_REF_WIDTH; i++)
176 c.r[i] = a.r[i]*b.r[i];
182 static gmx_inline gmx_simd_ref_pr
183 gmx_simd_ref_madd_pr(gmx_simd_ref_pr a, gmx_simd_ref_pr b, gmx_simd_ref_pr c)
188 for (i = 0; i < GMX_SIMD_REF_WIDTH; i++)
190 d.r[i] = a.r[i]*b.r[i] + c.r[i];
196 static gmx_inline gmx_simd_ref_pr
197 gmx_simd_ref_nmsub_pr(gmx_simd_ref_pr a, gmx_simd_ref_pr b, gmx_simd_ref_pr c)
202 for (i = 0; i < GMX_SIMD_REF_WIDTH; i++)
204 d.r[i] = -a.r[i]*b.r[i] + c.r[i];
210 static gmx_inline gmx_simd_ref_pr
211 gmx_simd_ref_max_pr(gmx_simd_ref_pr a, gmx_simd_ref_pr b)
216 for (i = 0; i < GMX_SIMD_REF_WIDTH; i++)
218 c.r[i] = (a.r[i] >= b.r[i] ? a.r[i] : b.r[i]);
224 static gmx_inline gmx_simd_ref_pr
225 gmx_simd_ref_blendzero_pr(gmx_simd_ref_pr a, gmx_simd_ref_pb b)
230 for (i = 0; i < GMX_SIMD_REF_WIDTH; i++)
232 c.r[i] = (b.r[i] ? a.r[i] : 0.0);
238 /* Note that this reference implementation rounds away from zero,
239 * whereas most SIMD intrinsics will round to nearest even.
240 * Since this function is only used for periodic image calculations,
241 * the rounding of mantissas close to 0.5 is irrelevant.
243 static gmx_inline gmx_simd_ref_pr
244 gmx_simd_ref_round_pr(gmx_simd_ref_pr a)
249 for (i = 0; i < GMX_SIMD_REF_WIDTH; i++)
252 b.r[i] = round(a.r[i]);
254 b.r[i] = roundf(a.r[i]);
261 /* Not required, only used to speed up the nbnxn tabulated PME kernels */
262 static gmx_inline gmx_simd_ref_pr
263 gmx_simd_ref_floor_pr(gmx_simd_ref_pr a)
268 for (i = 0; i < GMX_SIMD_REF_WIDTH; i++)
271 b.r[i] = floor(a.r[i]);
273 b.r[i] = floorf(a.r[i]);
280 /* Not required, only used when blendv is faster than comparison */
281 static gmx_inline gmx_simd_ref_pr
282 gmx_simd_ref_blendv_pr(gmx_simd_ref_pr a, gmx_simd_ref_pr b, gmx_simd_ref_pr c)
287 for (i = 0; i < GMX_SIMD_REF_WIDTH; i++)
289 d.r[i] = (c.r[i] >= 0) ? a.r[i] : b.r[i];
295 /* Copy the sign of a to b, assumes b >= 0 for efficiency */
296 static gmx_inline gmx_simd_ref_pr
297 gmx_simd_ref_cpsgn_nonneg_pr(gmx_simd_ref_pr a, gmx_simd_ref_pr b)
302 for (i = 0; i < GMX_SIMD_REF_WIDTH; i++)
304 c.r[i] = (a.r[i] >= 0) ? b.r[i] : -b.r[i];
310 /* Very specific operation required in the non-bonded kernels */
311 static gmx_inline gmx_simd_ref_pr
312 gmx_simd_ref_masknot_add_pr(gmx_simd_ref_pb a, gmx_simd_ref_pr b, gmx_simd_ref_pr c)
317 for (i = 0; i < GMX_SIMD_REF_WIDTH; i++)
319 d.r[i] = a.r[i] ? b.r[i] : b.r[i] + c.r[i];
326 static gmx_inline gmx_simd_ref_pb
327 gmx_simd_ref_cmplt_pr(gmx_simd_ref_pr a, gmx_simd_ref_pr b)
332 for (i = 0; i < GMX_SIMD_REF_WIDTH; i++)
334 c.r[i] = (a.r[i] < b.r[i]);
340 /* Logical AND on SIMD booleans */
341 static gmx_inline gmx_simd_ref_pb
342 gmx_simd_ref_and_pb(gmx_simd_ref_pb a, gmx_simd_ref_pb b)
347 for (i = 0; i < GMX_SIMD_REF_WIDTH; i++)
349 c.r[i] = (a.r[i] && b.r[i]);
355 /* Logical OR on SIMD booleans */
356 static gmx_inline gmx_simd_ref_pb
357 gmx_simd_ref_or_pb(gmx_simd_ref_pb a, gmx_simd_ref_pb b)
362 for (i = 0; i < GMX_SIMD_REF_WIDTH; i++)
364 c.r[i] = (a.r[i] || b.r[i]);
370 /* Not required, gmx_anytrue_pb(x) returns if any of the boolean is x is True.
371 * If this is not present, define GMX_SIMD_IS_TRUE(real x),
372 * which should return x==True, where True is True as defined in SIMD.
374 static gmx_inline int
375 gmx_simd_ref_anytrue_pb(gmx_simd_ref_pb a)
381 for (i = 0; i < GMX_SIMD_REF_WIDTH; i++)
392 /* If we don't have gmx_anytrue_pb, we need to store gmx_mm_pb */
393 static gmx_inline void
394 gmx_simd_ref_store_pb(real *dest, gmx_simd_ref_pb src)
398 for (i = 0; i < GMX_SIMD_REF_WIDTH; i++)
405 /* For topology exclusion pair checking we need: ((a & b) ? True : False)
406 * when we do a bit-wise and between a and b.
407 * When integer SIMD operations are present, we use gmx_checkbitmask_epi32(a, b)
408 * Otherwise we do all operations, except for the set1, in reals.
411 /* Integer set and cast are only used for nbnxn exclusion masks */
412 static gmx_inline gmx_simd_ref_epi32
413 gmx_simd_ref_set1_epi32(int src)
415 gmx_simd_ref_epi32 a;
418 for (i = 0; i < GMX_SIMD_REF_WIDTH; i++)
426 static gmx_inline gmx_simd_ref_epi32
427 gmx_simd_ref_load_si(const int *src)
429 gmx_simd_ref_epi32 a;
432 for (i = 0; i < GMX_SIMD_REF_WIDTH; i++)
440 /* If the same bit is set in both input masks, return TRUE, else FALSE.
441 * This function is only called with a single bit set in b.
443 static gmx_inline gmx_simd_ref_pb
444 gmx_simd_ref_checkbitmask_epi32(gmx_simd_ref_epi32 a, gmx_simd_ref_epi32 b)
449 for (i = 0; i < GMX_SIMD_REF_WIDTH; i++)
451 c.r[i] = ((a.r[i] & b.r[i]) != 0);
458 /* Conversions only used for PME table lookup */
459 static gmx_inline gmx_simd_ref_epi32
460 gmx_simd_ref_cvttpr_epi32(gmx_simd_ref_pr a)
462 gmx_simd_ref_epi32 b;
465 for (i = 0; i < GMX_SIMD_REF_WIDTH; i++)
467 b.r[i] = (int)a.r[i];
473 /* These two function only need to be approximate, Newton-Raphson iteration
474 * is used for full accuracy in gmx_invsqrt_pr and gmx_inv_pr.
476 static gmx_inline gmx_simd_ref_pr
477 gmx_simd_ref_rsqrt_pr(gmx_simd_ref_pr a)
482 for (i = 0; i < GMX_SIMD_REF_WIDTH; i++)
485 b.r[i] = 1.0/sqrt(a.r[i]);
487 b.r[i] = 1.0/sqrtf(a.r[i]);
494 static gmx_inline gmx_simd_ref_pr
495 gmx_simd_ref_rcp_pr(gmx_simd_ref_pr a)
500 for (i = 0; i < GMX_SIMD_REF_WIDTH; i++)
508 static gmx_inline gmx_simd_ref_pr
509 gmx_simd_ref_exp_pr(gmx_simd_ref_pr a)
514 for (i = 0; i < GMX_SIMD_REF_WIDTH; i++)
517 b.r[i] = exp(a.r[i]);
519 b.r[i] = expf(a.r[i]);
526 static gmx_inline gmx_simd_ref_pr
527 gmx_simd_ref_sqrt_pr(gmx_simd_ref_pr a)
532 for (i = 0; i < GMX_SIMD_REF_WIDTH; i++)
535 b.r[i] = sqrt(a.r[i]);
537 b.r[i] = sqrtf(a.r[i]);
544 static gmx_inline int
545 gmx_simd_ref_sincos_pr(gmx_simd_ref_pr a,
546 gmx_simd_ref_pr *s, gmx_simd_ref_pr *c)
550 for (i = 0; i < GMX_SIMD_REF_WIDTH; i++)
552 s->r[i] = sin(a.r[i]);
553 c->r[i] = cos(a.r[i]);
559 static gmx_inline gmx_simd_ref_pr
560 gmx_simd_ref_acos_pr(gmx_simd_ref_pr a)
565 for (i = 0; i < GMX_SIMD_REF_WIDTH; i++)
567 b.r[i] = acos(a.r[i]);
573 static gmx_inline gmx_simd_ref_pr
574 gmx_simd_ref_atan2_pr(gmx_simd_ref_pr a, gmx_simd_ref_pr b)
579 for (i = 0; i < GMX_SIMD_REF_WIDTH; i++)
581 c.r[i] = atan2(a.r[i], b.r[i]);
587 #endif /* _gmx_simd_ref_h_ */