2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
35 #ifndef _nbnxn_kernel_simd_utils_ref_h_
36 #define _nbnxn_kernel_simd_utils_ref_h_
39 #include "gromacs/simd/simd_math.h"
41 typedef gmx_simd_int32_t gmx_simd_ref_exclfilter;
42 typedef gmx_simd_ref_exclfilter gmx_exclfilter;
43 static const int filter_stride = GMX_SIMD_INT32_WIDTH/GMX_SIMD_REAL_WIDTH;
45 /* Set the stride for the lookup of the two LJ parameters from their
46 (padded) array. Only strides of 2 and 4 are currently supported. */
47 #if defined GMX_NBNXN_SIMD_2XNN
48 static const int nbfp_stride = 4;
49 #elif defined GMX_DOUBLE
50 static const int nbfp_stride = 2;
52 static const int nbfp_stride = 4;
55 #if GMX_SIMD_REAL_WIDTH > 4
56 /* The 4xn kernel operates on 4-wide i-force registers */
58 /* float/double SIMD register type */
63 static gmx_inline gmx_simd4_real_t
64 gmx_simd4_load_r(const real *r)
69 for (i = 0; i < 4; i++)
77 static gmx_inline void
78 gmx_simd4_store_r(real *dest, gmx_simd4_real_t src)
83 for (i = 0; i < 4; i++)
89 static gmx_inline gmx_simd4_real_t
90 gmx_simd4_add_r(gmx_simd4_real_t a, gmx_simd4_real_t b)
95 for (i = 0; i < 4; i++)
97 c.r[i] = a.r[i] + b.r[i];
103 static gmx_inline real
104 gmx_simd4_reduce_r(gmx_simd4_real_t a)
106 return a.r[0] + a.r[1] + a.r[2] + a.r[3];
112 #ifdef GMX_NBNXN_SIMD_2XNN
114 /* Half-width operations are required for the 2xnn kernels */
116 /* Half-width SIMD real type */
117 /* float/double SIMD register type */
119 real r[GMX_SIMD_REAL_WIDTH/2];
122 /* Half-width SIMD operations */
124 /* Load reals at half-width aligned pointer b into half-width SIMD register a */
125 static gmx_inline void
126 gmx_load_hpr(gmx_mm_hpr *a, const real *b)
130 for (i = 0; i < GMX_SIMD_REAL_WIDTH/2; i++)
136 /* Set all entries in half-width SIMD register *a to b */
137 static gmx_inline void
138 gmx_set1_hpr(gmx_mm_hpr *a, real b)
142 for (i = 0; i < GMX_SIMD_REAL_WIDTH/2; i++)
148 /* Load one real at b and one real at b+1 into halves of a, respectively */
149 static gmx_inline void
150 gmx_load1p1_pr(gmx_simd_real_t *a, const real *b)
154 for (i = 0; i < GMX_SIMD_REAL_WIDTH/2; i++)
157 a->r[GMX_SIMD_REAL_WIDTH/2 + i] = b[1];
161 /* Load reals at half-width aligned pointer b into two halves of a */
162 static gmx_inline void
163 gmx_loaddh_pr(gmx_simd_real_t *a, const real *b)
167 for (i = 0; i < GMX_SIMD_REAL_WIDTH/2; i++)
170 a->r[GMX_SIMD_REAL_WIDTH/2 + i] = b[i];
174 /* Store half-width SIMD register b into half width aligned memory a */
175 static gmx_inline void
176 gmx_store_hpr(real *a, gmx_mm_hpr b)
180 for (i = 0; i < GMX_SIMD_REAL_WIDTH/2; i++)
186 static gmx_inline gmx_mm_hpr
187 gmx_add_hpr(gmx_mm_hpr a, gmx_mm_hpr b)
192 for (i = 0; i < GMX_SIMD_REAL_WIDTH/2; i++)
194 c.r[i] = a.r[i] + b.r[i];
200 static gmx_inline gmx_mm_hpr
201 gmx_sub_hpr(gmx_mm_hpr a, gmx_mm_hpr b)
206 for (i = 0; i < GMX_SIMD_REAL_WIDTH/2; i++)
208 c.r[i] = a.r[i] - b.r[i];
214 /* Sum over 4 half SIMD registers */
215 static gmx_inline gmx_mm_hpr
216 gmx_sum4_hpr(gmx_simd_real_t a, gmx_simd_real_t b)
221 for (i = 0; i < GMX_SIMD_REAL_WIDTH/2; i++)
225 a.r[GMX_SIMD_REAL_WIDTH/2+i] +
227 b.r[GMX_SIMD_REAL_WIDTH/2+i];
233 #ifdef GMX_NBNXN_SIMD_2XNN
234 /* Sum the elements of halfs of each input register and store sums in out */
235 static gmx_inline gmx_simd4_real_t
236 gmx_mm_transpose_sum4h_pr(gmx_simd_real_t a, gmx_simd_real_t b)
238 gmx_simd4_real_t sum;
246 for (i = 0; i < GMX_SIMD_REAL_WIDTH/2; i++)
249 sum.r[1] += a.r[GMX_SIMD_REAL_WIDTH/2+i];
251 sum.r[3] += b.r[GMX_SIMD_REAL_WIDTH/2+i];
258 static gmx_inline void
259 gmx_pr_to_2hpr(gmx_simd_real_t a, gmx_mm_hpr *b, gmx_mm_hpr *c)
263 for (i = 0; i < GMX_SIMD_REAL_WIDTH/2; i++)
266 c->r[i] = a.r[GMX_SIMD_REAL_WIDTH/2 + i];
269 static gmx_inline void
270 gmx_2hpr_to_pr(gmx_mm_hpr a, gmx_mm_hpr b, gmx_simd_real_t *c)
274 for (i = 0; i < GMX_SIMD_REAL_WIDTH/2; i++)
277 c->r[GMX_SIMD_REAL_WIDTH/2 + i] = b.r[i];
281 #endif /* GMX_NBNXN_SIMD_2XNN */
285 static gmx_inline void
286 load_table_f(const real *tab_coul_F, gmx_simd_int32_t ti_S,
288 gmx_simd_real_t *ctab0_S, gmx_simd_real_t *ctab1_S)
292 for (i = 0; i < GMX_SIMD_REAL_WIDTH; i++)
294 ctab0_S->r[i] = tab_coul_F[ti_S.i[i]];
295 ctab1_S->r[i] = tab_coul_F[ti_S.i[i]+1];
298 *ctab1_S = gmx_simd_sub_r(*ctab1_S, *ctab0_S);
301 static gmx_inline void
302 load_table_f_v(const real *tab_coul_F, const real *tab_coul_V,
303 gmx_simd_int32_t ti_S, int *ti,
304 gmx_simd_real_t *ctab0_S, gmx_simd_real_t *ctab1_S,
305 gmx_simd_real_t *ctabv_S)
309 load_table_f(tab_coul_F, ti_S, ti, ctab0_S, ctab1_S);
311 for (i = 0; i < GMX_SIMD_REAL_WIDTH; i++)
313 ctabv_S->r[i] = tab_coul_V[ti_S.i[i]];
319 static gmx_inline void
320 load_table_f(const real *tab_coul_FDV0, gmx_simd_int32_t ti_S, int *ti,
321 gmx_simd_real_t *ctab0_S, gmx_simd_real_t *ctab1_S)
325 for (i = 0; i < GMX_SIMD_REAL_WIDTH; i++)
327 ctab0_S->r[i] = tab_coul_FDV0[ti_S.i[i]*4];
328 ctab1_S->r[i] = tab_coul_FDV0[ti_S.i[i]*4+1];
332 static gmx_inline void
333 load_table_f_v(const real *tab_coul_FDV0,
334 gmx_simd_int32_t ti_S, int *ti,
335 gmx_simd_real_t *ctab0_S, gmx_simd_real_t *ctab1_S,
336 gmx_simd_real_t *ctabv_S)
340 load_table_f(tab_coul_FDV0, ti_S, ti, ctab0_S, ctab1_S);
342 for (i = 0; i < GMX_SIMD_REAL_WIDTH; i++)
344 ctabv_S->r[i] = tab_coul_FDV0[ti_S.i[i]*4+2];
349 /* Sum the elements within each input register and store the sums in out.
350 * Note that 4/8-way SIMD requires gmx_mm_transpose_sum4_pr instead.
352 #if GMX_SIMD_REAL_WIDTH == 2
353 static gmx_inline gmx_simd_real_t
354 gmx_mm_transpose_sum2_pr(gmx_simd_real_t in0, gmx_simd_real_t in1)
358 sum.r[0] = in0.r[0] + in0.r[1];
359 sum.r[1] = in1.r[0] + in1.r[1];
365 #if GMX_SIMD_REAL_WIDTH >= 4
366 #if GMX_SIMD_REAL_WIDTH == 4
367 static gmx_inline gmx_simd_real_t
369 static gmx_inline gmx_simd4_real_t
371 gmx_mm_transpose_sum4_pr(gmx_simd_real_t in0, gmx_simd_real_t in1,
372 gmx_simd_real_t in2, gmx_simd_real_t in3)
374 #if GMX_SIMD_REAL_WIDTH == 4
377 gmx_simd4_real_t sum;
386 for (i = 0; i < GMX_SIMD_REAL_WIDTH; i++)
388 sum.r[0] += in0.r[i];
389 sum.r[1] += in1.r[i];
390 sum.r[2] += in2.r[i];
391 sum.r[3] += in3.r[i];
399 /* In double precision it can be faster to first calculate single precision
400 * square roots for two double precision registers at once and then use
401 * double precision Newton-Raphson iteration to reach full double precision.
402 * For this reference code we just use a plain-C sqrt.
404 static gmx_inline void
405 gmx_mm_invsqrt2_pd(gmx_simd_real_t in0, gmx_simd_real_t in1,
406 gmx_simd_real_t *out0, gmx_simd_real_t *out1)
408 *out0 = gmx_simd_invsqrt_r(in0);
409 *out1 = gmx_simd_invsqrt_r(in1);
413 static gmx_inline void
414 load_lj_pair_params(const real *nbfp, const int *type, int aj,
415 gmx_simd_real_t *c6_S, gmx_simd_real_t *c12_S)
419 for (i = 0; i < GMX_SIMD_REAL_WIDTH; i++)
421 c6_S->r[i] = nbfp[type[aj+i]*nbfp_stride];
422 c12_S->r[i] = nbfp[type[aj+i]*nbfp_stride+1];
426 #ifdef GMX_NBNXN_SIMD_2XNN
427 static gmx_inline void
428 load_lj_pair_params2(const real *nbfp0, const real *nbfp1,
429 const int *type, int aj,
430 gmx_simd_real_t *c6_S, gmx_simd_real_t *c12_S)
434 for (i = 0; i < GMX_SIMD_REAL_WIDTH/2; i++)
436 c6_S->r[i] = nbfp0[type[aj+i]*nbfp_stride];
437 c6_S->r[GMX_SIMD_REAL_WIDTH/2 + i] = nbfp1[type[aj+i]*nbfp_stride];
438 c12_S->r[i] = nbfp0[type[aj+i]*nbfp_stride+1];
439 c12_S->r[GMX_SIMD_REAL_WIDTH/2 + i] = nbfp1[type[aj+i]*nbfp_stride+1];
444 /* Code for handling loading exclusions and converting them into
445 interactions. The x86 code might use either integer- or real-type
446 SIMD, but the reference code does not need to know. */
448 #define gmx_load1_exclfilter(e) gmx_simd_ref_load1_exclfilter(e)
449 #define gmx_load_exclusion_filter(e) gmx_simd_ref_load_exclusion_filter((int *) e)
450 #define gmx_checkbitmask_pb(m0, m1) gmx_simd_ref_checkbitmask_pb(m0, m1)
452 static gmx_inline gmx_simd_ref_exclfilter
453 gmx_simd_ref_load1_exclfilter(int src)
455 gmx_simd_ref_exclfilter a;
458 for (i = 0; i < GMX_SIMD_REAL_WIDTH; i++)
466 static gmx_inline gmx_simd_ref_exclfilter
467 gmx_simd_ref_load_exclusion_filter(const int *src)
469 gmx_simd_ref_exclfilter a;
472 for (i = 0; i < GMX_SIMD_REAL_WIDTH; i++)
480 /* For topology exclusion-pair checking we need: ((a & b) ? True :
481 * False). The x86 implementations use hardware-suitable integer-
482 * and/or real-valued SIMD operations and a bit-wise "and" to do
483 * this. The reference implementation normally uses logical operations
484 * for logic, but in this case the i- and j-atom exclusion masks
485 * computed during searching expect to be combined with bit-wise
488 * If the same bit is set in both input masks, return TRUE, else
489 * FALSE. This function is only called with a single bit set in b.
491 static gmx_inline gmx_simd_bool_t
492 gmx_simd_ref_checkbitmask_pb(gmx_simd_ref_exclfilter a, gmx_simd_ref_exclfilter b)
497 for (i = 0; i < GMX_SIMD_REAL_WIDTH; i++)
499 c.b[i] = ((a.i[i] & b.i[i]) != 0);
505 #endif /* _nbnxn_kernel_simd_utils_ref_h_ */