2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
5 * Copyright (c) 2001-2012, The GROMACS Development Team
6 * Copyright (c) 2012,2013, by the GROMACS development team, led by
7 * David van der Spoel, Berk Hess, Erik Lindahl, and including many
8 * others, as listed in the AUTHORS file in the top-level source
9 * directory and at http://www.gromacs.org.
11 * GROMACS is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU Lesser General Public License
13 * as published by the Free Software Foundation; either version 2.1
14 * of the License, or (at your option) any later version.
16 * GROMACS is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19 * Lesser General Public License for more details.
21 * You should have received a copy of the GNU Lesser General Public
22 * License along with GROMACS; if not, see
23 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
24 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
26 * If you want to redistribute modifications to GROMACS, please
27 * consider that scientific software is very special. Version
28 * control is crucial - bugs must be traceable. We will be happy to
29 * consider code for inclusion in the official distribution, but
30 * derived work must not be called official GROMACS. Details are found
31 * in the README & COPYING files - if they are missing, get the
32 * official version at http://www.gromacs.org.
34 * To help us fund GROMACS development, we humbly ask that you cite
35 * the research papers on the package. Check out http://www.gromacs.org.
37 #ifndef _nbnxn_kernel_simd_utils_ref_h_
38 #define _nbnxn_kernel_simd_utils_ref_h_
40 typedef gmx_simd_ref_epi32 gmx_simd_ref_exclfilter;
41 typedef gmx_simd_ref_exclfilter gmx_exclfilter;
42 static const int filter_stride = GMX_SIMD_EPI32_WIDTH/GMX_SIMD_WIDTH_HERE;
44 /* Set the stride for the lookup of the two LJ parameters from their
45 (padded) array. Only strides of 2 and 4 are currently supported. */
46 #if defined GMX_NBNXN_SIMD_2XNN
47 static const int nbfp_stride = 4;
48 #elif defined GMX_DOUBLE
49 static const int nbfp_stride = 2;
51 static const int nbfp_stride = 4;
54 #if GMX_SIMD_WIDTH_HERE > 4
55 /* The 4xn kernel operates on 4-wide i-force registers */
57 /* float/double SIMD register type */
62 static gmx_inline gmx_mm_pr4
63 gmx_load_pr4(const real *r)
68 for (i = 0; i < 4; i++)
76 static gmx_inline void
77 gmx_store_pr4(real *dest, gmx_mm_pr4 src)
82 for (i = 0; i < 4; i++)
88 static gmx_inline gmx_mm_pr4
89 gmx_add_pr4(gmx_mm_pr4 a, gmx_mm_pr4 b)
94 for (i = 0; i < 4; i++)
96 c.r[i] = a.r[i] + b.r[i];
104 typedef gmx_simd_ref_pr gmx_simd_ref_pr4;
109 #ifdef GMX_NBNXN_SIMD_2XNN
111 /* Half-width operations are required for the 2xnn kernels */
113 /* Half-width SIMD real type */
114 /* float/double SIMD register type */
116 real r[GMX_SIMD_WIDTH_HERE/2];
119 /* Half-width SIMD operations */
121 /* Load reals at half-width aligned pointer b into half-width SIMD register a */
122 static gmx_inline void
123 gmx_load_hpr(gmx_mm_hpr *a, const real *b)
127 for (i = 0; i < GMX_SIMD_WIDTH_HERE/2; i++)
133 /* Set all entries in half-width SIMD register *a to b */
134 static gmx_inline void
135 gmx_set1_hpr(gmx_mm_hpr *a, real b)
139 for (i = 0; i < GMX_SIMD_WIDTH_HERE/2; i++)
145 /* Load one real at b and one real at b+1 into halves of a, respectively */
146 static gmx_inline void
147 gmx_load1p1_pr(gmx_simd_ref_pr *a, const real *b)
151 for (i = 0; i < GMX_SIMD_WIDTH_HERE/2; i++)
154 a->r[GMX_SIMD_WIDTH_HERE/2 + i] = b[1];
158 /* Load reals at half-width aligned pointer b into two halves of a */
159 static gmx_inline void
160 gmx_loaddh_pr(gmx_simd_ref_pr *a, const real *b)
164 for (i = 0; i < GMX_SIMD_WIDTH_HERE/2; i++)
167 a->r[GMX_SIMD_WIDTH_HERE/2 + i] = b[i];
171 /* Store half-width SIMD register b into half width aligned memory a */
172 static gmx_inline void
173 gmx_store_hpr(real *a, gmx_mm_hpr b)
177 for (i = 0; i < GMX_SIMD_WIDTH_HERE/2; i++)
183 static gmx_inline gmx_mm_hpr
184 gmx_add_hpr(gmx_mm_hpr a, gmx_mm_hpr b)
189 for (i = 0; i < GMX_SIMD_WIDTH_HERE/2; i++)
191 c.r[i] = a.r[i] + b.r[i];
197 static gmx_inline gmx_mm_hpr
198 gmx_sub_hpr(gmx_mm_hpr a, gmx_mm_hpr b)
203 for (i = 0; i < GMX_SIMD_WIDTH_HERE/2; i++)
205 c.r[i] = a.r[i] - b.r[i];
211 /* Sum over 4 half SIMD registers */
212 static gmx_inline gmx_mm_hpr
213 gmx_sum4_hpr(gmx_simd_ref_pr a, gmx_simd_ref_pr b)
218 for (i = 0; i < GMX_SIMD_WIDTH_HERE/2; i++)
222 a.r[GMX_SIMD_WIDTH_HERE/2+i] +
224 b.r[GMX_SIMD_WIDTH_HERE/2+i];
230 /* Sum the elements of halfs of each input register and store sums in out */
231 static gmx_inline gmx_mm_pr4
232 gmx_mm_transpose_sum4h_pr(gmx_simd_ref_pr a, gmx_simd_ref_pr b)
242 for (i = 0; i < GMX_SIMD_WIDTH_HERE/2; i++)
245 sum.r[1] += a.r[GMX_SIMD_WIDTH_HERE/2+i];
247 sum.r[3] += b.r[GMX_SIMD_WIDTH_HERE/2+i];
253 static gmx_inline void
254 gmx_pr_to_2hpr(gmx_simd_ref_pr a, gmx_mm_hpr *b, gmx_mm_hpr *c)
258 for (i = 0; i < GMX_SIMD_WIDTH_HERE/2; i++)
261 c->r[i] = a.r[GMX_SIMD_WIDTH_HERE/2 + i];
264 static gmx_inline void
265 gmx_2hpr_to_pr(gmx_mm_hpr a, gmx_mm_hpr b, gmx_simd_ref_pr *c)
269 for (i = 0; i < GMX_SIMD_WIDTH_HERE/2; i++)
272 c->r[GMX_SIMD_WIDTH_HERE/2 + i] = b.r[i];
276 #endif /* GMX_NBNXN_SIMD_2XNN */
280 static gmx_inline void
281 load_table_f(const real *tab_coul_F, gmx_simd_ref_epi32 ti_S, int *ti,
282 gmx_simd_ref_pr *ctab0_S, gmx_simd_ref_pr *ctab1_S)
286 for (i = 0; i < GMX_SIMD_WIDTH_HERE; i++)
288 ctab0_S->r[i] = tab_coul_F[ti_S.r[i]];
289 ctab1_S->r[i] = tab_coul_F[ti_S.r[i]+1];
292 *ctab1_S = gmx_sub_pr(*ctab1_S, *ctab0_S);
295 static gmx_inline void
296 load_table_f_v(const real *tab_coul_F, const real *tab_coul_V,
297 gmx_simd_ref_epi32 ti_S, int *ti,
298 gmx_simd_ref_pr *ctab0_S, gmx_simd_ref_pr *ctab1_S,
299 gmx_simd_ref_pr *ctabv_S)
303 load_table_f(tab_coul_F, ti_S, ti, ctab0_S, ctab1_S);
305 for (i = 0; i < GMX_SIMD_WIDTH_HERE; i++)
307 ctabv_S->r[i] = tab_coul_V[ti_S.r[i]];
313 static gmx_inline void
314 load_table_f(const real *tab_coul_FDV0, gmx_simd_ref_epi32 ti_S, int *ti,
315 gmx_simd_ref_pr *ctab0_S, gmx_simd_ref_pr *ctab1_S)
319 for (i = 0; i < GMX_SIMD_WIDTH_HERE; i++)
321 ctab0_S->r[i] = tab_coul_FDV0[ti_S.r[i]*4];
322 ctab1_S->r[i] = tab_coul_FDV0[ti_S.r[i]*4+1];
326 static gmx_inline void
327 load_table_f_v(const real *tab_coul_FDV0,
328 gmx_simd_ref_epi32 ti_S, int *ti,
329 gmx_simd_ref_pr *ctab0_S, gmx_simd_ref_pr *ctab1_S,
330 gmx_simd_ref_pr *ctabv_S)
334 load_table_f(tab_coul_FDV0, ti_S, ti, ctab0_S, ctab1_S);
336 for (i = 0; i < GMX_SIMD_WIDTH_HERE; i++)
338 ctabv_S->r[i] = tab_coul_FDV0[ti_S.r[i]*4+2];
343 /* Sum the elements within each input register and store the sums in out.
344 * Note that 4/8-way SIMD requires gmx_mm_transpose_sum4_pr instead.
346 #if GMX_SIMD_WIDTH_HERE == 2
347 static gmx_inline gmx_simd_ref_pr
348 gmx_mm_transpose_sum2_pr(gmx_simd_ref_pr in0, gmx_simd_ref_pr in1)
352 sum.r[0] = in0.r[0] + in0.r[1];
353 sum.r[1] = in1.r[0] + in1.r[1];
359 #if GMX_SIMD_WIDTH_HERE >= 4
360 #if GMX_SIMD_WIDTH_HERE == 4
361 static gmx_inline gmx_simd_ref_pr
363 static gmx_inline gmx_mm_pr4
365 gmx_mm_transpose_sum4_pr(gmx_simd_ref_pr in0, gmx_simd_ref_pr in1,
366 gmx_simd_ref_pr in2, gmx_simd_ref_pr in3)
368 #if GMX_SIMD_WIDTH_HERE == 4
380 for (i = 0; i < GMX_SIMD_WIDTH_HERE; i++)
382 sum.r[0] += in0.r[i];
383 sum.r[1] += in1.r[i];
384 sum.r[2] += in2.r[i];
385 sum.r[3] += in3.r[i];
393 /* In double precision it can be faster to first calculate single precision
394 * square roots for two double precision registers at once and then use
395 * double precision Newton-Raphson iteration to reach full double precision.
396 * For this reference code we just use a plain-C sqrt.
398 static gmx_inline void
399 gmx_mm_invsqrt2_pd(gmx_simd_ref_pr in0, gmx_simd_ref_pr in1,
400 gmx_simd_ref_pr *out0, gmx_simd_ref_pr *out1)
402 out0 = gmx_invsqrt_pr(in0);
403 out1 = gmx_invsqrt_pr(in1);
407 static gmx_inline void
408 load_lj_pair_params(const real *nbfp, const int *type, int aj,
409 gmx_simd_ref_pr *c6_S, gmx_simd_ref_pr *c12_S)
413 for (i = 0; i < GMX_SIMD_WIDTH_HERE; i++)
415 c6_S->r[i] = nbfp[type[aj+i]*nbfp_stride];
416 c12_S->r[i] = nbfp[type[aj+i]*nbfp_stride+1];
420 #ifdef GMX_NBNXN_SIMD_2XNN
421 static gmx_inline void
422 load_lj_pair_params2(const real *nbfp0, const real *nbfp1,
423 const int *type, int aj,
424 gmx_simd_ref_pr *c6_S, gmx_simd_ref_pr *c12_S)
428 for (i = 0; i < GMX_SIMD_WIDTH_HERE/2; i++)
430 c6_S->r[i] = nbfp0[type[aj+i]*nbfp_stride];
431 c6_S->r[GMX_SIMD_WIDTH_HERE/2 + i] = nbfp1[type[aj+i]*nbfp_stride];
432 c12_S->r[i] = nbfp0[type[aj+i]*nbfp_stride+1];
433 c12_S->r[GMX_SIMD_WIDTH_HERE/2 + i] = nbfp1[type[aj+i]*nbfp_stride+1];
438 /* Code for handling loading exclusions and converting them into
439 interactions. The x86 code might use either integer- or real-type
440 SIMD, but the reference code does not need to know. */
442 #define gmx_load1_exclfilter(e) gmx_simd_ref_load1_exclfilter(e)
443 #define gmx_load_exclusion_filter(e) gmx_simd_ref_load_exclusion_filter((int *) e)
444 #define gmx_checkbitmask_pb(m0, m1) gmx_simd_ref_checkbitmask_pb(m0, m1)
446 static gmx_inline gmx_simd_ref_exclfilter
447 gmx_simd_ref_load1_exclfilter(int src)
449 gmx_simd_ref_exclfilter a;
452 for (i = 0; i < GMX_SIMD_REF_WIDTH; i++)
460 static gmx_inline gmx_simd_ref_exclfilter
461 gmx_simd_ref_load_exclusion_filter(const int *src)
463 gmx_simd_ref_exclfilter a;
466 for (i = 0; i < GMX_SIMD_REF_WIDTH; i++)
474 /* For topology exclusion-pair checking we need: ((a & b) ? True :
475 * False). The x86 implementations use hardware-suitable integer-
476 * and/or real-valued SIMD operations and a bit-wise "and" to do
477 * this. The reference implementation normally uses logical operations
478 * for logic, but in this case the i- and j-atom exclusion masks
479 * computed during searching expect to be combined with bit-wise
482 * If the same bit is set in both input masks, return TRUE, else
483 * FALSE. This function is only called with a single bit set in b.
485 static gmx_inline gmx_simd_ref_pb
486 gmx_simd_ref_checkbitmask_pb(gmx_simd_ref_exclfilter a, gmx_simd_ref_exclfilter b)
491 for (i = 0; i < GMX_SIMD_REF_WIDTH; i++)
493 c.r[i] = ((a.r[i] & b.r[i]) != 0);
499 #endif /* _nbnxn_kernel_simd_utils_ref_h_ */