2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
5 * Copyright (c) 2001-2012, The GROMACS development team,
6 * check out http://www.gromacs.org for more information.
7 * Copyright (c) 2012, by the GROMACS development team, led by
8 * David van der Spoel, Berk Hess, Erik Lindahl, and including many
9 * others, as listed in the AUTHORS file in the top-level source
10 * directory and at http://www.gromacs.org.
12 * GROMACS is free software; you can redistribute it and/or
13 * modify it under the terms of the GNU Lesser General Public License
14 * as published by the Free Software Foundation; either version 2.1
15 * of the License, or (at your option) any later version.
17 * GROMACS is distributed in the hope that it will be useful,
18 * but WITHOUT ANY WARRANTY; without even the implied warranty of
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20 * Lesser General Public License for more details.
22 * You should have received a copy of the GNU Lesser General Public
23 * License along with GROMACS; if not, see
24 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
25 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
27 * If you want to redistribute modifications to GROMACS, please
28 * consider that scientific software is very special. Version
29 * control is crucial - bugs must be traceable. We will be happy to
30 * consider code for inclusion in the official distribution, but
31 * derived work must not be called official GROMACS. Details are found
32 * in the README & COPYING files - if they are missing, get the
33 * official version at http://www.gromacs.org.
35 * To help us fund GROMACS development, we humbly ask that you cite
36 * the research papers on the package. Check out http://www.gromacs.org.
39 #if GMX_NBNXN_SIMD_BITWIDTH != 256
40 #error "unsupported SIMD width"
43 #include "gmx_simd_macros.h"
45 /* Define a few macros for half-width SIMD */
46 #if defined GMX_X86_AVX_256 && !defined GMX_DOUBLE
47 /* Half-width SIMD real type */
48 #define gmx_mm_hpr __m128
49 /* Half-width SIMD operations */
50 /* Load reals at half-width aligned pointer b into half-width SIMD register a */
51 #define gmx_load_hpr(a,b) a = _mm_load_ps(b)
52 #define gmx_set1_hpr _mm_set1_ps
53 /* Load reals at half-width aligned pointer b into two halves of a */
54 #define gmx_loaddh_pr(a, b) a = gmx_mm256_load4_ps(b)
55 /* Store half width SIMD registers b and c in ful width register a */
56 #define gmx_2hpr_to_pr(a, b, c) a = _mm256_insertf128_ps(_mm256_castps128_ps256(b), c, 0x1)
58 #error "Half-width SIMD macros are not yet defined"
62 #if GMX_SIMD_WIDTH_HERE >= 2*NBNXN_CPU_CLUSTER_I_SIZE
63 #define STRIDE_S (GMX_SIMD_WIDTH_HERE/2)
65 #define STRIDE_S NBNXN_CPU_CLUSTER_I_SIZE
68 static gmx_inline gmx_mm_pr gmx_load_hpr_hilo_pr(const real *a)
75 gmx_2hpr_to_pr(a_a_S, a_S, a_S);
80 static gmx_inline gmx_mm_pr gmx_set_2real_shift_pr(const real *a, real shift)
82 gmx_mm_hpr a0_S, a1_S;
85 a0_S = gmx_set1_hpr(a[0] + shift);
86 a1_S = gmx_set1_hpr(a[1] + shift);
88 gmx_2hpr_to_pr(a0_a1_S, a0_S, a1_S);
93 /* Copies PBC shifted i-cell packed atom coordinates to working array */
94 static gmx_inline void
95 icell_set_x_simd_2xnn(int ci,
96 real shx, real shy, real shz,
98 int stride, const real *x,
99 nbnxn_list_work_t *work)
102 nbnxn_x_ci_simd_2xnn_t *x_ci;
104 x_ci = work->x_ci_simd_2xnn;
106 ia = X_IND_CI_SIMD_2XNN(ci);
108 x_ci->ix_SSE0 = gmx_set_2real_shift_pr(x + ia + 0*STRIDE_S + 0, shx);
109 x_ci->iy_SSE0 = gmx_set_2real_shift_pr(x + ia + 1*STRIDE_S + 0, shy);
110 x_ci->iz_SSE0 = gmx_set_2real_shift_pr(x + ia + 2*STRIDE_S + 0, shz);
111 x_ci->ix_SSE2 = gmx_set_2real_shift_pr(x + ia + 0*STRIDE_S + 2, shx);
112 x_ci->iy_SSE2 = gmx_set_2real_shift_pr(x + ia + 1*STRIDE_S + 2, shy);
113 x_ci->iz_SSE2 = gmx_set_2real_shift_pr(x + ia + 2*STRIDE_S + 2, shz);
116 #ifndef GMX_HAVE_SIMD_ANYTRUE
117 /* Fallback function in case gmx_anytrue_pr is not present */
118 static gmx_inline gmx_bool
119 gmx_anytrue_2xn_pr(gmx_mm_pr bool_S)
121 real bools_array[2*GMX_SIMD_WIDTH_HERE], *bools;
125 bools = gmx_simd_align_real(bools_array);
127 gmx_store_pr(bools, bool_S);
130 for (s = 0; s < GMX_SIMD_WIDTH_HERE; s++)
132 if (GMX_SIMD_IS_TRUE(s))
142 /* SIMD code for making a pair list of cell ci vs cell cjf-cjl
143 * for coordinates in packed format.
144 * Checks bouding box distances and possibly atom pair distances.
145 * This is an accelerated version of make_cluster_list_simple.
147 static gmx_inline void
148 make_cluster_list_simd_2xnn(const nbnxn_grid_t *gridj,
149 nbnxn_pairlist_t *nbl,
150 int ci, int cjf, int cjl,
151 gmx_bool remove_sub_diag,
153 real rl2, float rbb2,
156 const nbnxn_x_ci_simd_2xnn_t *work;
159 gmx_mm_pr jx_SSE, jy_SSE, jz_SSE;
161 gmx_mm_pr dx_SSE0, dy_SSE0, dz_SSE0;
162 gmx_mm_pr dx_SSE2, dy_SSE2, dz_SSE2;
169 gmx_mm_pr wco_any_SSE;
175 int xind_f, xind_l, cj;
177 cjf = CI_TO_CJ_SIMD_2XNN(cjf);
178 cjl = CI_TO_CJ_SIMD_2XNN(cjl+1) - 1;
180 work = nbl->work->x_ci_simd_2xnn;
182 bb_ci = nbl->work->bb_ci;
184 rc2_SSE = gmx_set1_pr(rl2);
187 while (!InRange && cjf <= cjl)
189 d2 = subc_bb_dist2_sse(4, 0, bb_ci, cjf, gridj->bbj);
192 /* Check if the distance is within the distance where
193 * we use only the bounding box distance rbb,
194 * or within the cut-off and there is at least one atom pair
195 * within the cut-off.
203 xind_f = X_IND_CJ_SIMD_2XNN(CI_TO_CJ_SIMD_2XNN(gridj->cell0) + cjf);
205 jx_SSE = gmx_load_hpr_hilo_pr(x_j+xind_f+0*STRIDE_S);
206 jy_SSE = gmx_load_hpr_hilo_pr(x_j+xind_f+1*STRIDE_S);
207 jz_SSE = gmx_load_hpr_hilo_pr(x_j+xind_f+2*STRIDE_S);
209 /* Calculate distance */
210 dx_SSE0 = gmx_sub_pr(work->ix_SSE0, jx_SSE);
211 dy_SSE0 = gmx_sub_pr(work->iy_SSE0, jy_SSE);
212 dz_SSE0 = gmx_sub_pr(work->iz_SSE0, jz_SSE);
213 dx_SSE2 = gmx_sub_pr(work->ix_SSE2, jx_SSE);
214 dy_SSE2 = gmx_sub_pr(work->iy_SSE2, jy_SSE);
215 dz_SSE2 = gmx_sub_pr(work->iz_SSE2, jz_SSE);
217 /* rsq = dx*dx+dy*dy+dz*dz */
218 rsq_SSE0 = gmx_calc_rsq_pr(dx_SSE0, dy_SSE0, dz_SSE0);
219 rsq_SSE2 = gmx_calc_rsq_pr(dx_SSE2, dy_SSE2, dz_SSE2);
221 wco_SSE0 = gmx_cmplt_pr(rsq_SSE0, rc2_SSE);
222 wco_SSE2 = gmx_cmplt_pr(rsq_SSE2, rc2_SSE);
224 wco_any_SSE = gmx_or_pr(wco_SSE0, wco_SSE2);
226 #ifdef GMX_HAVE_SIMD_ANYTRUE
227 InRange = gmx_anytrue_pr(wco_any_SSE);
229 InRange = gmx_anytrue_2xn_pr(wco_any_SSE);
232 *ndistc += 2*GMX_SIMD_WIDTH_HERE;
245 while (!InRange && cjl > cjf)
247 d2 = subc_bb_dist2_sse(4, 0, bb_ci, cjl, gridj->bbj);
250 /* Check if the distance is within the distance where
251 * we use only the bounding box distance rbb,
252 * or within the cut-off and there is at least one atom pair
253 * within the cut-off.
261 xind_l = X_IND_CJ_SIMD_2XNN(CI_TO_CJ_SIMD_2XNN(gridj->cell0) + cjl);
263 jx_SSE = gmx_load_hpr_hilo_pr(x_j+xind_l+0*STRIDE_S);
264 jy_SSE = gmx_load_hpr_hilo_pr(x_j+xind_l+1*STRIDE_S);
265 jz_SSE = gmx_load_hpr_hilo_pr(x_j+xind_l+2*STRIDE_S);
267 /* Calculate distance */
268 dx_SSE0 = gmx_sub_pr(work->ix_SSE0, jx_SSE);
269 dy_SSE0 = gmx_sub_pr(work->iy_SSE0, jy_SSE);
270 dz_SSE0 = gmx_sub_pr(work->iz_SSE0, jz_SSE);
271 dx_SSE2 = gmx_sub_pr(work->ix_SSE2, jx_SSE);
272 dy_SSE2 = gmx_sub_pr(work->iy_SSE2, jy_SSE);
273 dz_SSE2 = gmx_sub_pr(work->iz_SSE2, jz_SSE);
275 /* rsq = dx*dx+dy*dy+dz*dz */
276 rsq_SSE0 = gmx_calc_rsq_pr(dx_SSE0, dy_SSE0, dz_SSE0);
277 rsq_SSE2 = gmx_calc_rsq_pr(dx_SSE2, dy_SSE2, dz_SSE2);
279 wco_SSE0 = gmx_cmplt_pr(rsq_SSE0, rc2_SSE);
280 wco_SSE2 = gmx_cmplt_pr(rsq_SSE2, rc2_SSE);
282 wco_any_SSE = gmx_or_pr(wco_SSE0, wco_SSE2);
284 #ifdef GMX_HAVE_SIMD_ANYTRUE
285 InRange = gmx_anytrue_pr(wco_any_SSE);
287 InRange = gmx_anytrue_2xn_pr(wco_any_SSE);
290 *ndistc += 2*GMX_SIMD_WIDTH_HERE;
300 for (cj = cjf; cj <= cjl; cj++)
302 /* Store cj and the interaction mask */
303 nbl->cj[nbl->ncj].cj = CI_TO_CJ_SIMD_2XNN(gridj->cell0) + cj;
304 nbl->cj[nbl->ncj].excl = get_imask_simd_2xnn(remove_sub_diag, ci, cj);
307 /* Increase the closing index in i super-cell list */
308 nbl->ci[nbl->nci].cj_ind_end = nbl->ncj;
317 #undef gmx_2hpr_to_pr