2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
5 * Copyright (c) 2001-2012, The GROMACS development team,
6 * check out http://www.gromacs.org for more information.
7 * Copyright (c) 2012,2013, by the GROMACS development team, led by
8 * David van der Spoel, Berk Hess, Erik Lindahl, and including many
9 * others, as listed in the AUTHORS file in the top-level source
10 * directory and at http://www.gromacs.org.
12 * GROMACS is free software; you can redistribute it and/or
13 * modify it under the terms of the GNU Lesser General Public License
14 * as published by the Free Software Foundation; either version 2.1
15 * of the License, or (at your option) any later version.
17 * GROMACS is distributed in the hope that it will be useful,
18 * but WITHOUT ANY WARRANTY; without even the implied warranty of
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20 * Lesser General Public License for more details.
22 * You should have received a copy of the GNU Lesser General Public
23 * License along with GROMACS; if not, see
24 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
25 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
27 * If you want to redistribute modifications to GROMACS, please
28 * consider that scientific software is very special. Version
29 * control is crucial - bugs must be traceable. We will be happy to
30 * consider code for inclusion in the official distribution, but
31 * derived work must not be called official GROMACS. Details are found
32 * in the README & COPYING files - if they are missing, get the
33 * official version at http://www.gromacs.org.
35 * To help us fund GROMACS development, we humbly ask that you cite
36 * the research papers on the package. Check out http://www.gromacs.org.
39 #if GMX_NBNXN_SIMD_BITWIDTH == 128
40 #define GMX_MM128_HERE
42 #if GMX_NBNXN_SIMD_BITWIDTH == 256
43 #define GMX_MM256_HERE
45 #error "unsupported GMX_NBNXN_SIMD_BITWIDTH"
48 #include "gmx_simd_macros.h"
50 #if GMX_SIMD_WIDTH_HERE >= NBNXN_CPU_CLUSTER_I_SIZE
51 #define STRIDE_S (GMX_SIMD_WIDTH_HERE)
53 #define STRIDE_S NBNXN_CPU_CLUSTER_I_SIZE
56 /* Copies PBC shifted i-cell packed atom coordinates to working array */
57 static gmx_inline void
58 icell_set_x_simd_4xn(int ci,
59 real shx, real shy, real shz,
61 int stride, const real *x,
62 nbnxn_list_work_t *work)
65 nbnxn_x_ci_simd_4xn_t *x_ci;
67 x_ci = work->x_ci_simd_4xn;
69 ia = X_IND_CI_SIMD_4XN(ci);
71 x_ci->ix_SSE0 = gmx_set1_pr(x[ia + 0*STRIDE_S ] + shx);
72 x_ci->iy_SSE0 = gmx_set1_pr(x[ia + 1*STRIDE_S ] + shy);
73 x_ci->iz_SSE0 = gmx_set1_pr(x[ia + 2*STRIDE_S ] + shz);
74 x_ci->ix_SSE1 = gmx_set1_pr(x[ia + 0*STRIDE_S + 1] + shx);
75 x_ci->iy_SSE1 = gmx_set1_pr(x[ia + 1*STRIDE_S + 1] + shy);
76 x_ci->iz_SSE1 = gmx_set1_pr(x[ia + 2*STRIDE_S + 1] + shz);
77 x_ci->ix_SSE2 = gmx_set1_pr(x[ia + 0*STRIDE_S + 2] + shx);
78 x_ci->iy_SSE2 = gmx_set1_pr(x[ia + 1*STRIDE_S + 2] + shy);
79 x_ci->iz_SSE2 = gmx_set1_pr(x[ia + 2*STRIDE_S + 2] + shz);
80 x_ci->ix_SSE3 = gmx_set1_pr(x[ia + 0*STRIDE_S + 3] + shx);
81 x_ci->iy_SSE3 = gmx_set1_pr(x[ia + 1*STRIDE_S + 3] + shy);
82 x_ci->iz_SSE3 = gmx_set1_pr(x[ia + 2*STRIDE_S + 3] + shz);
85 /* SIMD code for making a pair list of cell ci vs cell cjf-cjl
86 * for coordinates in packed format.
87 * Checks bouding box distances and possibly atom pair distances.
88 * This is an accelerated version of make_cluster_list_simple.
90 static gmx_inline void
91 make_cluster_list_simd_4xn(const nbnxn_grid_t *gridj,
92 nbnxn_pairlist_t *nbl,
93 int ci, int cjf, int cjl,
94 gmx_bool remove_sub_diag,
99 const nbnxn_x_ci_simd_4xn_t *work;
102 gmx_mm_pr jx_SSE, jy_SSE, jz_SSE;
104 gmx_mm_pr dx_SSE0, dy_SSE0, dz_SSE0;
105 gmx_mm_pr dx_SSE1, dy_SSE1, dz_SSE1;
106 gmx_mm_pr dx_SSE2, dy_SSE2, dz_SSE2;
107 gmx_mm_pr dx_SSE3, dy_SSE3, dz_SSE3;
118 gmx_mm_pr wco_any_SSE01, wco_any_SSE23, wco_any_SSE;
124 int xind_f, xind_l, cj;
126 cjf = CI_TO_CJ_SIMD_4XN(cjf);
127 cjl = CI_TO_CJ_SIMD_4XN(cjl+1) - 1;
129 work = nbl->work->x_ci_simd_4xn;
131 bb_ci = nbl->work->bb_ci;
133 rc2_SSE = gmx_set1_pr(rl2);
136 while (!InRange && cjf <= cjl)
138 d2 = subc_bb_dist2_sse(4, 0, bb_ci, cjf, gridj->bbj);
141 /* Check if the distance is within the distance where
142 * we use only the bounding box distance rbb,
143 * or within the cut-off and there is at least one atom pair
144 * within the cut-off.
152 xind_f = X_IND_CJ_SIMD_4XN(CI_TO_CJ_SIMD_4XN(gridj->cell0) + cjf);
154 jx_SSE = gmx_load_pr(x_j+xind_f+0*STRIDE_S);
155 jy_SSE = gmx_load_pr(x_j+xind_f+1*STRIDE_S);
156 jz_SSE = gmx_load_pr(x_j+xind_f+2*STRIDE_S);
159 /* Calculate distance */
160 dx_SSE0 = gmx_sub_pr(work->ix_SSE0, jx_SSE);
161 dy_SSE0 = gmx_sub_pr(work->iy_SSE0, jy_SSE);
162 dz_SSE0 = gmx_sub_pr(work->iz_SSE0, jz_SSE);
163 dx_SSE1 = gmx_sub_pr(work->ix_SSE1, jx_SSE);
164 dy_SSE1 = gmx_sub_pr(work->iy_SSE1, jy_SSE);
165 dz_SSE1 = gmx_sub_pr(work->iz_SSE1, jz_SSE);
166 dx_SSE2 = gmx_sub_pr(work->ix_SSE2, jx_SSE);
167 dy_SSE2 = gmx_sub_pr(work->iy_SSE2, jy_SSE);
168 dz_SSE2 = gmx_sub_pr(work->iz_SSE2, jz_SSE);
169 dx_SSE3 = gmx_sub_pr(work->ix_SSE3, jx_SSE);
170 dy_SSE3 = gmx_sub_pr(work->iy_SSE3, jy_SSE);
171 dz_SSE3 = gmx_sub_pr(work->iz_SSE3, jz_SSE);
173 /* rsq = dx*dx+dy*dy+dz*dz */
174 rsq_SSE0 = gmx_calc_rsq_pr(dx_SSE0, dy_SSE0, dz_SSE0);
175 rsq_SSE1 = gmx_calc_rsq_pr(dx_SSE1, dy_SSE1, dz_SSE1);
176 rsq_SSE2 = gmx_calc_rsq_pr(dx_SSE2, dy_SSE2, dz_SSE2);
177 rsq_SSE3 = gmx_calc_rsq_pr(dx_SSE3, dy_SSE3, dz_SSE3);
179 wco_SSE0 = gmx_cmplt_pr(rsq_SSE0, rc2_SSE);
180 wco_SSE1 = gmx_cmplt_pr(rsq_SSE1, rc2_SSE);
181 wco_SSE2 = gmx_cmplt_pr(rsq_SSE2, rc2_SSE);
182 wco_SSE3 = gmx_cmplt_pr(rsq_SSE3, rc2_SSE);
184 wco_any_SSE01 = gmx_or_pr(wco_SSE0, wco_SSE1);
185 wco_any_SSE23 = gmx_or_pr(wco_SSE2, wco_SSE3);
186 wco_any_SSE = gmx_or_pr(wco_any_SSE01, wco_any_SSE23);
188 InRange = gmx_movemask_pr(wco_any_SSE);
190 *ndistc += 4*GMX_SIMD_WIDTH_HERE;
203 while (!InRange && cjl > cjf)
205 d2 = subc_bb_dist2_sse(4, 0, bb_ci, cjl, gridj->bbj);
208 /* Check if the distance is within the distance where
209 * we use only the bounding box distance rbb,
210 * or within the cut-off and there is at least one atom pair
211 * within the cut-off.
219 xind_l = X_IND_CJ_SIMD_4XN(CI_TO_CJ_SIMD_4XN(gridj->cell0) + cjl);
221 jx_SSE = gmx_load_pr(x_j+xind_l+0*STRIDE_S);
222 jy_SSE = gmx_load_pr(x_j+xind_l+1*STRIDE_S);
223 jz_SSE = gmx_load_pr(x_j+xind_l+2*STRIDE_S);
225 /* Calculate distance */
226 dx_SSE0 = gmx_sub_pr(work->ix_SSE0, jx_SSE);
227 dy_SSE0 = gmx_sub_pr(work->iy_SSE0, jy_SSE);
228 dz_SSE0 = gmx_sub_pr(work->iz_SSE0, jz_SSE);
229 dx_SSE1 = gmx_sub_pr(work->ix_SSE1, jx_SSE);
230 dy_SSE1 = gmx_sub_pr(work->iy_SSE1, jy_SSE);
231 dz_SSE1 = gmx_sub_pr(work->iz_SSE1, jz_SSE);
232 dx_SSE2 = gmx_sub_pr(work->ix_SSE2, jx_SSE);
233 dy_SSE2 = gmx_sub_pr(work->iy_SSE2, jy_SSE);
234 dz_SSE2 = gmx_sub_pr(work->iz_SSE2, jz_SSE);
235 dx_SSE3 = gmx_sub_pr(work->ix_SSE3, jx_SSE);
236 dy_SSE3 = gmx_sub_pr(work->iy_SSE3, jy_SSE);
237 dz_SSE3 = gmx_sub_pr(work->iz_SSE3, jz_SSE);
239 /* rsq = dx*dx+dy*dy+dz*dz */
240 rsq_SSE0 = gmx_calc_rsq_pr(dx_SSE0, dy_SSE0, dz_SSE0);
241 rsq_SSE1 = gmx_calc_rsq_pr(dx_SSE1, dy_SSE1, dz_SSE1);
242 rsq_SSE2 = gmx_calc_rsq_pr(dx_SSE2, dy_SSE2, dz_SSE2);
243 rsq_SSE3 = gmx_calc_rsq_pr(dx_SSE3, dy_SSE3, dz_SSE3);
245 wco_SSE0 = gmx_cmplt_pr(rsq_SSE0, rc2_SSE);
246 wco_SSE1 = gmx_cmplt_pr(rsq_SSE1, rc2_SSE);
247 wco_SSE2 = gmx_cmplt_pr(rsq_SSE2, rc2_SSE);
248 wco_SSE3 = gmx_cmplt_pr(rsq_SSE3, rc2_SSE);
250 wco_any_SSE01 = gmx_or_pr(wco_SSE0, wco_SSE1);
251 wco_any_SSE23 = gmx_or_pr(wco_SSE2, wco_SSE3);
252 wco_any_SSE = gmx_or_pr(wco_any_SSE01, wco_any_SSE23);
254 InRange = gmx_movemask_pr(wco_any_SSE);
256 *ndistc += 4*GMX_SIMD_WIDTH_HERE;
266 for (cj = cjf; cj <= cjl; cj++)
268 /* Store cj and the interaction mask */
269 nbl->cj[nbl->ncj].cj = CI_TO_CJ_SIMD_4XN(gridj->cell0) + cj;
270 nbl->cj[nbl->ncj].excl = get_imask_x86_simd_4xn(remove_sub_diag, ci, cj);
273 /* Increase the closing index in i super-cell list */
274 nbl->ci[nbl->nci].cj_ind_end = nbl->ncj;
279 #undef GMX_MM128_HERE
280 #undef GMX_MM256_HERE