1 | |
2 | |
3 | |
4 | |
5 | |
6 | |
7 | |
8 | |
9 | |
10 | |
11 | |
12 | |
13 | |
14 | |
15 | |
16 | |
17 | |
18 | |
19 | |
20 | |
21 | |
22 | |
23 | |
24 | |
25 | |
26 | |
27 | |
28 | |
29 | |
30 | |
31 | |
32 | |
33 | |
34 | |
35 | |
36 | |
37 | { |
38 | const nbnxn_ci_t *nbln; |
39 | const nbnxn_cj_t *l_cj; |
40 | const int * type; |
41 | const real * q; |
42 | const real *shiftvec; |
43 | const real *x; |
44 | const real *nbfp0, *nbfp1, *nbfp2 = NULL((void*)0), *nbfp3 = NULL((void*)0); |
45 | real facel; |
46 | real *nbfp_ptr; |
47 | int n, ci, ci_sh; |
48 | int ish, ish3; |
49 | gmx_bool do_LJ, half_LJ, do_coul, do_self; |
50 | int sci, scix, sciy, sciz, sci2; |
51 | int cjind0, cjind1, cjind; |
52 | int ip, jp; |
53 | |
54 | #ifdef ENERGY_GROUPS |
55 | int Vstride_i; |
56 | int egps_ishift, egps_imask; |
57 | int egps_jshift, egps_jmask, egps_jstride; |
58 | int egps_i; |
59 | real *vvdwtp[UNROLLI4]; |
60 | real *vctp[UNROLLI4]; |
61 | #endif |
62 | |
63 | gmx_simd_real_t__m128 shX_S; |
64 | gmx_simd_real_t__m128 shY_S; |
65 | gmx_simd_real_t__m128 shZ_S; |
66 | gmx_simd_real_t__m128 ix_S0, iy_S0, iz_S0; |
67 | gmx_simd_real_t__m128 ix_S1, iy_S1, iz_S1; |
68 | gmx_simd_real_t__m128 ix_S2, iy_S2, iz_S2; |
69 | gmx_simd_real_t__m128 ix_S3, iy_S3, iz_S3; |
70 | gmx_simd_real_t__m128 fix_S0, fiy_S0, fiz_S0; |
71 | gmx_simd_real_t__m128 fix_S1, fiy_S1, fiz_S1; |
72 | gmx_simd_real_t__m128 fix_S2, fiy_S2, fiz_S2; |
73 | gmx_simd_real_t__m128 fix_S3, fiy_S3, fiz_S3; |
74 | #if UNROLLJ(4/1) >= 4 |
75 | |
76 | gmx_simd4_real_t__m128 fix_S, fiy_S, fiz_S; |
77 | #else |
78 | |
79 | gmx_simd_real_t__m128 fix0_S, fiy0_S, fiz0_S; |
80 | gmx_simd_real_t__m128 fix2_S, fiy2_S, fiz2_S; |
81 | #endif |
82 | |
83 | gmx_simd_real_t__m128 diagonal_jmi_S; |
84 | #if UNROLLI4 == UNROLLJ(4/1) |
85 | gmx_simd_bool_t__m128 diagonal_mask_S0, diagonal_mask_S1, diagonal_mask_S2, diagonal_mask_S3; |
86 | #else |
87 | gmx_simd_bool_t__m128 diagonal_mask0_S0, diagonal_mask0_S1, diagonal_mask0_S2, diagonal_mask0_S3; |
88 | gmx_simd_bool_t__m128 diagonal_mask1_S0, diagonal_mask1_S1, diagonal_mask1_S2, diagonal_mask1_S3; |
89 | #endif |
90 | |
91 | unsigned *exclusion_filter; |
92 | gmx_exclfilter filter_S0, filter_S1, filter_S2, filter_S3; |
93 | |
94 | gmx_simd_real_t__m128 zero_S = gmx_simd_set1_r_mm_set1_ps(0.0); |
95 | |
96 | gmx_simd_real_t__m128 one_S = gmx_simd_set1_r_mm_set1_ps(1.0); |
97 | gmx_simd_real_t__m128 iq_S0 = gmx_simd_setzero_r_mm_setzero_ps(); |
98 | gmx_simd_real_t__m128 iq_S1 = gmx_simd_setzero_r_mm_setzero_ps(); |
99 | gmx_simd_real_t__m128 iq_S2 = gmx_simd_setzero_r_mm_setzero_ps(); |
100 | gmx_simd_real_t__m128 iq_S3 = gmx_simd_setzero_r_mm_setzero_ps(); |
101 | |
102 | #ifdef CALC_COUL_RF |
103 | gmx_simd_real_t__m128 mrc_3_S; |
104 | #ifdef CALC_ENERGIES |
105 | gmx_simd_real_t__m128 hrc_3_S, moh_rc_S; |
106 | #endif |
107 | #endif |
108 | |
109 | #ifdef CALC_COUL_TAB |
110 | |
111 | gmx_simd_real_t__m128 invtsp_S; |
112 | const real * tab_coul_F; |
113 | #ifndef TAB_FDV0 |
114 | const real * tab_coul_V; |
115 | #endif |
116 | |
117 | int ti0_array[2*GMX_SIMD_REAL_WIDTH4], *ti0 = NULL((void*)0); |
118 | int ti1_array[2*GMX_SIMD_REAL_WIDTH4], *ti1 = NULL((void*)0); |
119 | int ti2_array[2*GMX_SIMD_REAL_WIDTH4], *ti2 = NULL((void*)0); |
120 | int ti3_array[2*GMX_SIMD_REAL_WIDTH4], *ti3 = NULL((void*)0); |
121 | #ifdef CALC_ENERGIES |
122 | gmx_simd_real_t__m128 mhalfsp_S; |
123 | #endif |
124 | #endif |
125 | |
126 | #ifdef CALC_COUL_EWALD |
127 | gmx_simd_real_t__m128 beta2_S, beta_S; |
128 | #endif |
129 | |
130 | #if defined CALC_ENERGIES && (defined CALC_COUL_EWALD || defined CALC_COUL_TAB) |
131 | gmx_simd_real_t__m128 sh_ewald_S; |
132 | #endif |
133 | |
134 | #if defined LJ_CUT && defined CALC_ENERGIES |
135 | gmx_simd_real_t__m128 p6_cpot_S, p12_cpot_S; |
136 | #endif |
137 | #ifdef LJ_POT_SWITCH |
138 | gmx_simd_real_t__m128 rswitch_S; |
139 | gmx_simd_real_t__m128 swV3_S, swV4_S, swV5_S; |
140 | gmx_simd_real_t__m128 swF2_S, swF3_S, swF4_S; |
141 | #endif |
142 | #ifdef LJ_FORCE_SWITCH |
143 | gmx_simd_real_t__m128 rswitch_S; |
144 | gmx_simd_real_t__m128 p6_fc2_S, p6_fc3_S; |
145 | gmx_simd_real_t__m128 p12_fc2_S, p12_fc3_S; |
146 | #ifdef CALC_ENERGIES |
147 | gmx_simd_real_t__m128 p6_vc3_S, p6_vc4_S; |
148 | gmx_simd_real_t__m128 p12_vc3_S, p12_vc4_S; |
149 | gmx_simd_real_t__m128 p6_6cpot_S, p12_12cpot_S; |
150 | #endif |
151 | #endif |
152 | #ifdef LJ_EWALD_GEOM |
153 | real lj_ewaldcoeff2, lj_ewaldcoeff6_6; |
154 | gmx_simd_real_t__m128 mone_S, half_S, lje_c2_S, lje_c6_6_S, lje_vc_S; |
155 | #endif |
156 | |
157 | #ifdef LJ_COMB_LB |
158 | const real *ljc; |
159 | |
160 | gmx_simd_real_t__m128 hsig_i_S0, seps_i_S0; |
161 | gmx_simd_real_t__m128 hsig_i_S1, seps_i_S1; |
162 | gmx_simd_real_t__m128 hsig_i_S2, seps_i_S2; |
163 | gmx_simd_real_t__m128 hsig_i_S3, seps_i_S3; |
164 | #else |
165 | #ifdef FIX_LJ_C |
166 | real pvdw_array[2*UNROLLI4*UNROLLJ(4/1)+3]; |
167 | real *pvdw_c6, *pvdw_c12; |
168 | gmx_simd_real_t__m128 c6_S0, c12_S0; |
169 | gmx_simd_real_t__m128 c6_S1, c12_S1; |
170 | gmx_simd_real_t__m128 c6_S2, c12_S2; |
171 | gmx_simd_real_t__m128 c6_S3, c12_S3; |
172 | #endif |
173 | |
174 | #if defined LJ_COMB_GEOM || defined LJ_EWALD_GEOM |
175 | const real *ljc; |
176 | |
177 | gmx_simd_real_t__m128 c6s_S0, c12s_S0; |
178 | gmx_simd_real_t__m128 c6s_S1, c12s_S1; |
179 | gmx_simd_real_t__m128 c6s_S2 = gmx_simd_setzero_r_mm_setzero_ps(); |
180 | gmx_simd_real_t__m128 c12s_S2 = gmx_simd_setzero_r_mm_setzero_ps(); |
181 | gmx_simd_real_t__m128 c6s_S3 = gmx_simd_setzero_r_mm_setzero_ps(); |
182 | gmx_simd_real_t__m128 c12s_S3 = gmx_simd_setzero_r_mm_setzero_ps(); |
183 | #endif |
184 | #endif /* LJ_COMB_LB */ |
185 | |
186 | gmx_simd_real_t__m128 vctot_S, Vvdwtot_S; |
187 | gmx_simd_real_t__m128 sixth_S, twelveth_S; |
188 | |
189 | gmx_simd_real_t__m128 avoid_sing_S; |
190 | gmx_simd_real_t__m128 rc2_S; |
191 | #ifdef VDW_CUTOFF_CHECK |
192 | gmx_simd_real_t__m128 rcvdw2_S; |
193 | #endif |
194 | |
195 | int ninner; |
196 | |
197 | #ifdef COUNT_PAIRS |
198 | int npair = 0; |
199 | #endif |
200 | |
201 | #if defined LJ_COMB_GEOM || defined LJ_COMB_LB || defined LJ_EWALD_GEOM |
202 | ljc = nbat->lj_comb; |
203 | #endif |
204 | #if !(defined LJ_COMB_GEOM || defined LJ_COMB_LB) |
205 | |
206 | nbfp_ptr = (4 == nbfp_stride) ? nbat->nbfp_s4 : nbat->nbfp; |
207 | #endif |
208 | |
209 | |
210 | diagonal_jmi_S = gmx_simd_load_r_mm_load_ps(nbat->simd_4xn_diagonal_j_minus_i); |
211 | |
212 | #if UNROLLI4 == UNROLLJ(4/1) |
213 | diagonal_mask_S0 = gmx_simd_cmplt_r_mm_cmplt_ps(zero_S, diagonal_jmi_S); |
214 | diagonal_jmi_S = gmx_simd_sub_r_mm_sub_ps(diagonal_jmi_S, one_S); |
215 | diagonal_mask_S1 = gmx_simd_cmplt_r_mm_cmplt_ps(zero_S, diagonal_jmi_S); |
216 | diagonal_jmi_S = gmx_simd_sub_r_mm_sub_ps(diagonal_jmi_S, one_S); |
217 | diagonal_mask_S2 = gmx_simd_cmplt_r_mm_cmplt_ps(zero_S, diagonal_jmi_S); |
218 | diagonal_jmi_S = gmx_simd_sub_r_mm_sub_ps(diagonal_jmi_S, one_S); |
219 | diagonal_mask_S3 = gmx_simd_cmplt_r_mm_cmplt_ps(zero_S, diagonal_jmi_S); |
220 | #else |
221 | #if UNROLLI4 == 2*UNROLLJ(4/1) || 2*UNROLLI4 == UNROLLJ(4/1) |
222 | diagonal_mask0_S0 = gmx_simd_cmplt_r_mm_cmplt_ps(zero_S, diagonal_jmi_S); |
223 | diagonal_jmi_S = gmx_simd_sub_r_mm_sub_ps(diagonal_jmi_S, one_S); |
224 | diagonal_mask0_S1 = gmx_simd_cmplt_r_mm_cmplt_ps(zero_S, diagonal_jmi_S); |
225 | diagonal_jmi_S = gmx_simd_sub_r_mm_sub_ps(diagonal_jmi_S, one_S); |
226 | diagonal_mask0_S2 = gmx_simd_cmplt_r_mm_cmplt_ps(zero_S, diagonal_jmi_S); |
227 | diagonal_jmi_S = gmx_simd_sub_r_mm_sub_ps(diagonal_jmi_S, one_S); |
228 | diagonal_mask0_S3 = gmx_simd_cmplt_r_mm_cmplt_ps(zero_S, diagonal_jmi_S); |
229 | diagonal_jmi_S = gmx_simd_sub_r_mm_sub_ps(diagonal_jmi_S, one_S); |
230 | |
231 | #if UNROLLI4 == 2*UNROLLJ(4/1) |
232 | |
233 | diagonal_jmi_S = gmx_simd_load_r_mm_load_ps(nbat->simd_4xn_diagonal_j_minus_i + UNROLLJ(4/1)); |
234 | #endif |
235 | |
236 | diagonal_mask1_S0 = gmx_simd_cmplt_r_mm_cmplt_ps(zero_S, diagonal_jmi_S); |
237 | diagonal_jmi_S = gmx_simd_sub_r_mm_sub_ps(diagonal_jmi_S, one_S); |
238 | diagonal_mask1_S1 = gmx_simd_cmplt_r_mm_cmplt_ps(zero_S, diagonal_jmi_S); |
239 | diagonal_jmi_S = gmx_simd_sub_r_mm_sub_ps(diagonal_jmi_S, one_S); |
240 | diagonal_mask1_S2 = gmx_simd_cmplt_r_mm_cmplt_ps(zero_S, diagonal_jmi_S); |
241 | diagonal_jmi_S = gmx_simd_sub_r_mm_sub_ps(diagonal_jmi_S, one_S); |
242 | diagonal_mask1_S3 = gmx_simd_cmplt_r_mm_cmplt_ps(zero_S, diagonal_jmi_S); |
243 | #endif |
244 | #endif |
245 | |
246 | |
247 | |
248 | if (1 == filter_stride) |
249 | { |
250 | exclusion_filter = nbat->simd_exclusion_filter1; |
251 | } |
252 | else |
253 | { |
254 | exclusion_filter = nbat->simd_exclusion_filter2; |
255 | } |
256 | |
257 | |
258 | |
259 | |
260 | |
261 | filter_S0 = gmx_load_exclusion_filter(exclusion_filter + 0*UNROLLJ(4/1)*filter_stride); |
262 | filter_S1 = gmx_load_exclusion_filter(exclusion_filter + 1*UNROLLJ(4/1)*filter_stride); |
263 | filter_S2 = gmx_load_exclusion_filter(exclusion_filter + 2*UNROLLJ(4/1)*filter_stride); |
264 | filter_S3 = gmx_load_exclusion_filter(exclusion_filter + 3*UNROLLJ(4/1)*filter_stride); |
265 | |
266 | #ifdef CALC_COUL_RF |
267 | |
268 | mrc_3_S = gmx_simd_set1_r_mm_set1_ps(-2*ic->k_rf); |
269 | #ifdef CALC_ENERGIES |
270 | hrc_3_S = gmx_simd_set1_r_mm_set1_ps(ic->k_rf); |
271 | moh_rc_S = gmx_simd_set1_r_mm_set1_ps(-ic->c_rf); |
272 | #endif |
273 | #endif |
274 | |
275 | #ifdef CALC_COUL_TAB |
276 | |
277 | ti0 = prepare_table_load_buffer(ti0_array); |
278 | ti1 = prepare_table_load_buffer(ti1_array); |
279 | ti2 = prepare_table_load_buffer(ti2_array); |
280 | ti3 = prepare_table_load_buffer(ti3_array); |
281 | |
282 | invtsp_S = gmx_simd_set1_r_mm_set1_ps(ic->tabq_scale); |
283 | #ifdef CALC_ENERGIES |
284 | mhalfsp_S = gmx_simd_set1_r_mm_set1_ps(-0.5/ic->tabq_scale); |
285 | #endif |
286 | |
287 | #ifdef TAB_FDV0 |
288 | tab_coul_F = ic->tabq_coul_FDV0; |
289 | #else |
290 | tab_coul_F = ic->tabq_coul_F; |
291 | tab_coul_V = ic->tabq_coul_V; |
292 | #endif |
293 | #endif /* CALC_COUL_TAB */ |
294 | |
295 | #ifdef CALC_COUL_EWALD |
296 | beta2_S = gmx_simd_set1_r_mm_set1_ps(ic->ewaldcoeff_q*ic->ewaldcoeff_q); |
297 | beta_S = gmx_simd_set1_r_mm_set1_ps(ic->ewaldcoeff_q); |
298 | #endif |
299 | |
300 | #if (defined CALC_COUL_TAB || defined CALC_COUL_EWALD) && defined CALC_ENERGIES |
301 | sh_ewald_S = gmx_simd_set1_r_mm_set1_ps(ic->sh_ewald); |
302 | #endif |
303 | |
304 | |
305 | #if defined CALC_ENERGIES || defined LJ_POT_SWITCH |
306 | sixth_S = gmx_simd_set1_r_mm_set1_ps(1.0/6.0); |
307 | twelveth_S = gmx_simd_set1_r_mm_set1_ps(1.0/12.0); |
308 | #endif |
309 | |
310 | #if defined LJ_CUT && defined CALC_ENERGIES |
311 | |
312 | p6_cpot_S = gmx_simd_set1_r_mm_set1_ps(ic->dispersion_shift.cpot); |
313 | p12_cpot_S = gmx_simd_set1_r_mm_set1_ps(ic->repulsion_shift.cpot); |
314 | #endif |
315 | #ifdef LJ_POT_SWITCH |
316 | rswitch_S = gmx_simd_set1_r_mm_set1_ps(ic->rvdw_switch); |
317 | swV3_S = gmx_simd_set1_r_mm_set1_ps(ic->vdw_switch.c3); |
318 | swV4_S = gmx_simd_set1_r_mm_set1_ps(ic->vdw_switch.c4); |
319 | swV5_S = gmx_simd_set1_r_mm_set1_ps(ic->vdw_switch.c5); |
320 | swF2_S = gmx_simd_set1_r_mm_set1_ps(3*ic->vdw_switch.c3); |
321 | swF3_S = gmx_simd_set1_r_mm_set1_ps(4*ic->vdw_switch.c4); |
322 | swF4_S = gmx_simd_set1_r_mm_set1_ps(5*ic->vdw_switch.c5); |
323 | #endif |
324 | #ifdef LJ_FORCE_SWITCH |
325 | rswitch_S = gmx_simd_set1_r_mm_set1_ps(ic->rvdw_switch); |
326 | p6_fc2_S = gmx_simd_set1_r_mm_set1_ps(ic->dispersion_shift.c2); |
327 | p6_fc3_S = gmx_simd_set1_r_mm_set1_ps(ic->dispersion_shift.c3); |
328 | p12_fc2_S = gmx_simd_set1_r_mm_set1_ps(ic->repulsion_shift.c2); |
329 | p12_fc3_S = gmx_simd_set1_r_mm_set1_ps(ic->repulsion_shift.c3); |
330 | #ifdef CALC_ENERGIES |
331 | { |
332 | gmx_simd_real_t__m128 mthird_S = gmx_simd_set1_r_mm_set1_ps(-1.0/3.0); |
333 | gmx_simd_real_t__m128 mfourth_S = gmx_simd_set1_r_mm_set1_ps(-1.0/4.0); |
334 | |
335 | p6_vc3_S = gmx_simd_mul_r_mm_mul_ps(mthird_S, p6_fc2_S); |
336 | p6_vc4_S = gmx_simd_mul_r_mm_mul_ps(mfourth_S, p6_fc3_S); |
337 | p6_6cpot_S = gmx_simd_set1_r_mm_set1_ps(ic->dispersion_shift.cpot/6); |
338 | p12_vc3_S = gmx_simd_mul_r_mm_mul_ps(mthird_S, p12_fc2_S); |
339 | p12_vc4_S = gmx_simd_mul_r_mm_mul_ps(mfourth_S, p12_fc3_S); |
340 | p12_12cpot_S = gmx_simd_set1_r_mm_set1_ps(ic->repulsion_shift.cpot/12); |
341 | } |
342 | #endif |
343 | #endif |
344 | #ifdef LJ_EWALD_GEOM |
345 | mone_S = gmx_simd_set1_r_mm_set1_ps(-1.0); |
346 | half_S = gmx_simd_set1_r_mm_set1_ps(0.5); |
347 | lj_ewaldcoeff2 = ic->ewaldcoeff_lj*ic->ewaldcoeff_lj; |
348 | lj_ewaldcoeff6_6 = lj_ewaldcoeff2*lj_ewaldcoeff2*lj_ewaldcoeff2/6; |
349 | lje_c2_S = gmx_simd_set1_r_mm_set1_ps(lj_ewaldcoeff2); |
350 | lje_c6_6_S = gmx_simd_set1_r_mm_set1_ps(lj_ewaldcoeff6_6); |
351 | |
352 | lje_vc_S = gmx_simd_set1_r_mm_set1_ps(ic->sh_lj_ewald); |
353 | #endif |
354 | |
355 | |
356 | rc2_S = gmx_simd_set1_r_mm_set1_ps(ic->rcoulomb*ic->rcoulomb); |
357 | #ifdef VDW_CUTOFF_CHECK |
358 | rcvdw2_S = gmx_simd_set1_r_mm_set1_ps(ic->rvdw*ic->rvdw); |
359 | #endif |
360 | |
361 | avoid_sing_S = gmx_simd_set1_r_mm_set1_ps(NBNXN_AVOID_SING_R2_INC1.0e-12f); |
362 | |
363 | q = nbat->q; |
364 | type = nbat->type; |
| Value stored to 'type' is never read |
365 | facel = ic->epsfac; |
366 | shiftvec = shift_vec[0]; |
367 | x = nbat->x; |
368 | |
369 | #ifdef FIX_LJ_C |
370 | pvdw_c6 = gmx_simd_align_real(pvdw_array); |
371 | pvdw_c12 = pvdw_c6 + UNROLLI4*UNROLLJ(4/1); |
372 | |
373 | for (jp = 0; jp < UNROLLJ(4/1); jp++) |
374 | { |
375 | pvdw_c6 [0*UNROLLJ(4/1)+jp] = nbat->nbfp[0*2]; |
376 | pvdw_c6 [1*UNROLLJ(4/1)+jp] = nbat->nbfp[0*2]; |
377 | pvdw_c6 [2*UNROLLJ(4/1)+jp] = nbat->nbfp[0*2]; |
378 | pvdw_c6 [3*UNROLLJ(4/1)+jp] = nbat->nbfp[0*2]; |
379 | |
380 | pvdw_c12[0*UNROLLJ(4/1)+jp] = nbat->nbfp[0*2+1]; |
381 | pvdw_c12[1*UNROLLJ(4/1)+jp] = nbat->nbfp[0*2+1]; |
382 | pvdw_c12[2*UNROLLJ(4/1)+jp] = nbat->nbfp[0*2+1]; |
383 | pvdw_c12[3*UNROLLJ(4/1)+jp] = nbat->nbfp[0*2+1]; |
384 | } |
385 | c6_S0 = gmx_simd_load_r_mm_load_ps(pvdw_c6 +0*UNROLLJ(4/1)); |
386 | c6_S1 = gmx_simd_load_r_mm_load_ps(pvdw_c6 +1*UNROLLJ(4/1)); |
387 | c6_S2 = gmx_simd_load_r_mm_load_ps(pvdw_c6 +2*UNROLLJ(4/1)); |
388 | c6_S3 = gmx_simd_load_r_mm_load_ps(pvdw_c6 +3*UNROLLJ(4/1)); |
389 | |
390 | c12_S0 = gmx_simd_load_r_mm_load_ps(pvdw_c12+0*UNROLLJ(4/1)); |
391 | c12_S1 = gmx_simd_load_r_mm_load_ps(pvdw_c12+1*UNROLLJ(4/1)); |
392 | c12_S2 = gmx_simd_load_r_mm_load_ps(pvdw_c12+2*UNROLLJ(4/1)); |
393 | c12_S3 = gmx_simd_load_r_mm_load_ps(pvdw_c12+3*UNROLLJ(4/1)); |
394 | #endif /* FIX_LJ_C */ |
395 | |
396 | #ifdef ENERGY_GROUPS |
397 | egps_ishift = nbat->neg_2log; |
398 | egps_imask = (1<<egps_ishift) - 1; |
399 | egps_jshift = 2*nbat->neg_2log; |
400 | egps_jmask = (1<<egps_jshift) - 1; |
401 | egps_jstride = (UNROLLJ(4/1)>>1)*UNROLLJ(4/1); |
402 | |
403 | Vstride_i = nbat->nenergrp*(1<<nbat->neg_2log)*egps_jstride; |
404 | #endif |
405 | |
406 | l_cj = nbl->cj; |
407 | |
408 | ninner = 0; |
409 | for (n = 0; n < nbl->nci; n++) |
410 | { |
411 | nbln = &nbl->ci[n]; |
412 | |
413 | ish = (nbln->shift & NBNXN_CI_SHIFT127); |
414 | ish3 = ish*3; |
415 | cjind0 = nbln->cj_ind_start; |
416 | cjind1 = nbln->cj_ind_end; |
417 | ci = nbln->ci; |
418 | ci_sh = (ish == CENTRAL(((2*1 +1)*(2*1 +1)*(2*2 +1))/2) ? ci : -1); |
419 | |
420 | shX_S = gmx_simd_load1_r_mm_load1_ps(shiftvec+ish3); |
421 | shY_S = gmx_simd_load1_r_mm_load1_ps(shiftvec+ish3+1); |
422 | shZ_S = gmx_simd_load1_r_mm_load1_ps(shiftvec+ish3+2); |
423 | |
424 | #if UNROLLJ(4/1) <= 4 |
425 | sci = ci*STRIDE(4/1); |
426 | scix = sci*DIM3; |
427 | sci2 = sci*2; |
428 | #else |
429 | sci = (ci>>1)*STRIDE(4/1); |
430 | scix = sci*DIM3 + (ci & 1)*(STRIDE(4/1)>>1); |
431 | sci2 = sci*2 + (ci & 1)*(STRIDE(4/1)>>1); |
432 | sci += (ci & 1)*(STRIDE(4/1)>>1); |
433 | #endif |
434 | |
435 | |
436 | |
437 | |
438 | |
439 | |
440 | |
441 | do_LJ = (nbln->shift & NBNXN_CI_DO_LJ(0)(1<<(7+3*(0)))); |
442 | do_coul = (nbln->shift & NBNXN_CI_DO_COUL(0)(1<<(9+3*(0)))); |
443 | half_LJ = ((nbln->shift & NBNXN_CI_HALF_LJ(0)(1<<(8+3*(0)))) || !do_LJ) && do_coul; |
444 | #ifdef LJ_EWALD_GEOM |
445 | do_self = TRUE1; |
446 | #else |
447 | do_self = do_coul; |
448 | #endif |
449 | |
450 | #ifdef ENERGY_GROUPS |
451 | egps_i = nbat->energrp[ci]; |
452 | { |
453 | int ia, egp_ia; |
454 | |
455 | for (ia = 0; ia < UNROLLI4; ia++) |
456 | { |
457 | egp_ia = (egps_i >> (ia*egps_ishift)) & egps_imask; |
458 | vvdwtp[ia] = Vvdw + egp_ia*Vstride_i; |
459 | vctp[ia] = Vc + egp_ia*Vstride_i; |
460 | } |
461 | } |
462 | #endif |
463 | |
464 | #ifdef CALC_ENERGIES |
465 | #if UNROLLJ(4/1) == 4 |
466 | if (do_self && l_cj[nbln->cj_ind_start].cj == ci_sh) |
467 | #endif |
468 | #if UNROLLJ(4/1) == 2 |
469 | if (do_self && l_cj[nbln->cj_ind_start].cj == (ci_sh<<1)) |
470 | #endif |
471 | #if UNROLLJ(4/1) == 8 |
472 | if (do_self && l_cj[nbln->cj_ind_start].cj == (ci_sh>>1)) |
473 | #endif |
474 | { |
475 | if (do_coul) |
476 | { |
477 | real Vc_sub_self; |
478 | int ia; |
479 | |
480 | #ifdef CALC_COUL_RF |
481 | Vc_sub_self = 0.5*ic->c_rf; |
482 | #endif |
483 | #ifdef CALC_COUL_TAB |
484 | #ifdef TAB_FDV0 |
485 | Vc_sub_self = 0.5*tab_coul_F[2]; |
486 | #else |
487 | Vc_sub_self = 0.5*tab_coul_V[0]; |
488 | #endif |
489 | #endif |
490 | #ifdef CALC_COUL_EWALD |
491 | |
492 | Vc_sub_self = 0.5*ic->ewaldcoeff_q*M_2_SQRTPI1.12837916709551257390; |
493 | #endif |
494 | |
495 | for (ia = 0; ia < UNROLLI4; ia++) |
496 | { |
497 | real qi; |
498 | |
499 | qi = q[sci+ia]; |
500 | #ifdef ENERGY_GROUPS |
501 | vctp[ia][((egps_i>>(ia*egps_ishift)) & egps_imask)*egps_jstride] |
502 | #else |
503 | Vc[0] |
504 | #endif |
505 | -= facel*qi*qi*Vc_sub_self; |
506 | } |
507 | } |
508 | |
509 | #ifdef LJ_EWALD_GEOM |
510 | { |
511 | int ia; |
512 | |
513 | for (ia = 0; ia < UNROLLI4; ia++) |
514 | { |
515 | real c6_i; |
516 | |
517 | c6_i = nbat->nbfp[nbat->type[sci+ia]*(nbat->ntype + 1)*2]/6; |
518 | #ifdef ENERGY_GROUPS |
519 | vvdwtp[ia][((egps_i>>(ia*egps_ishift)) & egps_imask)*egps_jstride] |
520 | #else |
521 | Vvdw[0] |
522 | #endif |
523 | += 0.5*c6_i*lj_ewaldcoeff6_6; |
524 | } |
525 | } |
526 | #endif /* LJ_EWALD_GEOM */ |
527 | } |
528 | #endif |
529 | |
530 | |
531 | sciy = scix + STRIDE(4/1); |
532 | sciz = sciy + STRIDE(4/1); |
533 | ix_S0 = gmx_simd_add_r_mm_add_ps(gmx_simd_load1_r_mm_load1_ps(x+scix), shX_S); |
534 | ix_S1 = gmx_simd_add_r_mm_add_ps(gmx_simd_load1_r_mm_load1_ps(x+scix+1), shX_S); |
535 | ix_S2 = gmx_simd_add_r_mm_add_ps(gmx_simd_load1_r_mm_load1_ps(x+scix+2), shX_S); |
536 | ix_S3 = gmx_simd_add_r_mm_add_ps(gmx_simd_load1_r_mm_load1_ps(x+scix+3), shX_S); |
537 | iy_S0 = gmx_simd_add_r_mm_add_ps(gmx_simd_load1_r_mm_load1_ps(x+sciy), shY_S); |
538 | iy_S1 = gmx_simd_add_r_mm_add_ps(gmx_simd_load1_r_mm_load1_ps(x+sciy+1), shY_S); |
539 | iy_S2 = gmx_simd_add_r_mm_add_ps(gmx_simd_load1_r_mm_load1_ps(x+sciy+2), shY_S); |
540 | iy_S3 = gmx_simd_add_r_mm_add_ps(gmx_simd_load1_r_mm_load1_ps(x+sciy+3), shY_S); |
541 | iz_S0 = gmx_simd_add_r_mm_add_ps(gmx_simd_load1_r_mm_load1_ps(x+sciz), shZ_S); |
542 | iz_S1 = gmx_simd_add_r_mm_add_ps(gmx_simd_load1_r_mm_load1_ps(x+sciz+1), shZ_S); |
543 | iz_S2 = gmx_simd_add_r_mm_add_ps(gmx_simd_load1_r_mm_load1_ps(x+sciz+2), shZ_S); |
544 | iz_S3 = gmx_simd_add_r_mm_add_ps(gmx_simd_load1_r_mm_load1_ps(x+sciz+3), shZ_S); |
545 | |
546 | if (do_coul) |
547 | { |
548 | iq_S0 = gmx_simd_set1_r_mm_set1_ps(facel*q[sci]); |
549 | iq_S1 = gmx_simd_set1_r_mm_set1_ps(facel*q[sci+1]); |
550 | iq_S2 = gmx_simd_set1_r_mm_set1_ps(facel*q[sci+2]); |
551 | iq_S3 = gmx_simd_set1_r_mm_set1_ps(facel*q[sci+3]); |
552 | } |
553 | |
554 | #ifdef LJ_COMB_LB |
555 | hsig_i_S0 = gmx_simd_load1_r_mm_load1_ps(ljc+sci2+0); |
556 | hsig_i_S1 = gmx_simd_load1_r_mm_load1_ps(ljc+sci2+1); |
557 | hsig_i_S2 = gmx_simd_load1_r_mm_load1_ps(ljc+sci2+2); |
558 | hsig_i_S3 = gmx_simd_load1_r_mm_load1_ps(ljc+sci2+3); |
559 | seps_i_S0 = gmx_simd_load1_r_mm_load1_ps(ljc+sci2+STRIDE(4/1)+0); |
560 | seps_i_S1 = gmx_simd_load1_r_mm_load1_ps(ljc+sci2+STRIDE(4/1)+1); |
561 | seps_i_S2 = gmx_simd_load1_r_mm_load1_ps(ljc+sci2+STRIDE(4/1)+2); |
562 | seps_i_S3 = gmx_simd_load1_r_mm_load1_ps(ljc+sci2+STRIDE(4/1)+3); |
563 | #else |
564 | #ifdef LJ_COMB_GEOM |
565 | c6s_S0 = gmx_simd_load1_r_mm_load1_ps(ljc+sci2+0); |
566 | c6s_S1 = gmx_simd_load1_r_mm_load1_ps(ljc+sci2+1); |
567 | if (!half_LJ) |
568 | { |
569 | c6s_S2 = gmx_simd_load1_r_mm_load1_ps(ljc+sci2+2); |
570 | c6s_S3 = gmx_simd_load1_r_mm_load1_ps(ljc+sci2+3); |
571 | } |
572 | c12s_S0 = gmx_simd_load1_r_mm_load1_ps(ljc+sci2+STRIDE(4/1)+0); |
573 | c12s_S1 = gmx_simd_load1_r_mm_load1_ps(ljc+sci2+STRIDE(4/1)+1); |
574 | if (!half_LJ) |
575 | { |
576 | c12s_S2 = gmx_simd_load1_r_mm_load1_ps(ljc+sci2+STRIDE(4/1)+2); |
577 | c12s_S3 = gmx_simd_load1_r_mm_load1_ps(ljc+sci2+STRIDE(4/1)+3); |
578 | } |
579 | #else |
580 | nbfp0 = nbfp_ptr + type[sci ]*nbat->ntype*nbfp_stride; |
581 | nbfp1 = nbfp_ptr + type[sci+1]*nbat->ntype*nbfp_stride; |
582 | if (!half_LJ) |
583 | { |
584 | nbfp2 = nbfp_ptr + type[sci+2]*nbat->ntype*nbfp_stride; |
585 | nbfp3 = nbfp_ptr + type[sci+3]*nbat->ntype*nbfp_stride; |
586 | } |
587 | #endif |
588 | #endif |
589 | #ifdef LJ_EWALD_GEOM |
590 | |
591 | c6s_S0 = gmx_simd_load1_r_mm_load1_ps(ljc+sci2+0); |
592 | c6s_S1 = gmx_simd_load1_r_mm_load1_ps(ljc+sci2+1); |
593 | if (!half_LJ) |
594 | { |
595 | c6s_S2 = gmx_simd_load1_r_mm_load1_ps(ljc+sci2+2); |
596 | c6s_S3 = gmx_simd_load1_r_mm_load1_ps(ljc+sci2+3); |
597 | } |
598 | #endif |
599 | |
600 | |
601 | Vvdwtot_S = gmx_simd_setzero_r_mm_setzero_ps(); |
602 | vctot_S = gmx_simd_setzero_r_mm_setzero_ps(); |
603 | |
604 | |
605 | fix_S0 = gmx_simd_setzero_r_mm_setzero_ps(); |
606 | fix_S1 = gmx_simd_setzero_r_mm_setzero_ps(); |
607 | fix_S2 = gmx_simd_setzero_r_mm_setzero_ps(); |
608 | fix_S3 = gmx_simd_setzero_r_mm_setzero_ps(); |
609 | fiy_S0 = gmx_simd_setzero_r_mm_setzero_ps(); |
610 | fiy_S1 = gmx_simd_setzero_r_mm_setzero_ps(); |
611 | fiy_S2 = gmx_simd_setzero_r_mm_setzero_ps(); |
612 | fiy_S3 = gmx_simd_setzero_r_mm_setzero_ps(); |
613 | fiz_S0 = gmx_simd_setzero_r_mm_setzero_ps(); |
614 | fiz_S1 = gmx_simd_setzero_r_mm_setzero_ps(); |
615 | fiz_S2 = gmx_simd_setzero_r_mm_setzero_ps(); |
616 | fiz_S3 = gmx_simd_setzero_r_mm_setzero_ps(); |
617 | |
618 | cjind = cjind0; |
619 | |
620 | |
621 | #define CALC_LJ |
622 | if (half_LJ) |
623 | { |
624 | |
625 | #define CALC_COULOMB |
626 | #define HALF_LJ |
627 | #define CHECK_EXCLS |
628 | while (cjind < cjind1 && nbl->cj[cjind].excl != NBNXN_INTERACTION_MASK_ALL) |
629 | { |
630 | #include "nbnxn_kernel_simd_4xn_inner.h" |
631 | cjind++; |
632 | } |
633 | #undef CHECK_EXCLS |
634 | for (; (cjind < cjind1); cjind++) |
635 | { |
636 | #include "nbnxn_kernel_simd_4xn_inner.h" |
637 | } |
638 | #undef HALF_LJ |
639 | #undef CALC_COULOMB |
640 | } |
641 | else if (do_coul) |
642 | { |
643 | |
644 | #define CALC_COULOMB |
645 | #define CHECK_EXCLS |
646 | while (cjind < cjind1 && nbl->cj[cjind].excl != NBNXN_INTERACTION_MASK_ALL) |
647 | { |
648 | #include "nbnxn_kernel_simd_4xn_inner.h" |
649 | cjind++; |
650 | } |
651 | #undef CHECK_EXCLS |
652 | for (; (cjind < cjind1); cjind++) |
653 | { |
654 | #include "nbnxn_kernel_simd_4xn_inner.h" |
655 | } |
656 | #undef CALC_COULOMB |
657 | } |
658 | else |
659 | { |
660 | |
661 | #define CHECK_EXCLS |
662 | while (cjind < cjind1 && nbl->cj[cjind].excl != NBNXN_INTERACTION_MASK_ALL) |
663 | { |
664 | #include "nbnxn_kernel_simd_4xn_inner.h" |
665 | cjind++; |
666 | } |
667 | #undef CHECK_EXCLS |
668 | for (; (cjind < cjind1); cjind++) |
669 | { |
670 | #include "nbnxn_kernel_simd_4xn_inner.h" |
671 | } |
672 | } |
673 | #undef CALC_LJ |
674 | ninner += cjind1 - cjind0; |
675 | |
676 | |
677 | #if UNROLLJ(4/1) >= 4 |
678 | fix_S = gmx_mm_transpose_sum4_pr(fix_S0, fix_S1, fix_S2, fix_S3); |
679 | gmx_simd4_store_r_mm_store_ps(f+scix, gmx_simd4_add_r_mm_add_ps(fix_S, gmx_simd4_load_r_mm_load_ps(f+scix))); |
680 | |
681 | fiy_S = gmx_mm_transpose_sum4_pr(fiy_S0, fiy_S1, fiy_S2, fiy_S3); |
682 | gmx_simd4_store_r_mm_store_ps(f+sciy, gmx_simd4_add_r_mm_add_ps(fiy_S, gmx_simd4_load_r_mm_load_ps(f+sciy))); |
683 | |
684 | fiz_S = gmx_mm_transpose_sum4_pr(fiz_S0, fiz_S1, fiz_S2, fiz_S3); |
685 | gmx_simd4_store_r_mm_store_ps(f+sciz, gmx_simd4_add_r_mm_add_ps(fiz_S, gmx_simd4_load_r_mm_load_ps(f+sciz))); |
686 | |
687 | #ifdef CALC_SHIFTFORCES |
688 | fshift[ish3+0] += gmx_simd4_reduce_r(fix_S)gmx_simd_reduce_f_sse4_1(fix_S); |
689 | fshift[ish3+1] += gmx_simd4_reduce_r(fiy_S)gmx_simd_reduce_f_sse4_1(fiy_S); |
690 | fshift[ish3+2] += gmx_simd4_reduce_r(fiz_S)gmx_simd_reduce_f_sse4_1(fiz_S); |
691 | #endif |
692 | #else |
693 | fix0_S = gmx_mm_transpose_sum2_pr(fix_S0, fix_S1); |
694 | gmx_simd_store_r_mm_store_ps(f+scix, gmx_simd_add_r_mm_add_ps(fix0_S, gmx_simd_load_r_mm_load_ps(f+scix))); |
695 | fix2_S = gmx_mm_transpose_sum2_pr(fix_S2, fix_S3); |
696 | gmx_simd_store_r_mm_store_ps(f+scix+2, gmx_simd_add_r_mm_add_ps(fix2_S, gmx_simd_load_r_mm_load_ps(f+scix+2))); |
697 | |
698 | fiy0_S = gmx_mm_transpose_sum2_pr(fiy_S0, fiy_S1); |
699 | gmx_simd_store_r_mm_store_ps(f+sciy, gmx_simd_add_r_mm_add_ps(fiy0_S, gmx_simd_load_r_mm_load_ps(f+sciy))); |
700 | fiy2_S = gmx_mm_transpose_sum2_pr(fiy_S2, fiy_S3); |
701 | gmx_simd_store_r_mm_store_ps(f+sciy+2, gmx_simd_add_r_mm_add_ps(fiy2_S, gmx_simd_load_r_mm_load_ps(f+sciy+2))); |
702 | |
703 | fiz0_S = gmx_mm_transpose_sum2_pr(fiz_S0, fiz_S1); |
704 | gmx_simd_store_r_mm_store_ps(f+sciz, gmx_simd_add_r_mm_add_ps(fiz0_S, gmx_simd_load_r_mm_load_ps(f+sciz))); |
705 | fiz2_S = gmx_mm_transpose_sum2_pr(fiz_S2, fiz_S3); |
706 | gmx_simd_store_r_mm_store_ps(f+sciz+2, gmx_simd_add_r_mm_add_ps(fiz2_S, gmx_simd_load_r_mm_load_ps(f+sciz+2))); |
707 | |
708 | #ifdef CALC_SHIFTFORCES |
709 | fshift[ish3+0] += gmx_simd_reduce_r(gmx_simd_add_r(fix0_S, fix2_S))gmx_simd_reduce_f_sse4_1(_mm_add_ps(fix0_S, fix2_S)); |
710 | fshift[ish3+1] += gmx_simd_reduce_r(gmx_simd_add_r(fiy0_S, fiy2_S))gmx_simd_reduce_f_sse4_1(_mm_add_ps(fiy0_S, fiy2_S)); |
711 | fshift[ish3+2] += gmx_simd_reduce_r(gmx_simd_add_r(fiz0_S, fiz2_S))gmx_simd_reduce_f_sse4_1(_mm_add_ps(fiz0_S, fiz2_S)); |
712 | #endif |
713 | #endif |
714 | |
715 | #ifdef CALC_ENERGIES |
716 | if (do_coul) |
717 | { |
718 | *Vc += gmx_simd_reduce_r(vctot_S)gmx_simd_reduce_f_sse4_1(vctot_S); |
719 | } |
720 | |
721 | *Vvdw += gmx_simd_reduce_r(Vvdwtot_S)gmx_simd_reduce_f_sse4_1(Vvdwtot_S); |
722 | #endif |
723 | |
724 | |
725 | } |
726 | |
727 | #ifdef COUNT_PAIRS |
728 | printf("atom pairs %d\n", npair); |
729 | #endif |
730 | } |