File: | gromacs/mdlib/nbnxn_kernels/simd_4xn/nbnxn_kernel_simd_4xn_inner.h |
Location: | line 748, column 5 |
Description: | Value stored to 'rsw2_r_S1' is never read |
1 | /* |
2 | * This file is part of the GROMACS molecular simulation package. |
3 | * |
4 | * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by |
5 | * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, |
6 | * and including many others, as listed in the AUTHORS file in the |
7 | * top-level source directory and at http://www.gromacs.org. |
8 | * |
9 | * GROMACS is free software; you can redistribute it and/or |
10 | * modify it under the terms of the GNU Lesser General Public License |
11 | * as published by the Free Software Foundation; either version 2.1 |
12 | * of the License, or (at your option) any later version. |
13 | * |
14 | * GROMACS is distributed in the hope that it will be useful, |
15 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
16 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
17 | * Lesser General Public License for more details. |
18 | * |
19 | * You should have received a copy of the GNU Lesser General Public |
20 | * License along with GROMACS; if not, see |
21 | * http://www.gnu.org/licenses, or write to the Free Software Foundation, |
22 | * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. |
23 | * |
24 | * If you want to redistribute modifications to GROMACS, please |
25 | * consider that scientific software is very special. Version |
26 | * control is crucial - bugs must be traceable. We will be happy to |
27 | * consider code for inclusion in the official distribution, but |
28 | * derived work must not be called official GROMACS. Details are found |
29 | * in the README & COPYING files - if they are missing, get the |
30 | * official version at http://www.gromacs.org. |
31 | * |
32 | * To help us fund GROMACS development, we humbly ask that you cite |
33 | * the research papers on the package. Check out http://www.gromacs.org. |
34 | */ |
35 | |
36 | /* This is the innermost loop contents for the 4 x N atom SIMD kernel. |
37 | * This flavor of the kernel calculates interactions of 4 i-atoms |
38 | * with N j-atoms stored in N wide SIMD registers. |
39 | */ |
40 | |
41 | |
42 | /* When calculating RF or Ewald interactions we calculate the electrostatic/LJ |
43 | * forces on excluded atom pairs here in the non-bonded loops. |
44 | * But when energies and/or virial is required we calculate them |
45 | * separately to as then it is easier to separate the energy and virial |
46 | * contributions. |
47 | */ |
48 | #if defined CHECK_EXCLS && (defined CALC_COULOMB || defined LJ_EWALD_GEOM) |
49 | #define EXCL_FORCES |
50 | #endif |
51 | |
52 | /* Without exclusions and energies we only need to mask the cut-off, |
53 | * this can be faster when we have defined gmx_simd_blendv_r, i.e. an instruction |
54 | * that selects from two SIMD registers based on the contents of a third. |
55 | */ |
56 | #if !(defined CHECK_EXCLS || defined CALC_ENERGIES) && defined GMX_SIMD_HAVE_BLENDV |
57 | /* With RF and tabulated Coulomb we replace cmp+and with sub+blendv. |
58 | * With gcc this is slower, except for RF on Sandy Bridge. |
59 | * Tested with gcc 4.6.2, 4.6.3 and 4.7.1. |
60 | */ |
61 | #if (defined CALC_COUL_RF || defined CALC_COUL_TAB) && (!defined __GNUC__4 || (defined CALC_COUL_RF && defined GMX_SIMD_X86_AVX_256_OR_HIGHER)) |
62 | #define NBNXN_CUTOFF_USE_BLENDV |
63 | #endif |
64 | /* With analytical Ewald we replace cmp+and+and with sub+blendv+blendv. |
65 | * This is only faster with icc on Sandy Bridge (PS kernel slower than gcc 4.7). |
66 | * Tested with icc 13. |
67 | */ |
68 | #if defined CALC_COUL_EWALD && defined __INTEL_COMPILER && defined GMX_SIMD_X86_AVX_256_OR_HIGHER |
69 | #define NBNXN_CUTOFF_USE_BLENDV |
70 | #endif |
71 | #endif |
72 | |
73 | { |
74 | int cj, aj, ajx, ajy, ajz; |
75 | |
76 | #ifdef ENERGY_GROUPS |
77 | /* Energy group indices for two atoms packed into one int */ |
78 | int egp_jj[UNROLLJ(4/1)/2]; |
79 | #endif |
80 | |
81 | #ifdef CHECK_EXCLS |
82 | /* Interaction (non-exclusion) mask of all 1's or 0's */ |
83 | gmx_simd_bool_t__m128 interact_S0; |
84 | gmx_simd_bool_t__m128 interact_S1; |
85 | gmx_simd_bool_t__m128 interact_S2; |
86 | gmx_simd_bool_t__m128 interact_S3; |
87 | #endif |
88 | |
89 | gmx_simd_real_t__m128 jx_S, jy_S, jz_S; |
90 | gmx_simd_real_t__m128 dx_S0, dy_S0, dz_S0; |
91 | gmx_simd_real_t__m128 dx_S1, dy_S1, dz_S1; |
92 | gmx_simd_real_t__m128 dx_S2, dy_S2, dz_S2; |
93 | gmx_simd_real_t__m128 dx_S3, dy_S3, dz_S3; |
94 | gmx_simd_real_t__m128 tx_S0, ty_S0, tz_S0; |
95 | gmx_simd_real_t__m128 tx_S1, ty_S1, tz_S1; |
96 | gmx_simd_real_t__m128 tx_S2, ty_S2, tz_S2; |
97 | gmx_simd_real_t__m128 tx_S3, ty_S3, tz_S3; |
98 | gmx_simd_real_t__m128 rsq_S0, rinv_S0, rinvsq_S0; |
99 | gmx_simd_real_t__m128 rsq_S1, rinv_S1, rinvsq_S1; |
100 | gmx_simd_real_t__m128 rsq_S2, rinv_S2, rinvsq_S2; |
101 | gmx_simd_real_t__m128 rsq_S3, rinv_S3, rinvsq_S3; |
102 | #ifndef NBNXN_CUTOFF_USE_BLENDV |
103 | /* wco: within cut-off, mask of all 1's or 0's */ |
104 | gmx_simd_bool_t__m128 wco_S0; |
105 | gmx_simd_bool_t__m128 wco_S1; |
106 | gmx_simd_bool_t__m128 wco_S2; |
107 | gmx_simd_bool_t__m128 wco_S3; |
108 | #endif |
109 | #ifdef VDW_CUTOFF_CHECK |
110 | gmx_simd_bool_t__m128 wco_vdw_S0; |
111 | gmx_simd_bool_t__m128 wco_vdw_S1; |
112 | #ifndef HALF_LJ |
113 | gmx_simd_bool_t__m128 wco_vdw_S2; |
114 | gmx_simd_bool_t__m128 wco_vdw_S3; |
115 | #endif |
116 | #endif |
117 | |
118 | #if (defined CALC_COULOMB && defined CALC_COUL_TAB) || defined LJ_FORCE_SWITCH || defined LJ_POT_SWITCH |
119 | gmx_simd_real_t__m128 r_S0; |
120 | gmx_simd_real_t__m128 r_S1; |
121 | gmx_simd_real_t__m128 r_S2; |
122 | gmx_simd_real_t__m128 r_S3; |
123 | #endif |
124 | |
125 | #if defined LJ_FORCE_SWITCH || defined LJ_POT_SWITCH |
126 | gmx_simd_real_t__m128 rsw_S0, rsw2_S0, rsw2_r_S0; |
127 | gmx_simd_real_t__m128 rsw_S1, rsw2_S1, rsw2_r_S1; |
128 | #ifndef HALF_LJ |
129 | gmx_simd_real_t__m128 rsw_S2, rsw2_S2, rsw2_r_S2; |
130 | gmx_simd_real_t__m128 rsw_S3, rsw2_S3, rsw2_r_S3; |
131 | #endif |
132 | #endif |
133 | |
134 | #ifdef CALC_COULOMB |
135 | #ifdef CHECK_EXCLS |
136 | /* 1/r masked with the interaction mask */ |
137 | gmx_simd_real_t__m128 rinv_ex_S0; |
138 | gmx_simd_real_t__m128 rinv_ex_S1; |
139 | gmx_simd_real_t__m128 rinv_ex_S2; |
140 | gmx_simd_real_t__m128 rinv_ex_S3; |
141 | #endif |
142 | gmx_simd_real_t__m128 jq_S; |
143 | gmx_simd_real_t__m128 qq_S0; |
144 | gmx_simd_real_t__m128 qq_S1; |
145 | gmx_simd_real_t__m128 qq_S2; |
146 | gmx_simd_real_t__m128 qq_S3; |
147 | #ifdef CALC_COUL_TAB |
148 | /* The force (PME mesh force) we need to subtract from 1/r^2 */ |
149 | gmx_simd_real_t__m128 fsub_S0; |
150 | gmx_simd_real_t__m128 fsub_S1; |
151 | gmx_simd_real_t__m128 fsub_S2; |
152 | gmx_simd_real_t__m128 fsub_S3; |
153 | #endif |
154 | #ifdef CALC_COUL_EWALD |
155 | gmx_simd_real_t__m128 brsq_S0, brsq_S1, brsq_S2, brsq_S3; |
156 | gmx_simd_real_t__m128 ewcorr_S0, ewcorr_S1, ewcorr_S2, ewcorr_S3; |
157 | #endif |
158 | |
159 | /* frcoul = (1/r - fsub)*r */ |
160 | gmx_simd_real_t__m128 frcoul_S0; |
161 | gmx_simd_real_t__m128 frcoul_S1; |
162 | gmx_simd_real_t__m128 frcoul_S2; |
163 | gmx_simd_real_t__m128 frcoul_S3; |
164 | #ifdef CALC_COUL_TAB |
165 | /* For tables: r, rs=r/sp, rf=floor(rs), frac=rs-rf */ |
166 | gmx_simd_real_t__m128 rs_S0, rf_S0, frac_S0; |
167 | gmx_simd_real_t__m128 rs_S1, rf_S1, frac_S1; |
168 | gmx_simd_real_t__m128 rs_S2, rf_S2, frac_S2; |
169 | gmx_simd_real_t__m128 rs_S3, rf_S3, frac_S3; |
170 | /* Table index: rs truncated to an int */ |
171 | gmx_simd_int32_t__m128i ti_S0, ti_S1, ti_S2, ti_S3; |
172 | /* Linear force table values */ |
173 | gmx_simd_real_t__m128 ctab0_S0, ctab1_S0; |
174 | gmx_simd_real_t__m128 ctab0_S1, ctab1_S1; |
175 | gmx_simd_real_t__m128 ctab0_S2, ctab1_S2; |
176 | gmx_simd_real_t__m128 ctab0_S3, ctab1_S3; |
177 | #ifdef CALC_ENERGIES |
178 | /* Quadratic energy table value */ |
179 | gmx_simd_real_t__m128 ctabv_S0; |
180 | gmx_simd_real_t__m128 ctabv_S1; |
181 | gmx_simd_real_t__m128 ctabv_S2; |
182 | gmx_simd_real_t__m128 ctabv_S3; |
183 | #endif |
184 | #endif |
185 | #if defined CALC_ENERGIES && (defined CALC_COUL_EWALD || defined CALC_COUL_TAB) |
186 | /* The potential (PME mesh) we need to subtract from 1/r */ |
187 | gmx_simd_real_t__m128 vc_sub_S0; |
188 | gmx_simd_real_t__m128 vc_sub_S1; |
189 | gmx_simd_real_t__m128 vc_sub_S2; |
190 | gmx_simd_real_t__m128 vc_sub_S3; |
191 | #endif |
192 | #ifdef CALC_ENERGIES |
193 | /* Electrostatic potential */ |
194 | gmx_simd_real_t__m128 vcoul_S0; |
195 | gmx_simd_real_t__m128 vcoul_S1; |
196 | gmx_simd_real_t__m128 vcoul_S2; |
197 | gmx_simd_real_t__m128 vcoul_S3; |
198 | #endif |
199 | #endif |
200 | /* The force times 1/r */ |
201 | gmx_simd_real_t__m128 fscal_S0; |
202 | gmx_simd_real_t__m128 fscal_S1; |
203 | gmx_simd_real_t__m128 fscal_S2; |
204 | gmx_simd_real_t__m128 fscal_S3; |
205 | |
206 | #ifdef CALC_LJ |
207 | #ifdef LJ_COMB_LB |
208 | /* LJ sigma_j/2 and sqrt(epsilon_j) */ |
209 | gmx_simd_real_t__m128 hsig_j_S, seps_j_S; |
210 | /* LJ sigma_ij and epsilon_ij */ |
211 | gmx_simd_real_t__m128 sig_S0, eps_S0; |
212 | gmx_simd_real_t__m128 sig_S1, eps_S1; |
213 | #ifndef HALF_LJ |
214 | gmx_simd_real_t__m128 sig_S2, eps_S2; |
215 | gmx_simd_real_t__m128 sig_S3, eps_S3; |
216 | #endif |
217 | #ifdef CALC_ENERGIES |
218 | gmx_simd_real_t__m128 sig2_S0, sig6_S0; |
219 | gmx_simd_real_t__m128 sig2_S1, sig6_S1; |
220 | #ifndef HALF_LJ |
221 | gmx_simd_real_t__m128 sig2_S2, sig6_S2; |
222 | gmx_simd_real_t__m128 sig2_S3, sig6_S3; |
223 | #endif |
224 | #endif /* LJ_COMB_LB */ |
225 | #endif /* CALC_LJ */ |
226 | |
227 | #ifdef LJ_COMB_GEOM |
228 | gmx_simd_real_t__m128 c6s_j_S, c12s_j_S; |
229 | #endif |
230 | |
231 | #if defined LJ_COMB_GEOM || defined LJ_COMB_LB || defined LJ_EWALD_GEOM |
232 | /* Index for loading LJ parameters, complicated when interleaving */ |
233 | int aj2; |
234 | #endif |
235 | |
236 | #ifndef FIX_LJ_C |
237 | /* LJ C6 and C12 parameters, used with geometric comb. rule */ |
238 | gmx_simd_real_t__m128 c6_S0, c12_S0; |
239 | gmx_simd_real_t__m128 c6_S1, c12_S1; |
240 | #ifndef HALF_LJ |
241 | gmx_simd_real_t__m128 c6_S2, c12_S2; |
242 | gmx_simd_real_t__m128 c6_S3, c12_S3; |
243 | #endif |
244 | #endif |
245 | |
246 | /* Intermediate variables for LJ calculation */ |
247 | #ifndef LJ_COMB_LB |
248 | gmx_simd_real_t__m128 rinvsix_S0; |
249 | gmx_simd_real_t__m128 rinvsix_S1; |
250 | #ifndef HALF_LJ |
251 | gmx_simd_real_t__m128 rinvsix_S2; |
252 | gmx_simd_real_t__m128 rinvsix_S3; |
253 | #endif |
254 | #endif |
255 | #ifdef LJ_COMB_LB |
256 | gmx_simd_real_t__m128 sir_S0, sir2_S0, sir6_S0; |
257 | gmx_simd_real_t__m128 sir_S1, sir2_S1, sir6_S1; |
258 | #ifndef HALF_LJ |
259 | gmx_simd_real_t__m128 sir_S2, sir2_S2, sir6_S2; |
260 | gmx_simd_real_t__m128 sir_S3, sir2_S3, sir6_S3; |
261 | #endif |
262 | #endif |
263 | |
264 | gmx_simd_real_t__m128 FrLJ6_S0, FrLJ12_S0, frLJ_S0; |
265 | gmx_simd_real_t__m128 FrLJ6_S1, FrLJ12_S1, frLJ_S1; |
266 | #ifndef HALF_LJ |
267 | gmx_simd_real_t__m128 FrLJ6_S2, FrLJ12_S2, frLJ_S2; |
268 | gmx_simd_real_t__m128 FrLJ6_S3, FrLJ12_S3, frLJ_S3; |
269 | #endif |
270 | #if defined CALC_ENERGIES || defined LJ_POT_SWITCH |
271 | gmx_simd_real_t__m128 VLJ6_S0, VLJ12_S0, VLJ_S0; |
272 | gmx_simd_real_t__m128 VLJ6_S1, VLJ12_S1, VLJ_S1; |
273 | #ifndef HALF_LJ |
274 | gmx_simd_real_t__m128 VLJ6_S2, VLJ12_S2, VLJ_S2; |
275 | gmx_simd_real_t__m128 VLJ6_S3, VLJ12_S3, VLJ_S3; |
276 | #endif |
277 | #endif |
278 | #endif /* CALC_LJ */ |
279 | |
280 | /* j-cluster index */ |
281 | cj = l_cj[cjind].cj; |
282 | |
283 | /* Atom indices (of the first atom in the cluster) */ |
284 | aj = cj*UNROLLJ(4/1); |
285 | #if defined CALC_LJ && (defined LJ_COMB_GEOM || defined LJ_COMB_LB || defined LJ_EWALD_GEOM) |
286 | #if UNROLLJ(4/1) == STRIDE(4/1) |
287 | aj2 = aj*2; |
288 | #else |
289 | aj2 = (cj>>1)*2*STRIDE(4/1) + (cj & 1)*UNROLLJ(4/1); |
290 | #endif |
291 | #endif |
292 | #if UNROLLJ(4/1) == STRIDE(4/1) |
293 | ajx = aj*DIM3; |
294 | #else |
295 | ajx = (cj>>1)*DIM3*STRIDE(4/1) + (cj & 1)*UNROLLJ(4/1); |
296 | #endif |
297 | ajy = ajx + STRIDE(4/1); |
298 | ajz = ajy + STRIDE(4/1); |
299 | |
300 | #ifdef CHECK_EXCLS |
301 | gmx_load_simd_4xn_interactions(l_cj[cjind].excl, |
302 | filter_S0, filter_S1, |
303 | filter_S2, filter_S3, |
304 | #ifdef GMX_SIMD_IBM_QPX |
305 | l_cj[cjind].interaction_mask_indices, |
306 | nbat->simd_interaction_array, |
307 | #else |
308 | /* The struct fields do not exist |
309 | except on BlueGene/Q */ |
310 | NULL((void*)0), |
311 | NULL((void*)0), |
312 | #endif |
313 | &interact_S0, &interact_S1, |
314 | &interact_S2, &interact_S3); |
315 | #endif /* CHECK_EXCLS */ |
316 | |
317 | /* load j atom coordinates */ |
318 | jx_S = gmx_simd_load_r_mm_load_ps(x+ajx); |
319 | jy_S = gmx_simd_load_r_mm_load_ps(x+ajy); |
320 | jz_S = gmx_simd_load_r_mm_load_ps(x+ajz); |
321 | |
322 | /* Calculate distance */ |
323 | dx_S0 = gmx_simd_sub_r_mm_sub_ps(ix_S0, jx_S); |
324 | dy_S0 = gmx_simd_sub_r_mm_sub_ps(iy_S0, jy_S); |
325 | dz_S0 = gmx_simd_sub_r_mm_sub_ps(iz_S0, jz_S); |
326 | dx_S1 = gmx_simd_sub_r_mm_sub_ps(ix_S1, jx_S); |
327 | dy_S1 = gmx_simd_sub_r_mm_sub_ps(iy_S1, jy_S); |
328 | dz_S1 = gmx_simd_sub_r_mm_sub_ps(iz_S1, jz_S); |
329 | dx_S2 = gmx_simd_sub_r_mm_sub_ps(ix_S2, jx_S); |
330 | dy_S2 = gmx_simd_sub_r_mm_sub_ps(iy_S2, jy_S); |
331 | dz_S2 = gmx_simd_sub_r_mm_sub_ps(iz_S2, jz_S); |
332 | dx_S3 = gmx_simd_sub_r_mm_sub_ps(ix_S3, jx_S); |
333 | dy_S3 = gmx_simd_sub_r_mm_sub_ps(iy_S3, jy_S); |
334 | dz_S3 = gmx_simd_sub_r_mm_sub_ps(iz_S3, jz_S); |
335 | |
336 | /* rsq = dx*dx+dy*dy+dz*dz */ |
337 | rsq_S0 = gmx_simd_calc_rsq_rgmx_simd_norm2_f(dx_S0, dy_S0, dz_S0); |
338 | rsq_S1 = gmx_simd_calc_rsq_rgmx_simd_norm2_f(dx_S1, dy_S1, dz_S1); |
339 | rsq_S2 = gmx_simd_calc_rsq_rgmx_simd_norm2_f(dx_S2, dy_S2, dz_S2); |
340 | rsq_S3 = gmx_simd_calc_rsq_rgmx_simd_norm2_f(dx_S3, dy_S3, dz_S3); |
341 | |
342 | #ifndef NBNXN_CUTOFF_USE_BLENDV |
343 | wco_S0 = gmx_simd_cmplt_r_mm_cmplt_ps(rsq_S0, rc2_S); |
344 | wco_S1 = gmx_simd_cmplt_r_mm_cmplt_ps(rsq_S1, rc2_S); |
345 | wco_S2 = gmx_simd_cmplt_r_mm_cmplt_ps(rsq_S2, rc2_S); |
346 | wco_S3 = gmx_simd_cmplt_r_mm_cmplt_ps(rsq_S3, rc2_S); |
347 | #endif |
348 | |
349 | #ifdef CHECK_EXCLS |
350 | #ifdef EXCL_FORCES |
351 | /* Only remove the (sub-)diagonal to avoid double counting */ |
352 | #if UNROLLJ(4/1) == UNROLLI4 |
353 | if (cj == ci_sh) |
354 | { |
355 | wco_S0 = gmx_simd_and_b_mm_and_ps(wco_S0, diagonal_mask_S0); |
356 | wco_S1 = gmx_simd_and_b_mm_and_ps(wco_S1, diagonal_mask_S1); |
357 | wco_S2 = gmx_simd_and_b_mm_and_ps(wco_S2, diagonal_mask_S2); |
358 | wco_S3 = gmx_simd_and_b_mm_and_ps(wco_S3, diagonal_mask_S3); |
359 | } |
360 | #else |
361 | #if UNROLLJ(4/1) < UNROLLI4 |
362 | if (cj == ci_sh*2) |
363 | { |
364 | wco_S0 = gmx_simd_and_b_mm_and_ps(wco_S0, diagonal_mask0_S0); |
365 | wco_S1 = gmx_simd_and_b_mm_and_ps(wco_S1, diagonal_mask0_S1); |
366 | wco_S2 = gmx_simd_and_b_mm_and_ps(wco_S2, diagonal_mask0_S2); |
367 | wco_S3 = gmx_simd_and_b_mm_and_ps(wco_S3, diagonal_mask0_S3); |
368 | } |
369 | if (cj == ci_sh*2 + 1) |
370 | { |
371 | wco_S0 = gmx_simd_and_b_mm_and_ps(wco_S0, diagonal_mask1_S0); |
372 | wco_S1 = gmx_simd_and_b_mm_and_ps(wco_S1, diagonal_mask1_S1); |
373 | wco_S2 = gmx_simd_and_b_mm_and_ps(wco_S2, diagonal_mask1_S2); |
374 | wco_S3 = gmx_simd_and_b_mm_and_ps(wco_S3, diagonal_mask1_S3); |
375 | } |
376 | #else |
377 | if (cj*2 == ci_sh) |
378 | { |
379 | wco_S0 = gmx_simd_and_b_mm_and_ps(wco_S0, diagonal_mask0_S0); |
380 | wco_S1 = gmx_simd_and_b_mm_and_ps(wco_S1, diagonal_mask0_S1); |
381 | wco_S2 = gmx_simd_and_b_mm_and_ps(wco_S2, diagonal_mask0_S2); |
382 | wco_S3 = gmx_simd_and_b_mm_and_ps(wco_S3, diagonal_mask0_S3); |
383 | } |
384 | else if (cj*2 + 1 == ci_sh) |
385 | { |
386 | wco_S0 = gmx_simd_and_b_mm_and_ps(wco_S0, diagonal_mask1_S0); |
387 | wco_S1 = gmx_simd_and_b_mm_and_ps(wco_S1, diagonal_mask1_S1); |
388 | wco_S2 = gmx_simd_and_b_mm_and_ps(wco_S2, diagonal_mask1_S2); |
389 | wco_S3 = gmx_simd_and_b_mm_and_ps(wco_S3, diagonal_mask1_S3); |
390 | } |
391 | #endif |
392 | #endif |
393 | #else /* EXCL_FORCES */ |
394 | /* No exclusion forces: remove all excluded atom pairs from the list */ |
395 | wco_S0 = gmx_simd_and_b_mm_and_ps(wco_S0, interact_S0); |
396 | wco_S1 = gmx_simd_and_b_mm_and_ps(wco_S1, interact_S1); |
397 | wco_S2 = gmx_simd_and_b_mm_and_ps(wco_S2, interact_S2); |
398 | wco_S3 = gmx_simd_and_b_mm_and_ps(wco_S3, interact_S3); |
399 | #endif |
400 | #endif |
401 | |
402 | #ifdef COUNT_PAIRS |
403 | { |
404 | int i, j; |
405 | real tmpa[2*GMX_SIMD_REAL_WIDTH4], *tmp; |
406 | tmp = gmx_simd_align_rgmx_simd_align_f(tmpa); |
407 | for (i = 0; i < UNROLLI4; i++) |
408 | { |
409 | gmx_simd_store_r_mm_store_ps(tmp, gmx_simd_sub_r_mm_sub_ps(rc2_S, i == 0 ? rsq_S0 : (i == 1 ? rsq_S1 : (i == 2 ? rsq_S2 : rsq_S3)))); |
410 | for (j = 0; j < UNROLLJ(4/1); j++) |
411 | { |
412 | if (tmp[j] >= 0) |
413 | { |
414 | npair++; |
415 | } |
416 | } |
417 | } |
418 | } |
419 | #endif |
420 | |
421 | #ifdef CHECK_EXCLS |
422 | /* For excluded pairs add a small number to avoid r^-6 = NaN */ |
423 | rsq_S0 = gmx_simd_add_r_mm_add_ps(rsq_S0, gmx_simd_blendv_r_mm_blendv_ps(avoid_sing_S, gmx_simd_setzero_r_mm_setzero_ps(), interact_S0)); |
424 | rsq_S1 = gmx_simd_add_r_mm_add_ps(rsq_S1, gmx_simd_blendv_r_mm_blendv_ps(avoid_sing_S, gmx_simd_setzero_r_mm_setzero_ps(), interact_S1)); |
425 | rsq_S2 = gmx_simd_add_r_mm_add_ps(rsq_S2, gmx_simd_blendv_r_mm_blendv_ps(avoid_sing_S, gmx_simd_setzero_r_mm_setzero_ps(), interact_S2)); |
426 | rsq_S3 = gmx_simd_add_r_mm_add_ps(rsq_S3, gmx_simd_blendv_r_mm_blendv_ps(avoid_sing_S, gmx_simd_setzero_r_mm_setzero_ps(), interact_S3)); |
427 | #endif |
428 | |
429 | /* Calculate 1/r */ |
430 | #ifndef GMX_DOUBLE |
431 | rinv_S0 = gmx_simd_invsqrt_rgmx_simd_invsqrt_f(rsq_S0); |
432 | rinv_S1 = gmx_simd_invsqrt_rgmx_simd_invsqrt_f(rsq_S1); |
433 | rinv_S2 = gmx_simd_invsqrt_rgmx_simd_invsqrt_f(rsq_S2); |
434 | rinv_S3 = gmx_simd_invsqrt_rgmx_simd_invsqrt_f(rsq_S3); |
435 | #else |
436 | gmx_simd_invsqrt_pair_rgmx_simd_invsqrt_pair_f(rsq_S0, rsq_S1, &rinv_S0, &rinv_S1); |
437 | gmx_simd_invsqrt_pair_rgmx_simd_invsqrt_pair_f(rsq_S2, rsq_S3, &rinv_S2, &rinv_S3); |
438 | #endif |
439 | |
440 | #ifdef CALC_COULOMB |
441 | /* Load parameters for j atom */ |
442 | jq_S = gmx_simd_load_r_mm_load_ps(q+aj); |
443 | qq_S0 = gmx_simd_mul_r_mm_mul_ps(iq_S0, jq_S); |
444 | qq_S1 = gmx_simd_mul_r_mm_mul_ps(iq_S1, jq_S); |
445 | qq_S2 = gmx_simd_mul_r_mm_mul_ps(iq_S2, jq_S); |
446 | qq_S3 = gmx_simd_mul_r_mm_mul_ps(iq_S3, jq_S); |
447 | #endif |
448 | |
449 | #ifdef CALC_LJ |
450 | |
451 | #if !defined LJ_COMB_GEOM && !defined LJ_COMB_LB && !defined FIX_LJ_C |
452 | load_lj_pair_params(nbfp0, type, aj, &c6_S0, &c12_S0); |
453 | load_lj_pair_params(nbfp1, type, aj, &c6_S1, &c12_S1); |
454 | #ifndef HALF_LJ |
455 | load_lj_pair_params(nbfp2, type, aj, &c6_S2, &c12_S2); |
456 | load_lj_pair_params(nbfp3, type, aj, &c6_S3, &c12_S3); |
457 | #endif |
458 | #endif /* not defined any LJ rule */ |
459 | |
460 | #ifdef LJ_COMB_GEOM |
461 | c6s_j_S = gmx_simd_load_r_mm_load_ps(ljc+aj2+0); |
462 | c12s_j_S = gmx_simd_load_r_mm_load_ps(ljc+aj2+STRIDE(4/1)); |
463 | c6_S0 = gmx_simd_mul_r_mm_mul_ps(c6s_S0, c6s_j_S ); |
464 | c6_S1 = gmx_simd_mul_r_mm_mul_ps(c6s_S1, c6s_j_S ); |
465 | #ifndef HALF_LJ |
466 | c6_S2 = gmx_simd_mul_r_mm_mul_ps(c6s_S2, c6s_j_S ); |
467 | c6_S3 = gmx_simd_mul_r_mm_mul_ps(c6s_S3, c6s_j_S ); |
468 | #endif |
469 | c12_S0 = gmx_simd_mul_r_mm_mul_ps(c12s_S0, c12s_j_S); |
470 | c12_S1 = gmx_simd_mul_r_mm_mul_ps(c12s_S1, c12s_j_S); |
471 | #ifndef HALF_LJ |
472 | c12_S2 = gmx_simd_mul_r_mm_mul_ps(c12s_S2, c12s_j_S); |
473 | c12_S3 = gmx_simd_mul_r_mm_mul_ps(c12s_S3, c12s_j_S); |
474 | #endif |
475 | #endif /* LJ_COMB_GEOM */ |
476 | |
477 | #ifdef LJ_COMB_LB |
478 | hsig_j_S = gmx_simd_load_r_mm_load_ps(ljc+aj2+0); |
479 | seps_j_S = gmx_simd_load_r_mm_load_ps(ljc+aj2+STRIDE(4/1)); |
480 | |
481 | sig_S0 = gmx_simd_add_r_mm_add_ps(hsig_i_S0, hsig_j_S); |
482 | sig_S1 = gmx_simd_add_r_mm_add_ps(hsig_i_S1, hsig_j_S); |
483 | eps_S0 = gmx_simd_mul_r_mm_mul_ps(seps_i_S0, seps_j_S); |
484 | eps_S1 = gmx_simd_mul_r_mm_mul_ps(seps_i_S1, seps_j_S); |
485 | #ifndef HALF_LJ |
486 | sig_S2 = gmx_simd_add_r_mm_add_ps(hsig_i_S2, hsig_j_S); |
487 | sig_S3 = gmx_simd_add_r_mm_add_ps(hsig_i_S3, hsig_j_S); |
488 | eps_S2 = gmx_simd_mul_r_mm_mul_ps(seps_i_S2, seps_j_S); |
489 | eps_S3 = gmx_simd_mul_r_mm_mul_ps(seps_i_S3, seps_j_S); |
490 | #endif |
491 | #endif /* LJ_COMB_LB */ |
492 | |
493 | #endif /* CALC_LJ */ |
494 | |
495 | #ifndef NBNXN_CUTOFF_USE_BLENDV |
496 | rinv_S0 = gmx_simd_blendzero_r_mm_and_ps(rinv_S0, wco_S0); |
497 | rinv_S1 = gmx_simd_blendzero_r_mm_and_ps(rinv_S1, wco_S1); |
498 | rinv_S2 = gmx_simd_blendzero_r_mm_and_ps(rinv_S2, wco_S2); |
499 | rinv_S3 = gmx_simd_blendzero_r_mm_and_ps(rinv_S3, wco_S3); |
500 | #else |
501 | /* We only need to mask for the cut-off: blendv is faster */ |
502 | rinv_S0 = gmx_simd_blendv_r_mm_blendv_ps(rinv_S0, zero_S, gmx_simd_sub_r_mm_sub_ps(rc2_S, rsq_S0)); |
503 | rinv_S1 = gmx_simd_blendv_r_mm_blendv_ps(rinv_S1, zero_S, gmx_simd_sub_r_mm_sub_ps(rc2_S, rsq_S1)); |
504 | rinv_S2 = gmx_simd_blendv_r_mm_blendv_ps(rinv_S2, zero_S, gmx_simd_sub_r_mm_sub_ps(rc2_S, rsq_S2)); |
505 | rinv_S3 = gmx_simd_blendv_r_mm_blendv_ps(rinv_S3, zero_S, gmx_simd_sub_r_mm_sub_ps(rc2_S, rsq_S3)); |
506 | #endif |
507 | |
508 | rinvsq_S0 = gmx_simd_mul_r_mm_mul_ps(rinv_S0, rinv_S0); |
509 | rinvsq_S1 = gmx_simd_mul_r_mm_mul_ps(rinv_S1, rinv_S1); |
510 | rinvsq_S2 = gmx_simd_mul_r_mm_mul_ps(rinv_S2, rinv_S2); |
511 | rinvsq_S3 = gmx_simd_mul_r_mm_mul_ps(rinv_S3, rinv_S3); |
512 | |
513 | #ifdef CALC_COULOMB |
514 | /* Note that here we calculate force*r, not the usual force/r. |
515 | * This allows avoiding masking the reaction-field contribution, |
516 | * as frcoul is later multiplied by rinvsq which has been |
517 | * masked with the cut-off check. |
518 | */ |
519 | |
520 | #ifdef EXCL_FORCES |
521 | /* Only add 1/r for non-excluded atom pairs */ |
522 | rinv_ex_S0 = gmx_simd_blendzero_r_mm_and_ps(rinv_S0, interact_S0); |
523 | rinv_ex_S1 = gmx_simd_blendzero_r_mm_and_ps(rinv_S1, interact_S1); |
524 | rinv_ex_S2 = gmx_simd_blendzero_r_mm_and_ps(rinv_S2, interact_S2); |
525 | rinv_ex_S3 = gmx_simd_blendzero_r_mm_and_ps(rinv_S3, interact_S3); |
526 | #else |
527 | /* No exclusion forces, we always need 1/r */ |
528 | #define rinv_ex_S0 rinv_S0 |
529 | #define rinv_ex_S1 rinv_S1 |
530 | #define rinv_ex_S2 rinv_S2 |
531 | #define rinv_ex_S3 rinv_S3 |
532 | #endif |
533 | |
534 | #ifdef CALC_COUL_RF |
535 | /* Electrostatic interactions */ |
536 | frcoul_S0 = gmx_simd_mul_r_mm_mul_ps(qq_S0, gmx_simd_fmadd_r(rsq_S0, mrc_3_S, rinv_ex_S0)_mm_add_ps(_mm_mul_ps(rsq_S0, mrc_3_S), rinv_ex_S0)); |
537 | frcoul_S1 = gmx_simd_mul_r_mm_mul_ps(qq_S1, gmx_simd_fmadd_r(rsq_S1, mrc_3_S, rinv_ex_S1)_mm_add_ps(_mm_mul_ps(rsq_S1, mrc_3_S), rinv_ex_S1)); |
538 | frcoul_S2 = gmx_simd_mul_r_mm_mul_ps(qq_S2, gmx_simd_fmadd_r(rsq_S2, mrc_3_S, rinv_ex_S2)_mm_add_ps(_mm_mul_ps(rsq_S2, mrc_3_S), rinv_ex_S2)); |
539 | frcoul_S3 = gmx_simd_mul_r_mm_mul_ps(qq_S3, gmx_simd_fmadd_r(rsq_S3, mrc_3_S, rinv_ex_S3)_mm_add_ps(_mm_mul_ps(rsq_S3, mrc_3_S), rinv_ex_S3)); |
540 | |
541 | #ifdef CALC_ENERGIES |
542 | vcoul_S0 = gmx_simd_mul_r_mm_mul_ps(qq_S0, gmx_simd_add_r_mm_add_ps(rinv_ex_S0, gmx_simd_add_r_mm_add_ps(gmx_simd_mul_r_mm_mul_ps(rsq_S0, hrc_3_S), moh_rc_S))); |
543 | vcoul_S1 = gmx_simd_mul_r_mm_mul_ps(qq_S1, gmx_simd_add_r_mm_add_ps(rinv_ex_S1, gmx_simd_add_r_mm_add_ps(gmx_simd_mul_r_mm_mul_ps(rsq_S1, hrc_3_S), moh_rc_S))); |
544 | vcoul_S2 = gmx_simd_mul_r_mm_mul_ps(qq_S2, gmx_simd_add_r_mm_add_ps(rinv_ex_S2, gmx_simd_add_r_mm_add_ps(gmx_simd_mul_r_mm_mul_ps(rsq_S2, hrc_3_S), moh_rc_S))); |
545 | vcoul_S3 = gmx_simd_mul_r_mm_mul_ps(qq_S3, gmx_simd_add_r_mm_add_ps(rinv_ex_S3, gmx_simd_add_r_mm_add_ps(gmx_simd_mul_r_mm_mul_ps(rsq_S3, hrc_3_S), moh_rc_S))); |
546 | #endif |
547 | #endif |
548 | |
549 | #ifdef CALC_COUL_EWALD |
550 | /* We need to mask (or limit) rsq for the cut-off, |
551 | * as large distances can cause an overflow in gmx_pmecorrF/V. |
552 | */ |
553 | #ifndef NBNXN_CUTOFF_USE_BLENDV |
554 | brsq_S0 = gmx_simd_mul_r_mm_mul_ps(beta2_S, gmx_simd_blendzero_r_mm_and_ps(rsq_S0, wco_S0)); |
555 | brsq_S1 = gmx_simd_mul_r_mm_mul_ps(beta2_S, gmx_simd_blendzero_r_mm_and_ps(rsq_S1, wco_S1)); |
556 | brsq_S2 = gmx_simd_mul_r_mm_mul_ps(beta2_S, gmx_simd_blendzero_r_mm_and_ps(rsq_S2, wco_S2)); |
557 | brsq_S3 = gmx_simd_mul_r_mm_mul_ps(beta2_S, gmx_simd_blendzero_r_mm_and_ps(rsq_S3, wco_S3)); |
558 | #else |
559 | /* Strangely, putting mul on a separate line is slower (icc 13) */ |
560 | brsq_S0 = gmx_simd_mul_r_mm_mul_ps(beta2_S, gmx_simd_blendv_r_mm_blendv_ps(rsq_S0, zero_S, gmx_simd_sub_r_mm_sub_ps(rc2_S, rsq_S0))); |
561 | brsq_S1 = gmx_simd_mul_r_mm_mul_ps(beta2_S, gmx_simd_blendv_r_mm_blendv_ps(rsq_S1, zero_S, gmx_simd_sub_r_mm_sub_ps(rc2_S, rsq_S1))); |
562 | brsq_S2 = gmx_simd_mul_r_mm_mul_ps(beta2_S, gmx_simd_blendv_r_mm_blendv_ps(rsq_S2, zero_S, gmx_simd_sub_r_mm_sub_ps(rc2_S, rsq_S2))); |
563 | brsq_S3 = gmx_simd_mul_r_mm_mul_ps(beta2_S, gmx_simd_blendv_r_mm_blendv_ps(rsq_S3, zero_S, gmx_simd_sub_r_mm_sub_ps(rc2_S, rsq_S3))); |
564 | #endif |
565 | ewcorr_S0 = gmx_simd_mul_r_mm_mul_ps(gmx_simd_pmecorrF_rgmx_simd_pmecorrF_f(brsq_S0), beta_S); |
566 | ewcorr_S1 = gmx_simd_mul_r_mm_mul_ps(gmx_simd_pmecorrF_rgmx_simd_pmecorrF_f(brsq_S1), beta_S); |
567 | ewcorr_S2 = gmx_simd_mul_r_mm_mul_ps(gmx_simd_pmecorrF_rgmx_simd_pmecorrF_f(brsq_S2), beta_S); |
568 | ewcorr_S3 = gmx_simd_mul_r_mm_mul_ps(gmx_simd_pmecorrF_rgmx_simd_pmecorrF_f(brsq_S3), beta_S); |
569 | frcoul_S0 = gmx_simd_mul_r_mm_mul_ps(qq_S0, gmx_simd_fmadd_r(ewcorr_S0, brsq_S0, rinv_ex_S0)_mm_add_ps(_mm_mul_ps(ewcorr_S0, brsq_S0), rinv_ex_S0)); |
570 | frcoul_S1 = gmx_simd_mul_r_mm_mul_ps(qq_S1, gmx_simd_fmadd_r(ewcorr_S1, brsq_S1, rinv_ex_S1)_mm_add_ps(_mm_mul_ps(ewcorr_S1, brsq_S1), rinv_ex_S1)); |
571 | frcoul_S2 = gmx_simd_mul_r_mm_mul_ps(qq_S2, gmx_simd_fmadd_r(ewcorr_S2, brsq_S2, rinv_ex_S2)_mm_add_ps(_mm_mul_ps(ewcorr_S2, brsq_S2), rinv_ex_S2)); |
572 | frcoul_S3 = gmx_simd_mul_r_mm_mul_ps(qq_S3, gmx_simd_fmadd_r(ewcorr_S3, brsq_S3, rinv_ex_S3)_mm_add_ps(_mm_mul_ps(ewcorr_S3, brsq_S3), rinv_ex_S3)); |
573 | |
574 | #ifdef CALC_ENERGIES |
575 | vc_sub_S0 = gmx_simd_mul_r_mm_mul_ps(gmx_simd_pmecorrV_rgmx_simd_pmecorrV_f(brsq_S0), beta_S); |
576 | vc_sub_S1 = gmx_simd_mul_r_mm_mul_ps(gmx_simd_pmecorrV_rgmx_simd_pmecorrV_f(brsq_S1), beta_S); |
577 | vc_sub_S2 = gmx_simd_mul_r_mm_mul_ps(gmx_simd_pmecorrV_rgmx_simd_pmecorrV_f(brsq_S2), beta_S); |
578 | vc_sub_S3 = gmx_simd_mul_r_mm_mul_ps(gmx_simd_pmecorrV_rgmx_simd_pmecorrV_f(brsq_S3), beta_S); |
579 | #endif |
580 | |
581 | #endif /* CALC_COUL_EWALD */ |
582 | |
583 | #ifdef CALC_COUL_TAB |
584 | /* Electrostatic interactions */ |
585 | r_S0 = gmx_simd_mul_r_mm_mul_ps(rsq_S0, rinv_S0); |
586 | r_S1 = gmx_simd_mul_r_mm_mul_ps(rsq_S1, rinv_S1); |
587 | r_S2 = gmx_simd_mul_r_mm_mul_ps(rsq_S2, rinv_S2); |
588 | r_S3 = gmx_simd_mul_r_mm_mul_ps(rsq_S3, rinv_S3); |
589 | /* Convert r to scaled table units */ |
590 | rs_S0 = gmx_simd_mul_r_mm_mul_ps(r_S0, invtsp_S); |
591 | rs_S1 = gmx_simd_mul_r_mm_mul_ps(r_S1, invtsp_S); |
592 | rs_S2 = gmx_simd_mul_r_mm_mul_ps(r_S2, invtsp_S); |
593 | rs_S3 = gmx_simd_mul_r_mm_mul_ps(r_S3, invtsp_S); |
594 | /* Truncate scaled r to an int */ |
595 | ti_S0 = gmx_simd_cvtt_r2i_mm_cvttps_epi32(rs_S0); |
596 | ti_S1 = gmx_simd_cvtt_r2i_mm_cvttps_epi32(rs_S1); |
597 | ti_S2 = gmx_simd_cvtt_r2i_mm_cvttps_epi32(rs_S2); |
598 | ti_S3 = gmx_simd_cvtt_r2i_mm_cvttps_epi32(rs_S3); |
599 | #ifdef GMX_SIMD_HAVE_TRUNC |
600 | /* SSE4.1 trunc is faster than gmx_cvtepi32_ps int->float cast */ |
601 | rf_S0 = gmx_simd_trunc_r(rs_S0)__extension__ ({ __m128 __X = (rs_S0); (__m128) __builtin_ia32_roundps ((__v4sf)__X, ((0x00 | 0x03))); }); |
602 | rf_S1 = gmx_simd_trunc_r(rs_S1)__extension__ ({ __m128 __X = (rs_S1); (__m128) __builtin_ia32_roundps ((__v4sf)__X, ((0x00 | 0x03))); }); |
603 | rf_S2 = gmx_simd_trunc_r(rs_S2)__extension__ ({ __m128 __X = (rs_S2); (__m128) __builtin_ia32_roundps ((__v4sf)__X, ((0x00 | 0x03))); }); |
604 | rf_S3 = gmx_simd_trunc_r(rs_S3)__extension__ ({ __m128 __X = (rs_S3); (__m128) __builtin_ia32_roundps ((__v4sf)__X, ((0x00 | 0x03))); }); |
605 | #else |
606 | rf_S0 = gmx_simd_cvt_i2r_mm_cvtepi32_ps(ti_S0); |
607 | rf_S1 = gmx_simd_cvt_i2r_mm_cvtepi32_ps(ti_S1); |
608 | rf_S2 = gmx_simd_cvt_i2r_mm_cvtepi32_ps(ti_S2); |
609 | rf_S3 = gmx_simd_cvt_i2r_mm_cvtepi32_ps(ti_S3); |
610 | #endif |
611 | frac_S0 = gmx_simd_sub_r_mm_sub_ps(rs_S0, rf_S0); |
612 | frac_S1 = gmx_simd_sub_r_mm_sub_ps(rs_S1, rf_S1); |
613 | frac_S2 = gmx_simd_sub_r_mm_sub_ps(rs_S2, rf_S2); |
614 | frac_S3 = gmx_simd_sub_r_mm_sub_ps(rs_S3, rf_S3); |
615 | |
616 | /* Load and interpolate table forces and possibly energies. |
617 | * Force and energy can be combined in one table, stride 4: FDV0 |
618 | * or in two separate tables with stride 1: F and V |
619 | * Currently single precision uses FDV0, double F and V. |
620 | */ |
621 | #ifndef CALC_ENERGIES |
622 | load_table_f(tab_coul_F, ti_S0, ti0, &ctab0_S0, &ctab1_S0); |
623 | load_table_f(tab_coul_F, ti_S1, ti1, &ctab0_S1, &ctab1_S1); |
624 | load_table_f(tab_coul_F, ti_S2, ti2, &ctab0_S2, &ctab1_S2); |
625 | load_table_f(tab_coul_F, ti_S3, ti3, &ctab0_S3, &ctab1_S3); |
626 | #else |
627 | #ifdef TAB_FDV0 |
628 | load_table_f_v(tab_coul_F, ti_S0, ti0, &ctab0_S0, &ctab1_S0, &ctabv_S0); |
629 | load_table_f_v(tab_coul_F, ti_S1, ti1, &ctab0_S1, &ctab1_S1, &ctabv_S1); |
630 | load_table_f_v(tab_coul_F, ti_S2, ti2, &ctab0_S2, &ctab1_S2, &ctabv_S2); |
631 | load_table_f_v(tab_coul_F, ti_S3, ti3, &ctab0_S3, &ctab1_S3, &ctabv_S3); |
632 | #else |
633 | load_table_f_v(tab_coul_F, tab_coul_V, ti_S0, ti0, &ctab0_S0, &ctab1_S0, &ctabv_S0); |
634 | load_table_f_v(tab_coul_F, tab_coul_V, ti_S1, ti1, &ctab0_S1, &ctab1_S1, &ctabv_S1); |
635 | load_table_f_v(tab_coul_F, tab_coul_V, ti_S2, ti2, &ctab0_S2, &ctab1_S2, &ctabv_S2); |
636 | load_table_f_v(tab_coul_F, tab_coul_V, ti_S3, ti3, &ctab0_S3, &ctab1_S3, &ctabv_S3); |
637 | #endif |
638 | #endif |
639 | fsub_S0 = gmx_simd_add_r_mm_add_ps(ctab0_S0, gmx_simd_mul_r_mm_mul_ps(frac_S0, ctab1_S0)); |
640 | fsub_S1 = gmx_simd_add_r_mm_add_ps(ctab0_S1, gmx_simd_mul_r_mm_mul_ps(frac_S1, ctab1_S1)); |
641 | fsub_S2 = gmx_simd_add_r_mm_add_ps(ctab0_S2, gmx_simd_mul_r_mm_mul_ps(frac_S2, ctab1_S2)); |
642 | fsub_S3 = gmx_simd_add_r_mm_add_ps(ctab0_S3, gmx_simd_mul_r_mm_mul_ps(frac_S3, ctab1_S3)); |
643 | frcoul_S0 = gmx_simd_mul_r_mm_mul_ps(qq_S0, gmx_simd_sub_r_mm_sub_ps(rinv_ex_S0, gmx_simd_mul_r_mm_mul_ps(fsub_S0, r_S0))); |
644 | frcoul_S1 = gmx_simd_mul_r_mm_mul_ps(qq_S1, gmx_simd_sub_r_mm_sub_ps(rinv_ex_S1, gmx_simd_mul_r_mm_mul_ps(fsub_S1, r_S1))); |
645 | frcoul_S2 = gmx_simd_mul_r_mm_mul_ps(qq_S2, gmx_simd_sub_r_mm_sub_ps(rinv_ex_S2, gmx_simd_mul_r_mm_mul_ps(fsub_S2, r_S2))); |
646 | frcoul_S3 = gmx_simd_mul_r_mm_mul_ps(qq_S3, gmx_simd_sub_r_mm_sub_ps(rinv_ex_S3, gmx_simd_mul_r_mm_mul_ps(fsub_S3, r_S3))); |
647 | |
648 | #ifdef CALC_ENERGIES |
649 | vc_sub_S0 = gmx_simd_add_r_mm_add_ps(ctabv_S0, gmx_simd_mul_r_mm_mul_ps(gmx_simd_mul_r_mm_mul_ps(mhalfsp_S, frac_S0), gmx_simd_add_r_mm_add_ps(ctab0_S0, fsub_S0))); |
650 | vc_sub_S1 = gmx_simd_add_r_mm_add_ps(ctabv_S1, gmx_simd_mul_r_mm_mul_ps(gmx_simd_mul_r_mm_mul_ps(mhalfsp_S, frac_S1), gmx_simd_add_r_mm_add_ps(ctab0_S1, fsub_S1))); |
651 | vc_sub_S2 = gmx_simd_add_r_mm_add_ps(ctabv_S2, gmx_simd_mul_r_mm_mul_ps(gmx_simd_mul_r_mm_mul_ps(mhalfsp_S, frac_S2), gmx_simd_add_r_mm_add_ps(ctab0_S2, fsub_S2))); |
652 | vc_sub_S3 = gmx_simd_add_r_mm_add_ps(ctabv_S3, gmx_simd_mul_r_mm_mul_ps(gmx_simd_mul_r_mm_mul_ps(mhalfsp_S, frac_S3), gmx_simd_add_r_mm_add_ps(ctab0_S3, fsub_S3))); |
653 | #endif |
654 | #endif /* CALC_COUL_TAB */ |
655 | |
656 | #if defined CALC_ENERGIES && (defined CALC_COUL_EWALD || defined CALC_COUL_TAB) |
657 | #ifndef NO_SHIFT_EWALD |
658 | /* Add Ewald potential shift to vc_sub for convenience */ |
659 | #ifdef CHECK_EXCLS |
660 | vc_sub_S0 = gmx_simd_add_r_mm_add_ps(vc_sub_S0, gmx_simd_blendzero_r_mm_and_ps(sh_ewald_S, interact_S0)); |
661 | vc_sub_S1 = gmx_simd_add_r_mm_add_ps(vc_sub_S1, gmx_simd_blendzero_r_mm_and_ps(sh_ewald_S, interact_S1)); |
662 | vc_sub_S2 = gmx_simd_add_r_mm_add_ps(vc_sub_S2, gmx_simd_blendzero_r_mm_and_ps(sh_ewald_S, interact_S2)); |
663 | vc_sub_S3 = gmx_simd_add_r_mm_add_ps(vc_sub_S3, gmx_simd_blendzero_r_mm_and_ps(sh_ewald_S, interact_S3)); |
664 | #else |
665 | vc_sub_S0 = gmx_simd_add_r_mm_add_ps(vc_sub_S0, sh_ewald_S); |
666 | vc_sub_S1 = gmx_simd_add_r_mm_add_ps(vc_sub_S1, sh_ewald_S); |
667 | vc_sub_S2 = gmx_simd_add_r_mm_add_ps(vc_sub_S2, sh_ewald_S); |
668 | vc_sub_S3 = gmx_simd_add_r_mm_add_ps(vc_sub_S3, sh_ewald_S); |
669 | #endif |
670 | #endif |
671 | |
672 | vcoul_S0 = gmx_simd_mul_r_mm_mul_ps(qq_S0, gmx_simd_sub_r_mm_sub_ps(rinv_ex_S0, vc_sub_S0)); |
673 | vcoul_S1 = gmx_simd_mul_r_mm_mul_ps(qq_S1, gmx_simd_sub_r_mm_sub_ps(rinv_ex_S1, vc_sub_S1)); |
674 | vcoul_S2 = gmx_simd_mul_r_mm_mul_ps(qq_S2, gmx_simd_sub_r_mm_sub_ps(rinv_ex_S2, vc_sub_S2)); |
675 | vcoul_S3 = gmx_simd_mul_r_mm_mul_ps(qq_S3, gmx_simd_sub_r_mm_sub_ps(rinv_ex_S3, vc_sub_S3)); |
676 | |
677 | #endif |
678 | |
679 | #ifdef CALC_ENERGIES |
680 | /* Mask energy for cut-off and diagonal */ |
681 | vcoul_S0 = gmx_simd_blendzero_r_mm_and_ps(vcoul_S0, wco_S0); |
682 | vcoul_S1 = gmx_simd_blendzero_r_mm_and_ps(vcoul_S1, wco_S1); |
683 | vcoul_S2 = gmx_simd_blendzero_r_mm_and_ps(vcoul_S2, wco_S2); |
684 | vcoul_S3 = gmx_simd_blendzero_r_mm_and_ps(vcoul_S3, wco_S3); |
685 | #endif |
686 | |
687 | #endif /* CALC_COULOMB */ |
688 | |
689 | #ifdef CALC_LJ |
690 | /* Lennard-Jones interaction */ |
691 | |
692 | #ifdef VDW_CUTOFF_CHECK |
693 | wco_vdw_S0 = gmx_simd_cmplt_r_mm_cmplt_ps(rsq_S0, rcvdw2_S); |
694 | wco_vdw_S1 = gmx_simd_cmplt_r_mm_cmplt_ps(rsq_S1, rcvdw2_S); |
695 | #ifndef HALF_LJ |
696 | wco_vdw_S2 = gmx_simd_cmplt_r_mm_cmplt_ps(rsq_S2, rcvdw2_S); |
697 | wco_vdw_S3 = gmx_simd_cmplt_r_mm_cmplt_ps(rsq_S3, rcvdw2_S); |
698 | #endif |
699 | #else |
700 | /* Same cut-off for Coulomb and VdW, reuse the registers */ |
701 | #define wco_vdw_S0 wco_S0 |
702 | #define wco_vdw_S1 wco_S1 |
703 | #define wco_vdw_S2 wco_S2 |
704 | #define wco_vdw_S3 wco_S3 |
705 | #endif |
706 | |
707 | #ifndef LJ_COMB_LB |
708 | rinvsix_S0 = gmx_simd_mul_r_mm_mul_ps(rinvsq_S0, gmx_simd_mul_r_mm_mul_ps(rinvsq_S0, rinvsq_S0)); |
709 | rinvsix_S1 = gmx_simd_mul_r_mm_mul_ps(rinvsq_S1, gmx_simd_mul_r_mm_mul_ps(rinvsq_S1, rinvsq_S1)); |
710 | #ifdef EXCL_FORCES |
711 | rinvsix_S0 = gmx_simd_blendzero_r_mm_and_ps(rinvsix_S0, interact_S0); |
712 | rinvsix_S1 = gmx_simd_blendzero_r_mm_and_ps(rinvsix_S1, interact_S1); |
713 | #endif |
714 | #ifndef HALF_LJ |
715 | rinvsix_S2 = gmx_simd_mul_r_mm_mul_ps(rinvsq_S2, gmx_simd_mul_r_mm_mul_ps(rinvsq_S2, rinvsq_S2)); |
716 | rinvsix_S3 = gmx_simd_mul_r_mm_mul_ps(rinvsq_S3, gmx_simd_mul_r_mm_mul_ps(rinvsq_S3, rinvsq_S3)); |
717 | #ifdef EXCL_FORCES |
718 | rinvsix_S2 = gmx_simd_blendzero_r_mm_and_ps(rinvsix_S2, interact_S2); |
719 | rinvsix_S3 = gmx_simd_blendzero_r_mm_and_ps(rinvsix_S3, interact_S3); |
720 | #endif |
721 | #endif |
722 | |
723 | #if defined LJ_CUT || defined LJ_POT_SWITCH |
724 | /* We have plain LJ or LJ-PME with simple C6/6 C12/12 coefficients */ |
725 | FrLJ6_S0 = gmx_simd_mul_r_mm_mul_ps(c6_S0, rinvsix_S0); |
726 | FrLJ6_S1 = gmx_simd_mul_r_mm_mul_ps(c6_S1, rinvsix_S1); |
727 | #ifndef HALF_LJ |
728 | FrLJ6_S2 = gmx_simd_mul_r_mm_mul_ps(c6_S2, rinvsix_S2); |
729 | FrLJ6_S3 = gmx_simd_mul_r_mm_mul_ps(c6_S3, rinvsix_S3); |
730 | #endif |
731 | FrLJ12_S0 = gmx_simd_mul_r_mm_mul_ps(c12_S0, gmx_simd_mul_r_mm_mul_ps(rinvsix_S0, rinvsix_S0)); |
732 | FrLJ12_S1 = gmx_simd_mul_r_mm_mul_ps(c12_S1, gmx_simd_mul_r_mm_mul_ps(rinvsix_S1, rinvsix_S1)); |
733 | #ifndef HALF_LJ |
734 | FrLJ12_S2 = gmx_simd_mul_r_mm_mul_ps(c12_S2, gmx_simd_mul_r_mm_mul_ps(rinvsix_S2, rinvsix_S2)); |
735 | FrLJ12_S3 = gmx_simd_mul_r_mm_mul_ps(c12_S3, gmx_simd_mul_r_mm_mul_ps(rinvsix_S3, rinvsix_S3)); |
736 | #endif |
737 | #endif |
738 | |
739 | #if defined LJ_FORCE_SWITCH || defined LJ_POT_SWITCH |
740 | /* We switch the LJ force */ |
741 | r_S0 = gmx_simd_mul_r_mm_mul_ps(rsq_S0, rinv_S0); |
742 | rsw_S0 = gmx_simd_max_r_mm_max_ps(gmx_simd_sub_r_mm_sub_ps(r_S0, rswitch_S), zero_S); |
743 | rsw2_S0 = gmx_simd_mul_r_mm_mul_ps(rsw_S0, rsw_S0); |
744 | rsw2_r_S0 = gmx_simd_mul_r_mm_mul_ps(rsw2_S0, r_S0); |
745 | r_S1 = gmx_simd_mul_r_mm_mul_ps(rsq_S1, rinv_S1); |
746 | rsw_S1 = gmx_simd_max_r_mm_max_ps(gmx_simd_sub_r_mm_sub_ps(r_S1, rswitch_S), zero_S); |
747 | rsw2_S1 = gmx_simd_mul_r_mm_mul_ps(rsw_S1, rsw_S1); |
748 | rsw2_r_S1 = gmx_simd_mul_r_mm_mul_ps(rsw2_S1, r_S1); |
Value stored to 'rsw2_r_S1' is never read | |
749 | #ifndef HALF_LJ |
750 | r_S2 = gmx_simd_mul_r_mm_mul_ps(rsq_S2, rinv_S2); |
751 | rsw_S2 = gmx_simd_max_r_mm_max_ps(gmx_simd_sub_r_mm_sub_ps(r_S2, rswitch_S), zero_S); |
752 | rsw2_S2 = gmx_simd_mul_r_mm_mul_ps(rsw_S2, rsw_S2); |
753 | rsw2_r_S2 = gmx_simd_mul_r_mm_mul_ps(rsw2_S2, r_S2); |
754 | r_S3 = gmx_simd_mul_r_mm_mul_ps(rsq_S3, rinv_S3); |
755 | rsw_S3 = gmx_simd_max_r_mm_max_ps(gmx_simd_sub_r_mm_sub_ps(r_S3, rswitch_S), zero_S); |
756 | rsw2_S3 = gmx_simd_mul_r_mm_mul_ps(rsw_S3, rsw_S3); |
757 | rsw2_r_S3 = gmx_simd_mul_r_mm_mul_ps(rsw2_S3, r_S3); |
758 | #endif |
759 | #endif |
760 | |
761 | #ifdef LJ_FORCE_SWITCH |
762 | |
763 | #define gmx_add_fr_switch(fr, rsw, rsw2_r, c2, c3) gmx_simd_fmadd_r(gmx_simd_fmadd_r(c3, rsw, c2), rsw2_r, fr)_mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(c3, rsw), c2), rsw2_r ), fr) |
764 | |
765 | FrLJ6_S0 = gmx_simd_mul_r_mm_mul_ps(c6_S0, gmx_add_fr_switch(rinvsix_S0, rsw_S0, rsw2_r_S0, p6_fc2_S, p6_fc3_S)); |
766 | FrLJ6_S1 = gmx_simd_mul_r_mm_mul_ps(c6_S1, gmx_add_fr_switch(rinvsix_S1, rsw_S1, rsw2_r_S1, p6_fc2_S, p6_fc3_S)); |
767 | #ifndef HALF_LJ |
768 | FrLJ6_S2 = gmx_simd_mul_r_mm_mul_ps(c6_S2, gmx_add_fr_switch(rinvsix_S2, rsw_S2, rsw2_r_S2, p6_fc2_S, p6_fc3_S)); |
769 | FrLJ6_S3 = gmx_simd_mul_r_mm_mul_ps(c6_S3, gmx_add_fr_switch(rinvsix_S3, rsw_S3, rsw2_r_S3, p6_fc2_S, p6_fc3_S)); |
770 | #endif |
771 | FrLJ12_S0 = gmx_simd_mul_r_mm_mul_ps(c12_S0, gmx_add_fr_switch(gmx_simd_mul_r_mm_mul_ps(rinvsix_S0, rinvsix_S0), rsw_S0, rsw2_r_S0, p12_fc2_S, p12_fc3_S)); |
772 | FrLJ12_S1 = gmx_simd_mul_r_mm_mul_ps(c12_S1, gmx_add_fr_switch(gmx_simd_mul_r_mm_mul_ps(rinvsix_S1, rinvsix_S1), rsw_S1, rsw2_r_S1, p12_fc2_S, p12_fc3_S)); |
773 | #ifndef HALF_LJ |
774 | FrLJ12_S2 = gmx_simd_mul_r_mm_mul_ps(c12_S2, gmx_add_fr_switch(gmx_simd_mul_r_mm_mul_ps(rinvsix_S2, rinvsix_S2), rsw_S2, rsw2_r_S2, p12_fc2_S, p12_fc3_S)); |
775 | FrLJ12_S3 = gmx_simd_mul_r_mm_mul_ps(c12_S3, gmx_add_fr_switch(gmx_simd_mul_r_mm_mul_ps(rinvsix_S3, rinvsix_S3), rsw_S3, rsw2_r_S3, p12_fc2_S, p12_fc3_S)); |
776 | #endif |
777 | #undef gmx_add_fr_switch |
778 | #endif /* LJ_FORCE_SWITCH */ |
779 | |
780 | #endif /* not LJ_COMB_LB */ |
781 | |
782 | #ifdef LJ_COMB_LB |
783 | sir_S0 = gmx_simd_mul_r_mm_mul_ps(sig_S0, rinv_S0); |
784 | sir_S1 = gmx_simd_mul_r_mm_mul_ps(sig_S1, rinv_S1); |
785 | #ifndef HALF_LJ |
786 | sir_S2 = gmx_simd_mul_r_mm_mul_ps(sig_S2, rinv_S2); |
787 | sir_S3 = gmx_simd_mul_r_mm_mul_ps(sig_S3, rinv_S3); |
788 | #endif |
789 | sir2_S0 = gmx_simd_mul_r_mm_mul_ps(sir_S0, sir_S0); |
790 | sir2_S1 = gmx_simd_mul_r_mm_mul_ps(sir_S1, sir_S1); |
791 | #ifndef HALF_LJ |
792 | sir2_S2 = gmx_simd_mul_r_mm_mul_ps(sir_S2, sir_S2); |
793 | sir2_S3 = gmx_simd_mul_r_mm_mul_ps(sir_S3, sir_S3); |
794 | #endif |
795 | sir6_S0 = gmx_simd_mul_r_mm_mul_ps(sir2_S0, gmx_simd_mul_r_mm_mul_ps(sir2_S0, sir2_S0)); |
796 | sir6_S1 = gmx_simd_mul_r_mm_mul_ps(sir2_S1, gmx_simd_mul_r_mm_mul_ps(sir2_S1, sir2_S1)); |
797 | #ifdef EXCL_FORCES |
798 | sir6_S0 = gmx_simd_blendzero_r_mm_and_ps(sir6_S0, interact_S0); |
799 | sir6_S1 = gmx_simd_blendzero_r_mm_and_ps(sir6_S1, interact_S1); |
800 | #endif |
801 | #ifndef HALF_LJ |
802 | sir6_S2 = gmx_simd_mul_r_mm_mul_ps(sir2_S2, gmx_simd_mul_r_mm_mul_ps(sir2_S2, sir2_S2)); |
803 | sir6_S3 = gmx_simd_mul_r_mm_mul_ps(sir2_S3, gmx_simd_mul_r_mm_mul_ps(sir2_S3, sir2_S3)); |
804 | #ifdef EXCL_FORCES |
805 | sir6_S2 = gmx_simd_blendzero_r_mm_and_ps(sir6_S2, interact_S2); |
806 | sir6_S3 = gmx_simd_blendzero_r_mm_and_ps(sir6_S3, interact_S3); |
807 | #endif |
808 | #endif |
809 | #ifdef VDW_CUTOFF_CHECK |
810 | sir6_S0 = gmx_simd_blendzero_r_mm_and_ps(sir6_S0, wco_vdw_S0); |
811 | sir6_S1 = gmx_simd_blendzero_r_mm_and_ps(sir6_S1, wco_vdw_S1); |
812 | #ifndef HALF_LJ |
813 | sir6_S2 = gmx_simd_blendzero_r_mm_and_ps(sir6_S2, wco_vdw_S2); |
814 | sir6_S3 = gmx_simd_blendzero_r_mm_and_ps(sir6_S3, wco_vdw_S3); |
815 | #endif |
816 | #endif |
817 | FrLJ6_S0 = gmx_simd_mul_r_mm_mul_ps(eps_S0, sir6_S0); |
818 | FrLJ6_S1 = gmx_simd_mul_r_mm_mul_ps(eps_S1, sir6_S1); |
819 | #ifndef HALF_LJ |
820 | FrLJ6_S2 = gmx_simd_mul_r_mm_mul_ps(eps_S2, sir6_S2); |
821 | FrLJ6_S3 = gmx_simd_mul_r_mm_mul_ps(eps_S3, sir6_S3); |
822 | #endif |
823 | FrLJ12_S0 = gmx_simd_mul_r_mm_mul_ps(FrLJ6_S0, sir6_S0); |
824 | FrLJ12_S1 = gmx_simd_mul_r_mm_mul_ps(FrLJ6_S1, sir6_S1); |
825 | #ifndef HALF_LJ |
826 | FrLJ12_S2 = gmx_simd_mul_r_mm_mul_ps(FrLJ6_S2, sir6_S2); |
827 | FrLJ12_S3 = gmx_simd_mul_r_mm_mul_ps(FrLJ6_S3, sir6_S3); |
828 | #endif |
829 | #if defined CALC_ENERGIES |
830 | /* We need C6 and C12 to calculate the LJ potential shift */ |
831 | sig2_S0 = gmx_simd_mul_r_mm_mul_ps(sig_S0, sig_S0); |
832 | sig2_S1 = gmx_simd_mul_r_mm_mul_ps(sig_S1, sig_S1); |
833 | #ifndef HALF_LJ |
834 | sig2_S2 = gmx_simd_mul_r_mm_mul_ps(sig_S2, sig_S2); |
835 | sig2_S3 = gmx_simd_mul_r_mm_mul_ps(sig_S3, sig_S3); |
836 | #endif |
837 | sig6_S0 = gmx_simd_mul_r_mm_mul_ps(sig2_S0, gmx_simd_mul_r_mm_mul_ps(sig2_S0, sig2_S0)); |
838 | sig6_S1 = gmx_simd_mul_r_mm_mul_ps(sig2_S1, gmx_simd_mul_r_mm_mul_ps(sig2_S1, sig2_S1)); |
839 | #ifndef HALF_LJ |
840 | sig6_S2 = gmx_simd_mul_r_mm_mul_ps(sig2_S2, gmx_simd_mul_r_mm_mul_ps(sig2_S2, sig2_S2)); |
841 | sig6_S3 = gmx_simd_mul_r_mm_mul_ps(sig2_S3, gmx_simd_mul_r_mm_mul_ps(sig2_S3, sig2_S3)); |
842 | #endif |
843 | c6_S0 = gmx_simd_mul_r_mm_mul_ps(eps_S0, sig6_S0); |
844 | c6_S1 = gmx_simd_mul_r_mm_mul_ps(eps_S1, sig6_S1); |
845 | #ifndef HALF_LJ |
846 | c6_S2 = gmx_simd_mul_r_mm_mul_ps(eps_S2, sig6_S2); |
847 | c6_S3 = gmx_simd_mul_r_mm_mul_ps(eps_S3, sig6_S3); |
848 | #endif |
849 | c12_S0 = gmx_simd_mul_r_mm_mul_ps(c6_S0, sig6_S0); |
850 | c12_S1 = gmx_simd_mul_r_mm_mul_ps(c6_S1, sig6_S1); |
851 | #ifndef HALF_LJ |
852 | c12_S2 = gmx_simd_mul_r_mm_mul_ps(c6_S2, sig6_S2); |
853 | c12_S3 = gmx_simd_mul_r_mm_mul_ps(c6_S3, sig6_S3); |
854 | #endif |
855 | #endif |
856 | #endif /* LJ_COMB_LB */ |
857 | |
858 | /* Determine the total scalar LJ force*r */ |
859 | frLJ_S0 = gmx_simd_sub_r_mm_sub_ps(FrLJ12_S0, FrLJ6_S0); |
860 | frLJ_S1 = gmx_simd_sub_r_mm_sub_ps(FrLJ12_S1, FrLJ6_S1); |
861 | #ifndef HALF_LJ |
862 | frLJ_S2 = gmx_simd_sub_r_mm_sub_ps(FrLJ12_S2, FrLJ6_S2); |
863 | frLJ_S3 = gmx_simd_sub_r_mm_sub_ps(FrLJ12_S3, FrLJ6_S3); |
864 | #endif |
865 | |
866 | #if (defined LJ_CUT || defined LJ_FORCE_SWITCH) && defined CALC_ENERGIES |
867 | |
868 | #ifdef LJ_CUT |
869 | /* Calculate the LJ energies, with constant potential shift */ |
870 | VLJ6_S0 = gmx_simd_mul_r_mm_mul_ps(sixth_S, gmx_simd_fmadd_r(c6_S0, p6_cpot_S, FrLJ6_S0)_mm_add_ps(_mm_mul_ps(c6_S0, p6_cpot_S), FrLJ6_S0)); |
871 | VLJ6_S1 = gmx_simd_mul_r_mm_mul_ps(sixth_S, gmx_simd_fmadd_r(c6_S1, p6_cpot_S, FrLJ6_S1)_mm_add_ps(_mm_mul_ps(c6_S1, p6_cpot_S), FrLJ6_S1)); |
872 | #ifndef HALF_LJ |
873 | VLJ6_S2 = gmx_simd_mul_r_mm_mul_ps(sixth_S, gmx_simd_fmadd_r(c6_S2, p6_cpot_S, FrLJ6_S2)_mm_add_ps(_mm_mul_ps(c6_S2, p6_cpot_S), FrLJ6_S2)); |
874 | VLJ6_S3 = gmx_simd_mul_r_mm_mul_ps(sixth_S, gmx_simd_fmadd_r(c6_S3, p6_cpot_S, FrLJ6_S3)_mm_add_ps(_mm_mul_ps(c6_S3, p6_cpot_S), FrLJ6_S3)); |
875 | #endif |
876 | VLJ12_S0 = gmx_simd_mul_r_mm_mul_ps(twelveth_S, gmx_simd_fmadd_r(c12_S0, p12_cpot_S, FrLJ12_S0)_mm_add_ps(_mm_mul_ps(c12_S0, p12_cpot_S), FrLJ12_S0)); |
877 | VLJ12_S1 = gmx_simd_mul_r_mm_mul_ps(twelveth_S, gmx_simd_fmadd_r(c12_S1, p12_cpot_S, FrLJ12_S1)_mm_add_ps(_mm_mul_ps(c12_S1, p12_cpot_S), FrLJ12_S1)); |
878 | #ifndef HALF_LJ |
879 | VLJ12_S2 = gmx_simd_mul_r_mm_mul_ps(twelveth_S, gmx_simd_fmadd_r(c12_S2, p12_cpot_S, FrLJ12_S2)_mm_add_ps(_mm_mul_ps(c12_S2, p12_cpot_S), FrLJ12_S2)); |
880 | VLJ12_S3 = gmx_simd_mul_r_mm_mul_ps(twelveth_S, gmx_simd_fmadd_r(c12_S3, p12_cpot_S, FrLJ12_S3)_mm_add_ps(_mm_mul_ps(c12_S3, p12_cpot_S), FrLJ12_S3)); |
881 | #endif |
882 | #endif /* LJ_CUT */ |
883 | |
884 | #ifdef LJ_FORCE_SWITCH |
885 | #define v_fswitch_r(rsw, rsw2, c0, c3, c4) gmx_simd_fmadd_r(gmx_simd_fmadd_r(c4, rsw, c3), gmx_simd_mul_r(rsw2, rsw), c0)_mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(c4, rsw), c3), _mm_mul_ps (rsw2, rsw)), c0) |
886 | |
887 | VLJ6_S0 = gmx_simd_mul_r_mm_mul_ps(c6_S0, gmx_simd_fmadd_r(sixth_S, rinvsix_S0, v_fswitch_r(rsw_S0, rsw2_S0, p6_6cpot_S, p6_vc3_S, p6_vc4_S))_mm_add_ps(_mm_mul_ps(sixth_S, rinvsix_S0), v_fswitch_r(rsw_S0 , rsw2_S0, p6_6cpot_S, p6_vc3_S, p6_vc4_S))); |
888 | VLJ6_S1 = gmx_simd_mul_r_mm_mul_ps(c6_S1, gmx_simd_fmadd_r(sixth_S, rinvsix_S1, v_fswitch_r(rsw_S1, rsw2_S1, p6_6cpot_S, p6_vc3_S, p6_vc4_S))_mm_add_ps(_mm_mul_ps(sixth_S, rinvsix_S1), v_fswitch_r(rsw_S1 , rsw2_S1, p6_6cpot_S, p6_vc3_S, p6_vc4_S))); |
889 | #ifndef HALF_LJ |
890 | VLJ6_S2 = gmx_simd_mul_r_mm_mul_ps(c6_S2, gmx_simd_fmadd_r(sixth_S, rinvsix_S2, v_fswitch_r(rsw_S2, rsw2_S2, p6_6cpot_S, p6_vc3_S, p6_vc4_S))_mm_add_ps(_mm_mul_ps(sixth_S, rinvsix_S2), v_fswitch_r(rsw_S2 , rsw2_S2, p6_6cpot_S, p6_vc3_S, p6_vc4_S))); |
891 | VLJ6_S3 = gmx_simd_mul_r_mm_mul_ps(c6_S3, gmx_simd_fmadd_r(sixth_S, rinvsix_S3, v_fswitch_r(rsw_S3, rsw2_S3, p6_6cpot_S, p6_vc3_S, p6_vc4_S))_mm_add_ps(_mm_mul_ps(sixth_S, rinvsix_S3), v_fswitch_r(rsw_S3 , rsw2_S3, p6_6cpot_S, p6_vc3_S, p6_vc4_S))); |
892 | #endif |
893 | VLJ12_S0 = gmx_simd_mul_r_mm_mul_ps(c12_S0, gmx_simd_fmadd_r(twelveth_S, gmx_simd_mul_r(rinvsix_S0, rinvsix_S0), v_fswitch_r(rsw_S0, rsw2_S0, p12_12cpot_S, p12_vc3_S, p12_vc4_S))_mm_add_ps(_mm_mul_ps(twelveth_S, _mm_mul_ps(rinvsix_S0, rinvsix_S0 )), v_fswitch_r(rsw_S0, rsw2_S0, p12_12cpot_S, p12_vc3_S, p12_vc4_S ))); |
894 | VLJ12_S1 = gmx_simd_mul_r_mm_mul_ps(c12_S1, gmx_simd_fmadd_r(twelveth_S, gmx_simd_mul_r(rinvsix_S1, rinvsix_S1), v_fswitch_r(rsw_S1, rsw2_S1, p12_12cpot_S, p12_vc3_S, p12_vc4_S))_mm_add_ps(_mm_mul_ps(twelveth_S, _mm_mul_ps(rinvsix_S1, rinvsix_S1 )), v_fswitch_r(rsw_S1, rsw2_S1, p12_12cpot_S, p12_vc3_S, p12_vc4_S ))); |
895 | #ifndef HALF_LJ |
896 | VLJ12_S2 = gmx_simd_mul_r_mm_mul_ps(c12_S2, gmx_simd_fmadd_r(twelveth_S, gmx_simd_mul_r(rinvsix_S2, rinvsix_S2), v_fswitch_r(rsw_S2, rsw2_S2, p12_12cpot_S, p12_vc3_S, p12_vc4_S))_mm_add_ps(_mm_mul_ps(twelveth_S, _mm_mul_ps(rinvsix_S2, rinvsix_S2 )), v_fswitch_r(rsw_S2, rsw2_S2, p12_12cpot_S, p12_vc3_S, p12_vc4_S ))); |
897 | VLJ12_S3 = gmx_simd_mul_r_mm_mul_ps(c12_S3, gmx_simd_fmadd_r(twelveth_S, gmx_simd_mul_r(rinvsix_S3, rinvsix_S3), v_fswitch_r(rsw_S3, rsw2_S3, p12_12cpot_S, p12_vc3_S, p12_vc4_S))_mm_add_ps(_mm_mul_ps(twelveth_S, _mm_mul_ps(rinvsix_S3, rinvsix_S3 )), v_fswitch_r(rsw_S3, rsw2_S3, p12_12cpot_S, p12_vc3_S, p12_vc4_S ))); |
898 | #endif |
899 | #undef v_fswitch_r |
900 | #endif /* LJ_FORCE_SWITCH */ |
901 | |
902 | /* Add up the repulsion and dispersion */ |
903 | VLJ_S0 = gmx_simd_sub_r_mm_sub_ps(VLJ12_S0, VLJ6_S0); |
904 | VLJ_S1 = gmx_simd_sub_r_mm_sub_ps(VLJ12_S1, VLJ6_S1); |
905 | #ifndef HALF_LJ |
906 | VLJ_S2 = gmx_simd_sub_r_mm_sub_ps(VLJ12_S2, VLJ6_S2); |
907 | VLJ_S3 = gmx_simd_sub_r_mm_sub_ps(VLJ12_S3, VLJ6_S3); |
908 | #endif |
909 | |
910 | #endif /* (LJ_CUT || LJ_FORCE_SWITCH) && CALC_ENERGIES */ |
911 | |
912 | #ifdef LJ_POT_SWITCH |
913 | /* We always need the potential, since it is needed for the force */ |
914 | VLJ_S0 = gmx_simd_fnmadd_r(sixth_S, FrLJ6_S0, gmx_simd_mul_r(twelveth_S, FrLJ12_S0))_mm_sub_ps(_mm_mul_ps(twelveth_S, FrLJ12_S0), _mm_mul_ps(sixth_S , FrLJ6_S0)); |
915 | VLJ_S1 = gmx_simd_fnmadd_r(sixth_S, FrLJ6_S1, gmx_simd_mul_r(twelveth_S, FrLJ12_S1))_mm_sub_ps(_mm_mul_ps(twelveth_S, FrLJ12_S1), _mm_mul_ps(sixth_S , FrLJ6_S1)); |
916 | #ifndef HALF_LJ |
917 | VLJ_S2 = gmx_simd_fnmadd_r(sixth_S, FrLJ6_S2, gmx_simd_mul_r(twelveth_S, FrLJ12_S2))_mm_sub_ps(_mm_mul_ps(twelveth_S, FrLJ12_S2), _mm_mul_ps(sixth_S , FrLJ6_S2)); |
918 | VLJ_S3 = gmx_simd_fnmadd_r(sixth_S, FrLJ6_S3, gmx_simd_mul_r(twelveth_S, FrLJ12_S3))_mm_sub_ps(_mm_mul_ps(twelveth_S, FrLJ12_S3), _mm_mul_ps(sixth_S , FrLJ6_S3)); |
919 | #endif |
920 | |
921 | { |
922 | gmx_simd_real_t__m128 sw_S0, dsw_S0; |
923 | gmx_simd_real_t__m128 sw_S1, dsw_S1; |
924 | #ifndef HALF_LJ |
925 | gmx_simd_real_t__m128 sw_S2, dsw_S2; |
926 | gmx_simd_real_t__m128 sw_S3, dsw_S3; |
927 | #endif |
928 | |
929 | #define switch_r(rsw, rsw2, c3, c4, c5) gmx_simd_fmadd_r(gmx_simd_fmadd_r(gmx_simd_fmadd_r(c5, rsw, c4), rsw, c3), gmx_simd_mul_r(rsw2, rsw), one_S)_mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps (c5, rsw), c4), rsw), c3), _mm_mul_ps(rsw2, rsw)), one_S) |
930 | #define dswitch_r(rsw, rsw2, c2, c3, c4) gmx_simd_mul_r_mm_mul_ps(gmx_simd_fmadd_r(gmx_simd_fmadd_r(c4, rsw, c3), rsw, c2)_mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(c4, rsw), c3), rsw ), c2), rsw2) |
931 | |
932 | sw_S0 = switch_r(rsw_S0, rsw2_S0, swV3_S, swV4_S, swV5_S); |
933 | dsw_S0 = dswitch_r(rsw_S0, rsw2_S0, swF2_S, swF3_S, swF4_S); |
934 | sw_S1 = switch_r(rsw_S1, rsw2_S1, swV3_S, swV4_S, swV5_S); |
935 | dsw_S1 = dswitch_r(rsw_S1, rsw2_S1, swF2_S, swF3_S, swF4_S); |
936 | #ifndef HALF_LJ |
937 | sw_S2 = switch_r(rsw_S2, rsw2_S2, swV3_S, swV4_S, swV5_S); |
938 | dsw_S2 = dswitch_r(rsw_S2, rsw2_S2, swF2_S, swF3_S, swF4_S); |
939 | sw_S3 = switch_r(rsw_S3, rsw2_S3, swV3_S, swV4_S, swV5_S); |
940 | dsw_S3 = dswitch_r(rsw_S3, rsw2_S3, swF2_S, swF3_S, swF4_S); |
941 | #endif |
942 | frLJ_S0 = gmx_simd_fnmadd_r(gmx_simd_mul_r(dsw_S0, VLJ_S0), r_S0, gmx_simd_mul_r(sw_S0, frLJ_S0))_mm_sub_ps(_mm_mul_ps(sw_S0, frLJ_S0), _mm_mul_ps(_mm_mul_ps( dsw_S0, VLJ_S0), r_S0)); |
943 | frLJ_S1 = gmx_simd_fnmadd_r(gmx_simd_mul_r(dsw_S1, VLJ_S1), r_S1, gmx_simd_mul_r(sw_S1, frLJ_S1))_mm_sub_ps(_mm_mul_ps(sw_S1, frLJ_S1), _mm_mul_ps(_mm_mul_ps( dsw_S1, VLJ_S1), r_S1)); |
944 | #ifndef HALF_LJ |
945 | frLJ_S2 = gmx_simd_fnmadd_r(gmx_simd_mul_r(dsw_S2, VLJ_S2), r_S2, gmx_simd_mul_r(sw_S2, frLJ_S2))_mm_sub_ps(_mm_mul_ps(sw_S2, frLJ_S2), _mm_mul_ps(_mm_mul_ps( dsw_S2, VLJ_S2), r_S2)); |
946 | frLJ_S3 = gmx_simd_fnmadd_r(gmx_simd_mul_r(dsw_S3, VLJ_S3), r_S3, gmx_simd_mul_r(sw_S3, frLJ_S3))_mm_sub_ps(_mm_mul_ps(sw_S3, frLJ_S3), _mm_mul_ps(_mm_mul_ps( dsw_S3, VLJ_S3), r_S3)); |
947 | #endif |
948 | #ifdef CALC_ENERGIES |
949 | VLJ_S0 = gmx_simd_mul_r_mm_mul_ps(sw_S0, VLJ_S0); |
950 | VLJ_S1 = gmx_simd_mul_r_mm_mul_ps(sw_S1, VLJ_S1); |
951 | #ifndef HALF_LJ |
952 | VLJ_S2 = gmx_simd_mul_r_mm_mul_ps(sw_S2, VLJ_S2); |
953 | VLJ_S3 = gmx_simd_mul_r_mm_mul_ps(sw_S3, VLJ_S3); |
954 | #endif |
955 | #endif |
956 | |
957 | #undef switch_r |
958 | #undef dswitch_r |
959 | } |
960 | #endif /* LJ_POT_SWITCH */ |
961 | |
962 | #if defined CALC_ENERGIES && defined CHECK_EXCLS |
963 | /* The potential shift should be removed for excluded pairs */ |
964 | VLJ_S0 = gmx_simd_blendzero_r_mm_and_ps(VLJ_S0, interact_S0); |
965 | VLJ_S1 = gmx_simd_blendzero_r_mm_and_ps(VLJ_S1, interact_S1); |
966 | #ifndef HALF_LJ |
967 | VLJ_S2 = gmx_simd_blendzero_r_mm_and_ps(VLJ_S2, interact_S2); |
968 | VLJ_S3 = gmx_simd_blendzero_r_mm_and_ps(VLJ_S3, interact_S3); |
969 | #endif |
970 | #endif |
971 | |
972 | #ifdef LJ_EWALD_GEOM |
973 | { |
974 | gmx_simd_real_t__m128 c6s_j_S; |
975 | gmx_simd_real_t__m128 c6grid_S0, rinvsix_nm_S0, cr2_S0, expmcr2_S0, poly_S0; |
976 | gmx_simd_real_t__m128 c6grid_S1, rinvsix_nm_S1, cr2_S1, expmcr2_S1, poly_S1; |
977 | #ifndef HALF_LJ |
978 | gmx_simd_real_t__m128 c6grid_S2, rinvsix_nm_S2, cr2_S2, expmcr2_S2, poly_S2; |
979 | gmx_simd_real_t__m128 c6grid_S3, rinvsix_nm_S3, cr2_S3, expmcr2_S3, poly_S3; |
980 | #endif |
981 | #ifdef CALC_ENERGIES |
982 | gmx_simd_real_t__m128 sh_mask_S0; |
983 | gmx_simd_real_t__m128 sh_mask_S1; |
984 | #ifndef HALF_LJ |
985 | gmx_simd_real_t__m128 sh_mask_S2; |
986 | gmx_simd_real_t__m128 sh_mask_S3; |
987 | #endif |
988 | #endif |
989 | |
990 | /* Determine C6 for the grid using the geometric combination rule */ |
991 | c6s_j_S = gmx_simd_load_r_mm_load_ps(ljc+aj2+0); |
992 | c6grid_S0 = gmx_simd_mul_r_mm_mul_ps(c6s_S0, c6s_j_S); |
993 | c6grid_S1 = gmx_simd_mul_r_mm_mul_ps(c6s_S1, c6s_j_S); |
994 | #ifndef HALF_LJ |
995 | c6grid_S2 = gmx_simd_mul_r_mm_mul_ps(c6s_S2, c6s_j_S); |
996 | c6grid_S3 = gmx_simd_mul_r_mm_mul_ps(c6s_S3, c6s_j_S); |
997 | #endif |
998 | |
999 | #ifdef CHECK_EXCLS |
1000 | /* Recalculate rinvsix without exclusion mask (compiler might optimize) */ |
1001 | rinvsix_nm_S0 = gmx_simd_mul_r_mm_mul_ps(rinvsq_S0, gmx_simd_mul_r_mm_mul_ps(rinvsq_S0, rinvsq_S0)); |
1002 | rinvsix_nm_S1 = gmx_simd_mul_r_mm_mul_ps(rinvsq_S1, gmx_simd_mul_r_mm_mul_ps(rinvsq_S1, rinvsq_S1)); |
1003 | #ifndef HALF_LJ |
1004 | rinvsix_nm_S2 = gmx_simd_mul_r_mm_mul_ps(rinvsq_S2, gmx_simd_mul_r_mm_mul_ps(rinvsq_S2, rinvsq_S2)); |
1005 | rinvsix_nm_S3 = gmx_simd_mul_r_mm_mul_ps(rinvsq_S3, gmx_simd_mul_r_mm_mul_ps(rinvsq_S3, rinvsq_S3)); |
1006 | #endif |
1007 | #else |
1008 | /* We didn't use a mask, so we can copy */ |
1009 | rinvsix_nm_S0 = rinvsix_S0; |
1010 | rinvsix_nm_S1 = rinvsix_S1; |
1011 | #ifndef HALF_LJ |
1012 | rinvsix_nm_S2 = rinvsix_S2; |
1013 | rinvsix_nm_S3 = rinvsix_S3; |
1014 | #endif |
1015 | #endif |
1016 | |
1017 | cr2_S0 = gmx_simd_mul_r_mm_mul_ps(lje_c2_S, rsq_S0); |
1018 | cr2_S1 = gmx_simd_mul_r_mm_mul_ps(lje_c2_S, rsq_S1); |
1019 | #ifndef HALF_LJ |
1020 | cr2_S2 = gmx_simd_mul_r_mm_mul_ps(lje_c2_S, rsq_S2); |
1021 | cr2_S3 = gmx_simd_mul_r_mm_mul_ps(lje_c2_S, rsq_S3); |
1022 | #endif |
1023 | expmcr2_S0 = gmx_simd_exp_rgmx_simd_exp_f(gmx_simd_mul_r_mm_mul_ps(mone_S, cr2_S0)); |
1024 | expmcr2_S1 = gmx_simd_exp_rgmx_simd_exp_f(gmx_simd_mul_r_mm_mul_ps(mone_S, cr2_S1)); |
1025 | #ifndef HALF_LJ |
1026 | expmcr2_S2 = gmx_simd_exp_rgmx_simd_exp_f(gmx_simd_mul_r_mm_mul_ps(mone_S, cr2_S2)); |
1027 | expmcr2_S3 = gmx_simd_exp_rgmx_simd_exp_f(gmx_simd_mul_r_mm_mul_ps(mone_S, cr2_S3)); |
1028 | #endif |
1029 | |
1030 | /* 1 + cr2 + 1/2*cr2^2 */ |
1031 | poly_S0 = gmx_simd_fmadd_r(gmx_simd_fmadd_r(half_S, cr2_S0, one_S), cr2_S0, one_S)_mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(half_S, cr2_S0), one_S ), cr2_S0), one_S); |
1032 | poly_S1 = gmx_simd_fmadd_r(gmx_simd_fmadd_r(half_S, cr2_S1, one_S), cr2_S1, one_S)_mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(half_S, cr2_S1), one_S ), cr2_S1), one_S); |
1033 | #ifndef HALF_LJ |
1034 | poly_S2 = gmx_simd_fmadd_r(gmx_simd_fmadd_r(half_S, cr2_S2, one_S), cr2_S2, one_S)_mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(half_S, cr2_S2), one_S ), cr2_S2), one_S); |
1035 | poly_S3 = gmx_simd_fmadd_r(gmx_simd_fmadd_r(half_S, cr2_S3, one_S), cr2_S3, one_S)_mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(half_S, cr2_S3), one_S ), cr2_S3), one_S); |
1036 | #endif |
1037 | |
1038 | /* We calculate LJ F*r = (6*C6)*(r^-6 - F_mesh/6), we use: |
1039 | * r^-6*cexp*(1 + cr2 + cr2^2/2 + cr2^3/6) = cexp*(r^-6*poly + c^6/6) |
1040 | */ |
1041 | frLJ_S0 = gmx_simd_fmadd_r(c6grid_S0, gmx_simd_fnmadd_r(expmcr2_S0, gmx_simd_fmadd_r(rinvsix_nm_S0, poly_S0, lje_c6_6_S), rinvsix_nm_S0), frLJ_S0)_mm_add_ps(_mm_mul_ps(c6grid_S0, _mm_sub_ps(rinvsix_nm_S0, _mm_mul_ps (expmcr2_S0, _mm_add_ps(_mm_mul_ps(rinvsix_nm_S0, poly_S0), lje_c6_6_S )))), frLJ_S0); |
1042 | frLJ_S1 = gmx_simd_fmadd_r(c6grid_S1, gmx_simd_fnmadd_r(expmcr2_S1, gmx_simd_fmadd_r(rinvsix_nm_S1, poly_S1, lje_c6_6_S), rinvsix_nm_S1), frLJ_S1)_mm_add_ps(_mm_mul_ps(c6grid_S1, _mm_sub_ps(rinvsix_nm_S1, _mm_mul_ps (expmcr2_S1, _mm_add_ps(_mm_mul_ps(rinvsix_nm_S1, poly_S1), lje_c6_6_S )))), frLJ_S1); |
1043 | #ifndef HALF_LJ |
1044 | frLJ_S2 = gmx_simd_fmadd_r(c6grid_S2, gmx_simd_fnmadd_r(expmcr2_S2, gmx_simd_fmadd_r(rinvsix_nm_S2, poly_S2, lje_c6_6_S), rinvsix_nm_S2), frLJ_S2)_mm_add_ps(_mm_mul_ps(c6grid_S2, _mm_sub_ps(rinvsix_nm_S2, _mm_mul_ps (expmcr2_S2, _mm_add_ps(_mm_mul_ps(rinvsix_nm_S2, poly_S2), lje_c6_6_S )))), frLJ_S2); |
1045 | frLJ_S3 = gmx_simd_fmadd_r(c6grid_S3, gmx_simd_fnmadd_r(expmcr2_S3, gmx_simd_fmadd_r(rinvsix_nm_S3, poly_S3, lje_c6_6_S), rinvsix_nm_S3), frLJ_S3)_mm_add_ps(_mm_mul_ps(c6grid_S3, _mm_sub_ps(rinvsix_nm_S3, _mm_mul_ps (expmcr2_S3, _mm_add_ps(_mm_mul_ps(rinvsix_nm_S3, poly_S3), lje_c6_6_S )))), frLJ_S3); |
1046 | #endif |
1047 | |
1048 | #ifdef CALC_ENERGIES |
1049 | #ifdef CHECK_EXCLS |
1050 | sh_mask_S0 = gmx_simd_blendzero_r_mm_and_ps(lje_vc_S, interact_S0); |
1051 | sh_mask_S1 = gmx_simd_blendzero_r_mm_and_ps(lje_vc_S, interact_S1); |
1052 | #ifndef HALF_LJ |
1053 | sh_mask_S2 = gmx_simd_blendzero_r_mm_and_ps(lje_vc_S, interact_S2); |
1054 | sh_mask_S3 = gmx_simd_blendzero_r_mm_and_ps(lje_vc_S, interact_S3); |
1055 | #endif |
1056 | #else |
1057 | sh_mask_S0 = lje_vc_S; |
1058 | sh_mask_S1 = lje_vc_S; |
1059 | #ifndef HALF_LJ |
1060 | sh_mask_S2 = lje_vc_S; |
1061 | sh_mask_S3 = lje_vc_S; |
1062 | #endif |
1063 | #endif |
1064 | |
1065 | VLJ_S0 = gmx_simd_fmadd_r(gmx_simd_mul_r(sixth_S, c6grid_S0), gmx_simd_fmadd_r(rinvsix_nm_S0, gmx_simd_fnmadd_r(expmcr2_S0, poly_S0, one_S), sh_mask_S0), VLJ_S0)_mm_add_ps(_mm_mul_ps(_mm_mul_ps(sixth_S, c6grid_S0), _mm_add_ps (_mm_mul_ps(rinvsix_nm_S0, _mm_sub_ps(one_S, _mm_mul_ps(expmcr2_S0 , poly_S0))), sh_mask_S0)), VLJ_S0); |
1066 | VLJ_S1 = gmx_simd_fmadd_r(gmx_simd_mul_r(sixth_S, c6grid_S1), gmx_simd_fmadd_r(rinvsix_nm_S1, gmx_simd_fnmadd_r(expmcr2_S1, poly_S1, one_S), sh_mask_S1), VLJ_S1)_mm_add_ps(_mm_mul_ps(_mm_mul_ps(sixth_S, c6grid_S1), _mm_add_ps (_mm_mul_ps(rinvsix_nm_S1, _mm_sub_ps(one_S, _mm_mul_ps(expmcr2_S1 , poly_S1))), sh_mask_S1)), VLJ_S1); |
1067 | #ifndef HALF_LJ |
1068 | VLJ_S2 = gmx_simd_fmadd_r(gmx_simd_mul_r(sixth_S, c6grid_S2), gmx_simd_fmadd_r(rinvsix_nm_S2, gmx_simd_fnmadd_r(expmcr2_S2, poly_S2, one_S), sh_mask_S2), VLJ_S2)_mm_add_ps(_mm_mul_ps(_mm_mul_ps(sixth_S, c6grid_S2), _mm_add_ps (_mm_mul_ps(rinvsix_nm_S2, _mm_sub_ps(one_S, _mm_mul_ps(expmcr2_S2 , poly_S2))), sh_mask_S2)), VLJ_S2); |
1069 | VLJ_S3 = gmx_simd_fmadd_r(gmx_simd_mul_r(sixth_S, c6grid_S3), gmx_simd_fmadd_r(rinvsix_nm_S3, gmx_simd_fnmadd_r(expmcr2_S3, poly_S3, one_S), sh_mask_S3), VLJ_S3)_mm_add_ps(_mm_mul_ps(_mm_mul_ps(sixth_S, c6grid_S3), _mm_add_ps (_mm_mul_ps(rinvsix_nm_S3, _mm_sub_ps(one_S, _mm_mul_ps(expmcr2_S3 , poly_S3))), sh_mask_S3)), VLJ_S3); |
1070 | #endif |
1071 | #endif /* CALC_ENERGIES */ |
1072 | } |
1073 | #endif /* LJ_EWALD_GEOM */ |
1074 | |
1075 | #if defined VDW_CUTOFF_CHECK |
1076 | /* frLJ is multiplied later by rinvsq, which is masked for the Coulomb |
1077 | * cut-off, but if the VdW cut-off is shorter, we need to mask with that. |
1078 | */ |
1079 | frLJ_S0 = gmx_simd_blendzero_r_mm_and_ps(frLJ_S0, wco_vdw_S0); |
1080 | frLJ_S1 = gmx_simd_blendzero_r_mm_and_ps(frLJ_S1, wco_vdw_S1); |
1081 | #ifndef HALF_LJ |
1082 | frLJ_S2 = gmx_simd_blendzero_r_mm_and_ps(frLJ_S2, wco_vdw_S2); |
1083 | frLJ_S3 = gmx_simd_blendzero_r_mm_and_ps(frLJ_S3, wco_vdw_S3); |
1084 | #endif |
1085 | #endif |
1086 | |
1087 | #ifdef CALC_ENERGIES |
1088 | /* The potential shift should be removed for pairs beyond cut-off */ |
1089 | VLJ_S0 = gmx_simd_blendzero_r_mm_and_ps(VLJ_S0, wco_vdw_S0); |
1090 | VLJ_S1 = gmx_simd_blendzero_r_mm_and_ps(VLJ_S1, wco_vdw_S1); |
1091 | #ifndef HALF_LJ |
1092 | VLJ_S2 = gmx_simd_blendzero_r_mm_and_ps(VLJ_S2, wco_vdw_S2); |
1093 | VLJ_S3 = gmx_simd_blendzero_r_mm_and_ps(VLJ_S3, wco_vdw_S3); |
1094 | #endif |
1095 | #endif |
1096 | |
1097 | #endif /* CALC_LJ */ |
1098 | |
1099 | #ifdef CALC_ENERGIES |
1100 | #ifdef ENERGY_GROUPS |
1101 | /* Extract the group pair index per j pair. |
1102 | * Energy groups are stored per i-cluster, so things get |
1103 | * complicated when the i- and j-cluster size don't match. |
1104 | */ |
1105 | { |
1106 | int egps_j; |
1107 | #if UNROLLJ(4/1) == 2 |
1108 | egps_j = nbat->energrp[cj>>1]; |
1109 | egp_jj[0] = ((egps_j >> ((cj & 1)*egps_jshift)) & egps_jmask)*egps_jstride; |
1110 | #else |
1111 | /* We assume UNROLLI <= UNROLLJ */ |
1112 | int jdi; |
1113 | for (jdi = 0; jdi < UNROLLJ(4/1)/UNROLLI4; jdi++) |
1114 | { |
1115 | int jj; |
1116 | egps_j = nbat->energrp[cj*(UNROLLJ(4/1)/UNROLLI4)+jdi]; |
1117 | for (jj = 0; jj < (UNROLLI4/2); jj++) |
1118 | { |
1119 | egp_jj[jdi*(UNROLLI4/2)+jj] = ((egps_j >> (jj*egps_jshift)) & egps_jmask)*egps_jstride; |
1120 | } |
1121 | } |
1122 | #endif |
1123 | } |
1124 | #endif |
1125 | |
1126 | #ifdef CALC_COULOMB |
1127 | #ifndef ENERGY_GROUPS |
1128 | vctot_S = gmx_simd_add_r_mm_add_ps(vctot_S, gmx_simd_sum4_rgmx_simd_sum4_f(vcoul_S0, vcoul_S1, vcoul_S2, vcoul_S3)); |
1129 | #else |
1130 | add_ener_grp(vcoul_S0, vctp[0], egp_jj); |
1131 | add_ener_grp(vcoul_S1, vctp[1], egp_jj); |
1132 | add_ener_grp(vcoul_S2, vctp[2], egp_jj); |
1133 | add_ener_grp(vcoul_S3, vctp[3], egp_jj); |
1134 | #endif |
1135 | #endif |
1136 | |
1137 | #ifdef CALC_LJ |
1138 | |
1139 | #ifndef ENERGY_GROUPS |
1140 | #ifndef HALF_LJ |
1141 | Vvdwtot_S = gmx_simd_add_r_mm_add_ps(Vvdwtot_S, |
1142 | gmx_simd_sum4_rgmx_simd_sum4_f(VLJ_S0, VLJ_S1, VLJ_S2, VLJ_S3) |
1143 | ); |
1144 | #else |
1145 | Vvdwtot_S = gmx_simd_add_r_mm_add_ps(Vvdwtot_S, |
1146 | gmx_simd_add_r_mm_add_ps(VLJ_S0, VLJ_S1) |
1147 | ); |
1148 | #endif |
1149 | #else |
1150 | add_ener_grp(VLJ_S0, vvdwtp[0], egp_jj); |
1151 | add_ener_grp(VLJ_S1, vvdwtp[1], egp_jj); |
1152 | #ifndef HALF_LJ |
1153 | add_ener_grp(VLJ_S2, vvdwtp[2], egp_jj); |
1154 | add_ener_grp(VLJ_S3, vvdwtp[3], egp_jj); |
1155 | #endif |
1156 | #endif |
1157 | #endif /* CALC_LJ */ |
1158 | #endif /* CALC_ENERGIES */ |
1159 | |
1160 | #ifdef CALC_LJ |
1161 | #ifdef CALC_COULOMB |
1162 | fscal_S0 = gmx_simd_mul_r_mm_mul_ps(rinvsq_S0, gmx_simd_add_r_mm_add_ps(frcoul_S0, frLJ_S0)); |
1163 | #else |
1164 | fscal_S0 = gmx_simd_mul_r_mm_mul_ps(rinvsq_S0, frLJ_S0); |
1165 | #endif |
1166 | #ifdef CALC_COULOMB |
1167 | fscal_S1 = gmx_simd_mul_r_mm_mul_ps(rinvsq_S1, gmx_simd_add_r_mm_add_ps(frcoul_S1, frLJ_S1)); |
1168 | #else |
1169 | fscal_S1 = gmx_simd_mul_r_mm_mul_ps(rinvsq_S1, frLJ_S1); |
1170 | #endif |
1171 | #else |
1172 | fscal_S0 = gmx_simd_mul_r_mm_mul_ps(rinvsq_S0, frcoul_S0); |
1173 | fscal_S1 = gmx_simd_mul_r_mm_mul_ps(rinvsq_S1, frcoul_S1); |
1174 | #endif /* CALC_LJ */ |
1175 | #if defined CALC_LJ && !defined HALF_LJ |
1176 | #ifdef CALC_COULOMB |
1177 | fscal_S2 = gmx_simd_mul_r_mm_mul_ps(rinvsq_S2, gmx_simd_add_r_mm_add_ps(frcoul_S2, frLJ_S2)); |
1178 | fscal_S3 = gmx_simd_mul_r_mm_mul_ps(rinvsq_S3, gmx_simd_add_r_mm_add_ps(frcoul_S3, frLJ_S3)); |
1179 | #else |
1180 | fscal_S2 = gmx_simd_mul_r_mm_mul_ps(rinvsq_S2, frLJ_S2); |
1181 | fscal_S3 = gmx_simd_mul_r_mm_mul_ps(rinvsq_S3, frLJ_S3); |
1182 | #endif |
1183 | #else |
1184 | /* Atom 2 and 3 don't have LJ, so only add Coulomb forces */ |
1185 | fscal_S2 = gmx_simd_mul_r_mm_mul_ps(rinvsq_S2, frcoul_S2); |
1186 | fscal_S3 = gmx_simd_mul_r_mm_mul_ps(rinvsq_S3, frcoul_S3); |
1187 | #endif |
1188 | |
1189 | /* Calculate temporary vectorial force */ |
1190 | tx_S0 = gmx_simd_mul_r_mm_mul_ps(fscal_S0, dx_S0); |
1191 | tx_S1 = gmx_simd_mul_r_mm_mul_ps(fscal_S1, dx_S1); |
1192 | tx_S2 = gmx_simd_mul_r_mm_mul_ps(fscal_S2, dx_S2); |
1193 | tx_S3 = gmx_simd_mul_r_mm_mul_ps(fscal_S3, dx_S3); |
1194 | ty_S0 = gmx_simd_mul_r_mm_mul_ps(fscal_S0, dy_S0); |
1195 | ty_S1 = gmx_simd_mul_r_mm_mul_ps(fscal_S1, dy_S1); |
1196 | ty_S2 = gmx_simd_mul_r_mm_mul_ps(fscal_S2, dy_S2); |
1197 | ty_S3 = gmx_simd_mul_r_mm_mul_ps(fscal_S3, dy_S3); |
1198 | tz_S0 = gmx_simd_mul_r_mm_mul_ps(fscal_S0, dz_S0); |
1199 | tz_S1 = gmx_simd_mul_r_mm_mul_ps(fscal_S1, dz_S1); |
1200 | tz_S2 = gmx_simd_mul_r_mm_mul_ps(fscal_S2, dz_S2); |
1201 | tz_S3 = gmx_simd_mul_r_mm_mul_ps(fscal_S3, dz_S3); |
1202 | |
1203 | /* Increment i atom force */ |
1204 | fix_S0 = gmx_simd_add_r_mm_add_ps(fix_S0, tx_S0); |
1205 | fix_S1 = gmx_simd_add_r_mm_add_ps(fix_S1, tx_S1); |
1206 | fix_S2 = gmx_simd_add_r_mm_add_ps(fix_S2, tx_S2); |
1207 | fix_S3 = gmx_simd_add_r_mm_add_ps(fix_S3, tx_S3); |
1208 | fiy_S0 = gmx_simd_add_r_mm_add_ps(fiy_S0, ty_S0); |
1209 | fiy_S1 = gmx_simd_add_r_mm_add_ps(fiy_S1, ty_S1); |
1210 | fiy_S2 = gmx_simd_add_r_mm_add_ps(fiy_S2, ty_S2); |
1211 | fiy_S3 = gmx_simd_add_r_mm_add_ps(fiy_S3, ty_S3); |
1212 | fiz_S0 = gmx_simd_add_r_mm_add_ps(fiz_S0, tz_S0); |
1213 | fiz_S1 = gmx_simd_add_r_mm_add_ps(fiz_S1, tz_S1); |
1214 | fiz_S2 = gmx_simd_add_r_mm_add_ps(fiz_S2, tz_S2); |
1215 | fiz_S3 = gmx_simd_add_r_mm_add_ps(fiz_S3, tz_S3); |
1216 | |
1217 | /* Decrement j atom force */ |
1218 | gmx_simd_store_r_mm_store_ps(f+ajx, |
1219 | gmx_simd_sub_r_mm_sub_ps( gmx_simd_load_r_mm_load_ps(f+ajx), gmx_simd_sum4_rgmx_simd_sum4_f(tx_S0, tx_S1, tx_S2, tx_S3) )); |
1220 | gmx_simd_store_r_mm_store_ps(f+ajy, |
1221 | gmx_simd_sub_r_mm_sub_ps( gmx_simd_load_r_mm_load_ps(f+ajy), gmx_simd_sum4_rgmx_simd_sum4_f(ty_S0, ty_S1, ty_S2, ty_S3) )); |
1222 | gmx_simd_store_r_mm_store_ps(f+ajz, |
1223 | gmx_simd_sub_r_mm_sub_ps( gmx_simd_load_r_mm_load_ps(f+ajz), gmx_simd_sum4_rgmx_simd_sum4_f(tz_S0, tz_S1, tz_S2, tz_S3) )); |
1224 | } |
1225 | |
1226 | #undef rinv_ex_S0 |
1227 | #undef rinv_ex_S1 |
1228 | #undef rinv_ex_S2 |
1229 | #undef rinv_ex_S3 |
1230 | |
1231 | #undef wco_vdw_S0 |
1232 | #undef wco_vdw_S1 |
1233 | #undef wco_vdw_S2 |
1234 | #undef wco_vdw_S3 |
1235 | |
1236 | #undef NBNXN_CUTOFF_USE_BLENDV |
1237 | |
1238 | #undef EXCL_FORCES |