File: | gromacs/mdlib/perf_est.c |
Location: | line 417, column 48 |
Description: | Branch condition evaluates to a garbage value |
1 | /* | |||
2 | * This file is part of the GROMACS molecular simulation package. | |||
3 | * | |||
4 | * Copyright (c) 1991-2000, University of Groningen, The Netherlands. | |||
5 | * Copyright (c) 2001-2008, The GROMACS development team. | |||
6 | * Copyright (c) 2012,2014, by the GROMACS development team, led by | |||
7 | * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, | |||
8 | * and including many others, as listed in the AUTHORS file in the | |||
9 | * top-level source directory and at http://www.gromacs.org. | |||
10 | * | |||
11 | * GROMACS is free software; you can redistribute it and/or | |||
12 | * modify it under the terms of the GNU Lesser General Public License | |||
13 | * as published by the Free Software Foundation; either version 2.1 | |||
14 | * of the License, or (at your option) any later version. | |||
15 | * | |||
16 | * GROMACS is distributed in the hope that it will be useful, | |||
17 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | |||
18 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |||
19 | * Lesser General Public License for more details. | |||
20 | * | |||
21 | * You should have received a copy of the GNU Lesser General Public | |||
22 | * License along with GROMACS; if not, see | |||
23 | * http://www.gnu.org/licenses, or write to the Free Software Foundation, | |||
24 | * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. | |||
25 | * | |||
26 | * If you want to redistribute modifications to GROMACS, please | |||
27 | * consider that scientific software is very special. Version | |||
28 | * control is crucial - bugs must be traceable. We will be happy to | |||
29 | * consider code for inclusion in the official distribution, but | |||
30 | * derived work must not be called official GROMACS. Details are found | |||
31 | * in the README & COPYING files - if they are missing, get the | |||
32 | * official version at http://www.gromacs.org. | |||
33 | * | |||
34 | * To help us fund GROMACS development, we humbly ask that you cite | |||
35 | * the research papers on the package. Check out http://www.gromacs.org. | |||
36 | */ | |||
37 | #ifdef HAVE_CONFIG_H1 | |||
38 | #include <config.h> | |||
39 | #endif | |||
40 | ||||
41 | #include <math.h> | |||
42 | ||||
43 | #include "perf_est.h" | |||
44 | #include "physics.h" | |||
45 | #include "gromacs/math/vec.h" | |||
46 | #include "mtop_util.h" | |||
47 | #include "types/commrec.h" | |||
48 | #include "nbnxn_search.h" | |||
49 | #include "nbnxn_consts.h" | |||
50 | ||||
51 | ||||
52 | /* Computational cost of bonded, non-bonded and PME calculations. | |||
53 | * This will be machine dependent. | |||
54 | * The numbers here are accurate for Intel Core2 and AMD Athlon 64 | |||
55 | * in single precision. In double precision PME mesh is slightly cheaper, | |||
56 | * although not so much that the numbers need to be adjusted. | |||
57 | */ | |||
58 | ||||
59 | /* Cost of a pair interaction in the "group" cut-off scheme */ | |||
60 | #define C_GR_FQ1.5 1.5 | |||
61 | #define C_GR_QLJ_CUT1.5 1.5 | |||
62 | #define C_GR_QLJ_TAB2.0 2.0 | |||
63 | #define C_GR_LJ_CUT1.0 1.0 | |||
64 | #define C_GR_LJ_TAB1.75 1.75 | |||
65 | /* Cost of 1 water with one Q/LJ atom */ | |||
66 | #define C_GR_QLJW_CUT2.0 2.0 | |||
67 | #define C_GR_QLJW_TAB2.25 2.25 | |||
68 | /* Cost of 1 water with one Q atom or with 1/3 water (LJ negligible) */ | |||
69 | #define C_GR_QW1.75 1.75 | |||
70 | ||||
71 | /* Cost of a pair interaction in the "Verlet" cut-off scheme, QEXP is Ewald */ | |||
72 | #define C_VT_LJ0.30 0.30 | |||
73 | #define C_VT_QRF_LJ0.40 0.40 | |||
74 | #define C_VT_QRF0.30 0.30 | |||
75 | #define C_VT_QEXP_LJ0.55 0.55 | |||
76 | #define C_VT_QEXP0.50 0.50 | |||
77 | /* Extra cost for expensive LJ interaction, e.g. pot-switch or LJ-PME */ | |||
78 | #define C_VT_LJEXP_ADD0.20 0.20 | |||
79 | ||||
80 | /* Cost of PME, with all components running with SSE instructions */ | |||
81 | /* Cost of particle reordering and redistribution */ | |||
82 | #define C_PME_REDIST12.0 12.0 | |||
83 | /* Cost of q spreading and force interpolation per charge (mainly memory) */ | |||
84 | #define C_PME_SPREAD0.30 0.30 | |||
85 | /* Cost of fft's, will be multiplied with N log(N) */ | |||
86 | #define C_PME_FFT0.20 0.20 | |||
87 | /* Cost of pme_solve, will be multiplied with N */ | |||
88 | #define C_PME_SOLVE0.50 0.50 | |||
89 | ||||
90 | /* Cost of a bonded interaction divided by the number of (pbc_)dx nrequired */ | |||
91 | #define C_BOND5.0 5.0 | |||
92 | ||||
93 | int n_bonded_dx(gmx_mtop_t *mtop, gmx_bool bExcl) | |||
94 | { | |||
95 | int mb, nmol, ftype, ndxb, ndx_excl; | |||
96 | int ndx; | |||
97 | gmx_moltype_t *molt; | |||
98 | ||||
99 | /* Count the number of pbc_rvec_sub calls required for bonded interactions. | |||
100 | * This number is also roughly proportional to the computational cost. | |||
101 | */ | |||
102 | ndx = 0; | |||
103 | ndx_excl = 0; | |||
104 | for (mb = 0; mb < mtop->nmolblock; mb++) | |||
105 | { | |||
106 | molt = &mtop->moltype[mtop->molblock[mb].type]; | |||
107 | nmol = mtop->molblock[mb].nmol; | |||
108 | for (ftype = 0; ftype < F_NRE; ftype++) | |||
109 | { | |||
110 | if (interaction_function[ftype].flags & IF_BOND1) | |||
111 | { | |||
112 | switch (ftype) | |||
113 | { | |||
114 | case F_POSRES: | |||
115 | case F_FBPOSRES: ndxb = 1; break; | |||
116 | case F_CONNBONDS: ndxb = 0; break; | |||
117 | default: ndxb = NRAL(ftype)(interaction_function[(ftype)].nratoms) - 1; break; | |||
118 | } | |||
119 | ndx += nmol*ndxb*molt->ilist[ftype].nr/(1 + NRAL(ftype)(interaction_function[(ftype)].nratoms)); | |||
120 | } | |||
121 | } | |||
122 | if (bExcl) | |||
123 | { | |||
124 | ndx_excl += nmol*(molt->excls.nra - molt->atoms.nr)/2; | |||
125 | } | |||
126 | else | |||
127 | { | |||
128 | ndx_excl = 0; | |||
129 | } | |||
130 | } | |||
131 | ||||
132 | if (debug) | |||
133 | { | |||
134 | fprintf(debug, "ndx bonded %d exclusions %d\n", ndx, ndx_excl); | |||
135 | } | |||
136 | ||||
137 | ndx += ndx_excl; | |||
138 | ||||
139 | return ndx; | |||
140 | } | |||
141 | ||||
142 | static void pp_group_load(gmx_mtop_t *mtop, t_inputrec *ir, matrix box, | |||
143 | int *nq_tot, int *nlj_tot, | |||
144 | double *cost_pp, | |||
145 | gmx_bool *bChargePerturbed, gmx_bool *bTypePerturbed) | |||
146 | { | |||
147 | t_atom *atom; | |||
148 | int mb, nmol, atnr, cg, a, a0, ncqlj, ncq, nclj; | |||
149 | gmx_bool bBHAM, bLJcut, bWater, bQ, bLJ; | |||
150 | int nw, nqlj, nq, nlj; | |||
151 | float fq, fqlj, flj, fljtab, fqljw, fqw; | |||
152 | t_iparams *iparams; | |||
153 | gmx_moltype_t *molt; | |||
154 | ||||
155 | bBHAM = (mtop->ffparams.functype[0] == F_BHAM); | |||
156 | ||||
157 | bLJcut = ((ir->vdwtype == evdwCUT) && !bBHAM); | |||
158 | ||||
159 | /* Computational cost of bonded, non-bonded and PME calculations. | |||
160 | * This will be machine dependent. | |||
161 | * The numbers here are accurate for Intel Core2 and AMD Athlon 64 | |||
162 | * in single precision. In double precision PME mesh is slightly cheaper, | |||
163 | * although not so much that the numbers need to be adjusted. | |||
164 | */ | |||
165 | fq = C_GR_FQ1.5; | |||
166 | fqlj = (bLJcut ? C_GR_QLJ_CUT1.5 : C_GR_QLJ_TAB2.0); | |||
167 | flj = (bLJcut ? C_GR_LJ_CUT1.0 : C_GR_LJ_TAB1.75); | |||
168 | /* Cost of 1 water with one Q/LJ atom */ | |||
169 | fqljw = (bLJcut ? C_GR_QLJW_CUT2.0 : C_GR_QLJW_TAB2.25); | |||
170 | /* Cost of 1 water with one Q atom or with 1/3 water (LJ negligible) */ | |||
171 | fqw = C_GR_QW1.75; | |||
172 | ||||
173 | iparams = mtop->ffparams.iparams; | |||
174 | atnr = mtop->ffparams.atnr; | |||
175 | nw = 0; | |||
176 | nqlj = 0; | |||
177 | nq = 0; | |||
178 | nlj = 0; | |||
179 | *bChargePerturbed = FALSE0; | |||
180 | for (mb = 0; mb < mtop->nmolblock; mb++) | |||
181 | { | |||
182 | molt = &mtop->moltype[mtop->molblock[mb].type]; | |||
183 | atom = molt->atoms.atom; | |||
184 | nmol = mtop->molblock[mb].nmol; | |||
185 | a = 0; | |||
186 | for (cg = 0; cg < molt->cgs.nr; cg++) | |||
187 | { | |||
188 | bWater = !bBHAM; | |||
189 | ncqlj = 0; | |||
190 | ncq = 0; | |||
191 | nclj = 0; | |||
192 | a0 = a; | |||
193 | while (a < molt->cgs.index[cg+1]) | |||
194 | { | |||
195 | bQ = (atom[a].q != 0 || atom[a].qB != 0); | |||
196 | bLJ = (iparams[(atnr+1)*atom[a].type].lj.c6 != 0 || | |||
197 | iparams[(atnr+1)*atom[a].type].lj.c12 != 0); | |||
198 | if (atom[a].q != atom[a].qB) | |||
199 | { | |||
200 | *bChargePerturbed = TRUE1; | |||
201 | } | |||
202 | if (atom[a].type != atom[a].typeB) | |||
203 | { | |||
204 | *bTypePerturbed = TRUE1; | |||
205 | } | |||
206 | /* This if this atom fits into water optimization */ | |||
207 | if (!((a == a0 && bQ && bLJ) || | |||
208 | (a == a0+1 && bQ && !bLJ) || | |||
209 | (a == a0+2 && bQ && !bLJ && atom[a].q == atom[a-1].q) || | |||
210 | (a == a0+3 && !bQ && bLJ))) | |||
211 | { | |||
212 | bWater = FALSE0; | |||
213 | } | |||
214 | if (bQ && bLJ) | |||
215 | { | |||
216 | ncqlj++; | |||
217 | } | |||
218 | else | |||
219 | { | |||
220 | if (bQ) | |||
221 | { | |||
222 | ncq++; | |||
223 | } | |||
224 | if (bLJ) | |||
225 | { | |||
226 | nclj++; | |||
227 | } | |||
228 | } | |||
229 | a++; | |||
230 | } | |||
231 | if (bWater) | |||
232 | { | |||
233 | nw += nmol; | |||
234 | } | |||
235 | else | |||
236 | { | |||
237 | nqlj += nmol*ncqlj; | |||
238 | nq += nmol*ncq; | |||
239 | nlj += nmol*nclj; | |||
240 | } | |||
241 | } | |||
242 | } | |||
243 | ||||
244 | *nq_tot = nq + nqlj + nw*3; | |||
245 | *nlj_tot = nlj + nqlj + nw; | |||
246 | ||||
247 | if (debug) | |||
248 | { | |||
249 | fprintf(debug, "nw %d nqlj %d nq %d nlj %d\n", nw, nqlj, nq, nlj); | |||
250 | } | |||
251 | ||||
252 | /* For the PP non-bonded cost it is (unrealistically) assumed | |||
253 | * that all atoms are distributed homogeneously in space. | |||
254 | * Factor 3 is used because a water molecule has 3 atoms | |||
255 | * (and TIP4P effectively has 3 interactions with (water) atoms)). | |||
256 | */ | |||
257 | *cost_pp = 0.5*(fqljw*nw*nqlj + | |||
258 | fqw *nw*(3*nw + nq) + | |||
259 | fqlj *nqlj*nqlj + | |||
260 | fq *nq*(3*nw + nqlj + nq) + | |||
261 | flj *nlj*(nw + nqlj + nlj)) | |||
262 | *4/3*M_PI3.14159265358979323846*ir->rlist*ir->rlist*ir->rlist/det(box); | |||
263 | } | |||
264 | ||||
265 | static void pp_verlet_load(gmx_mtop_t *mtop, t_inputrec *ir, matrix box, | |||
266 | int *nq_tot, int *nlj_tot, | |||
267 | double *cost_pp, | |||
268 | gmx_bool *bChargePerturbed, gmx_bool *bTypePerturbed) | |||
269 | { | |||
270 | t_atom *atom; | |||
271 | int mb, nmol, atnr, cg, a, a0, nqlj, nq, nlj; | |||
272 | gmx_bool bQRF; | |||
273 | t_iparams *iparams; | |||
274 | gmx_moltype_t *molt; | |||
275 | real r_eff; | |||
276 | double c_qlj, c_q, c_lj; | |||
277 | double nat; | |||
278 | /* Conversion factor for reference vs SIMD kernel performance. | |||
279 | * The factor is about right for SSE2/4, but should be 2 higher for AVX256. | |||
280 | */ | |||
281 | #ifdef GMX_DOUBLE | |||
282 | const real nbnxn_refkernel_fac = 4.0; | |||
283 | #else | |||
284 | const real nbnxn_refkernel_fac = 8.0; | |||
285 | #endif | |||
286 | ||||
287 | bQRF = (EEL_RF(ir->coulombtype)((ir->coulombtype) == eelRF || (ir->coulombtype) == eelGRF || (ir->coulombtype) == eelRF_NEC || (ir->coulombtype) == eelRF_ZERO ) || ir->coulombtype == eelCUT); | |||
288 | ||||
289 | iparams = mtop->ffparams.iparams; | |||
290 | atnr = mtop->ffparams.atnr; | |||
291 | nqlj = 0; | |||
292 | nq = 0; | |||
293 | *bChargePerturbed = FALSE0; | |||
294 | for (mb = 0; mb < mtop->nmolblock; mb++) | |||
295 | { | |||
296 | molt = &mtop->moltype[mtop->molblock[mb].type]; | |||
297 | atom = molt->atoms.atom; | |||
298 | nmol = mtop->molblock[mb].nmol; | |||
299 | a = 0; | |||
300 | for (a = 0; a < molt->atoms.nr; a++) | |||
301 | { | |||
302 | if (atom[a].q != 0 || atom[a].qB != 0) | |||
303 | { | |||
304 | if (iparams[(atnr+1)*atom[a].type].lj.c6 != 0 || | |||
305 | iparams[(atnr+1)*atom[a].type].lj.c12 != 0) | |||
306 | { | |||
307 | nqlj += nmol; | |||
308 | } | |||
309 | else | |||
310 | { | |||
311 | nq += nmol; | |||
312 | } | |||
313 | } | |||
314 | if (atom[a].q != atom[a].qB) | |||
315 | { | |||
316 | *bChargePerturbed = TRUE1; | |||
317 | } | |||
318 | if (atom[a].type != atom[a].typeB) | |||
319 | { | |||
320 | *bTypePerturbed = TRUE1; | |||
321 | } | |||
322 | } | |||
323 | } | |||
324 | ||||
325 | nlj = mtop->natoms - nqlj - nq; | |||
326 | ||||
327 | *nq_tot = nqlj + nq; | |||
328 | *nlj_tot = nqlj + nlj; | |||
329 | ||||
330 | /* Effective cut-off for cluster pair list of 4x4 atoms */ | |||
331 | r_eff = ir->rlist + nbnxn_get_rlist_effective_inc(NBNXN_CPU_CLUSTER_I_SIZE4, mtop->natoms/det(box)); | |||
332 | ||||
333 | if (debug) | |||
334 | { | |||
335 | fprintf(debug, "nqlj %d nq %d nlj %d rlist %.3f r_eff %.3f\n", | |||
336 | nqlj, nq, nlj, ir->rlist, r_eff); | |||
337 | } | |||
338 | ||||
339 | /* Determine the cost per pair interaction */ | |||
340 | c_qlj = (bQRF ? C_VT_QRF_LJ0.40 : C_VT_QEXP_LJ0.55); | |||
341 | c_q = (bQRF ? C_VT_QRF0.30 : C_VT_QEXP0.50); | |||
342 | c_lj = C_VT_LJ0.30; | |||
343 | if (ir->vdw_modifier == eintmodPOTSWITCH || EVDW_PME(ir->vdwtype)((ir->vdwtype) == evdwPME)) | |||
344 | { | |||
345 | c_qlj += C_VT_LJEXP_ADD0.20; | |||
346 | c_lj += C_VT_LJEXP_ADD0.20; | |||
347 | } | |||
348 | if (EVDW_PME(ir->vdwtype)((ir->vdwtype) == evdwPME) && ir->ljpme_combination_rule == eljpmeLB) | |||
349 | { | |||
350 | /* We don't have LJ-PME LB comb. rule kernels, we use slow kernels */ | |||
351 | c_qlj *= nbnxn_refkernel_fac; | |||
352 | c_q *= nbnxn_refkernel_fac; | |||
353 | c_lj *= nbnxn_refkernel_fac; | |||
354 | } | |||
355 | ||||
356 | /* For the PP non-bonded cost it is (unrealistically) assumed | |||
357 | * that all atoms are distributed homogeneously in space. | |||
358 | */ | |||
359 | /* Convert mtop->natoms to double to avoid int overflow */ | |||
360 | nat = mtop->natoms; | |||
361 | *cost_pp = 0.5*nat*(nqlj*c_qlj + nq*c_q + nlj*c_lj) | |||
362 | *4/3*M_PI3.14159265358979323846*r_eff*r_eff*r_eff/det(box); | |||
363 | } | |||
364 | ||||
365 | float pme_load_estimate(gmx_mtop_t *mtop, t_inputrec *ir, matrix box) | |||
366 | { | |||
367 | t_atom *atom; | |||
368 | int mb, nmol, atnr, cg, a, a0, nq_tot, nlj_tot, f; | |||
369 | gmx_bool bBHAM, bLJcut, bChargePerturbed, bTypePerturbed; | |||
| ||||
370 | gmx_bool bWater, bQ, bLJ; | |||
371 | double cost_bond, cost_pp, cost_redist, cost_spread, cost_fft, cost_solve, cost_pme; | |||
372 | float ratio; | |||
373 | t_iparams *iparams; | |||
374 | gmx_moltype_t *molt; | |||
375 | ||||
376 | /* Computational cost of bonded, non-bonded and PME calculations. | |||
377 | * This will be machine dependent. | |||
378 | * The numbers here are accurate for Intel Core2 and AMD Athlon 64 | |||
379 | * in single precision. In double precision PME mesh is slightly cheaper, | |||
380 | * although not so much that the numbers need to be adjusted. | |||
381 | */ | |||
382 | ||||
383 | iparams = mtop->ffparams.iparams; | |||
384 | atnr = mtop->ffparams.atnr; | |||
385 | ||||
386 | cost_bond = C_BOND5.0*n_bonded_dx(mtop, TRUE1); | |||
387 | ||||
388 | if (ir->cutoff_scheme == ecutsGROUP) | |||
389 | { | |||
390 | pp_group_load(mtop, ir, box, | |||
391 | &nq_tot, &nlj_tot, &cost_pp, | |||
392 | &bChargePerturbed, &bTypePerturbed); | |||
393 | } | |||
394 | else | |||
395 | { | |||
396 | pp_verlet_load(mtop, ir, box, | |||
397 | &nq_tot, &nlj_tot, &cost_pp, | |||
398 | &bChargePerturbed, &bTypePerturbed); | |||
399 | } | |||
400 | ||||
401 | cost_redist = 0; | |||
402 | cost_spread = 0; | |||
403 | cost_fft = 0; | |||
404 | cost_solve = 0; | |||
405 | ||||
406 | if (EEL_PME(ir->coulombtype)((ir->coulombtype) == eelPME || (ir->coulombtype) == eelPMESWITCH || (ir->coulombtype) == eelPMEUSER || (ir->coulombtype ) == eelPMEUSERSWITCH || (ir->coulombtype) == eelP3M_AD)) | |||
407 | { | |||
408 | f = ((ir->efep != efepNO && bChargePerturbed) ? 2 : 1); | |||
409 | cost_redist += C_PME_REDIST12.0*nq_tot; | |||
410 | cost_spread += f*C_PME_SPREAD0.30*nq_tot*pow(ir->pme_order, 3); | |||
411 | cost_fft += f*C_PME_FFT0.20*ir->nkx*ir->nky*ir->nkz*log(ir->nkx*ir->nky*ir->nkz); | |||
412 | cost_solve += f*C_PME_SOLVE0.50*ir->nkx*ir->nky*ir->nkz; | |||
413 | } | |||
414 | ||||
415 | if (EVDW_PME(ir->vdwtype)((ir->vdwtype) == evdwPME)) | |||
416 | { | |||
417 | f = ((ir->efep != efepNO && bTypePerturbed) ? 2 : 1); | |||
| ||||
418 | if (ir->ljpme_combination_rule == eljpmeLB) | |||
419 | { | |||
420 | /* LB combination rule: we have 7 mesh terms */ | |||
421 | f *= 7; | |||
422 | } | |||
423 | cost_redist += C_PME_REDIST12.0*nlj_tot; | |||
424 | cost_spread += f*C_PME_SPREAD0.30*nlj_tot*pow(ir->pme_order, 3); | |||
425 | cost_fft += f*C_PME_FFT0.20*ir->nkx*ir->nky*ir->nkz*log(ir->nkx*ir->nky*ir->nkz); | |||
426 | cost_solve += f*C_PME_SOLVE0.50*ir->nkx*ir->nky*ir->nkz; | |||
427 | } | |||
428 | ||||
429 | cost_pme = cost_redist + cost_spread + cost_fft + cost_solve; | |||
430 | ||||
431 | ratio = cost_pme/(cost_bond + cost_pp + cost_pme); | |||
432 | ||||
433 | if (debug) | |||
434 | { | |||
435 | fprintf(debug, | |||
436 | "cost_bond %f\n" | |||
437 | "cost_pp %f\n" | |||
438 | "cost_redist %f\n" | |||
439 | "cost_spread %f\n" | |||
440 | "cost_fft %f\n" | |||
441 | "cost_solve %f\n", | |||
442 | cost_bond, cost_pp, cost_redist, cost_spread, cost_fft, cost_solve); | |||
443 | ||||
444 | fprintf(debug, "Estimate for relative PME load: %.3f\n", ratio); | |||
445 | } | |||
446 | ||||
447 | return ratio; | |||
448 | } |