2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
5 * Copyright (c) 2001-2008, The GROMACS development team.
6 * Copyright (c) 2012,2014,2015, by the GROMACS development team, led by
7 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
8 * and including many others, as listed in the AUTHORS file in the
9 * top-level source directory and at http://www.gromacs.org.
11 * GROMACS is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU Lesser General Public License
13 * as published by the Free Software Foundation; either version 2.1
14 * of the License, or (at your option) any later version.
16 * GROMACS is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19 * Lesser General Public License for more details.
21 * You should have received a copy of the GNU Lesser General Public
22 * License along with GROMACS; if not, see
23 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
24 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
26 * If you want to redistribute modifications to GROMACS, please
27 * consider that scientific software is very special. Version
28 * control is crucial - bugs must be traceable. We will be happy to
29 * consider code for inclusion in the official distribution, but
30 * derived work must not be called official GROMACS. Details are found
31 * in the README & COPYING files - if they are missing, get the
32 * official version at http://www.gromacs.org.
34 * To help us fund GROMACS development, we humbly ask that you cite
35 * the research papers on the package. Check out http://www.gromacs.org.
43 #include "gromacs/math/functions.h"
44 #include "gromacs/math/vec.h"
45 #include "gromacs/mdlib/nbnxn_consts.h"
46 #include "gromacs/mdlib/nbnxn_search.h"
47 #include "gromacs/mdtypes/commrec.h"
48 #include "gromacs/mdtypes/md_enums.h"
49 #include "gromacs/topology/ifunc.h"
50 #include "gromacs/topology/topology.h"
51 #include "gromacs/utility/fatalerror.h"
53 /* Computational cost of bonded, non-bonded and PME calculations.
54 * This will be machine dependent.
55 * The numbers here are accurate for Intel Core2 and AMD Athlon 64
56 * in single precision. In double precision PME mesh is slightly cheaper,
57 * although not so much that the numbers need to be adjusted.
60 /* Cost of a pair interaction in the "group" cut-off scheme */
62 #define C_GR_QLJ_CUT 1.5
63 #define C_GR_QLJ_TAB 2.0
64 #define C_GR_LJ_CUT 1.0
65 #define C_GR_LJ_TAB 1.75
66 /* Cost of 1 water with one Q/LJ atom */
67 #define C_GR_QLJW_CUT 2.0
68 #define C_GR_QLJW_TAB 2.25
69 /* Cost of 1 water with one Q atom or with 1/3 water (LJ negligible) */
72 /* Cost of a pair interaction in the "Verlet" cut-off scheme, QEXP is Ewald */
74 #define C_VT_QRF_LJ 0.40
76 #define C_VT_QEXP_LJ 0.55
77 #define C_VT_QEXP 0.50
78 /* Extra cost for expensive LJ interaction, e.g. pot-switch or LJ-PME */
79 #define C_VT_LJEXP_ADD 0.20
81 /* Cost of PME, with all components running with SSE instructions */
82 /* Cost of particle reordering and redistribution */
83 #define C_PME_REDIST 12.0
84 /* Cost of q spreading and force interpolation per charge (mainly memory) */
85 #define C_PME_SPREAD 0.30
86 /* Cost of fft's, will be multiplied with N log(N) */
87 #define C_PME_FFT 0.20
88 /* Cost of pme_solve, will be multiplied with N */
89 #define C_PME_SOLVE 0.50
91 /* Cost of a bonded interaction divided by the number of (pbc_)dx nrequired */
94 int n_bonded_dx(const gmx_mtop_t *mtop, gmx_bool bExcl)
96 int mb, nmol, ftype, ndxb, ndx_excl;
98 const gmx_moltype_t *molt;
100 /* Count the number of pbc_rvec_sub calls required for bonded interactions.
101 * This number is also roughly proportional to the computational cost.
105 #if defined _ICC && __ICC == 1400 || defined __ICL && __ICL == 1400
106 #pragma novector /* Work-around for incorrect vectorization */
108 for (mb = 0; mb < mtop->nmolblock; mb++)
110 molt = &mtop->moltype[mtop->molblock[mb].type];
111 nmol = mtop->molblock[mb].nmol;
112 for (ftype = 0; ftype < F_NRE; ftype++)
114 if (interaction_function[ftype].flags & IF_BOND)
119 case F_FBPOSRES: ndxb = 1; break;
120 case F_CONNBONDS: ndxb = 0; break;
121 default: ndxb = NRAL(ftype) - 1; break;
123 ndx += nmol*ndxb*molt->ilist[ftype].nr/(1 + NRAL(ftype));
128 ndx_excl += nmol*(molt->excls.nra - molt->atoms.nr)/2;
138 fprintf(debug, "ndx bonded %d exclusions %d\n", ndx, ndx_excl);
146 static void pp_group_load(const gmx_mtop_t *mtop, const t_inputrec *ir,
148 int *nq_tot, int *nlj_tot,
150 gmx_bool *bChargePerturbed, gmx_bool *bTypePerturbed)
153 int mb, nmol, atnr, cg, a, a0, ncqlj, ncq, nclj;
154 gmx_bool bBHAM, bLJcut, bWater, bQ, bLJ;
155 int nw, nqlj, nq, nlj;
156 float fq, fqlj, flj, fqljw, fqw;
160 bBHAM = (mtop->ffparams.functype[0] == F_BHAM);
162 bLJcut = ((ir->vdwtype == evdwCUT) && !bBHAM);
164 /* Computational cost of bonded, non-bonded and PME calculations.
165 * This will be machine dependent.
166 * The numbers here are accurate for Intel Core2 and AMD Athlon 64
167 * in single precision. In double precision PME mesh is slightly cheaper,
168 * although not so much that the numbers need to be adjusted.
171 fqlj = (bLJcut ? C_GR_QLJ_CUT : C_GR_QLJ_TAB);
172 flj = (bLJcut ? C_GR_LJ_CUT : C_GR_LJ_TAB);
173 /* Cost of 1 water with one Q/LJ atom */
174 fqljw = (bLJcut ? C_GR_QLJW_CUT : C_GR_QLJW_TAB);
175 /* Cost of 1 water with one Q atom or with 1/3 water (LJ negligible) */
178 iparams = mtop->ffparams.iparams;
179 atnr = mtop->ffparams.atnr;
184 *bChargePerturbed = FALSE;
185 *bTypePerturbed = FALSE;
186 for (mb = 0; mb < mtop->nmolblock; mb++)
188 molt = &mtop->moltype[mtop->molblock[mb].type];
189 atom = molt->atoms.atom;
190 nmol = mtop->molblock[mb].nmol;
192 for (cg = 0; cg < molt->cgs.nr; cg++)
199 while (a < molt->cgs.index[cg+1])
201 bQ = (atom[a].q != 0 || atom[a].qB != 0);
202 bLJ = (iparams[(atnr+1)*atom[a].type].lj.c6 != 0 ||
203 iparams[(atnr+1)*atom[a].type].lj.c12 != 0);
204 if (atom[a].q != atom[a].qB)
206 *bChargePerturbed = TRUE;
208 if (atom[a].type != atom[a].typeB)
210 *bTypePerturbed = TRUE;
212 /* This if this atom fits into water optimization */
213 if (!((a == a0 && bQ && bLJ) ||
214 (a == a0+1 && bQ && !bLJ) ||
215 (a == a0+2 && bQ && !bLJ && atom[a].q == atom[a-1].q) ||
216 (a == a0+3 && !bQ && bLJ)))
250 *nq_tot = nq + nqlj + nw*3;
251 *nlj_tot = nlj + nqlj + nw;
255 fprintf(debug, "nw %d nqlj %d nq %d nlj %d\n", nw, nqlj, nq, nlj);
258 /* For the PP non-bonded cost it is (unrealistically) assumed
259 * that all atoms are distributed homogeneously in space.
260 * Factor 3 is used because a water molecule has 3 atoms
261 * (and TIP4P effectively has 3 interactions with (water) atoms)).
263 *cost_pp = 0.5*(fqljw*nw*nqlj +
264 fqw *nw*(3*nw + nq) +
266 fq *nq*(3*nw + nqlj + nq) +
267 flj *nlj*(nw + nqlj + nlj))
268 *4/3*M_PI*ir->rlist*ir->rlist*ir->rlist/det(box);
271 static void pp_verlet_load(const gmx_mtop_t *mtop, const t_inputrec *ir,
273 int *nq_tot, int *nlj_tot,
275 gmx_bool *bChargePerturbed, gmx_bool *bTypePerturbed)
278 int mb, nmol, atnr, a, nqlj, nq, nlj;
283 double c_qlj, c_q, c_lj;
285 /* Conversion factor for reference vs SIMD kernel performance.
286 * The factor is about right for SSE2/4, but should be 2 higher for AVX256.
289 const real nbnxn_refkernel_fac = 4.0;
291 const real nbnxn_refkernel_fac = 8.0;
294 bQRF = (EEL_RF(ir->coulombtype) || ir->coulombtype == eelCUT);
296 iparams = mtop->ffparams.iparams;
297 atnr = mtop->ffparams.atnr;
300 *bChargePerturbed = FALSE;
301 *bTypePerturbed = FALSE;
302 for (mb = 0; mb < mtop->nmolblock; mb++)
304 molt = &mtop->moltype[mtop->molblock[mb].type];
305 atom = molt->atoms.atom;
306 nmol = mtop->molblock[mb].nmol;
307 for (a = 0; a < molt->atoms.nr; a++)
309 if (atom[a].q != 0 || atom[a].qB != 0)
311 if (iparams[(atnr+1)*atom[a].type].lj.c6 != 0 ||
312 iparams[(atnr+1)*atom[a].type].lj.c12 != 0)
321 if (atom[a].q != atom[a].qB)
323 *bChargePerturbed = TRUE;
325 if (atom[a].type != atom[a].typeB)
327 *bTypePerturbed = TRUE;
332 nlj = mtop->natoms - nqlj - nq;
335 *nlj_tot = nqlj + nlj;
337 /* Effective cut-off for cluster pair list of 4x4 atoms */
338 r_eff = ir->rlist + nbnxn_get_rlist_effective_inc(NBNXN_CPU_CLUSTER_I_SIZE, mtop->natoms/det(box));
342 fprintf(debug, "nqlj %d nq %d nlj %d rlist %.3f r_eff %.3f\n",
343 nqlj, nq, nlj, ir->rlist, r_eff);
346 /* Determine the cost per pair interaction */
347 c_qlj = (bQRF ? C_VT_QRF_LJ : C_VT_QEXP_LJ);
348 c_q = (bQRF ? C_VT_QRF : C_VT_QEXP);
350 if (ir->vdw_modifier == eintmodPOTSWITCH || EVDW_PME(ir->vdwtype))
352 c_qlj += C_VT_LJEXP_ADD;
353 c_lj += C_VT_LJEXP_ADD;
355 if (EVDW_PME(ir->vdwtype) && ir->ljpme_combination_rule == eljpmeLB)
357 /* We don't have LJ-PME LB comb. rule kernels, we use slow kernels */
358 c_qlj *= nbnxn_refkernel_fac;
359 c_q *= nbnxn_refkernel_fac;
360 c_lj *= nbnxn_refkernel_fac;
363 /* For the PP non-bonded cost it is (unrealistically) assumed
364 * that all atoms are distributed homogeneously in space.
366 /* Convert mtop->natoms to double to avoid int overflow */
368 *cost_pp = 0.5*nat*(nqlj*c_qlj + nq*c_q + nlj*c_lj)
369 *4/3*M_PI*r_eff*r_eff*r_eff/det(box);
372 float pme_load_estimate(const gmx_mtop_t *mtop, const t_inputrec *ir,
375 int nq_tot, nlj_tot, f;
376 gmx_bool bChargePerturbed, bTypePerturbed;
377 double cost_bond, cost_pp, cost_redist, cost_spread, cost_fft, cost_solve, cost_pme;
380 /* Computational cost of bonded, non-bonded and PME calculations.
381 * This will be machine dependent.
382 * The numbers here are accurate for Intel Core2 and AMD Athlon 64
383 * in single precision. In double precision PME mesh is slightly cheaper,
384 * although not so much that the numbers need to be adjusted.
387 cost_bond = C_BOND*n_bonded_dx(mtop, TRUE);
389 if (ir->cutoff_scheme == ecutsGROUP)
391 pp_group_load(mtop, ir, box,
392 &nq_tot, &nlj_tot, &cost_pp,
393 &bChargePerturbed, &bTypePerturbed);
397 pp_verlet_load(mtop, ir, box,
398 &nq_tot, &nlj_tot, &cost_pp,
399 &bChargePerturbed, &bTypePerturbed);
407 if (EEL_PME(ir->coulombtype))
409 f = ((ir->efep != efepNO && bChargePerturbed) ? 2 : 1);
410 cost_redist += C_PME_REDIST*nq_tot;
411 cost_spread += f*C_PME_SPREAD*nq_tot*gmx::power3(ir->pme_order);
412 cost_fft += f*C_PME_FFT*ir->nkx*ir->nky*ir->nkz*std::log(static_cast<real>(ir->nkx*ir->nky*ir->nkz));
413 cost_solve += f*C_PME_SOLVE*ir->nkx*ir->nky*ir->nkz;
416 if (EVDW_PME(ir->vdwtype))
418 f = ((ir->efep != efepNO && bTypePerturbed) ? 2 : 1);
419 if (ir->ljpme_combination_rule == eljpmeLB)
421 /* LB combination rule: we have 7 mesh terms */
424 cost_redist += C_PME_REDIST*nlj_tot;
425 cost_spread += f*C_PME_SPREAD*nlj_tot*gmx::power3(ir->pme_order);
426 cost_fft += f*C_PME_FFT*ir->nkx*ir->nky*ir->nkz*std::log(static_cast<real>(ir->nkx*ir->nky*ir->nkz));
427 cost_solve += f*C_PME_SOLVE*ir->nkx*ir->nky*ir->nkz;
430 cost_pme = cost_redist + cost_spread + cost_fft + cost_solve;
432 ratio = cost_pme/(cost_bond + cost_pp + cost_pme);
443 cost_bond, cost_pp, cost_redist, cost_spread, cost_fft, cost_solve);
445 fprintf(debug, "Estimate for relative PME load: %.3f\n", ratio);