2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
5 * Copyright (c) 2001-2008, The GROMACS development team.
6 * Copyright (c) 2012,2014,2015,2016,2017, by the GROMACS development team, led by
7 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
8 * and including many others, as listed in the AUTHORS file in the
9 * top-level source directory and at http://www.gromacs.org.
11 * GROMACS is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU Lesser General Public License
13 * as published by the Free Software Foundation; either version 2.1
14 * of the License, or (at your option) any later version.
16 * GROMACS is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19 * Lesser General Public License for more details.
21 * You should have received a copy of the GNU Lesser General Public
22 * License along with GROMACS; if not, see
23 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
24 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
26 * If you want to redistribute modifications to GROMACS, please
27 * consider that scientific software is very special. Version
28 * control is crucial - bugs must be traceable. We will be happy to
29 * consider code for inclusion in the official distribution, but
30 * derived work must not be called official GROMACS. Details are found
31 * in the README & COPYING files - if they are missing, get the
32 * official version at http://www.gromacs.org.
34 * To help us fund GROMACS development, we humbly ask that you cite
35 * the research papers on the package. Check out http://www.gromacs.org.
43 #include "gromacs/math/functions.h"
44 #include "gromacs/math/utilities.h"
45 #include "gromacs/math/vec.h"
46 #include "gromacs/mdlib/nbnxn_consts.h"
47 #include "gromacs/mdlib/nbnxn_search.h"
48 #include "gromacs/mdtypes/commrec.h"
49 #include "gromacs/mdtypes/inputrec.h"
50 #include "gromacs/mdtypes/md_enums.h"
51 #include "gromacs/simd/simd.h"
52 #include "gromacs/topology/ifunc.h"
53 #include "gromacs/topology/topology.h"
54 #include "gromacs/utility/fatalerror.h"
56 /* Computational cost of bonded, non-bonded and PME calculations.
57 * This will be machine dependent.
58 * The numbers are only used for estimating the relative cost of PME vs PP,
59 * so only relative numbers matter.
60 * The numbers here are accurate cycle counts for Haswell in single precision
61 * compiled with gcc5.2. A correction factor for other architectures is given
62 * by simd_cycle_factor().
63 * In double precision PME mesh is slightly cheaper, although not so much
64 * that the numbers need to be adjusted.
67 /* Cost of a pair interaction in the "group" cut-off scheme */
68 static const double c_group_fq = 18.0;
69 static const double c_group_qlj_cut = 18.0;
70 static const double c_group_qlj_tab = 24.0;
71 static const double c_group_lj_cut = 12.0;
72 static const double c_group_lj_tab = 21.0;
73 /* Cost of 1 water with one Q/LJ atom */
74 static const double c_group_qljw_cut = 24.0;
75 static const double c_group_qljw_tab = 27.0;
76 /* Cost of 1 water with one Q atom or with 1/3 water (LJ negligible) */
77 static const double c_group_qw = 21.0;
79 /* Cost of a pair interaction in the "Verlet" cut-off scheme, QEXP is Ewald */
80 static const double c_nbnxn_lj = 2.5;
81 static const double c_nbnxn_qrf_lj = 2.9;
82 static const double c_nbnxn_qrf = 2.4;
83 static const double c_nbnxn_qexp_lj = 4.2;
84 static const double c_nbnxn_qexp = 3.8;
85 /* Extra cost for expensive LJ interaction, e.g. pot-switch or LJ-PME */
86 static const double c_nbnxn_ljexp_add = 1.0;
88 /* Cost of the different components of PME. */
89 /* Cost of particle reordering and redistribution (no SIMD correction).
90 * This will be zero without MPI and can be very high with load imbalance.
91 * Thus we use an approximate value for medium parallelization.
93 static const double c_pme_redist = 100.0;
94 /* Cost of q spreading and force interpolation per charge. This part almost
95 * doesn't accelerate with SIMD, so we don't use SIMD correction.
97 static const double c_pme_spread = 5.0;
98 /* Cost of fft's, will be multiplied with 2 N log2(N) (no SIMD correction)
99 * Without MPI the number is 2-3, depending on grid factors and thread count.
100 * We take the high limit to be on the safe side and account for some MPI
101 * communication cost, which will dominate at high parallelization.
103 static const double c_pme_fft = 3.0;
104 /* Cost of pme_solve, will be multiplied with N */
105 static const double c_pme_solve = 9.0;
107 /* Cost of a bonded interaction divided by the number of distances calculations
108 * required in one interaction. The actual cost is nearly propotional to this.
110 static const double c_bond = 25.0;
113 #if GMX_SIMD_HAVE_REAL
114 static const gmx_bool bHaveSIMD = TRUE;
116 static const gmx_bool bHaveSIMD = FALSE;
119 /* Gives a correction factor for the currently compiled SIMD implementations
120 * versus the reference used for the coefficients above (8-wide SIMD with FMA).
121 * bUseSIMD sets if we asking for plain-C (FALSE) or SIMD (TRUE) code.
123 static double simd_cycle_factor(gmx_bool bUseSIMD)
125 /* The (average) ratio of the time taken by plain-C force calculations
126 * relative to SIMD versions, for the reference platform Haswell:
127 * 8-wide SIMD with FMA, factor: sqrt(2*8)*1.25 = 5.
128 * This factor is used for normalization in simd_cycle_factor().
130 const double simd_cycle_no_simd = 5.0;
133 #if GMX_SIMD_HAVE_REAL
136 /* We never get full speed-up of a factor GMX_SIMD_REAL_WIDTH.
137 * The actual speed-up depends very much on gather+scatter overhead,
138 * which is different for different bonded and non-bonded kernels.
139 * As a rough, but actually not bad, approximation we use a sqrt
140 * dependence on the width which gives a factor 4 for width=8.
142 speedup = std::sqrt(2.0*GMX_SIMD_REAL_WIDTH);
143 #if GMX_SIMD_HAVE_FMA
144 /* FMA tends to give a bit more speedup */
155 gmx_incons("gmx_cycle_factor() compiled without SIMD called with bUseSIMD=TRUE");
157 /* No SIMD, no speedup */
161 /* Return speed compared to the reference (Haswell).
162 * For x86 SIMD, the nbnxn kernels are relatively much slower on
163 * Sandy/Ivy Bridge than Haswell, but that only leads to a too high
164 * PME load estimate on SB/IB, which is erring on the safe side.
166 return simd_cycle_no_simd/speedup;
169 void count_bonded_distances(const gmx_mtop_t *mtop, const t_inputrec *ir,
170 double *ndistance_c, double *ndistance_simd)
173 double nonsimd_step_frac;
176 double ndtot_c, ndtot_simd;
177 #if GMX_SIMD_HAVE_REAL
178 gmx_bool bSimdBondeds = TRUE;
180 gmx_bool bSimdBondeds = FALSE;
183 bExcl = (ir->cutoff_scheme == ecutsGROUP && inputrecExclForces(ir)
184 && !EEL_FULL(ir->coulombtype));
188 /* We only have SIMD versions of these bondeds without energy and
189 * without shift-forces, we take that into account here.
191 if (ir->nstcalcenergy > 0)
193 nonsimd_step_frac = 1.0/ir->nstcalcenergy;
197 nonsimd_step_frac = 0;
199 if (ir->epc != epcNO && 1.0/ir->nstpcouple > nonsimd_step_frac)
201 nonsimd_step_frac = 1.0/ir->nstpcouple;
206 nonsimd_step_frac = 1;
209 /* Count the number of pbc_rvec_sub calls required for bonded interactions.
210 * This number is also roughly proportional to the computational cost.
214 #if defined _ICC && __ICC == 1400 || defined __ICL && __ICL == 1400
215 #pragma novector /* Work-around for incorrect vectorization */
217 for (mb = 0; mb < mtop->nmolblock; mb++)
219 molt = &mtop->moltype[mtop->molblock[mb].type];
220 nmol = mtop->molblock[mb].nmol;
221 for (ftype = 0; ftype < F_NRE; ftype++)
225 if (interaction_function[ftype].flags & IF_BOND)
227 double nd_c, nd_simd;
231 /* For all interactions, except for the three exceptions
232 * in the switch below, #distances = #atoms - 1.
242 /* These bonded potentially use SIMD */
247 nd_c = nonsimd_step_frac *(NRAL(ftype) - 1);
248 nd_simd = (1 - nonsimd_step_frac)*(NRAL(ftype) - 1);
251 nd_c = NRAL(ftype) - 1;
254 nbonds = nmol*molt->ilist[ftype].nr/(1 + NRAL(ftype));
255 ndtot_c += nbonds*nd_c;
256 ndtot_simd += nbonds*nd_simd;
261 ndtot_c += nmol*(molt->excls.nra - molt->atoms.nr)/2;
267 fprintf(debug, "nr. of distance calculations in bondeds: C %.1f SIMD %.1f\n", ndtot_c, ndtot_simd);
270 if (ndistance_c != nullptr)
272 *ndistance_c = ndtot_c;
274 if (ndistance_simd != nullptr)
276 *ndistance_simd = ndtot_simd;
280 static void pp_group_load(const gmx_mtop_t *mtop, const t_inputrec *ir,
282 int *nq_tot, int *nlj_tot,
284 gmx_bool *bChargePerturbed, gmx_bool *bTypePerturbed)
287 int mb, nmol, atnr, cg, a, a0, ncqlj, ncq, nclj;
288 gmx_bool bBHAM, bLJcut, bWater, bQ, bLJ;
289 int nw, nqlj, nq, nlj;
290 double fq, fqlj, flj, fqljw, fqw;
294 bBHAM = (mtop->ffparams.functype[0] == F_BHAM);
296 bLJcut = ((ir->vdwtype == evdwCUT) && !bBHAM);
298 /* Computational cost of bonded, non-bonded and PME calculations.
299 * This will be machine dependent.
300 * The numbers here are accurate for Intel Core2 and AMD Athlon 64
301 * in single precision. In double precision PME mesh is slightly cheaper,
302 * although not so much that the numbers need to be adjusted.
305 fqlj = (bLJcut ? c_group_qlj_cut : c_group_qlj_tab);
306 flj = (bLJcut ? c_group_lj_cut : c_group_lj_tab);
307 /* Cost of 1 water with one Q/LJ atom */
308 fqljw = (bLJcut ? c_group_qljw_cut : c_group_qljw_tab);
309 /* Cost of 1 water with one Q atom or with 1/3 water (LJ negligible) */
312 iparams = mtop->ffparams.iparams;
313 atnr = mtop->ffparams.atnr;
318 *bChargePerturbed = FALSE;
319 *bTypePerturbed = FALSE;
320 for (mb = 0; mb < mtop->nmolblock; mb++)
322 molt = &mtop->moltype[mtop->molblock[mb].type];
323 atom = molt->atoms.atom;
324 nmol = mtop->molblock[mb].nmol;
326 for (cg = 0; cg < molt->cgs.nr; cg++)
333 while (a < molt->cgs.index[cg+1])
335 bQ = (atom[a].q != 0 || atom[a].qB != 0);
336 bLJ = (iparams[(atnr+1)*atom[a].type].lj.c6 != 0 ||
337 iparams[(atnr+1)*atom[a].type].lj.c12 != 0);
338 if (atom[a].q != atom[a].qB)
340 *bChargePerturbed = TRUE;
342 if (atom[a].type != atom[a].typeB)
344 *bTypePerturbed = TRUE;
346 /* This if this atom fits into water optimization */
347 if (!((a == a0 && bQ && bLJ) ||
348 (a == a0+1 && bQ && !bLJ) ||
349 (a == a0+2 && bQ && !bLJ && atom[a].q == atom[a-1].q) ||
350 (a == a0+3 && !bQ && bLJ)))
384 *nq_tot = nq + nqlj + nw*3;
385 *nlj_tot = nlj + nqlj + nw;
389 fprintf(debug, "nw %d nqlj %d nq %d nlj %d\n", nw, nqlj, nq, nlj);
392 /* For the PP non-bonded cost it is (unrealistically) assumed
393 * that all atoms are distributed homogeneously in space.
394 * Factor 3 is used because a water molecule has 3 atoms
395 * (and TIP4P effectively has 3 interactions with (water) atoms)).
397 *cost_pp = 0.5*(fqljw*nw*nqlj +
398 fqw *nw*(3*nw + nq) +
400 fq *nq*(3*nw + nqlj + nq) +
401 flj *nlj*(nw + nqlj + nlj))
402 *4/3*M_PI*ir->rlist*ir->rlist*ir->rlist/det(box);
404 *cost_pp *= simd_cycle_factor(bHaveSIMD);
407 static void pp_verlet_load(const gmx_mtop_t *mtop, const t_inputrec *ir,
409 int *nq_tot, int *nlj_tot,
411 gmx_bool *bChargePerturbed, gmx_bool *bTypePerturbed)
414 int mb, nmol, atnr, a, nqlj, nq, nlj;
419 double c_qlj, c_q, c_lj;
422 /* Conversion factor for reference vs SIMD kernel performance.
423 * The factor is about right for SSE2/4, but should be 2 higher for AVX256.
426 const real nbnxn_refkernel_fac = 4.0;
428 const real nbnxn_refkernel_fac = 8.0;
431 bQRF = (EEL_RF(ir->coulombtype) || ir->coulombtype == eelCUT);
433 iparams = mtop->ffparams.iparams;
434 atnr = mtop->ffparams.atnr;
437 *bChargePerturbed = FALSE;
438 *bTypePerturbed = FALSE;
439 for (mb = 0; mb < mtop->nmolblock; mb++)
441 molt = &mtop->moltype[mtop->molblock[mb].type];
442 atom = molt->atoms.atom;
443 nmol = mtop->molblock[mb].nmol;
444 for (a = 0; a < molt->atoms.nr; a++)
446 if (atom[a].q != 0 || atom[a].qB != 0)
448 if (iparams[(atnr+1)*atom[a].type].lj.c6 != 0 ||
449 iparams[(atnr+1)*atom[a].type].lj.c12 != 0)
458 if (atom[a].q != atom[a].qB)
460 *bChargePerturbed = TRUE;
462 if (atom[a].type != atom[a].typeB)
464 *bTypePerturbed = TRUE;
469 nlj = mtop->natoms - nqlj - nq;
472 *nlj_tot = nqlj + nlj;
474 /* Effective cut-off for cluster pair list of 4x4 or 4x8 atoms.
475 * This choice should match the one of pick_nbnxn_kernel_cpu().
476 * TODO: Make this function use pick_nbnxn_kernel_cpu().
478 #if GMX_SIMD_HAVE_REAL && ((GMX_SIMD_REAL_WIDTH == 8 && defined GMX_SIMD_HAVE_FMA) || GMX_SIMD_REAL_WIDTH > 8)
483 r_eff = ir->rlist + nbnxn_get_rlist_effective_inc(j_cluster_size, mtop->natoms/det(box));
485 /* The average number of pairs per atom */
486 nppa = 0.5*4/3*M_PI*r_eff*r_eff*r_eff*mtop->natoms/det(box);
490 fprintf(debug, "nqlj %d nq %d nlj %d rlist %.3f r_eff %.3f pairs per atom %.1f\n",
491 nqlj, nq, nlj, ir->rlist, r_eff, nppa);
494 /* Determine the cost per pair interaction */
495 c_qlj = (bQRF ? c_nbnxn_qrf_lj : c_nbnxn_qexp_lj);
496 c_q = (bQRF ? c_nbnxn_qrf : c_nbnxn_qexp);
498 if (ir->vdw_modifier == eintmodPOTSWITCH || EVDW_PME(ir->vdwtype))
500 c_qlj += c_nbnxn_ljexp_add;
501 c_lj += c_nbnxn_ljexp_add;
503 if (EVDW_PME(ir->vdwtype) && ir->ljpme_combination_rule == eljpmeLB)
505 /* We don't have LJ-PME LB comb. rule kernels, we use slow kernels */
506 c_qlj *= nbnxn_refkernel_fac;
507 c_q *= nbnxn_refkernel_fac;
508 c_lj *= nbnxn_refkernel_fac;
511 /* For the PP non-bonded cost it is (unrealistically) assumed
512 * that all atoms are distributed homogeneously in space.
514 *cost_pp = (nqlj*c_qlj + nq*c_q + nlj*c_lj)*nppa;
516 *cost_pp *= simd_cycle_factor(bHaveSIMD);
519 float pme_load_estimate(const gmx_mtop_t *mtop, const t_inputrec *ir,
523 gmx_bool bChargePerturbed, bTypePerturbed;
524 double ndistance_c, ndistance_simd;
525 double cost_bond, cost_pp, cost_redist, cost_spread, cost_fft, cost_solve, cost_pme;
528 /* Computational cost of bonded, non-bonded and PME calculations.
529 * This will be machine dependent.
530 * The numbers here are accurate for Intel Core2 and AMD Athlon 64
531 * in single precision. In double precision PME mesh is slightly cheaper,
532 * although not so much that the numbers need to be adjusted.
535 count_bonded_distances(mtop, ir, &ndistance_c, &ndistance_simd);
536 /* C_BOND is the cost for bonded interactions with SIMD implementations,
537 * so we need to scale the number of bonded interactions for which there
538 * are only C implementations to the number of SIMD equivalents.
540 cost_bond = c_bond*(ndistance_c *simd_cycle_factor(FALSE) +
541 ndistance_simd*simd_cycle_factor(bHaveSIMD));
543 if (ir->cutoff_scheme == ecutsGROUP)
545 pp_group_load(mtop, ir, box,
546 &nq_tot, &nlj_tot, &cost_pp,
547 &bChargePerturbed, &bTypePerturbed);
551 pp_verlet_load(mtop, ir, box,
552 &nq_tot, &nlj_tot, &cost_pp,
553 &bChargePerturbed, &bTypePerturbed);
561 if (EEL_PME(ir->coulombtype))
563 double grid = ir->nkx*ir->nky*((ir->nkz + 1)/2);
565 int f = ((ir->efep != efepNO && bChargePerturbed) ? 2 : 1);
566 cost_redist += c_pme_redist*nq_tot;
567 cost_spread += f*c_pme_spread*nq_tot*gmx::power3(ir->pme_order);
568 cost_fft += f*c_pme_fft*grid*std::log(grid)/std::log(2.0);
569 cost_solve += f*c_pme_solve*grid*simd_cycle_factor(bHaveSIMD);
572 if (EVDW_PME(ir->vdwtype))
574 double grid = ir->nkx*ir->nky*((ir->nkz + 1)/2);
576 int f = ((ir->efep != efepNO && bTypePerturbed) ? 2 : 1);
577 if (ir->ljpme_combination_rule == eljpmeLB)
579 /* LB combination rule: we have 7 mesh terms */
582 cost_redist += c_pme_redist*nlj_tot;
583 cost_spread += f*c_pme_spread*nlj_tot*gmx::power3(ir->pme_order);
584 cost_fft += f*c_pme_fft*2*grid*std::log(grid)/std::log(2.0);
585 cost_solve += f*c_pme_solve*grid*simd_cycle_factor(bHaveSIMD);
588 cost_pme = cost_redist + cost_spread + cost_fft + cost_solve;
590 ratio = cost_pme/(cost_bond + cost_pp + cost_pme);
601 cost_bond, cost_pp, cost_redist, cost_spread, cost_fft, cost_solve);
603 fprintf(debug, "Estimate for relative PME load: %.3f\n", ratio);