2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
5 * Copyright (c) 2001-2008, The GROMACS development team.
6 * Copyright (c) 2012,2014,2015,2016,2017,2018,2019, by the GROMACS development team, led by
7 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
8 * and including many others, as listed in the AUTHORS file in the
9 * top-level source directory and at http://www.gromacs.org.
11 * GROMACS is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU Lesser General Public License
13 * as published by the Free Software Foundation; either version 2.1
14 * of the License, or (at your option) any later version.
16 * GROMACS is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19 * Lesser General Public License for more details.
21 * You should have received a copy of the GNU Lesser General Public
22 * License along with GROMACS; if not, see
23 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
24 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
26 * If you want to redistribute modifications to GROMACS, please
27 * consider that scientific software is very special. Version
28 * control is crucial - bugs must be traceable. We will be happy to
29 * consider code for inclusion in the official distribution, but
30 * derived work must not be called official GROMACS. Details are found
31 * in the README & COPYING files - if they are missing, get the
32 * official version at http://www.gromacs.org.
34 * To help us fund GROMACS development, we humbly ask that you cite
35 * the research papers on the package. Check out http://www.gromacs.org.
43 #include "gromacs/math/functions.h"
44 #include "gromacs/math/utilities.h"
45 #include "gromacs/math/vec.h"
46 #include "gromacs/mdtypes/commrec.h"
47 #include "gromacs/mdtypes/inputrec.h"
48 #include "gromacs/mdtypes/md_enums.h"
49 #include "gromacs/nbnxm/nbnxm_geometry.h"
50 #include "gromacs/simd/simd.h"
51 #include "gromacs/topology/ifunc.h"
52 #include "gromacs/topology/topology.h"
53 #include "gromacs/utility/fatalerror.h"
55 /* Computational cost of bonded, non-bonded and PME calculations.
56 * This will be machine dependent.
57 * The numbers are only used for estimating the relative cost of PME vs PP,
58 * so only relative numbers matter.
59 * The numbers here are accurate cycle counts for Haswell in single precision
60 * compiled with gcc5.2. A correction factor for other architectures is given
61 * by simd_cycle_factor().
62 * In double precision PME mesh is slightly cheaper, although not so much
63 * that the numbers need to be adjusted.
66 /* Cost of a pair interaction in the "Verlet" cut-off scheme, QEXP is Ewald */
67 static const double c_nbnxn_lj = 2.5;
68 static const double c_nbnxn_qrf_lj = 2.9;
69 static const double c_nbnxn_qrf = 2.4;
70 static const double c_nbnxn_qexp_lj = 4.2;
71 static const double c_nbnxn_qexp = 3.8;
72 /* Extra cost for expensive LJ interaction, e.g. pot-switch or LJ-PME */
73 static const double c_nbnxn_ljexp_add = 1.0;
75 /* Cost of the different components of PME. */
76 /* Cost of particle reordering and redistribution (no SIMD correction).
77 * This will be zero without MPI and can be very high with load imbalance.
78 * Thus we use an approximate value for medium parallelization.
80 static const double c_pme_redist = 100.0;
81 /* Cost of q spreading and force interpolation per charge. This part almost
82 * doesn't accelerate with SIMD, so we don't use SIMD correction.
84 static const double c_pme_spread = 5.0;
85 /* Cost of fft's, will be multiplied with 2 N log2(N) (no SIMD correction)
86 * Without MPI the number is 2-3, depending on grid factors and thread count.
87 * We take the high limit to be on the safe side and account for some MPI
88 * communication cost, which will dominate at high parallelization.
90 static const double c_pme_fft = 3.0;
91 /* Cost of pme_solve, will be multiplied with N */
92 static const double c_pme_solve = 9.0;
94 /* Cost of a bonded interaction divided by the number of distances calculations
95 * required in one interaction. The actual cost is nearly propotional to this.
97 static const double c_bond = 25.0;
100 #if GMX_SIMD_HAVE_REAL
101 static const gmx_bool bHaveSIMD = TRUE;
103 static const gmx_bool bHaveSIMD = FALSE;
106 /* Gives a correction factor for the currently compiled SIMD implementations
107 * versus the reference used for the coefficients above (8-wide SIMD with FMA).
108 * bUseSIMD sets if we asking for plain-C (FALSE) or SIMD (TRUE) code.
110 static double simd_cycle_factor(gmx_bool bUseSIMD)
112 /* The (average) ratio of the time taken by plain-C force calculations
113 * relative to SIMD versions, for the reference platform Haswell:
114 * 8-wide SIMD with FMA, factor: sqrt(2*8)*1.25 = 5.
115 * This factor is used for normalization in simd_cycle_factor().
117 const double simd_cycle_no_simd = 5.0;
120 #if GMX_SIMD_HAVE_REAL
123 /* We never get full speed-up of a factor GMX_SIMD_REAL_WIDTH.
124 * The actual speed-up depends very much on gather+scatter overhead,
125 * which is different for different bonded and non-bonded kernels.
126 * As a rough, but actually not bad, approximation we use a sqrt
127 * dependence on the width which gives a factor 4 for width=8.
129 speedup = std::sqrt(2.0 * GMX_SIMD_REAL_WIDTH);
130 # if GMX_SIMD_HAVE_FMA
131 /* FMA tends to give a bit more speedup */
142 gmx_incons("gmx_cycle_factor() compiled without SIMD called with bUseSIMD=TRUE");
144 /* No SIMD, no speedup */
148 /* Return speed compared to the reference (Haswell).
149 * For x86 SIMD, the nbnxn kernels are relatively much slower on
150 * Sandy/Ivy Bridge than Haswell, but that only leads to a too high
151 * PME load estimate on SB/IB, which is erring on the safe side.
153 return simd_cycle_no_simd / speedup;
156 void count_bonded_distances(const gmx_mtop_t& mtop, const t_inputrec& ir, double* ndistance_c, double* ndistance_simd)
159 double nonsimd_step_frac;
161 double ndtot_c, ndtot_simd;
162 #if GMX_SIMD_HAVE_REAL
163 gmx_bool bSimdBondeds = TRUE;
165 gmx_bool bSimdBondeds = FALSE;
168 bExcl = (ir.cutoff_scheme == ecutsGROUP && inputrecExclForces(&ir) && !EEL_FULL(ir.coulombtype));
172 /* We only have SIMD versions of these bondeds without energy and
173 * without shift-forces, we take that into account here.
175 if (ir.nstcalcenergy > 0)
177 nonsimd_step_frac = 1.0 / ir.nstcalcenergy;
181 nonsimd_step_frac = 0;
183 if (ir.epc != epcNO && 1.0 / ir.nstpcouple > nonsimd_step_frac)
185 nonsimd_step_frac = 1.0 / ir.nstpcouple;
190 nonsimd_step_frac = 1;
193 /* Count the number of pbc_rvec_sub calls required for bonded interactions.
194 * This number is also roughly proportional to the computational cost.
198 for (const gmx_molblock_t& molb : mtop.molblock)
200 const gmx_moltype_t* molt = &mtop.moltype[molb.type];
201 for (ftype = 0; ftype < F_NRE; ftype++)
205 if (interaction_function[ftype].flags & IF_BOND)
207 double nd_c, nd_simd;
211 /* For all interactions, except for the three exceptions
212 * in the switch below, #distances = #atoms - 1.
217 case F_FBPOSRES: nd_c = 1; break;
218 case F_CONNBONDS: break;
219 /* These bonded potentially use SIMD */
224 nd_c = nonsimd_step_frac * (NRAL(ftype) - 1);
225 nd_simd = (1 - nonsimd_step_frac) * (NRAL(ftype) - 1);
227 default: nd_c = NRAL(ftype) - 1; break;
229 nbonds = molb.nmol * molt->ilist[ftype].size() / (1 + NRAL(ftype));
230 ndtot_c += nbonds * nd_c;
231 ndtot_simd += nbonds * nd_simd;
236 ndtot_c += molb.nmol * (molt->excls.numElements() - molt->atoms.nr) / 2.;
242 fprintf(debug, "nr. of distance calculations in bondeds: C %.1f SIMD %.1f\n", ndtot_c, ndtot_simd);
245 if (ndistance_c != nullptr)
247 *ndistance_c = ndtot_c;
249 if (ndistance_simd != nullptr)
251 *ndistance_simd = ndtot_simd;
255 static void pp_verlet_load(const gmx_mtop_t& mtop,
256 const t_inputrec& ir,
261 gmx_bool* bChargePerturbed,
262 gmx_bool* bTypePerturbed)
264 int atnr, a, nqlj, nq, nlj;
267 double c_qlj, c_q, c_lj;
270 /* Conversion factor for reference vs SIMD kernel performance.
271 * The factor is about right for SSE2/4, but should be 2 higher for AVX256.
274 const real nbnxn_refkernel_fac = 4.0;
276 const real nbnxn_refkernel_fac = 8.0;
279 bQRF = (EEL_RF(ir.coulombtype) || ir.coulombtype == eelCUT);
281 gmx::ArrayRef<const t_iparams> iparams = mtop.ffparams.iparams;
282 atnr = mtop.ffparams.atnr;
285 *bChargePerturbed = FALSE;
286 *bTypePerturbed = FALSE;
287 for (const gmx_molblock_t& molb : mtop.molblock)
289 const gmx_moltype_t* molt = &mtop.moltype[molb.type];
290 const t_atom* atom = molt->atoms.atom;
291 for (a = 0; a < molt->atoms.nr; a++)
293 if (atom[a].q != 0 || atom[a].qB != 0)
295 if (iparams[(atnr + 1) * atom[a].type].lj.c6 != 0
296 || iparams[(atnr + 1) * atom[a].type].lj.c12 != 0)
305 if (atom[a].q != atom[a].qB)
307 *bChargePerturbed = TRUE;
309 if (atom[a].type != atom[a].typeB)
311 *bTypePerturbed = TRUE;
316 nlj = mtop.natoms - nqlj - nq;
319 *nlj_tot = nqlj + nlj;
321 /* Effective cut-off for cluster pair list of 4x4 or 4x8 atoms.
322 * This choice should match the one of pick_nbnxn_kernel_cpu().
323 * TODO: Make this function use pick_nbnxn_kernel_cpu().
325 #if GMX_SIMD_HAVE_REAL \
326 && ((GMX_SIMD_REAL_WIDTH == 8 && defined GMX_SIMD_HAVE_FMA) || GMX_SIMD_REAL_WIDTH > 8)
331 r_eff = ir.rlist + nbnxn_get_rlist_effective_inc(j_cluster_size, mtop.natoms / det(box));
333 /* The average number of pairs per atom */
334 nppa = 0.5 * 4 / 3 * M_PI * r_eff * r_eff * r_eff * mtop.natoms / det(box);
338 fprintf(debug, "nqlj %d nq %d nlj %d rlist %.3f r_eff %.3f pairs per atom %.1f\n", nqlj, nq,
339 nlj, ir.rlist, r_eff, nppa);
342 /* Determine the cost per pair interaction */
343 c_qlj = (bQRF ? c_nbnxn_qrf_lj : c_nbnxn_qexp_lj);
344 c_q = (bQRF ? c_nbnxn_qrf : c_nbnxn_qexp);
346 if (ir.vdw_modifier == eintmodPOTSWITCH || EVDW_PME(ir.vdwtype))
348 c_qlj += c_nbnxn_ljexp_add;
349 c_lj += c_nbnxn_ljexp_add;
351 if (EVDW_PME(ir.vdwtype) && ir.ljpme_combination_rule == eljpmeLB)
353 /* We don't have LJ-PME LB comb. rule kernels, we use slow kernels */
354 c_qlj *= nbnxn_refkernel_fac;
355 c_q *= nbnxn_refkernel_fac;
356 c_lj *= nbnxn_refkernel_fac;
359 /* For the PP non-bonded cost it is (unrealistically) assumed
360 * that all atoms are distributed homogeneously in space.
362 *cost_pp = (nqlj * c_qlj + nq * c_q + nlj * c_lj) * nppa;
364 *cost_pp *= simd_cycle_factor(bHaveSIMD);
367 float pme_load_estimate(const gmx_mtop_t& mtop, const t_inputrec& ir, const matrix box)
370 gmx_bool bChargePerturbed, bTypePerturbed;
371 double ndistance_c, ndistance_simd;
372 double cost_bond, cost_pp, cost_redist, cost_spread, cost_fft, cost_solve, cost_pme;
375 /* Computational cost of bonded, non-bonded and PME calculations.
376 * This will be machine dependent.
377 * The numbers here are accurate for Intel Core2 and AMD Athlon 64
378 * in single precision. In double precision PME mesh is slightly cheaper,
379 * although not so much that the numbers need to be adjusted.
382 count_bonded_distances(mtop, ir, &ndistance_c, &ndistance_simd);
383 /* C_BOND is the cost for bonded interactions with SIMD implementations,
384 * so we need to scale the number of bonded interactions for which there
385 * are only C implementations to the number of SIMD equivalents.
388 * (ndistance_c * simd_cycle_factor(FALSE) + ndistance_simd * simd_cycle_factor(bHaveSIMD));
390 pp_verlet_load(mtop, ir, box, &nq_tot, &nlj_tot, &cost_pp, &bChargePerturbed, &bTypePerturbed);
397 int gridNkzFactor = int{ (ir.nkz + 1) / 2 };
398 if (EEL_PME(ir.coulombtype))
400 double grid = ir.nkx * ir.nky * gridNkzFactor;
402 int f = ((ir.efep != efepNO && bChargePerturbed) ? 2 : 1);
403 cost_redist += c_pme_redist * nq_tot;
404 cost_spread += f * c_pme_spread * nq_tot * gmx::power3(ir.pme_order);
405 cost_fft += f * c_pme_fft * grid * std::log(grid) / std::log(2.0);
406 cost_solve += f * c_pme_solve * grid * simd_cycle_factor(bHaveSIMD);
409 if (EVDW_PME(ir.vdwtype))
411 double grid = ir.nkx * ir.nky * gridNkzFactor;
413 int f = ((ir.efep != efepNO && bTypePerturbed) ? 2 : 1);
414 if (ir.ljpme_combination_rule == eljpmeLB)
416 /* LB combination rule: we have 7 mesh terms */
419 cost_redist += c_pme_redist * nlj_tot;
420 cost_spread += f * c_pme_spread * nlj_tot * gmx::power3(ir.pme_order);
421 cost_fft += f * c_pme_fft * 2 * grid * std::log(grid) / std::log(2.0);
422 cost_solve += f * c_pme_solve * grid * simd_cycle_factor(bHaveSIMD);
425 cost_pme = cost_redist + cost_spread + cost_fft + cost_solve;
427 ratio = cost_pme / (cost_bond + cost_pp + cost_pme);
438 cost_bond, cost_pp, cost_redist, cost_spread, cost_fft, cost_solve);
440 fprintf(debug, "Estimate for relative PME load: %.3f\n", ratio);