src/gromacs/mdlib/perf_est.cpp

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
   5  * Copyright (c) 2001-2008, The GROMACS development team.
   6  * Copyright (c) 2012,2014,2015,2016,2017 by the GROMACS development team.
   7  * Copyright (c) 2018,2019,2020,2021, by the GROMACS development team, led by
   8  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   9  * and including many others, as listed in the AUTHORS file in the
  10  * top-level source directory and at http://www.gromacs.org.
  11  *
  12  * GROMACS is free software; you can redistribute it and/or
  13  * modify it under the terms of the GNU Lesser General Public License
  14  * as published by the Free Software Foundation; either version 2.1
  15  * of the License, or (at your option) any later version.
  16  *
  17  * GROMACS is distributed in the hope that it will be useful,
  18  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  19  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  20  * Lesser General Public License for more details.
  21  *
  22  * You should have received a copy of the GNU Lesser General Public
  23  * License along with GROMACS; if not, see
  24  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  25  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  26  *
  27  * If you want to redistribute modifications to GROMACS, please
  28  * consider that scientific software is very special. Version
  29  * control is crucial - bugs must be traceable. We will be happy to
  30  * consider code for inclusion in the official distribution, but
  31  * derived work must not be called official GROMACS. Details are found
  32  * in the README & COPYING files - if they are missing, get the
  33  * official version at http://www.gromacs.org.
  34  *
  35  * To help us fund GROMACS development, we humbly ask that you cite
  36  * the research papers on the package. Check out http://www.gromacs.org.
  37  */
  38 #include "gmxpre.h"
  39
  40 #include "perf_est.h"
  41
  42 #include <cmath>
  43
  44 #include "gromacs/math/functions.h"
  45 #include "gromacs/math/utilities.h"
  46 #include "gromacs/math/vec.h"
  47 #include "gromacs/mdtypes/commrec.h"
  48 #include "gromacs/mdtypes/inputrec.h"
  49 #include "gromacs/mdtypes/md_enums.h"
  50 #include "gromacs/nbnxm/nbnxm_geometry.h"
  51 #include "gromacs/simd/simd.h"
  52 #include "gromacs/topology/ifunc.h"
  53 #include "gromacs/topology/topology.h"
  54 #include "gromacs/utility/fatalerror.h"
  55
  56 /* Computational cost of bonded, non-bonded and PME calculations.
  57  * This will be machine dependent.
  58  * The numbers are only used for estimating the relative cost of PME vs PP,
  59  * so only relative numbers matter.
  60  * The numbers here are accurate cycle counts for Haswell in single precision
  61  * compiled with gcc5.2. A correction factor for other architectures is given
  62  * by simd_cycle_factor().
  63  * In double precision PME mesh is slightly cheaper, although not so much
  64  * that the numbers need to be adjusted.
  65  */
  66
  67 /* Cost of a pair interaction in the "Verlet" cut-off scheme, QEXP is Ewald */
  68 static const double c_nbnxn_lj      = 2.5;
  69 static const double c_nbnxn_qrf_lj  = 2.9;
  70 static const double c_nbnxn_qrf     = 2.4;
  71 static const double c_nbnxn_qexp_lj = 4.2;
  72 static const double c_nbnxn_qexp    = 3.8;
  73 /* Extra cost for expensive LJ interaction, e.g. pot-switch or LJ-PME */
  74 static const double c_nbnxn_ljexp_add = 1.0;
  75
  76 /* Cost of the different components of PME. */
  77 /* Cost of particle reordering and redistribution (no SIMD correction).
  78  * This will be zero without MPI and can be very high with load imbalance.
  79  * Thus we use an approximate value for medium parallelization.
  80  */
  81 static const double c_pme_redist = 100.0;
  82 /* Cost of q spreading and force interpolation per charge. This part almost
  83  * doesn't accelerate with SIMD, so we don't use SIMD correction.
  84  */
  85 static const double c_pme_spread = 5.0;
  86 /* Cost of fft's, will be multiplied with 2 N log2(N) (no SIMD correction)
  87  * Without MPI the number is 2-3, depending on grid factors and thread count.
  88  * We take the high limit to be on the safe side and account for some MPI
  89  * communication cost, which will dominate at high parallelization.
  90  */
  91 static const double c_pme_fft = 3.0;
  92 /* Cost of pme_solve, will be multiplied with N */
  93 static const double c_pme_solve = 9.0;
  94
  95 /* Cost of a bonded interaction divided by the number of distances calculations
  96  * required in one interaction. The actual cost is nearly propotional to this.
  97  */
  98 static const double c_bond = 25.0;
  99
 100
 101 #if GMX_SIMD_HAVE_REAL
 102 static const gmx_bool bHaveSIMD = TRUE;
 103 #else
 104 static const gmx_bool bHaveSIMD = FALSE;
 105 #endif
 106
 107 /* Gives a correction factor for the currently compiled SIMD implementations
 108  * versus the reference used for the coefficients above (8-wide SIMD with FMA).
 109  * bUseSIMD sets if we asking for plain-C (FALSE) or SIMD (TRUE) code.
 110  */
 111 static double simd_cycle_factor(gmx_bool bUseSIMD)
 112 {
 113     /* The (average) ratio of the time taken by plain-C force calculations
 114      * relative to SIMD versions, for the reference platform Haswell:
 115      * 8-wide SIMD with FMA, factor: sqrt(2*8)*1.25 = 5.
 116      * This factor is used for normalization in simd_cycle_factor().
 117      */
 118     const double simd_cycle_no_simd = 5.0;
 119     double       speedup;
 120
 121 #if GMX_SIMD_HAVE_REAL
 122     if (bUseSIMD)
 123     {
 124         /* We never get full speed-up of a factor GMX_SIMD_REAL_WIDTH.
 125          * The actual speed-up depends very much on gather+scatter overhead,
 126          * which is different for different bonded and non-bonded kernels.
 127          * As a rough, but actually not bad, approximation we use a sqrt
 128          * dependence on the width which gives a factor 4 for width=8.
 129          */
 130         speedup = std::sqrt(2.0 * GMX_SIMD_REAL_WIDTH);
 131 #    if GMX_SIMD_HAVE_FMA
 132         /* FMA tends to give a bit more speedup */
 133         speedup *= 1.25;
 134 #    endif
 135     }
 136     else
 137     {
 138         speedup = 1.0;
 139     }
 140 #else
 141     if (bUseSIMD)
 142     {
 143         gmx_incons("gmx_cycle_factor() compiled without SIMD called with bUseSIMD=TRUE");
 144     }
 145     /* No SIMD, no speedup */
 146     speedup                        = 1.0;
 147 #endif
 148
 149     /* Return speed compared to the reference (Haswell).
 150      * For x86 SIMD, the nbnxn kernels are relatively much slower on
 151      * Sandy/Ivy Bridge than Haswell, but that only leads to a too high
 152      * PME load estimate on SB/IB, which is erring on the safe side.
 153      */
 154     return simd_cycle_no_simd / speedup;
 155 }
 156
 157 void count_bonded_distances(const gmx_mtop_t& mtop, const t_inputrec& ir, double* ndistance_c, double* ndistance_simd)
 158 {
 159     gmx_bool bExcl;
 160     double   nonsimd_step_frac;
 161     int      ftype;
 162     double   ndtot_c, ndtot_simd;
 163 #if GMX_SIMD_HAVE_REAL
 164     gmx_bool bSimdBondeds = TRUE;
 165 #else
 166     gmx_bool   bSimdBondeds        = FALSE;
 167 #endif
 168
 169     bExcl = (ir.cutoff_scheme == ecutsGROUP && inputrecExclForces(&ir) && !EEL_FULL(ir.coulombtype));
 170
 171     if (bSimdBondeds)
 172     {
 173         /* We only have SIMD versions of these bondeds without energy and
 174          * without shift-forces, we take that into account here.
 175          */
 176         if (ir.nstcalcenergy > 0)
 177         {
 178             nonsimd_step_frac = 1.0 / ir.nstcalcenergy;
 179         }
 180         else
 181         {
 182             nonsimd_step_frac = 0;
 183         }
 184         if (ir.epc != PressureCoupling::No && 1.0 / ir.nstpcouple > nonsimd_step_frac)
 185         {
 186             nonsimd_step_frac = 1.0 / ir.nstpcouple;
 187         }
 188     }
 189     else
 190     {
 191         nonsimd_step_frac = 1;
 192     }
 193
 194     /* Count the number of pbc_rvec_sub calls required for bonded interactions.
 195      * This number is also roughly proportional to the computational cost.
 196      */
 197     ndtot_c    = 0;
 198     ndtot_simd = 0;
 199     for (const gmx_molblock_t& molb : mtop.molblock)
 200     {
 201         const gmx_moltype_t* molt = &mtop.moltype[molb.type];
 202         for (ftype = 0; ftype < F_NRE; ftype++)
 203         {
 204             int nbonds;
 205
 206             if (interaction_function[ftype].flags & IF_BOND)
 207             {
 208                 double nd_c, nd_simd;
 209
 210                 nd_c    = 0;
 211                 nd_simd = 0;
 212                 /* For all interactions, except for the three exceptions
 213                  * in the switch below, #distances = #atoms - 1.
 214                  */
 215                 switch (ftype)
 216                 {
 217                     case F_POSRES:
 218                     case F_FBPOSRES: nd_c = 1; break;
 219                     case F_CONNBONDS: break;
 220                     /* These bonded potentially use SIMD */
 221                     case F_ANGLES:
 222                     case F_PDIHS:
 223                     case F_RBDIHS:
 224                     case F_LJ14:
 225                         nd_c    = nonsimd_step_frac * (NRAL(ftype) - 1);
 226                         nd_simd = (1 - nonsimd_step_frac) * (NRAL(ftype) - 1);
 227                         break;
 228                     default: nd_c = NRAL(ftype) - 1; break;
 229                 }
 230                 nbonds = molb.nmol * molt->ilist[ftype].size() / (1 + NRAL(ftype));
 231                 ndtot_c += nbonds * nd_c;
 232                 ndtot_simd += nbonds * nd_simd;
 233             }
 234         }
 235         if (bExcl)
 236         {
 237             ndtot_c += molb.nmol * (molt->excls.numElements() - molt->atoms.nr) / 2.;
 238         }
 239     }
 240
 241     if (debug)
 242     {
 243         fprintf(debug, "nr. of distance calculations in bondeds: C %.1f SIMD %.1f\n", ndtot_c, ndtot_simd);
 244     }
 245
 246     if (ndistance_c != nullptr)
 247     {
 248         *ndistance_c = ndtot_c;
 249     }
 250     if (ndistance_simd != nullptr)
 251     {
 252         *ndistance_simd = ndtot_simd;
 253     }
 254 }
 255
 256 static void pp_verlet_load(const gmx_mtop_t& mtop,
 257                            const t_inputrec& ir,
 258                            const matrix      box,
 259                            int*              nq_tot,
 260                            int*              nlj_tot,
 261                            double*           cost_pp,
 262                            gmx_bool*         bChargePerturbed,
 263                            gmx_bool*         bTypePerturbed)
 264 {
 265     int      atnr, a, nqlj, nq, nlj;
 266     gmx_bool bQRF;
 267     real     r_eff;
 268     double   c_qlj, c_q, c_lj;
 269     double   nppa;
 270     int      j_cluster_size;
 271     /* Conversion factor for reference vs SIMD kernel performance.
 272      * The factor is about right for SSE2/4, but should be 2 higher for AVX256.
 273      */
 274 #if GMX_DOUBLE
 275     const real nbnxn_refkernel_fac = 4.0;
 276 #else
 277     const real nbnxn_refkernel_fac = 8.0;
 278 #endif
 279
 280     bQRF = (EEL_RF(ir.coulombtype) || ir.coulombtype == eelCUT);
 281
 282     gmx::ArrayRef<const t_iparams> iparams = mtop.ffparams.iparams;
 283     atnr                                   = mtop.ffparams.atnr;
 284     nqlj                                   = 0;
 285     nq                                     = 0;
 286     *bChargePerturbed                      = FALSE;
 287     *bTypePerturbed                        = FALSE;
 288     for (const gmx_molblock_t& molb : mtop.molblock)
 289     {
 290         const gmx_moltype_t* molt = &mtop.moltype[molb.type];
 291         const t_atom*        atom = molt->atoms.atom;
 292         for (a = 0; a < molt->atoms.nr; a++)
 293         {
 294             if (atom[a].q != 0 || atom[a].qB != 0)
 295             {
 296                 if (iparams[(atnr + 1) * atom[a].type].lj.c6 != 0
 297                     || iparams[(atnr + 1) * atom[a].type].lj.c12 != 0)
 298                 {
 299                     nqlj += molb.nmol;
 300                 }
 301                 else
 302                 {
 303                     nq += molb.nmol;
 304                 }
 305             }
 306             if (atom[a].q != atom[a].qB)
 307             {
 308                 *bChargePerturbed = TRUE;
 309             }
 310             if (atom[a].type != atom[a].typeB)
 311             {
 312                 *bTypePerturbed = TRUE;
 313             }
 314         }
 315     }
 316
 317     nlj = mtop.natoms - nqlj - nq;
 318
 319     *nq_tot  = nqlj + nq;
 320     *nlj_tot = nqlj + nlj;
 321
 322     /* Effective cut-off for cluster pair list of 4x4 or 4x8 atoms.
 323      * This choice should match the one of pick_nbnxn_kernel_cpu().
 324      * TODO: Make this function use pick_nbnxn_kernel_cpu().
 325      */
 326 #if GMX_SIMD_HAVE_REAL \
 327         && ((GMX_SIMD_REAL_WIDTH == 8 && defined GMX_SIMD_HAVE_FMA) || GMX_SIMD_REAL_WIDTH > 8)
 328     j_cluster_size = 8;
 329 #else
 330     j_cluster_size                 = 4;
 331 #endif
 332     r_eff = ir.rlist + nbnxn_get_rlist_effective_inc(j_cluster_size, mtop.natoms / det(box));
 333
 334     /* The average number of pairs per atom */
 335     nppa = 0.5 * 4 / 3 * M_PI * r_eff * r_eff * r_eff * mtop.natoms / det(box);
 336
 337     if (debug)
 338     {
 339         fprintf(debug,
 340                 "nqlj %d nq %d nlj %d rlist %.3f r_eff %.3f pairs per atom %.1f\n",
 341                 nqlj,
 342                 nq,
 343                 nlj,
 344                 ir.rlist,
 345                 r_eff,
 346                 nppa);
 347     }
 348
 349     /* Determine the cost per pair interaction */
 350     c_qlj = (bQRF ? c_nbnxn_qrf_lj : c_nbnxn_qexp_lj);
 351     c_q   = (bQRF ? c_nbnxn_qrf : c_nbnxn_qexp);
 352     c_lj  = c_nbnxn_lj;
 353     if (ir.vdw_modifier == eintmodPOTSWITCH || EVDW_PME(ir.vdwtype))
 354     {
 355         c_qlj += c_nbnxn_ljexp_add;
 356         c_lj += c_nbnxn_ljexp_add;
 357     }
 358     if (EVDW_PME(ir.vdwtype) && ir.ljpme_combination_rule == eljpmeLB)
 359     {
 360         /* We don't have LJ-PME LB comb. rule kernels, we use slow kernels */
 361         c_qlj *= nbnxn_refkernel_fac;
 362         c_q *= nbnxn_refkernel_fac;
 363         c_lj *= nbnxn_refkernel_fac;
 364     }
 365
 366     /* For the PP non-bonded cost it is (unrealistically) assumed
 367      * that all atoms are distributed homogeneously in space.
 368      */
 369     *cost_pp = (nqlj * c_qlj + nq * c_q + nlj * c_lj) * nppa;
 370
 371     *cost_pp *= simd_cycle_factor(bHaveSIMD);
 372 }
 373
 374 float pme_load_estimate(const gmx_mtop_t& mtop, const t_inputrec& ir, const matrix box)
 375 {
 376     int      nq_tot, nlj_tot;
 377     gmx_bool bChargePerturbed, bTypePerturbed;
 378     double   ndistance_c, ndistance_simd;
 379     double   cost_bond, cost_pp, cost_redist, cost_spread, cost_fft, cost_solve, cost_pme;
 380     float    ratio;
 381
 382     /* Computational cost of bonded, non-bonded and PME calculations.
 383      * This will be machine dependent.
 384      * The numbers here are accurate for Intel Core2 and AMD Athlon 64
 385      * in single precision. In double precision PME mesh is slightly cheaper,
 386      * although not so much that the numbers need to be adjusted.
 387      */
 388
 389     count_bonded_distances(mtop, ir, &ndistance_c, &ndistance_simd);
 390     /* C_BOND is the cost for bonded interactions with SIMD implementations,
 391      * so we need to scale the number of bonded interactions for which there
 392      * are only C implementations to the number of SIMD equivalents.
 393      */
 394     cost_bond = c_bond
 395                 * (ndistance_c * simd_cycle_factor(FALSE) + ndistance_simd * simd_cycle_factor(bHaveSIMD));
 396
 397     pp_verlet_load(mtop, ir, box, &nq_tot, &nlj_tot, &cost_pp, &bChargePerturbed, &bTypePerturbed);
 398
 399     cost_redist = 0;
 400     cost_spread = 0;
 401     cost_fft    = 0;
 402     cost_solve  = 0;
 403
 404     int gridNkzFactor = int{ (ir.nkz + 1) / 2 };
 405     if (EEL_PME(ir.coulombtype))
 406     {
 407         double grid = ir.nkx * ir.nky * gridNkzFactor;
 408
 409         int f = ((ir.efep != efepNO && bChargePerturbed) ? 2 : 1);
 410         cost_redist += c_pme_redist * nq_tot;
 411         cost_spread += f * c_pme_spread * nq_tot * gmx::power3(ir.pme_order);
 412         cost_fft += f * c_pme_fft * grid * std::log(grid) / std::log(2.0);
 413         cost_solve += f * c_pme_solve * grid * simd_cycle_factor(bHaveSIMD);
 414     }
 415
 416     if (EVDW_PME(ir.vdwtype))
 417     {
 418         double grid = ir.nkx * ir.nky * gridNkzFactor;
 419
 420         int f = ((ir.efep != efepNO && bTypePerturbed) ? 2 : 1);
 421         if (ir.ljpme_combination_rule == eljpmeLB)
 422         {
 423             /* LB combination rule: we have 7 mesh terms */
 424             f *= 7;
 425         }
 426         cost_redist += c_pme_redist * nlj_tot;
 427         cost_spread += f * c_pme_spread * nlj_tot * gmx::power3(ir.pme_order);
 428         cost_fft += f * c_pme_fft * 2 * grid * std::log(grid) / std::log(2.0);
 429         cost_solve += f * c_pme_solve * grid * simd_cycle_factor(bHaveSIMD);
 430     }
 431
 432     cost_pme = cost_redist + cost_spread + cost_fft + cost_solve;
 433
 434     ratio = cost_pme / (cost_bond + cost_pp + cost_pme);
 435
 436     if (debug)
 437     {
 438         fprintf(debug,
 439                 "cost_bond   %f\n"
 440                 "cost_pp     %f\n"
 441                 "cost_redist %f\n"
 442                 "cost_spread %f\n"
 443                 "cost_fft    %f\n"
 444                 "cost_solve  %f\n",
 445                 cost_bond,
 446                 cost_pp,
 447                 cost_redist,
 448                 cost_spread,
 449                 cost_fft,
 450                 cost_solve);
 451
 452         fprintf(debug, "Estimate for relative PME load: %.3f\n", ratio);
 453     }
 454
 455     return ratio;
 456 }