src/gromacs/mdlib/perf_est.cpp

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
   5  * Copyright (c) 2001-2008, The GROMACS development team.
   6  * Copyright (c) 2012,2014,2015,2016,2017 by the GROMACS development team.
   7  * Copyright (c) 2018,2019,2020,2021, by the GROMACS development team, led by
   8  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   9  * and including many others, as listed in the AUTHORS file in the
  10  * top-level source directory and at http://www.gromacs.org.
  11  *
  12  * GROMACS is free software; you can redistribute it and/or
  13  * modify it under the terms of the GNU Lesser General Public License
  14  * as published by the Free Software Foundation; either version 2.1
  15  * of the License, or (at your option) any later version.
  16  *
  17  * GROMACS is distributed in the hope that it will be useful,
  18  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  19  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  20  * Lesser General Public License for more details.
  21  *
  22  * You should have received a copy of the GNU Lesser General Public
  23  * License along with GROMACS; if not, see
  24  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  25  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  26  *
  27  * If you want to redistribute modifications to GROMACS, please
  28  * consider that scientific software is very special. Version
  29  * control is crucial - bugs must be traceable. We will be happy to
  30  * consider code for inclusion in the official distribution, but
  31  * derived work must not be called official GROMACS. Details are found
  32  * in the README & COPYING files - if they are missing, get the
  33  * official version at http://www.gromacs.org.
  34  *
  35  * To help us fund GROMACS development, we humbly ask that you cite
  36  * the research papers on the package. Check out http://www.gromacs.org.
  37  */
  38 #include "gmxpre.h"
  39
  40 #include "perf_est.h"
  41
  42 #include <cmath>
  43
  44 #include "gromacs/math/functions.h"
  45 #include "gromacs/math/units.h"
  46 #include "gromacs/math/utilities.h"
  47 #include "gromacs/math/vec.h"
  48 #include "gromacs/mdtypes/commrec.h"
  49 #include "gromacs/mdtypes/inputrec.h"
  50 #include "gromacs/mdtypes/md_enums.h"
  51 #include "gromacs/nbnxm/nbnxm_geometry.h"
  52 #include "gromacs/simd/simd.h"
  53 #include "gromacs/topology/ifunc.h"
  54 #include "gromacs/topology/topology.h"
  55 #include "gromacs/utility/fatalerror.h"
  56
  57 /* Computational cost of bonded, non-bonded and PME calculations.
  58  * This will be machine dependent.
  59  * The numbers are only used for estimating the relative cost of PME vs PP,
  60  * so only relative numbers matter.
  61  * The numbers here are accurate cycle counts for Haswell in single precision
  62  * compiled with gcc5.2. A correction factor for other architectures is given
  63  * by simd_cycle_factor().
  64  * In double precision PME mesh is slightly cheaper, although not so much
  65  * that the numbers need to be adjusted.
  66  */
  67
  68 /* Cost of a pair interaction in the "Verlet" cut-off scheme, QEXP is Ewald */
  69 static const double c_nbnxn_lj      = 2.5;
  70 static const double c_nbnxn_qrf_lj  = 2.9;
  71 static const double c_nbnxn_qrf     = 2.4;
  72 static const double c_nbnxn_qexp_lj = 4.2;
  73 static const double c_nbnxn_qexp    = 3.8;
  74 /* Extra cost for expensive LJ interaction, e.g. pot-switch or LJ-PME */
  75 static const double c_nbnxn_ljexp_add = 1.0;
  76
  77 /* Cost of the different components of PME. */
  78 /* Cost of particle reordering and redistribution (no SIMD correction).
  79  * This will be zero without MPI and can be very high with load imbalance.
  80  * Thus we use an approximate value for medium parallelization.
  81  */
  82 static const double c_pme_redist = 100.0;
  83 /* Cost of q spreading and force interpolation per charge. This part almost
  84  * doesn't accelerate with SIMD, so we don't use SIMD correction.
  85  */
  86 static const double c_pme_spread = 5.0;
  87 /* Cost of fft's, will be multiplied with 2 N log2(N) (no SIMD correction)
  88  * Without MPI the number is 2-3, depending on grid factors and thread count.
  89  * We take the high limit to be on the safe side and account for some MPI
  90  * communication cost, which will dominate at high parallelization.
  91  */
  92 static const double c_pme_fft = 3.0;
  93 /* Cost of pme_solve, will be multiplied with N */
  94 static const double c_pme_solve = 9.0;
  95
  96 /* Cost of a bonded interaction divided by the number of distances calculations
  97  * required in one interaction. The actual cost is nearly propotional to this.
  98  */
  99 static const double c_bond = 25.0;
 100
 101
 102 #if GMX_SIMD_HAVE_REAL
 103 static const gmx_bool bHaveSIMD = TRUE;
 104 #else
 105 static const gmx_bool bHaveSIMD = FALSE;
 106 #endif
 107
 108 /* Gives a correction factor for the currently compiled SIMD implementations
 109  * versus the reference used for the coefficients above (8-wide SIMD with FMA).
 110  * bUseSIMD sets if we asking for plain-C (FALSE) or SIMD (TRUE) code.
 111  */
 112 static double simd_cycle_factor(gmx_bool bUseSIMD)
 113 {
 114     /* The (average) ratio of the time taken by plain-C force calculations
 115      * relative to SIMD versions, for the reference platform Haswell:
 116      * 8-wide SIMD with FMA, factor: sqrt(2*8)*1.25 = 5.
 117      * This factor is used for normalization in simd_cycle_factor().
 118      */
 119     const double simd_cycle_no_simd = 5.0;
 120     double       speedup;
 121
 122 #if GMX_SIMD_HAVE_REAL
 123     if (bUseSIMD)
 124     {
 125         /* We never get full speed-up of a factor GMX_SIMD_REAL_WIDTH.
 126          * The actual speed-up depends very much on gather+scatter overhead,
 127          * which is different for different bonded and non-bonded kernels.
 128          * As a rough, but actually not bad, approximation we use a sqrt
 129          * dependence on the width which gives a factor 4 for width=8.
 130          */
 131         speedup = std::sqrt(2.0 * GMX_SIMD_REAL_WIDTH);
 132 #    if GMX_SIMD_HAVE_FMA
 133         /* FMA tends to give a bit more speedup */
 134         speedup *= 1.25;
 135 #    endif
 136     }
 137     else
 138     {
 139         speedup = 1.0;
 140     }
 141 #else
 142     if (bUseSIMD)
 143     {
 144         gmx_incons("gmx_cycle_factor() compiled without SIMD called with bUseSIMD=TRUE");
 145     }
 146     /* No SIMD, no speedup */
 147     speedup                        = 1.0;
 148 #endif
 149
 150     /* Return speed compared to the reference (Haswell).
 151      * For x86 SIMD, the nbnxn kernels are relatively much slower on
 152      * Sandy/Ivy Bridge than Haswell, but that only leads to a too high
 153      * PME load estimate on SB/IB, which is erring on the safe side.
 154      */
 155     return simd_cycle_no_simd / speedup;
 156 }
 157
 158 void count_bonded_distances(const gmx_mtop_t& mtop, const t_inputrec& ir, double* ndistance_c, double* ndistance_simd)
 159 {
 160     gmx_bool bExcl;
 161     double   nonsimd_step_frac;
 162     int      ftype;
 163     double   ndtot_c, ndtot_simd;
 164 #if GMX_SIMD_HAVE_REAL
 165     gmx_bool bSimdBondeds = TRUE;
 166 #else
 167     gmx_bool   bSimdBondeds        = FALSE;
 168 #endif
 169
 170     bExcl = (ir.cutoff_scheme == CutoffScheme::Group && inputrecExclForces(&ir)
 171              && !EEL_FULL(ir.coulombtype));
 172
 173     if (bSimdBondeds)
 174     {
 175         /* We only have SIMD versions of these bondeds without energy and
 176          * without shift-forces, we take that into account here.
 177          */
 178         if (ir.nstcalcenergy > 0)
 179         {
 180             nonsimd_step_frac = 1.0 / ir.nstcalcenergy;
 181         }
 182         else
 183         {
 184             nonsimd_step_frac = 0;
 185         }
 186         if (ir.epc != PressureCoupling::No && 1.0 / ir.nstpcouple > nonsimd_step_frac)
 187         {
 188             nonsimd_step_frac = 1.0 / ir.nstpcouple;
 189         }
 190     }
 191     else
 192     {
 193         nonsimd_step_frac = 1;
 194     }
 195
 196     /* Count the number of pbc_rvec_sub calls required for bonded interactions.
 197      * This number is also roughly proportional to the computational cost.
 198      */
 199     ndtot_c    = 0;
 200     ndtot_simd = 0;
 201     for (const gmx_molblock_t& molb : mtop.molblock)
 202     {
 203         const gmx_moltype_t* molt = &mtop.moltype[molb.type];
 204         for (ftype = 0; ftype < F_NRE; ftype++)
 205         {
 206             int nbonds;
 207
 208             if (interaction_function[ftype].flags & IF_BOND)
 209             {
 210                 double nd_c, nd_simd;
 211
 212                 nd_c    = 0;
 213                 nd_simd = 0;
 214                 /* For all interactions, except for the three exceptions
 215                  * in the switch below, #distances = #atoms - 1.
 216                  */
 217                 switch (ftype)
 218                 {
 219                     case F_POSRES:
 220                     case F_FBPOSRES: nd_c = 1; break;
 221                     case F_CONNBONDS: break;
 222                     /* These bonded potentially use SIMD */
 223                     case F_ANGLES:
 224                     case F_PDIHS:
 225                     case F_RBDIHS:
 226                     case F_LJ14:
 227                         nd_c    = nonsimd_step_frac * (NRAL(ftype) - 1);
 228                         nd_simd = (1 - nonsimd_step_frac) * (NRAL(ftype) - 1);
 229                         break;
 230                     default: nd_c = NRAL(ftype) - 1; break;
 231                 }
 232                 nbonds = molb.nmol * molt->ilist[ftype].size() / (1 + NRAL(ftype));
 233                 ndtot_c += nbonds * nd_c;
 234                 ndtot_simd += nbonds * nd_simd;
 235             }
 236         }
 237         if (bExcl)
 238         {
 239             ndtot_c += molb.nmol * (molt->excls.numElements() - molt->atoms.nr) / 2.;
 240         }
 241     }
 242
 243     if (debug)
 244     {
 245         fprintf(debug, "nr. of distance calculations in bondeds: C %.1f SIMD %.1f\n", ndtot_c, ndtot_simd);
 246     }
 247
 248     if (ndistance_c != nullptr)
 249     {
 250         *ndistance_c = ndtot_c;
 251     }
 252     if (ndistance_simd != nullptr)
 253     {
 254         *ndistance_simd = ndtot_simd;
 255     }
 256 }
 257
 258 static void pp_verlet_load(const gmx_mtop_t& mtop,
 259                            const t_inputrec& ir,
 260                            const matrix      box,
 261                            int*              nq_tot,
 262                            int*              nlj_tot,
 263                            double*           cost_pp,
 264                            gmx_bool*         bChargePerturbed,
 265                            gmx_bool*         bTypePerturbed)
 266 {
 267     int      atnr, a, nqlj, nq, nlj;
 268     gmx_bool bQRF;
 269     real     r_eff;
 270     double   c_qlj, c_q, c_lj;
 271     double   nppa;
 272     int      j_cluster_size;
 273     /* Conversion factor for reference vs SIMD kernel performance.
 274      * The factor is about right for SSE2/4, but should be 2 higher for AVX256.
 275      */
 276 #if GMX_DOUBLE
 277     const real nbnxn_refkernel_fac = 4.0;
 278 #else
 279     const real nbnxn_refkernel_fac = 8.0;
 280 #endif
 281
 282     bQRF = (EEL_RF(ir.coulombtype) || ir.coulombtype == CoulombInteractionType::Cut);
 283
 284     gmx::ArrayRef<const t_iparams> iparams = mtop.ffparams.iparams;
 285     atnr                                   = mtop.ffparams.atnr;
 286     nqlj                                   = 0;
 287     nq                                     = 0;
 288     *bChargePerturbed                      = FALSE;
 289     *bTypePerturbed                        = FALSE;
 290     for (const gmx_molblock_t& molb : mtop.molblock)
 291     {
 292         const gmx_moltype_t* molt = &mtop.moltype[molb.type];
 293         const t_atom*        atom = molt->atoms.atom;
 294         for (a = 0; a < molt->atoms.nr; a++)
 295         {
 296             if (atom[a].q != 0 || atom[a].qB != 0)
 297             {
 298                 if (iparams[(atnr + 1) * atom[a].type].lj.c6 != 0
 299                     || iparams[(atnr + 1) * atom[a].type].lj.c12 != 0)
 300                 {
 301                     nqlj += molb.nmol;
 302                 }
 303                 else
 304                 {
 305                     nq += molb.nmol;
 306                 }
 307             }
 308             if (atom[a].q != atom[a].qB)
 309             {
 310                 *bChargePerturbed = TRUE;
 311             }
 312             if (atom[a].type != atom[a].typeB)
 313             {
 314                 *bTypePerturbed = TRUE;
 315             }
 316         }
 317     }
 318
 319     nlj = mtop.natoms - nqlj - nq;
 320
 321     *nq_tot  = nqlj + nq;
 322     *nlj_tot = nqlj + nlj;
 323
 324     /* Effective cut-off for cluster pair list of 4x4 or 4x8 atoms.
 325      * This choice should match the one of pick_nbnxn_kernel_cpu().
 326      * TODO: Make this function use pick_nbnxn_kernel_cpu().
 327      */
 328 #if GMX_SIMD_HAVE_REAL \
 329         && ((GMX_SIMD_REAL_WIDTH == 8 && defined GMX_SIMD_HAVE_FMA) || GMX_SIMD_REAL_WIDTH > 8)
 330     j_cluster_size = 8;
 331 #else
 332     j_cluster_size                 = 4;
 333 #endif
 334     r_eff = ir.rlist + nbnxn_get_rlist_effective_inc(j_cluster_size, mtop.natoms / det(box));
 335
 336     /* The average number of pairs per atom */
 337     nppa = 0.5 * 4 / 3 * M_PI * r_eff * r_eff * r_eff * mtop.natoms / det(box);
 338
 339     if (debug)
 340     {
 341         fprintf(debug,
 342                 "nqlj %d nq %d nlj %d rlist %.3f r_eff %.3f pairs per atom %.1f\n",
 343                 nqlj,
 344                 nq,
 345                 nlj,
 346                 ir.rlist,
 347                 r_eff,
 348                 nppa);
 349     }
 350
 351     /* Determine the cost per pair interaction */
 352     c_qlj = (bQRF ? c_nbnxn_qrf_lj : c_nbnxn_qexp_lj);
 353     c_q   = (bQRF ? c_nbnxn_qrf : c_nbnxn_qexp);
 354     c_lj  = c_nbnxn_lj;
 355     if (ir.vdw_modifier == InteractionModifiers::PotSwitch || EVDW_PME(ir.vdwtype))
 356     {
 357         c_qlj += c_nbnxn_ljexp_add;
 358         c_lj += c_nbnxn_ljexp_add;
 359     }
 360     if (EVDW_PME(ir.vdwtype) && ir.ljpme_combination_rule == LongRangeVdW::LB)
 361     {
 362         /* We don't have LJ-PME LB comb. rule kernels, we use slow kernels */
 363         c_qlj *= nbnxn_refkernel_fac;
 364         c_q *= nbnxn_refkernel_fac;
 365         c_lj *= nbnxn_refkernel_fac;
 366     }
 367
 368     /* For the PP non-bonded cost it is (unrealistically) assumed
 369      * that all atoms are distributed homogeneously in space.
 370      */
 371     *cost_pp = (nqlj * c_qlj + nq * c_q + nlj * c_lj) * nppa;
 372
 373     *cost_pp *= simd_cycle_factor(bHaveSIMD);
 374 }
 375
 376 float pme_load_estimate(const gmx_mtop_t& mtop, const t_inputrec& ir, const matrix box)
 377 {
 378     int      nq_tot, nlj_tot;
 379     gmx_bool bChargePerturbed, bTypePerturbed;
 380     double   ndistance_c, ndistance_simd;
 381     double   cost_bond, cost_pp, cost_redist, cost_spread, cost_fft, cost_solve, cost_pme;
 382     float    ratio;
 383
 384     /* Computational cost of bonded, non-bonded and PME calculations.
 385      * This will be machine dependent.
 386      * The numbers here are accurate for Intel Core2 and AMD Athlon 64
 387      * in single precision. In double precision PME mesh is slightly cheaper,
 388      * although not so much that the numbers need to be adjusted.
 389      */
 390
 391     count_bonded_distances(mtop, ir, &ndistance_c, &ndistance_simd);
 392     /* C_BOND is the cost for bonded interactions with SIMD implementations,
 393      * so we need to scale the number of bonded interactions for which there
 394      * are only C implementations to the number of SIMD equivalents.
 395      */
 396     cost_bond = c_bond
 397                 * (ndistance_c * simd_cycle_factor(FALSE) + ndistance_simd * simd_cycle_factor(bHaveSIMD));
 398
 399     pp_verlet_load(mtop, ir, box, &nq_tot, &nlj_tot, &cost_pp, &bChargePerturbed, &bTypePerturbed);
 400
 401     cost_redist = 0;
 402     cost_spread = 0;
 403     cost_fft    = 0;
 404     cost_solve  = 0;
 405
 406     int gridNkzFactor = int{ (ir.nkz + 1) / 2 };
 407     if (EEL_PME(ir.coulombtype))
 408     {
 409         double grid = ir.nkx * ir.nky * gridNkzFactor;
 410
 411         int f = ((ir.efep != FreeEnergyPerturbationType::No && bChargePerturbed) ? 2 : 1);
 412         cost_redist += c_pme_redist * nq_tot;
 413         cost_spread += f * c_pme_spread * nq_tot * gmx::power3(ir.pme_order);
 414         cost_fft += f * c_pme_fft * grid * std::log(grid) / std::log(2.0);
 415         cost_solve += f * c_pme_solve * grid * simd_cycle_factor(bHaveSIMD);
 416     }
 417
 418     if (EVDW_PME(ir.vdwtype))
 419     {
 420         double grid = ir.nkx * ir.nky * gridNkzFactor;
 421
 422         int f = ((ir.efep != FreeEnergyPerturbationType::No && bTypePerturbed) ? 2 : 1);
 423         if (ir.ljpme_combination_rule == LongRangeVdW::LB)
 424         {
 425             /* LB combination rule: we have 7 mesh terms */
 426             f *= 7;
 427         }
 428         cost_redist += c_pme_redist * nlj_tot;
 429         cost_spread += f * c_pme_spread * nlj_tot * gmx::power3(ir.pme_order);
 430         cost_fft += f * c_pme_fft * 2 * grid * std::log(grid) / std::log(2.0);
 431         cost_solve += f * c_pme_solve * grid * simd_cycle_factor(bHaveSIMD);
 432     }
 433
 434     cost_pme = cost_redist + cost_spread + cost_fft + cost_solve;
 435
 436     ratio = cost_pme / (cost_bond + cost_pp + cost_pme);
 437
 438     if (debug)
 439     {
 440         fprintf(debug,
 441                 "cost_bond   %f\n"
 442                 "cost_pp     %f\n"
 443                 "cost_redist %f\n"
 444                 "cost_spread %f\n"
 445                 "cost_fft    %f\n"
 446                 "cost_solve  %f\n",
 447                 cost_bond,
 448                 cost_pp,
 449                 cost_redist,
 450                 cost_spread,
 451                 cost_fft,
 452                 cost_solve);
 453
 454         fprintf(debug, "Estimate for relative PME load: %.3f\n", ratio);
 455     }
 456
 457     return ratio;
 458 }