src/gromacs/mdlib/perf_est.cpp

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
   5  * Copyright (c) 2001-2008, The GROMACS development team.
   6  * Copyright (c) 2012,2014,2015,2016,2017 by the GROMACS development team.
   7  * Copyright (c) 2018,2019,2020,2021, by the GROMACS development team, led by
   8  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   9  * and including many others, as listed in the AUTHORS file in the
  10  * top-level source directory and at http://www.gromacs.org.
  11  *
  12  * GROMACS is free software; you can redistribute it and/or
  13  * modify it under the terms of the GNU Lesser General Public License
  14  * as published by the Free Software Foundation; either version 2.1
  15  * of the License, or (at your option) any later version.
  16  *
  17  * GROMACS is distributed in the hope that it will be useful,
  18  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  19  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  20  * Lesser General Public License for more details.
  21  *
  22  * You should have received a copy of the GNU Lesser General Public
  23  * License along with GROMACS; if not, see
  24  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  25  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  26  *
  27  * If you want to redistribute modifications to GROMACS, please
  28  * consider that scientific software is very special. Version
  29  * control is crucial - bugs must be traceable. We will be happy to
  30  * consider code for inclusion in the official distribution, but
  31  * derived work must not be called official GROMACS. Details are found
  32  * in the README & COPYING files - if they are missing, get the
  33  * official version at http://www.gromacs.org.
  34  *
  35  * To help us fund GROMACS development, we humbly ask that you cite
  36  * the research papers on the package. Check out http://www.gromacs.org.
  37  */
  38 #include "gmxpre.h"
  39
  40 #include "perf_est.h"
  41
  42 #include <cmath>
  43
  44 #include "gromacs/math/functions.h"
  45 #include "gromacs/math/utilities.h"
  46 #include "gromacs/math/vec.h"
  47 #include "gromacs/mdtypes/commrec.h"
  48 #include "gromacs/mdtypes/inputrec.h"
  49 #include "gromacs/mdtypes/md_enums.h"
  50 #include "gromacs/nbnxm/nbnxm_geometry.h"
  51 #include "gromacs/simd/simd.h"
  52 #include "gromacs/topology/ifunc.h"
  53 #include "gromacs/topology/topology.h"
  54 #include "gromacs/utility/fatalerror.h"
  55
  56 /* Computational cost of bonded, non-bonded and PME calculations.
  57  * This will be machine dependent.
  58  * The numbers are only used for estimating the relative cost of PME vs PP,
  59  * so only relative numbers matter.
  60  * The numbers here are accurate cycle counts for Haswell in single precision
  61  * compiled with gcc5.2. A correction factor for other architectures is given
  62  * by simd_cycle_factor().
  63  * In double precision PME mesh is slightly cheaper, although not so much
  64  * that the numbers need to be adjusted.
  65  */
  66
  67 /* Cost of a pair interaction in the "Verlet" cut-off scheme, QEXP is Ewald */
  68 static const double c_nbnxn_lj      = 2.5;
  69 static const double c_nbnxn_qrf_lj  = 2.9;
  70 static const double c_nbnxn_qrf     = 2.4;
  71 static const double c_nbnxn_qexp_lj = 4.2;
  72 static const double c_nbnxn_qexp    = 3.8;
  73 /* Extra cost for expensive LJ interaction, e.g. pot-switch or LJ-PME */
  74 static const double c_nbnxn_ljexp_add = 1.0;
  75
  76 /* Cost of the different components of PME. */
  77 /* Cost of particle reordering and redistribution (no SIMD correction).
  78  * This will be zero without MPI and can be very high with load imbalance.
  79  * Thus we use an approximate value for medium parallelization.
  80  */
  81 static const double c_pme_redist = 100.0;
  82 /* Cost of q spreading and force interpolation per charge. This part almost
  83  * doesn't accelerate with SIMD, so we don't use SIMD correction.
  84  */
  85 static const double c_pme_spread = 5.0;
  86 /* Cost of fft's, will be multiplied with 2 N log2(N) (no SIMD correction)
  87  * Without MPI the number is 2-3, depending on grid factors and thread count.
  88  * We take the high limit to be on the safe side and account for some MPI
  89  * communication cost, which will dominate at high parallelization.
  90  */
  91 static const double c_pme_fft = 3.0;
  92 /* Cost of pme_solve, will be multiplied with N */
  93 static const double c_pme_solve = 9.0;
  94
  95 /* Cost of a bonded interaction divided by the number of distances calculations
  96  * required in one interaction. The actual cost is nearly propotional to this.
  97  */
  98 static const double c_bond = 25.0;
  99
 100
 101 #if GMX_SIMD_HAVE_REAL
 102 static const gmx_bool bHaveSIMD = TRUE;
 103 #else
 104 static const gmx_bool bHaveSIMD = FALSE;
 105 #endif
 106
 107 /* Gives a correction factor for the currently compiled SIMD implementations
 108  * versus the reference used for the coefficients above (8-wide SIMD with FMA).
 109  * bUseSIMD sets if we asking for plain-C (FALSE) or SIMD (TRUE) code.
 110  */
 111 static double simd_cycle_factor(gmx_bool bUseSIMD)
 112 {
 113     /* The (average) ratio of the time taken by plain-C force calculations
 114      * relative to SIMD versions, for the reference platform Haswell:
 115      * 8-wide SIMD with FMA, factor: sqrt(2*8)*1.25 = 5.
 116      * This factor is used for normalization in simd_cycle_factor().
 117      */
 118     const double simd_cycle_no_simd = 5.0;
 119     double       speedup;
 120
 121 #if GMX_SIMD_HAVE_REAL
 122     if (bUseSIMD)
 123     {
 124         /* We never get full speed-up of a factor GMX_SIMD_REAL_WIDTH.
 125          * The actual speed-up depends very much on gather+scatter overhead,
 126          * which is different for different bonded and non-bonded kernels.
 127          * As a rough, but actually not bad, approximation we use a sqrt
 128          * dependence on the width which gives a factor 4 for width=8.
 129          */
 130         speedup = std::sqrt(2.0 * GMX_SIMD_REAL_WIDTH);
 131 #    if GMX_SIMD_HAVE_FMA
 132         /* FMA tends to give a bit more speedup */
 133         speedup *= 1.25;
 134 #    endif
 135     }
 136     else
 137     {
 138         speedup = 1.0;
 139     }
 140 #else
 141     if (bUseSIMD)
 142     {
 143         gmx_incons("gmx_cycle_factor() compiled without SIMD called with bUseSIMD=TRUE");
 144     }
 145     /* No SIMD, no speedup */
 146     speedup                        = 1.0;
 147 #endif
 148
 149     /* Return speed compared to the reference (Haswell).
 150      * For x86 SIMD, the nbnxn kernels are relatively much slower on
 151      * Sandy/Ivy Bridge than Haswell, but that only leads to a too high
 152      * PME load estimate on SB/IB, which is erring on the safe side.
 153      */
 154     return simd_cycle_no_simd / speedup;
 155 }
 156
 157 void count_bonded_distances(const gmx_mtop_t& mtop, const t_inputrec& ir, double* ndistance_c, double* ndistance_simd)
 158 {
 159     gmx_bool bExcl;
 160     double   nonsimd_step_frac;
 161     int      ftype;
 162     double   ndtot_c, ndtot_simd;
 163 #if GMX_SIMD_HAVE_REAL
 164     gmx_bool bSimdBondeds = TRUE;
 165 #else
 166     gmx_bool   bSimdBondeds        = FALSE;
 167 #endif
 168
 169     bExcl = (ir.cutoff_scheme == CutoffScheme::Group && inputrecExclForces(&ir)
 170              && !EEL_FULL(ir.coulombtype));
 171
 172     if (bSimdBondeds)
 173     {
 174         /* We only have SIMD versions of these bondeds without energy and
 175          * without shift-forces, we take that into account here.
 176          */
 177         if (ir.nstcalcenergy > 0)
 178         {
 179             nonsimd_step_frac = 1.0 / ir.nstcalcenergy;
 180         }
 181         else
 182         {
 183             nonsimd_step_frac = 0;
 184         }
 185         if (ir.epc != PressureCoupling::No && 1.0 / ir.nstpcouple > nonsimd_step_frac)
 186         {
 187             nonsimd_step_frac = 1.0 / ir.nstpcouple;
 188         }
 189     }
 190     else
 191     {
 192         nonsimd_step_frac = 1;
 193     }
 194
 195     /* Count the number of pbc_rvec_sub calls required for bonded interactions.
 196      * This number is also roughly proportional to the computational cost.
 197      */
 198     ndtot_c    = 0;
 199     ndtot_simd = 0;
 200     for (const gmx_molblock_t& molb : mtop.molblock)
 201     {
 202         const gmx_moltype_t* molt = &mtop.moltype[molb.type];
 203         for (ftype = 0; ftype < F_NRE; ftype++)
 204         {
 205             int nbonds;
 206
 207             if (interaction_function[ftype].flags & IF_BOND)
 208             {
 209                 double nd_c, nd_simd;
 210
 211                 nd_c    = 0;
 212                 nd_simd = 0;
 213                 /* For all interactions, except for the three exceptions
 214                  * in the switch below, #distances = #atoms - 1.
 215                  */
 216                 switch (ftype)
 217                 {
 218                     case F_POSRES:
 219                     case F_FBPOSRES: nd_c = 1; break;
 220                     case F_CONNBONDS: break;
 221                     /* These bonded potentially use SIMD */
 222                     case F_ANGLES:
 223                     case F_PDIHS:
 224                     case F_RBDIHS:
 225                     case F_LJ14:
 226                         nd_c    = nonsimd_step_frac * (NRAL(ftype) - 1);
 227                         nd_simd = (1 - nonsimd_step_frac) * (NRAL(ftype) - 1);
 228                         break;
 229                     default: nd_c = NRAL(ftype) - 1; break;
 230                 }
 231                 nbonds = molb.nmol * molt->ilist[ftype].size() / (1 + NRAL(ftype));
 232                 ndtot_c += nbonds * nd_c;
 233                 ndtot_simd += nbonds * nd_simd;
 234             }
 235         }
 236         if (bExcl)
 237         {
 238             ndtot_c += molb.nmol * (molt->excls.numElements() - molt->atoms.nr) / 2.;
 239         }
 240     }
 241
 242     if (debug)
 243     {
 244         fprintf(debug, "nr. of distance calculations in bondeds: C %.1f SIMD %.1f\n", ndtot_c, ndtot_simd);
 245     }
 246
 247     if (ndistance_c != nullptr)
 248     {
 249         *ndistance_c = ndtot_c;
 250     }
 251     if (ndistance_simd != nullptr)
 252     {
 253         *ndistance_simd = ndtot_simd;
 254     }
 255 }
 256
 257 static void pp_verlet_load(const gmx_mtop_t& mtop,
 258                            const t_inputrec& ir,
 259                            const matrix      box,
 260                            int*              nq_tot,
 261                            int*              nlj_tot,
 262                            double*           cost_pp,
 263                            gmx_bool*         bChargePerturbed,
 264                            gmx_bool*         bTypePerturbed)
 265 {
 266     int      atnr, a, nqlj, nq, nlj;
 267     gmx_bool bQRF;
 268     real     r_eff;
 269     double   c_qlj, c_q, c_lj;
 270     double   nppa;
 271     int      j_cluster_size;
 272     /* Conversion factor for reference vs SIMD kernel performance.
 273      * The factor is about right for SSE2/4, but should be 2 higher for AVX256.
 274      */
 275 #if GMX_DOUBLE
 276     const real nbnxn_refkernel_fac = 4.0;
 277 #else
 278     const real nbnxn_refkernel_fac = 8.0;
 279 #endif
 280
 281     bQRF = (EEL_RF(ir.coulombtype) || ir.coulombtype == CoulombInteractionType::Cut);
 282
 283     gmx::ArrayRef<const t_iparams> iparams = mtop.ffparams.iparams;
 284     atnr                                   = mtop.ffparams.atnr;
 285     nqlj                                   = 0;
 286     nq                                     = 0;
 287     *bChargePerturbed                      = FALSE;
 288     *bTypePerturbed                        = FALSE;
 289     for (const gmx_molblock_t& molb : mtop.molblock)
 290     {
 291         const gmx_moltype_t* molt = &mtop.moltype[molb.type];
 292         const t_atom*        atom = molt->atoms.atom;
 293         for (a = 0; a < molt->atoms.nr; a++)
 294         {
 295             if (atom[a].q != 0 || atom[a].qB != 0)
 296             {
 297                 if (iparams[(atnr + 1) * atom[a].type].lj.c6 != 0
 298                     || iparams[(atnr + 1) * atom[a].type].lj.c12 != 0)
 299                 {
 300                     nqlj += molb.nmol;
 301                 }
 302                 else
 303                 {
 304                     nq += molb.nmol;
 305                 }
 306             }
 307             if (atom[a].q != atom[a].qB)
 308             {
 309                 *bChargePerturbed = TRUE;
 310             }
 311             if (atom[a].type != atom[a].typeB)
 312             {
 313                 *bTypePerturbed = TRUE;
 314             }
 315         }
 316     }
 317
 318     nlj = mtop.natoms - nqlj - nq;
 319
 320     *nq_tot  = nqlj + nq;
 321     *nlj_tot = nqlj + nlj;
 322
 323     /* Effective cut-off for cluster pair list of 4x4 or 4x8 atoms.
 324      * This choice should match the one of pick_nbnxn_kernel_cpu().
 325      * TODO: Make this function use pick_nbnxn_kernel_cpu().
 326      */
 327 #if GMX_SIMD_HAVE_REAL \
 328         && ((GMX_SIMD_REAL_WIDTH == 8 && defined GMX_SIMD_HAVE_FMA) || GMX_SIMD_REAL_WIDTH > 8)
 329     j_cluster_size = 8;
 330 #else
 331     j_cluster_size                 = 4;
 332 #endif
 333     r_eff = ir.rlist + nbnxn_get_rlist_effective_inc(j_cluster_size, mtop.natoms / det(box));
 334
 335     /* The average number of pairs per atom */
 336     nppa = 0.5 * 4 / 3 * M_PI * r_eff * r_eff * r_eff * mtop.natoms / det(box);
 337
 338     if (debug)
 339     {
 340         fprintf(debug,
 341                 "nqlj %d nq %d nlj %d rlist %.3f r_eff %.3f pairs per atom %.1f\n",
 342                 nqlj,
 343                 nq,
 344                 nlj,
 345                 ir.rlist,
 346                 r_eff,
 347                 nppa);
 348     }
 349
 350     /* Determine the cost per pair interaction */
 351     c_qlj = (bQRF ? c_nbnxn_qrf_lj : c_nbnxn_qexp_lj);
 352     c_q   = (bQRF ? c_nbnxn_qrf : c_nbnxn_qexp);
 353     c_lj  = c_nbnxn_lj;
 354     if (ir.vdw_modifier == InteractionModifiers::PotSwitch || EVDW_PME(ir.vdwtype))
 355     {
 356         c_qlj += c_nbnxn_ljexp_add;
 357         c_lj += c_nbnxn_ljexp_add;
 358     }
 359     if (EVDW_PME(ir.vdwtype) && ir.ljpme_combination_rule == LongRangeVdW::LB)
 360     {
 361         /* We don't have LJ-PME LB comb. rule kernels, we use slow kernels */
 362         c_qlj *= nbnxn_refkernel_fac;
 363         c_q *= nbnxn_refkernel_fac;
 364         c_lj *= nbnxn_refkernel_fac;
 365     }
 366
 367     /* For the PP non-bonded cost it is (unrealistically) assumed
 368      * that all atoms are distributed homogeneously in space.
 369      */
 370     *cost_pp = (nqlj * c_qlj + nq * c_q + nlj * c_lj) * nppa;
 371
 372     *cost_pp *= simd_cycle_factor(bHaveSIMD);
 373 }
 374
 375 float pme_load_estimate(const gmx_mtop_t& mtop, const t_inputrec& ir, const matrix box)
 376 {
 377     int      nq_tot, nlj_tot;
 378     gmx_bool bChargePerturbed, bTypePerturbed;
 379     double   ndistance_c, ndistance_simd;
 380     double   cost_bond, cost_pp, cost_redist, cost_spread, cost_fft, cost_solve, cost_pme;
 381     float    ratio;
 382
 383     /* Computational cost of bonded, non-bonded and PME calculations.
 384      * This will be machine dependent.
 385      * The numbers here are accurate for Intel Core2 and AMD Athlon 64
 386      * in single precision. In double precision PME mesh is slightly cheaper,
 387      * although not so much that the numbers need to be adjusted.
 388      */
 389
 390     count_bonded_distances(mtop, ir, &ndistance_c, &ndistance_simd);
 391     /* C_BOND is the cost for bonded interactions with SIMD implementations,
 392      * so we need to scale the number of bonded interactions for which there
 393      * are only C implementations to the number of SIMD equivalents.
 394      */
 395     cost_bond = c_bond
 396                 * (ndistance_c * simd_cycle_factor(FALSE) + ndistance_simd * simd_cycle_factor(bHaveSIMD));
 397
 398     pp_verlet_load(mtop, ir, box, &nq_tot, &nlj_tot, &cost_pp, &bChargePerturbed, &bTypePerturbed);
 399
 400     cost_redist = 0;
 401     cost_spread = 0;
 402     cost_fft    = 0;
 403     cost_solve  = 0;
 404
 405     int gridNkzFactor = int{ (ir.nkz + 1) / 2 };
 406     if (EEL_PME(ir.coulombtype))
 407     {
 408         double grid = ir.nkx * ir.nky * gridNkzFactor;
 409
 410         int f = ((ir.efep != FreeEnergyPerturbationType::No && bChargePerturbed) ? 2 : 1);
 411         cost_redist += c_pme_redist * nq_tot;
 412         cost_spread += f * c_pme_spread * nq_tot * gmx::power3(ir.pme_order);
 413         cost_fft += f * c_pme_fft * grid * std::log(grid) / std::log(2.0);
 414         cost_solve += f * c_pme_solve * grid * simd_cycle_factor(bHaveSIMD);
 415     }
 416
 417     if (EVDW_PME(ir.vdwtype))
 418     {
 419         double grid = ir.nkx * ir.nky * gridNkzFactor;
 420
 421         int f = ((ir.efep != FreeEnergyPerturbationType::No && bTypePerturbed) ? 2 : 1);
 422         if (ir.ljpme_combination_rule == LongRangeVdW::LB)
 423         {
 424             /* LB combination rule: we have 7 mesh terms */
 425             f *= 7;
 426         }
 427         cost_redist += c_pme_redist * nlj_tot;
 428         cost_spread += f * c_pme_spread * nlj_tot * gmx::power3(ir.pme_order);
 429         cost_fft += f * c_pme_fft * 2 * grid * std::log(grid) / std::log(2.0);
 430         cost_solve += f * c_pme_solve * grid * simd_cycle_factor(bHaveSIMD);
 431     }
 432
 433     cost_pme = cost_redist + cost_spread + cost_fft + cost_solve;
 434
 435     ratio = cost_pme / (cost_bond + cost_pp + cost_pme);
 436
 437     if (debug)
 438     {
 439         fprintf(debug,
 440                 "cost_bond   %f\n"
 441                 "cost_pp     %f\n"
 442                 "cost_redist %f\n"
 443                 "cost_spread %f\n"
 444                 "cost_fft    %f\n"
 445                 "cost_solve  %f\n",
 446                 cost_bond,
 447                 cost_pp,
 448                 cost_redist,
 449                 cost_spread,
 450                 cost_fft,
 451                 cost_solve);
 452
 453         fprintf(debug, "Estimate for relative PME load: %.3f\n", ratio);
 454     }
 455
 456     return ratio;
 457 }