src/gromacs/mdlib/perf_est.cpp

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
   5  * Copyright (c) 2001-2008, The GROMACS development team.
   6  * Copyright (c) 2012,2014,2015,2016,2017,2018,2019, by the GROMACS development team, led by
   7  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   8  * and including many others, as listed in the AUTHORS file in the
   9  * top-level source directory and at http://www.gromacs.org.
  10  *
  11  * GROMACS is free software; you can redistribute it and/or
  12  * modify it under the terms of the GNU Lesser General Public License
  13  * as published by the Free Software Foundation; either version 2.1
  14  * of the License, or (at your option) any later version.
  15  *
  16  * GROMACS is distributed in the hope that it will be useful,
  17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  19  * Lesser General Public License for more details.
  20  *
  21  * You should have received a copy of the GNU Lesser General Public
  22  * License along with GROMACS; if not, see
  23  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  24  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  25  *
  26  * If you want to redistribute modifications to GROMACS, please
  27  * consider that scientific software is very special. Version
  28  * control is crucial - bugs must be traceable. We will be happy to
  29  * consider code for inclusion in the official distribution, but
  30  * derived work must not be called official GROMACS. Details are found
  31  * in the README & COPYING files - if they are missing, get the
  32  * official version at http://www.gromacs.org.
  33  *
  34  * To help us fund GROMACS development, we humbly ask that you cite
  35  * the research papers on the package. Check out http://www.gromacs.org.
  36  */
  37 #include "gmxpre.h"
  38
  39 #include "perf_est.h"
  40
  41 #include <cmath>
  42
  43 #include "gromacs/math/functions.h"
  44 #include "gromacs/math/utilities.h"
  45 #include "gromacs/math/vec.h"
  46 #include "gromacs/mdtypes/commrec.h"
  47 #include "gromacs/mdtypes/inputrec.h"
  48 #include "gromacs/mdtypes/md_enums.h"
  49 #include "gromacs/nbnxm/nbnxm_geometry.h"
  50 #include "gromacs/simd/simd.h"
  51 #include "gromacs/topology/ifunc.h"
  52 #include "gromacs/topology/topology.h"
  53 #include "gromacs/utility/fatalerror.h"
  54
  55 /* Computational cost of bonded, non-bonded and PME calculations.
  56  * This will be machine dependent.
  57  * The numbers are only used for estimating the relative cost of PME vs PP,
  58  * so only relative numbers matter.
  59  * The numbers here are accurate cycle counts for Haswell in single precision
  60  * compiled with gcc5.2. A correction factor for other architectures is given
  61  * by simd_cycle_factor().
  62  * In double precision PME mesh is slightly cheaper, although not so much
  63  * that the numbers need to be adjusted.
  64  */
  65
  66 /* Cost of a pair interaction in the "Verlet" cut-off scheme, QEXP is Ewald */
  67 static const double c_nbnxn_lj      = 2.5;
  68 static const double c_nbnxn_qrf_lj  = 2.9;
  69 static const double c_nbnxn_qrf     = 2.4;
  70 static const double c_nbnxn_qexp_lj = 4.2;
  71 static const double c_nbnxn_qexp    = 3.8;
  72 /* Extra cost for expensive LJ interaction, e.g. pot-switch or LJ-PME */
  73 static const double c_nbnxn_ljexp_add = 1.0;
  74
  75 /* Cost of the different components of PME. */
  76 /* Cost of particle reordering and redistribution (no SIMD correction).
  77  * This will be zero without MPI and can be very high with load imbalance.
  78  * Thus we use an approximate value for medium parallelization.
  79  */
  80 static const double c_pme_redist = 100.0;
  81 /* Cost of q spreading and force interpolation per charge. This part almost
  82  * doesn't accelerate with SIMD, so we don't use SIMD correction.
  83  */
  84 static const double c_pme_spread = 5.0;
  85 /* Cost of fft's, will be multiplied with 2 N log2(N) (no SIMD correction)
  86  * Without MPI the number is 2-3, depending on grid factors and thread count.
  87  * We take the high limit to be on the safe side and account for some MPI
  88  * communication cost, which will dominate at high parallelization.
  89  */
  90 static const double c_pme_fft = 3.0;
  91 /* Cost of pme_solve, will be multiplied with N */
  92 static const double c_pme_solve = 9.0;
  93
  94 /* Cost of a bonded interaction divided by the number of distances calculations
  95  * required in one interaction. The actual cost is nearly propotional to this.
  96  */
  97 static const double c_bond = 25.0;
  98
  99
 100 #if GMX_SIMD_HAVE_REAL
 101 static const gmx_bool bHaveSIMD = TRUE;
 102 #else
 103 static const gmx_bool bHaveSIMD = FALSE;
 104 #endif
 105
 106 /* Gives a correction factor for the currently compiled SIMD implementations
 107  * versus the reference used for the coefficients above (8-wide SIMD with FMA).
 108  * bUseSIMD sets if we asking for plain-C (FALSE) or SIMD (TRUE) code.
 109  */
 110 static double simd_cycle_factor(gmx_bool bUseSIMD)
 111 {
 112     /* The (average) ratio of the time taken by plain-C force calculations
 113      * relative to SIMD versions, for the reference platform Haswell:
 114      * 8-wide SIMD with FMA, factor: sqrt(2*8)*1.25 = 5.
 115      * This factor is used for normalization in simd_cycle_factor().
 116      */
 117     const double simd_cycle_no_simd = 5.0;
 118     double       speedup;
 119
 120 #if GMX_SIMD_HAVE_REAL
 121     if (bUseSIMD)
 122     {
 123         /* We never get full speed-up of a factor GMX_SIMD_REAL_WIDTH.
 124          * The actual speed-up depends very much on gather+scatter overhead,
 125          * which is different for different bonded and non-bonded kernels.
 126          * As a rough, but actually not bad, approximation we use a sqrt
 127          * dependence on the width which gives a factor 4 for width=8.
 128          */
 129         speedup = std::sqrt(2.0 * GMX_SIMD_REAL_WIDTH);
 130 #    if GMX_SIMD_HAVE_FMA
 131         /* FMA tends to give a bit more speedup */
 132         speedup *= 1.25;
 133 #    endif
 134     }
 135     else
 136     {
 137         speedup = 1.0;
 138     }
 139 #else
 140     if (bUseSIMD)
 141     {
 142         gmx_incons("gmx_cycle_factor() compiled without SIMD called with bUseSIMD=TRUE");
 143     }
 144     /* No SIMD, no speedup */
 145     speedup                        = 1.0;
 146 #endif
 147
 148     /* Return speed compared to the reference (Haswell).
 149      * For x86 SIMD, the nbnxn kernels are relatively much slower on
 150      * Sandy/Ivy Bridge than Haswell, but that only leads to a too high
 151      * PME load estimate on SB/IB, which is erring on the safe side.
 152      */
 153     return simd_cycle_no_simd / speedup;
 154 }
 155
 156 void count_bonded_distances(const gmx_mtop_t& mtop, const t_inputrec& ir, double* ndistance_c, double* ndistance_simd)
 157 {
 158     gmx_bool bExcl;
 159     double   nonsimd_step_frac;
 160     int      ftype;
 161     double   ndtot_c, ndtot_simd;
 162 #if GMX_SIMD_HAVE_REAL
 163     gmx_bool bSimdBondeds = TRUE;
 164 #else
 165     gmx_bool   bSimdBondeds        = FALSE;
 166 #endif
 167
 168     bExcl = (ir.cutoff_scheme == ecutsGROUP && inputrecExclForces(&ir) && !EEL_FULL(ir.coulombtype));
 169
 170     if (bSimdBondeds)
 171     {
 172         /* We only have SIMD versions of these bondeds without energy and
 173          * without shift-forces, we take that into account here.
 174          */
 175         if (ir.nstcalcenergy > 0)
 176         {
 177             nonsimd_step_frac = 1.0 / ir.nstcalcenergy;
 178         }
 179         else
 180         {
 181             nonsimd_step_frac = 0;
 182         }
 183         if (ir.epc != epcNO && 1.0 / ir.nstpcouple > nonsimd_step_frac)
 184         {
 185             nonsimd_step_frac = 1.0 / ir.nstpcouple;
 186         }
 187     }
 188     else
 189     {
 190         nonsimd_step_frac = 1;
 191     }
 192
 193     /* Count the number of pbc_rvec_sub calls required for bonded interactions.
 194      * This number is also roughly proportional to the computational cost.
 195      */
 196     ndtot_c    = 0;
 197     ndtot_simd = 0;
 198     for (const gmx_molblock_t& molb : mtop.molblock)
 199     {
 200         const gmx_moltype_t* molt = &mtop.moltype[molb.type];
 201         for (ftype = 0; ftype < F_NRE; ftype++)
 202         {
 203             int nbonds;
 204
 205             if (interaction_function[ftype].flags & IF_BOND)
 206             {
 207                 double nd_c, nd_simd;
 208
 209                 nd_c    = 0;
 210                 nd_simd = 0;
 211                 /* For all interactions, except for the three exceptions
 212                  * in the switch below, #distances = #atoms - 1.
 213                  */
 214                 switch (ftype)
 215                 {
 216                     case F_POSRES:
 217                     case F_FBPOSRES: nd_c = 1; break;
 218                     case F_CONNBONDS: break;
 219                     /* These bonded potentially use SIMD */
 220                     case F_ANGLES:
 221                     case F_PDIHS:
 222                     case F_RBDIHS:
 223                     case F_LJ14:
 224                         nd_c    = nonsimd_step_frac * (NRAL(ftype) - 1);
 225                         nd_simd = (1 - nonsimd_step_frac) * (NRAL(ftype) - 1);
 226                         break;
 227                     default: nd_c = NRAL(ftype) - 1; break;
 228                 }
 229                 nbonds = molb.nmol * molt->ilist[ftype].size() / (1 + NRAL(ftype));
 230                 ndtot_c += nbonds * nd_c;
 231                 ndtot_simd += nbonds * nd_simd;
 232             }
 233         }
 234         if (bExcl)
 235         {
 236             ndtot_c += molb.nmol * (molt->excls.numElements() - molt->atoms.nr) / 2.;
 237         }
 238     }
 239
 240     if (debug)
 241     {
 242         fprintf(debug, "nr. of distance calculations in bondeds: C %.1f SIMD %.1f\n", ndtot_c, ndtot_simd);
 243     }
 244
 245     if (ndistance_c != nullptr)
 246     {
 247         *ndistance_c = ndtot_c;
 248     }
 249     if (ndistance_simd != nullptr)
 250     {
 251         *ndistance_simd = ndtot_simd;
 252     }
 253 }
 254
 255 static void pp_verlet_load(const gmx_mtop_t& mtop,
 256                            const t_inputrec& ir,
 257                            const matrix      box,
 258                            int*              nq_tot,
 259                            int*              nlj_tot,
 260                            double*           cost_pp,
 261                            gmx_bool*         bChargePerturbed,
 262                            gmx_bool*         bTypePerturbed)
 263 {
 264     int      atnr, a, nqlj, nq, nlj;
 265     gmx_bool bQRF;
 266     real     r_eff;
 267     double   c_qlj, c_q, c_lj;
 268     double   nppa;
 269     int      j_cluster_size;
 270     /* Conversion factor for reference vs SIMD kernel performance.
 271      * The factor is about right for SSE2/4, but should be 2 higher for AVX256.
 272      */
 273 #if GMX_DOUBLE
 274     const real nbnxn_refkernel_fac = 4.0;
 275 #else
 276     const real nbnxn_refkernel_fac = 8.0;
 277 #endif
 278
 279     bQRF = (EEL_RF(ir.coulombtype) || ir.coulombtype == eelCUT);
 280
 281     gmx::ArrayRef<const t_iparams> iparams = mtop.ffparams.iparams;
 282     atnr                                   = mtop.ffparams.atnr;
 283     nqlj                                   = 0;
 284     nq                                     = 0;
 285     *bChargePerturbed                      = FALSE;
 286     *bTypePerturbed                        = FALSE;
 287     for (const gmx_molblock_t& molb : mtop.molblock)
 288     {
 289         const gmx_moltype_t* molt = &mtop.moltype[molb.type];
 290         const t_atom*        atom = molt->atoms.atom;
 291         for (a = 0; a < molt->atoms.nr; a++)
 292         {
 293             if (atom[a].q != 0 || atom[a].qB != 0)
 294             {
 295                 if (iparams[(atnr + 1) * atom[a].type].lj.c6 != 0
 296                     || iparams[(atnr + 1) * atom[a].type].lj.c12 != 0)
 297                 {
 298                     nqlj += molb.nmol;
 299                 }
 300                 else
 301                 {
 302                     nq += molb.nmol;
 303                 }
 304             }
 305             if (atom[a].q != atom[a].qB)
 306             {
 307                 *bChargePerturbed = TRUE;
 308             }
 309             if (atom[a].type != atom[a].typeB)
 310             {
 311                 *bTypePerturbed = TRUE;
 312             }
 313         }
 314     }
 315
 316     nlj = mtop.natoms - nqlj - nq;
 317
 318     *nq_tot  = nqlj + nq;
 319     *nlj_tot = nqlj + nlj;
 320
 321     /* Effective cut-off for cluster pair list of 4x4 or 4x8 atoms.
 322      * This choice should match the one of pick_nbnxn_kernel_cpu().
 323      * TODO: Make this function use pick_nbnxn_kernel_cpu().
 324      */
 325 #if GMX_SIMD_HAVE_REAL \
 326         && ((GMX_SIMD_REAL_WIDTH == 8 && defined GMX_SIMD_HAVE_FMA) || GMX_SIMD_REAL_WIDTH > 8)
 327     j_cluster_size = 8;
 328 #else
 329     j_cluster_size                 = 4;
 330 #endif
 331     r_eff = ir.rlist + nbnxn_get_rlist_effective_inc(j_cluster_size, mtop.natoms / det(box));
 332
 333     /* The average number of pairs per atom */
 334     nppa = 0.5 * 4 / 3 * M_PI * r_eff * r_eff * r_eff * mtop.natoms / det(box);
 335
 336     if (debug)
 337     {
 338         fprintf(debug, "nqlj %d nq %d nlj %d rlist %.3f r_eff %.3f pairs per atom %.1f\n", nqlj, nq,
 339                 nlj, ir.rlist, r_eff, nppa);
 340     }
 341
 342     /* Determine the cost per pair interaction */
 343     c_qlj = (bQRF ? c_nbnxn_qrf_lj : c_nbnxn_qexp_lj);
 344     c_q   = (bQRF ? c_nbnxn_qrf : c_nbnxn_qexp);
 345     c_lj  = c_nbnxn_lj;
 346     if (ir.vdw_modifier == eintmodPOTSWITCH || EVDW_PME(ir.vdwtype))
 347     {
 348         c_qlj += c_nbnxn_ljexp_add;
 349         c_lj += c_nbnxn_ljexp_add;
 350     }
 351     if (EVDW_PME(ir.vdwtype) && ir.ljpme_combination_rule == eljpmeLB)
 352     {
 353         /* We don't have LJ-PME LB comb. rule kernels, we use slow kernels */
 354         c_qlj *= nbnxn_refkernel_fac;
 355         c_q *= nbnxn_refkernel_fac;
 356         c_lj *= nbnxn_refkernel_fac;
 357     }
 358
 359     /* For the PP non-bonded cost it is (unrealistically) assumed
 360      * that all atoms are distributed homogeneously in space.
 361      */
 362     *cost_pp = (nqlj * c_qlj + nq * c_q + nlj * c_lj) * nppa;
 363
 364     *cost_pp *= simd_cycle_factor(bHaveSIMD);
 365 }
 366
 367 float pme_load_estimate(const gmx_mtop_t& mtop, const t_inputrec& ir, const matrix box)
 368 {
 369     int      nq_tot, nlj_tot;
 370     gmx_bool bChargePerturbed, bTypePerturbed;
 371     double   ndistance_c, ndistance_simd;
 372     double   cost_bond, cost_pp, cost_redist, cost_spread, cost_fft, cost_solve, cost_pme;
 373     float    ratio;
 374
 375     /* Computational cost of bonded, non-bonded and PME calculations.
 376      * This will be machine dependent.
 377      * The numbers here are accurate for Intel Core2 and AMD Athlon 64
 378      * in single precision. In double precision PME mesh is slightly cheaper,
 379      * although not so much that the numbers need to be adjusted.
 380      */
 381
 382     count_bonded_distances(mtop, ir, &ndistance_c, &ndistance_simd);
 383     /* C_BOND is the cost for bonded interactions with SIMD implementations,
 384      * so we need to scale the number of bonded interactions for which there
 385      * are only C implementations to the number of SIMD equivalents.
 386      */
 387     cost_bond = c_bond
 388                 * (ndistance_c * simd_cycle_factor(FALSE) + ndistance_simd * simd_cycle_factor(bHaveSIMD));
 389
 390     pp_verlet_load(mtop, ir, box, &nq_tot, &nlj_tot, &cost_pp, &bChargePerturbed, &bTypePerturbed);
 391
 392     cost_redist = 0;
 393     cost_spread = 0;
 394     cost_fft    = 0;
 395     cost_solve  = 0;
 396
 397     int gridNkzFactor = int{ (ir.nkz + 1) / 2 };
 398     if (EEL_PME(ir.coulombtype))
 399     {
 400         double grid = ir.nkx * ir.nky * gridNkzFactor;
 401
 402         int f = ((ir.efep != efepNO && bChargePerturbed) ? 2 : 1);
 403         cost_redist += c_pme_redist * nq_tot;
 404         cost_spread += f * c_pme_spread * nq_tot * gmx::power3(ir.pme_order);
 405         cost_fft += f * c_pme_fft * grid * std::log(grid) / std::log(2.0);
 406         cost_solve += f * c_pme_solve * grid * simd_cycle_factor(bHaveSIMD);
 407     }
 408
 409     if (EVDW_PME(ir.vdwtype))
 410     {
 411         double grid = ir.nkx * ir.nky * gridNkzFactor;
 412
 413         int f = ((ir.efep != efepNO && bTypePerturbed) ? 2 : 1);
 414         if (ir.ljpme_combination_rule == eljpmeLB)
 415         {
 416             /* LB combination rule: we have 7 mesh terms */
 417             f *= 7;
 418         }
 419         cost_redist += c_pme_redist * nlj_tot;
 420         cost_spread += f * c_pme_spread * nlj_tot * gmx::power3(ir.pme_order);
 421         cost_fft += f * c_pme_fft * 2 * grid * std::log(grid) / std::log(2.0);
 422         cost_solve += f * c_pme_solve * grid * simd_cycle_factor(bHaveSIMD);
 423     }
 424
 425     cost_pme = cost_redist + cost_spread + cost_fft + cost_solve;
 426
 427     ratio = cost_pme / (cost_bond + cost_pp + cost_pme);
 428
 429     if (debug)
 430     {
 431         fprintf(debug,
 432                 "cost_bond   %f\n"
 433                 "cost_pp     %f\n"
 434                 "cost_redist %f\n"
 435                 "cost_spread %f\n"
 436                 "cost_fft    %f\n"
 437                 "cost_solve  %f\n",
 438                 cost_bond, cost_pp, cost_redist, cost_spread, cost_fft, cost_solve);
 439
 440         fprintf(debug, "Estimate for relative PME load: %.3f\n", ratio);
 441     }
 442
 443     return ratio;
 444 }