src/gromacs/mdlib/perf_est.c

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
   5  * Copyright (c) 2001-2008, The GROMACS development team.
   6  * Copyright (c) 2012,2014, by the GROMACS development team, led by
   7  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   8  * and including many others, as listed in the AUTHORS file in the
   9  * top-level source directory and at http://www.gromacs.org.
  10  *
  11  * GROMACS is free software; you can redistribute it and/or
  12  * modify it under the terms of the GNU Lesser General Public License
  13  * as published by the Free Software Foundation; either version 2.1
  14  * of the License, or (at your option) any later version.
  15  *
  16  * GROMACS is distributed in the hope that it will be useful,
  17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  19  * Lesser General Public License for more details.
  20  *
  21  * You should have received a copy of the GNU Lesser General Public
  22  * License along with GROMACS; if not, see
  23  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  24  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  25  *
  26  * If you want to redistribute modifications to GROMACS, please
  27  * consider that scientific software is very special. Version
  28  * control is crucial - bugs must be traceable. We will be happy to
  29  * consider code for inclusion in the official distribution, but
  30  * derived work must not be called official GROMACS. Details are found
  31  * in the README & COPYING files - if they are missing, get the
  32  * official version at http://www.gromacs.org.
  33  *
  34  * To help us fund GROMACS development, we humbly ask that you cite
  35  * the research papers on the package. Check out http://www.gromacs.org.
  36  */
  37 #include "gmxpre.h"
  38
  39 #include "config.h"
  40
  41 #include <math.h>
  42
  43 #include "gromacs/math/vec.h"
  44 #include "gromacs/topology/topology.h"
  45 #include "gromacs/utility/fatalerror.h"
  46
  47 #include "gromacs/legacyheaders/perf_est.h"
  48 #include "gromacs/legacyheaders/types/commrec.h"
  49 #include "nbnxn_search.h"
  50 #include "nbnxn_consts.h"
  51
  52 /* Computational cost of bonded, non-bonded and PME calculations.
  53  * This will be machine dependent.
  54  * The numbers here are accurate for Intel Core2 and AMD Athlon 64
  55  * in single precision. In double precision PME mesh is slightly cheaper,
  56  * although not so much that the numbers need to be adjusted.
  57  */
  58
  59 /* Cost of a pair interaction in the "group" cut-off scheme */
  60 #define C_GR_FQ        1.5
  61 #define C_GR_QLJ_CUT   1.5
  62 #define C_GR_QLJ_TAB   2.0
  63 #define C_GR_LJ_CUT    1.0
  64 #define C_GR_LJ_TAB    1.75
  65 /* Cost of 1 water with one Q/LJ atom */
  66 #define C_GR_QLJW_CUT  2.0
  67 #define C_GR_QLJW_TAB  2.25
  68 /* Cost of 1 water with one Q atom or with 1/3 water (LJ negligible) */
  69 #define C_GR_QW        1.75
  70
  71 /* Cost of a pair interaction in the "Verlet" cut-off scheme, QEXP is Ewald */
  72 #define C_VT_LJ        0.30
  73 #define C_VT_QRF_LJ    0.40
  74 #define C_VT_QRF       0.30
  75 #define C_VT_QEXP_LJ   0.55
  76 #define C_VT_QEXP      0.50
  77 /* Extra cost for expensive LJ interaction, e.g. pot-switch or LJ-PME */
  78 #define C_VT_LJEXP_ADD 0.20
  79
  80 /* Cost of PME, with all components running with SSE instructions */
  81 /* Cost of particle reordering and redistribution */
  82 #define C_PME_REDIST  12.0
  83 /* Cost of q spreading and force interpolation per charge (mainly memory) */
  84 #define C_PME_SPREAD  0.30
  85 /* Cost of fft's, will be multiplied with N log(N) */
  86 #define C_PME_FFT     0.20
  87 /* Cost of pme_solve, will be multiplied with N */
  88 #define C_PME_SOLVE   0.50
  89
  90 /* Cost of a bonded interaction divided by the number of (pbc_)dx nrequired */
  91 #define C_BOND        5.0
  92
  93 int n_bonded_dx(gmx_mtop_t *mtop, gmx_bool bExcl)
  94 {
  95     int            mb, nmol, ftype, ndxb, ndx_excl;
  96     int            ndx;
  97     gmx_moltype_t *molt;
  98
  99     /* Count the number of pbc_rvec_sub calls required for bonded interactions.
 100      * This number is also roughly proportional to the computational cost.
 101      */
 102     ndx      = 0;
 103     ndx_excl = 0;
 104 #if __ICC == 1400 || __ICL == 1400
 105 #pragma novector /* Work-around for incorrect vectorization */
 106 #endif
 107     for (mb = 0; mb < mtop->nmolblock; mb++)
 108     {
 109         molt = &mtop->moltype[mtop->molblock[mb].type];
 110         nmol = mtop->molblock[mb].nmol;
 111         for (ftype = 0; ftype < F_NRE; ftype++)
 112         {
 113             if (interaction_function[ftype].flags & IF_BOND)
 114             {
 115                 switch (ftype)
 116                 {
 117                     case F_POSRES:
 118                     case F_FBPOSRES:  ndxb = 1; break;
 119                     case F_CONNBONDS: ndxb = 0; break;
 120                     default:     ndxb      = NRAL(ftype) - 1; break;
 121                 }
 122                 ndx += nmol*ndxb*molt->ilist[ftype].nr/(1 + NRAL(ftype));
 123             }
 124         }
 125         if (bExcl)
 126         {
 127             ndx_excl += nmol*(molt->excls.nra - molt->atoms.nr)/2;
 128         }
 129         else
 130         {
 131             ndx_excl = 0;
 132         }
 133     }
 134
 135     if (debug)
 136     {
 137         fprintf(debug, "ndx bonded %d exclusions %d\n", ndx, ndx_excl);
 138     }
 139
 140     ndx += ndx_excl;
 141
 142     return ndx;
 143 }
 144
 145 static void pp_group_load(gmx_mtop_t *mtop, t_inputrec *ir, matrix box,
 146                           int *nq_tot, int *nlj_tot,
 147                           double *cost_pp,
 148                           gmx_bool *bChargePerturbed, gmx_bool *bTypePerturbed)
 149 {
 150     t_atom        *atom;
 151     int            mb, nmol, atnr, cg, a, a0, ncqlj, ncq, nclj;
 152     gmx_bool       bBHAM, bLJcut, bWater, bQ, bLJ;
 153     int            nw, nqlj, nq, nlj;
 154     float          fq, fqlj, flj, fljtab, fqljw, fqw;
 155     t_iparams     *iparams;
 156     gmx_moltype_t *molt;
 157
 158     bBHAM = (mtop->ffparams.functype[0] == F_BHAM);
 159
 160     bLJcut = ((ir->vdwtype == evdwCUT) && !bBHAM);
 161
 162     /* Computational cost of bonded, non-bonded and PME calculations.
 163      * This will be machine dependent.
 164      * The numbers here are accurate for Intel Core2 and AMD Athlon 64
 165      * in single precision. In double precision PME mesh is slightly cheaper,
 166      * although not so much that the numbers need to be adjusted.
 167      */
 168     fq    = C_GR_FQ;
 169     fqlj  = (bLJcut ? C_GR_QLJ_CUT : C_GR_QLJ_TAB);
 170     flj   = (bLJcut ? C_GR_LJ_CUT  : C_GR_LJ_TAB);
 171     /* Cost of 1 water with one Q/LJ atom */
 172     fqljw = (bLJcut ? C_GR_QLJW_CUT : C_GR_QLJW_TAB);
 173     /* Cost of 1 water with one Q atom or with 1/3 water (LJ negligible) */
 174     fqw   = C_GR_QW;
 175
 176     iparams           = mtop->ffparams.iparams;
 177     atnr              = mtop->ffparams.atnr;
 178     nw                = 0;
 179     nqlj              = 0;
 180     nq                = 0;
 181     nlj               = 0;
 182     *bChargePerturbed = FALSE;
 183     for (mb = 0; mb < mtop->nmolblock; mb++)
 184     {
 185         molt = &mtop->moltype[mtop->molblock[mb].type];
 186         atom = molt->atoms.atom;
 187         nmol = mtop->molblock[mb].nmol;
 188         a    = 0;
 189         for (cg = 0; cg < molt->cgs.nr; cg++)
 190         {
 191             bWater = !bBHAM;
 192             ncqlj  = 0;
 193             ncq    = 0;
 194             nclj   = 0;
 195             a0     = a;
 196             while (a < molt->cgs.index[cg+1])
 197             {
 198                 bQ  = (atom[a].q != 0 || atom[a].qB != 0);
 199                 bLJ = (iparams[(atnr+1)*atom[a].type].lj.c6  != 0 ||
 200                        iparams[(atnr+1)*atom[a].type].lj.c12 != 0);
 201                 if (atom[a].q != atom[a].qB)
 202                 {
 203                     *bChargePerturbed = TRUE;
 204                 }
 205                 if (atom[a].type != atom[a].typeB)
 206                 {
 207                     *bTypePerturbed = TRUE;
 208                 }
 209                 /* This if this atom fits into water optimization */
 210                 if (!((a == a0   &&  bQ &&  bLJ) ||
 211                       (a == a0+1 &&  bQ && !bLJ) ||
 212                       (a == a0+2 &&  bQ && !bLJ && atom[a].q == atom[a-1].q) ||
 213                       (a == a0+3 && !bQ &&  bLJ)))
 214                 {
 215                     bWater = FALSE;
 216                 }
 217                 if (bQ && bLJ)
 218                 {
 219                     ncqlj++;
 220                 }
 221                 else
 222                 {
 223                     if (bQ)
 224                     {
 225                         ncq++;
 226                     }
 227                     if (bLJ)
 228                     {
 229                         nclj++;
 230                     }
 231                 }
 232                 a++;
 233             }
 234             if (bWater)
 235             {
 236                 nw   += nmol;
 237             }
 238             else
 239             {
 240                 nqlj += nmol*ncqlj;
 241                 nq   += nmol*ncq;
 242                 nlj  += nmol*nclj;
 243             }
 244         }
 245     }
 246
 247     *nq_tot  = nq  + nqlj + nw*3;
 248     *nlj_tot = nlj + nqlj + nw;
 249
 250     if (debug)
 251     {
 252         fprintf(debug, "nw %d nqlj %d nq %d nlj %d\n", nw, nqlj, nq, nlj);
 253     }
 254
 255     /* For the PP non-bonded cost it is (unrealistically) assumed
 256      * that all atoms are distributed homogeneously in space.
 257      * Factor 3 is used because a water molecule has 3 atoms
 258      * (and TIP4P effectively has 3 interactions with (water) atoms)).
 259      */
 260     *cost_pp = 0.5*(fqljw*nw*nqlj +
 261                     fqw  *nw*(3*nw + nq) +
 262                     fqlj *nqlj*nqlj +
 263                     fq   *nq*(3*nw + nqlj + nq) +
 264                     flj  *nlj*(nw + nqlj + nlj))
 265         *4/3*M_PI*ir->rlist*ir->rlist*ir->rlist/det(box);
 266 }
 267
 268 static void pp_verlet_load(gmx_mtop_t *mtop, t_inputrec *ir, matrix box,
 269                            int *nq_tot, int *nlj_tot,
 270                            double *cost_pp,
 271                            gmx_bool *bChargePerturbed, gmx_bool *bTypePerturbed)
 272 {
 273     t_atom        *atom;
 274     int            mb, nmol, atnr, cg, a, a0, nqlj, nq, nlj;
 275     gmx_bool       bQRF;
 276     t_iparams     *iparams;
 277     gmx_moltype_t *molt;
 278     real           r_eff;
 279     double         c_qlj, c_q, c_lj;
 280     double         nat;
 281     /* Conversion factor for reference vs SIMD kernel performance.
 282      * The factor is about right for SSE2/4, but should be 2 higher for AVX256.
 283      */
 284 #ifdef GMX_DOUBLE
 285     const real     nbnxn_refkernel_fac = 4.0;
 286 #else
 287     const real     nbnxn_refkernel_fac = 8.0;
 288 #endif
 289
 290     bQRF = (EEL_RF(ir->coulombtype) || ir->coulombtype == eelCUT);
 291
 292     iparams           = mtop->ffparams.iparams;
 293     atnr              = mtop->ffparams.atnr;
 294     nqlj              = 0;
 295     nq                = 0;
 296     *bChargePerturbed = FALSE;
 297     for (mb = 0; mb < mtop->nmolblock; mb++)
 298     {
 299         molt = &mtop->moltype[mtop->molblock[mb].type];
 300         atom = molt->atoms.atom;
 301         nmol = mtop->molblock[mb].nmol;
 302         a    = 0;
 303         for (a = 0; a < molt->atoms.nr; a++)
 304         {
 305             if (atom[a].q != 0 || atom[a].qB != 0)
 306             {
 307                 if (iparams[(atnr+1)*atom[a].type].lj.c6  != 0 ||
 308                     iparams[(atnr+1)*atom[a].type].lj.c12 != 0)
 309                 {
 310                     nqlj += nmol;
 311                 }
 312                 else
 313                 {
 314                     nq += nmol;
 315                 }
 316             }
 317             if (atom[a].q != atom[a].qB)
 318             {
 319                 *bChargePerturbed = TRUE;
 320             }
 321             if (atom[a].type != atom[a].typeB)
 322             {
 323                 *bTypePerturbed = TRUE;
 324             }
 325         }
 326     }
 327
 328     nlj = mtop->natoms - nqlj - nq;
 329
 330     *nq_tot  = nqlj + nq;
 331     *nlj_tot = nqlj + nlj;
 332
 333     /* Effective cut-off for cluster pair list of 4x4 atoms */
 334     r_eff = ir->rlist + nbnxn_get_rlist_effective_inc(NBNXN_CPU_CLUSTER_I_SIZE, mtop->natoms/det(box));
 335
 336     if (debug)
 337     {
 338         fprintf(debug, "nqlj %d nq %d nlj %d rlist %.3f r_eff %.3f\n",
 339                 nqlj, nq, nlj, ir->rlist, r_eff);
 340     }
 341
 342     /* Determine the cost per pair interaction */
 343     c_qlj = (bQRF ? C_VT_QRF_LJ : C_VT_QEXP_LJ);
 344     c_q   = (bQRF ? C_VT_QRF    : C_VT_QEXP);
 345     c_lj  = C_VT_LJ;
 346     if (ir->vdw_modifier == eintmodPOTSWITCH || EVDW_PME(ir->vdwtype))
 347     {
 348         c_qlj += C_VT_LJEXP_ADD;
 349         c_lj  += C_VT_LJEXP_ADD;
 350     }
 351     if (EVDW_PME(ir->vdwtype) && ir->ljpme_combination_rule == eljpmeLB)
 352     {
 353         /* We don't have LJ-PME LB comb. rule kernels, we use slow kernels */
 354         c_qlj *= nbnxn_refkernel_fac;
 355         c_q   *= nbnxn_refkernel_fac;
 356         c_lj  *= nbnxn_refkernel_fac;
 357     }
 358
 359     /* For the PP non-bonded cost it is (unrealistically) assumed
 360      * that all atoms are distributed homogeneously in space.
 361      */
 362     /* Convert mtop->natoms to double to avoid int overflow */
 363     nat      = mtop->natoms;
 364     *cost_pp = 0.5*nat*(nqlj*c_qlj + nq*c_q + nlj*c_lj)
 365         *4/3*M_PI*r_eff*r_eff*r_eff/det(box);
 366 }
 367
 368 float pme_load_estimate(gmx_mtop_t *mtop, t_inputrec *ir, matrix box)
 369 {
 370     t_atom        *atom;
 371     int            mb, nmol, atnr, cg, a, a0, nq_tot, nlj_tot, f;
 372     gmx_bool       bBHAM, bLJcut, bChargePerturbed, bTypePerturbed;
 373     gmx_bool       bWater, bQ, bLJ;
 374     double         cost_bond, cost_pp, cost_redist, cost_spread, cost_fft, cost_solve, cost_pme;
 375     float          ratio;
 376     t_iparams     *iparams;
 377     gmx_moltype_t *molt;
 378
 379     /* Computational cost of bonded, non-bonded and PME calculations.
 380      * This will be machine dependent.
 381      * The numbers here are accurate for Intel Core2 and AMD Athlon 64
 382      * in single precision. In double precision PME mesh is slightly cheaper,
 383      * although not so much that the numbers need to be adjusted.
 384      */
 385
 386     iparams = mtop->ffparams.iparams;
 387     atnr    = mtop->ffparams.atnr;
 388
 389     cost_bond = C_BOND*n_bonded_dx(mtop, TRUE);
 390
 391     if (ir->cutoff_scheme == ecutsGROUP)
 392     {
 393         pp_group_load(mtop, ir, box,
 394                       &nq_tot, &nlj_tot, &cost_pp,
 395                       &bChargePerturbed, &bTypePerturbed);
 396     }
 397     else
 398     {
 399         pp_verlet_load(mtop, ir, box,
 400                        &nq_tot, &nlj_tot, &cost_pp,
 401                        &bChargePerturbed, &bTypePerturbed);
 402     }
 403
 404     cost_redist = 0;
 405     cost_spread = 0;
 406     cost_fft    = 0;
 407     cost_solve  = 0;
 408
 409     if (EEL_PME(ir->coulombtype))
 410     {
 411         f            = ((ir->efep != efepNO && bChargePerturbed) ? 2 : 1);
 412         cost_redist +=   C_PME_REDIST*nq_tot;
 413         cost_spread += f*C_PME_SPREAD*nq_tot*pow(ir->pme_order, 3);
 414         cost_fft    += f*C_PME_FFT*ir->nkx*ir->nky*ir->nkz*log(ir->nkx*ir->nky*ir->nkz);
 415         cost_solve  += f*C_PME_SOLVE*ir->nkx*ir->nky*ir->nkz;
 416     }
 417
 418     if (EVDW_PME(ir->vdwtype))
 419     {
 420         f            = ((ir->efep != efepNO && bTypePerturbed) ? 2 : 1);
 421         if (ir->ljpme_combination_rule == eljpmeLB)
 422         {
 423             /* LB combination rule: we have 7 mesh terms */
 424             f       *= 7;
 425         }
 426         cost_redist +=   C_PME_REDIST*nlj_tot;
 427         cost_spread += f*C_PME_SPREAD*nlj_tot*pow(ir->pme_order, 3);
 428         cost_fft    += f*C_PME_FFT*ir->nkx*ir->nky*ir->nkz*log(ir->nkx*ir->nky*ir->nkz);
 429         cost_solve  += f*C_PME_SOLVE*ir->nkx*ir->nky*ir->nkz;
 430     }
 431
 432     cost_pme = cost_redist + cost_spread + cost_fft + cost_solve;
 433
 434     ratio = cost_pme/(cost_bond + cost_pp + cost_pme);
 435
 436     if (debug)
 437     {
 438         fprintf(debug,
 439                 "cost_bond   %f\n"
 440                 "cost_pp     %f\n"
 441                 "cost_redist %f\n"
 442                 "cost_spread %f\n"
 443                 "cost_fft    %f\n"
 444                 "cost_solve  %f\n",
 445                 cost_bond, cost_pp, cost_redist, cost_spread, cost_fft, cost_solve);
 446
 447         fprintf(debug, "Estimate for relative PME load: %.3f\n", ratio);
 448     }
 449
 450     return ratio;
 451 }