src/gromacs/domdec/domdec_setup.cpp

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2008,2009,2010,2011,2012,2013,2014,2015,2017,2018,2019, by the GROMACS development team, led by
   5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   6  * and including many others, as listed in the AUTHORS file in the
   7  * top-level source directory and at http://www.gromacs.org.
   8  *
   9  * GROMACS is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public License
  11  * as published by the Free Software Foundation; either version 2.1
  12  * of the License, or (at your option) any later version.
  13  *
  14  * GROMACS is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public
  20  * License along with GROMACS; if not, see
  21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  23  *
  24  * If you want to redistribute modifications to GROMACS, please
  25  * consider that scientific software is very special. Version
  26  * control is crucial - bugs must be traceable. We will be happy to
  27  * consider code for inclusion in the official distribution, but
  28  * derived work must not be called official GROMACS. Details are found
  29  * in the README & COPYING files - if they are missing, get the
  30  * official version at http://www.gromacs.org.
  31  *
  32  * To help us fund GROMACS development, we humbly ask that you cite
  33  * the research papers on the package. Check out http://www.gromacs.org.
  34  */
  35
  36 /*! \internal \file
  37  *
  38  * \brief This file defines functions used by the domdec module
  39  * in its initial setup phase.
  40  *
  41  * \author Berk Hess <hess@kth.se>
  42  * \ingroup module_domdec
  43  */
  44
  45 #include "gmxpre.h"
  46
  47 #include <cassert>
  48 #include <cinttypes>
  49 #include <cmath>
  50 #include <cstdio>
  51
  52 #include "gromacs/domdec/domdec.h"
  53 #include "gromacs/domdec/domdec_struct.h"
  54 #include "gromacs/ewald/pme.h"
  55 #include "gromacs/gmxlib/network.h"
  56 #include "gromacs/math/utilities.h"
  57 #include "gromacs/math/vec.h"
  58 #include "gromacs/mdlib/perf_est.h"
  59 #include "gromacs/mdtypes/commrec.h"
  60 #include "gromacs/mdtypes/inputrec.h"
  61 #include "gromacs/mdtypes/md_enums.h"
  62 #include "gromacs/pbcutil/pbc.h"
  63 #include "gromacs/topology/topology.h"
  64 #include "gromacs/utility/fatalerror.h"
  65 #include "gromacs/utility/logger.h"
  66 #include "gromacs/utility/stringutil.h"
  67
  68 /*! \brief Margin for setting up the DD grid */
  69 #define DD_GRID_MARGIN_PRES_SCALE 1.05
  70
  71 /*! \brief Factorize \p n.
  72  *
  73  * \param[in]    n     Value to factorize
  74  * \param[out]   fac   Vector of factors (to be allocated in this function)
  75  * \param[out]   mfac  Vector with the number of times each factor repeats in the factorization (to be allocated in this function)
  76  */
  77 static void factorize(int               n,
  78                       std::vector<int> *fac,
  79                       std::vector<int> *mfac)
  80 {
  81     if (n <= 0)
  82     {
  83         gmx_fatal(FARGS, "Can only factorize positive integers.");
  84     }
  85
  86     /* Decompose n in factors */
  87     fac->clear();
  88     mfac->clear();
  89     int d = 2;
  90     while (n > 1)
  91     {
  92         while (n % d == 0)
  93         {
  94             if (fac->empty() || fac->back() != d)
  95             {
  96                 fac->push_back(d);
  97                 mfac->push_back(1);
  98             }
  99             else
 100             {
 101                 mfac->back()++;
 102             }
 103             n /= d;
 104         }
 105         d++;
 106     }
 107 }
 108
 109 /*! \brief Find largest divisor of \p n smaller than \p n*/
 110 static int largest_divisor(int n)
 111 {
 112     std::vector<int> div;
 113     std::vector<int> mdiv;
 114     factorize(n, &div, &mdiv);
 115
 116     return div.back();
 117 }
 118
 119 /*! \brief Compute largest common divisor of \p n1 and \b n2 */
 120 static int lcd(int n1, int n2)
 121 {
 122     int d, i;
 123
 124     d = 1;
 125     for (i = 2; (i <= n1 && i <= n2); i++)
 126     {
 127         if (n1 % i == 0 && n2 % i == 0)
 128         {
 129             d = i;
 130         }
 131     }
 132
 133     return d;
 134 }
 135
 136 /*! \brief Returns TRUE when there are enough PME ranks for the ratio */
 137 static gmx_bool fits_pme_ratio(int nrank_tot, int nrank_pme, float ratio)
 138 {
 139     return (static_cast<double>(nrank_pme)/static_cast<double>(nrank_tot) > 0.95*ratio);
 140 }
 141
 142 /*! \brief Returns TRUE when npme out of ntot ranks doing PME is expected to give reasonable performance */
 143 static gmx_bool fits_pp_pme_perf(int ntot, int npme, float ratio)
 144 {
 145     std::vector<int> div;
 146     std::vector<int> mdiv;
 147     factorize(ntot - npme, &div, &mdiv);
 148
 149     int npp_root3  = gmx::roundToInt(std::cbrt(ntot - npme));
 150     int npme_root2 = gmx::roundToInt(std::sqrt(static_cast<double>(npme)));
 151
 152     /* The check below gives a reasonable division:
 153      * factor 5 allowed at 5 or more PP ranks,
 154      * factor 7 allowed at 49 or more PP ranks.
 155      */
 156     if (div.back() > 3 + npp_root3)
 157     {
 158         return FALSE;
 159     }
 160
 161     /* Check if the number of PP and PME ranks have a reasonable sized
 162      * denominator in common, such that we can use 2D PME decomposition
 163      * when required (which requires nx_pp == nx_pme).
 164      * The factor of 2 allows for a maximum ratio of 2^2=4
 165      * between nx_pme and ny_pme.
 166      */
 167     if (lcd(ntot - npme, npme)*2 < npme_root2)
 168     {
 169         return FALSE;
 170     }
 171
 172     /* Does this division gives a reasonable PME load? */
 173     return fits_pme_ratio(ntot, npme, ratio);
 174 }
 175
 176 /*! \brief Make a guess for the number of PME ranks to use. */
 177 static int guess_npme(const gmx::MDLogger &mdlog,
 178                       const gmx_mtop_t *mtop, const t_inputrec *ir,
 179                       const matrix box,
 180                       int nrank_tot)
 181 {
 182     float      ratio;
 183     int        npme;
 184
 185     ratio = pme_load_estimate(mtop, ir, box);
 186
 187     GMX_LOG(mdlog.info).appendTextFormatted(
 188             "Guess for relative PME load: %.2f", ratio);
 189
 190     /* We assume the optimal rank ratio is close to the load ratio.
 191      * The communication load is neglected,
 192      * but (hopefully) this will balance out between PP and PME.
 193      */
 194
 195     if (!fits_pme_ratio(nrank_tot, nrank_tot/2, ratio))
 196     {
 197         /* We would need more than nrank_tot/2 PME only nodes,
 198          * which is not possible. Since the PME load is very high,
 199          * we will not loose much performance when all ranks do PME.
 200          */
 201
 202         return 0;
 203     }
 204
 205     /* First try to find npme as a factor of nrank_tot up to nrank_tot/3.
 206      * We start with a minimum PME node fraction of 1/16
 207      * and avoid ratios which lead to large prime factors in nnodes-npme.
 208      */
 209     npme = (nrank_tot + 15)/16;
 210     while (npme <= nrank_tot/3)
 211     {
 212         if (nrank_tot % npme == 0)
 213         {
 214             /* Note that fits_perf might change the PME grid,
 215              * in the current implementation it does not.
 216              */
 217             if (fits_pp_pme_perf(nrank_tot, npme, ratio))
 218             {
 219                 break;
 220             }
 221         }
 222         npme++;
 223     }
 224     if (npme > nrank_tot/3)
 225     {
 226         /* Try any possible number for npme */
 227         npme = 1;
 228         while (npme <= nrank_tot/2)
 229         {
 230             /* Note that fits_perf may change the PME grid */
 231             if (fits_pp_pme_perf(nrank_tot, npme, ratio))
 232             {
 233                 break;
 234             }
 235             npme++;
 236         }
 237     }
 238     if (npme > nrank_tot/2)
 239     {
 240         gmx_fatal(FARGS, "Could not find an appropriate number of separate PME ranks. i.e. >= %5f*#ranks (%d) and <= #ranks/2 (%d) and reasonable performance wise (grid_x=%d, grid_y=%d).\n"
 241                   "Use the -npme option of mdrun or change the number of ranks or the PME grid dimensions, see the manual for details.",
 242                   ratio, gmx::roundToInt(0.95*ratio*nrank_tot), nrank_tot/2, ir->nkx, ir->nky);
 243     }
 244     else
 245     {
 246         GMX_LOG(mdlog.info).appendTextFormatted(
 247                 "Will use %d particle-particle and %d PME only ranks\n"
 248                 "This is a guess, check the performance at the end of the log file",
 249                 nrank_tot - npme, npme);
 250     }
 251
 252     return npme;
 253 }
 254
 255 /*! \brief Return \p n divided by \p f rounded up to the next integer. */
 256 static int div_up(int n, int f)
 257 {
 258     return (n + f - 1)/f;
 259 }
 260
 261 real comm_box_frac(const ivec dd_nc, real cutoff, const gmx_ddbox_t *ddbox)
 262 {
 263     int  i, j, k;
 264     rvec nw;
 265     real comm_vol;
 266
 267     for (i = 0; i < DIM; i++)
 268     {
 269         real bt = ddbox->box_size[i]*ddbox->skew_fac[i];
 270         nw[i] = dd_nc[i]*cutoff/bt;
 271     }
 272
 273     comm_vol = 0;
 274     for (i = 0; i < DIM; i++)
 275     {
 276         if (dd_nc[i] > 1)
 277         {
 278             comm_vol += nw[i];
 279             for (j = i+1; j < DIM; j++)
 280             {
 281                 if (dd_nc[j] > 1)
 282                 {
 283                     comm_vol += nw[i]*nw[j]*M_PI/4;
 284                     for (k = j+1; k < DIM; k++)
 285                     {
 286                         if (dd_nc[k] > 1)
 287                         {
 288                             comm_vol += nw[i]*nw[j]*nw[k]*M_PI/6;
 289                         }
 290                     }
 291                 }
 292             }
 293         }
 294     }
 295
 296     return comm_vol;
 297 }
 298
 299 /*! \brief Return whether the DD inhomogeneous in the z direction */
 300 static gmx_bool inhomogeneous_z(const t_inputrec *ir)
 301 {
 302     return ((EEL_PME(ir->coulombtype) || ir->coulombtype == eelEWALD) &&
 303             ir->ePBC == epbcXYZ && ir->ewald_geometry == eewg3DC);
 304 }
 305
 306 /*! \brief Estimate cost of PME FFT communication
 307  *
 308  * This only takes the communication into account and not imbalance
 309  * in the calculation. But the imbalance in communication and calculation
 310  * are similar and therefore these formulas also prefer load balance
 311  * in the FFT and pme_solve calculation.
 312  */
 313 static float comm_pme_cost_vol(int npme, int a, int b, int c)
 314 {
 315     /* We use a float here, since an integer might overflow */
 316     float comm_vol;
 317
 318     comm_vol  = npme - 1;
 319     comm_vol *= npme;
 320     comm_vol *= div_up(a, npme);
 321     comm_vol *= div_up(b, npme);
 322     comm_vol *= c;
 323
 324     return comm_vol;
 325 }
 326
 327 /*! \brief Estimate cost of communication for a possible domain decomposition. */
 328 static float comm_cost_est(real limit, real cutoff,
 329                            const matrix box, const gmx_ddbox_t *ddbox,
 330                            int natoms, const t_inputrec *ir,
 331                            float pbcdxr,
 332                            int npme_tot, ivec nc)
 333 {
 334     ivec  npme = {1, 1, 1};
 335     int   i, j, nk, overlap;
 336     rvec  bt;
 337     float comm_vol, comm_vol_xf, comm_pme, cost_pbcdx;
 338     /* This is the cost of a pbc_dx call relative to the cost
 339      * of communicating the coordinate and force of an atom.
 340      * This will be machine dependent.
 341      * These factors are for x86 with SMP or Infiniband.
 342      */
 343     float pbcdx_rect_fac = 0.1;
 344     float pbcdx_tric_fac = 0.2;
 345     float temp;
 346
 347     /* Check the DD algorithm restrictions */
 348     if ((ir->ePBC == epbcXY && ir->nwall < 2 && nc[ZZ] > 1) ||
 349         (ir->ePBC == epbcSCREW && (nc[XX] == 1 || nc[YY] > 1 || nc[ZZ] > 1)))
 350     {
 351         return -1;
 352     }
 353
 354     if (inhomogeneous_z(ir) && nc[ZZ] > 1)
 355     {
 356         return -1;
 357     }
 358
 359     assert(ddbox->npbcdim <= DIM);
 360
 361     /* Check if the triclinic requirements are met */
 362     for (i = 0; i < DIM; i++)
 363     {
 364         for (j = i+1; j < ddbox->npbcdim; j++)
 365         {
 366             if (box[j][i] != 0 || ir->deform[j][i] != 0 ||
 367                 (ir->epc != epcNO && ir->compress[j][i] != 0))
 368             {
 369                 if (nc[j] > 1 && nc[i] == 1)
 370                 {
 371                     return -1;
 372                 }
 373             }
 374         }
 375     }
 376
 377     for (i = 0; i < DIM; i++)
 378     {
 379         bt[i] = ddbox->box_size[i]*ddbox->skew_fac[i];
 380
 381         /* Without PBC and with 2 cells, there are no lower limits on the cell size */
 382         if (!(i >= ddbox->npbcdim && nc[i] <= 2) && bt[i] < nc[i]*limit)
 383         {
 384             return -1;
 385         }
 386         /* With PBC, check if the cut-off fits in nc[i]-1 cells */
 387         if (i < ddbox->npbcdim && nc[i] > 1 && (nc[i] - 1)*bt[i] < nc[i]*cutoff)
 388         {
 389             return -1;
 390         }
 391     }
 392
 393     if (npme_tot > 1)
 394     {
 395         /* The following choices should match those
 396          * in init_domain_decomposition in domdec.c.
 397          */
 398         if (nc[XX] == 1 && nc[YY] > 1)
 399         {
 400             npme[XX] = 1;
 401             npme[YY] = npme_tot;
 402         }
 403         else if (nc[YY] == 1)
 404         {
 405             npme[XX] = npme_tot;
 406             npme[YY] = 1;
 407         }
 408         else
 409         {
 410             /* Will we use 1D or 2D PME decomposition? */
 411             npme[XX] = (npme_tot % nc[XX] == 0) ? nc[XX] : npme_tot;
 412             npme[YY] = npme_tot/npme[XX];
 413         }
 414     }
 415
 416     if (EEL_PME(ir->coulombtype) || EVDW_PME(ir->vdwtype))
 417     {
 418         /* Check the PME grid restrictions.
 419          * Currently these can only be invalid here with too few grid lines
 420          * along the x dimension per rank doing PME.
 421          */
 422         int npme_x = (npme_tot > 1 ? npme[XX] : nc[XX]);
 423
 424         /* Currently we don't have the OpenMP thread count available here.
 425          * But with threads we have only tighter restrictions and it's
 426          * probably better anyhow to avoid settings where we need to reduce
 427          * grid lines over multiple ranks, as the thread check will do.
 428          */
 429         bool useThreads     = true;
 430         bool errorsAreFatal = false;
 431         if (!gmx_pme_check_restrictions(ir->pme_order, ir->nkx, ir->nky, ir->nkz,
 432                                         npme_x, useThreads, errorsAreFatal))
 433         {
 434             return -1;
 435         }
 436     }
 437
 438     /* When two dimensions are (nearly) equal, use more cells
 439      * for the smallest index, so the decomposition does not
 440      * depend sensitively on the rounding of the box elements.
 441      */
 442     for (i = 0; i < DIM; i++)
 443     {
 444         for (j = i+1; j < DIM; j++)
 445         {
 446             /* Check if the box size is nearly identical,
 447              * in that case we prefer nx > ny  and ny > nz.
 448              */
 449             if (std::fabs(bt[j] - bt[i]) < 0.01*bt[i] && nc[j] > nc[i])
 450             {
 451                 /* The XX/YY check is a bit compact. If nc[YY]==npme[YY]
 452                  * this means the swapped nc has nc[XX]==npme[XX],
 453                  * and we can also swap X and Y for PME.
 454                  */
 455                 /* Check if dimension i and j are equivalent for PME.
 456                  * For x/y: if nc[YY]!=npme[YY], we can not swap x/y
 457                  * For y/z: we can not have PME decomposition in z
 458                  */
 459                 if (npme_tot <= 1 ||
 460                     !((i == XX && j == YY && nc[YY] != npme[YY]) ||
 461                       (i == YY && j == ZZ && npme[YY] > 1)))
 462                 {
 463                     return -1;
 464                 }
 465             }
 466         }
 467     }
 468
 469     /* This function determines only half of the communication cost.
 470      * All PP, PME and PP-PME communication is symmetric
 471      * and the "back"-communication cost is identical to the forward cost.
 472      */
 473
 474     comm_vol = comm_box_frac(nc, cutoff, ddbox);
 475
 476     comm_pme = 0;
 477     for (i = 0; i < 2; i++)
 478     {
 479         /* Determine the largest volume for PME x/f redistribution */
 480         if (nc[i] % npme[i] != 0)
 481         {
 482             if (nc[i] > npme[i])
 483             {
 484                 comm_vol_xf = (npme[i] == 2 ? 1.0/3.0 : 0.5);
 485             }
 486             else
 487             {
 488                 comm_vol_xf = 1.0 - lcd(nc[i], npme[i])/static_cast<double>(npme[i]);
 489             }
 490             comm_pme += 3*natoms*comm_vol_xf;
 491         }
 492
 493         /* Grid overlap communication */
 494         if (npme[i] > 1)
 495         {
 496             nk        = (i == 0 ? ir->nkx : ir->nky);
 497             overlap   = (nk % npme[i] == 0 ? ir->pme_order-1 : ir->pme_order);
 498             temp      = npme[i];
 499             temp     *= overlap;
 500             temp     *= ir->nkx;
 501             temp     *= ir->nky;
 502             temp     *= ir->nkz;
 503             temp     /= nk;
 504             comm_pme += temp;
 505 /* Old line comm_pme += npme[i]*overlap*ir->nkx*ir->nky*ir->nkz/nk; */
 506         }
 507     }
 508
 509     comm_pme += comm_pme_cost_vol(npme[YY], ir->nky, ir->nkz, ir->nkx);
 510     comm_pme += comm_pme_cost_vol(npme[XX], ir->nkx, ir->nky, ir->nkz);
 511
 512     /* Add cost of pbc_dx for bondeds */
 513     cost_pbcdx = 0;
 514     if ((nc[XX] == 1 || nc[YY] == 1) || (nc[ZZ] == 1 && ir->ePBC != epbcXY))
 515     {
 516         if ((ddbox->tric_dir[XX] && nc[XX] == 1) ||
 517             (ddbox->tric_dir[YY] && nc[YY] == 1))
 518         {
 519             cost_pbcdx = pbcdxr*pbcdx_tric_fac;
 520         }
 521         else
 522         {
 523             cost_pbcdx = pbcdxr*pbcdx_rect_fac;
 524         }
 525     }
 526
 527     if (debug)
 528     {
 529         fprintf(debug,
 530                 "nc %2d %2d %2d %2d %2d vol pp %6.4f pbcdx %6.4f pme %9.3e tot %9.3e\n",
 531                 nc[XX], nc[YY], nc[ZZ], npme[XX], npme[YY],
 532                 comm_vol, cost_pbcdx, comm_pme/(3*natoms),
 533                 comm_vol + cost_pbcdx + comm_pme/(3*natoms));
 534     }
 535
 536     return 3*natoms*(comm_vol + cost_pbcdx) + comm_pme;
 537 }
 538
 539 /*! \brief Assign penalty factors to possible domain decompositions, based on the estimated communication costs. */
 540 static void assign_factors(real limit, real cutoff,
 541                            const matrix box, const gmx_ddbox_t *ddbox,
 542                            int natoms, const t_inputrec *ir,
 543                            float pbcdxr, int npme,
 544                            int ndiv, const int *div, const int *mdiv,
 545                            ivec ir_try, ivec opt)
 546 {
 547     int   x, y, i;
 548     float ce;
 549
 550     if (ndiv == 0)
 551     {
 552         ce = comm_cost_est(limit, cutoff, box, ddbox,
 553                            natoms, ir, pbcdxr, npme, ir_try);
 554         if (ce >= 0 && (opt[XX] == 0 ||
 555                         ce < comm_cost_est(limit, cutoff, box, ddbox,
 556                                            natoms, ir, pbcdxr,
 557                                            npme, opt)))
 558         {
 559             copy_ivec(ir_try, opt);
 560         }
 561
 562         return;
 563     }
 564
 565     for (x = mdiv[0]; x >= 0; x--)
 566     {
 567         for (i = 0; i < x; i++)
 568         {
 569             ir_try[XX] *= div[0];
 570         }
 571         for (y = mdiv[0]-x; y >= 0; y--)
 572         {
 573             for (i = 0; i < y; i++)
 574             {
 575                 ir_try[YY] *= div[0];
 576             }
 577             for (i = 0; i < mdiv[0]-x-y; i++)
 578             {
 579                 ir_try[ZZ] *= div[0];
 580             }
 581
 582             /* recurse */
 583             assign_factors(limit, cutoff, box, ddbox, natoms, ir, pbcdxr, npme,
 584                            ndiv-1, div+1, mdiv+1, ir_try, opt);
 585
 586             for (i = 0; i < mdiv[0]-x-y; i++)
 587             {
 588                 ir_try[ZZ] /= div[0];
 589             }
 590             for (i = 0; i < y; i++)
 591             {
 592                 ir_try[YY] /= div[0];
 593             }
 594         }
 595         for (i = 0; i < x; i++)
 596         {
 597             ir_try[XX] /= div[0];
 598         }
 599     }
 600 }
 601
 602 /*! \brief Determine the optimal distribution of DD cells for the simulation system and number of MPI ranks */
 603 static real optimize_ncells(const gmx::MDLogger &mdlog,
 604                             int nnodes_tot, int npme_only,
 605                             gmx_bool bDynLoadBal, real dlb_scale,
 606                             const gmx_mtop_t *mtop,
 607                             const matrix box, const gmx_ddbox_t *ddbox,
 608                             const t_inputrec *ir,
 609                             real cellsize_limit, real cutoff,
 610                             const bool haveInterDomainBondeds,
 611                             ivec nc)
 612 {
 613     int      npp, npme, d, nmax;
 614     double   pbcdxr;
 615     real     limit;
 616     ivec     itry;
 617
 618     limit  = cellsize_limit;
 619
 620     npp = nnodes_tot - npme_only;
 621     if (EEL_PME(ir->coulombtype))
 622     {
 623         npme = (npme_only > 0 ? npme_only : npp);
 624     }
 625     else
 626     {
 627         npme = 0;
 628     }
 629
 630     if (haveInterDomainBondeds)
 631     {
 632         /* If we can skip PBC for distance calculations in plain-C bondeds,
 633          * we can save some time (e.g. 3D DD with pbc=xyz).
 634          * Here we ignore SIMD bondeds as they always do (fast) PBC.
 635          */
 636         count_bonded_distances(mtop, ir, &pbcdxr, nullptr);
 637         pbcdxr /= static_cast<double>(mtop->natoms);
 638     }
 639     else
 640     {
 641         /* Every molecule is a single charge group: no pbc required */
 642         pbcdxr = 0;
 643     }
 644     /* Add a margin for DLB and/or pressure scaling */
 645     if (bDynLoadBal)
 646     {
 647         if (dlb_scale >= 1.0)
 648         {
 649             gmx_fatal(FARGS, "The value for option -dds should be smaller than 1");
 650         }
 651         GMX_LOG(mdlog.info).appendTextFormatted(
 652                 "Scaling the initial minimum size with 1/%g (option -dds) = %g",
 653                 dlb_scale, 1/dlb_scale);
 654         limit /= dlb_scale;
 655     }
 656     else if (ir->epc != epcNO)
 657     {
 658         GMX_LOG(mdlog.info).appendTextFormatted(
 659                 "To account for pressure scaling, scaling the initial minimum size with %g",
 660                 DD_GRID_MARGIN_PRES_SCALE);
 661         limit *= DD_GRID_MARGIN_PRES_SCALE;
 662     }
 663
 664     GMX_LOG(mdlog.info).appendTextFormatted(
 665             "Optimizing the DD grid for %d cells with a minimum initial size of %.3f nm",
 666             npp, limit);
 667
 668     if (inhomogeneous_z(ir))
 669     {
 670         GMX_LOG(mdlog.info).appendTextFormatted(
 671                 "Ewald_geometry=%s: assuming inhomogeneous particle distribution in z, will not decompose in z.",
 672                 eewg_names[ir->ewald_geometry]);
 673     }
 674
 675     if (limit > 0)
 676     {
 677         std::string maximumCells = "The maximum allowed number of cells is:";
 678         for (d = 0; d < DIM; d++)
 679         {
 680             nmax = static_cast<int>(ddbox->box_size[d]*ddbox->skew_fac[d]/limit);
 681             if (d >= ddbox->npbcdim && nmax < 2)
 682             {
 683                 nmax = 2;
 684             }
 685             if (d == ZZ && inhomogeneous_z(ir))
 686             {
 687                 nmax = 1;
 688             }
 689             maximumCells += gmx::formatString(" %c %d", 'X' + d, nmax);
 690         }
 691         GMX_LOG(mdlog.info).appendText(maximumCells);
 692     }
 693
 694     if (debug)
 695     {
 696         fprintf(debug, "Average nr of pbc_dx calls per atom %.2f\n", pbcdxr);
 697     }
 698
 699     /* Decompose npp in factors */
 700     std::vector<int> div;
 701     std::vector<int> mdiv;
 702     factorize(npp, &div, &mdiv);
 703
 704     itry[XX] = 1;
 705     itry[YY] = 1;
 706     itry[ZZ] = 1;
 707     clear_ivec(nc);
 708     assign_factors(limit, cutoff, box, ddbox, mtop->natoms, ir, pbcdxr,
 709                    npme, div.size(), div.data(), mdiv.data(), itry, nc);
 710
 711     return limit;
 712 }
 713
 714 DDSetup
 715 dd_choose_grid(const gmx::MDLogger &mdlog,
 716                const t_commrec *cr,
 717                const t_inputrec *ir,
 718                const gmx_mtop_t *mtop,
 719                const matrix box, const gmx_ddbox_t *ddbox,
 720                const int numPmeRanksRequested,
 721                const gmx_bool bDynLoadBal, const real dlb_scale,
 722                const real cellsize_limit, const real cutoff_dd,
 723                const bool haveInterDomainBondeds)
 724 {
 725     DDSetup ddSetup;
 726
 727     if (MASTER(cr))
 728     {
 729         int64_t nnodes_div = cr->nnodes;
 730         if (EEL_PME(ir->coulombtype))
 731         {
 732             if (numPmeRanksRequested > 0)
 733             {
 734                 if (numPmeRanksRequested >= cr->nnodes)
 735                 {
 736                     gmx_fatal(FARGS,
 737                               "Cannot have %d separate PME ranks with just %d total ranks",
 738                               numPmeRanksRequested, cr->nnodes);
 739                 }
 740
 741                 /* If the user purposely selected the number of PME nodes,
 742                  * only check for large primes in the PP node count.
 743                  */
 744                 nnodes_div -= numPmeRanksRequested;
 745             }
 746         }
 747         else
 748         {
 749             ddSetup.numPmeRanks = 0;
 750         }
 751
 752         if (nnodes_div > 12)
 753         {
 754             const int64_t ldiv = largest_divisor(nnodes_div);
 755             /* Check if the largest divisor is more than nnodes^2/3 */
 756             if (ldiv*ldiv*ldiv > nnodes_div*nnodes_div)
 757             {
 758                 gmx_fatal(FARGS, "The number of ranks you selected (%" PRId64 ") contains a large prime factor %" PRId64 ". In most cases this will lead to bad performance. Choose a number with smaller prime factors or set the decomposition (option -dd) manually.",
 759                           nnodes_div, ldiv);
 760             }
 761         }
 762
 763         if (EEL_PME(ir->coulombtype))
 764         {
 765             if (numPmeRanksRequested < 0)
 766             {
 767                 /* Use PME nodes when the number of nodes is more than 16 */
 768                 if (cr->nnodes <= 18)
 769                 {
 770                     ddSetup.numPmeRanks = 0;
 771                     GMX_LOG(mdlog.info).appendTextFormatted(
 772                             "Using %d separate PME ranks, as there are too few total\n"
 773                             " ranks for efficient splitting",
 774                             cr->npmenodes);
 775                 }
 776                 else
 777                 {
 778                     ddSetup.numPmeRanks = guess_npme(mdlog, mtop, ir, box, cr->nnodes);
 779                     GMX_LOG(mdlog.info).appendTextFormatted(
 780                             "Using %d separate PME ranks, as guessed by mdrun", ddSetup.numPmeRanks);
 781                 }
 782             }
 783             else
 784             {
 785                 /* We checked above that nPmeRanks is a valid number */
 786                 ddSetup.numPmeRanks = numPmeRanksRequested;
 787                 GMX_LOG(mdlog.info).appendTextFormatted(
 788                         "Using %d separate PME ranks", ddSetup.numPmeRanks);
 789                 // TODO: there was a ", per user request" note here, but it's not correct anymore,
 790                 // as with GPUs decision about nPmeRanks can be made in runner() as well.
 791                 // Consider a single spot for setting nPmeRanks.
 792             }
 793         }
 794
 795         ddSetup.cellsizeLimit =
 796             optimize_ncells(mdlog, cr->nnodes, ddSetup.numPmeRanks,
 797                             bDynLoadBal, dlb_scale,
 798                             mtop, box, ddbox, ir,
 799                             cellsize_limit, cutoff_dd,
 800                             haveInterDomainBondeds,
 801                             ddSetup.numDomains);
 802     }
 803
 804     /* Communicate the information set by the master to all nodes */
 805     gmx_bcast(sizeof(ddSetup.numDomains), ddSetup.numDomains, cr);
 806     if (EEL_PME(ir->coulombtype))
 807     {
 808         gmx_bcast(sizeof(ddSetup.numPmeRanks), &ddSetup.numPmeRanks, cr);
 809     }
 810     else
 811     {
 812         ddSetup.numPmeRanks = 0;
 813     }
 814
 815     return ddSetup;
 816 }