src/gromacs/domdec/domdec_setup.cpp

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2008,2009,2010,2011,2012,2013,2014,2015, by the GROMACS development team, led by
   5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   6  * and including many others, as listed in the AUTHORS file in the
   7  * top-level source directory and at http://www.gromacs.org.
   8  *
   9  * GROMACS is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public License
  11  * as published by the Free Software Foundation; either version 2.1
  12  * of the License, or (at your option) any later version.
  13  *
  14  * GROMACS is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public
  20  * License along with GROMACS; if not, see
  21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  23  *
  24  * If you want to redistribute modifications to GROMACS, please
  25  * consider that scientific software is very special. Version
  26  * control is crucial - bugs must be traceable. We will be happy to
  27  * consider code for inclusion in the official distribution, but
  28  * derived work must not be called official GROMACS. Details are found
  29  * in the README & COPYING files - if they are missing, get the
  30  * official version at http://www.gromacs.org.
  31  *
  32  * To help us fund GROMACS development, we humbly ask that you cite
  33  * the research papers on the package. Check out http://www.gromacs.org.
  34  */
  35
  36 /*! \internal \file
  37  *
  38  * \brief This file defines functions used by the domdec module
  39  * in its initial setup phase.
  40  *
  41  * \author Berk Hess <hess@kth.se>
  42  * \ingroup module_domdec
  43  */
  44
  45 #include "gmxpre.h"
  46
  47 #include <assert.h>
  48 #include <stdio.h>
  49
  50 #include <cmath>
  51
  52 #include "gromacs/domdec/domdec.h"
  53 #include "gromacs/domdec/domdec_struct.h"
  54 #include "gromacs/gmxlib/network.h"
  55 #include "gromacs/math/utilities.h"
  56 #include "gromacs/math/vec.h"
  57 #include "gromacs/mdlib/perf_est.h"
  58 #include "gromacs/mdtypes/commrec.h"
  59 #include "gromacs/mdtypes/inputrec.h"
  60 #include "gromacs/mdtypes/md_enums.h"
  61 #include "gromacs/pbcutil/pbc.h"
  62 #include "gromacs/utility/fatalerror.h"
  63 #include "gromacs/utility/smalloc.h"
  64
  65 /*! \brief Margin for setting up the DD grid */
  66 #define DD_GRID_MARGIN_PRES_SCALE 1.05
  67
  68 /*! \brief Factorize \p n.
  69  *
  70  * \param[in]    n     Value to factorize
  71  * \param[out]   fac   Pointer to array of factors (to be allocated in this function)
  72  * \param[out]   mfac  Pointer to array of the number of times each factor repeats in the factorization (to be allocated in this function)
  73  * \return             The number of unique factors
  74  */
  75 static int factorize(int n, int **fac, int **mfac)
  76 {
  77     int d, ndiv;
  78
  79     if (n <= 0)
  80     {
  81         gmx_fatal(FARGS, "Can only factorize positive integers.");
  82     }
  83
  84     /* Decompose n in factors */
  85     snew(*fac, n/2);
  86     snew(*mfac, n/2);
  87     d    = 2;
  88     ndiv = 0;
  89     while (n > 1)
  90     {
  91         while (n % d == 0)
  92         {
  93             if (ndiv == 0 || (*fac)[ndiv-1] != d)
  94             {
  95                 ndiv++;
  96                 (*fac)[ndiv-1] = d;
  97             }
  98             (*mfac)[ndiv-1]++;
  99             n /= d;
 100         }
 101         d++;
 102     }
 103
 104     return ndiv;
 105 }
 106
 107 /*! \brief Find largest divisor of \p n smaller than \p n*/
 108 static gmx_bool largest_divisor(int n)
 109 {
 110     int ndiv, *div, *mdiv, ldiv;
 111
 112     ndiv = factorize(n, &div, &mdiv);
 113     ldiv = div[ndiv-1];
 114     sfree(div);
 115     sfree(mdiv);
 116
 117     return ldiv;
 118 }
 119
 120 /*! \brief Compute largest common divisor of \p n1 and \b n2 */
 121 static int lcd(int n1, int n2)
 122 {
 123     int d, i;
 124
 125     d = 1;
 126     for (i = 2; (i <= n1 && i <= n2); i++)
 127     {
 128         if (n1 % i == 0 && n2 % i == 0)
 129         {
 130             d = i;
 131         }
 132     }
 133
 134     return d;
 135 }
 136
 137 /*! \brief Returns TRUE when there are enough PME ranks for the ratio */
 138 static gmx_bool fits_pme_ratio(int nrank_tot, int nrank_pme, float ratio)
 139 {
 140     return ((double)nrank_pme/(double)nrank_tot > 0.95*ratio);
 141 }
 142
 143 /*! \brief Returns TRUE when npme out of ntot ranks doing PME is expected to give reasonable performance */
 144 static gmx_bool fits_pp_pme_perf(int ntot, int npme, float ratio)
 145 {
 146     int ndiv, *div, *mdiv, ldiv;
 147     int npp_root3, npme_root2;
 148
 149     ndiv = factorize(ntot - npme, &div, &mdiv);
 150     ldiv = div[ndiv-1];
 151     sfree(div);
 152     sfree(mdiv);
 153
 154     npp_root3  = static_cast<int>(std::cbrt(ntot - npme) + 0.5);
 155     npme_root2 = static_cast<int>(std::sqrt(static_cast<double>(npme)) + 0.5);
 156
 157     /* The check below gives a reasonable division:
 158      * factor 5 allowed at 5 or more PP ranks,
 159      * factor 7 allowed at 49 or more PP ranks.
 160      */
 161     if (ldiv > 3 + npp_root3)
 162     {
 163         return FALSE;
 164     }
 165
 166     /* Check if the number of PP and PME ranks have a reasonable sized
 167      * denominator in common, such that we can use 2D PME decomposition
 168      * when required (which requires nx_pp == nx_pme).
 169      * The factor of 2 allows for a maximum ratio of 2^2=4
 170      * between nx_pme and ny_pme.
 171      */
 172     if (lcd(ntot - npme, npme)*2 < npme_root2)
 173     {
 174         return FALSE;
 175     }
 176
 177     /* Does this division gives a reasonable PME load? */
 178     return fits_pme_ratio(ntot, npme, ratio);
 179 }
 180
 181 /*! \brief Make a guess for the number of PME ranks to use. */
 182 static int guess_npme(FILE *fplog, const gmx_mtop_t *mtop, const t_inputrec *ir,
 183                       matrix box,
 184                       int nrank_tot)
 185 {
 186     float      ratio;
 187     int        npme;
 188
 189     ratio = pme_load_estimate(mtop, ir, box);
 190
 191     if (fplog)
 192     {
 193         fprintf(fplog, "Guess for relative PME load: %.2f\n", ratio);
 194     }
 195
 196     /* We assume the optimal rank ratio is close to the load ratio.
 197      * The communication load is neglected,
 198      * but (hopefully) this will balance out between PP and PME.
 199      */
 200
 201     if (!fits_pme_ratio(nrank_tot, nrank_tot/2, ratio))
 202     {
 203         /* We would need more than nrank_tot/2 PME only nodes,
 204          * which is not possible. Since the PME load is very high,
 205          * we will not loose much performance when all ranks do PME.
 206          */
 207
 208         return 0;
 209     }
 210
 211     /* First try to find npme as a factor of nrank_tot up to nrank_tot/3.
 212      * We start with a minimum PME node fraction of 1/16
 213      * and avoid ratios which lead to large prime factors in nnodes-npme.
 214      */
 215     npme = (nrank_tot + 15)/16;
 216     while (npme <= nrank_tot/3)
 217     {
 218         if (nrank_tot % npme == 0)
 219         {
 220             /* Note that fits_perf might change the PME grid,
 221              * in the current implementation it does not.
 222              */
 223             if (fits_pp_pme_perf(nrank_tot, npme, ratio))
 224             {
 225                 break;
 226             }
 227         }
 228         npme++;
 229     }
 230     if (npme > nrank_tot/3)
 231     {
 232         /* Try any possible number for npme */
 233         npme = 1;
 234         while (npme <= nrank_tot/2)
 235         {
 236             /* Note that fits_perf may change the PME grid */
 237             if (fits_pp_pme_perf(nrank_tot, npme, ratio))
 238             {
 239                 break;
 240             }
 241             npme++;
 242         }
 243     }
 244     if (npme > nrank_tot/2)
 245     {
 246         gmx_fatal(FARGS, "Could not find an appropriate number of separate PME ranks. i.e. >= %5f*#ranks (%d) and <= #ranks/2 (%d) and reasonable performance wise (grid_x=%d, grid_y=%d).\n"
 247                   "Use the -npme option of mdrun or change the number of ranks or the PME grid dimensions, see the manual for details.",
 248                   ratio, (int)(0.95*ratio*nrank_tot + 0.5), nrank_tot/2, ir->nkx, ir->nky);
 249         /* Keep the compiler happy */
 250         npme = 0;
 251     }
 252     else
 253     {
 254         if (fplog)
 255         {
 256             fprintf(fplog,
 257                     "Will use %d particle-particle and %d PME only ranks\n"
 258                     "This is a guess, check the performance at the end of the log file\n",
 259                     nrank_tot - npme, npme);
 260         }
 261         fprintf(stderr, "\n"
 262                 "Will use %d particle-particle and %d PME only ranks\n"
 263                 "This is a guess, check the performance at the end of the log file\n",
 264                 nrank_tot - npme, npme);
 265     }
 266
 267     return npme;
 268 }
 269
 270 /*! \brief Return \p n divided by \p f rounded up to the next integer. */
 271 static int div_up(int n, int f)
 272 {
 273     return (n + f - 1)/f;
 274 }
 275
 276 real comm_box_frac(const ivec dd_nc, real cutoff, const gmx_ddbox_t *ddbox)
 277 {
 278     int  i, j, k;
 279     rvec nw;
 280     real comm_vol;
 281
 282     for (i = 0; i < DIM; i++)
 283     {
 284         real bt = ddbox->box_size[i]*ddbox->skew_fac[i];
 285         nw[i] = dd_nc[i]*cutoff/bt;
 286     }
 287
 288     comm_vol = 0;
 289     for (i = 0; i < DIM; i++)
 290     {
 291         if (dd_nc[i] > 1)
 292         {
 293             comm_vol += nw[i];
 294             for (j = i+1; j < DIM; j++)
 295             {
 296                 if (dd_nc[j] > 1)
 297                 {
 298                     comm_vol += nw[i]*nw[j]*M_PI/4;
 299                     for (k = j+1; k < DIM; k++)
 300                     {
 301                         if (dd_nc[k] > 1)
 302                         {
 303                             comm_vol += nw[i]*nw[j]*nw[k]*M_PI/6;
 304                         }
 305                     }
 306                 }
 307             }
 308         }
 309     }
 310
 311     return comm_vol;
 312 }
 313
 314 /*! \brief Return whether the DD inhomogeneous in the z direction */
 315 static gmx_bool inhomogeneous_z(const t_inputrec *ir)
 316 {
 317     return ((EEL_PME(ir->coulombtype) || ir->coulombtype == eelEWALD) &&
 318             ir->ePBC == epbcXYZ && ir->ewald_geometry == eewg3DC);
 319 }
 320
 321 /*! \brief Estimate cost of PME FFT communication
 322  *
 323  * This only takes the communication into account and not imbalance
 324  * in the calculation. But the imbalance in communication and calculation
 325  * are similar and therefore these formulas also prefer load balance
 326  * in the FFT and pme_solve calculation.
 327  */
 328 static float comm_pme_cost_vol(int npme, int a, int b, int c)
 329 {
 330     /* We use a float here, since an integer might overflow */
 331     float comm_vol;
 332
 333     comm_vol  = npme - 1;
 334     comm_vol *= npme;
 335     comm_vol *= div_up(a, npme);
 336     comm_vol *= div_up(b, npme);
 337     comm_vol *= c;
 338
 339     return comm_vol;
 340 }
 341
 342 /*! \brief Estimate cost of communication for a possible domain decomposition. */
 343 static float comm_cost_est(real limit, real cutoff,
 344                            matrix box, const gmx_ddbox_t *ddbox,
 345                            int natoms, const t_inputrec *ir,
 346                            float pbcdxr,
 347                            int npme_tot, ivec nc)
 348 {
 349     ivec  npme = {1, 1, 1};
 350     int   i, j, nk, overlap;
 351     rvec  bt;
 352     float comm_vol, comm_vol_xf, comm_pme, cost_pbcdx;
 353     /* This is the cost of a pbc_dx call relative to the cost
 354      * of communicating the coordinate and force of an atom.
 355      * This will be machine dependent.
 356      * These factors are for x86 with SMP or Infiniband.
 357      */
 358     float pbcdx_rect_fac = 0.1;
 359     float pbcdx_tric_fac = 0.2;
 360     float temp;
 361
 362     /* Check the DD algorithm restrictions */
 363     if ((ir->ePBC == epbcXY && ir->nwall < 2 && nc[ZZ] > 1) ||
 364         (ir->ePBC == epbcSCREW && (nc[XX] == 1 || nc[YY] > 1 || nc[ZZ] > 1)))
 365     {
 366         return -1;
 367     }
 368
 369     if (inhomogeneous_z(ir) && nc[ZZ] > 1)
 370     {
 371         return -1;
 372     }
 373
 374     assert(ddbox->npbcdim <= DIM);
 375
 376     /* Check if the triclinic requirements are met */
 377     for (i = 0; i < DIM; i++)
 378     {
 379         for (j = i+1; j < ddbox->npbcdim; j++)
 380         {
 381             if (box[j][i] != 0 || ir->deform[j][i] != 0 ||
 382                 (ir->epc != epcNO && ir->compress[j][i] != 0))
 383             {
 384                 if (nc[j] > 1 && nc[i] == 1)
 385                 {
 386                     return -1;
 387                 }
 388             }
 389         }
 390     }
 391
 392     for (i = 0; i < DIM; i++)
 393     {
 394         bt[i] = ddbox->box_size[i]*ddbox->skew_fac[i];
 395
 396         /* Without PBC and with 2 cells, there are no lower limits on the cell size */
 397         if (!(i >= ddbox->npbcdim && nc[i] <= 2) && bt[i] < nc[i]*limit)
 398         {
 399             return -1;
 400         }
 401         /* With PBC, check if the cut-off fits in nc[i]-1 cells */
 402         if (i < ddbox->npbcdim && nc[i] > 1 && (nc[i] - 1)*bt[i] < nc[i]*cutoff)
 403         {
 404             return -1;
 405         }
 406     }
 407
 408     if (npme_tot > 1)
 409     {
 410         /* The following choices should match those
 411          * in init_domain_decomposition in domdec.c.
 412          */
 413         if (nc[XX] == 1 && nc[YY] > 1)
 414         {
 415             npme[XX] = 1;
 416             npme[YY] = npme_tot;
 417         }
 418         else if (nc[YY] == 1)
 419         {
 420             npme[XX] = npme_tot;
 421             npme[YY] = 1;
 422         }
 423         else
 424         {
 425             /* Will we use 1D or 2D PME decomposition? */
 426             npme[XX] = (npme_tot % nc[XX] == 0) ? nc[XX] : npme_tot;
 427             npme[YY] = npme_tot/npme[XX];
 428         }
 429     }
 430
 431     /* When two dimensions are (nearly) equal, use more cells
 432      * for the smallest index, so the decomposition does not
 433      * depend sensitively on the rounding of the box elements.
 434      */
 435     for (i = 0; i < DIM; i++)
 436     {
 437         for (j = i+1; j < DIM; j++)
 438         {
 439             /* Check if the box size is nearly identical,
 440              * in that case we prefer nx > ny  and ny > nz.
 441              */
 442             if (fabs(bt[j] - bt[i]) < 0.01*bt[i] && nc[j] > nc[i])
 443             {
 444                 /* The XX/YY check is a bit compact. If nc[YY]==npme[YY]
 445                  * this means the swapped nc has nc[XX]==npme[XX],
 446                  * and we can also swap X and Y for PME.
 447                  */
 448                 /* Check if dimension i and j are equivalent for PME.
 449                  * For x/y: if nc[YY]!=npme[YY], we can not swap x/y
 450                  * For y/z: we can not have PME decomposition in z
 451                  */
 452                 if (npme_tot <= 1 ||
 453                     !((i == XX && j == YY && nc[YY] != npme[YY]) ||
 454                       (i == YY && j == ZZ && npme[YY] > 1)))
 455                 {
 456                     return -1;
 457                 }
 458             }
 459         }
 460     }
 461
 462     /* This function determines only half of the communication cost.
 463      * All PP, PME and PP-PME communication is symmetric
 464      * and the "back"-communication cost is identical to the forward cost.
 465      */
 466
 467     comm_vol = comm_box_frac(nc, cutoff, ddbox);
 468
 469     comm_pme = 0;
 470     for (i = 0; i < 2; i++)
 471     {
 472         /* Determine the largest volume for PME x/f redistribution */
 473         if (nc[i] % npme[i] != 0)
 474         {
 475             if (nc[i] > npme[i])
 476             {
 477                 comm_vol_xf = (npme[i] == 2 ? 1.0/3.0 : 0.5);
 478             }
 479             else
 480             {
 481                 comm_vol_xf = 1.0 - lcd(nc[i], npme[i])/(double)npme[i];
 482             }
 483             comm_pme += 3*natoms*comm_vol_xf;
 484         }
 485
 486         /* Grid overlap communication */
 487         if (npme[i] > 1)
 488         {
 489             nk        = (i == 0 ? ir->nkx : ir->nky);
 490             overlap   = (nk % npme[i] == 0 ? ir->pme_order-1 : ir->pme_order);
 491             temp      = npme[i];
 492             temp     *= overlap;
 493             temp     *= ir->nkx;
 494             temp     *= ir->nky;
 495             temp     *= ir->nkz;
 496             temp     /= nk;
 497             comm_pme += temp;
 498 /* Old line comm_pme += npme[i]*overlap*ir->nkx*ir->nky*ir->nkz/nk; */
 499         }
 500     }
 501
 502     comm_pme += comm_pme_cost_vol(npme[YY], ir->nky, ir->nkz, ir->nkx);
 503     comm_pme += comm_pme_cost_vol(npme[XX], ir->nkx, ir->nky, ir->nkz);
 504
 505     /* Add cost of pbc_dx for bondeds */
 506     cost_pbcdx = 0;
 507     if ((nc[XX] == 1 || nc[YY] == 1) || (nc[ZZ] == 1 && ir->ePBC != epbcXY))
 508     {
 509         if ((ddbox->tric_dir[XX] && nc[XX] == 1) ||
 510             (ddbox->tric_dir[YY] && nc[YY] == 1))
 511         {
 512             cost_pbcdx = pbcdxr*pbcdx_tric_fac;
 513         }
 514         else
 515         {
 516             cost_pbcdx = pbcdxr*pbcdx_rect_fac;
 517         }
 518     }
 519
 520     if (debug)
 521     {
 522         fprintf(debug,
 523                 "nc %2d %2d %2d %2d %2d vol pp %6.4f pbcdx %6.4f pme %9.3e tot %9.3e\n",
 524                 nc[XX], nc[YY], nc[ZZ], npme[XX], npme[YY],
 525                 comm_vol, cost_pbcdx, comm_pme/(3*natoms),
 526                 comm_vol + cost_pbcdx + comm_pme/(3*natoms));
 527     }
 528
 529     return 3*natoms*(comm_vol + cost_pbcdx) + comm_pme;
 530 }
 531
 532 /*! \brief Assign penalty factors to possible domain decompositions, based on the estimated communication costs. */
 533 static void assign_factors(const gmx_domdec_t *dd,
 534                            real limit, real cutoff,
 535                            matrix box, const gmx_ddbox_t *ddbox,
 536                            int natoms, const t_inputrec *ir,
 537                            float pbcdxr, int npme,
 538                            int ndiv, int *div, int *mdiv, ivec ir_try, ivec opt)
 539 {
 540     int   x, y, i;
 541     float ce;
 542
 543     if (ndiv == 0)
 544     {
 545         ce = comm_cost_est(limit, cutoff, box, ddbox,
 546                            natoms, ir, pbcdxr, npme, ir_try);
 547         if (ce >= 0 && (opt[XX] == 0 ||
 548                         ce < comm_cost_est(limit, cutoff, box, ddbox,
 549                                            natoms, ir, pbcdxr,
 550                                            npme, opt)))
 551         {
 552             copy_ivec(ir_try, opt);
 553         }
 554
 555         return;
 556     }
 557
 558     for (x = mdiv[0]; x >= 0; x--)
 559     {
 560         for (i = 0; i < x; i++)
 561         {
 562             ir_try[XX] *= div[0];
 563         }
 564         for (y = mdiv[0]-x; y >= 0; y--)
 565         {
 566             for (i = 0; i < y; i++)
 567             {
 568                 ir_try[YY] *= div[0];
 569             }
 570             for (i = 0; i < mdiv[0]-x-y; i++)
 571             {
 572                 ir_try[ZZ] *= div[0];
 573             }
 574
 575             /* recurse */
 576             assign_factors(dd, limit, cutoff, box, ddbox, natoms, ir, pbcdxr, npme,
 577                            ndiv-1, div+1, mdiv+1, ir_try, opt);
 578
 579             for (i = 0; i < mdiv[0]-x-y; i++)
 580             {
 581                 ir_try[ZZ] /= div[0];
 582             }
 583             for (i = 0; i < y; i++)
 584             {
 585                 ir_try[YY] /= div[0];
 586             }
 587         }
 588         for (i = 0; i < x; i++)
 589         {
 590             ir_try[XX] /= div[0];
 591         }
 592     }
 593 }
 594
 595 /*! \brief Determine the optimal distribution of DD cells for the simulation system and number of MPI ranks */
 596 static real optimize_ncells(FILE *fplog,
 597                             int nnodes_tot, int npme_only,
 598                             gmx_bool bDynLoadBal, real dlb_scale,
 599                             const gmx_mtop_t *mtop,
 600                             matrix box, const gmx_ddbox_t *ddbox,
 601                             const t_inputrec *ir,
 602                             gmx_domdec_t *dd,
 603                             real cellsize_limit, real cutoff,
 604                             gmx_bool bInterCGBondeds,
 605                             ivec nc)
 606 {
 607     int      npp, npme, ndiv, *div, *mdiv, d, nmax;
 608     double   pbcdxr;
 609     real     limit;
 610     ivec     itry;
 611
 612     limit  = cellsize_limit;
 613
 614     dd->nc[XX] = 1;
 615     dd->nc[YY] = 1;
 616     dd->nc[ZZ] = 1;
 617
 618     npp = nnodes_tot - npme_only;
 619     if (EEL_PME(ir->coulombtype))
 620     {
 621         npme = (npme_only > 0 ? npme_only : npp);
 622     }
 623     else
 624     {
 625         npme = 0;
 626     }
 627
 628     if (bInterCGBondeds)
 629     {
 630         /* If we can skip PBC for distance calculations in plain-C bondeds,
 631          * we can save some time (e.g. 3D DD with pbc=xyz).
 632          * Here we ignore SIMD bondeds as they always do (fast) PBC.
 633          */
 634         count_bonded_distances(mtop, ir, &pbcdxr, NULL);
 635         pbcdxr /= (double)mtop->natoms;
 636     }
 637     else
 638     {
 639         /* Every molecule is a single charge group: no pbc required */
 640         pbcdxr = 0;
 641     }
 642     /* Add a margin for DLB and/or pressure scaling */
 643     if (bDynLoadBal)
 644     {
 645         if (dlb_scale >= 1.0)
 646         {
 647             gmx_fatal(FARGS, "The value for option -dds should be smaller than 1");
 648         }
 649         if (fplog)
 650         {
 651             fprintf(fplog, "Scaling the initial minimum size with 1/%g (option -dds) = %g\n", dlb_scale, 1/dlb_scale);
 652         }
 653         limit /= dlb_scale;
 654     }
 655     else if (ir->epc != epcNO)
 656     {
 657         if (fplog)
 658         {
 659             fprintf(fplog, "To account for pressure scaling, scaling the initial minimum size with %g\n", DD_GRID_MARGIN_PRES_SCALE);
 660             limit *= DD_GRID_MARGIN_PRES_SCALE;
 661         }
 662     }
 663
 664     if (fplog)
 665     {
 666         fprintf(fplog, "Optimizing the DD grid for %d cells with a minimum initial size of %.3f nm\n", npp, limit);
 667
 668         if (inhomogeneous_z(ir))
 669         {
 670             fprintf(fplog, "Ewald_geometry=%s: assuming inhomogeneous particle distribution in z, will not decompose in z.\n", eewg_names[ir->ewald_geometry]);
 671         }
 672
 673         if (limit > 0)
 674         {
 675             fprintf(fplog, "The maximum allowed number of cells is:");
 676             for (d = 0; d < DIM; d++)
 677             {
 678                 nmax = (int)(ddbox->box_size[d]*ddbox->skew_fac[d]/limit);
 679                 if (d >= ddbox->npbcdim && nmax < 2)
 680                 {
 681                     nmax = 2;
 682                 }
 683                 if (d == ZZ && inhomogeneous_z(ir))
 684                 {
 685                     nmax = 1;
 686                 }
 687                 fprintf(fplog, " %c %d", 'X' + d, nmax);
 688             }
 689             fprintf(fplog, "\n");
 690         }
 691     }
 692
 693     if (debug)
 694     {
 695         fprintf(debug, "Average nr of pbc_dx calls per atom %.2f\n", pbcdxr);
 696     }
 697
 698     /* Decompose npp in factors */
 699     ndiv = factorize(npp, &div, &mdiv);
 700
 701     itry[XX] = 1;
 702     itry[YY] = 1;
 703     itry[ZZ] = 1;
 704     clear_ivec(nc);
 705     assign_factors(dd, limit, cutoff, box, ddbox, mtop->natoms, ir, pbcdxr,
 706                    npme, ndiv, div, mdiv, itry, nc);
 707
 708     sfree(div);
 709     sfree(mdiv);
 710
 711     return limit;
 712 }
 713
 714 real dd_choose_grid(FILE *fplog,
 715                     t_commrec *cr, gmx_domdec_t *dd,
 716                     const t_inputrec *ir,
 717                     const gmx_mtop_t *mtop,
 718                     matrix box, const gmx_ddbox_t *ddbox,
 719                     int nPmeRanks,
 720                     gmx_bool bDynLoadBal, real dlb_scale,
 721                     real cellsize_limit, real cutoff_dd,
 722                     gmx_bool bInterCGBondeds)
 723 {
 724     gmx_int64_t     nnodes_div, ldiv;
 725     real            limit;
 726
 727     if (MASTER(cr))
 728     {
 729         nnodes_div = cr->nnodes;
 730         if (EEL_PME(ir->coulombtype))
 731         {
 732             if (nPmeRanks > 0)
 733             {
 734                 if (nPmeRanks >= cr->nnodes)
 735                 {
 736                     gmx_fatal(FARGS,
 737                               "Cannot have %d separate PME ranks with just %d total ranks",
 738                               nPmeRanks, cr->nnodes);
 739                 }
 740
 741                 /* If the user purposely selected the number of PME nodes,
 742                  * only check for large primes in the PP node count.
 743                  */
 744                 nnodes_div -= nPmeRanks;
 745             }
 746         }
 747         else
 748         {
 749             cr->npmenodes = 0;
 750         }
 751
 752         if (nnodes_div > 12)
 753         {
 754             ldiv = largest_divisor(nnodes_div);
 755             /* Check if the largest divisor is more than nnodes^2/3 */
 756             if (ldiv*ldiv*ldiv > nnodes_div*nnodes_div)
 757             {
 758                 gmx_fatal(FARGS, "The number of ranks you selected (%d) contains a large prime factor %d. In most cases this will lead to bad performance. Choose a number with smaller prime factors or set the decomposition (option -dd) manually.",
 759                           nnodes_div, ldiv);
 760             }
 761         }
 762
 763         if (EEL_PME(ir->coulombtype))
 764         {
 765             if (nPmeRanks < 0)
 766             {
 767                 /* Use PME nodes when the number of nodes is more than 16 */
 768                 if (cr->nnodes <= 18)
 769                 {
 770                     cr->npmenodes = 0;
 771                     if (fplog)
 772                     {
 773                         fprintf(fplog, "Using %d separate PME ranks, as there are too few total\n ranks for efficient splitting\n", cr->npmenodes);
 774                     }
 775                 }
 776                 else
 777                 {
 778                     cr->npmenodes = guess_npme(fplog, mtop, ir, box, cr->nnodes);
 779                     if (fplog)
 780                     {
 781                         fprintf(fplog, "Using %d separate PME ranks, as guessed by mdrun\n", cr->npmenodes);
 782                     }
 783                 }
 784             }
 785             else
 786             {
 787                 /* We checked above that nPmeRanks is a valid number */
 788                 cr->npmenodes = nPmeRanks;
 789                 if (fplog)
 790                 {
 791                     fprintf(fplog, "Using %d separate PME ranks, per user request\n", cr->npmenodes);
 792                 }
 793             }
 794         }
 795
 796         limit = optimize_ncells(fplog, cr->nnodes, cr->npmenodes,
 797                                 bDynLoadBal, dlb_scale,
 798                                 mtop, box, ddbox, ir, dd,
 799                                 cellsize_limit, cutoff_dd,
 800                                 bInterCGBondeds,
 801                                 dd->nc);
 802     }
 803     else
 804     {
 805         limit = 0;
 806     }
 807     /* Communicate the information set by the master to all nodes */
 808     gmx_bcast(sizeof(dd->nc), dd->nc, cr);
 809     if (EEL_PME(ir->coulombtype))
 810     {
 811         gmx_bcast(sizeof(cr->npmenodes), &cr->npmenodes, cr);
 812     }
 813     else
 814     {
 815         cr->npmenodes = 0;
 816     }
 817
 818     return limit;
 819 }