src/gromacs/domdec/domdec.cpp

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018, by the GROMACS development team, led by
   5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   6  * and including many others, as listed in the AUTHORS file in the
   7  * top-level source directory and at http://www.gromacs.org.
   8  *
   9  * GROMACS is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public License
  11  * as published by the Free Software Foundation; either version 2.1
  12  * of the License, or (at your option) any later version.
  13  *
  14  * GROMACS is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public
  20  * License along with GROMACS; if not, see
  21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  23  *
  24  * If you want to redistribute modifications to GROMACS, please
  25  * consider that scientific software is very special. Version
  26  * control is crucial - bugs must be traceable. We will be happy to
  27  * consider code for inclusion in the official distribution, but
  28  * derived work must not be called official GROMACS. Details are found
  29  * in the README & COPYING files - if they are missing, get the
  30  * official version at http://www.gromacs.org.
  31  *
  32  * To help us fund GROMACS development, we humbly ask that you cite
  33  * the research papers on the package. Check out http://www.gromacs.org.
  34  */
  35
  36 #include "gmxpre.h"
  37
  38 #include "domdec.h"
  39
  40 #include "config.h"
  41
  42 #include <cassert>
  43 #include <cinttypes>
  44 #include <climits>
  45 #include <cmath>
  46 #include <cstdio>
  47 #include <cstdlib>
  48 #include <cstring>
  49
  50 #include <algorithm>
  51
  52 #include "gromacs/compat/make_unique.h"
  53 #include "gromacs/domdec/collect.h"
  54 #include "gromacs/domdec/dlb.h"
  55 #include "gromacs/domdec/dlbtiming.h"
  56 #include "gromacs/domdec/domdec_network.h"
  57 #include "gromacs/domdec/ga2la.h"
  58 #include "gromacs/domdec/partition.h"
  59 #include "gromacs/gmxlib/network.h"
  60 #include "gromacs/gmxlib/nrnb.h"
  61 #include "gromacs/gpu_utils/gpu_utils.h"
  62 #include "gromacs/hardware/hw_info.h"
  63 #include "gromacs/listed-forces/manage-threading.h"
  64 #include "gromacs/math/vec.h"
  65 #include "gromacs/math/vectypes.h"
  66 #include "gromacs/mdlib/calc_verletbuf.h"
  67 #include "gromacs/mdlib/constr.h"
  68 #include "gromacs/mdlib/constraintrange.h"
  69 #include "gromacs/mdlib/mdrun.h"
  70 #include "gromacs/mdlib/updategroups.h"
  71 #include "gromacs/mdlib/vsite.h"
  72 #include "gromacs/mdtypes/commrec.h"
  73 #include "gromacs/mdtypes/inputrec.h"
  74 #include "gromacs/mdtypes/state.h"
  75 #include "gromacs/pbcutil/ishift.h"
  76 #include "gromacs/pbcutil/pbc.h"
  77 #include "gromacs/pulling/pull.h"
  78 #include "gromacs/timing/wallcycle.h"
  79 #include "gromacs/topology/block.h"
  80 #include "gromacs/topology/idef.h"
  81 #include "gromacs/topology/ifunc.h"
  82 #include "gromacs/topology/mtop_lookup.h"
  83 #include "gromacs/topology/mtop_util.h"
  84 #include "gromacs/topology/topology.h"
  85 #include "gromacs/utility/basedefinitions.h"
  86 #include "gromacs/utility/basenetwork.h"
  87 #include "gromacs/utility/cstringutil.h"
  88 #include "gromacs/utility/exceptions.h"
  89 #include "gromacs/utility/fatalerror.h"
  90 #include "gromacs/utility/gmxmpi.h"
  91 #include "gromacs/utility/logger.h"
  92 #include "gromacs/utility/real.h"
  93 #include "gromacs/utility/smalloc.h"
  94 #include "gromacs/utility/strconvert.h"
  95 #include "gromacs/utility/stringstream.h"
  96 #include "gromacs/utility/stringutil.h"
  97 #include "gromacs/utility/textwriter.h"
  98
  99 #include "atomdistribution.h"
 100 #include "box.h"
 101 #include "cellsizes.h"
 102 #include "distribute.h"
 103 #include "domdec_constraints.h"
 104 #include "domdec_internal.h"
 105 #include "domdec_vsite.h"
 106 #include "redistribute.h"
 107 #include "utility.h"
 108
 109 static const char *edlbs_names[int(DlbState::nr)] = { "off", "auto", "locked", "on", "on" };
 110
 111 /* The size per atom group of the cggl_flag buffer in gmx_domdec_comm_t */
 112 #define DD_CGIBS 2
 113
 114 /* The flags for the cggl_flag buffer in gmx_domdec_comm_t */
 115 #define DD_FLAG_NRCG  65535
 116 #define DD_FLAG_FW(d) (1<<(16+(d)*2))
 117 #define DD_FLAG_BW(d) (1<<(16+(d)*2+1))
 118
 119 /* The DD zone order */
 120 static const ivec dd_zo[DD_MAXZONE] =
 121 {{0, 0, 0}, {1, 0, 0}, {1, 1, 0}, {0, 1, 0}, {0, 1, 1}, {0, 0, 1}, {1, 0, 1}, {1, 1, 1}};
 122
 123 /* The non-bonded zone-pair setup for domain decomposition
 124  * The first number is the i-zone, the second number the first j-zone seen by
 125  * this i-zone, the third number the last+1 j-zone seen by this i-zone.
 126  * As is, this is for 3D decomposition, where there are 4 i-zones.
 127  * With 2D decomposition use only the first 2 i-zones and a last+1 j-zone of 4.
 128  * With 1D decomposition use only the first i-zone and a last+1 j-zone of 2.
 129  */
 130 static const int
 131     ddNonbondedZonePairRanges[DD_MAXIZONE][3] = {{0, 0, 8},
 132                                                  {1, 3, 6},
 133                                                  {2, 5, 6},
 134                                                  {3, 5, 7}};
 135
 136
 137 /*
 138    #define dd_index(n,i) ((((i)[ZZ]*(n)[YY] + (i)[YY])*(n)[XX]) + (i)[XX])
 139
 140    static void index2xyz(ivec nc,int ind,ivec xyz)
 141    {
 142    xyz[XX] = ind % nc[XX];
 143    xyz[YY] = (ind / nc[XX]) % nc[YY];
 144    xyz[ZZ] = ind / (nc[YY]*nc[XX]);
 145    }
 146  */
 147
 148 static void ddindex2xyz(const ivec nc, int ind, ivec xyz)
 149 {
 150     xyz[XX] = ind / (nc[YY]*nc[ZZ]);
 151     xyz[YY] = (ind / nc[ZZ]) % nc[YY];
 152     xyz[ZZ] = ind % nc[ZZ];
 153 }
 154
 155 static int ddcoord2ddnodeid(gmx_domdec_t *dd, ivec c)
 156 {
 157     int ddindex;
 158     int ddnodeid = -1;
 159
 160     ddindex = dd_index(dd->nc, c);
 161     if (dd->comm->bCartesianPP_PME)
 162     {
 163         ddnodeid = dd->comm->ddindex2ddnodeid[ddindex];
 164     }
 165     else if (dd->comm->bCartesianPP)
 166     {
 167 #if GMX_MPI
 168         MPI_Cart_rank(dd->mpi_comm_all, c, &ddnodeid);
 169 #endif
 170     }
 171     else
 172     {
 173         ddnodeid = ddindex;
 174     }
 175
 176     return ddnodeid;
 177 }
 178
 179 int ddglatnr(const gmx_domdec_t *dd, int i)
 180 {
 181     int atnr;
 182
 183     if (dd == nullptr)
 184     {
 185         atnr = i + 1;
 186     }
 187     else
 188     {
 189         if (i >= dd->comm->atomRanges.numAtomsTotal())
 190         {
 191             gmx_fatal(FARGS, "glatnr called with %d, which is larger than the local number of atoms (%d)", i, dd->comm->atomRanges.numAtomsTotal());
 192         }
 193         atnr = dd->globalAtomIndices[i] + 1;
 194     }
 195
 196     return atnr;
 197 }
 198
 199 t_block *dd_charge_groups_global(gmx_domdec_t *dd)
 200 {
 201     return &dd->comm->cgs_gl;
 202 }
 203
 204 void dd_store_state(gmx_domdec_t *dd, t_state *state)
 205 {
 206     int i;
 207
 208     if (state->ddp_count != dd->ddp_count)
 209     {
 210         gmx_incons("The MD state does not match the domain decomposition state");
 211     }
 212
 213     state->cg_gl.resize(dd->ncg_home);
 214     for (i = 0; i < dd->ncg_home; i++)
 215     {
 216         state->cg_gl[i] = dd->globalAtomGroupIndices[i];
 217     }
 218
 219     state->ddp_count_cg_gl = dd->ddp_count;
 220 }
 221
 222 gmx_domdec_zones_t *domdec_zones(gmx_domdec_t *dd)
 223 {
 224     return &dd->comm->zones;
 225 }
 226
 227 void dd_get_ns_ranges(const gmx_domdec_t *dd, int icg,
 228                       int *jcg0, int *jcg1, ivec shift0, ivec shift1)
 229 {
 230     gmx_domdec_zones_t *zones;
 231     int                 izone, d, dim;
 232
 233     zones = &dd->comm->zones;
 234
 235     izone = 0;
 236     while (icg >= zones->izone[izone].cg1)
 237     {
 238         izone++;
 239     }
 240
 241     if (izone == 0)
 242     {
 243         *jcg0 = icg;
 244     }
 245     else if (izone < zones->nizone)
 246     {
 247         *jcg0 = zones->izone[izone].jcg0;
 248     }
 249     else
 250     {
 251         gmx_fatal(FARGS, "DD icg %d out of range: izone (%d) >= nizone (%d)",
 252                   icg, izone, zones->nizone);
 253     }
 254
 255     *jcg1 = zones->izone[izone].jcg1;
 256
 257     for (d = 0; d < dd->ndim; d++)
 258     {
 259         dim         = dd->dim[d];
 260         shift0[dim] = zones->izone[izone].shift0[dim];
 261         shift1[dim] = zones->izone[izone].shift1[dim];
 262         if (dd->comm->tric_dir[dim] || (isDlbOn(dd->comm) && d > 0))
 263         {
 264             /* A conservative approach, this can be optimized */
 265             shift0[dim] -= 1;
 266             shift1[dim] += 1;
 267         }
 268     }
 269 }
 270
 271 int dd_numHomeAtoms(const gmx_domdec_t &dd)
 272 {
 273     return dd.comm->atomRanges.numHomeAtoms();
 274 }
 275
 276 int dd_natoms_mdatoms(const gmx_domdec_t *dd)
 277 {
 278     /* We currently set mdatoms entries for all atoms:
 279      * local + non-local + communicated for vsite + constraints
 280      */
 281
 282     return dd->comm->atomRanges.numAtomsTotal();
 283 }
 284
 285 int dd_natoms_vsite(const gmx_domdec_t *dd)
 286 {
 287     return dd->comm->atomRanges.end(DDAtomRanges::Type::Vsites);
 288 }
 289
 290 void dd_get_constraint_range(const gmx_domdec_t *dd, int *at_start, int *at_end)
 291 {
 292     *at_start = dd->comm->atomRanges.start(DDAtomRanges::Type::Constraints);
 293     *at_end   = dd->comm->atomRanges.end(DDAtomRanges::Type::Constraints);
 294 }
 295
 296 void dd_move_x(gmx_domdec_t             *dd,
 297                matrix                    box,
 298                gmx::ArrayRef<gmx::RVec>  x,
 299                gmx_wallcycle            *wcycle)
 300 {
 301     wallcycle_start(wcycle, ewcMOVEX);
 302
 303     int                    nzone, nat_tot;
 304     gmx_domdec_comm_t     *comm;
 305     gmx_domdec_comm_dim_t *cd;
 306     rvec                   shift = {0, 0, 0};
 307     gmx_bool               bPBC, bScrew;
 308
 309     comm = dd->comm;
 310
 311     const gmx::RangePartitioning &atomGrouping = dd->atomGrouping();
 312
 313     nzone   = 1;
 314     nat_tot = comm->atomRanges.numHomeAtoms();
 315     for (int d = 0; d < dd->ndim; d++)
 316     {
 317         bPBC   = (dd->ci[dd->dim[d]] == 0);
 318         bScrew = (bPBC && dd->bScrewPBC && dd->dim[d] == XX);
 319         if (bPBC)
 320         {
 321             copy_rvec(box[dd->dim[d]], shift);
 322         }
 323         cd = &comm->cd[d];
 324         for (const gmx_domdec_ind_t &ind : cd->ind)
 325         {
 326             DDBufferAccess<gmx::RVec>  sendBufferAccess(comm->rvecBuffer, ind.nsend[nzone + 1]);
 327             gmx::ArrayRef<gmx::RVec>  &sendBuffer = sendBufferAccess.buffer;
 328             int                        n          = 0;
 329             if (!bPBC)
 330             {
 331                 for (int g : ind.index)
 332                 {
 333                     for (int j : atomGrouping.block(g))
 334                     {
 335                         sendBuffer[n] = x[j];
 336                         n++;
 337                     }
 338                 }
 339             }
 340             else if (!bScrew)
 341             {
 342                 for (int g : ind.index)
 343                 {
 344                     for (int j : atomGrouping.block(g))
 345                     {
 346                         /* We need to shift the coordinates */
 347                         for (int d = 0; d < DIM; d++)
 348                         {
 349                             sendBuffer[n][d] = x[j][d] + shift[d];
 350                         }
 351                         n++;
 352                     }
 353                 }
 354             }
 355             else
 356             {
 357                 for (int g : ind.index)
 358                 {
 359                     for (int j : atomGrouping.block(g))
 360                     {
 361                         /* Shift x */
 362                         sendBuffer[n][XX] = x[j][XX] + shift[XX];
 363                         /* Rotate y and z.
 364                          * This operation requires a special shift force
 365                          * treatment, which is performed in calc_vir.
 366                          */
 367                         sendBuffer[n][YY] = box[YY][YY] - x[j][YY];
 368                         sendBuffer[n][ZZ] = box[ZZ][ZZ] - x[j][ZZ];
 369                         n++;
 370                     }
 371                 }
 372             }
 373
 374             DDBufferAccess<gmx::RVec>  receiveBufferAccess(comm->rvecBuffer2, cd->receiveInPlace ? 0 : ind.nrecv[nzone + 1]);
 375
 376             gmx::ArrayRef<gmx::RVec>   receiveBuffer;
 377             if (cd->receiveInPlace)
 378             {
 379                 receiveBuffer = gmx::arrayRefFromArray(x.data() + nat_tot, ind.nrecv[nzone + 1]);
 380             }
 381             else
 382             {
 383                 receiveBuffer = receiveBufferAccess.buffer;
 384             }
 385             /* Send and receive the coordinates */
 386             ddSendrecv(dd, d, dddirBackward,
 387                        sendBuffer, receiveBuffer);
 388
 389             if (!cd->receiveInPlace)
 390             {
 391                 int j = 0;
 392                 for (int zone = 0; zone < nzone; zone++)
 393                 {
 394                     for (int i = ind.cell2at0[zone]; i < ind.cell2at1[zone]; i++)
 395                     {
 396                         x[i] = receiveBuffer[j++];
 397                     }
 398                 }
 399             }
 400             nat_tot += ind.nrecv[nzone+1];
 401         }
 402         nzone += nzone;
 403     }
 404
 405     wallcycle_stop(wcycle, ewcMOVEX);
 406 }
 407
 408 void dd_move_f(gmx_domdec_t             *dd,
 409                gmx::ArrayRef<gmx::RVec>  f,
 410                rvec                     *fshift,
 411                gmx_wallcycle            *wcycle)
 412 {
 413     wallcycle_start(wcycle, ewcMOVEF);
 414
 415     int                    nzone, nat_tot;
 416     gmx_domdec_comm_t     *comm;
 417     gmx_domdec_comm_dim_t *cd;
 418     ivec                   vis;
 419     int                    is;
 420     gmx_bool               bShiftForcesNeedPbc, bScrew;
 421
 422     comm = dd->comm;
 423
 424     const gmx::RangePartitioning &atomGrouping = dd->atomGrouping();
 425
 426     nzone   = comm->zones.n/2;
 427     nat_tot = comm->atomRanges.end(DDAtomRanges::Type::Zones);
 428     for (int d = dd->ndim-1; d >= 0; d--)
 429     {
 430         /* Only forces in domains near the PBC boundaries need to
 431            consider PBC in the treatment of fshift */
 432         bShiftForcesNeedPbc   = (dd->ci[dd->dim[d]] == 0);
 433         bScrew                = (bShiftForcesNeedPbc && dd->bScrewPBC && dd->dim[d] == XX);
 434         if (fshift == nullptr && !bScrew)
 435         {
 436             bShiftForcesNeedPbc = FALSE;
 437         }
 438         /* Determine which shift vector we need */
 439         clear_ivec(vis);
 440         vis[dd->dim[d]] = 1;
 441         is              = IVEC2IS(vis);
 442
 443         cd = &comm->cd[d];
 444         for (int p = cd->numPulses() - 1; p >= 0; p--)
 445         {
 446             const gmx_domdec_ind_t    &ind  = cd->ind[p];
 447             DDBufferAccess<gmx::RVec>  receiveBufferAccess(comm->rvecBuffer, ind.nsend[nzone + 1]);
 448             gmx::ArrayRef<gmx::RVec>  &receiveBuffer = receiveBufferAccess.buffer;
 449
 450             nat_tot                        -= ind.nrecv[nzone+1];
 451
 452             DDBufferAccess<gmx::RVec>  sendBufferAccess(comm->rvecBuffer2, cd->receiveInPlace ? 0 : ind.nrecv[nzone + 1]);
 453
 454             gmx::ArrayRef<gmx::RVec>   sendBuffer;
 455             if (cd->receiveInPlace)
 456             {
 457                 sendBuffer = gmx::arrayRefFromArray(f.data() + nat_tot, ind.nrecv[nzone + 1]);
 458             }
 459             else
 460             {
 461                 sendBuffer = sendBufferAccess.buffer;
 462                 int j = 0;
 463                 for (int zone = 0; zone < nzone; zone++)
 464                 {
 465                     for (int i = ind.cell2at0[zone]; i < ind.cell2at1[zone]; i++)
 466                     {
 467                         sendBuffer[j++] = f[i];
 468                     }
 469                 }
 470             }
 471             /* Communicate the forces */
 472             ddSendrecv(dd, d, dddirForward,
 473                        sendBuffer, receiveBuffer);
 474             /* Add the received forces */
 475             int n = 0;
 476             if (!bShiftForcesNeedPbc)
 477             {
 478                 for (int g : ind.index)
 479                 {
 480                     for (int j : atomGrouping.block(g))
 481                     {
 482                         for (int d = 0; d < DIM; d++)
 483                         {
 484                             f[j][d] += receiveBuffer[n][d];
 485                         }
 486                         n++;
 487                     }
 488                 }
 489             }
 490             else if (!bScrew)
 491             {
 492                 /* fshift should always be defined if this function is
 493                  * called when bShiftForcesNeedPbc is true */
 494                 assert(nullptr != fshift);
 495                 for (int g : ind.index)
 496                 {
 497                     for (int j : atomGrouping.block(g))
 498                     {
 499                         for (int d = 0; d < DIM; d++)
 500                         {
 501                             f[j][d] += receiveBuffer[n][d];
 502                         }
 503                         /* Add this force to the shift force */
 504                         for (int d = 0; d < DIM; d++)
 505                         {
 506                             fshift[is][d] += receiveBuffer[n][d];
 507                         }
 508                         n++;
 509                     }
 510                 }
 511             }
 512             else
 513             {
 514                 for (int g : ind.index)
 515                 {
 516                     for (int j : atomGrouping.block(g))
 517                     {
 518                         /* Rotate the force */
 519                         f[j][XX] += receiveBuffer[n][XX];
 520                         f[j][YY] -= receiveBuffer[n][YY];
 521                         f[j][ZZ] -= receiveBuffer[n][ZZ];
 522                         if (fshift)
 523                         {
 524                             /* Add this force to the shift force */
 525                             for (int d = 0; d < DIM; d++)
 526                             {
 527                                 fshift[is][d] += receiveBuffer[n][d];
 528                             }
 529                         }
 530                         n++;
 531                     }
 532                 }
 533             }
 534         }
 535         nzone /= 2;
 536     }
 537     wallcycle_stop(wcycle, ewcMOVEF);
 538 }
 539
 540 /* Convenience function for extracting a real buffer from an rvec buffer
 541  *
 542  * To reduce the number of temporary communication buffers and avoid
 543  * cache polution, we reuse gmx::RVec buffers for storing reals.
 544  * This functions return a real buffer reference with the same number
 545  * of elements as the gmx::RVec buffer (so 1/3 of the size in bytes).
 546  */
 547 static gmx::ArrayRef<real>
 548 realArrayRefFromRvecArrayRef(gmx::ArrayRef<gmx::RVec> arrayRef)
 549 {
 550     return gmx::arrayRefFromArray(as_rvec_array(arrayRef.data())[0],
 551                                   arrayRef.size());
 552 }
 553
 554 void dd_atom_spread_real(gmx_domdec_t *dd, real v[])
 555 {
 556     int                    nzone, nat_tot;
 557     gmx_domdec_comm_t     *comm;
 558     gmx_domdec_comm_dim_t *cd;
 559
 560     comm = dd->comm;
 561
 562     const gmx::RangePartitioning &atomGrouping = dd->atomGrouping();
 563
 564     nzone   = 1;
 565     nat_tot = comm->atomRanges.numHomeAtoms();
 566     for (int d = 0; d < dd->ndim; d++)
 567     {
 568         cd = &comm->cd[d];
 569         for (const gmx_domdec_ind_t &ind : cd->ind)
 570         {
 571             /* Note: We provision for RVec instead of real, so a factor of 3
 572              * more than needed. The buffer actually already has this size
 573              * and we pass a plain pointer below, so this does not matter.
 574              */
 575             DDBufferAccess<gmx::RVec> sendBufferAccess(comm->rvecBuffer, ind.nsend[nzone + 1]);
 576             gmx::ArrayRef<real>       sendBuffer = realArrayRefFromRvecArrayRef(sendBufferAccess.buffer);
 577             int                       n          = 0;
 578             for (int g : ind.index)
 579             {
 580                 for (int j : atomGrouping.block(g))
 581                 {
 582                     sendBuffer[n++] = v[j];
 583                 }
 584             }
 585
 586             DDBufferAccess<gmx::RVec> receiveBufferAccess(comm->rvecBuffer2, cd->receiveInPlace ? 0 : ind.nrecv[nzone + 1]);
 587
 588             gmx::ArrayRef<real>       receiveBuffer;
 589             if (cd->receiveInPlace)
 590             {
 591                 receiveBuffer = gmx::arrayRefFromArray(v + nat_tot, ind.nrecv[nzone + 1]);
 592             }
 593             else
 594             {
 595                 receiveBuffer = realArrayRefFromRvecArrayRef(receiveBufferAccess.buffer);
 596             }
 597             /* Send and receive the data */
 598             ddSendrecv(dd, d, dddirBackward,
 599                        sendBuffer, receiveBuffer);
 600             if (!cd->receiveInPlace)
 601             {
 602                 int j = 0;
 603                 for (int zone = 0; zone < nzone; zone++)
 604                 {
 605                     for (int i = ind.cell2at0[zone]; i < ind.cell2at1[zone]; i++)
 606                     {
 607                         v[i] = receiveBuffer[j++];
 608                     }
 609                 }
 610             }
 611             nat_tot += ind.nrecv[nzone+1];
 612         }
 613         nzone += nzone;
 614     }
 615 }
 616
 617 void dd_atom_sum_real(gmx_domdec_t *dd, real v[])
 618 {
 619     int                    nzone, nat_tot;
 620     gmx_domdec_comm_t     *comm;
 621     gmx_domdec_comm_dim_t *cd;
 622
 623     comm = dd->comm;
 624
 625     const gmx::RangePartitioning &atomGrouping = dd->atomGrouping();
 626
 627     nzone   = comm->zones.n/2;
 628     nat_tot = comm->atomRanges.end(DDAtomRanges::Type::Zones);
 629     for (int d = dd->ndim-1; d >= 0; d--)
 630     {
 631         cd = &comm->cd[d];
 632         for (int p = cd->numPulses() - 1; p >= 0; p--)
 633         {
 634             const gmx_domdec_ind_t &ind = cd->ind[p];
 635
 636             /* Note: We provision for RVec instead of real, so a factor of 3
 637              * more than needed. The buffer actually already has this size
 638              * and we typecast, so this works as intended.
 639              */
 640             DDBufferAccess<gmx::RVec> receiveBufferAccess(comm->rvecBuffer, ind.nsend[nzone + 1]);
 641             gmx::ArrayRef<real>       receiveBuffer = realArrayRefFromRvecArrayRef(receiveBufferAccess.buffer);
 642             nat_tot -= ind.nrecv[nzone + 1];
 643
 644             DDBufferAccess<gmx::RVec> sendBufferAccess(comm->rvecBuffer2, cd->receiveInPlace ? 0 : ind.nrecv[nzone + 1]);
 645
 646             gmx::ArrayRef<real>       sendBuffer;
 647             if (cd->receiveInPlace)
 648             {
 649                 sendBuffer = gmx::arrayRefFromArray(v + nat_tot, ind.nrecv[nzone + 1]);
 650             }
 651             else
 652             {
 653                 sendBuffer = realArrayRefFromRvecArrayRef(sendBufferAccess.buffer);
 654                 int j = 0;
 655                 for (int zone = 0; zone < nzone; zone++)
 656                 {
 657                     for (int i = ind.cell2at0[zone]; i < ind.cell2at1[zone]; i++)
 658                     {
 659                         sendBuffer[j++] = v[i];
 660                     }
 661                 }
 662             }
 663             /* Communicate the forces */
 664             ddSendrecv(dd, d, dddirForward,
 665                        sendBuffer, receiveBuffer);
 666             /* Add the received forces */
 667             int n = 0;
 668             for (int g : ind.index)
 669             {
 670                 for (int j : atomGrouping.block(g))
 671                 {
 672                     v[j] += receiveBuffer[n];
 673                     n++;
 674                 }
 675             }
 676         }
 677         nzone /= 2;
 678     }
 679 }
 680
 681 real dd_cutoff_multibody(const gmx_domdec_t *dd)
 682 {
 683     gmx_domdec_comm_t *comm;
 684     int                di;
 685     real               r;
 686
 687     comm = dd->comm;
 688
 689     r = -1;
 690     if (comm->bInterCGBondeds)
 691     {
 692         if (comm->cutoff_mbody > 0)
 693         {
 694             r = comm->cutoff_mbody;
 695         }
 696         else
 697         {
 698             /* cutoff_mbody=0 means we do not have DLB */
 699             r = comm->cellsize_min[dd->dim[0]];
 700             for (di = 1; di < dd->ndim; di++)
 701             {
 702                 r = std::min(r, comm->cellsize_min[dd->dim[di]]);
 703             }
 704             if (comm->bBondComm)
 705             {
 706                 r = std::max(r, comm->cutoff_mbody);
 707             }
 708             else
 709             {
 710                 r = std::min(r, comm->cutoff);
 711             }
 712         }
 713     }
 714
 715     return r;
 716 }
 717
 718 real dd_cutoff_twobody(const gmx_domdec_t *dd)
 719 {
 720     real r_mb;
 721
 722     r_mb = dd_cutoff_multibody(dd);
 723
 724     return std::max(dd->comm->cutoff, r_mb);
 725 }
 726
 727
 728 static void dd_cart_coord2pmecoord(const gmx_domdec_t *dd, const ivec coord,
 729                                    ivec coord_pme)
 730 {
 731     int nc, ntot;
 732
 733     nc   = dd->nc[dd->comm->cartpmedim];
 734     ntot = dd->comm->ntot[dd->comm->cartpmedim];
 735     copy_ivec(coord, coord_pme);
 736     coord_pme[dd->comm->cartpmedim] =
 737         nc + (coord[dd->comm->cartpmedim]*(ntot - nc) + (ntot - nc)/2)/nc;
 738 }
 739
 740 static int ddindex2pmeindex(const gmx_domdec_t *dd, int ddindex)
 741 {
 742     int npp, npme;
 743
 744     npp  = dd->nnodes;
 745     npme = dd->comm->npmenodes;
 746
 747     /* Here we assign a PME node to communicate with this DD node
 748      * by assuming that the major index of both is x.
 749      * We add cr->npmenodes/2 to obtain an even distribution.
 750      */
 751     return (ddindex*npme + npme/2)/npp;
 752 }
 753
 754 static int *dd_interleaved_pme_ranks(const gmx_domdec_t *dd)
 755 {
 756     int *pme_rank;
 757     int  n, i, p0, p1;
 758
 759     snew(pme_rank, dd->comm->npmenodes);
 760     n = 0;
 761     for (i = 0; i < dd->nnodes; i++)
 762     {
 763         p0 = ddindex2pmeindex(dd, i);
 764         p1 = ddindex2pmeindex(dd, i+1);
 765         if (i+1 == dd->nnodes || p1 > p0)
 766         {
 767             if (debug)
 768             {
 769                 fprintf(debug, "pme_rank[%d] = %d\n", n, i+1+n);
 770             }
 771             pme_rank[n] = i + 1 + n;
 772             n++;
 773         }
 774     }
 775
 776     return pme_rank;
 777 }
 778
 779 static int gmx_ddcoord2pmeindex(const t_commrec *cr, int x, int y, int z)
 780 {
 781     gmx_domdec_t *dd;
 782     ivec          coords;
 783     int           slab;
 784
 785     dd = cr->dd;
 786     /*
 787        if (dd->comm->bCartesian) {
 788        gmx_ddindex2xyz(dd->nc,ddindex,coords);
 789        dd_coords2pmecoords(dd,coords,coords_pme);
 790        copy_ivec(dd->ntot,nc);
 791        nc[dd->cartpmedim]         -= dd->nc[dd->cartpmedim];
 792        coords_pme[dd->cartpmedim] -= dd->nc[dd->cartpmedim];
 793
 794        slab = (coords_pme[XX]*nc[YY] + coords_pme[YY])*nc[ZZ] + coords_pme[ZZ];
 795        } else {
 796        slab = (ddindex*cr->npmenodes + cr->npmenodes/2)/dd->nnodes;
 797        }
 798      */
 799     coords[XX] = x;
 800     coords[YY] = y;
 801     coords[ZZ] = z;
 802     slab       = ddindex2pmeindex(dd, dd_index(dd->nc, coords));
 803
 804     return slab;
 805 }
 806
 807 static int ddcoord2simnodeid(const t_commrec *cr, int x, int y, int z)
 808 {
 809     gmx_domdec_comm_t *comm;
 810     ivec               coords;
 811     int                ddindex, nodeid = -1;
 812
 813     comm = cr->dd->comm;
 814
 815     coords[XX] = x;
 816     coords[YY] = y;
 817     coords[ZZ] = z;
 818     if (comm->bCartesianPP_PME)
 819     {
 820 #if GMX_MPI
 821         MPI_Cart_rank(cr->mpi_comm_mysim, coords, &nodeid);
 822 #endif
 823     }
 824     else
 825     {
 826         ddindex = dd_index(cr->dd->nc, coords);
 827         if (comm->bCartesianPP)
 828         {
 829             nodeid = comm->ddindex2simnodeid[ddindex];
 830         }
 831         else
 832         {
 833             if (comm->pmenodes)
 834             {
 835                 nodeid = ddindex + gmx_ddcoord2pmeindex(cr, x, y, z);
 836             }
 837             else
 838             {
 839                 nodeid = ddindex;
 840             }
 841         }
 842     }
 843
 844     return nodeid;
 845 }
 846
 847 static int dd_simnode2pmenode(const gmx_domdec_t         *dd,
 848                               const t_commrec gmx_unused *cr,
 849                               int                         sim_nodeid)
 850 {
 851     int pmenode = -1;
 852
 853     const gmx_domdec_comm_t *comm = dd->comm;
 854
 855     /* This assumes a uniform x domain decomposition grid cell size */
 856     if (comm->bCartesianPP_PME)
 857     {
 858 #if GMX_MPI
 859         ivec coord, coord_pme;
 860         MPI_Cart_coords(cr->mpi_comm_mysim, sim_nodeid, DIM, coord);
 861         if (coord[comm->cartpmedim] < dd->nc[comm->cartpmedim])
 862         {
 863             /* This is a PP node */
 864             dd_cart_coord2pmecoord(dd, coord, coord_pme);
 865             MPI_Cart_rank(cr->mpi_comm_mysim, coord_pme, &pmenode);
 866         }
 867 #endif
 868     }
 869     else if (comm->bCartesianPP)
 870     {
 871         if (sim_nodeid < dd->nnodes)
 872         {
 873             pmenode = dd->nnodes + ddindex2pmeindex(dd, sim_nodeid);
 874         }
 875     }
 876     else
 877     {
 878         /* This assumes DD cells with identical x coordinates
 879          * are numbered sequentially.
 880          */
 881         if (dd->comm->pmenodes == nullptr)
 882         {
 883             if (sim_nodeid < dd->nnodes)
 884             {
 885                 /* The DD index equals the nodeid */
 886                 pmenode = dd->nnodes + ddindex2pmeindex(dd, sim_nodeid);
 887             }
 888         }
 889         else
 890         {
 891             int i = 0;
 892             while (sim_nodeid > dd->comm->pmenodes[i])
 893             {
 894                 i++;
 895             }
 896             if (sim_nodeid < dd->comm->pmenodes[i])
 897             {
 898                 pmenode = dd->comm->pmenodes[i];
 899             }
 900         }
 901     }
 902
 903     return pmenode;
 904 }
 905
 906 NumPmeDomains getNumPmeDomains(const gmx_domdec_t *dd)
 907 {
 908     if (dd != nullptr)
 909     {
 910         return {
 911                    dd->comm->npmenodes_x, dd->comm->npmenodes_y
 912         };
 913     }
 914     else
 915     {
 916         return {
 917                    1, 1
 918         };
 919     }
 920 }
 921
 922 std::vector<int> get_pme_ddranks(const t_commrec *cr, int pmenodeid)
 923 {
 924     gmx_domdec_t *dd;
 925     int           x, y, z;
 926     ivec          coord, coord_pme;
 927
 928     dd = cr->dd;
 929
 930     std::vector<int> ddranks;
 931     ddranks.reserve((dd->nnodes+cr->npmenodes-1)/cr->npmenodes);
 932
 933     for (x = 0; x < dd->nc[XX]; x++)
 934     {
 935         for (y = 0; y < dd->nc[YY]; y++)
 936         {
 937             for (z = 0; z < dd->nc[ZZ]; z++)
 938             {
 939                 if (dd->comm->bCartesianPP_PME)
 940                 {
 941                     coord[XX] = x;
 942                     coord[YY] = y;
 943                     coord[ZZ] = z;
 944                     dd_cart_coord2pmecoord(dd, coord, coord_pme);
 945                     if (dd->ci[XX] == coord_pme[XX] &&
 946                         dd->ci[YY] == coord_pme[YY] &&
 947                         dd->ci[ZZ] == coord_pme[ZZ])
 948                     {
 949                         ddranks.push_back(ddcoord2simnodeid(cr, x, y, z));
 950                     }
 951                 }
 952                 else
 953                 {
 954                     /* The slab corresponds to the nodeid in the PME group */
 955                     if (gmx_ddcoord2pmeindex(cr, x, y, z) == pmenodeid)
 956                     {
 957                         ddranks.push_back(ddcoord2simnodeid(cr, x, y, z));
 958                     }
 959                 }
 960             }
 961         }
 962     }
 963     return ddranks;
 964 }
 965
 966 static gmx_bool receive_vir_ener(const gmx_domdec_t *dd, const t_commrec *cr)
 967 {
 968     gmx_bool bReceive = TRUE;
 969
 970     if (cr->npmenodes < dd->nnodes)
 971     {
 972         gmx_domdec_comm_t *comm = dd->comm;
 973         if (comm->bCartesianPP_PME)
 974         {
 975 #if GMX_MPI
 976             int  pmenode = dd_simnode2pmenode(dd, cr, cr->sim_nodeid);
 977             ivec coords;
 978             MPI_Cart_coords(cr->mpi_comm_mysim, cr->sim_nodeid, DIM, coords);
 979             coords[comm->cartpmedim]++;
 980             if (coords[comm->cartpmedim] < dd->nc[comm->cartpmedim])
 981             {
 982                 int rank;
 983                 MPI_Cart_rank(cr->mpi_comm_mysim, coords, &rank);
 984                 if (dd_simnode2pmenode(dd, cr, rank) == pmenode)
 985                 {
 986                     /* This is not the last PP node for pmenode */
 987                     bReceive = FALSE;
 988                 }
 989             }
 990 #else
 991             GMX_RELEASE_ASSERT(false, "Without MPI we should not have Cartesian PP-PME with #PMEnodes < #DDnodes");
 992 #endif
 993         }
 994         else
 995         {
 996             int pmenode = dd_simnode2pmenode(dd, cr, cr->sim_nodeid);
 997             if (cr->sim_nodeid+1 < cr->nnodes &&
 998                 dd_simnode2pmenode(dd, cr, cr->sim_nodeid+1) == pmenode)
 999             {
1000                 /* This is not the last PP node for pmenode */
1001                 bReceive = FALSE;
1002             }
1003         }
1004     }
1005
1006     return bReceive;
1007 }
1008
1009 static void set_slb_pme_dim_f(gmx_domdec_t *dd, int dim, real **dim_f)
1010 {
1011     gmx_domdec_comm_t *comm;
1012     int                i;
1013
1014     comm = dd->comm;
1015
1016     snew(*dim_f, dd->nc[dim]+1);
1017     (*dim_f)[0] = 0;
1018     for (i = 1; i < dd->nc[dim]; i++)
1019     {
1020         if (comm->slb_frac[dim])
1021         {
1022             (*dim_f)[i] = (*dim_f)[i-1] + comm->slb_frac[dim][i-1];
1023         }
1024         else
1025         {
1026             (*dim_f)[i] = static_cast<real>(i)/static_cast<real>(dd->nc[dim]);
1027         }
1028     }
1029     (*dim_f)[dd->nc[dim]] = 1;
1030 }
1031
1032 static void init_ddpme(gmx_domdec_t *dd, gmx_ddpme_t *ddpme, int dimind)
1033 {
1034     int  pmeindex, slab, nso, i;
1035     ivec xyz;
1036
1037     if (dimind == 0 && dd->dim[0] == YY && dd->comm->npmenodes_x == 1)
1038     {
1039         ddpme->dim = YY;
1040     }
1041     else
1042     {
1043         ddpme->dim = dimind;
1044     }
1045     ddpme->dim_match = (ddpme->dim == dd->dim[dimind]);
1046
1047     ddpme->nslab = (ddpme->dim == 0 ?
1048                     dd->comm->npmenodes_x :
1049                     dd->comm->npmenodes_y);
1050
1051     if (ddpme->nslab <= 1)
1052     {
1053         return;
1054     }
1055
1056     nso = dd->comm->npmenodes/ddpme->nslab;
1057     /* Determine for each PME slab the PP location range for dimension dim */
1058     snew(ddpme->pp_min, ddpme->nslab);
1059     snew(ddpme->pp_max, ddpme->nslab);
1060     for (slab = 0; slab < ddpme->nslab; slab++)
1061     {
1062         ddpme->pp_min[slab] = dd->nc[dd->dim[dimind]] - 1;
1063         ddpme->pp_max[slab] = 0;
1064     }
1065     for (i = 0; i < dd->nnodes; i++)
1066     {
1067         ddindex2xyz(dd->nc, i, xyz);
1068         /* For y only use our y/z slab.
1069          * This assumes that the PME x grid size matches the DD grid size.
1070          */
1071         if (dimind == 0 || xyz[XX] == dd->ci[XX])
1072         {
1073             pmeindex = ddindex2pmeindex(dd, i);
1074             if (dimind == 0)
1075             {
1076                 slab = pmeindex/nso;
1077             }
1078             else
1079             {
1080                 slab = pmeindex % ddpme->nslab;
1081             }
1082             ddpme->pp_min[slab] = std::min(ddpme->pp_min[slab], xyz[dimind]);
1083             ddpme->pp_max[slab] = std::max(ddpme->pp_max[slab], xyz[dimind]);
1084         }
1085     }
1086
1087     set_slb_pme_dim_f(dd, ddpme->dim, &ddpme->slb_dim_f);
1088 }
1089
1090 int dd_pme_maxshift_x(const gmx_domdec_t *dd)
1091 {
1092     if (dd->comm->ddpme[0].dim == XX)
1093     {
1094         return dd->comm->ddpme[0].maxshift;
1095     }
1096     else
1097     {
1098         return 0;
1099     }
1100 }
1101
1102 int dd_pme_maxshift_y(const gmx_domdec_t *dd)
1103 {
1104     if (dd->comm->ddpme[0].dim == YY)
1105     {
1106         return dd->comm->ddpme[0].maxshift;
1107     }
1108     else if (dd->comm->npmedecompdim >= 2 && dd->comm->ddpme[1].dim == YY)
1109     {
1110         return dd->comm->ddpme[1].maxshift;
1111     }
1112     else
1113     {
1114         return 0;
1115     }
1116 }
1117
1118 void dd_cycles_add(const gmx_domdec_t *dd, float cycles, int ddCycl)
1119 {
1120     /* Note that the cycles value can be incorrect, either 0 or some
1121      * extremely large value, when our thread migrated to another core
1122      * with an unsynchronized cycle counter. If this happens less often
1123      * that once per nstlist steps, this will not cause issues, since
1124      * we later subtract the maximum value from the sum over nstlist steps.
1125      * A zero count will slightly lower the total, but that's a small effect.
1126      * Note that the main purpose of the subtraction of the maximum value
1127      * is to avoid throwing off the load balancing when stalls occur due
1128      * e.g. system activity or network congestion.
1129      */
1130     dd->comm->cycl[ddCycl] += cycles;
1131     dd->comm->cycl_n[ddCycl]++;
1132     if (cycles > dd->comm->cycl_max[ddCycl])
1133     {
1134         dd->comm->cycl_max[ddCycl] = cycles;
1135     }
1136 }
1137
1138 #if GMX_MPI
1139 static void make_load_communicator(gmx_domdec_t *dd, int dim_ind, ivec loc)
1140 {
1141     MPI_Comm           c_row;
1142     int                dim, i, rank;
1143     ivec               loc_c;
1144     gmx_bool           bPartOfGroup = FALSE;
1145
1146     dim = dd->dim[dim_ind];
1147     copy_ivec(loc, loc_c);
1148     for (i = 0; i < dd->nc[dim]; i++)
1149     {
1150         loc_c[dim] = i;
1151         rank       = dd_index(dd->nc, loc_c);
1152         if (rank == dd->rank)
1153         {
1154             /* This process is part of the group */
1155             bPartOfGroup = TRUE;
1156         }
1157     }
1158     MPI_Comm_split(dd->mpi_comm_all, bPartOfGroup ? 0 : MPI_UNDEFINED, dd->rank,
1159                    &c_row);
1160     if (bPartOfGroup)
1161     {
1162         dd->comm->mpi_comm_load[dim_ind] = c_row;
1163         if (!isDlbDisabled(dd->comm))
1164         {
1165             DDCellsizesWithDlb &cellsizes = dd->comm->cellsizesWithDlb[dim_ind];
1166
1167             if (dd->ci[dim] == dd->master_ci[dim])
1168             {
1169                 /* This is the root process of this row */
1170                 cellsizes.rowMaster  = gmx::compat::make_unique<RowMaster>();
1171
1172                 RowMaster &rowMaster = *cellsizes.rowMaster;
1173                 rowMaster.cellFrac.resize(ddCellFractionBufferSize(dd, dim_ind));
1174                 rowMaster.oldCellFrac.resize(dd->nc[dim] + 1);
1175                 rowMaster.isCellMin.resize(dd->nc[dim]);
1176                 if (dim_ind > 0)
1177                 {
1178                     rowMaster.bounds.resize(dd->nc[dim]);
1179                 }
1180                 rowMaster.buf_ncd.resize(dd->nc[dim]);
1181             }
1182             else
1183             {
1184                 /* This is not a root process, we only need to receive cell_f */
1185                 cellsizes.fracRow.resize(ddCellFractionBufferSize(dd, dim_ind));
1186             }
1187         }
1188         if (dd->ci[dim] == dd->master_ci[dim])
1189         {
1190             snew(dd->comm->load[dim_ind].load, dd->nc[dim]*DD_NLOAD_MAX);
1191         }
1192     }
1193 }
1194 #endif
1195
1196 void dd_setup_dlb_resource_sharing(t_commrec            *cr,
1197                                    int                   gpu_id)
1198 {
1199 #if GMX_MPI
1200     int           physicalnode_id_hash;
1201     gmx_domdec_t *dd;
1202     MPI_Comm      mpi_comm_pp_physicalnode;
1203
1204     if (!thisRankHasDuty(cr, DUTY_PP) || gpu_id < 0)
1205     {
1206         /* Only ranks with short-ranged tasks (currently) use GPUs.
1207          * If we don't have GPUs assigned, there are no resources to share.
1208          */
1209         return;
1210     }
1211
1212     physicalnode_id_hash = gmx_physicalnode_id_hash();
1213
1214     dd = cr->dd;
1215
1216     if (debug)
1217     {
1218         fprintf(debug, "dd_setup_dd_dlb_gpu_sharing:\n");
1219         fprintf(debug, "DD PP rank %d physical node hash %d gpu_id %d\n",
1220                 dd->rank, physicalnode_id_hash, gpu_id);
1221     }
1222     /* Split the PP communicator over the physical nodes */
1223     /* TODO: See if we should store this (before), as it's also used for
1224      * for the nodecomm summation.
1225      */
1226     // TODO PhysicalNodeCommunicator could be extended/used to handle
1227     // the need for per-node per-group communicators.
1228     MPI_Comm_split(dd->mpi_comm_all, physicalnode_id_hash, dd->rank,
1229                    &mpi_comm_pp_physicalnode);
1230     MPI_Comm_split(mpi_comm_pp_physicalnode, gpu_id, dd->rank,
1231                    &dd->comm->mpi_comm_gpu_shared);
1232     MPI_Comm_free(&mpi_comm_pp_physicalnode);
1233     MPI_Comm_size(dd->comm->mpi_comm_gpu_shared, &dd->comm->nrank_gpu_shared);
1234
1235     if (debug)
1236     {
1237         fprintf(debug, "nrank_gpu_shared %d\n", dd->comm->nrank_gpu_shared);
1238     }
1239
1240     /* Note that some ranks could share a GPU, while others don't */
1241
1242     if (dd->comm->nrank_gpu_shared == 1)
1243     {
1244         MPI_Comm_free(&dd->comm->mpi_comm_gpu_shared);
1245     }
1246 #else
1247     GMX_UNUSED_VALUE(cr);
1248     GMX_UNUSED_VALUE(gpu_id);
1249 #endif
1250 }
1251
1252 static void make_load_communicators(gmx_domdec_t gmx_unused *dd)
1253 {
1254 #if GMX_MPI
1255     int  dim0, dim1, i, j;
1256     ivec loc;
1257
1258     if (debug)
1259     {
1260         fprintf(debug, "Making load communicators\n");
1261     }
1262
1263     snew(dd->comm->load,          std::max(dd->ndim, 1));
1264     snew(dd->comm->mpi_comm_load, std::max(dd->ndim, 1));
1265
1266     if (dd->ndim == 0)
1267     {
1268         return;
1269     }
1270
1271     clear_ivec(loc);
1272     make_load_communicator(dd, 0, loc);
1273     if (dd->ndim > 1)
1274     {
1275         dim0 = dd->dim[0];
1276         for (i = 0; i < dd->nc[dim0]; i++)
1277         {
1278             loc[dim0] = i;
1279             make_load_communicator(dd, 1, loc);
1280         }
1281     }
1282     if (dd->ndim > 2)
1283     {
1284         dim0 = dd->dim[0];
1285         for (i = 0; i < dd->nc[dim0]; i++)
1286         {
1287             loc[dim0] = i;
1288             dim1      = dd->dim[1];
1289             for (j = 0; j < dd->nc[dim1]; j++)
1290             {
1291                 loc[dim1] = j;
1292                 make_load_communicator(dd, 2, loc);
1293             }
1294         }
1295     }
1296
1297     if (debug)
1298     {
1299         fprintf(debug, "Finished making load communicators\n");
1300     }
1301 #endif
1302 }
1303
1304 /*! \brief Sets up the relation between neighboring domains and zones */
1305 static void setup_neighbor_relations(gmx_domdec_t *dd)
1306 {
1307     int                     d, dim, i, j, m;
1308     ivec                    tmp, s;
1309     gmx_domdec_zones_t     *zones;
1310     gmx_domdec_ns_ranges_t *izone;
1311
1312     for (d = 0; d < dd->ndim; d++)
1313     {
1314         dim = dd->dim[d];
1315         copy_ivec(dd->ci, tmp);
1316         tmp[dim]           = (tmp[dim] + 1) % dd->nc[dim];
1317         dd->neighbor[d][0] = ddcoord2ddnodeid(dd, tmp);
1318         copy_ivec(dd->ci, tmp);
1319         tmp[dim]           = (tmp[dim] - 1 + dd->nc[dim]) % dd->nc[dim];
1320         dd->neighbor[d][1] = ddcoord2ddnodeid(dd, tmp);
1321         if (debug)
1322         {
1323             fprintf(debug, "DD rank %d neighbor ranks in dir %d are + %d - %d\n",
1324                     dd->rank, dim,
1325                     dd->neighbor[d][0],
1326                     dd->neighbor[d][1]);
1327         }
1328     }
1329
1330     int nzone  = (1 << dd->ndim);
1331     int nizone = (1 << std::max(dd->ndim - 1, 0));
1332     assert(nizone >= 1 && nizone <= DD_MAXIZONE);
1333
1334     zones = &dd->comm->zones;
1335
1336     for (i = 0; i < nzone; i++)
1337     {
1338         m = 0;
1339         clear_ivec(zones->shift[i]);
1340         for (d = 0; d < dd->ndim; d++)
1341         {
1342             zones->shift[i][dd->dim[d]] = dd_zo[i][m++];
1343         }
1344     }
1345
1346     zones->n = nzone;
1347     for (i = 0; i < nzone; i++)
1348     {
1349         for (d = 0; d < DIM; d++)
1350         {
1351             s[d] = dd->ci[d] - zones->shift[i][d];
1352             if (s[d] < 0)
1353             {
1354                 s[d] += dd->nc[d];
1355             }
1356             else if (s[d] >= dd->nc[d])
1357             {
1358                 s[d] -= dd->nc[d];
1359             }
1360         }
1361     }
1362     zones->nizone = nizone;
1363     for (i = 0; i < zones->nizone; i++)
1364     {
1365         assert(ddNonbondedZonePairRanges[i][0] == i);
1366
1367         izone     = &zones->izone[i];
1368         /* dd_zp3 is for 3D decomposition, for fewer dimensions use only
1369          * j-zones up to nzone.
1370          */
1371         izone->j0 = std::min(ddNonbondedZonePairRanges[i][1], nzone);
1372         izone->j1 = std::min(ddNonbondedZonePairRanges[i][2], nzone);
1373         for (dim = 0; dim < DIM; dim++)
1374         {
1375             if (dd->nc[dim] == 1)
1376             {
1377                 /* All shifts should be allowed */
1378                 izone->shift0[dim] = -1;
1379                 izone->shift1[dim] = 1;
1380             }
1381             else
1382             {
1383                 /* Determine the min/max j-zone shift wrt the i-zone */
1384                 izone->shift0[dim] = 1;
1385                 izone->shift1[dim] = -1;
1386                 for (j = izone->j0; j < izone->j1; j++)
1387                 {
1388                     int shift_diff = zones->shift[j][dim] - zones->shift[i][dim];
1389                     if (shift_diff < izone->shift0[dim])
1390                     {
1391                         izone->shift0[dim] = shift_diff;
1392                     }
1393                     if (shift_diff > izone->shift1[dim])
1394                     {
1395                         izone->shift1[dim] = shift_diff;
1396                     }
1397                 }
1398             }
1399         }
1400     }
1401
1402     if (!isDlbDisabled(dd->comm))
1403     {
1404         dd->comm->cellsizesWithDlb.resize(dd->ndim);
1405     }
1406
1407     if (dd->comm->bRecordLoad)
1408     {
1409         make_load_communicators(dd);
1410     }
1411 }
1412
1413 static void make_pp_communicator(const gmx::MDLogger  &mdlog,
1414                                  gmx_domdec_t         *dd,
1415                                  t_commrec gmx_unused *cr,
1416                                  bool gmx_unused       reorder)
1417 {
1418 #if GMX_MPI
1419     gmx_domdec_comm_t *comm;
1420     int                rank, *buf;
1421     ivec               periods;
1422     MPI_Comm           comm_cart;
1423
1424     comm = dd->comm;
1425
1426     if (comm->bCartesianPP)
1427     {
1428         /* Set up cartesian communication for the particle-particle part */
1429         GMX_LOG(mdlog.info).appendTextFormatted(
1430                 "Will use a Cartesian communicator: %d x %d x %d",
1431                 dd->nc[XX], dd->nc[YY], dd->nc[ZZ]);
1432
1433         for (int i = 0; i < DIM; i++)
1434         {
1435             periods[i] = TRUE;
1436         }
1437         MPI_Cart_create(cr->mpi_comm_mygroup, DIM, dd->nc, periods, static_cast<int>(reorder),
1438                         &comm_cart);
1439         /* We overwrite the old communicator with the new cartesian one */
1440         cr->mpi_comm_mygroup = comm_cart;
1441     }
1442
1443     dd->mpi_comm_all = cr->mpi_comm_mygroup;
1444     MPI_Comm_rank(dd->mpi_comm_all, &dd->rank);
1445
1446     if (comm->bCartesianPP_PME)
1447     {
1448         /* Since we want to use the original cartesian setup for sim,
1449          * and not the one after split, we need to make an index.
1450          */
1451         snew(comm->ddindex2ddnodeid, dd->nnodes);
1452         comm->ddindex2ddnodeid[dd_index(dd->nc, dd->ci)] = dd->rank;
1453         gmx_sumi(dd->nnodes, comm->ddindex2ddnodeid, cr);
1454         /* Get the rank of the DD master,
1455          * above we made sure that the master node is a PP node.
1456          */
1457         if (MASTER(cr))
1458         {
1459             rank = dd->rank;
1460         }
1461         else
1462         {
1463             rank = 0;
1464         }
1465         MPI_Allreduce(&rank, &dd->masterrank, 1, MPI_INT, MPI_SUM, dd->mpi_comm_all);
1466     }
1467     else if (comm->bCartesianPP)
1468     {
1469         if (cr->npmenodes == 0)
1470         {
1471             /* The PP communicator is also
1472              * the communicator for this simulation
1473              */
1474             cr->mpi_comm_mysim = cr->mpi_comm_mygroup;
1475         }
1476         cr->nodeid = dd->rank;
1477
1478         MPI_Cart_coords(dd->mpi_comm_all, dd->rank, DIM, dd->ci);
1479
1480         /* We need to make an index to go from the coordinates
1481          * to the nodeid of this simulation.
1482          */
1483         snew(comm->ddindex2simnodeid, dd->nnodes);
1484         snew(buf, dd->nnodes);
1485         if (thisRankHasDuty(cr, DUTY_PP))
1486         {
1487             buf[dd_index(dd->nc, dd->ci)] = cr->sim_nodeid;
1488         }
1489         /* Communicate the ddindex to simulation nodeid index */
1490         MPI_Allreduce(buf, comm->ddindex2simnodeid, dd->nnodes, MPI_INT, MPI_SUM,
1491                       cr->mpi_comm_mysim);
1492         sfree(buf);
1493
1494         /* Determine the master coordinates and rank.
1495          * The DD master should be the same node as the master of this sim.
1496          */
1497         for (int i = 0; i < dd->nnodes; i++)
1498         {
1499             if (comm->ddindex2simnodeid[i] == 0)
1500             {
1501                 ddindex2xyz(dd->nc, i, dd->master_ci);
1502                 MPI_Cart_rank(dd->mpi_comm_all, dd->master_ci, &dd->masterrank);
1503             }
1504         }
1505         if (debug)
1506         {
1507             fprintf(debug, "The master rank is %d\n", dd->masterrank);
1508         }
1509     }
1510     else
1511     {
1512         /* No Cartesian communicators */
1513         /* We use the rank in dd->comm->all as DD index */
1514         ddindex2xyz(dd->nc, dd->rank, dd->ci);
1515         /* The simulation master nodeid is 0, so the DD master rank is also 0 */
1516         dd->masterrank = 0;
1517         clear_ivec(dd->master_ci);
1518     }
1519 #endif
1520
1521     GMX_LOG(mdlog.info).appendTextFormatted(
1522             "Domain decomposition rank %d, coordinates %d %d %d\n",
1523             dd->rank, dd->ci[XX], dd->ci[YY], dd->ci[ZZ]);
1524     if (debug)
1525     {
1526         fprintf(debug,
1527                 "Domain decomposition rank %d, coordinates %d %d %d\n\n",
1528                 dd->rank, dd->ci[XX], dd->ci[YY], dd->ci[ZZ]);
1529     }
1530 }
1531
1532 static void receive_ddindex2simnodeid(gmx_domdec_t         *dd,
1533                                       t_commrec            *cr)
1534 {
1535 #if GMX_MPI
1536     gmx_domdec_comm_t *comm = dd->comm;
1537
1538     if (!comm->bCartesianPP_PME && comm->bCartesianPP)
1539     {
1540         int *buf;
1541         snew(comm->ddindex2simnodeid, dd->nnodes);
1542         snew(buf, dd->nnodes);
1543         if (thisRankHasDuty(cr, DUTY_PP))
1544         {
1545             buf[dd_index(dd->nc, dd->ci)] = cr->sim_nodeid;
1546         }
1547         /* Communicate the ddindex to simulation nodeid index */
1548         MPI_Allreduce(buf, comm->ddindex2simnodeid, dd->nnodes, MPI_INT, MPI_SUM,
1549                       cr->mpi_comm_mysim);
1550         sfree(buf);
1551     }
1552 #else
1553     GMX_UNUSED_VALUE(dd);
1554     GMX_UNUSED_VALUE(cr);
1555 #endif
1556 }
1557
1558 static void split_communicator(const gmx::MDLogger &mdlog,
1559                                t_commrec *cr, gmx_domdec_t *dd,
1560                                DdRankOrder gmx_unused rankOrder,
1561                                bool gmx_unused reorder)
1562 {
1563     gmx_domdec_comm_t *comm;
1564     int                i;
1565     gmx_bool           bDiv[DIM];
1566 #if GMX_MPI
1567     MPI_Comm           comm_cart;
1568 #endif
1569
1570     comm = dd->comm;
1571
1572     if (comm->bCartesianPP)
1573     {
1574         for (i = 1; i < DIM; i++)
1575         {
1576             bDiv[i] = ((cr->npmenodes*dd->nc[i]) % (dd->nnodes) == 0);
1577         }
1578         if (bDiv[YY] || bDiv[ZZ])
1579         {
1580             comm->bCartesianPP_PME = TRUE;
1581             /* If we have 2D PME decomposition, which is always in x+y,
1582              * we stack the PME only nodes in z.
1583              * Otherwise we choose the direction that provides the thinnest slab
1584              * of PME only nodes as this will have the least effect
1585              * on the PP communication.
1586              * But for the PME communication the opposite might be better.
1587              */
1588             if (bDiv[ZZ] && (comm->npmenodes_y > 1 ||
1589                              !bDiv[YY] ||
1590                              dd->nc[YY] > dd->nc[ZZ]))
1591             {
1592                 comm->cartpmedim = ZZ;
1593             }
1594             else
1595             {
1596                 comm->cartpmedim = YY;
1597             }
1598             comm->ntot[comm->cartpmedim]
1599                 += (cr->npmenodes*dd->nc[comm->cartpmedim])/dd->nnodes;
1600         }
1601         else
1602         {
1603             GMX_LOG(mdlog.info).appendTextFormatted(
1604                     "Number of PME-only ranks (%d) is not a multiple of nx*ny (%d*%d) or nx*nz (%d*%d)",
1605                     cr->npmenodes, dd->nc[XX], dd->nc[YY], dd->nc[XX], dd->nc[ZZ]);
1606             GMX_LOG(mdlog.info).appendText("Will not use a Cartesian communicator for PP <-> PME\n");
1607         }
1608     }
1609
1610     if (comm->bCartesianPP_PME)
1611     {
1612 #if GMX_MPI
1613         int  rank;
1614         ivec periods;
1615
1616         GMX_LOG(mdlog.info).appendTextFormatted(
1617                 "Will use a Cartesian communicator for PP <-> PME: %d x %d x %d",
1618                 comm->ntot[XX], comm->ntot[YY], comm->ntot[ZZ]);
1619
1620         for (i = 0; i < DIM; i++)
1621         {
1622             periods[i] = TRUE;
1623         }
1624         MPI_Cart_create(cr->mpi_comm_mysim, DIM, comm->ntot, periods, static_cast<int>(reorder),
1625                         &comm_cart);
1626         MPI_Comm_rank(comm_cart, &rank);
1627         if (MASTER(cr) && rank != 0)
1628         {
1629             gmx_fatal(FARGS, "MPI rank 0 was renumbered by MPI_Cart_create, we do not allow this");
1630         }
1631
1632         /* With this assigment we loose the link to the original communicator
1633          * which will usually be MPI_COMM_WORLD, unless have multisim.
1634          */
1635         cr->mpi_comm_mysim = comm_cart;
1636         cr->sim_nodeid     = rank;
1637
1638         MPI_Cart_coords(cr->mpi_comm_mysim, cr->sim_nodeid, DIM, dd->ci);
1639
1640         GMX_LOG(mdlog.info).appendTextFormatted(
1641                 "Cartesian rank %d, coordinates %d %d %d\n",
1642                 cr->sim_nodeid, dd->ci[XX], dd->ci[YY], dd->ci[ZZ]);
1643
1644         if (dd->ci[comm->cartpmedim] < dd->nc[comm->cartpmedim])
1645         {
1646             cr->duty = DUTY_PP;
1647         }
1648         if (cr->npmenodes == 0 ||
1649             dd->ci[comm->cartpmedim] >= dd->nc[comm->cartpmedim])
1650         {
1651             cr->duty = DUTY_PME;
1652         }
1653
1654         /* Split the sim communicator into PP and PME only nodes */
1655         MPI_Comm_split(cr->mpi_comm_mysim,
1656                        getThisRankDuties(cr),
1657                        dd_index(comm->ntot, dd->ci),
1658                        &cr->mpi_comm_mygroup);
1659 #endif
1660     }
1661     else
1662     {
1663         switch (rankOrder)
1664         {
1665             case DdRankOrder::pp_pme:
1666                 GMX_LOG(mdlog.info).appendText("Order of the ranks: PP first, PME last");
1667                 break;
1668             case DdRankOrder::interleave:
1669                 /* Interleave the PP-only and PME-only ranks */
1670                 GMX_LOG(mdlog.info).appendText("Interleaving PP and PME ranks");
1671                 comm->pmenodes = dd_interleaved_pme_ranks(dd);
1672                 break;
1673             case DdRankOrder::cartesian:
1674                 break;
1675             default:
1676                 gmx_fatal(FARGS, "Invalid ddRankOrder=%d", static_cast<int>(rankOrder));
1677         }
1678
1679         if (dd_simnode2pmenode(dd, cr, cr->sim_nodeid) == -1)
1680         {
1681             cr->duty = DUTY_PME;
1682         }
1683         else
1684         {
1685             cr->duty = DUTY_PP;
1686         }
1687 #if GMX_MPI
1688         /* Split the sim communicator into PP and PME only nodes */
1689         MPI_Comm_split(cr->mpi_comm_mysim,
1690                        getThisRankDuties(cr),
1691                        cr->nodeid,
1692                        &cr->mpi_comm_mygroup);
1693         MPI_Comm_rank(cr->mpi_comm_mygroup, &cr->nodeid);
1694 #endif
1695     }
1696
1697     GMX_LOG(mdlog.info).appendTextFormatted(
1698             "This rank does only %s work.\n",
1699             thisRankHasDuty(cr, DUTY_PP) ? "particle-particle" : "PME-mesh");
1700 }
1701
1702 /*! \brief Generates the MPI communicators for domain decomposition */
1703 static void make_dd_communicators(const gmx::MDLogger &mdlog,
1704                                   t_commrec *cr,
1705                                   gmx_domdec_t *dd, DdRankOrder ddRankOrder)
1706 {
1707     gmx_domdec_comm_t *comm;
1708     bool               CartReorder;
1709
1710     comm = dd->comm;
1711
1712     copy_ivec(dd->nc, comm->ntot);
1713
1714     comm->bCartesianPP     = (ddRankOrder == DdRankOrder::cartesian);
1715     comm->bCartesianPP_PME = FALSE;
1716
1717     /* Reorder the nodes by default. This might change the MPI ranks.
1718      * Real reordering is only supported on very few architectures,
1719      * Blue Gene is one of them.
1720      */
1721     CartReorder = getenv("GMX_NO_CART_REORDER") == nullptr;
1722
1723     if (cr->npmenodes > 0)
1724     {
1725         /* Split the communicator into a PP and PME part */
1726         split_communicator(mdlog, cr, dd, ddRankOrder, CartReorder);
1727         if (comm->bCartesianPP_PME)
1728         {
1729             /* We (possibly) reordered the nodes in split_communicator,
1730              * so it is no longer required in make_pp_communicator.
1731              */
1732             CartReorder = FALSE;
1733         }
1734     }
1735     else
1736     {
1737         /* All nodes do PP and PME */
1738 #if GMX_MPI
1739         /* We do not require separate communicators */
1740         cr->mpi_comm_mygroup = cr->mpi_comm_mysim;
1741 #endif
1742     }
1743
1744     if (thisRankHasDuty(cr, DUTY_PP))
1745     {
1746         /* Copy or make a new PP communicator */
1747         make_pp_communicator(mdlog, dd, cr, CartReorder);
1748     }
1749     else
1750     {
1751         receive_ddindex2simnodeid(dd, cr);
1752     }
1753
1754     if (!thisRankHasDuty(cr, DUTY_PME))
1755     {
1756         /* Set up the commnuication to our PME node */
1757         dd->pme_nodeid           = dd_simnode2pmenode(dd, cr, cr->sim_nodeid);
1758         dd->pme_receive_vir_ener = receive_vir_ener(dd, cr);
1759         if (debug)
1760         {
1761             fprintf(debug, "My pme_nodeid %d receive ener %s\n",
1762                     dd->pme_nodeid, gmx::boolToString(dd->pme_receive_vir_ener));
1763         }
1764     }
1765     else
1766     {
1767         dd->pme_nodeid = -1;
1768     }
1769
1770     /* We can not use DDMASTER(dd), because dd->masterrank is set later */
1771     if (MASTER(cr))
1772     {
1773         dd->ma = gmx::compat::make_unique<AtomDistribution>(dd->nc,
1774                                                             comm->cgs_gl.nr,
1775                                                             comm->cgs_gl.index[comm->cgs_gl.nr]);
1776     }
1777 }
1778
1779 static real *get_slb_frac(const gmx::MDLogger &mdlog,
1780                           const char *dir, int nc, const char *size_string)
1781 {
1782     real  *slb_frac, tot;
1783     int    i, n;
1784     double dbl;
1785
1786     slb_frac = nullptr;
1787     if (nc > 1 && size_string != nullptr)
1788     {
1789         GMX_LOG(mdlog.info).appendTextFormatted(
1790                 "Using static load balancing for the %s direction", dir);
1791         snew(slb_frac, nc);
1792         tot = 0;
1793         for (i = 0; i < nc; i++)
1794         {
1795             dbl = 0;
1796             sscanf(size_string, "%20lf%n", &dbl, &n);
1797             if (dbl == 0)
1798             {
1799                 gmx_fatal(FARGS, "Incorrect or not enough DD cell size entries for direction %s: '%s'", dir, size_string);
1800             }
1801             slb_frac[i]  = dbl;
1802             size_string += n;
1803             tot         += slb_frac[i];
1804         }
1805         /* Normalize */
1806         std::string relativeCellSizes = "Relative cell sizes:";
1807         for (i = 0; i < nc; i++)
1808         {
1809             slb_frac[i]       /= tot;
1810             relativeCellSizes += gmx::formatString(" %5.3f", slb_frac[i]);
1811         }
1812         GMX_LOG(mdlog.info).appendText(relativeCellSizes);
1813     }
1814
1815     return slb_frac;
1816 }
1817
1818 static int multi_body_bondeds_count(const gmx_mtop_t *mtop)
1819 {
1820     int                  n     = 0;
1821     gmx_mtop_ilistloop_t iloop = gmx_mtop_ilistloop_init(mtop);
1822     int                  nmol;
1823     while (const InteractionLists *ilists = gmx_mtop_ilistloop_next(iloop, &nmol))
1824     {
1825         for (auto &ilist : extractILists(*ilists, IF_BOND))
1826         {
1827             if (NRAL(ilist.functionType) >  2)
1828             {
1829                 n += nmol*(ilist.iatoms.size()/ilistStride(ilist));
1830             }
1831         }
1832     }
1833
1834     return n;
1835 }
1836
1837 static int dd_getenv(const gmx::MDLogger &mdlog,
1838                      const char *env_var, int def)
1839 {
1840     char *val;
1841     int   nst;
1842
1843     nst = def;
1844     val = getenv(env_var);
1845     if (val)
1846     {
1847         if (sscanf(val, "%20d", &nst) <= 0)
1848         {
1849             nst = 1;
1850         }
1851         GMX_LOG(mdlog.info).appendTextFormatted(
1852                 "Found env.var. %s = %s, using value %d",
1853                 env_var, val, nst);
1854     }
1855
1856     return nst;
1857 }
1858
1859 static void check_dd_restrictions(const gmx_domdec_t  *dd,
1860                                   const t_inputrec    *ir,
1861                                   const gmx::MDLogger &mdlog)
1862 {
1863     if (ir->ePBC == epbcSCREW &&
1864         (dd->nc[XX] == 1 || dd->nc[YY] > 1 || dd->nc[ZZ] > 1))
1865     {
1866         gmx_fatal(FARGS, "With pbc=%s can only do domain decomposition in the x-direction", epbc_names[ir->ePBC]);
1867     }
1868
1869     if (ir->ns_type == ensSIMPLE)
1870     {
1871         gmx_fatal(FARGS, "Domain decomposition does not support simple neighbor searching, use grid searching or run with one MPI rank");
1872     }
1873
1874     if (ir->nstlist == 0)
1875     {
1876         gmx_fatal(FARGS, "Domain decomposition does not work with nstlist=0");
1877     }
1878
1879     if (ir->comm_mode == ecmANGULAR && ir->ePBC != epbcNONE)
1880     {
1881         GMX_LOG(mdlog.warning).appendText("comm-mode angular will give incorrect results when the comm group partially crosses a periodic boundary");
1882     }
1883 }
1884
1885 static real average_cellsize_min(gmx_domdec_t *dd, gmx_ddbox_t *ddbox)
1886 {
1887     int  di, d;
1888     real r;
1889
1890     r = ddbox->box_size[XX];
1891     for (di = 0; di < dd->ndim; di++)
1892     {
1893         d = dd->dim[di];
1894         /* Check using the initial average cell size */
1895         r = std::min(r, ddbox->box_size[d]*ddbox->skew_fac[d]/dd->nc[d]);
1896     }
1897
1898     return r;
1899 }
1900
1901 /*! \brief Depending on the DLB initial value return the DLB switched off state or issue an error.
1902  */
1903 static DlbState forceDlbOffOrBail(DlbState             cmdlineDlbState,
1904                                   const std::string   &reasonStr,
1905                                   const gmx::MDLogger &mdlog)
1906 {
1907     std::string dlbNotSupportedErr  = "Dynamic load balancing requested, but ";
1908     std::string dlbDisableNote      = "NOTE: disabling dynamic load balancing as ";
1909
1910     if (cmdlineDlbState == DlbState::onUser)
1911     {
1912         gmx_fatal(FARGS, "%s", (dlbNotSupportedErr + reasonStr).c_str());
1913     }
1914     else if (cmdlineDlbState == DlbState::offCanTurnOn)
1915     {
1916         GMX_LOG(mdlog.info).appendText(dlbDisableNote + reasonStr);
1917     }
1918     return DlbState::offForever;
1919 }
1920
1921 /*! \brief Return the dynamic load balancer's initial state based on initial conditions and user inputs.
1922  *
1923  * This function parses the parameters of "-dlb" command line option setting
1924  * corresponding state values. Then it checks the consistency of the determined
1925  * state with other run parameters and settings. As a result, the initial state
1926  * may be altered or an error may be thrown if incompatibility of options is detected.
1927  *
1928  * \param [in] mdlog       Logger.
1929  * \param [in] dlbOption   Enum value for the DLB option.
1930  * \param [in] bRecordLoad True if the load balancer is recording load information.
1931  * \param [in] mdrunOptions  Options for mdrun.
1932  * \param [in] ir          Pointer mdrun to input parameters.
1933  * \returns                DLB initial/startup state.
1934  */
1935 static DlbState determineInitialDlbState(const gmx::MDLogger &mdlog,
1936                                          DlbOption dlbOption, gmx_bool bRecordLoad,
1937                                          const MdrunOptions &mdrunOptions,
1938                                          const t_inputrec *ir)
1939 {
1940     DlbState dlbState = DlbState::offCanTurnOn;
1941
1942     switch (dlbOption)
1943     {
1944         case DlbOption::turnOnWhenUseful: dlbState = DlbState::offCanTurnOn; break;
1945         case DlbOption::no:               dlbState = DlbState::offUser;      break;
1946         case DlbOption::yes:              dlbState = DlbState::onUser;       break;
1947         default: gmx_incons("Invalid dlbOption enum value");
1948     }
1949
1950     /* Reruns don't support DLB: bail or override auto mode */
1951     if (mdrunOptions.rerun)
1952     {
1953         std::string reasonStr = "it is not supported in reruns.";
1954         return forceDlbOffOrBail(dlbState, reasonStr, mdlog);
1955     }
1956
1957     /* Unsupported integrators */
1958     if (!EI_DYNAMICS(ir->eI))
1959     {
1960         auto reasonStr = gmx::formatString("it is only supported with dynamics, not with integrator '%s'.", EI(ir->eI));
1961         return forceDlbOffOrBail(dlbState, reasonStr, mdlog);
1962     }
1963
1964     /* Without cycle counters we can't time work to balance on */
1965     if (!bRecordLoad)
1966     {
1967         std::string reasonStr = "cycle counters unsupported or not enabled in the operating system kernel.";
1968         return forceDlbOffOrBail(dlbState, reasonStr, mdlog);
1969     }
1970
1971     if (mdrunOptions.reproducible)
1972     {
1973         std::string reasonStr = "you started a reproducible run.";
1974         switch (dlbState)
1975         {
1976             case DlbState::offUser:
1977                 break;
1978             case DlbState::offForever:
1979                 GMX_RELEASE_ASSERT(false, "DlbState::offForever is not a valid initial state");
1980                 break;
1981             case DlbState::offCanTurnOn:
1982                 return forceDlbOffOrBail(dlbState, reasonStr, mdlog);
1983             case DlbState::onCanTurnOff:
1984                 GMX_RELEASE_ASSERT(false, "DlbState::offCanTurnOff is not a valid initial state");
1985                 break;
1986             case DlbState::onUser:
1987                 return forceDlbOffOrBail(dlbState, reasonStr + " In load balanced runs binary reproducibility cannot be ensured.", mdlog);
1988             default:
1989                 gmx_fatal(FARGS, "Death horror: undefined case (%d) for load balancing choice", static_cast<int>(dlbState));
1990         }
1991     }
1992
1993     return dlbState;
1994 }
1995
1996 static void set_dd_dim(const gmx::MDLogger &mdlog, gmx_domdec_t *dd)
1997 {
1998     dd->ndim = 0;
1999     if (getenv("GMX_DD_ORDER_ZYX") != nullptr)
2000     {
2001         /* Decomposition order z,y,x */
2002         GMX_LOG(mdlog.info).appendText("Using domain decomposition order z, y, x");
2003         for (int dim = DIM-1; dim >= 0; dim--)
2004         {
2005             if (dd->nc[dim] > 1)
2006             {
2007                 dd->dim[dd->ndim++] = dim;
2008             }
2009         }
2010     }
2011     else
2012     {
2013         /* Decomposition order x,y,z */
2014         for (int dim = 0; dim < DIM; dim++)
2015         {
2016             if (dd->nc[dim] > 1)
2017             {
2018                 dd->dim[dd->ndim++] = dim;
2019             }
2020         }
2021     }
2022
2023     if (dd->ndim == 0)
2024     {
2025         /* Set dim[0] to avoid extra checks on ndim in several places */
2026         dd->dim[0] = XX;
2027     }
2028 }
2029
2030 static gmx_domdec_comm_t *init_dd_comm()
2031 {
2032     gmx_domdec_comm_t *comm = new gmx_domdec_comm_t;
2033
2034     comm->n_load_have      = 0;
2035     comm->n_load_collect   = 0;
2036
2037     comm->haveTurnedOffDlb = false;
2038
2039     for (int i = 0; i < static_cast<int>(DDAtomRanges::Type::Number); i++)
2040     {
2041         comm->sum_nat[i] = 0;
2042     }
2043     comm->ndecomp   = 0;
2044     comm->nload     = 0;
2045     comm->load_step = 0;
2046     comm->load_sum  = 0;
2047     comm->load_max  = 0;
2048     clear_ivec(comm->load_lim);
2049     comm->load_mdf  = 0;
2050     comm->load_pme  = 0;
2051
2052     /* This should be replaced by a unique pointer */
2053     comm->balanceRegion = ddBalanceRegionAllocate();
2054
2055     return comm;
2056 }
2057
2058 /* Returns whether mtop contains constraints and/or vsites */
2059 static bool systemHasConstraintsOrVsites(const gmx_mtop_t &mtop)
2060 {
2061     auto ilistLoop = gmx_mtop_ilistloop_init(mtop);
2062     int  nmol;
2063     while (const InteractionLists *ilists = gmx_mtop_ilistloop_next(ilistLoop, &nmol))
2064     {
2065         if (!extractILists(*ilists, IF_CONSTRAINT | IF_VSITE).empty())
2066         {
2067             return true;
2068         }
2069     }
2070
2071     return false;
2072 }
2073
2074 static void setupUpdateGroups(const gmx::MDLogger &mdlog,
2075                               const gmx_mtop_t    &mtop,
2076                               const t_inputrec    &inputrec,
2077                               real                 cutoffMargin,
2078                               int                  numMpiRanksTotal,
2079                               gmx_domdec_comm_t   *comm)
2080 {
2081     /* When we have constraints and/or vsites, it is beneficial to use
2082      * update groups (when possible) to allow independent update of groups.
2083      */
2084     if (!systemHasConstraintsOrVsites(mtop))
2085     {
2086         /* No constraints or vsites, atoms can be updated independently */
2087         return;
2088     }
2089
2090     comm->updateGroupingPerMoleculetype = gmx::makeUpdateGroups(mtop);
2091     comm->useUpdateGroups               = !comm->updateGroupingPerMoleculetype.empty();
2092
2093     if (comm->useUpdateGroups)
2094     {
2095         int numUpdateGroups = 0;
2096         for (const auto &molblock : mtop.molblock)
2097         {
2098             numUpdateGroups += molblock.nmol*comm->updateGroupingPerMoleculetype[molblock.type].numBlocks();
2099         }
2100
2101         /* Note: We would like to use dd->nnodes for the atom count estimate,
2102          *       but that is not yet available here. But this anyhow only
2103          *       affect performance up to the second dd_partition_system call.
2104          */
2105         int homeAtomCountEstimate =  mtop.natoms/numMpiRanksTotal;
2106         comm->updateGroupsCog =
2107             gmx::compat::make_unique<gmx::UpdateGroupsCog>(mtop,
2108                                                            comm->updateGroupingPerMoleculetype,
2109                                                            maxReferenceTemperature(inputrec),
2110                                                            homeAtomCountEstimate);
2111
2112         /* To use update groups, the large domain-to-domain cutoff distance
2113          * should be compatible with the box size.
2114          */
2115         comm->useUpdateGroups = (atomToAtomIntoDomainToDomainCutoff(*comm, 0) < cutoffMargin);
2116
2117         if (comm->useUpdateGroups)
2118         {
2119             GMX_LOG(mdlog.info).appendTextFormatted(
2120                     "Using update groups, nr %d, average size %.1f atoms, max. radius %.3f nm\n",
2121                     numUpdateGroups,
2122                     mtop.natoms/static_cast<double>(numUpdateGroups),
2123                     comm->updateGroupsCog->maxUpdateGroupRadius());
2124         }
2125         else
2126         {
2127             GMX_LOG(mdlog.info).appendTextFormatted("The combination of rlist and box size prohibits the use of update groups\n");
2128             comm->updateGroupingPerMoleculetype.clear();
2129             comm->updateGroupsCog.reset(nullptr);
2130         }
2131     }
2132 }
2133
2134 /*! \brief Set the cell size and interaction limits, as well as the DD grid */
2135 static void set_dd_limits_and_grid(const gmx::MDLogger &mdlog,
2136                                    t_commrec *cr, gmx_domdec_t *dd,
2137                                    const DomdecOptions &options,
2138                                    const MdrunOptions &mdrunOptions,
2139                                    const gmx_mtop_t *mtop,
2140                                    const t_inputrec *ir,
2141                                    const matrix box,
2142                                    gmx::ArrayRef<const gmx::RVec> xGlobal,
2143                                    gmx_ddbox_t *ddbox)
2144 {
2145     real               r_bonded         = -1;
2146     real               r_bonded_limit   = -1;
2147     const real         tenPercentMargin = 1.1;
2148     gmx_domdec_comm_t *comm             = dd->comm;
2149
2150     dd->npbcdim              = ePBC2npbcdim(ir->ePBC);
2151     dd->numBoundedDimensions = inputrec2nboundeddim(ir);
2152     dd->haveDynamicBox       = inputrecDynamicBox(ir);
2153     dd->bScrewPBC            = (ir->ePBC == epbcSCREW);
2154
2155     dd->pme_recv_f_alloc = 0;
2156     dd->pme_recv_f_buf   = nullptr;
2157
2158     /* Initialize to GPU share count to 0, might change later */
2159     comm->nrank_gpu_shared = 0;
2160
2161     comm->dlbState         = determineInitialDlbState(mdlog, options.dlbOption, comm->bRecordLoad, mdrunOptions, ir);
2162     dd_dlb_set_should_check_whether_to_turn_dlb_on(dd, TRUE);
2163     /* To consider turning DLB on after 2*nstlist steps we need to check
2164      * at partitioning count 3. Thus we need to increase the first count by 2.
2165      */
2166     comm->ddPartioningCountFirstDlbOff += 2;
2167
2168     GMX_LOG(mdlog.info).appendTextFormatted(
2169             "Dynamic load balancing: %s", edlbs_names[int(comm->dlbState)]);
2170
2171     comm->bPMELoadBalDLBLimits = FALSE;
2172
2173     /* Allocate the charge group/atom sorting struct */
2174     comm->sort = gmx::compat::make_unique<gmx_domdec_sort_t>();
2175
2176     comm->bCGs = (ncg_mtop(mtop) < mtop->natoms);
2177
2178     /* We need to decide on update groups early, as this affects communication distances */
2179     comm->useUpdateGroups = false;
2180     if (ir->cutoff_scheme == ecutsVERLET)
2181     {
2182         real cutoffMargin = std::sqrt(max_cutoff2(ir->ePBC, box)) - ir->rlist;
2183         setupUpdateGroups(mdlog, *mtop, *ir, cutoffMargin, cr->nnodes, comm);
2184     }
2185
2186     comm->bInterCGBondeds = ((ncg_mtop(mtop) > gmx_mtop_num_molecules(*mtop)) ||
2187                              mtop->bIntermolecularInteractions);
2188     if (comm->bInterCGBondeds)
2189     {
2190         comm->bInterCGMultiBody = (multi_body_bondeds_count(mtop) > 0);
2191     }
2192     else
2193     {
2194         comm->bInterCGMultiBody = FALSE;
2195     }
2196
2197     if (comm->useUpdateGroups)
2198     {
2199         dd->splitConstraints = false;
2200         dd->splitSettles     = false;
2201     }
2202     else
2203     {
2204         dd->splitConstraints = gmx::inter_charge_group_constraints(*mtop);
2205         dd->splitSettles     = gmx::inter_charge_group_settles(*mtop);
2206     }
2207
2208     if (ir->rlist == 0)
2209     {
2210         /* Set the cut-off to some very large value,
2211          * so we don't need if statements everywhere in the code.
2212          * We use sqrt, since the cut-off is squared in some places.
2213          */
2214         comm->cutoff   = GMX_CUTOFF_INF;
2215     }
2216     else
2217     {
2218         comm->cutoff   = atomToAtomIntoDomainToDomainCutoff(*comm, ir->rlist);
2219     }
2220     comm->cutoff_mbody = 0;
2221
2222     /* Determine the minimum cell size limit, affected by many factors */
2223     comm->cellsize_limit = 0;
2224     comm->bBondComm      = FALSE;
2225
2226     /* We do not allow home atoms to move beyond the neighboring domain
2227      * between domain decomposition steps, which limits the cell size.
2228      * Get an estimate of cell size limit due to atom displacement.
2229      * In most cases this is a large overestimate, because it assumes
2230      * non-interaction atoms.
2231      * We set the chance to 1 in a trillion steps.
2232      */
2233     constexpr real c_chanceThatAtomMovesBeyondDomain = 1e-12;
2234     const real     limitForAtomDisplacement          =
2235         minCellSizeForAtomDisplacement(*mtop, *ir,
2236                                        comm->updateGroupingPerMoleculetype,
2237                                        c_chanceThatAtomMovesBeyondDomain);
2238     GMX_LOG(mdlog.info).appendTextFormatted(
2239             "Minimum cell size due to atom displacement: %.3f nm",
2240             limitForAtomDisplacement);
2241
2242     comm->cellsize_limit = std::max(comm->cellsize_limit,
2243                                     limitForAtomDisplacement);
2244
2245     /* TODO: PME decomposition currently requires atoms not to be more than
2246      *       2/3 of comm->cutoff, which is >=rlist, outside of their domain.
2247      *       In nearly all cases, limitForAtomDisplacement will be smaller
2248      *       than 2/3*rlist, so the PME requirement is satisfied.
2249      *       But it would be better for both correctness and performance
2250      *       to use limitForAtomDisplacement instead of 2/3*comm->cutoff.
2251      *       Note that we would need to improve the pairlist buffer case.
2252      */
2253
2254     if (comm->bInterCGBondeds)
2255     {
2256         if (options.minimumCommunicationRange > 0)
2257         {
2258             comm->cutoff_mbody = atomToAtomIntoDomainToDomainCutoff(*comm, options.minimumCommunicationRange);
2259             if (options.useBondedCommunication)
2260             {
2261                 comm->bBondComm = (comm->cutoff_mbody > comm->cutoff);
2262             }
2263             else
2264             {
2265                 comm->cutoff = std::max(comm->cutoff, comm->cutoff_mbody);
2266             }
2267             r_bonded_limit = comm->cutoff_mbody;
2268         }
2269         else if (ir->bPeriodicMols)
2270         {
2271             /* Can not easily determine the required cut-off */
2272             GMX_LOG(mdlog.warning).appendText("NOTE: Periodic molecules are present in this system. Because of this, the domain decomposition algorithm cannot easily determine the minimum cell size that it requires for treating bonded interactions. Instead, domain decomposition will assume that half the non-bonded cut-off will be a suitable lower bound.");
2273             comm->cutoff_mbody = comm->cutoff/2;
2274             r_bonded_limit     = comm->cutoff_mbody;
2275         }
2276         else
2277         {
2278             real r_2b, r_mb;
2279
2280             if (MASTER(cr))
2281             {
2282                 dd_bonded_cg_distance(mdlog, mtop, ir, as_rvec_array(xGlobal.data()), box,
2283                                       options.checkBondedInteractions,
2284                                       &r_2b, &r_mb);
2285             }
2286             gmx_bcast(sizeof(r_2b), &r_2b, cr);
2287             gmx_bcast(sizeof(r_mb), &r_mb, cr);
2288
2289             /* We use an initial margin of 10% for the minimum cell size,
2290              * except when we are just below the non-bonded cut-off.
2291              */
2292             if (options.useBondedCommunication)
2293             {
2294                 if (std::max(r_2b, r_mb) > comm->cutoff)
2295                 {
2296                     r_bonded        = std::max(r_2b, r_mb);
2297                     r_bonded_limit  = tenPercentMargin*r_bonded;
2298                     comm->bBondComm = TRUE;
2299                 }
2300                 else
2301                 {
2302                     r_bonded       = r_mb;
2303                     r_bonded_limit = std::min(tenPercentMargin*r_bonded, comm->cutoff);
2304                 }
2305                 /* We determine cutoff_mbody later */
2306             }
2307             else
2308             {
2309                 /* No special bonded communication,
2310                  * simply increase the DD cut-off.
2311                  */
2312                 r_bonded_limit     = tenPercentMargin*std::max(r_2b, r_mb);
2313                 comm->cutoff_mbody = r_bonded_limit;
2314                 comm->cutoff       = std::max(comm->cutoff, comm->cutoff_mbody);
2315             }
2316         }
2317         GMX_LOG(mdlog.info).appendTextFormatted(
2318                 "Minimum cell size due to bonded interactions: %.3f nm",
2319                 r_bonded_limit);
2320
2321         comm->cellsize_limit = std::max(comm->cellsize_limit, r_bonded_limit);
2322     }
2323
2324     real rconstr = 0;
2325     if (dd->splitConstraints && options.constraintCommunicationRange <= 0)
2326     {
2327         /* There is a cell size limit due to the constraints (P-LINCS) */
2328         rconstr = gmx::constr_r_max(mdlog, mtop, ir);
2329         GMX_LOG(mdlog.info).appendTextFormatted(
2330                 "Estimated maximum distance required for P-LINCS: %.3f nm",
2331                 rconstr);
2332         if (rconstr > comm->cellsize_limit)
2333         {
2334             GMX_LOG(mdlog.info).appendText("This distance will limit the DD cell size, you can override this with -rcon");
2335         }
2336     }
2337     else if (options.constraintCommunicationRange > 0)
2338     {
2339         /* Here we do not check for dd->splitConstraints.
2340          * because one can also set a cell size limit for virtual sites only
2341          * and at this point we don't know yet if there are intercg v-sites.
2342          */
2343         GMX_LOG(mdlog.info).appendTextFormatted(
2344                 "User supplied maximum distance required for P-LINCS: %.3f nm",
2345                 options.constraintCommunicationRange);
2346         rconstr = options.constraintCommunicationRange;
2347     }
2348     comm->cellsize_limit = std::max(comm->cellsize_limit, rconstr);
2349
2350     comm->cgs_gl = gmx_mtop_global_cgs(mtop);
2351
2352     if (options.numCells[XX] > 0)
2353     {
2354         copy_ivec(options.numCells, dd->nc);
2355         set_dd_dim(mdlog, dd);
2356         set_ddbox_cr(*cr, &dd->nc, *ir, box, xGlobal, ddbox);
2357
2358         if (options.numPmeRanks >= 0)
2359         {
2360             cr->npmenodes = options.numPmeRanks;
2361         }
2362         else
2363         {
2364             /* When the DD grid is set explicitly and -npme is set to auto,
2365              * don't use PME ranks. We check later if the DD grid is
2366              * compatible with the total number of ranks.
2367              */
2368             cr->npmenodes = 0;
2369         }
2370
2371         real acs = average_cellsize_min(dd, ddbox);
2372         if (acs < comm->cellsize_limit)
2373         {
2374             gmx_fatal_collective(FARGS, cr->mpi_comm_mysim, MASTER(cr),
2375                                  "The initial cell size (%f) is smaller than the cell size limit (%f), change options -dd, -rdd or -rcon, see the log file for details",
2376                                  acs, comm->cellsize_limit);
2377         }
2378     }
2379     else
2380     {
2381         set_ddbox_cr(*cr, nullptr, *ir, box, xGlobal, ddbox);
2382
2383         /* We need to choose the optimal DD grid and possibly PME nodes */
2384         real limit =
2385             dd_choose_grid(mdlog, cr, dd, ir, mtop, box, ddbox,
2386                            options.numPmeRanks,
2387                            !isDlbDisabled(comm),
2388                            options.dlbScaling,
2389                            comm->cellsize_limit, comm->cutoff,
2390                            comm->bInterCGBondeds);
2391
2392         if (dd->nc[XX] == 0)
2393         {
2394             char     buf[STRLEN];
2395             gmx_bool bC = (dd->splitConstraints && rconstr > r_bonded_limit);
2396             sprintf(buf, "Change the number of ranks or mdrun option %s%s%s",
2397                     !bC ? "-rdd" : "-rcon",
2398                     comm->dlbState != DlbState::offUser ? " or -dds" : "",
2399                     bC ? " or your LINCS settings" : "");
2400
2401             gmx_fatal_collective(FARGS, cr->mpi_comm_mysim, MASTER(cr),
2402                                  "There is no domain decomposition for %d ranks that is compatible with the given box and a minimum cell size of %g nm\n"
2403                                  "%s\n"
2404                                  "Look in the log file for details on the domain decomposition",
2405                                  cr->nnodes-cr->npmenodes, limit, buf);
2406         }
2407         set_dd_dim(mdlog, dd);
2408     }
2409
2410     GMX_LOG(mdlog.info).appendTextFormatted(
2411             "Domain decomposition grid %d x %d x %d, separate PME ranks %d",
2412             dd->nc[XX], dd->nc[YY], dd->nc[ZZ], cr->npmenodes);
2413
2414     dd->nnodes = dd->nc[XX]*dd->nc[YY]*dd->nc[ZZ];
2415     if (cr->nnodes - dd->nnodes != cr->npmenodes)
2416     {
2417         gmx_fatal_collective(FARGS, cr->mpi_comm_mysim, MASTER(cr),
2418                              "The size of the domain decomposition grid (%d) does not match the number of ranks (%d). The total number of ranks is %d",
2419                              dd->nnodes, cr->nnodes - cr->npmenodes, cr->nnodes);
2420     }
2421     if (cr->npmenodes > dd->nnodes)
2422     {
2423         gmx_fatal_collective(FARGS, cr->mpi_comm_mysim, MASTER(cr),
2424                              "The number of separate PME ranks (%d) is larger than the number of PP ranks (%d), this is not supported.", cr->npmenodes, dd->nnodes);
2425     }
2426     if (cr->npmenodes > 0)
2427     {
2428         comm->npmenodes = cr->npmenodes;
2429     }
2430     else
2431     {
2432         comm->npmenodes = dd->nnodes;
2433     }
2434
2435     if (EEL_PME(ir->coulombtype) || EVDW_PME(ir->vdwtype))
2436     {
2437         /* The following choices should match those
2438          * in comm_cost_est in domdec_setup.c.
2439          * Note that here the checks have to take into account
2440          * that the decomposition might occur in a different order than xyz
2441          * (for instance through the env.var. GMX_DD_ORDER_ZYX),
2442          * in which case they will not match those in comm_cost_est,
2443          * but since that is mainly for testing purposes that's fine.
2444          */
2445         if (dd->ndim >= 2 && dd->dim[0] == XX && dd->dim[1] == YY &&
2446             comm->npmenodes > dd->nc[XX] && comm->npmenodes % dd->nc[XX] == 0 &&
2447             getenv("GMX_PMEONEDD") == nullptr)
2448         {
2449             comm->npmedecompdim = 2;
2450             comm->npmenodes_x   = dd->nc[XX];
2451             comm->npmenodes_y   = comm->npmenodes/comm->npmenodes_x;
2452         }
2453         else
2454         {
2455             /* In case nc is 1 in both x and y we could still choose to
2456              * decompose pme in y instead of x, but we use x for simplicity.
2457              */
2458             comm->npmedecompdim = 1;
2459             if (dd->dim[0] == YY)
2460             {
2461                 comm->npmenodes_x = 1;
2462                 comm->npmenodes_y = comm->npmenodes;
2463             }
2464             else
2465             {
2466                 comm->npmenodes_x = comm->npmenodes;
2467                 comm->npmenodes_y = 1;
2468             }
2469         }
2470         GMX_LOG(mdlog.info).appendTextFormatted(
2471                 "PME domain decomposition: %d x %d x %d",
2472                 comm->npmenodes_x, comm->npmenodes_y, 1);
2473     }
2474     else
2475     {
2476         comm->npmedecompdim = 0;
2477         comm->npmenodes_x   = 0;
2478         comm->npmenodes_y   = 0;
2479     }
2480
2481     snew(comm->slb_frac, DIM);
2482     if (isDlbDisabled(comm))
2483     {
2484         comm->slb_frac[XX] = get_slb_frac(mdlog, "x", dd->nc[XX], options.cellSizeX);
2485         comm->slb_frac[YY] = get_slb_frac(mdlog, "y", dd->nc[YY], options.cellSizeY);
2486         comm->slb_frac[ZZ] = get_slb_frac(mdlog, "z", dd->nc[ZZ], options.cellSizeZ);
2487     }
2488
2489     if (comm->bInterCGBondeds && comm->cutoff_mbody == 0)
2490     {
2491         if (comm->bBondComm || !isDlbDisabled(comm))
2492         {
2493             /* Set the bonded communication distance to halfway
2494              * the minimum and the maximum,
2495              * since the extra communication cost is nearly zero.
2496              */
2497             real acs           = average_cellsize_min(dd, ddbox);
2498             comm->cutoff_mbody = 0.5*(r_bonded + acs);
2499             if (!isDlbDisabled(comm))
2500             {
2501                 /* Check if this does not limit the scaling */
2502                 comm->cutoff_mbody = std::min(comm->cutoff_mbody,
2503                                               options.dlbScaling*acs);
2504             }
2505             if (!comm->bBondComm)
2506             {
2507                 /* Without bBondComm do not go beyond the n.b. cut-off */
2508                 comm->cutoff_mbody = std::min(comm->cutoff_mbody, comm->cutoff);
2509                 if (comm->cellsize_limit >= comm->cutoff)
2510                 {
2511                     /* We don't loose a lot of efficieny
2512                      * when increasing it to the n.b. cut-off.
2513                      * It can even be slightly faster, because we need
2514                      * less checks for the communication setup.
2515                      */
2516                     comm->cutoff_mbody = comm->cutoff;
2517                 }
2518             }
2519             /* Check if we did not end up below our original limit */
2520             comm->cutoff_mbody = std::max(comm->cutoff_mbody, r_bonded_limit);
2521
2522             if (comm->cutoff_mbody > comm->cellsize_limit)
2523             {
2524                 comm->cellsize_limit = comm->cutoff_mbody;
2525             }
2526         }
2527         /* Without DLB and cutoff_mbody<cutoff, cutoff_mbody is dynamic */
2528     }
2529
2530     if (debug)
2531     {
2532         fprintf(debug, "Bonded atom communication beyond the cut-off: %s\n"
2533                 "cellsize limit %f\n",
2534                 gmx::boolToString(comm->bBondComm), comm->cellsize_limit);
2535     }
2536
2537     if (MASTER(cr))
2538     {
2539         check_dd_restrictions(dd, ir, mdlog);
2540     }
2541 }
2542
2543 static char *init_bLocalCG(const gmx_mtop_t *mtop)
2544 {
2545     int   ncg, cg;
2546     char *bLocalCG;
2547
2548     ncg = ncg_mtop(mtop);
2549     snew(bLocalCG, ncg);
2550     for (cg = 0; cg < ncg; cg++)
2551     {
2552         bLocalCG[cg] = FALSE;
2553     }
2554
2555     return bLocalCG;
2556 }
2557
2558 void dd_init_bondeds(FILE *fplog,
2559                      gmx_domdec_t *dd,
2560                      const gmx_mtop_t *mtop,
2561                      const gmx_vsite_t *vsite,
2562                      const t_inputrec *ir,
2563                      gmx_bool bBCheck, cginfo_mb_t *cginfo_mb)
2564 {
2565     gmx_domdec_comm_t *comm;
2566
2567     dd_make_reverse_top(fplog, dd, mtop, vsite, ir, bBCheck);
2568
2569     comm = dd->comm;
2570
2571     if (comm->bBondComm)
2572     {
2573         /* Communicate atoms beyond the cut-off for bonded interactions */
2574         comm = dd->comm;
2575
2576         comm->cglink = make_charge_group_links(mtop, dd, cginfo_mb);
2577
2578         comm->bLocalCG = init_bLocalCG(mtop);
2579     }
2580     else
2581     {
2582         /* Only communicate atoms based on cut-off */
2583         comm->cglink   = nullptr;
2584         comm->bLocalCG = nullptr;
2585     }
2586 }
2587
2588 static void writeSettings(gmx::TextWriter       *log,
2589                           gmx_domdec_t          *dd,
2590                           const gmx_mtop_t      *mtop,
2591                           const t_inputrec      *ir,
2592                           gmx_bool               bDynLoadBal,
2593                           real                   dlb_scale,
2594                           const gmx_ddbox_t     *ddbox)
2595 {
2596     gmx_domdec_comm_t *comm;
2597     int                d;
2598     ivec               np;
2599     real               limit, shrink;
2600
2601     comm = dd->comm;
2602
2603     if (bDynLoadBal)
2604     {
2605         log->writeString("The maximum number of communication pulses is:");
2606         for (d = 0; d < dd->ndim; d++)
2607         {
2608             log->writeStringFormatted(" %c %d", dim2char(dd->dim[d]), comm->cd[d].np_dlb);
2609         }
2610         log->ensureLineBreak();
2611         log->writeLineFormatted("The minimum size for domain decomposition cells is %.3f nm", comm->cellsize_limit);
2612         log->writeLineFormatted("The requested allowed shrink of DD cells (option -dds) is: %.2f", dlb_scale);
2613         log->writeString("The allowed shrink of domain decomposition cells is:");
2614         for (d = 0; d < DIM; d++)
2615         {
2616             if (dd->nc[d] > 1)
2617             {
2618                 if (d >= ddbox->npbcdim && dd->nc[d] == 2)
2619                 {
2620                     shrink = 0;
2621                 }
2622                 else
2623                 {
2624                     shrink =
2625                         comm->cellsize_min_dlb[d]/
2626                         (ddbox->box_size[d]*ddbox->skew_fac[d]/dd->nc[d]);
2627                 }
2628                 log->writeStringFormatted(" %c %.2f", dim2char(d), shrink);
2629             }
2630         }
2631         log->ensureLineBreak();
2632     }
2633     else
2634     {
2635         set_dd_cell_sizes_slb(dd, ddbox, setcellsizeslbPULSE_ONLY, np);
2636         log->writeString("The initial number of communication pulses is:");
2637         for (d = 0; d < dd->ndim; d++)
2638         {
2639             log->writeStringFormatted(" %c %d", dim2char(dd->dim[d]), np[dd->dim[d]]);
2640         }
2641         log->ensureLineBreak();
2642         log->writeString("The initial domain decomposition cell size is:");
2643         for (d = 0; d < DIM; d++)
2644         {
2645             if (dd->nc[d] > 1)
2646             {
2647                 log->writeStringFormatted(" %c %.2f nm",
2648                                           dim2char(d), dd->comm->cellsize_min[d]);
2649             }
2650         }
2651         log->ensureLineBreak();
2652         log->writeLine();
2653     }
2654
2655     gmx_bool bInterCGVsites = count_intercg_vsites(mtop) != 0;
2656
2657     if (comm->bInterCGBondeds ||
2658         bInterCGVsites ||
2659         dd->splitConstraints || dd->splitSettles)
2660     {
2661         std::string decompUnits;
2662         if (comm->bCGs)
2663         {
2664             decompUnits = "charge groups";
2665         }
2666         else if (comm->useUpdateGroups)
2667         {
2668             decompUnits = "atom groups";
2669         }
2670         else
2671         {
2672             decompUnits = "atoms";
2673         }
2674
2675         log->writeLineFormatted("The maximum allowed distance for %s involved in interactions is:", decompUnits.c_str());
2676         log->writeLineFormatted("%40s  %-7s %6.3f nm", "non-bonded interactions", "", comm->cutoff);
2677
2678         if (bDynLoadBal)
2679         {
2680             limit = dd->comm->cellsize_limit;
2681         }
2682         else
2683         {
2684             if (dynamic_dd_box(*dd))
2685             {
2686                 log->writeLine("(the following are initial values, they could change due to box deformation)");
2687             }
2688             limit = dd->comm->cellsize_min[XX];
2689             for (d = 1; d < DIM; d++)
2690             {
2691                 limit = std::min(limit, dd->comm->cellsize_min[d]);
2692             }
2693         }
2694
2695         if (comm->bInterCGBondeds)
2696         {
2697             log->writeLineFormatted("%40s  %-7s %6.3f nm",
2698                                     "two-body bonded interactions", "(-rdd)",
2699                                     std::max(comm->cutoff, comm->cutoff_mbody));
2700             log->writeLineFormatted("%40s  %-7s %6.3f nm",
2701                                     "multi-body bonded interactions", "(-rdd)",
2702                                     (comm->bBondComm || isDlbOn(dd->comm)) ? comm->cutoff_mbody : std::min(comm->cutoff, limit));
2703         }
2704         if (bInterCGVsites)
2705         {
2706             log->writeLineFormatted("%40s  %-7s %6.3f nm",
2707                                     "virtual site constructions", "(-rcon)", limit);
2708         }
2709         if (dd->splitConstraints || dd->splitSettles)
2710         {
2711             std::string separation = gmx::formatString("atoms separated by up to %d constraints",
2712                                                        1+ir->nProjOrder);
2713             log->writeLineFormatted("%40s  %-7s %6.3f nm\n",
2714                                     separation.c_str(), "(-rcon)", limit);
2715         }
2716         log->ensureLineBreak();
2717     }
2718 }
2719
2720 static void logSettings(const gmx::MDLogger &mdlog,
2721                         gmx_domdec_t        *dd,
2722                         const gmx_mtop_t    *mtop,
2723                         const t_inputrec    *ir,
2724                         real                 dlb_scale,
2725                         const gmx_ddbox_t   *ddbox)
2726 {
2727     gmx::StringOutputStream stream;
2728     gmx::TextWriter         log(&stream);
2729     writeSettings(&log, dd, mtop, ir, isDlbOn(dd->comm), dlb_scale, ddbox);
2730     if (dd->comm->dlbState == DlbState::offCanTurnOn)
2731     {
2732         {
2733             log.ensureEmptyLine();
2734             log.writeLine("When dynamic load balancing gets turned on, these settings will change to:");
2735         }
2736         writeSettings(&log, dd, mtop, ir, true, dlb_scale, ddbox);
2737     }
2738     GMX_LOG(mdlog.info).asParagraph().appendText(stream.toString());
2739 }
2740
2741 static void set_cell_limits_dlb(const gmx::MDLogger &mdlog,
2742                                 gmx_domdec_t        *dd,
2743                                 real                 dlb_scale,
2744                                 const t_inputrec    *ir,
2745                                 const gmx_ddbox_t   *ddbox)
2746 {
2747     gmx_domdec_comm_t *comm;
2748     int                d, dim, npulse, npulse_d_max, npulse_d;
2749     gmx_bool           bNoCutOff;
2750
2751     comm = dd->comm;
2752
2753     bNoCutOff = (ir->rvdw == 0 || ir->rcoulomb == 0);
2754
2755     /* Determine the maximum number of comm. pulses in one dimension */
2756
2757     comm->cellsize_limit = std::max(comm->cellsize_limit, comm->cutoff_mbody);
2758
2759     /* Determine the maximum required number of grid pulses */
2760     if (comm->cellsize_limit >= comm->cutoff)
2761     {
2762         /* Only a single pulse is required */
2763         npulse = 1;
2764     }
2765     else if (!bNoCutOff && comm->cellsize_limit > 0)
2766     {
2767         /* We round down slightly here to avoid overhead due to the latency
2768          * of extra communication calls when the cut-off
2769          * would be only slightly longer than the cell size.
2770          * Later cellsize_limit is redetermined,
2771          * so we can not miss interactions due to this rounding.
2772          */
2773         npulse = static_cast<int>(0.96 + comm->cutoff/comm->cellsize_limit);
2774     }
2775     else
2776     {
2777         /* There is no cell size limit */
2778         npulse = std::max(dd->nc[XX]-1, std::max(dd->nc[YY]-1, dd->nc[ZZ]-1));
2779     }
2780
2781     if (!bNoCutOff && npulse > 1)
2782     {
2783         /* See if we can do with less pulses, based on dlb_scale */
2784         npulse_d_max = 0;
2785         for (d = 0; d < dd->ndim; d++)
2786         {
2787             dim      = dd->dim[d];
2788             npulse_d = static_cast<int>(1 + dd->nc[dim]*comm->cutoff
2789                                         /(ddbox->box_size[dim]*ddbox->skew_fac[dim]*dlb_scale));
2790             npulse_d_max = std::max(npulse_d_max, npulse_d);
2791         }
2792         npulse = std::min(npulse, npulse_d_max);
2793     }
2794
2795     /* This env var can override npulse */
2796     d = dd_getenv(mdlog, "GMX_DD_NPULSE", 0);
2797     if (d > 0)
2798     {
2799         npulse = d;
2800     }
2801
2802     comm->maxpulse       = 1;
2803     comm->bVacDLBNoLimit = (ir->ePBC == epbcNONE);
2804     for (d = 0; d < dd->ndim; d++)
2805     {
2806         comm->cd[d].np_dlb    = std::min(npulse, dd->nc[dd->dim[d]]-1);
2807         comm->maxpulse        = std::max(comm->maxpulse, comm->cd[d].np_dlb);
2808         if (comm->cd[d].np_dlb < dd->nc[dd->dim[d]]-1)
2809         {
2810             comm->bVacDLBNoLimit = FALSE;
2811         }
2812     }
2813
2814     /* cellsize_limit is set for LINCS in init_domain_decomposition */
2815     if (!comm->bVacDLBNoLimit)
2816     {
2817         comm->cellsize_limit = std::max(comm->cellsize_limit,
2818                                         comm->cutoff/comm->maxpulse);
2819     }
2820     comm->cellsize_limit = std::max(comm->cellsize_limit, comm->cutoff_mbody);
2821     /* Set the minimum cell size for each DD dimension */
2822     for (d = 0; d < dd->ndim; d++)
2823     {
2824         if (comm->bVacDLBNoLimit ||
2825             comm->cd[d].np_dlb*comm->cellsize_limit >= comm->cutoff)
2826         {
2827             comm->cellsize_min_dlb[dd->dim[d]] = comm->cellsize_limit;
2828         }
2829         else
2830         {
2831             comm->cellsize_min_dlb[dd->dim[d]] =
2832                 comm->cutoff/comm->cd[d].np_dlb;
2833         }
2834     }
2835     if (comm->cutoff_mbody <= 0)
2836     {
2837         comm->cutoff_mbody = std::min(comm->cutoff, comm->cellsize_limit);
2838     }
2839     if (isDlbOn(comm))
2840     {
2841         set_dlb_limits(dd);
2842     }
2843 }
2844
2845 gmx_bool dd_bonded_molpbc(const gmx_domdec_t *dd, int ePBC)
2846 {
2847     /* If each molecule is a single charge group
2848      * or we use domain decomposition for each periodic dimension,
2849      * we do not need to take pbc into account for the bonded interactions.
2850      */
2851     return (ePBC != epbcNONE && dd->comm->bInterCGBondeds &&
2852             !(dd->nc[XX] > 1 &&
2853               dd->nc[YY] > 1 &&
2854               (dd->nc[ZZ] > 1 || ePBC == epbcXY)));
2855 }
2856
2857 /*! \brief Sets grid size limits and PP-PME setup, prints settings to log */
2858 static void set_ddgrid_parameters(const gmx::MDLogger &mdlog,
2859                                   gmx_domdec_t *dd, real dlb_scale,
2860                                   const gmx_mtop_t *mtop, const t_inputrec *ir,
2861                                   const gmx_ddbox_t *ddbox)
2862 {
2863     gmx_domdec_comm_t *comm;
2864     int                natoms_tot;
2865     real               vol_frac;
2866
2867     comm = dd->comm;
2868
2869     if (EEL_PME(ir->coulombtype) || EVDW_PME(ir->vdwtype))
2870     {
2871         init_ddpme(dd, &comm->ddpme[0], 0);
2872         if (comm->npmedecompdim >= 2)
2873         {
2874             init_ddpme(dd, &comm->ddpme[1], 1);
2875         }
2876     }
2877     else
2878     {
2879         comm->npmenodes = 0;
2880         if (dd->pme_nodeid >= 0)
2881         {
2882             gmx_fatal_collective(FARGS, dd->mpi_comm_all, DDMASTER(dd),
2883                                  "Can not have separate PME ranks without PME electrostatics");
2884         }
2885     }
2886
2887     if (debug)
2888     {
2889         fprintf(debug, "The DD cut-off is %f\n", comm->cutoff);
2890     }
2891     if (!isDlbDisabled(comm))
2892     {
2893         set_cell_limits_dlb(mdlog, dd, dlb_scale, ir, ddbox);
2894     }
2895
2896     logSettings(mdlog, dd, mtop, ir, dlb_scale, ddbox);
2897
2898     if (ir->ePBC == epbcNONE)
2899     {
2900         vol_frac = 1 - 1/static_cast<double>(dd->nnodes);
2901     }
2902     else
2903     {
2904         vol_frac =
2905             (1 + comm_box_frac(dd->nc, comm->cutoff, ddbox))/static_cast<double>(dd->nnodes);
2906     }
2907     if (debug)
2908     {
2909         fprintf(debug, "Volume fraction for all DD zones: %f\n", vol_frac);
2910     }
2911     natoms_tot = comm->cgs_gl.index[comm->cgs_gl.nr];
2912
2913     dd->ga2la  = new gmx_ga2la_t(natoms_tot,
2914                                  static_cast<int>(vol_frac*natoms_tot));
2915 }
2916
2917 /*! \brief Set some important DD parameters that can be modified by env.vars */
2918 static void set_dd_envvar_options(const gmx::MDLogger &mdlog,
2919                                   gmx_domdec_t *dd, int rank_mysim)
2920 {
2921     gmx_domdec_comm_t *comm = dd->comm;
2922
2923     dd->bSendRecv2      = (dd_getenv(mdlog, "GMX_DD_USE_SENDRECV2", 0) != 0);
2924     comm->dlb_scale_lim = dd_getenv(mdlog, "GMX_DLB_MAX_BOX_SCALING", 10);
2925     comm->eFlop         = dd_getenv(mdlog, "GMX_DLB_BASED_ON_FLOPS", 0);
2926     int recload         = dd_getenv(mdlog, "GMX_DD_RECORD_LOAD", 1);
2927     comm->nstDDDump     = dd_getenv(mdlog, "GMX_DD_NST_DUMP", 0);
2928     comm->nstDDDumpGrid = dd_getenv(mdlog, "GMX_DD_NST_DUMP_GRID", 0);
2929     comm->DD_debug      = dd_getenv(mdlog, "GMX_DD_DEBUG", 0);
2930
2931     if (dd->bSendRecv2)
2932     {
2933         GMX_LOG(mdlog.info).appendText("Will use two sequential MPI_Sendrecv calls instead of two simultaneous non-blocking MPI_Irecv and MPI_Isend pairs for constraint and vsite communication");
2934     }
2935
2936     if (comm->eFlop)
2937     {
2938         GMX_LOG(mdlog.info).appendText("Will load balance based on FLOP count");
2939         if (comm->eFlop > 1)
2940         {
2941             srand(1 + rank_mysim);
2942         }
2943         comm->bRecordLoad = TRUE;
2944     }
2945     else
2946     {
2947         comm->bRecordLoad = (wallcycle_have_counter() && recload > 0);
2948     }
2949 }
2950
2951 gmx_domdec_t *init_domain_decomposition(const gmx::MDLogger           &mdlog,
2952                                         t_commrec                     *cr,
2953                                         const DomdecOptions           &options,
2954                                         const MdrunOptions            &mdrunOptions,
2955                                         const gmx_mtop_t              *mtop,
2956                                         const t_inputrec              *ir,
2957                                         const matrix                   box,
2958                                         gmx::ArrayRef<const gmx::RVec> xGlobal,
2959                                         gmx::LocalAtomSetManager      *atomSets)
2960 {
2961     gmx_domdec_t      *dd;
2962
2963     GMX_LOG(mdlog.info).appendTextFormatted(
2964             "\nInitializing Domain Decomposition on %d ranks", cr->nnodes);
2965
2966     dd = new gmx_domdec_t;
2967
2968     dd->comm = init_dd_comm();
2969
2970     /* Initialize DD paritioning counters */
2971     dd->comm->partition_step = INT_MIN;
2972     dd->ddp_count            = 0;
2973
2974     set_dd_envvar_options(mdlog, dd, cr->nodeid);
2975
2976     gmx_ddbox_t ddbox = {0};
2977     set_dd_limits_and_grid(mdlog, cr, dd, options, mdrunOptions,
2978                            mtop, ir,
2979                            box, xGlobal,
2980                            &ddbox);
2981
2982     make_dd_communicators(mdlog, cr, dd, options.rankOrder);
2983
2984     if (thisRankHasDuty(cr, DUTY_PP))
2985     {
2986         set_ddgrid_parameters(mdlog, dd, options.dlbScaling, mtop, ir, &ddbox);
2987
2988         setup_neighbor_relations(dd);
2989     }
2990
2991     /* Set overallocation to avoid frequent reallocation of arrays */
2992     set_over_alloc_dd(TRUE);
2993
2994     clear_dd_cycle_counts(dd);
2995
2996     dd->atomSets = atomSets;
2997
2998     return dd;
2999 }
3000
3001 static gmx_bool test_dd_cutoff(t_commrec     *cr,
3002                                const t_state &state,
3003                                real           cutoffRequested)
3004 {
3005     gmx_domdec_t *dd;
3006     gmx_ddbox_t   ddbox;
3007     int           d, dim, np;
3008     real          inv_cell_size;
3009     int           LocallyLimited;
3010
3011     dd = cr->dd;
3012
3013     set_ddbox(*dd, false, state.box, true, state.x, &ddbox);
3014
3015     LocallyLimited = 0;
3016
3017     for (d = 0; d < dd->ndim; d++)
3018     {
3019         dim = dd->dim[d];
3020
3021         inv_cell_size = DD_CELL_MARGIN*dd->nc[dim]/ddbox.box_size[dim];
3022         if (dynamic_dd_box(*dd))
3023         {
3024             inv_cell_size *= DD_PRES_SCALE_MARGIN;
3025         }
3026
3027         np = 1 + static_cast<int>(cutoffRequested*inv_cell_size*ddbox.skew_fac[dim]);
3028
3029         if (!isDlbDisabled(dd->comm) && (dim < ddbox.npbcdim) && (dd->comm->cd[d].np_dlb > 0))
3030         {
3031             if (np > dd->comm->cd[d].np_dlb)
3032             {
3033                 return FALSE;
3034             }
3035
3036             /* If a current local cell size is smaller than the requested
3037              * cut-off, we could still fix it, but this gets very complicated.
3038              * Without fixing here, we might actually need more checks.
3039              */
3040             real cellSizeAlongDim = (dd->comm->cell_x1[dim] - dd->comm->cell_x0[dim])*ddbox.skew_fac[dim];
3041             if (cellSizeAlongDim*dd->comm->cd[d].np_dlb < cutoffRequested)
3042             {
3043                 LocallyLimited = 1;
3044             }
3045         }
3046     }
3047
3048     if (!isDlbDisabled(dd->comm))
3049     {
3050         /* If DLB is not active yet, we don't need to check the grid jumps.
3051          * Actually we shouldn't, because then the grid jump data is not set.
3052          */
3053         if (isDlbOn(dd->comm) &&
3054             check_grid_jump(0, dd, cutoffRequested, &ddbox, FALSE))
3055         {
3056             LocallyLimited = 1;
3057         }
3058
3059         gmx_sumi(1, &LocallyLimited, cr);
3060
3061         if (LocallyLimited > 0)
3062         {
3063             return FALSE;
3064         }
3065     }
3066
3067     return TRUE;
3068 }
3069
3070 gmx_bool change_dd_cutoff(t_commrec     *cr,
3071                           const t_state &state,
3072                           real           cutoffRequested)
3073 {
3074     gmx_bool bCutoffAllowed;
3075
3076     bCutoffAllowed = test_dd_cutoff(cr, state, cutoffRequested);
3077
3078     if (bCutoffAllowed)
3079     {
3080         cr->dd->comm->cutoff = cutoffRequested;
3081     }
3082
3083     return bCutoffAllowed;
3084 }