src/gromacs/domdec/domdec.cpp

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019, by the GROMACS development team, led by
   5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   6  * and including many others, as listed in the AUTHORS file in the
   7  * top-level source directory and at http://www.gromacs.org.
   8  *
   9  * GROMACS is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public License
  11  * as published by the Free Software Foundation; either version 2.1
  12  * of the License, or (at your option) any later version.
  13  *
  14  * GROMACS is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public
  20  * License along with GROMACS; if not, see
  21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  23  *
  24  * If you want to redistribute modifications to GROMACS, please
  25  * consider that scientific software is very special. Version
  26  * control is crucial - bugs must be traceable. We will be happy to
  27  * consider code for inclusion in the official distribution, but
  28  * derived work must not be called official GROMACS. Details are found
  29  * in the README & COPYING files - if they are missing, get the
  30  * official version at http://www.gromacs.org.
  31  *
  32  * To help us fund GROMACS development, we humbly ask that you cite
  33  * the research papers on the package. Check out http://www.gromacs.org.
  34  */
  35
  36 #include "gmxpre.h"
  37
  38 #include "domdec.h"
  39
  40 #include "config.h"
  41
  42 #include <cassert>
  43 #include <cinttypes>
  44 #include <climits>
  45 #include <cmath>
  46 #include <cstdio>
  47 #include <cstdlib>
  48 #include <cstring>
  49
  50 #include <algorithm>
  51
  52 #include "gromacs/compat/make_unique.h"
  53 #include "gromacs/domdec/collect.h"
  54 #include "gromacs/domdec/dlb.h"
  55 #include "gromacs/domdec/dlbtiming.h"
  56 #include "gromacs/domdec/domdec_network.h"
  57 #include "gromacs/domdec/ga2la.h"
  58 #include "gromacs/domdec/partition.h"
  59 #include "gromacs/gmxlib/network.h"
  60 #include "gromacs/gmxlib/nrnb.h"
  61 #include "gromacs/gpu_utils/gpu_utils.h"
  62 #include "gromacs/hardware/hw_info.h"
  63 #include "gromacs/listed_forces/manage-threading.h"
  64 #include "gromacs/math/vec.h"
  65 #include "gromacs/math/vectypes.h"
  66 #include "gromacs/mdlib/calc_verletbuf.h"
  67 #include "gromacs/mdlib/constr.h"
  68 #include "gromacs/mdlib/constraintrange.h"
  69 #include "gromacs/mdlib/mdrun.h"
  70 #include "gromacs/mdlib/updategroups.h"
  71 #include "gromacs/mdlib/vsite.h"
  72 #include "gromacs/mdtypes/commrec.h"
  73 #include "gromacs/mdtypes/inputrec.h"
  74 #include "gromacs/mdtypes/state.h"
  75 #include "gromacs/pbcutil/ishift.h"
  76 #include "gromacs/pbcutil/pbc.h"
  77 #include "gromacs/pulling/pull.h"
  78 #include "gromacs/timing/wallcycle.h"
  79 #include "gromacs/topology/block.h"
  80 #include "gromacs/topology/idef.h"
  81 #include "gromacs/topology/ifunc.h"
  82 #include "gromacs/topology/mtop_lookup.h"
  83 #include "gromacs/topology/mtop_util.h"
  84 #include "gromacs/topology/topology.h"
  85 #include "gromacs/utility/basedefinitions.h"
  86 #include "gromacs/utility/basenetwork.h"
  87 #include "gromacs/utility/cstringutil.h"
  88 #include "gromacs/utility/exceptions.h"
  89 #include "gromacs/utility/fatalerror.h"
  90 #include "gromacs/utility/gmxmpi.h"
  91 #include "gromacs/utility/logger.h"
  92 #include "gromacs/utility/real.h"
  93 #include "gromacs/utility/smalloc.h"
  94 #include "gromacs/utility/strconvert.h"
  95 #include "gromacs/utility/stringstream.h"
  96 #include "gromacs/utility/stringutil.h"
  97 #include "gromacs/utility/textwriter.h"
  98
  99 #include "atomdistribution.h"
 100 #include "box.h"
 101 #include "cellsizes.h"
 102 #include "distribute.h"
 103 #include "domdec_constraints.h"
 104 #include "domdec_internal.h"
 105 #include "domdec_vsite.h"
 106 #include "redistribute.h"
 107 #include "utility.h"
 108
 109 static const char *edlbs_names[int(DlbState::nr)] = { "off", "auto", "locked", "on", "on" };
 110
 111 /* The size per atom group of the cggl_flag buffer in gmx_domdec_comm_t */
 112 #define DD_CGIBS 2
 113
 114 /* The flags for the cggl_flag buffer in gmx_domdec_comm_t */
 115 #define DD_FLAG_NRCG  65535
 116 #define DD_FLAG_FW(d) (1<<(16+(d)*2))
 117 #define DD_FLAG_BW(d) (1<<(16+(d)*2+1))
 118
 119 /* The DD zone order */
 120 static const ivec dd_zo[DD_MAXZONE] =
 121 {{0, 0, 0}, {1, 0, 0}, {1, 1, 0}, {0, 1, 0}, {0, 1, 1}, {0, 0, 1}, {1, 0, 1}, {1, 1, 1}};
 122
 123 /* The non-bonded zone-pair setup for domain decomposition
 124  * The first number is the i-zone, the second number the first j-zone seen by
 125  * this i-zone, the third number the last+1 j-zone seen by this i-zone.
 126  * As is, this is for 3D decomposition, where there are 4 i-zones.
 127  * With 2D decomposition use only the first 2 i-zones and a last+1 j-zone of 4.
 128  * With 1D decomposition use only the first i-zone and a last+1 j-zone of 2.
 129  */
 130 static const int
 131     ddNonbondedZonePairRanges[DD_MAXIZONE][3] = {{0, 0, 8},
 132                                                  {1, 3, 6},
 133                                                  {2, 5, 6},
 134                                                  {3, 5, 7}};
 135
 136
 137 /*
 138    #define dd_index(n,i) ((((i)[ZZ]*(n)[YY] + (i)[YY])*(n)[XX]) + (i)[XX])
 139
 140    static void index2xyz(ivec nc,int ind,ivec xyz)
 141    {
 142    xyz[XX] = ind % nc[XX];
 143    xyz[YY] = (ind / nc[XX]) % nc[YY];
 144    xyz[ZZ] = ind / (nc[YY]*nc[XX]);
 145    }
 146  */
 147
 148 static void ddindex2xyz(const ivec nc, int ind, ivec xyz)
 149 {
 150     xyz[XX] = ind / (nc[YY]*nc[ZZ]);
 151     xyz[YY] = (ind / nc[ZZ]) % nc[YY];
 152     xyz[ZZ] = ind % nc[ZZ];
 153 }
 154
 155 static int ddcoord2ddnodeid(gmx_domdec_t *dd, ivec c)
 156 {
 157     int ddindex;
 158     int ddnodeid = -1;
 159
 160     ddindex = dd_index(dd->nc, c);
 161     if (dd->comm->bCartesianPP_PME)
 162     {
 163         ddnodeid = dd->comm->ddindex2ddnodeid[ddindex];
 164     }
 165     else if (dd->comm->bCartesianPP)
 166     {
 167 #if GMX_MPI
 168         MPI_Cart_rank(dd->mpi_comm_all, c, &ddnodeid);
 169 #endif
 170     }
 171     else
 172     {
 173         ddnodeid = ddindex;
 174     }
 175
 176     return ddnodeid;
 177 }
 178
 179 int ddglatnr(const gmx_domdec_t *dd, int i)
 180 {
 181     int atnr;
 182
 183     if (dd == nullptr)
 184     {
 185         atnr = i + 1;
 186     }
 187     else
 188     {
 189         if (i >= dd->comm->atomRanges.numAtomsTotal())
 190         {
 191             gmx_fatal(FARGS, "glatnr called with %d, which is larger than the local number of atoms (%d)", i, dd->comm->atomRanges.numAtomsTotal());
 192         }
 193         atnr = dd->globalAtomIndices[i] + 1;
 194     }
 195
 196     return atnr;
 197 }
 198
 199 t_block *dd_charge_groups_global(gmx_domdec_t *dd)
 200 {
 201     return &dd->comm->cgs_gl;
 202 }
 203
 204 void dd_store_state(gmx_domdec_t *dd, t_state *state)
 205 {
 206     int i;
 207
 208     if (state->ddp_count != dd->ddp_count)
 209     {
 210         gmx_incons("The MD state does not match the domain decomposition state");
 211     }
 212
 213     state->cg_gl.resize(dd->ncg_home);
 214     for (i = 0; i < dd->ncg_home; i++)
 215     {
 216         state->cg_gl[i] = dd->globalAtomGroupIndices[i];
 217     }
 218
 219     state->ddp_count_cg_gl = dd->ddp_count;
 220 }
 221
 222 gmx_domdec_zones_t *domdec_zones(gmx_domdec_t *dd)
 223 {
 224     return &dd->comm->zones;
 225 }
 226
 227 void dd_get_ns_ranges(const gmx_domdec_t *dd, int icg,
 228                       int *jcg0, int *jcg1, ivec shift0, ivec shift1)
 229 {
 230     gmx_domdec_zones_t *zones;
 231     int                 izone, d, dim;
 232
 233     zones = &dd->comm->zones;
 234
 235     izone = 0;
 236     while (icg >= zones->izone[izone].cg1)
 237     {
 238         izone++;
 239     }
 240
 241     if (izone == 0)
 242     {
 243         *jcg0 = icg;
 244     }
 245     else if (izone < zones->nizone)
 246     {
 247         *jcg0 = zones->izone[izone].jcg0;
 248     }
 249     else
 250     {
 251         gmx_fatal(FARGS, "DD icg %d out of range: izone (%d) >= nizone (%d)",
 252                   icg, izone, zones->nizone);
 253     }
 254
 255     *jcg1 = zones->izone[izone].jcg1;
 256
 257     for (d = 0; d < dd->ndim; d++)
 258     {
 259         dim         = dd->dim[d];
 260         shift0[dim] = zones->izone[izone].shift0[dim];
 261         shift1[dim] = zones->izone[izone].shift1[dim];
 262         if (dd->comm->tric_dir[dim] || (isDlbOn(dd->comm) && d > 0))
 263         {
 264             /* A conservative approach, this can be optimized */
 265             shift0[dim] -= 1;
 266             shift1[dim] += 1;
 267         }
 268     }
 269 }
 270
 271 int dd_numHomeAtoms(const gmx_domdec_t &dd)
 272 {
 273     return dd.comm->atomRanges.numHomeAtoms();
 274 }
 275
 276 int dd_natoms_mdatoms(const gmx_domdec_t *dd)
 277 {
 278     /* We currently set mdatoms entries for all atoms:
 279      * local + non-local + communicated for vsite + constraints
 280      */
 281
 282     return dd->comm->atomRanges.numAtomsTotal();
 283 }
 284
 285 int dd_natoms_vsite(const gmx_domdec_t *dd)
 286 {
 287     return dd->comm->atomRanges.end(DDAtomRanges::Type::Vsites);
 288 }
 289
 290 void dd_get_constraint_range(const gmx_domdec_t *dd, int *at_start, int *at_end)
 291 {
 292     *at_start = dd->comm->atomRanges.start(DDAtomRanges::Type::Constraints);
 293     *at_end   = dd->comm->atomRanges.end(DDAtomRanges::Type::Constraints);
 294 }
 295
 296 void dd_move_x(gmx_domdec_t             *dd,
 297                matrix                    box,
 298                gmx::ArrayRef<gmx::RVec>  x,
 299                gmx_wallcycle            *wcycle)
 300 {
 301     wallcycle_start(wcycle, ewcMOVEX);
 302
 303     int                    nzone, nat_tot;
 304     gmx_domdec_comm_t     *comm;
 305     gmx_domdec_comm_dim_t *cd;
 306     rvec                   shift = {0, 0, 0};
 307     gmx_bool               bPBC, bScrew;
 308
 309     comm = dd->comm;
 310
 311     const gmx::RangePartitioning &atomGrouping = dd->atomGrouping();
 312
 313     nzone   = 1;
 314     nat_tot = comm->atomRanges.numHomeAtoms();
 315     for (int d = 0; d < dd->ndim; d++)
 316     {
 317         bPBC   = (dd->ci[dd->dim[d]] == 0);
 318         bScrew = (bPBC && dd->bScrewPBC && dd->dim[d] == XX);
 319         if (bPBC)
 320         {
 321             copy_rvec(box[dd->dim[d]], shift);
 322         }
 323         cd = &comm->cd[d];
 324         for (const gmx_domdec_ind_t &ind : cd->ind)
 325         {
 326             DDBufferAccess<gmx::RVec>  sendBufferAccess(comm->rvecBuffer, ind.nsend[nzone + 1]);
 327             gmx::ArrayRef<gmx::RVec>  &sendBuffer = sendBufferAccess.buffer;
 328             int                        n          = 0;
 329             if (!bPBC)
 330             {
 331                 for (int g : ind.index)
 332                 {
 333                     for (int j : atomGrouping.block(g))
 334                     {
 335                         sendBuffer[n] = x[j];
 336                         n++;
 337                     }
 338                 }
 339             }
 340             else if (!bScrew)
 341             {
 342                 for (int g : ind.index)
 343                 {
 344                     for (int j : atomGrouping.block(g))
 345                     {
 346                         /* We need to shift the coordinates */
 347                         for (int d = 0; d < DIM; d++)
 348                         {
 349                             sendBuffer[n][d] = x[j][d] + shift[d];
 350                         }
 351                         n++;
 352                     }
 353                 }
 354             }
 355             else
 356             {
 357                 for (int g : ind.index)
 358                 {
 359                     for (int j : atomGrouping.block(g))
 360                     {
 361                         /* Shift x */
 362                         sendBuffer[n][XX] = x[j][XX] + shift[XX];
 363                         /* Rotate y and z.
 364                          * This operation requires a special shift force
 365                          * treatment, which is performed in calc_vir.
 366                          */
 367                         sendBuffer[n][YY] = box[YY][YY] - x[j][YY];
 368                         sendBuffer[n][ZZ] = box[ZZ][ZZ] - x[j][ZZ];
 369                         n++;
 370                     }
 371                 }
 372             }
 373
 374             DDBufferAccess<gmx::RVec>  receiveBufferAccess(comm->rvecBuffer2, cd->receiveInPlace ? 0 : ind.nrecv[nzone + 1]);
 375
 376             gmx::ArrayRef<gmx::RVec>   receiveBuffer;
 377             if (cd->receiveInPlace)
 378             {
 379                 receiveBuffer = gmx::arrayRefFromArray(x.data() + nat_tot, ind.nrecv[nzone + 1]);
 380             }
 381             else
 382             {
 383                 receiveBuffer = receiveBufferAccess.buffer;
 384             }
 385             /* Send and receive the coordinates */
 386             ddSendrecv(dd, d, dddirBackward,
 387                        sendBuffer, receiveBuffer);
 388
 389             if (!cd->receiveInPlace)
 390             {
 391                 int j = 0;
 392                 for (int zone = 0; zone < nzone; zone++)
 393                 {
 394                     for (int i = ind.cell2at0[zone]; i < ind.cell2at1[zone]; i++)
 395                     {
 396                         x[i] = receiveBuffer[j++];
 397                     }
 398                 }
 399             }
 400             nat_tot += ind.nrecv[nzone+1];
 401         }
 402         nzone += nzone;
 403     }
 404
 405     wallcycle_stop(wcycle, ewcMOVEX);
 406 }
 407
 408 void dd_move_f(gmx_domdec_t             *dd,
 409                gmx::ArrayRef<gmx::RVec>  f,
 410                rvec                     *fshift,
 411                gmx_wallcycle            *wcycle)
 412 {
 413     wallcycle_start(wcycle, ewcMOVEF);
 414
 415     int                    nzone, nat_tot;
 416     gmx_domdec_comm_t     *comm;
 417     gmx_domdec_comm_dim_t *cd;
 418     ivec                   vis;
 419     int                    is;
 420     gmx_bool               bShiftForcesNeedPbc, bScrew;
 421
 422     comm = dd->comm;
 423
 424     const gmx::RangePartitioning &atomGrouping = dd->atomGrouping();
 425
 426     nzone   = comm->zones.n/2;
 427     nat_tot = comm->atomRanges.end(DDAtomRanges::Type::Zones);
 428     for (int d = dd->ndim-1; d >= 0; d--)
 429     {
 430         /* Only forces in domains near the PBC boundaries need to
 431            consider PBC in the treatment of fshift */
 432         bShiftForcesNeedPbc   = (dd->ci[dd->dim[d]] == 0);
 433         bScrew                = (bShiftForcesNeedPbc && dd->bScrewPBC && dd->dim[d] == XX);
 434         if (fshift == nullptr && !bScrew)
 435         {
 436             bShiftForcesNeedPbc = FALSE;
 437         }
 438         /* Determine which shift vector we need */
 439         clear_ivec(vis);
 440         vis[dd->dim[d]] = 1;
 441         is              = IVEC2IS(vis);
 442
 443         cd = &comm->cd[d];
 444         for (int p = cd->numPulses() - 1; p >= 0; p--)
 445         {
 446             const gmx_domdec_ind_t    &ind  = cd->ind[p];
 447             DDBufferAccess<gmx::RVec>  receiveBufferAccess(comm->rvecBuffer, ind.nsend[nzone + 1]);
 448             gmx::ArrayRef<gmx::RVec>  &receiveBuffer = receiveBufferAccess.buffer;
 449
 450             nat_tot                        -= ind.nrecv[nzone+1];
 451
 452             DDBufferAccess<gmx::RVec>  sendBufferAccess(comm->rvecBuffer2, cd->receiveInPlace ? 0 : ind.nrecv[nzone + 1]);
 453
 454             gmx::ArrayRef<gmx::RVec>   sendBuffer;
 455             if (cd->receiveInPlace)
 456             {
 457                 sendBuffer = gmx::arrayRefFromArray(f.data() + nat_tot, ind.nrecv[nzone + 1]);
 458             }
 459             else
 460             {
 461                 sendBuffer = sendBufferAccess.buffer;
 462                 int j = 0;
 463                 for (int zone = 0; zone < nzone; zone++)
 464                 {
 465                     for (int i = ind.cell2at0[zone]; i < ind.cell2at1[zone]; i++)
 466                     {
 467                         sendBuffer[j++] = f[i];
 468                     }
 469                 }
 470             }
 471             /* Communicate the forces */
 472             ddSendrecv(dd, d, dddirForward,
 473                        sendBuffer, receiveBuffer);
 474             /* Add the received forces */
 475             int n = 0;
 476             if (!bShiftForcesNeedPbc)
 477             {
 478                 for (int g : ind.index)
 479                 {
 480                     for (int j : atomGrouping.block(g))
 481                     {
 482                         for (int d = 0; d < DIM; d++)
 483                         {
 484                             f[j][d] += receiveBuffer[n][d];
 485                         }
 486                         n++;
 487                     }
 488                 }
 489             }
 490             else if (!bScrew)
 491             {
 492                 /* fshift should always be defined if this function is
 493                  * called when bShiftForcesNeedPbc is true */
 494                 assert(nullptr != fshift);
 495                 for (int g : ind.index)
 496                 {
 497                     for (int j : atomGrouping.block(g))
 498                     {
 499                         for (int d = 0; d < DIM; d++)
 500                         {
 501                             f[j][d] += receiveBuffer[n][d];
 502                         }
 503                         /* Add this force to the shift force */
 504                         for (int d = 0; d < DIM; d++)
 505                         {
 506                             fshift[is][d] += receiveBuffer[n][d];
 507                         }
 508                         n++;
 509                     }
 510                 }
 511             }
 512             else
 513             {
 514                 for (int g : ind.index)
 515                 {
 516                     for (int j : atomGrouping.block(g))
 517                     {
 518                         /* Rotate the force */
 519                         f[j][XX] += receiveBuffer[n][XX];
 520                         f[j][YY] -= receiveBuffer[n][YY];
 521                         f[j][ZZ] -= receiveBuffer[n][ZZ];
 522                         if (fshift)
 523                         {
 524                             /* Add this force to the shift force */
 525                             for (int d = 0; d < DIM; d++)
 526                             {
 527                                 fshift[is][d] += receiveBuffer[n][d];
 528                             }
 529                         }
 530                         n++;
 531                     }
 532                 }
 533             }
 534         }
 535         nzone /= 2;
 536     }
 537     wallcycle_stop(wcycle, ewcMOVEF);
 538 }
 539
 540 /* Convenience function for extracting a real buffer from an rvec buffer
 541  *
 542  * To reduce the number of temporary communication buffers and avoid
 543  * cache polution, we reuse gmx::RVec buffers for storing reals.
 544  * This functions return a real buffer reference with the same number
 545  * of elements as the gmx::RVec buffer (so 1/3 of the size in bytes).
 546  */
 547 static gmx::ArrayRef<real>
 548 realArrayRefFromRvecArrayRef(gmx::ArrayRef<gmx::RVec> arrayRef)
 549 {
 550     return gmx::arrayRefFromArray(as_rvec_array(arrayRef.data())[0],
 551                                   arrayRef.size());
 552 }
 553
 554 void dd_atom_spread_real(gmx_domdec_t *dd, real v[])
 555 {
 556     int                    nzone, nat_tot;
 557     gmx_domdec_comm_t     *comm;
 558     gmx_domdec_comm_dim_t *cd;
 559
 560     comm = dd->comm;
 561
 562     const gmx::RangePartitioning &atomGrouping = dd->atomGrouping();
 563
 564     nzone   = 1;
 565     nat_tot = comm->atomRanges.numHomeAtoms();
 566     for (int d = 0; d < dd->ndim; d++)
 567     {
 568         cd = &comm->cd[d];
 569         for (const gmx_domdec_ind_t &ind : cd->ind)
 570         {
 571             /* Note: We provision for RVec instead of real, so a factor of 3
 572              * more than needed. The buffer actually already has this size
 573              * and we pass a plain pointer below, so this does not matter.
 574              */
 575             DDBufferAccess<gmx::RVec> sendBufferAccess(comm->rvecBuffer, ind.nsend[nzone + 1]);
 576             gmx::ArrayRef<real>       sendBuffer = realArrayRefFromRvecArrayRef(sendBufferAccess.buffer);
 577             int                       n          = 0;
 578             for (int g : ind.index)
 579             {
 580                 for (int j : atomGrouping.block(g))
 581                 {
 582                     sendBuffer[n++] = v[j];
 583                 }
 584             }
 585
 586             DDBufferAccess<gmx::RVec> receiveBufferAccess(comm->rvecBuffer2, cd->receiveInPlace ? 0 : ind.nrecv[nzone + 1]);
 587
 588             gmx::ArrayRef<real>       receiveBuffer;
 589             if (cd->receiveInPlace)
 590             {
 591                 receiveBuffer = gmx::arrayRefFromArray(v + nat_tot, ind.nrecv[nzone + 1]);
 592             }
 593             else
 594             {
 595                 receiveBuffer = realArrayRefFromRvecArrayRef(receiveBufferAccess.buffer);
 596             }
 597             /* Send and receive the data */
 598             ddSendrecv(dd, d, dddirBackward,
 599                        sendBuffer, receiveBuffer);
 600             if (!cd->receiveInPlace)
 601             {
 602                 int j = 0;
 603                 for (int zone = 0; zone < nzone; zone++)
 604                 {
 605                     for (int i = ind.cell2at0[zone]; i < ind.cell2at1[zone]; i++)
 606                     {
 607                         v[i] = receiveBuffer[j++];
 608                     }
 609                 }
 610             }
 611             nat_tot += ind.nrecv[nzone+1];
 612         }
 613         nzone += nzone;
 614     }
 615 }
 616
 617 void dd_atom_sum_real(gmx_domdec_t *dd, real v[])
 618 {
 619     int                    nzone, nat_tot;
 620     gmx_domdec_comm_t     *comm;
 621     gmx_domdec_comm_dim_t *cd;
 622
 623     comm = dd->comm;
 624
 625     const gmx::RangePartitioning &atomGrouping = dd->atomGrouping();
 626
 627     nzone   = comm->zones.n/2;
 628     nat_tot = comm->atomRanges.end(DDAtomRanges::Type::Zones);
 629     for (int d = dd->ndim-1; d >= 0; d--)
 630     {
 631         cd = &comm->cd[d];
 632         for (int p = cd->numPulses() - 1; p >= 0; p--)
 633         {
 634             const gmx_domdec_ind_t &ind = cd->ind[p];
 635
 636             /* Note: We provision for RVec instead of real, so a factor of 3
 637              * more than needed. The buffer actually already has this size
 638              * and we typecast, so this works as intended.
 639              */
 640             DDBufferAccess<gmx::RVec> receiveBufferAccess(comm->rvecBuffer, ind.nsend[nzone + 1]);
 641             gmx::ArrayRef<real>       receiveBuffer = realArrayRefFromRvecArrayRef(receiveBufferAccess.buffer);
 642             nat_tot -= ind.nrecv[nzone + 1];
 643
 644             DDBufferAccess<gmx::RVec> sendBufferAccess(comm->rvecBuffer2, cd->receiveInPlace ? 0 : ind.nrecv[nzone + 1]);
 645
 646             gmx::ArrayRef<real>       sendBuffer;
 647             if (cd->receiveInPlace)
 648             {
 649                 sendBuffer = gmx::arrayRefFromArray(v + nat_tot, ind.nrecv[nzone + 1]);
 650             }
 651             else
 652             {
 653                 sendBuffer = realArrayRefFromRvecArrayRef(sendBufferAccess.buffer);
 654                 int j = 0;
 655                 for (int zone = 0; zone < nzone; zone++)
 656                 {
 657                     for (int i = ind.cell2at0[zone]; i < ind.cell2at1[zone]; i++)
 658                     {
 659                         sendBuffer[j++] = v[i];
 660                     }
 661                 }
 662             }
 663             /* Communicate the forces */
 664             ddSendrecv(dd, d, dddirForward,
 665                        sendBuffer, receiveBuffer);
 666             /* Add the received forces */
 667             int n = 0;
 668             for (int g : ind.index)
 669             {
 670                 for (int j : atomGrouping.block(g))
 671                 {
 672                     v[j] += receiveBuffer[n];
 673                     n++;
 674                 }
 675             }
 676         }
 677         nzone /= 2;
 678     }
 679 }
 680
 681 real dd_cutoff_multibody(const gmx_domdec_t *dd)
 682 {
 683     gmx_domdec_comm_t *comm;
 684     int                di;
 685     real               r;
 686
 687     comm = dd->comm;
 688
 689     r = -1;
 690     if (comm->bInterCGBondeds)
 691     {
 692         if (comm->cutoff_mbody > 0)
 693         {
 694             r = comm->cutoff_mbody;
 695         }
 696         else
 697         {
 698             /* cutoff_mbody=0 means we do not have DLB */
 699             r = comm->cellsize_min[dd->dim[0]];
 700             for (di = 1; di < dd->ndim; di++)
 701             {
 702                 r = std::min(r, comm->cellsize_min[dd->dim[di]]);
 703             }
 704             if (comm->bBondComm)
 705             {
 706                 r = std::max(r, comm->cutoff_mbody);
 707             }
 708             else
 709             {
 710                 r = std::min(r, comm->cutoff);
 711             }
 712         }
 713     }
 714
 715     return r;
 716 }
 717
 718 real dd_cutoff_twobody(const gmx_domdec_t *dd)
 719 {
 720     real r_mb;
 721
 722     r_mb = dd_cutoff_multibody(dd);
 723
 724     return std::max(dd->comm->cutoff, r_mb);
 725 }
 726
 727
 728 static void dd_cart_coord2pmecoord(const gmx_domdec_t *dd, const ivec coord,
 729                                    ivec coord_pme)
 730 {
 731     int nc, ntot;
 732
 733     nc   = dd->nc[dd->comm->cartpmedim];
 734     ntot = dd->comm->ntot[dd->comm->cartpmedim];
 735     copy_ivec(coord, coord_pme);
 736     coord_pme[dd->comm->cartpmedim] =
 737         nc + (coord[dd->comm->cartpmedim]*(ntot - nc) + (ntot - nc)/2)/nc;
 738 }
 739
 740 static int ddindex2pmeindex(const gmx_domdec_t *dd, int ddindex)
 741 {
 742     int npp, npme;
 743
 744     npp  = dd->nnodes;
 745     npme = dd->comm->npmenodes;
 746
 747     /* Here we assign a PME node to communicate with this DD node
 748      * by assuming that the major index of both is x.
 749      * We add cr->npmenodes/2 to obtain an even distribution.
 750      */
 751     return (ddindex*npme + npme/2)/npp;
 752 }
 753
 754 static int *dd_interleaved_pme_ranks(const gmx_domdec_t *dd)
 755 {
 756     int *pme_rank;
 757     int  n, i, p0, p1;
 758
 759     snew(pme_rank, dd->comm->npmenodes);
 760     n = 0;
 761     for (i = 0; i < dd->nnodes; i++)
 762     {
 763         p0 = ddindex2pmeindex(dd, i);
 764         p1 = ddindex2pmeindex(dd, i+1);
 765         if (i+1 == dd->nnodes || p1 > p0)
 766         {
 767             if (debug)
 768             {
 769                 fprintf(debug, "pme_rank[%d] = %d\n", n, i+1+n);
 770             }
 771             pme_rank[n] = i + 1 + n;
 772             n++;
 773         }
 774     }
 775
 776     return pme_rank;
 777 }
 778
 779 static int gmx_ddcoord2pmeindex(const t_commrec *cr, int x, int y, int z)
 780 {
 781     gmx_domdec_t *dd;
 782     ivec          coords;
 783     int           slab;
 784
 785     dd = cr->dd;
 786     /*
 787        if (dd->comm->bCartesian) {
 788        gmx_ddindex2xyz(dd->nc,ddindex,coords);
 789        dd_coords2pmecoords(dd,coords,coords_pme);
 790        copy_ivec(dd->ntot,nc);
 791        nc[dd->cartpmedim]         -= dd->nc[dd->cartpmedim];
 792        coords_pme[dd->cartpmedim] -= dd->nc[dd->cartpmedim];
 793
 794        slab = (coords_pme[XX]*nc[YY] + coords_pme[YY])*nc[ZZ] + coords_pme[ZZ];
 795        } else {
 796        slab = (ddindex*cr->npmenodes + cr->npmenodes/2)/dd->nnodes;
 797        }
 798      */
 799     coords[XX] = x;
 800     coords[YY] = y;
 801     coords[ZZ] = z;
 802     slab       = ddindex2pmeindex(dd, dd_index(dd->nc, coords));
 803
 804     return slab;
 805 }
 806
 807 static int ddcoord2simnodeid(const t_commrec *cr, int x, int y, int z)
 808 {
 809     gmx_domdec_comm_t *comm;
 810     ivec               coords;
 811     int                ddindex, nodeid = -1;
 812
 813     comm = cr->dd->comm;
 814
 815     coords[XX] = x;
 816     coords[YY] = y;
 817     coords[ZZ] = z;
 818     if (comm->bCartesianPP_PME)
 819     {
 820 #if GMX_MPI
 821         MPI_Cart_rank(cr->mpi_comm_mysim, coords, &nodeid);
 822 #endif
 823     }
 824     else
 825     {
 826         ddindex = dd_index(cr->dd->nc, coords);
 827         if (comm->bCartesianPP)
 828         {
 829             nodeid = comm->ddindex2simnodeid[ddindex];
 830         }
 831         else
 832         {
 833             if (comm->pmenodes)
 834             {
 835                 nodeid = ddindex + gmx_ddcoord2pmeindex(cr, x, y, z);
 836             }
 837             else
 838             {
 839                 nodeid = ddindex;
 840             }
 841         }
 842     }
 843
 844     return nodeid;
 845 }
 846
 847 static int dd_simnode2pmenode(const gmx_domdec_t         *dd,
 848                               const t_commrec gmx_unused *cr,
 849                               int                         sim_nodeid)
 850 {
 851     int pmenode = -1;
 852
 853     const gmx_domdec_comm_t *comm = dd->comm;
 854
 855     /* This assumes a uniform x domain decomposition grid cell size */
 856     if (comm->bCartesianPP_PME)
 857     {
 858 #if GMX_MPI
 859         ivec coord, coord_pme;
 860         MPI_Cart_coords(cr->mpi_comm_mysim, sim_nodeid, DIM, coord);
 861         if (coord[comm->cartpmedim] < dd->nc[comm->cartpmedim])
 862         {
 863             /* This is a PP node */
 864             dd_cart_coord2pmecoord(dd, coord, coord_pme);
 865             MPI_Cart_rank(cr->mpi_comm_mysim, coord_pme, &pmenode);
 866         }
 867 #endif
 868     }
 869     else if (comm->bCartesianPP)
 870     {
 871         if (sim_nodeid < dd->nnodes)
 872         {
 873             pmenode = dd->nnodes + ddindex2pmeindex(dd, sim_nodeid);
 874         }
 875     }
 876     else
 877     {
 878         /* This assumes DD cells with identical x coordinates
 879          * are numbered sequentially.
 880          */
 881         if (dd->comm->pmenodes == nullptr)
 882         {
 883             if (sim_nodeid < dd->nnodes)
 884             {
 885                 /* The DD index equals the nodeid */
 886                 pmenode = dd->nnodes + ddindex2pmeindex(dd, sim_nodeid);
 887             }
 888         }
 889         else
 890         {
 891             int i = 0;
 892             while (sim_nodeid > dd->comm->pmenodes[i])
 893             {
 894                 i++;
 895             }
 896             if (sim_nodeid < dd->comm->pmenodes[i])
 897             {
 898                 pmenode = dd->comm->pmenodes[i];
 899             }
 900         }
 901     }
 902
 903     return pmenode;
 904 }
 905
 906 NumPmeDomains getNumPmeDomains(const gmx_domdec_t *dd)
 907 {
 908     if (dd != nullptr)
 909     {
 910         return {
 911                    dd->comm->npmenodes_x, dd->comm->npmenodes_y
 912         };
 913     }
 914     else
 915     {
 916         return {
 917                    1, 1
 918         };
 919     }
 920 }
 921
 922 std::vector<int> get_pme_ddranks(const t_commrec *cr, int pmenodeid)
 923 {
 924     gmx_domdec_t *dd;
 925     int           x, y, z;
 926     ivec          coord, coord_pme;
 927
 928     dd = cr->dd;
 929
 930     std::vector<int> ddranks;
 931     ddranks.reserve((dd->nnodes+cr->npmenodes-1)/cr->npmenodes);
 932
 933     for (x = 0; x < dd->nc[XX]; x++)
 934     {
 935         for (y = 0; y < dd->nc[YY]; y++)
 936         {
 937             for (z = 0; z < dd->nc[ZZ]; z++)
 938             {
 939                 if (dd->comm->bCartesianPP_PME)
 940                 {
 941                     coord[XX] = x;
 942                     coord[YY] = y;
 943                     coord[ZZ] = z;
 944                     dd_cart_coord2pmecoord(dd, coord, coord_pme);
 945                     if (dd->ci[XX] == coord_pme[XX] &&
 946                         dd->ci[YY] == coord_pme[YY] &&
 947                         dd->ci[ZZ] == coord_pme[ZZ])
 948                     {
 949                         ddranks.push_back(ddcoord2simnodeid(cr, x, y, z));
 950                     }
 951                 }
 952                 else
 953                 {
 954                     /* The slab corresponds to the nodeid in the PME group */
 955                     if (gmx_ddcoord2pmeindex(cr, x, y, z) == pmenodeid)
 956                     {
 957                         ddranks.push_back(ddcoord2simnodeid(cr, x, y, z));
 958                     }
 959                 }
 960             }
 961         }
 962     }
 963     return ddranks;
 964 }
 965
 966 static gmx_bool receive_vir_ener(const gmx_domdec_t *dd, const t_commrec *cr)
 967 {
 968     gmx_bool bReceive = TRUE;
 969
 970     if (cr->npmenodes < dd->nnodes)
 971     {
 972         gmx_domdec_comm_t *comm = dd->comm;
 973         if (comm->bCartesianPP_PME)
 974         {
 975 #if GMX_MPI
 976             int  pmenode = dd_simnode2pmenode(dd, cr, cr->sim_nodeid);
 977             ivec coords;
 978             MPI_Cart_coords(cr->mpi_comm_mysim, cr->sim_nodeid, DIM, coords);
 979             coords[comm->cartpmedim]++;
 980             if (coords[comm->cartpmedim] < dd->nc[comm->cartpmedim])
 981             {
 982                 int rank;
 983                 MPI_Cart_rank(cr->mpi_comm_mysim, coords, &rank);
 984                 if (dd_simnode2pmenode(dd, cr, rank) == pmenode)
 985                 {
 986                     /* This is not the last PP node for pmenode */
 987                     bReceive = FALSE;
 988                 }
 989             }
 990 #else
 991             GMX_RELEASE_ASSERT(false, "Without MPI we should not have Cartesian PP-PME with #PMEnodes < #DDnodes");
 992 #endif
 993         }
 994         else
 995         {
 996             int pmenode = dd_simnode2pmenode(dd, cr, cr->sim_nodeid);
 997             if (cr->sim_nodeid+1 < cr->nnodes &&
 998                 dd_simnode2pmenode(dd, cr, cr->sim_nodeid+1) == pmenode)
 999             {
1000                 /* This is not the last PP node for pmenode */
1001                 bReceive = FALSE;
1002             }
1003         }
1004     }
1005
1006     return bReceive;
1007 }
1008
1009 static void set_slb_pme_dim_f(gmx_domdec_t *dd, int dim, real **dim_f)
1010 {
1011     gmx_domdec_comm_t *comm;
1012     int                i;
1013
1014     comm = dd->comm;
1015
1016     snew(*dim_f, dd->nc[dim]+1);
1017     (*dim_f)[0] = 0;
1018     for (i = 1; i < dd->nc[dim]; i++)
1019     {
1020         if (comm->slb_frac[dim])
1021         {
1022             (*dim_f)[i] = (*dim_f)[i-1] + comm->slb_frac[dim][i-1];
1023         }
1024         else
1025         {
1026             (*dim_f)[i] = static_cast<real>(i)/static_cast<real>(dd->nc[dim]);
1027         }
1028     }
1029     (*dim_f)[dd->nc[dim]] = 1;
1030 }
1031
1032 static void init_ddpme(gmx_domdec_t *dd, gmx_ddpme_t *ddpme, int dimind)
1033 {
1034     int  pmeindex, slab, nso, i;
1035     ivec xyz;
1036
1037     if (dimind == 0 && dd->dim[0] == YY && dd->comm->npmenodes_x == 1)
1038     {
1039         ddpme->dim = YY;
1040     }
1041     else
1042     {
1043         ddpme->dim = dimind;
1044     }
1045     ddpme->dim_match = (ddpme->dim == dd->dim[dimind]);
1046
1047     ddpme->nslab = (ddpme->dim == 0 ?
1048                     dd->comm->npmenodes_x :
1049                     dd->comm->npmenodes_y);
1050
1051     if (ddpme->nslab <= 1)
1052     {
1053         return;
1054     }
1055
1056     nso = dd->comm->npmenodes/ddpme->nslab;
1057     /* Determine for each PME slab the PP location range for dimension dim */
1058     snew(ddpme->pp_min, ddpme->nslab);
1059     snew(ddpme->pp_max, ddpme->nslab);
1060     for (slab = 0; slab < ddpme->nslab; slab++)
1061     {
1062         ddpme->pp_min[slab] = dd->nc[dd->dim[dimind]] - 1;
1063         ddpme->pp_max[slab] = 0;
1064     }
1065     for (i = 0; i < dd->nnodes; i++)
1066     {
1067         ddindex2xyz(dd->nc, i, xyz);
1068         /* For y only use our y/z slab.
1069          * This assumes that the PME x grid size matches the DD grid size.
1070          */
1071         if (dimind == 0 || xyz[XX] == dd->ci[XX])
1072         {
1073             pmeindex = ddindex2pmeindex(dd, i);
1074             if (dimind == 0)
1075             {
1076                 slab = pmeindex/nso;
1077             }
1078             else
1079             {
1080                 slab = pmeindex % ddpme->nslab;
1081             }
1082             ddpme->pp_min[slab] = std::min(ddpme->pp_min[slab], xyz[dimind]);
1083             ddpme->pp_max[slab] = std::max(ddpme->pp_max[slab], xyz[dimind]);
1084         }
1085     }
1086
1087     set_slb_pme_dim_f(dd, ddpme->dim, &ddpme->slb_dim_f);
1088 }
1089
1090 int dd_pme_maxshift_x(const gmx_domdec_t *dd)
1091 {
1092     if (dd->comm->ddpme[0].dim == XX)
1093     {
1094         return dd->comm->ddpme[0].maxshift;
1095     }
1096     else
1097     {
1098         return 0;
1099     }
1100 }
1101
1102 int dd_pme_maxshift_y(const gmx_domdec_t *dd)
1103 {
1104     if (dd->comm->ddpme[0].dim == YY)
1105     {
1106         return dd->comm->ddpme[0].maxshift;
1107     }
1108     else if (dd->comm->npmedecompdim >= 2 && dd->comm->ddpme[1].dim == YY)
1109     {
1110         return dd->comm->ddpme[1].maxshift;
1111     }
1112     else
1113     {
1114         return 0;
1115     }
1116 }
1117
1118 void dd_cycles_add(const gmx_domdec_t *dd, float cycles, int ddCycl)
1119 {
1120     /* Note that the cycles value can be incorrect, either 0 or some
1121      * extremely large value, when our thread migrated to another core
1122      * with an unsynchronized cycle counter. If this happens less often
1123      * that once per nstlist steps, this will not cause issues, since
1124      * we later subtract the maximum value from the sum over nstlist steps.
1125      * A zero count will slightly lower the total, but that's a small effect.
1126      * Note that the main purpose of the subtraction of the maximum value
1127      * is to avoid throwing off the load balancing when stalls occur due
1128      * e.g. system activity or network congestion.
1129      */
1130     dd->comm->cycl[ddCycl] += cycles;
1131     dd->comm->cycl_n[ddCycl]++;
1132     if (cycles > dd->comm->cycl_max[ddCycl])
1133     {
1134         dd->comm->cycl_max[ddCycl] = cycles;
1135     }
1136 }
1137
1138 #if GMX_MPI
1139 static void make_load_communicator(gmx_domdec_t *dd, int dim_ind, ivec loc)
1140 {
1141     MPI_Comm           c_row;
1142     int                dim, i, rank;
1143     ivec               loc_c;
1144     gmx_bool           bPartOfGroup = FALSE;
1145
1146     dim = dd->dim[dim_ind];
1147     copy_ivec(loc, loc_c);
1148     for (i = 0; i < dd->nc[dim]; i++)
1149     {
1150         loc_c[dim] = i;
1151         rank       = dd_index(dd->nc, loc_c);
1152         if (rank == dd->rank)
1153         {
1154             /* This process is part of the group */
1155             bPartOfGroup = TRUE;
1156         }
1157     }
1158     MPI_Comm_split(dd->mpi_comm_all, bPartOfGroup ? 0 : MPI_UNDEFINED, dd->rank,
1159                    &c_row);
1160     if (bPartOfGroup)
1161     {
1162         dd->comm->mpi_comm_load[dim_ind] = c_row;
1163         if (!isDlbDisabled(dd->comm))
1164         {
1165             DDCellsizesWithDlb &cellsizes = dd->comm->cellsizesWithDlb[dim_ind];
1166
1167             if (dd->ci[dim] == dd->master_ci[dim])
1168             {
1169                 /* This is the root process of this row */
1170                 cellsizes.rowMaster  = gmx::compat::make_unique<RowMaster>();
1171
1172                 RowMaster &rowMaster = *cellsizes.rowMaster;
1173                 rowMaster.cellFrac.resize(ddCellFractionBufferSize(dd, dim_ind));
1174                 rowMaster.oldCellFrac.resize(dd->nc[dim] + 1);
1175                 rowMaster.isCellMin.resize(dd->nc[dim]);
1176                 if (dim_ind > 0)
1177                 {
1178                     rowMaster.bounds.resize(dd->nc[dim]);
1179                 }
1180                 rowMaster.buf_ncd.resize(dd->nc[dim]);
1181             }
1182             else
1183             {
1184                 /* This is not a root process, we only need to receive cell_f */
1185                 cellsizes.fracRow.resize(ddCellFractionBufferSize(dd, dim_ind));
1186             }
1187         }
1188         if (dd->ci[dim] == dd->master_ci[dim])
1189         {
1190             snew(dd->comm->load[dim_ind].load, dd->nc[dim]*DD_NLOAD_MAX);
1191         }
1192     }
1193 }
1194 #endif
1195
1196 void dd_setup_dlb_resource_sharing(t_commrec            *cr,
1197                                    int                   gpu_id)
1198 {
1199 #if GMX_MPI
1200     int           physicalnode_id_hash;
1201     gmx_domdec_t *dd;
1202     MPI_Comm      mpi_comm_pp_physicalnode;
1203
1204     if (!thisRankHasDuty(cr, DUTY_PP) || gpu_id < 0)
1205     {
1206         /* Only ranks with short-ranged tasks (currently) use GPUs.
1207          * If we don't have GPUs assigned, there are no resources to share.
1208          */
1209         return;
1210     }
1211
1212     physicalnode_id_hash = gmx_physicalnode_id_hash();
1213
1214     dd = cr->dd;
1215
1216     if (debug)
1217     {
1218         fprintf(debug, "dd_setup_dd_dlb_gpu_sharing:\n");
1219         fprintf(debug, "DD PP rank %d physical node hash %d gpu_id %d\n",
1220                 dd->rank, physicalnode_id_hash, gpu_id);
1221     }
1222     /* Split the PP communicator over the physical nodes */
1223     /* TODO: See if we should store this (before), as it's also used for
1224      * for the nodecomm summation.
1225      */
1226     // TODO PhysicalNodeCommunicator could be extended/used to handle
1227     // the need for per-node per-group communicators.
1228     MPI_Comm_split(dd->mpi_comm_all, physicalnode_id_hash, dd->rank,
1229                    &mpi_comm_pp_physicalnode);
1230     MPI_Comm_split(mpi_comm_pp_physicalnode, gpu_id, dd->rank,
1231                    &dd->comm->mpi_comm_gpu_shared);
1232     MPI_Comm_free(&mpi_comm_pp_physicalnode);
1233     MPI_Comm_size(dd->comm->mpi_comm_gpu_shared, &dd->comm->nrank_gpu_shared);
1234
1235     if (debug)
1236     {
1237         fprintf(debug, "nrank_gpu_shared %d\n", dd->comm->nrank_gpu_shared);
1238     }
1239
1240     /* Note that some ranks could share a GPU, while others don't */
1241
1242     if (dd->comm->nrank_gpu_shared == 1)
1243     {
1244         MPI_Comm_free(&dd->comm->mpi_comm_gpu_shared);
1245     }
1246 #else
1247     GMX_UNUSED_VALUE(cr);
1248     GMX_UNUSED_VALUE(gpu_id);
1249 #endif
1250 }
1251
1252 static void make_load_communicators(gmx_domdec_t gmx_unused *dd)
1253 {
1254 #if GMX_MPI
1255     int  dim0, dim1, i, j;
1256     ivec loc;
1257
1258     if (debug)
1259     {
1260         fprintf(debug, "Making load communicators\n");
1261     }
1262
1263     snew(dd->comm->load,          std::max(dd->ndim, 1));
1264     snew(dd->comm->mpi_comm_load, std::max(dd->ndim, 1));
1265
1266     if (dd->ndim == 0)
1267     {
1268         return;
1269     }
1270
1271     clear_ivec(loc);
1272     make_load_communicator(dd, 0, loc);
1273     if (dd->ndim > 1)
1274     {
1275         dim0 = dd->dim[0];
1276         for (i = 0; i < dd->nc[dim0]; i++)
1277         {
1278             loc[dim0] = i;
1279             make_load_communicator(dd, 1, loc);
1280         }
1281     }
1282     if (dd->ndim > 2)
1283     {
1284         dim0 = dd->dim[0];
1285         for (i = 0; i < dd->nc[dim0]; i++)
1286         {
1287             loc[dim0] = i;
1288             dim1      = dd->dim[1];
1289             for (j = 0; j < dd->nc[dim1]; j++)
1290             {
1291                 loc[dim1] = j;
1292                 make_load_communicator(dd, 2, loc);
1293             }
1294         }
1295     }
1296
1297     if (debug)
1298     {
1299         fprintf(debug, "Finished making load communicators\n");
1300     }
1301 #endif
1302 }
1303
1304 /*! \brief Sets up the relation between neighboring domains and zones */
1305 static void setup_neighbor_relations(gmx_domdec_t *dd)
1306 {
1307     int                     d, dim, i, j, m;
1308     ivec                    tmp, s;
1309     gmx_domdec_zones_t     *zones;
1310     gmx_domdec_ns_ranges_t *izone;
1311
1312     for (d = 0; d < dd->ndim; d++)
1313     {
1314         dim = dd->dim[d];
1315         copy_ivec(dd->ci, tmp);
1316         tmp[dim]           = (tmp[dim] + 1) % dd->nc[dim];
1317         dd->neighbor[d][0] = ddcoord2ddnodeid(dd, tmp);
1318         copy_ivec(dd->ci, tmp);
1319         tmp[dim]           = (tmp[dim] - 1 + dd->nc[dim]) % dd->nc[dim];
1320         dd->neighbor[d][1] = ddcoord2ddnodeid(dd, tmp);
1321         if (debug)
1322         {
1323             fprintf(debug, "DD rank %d neighbor ranks in dir %d are + %d - %d\n",
1324                     dd->rank, dim,
1325                     dd->neighbor[d][0],
1326                     dd->neighbor[d][1]);
1327         }
1328     }
1329
1330     int nzone  = (1 << dd->ndim);
1331     int nizone = (1 << std::max(dd->ndim - 1, 0));
1332     assert(nizone >= 1 && nizone <= DD_MAXIZONE);
1333
1334     zones = &dd->comm->zones;
1335
1336     for (i = 0; i < nzone; i++)
1337     {
1338         m = 0;
1339         clear_ivec(zones->shift[i]);
1340         for (d = 0; d < dd->ndim; d++)
1341         {
1342             zones->shift[i][dd->dim[d]] = dd_zo[i][m++];
1343         }
1344     }
1345
1346     zones->n = nzone;
1347     for (i = 0; i < nzone; i++)
1348     {
1349         for (d = 0; d < DIM; d++)
1350         {
1351             s[d] = dd->ci[d] - zones->shift[i][d];
1352             if (s[d] < 0)
1353             {
1354                 s[d] += dd->nc[d];
1355             }
1356             else if (s[d] >= dd->nc[d])
1357             {
1358                 s[d] -= dd->nc[d];
1359             }
1360         }
1361     }
1362     zones->nizone = nizone;
1363     for (i = 0; i < zones->nizone; i++)
1364     {
1365         assert(ddNonbondedZonePairRanges[i][0] == i);
1366
1367         izone     = &zones->izone[i];
1368         /* dd_zp3 is for 3D decomposition, for fewer dimensions use only
1369          * j-zones up to nzone.
1370          */
1371         izone->j0 = std::min(ddNonbondedZonePairRanges[i][1], nzone);
1372         izone->j1 = std::min(ddNonbondedZonePairRanges[i][2], nzone);
1373         for (dim = 0; dim < DIM; dim++)
1374         {
1375             if (dd->nc[dim] == 1)
1376             {
1377                 /* All shifts should be allowed */
1378                 izone->shift0[dim] = -1;
1379                 izone->shift1[dim] = 1;
1380             }
1381             else
1382             {
1383                 /* Determine the min/max j-zone shift wrt the i-zone */
1384                 izone->shift0[dim] = 1;
1385                 izone->shift1[dim] = -1;
1386                 for (j = izone->j0; j < izone->j1; j++)
1387                 {
1388                     int shift_diff = zones->shift[j][dim] - zones->shift[i][dim];
1389                     if (shift_diff < izone->shift0[dim])
1390                     {
1391                         izone->shift0[dim] = shift_diff;
1392                     }
1393                     if (shift_diff > izone->shift1[dim])
1394                     {
1395                         izone->shift1[dim] = shift_diff;
1396                     }
1397                 }
1398             }
1399         }
1400     }
1401
1402     if (!isDlbDisabled(dd->comm))
1403     {
1404         dd->comm->cellsizesWithDlb.resize(dd->ndim);
1405     }
1406
1407     if (dd->comm->bRecordLoad)
1408     {
1409         make_load_communicators(dd);
1410     }
1411 }
1412
1413 static void make_pp_communicator(const gmx::MDLogger  &mdlog,
1414                                  gmx_domdec_t         *dd,
1415                                  t_commrec gmx_unused *cr,
1416                                  bool gmx_unused       reorder)
1417 {
1418 #if GMX_MPI
1419     gmx_domdec_comm_t *comm;
1420     int                rank, *buf;
1421     ivec               periods;
1422     MPI_Comm           comm_cart;
1423
1424     comm = dd->comm;
1425
1426     if (comm->bCartesianPP)
1427     {
1428         /* Set up cartesian communication for the particle-particle part */
1429         GMX_LOG(mdlog.info).appendTextFormatted(
1430                 "Will use a Cartesian communicator: %d x %d x %d",
1431                 dd->nc[XX], dd->nc[YY], dd->nc[ZZ]);
1432
1433         for (int i = 0; i < DIM; i++)
1434         {
1435             periods[i] = TRUE;
1436         }
1437         MPI_Cart_create(cr->mpi_comm_mygroup, DIM, dd->nc, periods, static_cast<int>(reorder),
1438                         &comm_cart);
1439         /* We overwrite the old communicator with the new cartesian one */
1440         cr->mpi_comm_mygroup = comm_cart;
1441     }
1442
1443     dd->mpi_comm_all = cr->mpi_comm_mygroup;
1444     MPI_Comm_rank(dd->mpi_comm_all, &dd->rank);
1445
1446     if (comm->bCartesianPP_PME)
1447     {
1448         /* Since we want to use the original cartesian setup for sim,
1449          * and not the one after split, we need to make an index.
1450          */
1451         snew(comm->ddindex2ddnodeid, dd->nnodes);
1452         comm->ddindex2ddnodeid[dd_index(dd->nc, dd->ci)] = dd->rank;
1453         gmx_sumi(dd->nnodes, comm->ddindex2ddnodeid, cr);
1454         /* Get the rank of the DD master,
1455          * above we made sure that the master node is a PP node.
1456          */
1457         if (MASTER(cr))
1458         {
1459             rank = dd->rank;
1460         }
1461         else
1462         {
1463             rank = 0;
1464         }
1465         MPI_Allreduce(&rank, &dd->masterrank, 1, MPI_INT, MPI_SUM, dd->mpi_comm_all);
1466     }
1467     else if (comm->bCartesianPP)
1468     {
1469         if (cr->npmenodes == 0)
1470         {
1471             /* The PP communicator is also
1472              * the communicator for this simulation
1473              */
1474             cr->mpi_comm_mysim = cr->mpi_comm_mygroup;
1475         }
1476         cr->nodeid = dd->rank;
1477
1478         MPI_Cart_coords(dd->mpi_comm_all, dd->rank, DIM, dd->ci);
1479
1480         /* We need to make an index to go from the coordinates
1481          * to the nodeid of this simulation.
1482          */
1483         snew(comm->ddindex2simnodeid, dd->nnodes);
1484         snew(buf, dd->nnodes);
1485         if (thisRankHasDuty(cr, DUTY_PP))
1486         {
1487             buf[dd_index(dd->nc, dd->ci)] = cr->sim_nodeid;
1488         }
1489         /* Communicate the ddindex to simulation nodeid index */
1490         MPI_Allreduce(buf, comm->ddindex2simnodeid, dd->nnodes, MPI_INT, MPI_SUM,
1491                       cr->mpi_comm_mysim);
1492         sfree(buf);
1493
1494         /* Determine the master coordinates and rank.
1495          * The DD master should be the same node as the master of this sim.
1496          */
1497         for (int i = 0; i < dd->nnodes; i++)
1498         {
1499             if (comm->ddindex2simnodeid[i] == 0)
1500             {
1501                 ddindex2xyz(dd->nc, i, dd->master_ci);
1502                 MPI_Cart_rank(dd->mpi_comm_all, dd->master_ci, &dd->masterrank);
1503             }
1504         }
1505         if (debug)
1506         {
1507             fprintf(debug, "The master rank is %d\n", dd->masterrank);
1508         }
1509     }
1510     else
1511     {
1512         /* No Cartesian communicators */
1513         /* We use the rank in dd->comm->all as DD index */
1514         ddindex2xyz(dd->nc, dd->rank, dd->ci);
1515         /* The simulation master nodeid is 0, so the DD master rank is also 0 */
1516         dd->masterrank = 0;
1517         clear_ivec(dd->master_ci);
1518     }
1519 #endif
1520
1521     GMX_LOG(mdlog.info).appendTextFormatted(
1522             "Domain decomposition rank %d, coordinates %d %d %d\n",
1523             dd->rank, dd->ci[XX], dd->ci[YY], dd->ci[ZZ]);
1524     if (debug)
1525     {
1526         fprintf(debug,
1527                 "Domain decomposition rank %d, coordinates %d %d %d\n\n",
1528                 dd->rank, dd->ci[XX], dd->ci[YY], dd->ci[ZZ]);
1529     }
1530 }
1531
1532 static void receive_ddindex2simnodeid(gmx_domdec_t         *dd,
1533                                       t_commrec            *cr)
1534 {
1535 #if GMX_MPI
1536     gmx_domdec_comm_t *comm = dd->comm;
1537
1538     if (!comm->bCartesianPP_PME && comm->bCartesianPP)
1539     {
1540         int *buf;
1541         snew(comm->ddindex2simnodeid, dd->nnodes);
1542         snew(buf, dd->nnodes);
1543         if (thisRankHasDuty(cr, DUTY_PP))
1544         {
1545             buf[dd_index(dd->nc, dd->ci)] = cr->sim_nodeid;
1546         }
1547         /* Communicate the ddindex to simulation nodeid index */
1548         MPI_Allreduce(buf, comm->ddindex2simnodeid, dd->nnodes, MPI_INT, MPI_SUM,
1549                       cr->mpi_comm_mysim);
1550         sfree(buf);
1551     }
1552 #else
1553     GMX_UNUSED_VALUE(dd);
1554     GMX_UNUSED_VALUE(cr);
1555 #endif
1556 }
1557
1558 static void split_communicator(const gmx::MDLogger &mdlog,
1559                                t_commrec *cr, gmx_domdec_t *dd,
1560                                DdRankOrder gmx_unused rankOrder,
1561                                bool gmx_unused reorder)
1562 {
1563     gmx_domdec_comm_t *comm;
1564     int                i;
1565     gmx_bool           bDiv[DIM];
1566 #if GMX_MPI
1567     MPI_Comm           comm_cart;
1568 #endif
1569
1570     comm = dd->comm;
1571
1572     if (comm->bCartesianPP)
1573     {
1574         for (i = 1; i < DIM; i++)
1575         {
1576             bDiv[i] = ((cr->npmenodes*dd->nc[i]) % (dd->nnodes) == 0);
1577         }
1578         if (bDiv[YY] || bDiv[ZZ])
1579         {
1580             comm->bCartesianPP_PME = TRUE;
1581             /* If we have 2D PME decomposition, which is always in x+y,
1582              * we stack the PME only nodes in z.
1583              * Otherwise we choose the direction that provides the thinnest slab
1584              * of PME only nodes as this will have the least effect
1585              * on the PP communication.
1586              * But for the PME communication the opposite might be better.
1587              */
1588             if (bDiv[ZZ] && (comm->npmenodes_y > 1 ||
1589                              !bDiv[YY] ||
1590                              dd->nc[YY] > dd->nc[ZZ]))
1591             {
1592                 comm->cartpmedim = ZZ;
1593             }
1594             else
1595             {
1596                 comm->cartpmedim = YY;
1597             }
1598             comm->ntot[comm->cartpmedim]
1599                 += (cr->npmenodes*dd->nc[comm->cartpmedim])/dd->nnodes;
1600         }
1601         else
1602         {
1603             GMX_LOG(mdlog.info).appendTextFormatted(
1604                     "Number of PME-only ranks (%d) is not a multiple of nx*ny (%d*%d) or nx*nz (%d*%d)",
1605                     cr->npmenodes, dd->nc[XX], dd->nc[YY], dd->nc[XX], dd->nc[ZZ]);
1606             GMX_LOG(mdlog.info).appendText("Will not use a Cartesian communicator for PP <-> PME\n");
1607         }
1608     }
1609
1610     if (comm->bCartesianPP_PME)
1611     {
1612 #if GMX_MPI
1613         int  rank;
1614         ivec periods;
1615
1616         GMX_LOG(mdlog.info).appendTextFormatted(
1617                 "Will use a Cartesian communicator for PP <-> PME: %d x %d x %d",
1618                 comm->ntot[XX], comm->ntot[YY], comm->ntot[ZZ]);
1619
1620         for (i = 0; i < DIM; i++)
1621         {
1622             periods[i] = TRUE;
1623         }
1624         MPI_Cart_create(cr->mpi_comm_mysim, DIM, comm->ntot, periods, static_cast<int>(reorder),
1625                         &comm_cart);
1626         MPI_Comm_rank(comm_cart, &rank);
1627         if (MASTER(cr) && rank != 0)
1628         {
1629             gmx_fatal(FARGS, "MPI rank 0 was renumbered by MPI_Cart_create, we do not allow this");
1630         }
1631
1632         /* With this assigment we loose the link to the original communicator
1633          * which will usually be MPI_COMM_WORLD, unless have multisim.
1634          */
1635         cr->mpi_comm_mysim = comm_cart;
1636         cr->sim_nodeid     = rank;
1637
1638         MPI_Cart_coords(cr->mpi_comm_mysim, cr->sim_nodeid, DIM, dd->ci);
1639
1640         GMX_LOG(mdlog.info).appendTextFormatted(
1641                 "Cartesian rank %d, coordinates %d %d %d\n",
1642                 cr->sim_nodeid, dd->ci[XX], dd->ci[YY], dd->ci[ZZ]);
1643
1644         if (dd->ci[comm->cartpmedim] < dd->nc[comm->cartpmedim])
1645         {
1646             cr->duty = DUTY_PP;
1647         }
1648         if (cr->npmenodes == 0 ||
1649             dd->ci[comm->cartpmedim] >= dd->nc[comm->cartpmedim])
1650         {
1651             cr->duty = DUTY_PME;
1652         }
1653
1654         /* Split the sim communicator into PP and PME only nodes */
1655         MPI_Comm_split(cr->mpi_comm_mysim,
1656                        getThisRankDuties(cr),
1657                        dd_index(comm->ntot, dd->ci),
1658                        &cr->mpi_comm_mygroup);
1659 #endif
1660     }
1661     else
1662     {
1663         switch (rankOrder)
1664         {
1665             case DdRankOrder::pp_pme:
1666                 GMX_LOG(mdlog.info).appendText("Order of the ranks: PP first, PME last");
1667                 break;
1668             case DdRankOrder::interleave:
1669                 /* Interleave the PP-only and PME-only ranks */
1670                 GMX_LOG(mdlog.info).appendText("Interleaving PP and PME ranks");
1671                 comm->pmenodes = dd_interleaved_pme_ranks(dd);
1672                 break;
1673             case DdRankOrder::cartesian:
1674                 break;
1675             default:
1676                 gmx_fatal(FARGS, "Invalid ddRankOrder=%d", static_cast<int>(rankOrder));
1677         }
1678
1679         if (dd_simnode2pmenode(dd, cr, cr->sim_nodeid) == -1)
1680         {
1681             cr->duty = DUTY_PME;
1682         }
1683         else
1684         {
1685             cr->duty = DUTY_PP;
1686         }
1687 #if GMX_MPI
1688         /* Split the sim communicator into PP and PME only nodes */
1689         MPI_Comm_split(cr->mpi_comm_mysim,
1690                        getThisRankDuties(cr),
1691                        cr->nodeid,
1692                        &cr->mpi_comm_mygroup);
1693         MPI_Comm_rank(cr->mpi_comm_mygroup, &cr->nodeid);
1694 #endif
1695     }
1696
1697     GMX_LOG(mdlog.info).appendTextFormatted(
1698             "This rank does only %s work.\n",
1699             thisRankHasDuty(cr, DUTY_PP) ? "particle-particle" : "PME-mesh");
1700 }
1701
1702 /*! \brief Generates the MPI communicators for domain decomposition */
1703 static void make_dd_communicators(const gmx::MDLogger &mdlog,
1704                                   t_commrec *cr,
1705                                   gmx_domdec_t *dd, DdRankOrder ddRankOrder)
1706 {
1707     gmx_domdec_comm_t *comm;
1708     bool               CartReorder;
1709
1710     comm = dd->comm;
1711
1712     copy_ivec(dd->nc, comm->ntot);
1713
1714     comm->bCartesianPP     = (ddRankOrder == DdRankOrder::cartesian);
1715     comm->bCartesianPP_PME = FALSE;
1716
1717     /* Reorder the nodes by default. This might change the MPI ranks.
1718      * Real reordering is only supported on very few architectures,
1719      * Blue Gene is one of them.
1720      */
1721     CartReorder = getenv("GMX_NO_CART_REORDER") == nullptr;
1722
1723     if (cr->npmenodes > 0)
1724     {
1725         /* Split the communicator into a PP and PME part */
1726         split_communicator(mdlog, cr, dd, ddRankOrder, CartReorder);
1727         if (comm->bCartesianPP_PME)
1728         {
1729             /* We (possibly) reordered the nodes in split_communicator,
1730              * so it is no longer required in make_pp_communicator.
1731              */
1732             CartReorder = FALSE;
1733         }
1734     }
1735     else
1736     {
1737         /* All nodes do PP and PME */
1738 #if GMX_MPI
1739         /* We do not require separate communicators */
1740         cr->mpi_comm_mygroup = cr->mpi_comm_mysim;
1741 #endif
1742     }
1743
1744     if (thisRankHasDuty(cr, DUTY_PP))
1745     {
1746         /* Copy or make a new PP communicator */
1747         make_pp_communicator(mdlog, dd, cr, CartReorder);
1748     }
1749     else
1750     {
1751         receive_ddindex2simnodeid(dd, cr);
1752     }
1753
1754     if (!thisRankHasDuty(cr, DUTY_PME))
1755     {
1756         /* Set up the commnuication to our PME node */
1757         dd->pme_nodeid           = dd_simnode2pmenode(dd, cr, cr->sim_nodeid);
1758         dd->pme_receive_vir_ener = receive_vir_ener(dd, cr);
1759         if (debug)
1760         {
1761             fprintf(debug, "My pme_nodeid %d receive ener %s\n",
1762                     dd->pme_nodeid, gmx::boolToString(dd->pme_receive_vir_ener));
1763         }
1764     }
1765     else
1766     {
1767         dd->pme_nodeid = -1;
1768     }
1769
1770     /* We can not use DDMASTER(dd), because dd->masterrank is set later */
1771     if (MASTER(cr))
1772     {
1773         dd->ma = gmx::compat::make_unique<AtomDistribution>(dd->nc,
1774                                                             comm->cgs_gl.nr,
1775                                                             comm->cgs_gl.index[comm->cgs_gl.nr]);
1776     }
1777 }
1778
1779 static real *get_slb_frac(const gmx::MDLogger &mdlog,
1780                           const char *dir, int nc, const char *size_string)
1781 {
1782     real  *slb_frac, tot;
1783     int    i, n;
1784     double dbl;
1785
1786     slb_frac = nullptr;
1787     if (nc > 1 && size_string != nullptr)
1788     {
1789         GMX_LOG(mdlog.info).appendTextFormatted(
1790                 "Using static load balancing for the %s direction", dir);
1791         snew(slb_frac, nc);
1792         tot = 0;
1793         for (i = 0; i < nc; i++)
1794         {
1795             dbl = 0;
1796             sscanf(size_string, "%20lf%n", &dbl, &n);
1797             if (dbl == 0)
1798             {
1799                 gmx_fatal(FARGS, "Incorrect or not enough DD cell size entries for direction %s: '%s'", dir, size_string);
1800             }
1801             slb_frac[i]  = dbl;
1802             size_string += n;
1803             tot         += slb_frac[i];
1804         }
1805         /* Normalize */
1806         std::string relativeCellSizes = "Relative cell sizes:";
1807         for (i = 0; i < nc; i++)
1808         {
1809             slb_frac[i]       /= tot;
1810             relativeCellSizes += gmx::formatString(" %5.3f", slb_frac[i]);
1811         }
1812         GMX_LOG(mdlog.info).appendText(relativeCellSizes);
1813     }
1814
1815     return slb_frac;
1816 }
1817
1818 static int multi_body_bondeds_count(const gmx_mtop_t *mtop)
1819 {
1820     int                  n     = 0;
1821     gmx_mtop_ilistloop_t iloop = gmx_mtop_ilistloop_init(mtop);
1822     int                  nmol;
1823     while (const InteractionLists *ilists = gmx_mtop_ilistloop_next(iloop, &nmol))
1824     {
1825         for (auto &ilist : extractILists(*ilists, IF_BOND))
1826         {
1827             if (NRAL(ilist.functionType) >  2)
1828             {
1829                 n += nmol*(ilist.iatoms.size()/ilistStride(ilist));
1830             }
1831         }
1832     }
1833
1834     return n;
1835 }
1836
1837 static int dd_getenv(const gmx::MDLogger &mdlog,
1838                      const char *env_var, int def)
1839 {
1840     char *val;
1841     int   nst;
1842
1843     nst = def;
1844     val = getenv(env_var);
1845     if (val)
1846     {
1847         if (sscanf(val, "%20d", &nst) <= 0)
1848         {
1849             nst = 1;
1850         }
1851         GMX_LOG(mdlog.info).appendTextFormatted(
1852                 "Found env.var. %s = %s, using value %d",
1853                 env_var, val, nst);
1854     }
1855
1856     return nst;
1857 }
1858
1859 static void check_dd_restrictions(const gmx_domdec_t  *dd,
1860                                   const t_inputrec    *ir,
1861                                   const gmx::MDLogger &mdlog)
1862 {
1863     if (ir->ePBC == epbcSCREW &&
1864         (dd->nc[XX] == 1 || dd->nc[YY] > 1 || dd->nc[ZZ] > 1))
1865     {
1866         gmx_fatal(FARGS, "With pbc=%s can only do domain decomposition in the x-direction", epbc_names[ir->ePBC]);
1867     }
1868
1869     if (ir->ns_type == ensSIMPLE)
1870     {
1871         gmx_fatal(FARGS, "Domain decomposition does not support simple neighbor searching, use grid searching or run with one MPI rank");
1872     }
1873
1874     if (ir->nstlist == 0)
1875     {
1876         gmx_fatal(FARGS, "Domain decomposition does not work with nstlist=0");
1877     }
1878
1879     if (ir->comm_mode == ecmANGULAR && ir->ePBC != epbcNONE)
1880     {
1881         GMX_LOG(mdlog.warning).appendText("comm-mode angular will give incorrect results when the comm group partially crosses a periodic boundary");
1882     }
1883 }
1884
1885 static real average_cellsize_min(gmx_domdec_t *dd, gmx_ddbox_t *ddbox)
1886 {
1887     int  di, d;
1888     real r;
1889
1890     r = ddbox->box_size[XX];
1891     for (di = 0; di < dd->ndim; di++)
1892     {
1893         d = dd->dim[di];
1894         /* Check using the initial average cell size */
1895         r = std::min(r, ddbox->box_size[d]*ddbox->skew_fac[d]/dd->nc[d]);
1896     }
1897
1898     return r;
1899 }
1900
1901 /*! \brief Depending on the DLB initial value return the DLB switched off state or issue an error.
1902  */
1903 static DlbState forceDlbOffOrBail(DlbState             cmdlineDlbState,
1904                                   const std::string   &reasonStr,
1905                                   const gmx::MDLogger &mdlog)
1906 {
1907     std::string dlbNotSupportedErr  = "Dynamic load balancing requested, but ";
1908     std::string dlbDisableNote      = "NOTE: disabling dynamic load balancing as ";
1909
1910     if (cmdlineDlbState == DlbState::onUser)
1911     {
1912         gmx_fatal(FARGS, "%s", (dlbNotSupportedErr + reasonStr).c_str());
1913     }
1914     else if (cmdlineDlbState == DlbState::offCanTurnOn)
1915     {
1916         GMX_LOG(mdlog.info).appendText(dlbDisableNote + reasonStr);
1917     }
1918     return DlbState::offForever;
1919 }
1920
1921 /*! \brief Return the dynamic load balancer's initial state based on initial conditions and user inputs.
1922  *
1923  * This function parses the parameters of "-dlb" command line option setting
1924  * corresponding state values. Then it checks the consistency of the determined
1925  * state with other run parameters and settings. As a result, the initial state
1926  * may be altered or an error may be thrown if incompatibility of options is detected.
1927  *
1928  * \param [in] mdlog       Logger.
1929  * \param [in] dlbOption   Enum value for the DLB option.
1930  * \param [in] bRecordLoad True if the load balancer is recording load information.
1931  * \param [in] mdrunOptions  Options for mdrun.
1932  * \param [in] ir          Pointer mdrun to input parameters.
1933  * \returns                DLB initial/startup state.
1934  */
1935 static DlbState determineInitialDlbState(const gmx::MDLogger &mdlog,
1936                                          DlbOption dlbOption, gmx_bool bRecordLoad,
1937                                          const MdrunOptions &mdrunOptions,
1938                                          const t_inputrec *ir)
1939 {
1940     DlbState dlbState = DlbState::offCanTurnOn;
1941
1942     switch (dlbOption)
1943     {
1944         case DlbOption::turnOnWhenUseful: dlbState = DlbState::offCanTurnOn; break;
1945         case DlbOption::no:               dlbState = DlbState::offUser;      break;
1946         case DlbOption::yes:              dlbState = DlbState::onUser;       break;
1947         default: gmx_incons("Invalid dlbOption enum value");
1948     }
1949
1950     /* Reruns don't support DLB: bail or override auto mode */
1951     if (mdrunOptions.rerun)
1952     {
1953         std::string reasonStr = "it is not supported in reruns.";
1954         return forceDlbOffOrBail(dlbState, reasonStr, mdlog);
1955     }
1956
1957     /* Unsupported integrators */
1958     if (!EI_DYNAMICS(ir->eI))
1959     {
1960         auto reasonStr = gmx::formatString("it is only supported with dynamics, not with integrator '%s'.", EI(ir->eI));
1961         return forceDlbOffOrBail(dlbState, reasonStr, mdlog);
1962     }
1963
1964     /* Without cycle counters we can't time work to balance on */
1965     if (!bRecordLoad)
1966     {
1967         std::string reasonStr = "cycle counters unsupported or not enabled in the operating system kernel.";
1968         return forceDlbOffOrBail(dlbState, reasonStr, mdlog);
1969     }
1970
1971     if (mdrunOptions.reproducible)
1972     {
1973         std::string reasonStr = "you started a reproducible run.";
1974         switch (dlbState)
1975         {
1976             case DlbState::offUser:
1977                 break;
1978             case DlbState::offForever:
1979                 GMX_RELEASE_ASSERT(false, "DlbState::offForever is not a valid initial state");
1980                 break;
1981             case DlbState::offCanTurnOn:
1982                 return forceDlbOffOrBail(dlbState, reasonStr, mdlog);
1983             case DlbState::onCanTurnOff:
1984                 GMX_RELEASE_ASSERT(false, "DlbState::offCanTurnOff is not a valid initial state");
1985                 break;
1986             case DlbState::onUser:
1987                 return forceDlbOffOrBail(dlbState, reasonStr + " In load balanced runs binary reproducibility cannot be ensured.", mdlog);
1988             default:
1989                 gmx_fatal(FARGS, "Death horror: undefined case (%d) for load balancing choice", static_cast<int>(dlbState));
1990         }
1991     }
1992
1993     return dlbState;
1994 }
1995
1996 static void set_dd_dim(const gmx::MDLogger &mdlog, gmx_domdec_t *dd)
1997 {
1998     dd->ndim = 0;
1999     if (getenv("GMX_DD_ORDER_ZYX") != nullptr)
2000     {
2001         /* Decomposition order z,y,x */
2002         GMX_LOG(mdlog.info).appendText("Using domain decomposition order z, y, x");
2003         for (int dim = DIM-1; dim >= 0; dim--)
2004         {
2005             if (dd->nc[dim] > 1)
2006             {
2007                 dd->dim[dd->ndim++] = dim;
2008             }
2009         }
2010     }
2011     else
2012     {
2013         /* Decomposition order x,y,z */
2014         for (int dim = 0; dim < DIM; dim++)
2015         {
2016             if (dd->nc[dim] > 1)
2017             {
2018                 dd->dim[dd->ndim++] = dim;
2019             }
2020         }
2021     }
2022
2023     if (dd->ndim == 0)
2024     {
2025         /* Set dim[0] to avoid extra checks on ndim in several places */
2026         dd->dim[0] = XX;
2027     }
2028 }
2029
2030 static gmx_domdec_comm_t *init_dd_comm()
2031 {
2032     gmx_domdec_comm_t *comm = new gmx_domdec_comm_t;
2033
2034     comm->n_load_have      = 0;
2035     comm->n_load_collect   = 0;
2036
2037     comm->haveTurnedOffDlb = false;
2038
2039     for (int i = 0; i < static_cast<int>(DDAtomRanges::Type::Number); i++)
2040     {
2041         comm->sum_nat[i] = 0;
2042     }
2043     comm->ndecomp   = 0;
2044     comm->nload     = 0;
2045     comm->load_step = 0;
2046     comm->load_sum  = 0;
2047     comm->load_max  = 0;
2048     clear_ivec(comm->load_lim);
2049     comm->load_mdf  = 0;
2050     comm->load_pme  = 0;
2051
2052     /* This should be replaced by a unique pointer */
2053     comm->balanceRegion = ddBalanceRegionAllocate();
2054
2055     return comm;
2056 }
2057
2058 /* Returns whether mtop contains constraints and/or vsites */
2059 static bool systemHasConstraintsOrVsites(const gmx_mtop_t &mtop)
2060 {
2061     auto ilistLoop = gmx_mtop_ilistloop_init(mtop);
2062     int  nmol;
2063     while (const InteractionLists *ilists = gmx_mtop_ilistloop_next(ilistLoop, &nmol))
2064     {
2065         if (!extractILists(*ilists, IF_CONSTRAINT | IF_VSITE).empty())
2066         {
2067             return true;
2068         }
2069     }
2070
2071     return false;
2072 }
2073
2074 static void setupUpdateGroups(const gmx::MDLogger &mdlog,
2075                               const gmx_mtop_t    &mtop,
2076                               const t_inputrec    &inputrec,
2077                               real                 cutoffMargin,
2078                               int                  numMpiRanksTotal,
2079                               gmx_domdec_comm_t   *comm)
2080 {
2081     /* When we have constraints and/or vsites, it is beneficial to use
2082      * update groups (when possible) to allow independent update of groups.
2083      */
2084     if (!systemHasConstraintsOrVsites(mtop))
2085     {
2086         /* No constraints or vsites, atoms can be updated independently */
2087         return;
2088     }
2089
2090     comm->updateGroupingPerMoleculetype = gmx::makeUpdateGroups(mtop);
2091     comm->useUpdateGroups               =
2092         (!comm->updateGroupingPerMoleculetype.empty() &&
2093          getenv("GMX_NO_UPDATEGROUPS") == nullptr);
2094
2095     if (comm->useUpdateGroups)
2096     {
2097         int numUpdateGroups = 0;
2098         for (const auto &molblock : mtop.molblock)
2099         {
2100             numUpdateGroups += molblock.nmol*comm->updateGroupingPerMoleculetype[molblock.type].numBlocks();
2101         }
2102
2103         /* Note: We would like to use dd->nnodes for the atom count estimate,
2104          *       but that is not yet available here. But this anyhow only
2105          *       affect performance up to the second dd_partition_system call.
2106          */
2107         int homeAtomCountEstimate =  mtop.natoms/numMpiRanksTotal;
2108         comm->updateGroupsCog =
2109             gmx::compat::make_unique<gmx::UpdateGroupsCog>(mtop,
2110                                                            comm->updateGroupingPerMoleculetype,
2111                                                            maxReferenceTemperature(inputrec),
2112                                                            homeAtomCountEstimate);
2113
2114         /* To use update groups, the large domain-to-domain cutoff distance
2115          * should be compatible with the box size.
2116          */
2117         comm->useUpdateGroups = (atomToAtomIntoDomainToDomainCutoff(*comm, 0) < cutoffMargin);
2118
2119         if (comm->useUpdateGroups)
2120         {
2121             GMX_LOG(mdlog.info).appendTextFormatted(
2122                     "Using update groups, nr %d, average size %.1f atoms, max. radius %.3f nm\n",
2123                     numUpdateGroups,
2124                     mtop.natoms/static_cast<double>(numUpdateGroups),
2125                     comm->updateGroupsCog->maxUpdateGroupRadius());
2126         }
2127         else
2128         {
2129             GMX_LOG(mdlog.info).appendTextFormatted("The combination of rlist and box size prohibits the use of update groups\n");
2130             comm->updateGroupingPerMoleculetype.clear();
2131             comm->updateGroupsCog.reset(nullptr);
2132         }
2133     }
2134 }
2135
2136 /*! \brief Set the cell size and interaction limits, as well as the DD grid */
2137 static void set_dd_limits_and_grid(const gmx::MDLogger &mdlog,
2138                                    t_commrec *cr, gmx_domdec_t *dd,
2139                                    const DomdecOptions &options,
2140                                    const MdrunOptions &mdrunOptions,
2141                                    const gmx_mtop_t *mtop,
2142                                    const t_inputrec *ir,
2143                                    const matrix box,
2144                                    gmx::ArrayRef<const gmx::RVec> xGlobal,
2145                                    gmx_ddbox_t *ddbox)
2146 {
2147     real               r_bonded         = -1;
2148     real               r_bonded_limit   = -1;
2149     const real         tenPercentMargin = 1.1;
2150     gmx_domdec_comm_t *comm             = dd->comm;
2151
2152     dd->npbcdim              = ePBC2npbcdim(ir->ePBC);
2153     dd->numBoundedDimensions = inputrec2nboundeddim(ir);
2154     dd->haveDynamicBox       = inputrecDynamicBox(ir);
2155     dd->bScrewPBC            = (ir->ePBC == epbcSCREW);
2156
2157     dd->pme_recv_f_alloc = 0;
2158     dd->pme_recv_f_buf   = nullptr;
2159
2160     /* Initialize to GPU share count to 0, might change later */
2161     comm->nrank_gpu_shared = 0;
2162
2163     comm->dlbState         = determineInitialDlbState(mdlog, options.dlbOption, comm->bRecordLoad, mdrunOptions, ir);
2164     dd_dlb_set_should_check_whether_to_turn_dlb_on(dd, TRUE);
2165     /* To consider turning DLB on after 2*nstlist steps we need to check
2166      * at partitioning count 3. Thus we need to increase the first count by 2.
2167      */
2168     comm->ddPartioningCountFirstDlbOff += 2;
2169
2170     GMX_LOG(mdlog.info).appendTextFormatted(
2171             "Dynamic load balancing: %s", edlbs_names[int(comm->dlbState)]);
2172
2173     comm->bPMELoadBalDLBLimits = FALSE;
2174
2175     /* Allocate the charge group/atom sorting struct */
2176     comm->sort = gmx::compat::make_unique<gmx_domdec_sort_t>();
2177
2178     comm->bCGs = (ncg_mtop(mtop) < mtop->natoms);
2179
2180     /* We need to decide on update groups early, as this affects communication distances */
2181     comm->useUpdateGroups = false;
2182     if (ir->cutoff_scheme == ecutsVERLET)
2183     {
2184         real cutoffMargin = std::sqrt(max_cutoff2(ir->ePBC, box)) - ir->rlist;
2185         setupUpdateGroups(mdlog, *mtop, *ir, cutoffMargin, cr->nnodes, comm);
2186     }
2187
2188     comm->bInterCGBondeds = ((ncg_mtop(mtop) > gmx_mtop_num_molecules(*mtop)) ||
2189                              mtop->bIntermolecularInteractions);
2190     if (comm->bInterCGBondeds)
2191     {
2192         comm->bInterCGMultiBody = (multi_body_bondeds_count(mtop) > 0);
2193     }
2194     else
2195     {
2196         comm->bInterCGMultiBody = FALSE;
2197     }
2198
2199     if (comm->useUpdateGroups)
2200     {
2201         dd->splitConstraints = false;
2202         dd->splitSettles     = false;
2203     }
2204     else
2205     {
2206         dd->splitConstraints = gmx::inter_charge_group_constraints(*mtop);
2207         dd->splitSettles     = gmx::inter_charge_group_settles(*mtop);
2208     }
2209
2210     if (ir->rlist == 0)
2211     {
2212         /* Set the cut-off to some very large value,
2213          * so we don't need if statements everywhere in the code.
2214          * We use sqrt, since the cut-off is squared in some places.
2215          */
2216         comm->cutoff   = GMX_CUTOFF_INF;
2217     }
2218     else
2219     {
2220         comm->cutoff   = atomToAtomIntoDomainToDomainCutoff(*comm, ir->rlist);
2221     }
2222     comm->cutoff_mbody = 0;
2223
2224     /* Determine the minimum cell size limit, affected by many factors */
2225     comm->cellsize_limit = 0;
2226     comm->bBondComm      = FALSE;
2227
2228     /* We do not allow home atoms to move beyond the neighboring domain
2229      * between domain decomposition steps, which limits the cell size.
2230      * Get an estimate of cell size limit due to atom displacement.
2231      * In most cases this is a large overestimate, because it assumes
2232      * non-interaction atoms.
2233      * We set the chance to 1 in a trillion steps.
2234      */
2235     constexpr real c_chanceThatAtomMovesBeyondDomain = 1e-12;
2236     const real     limitForAtomDisplacement          =
2237         minCellSizeForAtomDisplacement(*mtop, *ir,
2238                                        comm->updateGroupingPerMoleculetype,
2239                                        c_chanceThatAtomMovesBeyondDomain);
2240     GMX_LOG(mdlog.info).appendTextFormatted(
2241             "Minimum cell size due to atom displacement: %.3f nm",
2242             limitForAtomDisplacement);
2243
2244     comm->cellsize_limit = std::max(comm->cellsize_limit,
2245                                     limitForAtomDisplacement);
2246
2247     /* TODO: PME decomposition currently requires atoms not to be more than
2248      *       2/3 of comm->cutoff, which is >=rlist, outside of their domain.
2249      *       In nearly all cases, limitForAtomDisplacement will be smaller
2250      *       than 2/3*rlist, so the PME requirement is satisfied.
2251      *       But it would be better for both correctness and performance
2252      *       to use limitForAtomDisplacement instead of 2/3*comm->cutoff.
2253      *       Note that we would need to improve the pairlist buffer case.
2254      */
2255
2256     if (comm->bInterCGBondeds)
2257     {
2258         if (options.minimumCommunicationRange > 0)
2259         {
2260             comm->cutoff_mbody = atomToAtomIntoDomainToDomainCutoff(*comm, options.minimumCommunicationRange);
2261             if (options.useBondedCommunication)
2262             {
2263                 comm->bBondComm = (comm->cutoff_mbody > comm->cutoff);
2264             }
2265             else
2266             {
2267                 comm->cutoff = std::max(comm->cutoff, comm->cutoff_mbody);
2268             }
2269             r_bonded_limit = comm->cutoff_mbody;
2270         }
2271         else if (ir->bPeriodicMols)
2272         {
2273             /* Can not easily determine the required cut-off */
2274             GMX_LOG(mdlog.warning).appendText("NOTE: Periodic molecules are present in this system. Because of this, the domain decomposition algorithm cannot easily determine the minimum cell size that it requires for treating bonded interactions. Instead, domain decomposition will assume that half the non-bonded cut-off will be a suitable lower bound.");
2275             comm->cutoff_mbody = comm->cutoff/2;
2276             r_bonded_limit     = comm->cutoff_mbody;
2277         }
2278         else
2279         {
2280             real r_2b, r_mb;
2281
2282             if (MASTER(cr))
2283             {
2284                 dd_bonded_cg_distance(mdlog, mtop, ir, as_rvec_array(xGlobal.data()), box,
2285                                       options.checkBondedInteractions,
2286                                       &r_2b, &r_mb);
2287             }
2288             gmx_bcast(sizeof(r_2b), &r_2b, cr);
2289             gmx_bcast(sizeof(r_mb), &r_mb, cr);
2290
2291             /* We use an initial margin of 10% for the minimum cell size,
2292              * except when we are just below the non-bonded cut-off.
2293              */
2294             if (options.useBondedCommunication)
2295             {
2296                 if (std::max(r_2b, r_mb) > comm->cutoff)
2297                 {
2298                     r_bonded        = std::max(r_2b, r_mb);
2299                     r_bonded_limit  = tenPercentMargin*r_bonded;
2300                     comm->bBondComm = TRUE;
2301                 }
2302                 else
2303                 {
2304                     r_bonded       = r_mb;
2305                     r_bonded_limit = std::min(tenPercentMargin*r_bonded, comm->cutoff);
2306                 }
2307                 /* We determine cutoff_mbody later */
2308             }
2309             else
2310             {
2311                 /* No special bonded communication,
2312                  * simply increase the DD cut-off.
2313                  */
2314                 r_bonded_limit     = tenPercentMargin*std::max(r_2b, r_mb);
2315                 comm->cutoff_mbody = r_bonded_limit;
2316                 comm->cutoff       = std::max(comm->cutoff, comm->cutoff_mbody);
2317             }
2318         }
2319         GMX_LOG(mdlog.info).appendTextFormatted(
2320                 "Minimum cell size due to bonded interactions: %.3f nm",
2321                 r_bonded_limit);
2322
2323         comm->cellsize_limit = std::max(comm->cellsize_limit, r_bonded_limit);
2324     }
2325
2326     real rconstr = 0;
2327     if (dd->splitConstraints && options.constraintCommunicationRange <= 0)
2328     {
2329         /* There is a cell size limit due to the constraints (P-LINCS) */
2330         rconstr = gmx::constr_r_max(mdlog, mtop, ir);
2331         GMX_LOG(mdlog.info).appendTextFormatted(
2332                 "Estimated maximum distance required for P-LINCS: %.3f nm",
2333                 rconstr);
2334         if (rconstr > comm->cellsize_limit)
2335         {
2336             GMX_LOG(mdlog.info).appendText("This distance will limit the DD cell size, you can override this with -rcon");
2337         }
2338     }
2339     else if (options.constraintCommunicationRange > 0)
2340     {
2341         /* Here we do not check for dd->splitConstraints.
2342          * because one can also set a cell size limit for virtual sites only
2343          * and at this point we don't know yet if there are intercg v-sites.
2344          */
2345         GMX_LOG(mdlog.info).appendTextFormatted(
2346                 "User supplied maximum distance required for P-LINCS: %.3f nm",
2347                 options.constraintCommunicationRange);
2348         rconstr = options.constraintCommunicationRange;
2349     }
2350     comm->cellsize_limit = std::max(comm->cellsize_limit, rconstr);
2351
2352     comm->cgs_gl = gmx_mtop_global_cgs(mtop);
2353
2354     if (options.numCells[XX] > 0)
2355     {
2356         copy_ivec(options.numCells, dd->nc);
2357         set_dd_dim(mdlog, dd);
2358         set_ddbox_cr(*cr, &dd->nc, *ir, box, xGlobal, ddbox);
2359
2360         if (options.numPmeRanks >= 0)
2361         {
2362             cr->npmenodes = options.numPmeRanks;
2363         }
2364         else
2365         {
2366             /* When the DD grid is set explicitly and -npme is set to auto,
2367              * don't use PME ranks. We check later if the DD grid is
2368              * compatible with the total number of ranks.
2369              */
2370             cr->npmenodes = 0;
2371         }
2372
2373         real acs = average_cellsize_min(dd, ddbox);
2374         if (acs < comm->cellsize_limit)
2375         {
2376             gmx_fatal_collective(FARGS, cr->mpi_comm_mysim, MASTER(cr),
2377                                  "The initial cell size (%f) is smaller than the cell size limit (%f), change options -dd, -rdd or -rcon, see the log file for details",
2378                                  acs, comm->cellsize_limit);
2379         }
2380     }
2381     else
2382     {
2383         set_ddbox_cr(*cr, nullptr, *ir, box, xGlobal, ddbox);
2384
2385         /* We need to choose the optimal DD grid and possibly PME nodes */
2386         real limit =
2387             dd_choose_grid(mdlog, cr, dd, ir, mtop, box, ddbox,
2388                            options.numPmeRanks,
2389                            !isDlbDisabled(comm),
2390                            options.dlbScaling,
2391                            comm->cellsize_limit, comm->cutoff,
2392                            comm->bInterCGBondeds);
2393
2394         if (dd->nc[XX] == 0)
2395         {
2396             char     buf[STRLEN];
2397             gmx_bool bC = (dd->splitConstraints && rconstr > r_bonded_limit);
2398             sprintf(buf, "Change the number of ranks or mdrun option %s%s%s",
2399                     !bC ? "-rdd" : "-rcon",
2400                     comm->dlbState != DlbState::offUser ? " or -dds" : "",
2401                     bC ? " or your LINCS settings" : "");
2402
2403             gmx_fatal_collective(FARGS, cr->mpi_comm_mysim, MASTER(cr),
2404                                  "There is no domain decomposition for %d ranks that is compatible with the given box and a minimum cell size of %g nm\n"
2405                                  "%s\n"
2406                                  "Look in the log file for details on the domain decomposition",
2407                                  cr->nnodes-cr->npmenodes, limit, buf);
2408         }
2409         set_dd_dim(mdlog, dd);
2410     }
2411
2412     GMX_LOG(mdlog.info).appendTextFormatted(
2413             "Domain decomposition grid %d x %d x %d, separate PME ranks %d",
2414             dd->nc[XX], dd->nc[YY], dd->nc[ZZ], cr->npmenodes);
2415
2416     dd->nnodes = dd->nc[XX]*dd->nc[YY]*dd->nc[ZZ];
2417     if (cr->nnodes - dd->nnodes != cr->npmenodes)
2418     {
2419         gmx_fatal_collective(FARGS, cr->mpi_comm_mysim, MASTER(cr),
2420                              "The size of the domain decomposition grid (%d) does not match the number of ranks (%d). The total number of ranks is %d",
2421                              dd->nnodes, cr->nnodes - cr->npmenodes, cr->nnodes);
2422     }
2423     if (cr->npmenodes > dd->nnodes)
2424     {
2425         gmx_fatal_collective(FARGS, cr->mpi_comm_mysim, MASTER(cr),
2426                              "The number of separate PME ranks (%d) is larger than the number of PP ranks (%d), this is not supported.", cr->npmenodes, dd->nnodes);
2427     }
2428     if (cr->npmenodes > 0)
2429     {
2430         comm->npmenodes = cr->npmenodes;
2431     }
2432     else
2433     {
2434         comm->npmenodes = dd->nnodes;
2435     }
2436
2437     if (EEL_PME(ir->coulombtype) || EVDW_PME(ir->vdwtype))
2438     {
2439         /* The following choices should match those
2440          * in comm_cost_est in domdec_setup.c.
2441          * Note that here the checks have to take into account
2442          * that the decomposition might occur in a different order than xyz
2443          * (for instance through the env.var. GMX_DD_ORDER_ZYX),
2444          * in which case they will not match those in comm_cost_est,
2445          * but since that is mainly for testing purposes that's fine.
2446          */
2447         if (dd->ndim >= 2 && dd->dim[0] == XX && dd->dim[1] == YY &&
2448             comm->npmenodes > dd->nc[XX] && comm->npmenodes % dd->nc[XX] == 0 &&
2449             getenv("GMX_PMEONEDD") == nullptr)
2450         {
2451             comm->npmedecompdim = 2;
2452             comm->npmenodes_x   = dd->nc[XX];
2453             comm->npmenodes_y   = comm->npmenodes/comm->npmenodes_x;
2454         }
2455         else
2456         {
2457             /* In case nc is 1 in both x and y we could still choose to
2458              * decompose pme in y instead of x, but we use x for simplicity.
2459              */
2460             comm->npmedecompdim = 1;
2461             if (dd->dim[0] == YY)
2462             {
2463                 comm->npmenodes_x = 1;
2464                 comm->npmenodes_y = comm->npmenodes;
2465             }
2466             else
2467             {
2468                 comm->npmenodes_x = comm->npmenodes;
2469                 comm->npmenodes_y = 1;
2470             }
2471         }
2472         GMX_LOG(mdlog.info).appendTextFormatted(
2473                 "PME domain decomposition: %d x %d x %d",
2474                 comm->npmenodes_x, comm->npmenodes_y, 1);
2475     }
2476     else
2477     {
2478         comm->npmedecompdim = 0;
2479         comm->npmenodes_x   = 0;
2480         comm->npmenodes_y   = 0;
2481     }
2482
2483     snew(comm->slb_frac, DIM);
2484     if (isDlbDisabled(comm))
2485     {
2486         comm->slb_frac[XX] = get_slb_frac(mdlog, "x", dd->nc[XX], options.cellSizeX);
2487         comm->slb_frac[YY] = get_slb_frac(mdlog, "y", dd->nc[YY], options.cellSizeY);
2488         comm->slb_frac[ZZ] = get_slb_frac(mdlog, "z", dd->nc[ZZ], options.cellSizeZ);
2489     }
2490
2491     if (comm->bInterCGBondeds && comm->cutoff_mbody == 0)
2492     {
2493         if (comm->bBondComm || !isDlbDisabled(comm))
2494         {
2495             /* Set the bonded communication distance to halfway
2496              * the minimum and the maximum,
2497              * since the extra communication cost is nearly zero.
2498              */
2499             real acs           = average_cellsize_min(dd, ddbox);
2500             comm->cutoff_mbody = 0.5*(r_bonded + acs);
2501             if (!isDlbDisabled(comm))
2502             {
2503                 /* Check if this does not limit the scaling */
2504                 comm->cutoff_mbody = std::min(comm->cutoff_mbody,
2505                                               options.dlbScaling*acs);
2506             }
2507             if (!comm->bBondComm)
2508             {
2509                 /* Without bBondComm do not go beyond the n.b. cut-off */
2510                 comm->cutoff_mbody = std::min(comm->cutoff_mbody, comm->cutoff);
2511                 if (comm->cellsize_limit >= comm->cutoff)
2512                 {
2513                     /* We don't loose a lot of efficieny
2514                      * when increasing it to the n.b. cut-off.
2515                      * It can even be slightly faster, because we need
2516                      * less checks for the communication setup.
2517                      */
2518                     comm->cutoff_mbody = comm->cutoff;
2519                 }
2520             }
2521             /* Check if we did not end up below our original limit */
2522             comm->cutoff_mbody = std::max(comm->cutoff_mbody, r_bonded_limit);
2523
2524             if (comm->cutoff_mbody > comm->cellsize_limit)
2525             {
2526                 comm->cellsize_limit = comm->cutoff_mbody;
2527             }
2528         }
2529         /* Without DLB and cutoff_mbody<cutoff, cutoff_mbody is dynamic */
2530     }
2531
2532     if (debug)
2533     {
2534         fprintf(debug, "Bonded atom communication beyond the cut-off: %s\n"
2535                 "cellsize limit %f\n",
2536                 gmx::boolToString(comm->bBondComm), comm->cellsize_limit);
2537     }
2538
2539     if (MASTER(cr))
2540     {
2541         check_dd_restrictions(dd, ir, mdlog);
2542     }
2543 }
2544
2545 static char *init_bLocalCG(const gmx_mtop_t *mtop)
2546 {
2547     int   ncg, cg;
2548     char *bLocalCG;
2549
2550     ncg = ncg_mtop(mtop);
2551     snew(bLocalCG, ncg);
2552     for (cg = 0; cg < ncg; cg++)
2553     {
2554         bLocalCG[cg] = FALSE;
2555     }
2556
2557     return bLocalCG;
2558 }
2559
2560 void dd_init_bondeds(FILE *fplog,
2561                      gmx_domdec_t *dd,
2562                      const gmx_mtop_t *mtop,
2563                      const gmx_vsite_t *vsite,
2564                      const t_inputrec *ir,
2565                      gmx_bool bBCheck, cginfo_mb_t *cginfo_mb)
2566 {
2567     gmx_domdec_comm_t *comm;
2568
2569     dd_make_reverse_top(fplog, dd, mtop, vsite, ir, bBCheck);
2570
2571     comm = dd->comm;
2572
2573     if (comm->bBondComm)
2574     {
2575         /* Communicate atoms beyond the cut-off for bonded interactions */
2576         comm = dd->comm;
2577
2578         comm->cglink = make_charge_group_links(mtop, dd, cginfo_mb);
2579
2580         comm->bLocalCG = init_bLocalCG(mtop);
2581     }
2582     else
2583     {
2584         /* Only communicate atoms based on cut-off */
2585         comm->cglink   = nullptr;
2586         comm->bLocalCG = nullptr;
2587     }
2588 }
2589
2590 static void writeSettings(gmx::TextWriter       *log,
2591                           gmx_domdec_t          *dd,
2592                           const gmx_mtop_t      *mtop,
2593                           const t_inputrec      *ir,
2594                           gmx_bool               bDynLoadBal,
2595                           real                   dlb_scale,
2596                           const gmx_ddbox_t     *ddbox)
2597 {
2598     gmx_domdec_comm_t *comm;
2599     int                d;
2600     ivec               np;
2601     real               limit, shrink;
2602
2603     comm = dd->comm;
2604
2605     if (bDynLoadBal)
2606     {
2607         log->writeString("The maximum number of communication pulses is:");
2608         for (d = 0; d < dd->ndim; d++)
2609         {
2610             log->writeStringFormatted(" %c %d", dim2char(dd->dim[d]), comm->cd[d].np_dlb);
2611         }
2612         log->ensureLineBreak();
2613         log->writeLineFormatted("The minimum size for domain decomposition cells is %.3f nm", comm->cellsize_limit);
2614         log->writeLineFormatted("The requested allowed shrink of DD cells (option -dds) is: %.2f", dlb_scale);
2615         log->writeString("The allowed shrink of domain decomposition cells is:");
2616         for (d = 0; d < DIM; d++)
2617         {
2618             if (dd->nc[d] > 1)
2619             {
2620                 if (d >= ddbox->npbcdim && dd->nc[d] == 2)
2621                 {
2622                     shrink = 0;
2623                 }
2624                 else
2625                 {
2626                     shrink =
2627                         comm->cellsize_min_dlb[d]/
2628                         (ddbox->box_size[d]*ddbox->skew_fac[d]/dd->nc[d]);
2629                 }
2630                 log->writeStringFormatted(" %c %.2f", dim2char(d), shrink);
2631             }
2632         }
2633         log->ensureLineBreak();
2634     }
2635     else
2636     {
2637         set_dd_cell_sizes_slb(dd, ddbox, setcellsizeslbPULSE_ONLY, np);
2638         log->writeString("The initial number of communication pulses is:");
2639         for (d = 0; d < dd->ndim; d++)
2640         {
2641             log->writeStringFormatted(" %c %d", dim2char(dd->dim[d]), np[dd->dim[d]]);
2642         }
2643         log->ensureLineBreak();
2644         log->writeString("The initial domain decomposition cell size is:");
2645         for (d = 0; d < DIM; d++)
2646         {
2647             if (dd->nc[d] > 1)
2648             {
2649                 log->writeStringFormatted(" %c %.2f nm",
2650                                           dim2char(d), dd->comm->cellsize_min[d]);
2651             }
2652         }
2653         log->ensureLineBreak();
2654         log->writeLine();
2655     }
2656
2657     gmx_bool bInterCGVsites = count_intercg_vsites(mtop) != 0;
2658
2659     if (comm->bInterCGBondeds ||
2660         bInterCGVsites ||
2661         dd->splitConstraints || dd->splitSettles)
2662     {
2663         std::string decompUnits;
2664         if (comm->bCGs)
2665         {
2666             decompUnits = "charge groups";
2667         }
2668         else if (comm->useUpdateGroups)
2669         {
2670             decompUnits = "atom groups";
2671         }
2672         else
2673         {
2674             decompUnits = "atoms";
2675         }
2676
2677         log->writeLineFormatted("The maximum allowed distance for %s involved in interactions is:", decompUnits.c_str());
2678         log->writeLineFormatted("%40s  %-7s %6.3f nm", "non-bonded interactions", "", comm->cutoff);
2679
2680         if (bDynLoadBal)
2681         {
2682             limit = dd->comm->cellsize_limit;
2683         }
2684         else
2685         {
2686             if (dynamic_dd_box(*dd))
2687             {
2688                 log->writeLine("(the following are initial values, they could change due to box deformation)");
2689             }
2690             limit = dd->comm->cellsize_min[XX];
2691             for (d = 1; d < DIM; d++)
2692             {
2693                 limit = std::min(limit, dd->comm->cellsize_min[d]);
2694             }
2695         }
2696
2697         if (comm->bInterCGBondeds)
2698         {
2699             log->writeLineFormatted("%40s  %-7s %6.3f nm",
2700                                     "two-body bonded interactions", "(-rdd)",
2701                                     std::max(comm->cutoff, comm->cutoff_mbody));
2702             log->writeLineFormatted("%40s  %-7s %6.3f nm",
2703                                     "multi-body bonded interactions", "(-rdd)",
2704                                     (comm->bBondComm || isDlbOn(dd->comm)) ? comm->cutoff_mbody : std::min(comm->cutoff, limit));
2705         }
2706         if (bInterCGVsites)
2707         {
2708             log->writeLineFormatted("%40s  %-7s %6.3f nm",
2709                                     "virtual site constructions", "(-rcon)", limit);
2710         }
2711         if (dd->splitConstraints || dd->splitSettles)
2712         {
2713             std::string separation = gmx::formatString("atoms separated by up to %d constraints",
2714                                                        1+ir->nProjOrder);
2715             log->writeLineFormatted("%40s  %-7s %6.3f nm\n",
2716                                     separation.c_str(), "(-rcon)", limit);
2717         }
2718         log->ensureLineBreak();
2719     }
2720 }
2721
2722 static void logSettings(const gmx::MDLogger &mdlog,
2723                         gmx_domdec_t        *dd,
2724                         const gmx_mtop_t    *mtop,
2725                         const t_inputrec    *ir,
2726                         real                 dlb_scale,
2727                         const gmx_ddbox_t   *ddbox)
2728 {
2729     gmx::StringOutputStream stream;
2730     gmx::TextWriter         log(&stream);
2731     writeSettings(&log, dd, mtop, ir, isDlbOn(dd->comm), dlb_scale, ddbox);
2732     if (dd->comm->dlbState == DlbState::offCanTurnOn)
2733     {
2734         {
2735             log.ensureEmptyLine();
2736             log.writeLine("When dynamic load balancing gets turned on, these settings will change to:");
2737         }
2738         writeSettings(&log, dd, mtop, ir, true, dlb_scale, ddbox);
2739     }
2740     GMX_LOG(mdlog.info).asParagraph().appendText(stream.toString());
2741 }
2742
2743 static void set_cell_limits_dlb(const gmx::MDLogger &mdlog,
2744                                 gmx_domdec_t        *dd,
2745                                 real                 dlb_scale,
2746                                 const t_inputrec    *ir,
2747                                 const gmx_ddbox_t   *ddbox)
2748 {
2749     gmx_domdec_comm_t *comm;
2750     int                d, dim, npulse, npulse_d_max, npulse_d;
2751     gmx_bool           bNoCutOff;
2752
2753     comm = dd->comm;
2754
2755     bNoCutOff = (ir->rvdw == 0 || ir->rcoulomb == 0);
2756
2757     /* Determine the maximum number of comm. pulses in one dimension */
2758
2759     comm->cellsize_limit = std::max(comm->cellsize_limit, comm->cutoff_mbody);
2760
2761     /* Determine the maximum required number of grid pulses */
2762     if (comm->cellsize_limit >= comm->cutoff)
2763     {
2764         /* Only a single pulse is required */
2765         npulse = 1;
2766     }
2767     else if (!bNoCutOff && comm->cellsize_limit > 0)
2768     {
2769         /* We round down slightly here to avoid overhead due to the latency
2770          * of extra communication calls when the cut-off
2771          * would be only slightly longer than the cell size.
2772          * Later cellsize_limit is redetermined,
2773          * so we can not miss interactions due to this rounding.
2774          */
2775         npulse = static_cast<int>(0.96 + comm->cutoff/comm->cellsize_limit);
2776     }
2777     else
2778     {
2779         /* There is no cell size limit */
2780         npulse = std::max(dd->nc[XX]-1, std::max(dd->nc[YY]-1, dd->nc[ZZ]-1));
2781     }
2782
2783     if (!bNoCutOff && npulse > 1)
2784     {
2785         /* See if we can do with less pulses, based on dlb_scale */
2786         npulse_d_max = 0;
2787         for (d = 0; d < dd->ndim; d++)
2788         {
2789             dim      = dd->dim[d];
2790             npulse_d = static_cast<int>(1 + dd->nc[dim]*comm->cutoff
2791                                         /(ddbox->box_size[dim]*ddbox->skew_fac[dim]*dlb_scale));
2792             npulse_d_max = std::max(npulse_d_max, npulse_d);
2793         }
2794         npulse = std::min(npulse, npulse_d_max);
2795     }
2796
2797     /* This env var can override npulse */
2798     d = dd_getenv(mdlog, "GMX_DD_NPULSE", 0);
2799     if (d > 0)
2800     {
2801         npulse = d;
2802     }
2803
2804     comm->maxpulse       = 1;
2805     comm->bVacDLBNoLimit = (ir->ePBC == epbcNONE);
2806     for (d = 0; d < dd->ndim; d++)
2807     {
2808         comm->cd[d].np_dlb    = std::min(npulse, dd->nc[dd->dim[d]]-1);
2809         comm->maxpulse        = std::max(comm->maxpulse, comm->cd[d].np_dlb);
2810         if (comm->cd[d].np_dlb < dd->nc[dd->dim[d]]-1)
2811         {
2812             comm->bVacDLBNoLimit = FALSE;
2813         }
2814     }
2815
2816     /* cellsize_limit is set for LINCS in init_domain_decomposition */
2817     if (!comm->bVacDLBNoLimit)
2818     {
2819         comm->cellsize_limit = std::max(comm->cellsize_limit,
2820                                         comm->cutoff/comm->maxpulse);
2821     }
2822     comm->cellsize_limit = std::max(comm->cellsize_limit, comm->cutoff_mbody);
2823     /* Set the minimum cell size for each DD dimension */
2824     for (d = 0; d < dd->ndim; d++)
2825     {
2826         if (comm->bVacDLBNoLimit ||
2827             comm->cd[d].np_dlb*comm->cellsize_limit >= comm->cutoff)
2828         {
2829             comm->cellsize_min_dlb[dd->dim[d]] = comm->cellsize_limit;
2830         }
2831         else
2832         {
2833             comm->cellsize_min_dlb[dd->dim[d]] =
2834                 comm->cutoff/comm->cd[d].np_dlb;
2835         }
2836     }
2837     if (comm->cutoff_mbody <= 0)
2838     {
2839         comm->cutoff_mbody = std::min(comm->cutoff, comm->cellsize_limit);
2840     }
2841     if (isDlbOn(comm))
2842     {
2843         set_dlb_limits(dd);
2844     }
2845 }
2846
2847 gmx_bool dd_bonded_molpbc(const gmx_domdec_t *dd, int ePBC)
2848 {
2849     /* If each molecule is a single charge group
2850      * or we use domain decomposition for each periodic dimension,
2851      * we do not need to take pbc into account for the bonded interactions.
2852      */
2853     return (ePBC != epbcNONE && dd->comm->bInterCGBondeds &&
2854             !(dd->nc[XX] > 1 &&
2855               dd->nc[YY] > 1 &&
2856               (dd->nc[ZZ] > 1 || ePBC == epbcXY)));
2857 }
2858
2859 /*! \brief Sets grid size limits and PP-PME setup, prints settings to log */
2860 static void set_ddgrid_parameters(const gmx::MDLogger &mdlog,
2861                                   gmx_domdec_t *dd, real dlb_scale,
2862                                   const gmx_mtop_t *mtop, const t_inputrec *ir,
2863                                   const gmx_ddbox_t *ddbox)
2864 {
2865     gmx_domdec_comm_t *comm;
2866     int                natoms_tot;
2867     real               vol_frac;
2868
2869     comm = dd->comm;
2870
2871     if (EEL_PME(ir->coulombtype) || EVDW_PME(ir->vdwtype))
2872     {
2873         init_ddpme(dd, &comm->ddpme[0], 0);
2874         if (comm->npmedecompdim >= 2)
2875         {
2876             init_ddpme(dd, &comm->ddpme[1], 1);
2877         }
2878     }
2879     else
2880     {
2881         comm->npmenodes = 0;
2882         if (dd->pme_nodeid >= 0)
2883         {
2884             gmx_fatal_collective(FARGS, dd->mpi_comm_all, DDMASTER(dd),
2885                                  "Can not have separate PME ranks without PME electrostatics");
2886         }
2887     }
2888
2889     if (debug)
2890     {
2891         fprintf(debug, "The DD cut-off is %f\n", comm->cutoff);
2892     }
2893     if (!isDlbDisabled(comm))
2894     {
2895         set_cell_limits_dlb(mdlog, dd, dlb_scale, ir, ddbox);
2896     }
2897
2898     logSettings(mdlog, dd, mtop, ir, dlb_scale, ddbox);
2899
2900     if (ir->ePBC == epbcNONE)
2901     {
2902         vol_frac = 1 - 1/static_cast<double>(dd->nnodes);
2903     }
2904     else
2905     {
2906         vol_frac =
2907             (1 + comm_box_frac(dd->nc, comm->cutoff, ddbox))/static_cast<double>(dd->nnodes);
2908     }
2909     if (debug)
2910     {
2911         fprintf(debug, "Volume fraction for all DD zones: %f\n", vol_frac);
2912     }
2913     natoms_tot = comm->cgs_gl.index[comm->cgs_gl.nr];
2914
2915     dd->ga2la  = new gmx_ga2la_t(natoms_tot,
2916                                  static_cast<int>(vol_frac*natoms_tot));
2917 }
2918
2919 /*! \brief Set some important DD parameters that can be modified by env.vars */
2920 static void set_dd_envvar_options(const gmx::MDLogger &mdlog,
2921                                   gmx_domdec_t *dd, int rank_mysim)
2922 {
2923     gmx_domdec_comm_t *comm = dd->comm;
2924
2925     dd->bSendRecv2      = (dd_getenv(mdlog, "GMX_DD_USE_SENDRECV2", 0) != 0);
2926     comm->dlb_scale_lim = dd_getenv(mdlog, "GMX_DLB_MAX_BOX_SCALING", 10);
2927     comm->eFlop         = dd_getenv(mdlog, "GMX_DLB_BASED_ON_FLOPS", 0);
2928     int recload         = dd_getenv(mdlog, "GMX_DD_RECORD_LOAD", 1);
2929     comm->nstDDDump     = dd_getenv(mdlog, "GMX_DD_NST_DUMP", 0);
2930     comm->nstDDDumpGrid = dd_getenv(mdlog, "GMX_DD_NST_DUMP_GRID", 0);
2931     comm->DD_debug      = dd_getenv(mdlog, "GMX_DD_DEBUG", 0);
2932
2933     if (dd->bSendRecv2)
2934     {
2935         GMX_LOG(mdlog.info).appendText("Will use two sequential MPI_Sendrecv calls instead of two simultaneous non-blocking MPI_Irecv and MPI_Isend pairs for constraint and vsite communication");
2936     }
2937
2938     if (comm->eFlop)
2939     {
2940         GMX_LOG(mdlog.info).appendText("Will load balance based on FLOP count");
2941         if (comm->eFlop > 1)
2942         {
2943             srand(1 + rank_mysim);
2944         }
2945         comm->bRecordLoad = TRUE;
2946     }
2947     else
2948     {
2949         comm->bRecordLoad = (wallcycle_have_counter() && recload > 0);
2950     }
2951 }
2952
2953 gmx_domdec_t *init_domain_decomposition(const gmx::MDLogger           &mdlog,
2954                                         t_commrec                     *cr,
2955                                         const DomdecOptions           &options,
2956                                         const MdrunOptions            &mdrunOptions,
2957                                         const gmx_mtop_t              *mtop,
2958                                         const t_inputrec              *ir,
2959                                         const matrix                   box,
2960                                         gmx::ArrayRef<const gmx::RVec> xGlobal,
2961                                         gmx::LocalAtomSetManager      *atomSets)
2962 {
2963     gmx_domdec_t      *dd;
2964
2965     GMX_LOG(mdlog.info).appendTextFormatted(
2966             "\nInitializing Domain Decomposition on %d ranks", cr->nnodes);
2967
2968     dd = new gmx_domdec_t;
2969
2970     dd->comm = init_dd_comm();
2971
2972     /* Initialize DD paritioning counters */
2973     dd->comm->partition_step = INT_MIN;
2974     dd->ddp_count            = 0;
2975
2976     set_dd_envvar_options(mdlog, dd, cr->nodeid);
2977
2978     gmx_ddbox_t ddbox = {0};
2979     set_dd_limits_and_grid(mdlog, cr, dd, options, mdrunOptions,
2980                            mtop, ir,
2981                            box, xGlobal,
2982                            &ddbox);
2983
2984     make_dd_communicators(mdlog, cr, dd, options.rankOrder);
2985
2986     if (thisRankHasDuty(cr, DUTY_PP))
2987     {
2988         set_ddgrid_parameters(mdlog, dd, options.dlbScaling, mtop, ir, &ddbox);
2989
2990         setup_neighbor_relations(dd);
2991     }
2992
2993     /* Set overallocation to avoid frequent reallocation of arrays */
2994     set_over_alloc_dd(TRUE);
2995
2996     clear_dd_cycle_counts(dd);
2997
2998     dd->atomSets = atomSets;
2999
3000     return dd;
3001 }
3002
3003 static gmx_bool test_dd_cutoff(t_commrec     *cr,
3004                                const t_state &state,
3005                                real           cutoffRequested)
3006 {
3007     gmx_domdec_t *dd;
3008     gmx_ddbox_t   ddbox;
3009     int           d, dim, np;
3010     real          inv_cell_size;
3011     int           LocallyLimited;
3012
3013     dd = cr->dd;
3014
3015     set_ddbox(*dd, false, state.box, true, state.x, &ddbox);
3016
3017     LocallyLimited = 0;
3018
3019     for (d = 0; d < dd->ndim; d++)
3020     {
3021         dim = dd->dim[d];
3022
3023         inv_cell_size = DD_CELL_MARGIN*dd->nc[dim]/ddbox.box_size[dim];
3024         if (dynamic_dd_box(*dd))
3025         {
3026             inv_cell_size *= DD_PRES_SCALE_MARGIN;
3027         }
3028
3029         np = 1 + static_cast<int>(cutoffRequested*inv_cell_size*ddbox.skew_fac[dim]);
3030
3031         if (!isDlbDisabled(dd->comm) && (dim < ddbox.npbcdim) && (dd->comm->cd[d].np_dlb > 0))
3032         {
3033             if (np > dd->comm->cd[d].np_dlb)
3034             {
3035                 return FALSE;
3036             }
3037
3038             /* If a current local cell size is smaller than the requested
3039              * cut-off, we could still fix it, but this gets very complicated.
3040              * Without fixing here, we might actually need more checks.
3041              */
3042             real cellSizeAlongDim = (dd->comm->cell_x1[dim] - dd->comm->cell_x0[dim])*ddbox.skew_fac[dim];
3043             if (cellSizeAlongDim*dd->comm->cd[d].np_dlb < cutoffRequested)
3044             {
3045                 LocallyLimited = 1;
3046             }
3047         }
3048     }
3049
3050     if (!isDlbDisabled(dd->comm))
3051     {
3052         /* If DLB is not active yet, we don't need to check the grid jumps.
3053          * Actually we shouldn't, because then the grid jump data is not set.
3054          */
3055         if (isDlbOn(dd->comm) &&
3056             check_grid_jump(0, dd, cutoffRequested, &ddbox, FALSE))
3057         {
3058             LocallyLimited = 1;
3059         }
3060
3061         gmx_sumi(1, &LocallyLimited, cr);
3062
3063         if (LocallyLimited > 0)
3064         {
3065             return FALSE;
3066         }
3067     }
3068
3069     return TRUE;
3070 }
3071
3072 gmx_bool change_dd_cutoff(t_commrec     *cr,
3073                           const t_state &state,
3074                           real           cutoffRequested)
3075 {
3076     gmx_bool bCutoffAllowed;
3077
3078     bCutoffAllowed = test_dd_cutoff(cr, state, cutoffRequested);
3079
3080     if (bCutoffAllowed)
3081     {
3082         cr->dd->comm->cutoff = cutoffRequested;
3083     }
3084
3085     return bCutoffAllowed;
3086 }