src/gromacs/domdec/domdec.cpp

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019, by the GROMACS development team, led by
   5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   6  * and including many others, as listed in the AUTHORS file in the
   7  * top-level source directory and at http://www.gromacs.org.
   8  *
   9  * GROMACS is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public License
  11  * as published by the Free Software Foundation; either version 2.1
  12  * of the License, or (at your option) any later version.
  13  *
  14  * GROMACS is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public
  20  * License along with GROMACS; if not, see
  21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  23  *
  24  * If you want to redistribute modifications to GROMACS, please
  25  * consider that scientific software is very special. Version
  26  * control is crucial - bugs must be traceable. We will be happy to
  27  * consider code for inclusion in the official distribution, but
  28  * derived work must not be called official GROMACS. Details are found
  29  * in the README & COPYING files - if they are missing, get the
  30  * official version at http://www.gromacs.org.
  31  *
  32  * To help us fund GROMACS development, we humbly ask that you cite
  33  * the research papers on the package. Check out http://www.gromacs.org.
  34  */
  35
  36 #include "gmxpre.h"
  37
  38 #include "domdec.h"
  39
  40 #include "config.h"
  41
  42 #include <cassert>
  43 #include <cinttypes>
  44 #include <climits>
  45 #include <cmath>
  46 #include <cstdio>
  47 #include <cstdlib>
  48 #include <cstring>
  49
  50 #include <algorithm>
  51 #include <memory>
  52
  53 #include "gromacs/domdec/collect.h"
  54 #include "gromacs/domdec/dlb.h"
  55 #include "gromacs/domdec/dlbtiming.h"
  56 #include "gromacs/domdec/domdec_network.h"
  57 #include "gromacs/domdec/ga2la.h"
  58 #include "gromacs/domdec/options.h"
  59 #include "gromacs/domdec/partition.h"
  60 #include "gromacs/gmxlib/network.h"
  61 #include "gromacs/gmxlib/nrnb.h"
  62 #include "gromacs/gpu_utils/gpu_utils.h"
  63 #include "gromacs/hardware/hw_info.h"
  64 #include "gromacs/listed_forces/manage_threading.h"
  65 #include "gromacs/math/vec.h"
  66 #include "gromacs/math/vectypes.h"
  67 #include "gromacs/mdlib/calc_verletbuf.h"
  68 #include "gromacs/mdlib/constr.h"
  69 #include "gromacs/mdlib/constraintrange.h"
  70 #include "gromacs/mdlib/updategroups.h"
  71 #include "gromacs/mdlib/vsite.h"
  72 #include "gromacs/mdtypes/commrec.h"
  73 #include "gromacs/mdtypes/inputrec.h"
  74 #include "gromacs/mdtypes/mdrunoptions.h"
  75 #include "gromacs/mdtypes/state.h"
  76 #include "gromacs/pbcutil/ishift.h"
  77 #include "gromacs/pbcutil/pbc.h"
  78 #include "gromacs/pulling/pull.h"
  79 #include "gromacs/timing/wallcycle.h"
  80 #include "gromacs/topology/block.h"
  81 #include "gromacs/topology/idef.h"
  82 #include "gromacs/topology/ifunc.h"
  83 #include "gromacs/topology/mtop_lookup.h"
  84 #include "gromacs/topology/mtop_util.h"
  85 #include "gromacs/topology/topology.h"
  86 #include "gromacs/utility/basedefinitions.h"
  87 #include "gromacs/utility/basenetwork.h"
  88 #include "gromacs/utility/cstringutil.h"
  89 #include "gromacs/utility/exceptions.h"
  90 #include "gromacs/utility/fatalerror.h"
  91 #include "gromacs/utility/gmxmpi.h"
  92 #include "gromacs/utility/logger.h"
  93 #include "gromacs/utility/real.h"
  94 #include "gromacs/utility/smalloc.h"
  95 #include "gromacs/utility/strconvert.h"
  96 #include "gromacs/utility/stringstream.h"
  97 #include "gromacs/utility/stringutil.h"
  98 #include "gromacs/utility/textwriter.h"
  99
 100 #include "atomdistribution.h"
 101 #include "box.h"
 102 #include "cellsizes.h"
 103 #include "distribute.h"
 104 #include "domdec_constraints.h"
 105 #include "domdec_internal.h"
 106 #include "domdec_vsite.h"
 107 #include "redistribute.h"
 108 #include "utility.h"
 109
 110 // TODO remove this when moving domdec into gmx namespace
 111 using gmx::DdRankOrder;
 112 using gmx::DlbOption;
 113 using gmx::DomdecOptions;
 114
 115 static const char *edlbs_names[int(DlbState::nr)] = { "off", "auto", "locked", "on", "on" };
 116
 117 /* The size per atom group of the cggl_flag buffer in gmx_domdec_comm_t */
 118 #define DD_CGIBS 2
 119
 120 /* The flags for the cggl_flag buffer in gmx_domdec_comm_t */
 121 #define DD_FLAG_NRCG  65535
 122 #define DD_FLAG_FW(d) (1<<(16+(d)*2))
 123 #define DD_FLAG_BW(d) (1<<(16+(d)*2+1))
 124
 125 /* The DD zone order */
 126 static const ivec dd_zo[DD_MAXZONE] =
 127 {{0, 0, 0}, {1, 0, 0}, {1, 1, 0}, {0, 1, 0}, {0, 1, 1}, {0, 0, 1}, {1, 0, 1}, {1, 1, 1}};
 128
 129 /* The non-bonded zone-pair setup for domain decomposition
 130  * The first number is the i-zone, the second number the first j-zone seen by
 131  * this i-zone, the third number the last+1 j-zone seen by this i-zone.
 132  * As is, this is for 3D decomposition, where there are 4 i-zones.
 133  * With 2D decomposition use only the first 2 i-zones and a last+1 j-zone of 4.
 134  * With 1D decomposition use only the first i-zone and a last+1 j-zone of 2.
 135  */
 136 static const int
 137     ddNonbondedZonePairRanges[DD_MAXIZONE][3] = {{0, 0, 8},
 138                                                  {1, 3, 6},
 139                                                  {2, 5, 6},
 140                                                  {3, 5, 7}};
 141
 142
 143 /*
 144    #define dd_index(n,i) ((((i)[ZZ]*(n)[YY] + (i)[YY])*(n)[XX]) + (i)[XX])
 145
 146    static void index2xyz(ivec nc,int ind,ivec xyz)
 147    {
 148    xyz[XX] = ind % nc[XX];
 149    xyz[YY] = (ind / nc[XX]) % nc[YY];
 150    xyz[ZZ] = ind / (nc[YY]*nc[XX]);
 151    }
 152  */
 153
 154 static void ddindex2xyz(const ivec nc, int ind, ivec xyz)
 155 {
 156     xyz[XX] = ind / (nc[YY]*nc[ZZ]);
 157     xyz[YY] = (ind / nc[ZZ]) % nc[YY];
 158     xyz[ZZ] = ind % nc[ZZ];
 159 }
 160
 161 static int ddcoord2ddnodeid(gmx_domdec_t *dd, ivec c)
 162 {
 163     int ddindex;
 164     int ddnodeid = -1;
 165
 166     ddindex = dd_index(dd->nc, c);
 167     if (dd->comm->bCartesianPP_PME)
 168     {
 169         ddnodeid = dd->comm->ddindex2ddnodeid[ddindex];
 170     }
 171     else if (dd->comm->bCartesianPP)
 172     {
 173 #if GMX_MPI
 174         MPI_Cart_rank(dd->mpi_comm_all, c, &ddnodeid);
 175 #endif
 176     }
 177     else
 178     {
 179         ddnodeid = ddindex;
 180     }
 181
 182     return ddnodeid;
 183 }
 184
 185 int ddglatnr(const gmx_domdec_t *dd, int i)
 186 {
 187     int atnr;
 188
 189     if (dd == nullptr)
 190     {
 191         atnr = i + 1;
 192     }
 193     else
 194     {
 195         if (i >= dd->comm->atomRanges.numAtomsTotal())
 196         {
 197             gmx_fatal(FARGS, "glatnr called with %d, which is larger than the local number of atoms (%d)", i, dd->comm->atomRanges.numAtomsTotal());
 198         }
 199         atnr = dd->globalAtomIndices[i] + 1;
 200     }
 201
 202     return atnr;
 203 }
 204
 205 t_block *dd_charge_groups_global(gmx_domdec_t *dd)
 206 {
 207     return &dd->comm->cgs_gl;
 208 }
 209
 210 gmx::ArrayRef<const gmx::RangePartitioning> getUpdateGroupingPerMoleculetype(const gmx_domdec_t &dd)
 211 {
 212     GMX_RELEASE_ASSERT(dd.comm, "Need a valid dd.comm");
 213     return dd.comm->updateGroupingPerMoleculetype;
 214 }
 215
 216 void dd_store_state(gmx_domdec_t *dd, t_state *state)
 217 {
 218     int i;
 219
 220     if (state->ddp_count != dd->ddp_count)
 221     {
 222         gmx_incons("The MD state does not match the domain decomposition state");
 223     }
 224
 225     state->cg_gl.resize(dd->ncg_home);
 226     for (i = 0; i < dd->ncg_home; i++)
 227     {
 228         state->cg_gl[i] = dd->globalAtomGroupIndices[i];
 229     }
 230
 231     state->ddp_count_cg_gl = dd->ddp_count;
 232 }
 233
 234 gmx_domdec_zones_t *domdec_zones(gmx_domdec_t *dd)
 235 {
 236     return &dd->comm->zones;
 237 }
 238
 239 void dd_get_ns_ranges(const gmx_domdec_t *dd, int icg,
 240                       int *jcg0, int *jcg1, ivec shift0, ivec shift1)
 241 {
 242     gmx_domdec_zones_t *zones;
 243     int                 izone, d, dim;
 244
 245     zones = &dd->comm->zones;
 246
 247     izone = 0;
 248     while (icg >= zones->izone[izone].cg1)
 249     {
 250         izone++;
 251     }
 252
 253     if (izone == 0)
 254     {
 255         *jcg0 = icg;
 256     }
 257     else if (izone < zones->nizone)
 258     {
 259         *jcg0 = zones->izone[izone].jcg0;
 260     }
 261     else
 262     {
 263         gmx_fatal(FARGS, "DD icg %d out of range: izone (%d) >= nizone (%d)",
 264                   icg, izone, zones->nizone);
 265     }
 266
 267     *jcg1 = zones->izone[izone].jcg1;
 268
 269     for (d = 0; d < dd->ndim; d++)
 270     {
 271         dim         = dd->dim[d];
 272         shift0[dim] = zones->izone[izone].shift0[dim];
 273         shift1[dim] = zones->izone[izone].shift1[dim];
 274         if (dd->comm->tric_dir[dim] || (isDlbOn(dd->comm) && d > 0))
 275         {
 276             /* A conservative approach, this can be optimized */
 277             shift0[dim] -= 1;
 278             shift1[dim] += 1;
 279         }
 280     }
 281 }
 282
 283 int dd_numHomeAtoms(const gmx_domdec_t &dd)
 284 {
 285     return dd.comm->atomRanges.numHomeAtoms();
 286 }
 287
 288 int dd_natoms_mdatoms(const gmx_domdec_t *dd)
 289 {
 290     /* We currently set mdatoms entries for all atoms:
 291      * local + non-local + communicated for vsite + constraints
 292      */
 293
 294     return dd->comm->atomRanges.numAtomsTotal();
 295 }
 296
 297 int dd_natoms_vsite(const gmx_domdec_t *dd)
 298 {
 299     return dd->comm->atomRanges.end(DDAtomRanges::Type::Vsites);
 300 }
 301
 302 void dd_get_constraint_range(const gmx_domdec_t *dd, int *at_start, int *at_end)
 303 {
 304     *at_start = dd->comm->atomRanges.start(DDAtomRanges::Type::Constraints);
 305     *at_end   = dd->comm->atomRanges.end(DDAtomRanges::Type::Constraints);
 306 }
 307
 308 void dd_move_x(gmx_domdec_t             *dd,
 309                const matrix              box,
 310                gmx::ArrayRef<gmx::RVec>  x,
 311                gmx_wallcycle            *wcycle)
 312 {
 313     wallcycle_start(wcycle, ewcMOVEX);
 314
 315     int                    nzone, nat_tot;
 316     gmx_domdec_comm_t     *comm;
 317     gmx_domdec_comm_dim_t *cd;
 318     rvec                   shift = {0, 0, 0};
 319     gmx_bool               bPBC, bScrew;
 320
 321     comm = dd->comm;
 322
 323     nzone   = 1;
 324     nat_tot = comm->atomRanges.numHomeAtoms();
 325     for (int d = 0; d < dd->ndim; d++)
 326     {
 327         bPBC   = (dd->ci[dd->dim[d]] == 0);
 328         bScrew = (bPBC && dd->bScrewPBC && dd->dim[d] == XX);
 329         if (bPBC)
 330         {
 331             copy_rvec(box[dd->dim[d]], shift);
 332         }
 333         cd = &comm->cd[d];
 334         for (const gmx_domdec_ind_t &ind : cd->ind)
 335         {
 336             DDBufferAccess<gmx::RVec>  sendBufferAccess(comm->rvecBuffer, ind.nsend[nzone + 1]);
 337             gmx::ArrayRef<gmx::RVec>  &sendBuffer = sendBufferAccess.buffer;
 338             int                        n          = 0;
 339             if (!bPBC)
 340             {
 341                 for (int j : ind.index)
 342                 {
 343                     sendBuffer[n] = x[j];
 344                     n++;
 345                 }
 346             }
 347             else if (!bScrew)
 348             {
 349                 for (int j : ind.index)
 350                 {
 351                     /* We need to shift the coordinates */
 352                     for (int d = 0; d < DIM; d++)
 353                     {
 354                         sendBuffer[n][d] = x[j][d] + shift[d];
 355                     }
 356                     n++;
 357                 }
 358             }
 359             else
 360             {
 361                 for (int j : ind.index)
 362                 {
 363                     /* Shift x */
 364                     sendBuffer[n][XX] = x[j][XX] + shift[XX];
 365                     /* Rotate y and z.
 366                      * This operation requires a special shift force
 367                      * treatment, which is performed in calc_vir.
 368                      */
 369                     sendBuffer[n][YY] = box[YY][YY] - x[j][YY];
 370                     sendBuffer[n][ZZ] = box[ZZ][ZZ] - x[j][ZZ];
 371                     n++;
 372                 }
 373             }
 374
 375             DDBufferAccess<gmx::RVec>  receiveBufferAccess(comm->rvecBuffer2, cd->receiveInPlace ? 0 : ind.nrecv[nzone + 1]);
 376
 377             gmx::ArrayRef<gmx::RVec>   receiveBuffer;
 378             if (cd->receiveInPlace)
 379             {
 380                 receiveBuffer = gmx::arrayRefFromArray(x.data() + nat_tot, ind.nrecv[nzone + 1]);
 381             }
 382             else
 383             {
 384                 receiveBuffer = receiveBufferAccess.buffer;
 385             }
 386             /* Send and receive the coordinates */
 387             ddSendrecv(dd, d, dddirBackward,
 388                        sendBuffer, receiveBuffer);
 389
 390             if (!cd->receiveInPlace)
 391             {
 392                 int j = 0;
 393                 for (int zone = 0; zone < nzone; zone++)
 394                 {
 395                     for (int i = ind.cell2at0[zone]; i < ind.cell2at1[zone]; i++)
 396                     {
 397                         x[i] = receiveBuffer[j++];
 398                     }
 399                 }
 400             }
 401             nat_tot += ind.nrecv[nzone+1];
 402         }
 403         nzone += nzone;
 404     }
 405
 406     wallcycle_stop(wcycle, ewcMOVEX);
 407 }
 408
 409 void dd_move_f(gmx_domdec_t             *dd,
 410                gmx::ArrayRef<gmx::RVec>  f,
 411                rvec                     *fshift,
 412                gmx_wallcycle            *wcycle)
 413 {
 414     wallcycle_start(wcycle, ewcMOVEF);
 415
 416     int                    nzone, nat_tot;
 417     gmx_domdec_comm_t     *comm;
 418     gmx_domdec_comm_dim_t *cd;
 419     ivec                   vis;
 420     int                    is;
 421     gmx_bool               bShiftForcesNeedPbc, bScrew;
 422
 423     comm = dd->comm;
 424
 425     nzone   = comm->zones.n/2;
 426     nat_tot = comm->atomRanges.end(DDAtomRanges::Type::Zones);
 427     for (int d = dd->ndim-1; d >= 0; d--)
 428     {
 429         /* Only forces in domains near the PBC boundaries need to
 430            consider PBC in the treatment of fshift */
 431         bShiftForcesNeedPbc   = (dd->ci[dd->dim[d]] == 0);
 432         bScrew                = (bShiftForcesNeedPbc && dd->bScrewPBC && dd->dim[d] == XX);
 433         if (fshift == nullptr && !bScrew)
 434         {
 435             bShiftForcesNeedPbc = FALSE;
 436         }
 437         /* Determine which shift vector we need */
 438         clear_ivec(vis);
 439         vis[dd->dim[d]] = 1;
 440         is              = IVEC2IS(vis);
 441
 442         cd = &comm->cd[d];
 443         for (int p = cd->numPulses() - 1; p >= 0; p--)
 444         {
 445             const gmx_domdec_ind_t    &ind  = cd->ind[p];
 446             DDBufferAccess<gmx::RVec>  receiveBufferAccess(comm->rvecBuffer, ind.nsend[nzone + 1]);
 447             gmx::ArrayRef<gmx::RVec>  &receiveBuffer = receiveBufferAccess.buffer;
 448
 449             nat_tot                        -= ind.nrecv[nzone+1];
 450
 451             DDBufferAccess<gmx::RVec>  sendBufferAccess(comm->rvecBuffer2, cd->receiveInPlace ? 0 : ind.nrecv[nzone + 1]);
 452
 453             gmx::ArrayRef<gmx::RVec>   sendBuffer;
 454             if (cd->receiveInPlace)
 455             {
 456                 sendBuffer = gmx::arrayRefFromArray(f.data() + nat_tot, ind.nrecv[nzone + 1]);
 457             }
 458             else
 459             {
 460                 sendBuffer = sendBufferAccess.buffer;
 461                 int j = 0;
 462                 for (int zone = 0; zone < nzone; zone++)
 463                 {
 464                     for (int i = ind.cell2at0[zone]; i < ind.cell2at1[zone]; i++)
 465                     {
 466                         sendBuffer[j++] = f[i];
 467                     }
 468                 }
 469             }
 470             /* Communicate the forces */
 471             ddSendrecv(dd, d, dddirForward,
 472                        sendBuffer, receiveBuffer);
 473             /* Add the received forces */
 474             int n = 0;
 475             if (!bShiftForcesNeedPbc)
 476             {
 477                 for (int j : ind.index)
 478                 {
 479                     for (int d = 0; d < DIM; d++)
 480                     {
 481                         f[j][d] += receiveBuffer[n][d];
 482                     }
 483                     n++;
 484                 }
 485             }
 486             else if (!bScrew)
 487             {
 488                 /* fshift should always be defined if this function is
 489                  * called when bShiftForcesNeedPbc is true */
 490                 assert(nullptr != fshift);
 491                 for (int j : ind.index)
 492                 {
 493                     for (int d = 0; d < DIM; d++)
 494                     {
 495                         f[j][d] += receiveBuffer[n][d];
 496                     }
 497                     /* Add this force to the shift force */
 498                     for (int d = 0; d < DIM; d++)
 499                     {
 500                         fshift[is][d] += receiveBuffer[n][d];
 501                     }
 502                     n++;
 503                 }
 504             }
 505             else
 506             {
 507                 for (int j : ind.index)
 508                 {
 509                     /* Rotate the force */
 510                     f[j][XX] += receiveBuffer[n][XX];
 511                     f[j][YY] -= receiveBuffer[n][YY];
 512                     f[j][ZZ] -= receiveBuffer[n][ZZ];
 513                     if (fshift)
 514                     {
 515                         /* Add this force to the shift force */
 516                         for (int d = 0; d < DIM; d++)
 517                         {
 518                             fshift[is][d] += receiveBuffer[n][d];
 519                         }
 520                     }
 521                     n++;
 522                 }
 523             }
 524         }
 525         nzone /= 2;
 526     }
 527     wallcycle_stop(wcycle, ewcMOVEF);
 528 }
 529
 530 /* Convenience function for extracting a real buffer from an rvec buffer
 531  *
 532  * To reduce the number of temporary communication buffers and avoid
 533  * cache polution, we reuse gmx::RVec buffers for storing reals.
 534  * This functions return a real buffer reference with the same number
 535  * of elements as the gmx::RVec buffer (so 1/3 of the size in bytes).
 536  */
 537 static gmx::ArrayRef<real>
 538 realArrayRefFromRvecArrayRef(gmx::ArrayRef<gmx::RVec> arrayRef)
 539 {
 540     return gmx::arrayRefFromArray(as_rvec_array(arrayRef.data())[0],
 541                                   arrayRef.size());
 542 }
 543
 544 void dd_atom_spread_real(gmx_domdec_t *dd, real v[])
 545 {
 546     int                    nzone, nat_tot;
 547     gmx_domdec_comm_t     *comm;
 548     gmx_domdec_comm_dim_t *cd;
 549
 550     comm = dd->comm;
 551
 552     nzone   = 1;
 553     nat_tot = comm->atomRanges.numHomeAtoms();
 554     for (int d = 0; d < dd->ndim; d++)
 555     {
 556         cd = &comm->cd[d];
 557         for (const gmx_domdec_ind_t &ind : cd->ind)
 558         {
 559             /* Note: We provision for RVec instead of real, so a factor of 3
 560              * more than needed. The buffer actually already has this size
 561              * and we pass a plain pointer below, so this does not matter.
 562              */
 563             DDBufferAccess<gmx::RVec> sendBufferAccess(comm->rvecBuffer, ind.nsend[nzone + 1]);
 564             gmx::ArrayRef<real>       sendBuffer = realArrayRefFromRvecArrayRef(sendBufferAccess.buffer);
 565             int                       n          = 0;
 566             for (int j : ind.index)
 567             {
 568                 sendBuffer[n++] = v[j];
 569             }
 570
 571             DDBufferAccess<gmx::RVec> receiveBufferAccess(comm->rvecBuffer2, cd->receiveInPlace ? 0 : ind.nrecv[nzone + 1]);
 572
 573             gmx::ArrayRef<real>       receiveBuffer;
 574             if (cd->receiveInPlace)
 575             {
 576                 receiveBuffer = gmx::arrayRefFromArray(v + nat_tot, ind.nrecv[nzone + 1]);
 577             }
 578             else
 579             {
 580                 receiveBuffer = realArrayRefFromRvecArrayRef(receiveBufferAccess.buffer);
 581             }
 582             /* Send and receive the data */
 583             ddSendrecv(dd, d, dddirBackward,
 584                        sendBuffer, receiveBuffer);
 585             if (!cd->receiveInPlace)
 586             {
 587                 int j = 0;
 588                 for (int zone = 0; zone < nzone; zone++)
 589                 {
 590                     for (int i = ind.cell2at0[zone]; i < ind.cell2at1[zone]; i++)
 591                     {
 592                         v[i] = receiveBuffer[j++];
 593                     }
 594                 }
 595             }
 596             nat_tot += ind.nrecv[nzone+1];
 597         }
 598         nzone += nzone;
 599     }
 600 }
 601
 602 void dd_atom_sum_real(gmx_domdec_t *dd, real v[])
 603 {
 604     int                    nzone, nat_tot;
 605     gmx_domdec_comm_t     *comm;
 606     gmx_domdec_comm_dim_t *cd;
 607
 608     comm = dd->comm;
 609
 610     nzone   = comm->zones.n/2;
 611     nat_tot = comm->atomRanges.end(DDAtomRanges::Type::Zones);
 612     for (int d = dd->ndim-1; d >= 0; d--)
 613     {
 614         cd = &comm->cd[d];
 615         for (int p = cd->numPulses() - 1; p >= 0; p--)
 616         {
 617             const gmx_domdec_ind_t &ind = cd->ind[p];
 618
 619             /* Note: We provision for RVec instead of real, so a factor of 3
 620              * more than needed. The buffer actually already has this size
 621              * and we typecast, so this works as intended.
 622              */
 623             DDBufferAccess<gmx::RVec> receiveBufferAccess(comm->rvecBuffer, ind.nsend[nzone + 1]);
 624             gmx::ArrayRef<real>       receiveBuffer = realArrayRefFromRvecArrayRef(receiveBufferAccess.buffer);
 625             nat_tot -= ind.nrecv[nzone + 1];
 626
 627             DDBufferAccess<gmx::RVec> sendBufferAccess(comm->rvecBuffer2, cd->receiveInPlace ? 0 : ind.nrecv[nzone + 1]);
 628
 629             gmx::ArrayRef<real>       sendBuffer;
 630             if (cd->receiveInPlace)
 631             {
 632                 sendBuffer = gmx::arrayRefFromArray(v + nat_tot, ind.nrecv[nzone + 1]);
 633             }
 634             else
 635             {
 636                 sendBuffer = realArrayRefFromRvecArrayRef(sendBufferAccess.buffer);
 637                 int j = 0;
 638                 for (int zone = 0; zone < nzone; zone++)
 639                 {
 640                     for (int i = ind.cell2at0[zone]; i < ind.cell2at1[zone]; i++)
 641                     {
 642                         sendBuffer[j++] = v[i];
 643                     }
 644                 }
 645             }
 646             /* Communicate the forces */
 647             ddSendrecv(dd, d, dddirForward,
 648                        sendBuffer, receiveBuffer);
 649             /* Add the received forces */
 650             int n = 0;
 651             for (int j : ind.index)
 652             {
 653                 v[j] += receiveBuffer[n];
 654                 n++;
 655             }
 656         }
 657         nzone /= 2;
 658     }
 659 }
 660
 661 real dd_cutoff_multibody(const gmx_domdec_t *dd)
 662 {
 663     gmx_domdec_comm_t *comm;
 664     int                di;
 665     real               r;
 666
 667     comm = dd->comm;
 668
 669     r = -1;
 670     if (comm->haveInterDomainMultiBodyBondeds)
 671     {
 672         if (comm->cutoff_mbody > 0)
 673         {
 674             r = comm->cutoff_mbody;
 675         }
 676         else
 677         {
 678             /* cutoff_mbody=0 means we do not have DLB */
 679             r = comm->cellsize_min[dd->dim[0]];
 680             for (di = 1; di < dd->ndim; di++)
 681             {
 682                 r = std::min(r, comm->cellsize_min[dd->dim[di]]);
 683             }
 684             if (comm->bBondComm)
 685             {
 686                 r = std::max(r, comm->cutoff_mbody);
 687             }
 688             else
 689             {
 690                 r = std::min(r, comm->cutoff);
 691             }
 692         }
 693     }
 694
 695     return r;
 696 }
 697
 698 real dd_cutoff_twobody(const gmx_domdec_t *dd)
 699 {
 700     real r_mb;
 701
 702     r_mb = dd_cutoff_multibody(dd);
 703
 704     return std::max(dd->comm->cutoff, r_mb);
 705 }
 706
 707
 708 static void dd_cart_coord2pmecoord(const gmx_domdec_t *dd, const ivec coord,
 709                                    ivec coord_pme)
 710 {
 711     int nc, ntot;
 712
 713     nc   = dd->nc[dd->comm->cartpmedim];
 714     ntot = dd->comm->ntot[dd->comm->cartpmedim];
 715     copy_ivec(coord, coord_pme);
 716     coord_pme[dd->comm->cartpmedim] =
 717         nc + (coord[dd->comm->cartpmedim]*(ntot - nc) + (ntot - nc)/2)/nc;
 718 }
 719
 720 static int ddindex2pmeindex(const gmx_domdec_t *dd, int ddindex)
 721 {
 722     int npp, npme;
 723
 724     npp  = dd->nnodes;
 725     npme = dd->comm->npmenodes;
 726
 727     /* Here we assign a PME node to communicate with this DD node
 728      * by assuming that the major index of both is x.
 729      * We add cr->npmenodes/2 to obtain an even distribution.
 730      */
 731     return (ddindex*npme + npme/2)/npp;
 732 }
 733
 734 static int *dd_interleaved_pme_ranks(const gmx_domdec_t *dd)
 735 {
 736     int *pme_rank;
 737     int  n, i, p0, p1;
 738
 739     snew(pme_rank, dd->comm->npmenodes);
 740     n = 0;
 741     for (i = 0; i < dd->nnodes; i++)
 742     {
 743         p0 = ddindex2pmeindex(dd, i);
 744         p1 = ddindex2pmeindex(dd, i+1);
 745         if (i+1 == dd->nnodes || p1 > p0)
 746         {
 747             if (debug)
 748             {
 749                 fprintf(debug, "pme_rank[%d] = %d\n", n, i+1+n);
 750             }
 751             pme_rank[n] = i + 1 + n;
 752             n++;
 753         }
 754     }
 755
 756     return pme_rank;
 757 }
 758
 759 static int gmx_ddcoord2pmeindex(const t_commrec *cr, int x, int y, int z)
 760 {
 761     gmx_domdec_t *dd;
 762     ivec          coords;
 763     int           slab;
 764
 765     dd = cr->dd;
 766     /*
 767        if (dd->comm->bCartesian) {
 768        gmx_ddindex2xyz(dd->nc,ddindex,coords);
 769        dd_coords2pmecoords(dd,coords,coords_pme);
 770        copy_ivec(dd->ntot,nc);
 771        nc[dd->cartpmedim]         -= dd->nc[dd->cartpmedim];
 772        coords_pme[dd->cartpmedim] -= dd->nc[dd->cartpmedim];
 773
 774        slab = (coords_pme[XX]*nc[YY] + coords_pme[YY])*nc[ZZ] + coords_pme[ZZ];
 775        } else {
 776        slab = (ddindex*cr->npmenodes + cr->npmenodes/2)/dd->nnodes;
 777        }
 778      */
 779     coords[XX] = x;
 780     coords[YY] = y;
 781     coords[ZZ] = z;
 782     slab       = ddindex2pmeindex(dd, dd_index(dd->nc, coords));
 783
 784     return slab;
 785 }
 786
 787 static int ddcoord2simnodeid(const t_commrec *cr, int x, int y, int z)
 788 {
 789     gmx_domdec_comm_t *comm;
 790     ivec               coords;
 791     int                ddindex, nodeid = -1;
 792
 793     comm = cr->dd->comm;
 794
 795     coords[XX] = x;
 796     coords[YY] = y;
 797     coords[ZZ] = z;
 798     if (comm->bCartesianPP_PME)
 799     {
 800 #if GMX_MPI
 801         MPI_Cart_rank(cr->mpi_comm_mysim, coords, &nodeid);
 802 #endif
 803     }
 804     else
 805     {
 806         ddindex = dd_index(cr->dd->nc, coords);
 807         if (comm->bCartesianPP)
 808         {
 809             nodeid = comm->ddindex2simnodeid[ddindex];
 810         }
 811         else
 812         {
 813             if (comm->pmenodes)
 814             {
 815                 nodeid = ddindex + gmx_ddcoord2pmeindex(cr, x, y, z);
 816             }
 817             else
 818             {
 819                 nodeid = ddindex;
 820             }
 821         }
 822     }
 823
 824     return nodeid;
 825 }
 826
 827 static int dd_simnode2pmenode(const gmx_domdec_t         *dd,
 828                               const t_commrec gmx_unused *cr,
 829                               int                         sim_nodeid)
 830 {
 831     int pmenode = -1;
 832
 833     const gmx_domdec_comm_t *comm = dd->comm;
 834
 835     /* This assumes a uniform x domain decomposition grid cell size */
 836     if (comm->bCartesianPP_PME)
 837     {
 838 #if GMX_MPI
 839         ivec coord, coord_pme;
 840         MPI_Cart_coords(cr->mpi_comm_mysim, sim_nodeid, DIM, coord);
 841         if (coord[comm->cartpmedim] < dd->nc[comm->cartpmedim])
 842         {
 843             /* This is a PP node */
 844             dd_cart_coord2pmecoord(dd, coord, coord_pme);
 845             MPI_Cart_rank(cr->mpi_comm_mysim, coord_pme, &pmenode);
 846         }
 847 #endif
 848     }
 849     else if (comm->bCartesianPP)
 850     {
 851         if (sim_nodeid < dd->nnodes)
 852         {
 853             pmenode = dd->nnodes + ddindex2pmeindex(dd, sim_nodeid);
 854         }
 855     }
 856     else
 857     {
 858         /* This assumes DD cells with identical x coordinates
 859          * are numbered sequentially.
 860          */
 861         if (dd->comm->pmenodes == nullptr)
 862         {
 863             if (sim_nodeid < dd->nnodes)
 864             {
 865                 /* The DD index equals the nodeid */
 866                 pmenode = dd->nnodes + ddindex2pmeindex(dd, sim_nodeid);
 867             }
 868         }
 869         else
 870         {
 871             int i = 0;
 872             while (sim_nodeid > dd->comm->pmenodes[i])
 873             {
 874                 i++;
 875             }
 876             if (sim_nodeid < dd->comm->pmenodes[i])
 877             {
 878                 pmenode = dd->comm->pmenodes[i];
 879             }
 880         }
 881     }
 882
 883     return pmenode;
 884 }
 885
 886 NumPmeDomains getNumPmeDomains(const gmx_domdec_t *dd)
 887 {
 888     if (dd != nullptr)
 889     {
 890         return {
 891                    dd->comm->npmenodes_x, dd->comm->npmenodes_y
 892         };
 893     }
 894     else
 895     {
 896         return {
 897                    1, 1
 898         };
 899     }
 900 }
 901
 902 std::vector<int> get_pme_ddranks(const t_commrec *cr, int pmenodeid)
 903 {
 904     gmx_domdec_t *dd;
 905     int           x, y, z;
 906     ivec          coord, coord_pme;
 907
 908     dd = cr->dd;
 909
 910     std::vector<int> ddranks;
 911     ddranks.reserve((dd->nnodes+cr->npmenodes-1)/cr->npmenodes);
 912
 913     for (x = 0; x < dd->nc[XX]; x++)
 914     {
 915         for (y = 0; y < dd->nc[YY]; y++)
 916         {
 917             for (z = 0; z < dd->nc[ZZ]; z++)
 918             {
 919                 if (dd->comm->bCartesianPP_PME)
 920                 {
 921                     coord[XX] = x;
 922                     coord[YY] = y;
 923                     coord[ZZ] = z;
 924                     dd_cart_coord2pmecoord(dd, coord, coord_pme);
 925                     if (dd->ci[XX] == coord_pme[XX] &&
 926                         dd->ci[YY] == coord_pme[YY] &&
 927                         dd->ci[ZZ] == coord_pme[ZZ])
 928                     {
 929                         ddranks.push_back(ddcoord2simnodeid(cr, x, y, z));
 930                     }
 931                 }
 932                 else
 933                 {
 934                     /* The slab corresponds to the nodeid in the PME group */
 935                     if (gmx_ddcoord2pmeindex(cr, x, y, z) == pmenodeid)
 936                     {
 937                         ddranks.push_back(ddcoord2simnodeid(cr, x, y, z));
 938                     }
 939                 }
 940             }
 941         }
 942     }
 943     return ddranks;
 944 }
 945
 946 static gmx_bool receive_vir_ener(const gmx_domdec_t *dd, const t_commrec *cr)
 947 {
 948     gmx_bool bReceive = TRUE;
 949
 950     if (cr->npmenodes < dd->nnodes)
 951     {
 952         gmx_domdec_comm_t *comm = dd->comm;
 953         if (comm->bCartesianPP_PME)
 954         {
 955 #if GMX_MPI
 956             int  pmenode = dd_simnode2pmenode(dd, cr, cr->sim_nodeid);
 957             ivec coords;
 958             MPI_Cart_coords(cr->mpi_comm_mysim, cr->sim_nodeid, DIM, coords);
 959             coords[comm->cartpmedim]++;
 960             if (coords[comm->cartpmedim] < dd->nc[comm->cartpmedim])
 961             {
 962                 int rank;
 963                 MPI_Cart_rank(cr->mpi_comm_mysim, coords, &rank);
 964                 if (dd_simnode2pmenode(dd, cr, rank) == pmenode)
 965                 {
 966                     /* This is not the last PP node for pmenode */
 967                     bReceive = FALSE;
 968                 }
 969             }
 970 #else
 971             GMX_RELEASE_ASSERT(false, "Without MPI we should not have Cartesian PP-PME with #PMEnodes < #DDnodes");
 972 #endif
 973         }
 974         else
 975         {
 976             int pmenode = dd_simnode2pmenode(dd, cr, cr->sim_nodeid);
 977             if (cr->sim_nodeid+1 < cr->nnodes &&
 978                 dd_simnode2pmenode(dd, cr, cr->sim_nodeid+1) == pmenode)
 979             {
 980                 /* This is not the last PP node for pmenode */
 981                 bReceive = FALSE;
 982             }
 983         }
 984     }
 985
 986     return bReceive;
 987 }
 988
 989 static void set_slb_pme_dim_f(gmx_domdec_t *dd, int dim, real **dim_f)
 990 {
 991     gmx_domdec_comm_t *comm;
 992     int                i;
 993
 994     comm = dd->comm;
 995
 996     snew(*dim_f, dd->nc[dim]+1);
 997     (*dim_f)[0] = 0;
 998     for (i = 1; i < dd->nc[dim]; i++)
 999     {
1000         if (comm->slb_frac[dim])
1001         {
1002             (*dim_f)[i] = (*dim_f)[i-1] + comm->slb_frac[dim][i-1];
1003         }
1004         else
1005         {
1006             (*dim_f)[i] = static_cast<real>(i)/static_cast<real>(dd->nc[dim]);
1007         }
1008     }
1009     (*dim_f)[dd->nc[dim]] = 1;
1010 }
1011
1012 static void init_ddpme(gmx_domdec_t *dd, gmx_ddpme_t *ddpme, int dimind)
1013 {
1014     int  pmeindex, slab, nso, i;
1015     ivec xyz;
1016
1017     if (dimind == 0 && dd->dim[0] == YY && dd->comm->npmenodes_x == 1)
1018     {
1019         ddpme->dim = YY;
1020     }
1021     else
1022     {
1023         ddpme->dim = dimind;
1024     }
1025     ddpme->dim_match = (ddpme->dim == dd->dim[dimind]);
1026
1027     ddpme->nslab = (ddpme->dim == 0 ?
1028                     dd->comm->npmenodes_x :
1029                     dd->comm->npmenodes_y);
1030
1031     if (ddpme->nslab <= 1)
1032     {
1033         return;
1034     }
1035
1036     nso = dd->comm->npmenodes/ddpme->nslab;
1037     /* Determine for each PME slab the PP location range for dimension dim */
1038     snew(ddpme->pp_min, ddpme->nslab);
1039     snew(ddpme->pp_max, ddpme->nslab);
1040     for (slab = 0; slab < ddpme->nslab; slab++)
1041     {
1042         ddpme->pp_min[slab] = dd->nc[dd->dim[dimind]] - 1;
1043         ddpme->pp_max[slab] = 0;
1044     }
1045     for (i = 0; i < dd->nnodes; i++)
1046     {
1047         ddindex2xyz(dd->nc, i, xyz);
1048         /* For y only use our y/z slab.
1049          * This assumes that the PME x grid size matches the DD grid size.
1050          */
1051         if (dimind == 0 || xyz[XX] == dd->ci[XX])
1052         {
1053             pmeindex = ddindex2pmeindex(dd, i);
1054             if (dimind == 0)
1055             {
1056                 slab = pmeindex/nso;
1057             }
1058             else
1059             {
1060                 slab = pmeindex % ddpme->nslab;
1061             }
1062             ddpme->pp_min[slab] = std::min(ddpme->pp_min[slab], xyz[dimind]);
1063             ddpme->pp_max[slab] = std::max(ddpme->pp_max[slab], xyz[dimind]);
1064         }
1065     }
1066
1067     set_slb_pme_dim_f(dd, ddpme->dim, &ddpme->slb_dim_f);
1068 }
1069
1070 int dd_pme_maxshift_x(const gmx_domdec_t *dd)
1071 {
1072     if (dd->comm->ddpme[0].dim == XX)
1073     {
1074         return dd->comm->ddpme[0].maxshift;
1075     }
1076     else
1077     {
1078         return 0;
1079     }
1080 }
1081
1082 int dd_pme_maxshift_y(const gmx_domdec_t *dd)
1083 {
1084     if (dd->comm->ddpme[0].dim == YY)
1085     {
1086         return dd->comm->ddpme[0].maxshift;
1087     }
1088     else if (dd->comm->npmedecompdim >= 2 && dd->comm->ddpme[1].dim == YY)
1089     {
1090         return dd->comm->ddpme[1].maxshift;
1091     }
1092     else
1093     {
1094         return 0;
1095     }
1096 }
1097
1098 void dd_cycles_add(const gmx_domdec_t *dd, float cycles, int ddCycl)
1099 {
1100     /* Note that the cycles value can be incorrect, either 0 or some
1101      * extremely large value, when our thread migrated to another core
1102      * with an unsynchronized cycle counter. If this happens less often
1103      * that once per nstlist steps, this will not cause issues, since
1104      * we later subtract the maximum value from the sum over nstlist steps.
1105      * A zero count will slightly lower the total, but that's a small effect.
1106      * Note that the main purpose of the subtraction of the maximum value
1107      * is to avoid throwing off the load balancing when stalls occur due
1108      * e.g. system activity or network congestion.
1109      */
1110     dd->comm->cycl[ddCycl] += cycles;
1111     dd->comm->cycl_n[ddCycl]++;
1112     if (cycles > dd->comm->cycl_max[ddCycl])
1113     {
1114         dd->comm->cycl_max[ddCycl] = cycles;
1115     }
1116 }
1117
1118 #if GMX_MPI
1119 static void make_load_communicator(gmx_domdec_t *dd, int dim_ind, ivec loc)
1120 {
1121     MPI_Comm           c_row;
1122     int                dim, i, rank;
1123     ivec               loc_c;
1124     gmx_bool           bPartOfGroup = FALSE;
1125
1126     dim = dd->dim[dim_ind];
1127     copy_ivec(loc, loc_c);
1128     for (i = 0; i < dd->nc[dim]; i++)
1129     {
1130         loc_c[dim] = i;
1131         rank       = dd_index(dd->nc, loc_c);
1132         if (rank == dd->rank)
1133         {
1134             /* This process is part of the group */
1135             bPartOfGroup = TRUE;
1136         }
1137     }
1138     MPI_Comm_split(dd->mpi_comm_all, bPartOfGroup ? 0 : MPI_UNDEFINED, dd->rank,
1139                    &c_row);
1140     if (bPartOfGroup)
1141     {
1142         dd->comm->mpi_comm_load[dim_ind] = c_row;
1143         if (!isDlbDisabled(dd->comm))
1144         {
1145             DDCellsizesWithDlb &cellsizes = dd->comm->cellsizesWithDlb[dim_ind];
1146
1147             if (dd->ci[dim] == dd->master_ci[dim])
1148             {
1149                 /* This is the root process of this row */
1150                 cellsizes.rowMaster  = std::make_unique<RowMaster>();
1151
1152                 RowMaster &rowMaster = *cellsizes.rowMaster;
1153                 rowMaster.cellFrac.resize(ddCellFractionBufferSize(dd, dim_ind));
1154                 rowMaster.oldCellFrac.resize(dd->nc[dim] + 1);
1155                 rowMaster.isCellMin.resize(dd->nc[dim]);
1156                 if (dim_ind > 0)
1157                 {
1158                     rowMaster.bounds.resize(dd->nc[dim]);
1159                 }
1160                 rowMaster.buf_ncd.resize(dd->nc[dim]);
1161             }
1162             else
1163             {
1164                 /* This is not a root process, we only need to receive cell_f */
1165                 cellsizes.fracRow.resize(ddCellFractionBufferSize(dd, dim_ind));
1166             }
1167         }
1168         if (dd->ci[dim] == dd->master_ci[dim])
1169         {
1170             snew(dd->comm->load[dim_ind].load, dd->nc[dim]*DD_NLOAD_MAX);
1171         }
1172     }
1173 }
1174 #endif
1175
1176 void dd_setup_dlb_resource_sharing(t_commrec            *cr,
1177                                    int                   gpu_id)
1178 {
1179 #if GMX_MPI
1180     int           physicalnode_id_hash;
1181     gmx_domdec_t *dd;
1182     MPI_Comm      mpi_comm_pp_physicalnode;
1183
1184     if (!thisRankHasDuty(cr, DUTY_PP) || gpu_id < 0)
1185     {
1186         /* Only ranks with short-ranged tasks (currently) use GPUs.
1187          * If we don't have GPUs assigned, there are no resources to share.
1188          */
1189         return;
1190     }
1191
1192     physicalnode_id_hash = gmx_physicalnode_id_hash();
1193
1194     dd = cr->dd;
1195
1196     if (debug)
1197     {
1198         fprintf(debug, "dd_setup_dd_dlb_gpu_sharing:\n");
1199         fprintf(debug, "DD PP rank %d physical node hash %d gpu_id %d\n",
1200                 dd->rank, physicalnode_id_hash, gpu_id);
1201     }
1202     /* Split the PP communicator over the physical nodes */
1203     /* TODO: See if we should store this (before), as it's also used for
1204      * for the nodecomm summation.
1205      */
1206     // TODO PhysicalNodeCommunicator could be extended/used to handle
1207     // the need for per-node per-group communicators.
1208     MPI_Comm_split(dd->mpi_comm_all, physicalnode_id_hash, dd->rank,
1209                    &mpi_comm_pp_physicalnode);
1210     MPI_Comm_split(mpi_comm_pp_physicalnode, gpu_id, dd->rank,
1211                    &dd->comm->mpi_comm_gpu_shared);
1212     MPI_Comm_free(&mpi_comm_pp_physicalnode);
1213     MPI_Comm_size(dd->comm->mpi_comm_gpu_shared, &dd->comm->nrank_gpu_shared);
1214
1215     if (debug)
1216     {
1217         fprintf(debug, "nrank_gpu_shared %d\n", dd->comm->nrank_gpu_shared);
1218     }
1219
1220     /* Note that some ranks could share a GPU, while others don't */
1221
1222     if (dd->comm->nrank_gpu_shared == 1)
1223     {
1224         MPI_Comm_free(&dd->comm->mpi_comm_gpu_shared);
1225     }
1226 #else
1227     GMX_UNUSED_VALUE(cr);
1228     GMX_UNUSED_VALUE(gpu_id);
1229 #endif
1230 }
1231
1232 static void make_load_communicators(gmx_domdec_t gmx_unused *dd)
1233 {
1234 #if GMX_MPI
1235     int  dim0, dim1, i, j;
1236     ivec loc;
1237
1238     if (debug)
1239     {
1240         fprintf(debug, "Making load communicators\n");
1241     }
1242
1243     dd->comm->load = new domdec_load_t[std::max(dd->ndim, 1)];
1244     snew(dd->comm->mpi_comm_load, std::max(dd->ndim, 1));
1245
1246     if (dd->ndim == 0)
1247     {
1248         return;
1249     }
1250
1251     clear_ivec(loc);
1252     make_load_communicator(dd, 0, loc);
1253     if (dd->ndim > 1)
1254     {
1255         dim0 = dd->dim[0];
1256         for (i = 0; i < dd->nc[dim0]; i++)
1257         {
1258             loc[dim0] = i;
1259             make_load_communicator(dd, 1, loc);
1260         }
1261     }
1262     if (dd->ndim > 2)
1263     {
1264         dim0 = dd->dim[0];
1265         for (i = 0; i < dd->nc[dim0]; i++)
1266         {
1267             loc[dim0] = i;
1268             dim1      = dd->dim[1];
1269             for (j = 0; j < dd->nc[dim1]; j++)
1270             {
1271                 loc[dim1] = j;
1272                 make_load_communicator(dd, 2, loc);
1273             }
1274         }
1275     }
1276
1277     if (debug)
1278     {
1279         fprintf(debug, "Finished making load communicators\n");
1280     }
1281 #endif
1282 }
1283
1284 /*! \brief Sets up the relation between neighboring domains and zones */
1285 static void setup_neighbor_relations(gmx_domdec_t *dd)
1286 {
1287     int                     d, dim, i, j, m;
1288     ivec                    tmp, s;
1289     gmx_domdec_zones_t     *zones;
1290     gmx_domdec_ns_ranges_t *izone;
1291
1292     for (d = 0; d < dd->ndim; d++)
1293     {
1294         dim = dd->dim[d];
1295         copy_ivec(dd->ci, tmp);
1296         tmp[dim]           = (tmp[dim] + 1) % dd->nc[dim];
1297         dd->neighbor[d][0] = ddcoord2ddnodeid(dd, tmp);
1298         copy_ivec(dd->ci, tmp);
1299         tmp[dim]           = (tmp[dim] - 1 + dd->nc[dim]) % dd->nc[dim];
1300         dd->neighbor[d][1] = ddcoord2ddnodeid(dd, tmp);
1301         if (debug)
1302         {
1303             fprintf(debug, "DD rank %d neighbor ranks in dir %d are + %d - %d\n",
1304                     dd->rank, dim,
1305                     dd->neighbor[d][0],
1306                     dd->neighbor[d][1]);
1307         }
1308     }
1309
1310     int nzone  = (1 << dd->ndim);
1311     GMX_ASSERT(dd->ndim < DIM, "Invalid number of dimensions");
1312     int nizone = (1 << std::max(dd->ndim - 1, 0));
1313     assert(nizone >= 1 && nizone <= DD_MAXIZONE);
1314
1315     zones = &dd->comm->zones;
1316
1317     for (i = 0; i < nzone; i++)
1318     {
1319         m = 0;
1320         clear_ivec(zones->shift[i]);
1321         for (d = 0; d < dd->ndim; d++)
1322         {
1323             zones->shift[i][dd->dim[d]] = dd_zo[i][m++];
1324         }
1325     }
1326
1327     zones->n = nzone;
1328     for (i = 0; i < nzone; i++)
1329     {
1330         for (d = 0; d < DIM; d++)
1331         {
1332             s[d] = dd->ci[d] - zones->shift[i][d];
1333             if (s[d] < 0)
1334             {
1335                 s[d] += dd->nc[d];
1336             }
1337             else if (s[d] >= dd->nc[d])
1338             {
1339                 s[d] -= dd->nc[d];
1340             }
1341         }
1342     }
1343     zones->nizone = nizone;
1344     for (i = 0; i < zones->nizone; i++)
1345     {
1346         assert(ddNonbondedZonePairRanges[i][0] == i);
1347
1348         izone     = &zones->izone[i];
1349         /* dd_zp3 is for 3D decomposition, for fewer dimensions use only
1350          * j-zones up to nzone.
1351          */
1352         izone->j0 = std::min(ddNonbondedZonePairRanges[i][1], nzone);
1353         izone->j1 = std::min(ddNonbondedZonePairRanges[i][2], nzone);
1354         for (dim = 0; dim < DIM; dim++)
1355         {
1356             if (dd->nc[dim] == 1)
1357             {
1358                 /* All shifts should be allowed */
1359                 izone->shift0[dim] = -1;
1360                 izone->shift1[dim] = 1;
1361             }
1362             else
1363             {
1364                 /* Determine the min/max j-zone shift wrt the i-zone */
1365                 izone->shift0[dim] = 1;
1366                 izone->shift1[dim] = -1;
1367                 for (j = izone->j0; j < izone->j1; j++)
1368                 {
1369                     int shift_diff = zones->shift[j][dim] - zones->shift[i][dim];
1370                     if (shift_diff < izone->shift0[dim])
1371                     {
1372                         izone->shift0[dim] = shift_diff;
1373                     }
1374                     if (shift_diff > izone->shift1[dim])
1375                     {
1376                         izone->shift1[dim] = shift_diff;
1377                     }
1378                 }
1379             }
1380         }
1381     }
1382
1383     if (!isDlbDisabled(dd->comm))
1384     {
1385         dd->comm->cellsizesWithDlb.resize(dd->ndim);
1386     }
1387
1388     if (dd->comm->bRecordLoad)
1389     {
1390         make_load_communicators(dd);
1391     }
1392 }
1393
1394 static void make_pp_communicator(const gmx::MDLogger  &mdlog,
1395                                  gmx_domdec_t         *dd,
1396                                  t_commrec gmx_unused *cr,
1397                                  bool gmx_unused       reorder)
1398 {
1399 #if GMX_MPI
1400     gmx_domdec_comm_t *comm;
1401     int                rank, *buf;
1402     ivec               periods;
1403     MPI_Comm           comm_cart;
1404
1405     comm = dd->comm;
1406
1407     if (comm->bCartesianPP)
1408     {
1409         /* Set up cartesian communication for the particle-particle part */
1410         GMX_LOG(mdlog.info).appendTextFormatted(
1411                 "Will use a Cartesian communicator: %d x %d x %d",
1412                 dd->nc[XX], dd->nc[YY], dd->nc[ZZ]);
1413
1414         for (int i = 0; i < DIM; i++)
1415         {
1416             periods[i] = TRUE;
1417         }
1418         MPI_Cart_create(cr->mpi_comm_mygroup, DIM, dd->nc, periods, static_cast<int>(reorder),
1419                         &comm_cart);
1420         /* We overwrite the old communicator with the new cartesian one */
1421         cr->mpi_comm_mygroup = comm_cart;
1422     }
1423
1424     dd->mpi_comm_all = cr->mpi_comm_mygroup;
1425     MPI_Comm_rank(dd->mpi_comm_all, &dd->rank);
1426
1427     if (comm->bCartesianPP_PME)
1428     {
1429         /* Since we want to use the original cartesian setup for sim,
1430          * and not the one after split, we need to make an index.
1431          */
1432         snew(comm->ddindex2ddnodeid, dd->nnodes);
1433         comm->ddindex2ddnodeid[dd_index(dd->nc, dd->ci)] = dd->rank;
1434         gmx_sumi(dd->nnodes, comm->ddindex2ddnodeid, cr);
1435         /* Get the rank of the DD master,
1436          * above we made sure that the master node is a PP node.
1437          */
1438         if (MASTER(cr))
1439         {
1440             rank = dd->rank;
1441         }
1442         else
1443         {
1444             rank = 0;
1445         }
1446         MPI_Allreduce(&rank, &dd->masterrank, 1, MPI_INT, MPI_SUM, dd->mpi_comm_all);
1447     }
1448     else if (comm->bCartesianPP)
1449     {
1450         if (cr->npmenodes == 0)
1451         {
1452             /* The PP communicator is also
1453              * the communicator for this simulation
1454              */
1455             cr->mpi_comm_mysim = cr->mpi_comm_mygroup;
1456         }
1457         cr->nodeid = dd->rank;
1458
1459         MPI_Cart_coords(dd->mpi_comm_all, dd->rank, DIM, dd->ci);
1460
1461         /* We need to make an index to go from the coordinates
1462          * to the nodeid of this simulation.
1463          */
1464         snew(comm->ddindex2simnodeid, dd->nnodes);
1465         snew(buf, dd->nnodes);
1466         if (thisRankHasDuty(cr, DUTY_PP))
1467         {
1468             buf[dd_index(dd->nc, dd->ci)] = cr->sim_nodeid;
1469         }
1470         /* Communicate the ddindex to simulation nodeid index */
1471         MPI_Allreduce(buf, comm->ddindex2simnodeid, dd->nnodes, MPI_INT, MPI_SUM,
1472                       cr->mpi_comm_mysim);
1473         sfree(buf);
1474
1475         /* Determine the master coordinates and rank.
1476          * The DD master should be the same node as the master of this sim.
1477          */
1478         for (int i = 0; i < dd->nnodes; i++)
1479         {
1480             if (comm->ddindex2simnodeid[i] == 0)
1481             {
1482                 ddindex2xyz(dd->nc, i, dd->master_ci);
1483                 MPI_Cart_rank(dd->mpi_comm_all, dd->master_ci, &dd->masterrank);
1484             }
1485         }
1486         if (debug)
1487         {
1488             fprintf(debug, "The master rank is %d\n", dd->masterrank);
1489         }
1490     }
1491     else
1492     {
1493         /* No Cartesian communicators */
1494         /* We use the rank in dd->comm->all as DD index */
1495         ddindex2xyz(dd->nc, dd->rank, dd->ci);
1496         /* The simulation master nodeid is 0, so the DD master rank is also 0 */
1497         dd->masterrank = 0;
1498         clear_ivec(dd->master_ci);
1499     }
1500 #endif
1501
1502     GMX_LOG(mdlog.info).appendTextFormatted(
1503             "Domain decomposition rank %d, coordinates %d %d %d\n",
1504             dd->rank, dd->ci[XX], dd->ci[YY], dd->ci[ZZ]);
1505     if (debug)
1506     {
1507         fprintf(debug,
1508                 "Domain decomposition rank %d, coordinates %d %d %d\n\n",
1509                 dd->rank, dd->ci[XX], dd->ci[YY], dd->ci[ZZ]);
1510     }
1511 }
1512
1513 static void receive_ddindex2simnodeid(gmx_domdec_t         *dd,
1514                                       t_commrec            *cr)
1515 {
1516 #if GMX_MPI
1517     gmx_domdec_comm_t *comm = dd->comm;
1518
1519     if (!comm->bCartesianPP_PME && comm->bCartesianPP)
1520     {
1521         int *buf;
1522         snew(comm->ddindex2simnodeid, dd->nnodes);
1523         snew(buf, dd->nnodes);
1524         if (thisRankHasDuty(cr, DUTY_PP))
1525         {
1526             buf[dd_index(dd->nc, dd->ci)] = cr->sim_nodeid;
1527         }
1528         /* Communicate the ddindex to simulation nodeid index */
1529         MPI_Allreduce(buf, comm->ddindex2simnodeid, dd->nnodes, MPI_INT, MPI_SUM,
1530                       cr->mpi_comm_mysim);
1531         sfree(buf);
1532     }
1533 #else
1534     GMX_UNUSED_VALUE(dd);
1535     GMX_UNUSED_VALUE(cr);
1536 #endif
1537 }
1538
1539 static void split_communicator(const gmx::MDLogger &mdlog,
1540                                t_commrec *cr, gmx_domdec_t *dd,
1541                                DdRankOrder gmx_unused rankOrder,
1542                                bool gmx_unused reorder)
1543 {
1544     gmx_domdec_comm_t *comm;
1545     int                i;
1546     gmx_bool           bDiv[DIM];
1547 #if GMX_MPI
1548     MPI_Comm           comm_cart;
1549 #endif
1550
1551     comm = dd->comm;
1552
1553     if (comm->bCartesianPP)
1554     {
1555         for (i = 1; i < DIM; i++)
1556         {
1557             bDiv[i] = ((cr->npmenodes*dd->nc[i]) % (dd->nnodes) == 0);
1558         }
1559         if (bDiv[YY] || bDiv[ZZ])
1560         {
1561             comm->bCartesianPP_PME = TRUE;
1562             /* If we have 2D PME decomposition, which is always in x+y,
1563              * we stack the PME only nodes in z.
1564              * Otherwise we choose the direction that provides the thinnest slab
1565              * of PME only nodes as this will have the least effect
1566              * on the PP communication.
1567              * But for the PME communication the opposite might be better.
1568              */
1569             if (bDiv[ZZ] && (comm->npmenodes_y > 1 ||
1570                              !bDiv[YY] ||
1571                              dd->nc[YY] > dd->nc[ZZ]))
1572             {
1573                 comm->cartpmedim = ZZ;
1574             }
1575             else
1576             {
1577                 comm->cartpmedim = YY;
1578             }
1579             comm->ntot[comm->cartpmedim]
1580                 += (cr->npmenodes*dd->nc[comm->cartpmedim])/dd->nnodes;
1581         }
1582         else
1583         {
1584             GMX_LOG(mdlog.info).appendTextFormatted(
1585                     "Number of PME-only ranks (%d) is not a multiple of nx*ny (%d*%d) or nx*nz (%d*%d)",
1586                     cr->npmenodes, dd->nc[XX], dd->nc[YY], dd->nc[XX], dd->nc[ZZ]);
1587             GMX_LOG(mdlog.info).appendText("Will not use a Cartesian communicator for PP <-> PME\n");
1588         }
1589     }
1590
1591     if (comm->bCartesianPP_PME)
1592     {
1593 #if GMX_MPI
1594         int  rank;
1595         ivec periods;
1596
1597         GMX_LOG(mdlog.info).appendTextFormatted(
1598                 "Will use a Cartesian communicator for PP <-> PME: %d x %d x %d",
1599                 comm->ntot[XX], comm->ntot[YY], comm->ntot[ZZ]);
1600
1601         for (i = 0; i < DIM; i++)
1602         {
1603             periods[i] = TRUE;
1604         }
1605         MPI_Cart_create(cr->mpi_comm_mysim, DIM, comm->ntot, periods, static_cast<int>(reorder),
1606                         &comm_cart);
1607         MPI_Comm_rank(comm_cart, &rank);
1608         if (MASTER(cr) && rank != 0)
1609         {
1610             gmx_fatal(FARGS, "MPI rank 0 was renumbered by MPI_Cart_create, we do not allow this");
1611         }
1612
1613         /* With this assigment we loose the link to the original communicator
1614          * which will usually be MPI_COMM_WORLD, unless have multisim.
1615          */
1616         cr->mpi_comm_mysim = comm_cart;
1617         cr->sim_nodeid     = rank;
1618
1619         MPI_Cart_coords(cr->mpi_comm_mysim, cr->sim_nodeid, DIM, dd->ci);
1620
1621         GMX_LOG(mdlog.info).appendTextFormatted(
1622                 "Cartesian rank %d, coordinates %d %d %d\n",
1623                 cr->sim_nodeid, dd->ci[XX], dd->ci[YY], dd->ci[ZZ]);
1624
1625         if (dd->ci[comm->cartpmedim] < dd->nc[comm->cartpmedim])
1626         {
1627             cr->duty = DUTY_PP;
1628         }
1629         if (cr->npmenodes == 0 ||
1630             dd->ci[comm->cartpmedim] >= dd->nc[comm->cartpmedim])
1631         {
1632             cr->duty = DUTY_PME;
1633         }
1634
1635         /* Split the sim communicator into PP and PME only nodes */
1636         MPI_Comm_split(cr->mpi_comm_mysim,
1637                        getThisRankDuties(cr),
1638                        dd_index(comm->ntot, dd->ci),
1639                        &cr->mpi_comm_mygroup);
1640 #endif
1641     }
1642     else
1643     {
1644         switch (rankOrder)
1645         {
1646             case DdRankOrder::pp_pme:
1647                 GMX_LOG(mdlog.info).appendText("Order of the ranks: PP first, PME last");
1648                 break;
1649             case DdRankOrder::interleave:
1650                 /* Interleave the PP-only and PME-only ranks */
1651                 GMX_LOG(mdlog.info).appendText("Interleaving PP and PME ranks");
1652                 comm->pmenodes = dd_interleaved_pme_ranks(dd);
1653                 break;
1654             case DdRankOrder::cartesian:
1655                 break;
1656             default:
1657                 gmx_fatal(FARGS, "Invalid ddRankOrder=%d", static_cast<int>(rankOrder));
1658         }
1659
1660         if (dd_simnode2pmenode(dd, cr, cr->sim_nodeid) == -1)
1661         {
1662             cr->duty = DUTY_PME;
1663         }
1664         else
1665         {
1666             cr->duty = DUTY_PP;
1667         }
1668 #if GMX_MPI
1669         /* Split the sim communicator into PP and PME only nodes */
1670         MPI_Comm_split(cr->mpi_comm_mysim,
1671                        getThisRankDuties(cr),
1672                        cr->nodeid,
1673                        &cr->mpi_comm_mygroup);
1674         MPI_Comm_rank(cr->mpi_comm_mygroup, &cr->nodeid);
1675 #endif
1676     }
1677
1678     GMX_LOG(mdlog.info).appendTextFormatted(
1679             "This rank does only %s work.\n",
1680             thisRankHasDuty(cr, DUTY_PP) ? "particle-particle" : "PME-mesh");
1681 }
1682
1683 /*! \brief Generates the MPI communicators for domain decomposition */
1684 static void make_dd_communicators(const gmx::MDLogger &mdlog,
1685                                   t_commrec *cr,
1686                                   gmx_domdec_t *dd, DdRankOrder ddRankOrder)
1687 {
1688     gmx_domdec_comm_t *comm;
1689     bool               CartReorder;
1690
1691     comm = dd->comm;
1692
1693     copy_ivec(dd->nc, comm->ntot);
1694
1695     comm->bCartesianPP     = (ddRankOrder == DdRankOrder::cartesian);
1696     comm->bCartesianPP_PME = FALSE;
1697
1698     /* Reorder the nodes by default. This might change the MPI ranks.
1699      * Real reordering is only supported on very few architectures,
1700      * Blue Gene is one of them.
1701      */
1702     CartReorder = getenv("GMX_NO_CART_REORDER") == nullptr;
1703
1704     if (cr->npmenodes > 0)
1705     {
1706         /* Split the communicator into a PP and PME part */
1707         split_communicator(mdlog, cr, dd, ddRankOrder, CartReorder);
1708         if (comm->bCartesianPP_PME)
1709         {
1710             /* We (possibly) reordered the nodes in split_communicator,
1711              * so it is no longer required in make_pp_communicator.
1712              */
1713             CartReorder = FALSE;
1714         }
1715     }
1716     else
1717     {
1718         /* All nodes do PP and PME */
1719 #if GMX_MPI
1720         /* We do not require separate communicators */
1721         cr->mpi_comm_mygroup = cr->mpi_comm_mysim;
1722 #endif
1723     }
1724
1725     if (thisRankHasDuty(cr, DUTY_PP))
1726     {
1727         /* Copy or make a new PP communicator */
1728         make_pp_communicator(mdlog, dd, cr, CartReorder);
1729     }
1730     else
1731     {
1732         receive_ddindex2simnodeid(dd, cr);
1733     }
1734
1735     if (!thisRankHasDuty(cr, DUTY_PME))
1736     {
1737         /* Set up the commnuication to our PME node */
1738         dd->pme_nodeid           = dd_simnode2pmenode(dd, cr, cr->sim_nodeid);
1739         dd->pme_receive_vir_ener = receive_vir_ener(dd, cr);
1740         if (debug)
1741         {
1742             fprintf(debug, "My pme_nodeid %d receive ener %s\n",
1743                     dd->pme_nodeid, gmx::boolToString(dd->pme_receive_vir_ener));
1744         }
1745     }
1746     else
1747     {
1748         dd->pme_nodeid = -1;
1749     }
1750
1751     /* We can not use DDMASTER(dd), because dd->masterrank is set later */
1752     if (MASTER(cr))
1753     {
1754         dd->ma = std::make_unique<AtomDistribution>(dd->nc,
1755                                                     comm->cgs_gl.nr,
1756                                                     comm->cgs_gl.index[comm->cgs_gl.nr]);
1757     }
1758 }
1759
1760 static real *get_slb_frac(const gmx::MDLogger &mdlog,
1761                           const char *dir, int nc, const char *size_string)
1762 {
1763     real  *slb_frac, tot;
1764     int    i, n;
1765     double dbl;
1766
1767     slb_frac = nullptr;
1768     if (nc > 1 && size_string != nullptr)
1769     {
1770         GMX_LOG(mdlog.info).appendTextFormatted(
1771                 "Using static load balancing for the %s direction", dir);
1772         snew(slb_frac, nc);
1773         tot = 0;
1774         for (i = 0; i < nc; i++)
1775         {
1776             dbl = 0;
1777             sscanf(size_string, "%20lf%n", &dbl, &n);
1778             if (dbl == 0)
1779             {
1780                 gmx_fatal(FARGS, "Incorrect or not enough DD cell size entries for direction %s: '%s'", dir, size_string);
1781             }
1782             slb_frac[i]  = dbl;
1783             size_string += n;
1784             tot         += slb_frac[i];
1785         }
1786         /* Normalize */
1787         std::string relativeCellSizes = "Relative cell sizes:";
1788         for (i = 0; i < nc; i++)
1789         {
1790             slb_frac[i]       /= tot;
1791             relativeCellSizes += gmx::formatString(" %5.3f", slb_frac[i]);
1792         }
1793         GMX_LOG(mdlog.info).appendText(relativeCellSizes);
1794     }
1795
1796     return slb_frac;
1797 }
1798
1799 static int multi_body_bondeds_count(const gmx_mtop_t *mtop)
1800 {
1801     int                  n     = 0;
1802     gmx_mtop_ilistloop_t iloop = gmx_mtop_ilistloop_init(mtop);
1803     int                  nmol;
1804     while (const InteractionLists *ilists = gmx_mtop_ilistloop_next(iloop, &nmol))
1805     {
1806         for (auto &ilist : extractILists(*ilists, IF_BOND))
1807         {
1808             if (NRAL(ilist.functionType) >  2)
1809             {
1810                 n += nmol*(ilist.iatoms.size()/ilistStride(ilist));
1811             }
1812         }
1813     }
1814
1815     return n;
1816 }
1817
1818 static int dd_getenv(const gmx::MDLogger &mdlog,
1819                      const char *env_var, int def)
1820 {
1821     char *val;
1822     int   nst;
1823
1824     nst = def;
1825     val = getenv(env_var);
1826     if (val)
1827     {
1828         if (sscanf(val, "%20d", &nst) <= 0)
1829         {
1830             nst = 1;
1831         }
1832         GMX_LOG(mdlog.info).appendTextFormatted(
1833                 "Found env.var. %s = %s, using value %d",
1834                 env_var, val, nst);
1835     }
1836
1837     return nst;
1838 }
1839
1840 static void check_dd_restrictions(const gmx_domdec_t  *dd,
1841                                   const t_inputrec    *ir,
1842                                   const gmx::MDLogger &mdlog)
1843 {
1844     if (ir->ePBC == epbcSCREW &&
1845         (dd->nc[XX] == 1 || dd->nc[YY] > 1 || dd->nc[ZZ] > 1))
1846     {
1847         gmx_fatal(FARGS, "With pbc=%s can only do domain decomposition in the x-direction", epbc_names[ir->ePBC]);
1848     }
1849
1850     if (ir->ns_type == ensSIMPLE)
1851     {
1852         gmx_fatal(FARGS, "Domain decomposition does not support simple neighbor searching, use grid searching or run with one MPI rank");
1853     }
1854
1855     if (ir->nstlist == 0)
1856     {
1857         gmx_fatal(FARGS, "Domain decomposition does not work with nstlist=0");
1858     }
1859
1860     if (ir->comm_mode == ecmANGULAR && ir->ePBC != epbcNONE)
1861     {
1862         GMX_LOG(mdlog.warning).appendText("comm-mode angular will give incorrect results when the comm group partially crosses a periodic boundary");
1863     }
1864 }
1865
1866 static real average_cellsize_min(gmx_domdec_t *dd, gmx_ddbox_t *ddbox)
1867 {
1868     int  di, d;
1869     real r;
1870
1871     r = ddbox->box_size[XX];
1872     for (di = 0; di < dd->ndim; di++)
1873     {
1874         d = dd->dim[di];
1875         /* Check using the initial average cell size */
1876         r = std::min(r, ddbox->box_size[d]*ddbox->skew_fac[d]/dd->nc[d]);
1877     }
1878
1879     return r;
1880 }
1881
1882 /*! \brief Depending on the DLB initial value return the DLB switched off state or issue an error.
1883  */
1884 static DlbState forceDlbOffOrBail(DlbState             cmdlineDlbState,
1885                                   const std::string   &reasonStr,
1886                                   const gmx::MDLogger &mdlog)
1887 {
1888     std::string dlbNotSupportedErr  = "Dynamic load balancing requested, but ";
1889     std::string dlbDisableNote      = "NOTE: disabling dynamic load balancing as ";
1890
1891     if (cmdlineDlbState == DlbState::onUser)
1892     {
1893         gmx_fatal(FARGS, "%s", (dlbNotSupportedErr + reasonStr).c_str());
1894     }
1895     else if (cmdlineDlbState == DlbState::offCanTurnOn)
1896     {
1897         GMX_LOG(mdlog.info).appendText(dlbDisableNote + reasonStr);
1898     }
1899     return DlbState::offForever;
1900 }
1901
1902 /*! \brief Return the dynamic load balancer's initial state based on initial conditions and user inputs.
1903  *
1904  * This function parses the parameters of "-dlb" command line option setting
1905  * corresponding state values. Then it checks the consistency of the determined
1906  * state with other run parameters and settings. As a result, the initial state
1907  * may be altered or an error may be thrown if incompatibility of options is detected.
1908  *
1909  * \param [in] mdlog       Logger.
1910  * \param [in] dlbOption   Enum value for the DLB option.
1911  * \param [in] bRecordLoad True if the load balancer is recording load information.
1912  * \param [in] mdrunOptions  Options for mdrun.
1913  * \param [in] ir          Pointer mdrun to input parameters.
1914  * \returns                DLB initial/startup state.
1915  */
1916 static DlbState determineInitialDlbState(const gmx::MDLogger &mdlog,
1917                                          DlbOption dlbOption, gmx_bool bRecordLoad,
1918                                          const gmx::MdrunOptions &mdrunOptions,
1919                                          const t_inputrec *ir)
1920 {
1921     DlbState dlbState = DlbState::offCanTurnOn;
1922
1923     switch (dlbOption)
1924     {
1925         case DlbOption::turnOnWhenUseful: dlbState = DlbState::offCanTurnOn; break;
1926         case DlbOption::no:               dlbState = DlbState::offUser;      break;
1927         case DlbOption::yes:              dlbState = DlbState::onUser;       break;
1928         default: gmx_incons("Invalid dlbOption enum value");
1929     }
1930
1931     /* Reruns don't support DLB: bail or override auto mode */
1932     if (mdrunOptions.rerun)
1933     {
1934         std::string reasonStr = "it is not supported in reruns.";
1935         return forceDlbOffOrBail(dlbState, reasonStr, mdlog);
1936     }
1937
1938     /* Unsupported integrators */
1939     if (!EI_DYNAMICS(ir->eI))
1940     {
1941         auto reasonStr = gmx::formatString("it is only supported with dynamics, not with integrator '%s'.", EI(ir->eI));
1942         return forceDlbOffOrBail(dlbState, reasonStr, mdlog);
1943     }
1944
1945     /* Without cycle counters we can't time work to balance on */
1946     if (!bRecordLoad)
1947     {
1948         std::string reasonStr = "cycle counters unsupported or not enabled in the operating system kernel.";
1949         return forceDlbOffOrBail(dlbState, reasonStr, mdlog);
1950     }
1951
1952     if (mdrunOptions.reproducible)
1953     {
1954         std::string reasonStr = "you started a reproducible run.";
1955         switch (dlbState)
1956         {
1957             case DlbState::offUser:
1958                 break;
1959             case DlbState::offForever:
1960                 GMX_RELEASE_ASSERT(false, "DlbState::offForever is not a valid initial state");
1961                 break;
1962             case DlbState::offCanTurnOn:
1963                 return forceDlbOffOrBail(dlbState, reasonStr, mdlog);
1964             case DlbState::onCanTurnOff:
1965                 GMX_RELEASE_ASSERT(false, "DlbState::offCanTurnOff is not a valid initial state");
1966                 break;
1967             case DlbState::onUser:
1968                 return forceDlbOffOrBail(dlbState, reasonStr + " In load balanced runs binary reproducibility cannot be ensured.", mdlog);
1969             default:
1970                 gmx_fatal(FARGS, "Death horror: undefined case (%d) for load balancing choice", static_cast<int>(dlbState));
1971         }
1972     }
1973
1974     return dlbState;
1975 }
1976
1977 static void set_dd_dim(const gmx::MDLogger &mdlog, gmx_domdec_t *dd)
1978 {
1979     dd->ndim = 0;
1980     if (getenv("GMX_DD_ORDER_ZYX") != nullptr)
1981     {
1982         /* Decomposition order z,y,x */
1983         GMX_LOG(mdlog.info).appendText("Using domain decomposition order z, y, x");
1984         for (int dim = DIM-1; dim >= 0; dim--)
1985         {
1986             if (dd->nc[dim] > 1)
1987             {
1988                 dd->dim[dd->ndim++] = dim;
1989             }
1990         }
1991     }
1992     else
1993     {
1994         /* Decomposition order x,y,z */
1995         for (int dim = 0; dim < DIM; dim++)
1996         {
1997             if (dd->nc[dim] > 1)
1998             {
1999                 dd->dim[dd->ndim++] = dim;
2000             }
2001         }
2002     }
2003
2004     if (dd->ndim == 0)
2005     {
2006         /* Set dim[0] to avoid extra checks on ndim in several places */
2007         dd->dim[0] = XX;
2008     }
2009 }
2010
2011 static gmx_domdec_comm_t *init_dd_comm()
2012 {
2013     gmx_domdec_comm_t *comm = new gmx_domdec_comm_t;
2014
2015     comm->n_load_have      = 0;
2016     comm->n_load_collect   = 0;
2017
2018     comm->haveTurnedOffDlb = false;
2019
2020     for (int i = 0; i < static_cast<int>(DDAtomRanges::Type::Number); i++)
2021     {
2022         comm->sum_nat[i] = 0;
2023     }
2024     comm->ndecomp   = 0;
2025     comm->nload     = 0;
2026     comm->load_step = 0;
2027     comm->load_sum  = 0;
2028     comm->load_max  = 0;
2029     clear_ivec(comm->load_lim);
2030     comm->load_mdf  = 0;
2031     comm->load_pme  = 0;
2032
2033     /* This should be replaced by a unique pointer */
2034     comm->balanceRegion = ddBalanceRegionAllocate();
2035
2036     return comm;
2037 }
2038
2039 /* Returns whether mtop contains constraints and/or vsites */
2040 static bool systemHasConstraintsOrVsites(const gmx_mtop_t &mtop)
2041 {
2042     auto ilistLoop = gmx_mtop_ilistloop_init(mtop);
2043     int  nmol;
2044     while (const InteractionLists *ilists = gmx_mtop_ilistloop_next(ilistLoop, &nmol))
2045     {
2046         if (!extractILists(*ilists, IF_CONSTRAINT | IF_VSITE).empty())
2047         {
2048             return true;
2049         }
2050     }
2051
2052     return false;
2053 }
2054
2055 static void setupUpdateGroups(const gmx::MDLogger &mdlog,
2056                               const gmx_mtop_t    &mtop,
2057                               const t_inputrec    &inputrec,
2058                               real                 cutoffMargin,
2059                               int                  numMpiRanksTotal,
2060                               gmx_domdec_comm_t   *comm)
2061 {
2062     /* When we have constraints and/or vsites, it is beneficial to use
2063      * update groups (when possible) to allow independent update of groups.
2064      */
2065     if (!systemHasConstraintsOrVsites(mtop))
2066     {
2067         /* No constraints or vsites, atoms can be updated independently */
2068         return;
2069     }
2070
2071     comm->updateGroupingPerMoleculetype = gmx::makeUpdateGroups(mtop);
2072     comm->useUpdateGroups               =
2073         (!comm->updateGroupingPerMoleculetype.empty() &&
2074          getenv("GMX_NO_UPDATEGROUPS") == nullptr);
2075
2076     if (comm->useUpdateGroups)
2077     {
2078         int numUpdateGroups = 0;
2079         for (const auto &molblock : mtop.molblock)
2080         {
2081             numUpdateGroups += molblock.nmol*comm->updateGroupingPerMoleculetype[molblock.type].numBlocks();
2082         }
2083
2084         /* Note: We would like to use dd->nnodes for the atom count estimate,
2085          *       but that is not yet available here. But this anyhow only
2086          *       affect performance up to the second dd_partition_system call.
2087          */
2088         int homeAtomCountEstimate =  mtop.natoms/numMpiRanksTotal;
2089         comm->updateGroupsCog =
2090             std::make_unique<gmx::UpdateGroupsCog>(mtop,
2091                                                    comm->updateGroupingPerMoleculetype,
2092                                                    maxReferenceTemperature(inputrec),
2093                                                    homeAtomCountEstimate);
2094
2095         /* To use update groups, the large domain-to-domain cutoff distance
2096          * should be compatible with the box size.
2097          */
2098         comm->useUpdateGroups = (atomToAtomIntoDomainToDomainCutoff(*comm, 0) < cutoffMargin);
2099
2100         if (comm->useUpdateGroups)
2101         {
2102             GMX_LOG(mdlog.info).appendTextFormatted(
2103                     "Using update groups, nr %d, average size %.1f atoms, max. radius %.3f nm\n",
2104                     numUpdateGroups,
2105                     mtop.natoms/static_cast<double>(numUpdateGroups),
2106                     comm->updateGroupsCog->maxUpdateGroupRadius());
2107         }
2108         else
2109         {
2110             GMX_LOG(mdlog.info).appendTextFormatted("The combination of rlist and box size prohibits the use of update groups\n");
2111             comm->updateGroupingPerMoleculetype.clear();
2112             comm->updateGroupsCog.reset(nullptr);
2113         }
2114     }
2115 }
2116
2117 /*! \brief Set the cell size and interaction limits, as well as the DD grid */
2118 static void set_dd_limits_and_grid(const gmx::MDLogger &mdlog,
2119                                    t_commrec *cr, gmx_domdec_t *dd,
2120                                    const DomdecOptions &options,
2121                                    const gmx::MdrunOptions &mdrunOptions,
2122                                    const gmx_mtop_t *mtop,
2123                                    const t_inputrec *ir,
2124                                    const matrix box,
2125                                    gmx::ArrayRef<const gmx::RVec> xGlobal,
2126                                    gmx_ddbox_t *ddbox)
2127 {
2128     real               r_bonded         = -1;
2129     real               r_bonded_limit   = -1;
2130     const real         tenPercentMargin = 1.1;
2131     gmx_domdec_comm_t *comm             = dd->comm;
2132
2133     dd->npbcdim              = ePBC2npbcdim(ir->ePBC);
2134     dd->numBoundedDimensions = inputrec2nboundeddim(ir);
2135     dd->haveDynamicBox       = inputrecDynamicBox(ir);
2136     dd->bScrewPBC            = (ir->ePBC == epbcSCREW);
2137
2138     dd->pme_recv_f_alloc = 0;
2139     dd->pme_recv_f_buf   = nullptr;
2140
2141     /* Initialize to GPU share count to 0, might change later */
2142     comm->nrank_gpu_shared = 0;
2143
2144     comm->dlbState         = determineInitialDlbState(mdlog, options.dlbOption, comm->bRecordLoad, mdrunOptions, ir);
2145     dd_dlb_set_should_check_whether_to_turn_dlb_on(dd, TRUE);
2146     /* To consider turning DLB on after 2*nstlist steps we need to check
2147      * at partitioning count 3. Thus we need to increase the first count by 2.
2148      */
2149     comm->ddPartioningCountFirstDlbOff += 2;
2150
2151     GMX_LOG(mdlog.info).appendTextFormatted(
2152             "Dynamic load balancing: %s", edlbs_names[int(comm->dlbState)]);
2153
2154     comm->bPMELoadBalDLBLimits = FALSE;
2155
2156     /* Allocate the charge group/atom sorting struct */
2157     comm->sort = std::make_unique<gmx_domdec_sort_t>();
2158
2159     /* We need to decide on update groups early, as this affects communication distances */
2160     comm->useUpdateGroups = false;
2161     if (ir->cutoff_scheme == ecutsVERLET)
2162     {
2163         real cutoffMargin = std::sqrt(max_cutoff2(ir->ePBC, box)) - ir->rlist;
2164         setupUpdateGroups(mdlog, *mtop, *ir, cutoffMargin, cr->nnodes, comm);
2165     }
2166
2167     // TODO: Check whether all bondeds are within update groups
2168     comm->haveInterDomainBondeds          = (mtop->natoms > gmx_mtop_num_molecules(*mtop) ||
2169                                              mtop->bIntermolecularInteractions);
2170     comm->haveInterDomainMultiBodyBondeds = (multi_body_bondeds_count(mtop) > 0);
2171
2172     if (comm->useUpdateGroups)
2173     {
2174         dd->splitConstraints = false;
2175         dd->splitSettles     = false;
2176     }
2177     else
2178     {
2179         dd->splitConstraints = gmx::inter_charge_group_constraints(*mtop);
2180         dd->splitSettles     = gmx::inter_charge_group_settles(*mtop);
2181     }
2182
2183     if (ir->rlist == 0)
2184     {
2185         /* Set the cut-off to some very large value,
2186          * so we don't need if statements everywhere in the code.
2187          * We use sqrt, since the cut-off is squared in some places.
2188          */
2189         comm->cutoff   = GMX_CUTOFF_INF;
2190     }
2191     else
2192     {
2193         comm->cutoff   = atomToAtomIntoDomainToDomainCutoff(*comm, ir->rlist);
2194     }
2195     comm->cutoff_mbody = 0;
2196
2197     /* Determine the minimum cell size limit, affected by many factors */
2198     comm->cellsize_limit = 0;
2199     comm->bBondComm      = FALSE;
2200
2201     /* We do not allow home atoms to move beyond the neighboring domain
2202      * between domain decomposition steps, which limits the cell size.
2203      * Get an estimate of cell size limit due to atom displacement.
2204      * In most cases this is a large overestimate, because it assumes
2205      * non-interaction atoms.
2206      * We set the chance to 1 in a trillion steps.
2207      */
2208     constexpr real c_chanceThatAtomMovesBeyondDomain = 1e-12;
2209     const real     limitForAtomDisplacement          =
2210         minCellSizeForAtomDisplacement(*mtop, *ir,
2211                                        comm->updateGroupingPerMoleculetype,
2212                                        c_chanceThatAtomMovesBeyondDomain);
2213     GMX_LOG(mdlog.info).appendTextFormatted(
2214             "Minimum cell size due to atom displacement: %.3f nm",
2215             limitForAtomDisplacement);
2216
2217     comm->cellsize_limit = std::max(comm->cellsize_limit,
2218                                     limitForAtomDisplacement);
2219
2220     /* TODO: PME decomposition currently requires atoms not to be more than
2221      *       2/3 of comm->cutoff, which is >=rlist, outside of their domain.
2222      *       In nearly all cases, limitForAtomDisplacement will be smaller
2223      *       than 2/3*rlist, so the PME requirement is satisfied.
2224      *       But it would be better for both correctness and performance
2225      *       to use limitForAtomDisplacement instead of 2/3*comm->cutoff.
2226      *       Note that we would need to improve the pairlist buffer case.
2227      */
2228
2229     if (comm->haveInterDomainBondeds)
2230     {
2231         if (options.minimumCommunicationRange > 0)
2232         {
2233             comm->cutoff_mbody = atomToAtomIntoDomainToDomainCutoff(*comm, options.minimumCommunicationRange);
2234             if (options.useBondedCommunication)
2235             {
2236                 comm->bBondComm = (comm->cutoff_mbody > comm->cutoff);
2237             }
2238             else
2239             {
2240                 comm->cutoff = std::max(comm->cutoff, comm->cutoff_mbody);
2241             }
2242             r_bonded_limit = comm->cutoff_mbody;
2243         }
2244         else if (ir->bPeriodicMols)
2245         {
2246             /* Can not easily determine the required cut-off */
2247             GMX_LOG(mdlog.warning).appendText("NOTE: Periodic molecules are present in this system. Because of this, the domain decomposition algorithm cannot easily determine the minimum cell size that it requires for treating bonded interactions. Instead, domain decomposition will assume that half the non-bonded cut-off will be a suitable lower bound.");
2248             comm->cutoff_mbody = comm->cutoff/2;
2249             r_bonded_limit     = comm->cutoff_mbody;
2250         }
2251         else
2252         {
2253             real r_2b, r_mb;
2254
2255             if (MASTER(cr))
2256             {
2257                 dd_bonded_cg_distance(mdlog, mtop, ir, as_rvec_array(xGlobal.data()), box,
2258                                       options.checkBondedInteractions,
2259                                       &r_2b, &r_mb);
2260             }
2261             gmx_bcast(sizeof(r_2b), &r_2b, cr);
2262             gmx_bcast(sizeof(r_mb), &r_mb, cr);
2263
2264             /* We use an initial margin of 10% for the minimum cell size,
2265              * except when we are just below the non-bonded cut-off.
2266              */
2267             if (options.useBondedCommunication)
2268             {
2269                 if (std::max(r_2b, r_mb) > comm->cutoff)
2270                 {
2271                     r_bonded        = std::max(r_2b, r_mb);
2272                     r_bonded_limit  = tenPercentMargin*r_bonded;
2273                     comm->bBondComm = TRUE;
2274                 }
2275                 else
2276                 {
2277                     r_bonded       = r_mb;
2278                     r_bonded_limit = std::min(tenPercentMargin*r_bonded, comm->cutoff);
2279                 }
2280                 /* We determine cutoff_mbody later */
2281             }
2282             else
2283             {
2284                 /* No special bonded communication,
2285                  * simply increase the DD cut-off.
2286                  */
2287                 r_bonded_limit     = tenPercentMargin*std::max(r_2b, r_mb);
2288                 comm->cutoff_mbody = r_bonded_limit;
2289                 comm->cutoff       = std::max(comm->cutoff, comm->cutoff_mbody);
2290             }
2291         }
2292         GMX_LOG(mdlog.info).appendTextFormatted(
2293                 "Minimum cell size due to bonded interactions: %.3f nm",
2294                 r_bonded_limit);
2295
2296         comm->cellsize_limit = std::max(comm->cellsize_limit, r_bonded_limit);
2297     }
2298
2299     real rconstr = 0;
2300     if (dd->splitConstraints && options.constraintCommunicationRange <= 0)
2301     {
2302         /* There is a cell size limit due to the constraints (P-LINCS) */
2303         rconstr = gmx::constr_r_max(mdlog, mtop, ir);
2304         GMX_LOG(mdlog.info).appendTextFormatted(
2305                 "Estimated maximum distance required for P-LINCS: %.3f nm",
2306                 rconstr);
2307         if (rconstr > comm->cellsize_limit)
2308         {
2309             GMX_LOG(mdlog.info).appendText("This distance will limit the DD cell size, you can override this with -rcon");
2310         }
2311     }
2312     else if (options.constraintCommunicationRange > 0)
2313     {
2314         /* Here we do not check for dd->splitConstraints.
2315          * because one can also set a cell size limit for virtual sites only
2316          * and at this point we don't know yet if there are intercg v-sites.
2317          */
2318         GMX_LOG(mdlog.info).appendTextFormatted(
2319                 "User supplied maximum distance required for P-LINCS: %.3f nm",
2320                 options.constraintCommunicationRange);
2321         rconstr = options.constraintCommunicationRange;
2322     }
2323     comm->cellsize_limit = std::max(comm->cellsize_limit, rconstr);
2324
2325     comm->cgs_gl = gmx_mtop_global_cgs(mtop);
2326
2327     if (options.numCells[XX] > 0)
2328     {
2329         copy_ivec(options.numCells, dd->nc);
2330         set_dd_dim(mdlog, dd);
2331         set_ddbox_cr(*cr, &dd->nc, *ir, box, xGlobal, ddbox);
2332
2333         if (options.numPmeRanks >= 0)
2334         {
2335             cr->npmenodes = options.numPmeRanks;
2336         }
2337         else
2338         {
2339             /* When the DD grid is set explicitly and -npme is set to auto,
2340              * don't use PME ranks. We check later if the DD grid is
2341              * compatible with the total number of ranks.
2342              */
2343             cr->npmenodes = 0;
2344         }
2345
2346         real acs = average_cellsize_min(dd, ddbox);
2347         if (acs < comm->cellsize_limit)
2348         {
2349             gmx_fatal_collective(FARGS, cr->mpi_comm_mysim, MASTER(cr),
2350                                  "The initial cell size (%f) is smaller than the cell size limit (%f), change options -dd, -rdd or -rcon, see the log file for details",
2351                                  acs, comm->cellsize_limit);
2352         }
2353     }
2354     else
2355     {
2356         set_ddbox_cr(*cr, nullptr, *ir, box, xGlobal, ddbox);
2357
2358         /* We need to choose the optimal DD grid and possibly PME nodes */
2359         real limit =
2360             dd_choose_grid(mdlog, cr, dd, ir, mtop, box, ddbox,
2361                            options.numPmeRanks,
2362                            !isDlbDisabled(comm),
2363                            options.dlbScaling,
2364                            comm->cellsize_limit, comm->cutoff,
2365                            comm->haveInterDomainBondeds);
2366
2367         if (dd->nc[XX] == 0)
2368         {
2369             char     buf[STRLEN];
2370             gmx_bool bC = (dd->splitConstraints && rconstr > r_bonded_limit);
2371             sprintf(buf, "Change the number of ranks or mdrun option %s%s%s",
2372                     !bC ? "-rdd" : "-rcon",
2373                     comm->dlbState != DlbState::offUser ? " or -dds" : "",
2374                     bC ? " or your LINCS settings" : "");
2375
2376             gmx_fatal_collective(FARGS, cr->mpi_comm_mysim, MASTER(cr),
2377                                  "There is no domain decomposition for %d ranks that is compatible with the given box and a minimum cell size of %g nm\n"
2378                                  "%s\n"
2379                                  "Look in the log file for details on the domain decomposition",
2380                                  cr->nnodes-cr->npmenodes, limit, buf);
2381         }
2382         set_dd_dim(mdlog, dd);
2383     }
2384
2385     GMX_LOG(mdlog.info).appendTextFormatted(
2386             "Domain decomposition grid %d x %d x %d, separate PME ranks %d",
2387             dd->nc[XX], dd->nc[YY], dd->nc[ZZ], cr->npmenodes);
2388
2389     dd->nnodes = dd->nc[XX]*dd->nc[YY]*dd->nc[ZZ];
2390     if (cr->nnodes - dd->nnodes != cr->npmenodes)
2391     {
2392         gmx_fatal_collective(FARGS, cr->mpi_comm_mysim, MASTER(cr),
2393                              "The size of the domain decomposition grid (%d) does not match the number of ranks (%d). The total number of ranks is %d",
2394                              dd->nnodes, cr->nnodes - cr->npmenodes, cr->nnodes);
2395     }
2396     if (cr->npmenodes > dd->nnodes)
2397     {
2398         gmx_fatal_collective(FARGS, cr->mpi_comm_mysim, MASTER(cr),
2399                              "The number of separate PME ranks (%d) is larger than the number of PP ranks (%d), this is not supported.", cr->npmenodes, dd->nnodes);
2400     }
2401     if (cr->npmenodes > 0)
2402     {
2403         comm->npmenodes = cr->npmenodes;
2404     }
2405     else
2406     {
2407         comm->npmenodes = dd->nnodes;
2408     }
2409
2410     if (EEL_PME(ir->coulombtype) || EVDW_PME(ir->vdwtype))
2411     {
2412         /* The following choices should match those
2413          * in comm_cost_est in domdec_setup.c.
2414          * Note that here the checks have to take into account
2415          * that the decomposition might occur in a different order than xyz
2416          * (for instance through the env.var. GMX_DD_ORDER_ZYX),
2417          * in which case they will not match those in comm_cost_est,
2418          * but since that is mainly for testing purposes that's fine.
2419          */
2420         if (dd->ndim >= 2 && dd->dim[0] == XX && dd->dim[1] == YY &&
2421             comm->npmenodes > dd->nc[XX] && comm->npmenodes % dd->nc[XX] == 0 &&
2422             getenv("GMX_PMEONEDD") == nullptr)
2423         {
2424             comm->npmedecompdim = 2;
2425             comm->npmenodes_x   = dd->nc[XX];
2426             comm->npmenodes_y   = comm->npmenodes/comm->npmenodes_x;
2427         }
2428         else
2429         {
2430             /* In case nc is 1 in both x and y we could still choose to
2431              * decompose pme in y instead of x, but we use x for simplicity.
2432              */
2433             comm->npmedecompdim = 1;
2434             if (dd->dim[0] == YY)
2435             {
2436                 comm->npmenodes_x = 1;
2437                 comm->npmenodes_y = comm->npmenodes;
2438             }
2439             else
2440             {
2441                 comm->npmenodes_x = comm->npmenodes;
2442                 comm->npmenodes_y = 1;
2443             }
2444         }
2445         GMX_LOG(mdlog.info).appendTextFormatted(
2446                 "PME domain decomposition: %d x %d x %d",
2447                 comm->npmenodes_x, comm->npmenodes_y, 1);
2448     }
2449     else
2450     {
2451         comm->npmedecompdim = 0;
2452         comm->npmenodes_x   = 0;
2453         comm->npmenodes_y   = 0;
2454     }
2455
2456     snew(comm->slb_frac, DIM);
2457     if (isDlbDisabled(comm))
2458     {
2459         comm->slb_frac[XX] = get_slb_frac(mdlog, "x", dd->nc[XX], options.cellSizeX);
2460         comm->slb_frac[YY] = get_slb_frac(mdlog, "y", dd->nc[YY], options.cellSizeY);
2461         comm->slb_frac[ZZ] = get_slb_frac(mdlog, "z", dd->nc[ZZ], options.cellSizeZ);
2462     }
2463
2464     if (comm->haveInterDomainBondeds && comm->cutoff_mbody == 0)
2465     {
2466         if (comm->bBondComm || !isDlbDisabled(comm))
2467         {
2468             /* Set the bonded communication distance to halfway
2469              * the minimum and the maximum,
2470              * since the extra communication cost is nearly zero.
2471              */
2472             real acs           = average_cellsize_min(dd, ddbox);
2473             comm->cutoff_mbody = 0.5*(r_bonded + acs);
2474             if (!isDlbDisabled(comm))
2475             {
2476                 /* Check if this does not limit the scaling */
2477                 comm->cutoff_mbody = std::min(comm->cutoff_mbody,
2478                                               options.dlbScaling*acs);
2479             }
2480             if (!comm->bBondComm)
2481             {
2482                 /* Without bBondComm do not go beyond the n.b. cut-off */
2483                 comm->cutoff_mbody = std::min(comm->cutoff_mbody, comm->cutoff);
2484                 if (comm->cellsize_limit >= comm->cutoff)
2485                 {
2486                     /* We don't loose a lot of efficieny
2487                      * when increasing it to the n.b. cut-off.
2488                      * It can even be slightly faster, because we need
2489                      * less checks for the communication setup.
2490                      */
2491                     comm->cutoff_mbody = comm->cutoff;
2492                 }
2493             }
2494             /* Check if we did not end up below our original limit */
2495             comm->cutoff_mbody = std::max(comm->cutoff_mbody, r_bonded_limit);
2496
2497             if (comm->cutoff_mbody > comm->cellsize_limit)
2498             {
2499                 comm->cellsize_limit = comm->cutoff_mbody;
2500             }
2501         }
2502         /* Without DLB and cutoff_mbody<cutoff, cutoff_mbody is dynamic */
2503     }
2504
2505     if (debug)
2506     {
2507         fprintf(debug, "Bonded atom communication beyond the cut-off: %s\n"
2508                 "cellsize limit %f\n",
2509                 gmx::boolToString(comm->bBondComm), comm->cellsize_limit);
2510     }
2511
2512     if (MASTER(cr))
2513     {
2514         check_dd_restrictions(dd, ir, mdlog);
2515     }
2516 }
2517
2518 static char *init_bLocalCG(const gmx_mtop_t *mtop)
2519 {
2520     int   ncg, cg;
2521     char *bLocalCG;
2522
2523     ncg = ncg_mtop(mtop);
2524     snew(bLocalCG, ncg);
2525     for (cg = 0; cg < ncg; cg++)
2526     {
2527         bLocalCG[cg] = FALSE;
2528     }
2529
2530     return bLocalCG;
2531 }
2532
2533 void dd_init_bondeds(FILE *fplog,
2534                      gmx_domdec_t *dd,
2535                      const gmx_mtop_t *mtop,
2536                      const gmx_vsite_t *vsite,
2537                      const t_inputrec *ir,
2538                      gmx_bool bBCheck, cginfo_mb_t *cginfo_mb)
2539 {
2540     gmx_domdec_comm_t *comm;
2541
2542     dd_make_reverse_top(fplog, dd, mtop, vsite, ir, bBCheck);
2543
2544     comm = dd->comm;
2545
2546     if (comm->bBondComm)
2547     {
2548         /* Communicate atoms beyond the cut-off for bonded interactions */
2549         comm = dd->comm;
2550
2551         comm->cglink = make_charge_group_links(mtop, dd, cginfo_mb);
2552
2553         comm->bLocalCG = init_bLocalCG(mtop);
2554     }
2555     else
2556     {
2557         /* Only communicate atoms based on cut-off */
2558         comm->cglink   = nullptr;
2559         comm->bLocalCG = nullptr;
2560     }
2561 }
2562
2563 static void writeSettings(gmx::TextWriter       *log,
2564                           gmx_domdec_t          *dd,
2565                           const gmx_mtop_t      *mtop,
2566                           const t_inputrec      *ir,
2567                           gmx_bool               bDynLoadBal,
2568                           real                   dlb_scale,
2569                           const gmx_ddbox_t     *ddbox)
2570 {
2571     gmx_domdec_comm_t *comm;
2572     int                d;
2573     ivec               np;
2574     real               limit, shrink;
2575
2576     comm = dd->comm;
2577
2578     if (bDynLoadBal)
2579     {
2580         log->writeString("The maximum number of communication pulses is:");
2581         for (d = 0; d < dd->ndim; d++)
2582         {
2583             log->writeStringFormatted(" %c %d", dim2char(dd->dim[d]), comm->cd[d].np_dlb);
2584         }
2585         log->ensureLineBreak();
2586         log->writeLineFormatted("The minimum size for domain decomposition cells is %.3f nm", comm->cellsize_limit);
2587         log->writeLineFormatted("The requested allowed shrink of DD cells (option -dds) is: %.2f", dlb_scale);
2588         log->writeString("The allowed shrink of domain decomposition cells is:");
2589         for (d = 0; d < DIM; d++)
2590         {
2591             if (dd->nc[d] > 1)
2592             {
2593                 if (d >= ddbox->npbcdim && dd->nc[d] == 2)
2594                 {
2595                     shrink = 0;
2596                 }
2597                 else
2598                 {
2599                     shrink =
2600                         comm->cellsize_min_dlb[d]/
2601                         (ddbox->box_size[d]*ddbox->skew_fac[d]/dd->nc[d]);
2602                 }
2603                 log->writeStringFormatted(" %c %.2f", dim2char(d), shrink);
2604             }
2605         }
2606         log->ensureLineBreak();
2607     }
2608     else
2609     {
2610         set_dd_cell_sizes_slb(dd, ddbox, setcellsizeslbPULSE_ONLY, np);
2611         log->writeString("The initial number of communication pulses is:");
2612         for (d = 0; d < dd->ndim; d++)
2613         {
2614             log->writeStringFormatted(" %c %d", dim2char(dd->dim[d]), np[dd->dim[d]]);
2615         }
2616         log->ensureLineBreak();
2617         log->writeString("The initial domain decomposition cell size is:");
2618         for (d = 0; d < DIM; d++)
2619         {
2620             if (dd->nc[d] > 1)
2621             {
2622                 log->writeStringFormatted(" %c %.2f nm",
2623                                           dim2char(d), dd->comm->cellsize_min[d]);
2624             }
2625         }
2626         log->ensureLineBreak();
2627         log->writeLine();
2628     }
2629
2630     const bool haveInterDomainVsites =
2631         (countInterUpdategroupVsites(*mtop, comm->updateGroupingPerMoleculetype) != 0);
2632
2633     if (comm->haveInterDomainBondeds ||
2634         haveInterDomainVsites ||
2635         dd->splitConstraints || dd->splitSettles)
2636     {
2637         std::string decompUnits;
2638         if (comm->useUpdateGroups)
2639         {
2640             decompUnits = "atom groups";
2641         }
2642         else
2643         {
2644             decompUnits = "atoms";
2645         }
2646
2647         log->writeLineFormatted("The maximum allowed distance for %s involved in interactions is:", decompUnits.c_str());
2648         log->writeLineFormatted("%40s  %-7s %6.3f nm", "non-bonded interactions", "", comm->cutoff);
2649
2650         if (bDynLoadBal)
2651         {
2652             limit = dd->comm->cellsize_limit;
2653         }
2654         else
2655         {
2656             if (dynamic_dd_box(*dd))
2657             {
2658                 log->writeLine("(the following are initial values, they could change due to box deformation)");
2659             }
2660             limit = dd->comm->cellsize_min[XX];
2661             for (d = 1; d < DIM; d++)
2662             {
2663                 limit = std::min(limit, dd->comm->cellsize_min[d]);
2664             }
2665         }
2666
2667         if (comm->haveInterDomainBondeds)
2668         {
2669             log->writeLineFormatted("%40s  %-7s %6.3f nm",
2670                                     "two-body bonded interactions", "(-rdd)",
2671                                     std::max(comm->cutoff, comm->cutoff_mbody));
2672             log->writeLineFormatted("%40s  %-7s %6.3f nm",
2673                                     "multi-body bonded interactions", "(-rdd)",
2674                                     (comm->bBondComm || isDlbOn(dd->comm)) ? comm->cutoff_mbody : std::min(comm->cutoff, limit));
2675         }
2676         if (haveInterDomainVsites)
2677         {
2678             log->writeLineFormatted("%40s  %-7s %6.3f nm",
2679                                     "virtual site constructions", "(-rcon)", limit);
2680         }
2681         if (dd->splitConstraints || dd->splitSettles)
2682         {
2683             std::string separation = gmx::formatString("atoms separated by up to %d constraints",
2684                                                        1+ir->nProjOrder);
2685             log->writeLineFormatted("%40s  %-7s %6.3f nm\n",
2686                                     separation.c_str(), "(-rcon)", limit);
2687         }
2688         log->ensureLineBreak();
2689     }
2690 }
2691
2692 static void logSettings(const gmx::MDLogger &mdlog,
2693                         gmx_domdec_t        *dd,
2694                         const gmx_mtop_t    *mtop,
2695                         const t_inputrec    *ir,
2696                         real                 dlb_scale,
2697                         const gmx_ddbox_t   *ddbox)
2698 {
2699     gmx::StringOutputStream stream;
2700     gmx::TextWriter         log(&stream);
2701     writeSettings(&log, dd, mtop, ir, isDlbOn(dd->comm), dlb_scale, ddbox);
2702     if (dd->comm->dlbState == DlbState::offCanTurnOn)
2703     {
2704         {
2705             log.ensureEmptyLine();
2706             log.writeLine("When dynamic load balancing gets turned on, these settings will change to:");
2707         }
2708         writeSettings(&log, dd, mtop, ir, true, dlb_scale, ddbox);
2709     }
2710     GMX_LOG(mdlog.info).asParagraph().appendText(stream.toString());
2711 }
2712
2713 static void set_cell_limits_dlb(const gmx::MDLogger &mdlog,
2714                                 gmx_domdec_t        *dd,
2715                                 real                 dlb_scale,
2716                                 const t_inputrec    *ir,
2717                                 const gmx_ddbox_t   *ddbox)
2718 {
2719     gmx_domdec_comm_t *comm;
2720     int                d, dim, npulse, npulse_d_max, npulse_d;
2721     gmx_bool           bNoCutOff;
2722
2723     comm = dd->comm;
2724
2725     bNoCutOff = (ir->rvdw == 0 || ir->rcoulomb == 0);
2726
2727     /* Determine the maximum number of comm. pulses in one dimension */
2728
2729     comm->cellsize_limit = std::max(comm->cellsize_limit, comm->cutoff_mbody);
2730
2731     /* Determine the maximum required number of grid pulses */
2732     if (comm->cellsize_limit >= comm->cutoff)
2733     {
2734         /* Only a single pulse is required */
2735         npulse = 1;
2736     }
2737     else if (!bNoCutOff && comm->cellsize_limit > 0)
2738     {
2739         /* We round down slightly here to avoid overhead due to the latency
2740          * of extra communication calls when the cut-off
2741          * would be only slightly longer than the cell size.
2742          * Later cellsize_limit is redetermined,
2743          * so we can not miss interactions due to this rounding.
2744          */
2745         npulse = static_cast<int>(0.96 + comm->cutoff/comm->cellsize_limit);
2746     }
2747     else
2748     {
2749         /* There is no cell size limit */
2750         npulse = std::max(dd->nc[XX]-1, std::max(dd->nc[YY]-1, dd->nc[ZZ]-1));
2751     }
2752
2753     if (!bNoCutOff && npulse > 1)
2754     {
2755         /* See if we can do with less pulses, based on dlb_scale */
2756         npulse_d_max = 0;
2757         for (d = 0; d < dd->ndim; d++)
2758         {
2759             dim      = dd->dim[d];
2760             npulse_d = static_cast<int>(1 + dd->nc[dim]*comm->cutoff
2761                                         /(ddbox->box_size[dim]*ddbox->skew_fac[dim]*dlb_scale));
2762             npulse_d_max = std::max(npulse_d_max, npulse_d);
2763         }
2764         npulse = std::min(npulse, npulse_d_max);
2765     }
2766
2767     /* This env var can override npulse */
2768     d = dd_getenv(mdlog, "GMX_DD_NPULSE", 0);
2769     if (d > 0)
2770     {
2771         npulse = d;
2772     }
2773
2774     comm->maxpulse       = 1;
2775     comm->bVacDLBNoLimit = (ir->ePBC == epbcNONE);
2776     for (d = 0; d < dd->ndim; d++)
2777     {
2778         comm->cd[d].np_dlb    = std::min(npulse, dd->nc[dd->dim[d]]-1);
2779         comm->maxpulse        = std::max(comm->maxpulse, comm->cd[d].np_dlb);
2780         if (comm->cd[d].np_dlb < dd->nc[dd->dim[d]]-1)
2781         {
2782             comm->bVacDLBNoLimit = FALSE;
2783         }
2784     }
2785
2786     /* cellsize_limit is set for LINCS in init_domain_decomposition */
2787     if (!comm->bVacDLBNoLimit)
2788     {
2789         comm->cellsize_limit = std::max(comm->cellsize_limit,
2790                                         comm->cutoff/comm->maxpulse);
2791     }
2792     comm->cellsize_limit = std::max(comm->cellsize_limit, comm->cutoff_mbody);
2793     /* Set the minimum cell size for each DD dimension */
2794     for (d = 0; d < dd->ndim; d++)
2795     {
2796         if (comm->bVacDLBNoLimit ||
2797             comm->cd[d].np_dlb*comm->cellsize_limit >= comm->cutoff)
2798         {
2799             comm->cellsize_min_dlb[dd->dim[d]] = comm->cellsize_limit;
2800         }
2801         else
2802         {
2803             comm->cellsize_min_dlb[dd->dim[d]] =
2804                 comm->cutoff/comm->cd[d].np_dlb;
2805         }
2806     }
2807     if (comm->cutoff_mbody <= 0)
2808     {
2809         comm->cutoff_mbody = std::min(comm->cutoff, comm->cellsize_limit);
2810     }
2811     if (isDlbOn(comm))
2812     {
2813         set_dlb_limits(dd);
2814     }
2815 }
2816
2817 gmx_bool dd_bonded_molpbc(const gmx_domdec_t *dd, int ePBC)
2818 {
2819     /* If each molecule is a single charge group
2820      * or we use domain decomposition for each periodic dimension,
2821      * we do not need to take pbc into account for the bonded interactions.
2822      */
2823     return (ePBC != epbcNONE && dd->comm->haveInterDomainBondeds &&
2824             !(dd->nc[XX] > 1 &&
2825               dd->nc[YY] > 1 &&
2826               (dd->nc[ZZ] > 1 || ePBC == epbcXY)));
2827 }
2828
2829 /*! \brief Sets grid size limits and PP-PME setup, prints settings to log */
2830 static void set_ddgrid_parameters(const gmx::MDLogger &mdlog,
2831                                   gmx_domdec_t *dd, real dlb_scale,
2832                                   const gmx_mtop_t *mtop, const t_inputrec *ir,
2833                                   const gmx_ddbox_t *ddbox)
2834 {
2835     gmx_domdec_comm_t *comm;
2836     int                natoms_tot;
2837     real               vol_frac;
2838
2839     comm = dd->comm;
2840
2841     if (EEL_PME(ir->coulombtype) || EVDW_PME(ir->vdwtype))
2842     {
2843         init_ddpme(dd, &comm->ddpme[0], 0);
2844         if (comm->npmedecompdim >= 2)
2845         {
2846             init_ddpme(dd, &comm->ddpme[1], 1);
2847         }
2848     }
2849     else
2850     {
2851         comm->npmenodes = 0;
2852         if (dd->pme_nodeid >= 0)
2853         {
2854             gmx_fatal_collective(FARGS, dd->mpi_comm_all, DDMASTER(dd),
2855                                  "Can not have separate PME ranks without PME electrostatics");
2856         }
2857     }
2858
2859     if (debug)
2860     {
2861         fprintf(debug, "The DD cut-off is %f\n", comm->cutoff);
2862     }
2863     if (!isDlbDisabled(comm))
2864     {
2865         set_cell_limits_dlb(mdlog, dd, dlb_scale, ir, ddbox);
2866     }
2867
2868     logSettings(mdlog, dd, mtop, ir, dlb_scale, ddbox);
2869
2870     if (ir->ePBC == epbcNONE)
2871     {
2872         vol_frac = 1 - 1/static_cast<double>(dd->nnodes);
2873     }
2874     else
2875     {
2876         vol_frac =
2877             (1 + comm_box_frac(dd->nc, comm->cutoff, ddbox))/static_cast<double>(dd->nnodes);
2878     }
2879     if (debug)
2880     {
2881         fprintf(debug, "Volume fraction for all DD zones: %f\n", vol_frac);
2882     }
2883     natoms_tot = comm->cgs_gl.index[comm->cgs_gl.nr];
2884
2885     dd->ga2la  = new gmx_ga2la_t(natoms_tot,
2886                                  static_cast<int>(vol_frac*natoms_tot));
2887 }
2888
2889 /*! \brief Set some important DD parameters that can be modified by env.vars */
2890 static void set_dd_envvar_options(const gmx::MDLogger &mdlog,
2891                                   gmx_domdec_t *dd, int rank_mysim)
2892 {
2893     gmx_domdec_comm_t *comm = dd->comm;
2894
2895     dd->bSendRecv2      = (dd_getenv(mdlog, "GMX_DD_USE_SENDRECV2", 0) != 0);
2896     comm->dlb_scale_lim = dd_getenv(mdlog, "GMX_DLB_MAX_BOX_SCALING", 10);
2897     comm->eFlop         = dd_getenv(mdlog, "GMX_DLB_BASED_ON_FLOPS", 0);
2898     int recload         = dd_getenv(mdlog, "GMX_DD_RECORD_LOAD", 1);
2899     comm->nstDDDump     = dd_getenv(mdlog, "GMX_DD_NST_DUMP", 0);
2900     comm->nstDDDumpGrid = dd_getenv(mdlog, "GMX_DD_NST_DUMP_GRID", 0);
2901     comm->DD_debug      = dd_getenv(mdlog, "GMX_DD_DEBUG", 0);
2902
2903     if (dd->bSendRecv2)
2904     {
2905         GMX_LOG(mdlog.info).appendText("Will use two sequential MPI_Sendrecv calls instead of two simultaneous non-blocking MPI_Irecv and MPI_Isend pairs for constraint and vsite communication");
2906     }
2907
2908     if (comm->eFlop)
2909     {
2910         GMX_LOG(mdlog.info).appendText("Will load balance based on FLOP count");
2911         if (comm->eFlop > 1)
2912         {
2913             srand(1 + rank_mysim);
2914         }
2915         comm->bRecordLoad = TRUE;
2916     }
2917     else
2918     {
2919         comm->bRecordLoad = (wallcycle_have_counter() && recload > 0);
2920     }
2921 }
2922
2923 gmx_domdec_t *init_domain_decomposition(const gmx::MDLogger           &mdlog,
2924                                         t_commrec                     *cr,
2925                                         const DomdecOptions           &options,
2926                                         const gmx::MdrunOptions       &mdrunOptions,
2927                                         const gmx_mtop_t              *mtop,
2928                                         const t_inputrec              *ir,
2929                                         const matrix                   box,
2930                                         gmx::ArrayRef<const gmx::RVec> xGlobal,
2931                                         gmx::LocalAtomSetManager      *atomSets)
2932 {
2933     gmx_domdec_t      *dd;
2934
2935     GMX_LOG(mdlog.info).appendTextFormatted(
2936             "\nInitializing Domain Decomposition on %d ranks", cr->nnodes);
2937
2938     dd = new gmx_domdec_t;
2939
2940     dd->comm = init_dd_comm();
2941
2942     /* Initialize DD paritioning counters */
2943     dd->comm->partition_step = INT_MIN;
2944     dd->ddp_count            = 0;
2945
2946     set_dd_envvar_options(mdlog, dd, cr->nodeid);
2947
2948     gmx_ddbox_t ddbox = {0};
2949     set_dd_limits_and_grid(mdlog, cr, dd, options, mdrunOptions,
2950                            mtop, ir,
2951                            box, xGlobal,
2952                            &ddbox);
2953
2954     make_dd_communicators(mdlog, cr, dd, options.rankOrder);
2955
2956     if (thisRankHasDuty(cr, DUTY_PP))
2957     {
2958         set_ddgrid_parameters(mdlog, dd, options.dlbScaling, mtop, ir, &ddbox);
2959
2960         setup_neighbor_relations(dd);
2961     }
2962
2963     /* Set overallocation to avoid frequent reallocation of arrays */
2964     set_over_alloc_dd(TRUE);
2965
2966     clear_dd_cycle_counts(dd);
2967
2968     dd->atomSets = atomSets;
2969
2970     return dd;
2971 }
2972
2973 static gmx_bool test_dd_cutoff(t_commrec     *cr,
2974                                const t_state &state,
2975                                real           cutoffRequested)
2976 {
2977     gmx_domdec_t *dd;
2978     gmx_ddbox_t   ddbox;
2979     int           d, dim, np;
2980     real          inv_cell_size;
2981     int           LocallyLimited;
2982
2983     dd = cr->dd;
2984
2985     set_ddbox(*dd, false, state.box, true, state.x, &ddbox);
2986
2987     LocallyLimited = 0;
2988
2989     for (d = 0; d < dd->ndim; d++)
2990     {
2991         dim = dd->dim[d];
2992
2993         inv_cell_size = DD_CELL_MARGIN*dd->nc[dim]/ddbox.box_size[dim];
2994         if (dynamic_dd_box(*dd))
2995         {
2996             inv_cell_size *= DD_PRES_SCALE_MARGIN;
2997         }
2998
2999         np = 1 + static_cast<int>(cutoffRequested*inv_cell_size*ddbox.skew_fac[dim]);
3000
3001         if (!isDlbDisabled(dd->comm) && (dim < ddbox.npbcdim) && (dd->comm->cd[d].np_dlb > 0))
3002         {
3003             if (np > dd->comm->cd[d].np_dlb)
3004             {
3005                 return FALSE;
3006             }
3007
3008             /* If a current local cell size is smaller than the requested
3009              * cut-off, we could still fix it, but this gets very complicated.
3010              * Without fixing here, we might actually need more checks.
3011              */
3012             real cellSizeAlongDim = (dd->comm->cell_x1[dim] - dd->comm->cell_x0[dim])*ddbox.skew_fac[dim];
3013             if (cellSizeAlongDim*dd->comm->cd[d].np_dlb < cutoffRequested)
3014             {
3015                 LocallyLimited = 1;
3016             }
3017         }
3018     }
3019
3020     if (!isDlbDisabled(dd->comm))
3021     {
3022         /* If DLB is not active yet, we don't need to check the grid jumps.
3023          * Actually we shouldn't, because then the grid jump data is not set.
3024          */
3025         if (isDlbOn(dd->comm) &&
3026             check_grid_jump(0, dd, cutoffRequested, &ddbox, FALSE))
3027         {
3028             LocallyLimited = 1;
3029         }
3030
3031         gmx_sumi(1, &LocallyLimited, cr);
3032
3033         if (LocallyLimited > 0)
3034         {
3035             return FALSE;
3036         }
3037     }
3038
3039     return TRUE;
3040 }
3041
3042 gmx_bool change_dd_cutoff(t_commrec     *cr,
3043                           const t_state &state,
3044                           real           cutoffRequested)
3045 {
3046     gmx_bool bCutoffAllowed;
3047
3048     bCutoffAllowed = test_dd_cutoff(cr, state, cutoffRequested);
3049
3050     if (bCutoffAllowed)
3051     {
3052         cr->dd->comm->cutoff = cutoffRequested;
3053     }
3054
3055     return bCutoffAllowed;
3056 }