src/gromacs/domdec/domdec.cpp

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019, by the GROMACS development team, led by
   5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   6  * and including many others, as listed in the AUTHORS file in the
   7  * top-level source directory and at http://www.gromacs.org.
   8  *
   9  * GROMACS is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public License
  11  * as published by the Free Software Foundation; either version 2.1
  12  * of the License, or (at your option) any later version.
  13  *
  14  * GROMACS is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public
  20  * License along with GROMACS; if not, see
  21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  23  *
  24  * If you want to redistribute modifications to GROMACS, please
  25  * consider that scientific software is very special. Version
  26  * control is crucial - bugs must be traceable. We will be happy to
  27  * consider code for inclusion in the official distribution, but
  28  * derived work must not be called official GROMACS. Details are found
  29  * in the README & COPYING files - if they are missing, get the
  30  * official version at http://www.gromacs.org.
  31  *
  32  * To help us fund GROMACS development, we humbly ask that you cite
  33  * the research papers on the package. Check out http://www.gromacs.org.
  34  */
  35
  36 #include "gmxpre.h"
  37
  38 #include "domdec.h"
  39
  40 #include "config.h"
  41
  42 #include <cassert>
  43 #include <cinttypes>
  44 #include <climits>
  45 #include <cmath>
  46 #include <cstdio>
  47 #include <cstdlib>
  48 #include <cstring>
  49
  50 #include <algorithm>
  51 #include <memory>
  52
  53 #include "gromacs/domdec/collect.h"
  54 #include "gromacs/domdec/dlb.h"
  55 #include "gromacs/domdec/dlbtiming.h"
  56 #include "gromacs/domdec/domdec_network.h"
  57 #include "gromacs/domdec/ga2la.h"
  58 #include "gromacs/domdec/options.h"
  59 #include "gromacs/domdec/partition.h"
  60 #include "gromacs/gmxlib/network.h"
  61 #include "gromacs/gmxlib/nrnb.h"
  62 #include "gromacs/gpu_utils/gpu_utils.h"
  63 #include "gromacs/hardware/hw_info.h"
  64 #include "gromacs/listed_forces/manage_threading.h"
  65 #include "gromacs/math/vec.h"
  66 #include "gromacs/math/vectypes.h"
  67 #include "gromacs/mdlib/calc_verletbuf.h"
  68 #include "gromacs/mdlib/constr.h"
  69 #include "gromacs/mdlib/constraintrange.h"
  70 #include "gromacs/mdlib/updategroups.h"
  71 #include "gromacs/mdlib/vsite.h"
  72 #include "gromacs/mdtypes/commrec.h"
  73 #include "gromacs/mdtypes/inputrec.h"
  74 #include "gromacs/mdtypes/mdrunoptions.h"
  75 #include "gromacs/mdtypes/state.h"
  76 #include "gromacs/pbcutil/ishift.h"
  77 #include "gromacs/pbcutil/pbc.h"
  78 #include "gromacs/pulling/pull.h"
  79 #include "gromacs/timing/wallcycle.h"
  80 #include "gromacs/topology/block.h"
  81 #include "gromacs/topology/idef.h"
  82 #include "gromacs/topology/ifunc.h"
  83 #include "gromacs/topology/mtop_lookup.h"
  84 #include "gromacs/topology/mtop_util.h"
  85 #include "gromacs/topology/topology.h"
  86 #include "gromacs/utility/basedefinitions.h"
  87 #include "gromacs/utility/basenetwork.h"
  88 #include "gromacs/utility/cstringutil.h"
  89 #include "gromacs/utility/exceptions.h"
  90 #include "gromacs/utility/fatalerror.h"
  91 #include "gromacs/utility/gmxmpi.h"
  92 #include "gromacs/utility/logger.h"
  93 #include "gromacs/utility/real.h"
  94 #include "gromacs/utility/smalloc.h"
  95 #include "gromacs/utility/strconvert.h"
  96 #include "gromacs/utility/stringstream.h"
  97 #include "gromacs/utility/stringutil.h"
  98 #include "gromacs/utility/textwriter.h"
  99
 100 #include "atomdistribution.h"
 101 #include "box.h"
 102 #include "cellsizes.h"
 103 #include "distribute.h"
 104 #include "domdec_constraints.h"
 105 #include "domdec_internal.h"
 106 #include "domdec_vsite.h"
 107 #include "redistribute.h"
 108 #include "utility.h"
 109
 110 // TODO remove this when moving domdec into gmx namespace
 111 using gmx::DdRankOrder;
 112 using gmx::DlbOption;
 113 using gmx::DomdecOptions;
 114
 115 static const char *edlbs_names[int(DlbState::nr)] = { "off", "auto", "locked", "on", "on" };
 116
 117 /* The size per atom group of the cggl_flag buffer in gmx_domdec_comm_t */
 118 #define DD_CGIBS 2
 119
 120 /* The flags for the cggl_flag buffer in gmx_domdec_comm_t */
 121 #define DD_FLAG_NRCG  65535
 122 #define DD_FLAG_FW(d) (1<<(16+(d)*2))
 123 #define DD_FLAG_BW(d) (1<<(16+(d)*2+1))
 124
 125 /* The DD zone order */
 126 static const ivec dd_zo[DD_MAXZONE] =
 127 {{0, 0, 0}, {1, 0, 0}, {1, 1, 0}, {0, 1, 0}, {0, 1, 1}, {0, 0, 1}, {1, 0, 1}, {1, 1, 1}};
 128
 129 /* The non-bonded zone-pair setup for domain decomposition
 130  * The first number is the i-zone, the second number the first j-zone seen by
 131  * this i-zone, the third number the last+1 j-zone seen by this i-zone.
 132  * As is, this is for 3D decomposition, where there are 4 i-zones.
 133  * With 2D decomposition use only the first 2 i-zones and a last+1 j-zone of 4.
 134  * With 1D decomposition use only the first i-zone and a last+1 j-zone of 2.
 135  */
 136 static const int
 137     ddNonbondedZonePairRanges[DD_MAXIZONE][3] = {{0, 0, 8},
 138                                                  {1, 3, 6},
 139                                                  {2, 5, 6},
 140                                                  {3, 5, 7}};
 141
 142
 143 /*
 144    #define dd_index(n,i) ((((i)[ZZ]*(n)[YY] + (i)[YY])*(n)[XX]) + (i)[XX])
 145
 146    static void index2xyz(ivec nc,int ind,ivec xyz)
 147    {
 148    xyz[XX] = ind % nc[XX];
 149    xyz[YY] = (ind / nc[XX]) % nc[YY];
 150    xyz[ZZ] = ind / (nc[YY]*nc[XX]);
 151    }
 152  */
 153
 154 static void ddindex2xyz(const ivec nc, int ind, ivec xyz)
 155 {
 156     xyz[XX] = ind / (nc[YY]*nc[ZZ]);
 157     xyz[YY] = (ind / nc[ZZ]) % nc[YY];
 158     xyz[ZZ] = ind % nc[ZZ];
 159 }
 160
 161 static int ddcoord2ddnodeid(gmx_domdec_t *dd, ivec c)
 162 {
 163     int ddindex;
 164     int ddnodeid = -1;
 165
 166     ddindex = dd_index(dd->nc, c);
 167     if (dd->comm->bCartesianPP_PME)
 168     {
 169         ddnodeid = dd->comm->ddindex2ddnodeid[ddindex];
 170     }
 171     else if (dd->comm->bCartesianPP)
 172     {
 173 #if GMX_MPI
 174         MPI_Cart_rank(dd->mpi_comm_all, c, &ddnodeid);
 175 #endif
 176     }
 177     else
 178     {
 179         ddnodeid = ddindex;
 180     }
 181
 182     return ddnodeid;
 183 }
 184
 185 int ddglatnr(const gmx_domdec_t *dd, int i)
 186 {
 187     int atnr;
 188
 189     if (dd == nullptr)
 190     {
 191         atnr = i + 1;
 192     }
 193     else
 194     {
 195         if (i >= dd->comm->atomRanges.numAtomsTotal())
 196         {
 197             gmx_fatal(FARGS, "glatnr called with %d, which is larger than the local number of atoms (%d)", i, dd->comm->atomRanges.numAtomsTotal());
 198         }
 199         atnr = dd->globalAtomIndices[i] + 1;
 200     }
 201
 202     return atnr;
 203 }
 204
 205 t_block *dd_charge_groups_global(gmx_domdec_t *dd)
 206 {
 207     return &dd->comm->cgs_gl;
 208 }
 209
 210 gmx::ArrayRef<const gmx::RangePartitioning> getUpdateGroupingPerMoleculetype(const gmx_domdec_t &dd)
 211 {
 212     GMX_RELEASE_ASSERT(dd.comm, "Need a valid dd.comm");
 213     return dd.comm->updateGroupingPerMoleculetype;
 214 }
 215
 216 void dd_store_state(gmx_domdec_t *dd, t_state *state)
 217 {
 218     int i;
 219
 220     if (state->ddp_count != dd->ddp_count)
 221     {
 222         gmx_incons("The MD state does not match the domain decomposition state");
 223     }
 224
 225     state->cg_gl.resize(dd->ncg_home);
 226     for (i = 0; i < dd->ncg_home; i++)
 227     {
 228         state->cg_gl[i] = dd->globalAtomGroupIndices[i];
 229     }
 230
 231     state->ddp_count_cg_gl = dd->ddp_count;
 232 }
 233
 234 gmx_domdec_zones_t *domdec_zones(gmx_domdec_t *dd)
 235 {
 236     return &dd->comm->zones;
 237 }
 238
 239 void dd_get_ns_ranges(const gmx_domdec_t *dd, int icg,
 240                       int *jcg0, int *jcg1, ivec shift0, ivec shift1)
 241 {
 242     gmx_domdec_zones_t *zones;
 243     int                 izone, d, dim;
 244
 245     zones = &dd->comm->zones;
 246
 247     izone = 0;
 248     while (icg >= zones->izone[izone].cg1)
 249     {
 250         izone++;
 251     }
 252
 253     if (izone == 0)
 254     {
 255         *jcg0 = icg;
 256     }
 257     else if (izone < zones->nizone)
 258     {
 259         *jcg0 = zones->izone[izone].jcg0;
 260     }
 261     else
 262     {
 263         gmx_fatal(FARGS, "DD icg %d out of range: izone (%d) >= nizone (%d)",
 264                   icg, izone, zones->nizone);
 265     }
 266
 267     *jcg1 = zones->izone[izone].jcg1;
 268
 269     for (d = 0; d < dd->ndim; d++)
 270     {
 271         dim         = dd->dim[d];
 272         shift0[dim] = zones->izone[izone].shift0[dim];
 273         shift1[dim] = zones->izone[izone].shift1[dim];
 274         if (dd->comm->tric_dir[dim] || (isDlbOn(dd->comm) && d > 0))
 275         {
 276             /* A conservative approach, this can be optimized */
 277             shift0[dim] -= 1;
 278             shift1[dim] += 1;
 279         }
 280     }
 281 }
 282
 283 int dd_numHomeAtoms(const gmx_domdec_t &dd)
 284 {
 285     return dd.comm->atomRanges.numHomeAtoms();
 286 }
 287
 288 int dd_natoms_mdatoms(const gmx_domdec_t *dd)
 289 {
 290     /* We currently set mdatoms entries for all atoms:
 291      * local + non-local + communicated for vsite + constraints
 292      */
 293
 294     return dd->comm->atomRanges.numAtomsTotal();
 295 }
 296
 297 int dd_natoms_vsite(const gmx_domdec_t *dd)
 298 {
 299     return dd->comm->atomRanges.end(DDAtomRanges::Type::Vsites);
 300 }
 301
 302 void dd_get_constraint_range(const gmx_domdec_t *dd, int *at_start, int *at_end)
 303 {
 304     *at_start = dd->comm->atomRanges.start(DDAtomRanges::Type::Constraints);
 305     *at_end   = dd->comm->atomRanges.end(DDAtomRanges::Type::Constraints);
 306 }
 307
 308 void dd_move_x(gmx_domdec_t             *dd,
 309                matrix                    box,
 310                gmx::ArrayRef<gmx::RVec>  x,
 311                gmx_wallcycle            *wcycle)
 312 {
 313     wallcycle_start(wcycle, ewcMOVEX);
 314
 315     int                    nzone, nat_tot;
 316     gmx_domdec_comm_t     *comm;
 317     gmx_domdec_comm_dim_t *cd;
 318     rvec                   shift = {0, 0, 0};
 319     gmx_bool               bPBC, bScrew;
 320
 321     comm = dd->comm;
 322
 323     nzone   = 1;
 324     nat_tot = comm->atomRanges.numHomeAtoms();
 325     for (int d = 0; d < dd->ndim; d++)
 326     {
 327         bPBC   = (dd->ci[dd->dim[d]] == 0);
 328         bScrew = (bPBC && dd->bScrewPBC && dd->dim[d] == XX);
 329         if (bPBC)
 330         {
 331             copy_rvec(box[dd->dim[d]], shift);
 332         }
 333         cd = &comm->cd[d];
 334         for (const gmx_domdec_ind_t &ind : cd->ind)
 335         {
 336             DDBufferAccess<gmx::RVec>  sendBufferAccess(comm->rvecBuffer, ind.nsend[nzone + 1]);
 337             gmx::ArrayRef<gmx::RVec>  &sendBuffer = sendBufferAccess.buffer;
 338             int                        n          = 0;
 339             if (!bPBC)
 340             {
 341                 for (int j : ind.index)
 342                 {
 343                     sendBuffer[n] = x[j];
 344                     n++;
 345                 }
 346             }
 347             else if (!bScrew)
 348             {
 349                 for (int j : ind.index)
 350                 {
 351                     /* We need to shift the coordinates */
 352                     for (int d = 0; d < DIM; d++)
 353                     {
 354                         sendBuffer[n][d] = x[j][d] + shift[d];
 355                     }
 356                     n++;
 357                 }
 358             }
 359             else
 360             {
 361                 for (int j : ind.index)
 362                 {
 363                     /* Shift x */
 364                     sendBuffer[n][XX] = x[j][XX] + shift[XX];
 365                     /* Rotate y and z.
 366                      * This operation requires a special shift force
 367                      * treatment, which is performed in calc_vir.
 368                      */
 369                     sendBuffer[n][YY] = box[YY][YY] - x[j][YY];
 370                     sendBuffer[n][ZZ] = box[ZZ][ZZ] - x[j][ZZ];
 371                     n++;
 372                 }
 373             }
 374
 375             DDBufferAccess<gmx::RVec>  receiveBufferAccess(comm->rvecBuffer2, cd->receiveInPlace ? 0 : ind.nrecv[nzone + 1]);
 376
 377             gmx::ArrayRef<gmx::RVec>   receiveBuffer;
 378             if (cd->receiveInPlace)
 379             {
 380                 receiveBuffer = gmx::arrayRefFromArray(x.data() + nat_tot, ind.nrecv[nzone + 1]);
 381             }
 382             else
 383             {
 384                 receiveBuffer = receiveBufferAccess.buffer;
 385             }
 386             /* Send and receive the coordinates */
 387             ddSendrecv(dd, d, dddirBackward,
 388                        sendBuffer, receiveBuffer);
 389
 390             if (!cd->receiveInPlace)
 391             {
 392                 int j = 0;
 393                 for (int zone = 0; zone < nzone; zone++)
 394                 {
 395                     for (int i = ind.cell2at0[zone]; i < ind.cell2at1[zone]; i++)
 396                     {
 397                         x[i] = receiveBuffer[j++];
 398                     }
 399                 }
 400             }
 401             nat_tot += ind.nrecv[nzone+1];
 402         }
 403         nzone += nzone;
 404     }
 405
 406     wallcycle_stop(wcycle, ewcMOVEX);
 407 }
 408
 409 void dd_move_f(gmx_domdec_t             *dd,
 410                gmx::ArrayRef<gmx::RVec>  f,
 411                rvec                     *fshift,
 412                gmx_wallcycle            *wcycle)
 413 {
 414     wallcycle_start(wcycle, ewcMOVEF);
 415
 416     int                    nzone, nat_tot;
 417     gmx_domdec_comm_t     *comm;
 418     gmx_domdec_comm_dim_t *cd;
 419     ivec                   vis;
 420     int                    is;
 421     gmx_bool               bShiftForcesNeedPbc, bScrew;
 422
 423     comm = dd->comm;
 424
 425     nzone   = comm->zones.n/2;
 426     nat_tot = comm->atomRanges.end(DDAtomRanges::Type::Zones);
 427     for (int d = dd->ndim-1; d >= 0; d--)
 428     {
 429         /* Only forces in domains near the PBC boundaries need to
 430            consider PBC in the treatment of fshift */
 431         bShiftForcesNeedPbc   = (dd->ci[dd->dim[d]] == 0);
 432         bScrew                = (bShiftForcesNeedPbc && dd->bScrewPBC && dd->dim[d] == XX);
 433         if (fshift == nullptr && !bScrew)
 434         {
 435             bShiftForcesNeedPbc = FALSE;
 436         }
 437         /* Determine which shift vector we need */
 438         clear_ivec(vis);
 439         vis[dd->dim[d]] = 1;
 440         is              = IVEC2IS(vis);
 441
 442         cd = &comm->cd[d];
 443         for (int p = cd->numPulses() - 1; p >= 0; p--)
 444         {
 445             const gmx_domdec_ind_t    &ind  = cd->ind[p];
 446             DDBufferAccess<gmx::RVec>  receiveBufferAccess(comm->rvecBuffer, ind.nsend[nzone + 1]);
 447             gmx::ArrayRef<gmx::RVec>  &receiveBuffer = receiveBufferAccess.buffer;
 448
 449             nat_tot                        -= ind.nrecv[nzone+1];
 450
 451             DDBufferAccess<gmx::RVec>  sendBufferAccess(comm->rvecBuffer2, cd->receiveInPlace ? 0 : ind.nrecv[nzone + 1]);
 452
 453             gmx::ArrayRef<gmx::RVec>   sendBuffer;
 454             if (cd->receiveInPlace)
 455             {
 456                 sendBuffer = gmx::arrayRefFromArray(f.data() + nat_tot, ind.nrecv[nzone + 1]);
 457             }
 458             else
 459             {
 460                 sendBuffer = sendBufferAccess.buffer;
 461                 int j = 0;
 462                 for (int zone = 0; zone < nzone; zone++)
 463                 {
 464                     for (int i = ind.cell2at0[zone]; i < ind.cell2at1[zone]; i++)
 465                     {
 466                         sendBuffer[j++] = f[i];
 467                     }
 468                 }
 469             }
 470             /* Communicate the forces */
 471             ddSendrecv(dd, d, dddirForward,
 472                        sendBuffer, receiveBuffer);
 473             /* Add the received forces */
 474             int n = 0;
 475             if (!bShiftForcesNeedPbc)
 476             {
 477                 for (int j : ind.index)
 478                 {
 479                     for (int d = 0; d < DIM; d++)
 480                     {
 481                         f[j][d] += receiveBuffer[n][d];
 482                     }
 483                     n++;
 484                 }
 485             }
 486             else if (!bScrew)
 487             {
 488                 /* fshift should always be defined if this function is
 489                  * called when bShiftForcesNeedPbc is true */
 490                 assert(nullptr != fshift);
 491                 for (int j : ind.index)
 492                 {
 493                     for (int d = 0; d < DIM; d++)
 494                     {
 495                         f[j][d] += receiveBuffer[n][d];
 496                     }
 497                     /* Add this force to the shift force */
 498                     for (int d = 0; d < DIM; d++)
 499                     {
 500                         fshift[is][d] += receiveBuffer[n][d];
 501                     }
 502                     n++;
 503                 }
 504             }
 505             else
 506             {
 507                 for (int j : ind.index)
 508                 {
 509                     /* Rotate the force */
 510                     f[j][XX] += receiveBuffer[n][XX];
 511                     f[j][YY] -= receiveBuffer[n][YY];
 512                     f[j][ZZ] -= receiveBuffer[n][ZZ];
 513                     if (fshift)
 514                     {
 515                         /* Add this force to the shift force */
 516                         for (int d = 0; d < DIM; d++)
 517                         {
 518                             fshift[is][d] += receiveBuffer[n][d];
 519                         }
 520                     }
 521                     n++;
 522                 }
 523             }
 524         }
 525         nzone /= 2;
 526     }
 527     wallcycle_stop(wcycle, ewcMOVEF);
 528 }
 529
 530 /* Convenience function for extracting a real buffer from an rvec buffer
 531  *
 532  * To reduce the number of temporary communication buffers and avoid
 533  * cache polution, we reuse gmx::RVec buffers for storing reals.
 534  * This functions return a real buffer reference with the same number
 535  * of elements as the gmx::RVec buffer (so 1/3 of the size in bytes).
 536  */
 537 static gmx::ArrayRef<real>
 538 realArrayRefFromRvecArrayRef(gmx::ArrayRef<gmx::RVec> arrayRef)
 539 {
 540     return gmx::arrayRefFromArray(as_rvec_array(arrayRef.data())[0],
 541                                   arrayRef.size());
 542 }
 543
 544 void dd_atom_spread_real(gmx_domdec_t *dd, real v[])
 545 {
 546     int                    nzone, nat_tot;
 547     gmx_domdec_comm_t     *comm;
 548     gmx_domdec_comm_dim_t *cd;
 549
 550     comm = dd->comm;
 551
 552     nzone   = 1;
 553     nat_tot = comm->atomRanges.numHomeAtoms();
 554     for (int d = 0; d < dd->ndim; d++)
 555     {
 556         cd = &comm->cd[d];
 557         for (const gmx_domdec_ind_t &ind : cd->ind)
 558         {
 559             /* Note: We provision for RVec instead of real, so a factor of 3
 560              * more than needed. The buffer actually already has this size
 561              * and we pass a plain pointer below, so this does not matter.
 562              */
 563             DDBufferAccess<gmx::RVec> sendBufferAccess(comm->rvecBuffer, ind.nsend[nzone + 1]);
 564             gmx::ArrayRef<real>       sendBuffer = realArrayRefFromRvecArrayRef(sendBufferAccess.buffer);
 565             int                       n          = 0;
 566             for (int j : ind.index)
 567             {
 568                 sendBuffer[n++] = v[j];
 569             }
 570
 571             DDBufferAccess<gmx::RVec> receiveBufferAccess(comm->rvecBuffer2, cd->receiveInPlace ? 0 : ind.nrecv[nzone + 1]);
 572
 573             gmx::ArrayRef<real>       receiveBuffer;
 574             if (cd->receiveInPlace)
 575             {
 576                 receiveBuffer = gmx::arrayRefFromArray(v + nat_tot, ind.nrecv[nzone + 1]);
 577             }
 578             else
 579             {
 580                 receiveBuffer = realArrayRefFromRvecArrayRef(receiveBufferAccess.buffer);
 581             }
 582             /* Send and receive the data */
 583             ddSendrecv(dd, d, dddirBackward,
 584                        sendBuffer, receiveBuffer);
 585             if (!cd->receiveInPlace)
 586             {
 587                 int j = 0;
 588                 for (int zone = 0; zone < nzone; zone++)
 589                 {
 590                     for (int i = ind.cell2at0[zone]; i < ind.cell2at1[zone]; i++)
 591                     {
 592                         v[i] = receiveBuffer[j++];
 593                     }
 594                 }
 595             }
 596             nat_tot += ind.nrecv[nzone+1];
 597         }
 598         nzone += nzone;
 599     }
 600 }
 601
 602 void dd_atom_sum_real(gmx_domdec_t *dd, real v[])
 603 {
 604     int                    nzone, nat_tot;
 605     gmx_domdec_comm_t     *comm;
 606     gmx_domdec_comm_dim_t *cd;
 607
 608     comm = dd->comm;
 609
 610     nzone   = comm->zones.n/2;
 611     nat_tot = comm->atomRanges.end(DDAtomRanges::Type::Zones);
 612     for (int d = dd->ndim-1; d >= 0; d--)
 613     {
 614         cd = &comm->cd[d];
 615         for (int p = cd->numPulses() - 1; p >= 0; p--)
 616         {
 617             const gmx_domdec_ind_t &ind = cd->ind[p];
 618
 619             /* Note: We provision for RVec instead of real, so a factor of 3
 620              * more than needed. The buffer actually already has this size
 621              * and we typecast, so this works as intended.
 622              */
 623             DDBufferAccess<gmx::RVec> receiveBufferAccess(comm->rvecBuffer, ind.nsend[nzone + 1]);
 624             gmx::ArrayRef<real>       receiveBuffer = realArrayRefFromRvecArrayRef(receiveBufferAccess.buffer);
 625             nat_tot -= ind.nrecv[nzone + 1];
 626
 627             DDBufferAccess<gmx::RVec> sendBufferAccess(comm->rvecBuffer2, cd->receiveInPlace ? 0 : ind.nrecv[nzone + 1]);
 628
 629             gmx::ArrayRef<real>       sendBuffer;
 630             if (cd->receiveInPlace)
 631             {
 632                 sendBuffer = gmx::arrayRefFromArray(v + nat_tot, ind.nrecv[nzone + 1]);
 633             }
 634             else
 635             {
 636                 sendBuffer = realArrayRefFromRvecArrayRef(sendBufferAccess.buffer);
 637                 int j = 0;
 638                 for (int zone = 0; zone < nzone; zone++)
 639                 {
 640                     for (int i = ind.cell2at0[zone]; i < ind.cell2at1[zone]; i++)
 641                     {
 642                         sendBuffer[j++] = v[i];
 643                     }
 644                 }
 645             }
 646             /* Communicate the forces */
 647             ddSendrecv(dd, d, dddirForward,
 648                        sendBuffer, receiveBuffer);
 649             /* Add the received forces */
 650             int n = 0;
 651             for (int j : ind.index)
 652             {
 653                 v[j] += receiveBuffer[n];
 654                 n++;
 655             }
 656         }
 657         nzone /= 2;
 658     }
 659 }
 660
 661 real dd_cutoff_multibody(const gmx_domdec_t *dd)
 662 {
 663     gmx_domdec_comm_t *comm;
 664     int                di;
 665     real               r;
 666
 667     comm = dd->comm;
 668
 669     r = -1;
 670     if (comm->haveInterDomainMultiBodyBondeds)
 671     {
 672         if (comm->cutoff_mbody > 0)
 673         {
 674             r = comm->cutoff_mbody;
 675         }
 676         else
 677         {
 678             /* cutoff_mbody=0 means we do not have DLB */
 679             r = comm->cellsize_min[dd->dim[0]];
 680             for (di = 1; di < dd->ndim; di++)
 681             {
 682                 r = std::min(r, comm->cellsize_min[dd->dim[di]]);
 683             }
 684             if (comm->bBondComm)
 685             {
 686                 r = std::max(r, comm->cutoff_mbody);
 687             }
 688             else
 689             {
 690                 r = std::min(r, comm->cutoff);
 691             }
 692         }
 693     }
 694
 695     return r;
 696 }
 697
 698 real dd_cutoff_twobody(const gmx_domdec_t *dd)
 699 {
 700     real r_mb;
 701
 702     r_mb = dd_cutoff_multibody(dd);
 703
 704     return std::max(dd->comm->cutoff, r_mb);
 705 }
 706
 707
 708 static void dd_cart_coord2pmecoord(const gmx_domdec_t *dd, const ivec coord,
 709                                    ivec coord_pme)
 710 {
 711     int nc, ntot;
 712
 713     nc   = dd->nc[dd->comm->cartpmedim];
 714     ntot = dd->comm->ntot[dd->comm->cartpmedim];
 715     copy_ivec(coord, coord_pme);
 716     coord_pme[dd->comm->cartpmedim] =
 717         nc + (coord[dd->comm->cartpmedim]*(ntot - nc) + (ntot - nc)/2)/nc;
 718 }
 719
 720 static int ddindex2pmeindex(const gmx_domdec_t *dd, int ddindex)
 721 {
 722     int npp, npme;
 723
 724     npp  = dd->nnodes;
 725     npme = dd->comm->npmenodes;
 726
 727     /* Here we assign a PME node to communicate with this DD node
 728      * by assuming that the major index of both is x.
 729      * We add cr->npmenodes/2 to obtain an even distribution.
 730      */
 731     return (ddindex*npme + npme/2)/npp;
 732 }
 733
 734 static int *dd_interleaved_pme_ranks(const gmx_domdec_t *dd)
 735 {
 736     int *pme_rank;
 737     int  n, i, p0, p1;
 738
 739     snew(pme_rank, dd->comm->npmenodes);
 740     n = 0;
 741     for (i = 0; i < dd->nnodes; i++)
 742     {
 743         p0 = ddindex2pmeindex(dd, i);
 744         p1 = ddindex2pmeindex(dd, i+1);
 745         if (i+1 == dd->nnodes || p1 > p0)
 746         {
 747             if (debug)
 748             {
 749                 fprintf(debug, "pme_rank[%d] = %d\n", n, i+1+n);
 750             }
 751             pme_rank[n] = i + 1 + n;
 752             n++;
 753         }
 754     }
 755
 756     return pme_rank;
 757 }
 758
 759 static int gmx_ddcoord2pmeindex(const t_commrec *cr, int x, int y, int z)
 760 {
 761     gmx_domdec_t *dd;
 762     ivec          coords;
 763     int           slab;
 764
 765     dd = cr->dd;
 766     /*
 767        if (dd->comm->bCartesian) {
 768        gmx_ddindex2xyz(dd->nc,ddindex,coords);
 769        dd_coords2pmecoords(dd,coords,coords_pme);
 770        copy_ivec(dd->ntot,nc);
 771        nc[dd->cartpmedim]         -= dd->nc[dd->cartpmedim];
 772        coords_pme[dd->cartpmedim] -= dd->nc[dd->cartpmedim];
 773
 774        slab = (coords_pme[XX]*nc[YY] + coords_pme[YY])*nc[ZZ] + coords_pme[ZZ];
 775        } else {
 776        slab = (ddindex*cr->npmenodes + cr->npmenodes/2)/dd->nnodes;
 777        }
 778      */
 779     coords[XX] = x;
 780     coords[YY] = y;
 781     coords[ZZ] = z;
 782     slab       = ddindex2pmeindex(dd, dd_index(dd->nc, coords));
 783
 784     return slab;
 785 }
 786
 787 static int ddcoord2simnodeid(const t_commrec *cr, int x, int y, int z)
 788 {
 789     gmx_domdec_comm_t *comm;
 790     ivec               coords;
 791     int                ddindex, nodeid = -1;
 792
 793     comm = cr->dd->comm;
 794
 795     coords[XX] = x;
 796     coords[YY] = y;
 797     coords[ZZ] = z;
 798     if (comm->bCartesianPP_PME)
 799     {
 800 #if GMX_MPI
 801         MPI_Cart_rank(cr->mpi_comm_mysim, coords, &nodeid);
 802 #endif
 803     }
 804     else
 805     {
 806         ddindex = dd_index(cr->dd->nc, coords);
 807         if (comm->bCartesianPP)
 808         {
 809             nodeid = comm->ddindex2simnodeid[ddindex];
 810         }
 811         else
 812         {
 813             if (comm->pmenodes)
 814             {
 815                 nodeid = ddindex + gmx_ddcoord2pmeindex(cr, x, y, z);
 816             }
 817             else
 818             {
 819                 nodeid = ddindex;
 820             }
 821         }
 822     }
 823
 824     return nodeid;
 825 }
 826
 827 static int dd_simnode2pmenode(const gmx_domdec_t         *dd,
 828                               const t_commrec gmx_unused *cr,
 829                               int                         sim_nodeid)
 830 {
 831     int pmenode = -1;
 832
 833     const gmx_domdec_comm_t *comm = dd->comm;
 834
 835     /* This assumes a uniform x domain decomposition grid cell size */
 836     if (comm->bCartesianPP_PME)
 837     {
 838 #if GMX_MPI
 839         ivec coord, coord_pme;
 840         MPI_Cart_coords(cr->mpi_comm_mysim, sim_nodeid, DIM, coord);
 841         if (coord[comm->cartpmedim] < dd->nc[comm->cartpmedim])
 842         {
 843             /* This is a PP node */
 844             dd_cart_coord2pmecoord(dd, coord, coord_pme);
 845             MPI_Cart_rank(cr->mpi_comm_mysim, coord_pme, &pmenode);
 846         }
 847 #endif
 848     }
 849     else if (comm->bCartesianPP)
 850     {
 851         if (sim_nodeid < dd->nnodes)
 852         {
 853             pmenode = dd->nnodes + ddindex2pmeindex(dd, sim_nodeid);
 854         }
 855     }
 856     else
 857     {
 858         /* This assumes DD cells with identical x coordinates
 859          * are numbered sequentially.
 860          */
 861         if (dd->comm->pmenodes == nullptr)
 862         {
 863             if (sim_nodeid < dd->nnodes)
 864             {
 865                 /* The DD index equals the nodeid */
 866                 pmenode = dd->nnodes + ddindex2pmeindex(dd, sim_nodeid);
 867             }
 868         }
 869         else
 870         {
 871             int i = 0;
 872             while (sim_nodeid > dd->comm->pmenodes[i])
 873             {
 874                 i++;
 875             }
 876             if (sim_nodeid < dd->comm->pmenodes[i])
 877             {
 878                 pmenode = dd->comm->pmenodes[i];
 879             }
 880         }
 881     }
 882
 883     return pmenode;
 884 }
 885
 886 NumPmeDomains getNumPmeDomains(const gmx_domdec_t *dd)
 887 {
 888     if (dd != nullptr)
 889     {
 890         return {
 891                    dd->comm->npmenodes_x, dd->comm->npmenodes_y
 892         };
 893     }
 894     else
 895     {
 896         return {
 897                    1, 1
 898         };
 899     }
 900 }
 901
 902 std::vector<int> get_pme_ddranks(const t_commrec *cr, int pmenodeid)
 903 {
 904     gmx_domdec_t *dd;
 905     int           x, y, z;
 906     ivec          coord, coord_pme;
 907
 908     dd = cr->dd;
 909
 910     std::vector<int> ddranks;
 911     ddranks.reserve((dd->nnodes+cr->npmenodes-1)/cr->npmenodes);
 912
 913     for (x = 0; x < dd->nc[XX]; x++)
 914     {
 915         for (y = 0; y < dd->nc[YY]; y++)
 916         {
 917             for (z = 0; z < dd->nc[ZZ]; z++)
 918             {
 919                 if (dd->comm->bCartesianPP_PME)
 920                 {
 921                     coord[XX] = x;
 922                     coord[YY] = y;
 923                     coord[ZZ] = z;
 924                     dd_cart_coord2pmecoord(dd, coord, coord_pme);
 925                     if (dd->ci[XX] == coord_pme[XX] &&
 926                         dd->ci[YY] == coord_pme[YY] &&
 927                         dd->ci[ZZ] == coord_pme[ZZ])
 928                     {
 929                         ddranks.push_back(ddcoord2simnodeid(cr, x, y, z));
 930                     }
 931                 }
 932                 else
 933                 {
 934                     /* The slab corresponds to the nodeid in the PME group */
 935                     if (gmx_ddcoord2pmeindex(cr, x, y, z) == pmenodeid)
 936                     {
 937                         ddranks.push_back(ddcoord2simnodeid(cr, x, y, z));
 938                     }
 939                 }
 940             }
 941         }
 942     }
 943     return ddranks;
 944 }
 945
 946 static gmx_bool receive_vir_ener(const gmx_domdec_t *dd, const t_commrec *cr)
 947 {
 948     gmx_bool bReceive = TRUE;
 949
 950     if (cr->npmenodes < dd->nnodes)
 951     {
 952         gmx_domdec_comm_t *comm = dd->comm;
 953         if (comm->bCartesianPP_PME)
 954         {
 955 #if GMX_MPI
 956             int  pmenode = dd_simnode2pmenode(dd, cr, cr->sim_nodeid);
 957             ivec coords;
 958             MPI_Cart_coords(cr->mpi_comm_mysim, cr->sim_nodeid, DIM, coords);
 959             coords[comm->cartpmedim]++;
 960             if (coords[comm->cartpmedim] < dd->nc[comm->cartpmedim])
 961             {
 962                 int rank;
 963                 MPI_Cart_rank(cr->mpi_comm_mysim, coords, &rank);
 964                 if (dd_simnode2pmenode(dd, cr, rank) == pmenode)
 965                 {
 966                     /* This is not the last PP node for pmenode */
 967                     bReceive = FALSE;
 968                 }
 969             }
 970 #else
 971             GMX_RELEASE_ASSERT(false, "Without MPI we should not have Cartesian PP-PME with #PMEnodes < #DDnodes");
 972 #endif
 973         }
 974         else
 975         {
 976             int pmenode = dd_simnode2pmenode(dd, cr, cr->sim_nodeid);
 977             if (cr->sim_nodeid+1 < cr->nnodes &&
 978                 dd_simnode2pmenode(dd, cr, cr->sim_nodeid+1) == pmenode)
 979             {
 980                 /* This is not the last PP node for pmenode */
 981                 bReceive = FALSE;
 982             }
 983         }
 984     }
 985
 986     return bReceive;
 987 }
 988
 989 static void set_slb_pme_dim_f(gmx_domdec_t *dd, int dim, real **dim_f)
 990 {
 991     gmx_domdec_comm_t *comm;
 992     int                i;
 993
 994     comm = dd->comm;
 995
 996     snew(*dim_f, dd->nc[dim]+1);
 997     (*dim_f)[0] = 0;
 998     for (i = 1; i < dd->nc[dim]; i++)
 999     {
1000         if (comm->slb_frac[dim])
1001         {
1002             (*dim_f)[i] = (*dim_f)[i-1] + comm->slb_frac[dim][i-1];
1003         }
1004         else
1005         {
1006             (*dim_f)[i] = static_cast<real>(i)/static_cast<real>(dd->nc[dim]);
1007         }
1008     }
1009     (*dim_f)[dd->nc[dim]] = 1;
1010 }
1011
1012 static void init_ddpme(gmx_domdec_t *dd, gmx_ddpme_t *ddpme, int dimind)
1013 {
1014     int  pmeindex, slab, nso, i;
1015     ivec xyz;
1016
1017     if (dimind == 0 && dd->dim[0] == YY && dd->comm->npmenodes_x == 1)
1018     {
1019         ddpme->dim = YY;
1020     }
1021     else
1022     {
1023         ddpme->dim = dimind;
1024     }
1025     ddpme->dim_match = (ddpme->dim == dd->dim[dimind]);
1026
1027     ddpme->nslab = (ddpme->dim == 0 ?
1028                     dd->comm->npmenodes_x :
1029                     dd->comm->npmenodes_y);
1030
1031     if (ddpme->nslab <= 1)
1032     {
1033         return;
1034     }
1035
1036     nso = dd->comm->npmenodes/ddpme->nslab;
1037     /* Determine for each PME slab the PP location range for dimension dim */
1038     snew(ddpme->pp_min, ddpme->nslab);
1039     snew(ddpme->pp_max, ddpme->nslab);
1040     for (slab = 0; slab < ddpme->nslab; slab++)
1041     {
1042         ddpme->pp_min[slab] = dd->nc[dd->dim[dimind]] - 1;
1043         ddpme->pp_max[slab] = 0;
1044     }
1045     for (i = 0; i < dd->nnodes; i++)
1046     {
1047         ddindex2xyz(dd->nc, i, xyz);
1048         /* For y only use our y/z slab.
1049          * This assumes that the PME x grid size matches the DD grid size.
1050          */
1051         if (dimind == 0 || xyz[XX] == dd->ci[XX])
1052         {
1053             pmeindex = ddindex2pmeindex(dd, i);
1054             if (dimind == 0)
1055             {
1056                 slab = pmeindex/nso;
1057             }
1058             else
1059             {
1060                 slab = pmeindex % ddpme->nslab;
1061             }
1062             ddpme->pp_min[slab] = std::min(ddpme->pp_min[slab], xyz[dimind]);
1063             ddpme->pp_max[slab] = std::max(ddpme->pp_max[slab], xyz[dimind]);
1064         }
1065     }
1066
1067     set_slb_pme_dim_f(dd, ddpme->dim, &ddpme->slb_dim_f);
1068 }
1069
1070 int dd_pme_maxshift_x(const gmx_domdec_t *dd)
1071 {
1072     if (dd->comm->ddpme[0].dim == XX)
1073     {
1074         return dd->comm->ddpme[0].maxshift;
1075     }
1076     else
1077     {
1078         return 0;
1079     }
1080 }
1081
1082 int dd_pme_maxshift_y(const gmx_domdec_t *dd)
1083 {
1084     if (dd->comm->ddpme[0].dim == YY)
1085     {
1086         return dd->comm->ddpme[0].maxshift;
1087     }
1088     else if (dd->comm->npmedecompdim >= 2 && dd->comm->ddpme[1].dim == YY)
1089     {
1090         return dd->comm->ddpme[1].maxshift;
1091     }
1092     else
1093     {
1094         return 0;
1095     }
1096 }
1097
1098 void dd_cycles_add(const gmx_domdec_t *dd, float cycles, int ddCycl)
1099 {
1100     /* Note that the cycles value can be incorrect, either 0 or some
1101      * extremely large value, when our thread migrated to another core
1102      * with an unsynchronized cycle counter. If this happens less often
1103      * that once per nstlist steps, this will not cause issues, since
1104      * we later subtract the maximum value from the sum over nstlist steps.
1105      * A zero count will slightly lower the total, but that's a small effect.
1106      * Note that the main purpose of the subtraction of the maximum value
1107      * is to avoid throwing off the load balancing when stalls occur due
1108      * e.g. system activity or network congestion.
1109      */
1110     dd->comm->cycl[ddCycl] += cycles;
1111     dd->comm->cycl_n[ddCycl]++;
1112     if (cycles > dd->comm->cycl_max[ddCycl])
1113     {
1114         dd->comm->cycl_max[ddCycl] = cycles;
1115     }
1116 }
1117
1118 #if GMX_MPI
1119 static void make_load_communicator(gmx_domdec_t *dd, int dim_ind, ivec loc)
1120 {
1121     MPI_Comm           c_row;
1122     int                dim, i, rank;
1123     ivec               loc_c;
1124     gmx_bool           bPartOfGroup = FALSE;
1125
1126     dim = dd->dim[dim_ind];
1127     copy_ivec(loc, loc_c);
1128     for (i = 0; i < dd->nc[dim]; i++)
1129     {
1130         loc_c[dim] = i;
1131         rank       = dd_index(dd->nc, loc_c);
1132         if (rank == dd->rank)
1133         {
1134             /* This process is part of the group */
1135             bPartOfGroup = TRUE;
1136         }
1137     }
1138     MPI_Comm_split(dd->mpi_comm_all, bPartOfGroup ? 0 : MPI_UNDEFINED, dd->rank,
1139                    &c_row);
1140     if (bPartOfGroup)
1141     {
1142         dd->comm->mpi_comm_load[dim_ind] = c_row;
1143         if (!isDlbDisabled(dd->comm))
1144         {
1145             DDCellsizesWithDlb &cellsizes = dd->comm->cellsizesWithDlb[dim_ind];
1146
1147             if (dd->ci[dim] == dd->master_ci[dim])
1148             {
1149                 /* This is the root process of this row */
1150                 cellsizes.rowMaster  = std::make_unique<RowMaster>();
1151
1152                 RowMaster &rowMaster = *cellsizes.rowMaster;
1153                 rowMaster.cellFrac.resize(ddCellFractionBufferSize(dd, dim_ind));
1154                 rowMaster.oldCellFrac.resize(dd->nc[dim] + 1);
1155                 rowMaster.isCellMin.resize(dd->nc[dim]);
1156                 if (dim_ind > 0)
1157                 {
1158                     rowMaster.bounds.resize(dd->nc[dim]);
1159                 }
1160                 rowMaster.buf_ncd.resize(dd->nc[dim]);
1161             }
1162             else
1163             {
1164                 /* This is not a root process, we only need to receive cell_f */
1165                 cellsizes.fracRow.resize(ddCellFractionBufferSize(dd, dim_ind));
1166             }
1167         }
1168         if (dd->ci[dim] == dd->master_ci[dim])
1169         {
1170             snew(dd->comm->load[dim_ind].load, dd->nc[dim]*DD_NLOAD_MAX);
1171         }
1172     }
1173 }
1174 #endif
1175
1176 void dd_setup_dlb_resource_sharing(t_commrec            *cr,
1177                                    int                   gpu_id)
1178 {
1179 #if GMX_MPI
1180     int           physicalnode_id_hash;
1181     gmx_domdec_t *dd;
1182     MPI_Comm      mpi_comm_pp_physicalnode;
1183
1184     if (!thisRankHasDuty(cr, DUTY_PP) || gpu_id < 0)
1185     {
1186         /* Only ranks with short-ranged tasks (currently) use GPUs.
1187          * If we don't have GPUs assigned, there are no resources to share.
1188          */
1189         return;
1190     }
1191
1192     physicalnode_id_hash = gmx_physicalnode_id_hash();
1193
1194     dd = cr->dd;
1195
1196     if (debug)
1197     {
1198         fprintf(debug, "dd_setup_dd_dlb_gpu_sharing:\n");
1199         fprintf(debug, "DD PP rank %d physical node hash %d gpu_id %d\n",
1200                 dd->rank, physicalnode_id_hash, gpu_id);
1201     }
1202     /* Split the PP communicator over the physical nodes */
1203     /* TODO: See if we should store this (before), as it's also used for
1204      * for the nodecomm summation.
1205      */
1206     // TODO PhysicalNodeCommunicator could be extended/used to handle
1207     // the need for per-node per-group communicators.
1208     MPI_Comm_split(dd->mpi_comm_all, physicalnode_id_hash, dd->rank,
1209                    &mpi_comm_pp_physicalnode);
1210     MPI_Comm_split(mpi_comm_pp_physicalnode, gpu_id, dd->rank,
1211                    &dd->comm->mpi_comm_gpu_shared);
1212     MPI_Comm_free(&mpi_comm_pp_physicalnode);
1213     MPI_Comm_size(dd->comm->mpi_comm_gpu_shared, &dd->comm->nrank_gpu_shared);
1214
1215     if (debug)
1216     {
1217         fprintf(debug, "nrank_gpu_shared %d\n", dd->comm->nrank_gpu_shared);
1218     }
1219
1220     /* Note that some ranks could share a GPU, while others don't */
1221
1222     if (dd->comm->nrank_gpu_shared == 1)
1223     {
1224         MPI_Comm_free(&dd->comm->mpi_comm_gpu_shared);
1225     }
1226 #else
1227     GMX_UNUSED_VALUE(cr);
1228     GMX_UNUSED_VALUE(gpu_id);
1229 #endif
1230 }
1231
1232 static void make_load_communicators(gmx_domdec_t gmx_unused *dd)
1233 {
1234 #if GMX_MPI
1235     int  dim0, dim1, i, j;
1236     ivec loc;
1237
1238     if (debug)
1239     {
1240         fprintf(debug, "Making load communicators\n");
1241     }
1242
1243     snew(dd->comm->load,          std::max(dd->ndim, 1));
1244     snew(dd->comm->mpi_comm_load, std::max(dd->ndim, 1));
1245
1246     if (dd->ndim == 0)
1247     {
1248         return;
1249     }
1250
1251     clear_ivec(loc);
1252     make_load_communicator(dd, 0, loc);
1253     if (dd->ndim > 1)
1254     {
1255         dim0 = dd->dim[0];
1256         for (i = 0; i < dd->nc[dim0]; i++)
1257         {
1258             loc[dim0] = i;
1259             make_load_communicator(dd, 1, loc);
1260         }
1261     }
1262     if (dd->ndim > 2)
1263     {
1264         dim0 = dd->dim[0];
1265         for (i = 0; i < dd->nc[dim0]; i++)
1266         {
1267             loc[dim0] = i;
1268             dim1      = dd->dim[1];
1269             for (j = 0; j < dd->nc[dim1]; j++)
1270             {
1271                 loc[dim1] = j;
1272                 make_load_communicator(dd, 2, loc);
1273             }
1274         }
1275     }
1276
1277     if (debug)
1278     {
1279         fprintf(debug, "Finished making load communicators\n");
1280     }
1281 #endif
1282 }
1283
1284 /*! \brief Sets up the relation between neighboring domains and zones */
1285 static void setup_neighbor_relations(gmx_domdec_t *dd)
1286 {
1287     int                     d, dim, i, j, m;
1288     ivec                    tmp, s;
1289     gmx_domdec_zones_t     *zones;
1290     gmx_domdec_ns_ranges_t *izone;
1291
1292     for (d = 0; d < dd->ndim; d++)
1293     {
1294         dim = dd->dim[d];
1295         copy_ivec(dd->ci, tmp);
1296         tmp[dim]           = (tmp[dim] + 1) % dd->nc[dim];
1297         dd->neighbor[d][0] = ddcoord2ddnodeid(dd, tmp);
1298         copy_ivec(dd->ci, tmp);
1299         tmp[dim]           = (tmp[dim] - 1 + dd->nc[dim]) % dd->nc[dim];
1300         dd->neighbor[d][1] = ddcoord2ddnodeid(dd, tmp);
1301         if (debug)
1302         {
1303             fprintf(debug, "DD rank %d neighbor ranks in dir %d are + %d - %d\n",
1304                     dd->rank, dim,
1305                     dd->neighbor[d][0],
1306                     dd->neighbor[d][1]);
1307         }
1308     }
1309
1310     int nzone  = (1 << dd->ndim);
1311     int nizone = (1 << std::max(dd->ndim - 1, 0));
1312     assert(nizone >= 1 && nizone <= DD_MAXIZONE);
1313
1314     zones = &dd->comm->zones;
1315
1316     for (i = 0; i < nzone; i++)
1317     {
1318         m = 0;
1319         clear_ivec(zones->shift[i]);
1320         for (d = 0; d < dd->ndim; d++)
1321         {
1322             zones->shift[i][dd->dim[d]] = dd_zo[i][m++];
1323         }
1324     }
1325
1326     zones->n = nzone;
1327     for (i = 0; i < nzone; i++)
1328     {
1329         for (d = 0; d < DIM; d++)
1330         {
1331             s[d] = dd->ci[d] - zones->shift[i][d];
1332             if (s[d] < 0)
1333             {
1334                 s[d] += dd->nc[d];
1335             }
1336             else if (s[d] >= dd->nc[d])
1337             {
1338                 s[d] -= dd->nc[d];
1339             }
1340         }
1341     }
1342     zones->nizone = nizone;
1343     for (i = 0; i < zones->nizone; i++)
1344     {
1345         assert(ddNonbondedZonePairRanges[i][0] == i);
1346
1347         izone     = &zones->izone[i];
1348         /* dd_zp3 is for 3D decomposition, for fewer dimensions use only
1349          * j-zones up to nzone.
1350          */
1351         izone->j0 = std::min(ddNonbondedZonePairRanges[i][1], nzone);
1352         izone->j1 = std::min(ddNonbondedZonePairRanges[i][2], nzone);
1353         for (dim = 0; dim < DIM; dim++)
1354         {
1355             if (dd->nc[dim] == 1)
1356             {
1357                 /* All shifts should be allowed */
1358                 izone->shift0[dim] = -1;
1359                 izone->shift1[dim] = 1;
1360             }
1361             else
1362             {
1363                 /* Determine the min/max j-zone shift wrt the i-zone */
1364                 izone->shift0[dim] = 1;
1365                 izone->shift1[dim] = -1;
1366                 for (j = izone->j0; j < izone->j1; j++)
1367                 {
1368                     int shift_diff = zones->shift[j][dim] - zones->shift[i][dim];
1369                     if (shift_diff < izone->shift0[dim])
1370                     {
1371                         izone->shift0[dim] = shift_diff;
1372                     }
1373                     if (shift_diff > izone->shift1[dim])
1374                     {
1375                         izone->shift1[dim] = shift_diff;
1376                     }
1377                 }
1378             }
1379         }
1380     }
1381
1382     if (!isDlbDisabled(dd->comm))
1383     {
1384         dd->comm->cellsizesWithDlb.resize(dd->ndim);
1385     }
1386
1387     if (dd->comm->bRecordLoad)
1388     {
1389         make_load_communicators(dd);
1390     }
1391 }
1392
1393 static void make_pp_communicator(const gmx::MDLogger  &mdlog,
1394                                  gmx_domdec_t         *dd,
1395                                  t_commrec gmx_unused *cr,
1396                                  bool gmx_unused       reorder)
1397 {
1398 #if GMX_MPI
1399     gmx_domdec_comm_t *comm;
1400     int                rank, *buf;
1401     ivec               periods;
1402     MPI_Comm           comm_cart;
1403
1404     comm = dd->comm;
1405
1406     if (comm->bCartesianPP)
1407     {
1408         /* Set up cartesian communication for the particle-particle part */
1409         GMX_LOG(mdlog.info).appendTextFormatted(
1410                 "Will use a Cartesian communicator: %d x %d x %d",
1411                 dd->nc[XX], dd->nc[YY], dd->nc[ZZ]);
1412
1413         for (int i = 0; i < DIM; i++)
1414         {
1415             periods[i] = TRUE;
1416         }
1417         MPI_Cart_create(cr->mpi_comm_mygroup, DIM, dd->nc, periods, static_cast<int>(reorder),
1418                         &comm_cart);
1419         /* We overwrite the old communicator with the new cartesian one */
1420         cr->mpi_comm_mygroup = comm_cart;
1421     }
1422
1423     dd->mpi_comm_all = cr->mpi_comm_mygroup;
1424     MPI_Comm_rank(dd->mpi_comm_all, &dd->rank);
1425
1426     if (comm->bCartesianPP_PME)
1427     {
1428         /* Since we want to use the original cartesian setup for sim,
1429          * and not the one after split, we need to make an index.
1430          */
1431         snew(comm->ddindex2ddnodeid, dd->nnodes);
1432         comm->ddindex2ddnodeid[dd_index(dd->nc, dd->ci)] = dd->rank;
1433         gmx_sumi(dd->nnodes, comm->ddindex2ddnodeid, cr);
1434         /* Get the rank of the DD master,
1435          * above we made sure that the master node is a PP node.
1436          */
1437         if (MASTER(cr))
1438         {
1439             rank = dd->rank;
1440         }
1441         else
1442         {
1443             rank = 0;
1444         }
1445         MPI_Allreduce(&rank, &dd->masterrank, 1, MPI_INT, MPI_SUM, dd->mpi_comm_all);
1446     }
1447     else if (comm->bCartesianPP)
1448     {
1449         if (cr->npmenodes == 0)
1450         {
1451             /* The PP communicator is also
1452              * the communicator for this simulation
1453              */
1454             cr->mpi_comm_mysim = cr->mpi_comm_mygroup;
1455         }
1456         cr->nodeid = dd->rank;
1457
1458         MPI_Cart_coords(dd->mpi_comm_all, dd->rank, DIM, dd->ci);
1459
1460         /* We need to make an index to go from the coordinates
1461          * to the nodeid of this simulation.
1462          */
1463         snew(comm->ddindex2simnodeid, dd->nnodes);
1464         snew(buf, dd->nnodes);
1465         if (thisRankHasDuty(cr, DUTY_PP))
1466         {
1467             buf[dd_index(dd->nc, dd->ci)] = cr->sim_nodeid;
1468         }
1469         /* Communicate the ddindex to simulation nodeid index */
1470         MPI_Allreduce(buf, comm->ddindex2simnodeid, dd->nnodes, MPI_INT, MPI_SUM,
1471                       cr->mpi_comm_mysim);
1472         sfree(buf);
1473
1474         /* Determine the master coordinates and rank.
1475          * The DD master should be the same node as the master of this sim.
1476          */
1477         for (int i = 0; i < dd->nnodes; i++)
1478         {
1479             if (comm->ddindex2simnodeid[i] == 0)
1480             {
1481                 ddindex2xyz(dd->nc, i, dd->master_ci);
1482                 MPI_Cart_rank(dd->mpi_comm_all, dd->master_ci, &dd->masterrank);
1483             }
1484         }
1485         if (debug)
1486         {
1487             fprintf(debug, "The master rank is %d\n", dd->masterrank);
1488         }
1489     }
1490     else
1491     {
1492         /* No Cartesian communicators */
1493         /* We use the rank in dd->comm->all as DD index */
1494         ddindex2xyz(dd->nc, dd->rank, dd->ci);
1495         /* The simulation master nodeid is 0, so the DD master rank is also 0 */
1496         dd->masterrank = 0;
1497         clear_ivec(dd->master_ci);
1498     }
1499 #endif
1500
1501     GMX_LOG(mdlog.info).appendTextFormatted(
1502             "Domain decomposition rank %d, coordinates %d %d %d\n",
1503             dd->rank, dd->ci[XX], dd->ci[YY], dd->ci[ZZ]);
1504     if (debug)
1505     {
1506         fprintf(debug,
1507                 "Domain decomposition rank %d, coordinates %d %d %d\n\n",
1508                 dd->rank, dd->ci[XX], dd->ci[YY], dd->ci[ZZ]);
1509     }
1510 }
1511
1512 static void receive_ddindex2simnodeid(gmx_domdec_t         *dd,
1513                                       t_commrec            *cr)
1514 {
1515 #if GMX_MPI
1516     gmx_domdec_comm_t *comm = dd->comm;
1517
1518     if (!comm->bCartesianPP_PME && comm->bCartesianPP)
1519     {
1520         int *buf;
1521         snew(comm->ddindex2simnodeid, dd->nnodes);
1522         snew(buf, dd->nnodes);
1523         if (thisRankHasDuty(cr, DUTY_PP))
1524         {
1525             buf[dd_index(dd->nc, dd->ci)] = cr->sim_nodeid;
1526         }
1527         /* Communicate the ddindex to simulation nodeid index */
1528         MPI_Allreduce(buf, comm->ddindex2simnodeid, dd->nnodes, MPI_INT, MPI_SUM,
1529                       cr->mpi_comm_mysim);
1530         sfree(buf);
1531     }
1532 #else
1533     GMX_UNUSED_VALUE(dd);
1534     GMX_UNUSED_VALUE(cr);
1535 #endif
1536 }
1537
1538 static void split_communicator(const gmx::MDLogger &mdlog,
1539                                t_commrec *cr, gmx_domdec_t *dd,
1540                                DdRankOrder gmx_unused rankOrder,
1541                                bool gmx_unused reorder)
1542 {
1543     gmx_domdec_comm_t *comm;
1544     int                i;
1545     gmx_bool           bDiv[DIM];
1546 #if GMX_MPI
1547     MPI_Comm           comm_cart;
1548 #endif
1549
1550     comm = dd->comm;
1551
1552     if (comm->bCartesianPP)
1553     {
1554         for (i = 1; i < DIM; i++)
1555         {
1556             bDiv[i] = ((cr->npmenodes*dd->nc[i]) % (dd->nnodes) == 0);
1557         }
1558         if (bDiv[YY] || bDiv[ZZ])
1559         {
1560             comm->bCartesianPP_PME = TRUE;
1561             /* If we have 2D PME decomposition, which is always in x+y,
1562              * we stack the PME only nodes in z.
1563              * Otherwise we choose the direction that provides the thinnest slab
1564              * of PME only nodes as this will have the least effect
1565              * on the PP communication.
1566              * But for the PME communication the opposite might be better.
1567              */
1568             if (bDiv[ZZ] && (comm->npmenodes_y > 1 ||
1569                              !bDiv[YY] ||
1570                              dd->nc[YY] > dd->nc[ZZ]))
1571             {
1572                 comm->cartpmedim = ZZ;
1573             }
1574             else
1575             {
1576                 comm->cartpmedim = YY;
1577             }
1578             comm->ntot[comm->cartpmedim]
1579                 += (cr->npmenodes*dd->nc[comm->cartpmedim])/dd->nnodes;
1580         }
1581         else
1582         {
1583             GMX_LOG(mdlog.info).appendTextFormatted(
1584                     "Number of PME-only ranks (%d) is not a multiple of nx*ny (%d*%d) or nx*nz (%d*%d)",
1585                     cr->npmenodes, dd->nc[XX], dd->nc[YY], dd->nc[XX], dd->nc[ZZ]);
1586             GMX_LOG(mdlog.info).appendText("Will not use a Cartesian communicator for PP <-> PME\n");
1587         }
1588     }
1589
1590     if (comm->bCartesianPP_PME)
1591     {
1592 #if GMX_MPI
1593         int  rank;
1594         ivec periods;
1595
1596         GMX_LOG(mdlog.info).appendTextFormatted(
1597                 "Will use a Cartesian communicator for PP <-> PME: %d x %d x %d",
1598                 comm->ntot[XX], comm->ntot[YY], comm->ntot[ZZ]);
1599
1600         for (i = 0; i < DIM; i++)
1601         {
1602             periods[i] = TRUE;
1603         }
1604         MPI_Cart_create(cr->mpi_comm_mysim, DIM, comm->ntot, periods, static_cast<int>(reorder),
1605                         &comm_cart);
1606         MPI_Comm_rank(comm_cart, &rank);
1607         if (MASTER(cr) && rank != 0)
1608         {
1609             gmx_fatal(FARGS, "MPI rank 0 was renumbered by MPI_Cart_create, we do not allow this");
1610         }
1611
1612         /* With this assigment we loose the link to the original communicator
1613          * which will usually be MPI_COMM_WORLD, unless have multisim.
1614          */
1615         cr->mpi_comm_mysim = comm_cart;
1616         cr->sim_nodeid     = rank;
1617
1618         MPI_Cart_coords(cr->mpi_comm_mysim, cr->sim_nodeid, DIM, dd->ci);
1619
1620         GMX_LOG(mdlog.info).appendTextFormatted(
1621                 "Cartesian rank %d, coordinates %d %d %d\n",
1622                 cr->sim_nodeid, dd->ci[XX], dd->ci[YY], dd->ci[ZZ]);
1623
1624         if (dd->ci[comm->cartpmedim] < dd->nc[comm->cartpmedim])
1625         {
1626             cr->duty = DUTY_PP;
1627         }
1628         if (cr->npmenodes == 0 ||
1629             dd->ci[comm->cartpmedim] >= dd->nc[comm->cartpmedim])
1630         {
1631             cr->duty = DUTY_PME;
1632         }
1633
1634         /* Split the sim communicator into PP and PME only nodes */
1635         MPI_Comm_split(cr->mpi_comm_mysim,
1636                        getThisRankDuties(cr),
1637                        dd_index(comm->ntot, dd->ci),
1638                        &cr->mpi_comm_mygroup);
1639 #endif
1640     }
1641     else
1642     {
1643         switch (rankOrder)
1644         {
1645             case DdRankOrder::pp_pme:
1646                 GMX_LOG(mdlog.info).appendText("Order of the ranks: PP first, PME last");
1647                 break;
1648             case DdRankOrder::interleave:
1649                 /* Interleave the PP-only and PME-only ranks */
1650                 GMX_LOG(mdlog.info).appendText("Interleaving PP and PME ranks");
1651                 comm->pmenodes = dd_interleaved_pme_ranks(dd);
1652                 break;
1653             case DdRankOrder::cartesian:
1654                 break;
1655             default:
1656                 gmx_fatal(FARGS, "Invalid ddRankOrder=%d", static_cast<int>(rankOrder));
1657         }
1658
1659         if (dd_simnode2pmenode(dd, cr, cr->sim_nodeid) == -1)
1660         {
1661             cr->duty = DUTY_PME;
1662         }
1663         else
1664         {
1665             cr->duty = DUTY_PP;
1666         }
1667 #if GMX_MPI
1668         /* Split the sim communicator into PP and PME only nodes */
1669         MPI_Comm_split(cr->mpi_comm_mysim,
1670                        getThisRankDuties(cr),
1671                        cr->nodeid,
1672                        &cr->mpi_comm_mygroup);
1673         MPI_Comm_rank(cr->mpi_comm_mygroup, &cr->nodeid);
1674 #endif
1675     }
1676
1677     GMX_LOG(mdlog.info).appendTextFormatted(
1678             "This rank does only %s work.\n",
1679             thisRankHasDuty(cr, DUTY_PP) ? "particle-particle" : "PME-mesh");
1680 }
1681
1682 /*! \brief Generates the MPI communicators for domain decomposition */
1683 static void make_dd_communicators(const gmx::MDLogger &mdlog,
1684                                   t_commrec *cr,
1685                                   gmx_domdec_t *dd, DdRankOrder ddRankOrder)
1686 {
1687     gmx_domdec_comm_t *comm;
1688     bool               CartReorder;
1689
1690     comm = dd->comm;
1691
1692     copy_ivec(dd->nc, comm->ntot);
1693
1694     comm->bCartesianPP     = (ddRankOrder == DdRankOrder::cartesian);
1695     comm->bCartesianPP_PME = FALSE;
1696
1697     /* Reorder the nodes by default. This might change the MPI ranks.
1698      * Real reordering is only supported on very few architectures,
1699      * Blue Gene is one of them.
1700      */
1701     CartReorder = getenv("GMX_NO_CART_REORDER") == nullptr;
1702
1703     if (cr->npmenodes > 0)
1704     {
1705         /* Split the communicator into a PP and PME part */
1706         split_communicator(mdlog, cr, dd, ddRankOrder, CartReorder);
1707         if (comm->bCartesianPP_PME)
1708         {
1709             /* We (possibly) reordered the nodes in split_communicator,
1710              * so it is no longer required in make_pp_communicator.
1711              */
1712             CartReorder = FALSE;
1713         }
1714     }
1715     else
1716     {
1717         /* All nodes do PP and PME */
1718 #if GMX_MPI
1719         /* We do not require separate communicators */
1720         cr->mpi_comm_mygroup = cr->mpi_comm_mysim;
1721 #endif
1722     }
1723
1724     if (thisRankHasDuty(cr, DUTY_PP))
1725     {
1726         /* Copy or make a new PP communicator */
1727         make_pp_communicator(mdlog, dd, cr, CartReorder);
1728     }
1729     else
1730     {
1731         receive_ddindex2simnodeid(dd, cr);
1732     }
1733
1734     if (!thisRankHasDuty(cr, DUTY_PME))
1735     {
1736         /* Set up the commnuication to our PME node */
1737         dd->pme_nodeid           = dd_simnode2pmenode(dd, cr, cr->sim_nodeid);
1738         dd->pme_receive_vir_ener = receive_vir_ener(dd, cr);
1739         if (debug)
1740         {
1741             fprintf(debug, "My pme_nodeid %d receive ener %s\n",
1742                     dd->pme_nodeid, gmx::boolToString(dd->pme_receive_vir_ener));
1743         }
1744     }
1745     else
1746     {
1747         dd->pme_nodeid = -1;
1748     }
1749
1750     /* We can not use DDMASTER(dd), because dd->masterrank is set later */
1751     if (MASTER(cr))
1752     {
1753         dd->ma = std::make_unique<AtomDistribution>(dd->nc,
1754                                                     comm->cgs_gl.nr,
1755                                                     comm->cgs_gl.index[comm->cgs_gl.nr]);
1756     }
1757 }
1758
1759 static real *get_slb_frac(const gmx::MDLogger &mdlog,
1760                           const char *dir, int nc, const char *size_string)
1761 {
1762     real  *slb_frac, tot;
1763     int    i, n;
1764     double dbl;
1765
1766     slb_frac = nullptr;
1767     if (nc > 1 && size_string != nullptr)
1768     {
1769         GMX_LOG(mdlog.info).appendTextFormatted(
1770                 "Using static load balancing for the %s direction", dir);
1771         snew(slb_frac, nc);
1772         tot = 0;
1773         for (i = 0; i < nc; i++)
1774         {
1775             dbl = 0;
1776             sscanf(size_string, "%20lf%n", &dbl, &n);
1777             if (dbl == 0)
1778             {
1779                 gmx_fatal(FARGS, "Incorrect or not enough DD cell size entries for direction %s: '%s'", dir, size_string);
1780             }
1781             slb_frac[i]  = dbl;
1782             size_string += n;
1783             tot         += slb_frac[i];
1784         }
1785         /* Normalize */
1786         std::string relativeCellSizes = "Relative cell sizes:";
1787         for (i = 0; i < nc; i++)
1788         {
1789             slb_frac[i]       /= tot;
1790             relativeCellSizes += gmx::formatString(" %5.3f", slb_frac[i]);
1791         }
1792         GMX_LOG(mdlog.info).appendText(relativeCellSizes);
1793     }
1794
1795     return slb_frac;
1796 }
1797
1798 static int multi_body_bondeds_count(const gmx_mtop_t *mtop)
1799 {
1800     int                  n     = 0;
1801     gmx_mtop_ilistloop_t iloop = gmx_mtop_ilistloop_init(mtop);
1802     int                  nmol;
1803     while (const InteractionLists *ilists = gmx_mtop_ilistloop_next(iloop, &nmol))
1804     {
1805         for (auto &ilist : extractILists(*ilists, IF_BOND))
1806         {
1807             if (NRAL(ilist.functionType) >  2)
1808             {
1809                 n += nmol*(ilist.iatoms.size()/ilistStride(ilist));
1810             }
1811         }
1812     }
1813
1814     return n;
1815 }
1816
1817 static int dd_getenv(const gmx::MDLogger &mdlog,
1818                      const char *env_var, int def)
1819 {
1820     char *val;
1821     int   nst;
1822
1823     nst = def;
1824     val = getenv(env_var);
1825     if (val)
1826     {
1827         if (sscanf(val, "%20d", &nst) <= 0)
1828         {
1829             nst = 1;
1830         }
1831         GMX_LOG(mdlog.info).appendTextFormatted(
1832                 "Found env.var. %s = %s, using value %d",
1833                 env_var, val, nst);
1834     }
1835
1836     return nst;
1837 }
1838
1839 static void check_dd_restrictions(const gmx_domdec_t  *dd,
1840                                   const t_inputrec    *ir,
1841                                   const gmx::MDLogger &mdlog)
1842 {
1843     if (ir->ePBC == epbcSCREW &&
1844         (dd->nc[XX] == 1 || dd->nc[YY] > 1 || dd->nc[ZZ] > 1))
1845     {
1846         gmx_fatal(FARGS, "With pbc=%s can only do domain decomposition in the x-direction", epbc_names[ir->ePBC]);
1847     }
1848
1849     if (ir->ns_type == ensSIMPLE)
1850     {
1851         gmx_fatal(FARGS, "Domain decomposition does not support simple neighbor searching, use grid searching or run with one MPI rank");
1852     }
1853
1854     if (ir->nstlist == 0)
1855     {
1856         gmx_fatal(FARGS, "Domain decomposition does not work with nstlist=0");
1857     }
1858
1859     if (ir->comm_mode == ecmANGULAR && ir->ePBC != epbcNONE)
1860     {
1861         GMX_LOG(mdlog.warning).appendText("comm-mode angular will give incorrect results when the comm group partially crosses a periodic boundary");
1862     }
1863 }
1864
1865 static real average_cellsize_min(gmx_domdec_t *dd, gmx_ddbox_t *ddbox)
1866 {
1867     int  di, d;
1868     real r;
1869
1870     r = ddbox->box_size[XX];
1871     for (di = 0; di < dd->ndim; di++)
1872     {
1873         d = dd->dim[di];
1874         /* Check using the initial average cell size */
1875         r = std::min(r, ddbox->box_size[d]*ddbox->skew_fac[d]/dd->nc[d]);
1876     }
1877
1878     return r;
1879 }
1880
1881 /*! \brief Depending on the DLB initial value return the DLB switched off state or issue an error.
1882  */
1883 static DlbState forceDlbOffOrBail(DlbState             cmdlineDlbState,
1884                                   const std::string   &reasonStr,
1885                                   const gmx::MDLogger &mdlog)
1886 {
1887     std::string dlbNotSupportedErr  = "Dynamic load balancing requested, but ";
1888     std::string dlbDisableNote      = "NOTE: disabling dynamic load balancing as ";
1889
1890     if (cmdlineDlbState == DlbState::onUser)
1891     {
1892         gmx_fatal(FARGS, "%s", (dlbNotSupportedErr + reasonStr).c_str());
1893     }
1894     else if (cmdlineDlbState == DlbState::offCanTurnOn)
1895     {
1896         GMX_LOG(mdlog.info).appendText(dlbDisableNote + reasonStr);
1897     }
1898     return DlbState::offForever;
1899 }
1900
1901 /*! \brief Return the dynamic load balancer's initial state based on initial conditions and user inputs.
1902  *
1903  * This function parses the parameters of "-dlb" command line option setting
1904  * corresponding state values. Then it checks the consistency of the determined
1905  * state with other run parameters and settings. As a result, the initial state
1906  * may be altered or an error may be thrown if incompatibility of options is detected.
1907  *
1908  * \param [in] mdlog       Logger.
1909  * \param [in] dlbOption   Enum value for the DLB option.
1910  * \param [in] bRecordLoad True if the load balancer is recording load information.
1911  * \param [in] mdrunOptions  Options for mdrun.
1912  * \param [in] ir          Pointer mdrun to input parameters.
1913  * \returns                DLB initial/startup state.
1914  */
1915 static DlbState determineInitialDlbState(const gmx::MDLogger &mdlog,
1916                                          DlbOption dlbOption, gmx_bool bRecordLoad,
1917                                          const gmx::MdrunOptions &mdrunOptions,
1918                                          const t_inputrec *ir)
1919 {
1920     DlbState dlbState = DlbState::offCanTurnOn;
1921
1922     switch (dlbOption)
1923     {
1924         case DlbOption::turnOnWhenUseful: dlbState = DlbState::offCanTurnOn; break;
1925         case DlbOption::no:               dlbState = DlbState::offUser;      break;
1926         case DlbOption::yes:              dlbState = DlbState::onUser;       break;
1927         default: gmx_incons("Invalid dlbOption enum value");
1928     }
1929
1930     /* Reruns don't support DLB: bail or override auto mode */
1931     if (mdrunOptions.rerun)
1932     {
1933         std::string reasonStr = "it is not supported in reruns.";
1934         return forceDlbOffOrBail(dlbState, reasonStr, mdlog);
1935     }
1936
1937     /* Unsupported integrators */
1938     if (!EI_DYNAMICS(ir->eI))
1939     {
1940         auto reasonStr = gmx::formatString("it is only supported with dynamics, not with integrator '%s'.", EI(ir->eI));
1941         return forceDlbOffOrBail(dlbState, reasonStr, mdlog);
1942     }
1943
1944     /* Without cycle counters we can't time work to balance on */
1945     if (!bRecordLoad)
1946     {
1947         std::string reasonStr = "cycle counters unsupported or not enabled in the operating system kernel.";
1948         return forceDlbOffOrBail(dlbState, reasonStr, mdlog);
1949     }
1950
1951     if (mdrunOptions.reproducible)
1952     {
1953         std::string reasonStr = "you started a reproducible run.";
1954         switch (dlbState)
1955         {
1956             case DlbState::offUser:
1957                 break;
1958             case DlbState::offForever:
1959                 GMX_RELEASE_ASSERT(false, "DlbState::offForever is not a valid initial state");
1960                 break;
1961             case DlbState::offCanTurnOn:
1962                 return forceDlbOffOrBail(dlbState, reasonStr, mdlog);
1963             case DlbState::onCanTurnOff:
1964                 GMX_RELEASE_ASSERT(false, "DlbState::offCanTurnOff is not a valid initial state");
1965                 break;
1966             case DlbState::onUser:
1967                 return forceDlbOffOrBail(dlbState, reasonStr + " In load balanced runs binary reproducibility cannot be ensured.", mdlog);
1968             default:
1969                 gmx_fatal(FARGS, "Death horror: undefined case (%d) for load balancing choice", static_cast<int>(dlbState));
1970         }
1971     }
1972
1973     return dlbState;
1974 }
1975
1976 static void set_dd_dim(const gmx::MDLogger &mdlog, gmx_domdec_t *dd)
1977 {
1978     dd->ndim = 0;
1979     if (getenv("GMX_DD_ORDER_ZYX") != nullptr)
1980     {
1981         /* Decomposition order z,y,x */
1982         GMX_LOG(mdlog.info).appendText("Using domain decomposition order z, y, x");
1983         for (int dim = DIM-1; dim >= 0; dim--)
1984         {
1985             if (dd->nc[dim] > 1)
1986             {
1987                 dd->dim[dd->ndim++] = dim;
1988             }
1989         }
1990     }
1991     else
1992     {
1993         /* Decomposition order x,y,z */
1994         for (int dim = 0; dim < DIM; dim++)
1995         {
1996             if (dd->nc[dim] > 1)
1997             {
1998                 dd->dim[dd->ndim++] = dim;
1999             }
2000         }
2001     }
2002
2003     if (dd->ndim == 0)
2004     {
2005         /* Set dim[0] to avoid extra checks on ndim in several places */
2006         dd->dim[0] = XX;
2007     }
2008 }
2009
2010 static gmx_domdec_comm_t *init_dd_comm()
2011 {
2012     gmx_domdec_comm_t *comm = new gmx_domdec_comm_t;
2013
2014     comm->n_load_have      = 0;
2015     comm->n_load_collect   = 0;
2016
2017     comm->haveTurnedOffDlb = false;
2018
2019     for (int i = 0; i < static_cast<int>(DDAtomRanges::Type::Number); i++)
2020     {
2021         comm->sum_nat[i] = 0;
2022     }
2023     comm->ndecomp   = 0;
2024     comm->nload     = 0;
2025     comm->load_step = 0;
2026     comm->load_sum  = 0;
2027     comm->load_max  = 0;
2028     clear_ivec(comm->load_lim);
2029     comm->load_mdf  = 0;
2030     comm->load_pme  = 0;
2031
2032     /* This should be replaced by a unique pointer */
2033     comm->balanceRegion = ddBalanceRegionAllocate();
2034
2035     return comm;
2036 }
2037
2038 /* Returns whether mtop contains constraints and/or vsites */
2039 static bool systemHasConstraintsOrVsites(const gmx_mtop_t &mtop)
2040 {
2041     auto ilistLoop = gmx_mtop_ilistloop_init(mtop);
2042     int  nmol;
2043     while (const InteractionLists *ilists = gmx_mtop_ilistloop_next(ilistLoop, &nmol))
2044     {
2045         if (!extractILists(*ilists, IF_CONSTRAINT | IF_VSITE).empty())
2046         {
2047             return true;
2048         }
2049     }
2050
2051     return false;
2052 }
2053
2054 static void setupUpdateGroups(const gmx::MDLogger &mdlog,
2055                               const gmx_mtop_t    &mtop,
2056                               const t_inputrec    &inputrec,
2057                               real                 cutoffMargin,
2058                               int                  numMpiRanksTotal,
2059                               gmx_domdec_comm_t   *comm)
2060 {
2061     /* When we have constraints and/or vsites, it is beneficial to use
2062      * update groups (when possible) to allow independent update of groups.
2063      */
2064     if (!systemHasConstraintsOrVsites(mtop))
2065     {
2066         /* No constraints or vsites, atoms can be updated independently */
2067         return;
2068     }
2069
2070     comm->updateGroupingPerMoleculetype = gmx::makeUpdateGroups(mtop);
2071     comm->useUpdateGroups               =
2072         (!comm->updateGroupingPerMoleculetype.empty() &&
2073          getenv("GMX_NO_UPDATEGROUPS") == nullptr);
2074
2075     if (comm->useUpdateGroups)
2076     {
2077         int numUpdateGroups = 0;
2078         for (const auto &molblock : mtop.molblock)
2079         {
2080             numUpdateGroups += molblock.nmol*comm->updateGroupingPerMoleculetype[molblock.type].numBlocks();
2081         }
2082
2083         /* Note: We would like to use dd->nnodes for the atom count estimate,
2084          *       but that is not yet available here. But this anyhow only
2085          *       affect performance up to the second dd_partition_system call.
2086          */
2087         int homeAtomCountEstimate =  mtop.natoms/numMpiRanksTotal;
2088         comm->updateGroupsCog =
2089             std::make_unique<gmx::UpdateGroupsCog>(mtop,
2090                                                    comm->updateGroupingPerMoleculetype,
2091                                                    maxReferenceTemperature(inputrec),
2092                                                    homeAtomCountEstimate);
2093
2094         /* To use update groups, the large domain-to-domain cutoff distance
2095          * should be compatible with the box size.
2096          */
2097         comm->useUpdateGroups = (atomToAtomIntoDomainToDomainCutoff(*comm, 0) < cutoffMargin);
2098
2099         if (comm->useUpdateGroups)
2100         {
2101             GMX_LOG(mdlog.info).appendTextFormatted(
2102                     "Using update groups, nr %d, average size %.1f atoms, max. radius %.3f nm\n",
2103                     numUpdateGroups,
2104                     mtop.natoms/static_cast<double>(numUpdateGroups),
2105                     comm->updateGroupsCog->maxUpdateGroupRadius());
2106         }
2107         else
2108         {
2109             GMX_LOG(mdlog.info).appendTextFormatted("The combination of rlist and box size prohibits the use of update groups\n");
2110             comm->updateGroupingPerMoleculetype.clear();
2111             comm->updateGroupsCog.reset(nullptr);
2112         }
2113     }
2114 }
2115
2116 /*! \brief Set the cell size and interaction limits, as well as the DD grid */
2117 static void set_dd_limits_and_grid(const gmx::MDLogger &mdlog,
2118                                    t_commrec *cr, gmx_domdec_t *dd,
2119                                    const DomdecOptions &options,
2120                                    const gmx::MdrunOptions &mdrunOptions,
2121                                    const gmx_mtop_t *mtop,
2122                                    const t_inputrec *ir,
2123                                    const matrix box,
2124                                    gmx::ArrayRef<const gmx::RVec> xGlobal,
2125                                    gmx_ddbox_t *ddbox)
2126 {
2127     real               r_bonded         = -1;
2128     real               r_bonded_limit   = -1;
2129     const real         tenPercentMargin = 1.1;
2130     gmx_domdec_comm_t *comm             = dd->comm;
2131
2132     dd->npbcdim              = ePBC2npbcdim(ir->ePBC);
2133     dd->numBoundedDimensions = inputrec2nboundeddim(ir);
2134     dd->haveDynamicBox       = inputrecDynamicBox(ir);
2135     dd->bScrewPBC            = (ir->ePBC == epbcSCREW);
2136
2137     dd->pme_recv_f_alloc = 0;
2138     dd->pme_recv_f_buf   = nullptr;
2139
2140     /* Initialize to GPU share count to 0, might change later */
2141     comm->nrank_gpu_shared = 0;
2142
2143     comm->dlbState         = determineInitialDlbState(mdlog, options.dlbOption, comm->bRecordLoad, mdrunOptions, ir);
2144     dd_dlb_set_should_check_whether_to_turn_dlb_on(dd, TRUE);
2145     /* To consider turning DLB on after 2*nstlist steps we need to check
2146      * at partitioning count 3. Thus we need to increase the first count by 2.
2147      */
2148     comm->ddPartioningCountFirstDlbOff += 2;
2149
2150     GMX_LOG(mdlog.info).appendTextFormatted(
2151             "Dynamic load balancing: %s", edlbs_names[int(comm->dlbState)]);
2152
2153     comm->bPMELoadBalDLBLimits = FALSE;
2154
2155     /* Allocate the charge group/atom sorting struct */
2156     comm->sort = std::make_unique<gmx_domdec_sort_t>();
2157
2158     /* We need to decide on update groups early, as this affects communication distances */
2159     comm->useUpdateGroups = false;
2160     if (ir->cutoff_scheme == ecutsVERLET)
2161     {
2162         real cutoffMargin = std::sqrt(max_cutoff2(ir->ePBC, box)) - ir->rlist;
2163         setupUpdateGroups(mdlog, *mtop, *ir, cutoffMargin, cr->nnodes, comm);
2164     }
2165
2166     // TODO: Check whether all bondeds are within update groups
2167     comm->haveInterDomainBondeds          = (mtop->natoms > gmx_mtop_num_molecules(*mtop) ||
2168                                              mtop->bIntermolecularInteractions);
2169     comm->haveInterDomainMultiBodyBondeds = (multi_body_bondeds_count(mtop) > 0);
2170
2171     if (comm->useUpdateGroups)
2172     {
2173         dd->splitConstraints = false;
2174         dd->splitSettles     = false;
2175     }
2176     else
2177     {
2178         dd->splitConstraints = gmx::inter_charge_group_constraints(*mtop);
2179         dd->splitSettles     = gmx::inter_charge_group_settles(*mtop);
2180     }
2181
2182     if (ir->rlist == 0)
2183     {
2184         /* Set the cut-off to some very large value,
2185          * so we don't need if statements everywhere in the code.
2186          * We use sqrt, since the cut-off is squared in some places.
2187          */
2188         comm->cutoff   = GMX_CUTOFF_INF;
2189     }
2190     else
2191     {
2192         comm->cutoff   = atomToAtomIntoDomainToDomainCutoff(*comm, ir->rlist);
2193     }
2194     comm->cutoff_mbody = 0;
2195
2196     /* Determine the minimum cell size limit, affected by many factors */
2197     comm->cellsize_limit = 0;
2198     comm->bBondComm      = FALSE;
2199
2200     /* We do not allow home atoms to move beyond the neighboring domain
2201      * between domain decomposition steps, which limits the cell size.
2202      * Get an estimate of cell size limit due to atom displacement.
2203      * In most cases this is a large overestimate, because it assumes
2204      * non-interaction atoms.
2205      * We set the chance to 1 in a trillion steps.
2206      */
2207     constexpr real c_chanceThatAtomMovesBeyondDomain = 1e-12;
2208     const real     limitForAtomDisplacement          =
2209         minCellSizeForAtomDisplacement(*mtop, *ir,
2210                                        comm->updateGroupingPerMoleculetype,
2211                                        c_chanceThatAtomMovesBeyondDomain);
2212     GMX_LOG(mdlog.info).appendTextFormatted(
2213             "Minimum cell size due to atom displacement: %.3f nm",
2214             limitForAtomDisplacement);
2215
2216     comm->cellsize_limit = std::max(comm->cellsize_limit,
2217                                     limitForAtomDisplacement);
2218
2219     /* TODO: PME decomposition currently requires atoms not to be more than
2220      *       2/3 of comm->cutoff, which is >=rlist, outside of their domain.
2221      *       In nearly all cases, limitForAtomDisplacement will be smaller
2222      *       than 2/3*rlist, so the PME requirement is satisfied.
2223      *       But it would be better for both correctness and performance
2224      *       to use limitForAtomDisplacement instead of 2/3*comm->cutoff.
2225      *       Note that we would need to improve the pairlist buffer case.
2226      */
2227
2228     if (comm->haveInterDomainBondeds)
2229     {
2230         if (options.minimumCommunicationRange > 0)
2231         {
2232             comm->cutoff_mbody = atomToAtomIntoDomainToDomainCutoff(*comm, options.minimumCommunicationRange);
2233             if (options.useBondedCommunication)
2234             {
2235                 comm->bBondComm = (comm->cutoff_mbody > comm->cutoff);
2236             }
2237             else
2238             {
2239                 comm->cutoff = std::max(comm->cutoff, comm->cutoff_mbody);
2240             }
2241             r_bonded_limit = comm->cutoff_mbody;
2242         }
2243         else if (ir->bPeriodicMols)
2244         {
2245             /* Can not easily determine the required cut-off */
2246             GMX_LOG(mdlog.warning).appendText("NOTE: Periodic molecules are present in this system. Because of this, the domain decomposition algorithm cannot easily determine the minimum cell size that it requires for treating bonded interactions. Instead, domain decomposition will assume that half the non-bonded cut-off will be a suitable lower bound.");
2247             comm->cutoff_mbody = comm->cutoff/2;
2248             r_bonded_limit     = comm->cutoff_mbody;
2249         }
2250         else
2251         {
2252             real r_2b, r_mb;
2253
2254             if (MASTER(cr))
2255             {
2256                 dd_bonded_cg_distance(mdlog, mtop, ir, as_rvec_array(xGlobal.data()), box,
2257                                       options.checkBondedInteractions,
2258                                       &r_2b, &r_mb);
2259             }
2260             gmx_bcast(sizeof(r_2b), &r_2b, cr);
2261             gmx_bcast(sizeof(r_mb), &r_mb, cr);
2262
2263             /* We use an initial margin of 10% for the minimum cell size,
2264              * except when we are just below the non-bonded cut-off.
2265              */
2266             if (options.useBondedCommunication)
2267             {
2268                 if (std::max(r_2b, r_mb) > comm->cutoff)
2269                 {
2270                     r_bonded        = std::max(r_2b, r_mb);
2271                     r_bonded_limit  = tenPercentMargin*r_bonded;
2272                     comm->bBondComm = TRUE;
2273                 }
2274                 else
2275                 {
2276                     r_bonded       = r_mb;
2277                     r_bonded_limit = std::min(tenPercentMargin*r_bonded, comm->cutoff);
2278                 }
2279                 /* We determine cutoff_mbody later */
2280             }
2281             else
2282             {
2283                 /* No special bonded communication,
2284                  * simply increase the DD cut-off.
2285                  */
2286                 r_bonded_limit     = tenPercentMargin*std::max(r_2b, r_mb);
2287                 comm->cutoff_mbody = r_bonded_limit;
2288                 comm->cutoff       = std::max(comm->cutoff, comm->cutoff_mbody);
2289             }
2290         }
2291         GMX_LOG(mdlog.info).appendTextFormatted(
2292                 "Minimum cell size due to bonded interactions: %.3f nm",
2293                 r_bonded_limit);
2294
2295         comm->cellsize_limit = std::max(comm->cellsize_limit, r_bonded_limit);
2296     }
2297
2298     real rconstr = 0;
2299     if (dd->splitConstraints && options.constraintCommunicationRange <= 0)
2300     {
2301         /* There is a cell size limit due to the constraints (P-LINCS) */
2302         rconstr = gmx::constr_r_max(mdlog, mtop, ir);
2303         GMX_LOG(mdlog.info).appendTextFormatted(
2304                 "Estimated maximum distance required for P-LINCS: %.3f nm",
2305                 rconstr);
2306         if (rconstr > comm->cellsize_limit)
2307         {
2308             GMX_LOG(mdlog.info).appendText("This distance will limit the DD cell size, you can override this with -rcon");
2309         }
2310     }
2311     else if (options.constraintCommunicationRange > 0)
2312     {
2313         /* Here we do not check for dd->splitConstraints.
2314          * because one can also set a cell size limit for virtual sites only
2315          * and at this point we don't know yet if there are intercg v-sites.
2316          */
2317         GMX_LOG(mdlog.info).appendTextFormatted(
2318                 "User supplied maximum distance required for P-LINCS: %.3f nm",
2319                 options.constraintCommunicationRange);
2320         rconstr = options.constraintCommunicationRange;
2321     }
2322     comm->cellsize_limit = std::max(comm->cellsize_limit, rconstr);
2323
2324     comm->cgs_gl = gmx_mtop_global_cgs(mtop);
2325
2326     if (options.numCells[XX] > 0)
2327     {
2328         copy_ivec(options.numCells, dd->nc);
2329         set_dd_dim(mdlog, dd);
2330         set_ddbox_cr(*cr, &dd->nc, *ir, box, xGlobal, ddbox);
2331
2332         if (options.numPmeRanks >= 0)
2333         {
2334             cr->npmenodes = options.numPmeRanks;
2335         }
2336         else
2337         {
2338             /* When the DD grid is set explicitly and -npme is set to auto,
2339              * don't use PME ranks. We check later if the DD grid is
2340              * compatible with the total number of ranks.
2341              */
2342             cr->npmenodes = 0;
2343         }
2344
2345         real acs = average_cellsize_min(dd, ddbox);
2346         if (acs < comm->cellsize_limit)
2347         {
2348             gmx_fatal_collective(FARGS, cr->mpi_comm_mysim, MASTER(cr),
2349                                  "The initial cell size (%f) is smaller than the cell size limit (%f), change options -dd, -rdd or -rcon, see the log file for details",
2350                                  acs, comm->cellsize_limit);
2351         }
2352     }
2353     else
2354     {
2355         set_ddbox_cr(*cr, nullptr, *ir, box, xGlobal, ddbox);
2356
2357         /* We need to choose the optimal DD grid and possibly PME nodes */
2358         real limit =
2359             dd_choose_grid(mdlog, cr, dd, ir, mtop, box, ddbox,
2360                            options.numPmeRanks,
2361                            !isDlbDisabled(comm),
2362                            options.dlbScaling,
2363                            comm->cellsize_limit, comm->cutoff,
2364                            comm->haveInterDomainBondeds);
2365
2366         if (dd->nc[XX] == 0)
2367         {
2368             char     buf[STRLEN];
2369             gmx_bool bC = (dd->splitConstraints && rconstr > r_bonded_limit);
2370             sprintf(buf, "Change the number of ranks or mdrun option %s%s%s",
2371                     !bC ? "-rdd" : "-rcon",
2372                     comm->dlbState != DlbState::offUser ? " or -dds" : "",
2373                     bC ? " or your LINCS settings" : "");
2374
2375             gmx_fatal_collective(FARGS, cr->mpi_comm_mysim, MASTER(cr),
2376                                  "There is no domain decomposition for %d ranks that is compatible with the given box and a minimum cell size of %g nm\n"
2377                                  "%s\n"
2378                                  "Look in the log file for details on the domain decomposition",
2379                                  cr->nnodes-cr->npmenodes, limit, buf);
2380         }
2381         set_dd_dim(mdlog, dd);
2382     }
2383
2384     GMX_LOG(mdlog.info).appendTextFormatted(
2385             "Domain decomposition grid %d x %d x %d, separate PME ranks %d",
2386             dd->nc[XX], dd->nc[YY], dd->nc[ZZ], cr->npmenodes);
2387
2388     dd->nnodes = dd->nc[XX]*dd->nc[YY]*dd->nc[ZZ];
2389     if (cr->nnodes - dd->nnodes != cr->npmenodes)
2390     {
2391         gmx_fatal_collective(FARGS, cr->mpi_comm_mysim, MASTER(cr),
2392                              "The size of the domain decomposition grid (%d) does not match the number of ranks (%d). The total number of ranks is %d",
2393                              dd->nnodes, cr->nnodes - cr->npmenodes, cr->nnodes);
2394     }
2395     if (cr->npmenodes > dd->nnodes)
2396     {
2397         gmx_fatal_collective(FARGS, cr->mpi_comm_mysim, MASTER(cr),
2398                              "The number of separate PME ranks (%d) is larger than the number of PP ranks (%d), this is not supported.", cr->npmenodes, dd->nnodes);
2399     }
2400     if (cr->npmenodes > 0)
2401     {
2402         comm->npmenodes = cr->npmenodes;
2403     }
2404     else
2405     {
2406         comm->npmenodes = dd->nnodes;
2407     }
2408
2409     if (EEL_PME(ir->coulombtype) || EVDW_PME(ir->vdwtype))
2410     {
2411         /* The following choices should match those
2412          * in comm_cost_est in domdec_setup.c.
2413          * Note that here the checks have to take into account
2414          * that the decomposition might occur in a different order than xyz
2415          * (for instance through the env.var. GMX_DD_ORDER_ZYX),
2416          * in which case they will not match those in comm_cost_est,
2417          * but since that is mainly for testing purposes that's fine.
2418          */
2419         if (dd->ndim >= 2 && dd->dim[0] == XX && dd->dim[1] == YY &&
2420             comm->npmenodes > dd->nc[XX] && comm->npmenodes % dd->nc[XX] == 0 &&
2421             getenv("GMX_PMEONEDD") == nullptr)
2422         {
2423             comm->npmedecompdim = 2;
2424             comm->npmenodes_x   = dd->nc[XX];
2425             comm->npmenodes_y   = comm->npmenodes/comm->npmenodes_x;
2426         }
2427         else
2428         {
2429             /* In case nc is 1 in both x and y we could still choose to
2430              * decompose pme in y instead of x, but we use x for simplicity.
2431              */
2432             comm->npmedecompdim = 1;
2433             if (dd->dim[0] == YY)
2434             {
2435                 comm->npmenodes_x = 1;
2436                 comm->npmenodes_y = comm->npmenodes;
2437             }
2438             else
2439             {
2440                 comm->npmenodes_x = comm->npmenodes;
2441                 comm->npmenodes_y = 1;
2442             }
2443         }
2444         GMX_LOG(mdlog.info).appendTextFormatted(
2445                 "PME domain decomposition: %d x %d x %d",
2446                 comm->npmenodes_x, comm->npmenodes_y, 1);
2447     }
2448     else
2449     {
2450         comm->npmedecompdim = 0;
2451         comm->npmenodes_x   = 0;
2452         comm->npmenodes_y   = 0;
2453     }
2454
2455     snew(comm->slb_frac, DIM);
2456     if (isDlbDisabled(comm))
2457     {
2458         comm->slb_frac[XX] = get_slb_frac(mdlog, "x", dd->nc[XX], options.cellSizeX);
2459         comm->slb_frac[YY] = get_slb_frac(mdlog, "y", dd->nc[YY], options.cellSizeY);
2460         comm->slb_frac[ZZ] = get_slb_frac(mdlog, "z", dd->nc[ZZ], options.cellSizeZ);
2461     }
2462
2463     if (comm->haveInterDomainBondeds && comm->cutoff_mbody == 0)
2464     {
2465         if (comm->bBondComm || !isDlbDisabled(comm))
2466         {
2467             /* Set the bonded communication distance to halfway
2468              * the minimum and the maximum,
2469              * since the extra communication cost is nearly zero.
2470              */
2471             real acs           = average_cellsize_min(dd, ddbox);
2472             comm->cutoff_mbody = 0.5*(r_bonded + acs);
2473             if (!isDlbDisabled(comm))
2474             {
2475                 /* Check if this does not limit the scaling */
2476                 comm->cutoff_mbody = std::min(comm->cutoff_mbody,
2477                                               options.dlbScaling*acs);
2478             }
2479             if (!comm->bBondComm)
2480             {
2481                 /* Without bBondComm do not go beyond the n.b. cut-off */
2482                 comm->cutoff_mbody = std::min(comm->cutoff_mbody, comm->cutoff);
2483                 if (comm->cellsize_limit >= comm->cutoff)
2484                 {
2485                     /* We don't loose a lot of efficieny
2486                      * when increasing it to the n.b. cut-off.
2487                      * It can even be slightly faster, because we need
2488                      * less checks for the communication setup.
2489                      */
2490                     comm->cutoff_mbody = comm->cutoff;
2491                 }
2492             }
2493             /* Check if we did not end up below our original limit */
2494             comm->cutoff_mbody = std::max(comm->cutoff_mbody, r_bonded_limit);
2495
2496             if (comm->cutoff_mbody > comm->cellsize_limit)
2497             {
2498                 comm->cellsize_limit = comm->cutoff_mbody;
2499             }
2500         }
2501         /* Without DLB and cutoff_mbody<cutoff, cutoff_mbody is dynamic */
2502     }
2503
2504     if (debug)
2505     {
2506         fprintf(debug, "Bonded atom communication beyond the cut-off: %s\n"
2507                 "cellsize limit %f\n",
2508                 gmx::boolToString(comm->bBondComm), comm->cellsize_limit);
2509     }
2510
2511     if (MASTER(cr))
2512     {
2513         check_dd_restrictions(dd, ir, mdlog);
2514     }
2515 }
2516
2517 static char *init_bLocalCG(const gmx_mtop_t *mtop)
2518 {
2519     int   ncg, cg;
2520     char *bLocalCG;
2521
2522     ncg = ncg_mtop(mtop);
2523     snew(bLocalCG, ncg);
2524     for (cg = 0; cg < ncg; cg++)
2525     {
2526         bLocalCG[cg] = FALSE;
2527     }
2528
2529     return bLocalCG;
2530 }
2531
2532 void dd_init_bondeds(FILE *fplog,
2533                      gmx_domdec_t *dd,
2534                      const gmx_mtop_t *mtop,
2535                      const gmx_vsite_t *vsite,
2536                      const t_inputrec *ir,
2537                      gmx_bool bBCheck, cginfo_mb_t *cginfo_mb)
2538 {
2539     gmx_domdec_comm_t *comm;
2540
2541     dd_make_reverse_top(fplog, dd, mtop, vsite, ir, bBCheck);
2542
2543     comm = dd->comm;
2544
2545     if (comm->bBondComm)
2546     {
2547         /* Communicate atoms beyond the cut-off for bonded interactions */
2548         comm = dd->comm;
2549
2550         comm->cglink = make_charge_group_links(mtop, dd, cginfo_mb);
2551
2552         comm->bLocalCG = init_bLocalCG(mtop);
2553     }
2554     else
2555     {
2556         /* Only communicate atoms based on cut-off */
2557         comm->cglink   = nullptr;
2558         comm->bLocalCG = nullptr;
2559     }
2560 }
2561
2562 static void writeSettings(gmx::TextWriter       *log,
2563                           gmx_domdec_t          *dd,
2564                           const gmx_mtop_t      *mtop,
2565                           const t_inputrec      *ir,
2566                           gmx_bool               bDynLoadBal,
2567                           real                   dlb_scale,
2568                           const gmx_ddbox_t     *ddbox)
2569 {
2570     gmx_domdec_comm_t *comm;
2571     int                d;
2572     ivec               np;
2573     real               limit, shrink;
2574
2575     comm = dd->comm;
2576
2577     if (bDynLoadBal)
2578     {
2579         log->writeString("The maximum number of communication pulses is:");
2580         for (d = 0; d < dd->ndim; d++)
2581         {
2582             log->writeStringFormatted(" %c %d", dim2char(dd->dim[d]), comm->cd[d].np_dlb);
2583         }
2584         log->ensureLineBreak();
2585         log->writeLineFormatted("The minimum size for domain decomposition cells is %.3f nm", comm->cellsize_limit);
2586         log->writeLineFormatted("The requested allowed shrink of DD cells (option -dds) is: %.2f", dlb_scale);
2587         log->writeString("The allowed shrink of domain decomposition cells is:");
2588         for (d = 0; d < DIM; d++)
2589         {
2590             if (dd->nc[d] > 1)
2591             {
2592                 if (d >= ddbox->npbcdim && dd->nc[d] == 2)
2593                 {
2594                     shrink = 0;
2595                 }
2596                 else
2597                 {
2598                     shrink =
2599                         comm->cellsize_min_dlb[d]/
2600                         (ddbox->box_size[d]*ddbox->skew_fac[d]/dd->nc[d]);
2601                 }
2602                 log->writeStringFormatted(" %c %.2f", dim2char(d), shrink);
2603             }
2604         }
2605         log->ensureLineBreak();
2606     }
2607     else
2608     {
2609         set_dd_cell_sizes_slb(dd, ddbox, setcellsizeslbPULSE_ONLY, np);
2610         log->writeString("The initial number of communication pulses is:");
2611         for (d = 0; d < dd->ndim; d++)
2612         {
2613             log->writeStringFormatted(" %c %d", dim2char(dd->dim[d]), np[dd->dim[d]]);
2614         }
2615         log->ensureLineBreak();
2616         log->writeString("The initial domain decomposition cell size is:");
2617         for (d = 0; d < DIM; d++)
2618         {
2619             if (dd->nc[d] > 1)
2620             {
2621                 log->writeStringFormatted(" %c %.2f nm",
2622                                           dim2char(d), dd->comm->cellsize_min[d]);
2623             }
2624         }
2625         log->ensureLineBreak();
2626         log->writeLine();
2627     }
2628
2629     const bool haveInterDomainVsites =
2630         (countInterUpdategroupVsites(*mtop, comm->updateGroupingPerMoleculetype) != 0);
2631
2632     if (comm->haveInterDomainBondeds ||
2633         haveInterDomainVsites ||
2634         dd->splitConstraints || dd->splitSettles)
2635     {
2636         std::string decompUnits;
2637         if (comm->useUpdateGroups)
2638         {
2639             decompUnits = "atom groups";
2640         }
2641         else
2642         {
2643             decompUnits = "atoms";
2644         }
2645
2646         log->writeLineFormatted("The maximum allowed distance for %s involved in interactions is:", decompUnits.c_str());
2647         log->writeLineFormatted("%40s  %-7s %6.3f nm", "non-bonded interactions", "", comm->cutoff);
2648
2649         if (bDynLoadBal)
2650         {
2651             limit = dd->comm->cellsize_limit;
2652         }
2653         else
2654         {
2655             if (dynamic_dd_box(*dd))
2656             {
2657                 log->writeLine("(the following are initial values, they could change due to box deformation)");
2658             }
2659             limit = dd->comm->cellsize_min[XX];
2660             for (d = 1; d < DIM; d++)
2661             {
2662                 limit = std::min(limit, dd->comm->cellsize_min[d]);
2663             }
2664         }
2665
2666         if (comm->haveInterDomainBondeds)
2667         {
2668             log->writeLineFormatted("%40s  %-7s %6.3f nm",
2669                                     "two-body bonded interactions", "(-rdd)",
2670                                     std::max(comm->cutoff, comm->cutoff_mbody));
2671             log->writeLineFormatted("%40s  %-7s %6.3f nm",
2672                                     "multi-body bonded interactions", "(-rdd)",
2673                                     (comm->bBondComm || isDlbOn(dd->comm)) ? comm->cutoff_mbody : std::min(comm->cutoff, limit));
2674         }
2675         if (haveInterDomainVsites)
2676         {
2677             log->writeLineFormatted("%40s  %-7s %6.3f nm",
2678                                     "virtual site constructions", "(-rcon)", limit);
2679         }
2680         if (dd->splitConstraints || dd->splitSettles)
2681         {
2682             std::string separation = gmx::formatString("atoms separated by up to %d constraints",
2683                                                        1+ir->nProjOrder);
2684             log->writeLineFormatted("%40s  %-7s %6.3f nm\n",
2685                                     separation.c_str(), "(-rcon)", limit);
2686         }
2687         log->ensureLineBreak();
2688     }
2689 }
2690
2691 static void logSettings(const gmx::MDLogger &mdlog,
2692                         gmx_domdec_t        *dd,
2693                         const gmx_mtop_t    *mtop,
2694                         const t_inputrec    *ir,
2695                         real                 dlb_scale,
2696                         const gmx_ddbox_t   *ddbox)
2697 {
2698     gmx::StringOutputStream stream;
2699     gmx::TextWriter         log(&stream);
2700     writeSettings(&log, dd, mtop, ir, isDlbOn(dd->comm), dlb_scale, ddbox);
2701     if (dd->comm->dlbState == DlbState::offCanTurnOn)
2702     {
2703         {
2704             log.ensureEmptyLine();
2705             log.writeLine("When dynamic load balancing gets turned on, these settings will change to:");
2706         }
2707         writeSettings(&log, dd, mtop, ir, true, dlb_scale, ddbox);
2708     }
2709     GMX_LOG(mdlog.info).asParagraph().appendText(stream.toString());
2710 }
2711
2712 static void set_cell_limits_dlb(const gmx::MDLogger &mdlog,
2713                                 gmx_domdec_t        *dd,
2714                                 real                 dlb_scale,
2715                                 const t_inputrec    *ir,
2716                                 const gmx_ddbox_t   *ddbox)
2717 {
2718     gmx_domdec_comm_t *comm;
2719     int                d, dim, npulse, npulse_d_max, npulse_d;
2720     gmx_bool           bNoCutOff;
2721
2722     comm = dd->comm;
2723
2724     bNoCutOff = (ir->rvdw == 0 || ir->rcoulomb == 0);
2725
2726     /* Determine the maximum number of comm. pulses in one dimension */
2727
2728     comm->cellsize_limit = std::max(comm->cellsize_limit, comm->cutoff_mbody);
2729
2730     /* Determine the maximum required number of grid pulses */
2731     if (comm->cellsize_limit >= comm->cutoff)
2732     {
2733         /* Only a single pulse is required */
2734         npulse = 1;
2735     }
2736     else if (!bNoCutOff && comm->cellsize_limit > 0)
2737     {
2738         /* We round down slightly here to avoid overhead due to the latency
2739          * of extra communication calls when the cut-off
2740          * would be only slightly longer than the cell size.
2741          * Later cellsize_limit is redetermined,
2742          * so we can not miss interactions due to this rounding.
2743          */
2744         npulse = static_cast<int>(0.96 + comm->cutoff/comm->cellsize_limit);
2745     }
2746     else
2747     {
2748         /* There is no cell size limit */
2749         npulse = std::max(dd->nc[XX]-1, std::max(dd->nc[YY]-1, dd->nc[ZZ]-1));
2750     }
2751
2752     if (!bNoCutOff && npulse > 1)
2753     {
2754         /* See if we can do with less pulses, based on dlb_scale */
2755         npulse_d_max = 0;
2756         for (d = 0; d < dd->ndim; d++)
2757         {
2758             dim      = dd->dim[d];
2759             npulse_d = static_cast<int>(1 + dd->nc[dim]*comm->cutoff
2760                                         /(ddbox->box_size[dim]*ddbox->skew_fac[dim]*dlb_scale));
2761             npulse_d_max = std::max(npulse_d_max, npulse_d);
2762         }
2763         npulse = std::min(npulse, npulse_d_max);
2764     }
2765
2766     /* This env var can override npulse */
2767     d = dd_getenv(mdlog, "GMX_DD_NPULSE", 0);
2768     if (d > 0)
2769     {
2770         npulse = d;
2771     }
2772
2773     comm->maxpulse       = 1;
2774     comm->bVacDLBNoLimit = (ir->ePBC == epbcNONE);
2775     for (d = 0; d < dd->ndim; d++)
2776     {
2777         comm->cd[d].np_dlb    = std::min(npulse, dd->nc[dd->dim[d]]-1);
2778         comm->maxpulse        = std::max(comm->maxpulse, comm->cd[d].np_dlb);
2779         if (comm->cd[d].np_dlb < dd->nc[dd->dim[d]]-1)
2780         {
2781             comm->bVacDLBNoLimit = FALSE;
2782         }
2783     }
2784
2785     /* cellsize_limit is set for LINCS in init_domain_decomposition */
2786     if (!comm->bVacDLBNoLimit)
2787     {
2788         comm->cellsize_limit = std::max(comm->cellsize_limit,
2789                                         comm->cutoff/comm->maxpulse);
2790     }
2791     comm->cellsize_limit = std::max(comm->cellsize_limit, comm->cutoff_mbody);
2792     /* Set the minimum cell size for each DD dimension */
2793     for (d = 0; d < dd->ndim; d++)
2794     {
2795         if (comm->bVacDLBNoLimit ||
2796             comm->cd[d].np_dlb*comm->cellsize_limit >= comm->cutoff)
2797         {
2798             comm->cellsize_min_dlb[dd->dim[d]] = comm->cellsize_limit;
2799         }
2800         else
2801         {
2802             comm->cellsize_min_dlb[dd->dim[d]] =
2803                 comm->cutoff/comm->cd[d].np_dlb;
2804         }
2805     }
2806     if (comm->cutoff_mbody <= 0)
2807     {
2808         comm->cutoff_mbody = std::min(comm->cutoff, comm->cellsize_limit);
2809     }
2810     if (isDlbOn(comm))
2811     {
2812         set_dlb_limits(dd);
2813     }
2814 }
2815
2816 gmx_bool dd_bonded_molpbc(const gmx_domdec_t *dd, int ePBC)
2817 {
2818     /* If each molecule is a single charge group
2819      * or we use domain decomposition for each periodic dimension,
2820      * we do not need to take pbc into account for the bonded interactions.
2821      */
2822     return (ePBC != epbcNONE && dd->comm->haveInterDomainBondeds &&
2823             !(dd->nc[XX] > 1 &&
2824               dd->nc[YY] > 1 &&
2825               (dd->nc[ZZ] > 1 || ePBC == epbcXY)));
2826 }
2827
2828 /*! \brief Sets grid size limits and PP-PME setup, prints settings to log */
2829 static void set_ddgrid_parameters(const gmx::MDLogger &mdlog,
2830                                   gmx_domdec_t *dd, real dlb_scale,
2831                                   const gmx_mtop_t *mtop, const t_inputrec *ir,
2832                                   const gmx_ddbox_t *ddbox)
2833 {
2834     gmx_domdec_comm_t *comm;
2835     int                natoms_tot;
2836     real               vol_frac;
2837
2838     comm = dd->comm;
2839
2840     if (EEL_PME(ir->coulombtype) || EVDW_PME(ir->vdwtype))
2841     {
2842         init_ddpme(dd, &comm->ddpme[0], 0);
2843         if (comm->npmedecompdim >= 2)
2844         {
2845             init_ddpme(dd, &comm->ddpme[1], 1);
2846         }
2847     }
2848     else
2849     {
2850         comm->npmenodes = 0;
2851         if (dd->pme_nodeid >= 0)
2852         {
2853             gmx_fatal_collective(FARGS, dd->mpi_comm_all, DDMASTER(dd),
2854                                  "Can not have separate PME ranks without PME electrostatics");
2855         }
2856     }
2857
2858     if (debug)
2859     {
2860         fprintf(debug, "The DD cut-off is %f\n", comm->cutoff);
2861     }
2862     if (!isDlbDisabled(comm))
2863     {
2864         set_cell_limits_dlb(mdlog, dd, dlb_scale, ir, ddbox);
2865     }
2866
2867     logSettings(mdlog, dd, mtop, ir, dlb_scale, ddbox);
2868
2869     if (ir->ePBC == epbcNONE)
2870     {
2871         vol_frac = 1 - 1/static_cast<double>(dd->nnodes);
2872     }
2873     else
2874     {
2875         vol_frac =
2876             (1 + comm_box_frac(dd->nc, comm->cutoff, ddbox))/static_cast<double>(dd->nnodes);
2877     }
2878     if (debug)
2879     {
2880         fprintf(debug, "Volume fraction for all DD zones: %f\n", vol_frac);
2881     }
2882     natoms_tot = comm->cgs_gl.index[comm->cgs_gl.nr];
2883
2884     dd->ga2la  = new gmx_ga2la_t(natoms_tot,
2885                                  static_cast<int>(vol_frac*natoms_tot));
2886 }
2887
2888 /*! \brief Set some important DD parameters that can be modified by env.vars */
2889 static void set_dd_envvar_options(const gmx::MDLogger &mdlog,
2890                                   gmx_domdec_t *dd, int rank_mysim)
2891 {
2892     gmx_domdec_comm_t *comm = dd->comm;
2893
2894     dd->bSendRecv2      = (dd_getenv(mdlog, "GMX_DD_USE_SENDRECV2", 0) != 0);
2895     comm->dlb_scale_lim = dd_getenv(mdlog, "GMX_DLB_MAX_BOX_SCALING", 10);
2896     comm->eFlop         = dd_getenv(mdlog, "GMX_DLB_BASED_ON_FLOPS", 0);
2897     int recload         = dd_getenv(mdlog, "GMX_DD_RECORD_LOAD", 1);
2898     comm->nstDDDump     = dd_getenv(mdlog, "GMX_DD_NST_DUMP", 0);
2899     comm->nstDDDumpGrid = dd_getenv(mdlog, "GMX_DD_NST_DUMP_GRID", 0);
2900     comm->DD_debug      = dd_getenv(mdlog, "GMX_DD_DEBUG", 0);
2901
2902     if (dd->bSendRecv2)
2903     {
2904         GMX_LOG(mdlog.info).appendText("Will use two sequential MPI_Sendrecv calls instead of two simultaneous non-blocking MPI_Irecv and MPI_Isend pairs for constraint and vsite communication");
2905     }
2906
2907     if (comm->eFlop)
2908     {
2909         GMX_LOG(mdlog.info).appendText("Will load balance based on FLOP count");
2910         if (comm->eFlop > 1)
2911         {
2912             srand(1 + rank_mysim);
2913         }
2914         comm->bRecordLoad = TRUE;
2915     }
2916     else
2917     {
2918         comm->bRecordLoad = (wallcycle_have_counter() && recload > 0);
2919     }
2920 }
2921
2922 gmx_domdec_t *init_domain_decomposition(const gmx::MDLogger           &mdlog,
2923                                         t_commrec                     *cr,
2924                                         const DomdecOptions           &options,
2925                                         const gmx::MdrunOptions       &mdrunOptions,
2926                                         const gmx_mtop_t              *mtop,
2927                                         const t_inputrec              *ir,
2928                                         const matrix                   box,
2929                                         gmx::ArrayRef<const gmx::RVec> xGlobal,
2930                                         gmx::LocalAtomSetManager      *atomSets)
2931 {
2932     gmx_domdec_t      *dd;
2933
2934     GMX_LOG(mdlog.info).appendTextFormatted(
2935             "\nInitializing Domain Decomposition on %d ranks", cr->nnodes);
2936
2937     dd = new gmx_domdec_t;
2938
2939     dd->comm = init_dd_comm();
2940
2941     /* Initialize DD paritioning counters */
2942     dd->comm->partition_step = INT_MIN;
2943     dd->ddp_count            = 0;
2944
2945     set_dd_envvar_options(mdlog, dd, cr->nodeid);
2946
2947     gmx_ddbox_t ddbox = {0};
2948     set_dd_limits_and_grid(mdlog, cr, dd, options, mdrunOptions,
2949                            mtop, ir,
2950                            box, xGlobal,
2951                            &ddbox);
2952
2953     make_dd_communicators(mdlog, cr, dd, options.rankOrder);
2954
2955     if (thisRankHasDuty(cr, DUTY_PP))
2956     {
2957         set_ddgrid_parameters(mdlog, dd, options.dlbScaling, mtop, ir, &ddbox);
2958
2959         setup_neighbor_relations(dd);
2960     }
2961
2962     /* Set overallocation to avoid frequent reallocation of arrays */
2963     set_over_alloc_dd(TRUE);
2964
2965     clear_dd_cycle_counts(dd);
2966
2967     dd->atomSets = atomSets;
2968
2969     return dd;
2970 }
2971
2972 static gmx_bool test_dd_cutoff(t_commrec     *cr,
2973                                const t_state &state,
2974                                real           cutoffRequested)
2975 {
2976     gmx_domdec_t *dd;
2977     gmx_ddbox_t   ddbox;
2978     int           d, dim, np;
2979     real          inv_cell_size;
2980     int           LocallyLimited;
2981
2982     dd = cr->dd;
2983
2984     set_ddbox(*dd, false, state.box, true, state.x, &ddbox);
2985
2986     LocallyLimited = 0;
2987
2988     for (d = 0; d < dd->ndim; d++)
2989     {
2990         dim = dd->dim[d];
2991
2992         inv_cell_size = DD_CELL_MARGIN*dd->nc[dim]/ddbox.box_size[dim];
2993         if (dynamic_dd_box(*dd))
2994         {
2995             inv_cell_size *= DD_PRES_SCALE_MARGIN;
2996         }
2997
2998         np = 1 + static_cast<int>(cutoffRequested*inv_cell_size*ddbox.skew_fac[dim]);
2999
3000         if (!isDlbDisabled(dd->comm) && (dim < ddbox.npbcdim) && (dd->comm->cd[d].np_dlb > 0))
3001         {
3002             if (np > dd->comm->cd[d].np_dlb)
3003             {
3004                 return FALSE;
3005             }
3006
3007             /* If a current local cell size is smaller than the requested
3008              * cut-off, we could still fix it, but this gets very complicated.
3009              * Without fixing here, we might actually need more checks.
3010              */
3011             real cellSizeAlongDim = (dd->comm->cell_x1[dim] - dd->comm->cell_x0[dim])*ddbox.skew_fac[dim];
3012             if (cellSizeAlongDim*dd->comm->cd[d].np_dlb < cutoffRequested)
3013             {
3014                 LocallyLimited = 1;
3015             }
3016         }
3017     }
3018
3019     if (!isDlbDisabled(dd->comm))
3020     {
3021         /* If DLB is not active yet, we don't need to check the grid jumps.
3022          * Actually we shouldn't, because then the grid jump data is not set.
3023          */
3024         if (isDlbOn(dd->comm) &&
3025             check_grid_jump(0, dd, cutoffRequested, &ddbox, FALSE))
3026         {
3027             LocallyLimited = 1;
3028         }
3029
3030         gmx_sumi(1, &LocallyLimited, cr);
3031
3032         if (LocallyLimited > 0)
3033         {
3034             return FALSE;
3035         }
3036     }
3037
3038     return TRUE;
3039 }
3040
3041 gmx_bool change_dd_cutoff(t_commrec     *cr,
3042                           const t_state &state,
3043                           real           cutoffRequested)
3044 {
3045     gmx_bool bCutoffAllowed;
3046
3047     bCutoffAllowed = test_dd_cutoff(cr, state, cutoffRequested);
3048
3049     if (bCutoffAllowed)
3050     {
3051         cr->dd->comm->cutoff = cutoffRequested;
3052     }
3053
3054     return bCutoffAllowed;
3055 }