src/gromacs/domdec/domdec.cpp

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019, by the GROMACS development team, led by
   5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   6  * and including many others, as listed in the AUTHORS file in the
   7  * top-level source directory and at http://www.gromacs.org.
   8  *
   9  * GROMACS is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public License
  11  * as published by the Free Software Foundation; either version 2.1
  12  * of the License, or (at your option) any later version.
  13  *
  14  * GROMACS is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public
  20  * License along with GROMACS; if not, see
  21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  23  *
  24  * If you want to redistribute modifications to GROMACS, please
  25  * consider that scientific software is very special. Version
  26  * control is crucial - bugs must be traceable. We will be happy to
  27  * consider code for inclusion in the official distribution, but
  28  * derived work must not be called official GROMACS. Details are found
  29  * in the README & COPYING files - if they are missing, get the
  30  * official version at http://www.gromacs.org.
  31  *
  32  * To help us fund GROMACS development, we humbly ask that you cite
  33  * the research papers on the package. Check out http://www.gromacs.org.
  34  */
  35
  36 #include "gmxpre.h"
  37
  38 #include "domdec.h"
  39
  40 #include "config.h"
  41
  42 #include <cassert>
  43 #include <cinttypes>
  44 #include <climits>
  45 #include <cmath>
  46 #include <cstdio>
  47 #include <cstdlib>
  48 #include <cstring>
  49
  50 #include <algorithm>
  51 #include <memory>
  52
  53 #include "gromacs/domdec/collect.h"
  54 #include "gromacs/domdec/dlb.h"
  55 #include "gromacs/domdec/dlbtiming.h"
  56 #include "gromacs/domdec/domdec_network.h"
  57 #include "gromacs/domdec/ga2la.h"
  58 #include "gromacs/domdec/options.h"
  59 #include "gromacs/domdec/partition.h"
  60 #include "gromacs/gmxlib/network.h"
  61 #include "gromacs/gmxlib/nrnb.h"
  62 #include "gromacs/gpu_utils/gpu_utils.h"
  63 #include "gromacs/hardware/hw_info.h"
  64 #include "gromacs/listed_forces/manage_threading.h"
  65 #include "gromacs/math/vec.h"
  66 #include "gromacs/math/vectypes.h"
  67 #include "gromacs/mdlib/calc_verletbuf.h"
  68 #include "gromacs/mdlib/constr.h"
  69 #include "gromacs/mdlib/constraintrange.h"
  70 #include "gromacs/mdlib/updategroups.h"
  71 #include "gromacs/mdlib/vsite.h"
  72 #include "gromacs/mdtypes/commrec.h"
  73 #include "gromacs/mdtypes/forceoutput.h"
  74 #include "gromacs/mdtypes/inputrec.h"
  75 #include "gromacs/mdtypes/mdrunoptions.h"
  76 #include "gromacs/mdtypes/state.h"
  77 #include "gromacs/pbcutil/ishift.h"
  78 #include "gromacs/pbcutil/pbc.h"
  79 #include "gromacs/pulling/pull.h"
  80 #include "gromacs/timing/wallcycle.h"
  81 #include "gromacs/topology/block.h"
  82 #include "gromacs/topology/idef.h"
  83 #include "gromacs/topology/ifunc.h"
  84 #include "gromacs/topology/mtop_lookup.h"
  85 #include "gromacs/topology/mtop_util.h"
  86 #include "gromacs/topology/topology.h"
  87 #include "gromacs/utility/basedefinitions.h"
  88 #include "gromacs/utility/basenetwork.h"
  89 #include "gromacs/utility/cstringutil.h"
  90 #include "gromacs/utility/exceptions.h"
  91 #include "gromacs/utility/fatalerror.h"
  92 #include "gromacs/utility/gmxmpi.h"
  93 #include "gromacs/utility/logger.h"
  94 #include "gromacs/utility/real.h"
  95 #include "gromacs/utility/smalloc.h"
  96 #include "gromacs/utility/strconvert.h"
  97 #include "gromacs/utility/stringstream.h"
  98 #include "gromacs/utility/stringutil.h"
  99 #include "gromacs/utility/textwriter.h"
 100
 101 #include "atomdistribution.h"
 102 #include "box.h"
 103 #include "cellsizes.h"
 104 #include "distribute.h"
 105 #include "domdec_constraints.h"
 106 #include "domdec_internal.h"
 107 #include "domdec_vsite.h"
 108 #include "redistribute.h"
 109 #include "utility.h"
 110
 111 // TODO remove this when moving domdec into gmx namespace
 112 using gmx::DdRankOrder;
 113 using gmx::DlbOption;
 114 using gmx::DomdecOptions;
 115
 116 static const char *edlbs_names[int(DlbState::nr)] = { "off", "auto", "locked", "on", "on" };
 117
 118 /* The size per atom group of the cggl_flag buffer in gmx_domdec_comm_t */
 119 #define DD_CGIBS 2
 120
 121 /* The flags for the cggl_flag buffer in gmx_domdec_comm_t */
 122 #define DD_FLAG_NRCG  65535
 123 #define DD_FLAG_FW(d) (1<<(16+(d)*2))
 124 #define DD_FLAG_BW(d) (1<<(16+(d)*2+1))
 125
 126 /* The DD zone order */
 127 static const ivec dd_zo[DD_MAXZONE] =
 128 {{0, 0, 0}, {1, 0, 0}, {1, 1, 0}, {0, 1, 0}, {0, 1, 1}, {0, 0, 1}, {1, 0, 1}, {1, 1, 1}};
 129
 130 /* The non-bonded zone-pair setup for domain decomposition
 131  * The first number is the i-zone, the second number the first j-zone seen by
 132  * this i-zone, the third number the last+1 j-zone seen by this i-zone.
 133  * As is, this is for 3D decomposition, where there are 4 i-zones.
 134  * With 2D decomposition use only the first 2 i-zones and a last+1 j-zone of 4.
 135  * With 1D decomposition use only the first i-zone and a last+1 j-zone of 2.
 136  */
 137 static const int
 138     ddNonbondedZonePairRanges[DD_MAXIZONE][3] = {{0, 0, 8},
 139                                                  {1, 3, 6},
 140                                                  {2, 5, 6},
 141                                                  {3, 5, 7}};
 142
 143
 144 /*
 145    #define dd_index(n,i) ((((i)[ZZ]*(n)[YY] + (i)[YY])*(n)[XX]) + (i)[XX])
 146
 147    static void index2xyz(ivec nc,int ind,ivec xyz)
 148    {
 149    xyz[XX] = ind % nc[XX];
 150    xyz[YY] = (ind / nc[XX]) % nc[YY];
 151    xyz[ZZ] = ind / (nc[YY]*nc[XX]);
 152    }
 153  */
 154
 155 static void ddindex2xyz(const ivec nc, int ind, ivec xyz)
 156 {
 157     xyz[XX] = ind / (nc[YY]*nc[ZZ]);
 158     xyz[YY] = (ind / nc[ZZ]) % nc[YY];
 159     xyz[ZZ] = ind % nc[ZZ];
 160 }
 161
 162 static int ddcoord2ddnodeid(gmx_domdec_t *dd, ivec c)
 163 {
 164     int ddindex;
 165     int ddnodeid = -1;
 166
 167     ddindex = dd_index(dd->nc, c);
 168     if (dd->comm->bCartesianPP_PME)
 169     {
 170         ddnodeid = dd->comm->ddindex2ddnodeid[ddindex];
 171     }
 172     else if (dd->comm->bCartesianPP)
 173     {
 174 #if GMX_MPI
 175         MPI_Cart_rank(dd->mpi_comm_all, c, &ddnodeid);
 176 #endif
 177     }
 178     else
 179     {
 180         ddnodeid = ddindex;
 181     }
 182
 183     return ddnodeid;
 184 }
 185
 186 int ddglatnr(const gmx_domdec_t *dd, int i)
 187 {
 188     int atnr;
 189
 190     if (dd == nullptr)
 191     {
 192         atnr = i + 1;
 193     }
 194     else
 195     {
 196         if (i >= dd->comm->atomRanges.numAtomsTotal())
 197         {
 198             gmx_fatal(FARGS, "glatnr called with %d, which is larger than the local number of atoms (%d)", i, dd->comm->atomRanges.numAtomsTotal());
 199         }
 200         atnr = dd->globalAtomIndices[i] + 1;
 201     }
 202
 203     return atnr;
 204 }
 205
 206 t_block *dd_charge_groups_global(gmx_domdec_t *dd)
 207 {
 208     return &dd->comm->cgs_gl;
 209 }
 210
 211 gmx::ArrayRef<const gmx::RangePartitioning> getUpdateGroupingPerMoleculetype(const gmx_domdec_t &dd)
 212 {
 213     GMX_RELEASE_ASSERT(dd.comm, "Need a valid dd.comm");
 214     return dd.comm->systemInfo.updateGroupingPerMoleculetype;
 215 }
 216
 217 void dd_store_state(gmx_domdec_t *dd, t_state *state)
 218 {
 219     int i;
 220
 221     if (state->ddp_count != dd->ddp_count)
 222     {
 223         gmx_incons("The MD state does not match the domain decomposition state");
 224     }
 225
 226     state->cg_gl.resize(dd->ncg_home);
 227     for (i = 0; i < dd->ncg_home; i++)
 228     {
 229         state->cg_gl[i] = dd->globalAtomGroupIndices[i];
 230     }
 231
 232     state->ddp_count_cg_gl = dd->ddp_count;
 233 }
 234
 235 gmx_domdec_zones_t *domdec_zones(gmx_domdec_t *dd)
 236 {
 237     return &dd->comm->zones;
 238 }
 239
 240 void dd_get_ns_ranges(const gmx_domdec_t *dd, int icg,
 241                       int *jcg0, int *jcg1, ivec shift0, ivec shift1)
 242 {
 243     gmx_domdec_zones_t *zones;
 244     int                 izone, d, dim;
 245
 246     zones = &dd->comm->zones;
 247
 248     izone = 0;
 249     while (icg >= zones->izone[izone].cg1)
 250     {
 251         izone++;
 252     }
 253
 254     if (izone == 0)
 255     {
 256         *jcg0 = icg;
 257     }
 258     else if (izone < zones->nizone)
 259     {
 260         *jcg0 = zones->izone[izone].jcg0;
 261     }
 262     else
 263     {
 264         gmx_fatal(FARGS, "DD icg %d out of range: izone (%d) >= nizone (%d)",
 265                   icg, izone, zones->nizone);
 266     }
 267
 268     *jcg1 = zones->izone[izone].jcg1;
 269
 270     for (d = 0; d < dd->ndim; d++)
 271     {
 272         dim         = dd->dim[d];
 273         shift0[dim] = zones->izone[izone].shift0[dim];
 274         shift1[dim] = zones->izone[izone].shift1[dim];
 275         if (dd->comm->tric_dir[dim] || (isDlbOn(dd->comm) && d > 0))
 276         {
 277             /* A conservative approach, this can be optimized */
 278             shift0[dim] -= 1;
 279             shift1[dim] += 1;
 280         }
 281     }
 282 }
 283
 284 int dd_numHomeAtoms(const gmx_domdec_t &dd)
 285 {
 286     return dd.comm->atomRanges.numHomeAtoms();
 287 }
 288
 289 int dd_natoms_mdatoms(const gmx_domdec_t *dd)
 290 {
 291     /* We currently set mdatoms entries for all atoms:
 292      * local + non-local + communicated for vsite + constraints
 293      */
 294
 295     return dd->comm->atomRanges.numAtomsTotal();
 296 }
 297
 298 int dd_natoms_vsite(const gmx_domdec_t *dd)
 299 {
 300     return dd->comm->atomRanges.end(DDAtomRanges::Type::Vsites);
 301 }
 302
 303 void dd_get_constraint_range(const gmx_domdec_t *dd, int *at_start, int *at_end)
 304 {
 305     *at_start = dd->comm->atomRanges.start(DDAtomRanges::Type::Constraints);
 306     *at_end   = dd->comm->atomRanges.end(DDAtomRanges::Type::Constraints);
 307 }
 308
 309 void dd_move_x(gmx_domdec_t             *dd,
 310                const matrix              box,
 311                gmx::ArrayRef<gmx::RVec>  x,
 312                gmx_wallcycle            *wcycle)
 313 {
 314     wallcycle_start(wcycle, ewcMOVEX);
 315
 316     int                    nzone, nat_tot;
 317     gmx_domdec_comm_t     *comm;
 318     gmx_domdec_comm_dim_t *cd;
 319     rvec                   shift = {0, 0, 0};
 320     gmx_bool               bPBC, bScrew;
 321
 322     comm = dd->comm;
 323
 324     nzone   = 1;
 325     nat_tot = comm->atomRanges.numHomeAtoms();
 326     for (int d = 0; d < dd->ndim; d++)
 327     {
 328         bPBC   = (dd->ci[dd->dim[d]] == 0);
 329         bScrew = (bPBC && dd->unitCellInfo.haveScrewPBC && dd->dim[d] == XX);
 330         if (bPBC)
 331         {
 332             copy_rvec(box[dd->dim[d]], shift);
 333         }
 334         cd = &comm->cd[d];
 335         for (const gmx_domdec_ind_t &ind : cd->ind)
 336         {
 337             DDBufferAccess<gmx::RVec>  sendBufferAccess(comm->rvecBuffer, ind.nsend[nzone + 1]);
 338             gmx::ArrayRef<gmx::RVec>  &sendBuffer = sendBufferAccess.buffer;
 339             int                        n          = 0;
 340             if (!bPBC)
 341             {
 342                 for (int j : ind.index)
 343                 {
 344                     sendBuffer[n] = x[j];
 345                     n++;
 346                 }
 347             }
 348             else if (!bScrew)
 349             {
 350                 for (int j : ind.index)
 351                 {
 352                     /* We need to shift the coordinates */
 353                     for (int d = 0; d < DIM; d++)
 354                     {
 355                         sendBuffer[n][d] = x[j][d] + shift[d];
 356                     }
 357                     n++;
 358                 }
 359             }
 360             else
 361             {
 362                 for (int j : ind.index)
 363                 {
 364                     /* Shift x */
 365                     sendBuffer[n][XX] = x[j][XX] + shift[XX];
 366                     /* Rotate y and z.
 367                      * This operation requires a special shift force
 368                      * treatment, which is performed in calc_vir.
 369                      */
 370                     sendBuffer[n][YY] = box[YY][YY] - x[j][YY];
 371                     sendBuffer[n][ZZ] = box[ZZ][ZZ] - x[j][ZZ];
 372                     n++;
 373                 }
 374             }
 375
 376             DDBufferAccess<gmx::RVec>  receiveBufferAccess(comm->rvecBuffer2, cd->receiveInPlace ? 0 : ind.nrecv[nzone + 1]);
 377
 378             gmx::ArrayRef<gmx::RVec>   receiveBuffer;
 379             if (cd->receiveInPlace)
 380             {
 381                 receiveBuffer = gmx::arrayRefFromArray(x.data() + nat_tot, ind.nrecv[nzone + 1]);
 382             }
 383             else
 384             {
 385                 receiveBuffer = receiveBufferAccess.buffer;
 386             }
 387             /* Send and receive the coordinates */
 388             ddSendrecv(dd, d, dddirBackward,
 389                        sendBuffer, receiveBuffer);
 390
 391             if (!cd->receiveInPlace)
 392             {
 393                 int j = 0;
 394                 for (int zone = 0; zone < nzone; zone++)
 395                 {
 396                     for (int i = ind.cell2at0[zone]; i < ind.cell2at1[zone]; i++)
 397                     {
 398                         x[i] = receiveBuffer[j++];
 399                     }
 400                 }
 401             }
 402             nat_tot += ind.nrecv[nzone+1];
 403         }
 404         nzone += nzone;
 405     }
 406
 407     wallcycle_stop(wcycle, ewcMOVEX);
 408 }
 409
 410 void dd_move_f(gmx_domdec_t              *dd,
 411                gmx::ForceWithShiftForces *forceWithShiftForces,
 412                gmx_wallcycle             *wcycle)
 413 {
 414     wallcycle_start(wcycle, ewcMOVEF);
 415
 416     gmx::ArrayRef<gmx::RVec> f       = forceWithShiftForces->force();
 417     gmx::ArrayRef<gmx::RVec> fshift  = forceWithShiftForces->shiftForces();
 418
 419     gmx_domdec_comm_t       &comm    = *dd->comm;
 420     int                      nzone   = comm.zones.n/2;
 421     int                      nat_tot = comm.atomRanges.end(DDAtomRanges::Type::Zones);
 422     for (int d = dd->ndim-1; d >= 0; d--)
 423     {
 424         /* Only forces in domains near the PBC boundaries need to
 425            consider PBC in the treatment of fshift */
 426         const bool shiftForcesNeedPbc = (forceWithShiftForces->computeVirial() && dd->ci[dd->dim[d]] == 0);
 427         const bool applyScrewPbc      = (shiftForcesNeedPbc && dd->unitCellInfo.haveScrewPBC && dd->dim[d] == XX);
 428         /* Determine which shift vector we need */
 429         ivec       vis   = { 0, 0, 0 };
 430         vis[dd->dim[d]]  = 1;
 431         const int  is    = IVEC2IS(vis);
 432
 433         /* Loop over the pulses */
 434         const gmx_domdec_comm_dim_t &cd = comm.cd[d];
 435         for (int p = cd.numPulses() - 1; p >= 0; p--)
 436         {
 437             const gmx_domdec_ind_t    &ind  = cd.ind[p];
 438             DDBufferAccess<gmx::RVec>  receiveBufferAccess(comm.rvecBuffer, ind.nsend[nzone + 1]);
 439             gmx::ArrayRef<gmx::RVec>  &receiveBuffer = receiveBufferAccess.buffer;
 440
 441             nat_tot                                 -= ind.nrecv[nzone+1];
 442
 443             DDBufferAccess<gmx::RVec>  sendBufferAccess(comm.rvecBuffer2, cd.receiveInPlace ? 0 : ind.nrecv[nzone + 1]);
 444
 445             gmx::ArrayRef<gmx::RVec>   sendBuffer;
 446             if (cd.receiveInPlace)
 447             {
 448                 sendBuffer = gmx::arrayRefFromArray(f.data() + nat_tot, ind.nrecv[nzone + 1]);
 449             }
 450             else
 451             {
 452                 sendBuffer = sendBufferAccess.buffer;
 453                 int j = 0;
 454                 for (int zone = 0; zone < nzone; zone++)
 455                 {
 456                     for (int i = ind.cell2at0[zone]; i < ind.cell2at1[zone]; i++)
 457                     {
 458                         sendBuffer[j++] = f[i];
 459                     }
 460                 }
 461             }
 462             /* Communicate the forces */
 463             ddSendrecv(dd, d, dddirForward,
 464                        sendBuffer, receiveBuffer);
 465             /* Add the received forces */
 466             int n = 0;
 467             if (!shiftForcesNeedPbc)
 468             {
 469                 for (int j : ind.index)
 470                 {
 471                     for (int d = 0; d < DIM; d++)
 472                     {
 473                         f[j][d] += receiveBuffer[n][d];
 474                     }
 475                     n++;
 476                 }
 477             }
 478             else if (!applyScrewPbc)
 479             {
 480                 for (int j : ind.index)
 481                 {
 482                     for (int d = 0; d < DIM; d++)
 483                     {
 484                         f[j][d] += receiveBuffer[n][d];
 485                     }
 486                     /* Add this force to the shift force */
 487                     for (int d = 0; d < DIM; d++)
 488                     {
 489                         fshift[is][d] += receiveBuffer[n][d];
 490                     }
 491                     n++;
 492                 }
 493             }
 494             else
 495             {
 496                 for (int j : ind.index)
 497                 {
 498                     /* Rotate the force */
 499                     f[j][XX] += receiveBuffer[n][XX];
 500                     f[j][YY] -= receiveBuffer[n][YY];
 501                     f[j][ZZ] -= receiveBuffer[n][ZZ];
 502                     if (shiftForcesNeedPbc)
 503                     {
 504                         /* Add this force to the shift force */
 505                         for (int d = 0; d < DIM; d++)
 506                         {
 507                             fshift[is][d] += receiveBuffer[n][d];
 508                         }
 509                     }
 510                     n++;
 511                 }
 512             }
 513         }
 514         nzone /= 2;
 515     }
 516     wallcycle_stop(wcycle, ewcMOVEF);
 517 }
 518
 519 /* Convenience function for extracting a real buffer from an rvec buffer
 520  *
 521  * To reduce the number of temporary communication buffers and avoid
 522  * cache polution, we reuse gmx::RVec buffers for storing reals.
 523  * This functions return a real buffer reference with the same number
 524  * of elements as the gmx::RVec buffer (so 1/3 of the size in bytes).
 525  */
 526 static gmx::ArrayRef<real>
 527 realArrayRefFromRvecArrayRef(gmx::ArrayRef<gmx::RVec> arrayRef)
 528 {
 529     return gmx::arrayRefFromArray(as_rvec_array(arrayRef.data())[0],
 530                                   arrayRef.size());
 531 }
 532
 533 void dd_atom_spread_real(gmx_domdec_t *dd, real v[])
 534 {
 535     int                    nzone, nat_tot;
 536     gmx_domdec_comm_t     *comm;
 537     gmx_domdec_comm_dim_t *cd;
 538
 539     comm = dd->comm;
 540
 541     nzone   = 1;
 542     nat_tot = comm->atomRanges.numHomeAtoms();
 543     for (int d = 0; d < dd->ndim; d++)
 544     {
 545         cd = &comm->cd[d];
 546         for (const gmx_domdec_ind_t &ind : cd->ind)
 547         {
 548             /* Note: We provision for RVec instead of real, so a factor of 3
 549              * more than needed. The buffer actually already has this size
 550              * and we pass a plain pointer below, so this does not matter.
 551              */
 552             DDBufferAccess<gmx::RVec> sendBufferAccess(comm->rvecBuffer, ind.nsend[nzone + 1]);
 553             gmx::ArrayRef<real>       sendBuffer = realArrayRefFromRvecArrayRef(sendBufferAccess.buffer);
 554             int                       n          = 0;
 555             for (int j : ind.index)
 556             {
 557                 sendBuffer[n++] = v[j];
 558             }
 559
 560             DDBufferAccess<gmx::RVec> receiveBufferAccess(comm->rvecBuffer2, cd->receiveInPlace ? 0 : ind.nrecv[nzone + 1]);
 561
 562             gmx::ArrayRef<real>       receiveBuffer;
 563             if (cd->receiveInPlace)
 564             {
 565                 receiveBuffer = gmx::arrayRefFromArray(v + nat_tot, ind.nrecv[nzone + 1]);
 566             }
 567             else
 568             {
 569                 receiveBuffer = realArrayRefFromRvecArrayRef(receiveBufferAccess.buffer);
 570             }
 571             /* Send and receive the data */
 572             ddSendrecv(dd, d, dddirBackward,
 573                        sendBuffer, receiveBuffer);
 574             if (!cd->receiveInPlace)
 575             {
 576                 int j = 0;
 577                 for (int zone = 0; zone < nzone; zone++)
 578                 {
 579                     for (int i = ind.cell2at0[zone]; i < ind.cell2at1[zone]; i++)
 580                     {
 581                         v[i] = receiveBuffer[j++];
 582                     }
 583                 }
 584             }
 585             nat_tot += ind.nrecv[nzone+1];
 586         }
 587         nzone += nzone;
 588     }
 589 }
 590
 591 void dd_atom_sum_real(gmx_domdec_t *dd, real v[])
 592 {
 593     int                    nzone, nat_tot;
 594     gmx_domdec_comm_t     *comm;
 595     gmx_domdec_comm_dim_t *cd;
 596
 597     comm = dd->comm;
 598
 599     nzone   = comm->zones.n/2;
 600     nat_tot = comm->atomRanges.end(DDAtomRanges::Type::Zones);
 601     for (int d = dd->ndim-1; d >= 0; d--)
 602     {
 603         cd = &comm->cd[d];
 604         for (int p = cd->numPulses() - 1; p >= 0; p--)
 605         {
 606             const gmx_domdec_ind_t &ind = cd->ind[p];
 607
 608             /* Note: We provision for RVec instead of real, so a factor of 3
 609              * more than needed. The buffer actually already has this size
 610              * and we typecast, so this works as intended.
 611              */
 612             DDBufferAccess<gmx::RVec> receiveBufferAccess(comm->rvecBuffer, ind.nsend[nzone + 1]);
 613             gmx::ArrayRef<real>       receiveBuffer = realArrayRefFromRvecArrayRef(receiveBufferAccess.buffer);
 614             nat_tot -= ind.nrecv[nzone + 1];
 615
 616             DDBufferAccess<gmx::RVec> sendBufferAccess(comm->rvecBuffer2, cd->receiveInPlace ? 0 : ind.nrecv[nzone + 1]);
 617
 618             gmx::ArrayRef<real>       sendBuffer;
 619             if (cd->receiveInPlace)
 620             {
 621                 sendBuffer = gmx::arrayRefFromArray(v + nat_tot, ind.nrecv[nzone + 1]);
 622             }
 623             else
 624             {
 625                 sendBuffer = realArrayRefFromRvecArrayRef(sendBufferAccess.buffer);
 626                 int j = 0;
 627                 for (int zone = 0; zone < nzone; zone++)
 628                 {
 629                     for (int i = ind.cell2at0[zone]; i < ind.cell2at1[zone]; i++)
 630                     {
 631                         sendBuffer[j++] = v[i];
 632                     }
 633                 }
 634             }
 635             /* Communicate the forces */
 636             ddSendrecv(dd, d, dddirForward,
 637                        sendBuffer, receiveBuffer);
 638             /* Add the received forces */
 639             int n = 0;
 640             for (int j : ind.index)
 641             {
 642                 v[j] += receiveBuffer[n];
 643                 n++;
 644             }
 645         }
 646         nzone /= 2;
 647     }
 648 }
 649
 650 real dd_cutoff_multibody(const gmx_domdec_t *dd)
 651 {
 652     const gmx_domdec_comm_t &comm       = *dd->comm;
 653     const DDSystemInfo      &systemInfo = comm.systemInfo;
 654
 655     real                     r = -1;
 656     if (systemInfo.haveInterDomainMultiBodyBondeds)
 657     {
 658         if (comm.cutoff_mbody > 0)
 659         {
 660             r = comm.cutoff_mbody;
 661         }
 662         else
 663         {
 664             /* cutoff_mbody=0 means we do not have DLB */
 665             r = comm.cellsize_min[dd->dim[0]];
 666             for (int di = 1; di < dd->ndim; di++)
 667             {
 668                 r = std::min(r, comm.cellsize_min[dd->dim[di]]);
 669             }
 670             if (comm.bBondComm)
 671             {
 672                 r = std::max(r, comm.cutoff_mbody);
 673             }
 674             else
 675             {
 676                 r = std::min(r, systemInfo.cutoff);
 677             }
 678         }
 679     }
 680
 681     return r;
 682 }
 683
 684 real dd_cutoff_twobody(const gmx_domdec_t *dd)
 685 {
 686     real r_mb;
 687
 688     r_mb = dd_cutoff_multibody(dd);
 689
 690     return std::max(dd->comm->systemInfo.cutoff, r_mb);
 691 }
 692
 693
 694 static void dd_cart_coord2pmecoord(const gmx_domdec_t *dd, const ivec coord,
 695                                    ivec coord_pme)
 696 {
 697     int nc, ntot;
 698
 699     nc   = dd->nc[dd->comm->cartpmedim];
 700     ntot = dd->comm->ntot[dd->comm->cartpmedim];
 701     copy_ivec(coord, coord_pme);
 702     coord_pme[dd->comm->cartpmedim] =
 703         nc + (coord[dd->comm->cartpmedim]*(ntot - nc) + (ntot - nc)/2)/nc;
 704 }
 705
 706 static int ddindex2pmeindex(const gmx_domdec_t *dd, int ddindex)
 707 {
 708     int npp, npme;
 709
 710     npp  = dd->nnodes;
 711     npme = dd->comm->npmenodes;
 712
 713     /* Here we assign a PME node to communicate with this DD node
 714      * by assuming that the major index of both is x.
 715      * We add cr->npmenodes/2 to obtain an even distribution.
 716      */
 717     return (ddindex*npme + npme/2)/npp;
 718 }
 719
 720 static int *dd_interleaved_pme_ranks(const gmx_domdec_t *dd)
 721 {
 722     int *pme_rank;
 723     int  n, i, p0, p1;
 724
 725     snew(pme_rank, dd->comm->npmenodes);
 726     n = 0;
 727     for (i = 0; i < dd->nnodes; i++)
 728     {
 729         p0 = ddindex2pmeindex(dd, i);
 730         p1 = ddindex2pmeindex(dd, i+1);
 731         if (i+1 == dd->nnodes || p1 > p0)
 732         {
 733             if (debug)
 734             {
 735                 fprintf(debug, "pme_rank[%d] = %d\n", n, i+1+n);
 736             }
 737             pme_rank[n] = i + 1 + n;
 738             n++;
 739         }
 740     }
 741
 742     return pme_rank;
 743 }
 744
 745 static int gmx_ddcoord2pmeindex(const t_commrec *cr, int x, int y, int z)
 746 {
 747     gmx_domdec_t *dd;
 748     ivec          coords;
 749     int           slab;
 750
 751     dd = cr->dd;
 752     /*
 753        if (dd->comm->bCartesian) {
 754        gmx_ddindex2xyz(dd->nc,ddindex,coords);
 755        dd_coords2pmecoords(dd,coords,coords_pme);
 756        copy_ivec(dd->ntot,nc);
 757        nc[dd->cartpmedim]         -= dd->nc[dd->cartpmedim];
 758        coords_pme[dd->cartpmedim] -= dd->nc[dd->cartpmedim];
 759
 760        slab = (coords_pme[XX]*nc[YY] + coords_pme[YY])*nc[ZZ] + coords_pme[ZZ];
 761        } else {
 762        slab = (ddindex*cr->npmenodes + cr->npmenodes/2)/dd->nnodes;
 763        }
 764      */
 765     coords[XX] = x;
 766     coords[YY] = y;
 767     coords[ZZ] = z;
 768     slab       = ddindex2pmeindex(dd, dd_index(dd->nc, coords));
 769
 770     return slab;
 771 }
 772
 773 static int ddcoord2simnodeid(const t_commrec *cr, int x, int y, int z)
 774 {
 775     gmx_domdec_comm_t *comm;
 776     ivec               coords;
 777     int                ddindex, nodeid = -1;
 778
 779     comm = cr->dd->comm;
 780
 781     coords[XX] = x;
 782     coords[YY] = y;
 783     coords[ZZ] = z;
 784     if (comm->bCartesianPP_PME)
 785     {
 786 #if GMX_MPI
 787         MPI_Cart_rank(cr->mpi_comm_mysim, coords, &nodeid);
 788 #endif
 789     }
 790     else
 791     {
 792         ddindex = dd_index(cr->dd->nc, coords);
 793         if (comm->bCartesianPP)
 794         {
 795             nodeid = comm->ddindex2simnodeid[ddindex];
 796         }
 797         else
 798         {
 799             if (comm->pmenodes)
 800             {
 801                 nodeid = ddindex + gmx_ddcoord2pmeindex(cr, x, y, z);
 802             }
 803             else
 804             {
 805                 nodeid = ddindex;
 806             }
 807         }
 808     }
 809
 810     return nodeid;
 811 }
 812
 813 static int dd_simnode2pmenode(const gmx_domdec_t         *dd,
 814                               const t_commrec gmx_unused *cr,
 815                               int                         sim_nodeid)
 816 {
 817     int pmenode = -1;
 818
 819     const gmx_domdec_comm_t *comm = dd->comm;
 820
 821     /* This assumes a uniform x domain decomposition grid cell size */
 822     if (comm->bCartesianPP_PME)
 823     {
 824 #if GMX_MPI
 825         ivec coord, coord_pme;
 826         MPI_Cart_coords(cr->mpi_comm_mysim, sim_nodeid, DIM, coord);
 827         if (coord[comm->cartpmedim] < dd->nc[comm->cartpmedim])
 828         {
 829             /* This is a PP node */
 830             dd_cart_coord2pmecoord(dd, coord, coord_pme);
 831             MPI_Cart_rank(cr->mpi_comm_mysim, coord_pme, &pmenode);
 832         }
 833 #endif
 834     }
 835     else if (comm->bCartesianPP)
 836     {
 837         if (sim_nodeid < dd->nnodes)
 838         {
 839             pmenode = dd->nnodes + ddindex2pmeindex(dd, sim_nodeid);
 840         }
 841     }
 842     else
 843     {
 844         /* This assumes DD cells with identical x coordinates
 845          * are numbered sequentially.
 846          */
 847         if (dd->comm->pmenodes == nullptr)
 848         {
 849             if (sim_nodeid < dd->nnodes)
 850             {
 851                 /* The DD index equals the nodeid */
 852                 pmenode = dd->nnodes + ddindex2pmeindex(dd, sim_nodeid);
 853             }
 854         }
 855         else
 856         {
 857             int i = 0;
 858             while (sim_nodeid > dd->comm->pmenodes[i])
 859             {
 860                 i++;
 861             }
 862             if (sim_nodeid < dd->comm->pmenodes[i])
 863             {
 864                 pmenode = dd->comm->pmenodes[i];
 865             }
 866         }
 867     }
 868
 869     return pmenode;
 870 }
 871
 872 NumPmeDomains getNumPmeDomains(const gmx_domdec_t *dd)
 873 {
 874     if (dd != nullptr)
 875     {
 876         return {
 877                    dd->comm->npmenodes_x, dd->comm->npmenodes_y
 878         };
 879     }
 880     else
 881     {
 882         return {
 883                    1, 1
 884         };
 885     }
 886 }
 887
 888 std::vector<int> get_pme_ddranks(const t_commrec *cr, int pmenodeid)
 889 {
 890     gmx_domdec_t *dd;
 891     int           x, y, z;
 892     ivec          coord, coord_pme;
 893
 894     dd = cr->dd;
 895
 896     std::vector<int> ddranks;
 897     ddranks.reserve((dd->nnodes+cr->npmenodes-1)/cr->npmenodes);
 898
 899     for (x = 0; x < dd->nc[XX]; x++)
 900     {
 901         for (y = 0; y < dd->nc[YY]; y++)
 902         {
 903             for (z = 0; z < dd->nc[ZZ]; z++)
 904             {
 905                 if (dd->comm->bCartesianPP_PME)
 906                 {
 907                     coord[XX] = x;
 908                     coord[YY] = y;
 909                     coord[ZZ] = z;
 910                     dd_cart_coord2pmecoord(dd, coord, coord_pme);
 911                     if (dd->ci[XX] == coord_pme[XX] &&
 912                         dd->ci[YY] == coord_pme[YY] &&
 913                         dd->ci[ZZ] == coord_pme[ZZ])
 914                     {
 915                         ddranks.push_back(ddcoord2simnodeid(cr, x, y, z));
 916                     }
 917                 }
 918                 else
 919                 {
 920                     /* The slab corresponds to the nodeid in the PME group */
 921                     if (gmx_ddcoord2pmeindex(cr, x, y, z) == pmenodeid)
 922                     {
 923                         ddranks.push_back(ddcoord2simnodeid(cr, x, y, z));
 924                     }
 925                 }
 926             }
 927         }
 928     }
 929     return ddranks;
 930 }
 931
 932 static gmx_bool receive_vir_ener(const gmx_domdec_t *dd, const t_commrec *cr)
 933 {
 934     gmx_bool bReceive = TRUE;
 935
 936     if (cr->npmenodes < dd->nnodes)
 937     {
 938         gmx_domdec_comm_t *comm = dd->comm;
 939         if (comm->bCartesianPP_PME)
 940         {
 941 #if GMX_MPI
 942             int  pmenode = dd_simnode2pmenode(dd, cr, cr->sim_nodeid);
 943             ivec coords;
 944             MPI_Cart_coords(cr->mpi_comm_mysim, cr->sim_nodeid, DIM, coords);
 945             coords[comm->cartpmedim]++;
 946             if (coords[comm->cartpmedim] < dd->nc[comm->cartpmedim])
 947             {
 948                 int rank;
 949                 MPI_Cart_rank(cr->mpi_comm_mysim, coords, &rank);
 950                 if (dd_simnode2pmenode(dd, cr, rank) == pmenode)
 951                 {
 952                     /* This is not the last PP node for pmenode */
 953                     bReceive = FALSE;
 954                 }
 955             }
 956 #else
 957             GMX_RELEASE_ASSERT(false, "Without MPI we should not have Cartesian PP-PME with #PMEnodes < #DDnodes");
 958 #endif
 959         }
 960         else
 961         {
 962             int pmenode = dd_simnode2pmenode(dd, cr, cr->sim_nodeid);
 963             if (cr->sim_nodeid+1 < cr->nnodes &&
 964                 dd_simnode2pmenode(dd, cr, cr->sim_nodeid+1) == pmenode)
 965             {
 966                 /* This is not the last PP node for pmenode */
 967                 bReceive = FALSE;
 968             }
 969         }
 970     }
 971
 972     return bReceive;
 973 }
 974
 975 static void set_slb_pme_dim_f(gmx_domdec_t *dd, int dim, real **dim_f)
 976 {
 977     gmx_domdec_comm_t *comm;
 978     int                i;
 979
 980     comm = dd->comm;
 981
 982     snew(*dim_f, dd->nc[dim]+1);
 983     (*dim_f)[0] = 0;
 984     for (i = 1; i < dd->nc[dim]; i++)
 985     {
 986         if (comm->slb_frac[dim])
 987         {
 988             (*dim_f)[i] = (*dim_f)[i-1] + comm->slb_frac[dim][i-1];
 989         }
 990         else
 991         {
 992             (*dim_f)[i] = static_cast<real>(i)/static_cast<real>(dd->nc[dim]);
 993         }
 994     }
 995     (*dim_f)[dd->nc[dim]] = 1;
 996 }
 997
 998 static void init_ddpme(gmx_domdec_t *dd, gmx_ddpme_t *ddpme, int dimind)
 999 {
1000     int  pmeindex, slab, nso, i;
1001     ivec xyz;
1002
1003     if (dimind == 0 && dd->dim[0] == YY && dd->comm->npmenodes_x == 1)
1004     {
1005         ddpme->dim = YY;
1006     }
1007     else
1008     {
1009         ddpme->dim = dimind;
1010     }
1011     ddpme->dim_match = (ddpme->dim == dd->dim[dimind]);
1012
1013     ddpme->nslab = (ddpme->dim == 0 ?
1014                     dd->comm->npmenodes_x :
1015                     dd->comm->npmenodes_y);
1016
1017     if (ddpme->nslab <= 1)
1018     {
1019         return;
1020     }
1021
1022     nso = dd->comm->npmenodes/ddpme->nslab;
1023     /* Determine for each PME slab the PP location range for dimension dim */
1024     snew(ddpme->pp_min, ddpme->nslab);
1025     snew(ddpme->pp_max, ddpme->nslab);
1026     for (slab = 0; slab < ddpme->nslab; slab++)
1027     {
1028         ddpme->pp_min[slab] = dd->nc[dd->dim[dimind]] - 1;
1029         ddpme->pp_max[slab] = 0;
1030     }
1031     for (i = 0; i < dd->nnodes; i++)
1032     {
1033         ddindex2xyz(dd->nc, i, xyz);
1034         /* For y only use our y/z slab.
1035          * This assumes that the PME x grid size matches the DD grid size.
1036          */
1037         if (dimind == 0 || xyz[XX] == dd->ci[XX])
1038         {
1039             pmeindex = ddindex2pmeindex(dd, i);
1040             if (dimind == 0)
1041             {
1042                 slab = pmeindex/nso;
1043             }
1044             else
1045             {
1046                 slab = pmeindex % ddpme->nslab;
1047             }
1048             ddpme->pp_min[slab] = std::min(ddpme->pp_min[slab], xyz[dimind]);
1049             ddpme->pp_max[slab] = std::max(ddpme->pp_max[slab], xyz[dimind]);
1050         }
1051     }
1052
1053     set_slb_pme_dim_f(dd, ddpme->dim, &ddpme->slb_dim_f);
1054 }
1055
1056 int dd_pme_maxshift_x(const gmx_domdec_t *dd)
1057 {
1058     if (dd->comm->ddpme[0].dim == XX)
1059     {
1060         return dd->comm->ddpme[0].maxshift;
1061     }
1062     else
1063     {
1064         return 0;
1065     }
1066 }
1067
1068 int dd_pme_maxshift_y(const gmx_domdec_t *dd)
1069 {
1070     if (dd->comm->ddpme[0].dim == YY)
1071     {
1072         return dd->comm->ddpme[0].maxshift;
1073     }
1074     else if (dd->comm->npmedecompdim >= 2 && dd->comm->ddpme[1].dim == YY)
1075     {
1076         return dd->comm->ddpme[1].maxshift;
1077     }
1078     else
1079     {
1080         return 0;
1081     }
1082 }
1083
1084 void dd_cycles_add(const gmx_domdec_t *dd, float cycles, int ddCycl)
1085 {
1086     /* Note that the cycles value can be incorrect, either 0 or some
1087      * extremely large value, when our thread migrated to another core
1088      * with an unsynchronized cycle counter. If this happens less often
1089      * that once per nstlist steps, this will not cause issues, since
1090      * we later subtract the maximum value from the sum over nstlist steps.
1091      * A zero count will slightly lower the total, but that's a small effect.
1092      * Note that the main purpose of the subtraction of the maximum value
1093      * is to avoid throwing off the load balancing when stalls occur due
1094      * e.g. system activity or network congestion.
1095      */
1096     dd->comm->cycl[ddCycl] += cycles;
1097     dd->comm->cycl_n[ddCycl]++;
1098     if (cycles > dd->comm->cycl_max[ddCycl])
1099     {
1100         dd->comm->cycl_max[ddCycl] = cycles;
1101     }
1102 }
1103
1104 #if GMX_MPI
1105 static void make_load_communicator(gmx_domdec_t *dd, int dim_ind, ivec loc)
1106 {
1107     MPI_Comm           c_row;
1108     int                dim, i, rank;
1109     ivec               loc_c;
1110     gmx_bool           bPartOfGroup = FALSE;
1111
1112     dim = dd->dim[dim_ind];
1113     copy_ivec(loc, loc_c);
1114     for (i = 0; i < dd->nc[dim]; i++)
1115     {
1116         loc_c[dim] = i;
1117         rank       = dd_index(dd->nc, loc_c);
1118         if (rank == dd->rank)
1119         {
1120             /* This process is part of the group */
1121             bPartOfGroup = TRUE;
1122         }
1123     }
1124     MPI_Comm_split(dd->mpi_comm_all, bPartOfGroup ? 0 : MPI_UNDEFINED, dd->rank,
1125                    &c_row);
1126     if (bPartOfGroup)
1127     {
1128         dd->comm->mpi_comm_load[dim_ind] = c_row;
1129         if (!isDlbDisabled(dd->comm))
1130         {
1131             DDCellsizesWithDlb &cellsizes = dd->comm->cellsizesWithDlb[dim_ind];
1132
1133             if (dd->ci[dim] == dd->master_ci[dim])
1134             {
1135                 /* This is the root process of this row */
1136                 cellsizes.rowMaster  = std::make_unique<RowMaster>();
1137
1138                 RowMaster &rowMaster = *cellsizes.rowMaster;
1139                 rowMaster.cellFrac.resize(ddCellFractionBufferSize(dd, dim_ind));
1140                 rowMaster.oldCellFrac.resize(dd->nc[dim] + 1);
1141                 rowMaster.isCellMin.resize(dd->nc[dim]);
1142                 if (dim_ind > 0)
1143                 {
1144                     rowMaster.bounds.resize(dd->nc[dim]);
1145                 }
1146                 rowMaster.buf_ncd.resize(dd->nc[dim]);
1147             }
1148             else
1149             {
1150                 /* This is not a root process, we only need to receive cell_f */
1151                 cellsizes.fracRow.resize(ddCellFractionBufferSize(dd, dim_ind));
1152             }
1153         }
1154         if (dd->ci[dim] == dd->master_ci[dim])
1155         {
1156             snew(dd->comm->load[dim_ind].load, dd->nc[dim]*DD_NLOAD_MAX);
1157         }
1158     }
1159 }
1160 #endif
1161
1162 void dd_setup_dlb_resource_sharing(t_commrec            *cr,
1163                                    int                   gpu_id)
1164 {
1165 #if GMX_MPI
1166     int           physicalnode_id_hash;
1167     gmx_domdec_t *dd;
1168     MPI_Comm      mpi_comm_pp_physicalnode;
1169
1170     if (!thisRankHasDuty(cr, DUTY_PP) || gpu_id < 0)
1171     {
1172         /* Only ranks with short-ranged tasks (currently) use GPUs.
1173          * If we don't have GPUs assigned, there are no resources to share.
1174          */
1175         return;
1176     }
1177
1178     physicalnode_id_hash = gmx_physicalnode_id_hash();
1179
1180     dd = cr->dd;
1181
1182     if (debug)
1183     {
1184         fprintf(debug, "dd_setup_dd_dlb_gpu_sharing:\n");
1185         fprintf(debug, "DD PP rank %d physical node hash %d gpu_id %d\n",
1186                 dd->rank, physicalnode_id_hash, gpu_id);
1187     }
1188     /* Split the PP communicator over the physical nodes */
1189     /* TODO: See if we should store this (before), as it's also used for
1190      * for the nodecomm summation.
1191      */
1192     // TODO PhysicalNodeCommunicator could be extended/used to handle
1193     // the need for per-node per-group communicators.
1194     MPI_Comm_split(dd->mpi_comm_all, physicalnode_id_hash, dd->rank,
1195                    &mpi_comm_pp_physicalnode);
1196     MPI_Comm_split(mpi_comm_pp_physicalnode, gpu_id, dd->rank,
1197                    &dd->comm->mpi_comm_gpu_shared);
1198     MPI_Comm_free(&mpi_comm_pp_physicalnode);
1199     MPI_Comm_size(dd->comm->mpi_comm_gpu_shared, &dd->comm->nrank_gpu_shared);
1200
1201     if (debug)
1202     {
1203         fprintf(debug, "nrank_gpu_shared %d\n", dd->comm->nrank_gpu_shared);
1204     }
1205
1206     /* Note that some ranks could share a GPU, while others don't */
1207
1208     if (dd->comm->nrank_gpu_shared == 1)
1209     {
1210         MPI_Comm_free(&dd->comm->mpi_comm_gpu_shared);
1211     }
1212 #else
1213     GMX_UNUSED_VALUE(cr);
1214     GMX_UNUSED_VALUE(gpu_id);
1215 #endif
1216 }
1217
1218 static void make_load_communicators(gmx_domdec_t gmx_unused *dd)
1219 {
1220 #if GMX_MPI
1221     int  dim0, dim1, i, j;
1222     ivec loc;
1223
1224     if (debug)
1225     {
1226         fprintf(debug, "Making load communicators\n");
1227     }
1228
1229     dd->comm->load = new domdec_load_t[std::max(dd->ndim, 1)];
1230     snew(dd->comm->mpi_comm_load, std::max(dd->ndim, 1));
1231
1232     if (dd->ndim == 0)
1233     {
1234         return;
1235     }
1236
1237     clear_ivec(loc);
1238     make_load_communicator(dd, 0, loc);
1239     if (dd->ndim > 1)
1240     {
1241         dim0 = dd->dim[0];
1242         for (i = 0; i < dd->nc[dim0]; i++)
1243         {
1244             loc[dim0] = i;
1245             make_load_communicator(dd, 1, loc);
1246         }
1247     }
1248     if (dd->ndim > 2)
1249     {
1250         dim0 = dd->dim[0];
1251         for (i = 0; i < dd->nc[dim0]; i++)
1252         {
1253             loc[dim0] = i;
1254             dim1      = dd->dim[1];
1255             for (j = 0; j < dd->nc[dim1]; j++)
1256             {
1257                 loc[dim1] = j;
1258                 make_load_communicator(dd, 2, loc);
1259             }
1260         }
1261     }
1262
1263     if (debug)
1264     {
1265         fprintf(debug, "Finished making load communicators\n");
1266     }
1267 #endif
1268 }
1269
1270 /*! \brief Sets up the relation between neighboring domains and zones */
1271 static void setup_neighbor_relations(gmx_domdec_t *dd)
1272 {
1273     int                     d, dim, i, j, m;
1274     ivec                    tmp, s;
1275     gmx_domdec_zones_t     *zones;
1276     gmx_domdec_ns_ranges_t *izone;
1277     GMX_ASSERT(dd->ndim >= 0, "Must have non-negative number of dimensions for DD");
1278
1279     for (d = 0; d < dd->ndim; d++)
1280     {
1281         dim = dd->dim[d];
1282         copy_ivec(dd->ci, tmp);
1283         tmp[dim]           = (tmp[dim] + 1) % dd->nc[dim];
1284         dd->neighbor[d][0] = ddcoord2ddnodeid(dd, tmp);
1285         copy_ivec(dd->ci, tmp);
1286         tmp[dim]           = (tmp[dim] - 1 + dd->nc[dim]) % dd->nc[dim];
1287         dd->neighbor[d][1] = ddcoord2ddnodeid(dd, tmp);
1288         if (debug)
1289         {
1290             fprintf(debug, "DD rank %d neighbor ranks in dir %d are + %d - %d\n",
1291                     dd->rank, dim,
1292                     dd->neighbor[d][0],
1293                     dd->neighbor[d][1]);
1294         }
1295     }
1296
1297     int nzone  = (1 << dd->ndim);
1298     GMX_ASSERT(dd->ndim < DIM, "Invalid number of dimensions");
1299     int nizone = (1 << std::max(dd->ndim - 1, 0));
1300     assert(nizone >= 1 && nizone <= DD_MAXIZONE);
1301
1302     zones = &dd->comm->zones;
1303
1304     for (i = 0; i < nzone; i++)
1305     {
1306         m = 0;
1307         clear_ivec(zones->shift[i]);
1308         for (d = 0; d < dd->ndim; d++)
1309         {
1310             zones->shift[i][dd->dim[d]] = dd_zo[i][m++];
1311         }
1312     }
1313
1314     zones->n = nzone;
1315     for (i = 0; i < nzone; i++)
1316     {
1317         for (d = 0; d < DIM; d++)
1318         {
1319             s[d] = dd->ci[d] - zones->shift[i][d];
1320             if (s[d] < 0)
1321             {
1322                 s[d] += dd->nc[d];
1323             }
1324             else if (s[d] >= dd->nc[d])
1325             {
1326                 s[d] -= dd->nc[d];
1327             }
1328         }
1329     }
1330     zones->nizone = nizone;
1331     for (i = 0; i < zones->nizone; i++)
1332     {
1333         assert(ddNonbondedZonePairRanges[i][0] == i);
1334
1335         izone     = &zones->izone[i];
1336         /* dd_zp3 is for 3D decomposition, for fewer dimensions use only
1337          * j-zones up to nzone.
1338          */
1339         izone->j0 = std::min(ddNonbondedZonePairRanges[i][1], nzone);
1340         izone->j1 = std::min(ddNonbondedZonePairRanges[i][2], nzone);
1341         for (dim = 0; dim < DIM; dim++)
1342         {
1343             if (dd->nc[dim] == 1)
1344             {
1345                 /* All shifts should be allowed */
1346                 izone->shift0[dim] = -1;
1347                 izone->shift1[dim] = 1;
1348             }
1349             else
1350             {
1351                 /* Determine the min/max j-zone shift wrt the i-zone */
1352                 izone->shift0[dim] = 1;
1353                 izone->shift1[dim] = -1;
1354                 for (j = izone->j0; j < izone->j1; j++)
1355                 {
1356                     int shift_diff = zones->shift[j][dim] - zones->shift[i][dim];
1357                     if (shift_diff < izone->shift0[dim])
1358                     {
1359                         izone->shift0[dim] = shift_diff;
1360                     }
1361                     if (shift_diff > izone->shift1[dim])
1362                     {
1363                         izone->shift1[dim] = shift_diff;
1364                     }
1365                 }
1366             }
1367         }
1368     }
1369
1370     if (!isDlbDisabled(dd->comm))
1371     {
1372         dd->comm->cellsizesWithDlb.resize(dd->ndim);
1373     }
1374
1375     if (dd->comm->bRecordLoad)
1376     {
1377         make_load_communicators(dd);
1378     }
1379 }
1380
1381 static void make_pp_communicator(const gmx::MDLogger  &mdlog,
1382                                  gmx_domdec_t         *dd,
1383                                  t_commrec gmx_unused *cr,
1384                                  bool gmx_unused       reorder)
1385 {
1386 #if GMX_MPI
1387     gmx_domdec_comm_t *comm;
1388     int                rank, *buf;
1389     ivec               periods;
1390     MPI_Comm           comm_cart;
1391
1392     comm = dd->comm;
1393
1394     if (comm->bCartesianPP)
1395     {
1396         /* Set up cartesian communication for the particle-particle part */
1397         GMX_LOG(mdlog.info).appendTextFormatted(
1398                 "Will use a Cartesian communicator: %d x %d x %d",
1399                 dd->nc[XX], dd->nc[YY], dd->nc[ZZ]);
1400
1401         for (int i = 0; i < DIM; i++)
1402         {
1403             periods[i] = TRUE;
1404         }
1405         MPI_Cart_create(cr->mpi_comm_mygroup, DIM, dd->nc, periods, static_cast<int>(reorder),
1406                         &comm_cart);
1407         /* We overwrite the old communicator with the new cartesian one */
1408         cr->mpi_comm_mygroup = comm_cart;
1409     }
1410
1411     dd->mpi_comm_all = cr->mpi_comm_mygroup;
1412     MPI_Comm_rank(dd->mpi_comm_all, &dd->rank);
1413
1414     if (comm->bCartesianPP_PME)
1415     {
1416         /* Since we want to use the original cartesian setup for sim,
1417          * and not the one after split, we need to make an index.
1418          */
1419         snew(comm->ddindex2ddnodeid, dd->nnodes);
1420         comm->ddindex2ddnodeid[dd_index(dd->nc, dd->ci)] = dd->rank;
1421         gmx_sumi(dd->nnodes, comm->ddindex2ddnodeid, cr);
1422         /* Get the rank of the DD master,
1423          * above we made sure that the master node is a PP node.
1424          */
1425         if (MASTER(cr))
1426         {
1427             rank = dd->rank;
1428         }
1429         else
1430         {
1431             rank = 0;
1432         }
1433         MPI_Allreduce(&rank, &dd->masterrank, 1, MPI_INT, MPI_SUM, dd->mpi_comm_all);
1434     }
1435     else if (comm->bCartesianPP)
1436     {
1437         if (cr->npmenodes == 0)
1438         {
1439             /* The PP communicator is also
1440              * the communicator for this simulation
1441              */
1442             cr->mpi_comm_mysim = cr->mpi_comm_mygroup;
1443         }
1444         cr->nodeid = dd->rank;
1445
1446         MPI_Cart_coords(dd->mpi_comm_all, dd->rank, DIM, dd->ci);
1447
1448         /* We need to make an index to go from the coordinates
1449          * to the nodeid of this simulation.
1450          */
1451         snew(comm->ddindex2simnodeid, dd->nnodes);
1452         snew(buf, dd->nnodes);
1453         if (thisRankHasDuty(cr, DUTY_PP))
1454         {
1455             buf[dd_index(dd->nc, dd->ci)] = cr->sim_nodeid;
1456         }
1457         /* Communicate the ddindex to simulation nodeid index */
1458         MPI_Allreduce(buf, comm->ddindex2simnodeid, dd->nnodes, MPI_INT, MPI_SUM,
1459                       cr->mpi_comm_mysim);
1460         sfree(buf);
1461
1462         /* Determine the master coordinates and rank.
1463          * The DD master should be the same node as the master of this sim.
1464          */
1465         for (int i = 0; i < dd->nnodes; i++)
1466         {
1467             if (comm->ddindex2simnodeid[i] == 0)
1468             {
1469                 ddindex2xyz(dd->nc, i, dd->master_ci);
1470                 MPI_Cart_rank(dd->mpi_comm_all, dd->master_ci, &dd->masterrank);
1471             }
1472         }
1473         if (debug)
1474         {
1475             fprintf(debug, "The master rank is %d\n", dd->masterrank);
1476         }
1477     }
1478     else
1479     {
1480         /* No Cartesian communicators */
1481         /* We use the rank in dd->comm->all as DD index */
1482         ddindex2xyz(dd->nc, dd->rank, dd->ci);
1483         /* The simulation master nodeid is 0, so the DD master rank is also 0 */
1484         dd->masterrank = 0;
1485         clear_ivec(dd->master_ci);
1486     }
1487 #endif
1488
1489     GMX_LOG(mdlog.info).appendTextFormatted(
1490             "Domain decomposition rank %d, coordinates %d %d %d\n",
1491             dd->rank, dd->ci[XX], dd->ci[YY], dd->ci[ZZ]);
1492     if (debug)
1493     {
1494         fprintf(debug,
1495                 "Domain decomposition rank %d, coordinates %d %d %d\n\n",
1496                 dd->rank, dd->ci[XX], dd->ci[YY], dd->ci[ZZ]);
1497     }
1498 }
1499
1500 static void receive_ddindex2simnodeid(gmx_domdec_t         *dd,
1501                                       t_commrec            *cr)
1502 {
1503 #if GMX_MPI
1504     gmx_domdec_comm_t *comm = dd->comm;
1505
1506     if (!comm->bCartesianPP_PME && comm->bCartesianPP)
1507     {
1508         int *buf;
1509         snew(comm->ddindex2simnodeid, dd->nnodes);
1510         snew(buf, dd->nnodes);
1511         if (thisRankHasDuty(cr, DUTY_PP))
1512         {
1513             buf[dd_index(dd->nc, dd->ci)] = cr->sim_nodeid;
1514         }
1515         /* Communicate the ddindex to simulation nodeid index */
1516         MPI_Allreduce(buf, comm->ddindex2simnodeid, dd->nnodes, MPI_INT, MPI_SUM,
1517                       cr->mpi_comm_mysim);
1518         sfree(buf);
1519     }
1520 #else
1521     GMX_UNUSED_VALUE(dd);
1522     GMX_UNUSED_VALUE(cr);
1523 #endif
1524 }
1525
1526 static void split_communicator(const gmx::MDLogger &mdlog,
1527                                t_commrec *cr, gmx_domdec_t *dd,
1528                                DdRankOrder gmx_unused rankOrder,
1529                                bool gmx_unused reorder)
1530 {
1531     gmx_domdec_comm_t *comm;
1532     int                i;
1533     gmx_bool           bDiv[DIM];
1534 #if GMX_MPI
1535     MPI_Comm           comm_cart;
1536 #endif
1537
1538     comm = dd->comm;
1539
1540     if (comm->bCartesianPP)
1541     {
1542         for (i = 1; i < DIM; i++)
1543         {
1544             bDiv[i] = ((cr->npmenodes*dd->nc[i]) % (dd->nnodes) == 0);
1545         }
1546         if (bDiv[YY] || bDiv[ZZ])
1547         {
1548             comm->bCartesianPP_PME = TRUE;
1549             /* If we have 2D PME decomposition, which is always in x+y,
1550              * we stack the PME only nodes in z.
1551              * Otherwise we choose the direction that provides the thinnest slab
1552              * of PME only nodes as this will have the least effect
1553              * on the PP communication.
1554              * But for the PME communication the opposite might be better.
1555              */
1556             if (bDiv[ZZ] && (comm->npmenodes_y > 1 ||
1557                              !bDiv[YY] ||
1558                              dd->nc[YY] > dd->nc[ZZ]))
1559             {
1560                 comm->cartpmedim = ZZ;
1561             }
1562             else
1563             {
1564                 comm->cartpmedim = YY;
1565             }
1566             comm->ntot[comm->cartpmedim]
1567                 += (cr->npmenodes*dd->nc[comm->cartpmedim])/dd->nnodes;
1568         }
1569         else
1570         {
1571             GMX_LOG(mdlog.info).appendTextFormatted(
1572                     "Number of PME-only ranks (%d) is not a multiple of nx*ny (%d*%d) or nx*nz (%d*%d)",
1573                     cr->npmenodes, dd->nc[XX], dd->nc[YY], dd->nc[XX], dd->nc[ZZ]);
1574             GMX_LOG(mdlog.info).appendText("Will not use a Cartesian communicator for PP <-> PME\n");
1575         }
1576     }
1577
1578     if (comm->bCartesianPP_PME)
1579     {
1580 #if GMX_MPI
1581         int  rank;
1582         ivec periods;
1583
1584         GMX_LOG(mdlog.info).appendTextFormatted(
1585                 "Will use a Cartesian communicator for PP <-> PME: %d x %d x %d",
1586                 comm->ntot[XX], comm->ntot[YY], comm->ntot[ZZ]);
1587
1588         for (i = 0; i < DIM; i++)
1589         {
1590             periods[i] = TRUE;
1591         }
1592         MPI_Cart_create(cr->mpi_comm_mysim, DIM, comm->ntot, periods, static_cast<int>(reorder),
1593                         &comm_cart);
1594         MPI_Comm_rank(comm_cart, &rank);
1595         if (MASTER(cr) && rank != 0)
1596         {
1597             gmx_fatal(FARGS, "MPI rank 0 was renumbered by MPI_Cart_create, we do not allow this");
1598         }
1599
1600         /* With this assigment we loose the link to the original communicator
1601          * which will usually be MPI_COMM_WORLD, unless have multisim.
1602          */
1603         cr->mpi_comm_mysim = comm_cart;
1604         cr->sim_nodeid     = rank;
1605
1606         MPI_Cart_coords(cr->mpi_comm_mysim, cr->sim_nodeid, DIM, dd->ci);
1607
1608         GMX_LOG(mdlog.info).appendTextFormatted(
1609                 "Cartesian rank %d, coordinates %d %d %d\n",
1610                 cr->sim_nodeid, dd->ci[XX], dd->ci[YY], dd->ci[ZZ]);
1611
1612         if (dd->ci[comm->cartpmedim] < dd->nc[comm->cartpmedim])
1613         {
1614             cr->duty = DUTY_PP;
1615         }
1616         if (cr->npmenodes == 0 ||
1617             dd->ci[comm->cartpmedim] >= dd->nc[comm->cartpmedim])
1618         {
1619             cr->duty = DUTY_PME;
1620         }
1621
1622         /* Split the sim communicator into PP and PME only nodes */
1623         MPI_Comm_split(cr->mpi_comm_mysim,
1624                        getThisRankDuties(cr),
1625                        dd_index(comm->ntot, dd->ci),
1626                        &cr->mpi_comm_mygroup);
1627 #endif
1628     }
1629     else
1630     {
1631         switch (rankOrder)
1632         {
1633             case DdRankOrder::pp_pme:
1634                 GMX_LOG(mdlog.info).appendText("Order of the ranks: PP first, PME last");
1635                 break;
1636             case DdRankOrder::interleave:
1637                 /* Interleave the PP-only and PME-only ranks */
1638                 GMX_LOG(mdlog.info).appendText("Interleaving PP and PME ranks");
1639                 comm->pmenodes = dd_interleaved_pme_ranks(dd);
1640                 break;
1641             case DdRankOrder::cartesian:
1642                 break;
1643             default:
1644                 gmx_fatal(FARGS, "Invalid ddRankOrder=%d", static_cast<int>(rankOrder));
1645         }
1646
1647         if (dd_simnode2pmenode(dd, cr, cr->sim_nodeid) == -1)
1648         {
1649             cr->duty = DUTY_PME;
1650         }
1651         else
1652         {
1653             cr->duty = DUTY_PP;
1654         }
1655 #if GMX_MPI
1656         /* Split the sim communicator into PP and PME only nodes */
1657         MPI_Comm_split(cr->mpi_comm_mysim,
1658                        getThisRankDuties(cr),
1659                        cr->nodeid,
1660                        &cr->mpi_comm_mygroup);
1661         MPI_Comm_rank(cr->mpi_comm_mygroup, &cr->nodeid);
1662 #endif
1663     }
1664
1665     GMX_LOG(mdlog.info).appendTextFormatted(
1666             "This rank does only %s work.\n",
1667             thisRankHasDuty(cr, DUTY_PP) ? "particle-particle" : "PME-mesh");
1668 }
1669
1670 /*! \brief Generates the MPI communicators for domain decomposition */
1671 static void make_dd_communicators(const gmx::MDLogger &mdlog,
1672                                   t_commrec *cr,
1673                                   gmx_domdec_t *dd, DdRankOrder ddRankOrder)
1674 {
1675     gmx_domdec_comm_t *comm;
1676     bool               CartReorder;
1677
1678     comm = dd->comm;
1679
1680     copy_ivec(dd->nc, comm->ntot);
1681
1682     comm->bCartesianPP     = (ddRankOrder == DdRankOrder::cartesian);
1683     comm->bCartesianPP_PME = FALSE;
1684
1685     /* Reorder the nodes by default. This might change the MPI ranks.
1686      * Real reordering is only supported on very few architectures,
1687      * Blue Gene is one of them.
1688      */
1689     CartReorder = getenv("GMX_NO_CART_REORDER") == nullptr;
1690
1691     if (cr->npmenodes > 0)
1692     {
1693         /* Split the communicator into a PP and PME part */
1694         split_communicator(mdlog, cr, dd, ddRankOrder, CartReorder);
1695         if (comm->bCartesianPP_PME)
1696         {
1697             /* We (possibly) reordered the nodes in split_communicator,
1698              * so it is no longer required in make_pp_communicator.
1699              */
1700             CartReorder = FALSE;
1701         }
1702     }
1703     else
1704     {
1705         /* All nodes do PP and PME */
1706 #if GMX_MPI
1707         /* We do not require separate communicators */
1708         cr->mpi_comm_mygroup = cr->mpi_comm_mysim;
1709 #endif
1710     }
1711
1712     if (thisRankHasDuty(cr, DUTY_PP))
1713     {
1714         /* Copy or make a new PP communicator */
1715         make_pp_communicator(mdlog, dd, cr, CartReorder);
1716     }
1717     else
1718     {
1719         receive_ddindex2simnodeid(dd, cr);
1720     }
1721
1722     if (!thisRankHasDuty(cr, DUTY_PME))
1723     {
1724         /* Set up the commnuication to our PME node */
1725         dd->pme_nodeid           = dd_simnode2pmenode(dd, cr, cr->sim_nodeid);
1726         dd->pme_receive_vir_ener = receive_vir_ener(dd, cr);
1727         if (debug)
1728         {
1729             fprintf(debug, "My pme_nodeid %d receive ener %s\n",
1730                     dd->pme_nodeid, gmx::boolToString(dd->pme_receive_vir_ener));
1731         }
1732     }
1733     else
1734     {
1735         dd->pme_nodeid = -1;
1736     }
1737
1738     /* We can not use DDMASTER(dd), because dd->masterrank is set later */
1739     if (MASTER(cr))
1740     {
1741         dd->ma = std::make_unique<AtomDistribution>(dd->nc,
1742                                                     comm->cgs_gl.nr,
1743                                                     comm->cgs_gl.index[comm->cgs_gl.nr]);
1744     }
1745 }
1746
1747 static real *get_slb_frac(const gmx::MDLogger &mdlog,
1748                           const char *dir, int nc, const char *size_string)
1749 {
1750     real  *slb_frac, tot;
1751     int    i, n;
1752     double dbl;
1753
1754     slb_frac = nullptr;
1755     if (nc > 1 && size_string != nullptr)
1756     {
1757         GMX_LOG(mdlog.info).appendTextFormatted(
1758                 "Using static load balancing for the %s direction", dir);
1759         snew(slb_frac, nc);
1760         tot = 0;
1761         for (i = 0; i < nc; i++)
1762         {
1763             dbl = 0;
1764             sscanf(size_string, "%20lf%n", &dbl, &n);
1765             if (dbl == 0)
1766             {
1767                 gmx_fatal(FARGS, "Incorrect or not enough DD cell size entries for direction %s: '%s'", dir, size_string);
1768             }
1769             slb_frac[i]  = dbl;
1770             size_string += n;
1771             tot         += slb_frac[i];
1772         }
1773         /* Normalize */
1774         std::string relativeCellSizes = "Relative cell sizes:";
1775         for (i = 0; i < nc; i++)
1776         {
1777             slb_frac[i]       /= tot;
1778             relativeCellSizes += gmx::formatString(" %5.3f", slb_frac[i]);
1779         }
1780         GMX_LOG(mdlog.info).appendText(relativeCellSizes);
1781     }
1782
1783     return slb_frac;
1784 }
1785
1786 static int multi_body_bondeds_count(const gmx_mtop_t *mtop)
1787 {
1788     int                  n     = 0;
1789     gmx_mtop_ilistloop_t iloop = gmx_mtop_ilistloop_init(mtop);
1790     int                  nmol;
1791     while (const InteractionLists *ilists = gmx_mtop_ilistloop_next(iloop, &nmol))
1792     {
1793         for (auto &ilist : extractILists(*ilists, IF_BOND))
1794         {
1795             if (NRAL(ilist.functionType) >  2)
1796             {
1797                 n += nmol*(ilist.iatoms.size()/ilistStride(ilist));
1798             }
1799         }
1800     }
1801
1802     return n;
1803 }
1804
1805 static int dd_getenv(const gmx::MDLogger &mdlog,
1806                      const char *env_var, int def)
1807 {
1808     char *val;
1809     int   nst;
1810
1811     nst = def;
1812     val = getenv(env_var);
1813     if (val)
1814     {
1815         if (sscanf(val, "%20d", &nst) <= 0)
1816         {
1817             nst = 1;
1818         }
1819         GMX_LOG(mdlog.info).appendTextFormatted(
1820                 "Found env.var. %s = %s, using value %d",
1821                 env_var, val, nst);
1822     }
1823
1824     return nst;
1825 }
1826
1827 static void check_dd_restrictions(const gmx_domdec_t  *dd,
1828                                   const t_inputrec    *ir,
1829                                   const gmx::MDLogger &mdlog)
1830 {
1831     if (ir->ePBC == epbcSCREW &&
1832         (dd->nc[XX] == 1 || dd->nc[YY] > 1 || dd->nc[ZZ] > 1))
1833     {
1834         gmx_fatal(FARGS, "With pbc=%s can only do domain decomposition in the x-direction", epbc_names[ir->ePBC]);
1835     }
1836
1837     if (ir->ns_type == ensSIMPLE)
1838     {
1839         gmx_fatal(FARGS, "Domain decomposition does not support simple neighbor searching, use grid searching or run with one MPI rank");
1840     }
1841
1842     if (ir->nstlist == 0)
1843     {
1844         gmx_fatal(FARGS, "Domain decomposition does not work with nstlist=0");
1845     }
1846
1847     if (ir->comm_mode == ecmANGULAR && ir->ePBC != epbcNONE)
1848     {
1849         GMX_LOG(mdlog.warning).appendText("comm-mode angular will give incorrect results when the comm group partially crosses a periodic boundary");
1850     }
1851 }
1852
1853 static real average_cellsize_min(const gmx_ddbox_t &ddbox,
1854                                  const ivec         numDomains)
1855 {
1856     real r = ddbox.box_size[XX];
1857     for (int d = 0; d < DIM; d++)
1858     {
1859         if (numDomains[d] > 1)
1860         {
1861             /* Check using the initial average cell size */
1862             r = std::min(r, ddbox.box_size[d]*ddbox.skew_fac[d]/numDomains[d]);
1863         }
1864     }
1865
1866     return r;
1867 }
1868
1869 /*! \brief Depending on the DLB initial value return the DLB switched off state or issue an error.
1870  */
1871 static DlbState forceDlbOffOrBail(DlbState             cmdlineDlbState,
1872                                   const std::string   &reasonStr,
1873                                   const gmx::MDLogger &mdlog)
1874 {
1875     std::string dlbNotSupportedErr  = "Dynamic load balancing requested, but ";
1876     std::string dlbDisableNote      = "NOTE: disabling dynamic load balancing as ";
1877
1878     if (cmdlineDlbState == DlbState::onUser)
1879     {
1880         gmx_fatal(FARGS, "%s", (dlbNotSupportedErr + reasonStr).c_str());
1881     }
1882     else if (cmdlineDlbState == DlbState::offCanTurnOn)
1883     {
1884         GMX_LOG(mdlog.info).appendText(dlbDisableNote + reasonStr);
1885     }
1886     return DlbState::offForever;
1887 }
1888
1889 /*! \brief Return the dynamic load balancer's initial state based on initial conditions and user inputs.
1890  *
1891  * This function parses the parameters of "-dlb" command line option setting
1892  * corresponding state values. Then it checks the consistency of the determined
1893  * state with other run parameters and settings. As a result, the initial state
1894  * may be altered or an error may be thrown if incompatibility of options is detected.
1895  *
1896  * \param [in] mdlog       Logger.
1897  * \param [in] dlbOption   Enum value for the DLB option.
1898  * \param [in] bRecordLoad True if the load balancer is recording load information.
1899  * \param [in] mdrunOptions  Options for mdrun.
1900  * \param [in] ir          Pointer mdrun to input parameters.
1901  * \returns                DLB initial/startup state.
1902  */
1903 static DlbState determineInitialDlbState(const gmx::MDLogger &mdlog,
1904                                          DlbOption dlbOption, gmx_bool bRecordLoad,
1905                                          const gmx::MdrunOptions &mdrunOptions,
1906                                          const t_inputrec *ir)
1907 {
1908     DlbState dlbState = DlbState::offCanTurnOn;
1909
1910     switch (dlbOption)
1911     {
1912         case DlbOption::turnOnWhenUseful: dlbState = DlbState::offCanTurnOn; break;
1913         case DlbOption::no:               dlbState = DlbState::offUser;      break;
1914         case DlbOption::yes:              dlbState = DlbState::onUser;       break;
1915         default: gmx_incons("Invalid dlbOption enum value");
1916     }
1917
1918     /* Reruns don't support DLB: bail or override auto mode */
1919     if (mdrunOptions.rerun)
1920     {
1921         std::string reasonStr = "it is not supported in reruns.";
1922         return forceDlbOffOrBail(dlbState, reasonStr, mdlog);
1923     }
1924
1925     /* Unsupported integrators */
1926     if (!EI_DYNAMICS(ir->eI))
1927     {
1928         auto reasonStr = gmx::formatString("it is only supported with dynamics, not with integrator '%s'.", EI(ir->eI));
1929         return forceDlbOffOrBail(dlbState, reasonStr, mdlog);
1930     }
1931
1932     /* Without cycle counters we can't time work to balance on */
1933     if (!bRecordLoad)
1934     {
1935         std::string reasonStr = "cycle counters unsupported or not enabled in the operating system kernel.";
1936         return forceDlbOffOrBail(dlbState, reasonStr, mdlog);
1937     }
1938
1939     if (mdrunOptions.reproducible)
1940     {
1941         std::string reasonStr = "you started a reproducible run.";
1942         switch (dlbState)
1943         {
1944             case DlbState::offUser:
1945                 break;
1946             case DlbState::offForever:
1947                 GMX_RELEASE_ASSERT(false, "DlbState::offForever is not a valid initial state");
1948                 break;
1949             case DlbState::offCanTurnOn:
1950                 return forceDlbOffOrBail(dlbState, reasonStr, mdlog);
1951             case DlbState::onCanTurnOff:
1952                 GMX_RELEASE_ASSERT(false, "DlbState::offCanTurnOff is not a valid initial state");
1953                 break;
1954             case DlbState::onUser:
1955                 return forceDlbOffOrBail(dlbState, reasonStr + " In load balanced runs binary reproducibility cannot be ensured.", mdlog);
1956             default:
1957                 gmx_fatal(FARGS, "Death horror: undefined case (%d) for load balancing choice", static_cast<int>(dlbState));
1958         }
1959     }
1960
1961     return dlbState;
1962 }
1963
1964 static void set_dd_dim(const gmx::MDLogger &mdlog, gmx_domdec_t *dd)
1965 {
1966     dd->ndim = 0;
1967     if (getenv("GMX_DD_ORDER_ZYX") != nullptr)
1968     {
1969         /* Decomposition order z,y,x */
1970         GMX_LOG(mdlog.info).appendText("Using domain decomposition order z, y, x");
1971         for (int dim = DIM-1; dim >= 0; dim--)
1972         {
1973             if (dd->nc[dim] > 1)
1974             {
1975                 dd->dim[dd->ndim++] = dim;
1976             }
1977         }
1978     }
1979     else
1980     {
1981         /* Decomposition order x,y,z */
1982         for (int dim = 0; dim < DIM; dim++)
1983         {
1984             if (dd->nc[dim] > 1)
1985             {
1986                 dd->dim[dd->ndim++] = dim;
1987             }
1988         }
1989     }
1990
1991     if (dd->ndim == 0)
1992     {
1993         /* Set dim[0] to avoid extra checks on ndim in several places */
1994         dd->dim[0] = XX;
1995     }
1996 }
1997
1998 static gmx_domdec_comm_t *init_dd_comm()
1999 {
2000     gmx_domdec_comm_t *comm = new gmx_domdec_comm_t;
2001
2002     comm->n_load_have      = 0;
2003     comm->n_load_collect   = 0;
2004
2005     comm->haveTurnedOffDlb = false;
2006
2007     for (int i = 0; i < static_cast<int>(DDAtomRanges::Type::Number); i++)
2008     {
2009         comm->sum_nat[i] = 0;
2010     }
2011     comm->ndecomp   = 0;
2012     comm->nload     = 0;
2013     comm->load_step = 0;
2014     comm->load_sum  = 0;
2015     comm->load_max  = 0;
2016     clear_ivec(comm->load_lim);
2017     comm->load_mdf  = 0;
2018     comm->load_pme  = 0;
2019
2020     /* This should be replaced by a unique pointer */
2021     comm->balanceRegion = ddBalanceRegionAllocate();
2022
2023     return comm;
2024 }
2025
2026 /* Returns whether mtop contains constraints and/or vsites */
2027 static bool systemHasConstraintsOrVsites(const gmx_mtop_t &mtop)
2028 {
2029     auto ilistLoop = gmx_mtop_ilistloop_init(mtop);
2030     int  nmol;
2031     while (const InteractionLists *ilists = gmx_mtop_ilistloop_next(ilistLoop, &nmol))
2032     {
2033         if (!extractILists(*ilists, IF_CONSTRAINT | IF_VSITE).empty())
2034         {
2035             return true;
2036         }
2037     }
2038
2039     return false;
2040 }
2041
2042 static void setupUpdateGroups(const gmx::MDLogger &mdlog,
2043                               const gmx_mtop_t    &mtop,
2044                               const t_inputrec    &inputrec,
2045                               const real           cutoffMargin,
2046                               DDSystemInfo        *systemInfo)
2047 {
2048     /* When we have constraints and/or vsites, it is beneficial to use
2049      * update groups (when possible) to allow independent update of groups.
2050      */
2051     if (!systemHasConstraintsOrVsites(mtop))
2052     {
2053         /* No constraints or vsites, atoms can be updated independently */
2054         return;
2055     }
2056
2057     systemInfo->updateGroupingPerMoleculetype = gmx::makeUpdateGroups(mtop);
2058     systemInfo->useUpdateGroups               =
2059         (!systemInfo->updateGroupingPerMoleculetype.empty() &&
2060          getenv("GMX_NO_UPDATEGROUPS") == nullptr);
2061
2062     if (systemInfo->useUpdateGroups)
2063     {
2064         int numUpdateGroups = 0;
2065         for (const auto &molblock : mtop.molblock)
2066         {
2067             numUpdateGroups += molblock.nmol*systemInfo->updateGroupingPerMoleculetype[molblock.type].numBlocks();
2068         }
2069
2070         systemInfo->maxUpdateGroupRadius =
2071             computeMaxUpdateGroupRadius(mtop,
2072                                         systemInfo->updateGroupingPerMoleculetype,
2073                                         maxReferenceTemperature(inputrec));
2074
2075         /* To use update groups, the large domain-to-domain cutoff distance
2076          * should be compatible with the box size.
2077          */
2078         systemInfo->useUpdateGroups = (atomToAtomIntoDomainToDomainCutoff(*systemInfo, 0) < cutoffMargin);
2079
2080         if (systemInfo->useUpdateGroups)
2081         {
2082             GMX_LOG(mdlog.info).appendTextFormatted(
2083                     "Using update groups, nr %d, average size %.1f atoms, max. radius %.3f nm\n",
2084                     numUpdateGroups,
2085                     mtop.natoms/static_cast<double>(numUpdateGroups),
2086                     systemInfo->maxUpdateGroupRadius);
2087         }
2088         else
2089         {
2090             GMX_LOG(mdlog.info).appendTextFormatted("The combination of rlist and box size prohibits the use of update groups\n");
2091             systemInfo->updateGroupingPerMoleculetype.clear();
2092         }
2093     }
2094 }
2095
2096 UnitCellInfo::UnitCellInfo(const t_inputrec &ir) :
2097     npbcdim(ePBC2npbcdim(ir.ePBC)),
2098     numBoundedDimensions(inputrec2nboundeddim(&ir)),
2099     ddBoxIsDynamic(numBoundedDimensions < DIM || inputrecDynamicBox(&ir)),
2100     haveScrewPBC(ir.ePBC == epbcSCREW)
2101 {
2102 }
2103
2104 /*! \brief Set the cell size and interaction limits, as well as the DD grid */
2105 static void set_dd_limits_and_grid(const gmx::MDLogger &mdlog,
2106                                    t_commrec *cr, gmx_domdec_t *dd,
2107                                    const DomdecOptions &options,
2108                                    const gmx::MdrunOptions &mdrunOptions,
2109                                    const gmx_mtop_t *mtop,
2110                                    const t_inputrec *ir,
2111                                    const matrix box,
2112                                    gmx::ArrayRef<const gmx::RVec> xGlobal,
2113                                    gmx_ddbox_t *ddbox)
2114 {
2115     real               r_bonded         = -1;
2116     real               r_bonded_limit   = -1;
2117     const real         tenPercentMargin = 1.1;
2118     gmx_domdec_comm_t *comm             = dd->comm;
2119
2120     /* Initialize to GPU share count to 0, might change later */
2121     comm->nrank_gpu_shared = 0;
2122
2123     comm->dlbState         = determineInitialDlbState(mdlog, options.dlbOption, comm->bRecordLoad, mdrunOptions, ir);
2124     dd_dlb_set_should_check_whether_to_turn_dlb_on(dd, TRUE);
2125     /* To consider turning DLB on after 2*nstlist steps we need to check
2126      * at partitioning count 3. Thus we need to increase the first count by 2.
2127      */
2128     comm->ddPartioningCountFirstDlbOff += 2;
2129
2130     GMX_LOG(mdlog.info).appendTextFormatted(
2131             "Dynamic load balancing: %s", edlbs_names[int(comm->dlbState)]);
2132
2133     comm->bPMELoadBalDLBLimits = FALSE;
2134
2135     /* Allocate the charge group/atom sorting struct */
2136     comm->sort = std::make_unique<gmx_domdec_sort_t>();
2137
2138     /* Generate the simulation system information */
2139     DDSystemInfo &systemInfo = comm->systemInfo;
2140
2141     /* We need to decide on update groups early, as this affects communication distances */
2142     systemInfo.useUpdateGroups = false;
2143     if (ir->cutoff_scheme == ecutsVERLET)
2144     {
2145         real cutoffMargin = std::sqrt(max_cutoff2(ir->ePBC, box)) - ir->rlist;
2146         setupUpdateGroups(mdlog, *mtop, *ir, cutoffMargin, &systemInfo);
2147
2148         if (systemInfo.useUpdateGroups)
2149         {
2150             /* Note: We would like to use dd->nnodes for the atom count estimate,
2151              *       but that is not yet available here. But this anyhow only
2152              *       affect performance up to the second dd_partition_system call.
2153              */
2154             const int homeAtomCountEstimate =  mtop->natoms/cr->nnodes;
2155             comm->updateGroupsCog =
2156                 std::make_unique<gmx::UpdateGroupsCog>(*mtop,
2157                                                        systemInfo.updateGroupingPerMoleculetype,
2158                                                        maxReferenceTemperature(*ir),
2159                                                        homeAtomCountEstimate);
2160         }
2161     }
2162
2163     // TODO: Check whether all bondeds are within update groups
2164     systemInfo.haveInterDomainBondeds          = (mtop->natoms > gmx_mtop_num_molecules(*mtop) ||
2165                                                   mtop->bIntermolecularInteractions);
2166     systemInfo.haveInterDomainMultiBodyBondeds = (multi_body_bondeds_count(mtop) > 0);
2167
2168     if (systemInfo.useUpdateGroups)
2169     {
2170         dd->splitConstraints = false;
2171         dd->splitSettles     = false;
2172     }
2173     else
2174     {
2175         dd->splitConstraints = gmx::inter_charge_group_constraints(*mtop);
2176         dd->splitSettles     = gmx::inter_charge_group_settles(*mtop);
2177     }
2178
2179     if (ir->rlist == 0)
2180     {
2181         /* Set the cut-off to some very large value,
2182          * so we don't need if statements everywhere in the code.
2183          * We use sqrt, since the cut-off is squared in some places.
2184          */
2185         systemInfo.cutoff = GMX_CUTOFF_INF;
2186     }
2187     else
2188     {
2189         systemInfo.cutoff = atomToAtomIntoDomainToDomainCutoff(systemInfo, ir->rlist);
2190     }
2191     systemInfo.minCutoffForMultiBody = 0;
2192
2193     /* Determine the minimum cell size limit, affected by many factors */
2194     systemInfo.cellsizeLimit = 0;
2195     comm->bBondComm          = FALSE;
2196
2197     /* We do not allow home atoms to move beyond the neighboring domain
2198      * between domain decomposition steps, which limits the cell size.
2199      * Get an estimate of cell size limit due to atom displacement.
2200      * In most cases this is a large overestimate, because it assumes
2201      * non-interaction atoms.
2202      * We set the chance to 1 in a trillion steps.
2203      */
2204     constexpr real c_chanceThatAtomMovesBeyondDomain = 1e-12;
2205     const real     limitForAtomDisplacement          =
2206         minCellSizeForAtomDisplacement(*mtop, *ir,
2207                                        systemInfo.updateGroupingPerMoleculetype,
2208                                        c_chanceThatAtomMovesBeyondDomain);
2209     GMX_LOG(mdlog.info).appendTextFormatted(
2210             "Minimum cell size due to atom displacement: %.3f nm",
2211             limitForAtomDisplacement);
2212
2213     systemInfo.cellsizeLimit = std::max(systemInfo.cellsizeLimit,
2214                                         limitForAtomDisplacement);
2215
2216     /* TODO: PME decomposition currently requires atoms not to be more than
2217      *       2/3 of comm->cutoff, which is >=rlist, outside of their domain.
2218      *       In nearly all cases, limitForAtomDisplacement will be smaller
2219      *       than 2/3*rlist, so the PME requirement is satisfied.
2220      *       But it would be better for both correctness and performance
2221      *       to use limitForAtomDisplacement instead of 2/3*comm->cutoff.
2222      *       Note that we would need to improve the pairlist buffer case.
2223      */
2224
2225     if (systemInfo.haveInterDomainBondeds)
2226     {
2227         if (options.minimumCommunicationRange > 0)
2228         {
2229             systemInfo.minCutoffForMultiBody =
2230                 atomToAtomIntoDomainToDomainCutoff(systemInfo, options.minimumCommunicationRange);
2231             if (options.useBondedCommunication)
2232             {
2233                 comm->bBondComm = (systemInfo.minCutoffForMultiBody > systemInfo.cutoff);
2234             }
2235             else
2236             {
2237                 systemInfo.cutoff = std::max(systemInfo.cutoff,
2238                                              systemInfo.minCutoffForMultiBody);
2239             }
2240             r_bonded_limit = systemInfo.minCutoffForMultiBody;
2241         }
2242         else if (ir->bPeriodicMols)
2243         {
2244             /* Can not easily determine the required cut-off */
2245             GMX_LOG(mdlog.warning).appendText("NOTE: Periodic molecules are present in this system. Because of this, the domain decomposition algorithm cannot easily determine the minimum cell size that it requires for treating bonded interactions. Instead, domain decomposition will assume that half the non-bonded cut-off will be a suitable lower bound.");
2246             systemInfo.minCutoffForMultiBody = systemInfo.cutoff/2;
2247             r_bonded_limit                   = systemInfo.minCutoffForMultiBody;
2248         }
2249         else
2250         {
2251             real r_2b, r_mb;
2252
2253             if (MASTER(cr))
2254             {
2255                 dd_bonded_cg_distance(mdlog, mtop, ir, as_rvec_array(xGlobal.data()), box,
2256                                       options.checkBondedInteractions,
2257                                       &r_2b, &r_mb);
2258             }
2259             gmx_bcast(sizeof(r_2b), &r_2b, cr);
2260             gmx_bcast(sizeof(r_mb), &r_mb, cr);
2261
2262             /* We use an initial margin of 10% for the minimum cell size,
2263              * except when we are just below the non-bonded cut-off.
2264              */
2265             if (options.useBondedCommunication)
2266             {
2267                 if (std::max(r_2b, r_mb) > comm->systemInfo.cutoff)
2268                 {
2269                     r_bonded        = std::max(r_2b, r_mb);
2270                     r_bonded_limit  = tenPercentMargin*r_bonded;
2271                     comm->bBondComm = TRUE;
2272                 }
2273                 else
2274                 {
2275                     r_bonded       = r_mb;
2276                     r_bonded_limit = std::min(tenPercentMargin*r_bonded, systemInfo.cutoff);
2277                 }
2278                 /* We determine cutoff_mbody later */
2279             }
2280             else
2281             {
2282                 /* No special bonded communication,
2283                  * simply increase the DD cut-off.
2284                  */
2285                 r_bonded_limit                   = tenPercentMargin*std::max(r_2b, r_mb);
2286                 systemInfo.minCutoffForMultiBody = r_bonded_limit;
2287                 systemInfo.cutoff                = std::max(systemInfo.cutoff,
2288                                                             systemInfo.minCutoffForMultiBody);
2289             }
2290         }
2291         GMX_LOG(mdlog.info).appendTextFormatted(
2292                 "Minimum cell size due to bonded interactions: %.3f nm",
2293                 r_bonded_limit);
2294
2295         systemInfo.cellsizeLimit = std::max(systemInfo.cellsizeLimit, r_bonded_limit);
2296     }
2297
2298     real rconstr = 0;
2299     if (dd->splitConstraints && options.constraintCommunicationRange <= 0)
2300     {
2301         /* There is a cell size limit due to the constraints (P-LINCS) */
2302         rconstr = gmx::constr_r_max(mdlog, mtop, ir);
2303         GMX_LOG(mdlog.info).appendTextFormatted(
2304                 "Estimated maximum distance required for P-LINCS: %.3f nm",
2305                 rconstr);
2306         if (rconstr > systemInfo.cellsizeLimit)
2307         {
2308             GMX_LOG(mdlog.info).appendText("This distance will limit the DD cell size, you can override this with -rcon");
2309         }
2310     }
2311     else if (options.constraintCommunicationRange > 0)
2312     {
2313         /* Here we do not check for dd->splitConstraints.
2314          * because one can also set a cell size limit for virtual sites only
2315          * and at this point we don't know yet if there are intercg v-sites.
2316          */
2317         GMX_LOG(mdlog.info).appendTextFormatted(
2318                 "User supplied maximum distance required for P-LINCS: %.3f nm",
2319                 options.constraintCommunicationRange);
2320         rconstr = options.constraintCommunicationRange;
2321     }
2322     systemInfo.cellsizeLimit = std::max(systemInfo.cellsizeLimit, rconstr);
2323
2324     comm->cgs_gl = gmx_mtop_global_cgs(mtop);
2325
2326     DDSetup ddSetup;
2327     if (options.numCells[XX] > 0)
2328     {
2329         copy_ivec(options.numCells, ddSetup.numDomains);
2330         set_ddbox_cr(*cr, &ddSetup.numDomains, *ir, box, xGlobal, ddbox);
2331
2332         if (options.numPmeRanks >= 0)
2333         {
2334             ddSetup.numPmeRanks = options.numPmeRanks;
2335         }
2336         else
2337         {
2338             /* When the DD grid is set explicitly and -npme is set to auto,
2339              * don't use PME ranks. We check later if the DD grid is
2340              * compatible with the total number of ranks.
2341              */
2342             ddSetup.numPmeRanks = 0;
2343         }
2344     }
2345     else
2346     {
2347         set_ddbox_cr(*cr, nullptr, *ir, box, xGlobal, ddbox);
2348
2349         /* We need to choose the optimal DD grid and possibly PME nodes */
2350         ddSetup =
2351             dd_choose_grid(mdlog, cr, ir, mtop, box, ddbox,
2352                            options.numPmeRanks,
2353                            !isDlbDisabled(comm),
2354                            options.dlbScaling,
2355                            systemInfo);
2356
2357         if (ddSetup.numDomains[XX] == 0)
2358         {
2359             char     buf[STRLEN];
2360             gmx_bool bC = (dd->splitConstraints && rconstr > r_bonded_limit);
2361             sprintf(buf, "Change the number of ranks or mdrun option %s%s%s",
2362                     !bC ? "-rdd" : "-rcon",
2363                     comm->dlbState != DlbState::offUser ? " or -dds" : "",
2364                     bC ? " or your LINCS settings" : "");
2365
2366             gmx_fatal_collective(FARGS, cr->mpi_comm_mysim, MASTER(cr),
2367                                  "There is no domain decomposition for %d ranks that is compatible with the given box and a minimum cell size of %g nm\n"
2368                                  "%s\n"
2369                                  "Look in the log file for details on the domain decomposition",
2370                                  cr->nnodes - ddSetup.numPmeRanks,
2371                                  ddSetup.cellsizeLimit,
2372                                  buf);
2373         }
2374     }
2375
2376     const real acs = average_cellsize_min(*ddbox, ddSetup.numDomains);
2377     if (acs < systemInfo.cellsizeLimit)
2378     {
2379         if (options.numCells[XX] <= 0)
2380         {
2381             GMX_RELEASE_ASSERT(false, "dd_choose_grid() should return a grid that satisfies the cell size limits");
2382         }
2383         else
2384         {
2385             gmx_fatal_collective(FARGS, cr->mpi_comm_mysim, MASTER(cr),
2386                                  "The initial cell size (%f) is smaller than the cell size limit (%f), change options -dd, -rdd or -rcon, see the log file for details",
2387                                  acs, systemInfo.cellsizeLimit);
2388         }
2389     }
2390
2391     /* Set the DD setup given by ddSetup */
2392     cr->npmenodes = ddSetup.numPmeRanks;
2393     copy_ivec(ddSetup.numDomains, dd->nc);
2394     set_dd_dim(mdlog, dd);
2395
2396     GMX_LOG(mdlog.info).appendTextFormatted(
2397             "Domain decomposition grid %d x %d x %d, separate PME ranks %d",
2398             dd->nc[XX], dd->nc[YY], dd->nc[ZZ], cr->npmenodes);
2399
2400     dd->nnodes = dd->nc[XX]*dd->nc[YY]*dd->nc[ZZ];
2401     if (cr->nnodes - dd->nnodes != cr->npmenodes)
2402     {
2403         gmx_fatal_collective(FARGS, cr->mpi_comm_mysim, MASTER(cr),
2404                              "The size of the domain decomposition grid (%d) does not match the number of ranks (%d). The total number of ranks is %d",
2405                              dd->nnodes, cr->nnodes - cr->npmenodes, cr->nnodes);
2406     }
2407     if (cr->npmenodes > dd->nnodes)
2408     {
2409         gmx_fatal_collective(FARGS, cr->mpi_comm_mysim, MASTER(cr),
2410                              "The number of separate PME ranks (%d) is larger than the number of PP ranks (%d), this is not supported.", cr->npmenodes, dd->nnodes);
2411     }
2412     if (cr->npmenodes > 0)
2413     {
2414         comm->npmenodes = cr->npmenodes;
2415     }
2416     else
2417     {
2418         comm->npmenodes = dd->nnodes;
2419     }
2420
2421     if (EEL_PME(ir->coulombtype) || EVDW_PME(ir->vdwtype))
2422     {
2423         /* The following choices should match those
2424          * in comm_cost_est in domdec_setup.c.
2425          * Note that here the checks have to take into account
2426          * that the decomposition might occur in a different order than xyz
2427          * (for instance through the env.var. GMX_DD_ORDER_ZYX),
2428          * in which case they will not match those in comm_cost_est,
2429          * but since that is mainly for testing purposes that's fine.
2430          */
2431         if (dd->ndim >= 2 && dd->dim[0] == XX && dd->dim[1] == YY &&
2432             comm->npmenodes > dd->nc[XX] && comm->npmenodes % dd->nc[XX] == 0 &&
2433             getenv("GMX_PMEONEDD") == nullptr)
2434         {
2435             comm->npmedecompdim = 2;
2436             comm->npmenodes_x   = dd->nc[XX];
2437             comm->npmenodes_y   = comm->npmenodes/comm->npmenodes_x;
2438         }
2439         else
2440         {
2441             /* In case nc is 1 in both x and y we could still choose to
2442              * decompose pme in y instead of x, but we use x for simplicity.
2443              */
2444             comm->npmedecompdim = 1;
2445             if (dd->dim[0] == YY)
2446             {
2447                 comm->npmenodes_x = 1;
2448                 comm->npmenodes_y = comm->npmenodes;
2449             }
2450             else
2451             {
2452                 comm->npmenodes_x = comm->npmenodes;
2453                 comm->npmenodes_y = 1;
2454             }
2455         }
2456         GMX_LOG(mdlog.info).appendTextFormatted(
2457                 "PME domain decomposition: %d x %d x %d",
2458                 comm->npmenodes_x, comm->npmenodes_y, 1);
2459     }
2460     else
2461     {
2462         comm->npmedecompdim = 0;
2463         comm->npmenodes_x   = 0;
2464         comm->npmenodes_y   = 0;
2465     }
2466
2467     snew(comm->slb_frac, DIM);
2468     if (isDlbDisabled(comm))
2469     {
2470         comm->slb_frac[XX] = get_slb_frac(mdlog, "x", dd->nc[XX], options.cellSizeX);
2471         comm->slb_frac[YY] = get_slb_frac(mdlog, "y", dd->nc[YY], options.cellSizeY);
2472         comm->slb_frac[ZZ] = get_slb_frac(mdlog, "z", dd->nc[ZZ], options.cellSizeZ);
2473     }
2474
2475     /* Set the multi-body cut-off and cellsize limit for DLB */
2476     comm->cutoff_mbody   = systemInfo.minCutoffForMultiBody;
2477     comm->cellsize_limit = systemInfo.cellsizeLimit;
2478     if (systemInfo.haveInterDomainBondeds && comm->cutoff_mbody == 0)
2479     {
2480         if (comm->bBondComm || !isDlbDisabled(comm))
2481         {
2482             /* Set the bonded communication distance to halfway
2483              * the minimum and the maximum,
2484              * since the extra communication cost is nearly zero.
2485              */
2486             real acs           = average_cellsize_min(*ddbox, dd->nc);
2487             comm->cutoff_mbody = 0.5*(r_bonded + acs);
2488             if (!isDlbDisabled(comm))
2489             {
2490                 /* Check if this does not limit the scaling */
2491                 comm->cutoff_mbody = std::min(comm->cutoff_mbody,
2492                                               options.dlbScaling*acs);
2493             }
2494             if (!comm->bBondComm)
2495             {
2496                 /* Without bBondComm do not go beyond the n.b. cut-off */
2497                 comm->cutoff_mbody = std::min(comm->cutoff_mbody, systemInfo.cutoff);
2498                 if (comm->cellsize_limit >= systemInfo.cutoff)
2499                 {
2500                     /* We don't loose a lot of efficieny
2501                      * when increasing it to the n.b. cut-off.
2502                      * It can even be slightly faster, because we need
2503                      * less checks for the communication setup.
2504                      */
2505                     comm->cutoff_mbody = systemInfo.cutoff;
2506                 }
2507             }
2508             /* Check if we did not end up below our original limit */
2509             comm->cutoff_mbody = std::max(comm->cutoff_mbody, r_bonded_limit);
2510
2511             if (comm->cutoff_mbody > comm->cellsize_limit)
2512             {
2513                 comm->cellsize_limit = comm->cutoff_mbody;
2514             }
2515         }
2516         /* Without DLB and cutoff_mbody<cutoff, cutoff_mbody is dynamic */
2517     }
2518
2519     if (debug)
2520     {
2521         fprintf(debug, "Bonded atom communication beyond the cut-off: %s\n"
2522                 "cellsize limit %f\n",
2523                 gmx::boolToString(comm->bBondComm), comm->cellsize_limit);
2524     }
2525
2526     if (MASTER(cr))
2527     {
2528         check_dd_restrictions(dd, ir, mdlog);
2529     }
2530 }
2531
2532 static char *init_bLocalCG(const gmx_mtop_t *mtop)
2533 {
2534     int   ncg, cg;
2535     char *bLocalCG;
2536
2537     ncg = ncg_mtop(mtop);
2538     snew(bLocalCG, ncg);
2539     for (cg = 0; cg < ncg; cg++)
2540     {
2541         bLocalCG[cg] = FALSE;
2542     }
2543
2544     return bLocalCG;
2545 }
2546
2547 void dd_init_bondeds(FILE *fplog,
2548                      gmx_domdec_t *dd,
2549                      const gmx_mtop_t *mtop,
2550                      const gmx_vsite_t *vsite,
2551                      const t_inputrec *ir,
2552                      gmx_bool bBCheck, cginfo_mb_t *cginfo_mb)
2553 {
2554     gmx_domdec_comm_t *comm;
2555
2556     dd_make_reverse_top(fplog, dd, mtop, vsite, ir, bBCheck);
2557
2558     comm = dd->comm;
2559
2560     if (comm->bBondComm)
2561     {
2562         /* Communicate atoms beyond the cut-off for bonded interactions */
2563         comm = dd->comm;
2564
2565         comm->cglink = make_charge_group_links(mtop, dd, cginfo_mb);
2566
2567         comm->bLocalCG = init_bLocalCG(mtop);
2568     }
2569     else
2570     {
2571         /* Only communicate atoms based on cut-off */
2572         comm->cglink   = nullptr;
2573         comm->bLocalCG = nullptr;
2574     }
2575 }
2576
2577 static void writeSettings(gmx::TextWriter       *log,
2578                           gmx_domdec_t          *dd,
2579                           const gmx_mtop_t      *mtop,
2580                           const t_inputrec      *ir,
2581                           gmx_bool               bDynLoadBal,
2582                           real                   dlb_scale,
2583                           const gmx_ddbox_t     *ddbox)
2584 {
2585     gmx_domdec_comm_t *comm;
2586     int                d;
2587     ivec               np;
2588     real               limit, shrink;
2589
2590     comm = dd->comm;
2591
2592     if (bDynLoadBal)
2593     {
2594         log->writeString("The maximum number of communication pulses is:");
2595         for (d = 0; d < dd->ndim; d++)
2596         {
2597             log->writeStringFormatted(" %c %d", dim2char(dd->dim[d]), comm->cd[d].np_dlb);
2598         }
2599         log->ensureLineBreak();
2600         log->writeLineFormatted("The minimum size for domain decomposition cells is %.3f nm", comm->cellsize_limit);
2601         log->writeLineFormatted("The requested allowed shrink of DD cells (option -dds) is: %.2f", dlb_scale);
2602         log->writeString("The allowed shrink of domain decomposition cells is:");
2603         for (d = 0; d < DIM; d++)
2604         {
2605             if (dd->nc[d] > 1)
2606             {
2607                 if (d >= ddbox->npbcdim && dd->nc[d] == 2)
2608                 {
2609                     shrink = 0;
2610                 }
2611                 else
2612                 {
2613                     shrink =
2614                         comm->cellsize_min_dlb[d]/
2615                         (ddbox->box_size[d]*ddbox->skew_fac[d]/dd->nc[d]);
2616                 }
2617                 log->writeStringFormatted(" %c %.2f", dim2char(d), shrink);
2618             }
2619         }
2620         log->ensureLineBreak();
2621     }
2622     else
2623     {
2624         set_dd_cell_sizes_slb(dd, ddbox, setcellsizeslbPULSE_ONLY, np);
2625         log->writeString("The initial number of communication pulses is:");
2626         for (d = 0; d < dd->ndim; d++)
2627         {
2628             log->writeStringFormatted(" %c %d", dim2char(dd->dim[d]), np[dd->dim[d]]);
2629         }
2630         log->ensureLineBreak();
2631         log->writeString("The initial domain decomposition cell size is:");
2632         for (d = 0; d < DIM; d++)
2633         {
2634             if (dd->nc[d] > 1)
2635             {
2636                 log->writeStringFormatted(" %c %.2f nm",
2637                                           dim2char(d), dd->comm->cellsize_min[d]);
2638             }
2639         }
2640         log->ensureLineBreak();
2641         log->writeLine();
2642     }
2643
2644     const bool haveInterDomainVsites =
2645         (countInterUpdategroupVsites(*mtop, comm->systemInfo.updateGroupingPerMoleculetype) != 0);
2646
2647     if (comm->systemInfo.haveInterDomainBondeds ||
2648         haveInterDomainVsites ||
2649         dd->splitConstraints || dd->splitSettles)
2650     {
2651         std::string decompUnits;
2652         if (comm->systemInfo.useUpdateGroups)
2653         {
2654             decompUnits = "atom groups";
2655         }
2656         else
2657         {
2658             decompUnits = "atoms";
2659         }
2660
2661         log->writeLineFormatted("The maximum allowed distance for %s involved in interactions is:", decompUnits.c_str());
2662         log->writeLineFormatted("%40s  %-7s %6.3f nm", "non-bonded interactions", "", comm->systemInfo.cutoff);
2663
2664         if (bDynLoadBal)
2665         {
2666             limit = dd->comm->cellsize_limit;
2667         }
2668         else
2669         {
2670             if (dd->unitCellInfo.ddBoxIsDynamic)
2671             {
2672                 log->writeLine("(the following are initial values, they could change due to box deformation)");
2673             }
2674             limit = dd->comm->cellsize_min[XX];
2675             for (d = 1; d < DIM; d++)
2676             {
2677                 limit = std::min(limit, dd->comm->cellsize_min[d]);
2678             }
2679         }
2680
2681         if (comm->systemInfo.haveInterDomainBondeds)
2682         {
2683             log->writeLineFormatted("%40s  %-7s %6.3f nm",
2684                                     "two-body bonded interactions", "(-rdd)",
2685                                     std::max(comm->systemInfo.cutoff, comm->cutoff_mbody));
2686             log->writeLineFormatted("%40s  %-7s %6.3f nm",
2687                                     "multi-body bonded interactions", "(-rdd)",
2688                                     (comm->bBondComm || isDlbOn(dd->comm)) ? comm->cutoff_mbody : std::min(comm->systemInfo.cutoff, limit));
2689         }
2690         if (haveInterDomainVsites)
2691         {
2692             log->writeLineFormatted("%40s  %-7s %6.3f nm",
2693                                     "virtual site constructions", "(-rcon)", limit);
2694         }
2695         if (dd->splitConstraints || dd->splitSettles)
2696         {
2697             std::string separation = gmx::formatString("atoms separated by up to %d constraints",
2698                                                        1+ir->nProjOrder);
2699             log->writeLineFormatted("%40s  %-7s %6.3f nm\n",
2700                                     separation.c_str(), "(-rcon)", limit);
2701         }
2702         log->ensureLineBreak();
2703     }
2704 }
2705
2706 static void logSettings(const gmx::MDLogger &mdlog,
2707                         gmx_domdec_t        *dd,
2708                         const gmx_mtop_t    *mtop,
2709                         const t_inputrec    *ir,
2710                         real                 dlb_scale,
2711                         const gmx_ddbox_t   *ddbox)
2712 {
2713     gmx::StringOutputStream stream;
2714     gmx::TextWriter         log(&stream);
2715     writeSettings(&log, dd, mtop, ir, isDlbOn(dd->comm), dlb_scale, ddbox);
2716     if (dd->comm->dlbState == DlbState::offCanTurnOn)
2717     {
2718         {
2719             log.ensureEmptyLine();
2720             log.writeLine("When dynamic load balancing gets turned on, these settings will change to:");
2721         }
2722         writeSettings(&log, dd, mtop, ir, true, dlb_scale, ddbox);
2723     }
2724     GMX_LOG(mdlog.info).asParagraph().appendText(stream.toString());
2725 }
2726
2727 static void set_cell_limits_dlb(const gmx::MDLogger &mdlog,
2728                                 gmx_domdec_t        *dd,
2729                                 real                 dlb_scale,
2730                                 const t_inputrec    *ir,
2731                                 const gmx_ddbox_t   *ddbox)
2732 {
2733     gmx_domdec_comm_t *comm;
2734     int                d, dim, npulse, npulse_d_max, npulse_d;
2735     gmx_bool           bNoCutOff;
2736
2737     comm = dd->comm;
2738
2739     bNoCutOff = (ir->rvdw == 0 || ir->rcoulomb == 0);
2740
2741     /* Determine the maximum number of comm. pulses in one dimension */
2742
2743     comm->cellsize_limit = std::max(comm->cellsize_limit, comm->cutoff_mbody);
2744
2745     /* Determine the maximum required number of grid pulses */
2746     if (comm->cellsize_limit >= comm->systemInfo.cutoff)
2747     {
2748         /* Only a single pulse is required */
2749         npulse = 1;
2750     }
2751     else if (!bNoCutOff && comm->cellsize_limit > 0)
2752     {
2753         /* We round down slightly here to avoid overhead due to the latency
2754          * of extra communication calls when the cut-off
2755          * would be only slightly longer than the cell size.
2756          * Later cellsize_limit is redetermined,
2757          * so we can not miss interactions due to this rounding.
2758          */
2759         npulse = static_cast<int>(0.96 + comm->systemInfo.cutoff/comm->cellsize_limit);
2760     }
2761     else
2762     {
2763         /* There is no cell size limit */
2764         npulse = std::max(dd->nc[XX]-1, std::max(dd->nc[YY]-1, dd->nc[ZZ]-1));
2765     }
2766
2767     if (!bNoCutOff && npulse > 1)
2768     {
2769         /* See if we can do with less pulses, based on dlb_scale */
2770         npulse_d_max = 0;
2771         for (d = 0; d < dd->ndim; d++)
2772         {
2773             dim      = dd->dim[d];
2774             npulse_d = static_cast<int>(1 + dd->nc[dim]*comm->systemInfo.cutoff
2775                                         /(ddbox->box_size[dim]*ddbox->skew_fac[dim]*dlb_scale));
2776             npulse_d_max = std::max(npulse_d_max, npulse_d);
2777         }
2778         npulse = std::min(npulse, npulse_d_max);
2779     }
2780
2781     /* This env var can override npulse */
2782     d = dd_getenv(mdlog, "GMX_DD_NPULSE", 0);
2783     if (d > 0)
2784     {
2785         npulse = d;
2786     }
2787
2788     comm->maxpulse       = 1;
2789     comm->bVacDLBNoLimit = (ir->ePBC == epbcNONE);
2790     for (d = 0; d < dd->ndim; d++)
2791     {
2792         comm->cd[d].np_dlb    = std::min(npulse, dd->nc[dd->dim[d]]-1);
2793         comm->maxpulse        = std::max(comm->maxpulse, comm->cd[d].np_dlb);
2794         if (comm->cd[d].np_dlb < dd->nc[dd->dim[d]]-1)
2795         {
2796             comm->bVacDLBNoLimit = FALSE;
2797         }
2798     }
2799
2800     /* cellsize_limit is set for LINCS in init_domain_decomposition */
2801     if (!comm->bVacDLBNoLimit)
2802     {
2803         comm->cellsize_limit = std::max(comm->cellsize_limit,
2804                                         comm->systemInfo.cutoff/comm->maxpulse);
2805     }
2806     comm->cellsize_limit = std::max(comm->cellsize_limit, comm->cutoff_mbody);
2807     /* Set the minimum cell size for each DD dimension */
2808     for (d = 0; d < dd->ndim; d++)
2809     {
2810         if (comm->bVacDLBNoLimit ||
2811             comm->cd[d].np_dlb*comm->cellsize_limit >= comm->systemInfo.cutoff)
2812         {
2813             comm->cellsize_min_dlb[dd->dim[d]] = comm->cellsize_limit;
2814         }
2815         else
2816         {
2817             comm->cellsize_min_dlb[dd->dim[d]] =
2818                 comm->systemInfo.cutoff/comm->cd[d].np_dlb;
2819         }
2820     }
2821     if (comm->cutoff_mbody <= 0)
2822     {
2823         comm->cutoff_mbody = std::min(comm->systemInfo.cutoff, comm->cellsize_limit);
2824     }
2825     if (isDlbOn(comm))
2826     {
2827         set_dlb_limits(dd);
2828     }
2829 }
2830
2831 gmx_bool dd_bonded_molpbc(const gmx_domdec_t *dd, int ePBC)
2832 {
2833     /* If each molecule is a single charge group
2834      * or we use domain decomposition for each periodic dimension,
2835      * we do not need to take pbc into account for the bonded interactions.
2836      */
2837     return (ePBC != epbcNONE && dd->comm->systemInfo.haveInterDomainBondeds &&
2838             !(dd->nc[XX] > 1 &&
2839               dd->nc[YY] > 1 &&
2840               (dd->nc[ZZ] > 1 || ePBC == epbcXY)));
2841 }
2842
2843 /*! \brief Sets grid size limits and PP-PME setup, prints settings to log */
2844 static void set_ddgrid_parameters(const gmx::MDLogger &mdlog,
2845                                   gmx_domdec_t *dd, real dlb_scale,
2846                                   const gmx_mtop_t *mtop, const t_inputrec *ir,
2847                                   const gmx_ddbox_t *ddbox)
2848 {
2849     gmx_domdec_comm_t *comm;
2850     int                natoms_tot;
2851     real               vol_frac;
2852
2853     comm = dd->comm;
2854
2855     if (EEL_PME(ir->coulombtype) || EVDW_PME(ir->vdwtype))
2856     {
2857         init_ddpme(dd, &comm->ddpme[0], 0);
2858         if (comm->npmedecompdim >= 2)
2859         {
2860             init_ddpme(dd, &comm->ddpme[1], 1);
2861         }
2862     }
2863     else
2864     {
2865         comm->npmenodes = 0;
2866         if (dd->pme_nodeid >= 0)
2867         {
2868             gmx_fatal_collective(FARGS, dd->mpi_comm_all, DDMASTER(dd),
2869                                  "Can not have separate PME ranks without PME electrostatics");
2870         }
2871     }
2872
2873     if (debug)
2874     {
2875         fprintf(debug, "The DD cut-off is %f\n", comm->systemInfo.cutoff);
2876     }
2877     if (!isDlbDisabled(comm))
2878     {
2879         set_cell_limits_dlb(mdlog, dd, dlb_scale, ir, ddbox);
2880     }
2881
2882     logSettings(mdlog, dd, mtop, ir, dlb_scale, ddbox);
2883
2884     if (ir->ePBC == epbcNONE)
2885     {
2886         vol_frac = 1 - 1/static_cast<double>(dd->nnodes);
2887     }
2888     else
2889     {
2890         vol_frac =
2891             (1 + comm_box_frac(dd->nc, comm->systemInfo.cutoff, ddbox))/static_cast<double>(dd->nnodes);
2892     }
2893     if (debug)
2894     {
2895         fprintf(debug, "Volume fraction for all DD zones: %f\n", vol_frac);
2896     }
2897     natoms_tot = comm->cgs_gl.index[comm->cgs_gl.nr];
2898
2899     dd->ga2la  = new gmx_ga2la_t(natoms_tot,
2900                                  static_cast<int>(vol_frac*natoms_tot));
2901 }
2902
2903 /*! \brief Set some important DD parameters that can be modified by env.vars */
2904 static void set_dd_envvar_options(const gmx::MDLogger &mdlog,
2905                                   gmx_domdec_t *dd, int rank_mysim)
2906 {
2907     gmx_domdec_comm_t *comm = dd->comm;
2908
2909     dd->bSendRecv2      = (dd_getenv(mdlog, "GMX_DD_USE_SENDRECV2", 0) != 0);
2910     comm->dlb_scale_lim = dd_getenv(mdlog, "GMX_DLB_MAX_BOX_SCALING", 10);
2911     comm->eFlop         = dd_getenv(mdlog, "GMX_DLB_BASED_ON_FLOPS", 0);
2912     int recload         = dd_getenv(mdlog, "GMX_DD_RECORD_LOAD", 1);
2913     comm->nstDDDump     = dd_getenv(mdlog, "GMX_DD_NST_DUMP", 0);
2914     comm->nstDDDumpGrid = dd_getenv(mdlog, "GMX_DD_NST_DUMP_GRID", 0);
2915     comm->DD_debug      = dd_getenv(mdlog, "GMX_DD_DEBUG", 0);
2916
2917     if (dd->bSendRecv2)
2918     {
2919         GMX_LOG(mdlog.info).appendText("Will use two sequential MPI_Sendrecv calls instead of two simultaneous non-blocking MPI_Irecv and MPI_Isend pairs for constraint and vsite communication");
2920     }
2921
2922     if (comm->eFlop)
2923     {
2924         GMX_LOG(mdlog.info).appendText("Will load balance based on FLOP count");
2925         if (comm->eFlop > 1)
2926         {
2927             srand(1 + rank_mysim);
2928         }
2929         comm->bRecordLoad = TRUE;
2930     }
2931     else
2932     {
2933         comm->bRecordLoad = (wallcycle_have_counter() && recload > 0);
2934     }
2935 }
2936
2937 gmx_domdec_t::gmx_domdec_t(const t_inputrec &ir) :
2938     unitCellInfo(ir)
2939 {
2940 }
2941
2942 gmx_domdec_t *init_domain_decomposition(const gmx::MDLogger           &mdlog,
2943                                         t_commrec                     *cr,
2944                                         const DomdecOptions           &options,
2945                                         const gmx::MdrunOptions       &mdrunOptions,
2946                                         const gmx_mtop_t              *mtop,
2947                                         const t_inputrec              *ir,
2948                                         const matrix                   box,
2949                                         gmx::ArrayRef<const gmx::RVec> xGlobal,
2950                                         gmx::LocalAtomSetManager      *atomSets)
2951 {
2952     gmx_domdec_t      *dd;
2953
2954     GMX_LOG(mdlog.info).appendTextFormatted(
2955             "\nInitializing Domain Decomposition on %d ranks", cr->nnodes);
2956
2957     dd = new gmx_domdec_t(*ir);
2958
2959     dd->comm = init_dd_comm();
2960
2961     set_dd_envvar_options(mdlog, dd, cr->nodeid);
2962
2963     gmx_ddbox_t ddbox = {0};
2964     set_dd_limits_and_grid(mdlog, cr, dd, options, mdrunOptions,
2965                            mtop, ir,
2966                            box, xGlobal,
2967                            &ddbox);
2968
2969     make_dd_communicators(mdlog, cr, dd, options.rankOrder);
2970
2971     if (thisRankHasDuty(cr, DUTY_PP))
2972     {
2973         set_ddgrid_parameters(mdlog, dd, options.dlbScaling, mtop, ir, &ddbox);
2974
2975         setup_neighbor_relations(dd);
2976     }
2977
2978     /* Set overallocation to avoid frequent reallocation of arrays */
2979     set_over_alloc_dd(TRUE);
2980
2981     dd->atomSets = atomSets;
2982
2983     return dd;
2984 }
2985
2986 static gmx_bool test_dd_cutoff(t_commrec     *cr,
2987                                const t_state &state,
2988                                real           cutoffRequested)
2989 {
2990     gmx_domdec_t *dd;
2991     gmx_ddbox_t   ddbox;
2992     int           d, dim, np;
2993     real          inv_cell_size;
2994     int           LocallyLimited;
2995
2996     dd = cr->dd;
2997
2998     set_ddbox(*dd, false, state.box, true, state.x, &ddbox);
2999
3000     LocallyLimited = 0;
3001
3002     for (d = 0; d < dd->ndim; d++)
3003     {
3004         dim = dd->dim[d];
3005
3006         inv_cell_size = DD_CELL_MARGIN*dd->nc[dim]/ddbox.box_size[dim];
3007         if (dd->unitCellInfo.ddBoxIsDynamic)
3008         {
3009             inv_cell_size *= DD_PRES_SCALE_MARGIN;
3010         }
3011
3012         np = 1 + static_cast<int>(cutoffRequested*inv_cell_size*ddbox.skew_fac[dim]);
3013
3014         if (!isDlbDisabled(dd->comm) && (dim < ddbox.npbcdim) && (dd->comm->cd[d].np_dlb > 0))
3015         {
3016             if (np > dd->comm->cd[d].np_dlb)
3017             {
3018                 return FALSE;
3019             }
3020
3021             /* If a current local cell size is smaller than the requested
3022              * cut-off, we could still fix it, but this gets very complicated.
3023              * Without fixing here, we might actually need more checks.
3024              */
3025             real cellSizeAlongDim = (dd->comm->cell_x1[dim] - dd->comm->cell_x0[dim])*ddbox.skew_fac[dim];
3026             if (cellSizeAlongDim*dd->comm->cd[d].np_dlb < cutoffRequested)
3027             {
3028                 LocallyLimited = 1;
3029             }
3030         }
3031     }
3032
3033     if (!isDlbDisabled(dd->comm))
3034     {
3035         /* If DLB is not active yet, we don't need to check the grid jumps.
3036          * Actually we shouldn't, because then the grid jump data is not set.
3037          */
3038         if (isDlbOn(dd->comm) &&
3039             check_grid_jump(0, dd, cutoffRequested, &ddbox, FALSE))
3040         {
3041             LocallyLimited = 1;
3042         }
3043
3044         gmx_sumi(1, &LocallyLimited, cr);
3045
3046         if (LocallyLimited > 0)
3047         {
3048             return FALSE;
3049         }
3050     }
3051
3052     return TRUE;
3053 }
3054
3055 gmx_bool change_dd_cutoff(t_commrec     *cr,
3056                           const t_state &state,
3057                           real           cutoffRequested)
3058 {
3059     gmx_bool bCutoffAllowed;
3060
3061     bCutoffAllowed = test_dd_cutoff(cr, state, cutoffRequested);
3062
3063     if (bCutoffAllowed)
3064     {
3065         cr->dd->comm->systemInfo.cutoff = cutoffRequested;
3066     }
3067
3068     return bCutoffAllowed;
3069 }