src/gromacs/domdec/domdec.cpp

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018, by the GROMACS development team, led by
   5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   6  * and including many others, as listed in the AUTHORS file in the
   7  * top-level source directory and at http://www.gromacs.org.
   8  *
   9  * GROMACS is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public License
  11  * as published by the Free Software Foundation; either version 2.1
  12  * of the License, or (at your option) any later version.
  13  *
  14  * GROMACS is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public
  20  * License along with GROMACS; if not, see
  21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  23  *
  24  * If you want to redistribute modifications to GROMACS, please
  25  * consider that scientific software is very special. Version
  26  * control is crucial - bugs must be traceable. We will be happy to
  27  * consider code for inclusion in the official distribution, but
  28  * derived work must not be called official GROMACS. Details are found
  29  * in the README & COPYING files - if they are missing, get the
  30  * official version at http://www.gromacs.org.
  31  *
  32  * To help us fund GROMACS development, we humbly ask that you cite
  33  * the research papers on the package. Check out http://www.gromacs.org.
  34  */
  35
  36 #include "gmxpre.h"
  37
  38 #include "domdec.h"
  39
  40 #include "config.h"
  41
  42 #include <cassert>
  43 #include <cinttypes>
  44 #include <climits>
  45 #include <cmath>
  46 #include <cstdio>
  47 #include <cstdlib>
  48 #include <cstring>
  49
  50 #include <algorithm>
  51
  52 #include "gromacs/compat/make_unique.h"
  53 #include "gromacs/domdec/collect.h"
  54 #include "gromacs/domdec/dlb.h"
  55 #include "gromacs/domdec/dlbtiming.h"
  56 #include "gromacs/domdec/domdec_network.h"
  57 #include "gromacs/domdec/ga2la.h"
  58 #include "gromacs/domdec/partition.h"
  59 #include "gromacs/gmxlib/network.h"
  60 #include "gromacs/gmxlib/nrnb.h"
  61 #include "gromacs/gpu_utils/gpu_utils.h"
  62 #include "gromacs/hardware/hw_info.h"
  63 #include "gromacs/listed-forces/manage-threading.h"
  64 #include "gromacs/math/vec.h"
  65 #include "gromacs/math/vectypes.h"
  66 #include "gromacs/mdlib/calc_verletbuf.h"
  67 #include "gromacs/mdlib/constr.h"
  68 #include "gromacs/mdlib/constraintrange.h"
  69 #include "gromacs/mdlib/mdrun.h"
  70 #include "gromacs/mdlib/updategroups.h"
  71 #include "gromacs/mdlib/vsite.h"
  72 #include "gromacs/mdtypes/commrec.h"
  73 #include "gromacs/mdtypes/inputrec.h"
  74 #include "gromacs/mdtypes/state.h"
  75 #include "gromacs/pbcutil/ishift.h"
  76 #include "gromacs/pbcutil/pbc.h"
  77 #include "gromacs/pulling/pull.h"
  78 #include "gromacs/timing/wallcycle.h"
  79 #include "gromacs/topology/block.h"
  80 #include "gromacs/topology/idef.h"
  81 #include "gromacs/topology/ifunc.h"
  82 #include "gromacs/topology/mtop_lookup.h"
  83 #include "gromacs/topology/mtop_util.h"
  84 #include "gromacs/topology/topology.h"
  85 #include "gromacs/utility/basedefinitions.h"
  86 #include "gromacs/utility/basenetwork.h"
  87 #include "gromacs/utility/cstringutil.h"
  88 #include "gromacs/utility/exceptions.h"
  89 #include "gromacs/utility/fatalerror.h"
  90 #include "gromacs/utility/gmxmpi.h"
  91 #include "gromacs/utility/logger.h"
  92 #include "gromacs/utility/real.h"
  93 #include "gromacs/utility/smalloc.h"
  94 #include "gromacs/utility/strconvert.h"
  95 #include "gromacs/utility/stringstream.h"
  96 #include "gromacs/utility/stringutil.h"
  97 #include "gromacs/utility/textwriter.h"
  98
  99 #include "atomdistribution.h"
 100 #include "box.h"
 101 #include "cellsizes.h"
 102 #include "distribute.h"
 103 #include "domdec_constraints.h"
 104 #include "domdec_internal.h"
 105 #include "domdec_vsite.h"
 106 #include "redistribute.h"
 107 #include "utility.h"
 108
 109 static const char *edlbs_names[int(DlbState::nr)] = { "off", "auto", "locked", "on", "on" };
 110
 111 /* The size per atom group of the cggl_flag buffer in gmx_domdec_comm_t */
 112 #define DD_CGIBS 2
 113
 114 /* The flags for the cggl_flag buffer in gmx_domdec_comm_t */
 115 #define DD_FLAG_NRCG  65535
 116 #define DD_FLAG_FW(d) (1<<(16+(d)*2))
 117 #define DD_FLAG_BW(d) (1<<(16+(d)*2+1))
 118
 119 /* The DD zone order */
 120 static const ivec dd_zo[DD_MAXZONE] =
 121 {{0, 0, 0}, {1, 0, 0}, {1, 1, 0}, {0, 1, 0}, {0, 1, 1}, {0, 0, 1}, {1, 0, 1}, {1, 1, 1}};
 122
 123 /* The non-bonded zone-pair setup for domain decomposition
 124  * The first number is the i-zone, the second number the first j-zone seen by
 125  * this i-zone, the third number the last+1 j-zone seen by this i-zone.
 126  * As is, this is for 3D decomposition, where there are 4 i-zones.
 127  * With 2D decomposition use only the first 2 i-zones and a last+1 j-zone of 4.
 128  * With 1D decomposition use only the first i-zone and a last+1 j-zone of 2.
 129  */
 130 static const int
 131     ddNonbondedZonePairRanges[DD_MAXIZONE][3] = {{0, 0, 8},
 132                                                  {1, 3, 6},
 133                                                  {2, 5, 6},
 134                                                  {3, 5, 7}};
 135
 136
 137 /*
 138    #define dd_index(n,i) ((((i)[ZZ]*(n)[YY] + (i)[YY])*(n)[XX]) + (i)[XX])
 139
 140    static void index2xyz(ivec nc,int ind,ivec xyz)
 141    {
 142    xyz[XX] = ind % nc[XX];
 143    xyz[YY] = (ind / nc[XX]) % nc[YY];
 144    xyz[ZZ] = ind / (nc[YY]*nc[XX]);
 145    }
 146  */
 147
 148 static void ddindex2xyz(const ivec nc, int ind, ivec xyz)
 149 {
 150     xyz[XX] = ind / (nc[YY]*nc[ZZ]);
 151     xyz[YY] = (ind / nc[ZZ]) % nc[YY];
 152     xyz[ZZ] = ind % nc[ZZ];
 153 }
 154
 155 static int ddcoord2ddnodeid(gmx_domdec_t *dd, ivec c)
 156 {
 157     int ddindex;
 158     int ddnodeid = -1;
 159
 160     ddindex = dd_index(dd->nc, c);
 161     if (dd->comm->bCartesianPP_PME)
 162     {
 163         ddnodeid = dd->comm->ddindex2ddnodeid[ddindex];
 164     }
 165     else if (dd->comm->bCartesianPP)
 166     {
 167 #if GMX_MPI
 168         MPI_Cart_rank(dd->mpi_comm_all, c, &ddnodeid);
 169 #endif
 170     }
 171     else
 172     {
 173         ddnodeid = ddindex;
 174     }
 175
 176     return ddnodeid;
 177 }
 178
 179 int ddglatnr(const gmx_domdec_t *dd, int i)
 180 {
 181     int atnr;
 182
 183     if (dd == nullptr)
 184     {
 185         atnr = i + 1;
 186     }
 187     else
 188     {
 189         if (i >= dd->comm->atomRanges.numAtomsTotal())
 190         {
 191             gmx_fatal(FARGS, "glatnr called with %d, which is larger than the local number of atoms (%d)", i, dd->comm->atomRanges.numAtomsTotal());
 192         }
 193         atnr = dd->globalAtomIndices[i] + 1;
 194     }
 195
 196     return atnr;
 197 }
 198
 199 t_block *dd_charge_groups_global(gmx_domdec_t *dd)
 200 {
 201     return &dd->comm->cgs_gl;
 202 }
 203
 204 void dd_store_state(gmx_domdec_t *dd, t_state *state)
 205 {
 206     int i;
 207
 208     if (state->ddp_count != dd->ddp_count)
 209     {
 210         gmx_incons("The MD state does not match the domain decomposition state");
 211     }
 212
 213     state->cg_gl.resize(dd->ncg_home);
 214     for (i = 0; i < dd->ncg_home; i++)
 215     {
 216         state->cg_gl[i] = dd->globalAtomGroupIndices[i];
 217     }
 218
 219     state->ddp_count_cg_gl = dd->ddp_count;
 220 }
 221
 222 gmx_domdec_zones_t *domdec_zones(gmx_domdec_t *dd)
 223 {
 224     return &dd->comm->zones;
 225 }
 226
 227 void dd_get_ns_ranges(const gmx_domdec_t *dd, int icg,
 228                       int *jcg0, int *jcg1, ivec shift0, ivec shift1)
 229 {
 230     gmx_domdec_zones_t *zones;
 231     int                 izone, d, dim;
 232
 233     zones = &dd->comm->zones;
 234
 235     izone = 0;
 236     while (icg >= zones->izone[izone].cg1)
 237     {
 238         izone++;
 239     }
 240
 241     if (izone == 0)
 242     {
 243         *jcg0 = icg;
 244     }
 245     else if (izone < zones->nizone)
 246     {
 247         *jcg0 = zones->izone[izone].jcg0;
 248     }
 249     else
 250     {
 251         gmx_fatal(FARGS, "DD icg %d out of range: izone (%d) >= nizone (%d)",
 252                   icg, izone, zones->nizone);
 253     }
 254
 255     *jcg1 = zones->izone[izone].jcg1;
 256
 257     for (d = 0; d < dd->ndim; d++)
 258     {
 259         dim         = dd->dim[d];
 260         shift0[dim] = zones->izone[izone].shift0[dim];
 261         shift1[dim] = zones->izone[izone].shift1[dim];
 262         if (dd->comm->tric_dir[dim] || (isDlbOn(dd->comm) && d > 0))
 263         {
 264             /* A conservative approach, this can be optimized */
 265             shift0[dim] -= 1;
 266             shift1[dim] += 1;
 267         }
 268     }
 269 }
 270
 271 int dd_numHomeAtoms(const gmx_domdec_t &dd)
 272 {
 273     return dd.comm->atomRanges.numHomeAtoms();
 274 }
 275
 276 int dd_natoms_mdatoms(const gmx_domdec_t *dd)
 277 {
 278     /* We currently set mdatoms entries for all atoms:
 279      * local + non-local + communicated for vsite + constraints
 280      */
 281
 282     return dd->comm->atomRanges.numAtomsTotal();
 283 }
 284
 285 int dd_natoms_vsite(const gmx_domdec_t *dd)
 286 {
 287     return dd->comm->atomRanges.end(DDAtomRanges::Type::Vsites);
 288 }
 289
 290 void dd_get_constraint_range(const gmx_domdec_t *dd, int *at_start, int *at_end)
 291 {
 292     *at_start = dd->comm->atomRanges.start(DDAtomRanges::Type::Constraints);
 293     *at_end   = dd->comm->atomRanges.end(DDAtomRanges::Type::Constraints);
 294 }
 295
 296 void dd_move_x(gmx_domdec_t             *dd,
 297                matrix                    box,
 298                gmx::ArrayRef<gmx::RVec>  x,
 299                gmx_wallcycle            *wcycle)
 300 {
 301     wallcycle_start(wcycle, ewcMOVEX);
 302
 303     int                    nzone, nat_tot;
 304     gmx_domdec_comm_t     *comm;
 305     gmx_domdec_comm_dim_t *cd;
 306     rvec                   shift = {0, 0, 0};
 307     gmx_bool               bPBC, bScrew;
 308
 309     comm = dd->comm;
 310
 311     const gmx::RangePartitioning &atomGrouping = dd->atomGrouping();
 312
 313     nzone   = 1;
 314     nat_tot = comm->atomRanges.numHomeAtoms();
 315     for (int d = 0; d < dd->ndim; d++)
 316     {
 317         bPBC   = (dd->ci[dd->dim[d]] == 0);
 318         bScrew = (bPBC && dd->bScrewPBC && dd->dim[d] == XX);
 319         if (bPBC)
 320         {
 321             copy_rvec(box[dd->dim[d]], shift);
 322         }
 323         cd = &comm->cd[d];
 324         for (const gmx_domdec_ind_t &ind : cd->ind)
 325         {
 326             DDBufferAccess<gmx::RVec>  sendBufferAccess(comm->rvecBuffer, ind.nsend[nzone + 1]);
 327             gmx::ArrayRef<gmx::RVec>  &sendBuffer = sendBufferAccess.buffer;
 328             int                        n          = 0;
 329             if (!bPBC)
 330             {
 331                 for (int g : ind.index)
 332                 {
 333                     for (int j : atomGrouping.block(g))
 334                     {
 335                         sendBuffer[n] = x[j];
 336                         n++;
 337                     }
 338                 }
 339             }
 340             else if (!bScrew)
 341             {
 342                 for (int g : ind.index)
 343                 {
 344                     for (int j : atomGrouping.block(g))
 345                     {
 346                         /* We need to shift the coordinates */
 347                         for (int d = 0; d < DIM; d++)
 348                         {
 349                             sendBuffer[n][d] = x[j][d] + shift[d];
 350                         }
 351                         n++;
 352                     }
 353                 }
 354             }
 355             else
 356             {
 357                 for (int g : ind.index)
 358                 {
 359                     for (int j : atomGrouping.block(g))
 360                     {
 361                         /* Shift x */
 362                         sendBuffer[n][XX] = x[j][XX] + shift[XX];
 363                         /* Rotate y and z.
 364                          * This operation requires a special shift force
 365                          * treatment, which is performed in calc_vir.
 366                          */
 367                         sendBuffer[n][YY] = box[YY][YY] - x[j][YY];
 368                         sendBuffer[n][ZZ] = box[ZZ][ZZ] - x[j][ZZ];
 369                         n++;
 370                     }
 371                 }
 372             }
 373
 374             DDBufferAccess<gmx::RVec>  receiveBufferAccess(comm->rvecBuffer2, cd->receiveInPlace ? 0 : ind.nrecv[nzone + 1]);
 375
 376             gmx::ArrayRef<gmx::RVec>   receiveBuffer;
 377             if (cd->receiveInPlace)
 378             {
 379                 receiveBuffer = gmx::arrayRefFromArray(x.data() + nat_tot, ind.nrecv[nzone + 1]);
 380             }
 381             else
 382             {
 383                 receiveBuffer = receiveBufferAccess.buffer;
 384             }
 385             /* Send and receive the coordinates */
 386             ddSendrecv(dd, d, dddirBackward,
 387                        sendBuffer, receiveBuffer);
 388
 389             if (!cd->receiveInPlace)
 390             {
 391                 int j = 0;
 392                 for (int zone = 0; zone < nzone; zone++)
 393                 {
 394                     for (int i = ind.cell2at0[zone]; i < ind.cell2at1[zone]; i++)
 395                     {
 396                         x[i] = receiveBuffer[j++];
 397                     }
 398                 }
 399             }
 400             nat_tot += ind.nrecv[nzone+1];
 401         }
 402         nzone += nzone;
 403     }
 404
 405     wallcycle_stop(wcycle, ewcMOVEX);
 406 }
 407
 408 void dd_move_f(gmx_domdec_t             *dd,
 409                gmx::ArrayRef<gmx::RVec>  f,
 410                rvec                     *fshift,
 411                gmx_wallcycle            *wcycle)
 412 {
 413     wallcycle_start(wcycle, ewcMOVEF);
 414
 415     int                    nzone, nat_tot;
 416     gmx_domdec_comm_t     *comm;
 417     gmx_domdec_comm_dim_t *cd;
 418     ivec                   vis;
 419     int                    is;
 420     gmx_bool               bShiftForcesNeedPbc, bScrew;
 421
 422     comm = dd->comm;
 423
 424     const gmx::RangePartitioning &atomGrouping = dd->atomGrouping();
 425
 426     nzone   = comm->zones.n/2;
 427     nat_tot = comm->atomRanges.end(DDAtomRanges::Type::Zones);
 428     for (int d = dd->ndim-1; d >= 0; d--)
 429     {
 430         /* Only forces in domains near the PBC boundaries need to
 431            consider PBC in the treatment of fshift */
 432         bShiftForcesNeedPbc   = (dd->ci[dd->dim[d]] == 0);
 433         bScrew                = (bShiftForcesNeedPbc && dd->bScrewPBC && dd->dim[d] == XX);
 434         if (fshift == nullptr && !bScrew)
 435         {
 436             bShiftForcesNeedPbc = FALSE;
 437         }
 438         /* Determine which shift vector we need */
 439         clear_ivec(vis);
 440         vis[dd->dim[d]] = 1;
 441         is              = IVEC2IS(vis);
 442
 443         cd = &comm->cd[d];
 444         for (int p = cd->numPulses() - 1; p >= 0; p--)
 445         {
 446             const gmx_domdec_ind_t    &ind  = cd->ind[p];
 447             DDBufferAccess<gmx::RVec>  receiveBufferAccess(comm->rvecBuffer, ind.nsend[nzone + 1]);
 448             gmx::ArrayRef<gmx::RVec>  &receiveBuffer = receiveBufferAccess.buffer;
 449
 450             nat_tot                        -= ind.nrecv[nzone+1];
 451
 452             DDBufferAccess<gmx::RVec>  sendBufferAccess(comm->rvecBuffer2, cd->receiveInPlace ? 0 : ind.nrecv[nzone + 1]);
 453
 454             gmx::ArrayRef<gmx::RVec>   sendBuffer;
 455             if (cd->receiveInPlace)
 456             {
 457                 sendBuffer = gmx::arrayRefFromArray(f.data() + nat_tot, ind.nrecv[nzone + 1]);
 458             }
 459             else
 460             {
 461                 sendBuffer = sendBufferAccess.buffer;
 462                 int j = 0;
 463                 for (int zone = 0; zone < nzone; zone++)
 464                 {
 465                     for (int i = ind.cell2at0[zone]; i < ind.cell2at1[zone]; i++)
 466                     {
 467                         sendBuffer[j++] = f[i];
 468                     }
 469                 }
 470             }
 471             /* Communicate the forces */
 472             ddSendrecv(dd, d, dddirForward,
 473                        sendBuffer, receiveBuffer);
 474             /* Add the received forces */
 475             int n = 0;
 476             if (!bShiftForcesNeedPbc)
 477             {
 478                 for (int g : ind.index)
 479                 {
 480                     for (int j : atomGrouping.block(g))
 481                     {
 482                         for (int d = 0; d < DIM; d++)
 483                         {
 484                             f[j][d] += receiveBuffer[n][d];
 485                         }
 486                         n++;
 487                     }
 488                 }
 489             }
 490             else if (!bScrew)
 491             {
 492                 /* fshift should always be defined if this function is
 493                  * called when bShiftForcesNeedPbc is true */
 494                 assert(nullptr != fshift);
 495                 for (int g : ind.index)
 496                 {
 497                     for (int j : atomGrouping.block(g))
 498                     {
 499                         for (int d = 0; d < DIM; d++)
 500                         {
 501                             f[j][d] += receiveBuffer[n][d];
 502                         }
 503                         /* Add this force to the shift force */
 504                         for (int d = 0; d < DIM; d++)
 505                         {
 506                             fshift[is][d] += receiveBuffer[n][d];
 507                         }
 508                         n++;
 509                     }
 510                 }
 511             }
 512             else
 513             {
 514                 for (int g : ind.index)
 515                 {
 516                     for (int j : atomGrouping.block(g))
 517                     {
 518                         /* Rotate the force */
 519                         f[j][XX] += receiveBuffer[n][XX];
 520                         f[j][YY] -= receiveBuffer[n][YY];
 521                         f[j][ZZ] -= receiveBuffer[n][ZZ];
 522                         if (fshift)
 523                         {
 524                             /* Add this force to the shift force */
 525                             for (int d = 0; d < DIM; d++)
 526                             {
 527                                 fshift[is][d] += receiveBuffer[n][d];
 528                             }
 529                         }
 530                         n++;
 531                     }
 532                 }
 533             }
 534         }
 535         nzone /= 2;
 536     }
 537     wallcycle_stop(wcycle, ewcMOVEF);
 538 }
 539
 540 /* Convenience function for extracting a real buffer from an rvec buffer
 541  *
 542  * To reduce the number of temporary communication buffers and avoid
 543  * cache polution, we reuse gmx::RVec buffers for storing reals.
 544  * This functions return a real buffer reference with the same number
 545  * of elements as the gmx::RVec buffer (so 1/3 of the size in bytes).
 546  */
 547 static gmx::ArrayRef<real>
 548 realArrayRefFromRvecArrayRef(gmx::ArrayRef<gmx::RVec> arrayRef)
 549 {
 550     return gmx::arrayRefFromArray(as_rvec_array(arrayRef.data())[0],
 551                                   arrayRef.size());
 552 }
 553
 554 void dd_atom_spread_real(gmx_domdec_t *dd, real v[])
 555 {
 556     int                    nzone, nat_tot;
 557     gmx_domdec_comm_t     *comm;
 558     gmx_domdec_comm_dim_t *cd;
 559
 560     comm = dd->comm;
 561
 562     const gmx::RangePartitioning &atomGrouping = dd->atomGrouping();
 563
 564     nzone   = 1;
 565     nat_tot = comm->atomRanges.numHomeAtoms();
 566     for (int d = 0; d < dd->ndim; d++)
 567     {
 568         cd = &comm->cd[d];
 569         for (const gmx_domdec_ind_t &ind : cd->ind)
 570         {
 571             /* Note: We provision for RVec instead of real, so a factor of 3
 572              * more than needed. The buffer actually already has this size
 573              * and we pass a plain pointer below, so this does not matter.
 574              */
 575             DDBufferAccess<gmx::RVec> sendBufferAccess(comm->rvecBuffer, ind.nsend[nzone + 1]);
 576             gmx::ArrayRef<real>       sendBuffer = realArrayRefFromRvecArrayRef(sendBufferAccess.buffer);
 577             int                       n          = 0;
 578             for (int g : ind.index)
 579             {
 580                 for (int j : atomGrouping.block(g))
 581                 {
 582                     sendBuffer[n++] = v[j];
 583                 }
 584             }
 585
 586             DDBufferAccess<gmx::RVec> receiveBufferAccess(comm->rvecBuffer2, cd->receiveInPlace ? 0 : ind.nrecv[nzone + 1]);
 587
 588             gmx::ArrayRef<real>       receiveBuffer;
 589             if (cd->receiveInPlace)
 590             {
 591                 receiveBuffer = gmx::arrayRefFromArray(v + nat_tot, ind.nrecv[nzone + 1]);
 592             }
 593             else
 594             {
 595                 receiveBuffer = realArrayRefFromRvecArrayRef(receiveBufferAccess.buffer);
 596             }
 597             /* Send and receive the data */
 598             ddSendrecv(dd, d, dddirBackward,
 599                        sendBuffer, receiveBuffer);
 600             if (!cd->receiveInPlace)
 601             {
 602                 int j = 0;
 603                 for (int zone = 0; zone < nzone; zone++)
 604                 {
 605                     for (int i = ind.cell2at0[zone]; i < ind.cell2at1[zone]; i++)
 606                     {
 607                         v[i] = receiveBuffer[j++];
 608                     }
 609                 }
 610             }
 611             nat_tot += ind.nrecv[nzone+1];
 612         }
 613         nzone += nzone;
 614     }
 615 }
 616
 617 void dd_atom_sum_real(gmx_domdec_t *dd, real v[])
 618 {
 619     int                    nzone, nat_tot;
 620     gmx_domdec_comm_t     *comm;
 621     gmx_domdec_comm_dim_t *cd;
 622
 623     comm = dd->comm;
 624
 625     const gmx::RangePartitioning &atomGrouping = dd->atomGrouping();
 626
 627     nzone   = comm->zones.n/2;
 628     nat_tot = comm->atomRanges.end(DDAtomRanges::Type::Zones);
 629     for (int d = dd->ndim-1; d >= 0; d--)
 630     {
 631         cd = &comm->cd[d];
 632         for (int p = cd->numPulses() - 1; p >= 0; p--)
 633         {
 634             const gmx_domdec_ind_t &ind = cd->ind[p];
 635
 636             /* Note: We provision for RVec instead of real, so a factor of 3
 637              * more than needed. The buffer actually already has this size
 638              * and we typecast, so this works as intended.
 639              */
 640             DDBufferAccess<gmx::RVec> receiveBufferAccess(comm->rvecBuffer, ind.nsend[nzone + 1]);
 641             gmx::ArrayRef<real>       receiveBuffer = realArrayRefFromRvecArrayRef(receiveBufferAccess.buffer);
 642             nat_tot -= ind.nrecv[nzone + 1];
 643
 644             DDBufferAccess<gmx::RVec> sendBufferAccess(comm->rvecBuffer2, cd->receiveInPlace ? 0 : ind.nrecv[nzone + 1]);
 645
 646             gmx::ArrayRef<real>       sendBuffer;
 647             if (cd->receiveInPlace)
 648             {
 649                 sendBuffer = gmx::arrayRefFromArray(v + nat_tot, ind.nrecv[nzone + 1]);
 650             }
 651             else
 652             {
 653                 sendBuffer = realArrayRefFromRvecArrayRef(sendBufferAccess.buffer);
 654                 int j = 0;
 655                 for (int zone = 0; zone < nzone; zone++)
 656                 {
 657                     for (int i = ind.cell2at0[zone]; i < ind.cell2at1[zone]; i++)
 658                     {
 659                         sendBuffer[j++] = v[i];
 660                     }
 661                 }
 662             }
 663             /* Communicate the forces */
 664             ddSendrecv(dd, d, dddirForward,
 665                        sendBuffer, receiveBuffer);
 666             /* Add the received forces */
 667             int n = 0;
 668             for (int g : ind.index)
 669             {
 670                 for (int j : atomGrouping.block(g))
 671                 {
 672                     v[j] += receiveBuffer[n];
 673                     n++;
 674                 }
 675             }
 676         }
 677         nzone /= 2;
 678     }
 679 }
 680
 681 real dd_cutoff_multibody(const gmx_domdec_t *dd)
 682 {
 683     gmx_domdec_comm_t *comm;
 684     int                di;
 685     real               r;
 686
 687     comm = dd->comm;
 688
 689     r = -1;
 690     if (comm->bInterCGBondeds)
 691     {
 692         if (comm->cutoff_mbody > 0)
 693         {
 694             r = comm->cutoff_mbody;
 695         }
 696         else
 697         {
 698             /* cutoff_mbody=0 means we do not have DLB */
 699             r = comm->cellsize_min[dd->dim[0]];
 700             for (di = 1; di < dd->ndim; di++)
 701             {
 702                 r = std::min(r, comm->cellsize_min[dd->dim[di]]);
 703             }
 704             if (comm->bBondComm)
 705             {
 706                 r = std::max(r, comm->cutoff_mbody);
 707             }
 708             else
 709             {
 710                 r = std::min(r, comm->cutoff);
 711             }
 712         }
 713     }
 714
 715     return r;
 716 }
 717
 718 real dd_cutoff_twobody(const gmx_domdec_t *dd)
 719 {
 720     real r_mb;
 721
 722     r_mb = dd_cutoff_multibody(dd);
 723
 724     return std::max(dd->comm->cutoff, r_mb);
 725 }
 726
 727
 728 static void dd_cart_coord2pmecoord(const gmx_domdec_t *dd, const ivec coord,
 729                                    ivec coord_pme)
 730 {
 731     int nc, ntot;
 732
 733     nc   = dd->nc[dd->comm->cartpmedim];
 734     ntot = dd->comm->ntot[dd->comm->cartpmedim];
 735     copy_ivec(coord, coord_pme);
 736     coord_pme[dd->comm->cartpmedim] =
 737         nc + (coord[dd->comm->cartpmedim]*(ntot - nc) + (ntot - nc)/2)/nc;
 738 }
 739
 740 static int ddindex2pmeindex(const gmx_domdec_t *dd, int ddindex)
 741 {
 742     int npp, npme;
 743
 744     npp  = dd->nnodes;
 745     npme = dd->comm->npmenodes;
 746
 747     /* Here we assign a PME node to communicate with this DD node
 748      * by assuming that the major index of both is x.
 749      * We add cr->npmenodes/2 to obtain an even distribution.
 750      */
 751     return (ddindex*npme + npme/2)/npp;
 752 }
 753
 754 static int *dd_interleaved_pme_ranks(const gmx_domdec_t *dd)
 755 {
 756     int *pme_rank;
 757     int  n, i, p0, p1;
 758
 759     snew(pme_rank, dd->comm->npmenodes);
 760     n = 0;
 761     for (i = 0; i < dd->nnodes; i++)
 762     {
 763         p0 = ddindex2pmeindex(dd, i);
 764         p1 = ddindex2pmeindex(dd, i+1);
 765         if (i+1 == dd->nnodes || p1 > p0)
 766         {
 767             if (debug)
 768             {
 769                 fprintf(debug, "pme_rank[%d] = %d\n", n, i+1+n);
 770             }
 771             pme_rank[n] = i + 1 + n;
 772             n++;
 773         }
 774     }
 775
 776     return pme_rank;
 777 }
 778
 779 static int gmx_ddcoord2pmeindex(const t_commrec *cr, int x, int y, int z)
 780 {
 781     gmx_domdec_t *dd;
 782     ivec          coords;
 783     int           slab;
 784
 785     dd = cr->dd;
 786     /*
 787        if (dd->comm->bCartesian) {
 788        gmx_ddindex2xyz(dd->nc,ddindex,coords);
 789        dd_coords2pmecoords(dd,coords,coords_pme);
 790        copy_ivec(dd->ntot,nc);
 791        nc[dd->cartpmedim]         -= dd->nc[dd->cartpmedim];
 792        coords_pme[dd->cartpmedim] -= dd->nc[dd->cartpmedim];
 793
 794        slab = (coords_pme[XX]*nc[YY] + coords_pme[YY])*nc[ZZ] + coords_pme[ZZ];
 795        } else {
 796        slab = (ddindex*cr->npmenodes + cr->npmenodes/2)/dd->nnodes;
 797        }
 798      */
 799     coords[XX] = x;
 800     coords[YY] = y;
 801     coords[ZZ] = z;
 802     slab       = ddindex2pmeindex(dd, dd_index(dd->nc, coords));
 803
 804     return slab;
 805 }
 806
 807 static int ddcoord2simnodeid(const t_commrec *cr, int x, int y, int z)
 808 {
 809     gmx_domdec_comm_t *comm;
 810     ivec               coords;
 811     int                ddindex, nodeid = -1;
 812
 813     comm = cr->dd->comm;
 814
 815     coords[XX] = x;
 816     coords[YY] = y;
 817     coords[ZZ] = z;
 818     if (comm->bCartesianPP_PME)
 819     {
 820 #if GMX_MPI
 821         MPI_Cart_rank(cr->mpi_comm_mysim, coords, &nodeid);
 822 #endif
 823     }
 824     else
 825     {
 826         ddindex = dd_index(cr->dd->nc, coords);
 827         if (comm->bCartesianPP)
 828         {
 829             nodeid = comm->ddindex2simnodeid[ddindex];
 830         }
 831         else
 832         {
 833             if (comm->pmenodes)
 834             {
 835                 nodeid = ddindex + gmx_ddcoord2pmeindex(cr, x, y, z);
 836             }
 837             else
 838             {
 839                 nodeid = ddindex;
 840             }
 841         }
 842     }
 843
 844     return nodeid;
 845 }
 846
 847 static int dd_simnode2pmenode(const gmx_domdec_t         *dd,
 848                               const t_commrec gmx_unused *cr,
 849                               int                         sim_nodeid)
 850 {
 851     int pmenode = -1;
 852
 853     const gmx_domdec_comm_t *comm = dd->comm;
 854
 855     /* This assumes a uniform x domain decomposition grid cell size */
 856     if (comm->bCartesianPP_PME)
 857     {
 858 #if GMX_MPI
 859         ivec coord, coord_pme;
 860         MPI_Cart_coords(cr->mpi_comm_mysim, sim_nodeid, DIM, coord);
 861         if (coord[comm->cartpmedim] < dd->nc[comm->cartpmedim])
 862         {
 863             /* This is a PP node */
 864             dd_cart_coord2pmecoord(dd, coord, coord_pme);
 865             MPI_Cart_rank(cr->mpi_comm_mysim, coord_pme, &pmenode);
 866         }
 867 #endif
 868     }
 869     else if (comm->bCartesianPP)
 870     {
 871         if (sim_nodeid < dd->nnodes)
 872         {
 873             pmenode = dd->nnodes + ddindex2pmeindex(dd, sim_nodeid);
 874         }
 875     }
 876     else
 877     {
 878         /* This assumes DD cells with identical x coordinates
 879          * are numbered sequentially.
 880          */
 881         if (dd->comm->pmenodes == nullptr)
 882         {
 883             if (sim_nodeid < dd->nnodes)
 884             {
 885                 /* The DD index equals the nodeid */
 886                 pmenode = dd->nnodes + ddindex2pmeindex(dd, sim_nodeid);
 887             }
 888         }
 889         else
 890         {
 891             int i = 0;
 892             while (sim_nodeid > dd->comm->pmenodes[i])
 893             {
 894                 i++;
 895             }
 896             if (sim_nodeid < dd->comm->pmenodes[i])
 897             {
 898                 pmenode = dd->comm->pmenodes[i];
 899             }
 900         }
 901     }
 902
 903     return pmenode;
 904 }
 905
 906 NumPmeDomains getNumPmeDomains(const gmx_domdec_t *dd)
 907 {
 908     if (dd != nullptr)
 909     {
 910         return {
 911                    dd->comm->npmenodes_x, dd->comm->npmenodes_y
 912         };
 913     }
 914     else
 915     {
 916         return {
 917                    1, 1
 918         };
 919     }
 920 }
 921
 922 std::vector<int> get_pme_ddranks(const t_commrec *cr, int pmenodeid)
 923 {
 924     gmx_domdec_t *dd;
 925     int           x, y, z;
 926     ivec          coord, coord_pme;
 927
 928     dd = cr->dd;
 929
 930     std::vector<int> ddranks;
 931     ddranks.reserve((dd->nnodes+cr->npmenodes-1)/cr->npmenodes);
 932
 933     for (x = 0; x < dd->nc[XX]; x++)
 934     {
 935         for (y = 0; y < dd->nc[YY]; y++)
 936         {
 937             for (z = 0; z < dd->nc[ZZ]; z++)
 938             {
 939                 if (dd->comm->bCartesianPP_PME)
 940                 {
 941                     coord[XX] = x;
 942                     coord[YY] = y;
 943                     coord[ZZ] = z;
 944                     dd_cart_coord2pmecoord(dd, coord, coord_pme);
 945                     if (dd->ci[XX] == coord_pme[XX] &&
 946                         dd->ci[YY] == coord_pme[YY] &&
 947                         dd->ci[ZZ] == coord_pme[ZZ])
 948                     {
 949                         ddranks.push_back(ddcoord2simnodeid(cr, x, y, z));
 950                     }
 951                 }
 952                 else
 953                 {
 954                     /* The slab corresponds to the nodeid in the PME group */
 955                     if (gmx_ddcoord2pmeindex(cr, x, y, z) == pmenodeid)
 956                     {
 957                         ddranks.push_back(ddcoord2simnodeid(cr, x, y, z));
 958                     }
 959                 }
 960             }
 961         }
 962     }
 963     return ddranks;
 964 }
 965
 966 static gmx_bool receive_vir_ener(const gmx_domdec_t *dd, const t_commrec *cr)
 967 {
 968     gmx_bool bReceive = TRUE;
 969
 970     if (cr->npmenodes < dd->nnodes)
 971     {
 972         gmx_domdec_comm_t *comm = dd->comm;
 973         if (comm->bCartesianPP_PME)
 974         {
 975 #if GMX_MPI
 976             int  pmenode = dd_simnode2pmenode(dd, cr, cr->sim_nodeid);
 977             ivec coords;
 978             MPI_Cart_coords(cr->mpi_comm_mysim, cr->sim_nodeid, DIM, coords);
 979             coords[comm->cartpmedim]++;
 980             if (coords[comm->cartpmedim] < dd->nc[comm->cartpmedim])
 981             {
 982                 int rank;
 983                 MPI_Cart_rank(cr->mpi_comm_mysim, coords, &rank);
 984                 if (dd_simnode2pmenode(dd, cr, rank) == pmenode)
 985                 {
 986                     /* This is not the last PP node for pmenode */
 987                     bReceive = FALSE;
 988                 }
 989             }
 990 #else
 991             GMX_RELEASE_ASSERT(false, "Without MPI we should not have Cartesian PP-PME with #PMEnodes < #DDnodes");
 992 #endif
 993         }
 994         else
 995         {
 996             int pmenode = dd_simnode2pmenode(dd, cr, cr->sim_nodeid);
 997             if (cr->sim_nodeid+1 < cr->nnodes &&
 998                 dd_simnode2pmenode(dd, cr, cr->sim_nodeid+1) == pmenode)
 999             {
1000                 /* This is not the last PP node for pmenode */
1001                 bReceive = FALSE;
1002             }
1003         }
1004     }
1005
1006     return bReceive;
1007 }
1008
1009 static void set_slb_pme_dim_f(gmx_domdec_t *dd, int dim, real **dim_f)
1010 {
1011     gmx_domdec_comm_t *comm;
1012     int                i;
1013
1014     comm = dd->comm;
1015
1016     snew(*dim_f, dd->nc[dim]+1);
1017     (*dim_f)[0] = 0;
1018     for (i = 1; i < dd->nc[dim]; i++)
1019     {
1020         if (comm->slb_frac[dim])
1021         {
1022             (*dim_f)[i] = (*dim_f)[i-1] + comm->slb_frac[dim][i-1];
1023         }
1024         else
1025         {
1026             (*dim_f)[i] = static_cast<real>(i)/static_cast<real>(dd->nc[dim]);
1027         }
1028     }
1029     (*dim_f)[dd->nc[dim]] = 1;
1030 }
1031
1032 static void init_ddpme(gmx_domdec_t *dd, gmx_ddpme_t *ddpme, int dimind)
1033 {
1034     int  pmeindex, slab, nso, i;
1035     ivec xyz;
1036
1037     if (dimind == 0 && dd->dim[0] == YY && dd->comm->npmenodes_x == 1)
1038     {
1039         ddpme->dim = YY;
1040     }
1041     else
1042     {
1043         ddpme->dim = dimind;
1044     }
1045     ddpme->dim_match = (ddpme->dim == dd->dim[dimind]);
1046
1047     ddpme->nslab = (ddpme->dim == 0 ?
1048                     dd->comm->npmenodes_x :
1049                     dd->comm->npmenodes_y);
1050
1051     if (ddpme->nslab <= 1)
1052     {
1053         return;
1054     }
1055
1056     nso = dd->comm->npmenodes/ddpme->nslab;
1057     /* Determine for each PME slab the PP location range for dimension dim */
1058     snew(ddpme->pp_min, ddpme->nslab);
1059     snew(ddpme->pp_max, ddpme->nslab);
1060     for (slab = 0; slab < ddpme->nslab; slab++)
1061     {
1062         ddpme->pp_min[slab] = dd->nc[dd->dim[dimind]] - 1;
1063         ddpme->pp_max[slab] = 0;
1064     }
1065     for (i = 0; i < dd->nnodes; i++)
1066     {
1067         ddindex2xyz(dd->nc, i, xyz);
1068         /* For y only use our y/z slab.
1069          * This assumes that the PME x grid size matches the DD grid size.
1070          */
1071         if (dimind == 0 || xyz[XX] == dd->ci[XX])
1072         {
1073             pmeindex = ddindex2pmeindex(dd, i);
1074             if (dimind == 0)
1075             {
1076                 slab = pmeindex/nso;
1077             }
1078             else
1079             {
1080                 slab = pmeindex % ddpme->nslab;
1081             }
1082             ddpme->pp_min[slab] = std::min(ddpme->pp_min[slab], xyz[dimind]);
1083             ddpme->pp_max[slab] = std::max(ddpme->pp_max[slab], xyz[dimind]);
1084         }
1085     }
1086
1087     set_slb_pme_dim_f(dd, ddpme->dim, &ddpme->slb_dim_f);
1088 }
1089
1090 int dd_pme_maxshift_x(const gmx_domdec_t *dd)
1091 {
1092     if (dd->comm->ddpme[0].dim == XX)
1093     {
1094         return dd->comm->ddpme[0].maxshift;
1095     }
1096     else
1097     {
1098         return 0;
1099     }
1100 }
1101
1102 int dd_pme_maxshift_y(const gmx_domdec_t *dd)
1103 {
1104     if (dd->comm->ddpme[0].dim == YY)
1105     {
1106         return dd->comm->ddpme[0].maxshift;
1107     }
1108     else if (dd->comm->npmedecompdim >= 2 && dd->comm->ddpme[1].dim == YY)
1109     {
1110         return dd->comm->ddpme[1].maxshift;
1111     }
1112     else
1113     {
1114         return 0;
1115     }
1116 }
1117
1118 void dd_cycles_add(const gmx_domdec_t *dd, float cycles, int ddCycl)
1119 {
1120     /* Note that the cycles value can be incorrect, either 0 or some
1121      * extremely large value, when our thread migrated to another core
1122      * with an unsynchronized cycle counter. If this happens less often
1123      * that once per nstlist steps, this will not cause issues, since
1124      * we later subtract the maximum value from the sum over nstlist steps.
1125      * A zero count will slightly lower the total, but that's a small effect.
1126      * Note that the main purpose of the subtraction of the maximum value
1127      * is to avoid throwing off the load balancing when stalls occur due
1128      * e.g. system activity or network congestion.
1129      */
1130     dd->comm->cycl[ddCycl] += cycles;
1131     dd->comm->cycl_n[ddCycl]++;
1132     if (cycles > dd->comm->cycl_max[ddCycl])
1133     {
1134         dd->comm->cycl_max[ddCycl] = cycles;
1135     }
1136 }
1137
1138 #if GMX_MPI
1139 static void make_load_communicator(gmx_domdec_t *dd, int dim_ind, ivec loc)
1140 {
1141     MPI_Comm           c_row;
1142     int                dim, i, rank;
1143     ivec               loc_c;
1144     gmx_bool           bPartOfGroup = FALSE;
1145
1146     dim = dd->dim[dim_ind];
1147     copy_ivec(loc, loc_c);
1148     for (i = 0; i < dd->nc[dim]; i++)
1149     {
1150         loc_c[dim] = i;
1151         rank       = dd_index(dd->nc, loc_c);
1152         if (rank == dd->rank)
1153         {
1154             /* This process is part of the group */
1155             bPartOfGroup = TRUE;
1156         }
1157     }
1158     MPI_Comm_split(dd->mpi_comm_all, bPartOfGroup ? 0 : MPI_UNDEFINED, dd->rank,
1159                    &c_row);
1160     if (bPartOfGroup)
1161     {
1162         dd->comm->mpi_comm_load[dim_ind] = c_row;
1163         if (!isDlbDisabled(dd->comm))
1164         {
1165             DDCellsizesWithDlb &cellsizes = dd->comm->cellsizesWithDlb[dim_ind];
1166
1167             if (dd->ci[dim] == dd->master_ci[dim])
1168             {
1169                 /* This is the root process of this row */
1170                 cellsizes.rowMaster  = gmx::compat::make_unique<RowMaster>();
1171
1172                 RowMaster &rowMaster = *cellsizes.rowMaster;
1173                 rowMaster.cellFrac.resize(ddCellFractionBufferSize(dd, dim_ind));
1174                 rowMaster.oldCellFrac.resize(dd->nc[dim] + 1);
1175                 rowMaster.isCellMin.resize(dd->nc[dim]);
1176                 if (dim_ind > 0)
1177                 {
1178                     rowMaster.bounds.resize(dd->nc[dim]);
1179                 }
1180                 rowMaster.buf_ncd.resize(dd->nc[dim]);
1181             }
1182             else
1183             {
1184                 /* This is not a root process, we only need to receive cell_f */
1185                 cellsizes.fracRow.resize(ddCellFractionBufferSize(dd, dim_ind));
1186             }
1187         }
1188         if (dd->ci[dim] == dd->master_ci[dim])
1189         {
1190             snew(dd->comm->load[dim_ind].load, dd->nc[dim]*DD_NLOAD_MAX);
1191         }
1192     }
1193 }
1194 #endif
1195
1196 void dd_setup_dlb_resource_sharing(t_commrec            *cr,
1197                                    int                   gpu_id)
1198 {
1199 #if GMX_MPI
1200     int           physicalnode_id_hash;
1201     gmx_domdec_t *dd;
1202     MPI_Comm      mpi_comm_pp_physicalnode;
1203
1204     if (!thisRankHasDuty(cr, DUTY_PP) || gpu_id < 0)
1205     {
1206         /* Only ranks with short-ranged tasks (currently) use GPUs.
1207          * If we don't have GPUs assigned, there are no resources to share.
1208          */
1209         return;
1210     }
1211
1212     physicalnode_id_hash = gmx_physicalnode_id_hash();
1213
1214     dd = cr->dd;
1215
1216     if (debug)
1217     {
1218         fprintf(debug, "dd_setup_dd_dlb_gpu_sharing:\n");
1219         fprintf(debug, "DD PP rank %d physical node hash %d gpu_id %d\n",
1220                 dd->rank, physicalnode_id_hash, gpu_id);
1221     }
1222     /* Split the PP communicator over the physical nodes */
1223     /* TODO: See if we should store this (before), as it's also used for
1224      * for the nodecomm summation.
1225      */
1226     // TODO PhysicalNodeCommunicator could be extended/used to handle
1227     // the need for per-node per-group communicators.
1228     MPI_Comm_split(dd->mpi_comm_all, physicalnode_id_hash, dd->rank,
1229                    &mpi_comm_pp_physicalnode);
1230     MPI_Comm_split(mpi_comm_pp_physicalnode, gpu_id, dd->rank,
1231                    &dd->comm->mpi_comm_gpu_shared);
1232     MPI_Comm_free(&mpi_comm_pp_physicalnode);
1233     MPI_Comm_size(dd->comm->mpi_comm_gpu_shared, &dd->comm->nrank_gpu_shared);
1234
1235     if (debug)
1236     {
1237         fprintf(debug, "nrank_gpu_shared %d\n", dd->comm->nrank_gpu_shared);
1238     }
1239
1240     /* Note that some ranks could share a GPU, while others don't */
1241
1242     if (dd->comm->nrank_gpu_shared == 1)
1243     {
1244         MPI_Comm_free(&dd->comm->mpi_comm_gpu_shared);
1245     }
1246 #else
1247     GMX_UNUSED_VALUE(cr);
1248     GMX_UNUSED_VALUE(gpu_id);
1249 #endif
1250 }
1251
1252 static void make_load_communicators(gmx_domdec_t gmx_unused *dd)
1253 {
1254 #if GMX_MPI
1255     int  dim0, dim1, i, j;
1256     ivec loc;
1257
1258     if (debug)
1259     {
1260         fprintf(debug, "Making load communicators\n");
1261     }
1262
1263     snew(dd->comm->load,          std::max(dd->ndim, 1));
1264     snew(dd->comm->mpi_comm_load, std::max(dd->ndim, 1));
1265
1266     if (dd->ndim == 0)
1267     {
1268         return;
1269     }
1270
1271     clear_ivec(loc);
1272     make_load_communicator(dd, 0, loc);
1273     if (dd->ndim > 1)
1274     {
1275         dim0 = dd->dim[0];
1276         for (i = 0; i < dd->nc[dim0]; i++)
1277         {
1278             loc[dim0] = i;
1279             make_load_communicator(dd, 1, loc);
1280         }
1281     }
1282     if (dd->ndim > 2)
1283     {
1284         dim0 = dd->dim[0];
1285         for (i = 0; i < dd->nc[dim0]; i++)
1286         {
1287             loc[dim0] = i;
1288             dim1      = dd->dim[1];
1289             for (j = 0; j < dd->nc[dim1]; j++)
1290             {
1291                 loc[dim1] = j;
1292                 make_load_communicator(dd, 2, loc);
1293             }
1294         }
1295     }
1296
1297     if (debug)
1298     {
1299         fprintf(debug, "Finished making load communicators\n");
1300     }
1301 #endif
1302 }
1303
1304 /*! \brief Sets up the relation between neighboring domains and zones */
1305 static void setup_neighbor_relations(gmx_domdec_t *dd)
1306 {
1307     int                     d, dim, i, j, m;
1308     ivec                    tmp, s;
1309     gmx_domdec_zones_t     *zones;
1310     gmx_domdec_ns_ranges_t *izone;
1311
1312     for (d = 0; d < dd->ndim; d++)
1313     {
1314         dim = dd->dim[d];
1315         copy_ivec(dd->ci, tmp);
1316         tmp[dim]           = (tmp[dim] + 1) % dd->nc[dim];
1317         dd->neighbor[d][0] = ddcoord2ddnodeid(dd, tmp);
1318         copy_ivec(dd->ci, tmp);
1319         tmp[dim]           = (tmp[dim] - 1 + dd->nc[dim]) % dd->nc[dim];
1320         dd->neighbor[d][1] = ddcoord2ddnodeid(dd, tmp);
1321         if (debug)
1322         {
1323             fprintf(debug, "DD rank %d neighbor ranks in dir %d are + %d - %d\n",
1324                     dd->rank, dim,
1325                     dd->neighbor[d][0],
1326                     dd->neighbor[d][1]);
1327         }
1328     }
1329
1330     int nzone  = (1 << dd->ndim);
1331     int nizone = (1 << std::max(dd->ndim - 1, 0));
1332     assert(nizone >= 1 && nizone <= DD_MAXIZONE);
1333
1334     zones = &dd->comm->zones;
1335
1336     for (i = 0; i < nzone; i++)
1337     {
1338         m = 0;
1339         clear_ivec(zones->shift[i]);
1340         for (d = 0; d < dd->ndim; d++)
1341         {
1342             zones->shift[i][dd->dim[d]] = dd_zo[i][m++];
1343         }
1344     }
1345
1346     zones->n = nzone;
1347     for (i = 0; i < nzone; i++)
1348     {
1349         for (d = 0; d < DIM; d++)
1350         {
1351             s[d] = dd->ci[d] - zones->shift[i][d];
1352             if (s[d] < 0)
1353             {
1354                 s[d] += dd->nc[d];
1355             }
1356             else if (s[d] >= dd->nc[d])
1357             {
1358                 s[d] -= dd->nc[d];
1359             }
1360         }
1361     }
1362     zones->nizone = nizone;
1363     for (i = 0; i < zones->nizone; i++)
1364     {
1365         assert(ddNonbondedZonePairRanges[i][0] == i);
1366
1367         izone     = &zones->izone[i];
1368         /* dd_zp3 is for 3D decomposition, for fewer dimensions use only
1369          * j-zones up to nzone.
1370          */
1371         izone->j0 = std::min(ddNonbondedZonePairRanges[i][1], nzone);
1372         izone->j1 = std::min(ddNonbondedZonePairRanges[i][2], nzone);
1373         for (dim = 0; dim < DIM; dim++)
1374         {
1375             if (dd->nc[dim] == 1)
1376             {
1377                 /* All shifts should be allowed */
1378                 izone->shift0[dim] = -1;
1379                 izone->shift1[dim] = 1;
1380             }
1381             else
1382             {
1383                 /* Determine the min/max j-zone shift wrt the i-zone */
1384                 izone->shift0[dim] = 1;
1385                 izone->shift1[dim] = -1;
1386                 for (j = izone->j0; j < izone->j1; j++)
1387                 {
1388                     int shift_diff = zones->shift[j][dim] - zones->shift[i][dim];
1389                     if (shift_diff < izone->shift0[dim])
1390                     {
1391                         izone->shift0[dim] = shift_diff;
1392                     }
1393                     if (shift_diff > izone->shift1[dim])
1394                     {
1395                         izone->shift1[dim] = shift_diff;
1396                     }
1397                 }
1398             }
1399         }
1400     }
1401
1402     if (!isDlbDisabled(dd->comm))
1403     {
1404         dd->comm->cellsizesWithDlb.resize(dd->ndim);
1405     }
1406
1407     if (dd->comm->bRecordLoad)
1408     {
1409         make_load_communicators(dd);
1410     }
1411 }
1412
1413 static void make_pp_communicator(const gmx::MDLogger  &mdlog,
1414                                  gmx_domdec_t         *dd,
1415                                  t_commrec gmx_unused *cr,
1416                                  bool gmx_unused       reorder)
1417 {
1418 #if GMX_MPI
1419     gmx_domdec_comm_t *comm;
1420     int                rank, *buf;
1421     ivec               periods;
1422     MPI_Comm           comm_cart;
1423
1424     comm = dd->comm;
1425
1426     if (comm->bCartesianPP)
1427     {
1428         /* Set up cartesian communication for the particle-particle part */
1429         GMX_LOG(mdlog.info).appendTextFormatted(
1430                 "Will use a Cartesian communicator: %d x %d x %d",
1431                 dd->nc[XX], dd->nc[YY], dd->nc[ZZ]);
1432
1433         for (int i = 0; i < DIM; i++)
1434         {
1435             periods[i] = TRUE;
1436         }
1437         MPI_Cart_create(cr->mpi_comm_mygroup, DIM, dd->nc, periods, static_cast<int>(reorder),
1438                         &comm_cart);
1439         /* We overwrite the old communicator with the new cartesian one */
1440         cr->mpi_comm_mygroup = comm_cart;
1441     }
1442
1443     dd->mpi_comm_all = cr->mpi_comm_mygroup;
1444     MPI_Comm_rank(dd->mpi_comm_all, &dd->rank);
1445
1446     if (comm->bCartesianPP_PME)
1447     {
1448         /* Since we want to use the original cartesian setup for sim,
1449          * and not the one after split, we need to make an index.
1450          */
1451         snew(comm->ddindex2ddnodeid, dd->nnodes);
1452         comm->ddindex2ddnodeid[dd_index(dd->nc, dd->ci)] = dd->rank;
1453         gmx_sumi(dd->nnodes, comm->ddindex2ddnodeid, cr);
1454         /* Get the rank of the DD master,
1455          * above we made sure that the master node is a PP node.
1456          */
1457         if (MASTER(cr))
1458         {
1459             rank = dd->rank;
1460         }
1461         else
1462         {
1463             rank = 0;
1464         }
1465         MPI_Allreduce(&rank, &dd->masterrank, 1, MPI_INT, MPI_SUM, dd->mpi_comm_all);
1466     }
1467     else if (comm->bCartesianPP)
1468     {
1469         if (cr->npmenodes == 0)
1470         {
1471             /* The PP communicator is also
1472              * the communicator for this simulation
1473              */
1474             cr->mpi_comm_mysim = cr->mpi_comm_mygroup;
1475         }
1476         cr->nodeid = dd->rank;
1477
1478         MPI_Cart_coords(dd->mpi_comm_all, dd->rank, DIM, dd->ci);
1479
1480         /* We need to make an index to go from the coordinates
1481          * to the nodeid of this simulation.
1482          */
1483         snew(comm->ddindex2simnodeid, dd->nnodes);
1484         snew(buf, dd->nnodes);
1485         if (thisRankHasDuty(cr, DUTY_PP))
1486         {
1487             buf[dd_index(dd->nc, dd->ci)] = cr->sim_nodeid;
1488         }
1489         /* Communicate the ddindex to simulation nodeid index */
1490         MPI_Allreduce(buf, comm->ddindex2simnodeid, dd->nnodes, MPI_INT, MPI_SUM,
1491                       cr->mpi_comm_mysim);
1492         sfree(buf);
1493
1494         /* Determine the master coordinates and rank.
1495          * The DD master should be the same node as the master of this sim.
1496          */
1497         for (int i = 0; i < dd->nnodes; i++)
1498         {
1499             if (comm->ddindex2simnodeid[i] == 0)
1500             {
1501                 ddindex2xyz(dd->nc, i, dd->master_ci);
1502                 MPI_Cart_rank(dd->mpi_comm_all, dd->master_ci, &dd->masterrank);
1503             }
1504         }
1505         if (debug)
1506         {
1507             fprintf(debug, "The master rank is %d\n", dd->masterrank);
1508         }
1509     }
1510     else
1511     {
1512         /* No Cartesian communicators */
1513         /* We use the rank in dd->comm->all as DD index */
1514         ddindex2xyz(dd->nc, dd->rank, dd->ci);
1515         /* The simulation master nodeid is 0, so the DD master rank is also 0 */
1516         dd->masterrank = 0;
1517         clear_ivec(dd->master_ci);
1518     }
1519 #endif
1520
1521     GMX_LOG(mdlog.info).appendTextFormatted(
1522             "Domain decomposition rank %d, coordinates %d %d %d\n",
1523             dd->rank, dd->ci[XX], dd->ci[YY], dd->ci[ZZ]);
1524     if (debug)
1525     {
1526         fprintf(debug,
1527                 "Domain decomposition rank %d, coordinates %d %d %d\n\n",
1528                 dd->rank, dd->ci[XX], dd->ci[YY], dd->ci[ZZ]);
1529     }
1530 }
1531
1532 static void receive_ddindex2simnodeid(gmx_domdec_t         *dd,
1533                                       t_commrec            *cr)
1534 {
1535 #if GMX_MPI
1536     gmx_domdec_comm_t *comm = dd->comm;
1537
1538     if (!comm->bCartesianPP_PME && comm->bCartesianPP)
1539     {
1540         int *buf;
1541         snew(comm->ddindex2simnodeid, dd->nnodes);
1542         snew(buf, dd->nnodes);
1543         if (thisRankHasDuty(cr, DUTY_PP))
1544         {
1545             buf[dd_index(dd->nc, dd->ci)] = cr->sim_nodeid;
1546         }
1547         /* Communicate the ddindex to simulation nodeid index */
1548         MPI_Allreduce(buf, comm->ddindex2simnodeid, dd->nnodes, MPI_INT, MPI_SUM,
1549                       cr->mpi_comm_mysim);
1550         sfree(buf);
1551     }
1552 #else
1553     GMX_UNUSED_VALUE(dd);
1554     GMX_UNUSED_VALUE(cr);
1555 #endif
1556 }
1557
1558 static void split_communicator(const gmx::MDLogger &mdlog,
1559                                t_commrec *cr, gmx_domdec_t *dd,
1560                                DdRankOrder gmx_unused rankOrder,
1561                                bool gmx_unused reorder)
1562 {
1563     gmx_domdec_comm_t *comm;
1564     int                i;
1565     gmx_bool           bDiv[DIM];
1566 #if GMX_MPI
1567     MPI_Comm           comm_cart;
1568 #endif
1569
1570     comm = dd->comm;
1571
1572     if (comm->bCartesianPP)
1573     {
1574         for (i = 1; i < DIM; i++)
1575         {
1576             bDiv[i] = ((cr->npmenodes*dd->nc[i]) % (dd->nnodes) == 0);
1577         }
1578         if (bDiv[YY] || bDiv[ZZ])
1579         {
1580             comm->bCartesianPP_PME = TRUE;
1581             /* If we have 2D PME decomposition, which is always in x+y,
1582              * we stack the PME only nodes in z.
1583              * Otherwise we choose the direction that provides the thinnest slab
1584              * of PME only nodes as this will have the least effect
1585              * on the PP communication.
1586              * But for the PME communication the opposite might be better.
1587              */
1588             if (bDiv[ZZ] && (comm->npmenodes_y > 1 ||
1589                              !bDiv[YY] ||
1590                              dd->nc[YY] > dd->nc[ZZ]))
1591             {
1592                 comm->cartpmedim = ZZ;
1593             }
1594             else
1595             {
1596                 comm->cartpmedim = YY;
1597             }
1598             comm->ntot[comm->cartpmedim]
1599                 += (cr->npmenodes*dd->nc[comm->cartpmedim])/dd->nnodes;
1600         }
1601         else
1602         {
1603             GMX_LOG(mdlog.info).appendTextFormatted(
1604                     "Number of PME-only ranks (%d) is not a multiple of nx*ny (%d*%d) or nx*nz (%d*%d)",
1605                     cr->npmenodes, dd->nc[XX], dd->nc[YY], dd->nc[XX], dd->nc[ZZ]);
1606             GMX_LOG(mdlog.info).appendText("Will not use a Cartesian communicator for PP <-> PME\n");
1607         }
1608     }
1609
1610     if (comm->bCartesianPP_PME)
1611     {
1612 #if GMX_MPI
1613         int  rank;
1614         ivec periods;
1615
1616         GMX_LOG(mdlog.info).appendTextFormatted(
1617                 "Will use a Cartesian communicator for PP <-> PME: %d x %d x %d",
1618                 comm->ntot[XX], comm->ntot[YY], comm->ntot[ZZ]);
1619
1620         for (i = 0; i < DIM; i++)
1621         {
1622             periods[i] = TRUE;
1623         }
1624         MPI_Cart_create(cr->mpi_comm_mysim, DIM, comm->ntot, periods, static_cast<int>(reorder),
1625                         &comm_cart);
1626         MPI_Comm_rank(comm_cart, &rank);
1627         if (MASTER(cr) && rank != 0)
1628         {
1629             gmx_fatal(FARGS, "MPI rank 0 was renumbered by MPI_Cart_create, we do not allow this");
1630         }
1631
1632         /* With this assigment we loose the link to the original communicator
1633          * which will usually be MPI_COMM_WORLD, unless have multisim.
1634          */
1635         cr->mpi_comm_mysim = comm_cart;
1636         cr->sim_nodeid     = rank;
1637
1638         MPI_Cart_coords(cr->mpi_comm_mysim, cr->sim_nodeid, DIM, dd->ci);
1639
1640         GMX_LOG(mdlog.info).appendTextFormatted(
1641                 "Cartesian rank %d, coordinates %d %d %d\n",
1642                 cr->sim_nodeid, dd->ci[XX], dd->ci[YY], dd->ci[ZZ]);
1643
1644         if (dd->ci[comm->cartpmedim] < dd->nc[comm->cartpmedim])
1645         {
1646             cr->duty = DUTY_PP;
1647         }
1648         if (cr->npmenodes == 0 ||
1649             dd->ci[comm->cartpmedim] >= dd->nc[comm->cartpmedim])
1650         {
1651             cr->duty = DUTY_PME;
1652         }
1653
1654         /* Split the sim communicator into PP and PME only nodes */
1655         MPI_Comm_split(cr->mpi_comm_mysim,
1656                        getThisRankDuties(cr),
1657                        dd_index(comm->ntot, dd->ci),
1658                        &cr->mpi_comm_mygroup);
1659 #endif
1660     }
1661     else
1662     {
1663         switch (rankOrder)
1664         {
1665             case DdRankOrder::pp_pme:
1666                 GMX_LOG(mdlog.info).appendText("Order of the ranks: PP first, PME last");
1667                 break;
1668             case DdRankOrder::interleave:
1669                 /* Interleave the PP-only and PME-only ranks */
1670                 GMX_LOG(mdlog.info).appendText("Interleaving PP and PME ranks");
1671                 comm->pmenodes = dd_interleaved_pme_ranks(dd);
1672                 break;
1673             case DdRankOrder::cartesian:
1674                 break;
1675             default:
1676                 gmx_fatal(FARGS, "Invalid ddRankOrder=%d", static_cast<int>(rankOrder));
1677         }
1678
1679         if (dd_simnode2pmenode(dd, cr, cr->sim_nodeid) == -1)
1680         {
1681             cr->duty = DUTY_PME;
1682         }
1683         else
1684         {
1685             cr->duty = DUTY_PP;
1686         }
1687 #if GMX_MPI
1688         /* Split the sim communicator into PP and PME only nodes */
1689         MPI_Comm_split(cr->mpi_comm_mysim,
1690                        getThisRankDuties(cr),
1691                        cr->nodeid,
1692                        &cr->mpi_comm_mygroup);
1693         MPI_Comm_rank(cr->mpi_comm_mygroup, &cr->nodeid);
1694 #endif
1695     }
1696
1697     GMX_LOG(mdlog.info).appendTextFormatted(
1698             "This rank does only %s work.\n",
1699             thisRankHasDuty(cr, DUTY_PP) ? "particle-particle" : "PME-mesh");
1700 }
1701
1702 /*! \brief Generates the MPI communicators for domain decomposition */
1703 static void make_dd_communicators(const gmx::MDLogger &mdlog,
1704                                   t_commrec *cr,
1705                                   gmx_domdec_t *dd, DdRankOrder ddRankOrder)
1706 {
1707     gmx_domdec_comm_t *comm;
1708     bool               CartReorder;
1709
1710     comm = dd->comm;
1711
1712     copy_ivec(dd->nc, comm->ntot);
1713
1714     comm->bCartesianPP     = (ddRankOrder == DdRankOrder::cartesian);
1715     comm->bCartesianPP_PME = FALSE;
1716
1717     /* Reorder the nodes by default. This might change the MPI ranks.
1718      * Real reordering is only supported on very few architectures,
1719      * Blue Gene is one of them.
1720      */
1721     CartReorder = getenv("GMX_NO_CART_REORDER") == nullptr;
1722
1723     if (cr->npmenodes > 0)
1724     {
1725         /* Split the communicator into a PP and PME part */
1726         split_communicator(mdlog, cr, dd, ddRankOrder, CartReorder);
1727         if (comm->bCartesianPP_PME)
1728         {
1729             /* We (possibly) reordered the nodes in split_communicator,
1730              * so it is no longer required in make_pp_communicator.
1731              */
1732             CartReorder = FALSE;
1733         }
1734     }
1735     else
1736     {
1737         /* All nodes do PP and PME */
1738 #if GMX_MPI
1739         /* We do not require separate communicators */
1740         cr->mpi_comm_mygroup = cr->mpi_comm_mysim;
1741 #endif
1742     }
1743
1744     if (thisRankHasDuty(cr, DUTY_PP))
1745     {
1746         /* Copy or make a new PP communicator */
1747         make_pp_communicator(mdlog, dd, cr, CartReorder);
1748     }
1749     else
1750     {
1751         receive_ddindex2simnodeid(dd, cr);
1752     }
1753
1754     if (!thisRankHasDuty(cr, DUTY_PME))
1755     {
1756         /* Set up the commnuication to our PME node */
1757         dd->pme_nodeid           = dd_simnode2pmenode(dd, cr, cr->sim_nodeid);
1758         dd->pme_receive_vir_ener = receive_vir_ener(dd, cr);
1759         if (debug)
1760         {
1761             fprintf(debug, "My pme_nodeid %d receive ener %s\n",
1762                     dd->pme_nodeid, gmx::boolToString(dd->pme_receive_vir_ener));
1763         }
1764     }
1765     else
1766     {
1767         dd->pme_nodeid = -1;
1768     }
1769
1770     if (DDMASTER(dd))
1771     {
1772         dd->ma = gmx::compat::make_unique<AtomDistribution>(dd->nc,
1773                                                             comm->cgs_gl.nr,
1774                                                             comm->cgs_gl.index[comm->cgs_gl.nr]);
1775     }
1776 }
1777
1778 static real *get_slb_frac(const gmx::MDLogger &mdlog,
1779                           const char *dir, int nc, const char *size_string)
1780 {
1781     real  *slb_frac, tot;
1782     int    i, n;
1783     double dbl;
1784
1785     slb_frac = nullptr;
1786     if (nc > 1 && size_string != nullptr)
1787     {
1788         GMX_LOG(mdlog.info).appendTextFormatted(
1789                 "Using static load balancing for the %s direction", dir);
1790         snew(slb_frac, nc);
1791         tot = 0;
1792         for (i = 0; i < nc; i++)
1793         {
1794             dbl = 0;
1795             sscanf(size_string, "%20lf%n", &dbl, &n);
1796             if (dbl == 0)
1797             {
1798                 gmx_fatal(FARGS, "Incorrect or not enough DD cell size entries for direction %s: '%s'", dir, size_string);
1799             }
1800             slb_frac[i]  = dbl;
1801             size_string += n;
1802             tot         += slb_frac[i];
1803         }
1804         /* Normalize */
1805         std::string relativeCellSizes = "Relative cell sizes:";
1806         for (i = 0; i < nc; i++)
1807         {
1808             slb_frac[i]       /= tot;
1809             relativeCellSizes += gmx::formatString(" %5.3f", slb_frac[i]);
1810         }
1811         GMX_LOG(mdlog.info).appendText(relativeCellSizes);
1812     }
1813
1814     return slb_frac;
1815 }
1816
1817 static int multi_body_bondeds_count(const gmx_mtop_t *mtop)
1818 {
1819     int                  n     = 0;
1820     gmx_mtop_ilistloop_t iloop = gmx_mtop_ilistloop_init(mtop);
1821     int                  nmol;
1822     while (const InteractionLists *ilists = gmx_mtop_ilistloop_next(iloop, &nmol))
1823     {
1824         for (auto &ilist : extractILists(*ilists, IF_BOND))
1825         {
1826             if (NRAL(ilist.functionType) >  2)
1827             {
1828                 n += nmol*(ilist.iatoms.size()/ilistStride(ilist));
1829             }
1830         }
1831     }
1832
1833     return n;
1834 }
1835
1836 static int dd_getenv(const gmx::MDLogger &mdlog,
1837                      const char *env_var, int def)
1838 {
1839     char *val;
1840     int   nst;
1841
1842     nst = def;
1843     val = getenv(env_var);
1844     if (val)
1845     {
1846         if (sscanf(val, "%20d", &nst) <= 0)
1847         {
1848             nst = 1;
1849         }
1850         GMX_LOG(mdlog.info).appendTextFormatted(
1851                 "Found env.var. %s = %s, using value %d",
1852                 env_var, val, nst);
1853     }
1854
1855     return nst;
1856 }
1857
1858 static void check_dd_restrictions(const gmx_domdec_t  *dd,
1859                                   const t_inputrec    *ir,
1860                                   const gmx::MDLogger &mdlog)
1861 {
1862     if (ir->ePBC == epbcSCREW &&
1863         (dd->nc[XX] == 1 || dd->nc[YY] > 1 || dd->nc[ZZ] > 1))
1864     {
1865         gmx_fatal(FARGS, "With pbc=%s can only do domain decomposition in the x-direction", epbc_names[ir->ePBC]);
1866     }
1867
1868     if (ir->ns_type == ensSIMPLE)
1869     {
1870         gmx_fatal(FARGS, "Domain decomposition does not support simple neighbor searching, use grid searching or run with one MPI rank");
1871     }
1872
1873     if (ir->nstlist == 0)
1874     {
1875         gmx_fatal(FARGS, "Domain decomposition does not work with nstlist=0");
1876     }
1877
1878     if (ir->comm_mode == ecmANGULAR && ir->ePBC != epbcNONE)
1879     {
1880         GMX_LOG(mdlog.warning).appendText("comm-mode angular will give incorrect results when the comm group partially crosses a periodic boundary");
1881     }
1882 }
1883
1884 static real average_cellsize_min(gmx_domdec_t *dd, gmx_ddbox_t *ddbox)
1885 {
1886     int  di, d;
1887     real r;
1888
1889     r = ddbox->box_size[XX];
1890     for (di = 0; di < dd->ndim; di++)
1891     {
1892         d = dd->dim[di];
1893         /* Check using the initial average cell size */
1894         r = std::min(r, ddbox->box_size[d]*ddbox->skew_fac[d]/dd->nc[d]);
1895     }
1896
1897     return r;
1898 }
1899
1900 /*! \brief Depending on the DLB initial value return the DLB switched off state or issue an error.
1901  */
1902 static DlbState forceDlbOffOrBail(DlbState             cmdlineDlbState,
1903                                   const std::string   &reasonStr,
1904                                   const gmx::MDLogger &mdlog)
1905 {
1906     std::string dlbNotSupportedErr  = "Dynamic load balancing requested, but ";
1907     std::string dlbDisableNote      = "NOTE: disabling dynamic load balancing as ";
1908
1909     if (cmdlineDlbState == DlbState::onUser)
1910     {
1911         gmx_fatal(FARGS, "%s", (dlbNotSupportedErr + reasonStr).c_str());
1912     }
1913     else if (cmdlineDlbState == DlbState::offCanTurnOn)
1914     {
1915         GMX_LOG(mdlog.info).appendText(dlbDisableNote + reasonStr);
1916     }
1917     return DlbState::offForever;
1918 }
1919
1920 /*! \brief Return the dynamic load balancer's initial state based on initial conditions and user inputs.
1921  *
1922  * This function parses the parameters of "-dlb" command line option setting
1923  * corresponding state values. Then it checks the consistency of the determined
1924  * state with other run parameters and settings. As a result, the initial state
1925  * may be altered or an error may be thrown if incompatibility of options is detected.
1926  *
1927  * \param [in] mdlog       Logger.
1928  * \param [in] dlbOption   Enum value for the DLB option.
1929  * \param [in] bRecordLoad True if the load balancer is recording load information.
1930  * \param [in] mdrunOptions  Options for mdrun.
1931  * \param [in] ir          Pointer mdrun to input parameters.
1932  * \returns                DLB initial/startup state.
1933  */
1934 static DlbState determineInitialDlbState(const gmx::MDLogger &mdlog,
1935                                          DlbOption dlbOption, gmx_bool bRecordLoad,
1936                                          const MdrunOptions &mdrunOptions,
1937                                          const t_inputrec *ir)
1938 {
1939     DlbState dlbState = DlbState::offCanTurnOn;
1940
1941     switch (dlbOption)
1942     {
1943         case DlbOption::turnOnWhenUseful: dlbState = DlbState::offCanTurnOn; break;
1944         case DlbOption::no:               dlbState = DlbState::offUser;      break;
1945         case DlbOption::yes:              dlbState = DlbState::onUser;       break;
1946         default: gmx_incons("Invalid dlbOption enum value");
1947     }
1948
1949     /* Reruns don't support DLB: bail or override auto mode */
1950     if (mdrunOptions.rerun)
1951     {
1952         std::string reasonStr = "it is not supported in reruns.";
1953         return forceDlbOffOrBail(dlbState, reasonStr, mdlog);
1954     }
1955
1956     /* Unsupported integrators */
1957     if (!EI_DYNAMICS(ir->eI))
1958     {
1959         auto reasonStr = gmx::formatString("it is only supported with dynamics, not with integrator '%s'.", EI(ir->eI));
1960         return forceDlbOffOrBail(dlbState, reasonStr, mdlog);
1961     }
1962
1963     /* Without cycle counters we can't time work to balance on */
1964     if (!bRecordLoad)
1965     {
1966         std::string reasonStr = "cycle counters unsupported or not enabled in the operating system kernel.";
1967         return forceDlbOffOrBail(dlbState, reasonStr, mdlog);
1968     }
1969
1970     if (mdrunOptions.reproducible)
1971     {
1972         std::string reasonStr = "you started a reproducible run.";
1973         switch (dlbState)
1974         {
1975             case DlbState::offUser:
1976                 break;
1977             case DlbState::offForever:
1978                 GMX_RELEASE_ASSERT(false, "DlbState::offForever is not a valid initial state");
1979                 break;
1980             case DlbState::offCanTurnOn:
1981                 return forceDlbOffOrBail(dlbState, reasonStr, mdlog);
1982             case DlbState::onCanTurnOff:
1983                 GMX_RELEASE_ASSERT(false, "DlbState::offCanTurnOff is not a valid initial state");
1984                 break;
1985             case DlbState::onUser:
1986                 return forceDlbOffOrBail(dlbState, reasonStr + " In load balanced runs binary reproducibility cannot be ensured.", mdlog);
1987             default:
1988                 gmx_fatal(FARGS, "Death horror: undefined case (%d) for load balancing choice", static_cast<int>(dlbState));
1989         }
1990     }
1991
1992     return dlbState;
1993 }
1994
1995 static void set_dd_dim(const gmx::MDLogger &mdlog, gmx_domdec_t *dd)
1996 {
1997     dd->ndim = 0;
1998     if (getenv("GMX_DD_ORDER_ZYX") != nullptr)
1999     {
2000         /* Decomposition order z,y,x */
2001         GMX_LOG(mdlog.info).appendText("Using domain decomposition order z, y, x");
2002         for (int dim = DIM-1; dim >= 0; dim--)
2003         {
2004             if (dd->nc[dim] > 1)
2005             {
2006                 dd->dim[dd->ndim++] = dim;
2007             }
2008         }
2009     }
2010     else
2011     {
2012         /* Decomposition order x,y,z */
2013         for (int dim = 0; dim < DIM; dim++)
2014         {
2015             if (dd->nc[dim] > 1)
2016             {
2017                 dd->dim[dd->ndim++] = dim;
2018             }
2019         }
2020     }
2021
2022     if (dd->ndim == 0)
2023     {
2024         /* Set dim[0] to avoid extra checks on ndim in several places */
2025         dd->dim[0] = XX;
2026     }
2027 }
2028
2029 static gmx_domdec_comm_t *init_dd_comm()
2030 {
2031     gmx_domdec_comm_t *comm = new gmx_domdec_comm_t;
2032
2033     comm->n_load_have      = 0;
2034     comm->n_load_collect   = 0;
2035
2036     comm->haveTurnedOffDlb = false;
2037
2038     for (int i = 0; i < static_cast<int>(DDAtomRanges::Type::Number); i++)
2039     {
2040         comm->sum_nat[i] = 0;
2041     }
2042     comm->ndecomp   = 0;
2043     comm->nload     = 0;
2044     comm->load_step = 0;
2045     comm->load_sum  = 0;
2046     comm->load_max  = 0;
2047     clear_ivec(comm->load_lim);
2048     comm->load_mdf  = 0;
2049     comm->load_pme  = 0;
2050
2051     /* This should be replaced by a unique pointer */
2052     comm->balanceRegion = ddBalanceRegionAllocate();
2053
2054     return comm;
2055 }
2056
2057 /* Returns whether mtop contains constraints and/or vsites */
2058 static bool systemHasConstraintsOrVsites(const gmx_mtop_t &mtop)
2059 {
2060     auto ilistLoop = gmx_mtop_ilistloop_init(mtop);
2061     int  nmol;
2062     while (const InteractionLists *ilists = gmx_mtop_ilistloop_next(ilistLoop, &nmol))
2063     {
2064         if (!extractILists(*ilists, IF_CONSTRAINT | IF_VSITE).empty())
2065         {
2066             return true;
2067         }
2068     }
2069
2070     return false;
2071 }
2072
2073 static void setupUpdateGroups(const gmx::MDLogger &mdlog,
2074                               const gmx_mtop_t    &mtop,
2075                               const t_inputrec    &inputrec,
2076                               real                 cutoffMargin,
2077                               int                  numMpiRanksTotal,
2078                               gmx_domdec_comm_t   *comm)
2079 {
2080     /* When we have constraints and/or vsites, it is beneficial to use
2081      * update groups (when possible) to allow independent update of groups.
2082      */
2083     if (!systemHasConstraintsOrVsites(mtop))
2084     {
2085         /* No constraints or vsites, atoms can be updated independently */
2086         return;
2087     }
2088
2089     comm->updateGroupingPerMoleculetype = gmx::makeUpdateGroups(mtop);
2090     comm->useUpdateGroups               = !comm->updateGroupingPerMoleculetype.empty();
2091
2092     if (comm->useUpdateGroups)
2093     {
2094         int numUpdateGroups = 0;
2095         for (const auto &molblock : mtop.molblock)
2096         {
2097             numUpdateGroups += molblock.nmol*comm->updateGroupingPerMoleculetype[molblock.type].numBlocks();
2098         }
2099
2100         /* Note: We would like to use dd->nnodes for the atom count estimate,
2101          *       but that is not yet available here. But this anyhow only
2102          *       affect performance up to the second dd_partition_system call.
2103          */
2104         int homeAtomCountEstimate =  mtop.natoms/numMpiRanksTotal;
2105         comm->updateGroupsCog =
2106             gmx::compat::make_unique<gmx::UpdateGroupsCog>(mtop,
2107                                                            comm->updateGroupingPerMoleculetype,
2108                                                            maxReferenceTemperature(inputrec),
2109                                                            homeAtomCountEstimate);
2110
2111         /* To use update groups, the large domain-to-domain cutoff distance
2112          * should be compatible with the box size.
2113          */
2114         comm->useUpdateGroups = (atomToAtomIntoDomainToDomainCutoff(*comm, 0) < cutoffMargin);
2115
2116         if (comm->useUpdateGroups)
2117         {
2118             GMX_LOG(mdlog.info).appendTextFormatted(
2119                     "Using update groups, nr %d, average size %.1f atoms, max. radius %.3f nm\n",
2120                     numUpdateGroups,
2121                     mtop.natoms/static_cast<double>(numUpdateGroups),
2122                     comm->updateGroupsCog->maxUpdateGroupRadius());
2123         }
2124         else
2125         {
2126             GMX_LOG(mdlog.info).appendTextFormatted("The combination of rlist and box size prohibits the use of update groups\n");
2127             comm->updateGroupingPerMoleculetype.clear();
2128             comm->updateGroupsCog.reset(nullptr);
2129         }
2130     }
2131 }
2132
2133 /*! \brief Set the cell size and interaction limits, as well as the DD grid */
2134 static void set_dd_limits_and_grid(const gmx::MDLogger &mdlog,
2135                                    t_commrec *cr, gmx_domdec_t *dd,
2136                                    const DomdecOptions &options,
2137                                    const MdrunOptions &mdrunOptions,
2138                                    const gmx_mtop_t *mtop,
2139                                    const t_inputrec *ir,
2140                                    const matrix box,
2141                                    gmx::ArrayRef<const gmx::RVec> xGlobal,
2142                                    gmx_ddbox_t *ddbox)
2143 {
2144     real               r_bonded         = -1;
2145     real               r_bonded_limit   = -1;
2146     const real         tenPercentMargin = 1.1;
2147     gmx_domdec_comm_t *comm             = dd->comm;
2148
2149     dd->npbcdim              = ePBC2npbcdim(ir->ePBC);
2150     dd->numBoundedDimensions = inputrec2nboundeddim(ir);
2151     dd->haveDynamicBox       = inputrecDynamicBox(ir);
2152     dd->bScrewPBC            = (ir->ePBC == epbcSCREW);
2153
2154     dd->pme_recv_f_alloc = 0;
2155     dd->pme_recv_f_buf   = nullptr;
2156
2157     /* Initialize to GPU share count to 0, might change later */
2158     comm->nrank_gpu_shared = 0;
2159
2160     comm->dlbState         = determineInitialDlbState(mdlog, options.dlbOption, comm->bRecordLoad, mdrunOptions, ir);
2161     dd_dlb_set_should_check_whether_to_turn_dlb_on(dd, TRUE);
2162     /* To consider turning DLB on after 2*nstlist steps we need to check
2163      * at partitioning count 3. Thus we need to increase the first count by 2.
2164      */
2165     comm->ddPartioningCountFirstDlbOff += 2;
2166
2167     GMX_LOG(mdlog.info).appendTextFormatted(
2168             "Dynamic load balancing: %s", edlbs_names[int(comm->dlbState)]);
2169
2170     comm->bPMELoadBalDLBLimits = FALSE;
2171
2172     /* Allocate the charge group/atom sorting struct */
2173     comm->sort = gmx::compat::make_unique<gmx_domdec_sort_t>();
2174
2175     comm->bCGs = (ncg_mtop(mtop) < mtop->natoms);
2176
2177     /* We need to decide on update groups early, as this affects communication distances */
2178     comm->useUpdateGroups = false;
2179     if (ir->cutoff_scheme == ecutsVERLET)
2180     {
2181         real cutoffMargin = std::sqrt(max_cutoff2(ir->ePBC, box)) - ir->rlist;
2182         setupUpdateGroups(mdlog, *mtop, *ir, cutoffMargin, cr->nnodes, comm);
2183     }
2184
2185     comm->bInterCGBondeds = ((ncg_mtop(mtop) > gmx_mtop_num_molecules(*mtop)) ||
2186                              mtop->bIntermolecularInteractions);
2187     if (comm->bInterCGBondeds)
2188     {
2189         comm->bInterCGMultiBody = (multi_body_bondeds_count(mtop) > 0);
2190     }
2191     else
2192     {
2193         comm->bInterCGMultiBody = FALSE;
2194     }
2195
2196     if (comm->useUpdateGroups)
2197     {
2198         dd->splitConstraints = false;
2199         dd->splitSettles     = false;
2200     }
2201     else
2202     {
2203         dd->splitConstraints = gmx::inter_charge_group_constraints(*mtop);
2204         dd->splitSettles     = gmx::inter_charge_group_settles(*mtop);
2205     }
2206
2207     if (ir->rlist == 0)
2208     {
2209         /* Set the cut-off to some very large value,
2210          * so we don't need if statements everywhere in the code.
2211          * We use sqrt, since the cut-off is squared in some places.
2212          */
2213         comm->cutoff   = GMX_CUTOFF_INF;
2214     }
2215     else
2216     {
2217         comm->cutoff   = atomToAtomIntoDomainToDomainCutoff(*comm, ir->rlist);
2218     }
2219     comm->cutoff_mbody = 0;
2220
2221     /* Determine the minimum cell size limit, affected by many factors */
2222     comm->cellsize_limit = 0;
2223     comm->bBondComm      = FALSE;
2224
2225     /* We do not allow home atoms to move beyond the neighboring domain
2226      * between domain decomposition steps, which limits the cell size.
2227      * Get an estimate of cell size limit due to atom displacement.
2228      * In most cases this is a large overestimate, because it assumes
2229      * non-interaction atoms.
2230      * We set the chance to 1 in a trillion steps.
2231      */
2232     constexpr real c_chanceThatAtomMovesBeyondDomain = 1e-12;
2233     const real     limitForAtomDisplacement          =
2234         minCellSizeForAtomDisplacement(*mtop, *ir,
2235                                        comm->updateGroupingPerMoleculetype,
2236                                        c_chanceThatAtomMovesBeyondDomain);
2237     GMX_LOG(mdlog.info).appendTextFormatted(
2238             "Minimum cell size due to atom displacement: %.3f nm",
2239             limitForAtomDisplacement);
2240
2241     comm->cellsize_limit = std::max(comm->cellsize_limit,
2242                                     limitForAtomDisplacement);
2243
2244     /* TODO: PME decomposition currently requires atoms not to be more than
2245      *       2/3 of comm->cutoff, which is >=rlist, outside of their domain.
2246      *       In nearly all cases, limitForAtomDisplacement will be smaller
2247      *       than 2/3*rlist, so the PME requirement is satisfied.
2248      *       But it would be better for both correctness and performance
2249      *       to use limitForAtomDisplacement instead of 2/3*comm->cutoff.
2250      *       Note that we would need to improve the pairlist buffer case.
2251      */
2252
2253     if (comm->bInterCGBondeds)
2254     {
2255         if (options.minimumCommunicationRange > 0)
2256         {
2257             comm->cutoff_mbody = atomToAtomIntoDomainToDomainCutoff(*comm, options.minimumCommunicationRange);
2258             if (options.useBondedCommunication)
2259             {
2260                 comm->bBondComm = (comm->cutoff_mbody > comm->cutoff);
2261             }
2262             else
2263             {
2264                 comm->cutoff = std::max(comm->cutoff, comm->cutoff_mbody);
2265             }
2266             r_bonded_limit = comm->cutoff_mbody;
2267         }
2268         else if (ir->bPeriodicMols)
2269         {
2270             /* Can not easily determine the required cut-off */
2271             GMX_LOG(mdlog.warning).appendText("NOTE: Periodic molecules are present in this system. Because of this, the domain decomposition algorithm cannot easily determine the minimum cell size that it requires for treating bonded interactions. Instead, domain decomposition will assume that half the non-bonded cut-off will be a suitable lower bound.");
2272             comm->cutoff_mbody = comm->cutoff/2;
2273             r_bonded_limit     = comm->cutoff_mbody;
2274         }
2275         else
2276         {
2277             real r_2b, r_mb;
2278
2279             if (MASTER(cr))
2280             {
2281                 dd_bonded_cg_distance(mdlog, mtop, ir, as_rvec_array(xGlobal.data()), box,
2282                                       options.checkBondedInteractions,
2283                                       &r_2b, &r_mb);
2284             }
2285             gmx_bcast(sizeof(r_2b), &r_2b, cr);
2286             gmx_bcast(sizeof(r_mb), &r_mb, cr);
2287
2288             /* We use an initial margin of 10% for the minimum cell size,
2289              * except when we are just below the non-bonded cut-off.
2290              */
2291             if (options.useBondedCommunication)
2292             {
2293                 if (std::max(r_2b, r_mb) > comm->cutoff)
2294                 {
2295                     r_bonded        = std::max(r_2b, r_mb);
2296                     r_bonded_limit  = tenPercentMargin*r_bonded;
2297                     comm->bBondComm = TRUE;
2298                 }
2299                 else
2300                 {
2301                     r_bonded       = r_mb;
2302                     r_bonded_limit = std::min(tenPercentMargin*r_bonded, comm->cutoff);
2303                 }
2304                 /* We determine cutoff_mbody later */
2305             }
2306             else
2307             {
2308                 /* No special bonded communication,
2309                  * simply increase the DD cut-off.
2310                  */
2311                 r_bonded_limit     = tenPercentMargin*std::max(r_2b, r_mb);
2312                 comm->cutoff_mbody = r_bonded_limit;
2313                 comm->cutoff       = std::max(comm->cutoff, comm->cutoff_mbody);
2314             }
2315         }
2316         GMX_LOG(mdlog.info).appendTextFormatted(
2317                 "Minimum cell size due to bonded interactions: %.3f nm",
2318                 r_bonded_limit);
2319
2320         comm->cellsize_limit = std::max(comm->cellsize_limit, r_bonded_limit);
2321     }
2322
2323     real rconstr = 0;
2324     if (dd->splitConstraints && options.constraintCommunicationRange <= 0)
2325     {
2326         /* There is a cell size limit due to the constraints (P-LINCS) */
2327         rconstr = gmx::constr_r_max(mdlog, mtop, ir);
2328         GMX_LOG(mdlog.info).appendTextFormatted(
2329                 "Estimated maximum distance required for P-LINCS: %.3f nm",
2330                 rconstr);
2331         if (rconstr > comm->cellsize_limit)
2332         {
2333             GMX_LOG(mdlog.info).appendText("This distance will limit the DD cell size, you can override this with -rcon");
2334         }
2335     }
2336     else if (options.constraintCommunicationRange > 0)
2337     {
2338         /* Here we do not check for dd->splitConstraints.
2339          * because one can also set a cell size limit for virtual sites only
2340          * and at this point we don't know yet if there are intercg v-sites.
2341          */
2342         GMX_LOG(mdlog.info).appendTextFormatted(
2343                 "User supplied maximum distance required for P-LINCS: %.3f nm",
2344                 options.constraintCommunicationRange);
2345         rconstr = options.constraintCommunicationRange;
2346     }
2347     comm->cellsize_limit = std::max(comm->cellsize_limit, rconstr);
2348
2349     comm->cgs_gl = gmx_mtop_global_cgs(mtop);
2350
2351     if (options.numCells[XX] > 0)
2352     {
2353         copy_ivec(options.numCells, dd->nc);
2354         set_dd_dim(mdlog, dd);
2355         set_ddbox_cr(*cr, &dd->nc, *ir, box, xGlobal, ddbox);
2356
2357         if (options.numPmeRanks >= 0)
2358         {
2359             cr->npmenodes = options.numPmeRanks;
2360         }
2361         else
2362         {
2363             /* When the DD grid is set explicitly and -npme is set to auto,
2364              * don't use PME ranks. We check later if the DD grid is
2365              * compatible with the total number of ranks.
2366              */
2367             cr->npmenodes = 0;
2368         }
2369
2370         real acs = average_cellsize_min(dd, ddbox);
2371         if (acs < comm->cellsize_limit)
2372         {
2373             gmx_fatal_collective(FARGS, cr->mpi_comm_mysim, MASTER(cr),
2374                                  "The initial cell size (%f) is smaller than the cell size limit (%f), change options -dd, -rdd or -rcon, see the log file for details",
2375                                  acs, comm->cellsize_limit);
2376         }
2377     }
2378     else
2379     {
2380         set_ddbox_cr(*cr, nullptr, *ir, box, xGlobal, ddbox);
2381
2382         /* We need to choose the optimal DD grid and possibly PME nodes */
2383         real limit =
2384             dd_choose_grid(mdlog, cr, dd, ir, mtop, box, ddbox,
2385                            options.numPmeRanks,
2386                            !isDlbDisabled(comm),
2387                            options.dlbScaling,
2388                            comm->cellsize_limit, comm->cutoff,
2389                            comm->bInterCGBondeds);
2390
2391         if (dd->nc[XX] == 0)
2392         {
2393             char     buf[STRLEN];
2394             gmx_bool bC = (dd->splitConstraints && rconstr > r_bonded_limit);
2395             sprintf(buf, "Change the number of ranks or mdrun option %s%s%s",
2396                     !bC ? "-rdd" : "-rcon",
2397                     comm->dlbState != DlbState::offUser ? " or -dds" : "",
2398                     bC ? " or your LINCS settings" : "");
2399
2400             gmx_fatal_collective(FARGS, cr->mpi_comm_mysim, MASTER(cr),
2401                                  "There is no domain decomposition for %d ranks that is compatible with the given box and a minimum cell size of %g nm\n"
2402                                  "%s\n"
2403                                  "Look in the log file for details on the domain decomposition",
2404                                  cr->nnodes-cr->npmenodes, limit, buf);
2405         }
2406         set_dd_dim(mdlog, dd);
2407     }
2408
2409     GMX_LOG(mdlog.info).appendTextFormatted(
2410             "Domain decomposition grid %d x %d x %d, separate PME ranks %d",
2411             dd->nc[XX], dd->nc[YY], dd->nc[ZZ], cr->npmenodes);
2412
2413     dd->nnodes = dd->nc[XX]*dd->nc[YY]*dd->nc[ZZ];
2414     if (cr->nnodes - dd->nnodes != cr->npmenodes)
2415     {
2416         gmx_fatal_collective(FARGS, cr->mpi_comm_mysim, MASTER(cr),
2417                              "The size of the domain decomposition grid (%d) does not match the number of ranks (%d). The total number of ranks is %d",
2418                              dd->nnodes, cr->nnodes - cr->npmenodes, cr->nnodes);
2419     }
2420     if (cr->npmenodes > dd->nnodes)
2421     {
2422         gmx_fatal_collective(FARGS, cr->mpi_comm_mysim, MASTER(cr),
2423                              "The number of separate PME ranks (%d) is larger than the number of PP ranks (%d), this is not supported.", cr->npmenodes, dd->nnodes);
2424     }
2425     if (cr->npmenodes > 0)
2426     {
2427         comm->npmenodes = cr->npmenodes;
2428     }
2429     else
2430     {
2431         comm->npmenodes = dd->nnodes;
2432     }
2433
2434     if (EEL_PME(ir->coulombtype) || EVDW_PME(ir->vdwtype))
2435     {
2436         /* The following choices should match those
2437          * in comm_cost_est in domdec_setup.c.
2438          * Note that here the checks have to take into account
2439          * that the decomposition might occur in a different order than xyz
2440          * (for instance through the env.var. GMX_DD_ORDER_ZYX),
2441          * in which case they will not match those in comm_cost_est,
2442          * but since that is mainly for testing purposes that's fine.
2443          */
2444         if (dd->ndim >= 2 && dd->dim[0] == XX && dd->dim[1] == YY &&
2445             comm->npmenodes > dd->nc[XX] && comm->npmenodes % dd->nc[XX] == 0 &&
2446             getenv("GMX_PMEONEDD") == nullptr)
2447         {
2448             comm->npmedecompdim = 2;
2449             comm->npmenodes_x   = dd->nc[XX];
2450             comm->npmenodes_y   = comm->npmenodes/comm->npmenodes_x;
2451         }
2452         else
2453         {
2454             /* In case nc is 1 in both x and y we could still choose to
2455              * decompose pme in y instead of x, but we use x for simplicity.
2456              */
2457             comm->npmedecompdim = 1;
2458             if (dd->dim[0] == YY)
2459             {
2460                 comm->npmenodes_x = 1;
2461                 comm->npmenodes_y = comm->npmenodes;
2462             }
2463             else
2464             {
2465                 comm->npmenodes_x = comm->npmenodes;
2466                 comm->npmenodes_y = 1;
2467             }
2468         }
2469         GMX_LOG(mdlog.info).appendTextFormatted(
2470                 "PME domain decomposition: %d x %d x %d",
2471                 comm->npmenodes_x, comm->npmenodes_y, 1);
2472     }
2473     else
2474     {
2475         comm->npmedecompdim = 0;
2476         comm->npmenodes_x   = 0;
2477         comm->npmenodes_y   = 0;
2478     }
2479
2480     snew(comm->slb_frac, DIM);
2481     if (isDlbDisabled(comm))
2482     {
2483         comm->slb_frac[XX] = get_slb_frac(mdlog, "x", dd->nc[XX], options.cellSizeX);
2484         comm->slb_frac[YY] = get_slb_frac(mdlog, "y", dd->nc[YY], options.cellSizeY);
2485         comm->slb_frac[ZZ] = get_slb_frac(mdlog, "z", dd->nc[ZZ], options.cellSizeZ);
2486     }
2487
2488     if (comm->bInterCGBondeds && comm->cutoff_mbody == 0)
2489     {
2490         if (comm->bBondComm || !isDlbDisabled(comm))
2491         {
2492             /* Set the bonded communication distance to halfway
2493              * the minimum and the maximum,
2494              * since the extra communication cost is nearly zero.
2495              */
2496             real acs           = average_cellsize_min(dd, ddbox);
2497             comm->cutoff_mbody = 0.5*(r_bonded + acs);
2498             if (!isDlbDisabled(comm))
2499             {
2500                 /* Check if this does not limit the scaling */
2501                 comm->cutoff_mbody = std::min(comm->cutoff_mbody,
2502                                               options.dlbScaling*acs);
2503             }
2504             if (!comm->bBondComm)
2505             {
2506                 /* Without bBondComm do not go beyond the n.b. cut-off */
2507                 comm->cutoff_mbody = std::min(comm->cutoff_mbody, comm->cutoff);
2508                 if (comm->cellsize_limit >= comm->cutoff)
2509                 {
2510                     /* We don't loose a lot of efficieny
2511                      * when increasing it to the n.b. cut-off.
2512                      * It can even be slightly faster, because we need
2513                      * less checks for the communication setup.
2514                      */
2515                     comm->cutoff_mbody = comm->cutoff;
2516                 }
2517             }
2518             /* Check if we did not end up below our original limit */
2519             comm->cutoff_mbody = std::max(comm->cutoff_mbody, r_bonded_limit);
2520
2521             if (comm->cutoff_mbody > comm->cellsize_limit)
2522             {
2523                 comm->cellsize_limit = comm->cutoff_mbody;
2524             }
2525         }
2526         /* Without DLB and cutoff_mbody<cutoff, cutoff_mbody is dynamic */
2527     }
2528
2529     if (debug)
2530     {
2531         fprintf(debug, "Bonded atom communication beyond the cut-off: %s\n"
2532                 "cellsize limit %f\n",
2533                 gmx::boolToString(comm->bBondComm), comm->cellsize_limit);
2534     }
2535
2536     if (MASTER(cr))
2537     {
2538         check_dd_restrictions(dd, ir, mdlog);
2539     }
2540 }
2541
2542 static char *init_bLocalCG(const gmx_mtop_t *mtop)
2543 {
2544     int   ncg, cg;
2545     char *bLocalCG;
2546
2547     ncg = ncg_mtop(mtop);
2548     snew(bLocalCG, ncg);
2549     for (cg = 0; cg < ncg; cg++)
2550     {
2551         bLocalCG[cg] = FALSE;
2552     }
2553
2554     return bLocalCG;
2555 }
2556
2557 void dd_init_bondeds(FILE *fplog,
2558                      gmx_domdec_t *dd,
2559                      const gmx_mtop_t *mtop,
2560                      const gmx_vsite_t *vsite,
2561                      const t_inputrec *ir,
2562                      gmx_bool bBCheck, cginfo_mb_t *cginfo_mb)
2563 {
2564     gmx_domdec_comm_t *comm;
2565
2566     dd_make_reverse_top(fplog, dd, mtop, vsite, ir, bBCheck);
2567
2568     comm = dd->comm;
2569
2570     if (comm->bBondComm)
2571     {
2572         /* Communicate atoms beyond the cut-off for bonded interactions */
2573         comm = dd->comm;
2574
2575         comm->cglink = make_charge_group_links(mtop, dd, cginfo_mb);
2576
2577         comm->bLocalCG = init_bLocalCG(mtop);
2578     }
2579     else
2580     {
2581         /* Only communicate atoms based on cut-off */
2582         comm->cglink   = nullptr;
2583         comm->bLocalCG = nullptr;
2584     }
2585 }
2586
2587 static void writeSettings(gmx::TextWriter       *log,
2588                           gmx_domdec_t          *dd,
2589                           const gmx_mtop_t      *mtop,
2590                           const t_inputrec      *ir,
2591                           gmx_bool               bDynLoadBal,
2592                           real                   dlb_scale,
2593                           const gmx_ddbox_t     *ddbox)
2594 {
2595     gmx_domdec_comm_t *comm;
2596     int                d;
2597     ivec               np;
2598     real               limit, shrink;
2599
2600     comm = dd->comm;
2601
2602     if (bDynLoadBal)
2603     {
2604         log->writeString("The maximum number of communication pulses is:");
2605         for (d = 0; d < dd->ndim; d++)
2606         {
2607             log->writeStringFormatted(" %c %d", dim2char(dd->dim[d]), comm->cd[d].np_dlb);
2608         }
2609         log->ensureLineBreak();
2610         log->writeLineFormatted("The minimum size for domain decomposition cells is %.3f nm", comm->cellsize_limit);
2611         log->writeLineFormatted("The requested allowed shrink of DD cells (option -dds) is: %.2f", dlb_scale);
2612         log->writeString("The allowed shrink of domain decomposition cells is:");
2613         for (d = 0; d < DIM; d++)
2614         {
2615             if (dd->nc[d] > 1)
2616             {
2617                 if (d >= ddbox->npbcdim && dd->nc[d] == 2)
2618                 {
2619                     shrink = 0;
2620                 }
2621                 else
2622                 {
2623                     shrink =
2624                         comm->cellsize_min_dlb[d]/
2625                         (ddbox->box_size[d]*ddbox->skew_fac[d]/dd->nc[d]);
2626                 }
2627                 log->writeStringFormatted(" %c %.2f", dim2char(d), shrink);
2628             }
2629         }
2630         log->ensureLineBreak();
2631     }
2632     else
2633     {
2634         set_dd_cell_sizes_slb(dd, ddbox, setcellsizeslbPULSE_ONLY, np);
2635         log->writeString("The initial number of communication pulses is:");
2636         for (d = 0; d < dd->ndim; d++)
2637         {
2638             log->writeStringFormatted(" %c %d", dim2char(dd->dim[d]), np[dd->dim[d]]);
2639         }
2640         log->ensureLineBreak();
2641         log->writeString("The initial domain decomposition cell size is:");
2642         for (d = 0; d < DIM; d++)
2643         {
2644             if (dd->nc[d] > 1)
2645             {
2646                 log->writeStringFormatted(" %c %.2f nm",
2647                                           dim2char(d), dd->comm->cellsize_min[d]);
2648             }
2649         }
2650         log->ensureLineBreak();
2651         log->writeLine();
2652     }
2653
2654     gmx_bool bInterCGVsites = count_intercg_vsites(mtop) != 0;
2655
2656     if (comm->bInterCGBondeds ||
2657         bInterCGVsites ||
2658         dd->splitConstraints || dd->splitSettles)
2659     {
2660         std::string decompUnits;
2661         if (comm->bCGs)
2662         {
2663             decompUnits = "charge groups";
2664         }
2665         else if (comm->useUpdateGroups)
2666         {
2667             decompUnits = "atom groups";
2668         }
2669         else
2670         {
2671             decompUnits = "atoms";
2672         }
2673
2674         log->writeLineFormatted("The maximum allowed distance for %s involved in interactions is:", decompUnits.c_str());
2675         log->writeLineFormatted("%40s  %-7s %6.3f nm", "non-bonded interactions", "", comm->cutoff);
2676
2677         if (bDynLoadBal)
2678         {
2679             limit = dd->comm->cellsize_limit;
2680         }
2681         else
2682         {
2683             if (dynamic_dd_box(*dd))
2684             {
2685                 log->writeLine("(the following are initial values, they could change due to box deformation)");
2686             }
2687             limit = dd->comm->cellsize_min[XX];
2688             for (d = 1; d < DIM; d++)
2689             {
2690                 limit = std::min(limit, dd->comm->cellsize_min[d]);
2691             }
2692         }
2693
2694         if (comm->bInterCGBondeds)
2695         {
2696             log->writeLineFormatted("%40s  %-7s %6.3f nm",
2697                                     "two-body bonded interactions", "(-rdd)",
2698                                     std::max(comm->cutoff, comm->cutoff_mbody));
2699             log->writeLineFormatted("%40s  %-7s %6.3f nm",
2700                                     "multi-body bonded interactions", "(-rdd)",
2701                                     (comm->bBondComm || isDlbOn(dd->comm)) ? comm->cutoff_mbody : std::min(comm->cutoff, limit));
2702         }
2703         if (bInterCGVsites)
2704         {
2705             log->writeLineFormatted("%40s  %-7s %6.3f nm",
2706                                     "virtual site constructions", "(-rcon)", limit);
2707         }
2708         if (dd->splitConstraints || dd->splitSettles)
2709         {
2710             std::string separation = gmx::formatString("atoms separated by up to %d constraints",
2711                                                        1+ir->nProjOrder);
2712             log->writeLineFormatted("%40s  %-7s %6.3f nm\n",
2713                                     separation.c_str(), "(-rcon)", limit);
2714         }
2715         log->ensureLineBreak();
2716     }
2717 }
2718
2719 static void logSettings(const gmx::MDLogger &mdlog,
2720                         gmx_domdec_t        *dd,
2721                         const gmx_mtop_t    *mtop,
2722                         const t_inputrec    *ir,
2723                         real                 dlb_scale,
2724                         const gmx_ddbox_t   *ddbox)
2725 {
2726     gmx::StringOutputStream stream;
2727     gmx::TextWriter         log(&stream);
2728     writeSettings(&log, dd, mtop, ir, isDlbOn(dd->comm), dlb_scale, ddbox);
2729     if (dd->comm->dlbState == DlbState::offCanTurnOn)
2730     {
2731         {
2732             log.ensureEmptyLine();
2733             log.writeLine("When dynamic load balancing gets turned on, these settings will change to:");
2734         }
2735         writeSettings(&log, dd, mtop, ir, true, dlb_scale, ddbox);
2736     }
2737     GMX_LOG(mdlog.info).asParagraph().appendText(stream.toString());
2738 }
2739
2740 static void set_cell_limits_dlb(const gmx::MDLogger &mdlog,
2741                                 gmx_domdec_t        *dd,
2742                                 real                 dlb_scale,
2743                                 const t_inputrec    *ir,
2744                                 const gmx_ddbox_t   *ddbox)
2745 {
2746     gmx_domdec_comm_t *comm;
2747     int                d, dim, npulse, npulse_d_max, npulse_d;
2748     gmx_bool           bNoCutOff;
2749
2750     comm = dd->comm;
2751
2752     bNoCutOff = (ir->rvdw == 0 || ir->rcoulomb == 0);
2753
2754     /* Determine the maximum number of comm. pulses in one dimension */
2755
2756     comm->cellsize_limit = std::max(comm->cellsize_limit, comm->cutoff_mbody);
2757
2758     /* Determine the maximum required number of grid pulses */
2759     if (comm->cellsize_limit >= comm->cutoff)
2760     {
2761         /* Only a single pulse is required */
2762         npulse = 1;
2763     }
2764     else if (!bNoCutOff && comm->cellsize_limit > 0)
2765     {
2766         /* We round down slightly here to avoid overhead due to the latency
2767          * of extra communication calls when the cut-off
2768          * would be only slightly longer than the cell size.
2769          * Later cellsize_limit is redetermined,
2770          * so we can not miss interactions due to this rounding.
2771          */
2772         npulse = static_cast<int>(0.96 + comm->cutoff/comm->cellsize_limit);
2773     }
2774     else
2775     {
2776         /* There is no cell size limit */
2777         npulse = std::max(dd->nc[XX]-1, std::max(dd->nc[YY]-1, dd->nc[ZZ]-1));
2778     }
2779
2780     if (!bNoCutOff && npulse > 1)
2781     {
2782         /* See if we can do with less pulses, based on dlb_scale */
2783         npulse_d_max = 0;
2784         for (d = 0; d < dd->ndim; d++)
2785         {
2786             dim      = dd->dim[d];
2787             npulse_d = static_cast<int>(1 + dd->nc[dim]*comm->cutoff
2788                                         /(ddbox->box_size[dim]*ddbox->skew_fac[dim]*dlb_scale));
2789             npulse_d_max = std::max(npulse_d_max, npulse_d);
2790         }
2791         npulse = std::min(npulse, npulse_d_max);
2792     }
2793
2794     /* This env var can override npulse */
2795     d = dd_getenv(mdlog, "GMX_DD_NPULSE", 0);
2796     if (d > 0)
2797     {
2798         npulse = d;
2799     }
2800
2801     comm->maxpulse       = 1;
2802     comm->bVacDLBNoLimit = (ir->ePBC == epbcNONE);
2803     for (d = 0; d < dd->ndim; d++)
2804     {
2805         comm->cd[d].np_dlb    = std::min(npulse, dd->nc[dd->dim[d]]-1);
2806         comm->maxpulse        = std::max(comm->maxpulse, comm->cd[d].np_dlb);
2807         if (comm->cd[d].np_dlb < dd->nc[dd->dim[d]]-1)
2808         {
2809             comm->bVacDLBNoLimit = FALSE;
2810         }
2811     }
2812
2813     /* cellsize_limit is set for LINCS in init_domain_decomposition */
2814     if (!comm->bVacDLBNoLimit)
2815     {
2816         comm->cellsize_limit = std::max(comm->cellsize_limit,
2817                                         comm->cutoff/comm->maxpulse);
2818     }
2819     comm->cellsize_limit = std::max(comm->cellsize_limit, comm->cutoff_mbody);
2820     /* Set the minimum cell size for each DD dimension */
2821     for (d = 0; d < dd->ndim; d++)
2822     {
2823         if (comm->bVacDLBNoLimit ||
2824             comm->cd[d].np_dlb*comm->cellsize_limit >= comm->cutoff)
2825         {
2826             comm->cellsize_min_dlb[dd->dim[d]] = comm->cellsize_limit;
2827         }
2828         else
2829         {
2830             comm->cellsize_min_dlb[dd->dim[d]] =
2831                 comm->cutoff/comm->cd[d].np_dlb;
2832         }
2833     }
2834     if (comm->cutoff_mbody <= 0)
2835     {
2836         comm->cutoff_mbody = std::min(comm->cutoff, comm->cellsize_limit);
2837     }
2838     if (isDlbOn(comm))
2839     {
2840         set_dlb_limits(dd);
2841     }
2842 }
2843
2844 gmx_bool dd_bonded_molpbc(const gmx_domdec_t *dd, int ePBC)
2845 {
2846     /* If each molecule is a single charge group
2847      * or we use domain decomposition for each periodic dimension,
2848      * we do not need to take pbc into account for the bonded interactions.
2849      */
2850     return (ePBC != epbcNONE && dd->comm->bInterCGBondeds &&
2851             !(dd->nc[XX] > 1 &&
2852               dd->nc[YY] > 1 &&
2853               (dd->nc[ZZ] > 1 || ePBC == epbcXY)));
2854 }
2855
2856 /*! \brief Sets grid size limits and PP-PME setup, prints settings to log */
2857 static void set_ddgrid_parameters(const gmx::MDLogger &mdlog,
2858                                   gmx_domdec_t *dd, real dlb_scale,
2859                                   const gmx_mtop_t *mtop, const t_inputrec *ir,
2860                                   const gmx_ddbox_t *ddbox)
2861 {
2862     gmx_domdec_comm_t *comm;
2863     int                natoms_tot;
2864     real               vol_frac;
2865
2866     comm = dd->comm;
2867
2868     if (EEL_PME(ir->coulombtype) || EVDW_PME(ir->vdwtype))
2869     {
2870         init_ddpme(dd, &comm->ddpme[0], 0);
2871         if (comm->npmedecompdim >= 2)
2872         {
2873             init_ddpme(dd, &comm->ddpme[1], 1);
2874         }
2875     }
2876     else
2877     {
2878         comm->npmenodes = 0;
2879         if (dd->pme_nodeid >= 0)
2880         {
2881             gmx_fatal_collective(FARGS, dd->mpi_comm_all, DDMASTER(dd),
2882                                  "Can not have separate PME ranks without PME electrostatics");
2883         }
2884     }
2885
2886     if (debug)
2887     {
2888         fprintf(debug, "The DD cut-off is %f\n", comm->cutoff);
2889     }
2890     if (!isDlbDisabled(comm))
2891     {
2892         set_cell_limits_dlb(mdlog, dd, dlb_scale, ir, ddbox);
2893     }
2894
2895     logSettings(mdlog, dd, mtop, ir, dlb_scale, ddbox);
2896
2897     if (ir->ePBC == epbcNONE)
2898     {
2899         vol_frac = 1 - 1/static_cast<double>(dd->nnodes);
2900     }
2901     else
2902     {
2903         vol_frac =
2904             (1 + comm_box_frac(dd->nc, comm->cutoff, ddbox))/static_cast<double>(dd->nnodes);
2905     }
2906     if (debug)
2907     {
2908         fprintf(debug, "Volume fraction for all DD zones: %f\n", vol_frac);
2909     }
2910     natoms_tot = comm->cgs_gl.index[comm->cgs_gl.nr];
2911
2912     dd->ga2la  = new gmx_ga2la_t(natoms_tot,
2913                                  static_cast<int>(vol_frac*natoms_tot));
2914 }
2915
2916 /*! \brief Set some important DD parameters that can be modified by env.vars */
2917 static void set_dd_envvar_options(const gmx::MDLogger &mdlog,
2918                                   gmx_domdec_t *dd, int rank_mysim)
2919 {
2920     gmx_domdec_comm_t *comm = dd->comm;
2921
2922     dd->bSendRecv2      = (dd_getenv(mdlog, "GMX_DD_USE_SENDRECV2", 0) != 0);
2923     comm->dlb_scale_lim = dd_getenv(mdlog, "GMX_DLB_MAX_BOX_SCALING", 10);
2924     comm->eFlop         = dd_getenv(mdlog, "GMX_DLB_BASED_ON_FLOPS", 0);
2925     int recload         = dd_getenv(mdlog, "GMX_DD_RECORD_LOAD", 1);
2926     comm->nstDDDump     = dd_getenv(mdlog, "GMX_DD_NST_DUMP", 0);
2927     comm->nstDDDumpGrid = dd_getenv(mdlog, "GMX_DD_NST_DUMP_GRID", 0);
2928     comm->DD_debug      = dd_getenv(mdlog, "GMX_DD_DEBUG", 0);
2929
2930     if (dd->bSendRecv2)
2931     {
2932         GMX_LOG(mdlog.info).appendText("Will use two sequential MPI_Sendrecv calls instead of two simultaneous non-blocking MPI_Irecv and MPI_Isend pairs for constraint and vsite communication");
2933     }
2934
2935     if (comm->eFlop)
2936     {
2937         GMX_LOG(mdlog.info).appendText("Will load balance based on FLOP count");
2938         if (comm->eFlop > 1)
2939         {
2940             srand(1 + rank_mysim);
2941         }
2942         comm->bRecordLoad = TRUE;
2943     }
2944     else
2945     {
2946         comm->bRecordLoad = (wallcycle_have_counter() && recload > 0);
2947     }
2948 }
2949
2950 gmx_domdec_t *init_domain_decomposition(const gmx::MDLogger           &mdlog,
2951                                         t_commrec                     *cr,
2952                                         const DomdecOptions           &options,
2953                                         const MdrunOptions            &mdrunOptions,
2954                                         const gmx_mtop_t              *mtop,
2955                                         const t_inputrec              *ir,
2956                                         const matrix                   box,
2957                                         gmx::ArrayRef<const gmx::RVec> xGlobal,
2958                                         gmx::LocalAtomSetManager      *atomSets)
2959 {
2960     gmx_domdec_t      *dd;
2961
2962     GMX_LOG(mdlog.info).appendTextFormatted(
2963             "\nInitializing Domain Decomposition on %d ranks", cr->nnodes);
2964
2965     dd = new gmx_domdec_t;
2966
2967     dd->comm = init_dd_comm();
2968
2969     /* Initialize DD paritioning counters */
2970     dd->comm->partition_step = INT_MIN;
2971     dd->ddp_count            = 0;
2972
2973     set_dd_envvar_options(mdlog, dd, cr->nodeid);
2974
2975     gmx_ddbox_t ddbox = {0};
2976     set_dd_limits_and_grid(mdlog, cr, dd, options, mdrunOptions,
2977                            mtop, ir,
2978                            box, xGlobal,
2979                            &ddbox);
2980
2981     make_dd_communicators(mdlog, cr, dd, options.rankOrder);
2982
2983     if (thisRankHasDuty(cr, DUTY_PP))
2984     {
2985         set_ddgrid_parameters(mdlog, dd, options.dlbScaling, mtop, ir, &ddbox);
2986
2987         setup_neighbor_relations(dd);
2988     }
2989
2990     /* Set overallocation to avoid frequent reallocation of arrays */
2991     set_over_alloc_dd(TRUE);
2992
2993     clear_dd_cycle_counts(dd);
2994
2995     dd->atomSets = atomSets;
2996
2997     return dd;
2998 }
2999
3000 static gmx_bool test_dd_cutoff(t_commrec     *cr,
3001                                const t_state &state,
3002                                real           cutoffRequested)
3003 {
3004     gmx_domdec_t *dd;
3005     gmx_ddbox_t   ddbox;
3006     int           d, dim, np;
3007     real          inv_cell_size;
3008     int           LocallyLimited;
3009
3010     dd = cr->dd;
3011
3012     set_ddbox(*dd, false, state.box, true, state.x, &ddbox);
3013
3014     LocallyLimited = 0;
3015
3016     for (d = 0; d < dd->ndim; d++)
3017     {
3018         dim = dd->dim[d];
3019
3020         inv_cell_size = DD_CELL_MARGIN*dd->nc[dim]/ddbox.box_size[dim];
3021         if (dynamic_dd_box(*dd))
3022         {
3023             inv_cell_size *= DD_PRES_SCALE_MARGIN;
3024         }
3025
3026         np = 1 + static_cast<int>(cutoffRequested*inv_cell_size*ddbox.skew_fac[dim]);
3027
3028         if (!isDlbDisabled(dd->comm) && (dim < ddbox.npbcdim) && (dd->comm->cd[d].np_dlb > 0))
3029         {
3030             if (np > dd->comm->cd[d].np_dlb)
3031             {
3032                 return FALSE;
3033             }
3034
3035             /* If a current local cell size is smaller than the requested
3036              * cut-off, we could still fix it, but this gets very complicated.
3037              * Without fixing here, we might actually need more checks.
3038              */
3039             real cellSizeAlongDim = (dd->comm->cell_x1[dim] - dd->comm->cell_x0[dim])*ddbox.skew_fac[dim];
3040             if (cellSizeAlongDim*dd->comm->cd[d].np_dlb < cutoffRequested)
3041             {
3042                 LocallyLimited = 1;
3043             }
3044         }
3045     }
3046
3047     if (!isDlbDisabled(dd->comm))
3048     {
3049         /* If DLB is not active yet, we don't need to check the grid jumps.
3050          * Actually we shouldn't, because then the grid jump data is not set.
3051          */
3052         if (isDlbOn(dd->comm) &&
3053             check_grid_jump(0, dd, cutoffRequested, &ddbox, FALSE))
3054         {
3055             LocallyLimited = 1;
3056         }
3057
3058         gmx_sumi(1, &LocallyLimited, cr);
3059
3060         if (LocallyLimited > 0)
3061         {
3062             return FALSE;
3063         }
3064     }
3065
3066     return TRUE;
3067 }
3068
3069 gmx_bool change_dd_cutoff(t_commrec     *cr,
3070                           const t_state &state,
3071                           real           cutoffRequested)
3072 {
3073     gmx_bool bCutoffAllowed;
3074
3075     bCutoffAllowed = test_dd_cutoff(cr, state, cutoffRequested);
3076
3077     if (bCutoffAllowed)
3078     {
3079         cr->dd->comm->cutoff = cutoffRequested;
3080     }
3081
3082     return bCutoffAllowed;
3083 }