src/gromacs/domdec/domdec.cpp

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2005,2006,2007,2008,2009 by the GROMACS development team.
   5  * Copyright (c) 2010,2011,2012,2013,2014 by the GROMACS development team.
   6  * Copyright (c) 2015,2016,2017,2018,2019 by the GROMACS development team.
   7  * Copyright (c) 2020,2021, by the GROMACS development team, led by
   8  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   9  * and including many others, as listed in the AUTHORS file in the
  10  * top-level source directory and at http://www.gromacs.org.
  11  *
  12  * GROMACS is free software; you can redistribute it and/or
  13  * modify it under the terms of the GNU Lesser General Public License
  14  * as published by the Free Software Foundation; either version 2.1
  15  * of the License, or (at your option) any later version.
  16  *
  17  * GROMACS is distributed in the hope that it will be useful,
  18  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  19  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  20  * Lesser General Public License for more details.
  21  *
  22  * You should have received a copy of the GNU Lesser General Public
  23  * License along with GROMACS; if not, see
  24  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  25  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  26  *
  27  * If you want to redistribute modifications to GROMACS, please
  28  * consider that scientific software is very special. Version
  29  * control is crucial - bugs must be traceable. We will be happy to
  30  * consider code for inclusion in the official distribution, but
  31  * derived work must not be called official GROMACS. Details are found
  32  * in the README & COPYING files - if they are missing, get the
  33  * official version at http://www.gromacs.org.
  34  *
  35  * To help us fund GROMACS development, we humbly ask that you cite
  36  * the research papers on the package. Check out http://www.gromacs.org.
  37  */
  38
  39 #include "gmxpre.h"
  40
  41 #include "domdec.h"
  42
  43 #include "config.h"
  44
  45 #include <cassert>
  46 #include <cinttypes>
  47 #include <climits>
  48 #include <cmath>
  49 #include <cstdio>
  50 #include <cstdlib>
  51 #include <cstring>
  52
  53 #include <algorithm>
  54 #include <memory>
  55
  56 #include "gromacs/domdec/builder.h"
  57 #include "gromacs/domdec/collect.h"
  58 #include "gromacs/domdec/dlb.h"
  59 #include "gromacs/domdec/dlbtiming.h"
  60 #include "gromacs/domdec/domdec_network.h"
  61 #include "gromacs/domdec/ga2la.h"
  62 #include "gromacs/domdec/gpuhaloexchange.h"
  63 #include "gromacs/domdec/options.h"
  64 #include "gromacs/domdec/partition.h"
  65 #include "gromacs/gmxlib/network.h"
  66 #include "gromacs/gmxlib/nrnb.h"
  67 #include "gromacs/gpu_utils/device_stream_manager.h"
  68 #include "gromacs/gpu_utils/gpu_utils.h"
  69 #include "gromacs/hardware/hw_info.h"
  70 #include "gromacs/math/vec.h"
  71 #include "gromacs/math/vectypes.h"
  72 #include "gromacs/mdlib/calc_verletbuf.h"
  73 #include "gromacs/mdlib/constr.h"
  74 #include "gromacs/mdlib/constraintrange.h"
  75 #include "gromacs/mdlib/updategroups.h"
  76 #include "gromacs/mdlib/vsite.h"
  77 #include "gromacs/mdtypes/commrec.h"
  78 #include "gromacs/mdtypes/forceoutput.h"
  79 #include "gromacs/mdtypes/inputrec.h"
  80 #include "gromacs/mdtypes/mdrunoptions.h"
  81 #include "gromacs/mdtypes/state.h"
  82 #include "gromacs/pbcutil/ishift.h"
  83 #include "gromacs/pbcutil/pbc.h"
  84 #include "gromacs/pulling/pull.h"
  85 #include "gromacs/timing/wallcycle.h"
  86 #include "gromacs/topology/block.h"
  87 #include "gromacs/topology/idef.h"
  88 #include "gromacs/topology/ifunc.h"
  89 #include "gromacs/topology/mtop_lookup.h"
  90 #include "gromacs/topology/mtop_util.h"
  91 #include "gromacs/topology/topology.h"
  92 #include "gromacs/utility/basedefinitions.h"
  93 #include "gromacs/utility/basenetwork.h"
  94 #include "gromacs/utility/cstringutil.h"
  95 #include "gromacs/utility/enumerationhelpers.h"
  96 #include "gromacs/utility/exceptions.h"
  97 #include "gromacs/utility/fatalerror.h"
  98 #include "gromacs/utility/gmxmpi.h"
  99 #include "gromacs/utility/logger.h"
 100 #include "gromacs/utility/real.h"
 101 #include "gromacs/utility/smalloc.h"
 102 #include "gromacs/utility/strconvert.h"
 103 #include "gromacs/utility/stringstream.h"
 104 #include "gromacs/utility/stringutil.h"
 105 #include "gromacs/utility/textwriter.h"
 106
 107 #include "atomdistribution.h"
 108 #include "box.h"
 109 #include "cellsizes.h"
 110 #include "distribute.h"
 111 #include "domdec_constraints.h"
 112 #include "domdec_internal.h"
 113 #include "domdec_setup.h"
 114 #include "domdec_vsite.h"
 115 #include "redistribute.h"
 116 #include "utility.h"
 117
 118 // TODO remove this when moving domdec into gmx namespace
 119 using gmx::DdRankOrder;
 120 using gmx::DlbOption;
 121 using gmx::DomdecOptions;
 122
 123 static const char* enumValueToString(DlbState enumValue)
 124 {
 125     static constexpr gmx::EnumerationArray<DlbState, const char*> dlbStateNames = {
 126         "off", "off", "auto", "locked", "on", "on"
 127     };
 128     return dlbStateNames[enumValue];
 129 }
 130
 131 /* The size per atom group of the cggl_flag buffer in gmx_domdec_comm_t */
 132 #define DD_CGIBS 2
 133
 134 /* The flags for the cggl_flag buffer in gmx_domdec_comm_t */
 135 #define DD_FLAG_NRCG 65535
 136 #define DD_FLAG_FW(d) (1 << (16 + (d)*2))
 137 #define DD_FLAG_BW(d) (1 << (16 + (d)*2 + 1))
 138
 139 /* The DD zone order */
 140 static const ivec dd_zo[DD_MAXZONE] = { { 0, 0, 0 }, { 1, 0, 0 }, { 1, 1, 0 }, { 0, 1, 0 },
 141                                         { 0, 1, 1 }, { 0, 0, 1 }, { 1, 0, 1 }, { 1, 1, 1 } };
 142
 143 /* The non-bonded zone-pair setup for domain decomposition
 144  * The first number is the i-zone, the second number the first j-zone seen by
 145  * this i-zone, the third number the last+1 j-zone seen by this i-zone.
 146  * As is, this is for 3D decomposition, where there are 4 i-zones.
 147  * With 2D decomposition use only the first 2 i-zones and a last+1 j-zone of 4.
 148  * With 1D decomposition use only the first i-zone and a last+1 j-zone of 2.
 149  */
 150 static const int ddNonbondedZonePairRanges[DD_MAXIZONE][3] = { { 0, 0, 8 },
 151                                                                { 1, 3, 6 },
 152                                                                { 2, 5, 6 },
 153                                                                { 3, 5, 7 } };
 154
 155 static void ddindex2xyz(const ivec nc, int ind, ivec xyz)
 156 {
 157     xyz[XX] = ind / (nc[YY] * nc[ZZ]);
 158     xyz[YY] = (ind / nc[ZZ]) % nc[YY];
 159     xyz[ZZ] = ind % nc[ZZ];
 160 }
 161
 162 static int ddcoord2ddnodeid(gmx_domdec_t* dd, ivec c)
 163 {
 164     int ddnodeid = -1;
 165
 166     const CartesianRankSetup& cartSetup = dd->comm->cartesianRankSetup;
 167     const int                 ddindex   = dd_index(dd->numCells, c);
 168     if (cartSetup.bCartesianPP_PME)
 169     {
 170         ddnodeid = cartSetup.ddindex2ddnodeid[ddindex];
 171     }
 172     else if (cartSetup.bCartesianPP)
 173     {
 174 #if GMX_MPI
 175         MPI_Cart_rank(dd->mpi_comm_all, c, &ddnodeid);
 176 #endif
 177     }
 178     else
 179     {
 180         ddnodeid = ddindex;
 181     }
 182
 183     return ddnodeid;
 184 }
 185
 186 int ddglatnr(const gmx_domdec_t* dd, int i)
 187 {
 188     int atnr = 0;
 189
 190     if (dd == nullptr)
 191     {
 192         atnr = i + 1;
 193     }
 194     else
 195     {
 196         if (i >= dd->comm->atomRanges.numAtomsTotal())
 197         {
 198             gmx_fatal(FARGS,
 199                       "glatnr called with %d, which is larger than the local number of atoms (%d)",
 200                       i,
 201                       dd->comm->atomRanges.numAtomsTotal());
 202         }
 203         atnr = dd->globalAtomIndices[i] + 1;
 204     }
 205
 206     return atnr;
 207 }
 208
 209 gmx::ArrayRef<const gmx::RangePartitioning> getUpdateGroupingsPerMoleculeType(const gmx_domdec_t& dd)
 210 {
 211     GMX_RELEASE_ASSERT(dd.comm, "Need a valid dd.comm");
 212     return dd.comm->systemInfo.updateGroupingsPerMoleculeType;
 213 }
 214
 215 void dd_store_state(const gmx_domdec_t& dd, t_state* state)
 216 {
 217     if (state->ddp_count != dd.ddp_count)
 218     {
 219         gmx_incons("The MD state does not match the domain decomposition state");
 220     }
 221
 222     state->cg_gl.resize(dd.ncg_home);
 223     for (int i = 0; i < dd.ncg_home; i++)
 224     {
 225         state->cg_gl[i] = dd.globalAtomGroupIndices[i];
 226     }
 227
 228     state->ddp_count_cg_gl = dd.ddp_count;
 229 }
 230
 231 gmx_domdec_zones_t* domdec_zones(gmx_domdec_t* dd)
 232 {
 233     return &dd->comm->zones;
 234 }
 235
 236 int dd_numAtomsZones(const gmx_domdec_t& dd)
 237 {
 238     return dd.comm->atomRanges.end(DDAtomRanges::Type::Zones);
 239 }
 240
 241 int dd_numHomeAtoms(const gmx_domdec_t& dd)
 242 {
 243     return dd.comm->atomRanges.numHomeAtoms();
 244 }
 245
 246 int dd_natoms_mdatoms(const gmx_domdec_t& dd)
 247 {
 248     /* We currently set mdatoms entries for all atoms:
 249      * local + non-local + communicated for vsite + constraints
 250      */
 251
 252     return dd.comm->atomRanges.numAtomsTotal();
 253 }
 254
 255 int dd_natoms_vsite(const gmx_domdec_t& dd)
 256 {
 257     return dd.comm->atomRanges.end(DDAtomRanges::Type::Vsites);
 258 }
 259
 260 void dd_get_constraint_range(const gmx_domdec_t& dd, int* at_start, int* at_end)
 261 {
 262     *at_start = dd.comm->atomRanges.start(DDAtomRanges::Type::Constraints);
 263     *at_end   = dd.comm->atomRanges.end(DDAtomRanges::Type::Constraints);
 264 }
 265
 266 void dd_move_x(gmx_domdec_t* dd, const matrix box, gmx::ArrayRef<gmx::RVec> x, gmx_wallcycle* wcycle)
 267 {
 268     wallcycle_start(wcycle, WallCycleCounter::MoveX);
 269
 270     rvec shift = { 0, 0, 0 };
 271
 272     gmx_domdec_comm_t* comm = dd->comm;
 273
 274     int nzone   = 1;
 275     int nat_tot = comm->atomRanges.numHomeAtoms();
 276     for (int d = 0; d < dd->ndim; d++)
 277     {
 278         const bool bPBC   = (dd->ci[dd->dim[d]] == 0);
 279         const bool bScrew = (bPBC && dd->unitCellInfo.haveScrewPBC && dd->dim[d] == XX);
 280         if (bPBC)
 281         {
 282             copy_rvec(box[dd->dim[d]], shift);
 283         }
 284         gmx_domdec_comm_dim_t* cd = &comm->cd[d];
 285         for (const gmx_domdec_ind_t& ind : cd->ind)
 286         {
 287             DDBufferAccess<gmx::RVec> sendBufferAccess(comm->rvecBuffer, ind.nsend[nzone + 1]);
 288             gmx::ArrayRef<gmx::RVec>& sendBuffer = sendBufferAccess.buffer;
 289             int                       n          = 0;
 290             if (!bPBC)
 291             {
 292                 for (int j : ind.index)
 293                 {
 294                     sendBuffer[n] = x[j];
 295                     n++;
 296                 }
 297             }
 298             else if (!bScrew)
 299             {
 300                 for (int j : ind.index)
 301                 {
 302                     /* We need to shift the coordinates */
 303                     for (int d = 0; d < DIM; d++)
 304                     {
 305                         sendBuffer[n][d] = x[j][d] + shift[d];
 306                     }
 307                     n++;
 308                 }
 309             }
 310             else
 311             {
 312                 for (int j : ind.index)
 313                 {
 314                     /* Shift x */
 315                     sendBuffer[n][XX] = x[j][XX] + shift[XX];
 316                     /* Rotate y and z.
 317                      * This operation requires a special shift force
 318                      * treatment, which is performed in calc_vir.
 319                      */
 320                     sendBuffer[n][YY] = box[YY][YY] - x[j][YY];
 321                     sendBuffer[n][ZZ] = box[ZZ][ZZ] - x[j][ZZ];
 322                     n++;
 323                 }
 324             }
 325
 326             DDBufferAccess<gmx::RVec> receiveBufferAccess(
 327                     comm->rvecBuffer2, cd->receiveInPlace ? 0 : ind.nrecv[nzone + 1]);
 328
 329             gmx::ArrayRef<gmx::RVec> receiveBuffer;
 330             if (cd->receiveInPlace)
 331             {
 332                 receiveBuffer = gmx::arrayRefFromArray(x.data() + nat_tot, ind.nrecv[nzone + 1]);
 333             }
 334             else
 335             {
 336                 receiveBuffer = receiveBufferAccess.buffer;
 337             }
 338             /* Send and receive the coordinates */
 339             ddSendrecv(dd, d, dddirBackward, sendBuffer, receiveBuffer);
 340
 341             if (!cd->receiveInPlace)
 342             {
 343                 int j = 0;
 344                 for (int zone = 0; zone < nzone; zone++)
 345                 {
 346                     for (int i = ind.cell2at0[zone]; i < ind.cell2at1[zone]; i++)
 347                     {
 348                         x[i] = receiveBuffer[j++];
 349                     }
 350                 }
 351             }
 352             nat_tot += ind.nrecv[nzone + 1];
 353         }
 354         nzone += nzone;
 355     }
 356
 357     wallcycle_stop(wcycle, WallCycleCounter::MoveX);
 358 }
 359
 360 void dd_move_f(gmx_domdec_t* dd, gmx::ForceWithShiftForces* forceWithShiftForces, gmx_wallcycle* wcycle)
 361 {
 362     wallcycle_start(wcycle, WallCycleCounter::MoveF);
 363
 364     gmx::ArrayRef<gmx::RVec> f      = forceWithShiftForces->force();
 365     gmx::ArrayRef<gmx::RVec> fshift = forceWithShiftForces->shiftForces();
 366
 367     gmx_domdec_comm_t& comm    = *dd->comm;
 368     int                nzone   = comm.zones.n / 2;
 369     int                nat_tot = comm.atomRanges.end(DDAtomRanges::Type::Zones);
 370     for (int d = dd->ndim - 1; d >= 0; d--)
 371     {
 372         /* Only forces in domains near the PBC boundaries need to
 373            consider PBC in the treatment of fshift */
 374         const bool shiftForcesNeedPbc =
 375                 (forceWithShiftForces->computeVirial() && dd->ci[dd->dim[d]] == 0);
 376         const bool applyScrewPbc =
 377                 (shiftForcesNeedPbc && dd->unitCellInfo.haveScrewPBC && dd->dim[d] == XX);
 378         /* Determine which shift vector we need */
 379         ivec vis        = { 0, 0, 0 };
 380         vis[dd->dim[d]] = 1;
 381         const int is    = gmx::ivecToShiftIndex(vis);
 382
 383         /* Loop over the pulses */
 384         const gmx_domdec_comm_dim_t& cd = comm.cd[d];
 385         for (int p = cd.numPulses() - 1; p >= 0; p--)
 386         {
 387             const gmx_domdec_ind_t&   ind = cd.ind[p];
 388             DDBufferAccess<gmx::RVec> receiveBufferAccess(comm.rvecBuffer, ind.nsend[nzone + 1]);
 389             gmx::ArrayRef<gmx::RVec>& receiveBuffer = receiveBufferAccess.buffer;
 390
 391             nat_tot -= ind.nrecv[nzone + 1];
 392
 393             DDBufferAccess<gmx::RVec> sendBufferAccess(
 394                     comm.rvecBuffer2, cd.receiveInPlace ? 0 : ind.nrecv[nzone + 1]);
 395
 396             gmx::ArrayRef<gmx::RVec> sendBuffer;
 397             if (cd.receiveInPlace)
 398             {
 399                 sendBuffer = gmx::arrayRefFromArray(f.data() + nat_tot, ind.nrecv[nzone + 1]);
 400             }
 401             else
 402             {
 403                 sendBuffer = sendBufferAccess.buffer;
 404                 int j      = 0;
 405                 for (int zone = 0; zone < nzone; zone++)
 406                 {
 407                     for (int i = ind.cell2at0[zone]; i < ind.cell2at1[zone]; i++)
 408                     {
 409                         sendBuffer[j++] = f[i];
 410                     }
 411                 }
 412             }
 413             /* Communicate the forces */
 414             ddSendrecv(dd, d, dddirForward, sendBuffer, receiveBuffer);
 415             /* Add the received forces */
 416             int n = 0;
 417             if (!shiftForcesNeedPbc)
 418             {
 419                 for (int j : ind.index)
 420                 {
 421                     for (int d = 0; d < DIM; d++)
 422                     {
 423                         f[j][d] += receiveBuffer[n][d];
 424                     }
 425                     n++;
 426                 }
 427             }
 428             else if (!applyScrewPbc)
 429             {
 430                 for (int j : ind.index)
 431                 {
 432                     for (int d = 0; d < DIM; d++)
 433                     {
 434                         f[j][d] += receiveBuffer[n][d];
 435                     }
 436                     /* Add this force to the shift force */
 437                     for (int d = 0; d < DIM; d++)
 438                     {
 439                         fshift[is][d] += receiveBuffer[n][d];
 440                     }
 441                     n++;
 442                 }
 443             }
 444             else
 445             {
 446                 for (int j : ind.index)
 447                 {
 448                     /* Rotate the force */
 449                     f[j][XX] += receiveBuffer[n][XX];
 450                     f[j][YY] -= receiveBuffer[n][YY];
 451                     f[j][ZZ] -= receiveBuffer[n][ZZ];
 452                     if (shiftForcesNeedPbc)
 453                     {
 454                         /* Add this force to the shift force */
 455                         for (int d = 0; d < DIM; d++)
 456                         {
 457                             fshift[is][d] += receiveBuffer[n][d];
 458                         }
 459                     }
 460                     n++;
 461                 }
 462             }
 463         }
 464         nzone /= 2;
 465     }
 466     wallcycle_stop(wcycle, WallCycleCounter::MoveF);
 467 }
 468
 469 /* Convenience function for extracting a real buffer from an rvec buffer
 470  *
 471  * To reduce the number of temporary communication buffers and avoid
 472  * cache polution, we reuse gmx::RVec buffers for storing reals.
 473  * This functions return a real buffer reference with the same number
 474  * of elements as the gmx::RVec buffer (so 1/3 of the size in bytes).
 475  */
 476 static gmx::ArrayRef<real> realArrayRefFromRvecArrayRef(gmx::ArrayRef<gmx::RVec> arrayRef)
 477 {
 478     return gmx::arrayRefFromArray(as_rvec_array(arrayRef.data())[0], arrayRef.size());
 479 }
 480
 481 void dd_atom_spread_real(gmx_domdec_t* dd, real v[])
 482 {
 483     gmx_domdec_comm_t* comm = dd->comm;
 484
 485     int nzone   = 1;
 486     int nat_tot = comm->atomRanges.numHomeAtoms();
 487     for (int d = 0; d < dd->ndim; d++)
 488     {
 489         gmx_domdec_comm_dim_t* cd = &comm->cd[d];
 490         for (const gmx_domdec_ind_t& ind : cd->ind)
 491         {
 492             /* Note: We provision for RVec instead of real, so a factor of 3
 493              * more than needed. The buffer actually already has this size
 494              * and we pass a plain pointer below, so this does not matter.
 495              */
 496             DDBufferAccess<gmx::RVec> sendBufferAccess(comm->rvecBuffer, ind.nsend[nzone + 1]);
 497             gmx::ArrayRef<real> sendBuffer = realArrayRefFromRvecArrayRef(sendBufferAccess.buffer);
 498             int                 n          = 0;
 499             for (int j : ind.index)
 500             {
 501                 sendBuffer[n++] = v[j];
 502             }
 503
 504             DDBufferAccess<gmx::RVec> receiveBufferAccess(
 505                     comm->rvecBuffer2, cd->receiveInPlace ? 0 : ind.nrecv[nzone + 1]);
 506
 507             gmx::ArrayRef<real> receiveBuffer;
 508             if (cd->receiveInPlace)
 509             {
 510                 receiveBuffer = gmx::arrayRefFromArray(v + nat_tot, ind.nrecv[nzone + 1]);
 511             }
 512             else
 513             {
 514                 receiveBuffer = realArrayRefFromRvecArrayRef(receiveBufferAccess.buffer);
 515             }
 516             /* Send and receive the data */
 517             ddSendrecv(dd, d, dddirBackward, sendBuffer, receiveBuffer);
 518             if (!cd->receiveInPlace)
 519             {
 520                 int j = 0;
 521                 for (int zone = 0; zone < nzone; zone++)
 522                 {
 523                     for (int i = ind.cell2at0[zone]; i < ind.cell2at1[zone]; i++)
 524                     {
 525                         v[i] = receiveBuffer[j++];
 526                     }
 527                 }
 528             }
 529             nat_tot += ind.nrecv[nzone + 1];
 530         }
 531         nzone += nzone;
 532     }
 533 }
 534
 535 void dd_atom_sum_real(gmx_domdec_t* dd, real v[])
 536 {
 537     gmx_domdec_comm_t* comm = dd->comm;
 538
 539     int nzone   = comm->zones.n / 2;
 540     int nat_tot = comm->atomRanges.end(DDAtomRanges::Type::Zones);
 541     for (int d = dd->ndim - 1; d >= 0; d--)
 542     {
 543         gmx_domdec_comm_dim_t* cd = &comm->cd[d];
 544         for (int p = cd->numPulses() - 1; p >= 0; p--)
 545         {
 546             const gmx_domdec_ind_t& ind = cd->ind[p];
 547
 548             /* Note: We provision for RVec instead of real, so a factor of 3
 549              * more than needed. The buffer actually already has this size
 550              * and we typecast, so this works as intended.
 551              */
 552             DDBufferAccess<gmx::RVec> receiveBufferAccess(comm->rvecBuffer, ind.nsend[nzone + 1]);
 553             gmx::ArrayRef<real> receiveBuffer = realArrayRefFromRvecArrayRef(receiveBufferAccess.buffer);
 554             nat_tot -= ind.nrecv[nzone + 1];
 555
 556             DDBufferAccess<gmx::RVec> sendBufferAccess(
 557                     comm->rvecBuffer2, cd->receiveInPlace ? 0 : ind.nrecv[nzone + 1]);
 558
 559             gmx::ArrayRef<real> sendBuffer;
 560             if (cd->receiveInPlace)
 561             {
 562                 sendBuffer = gmx::arrayRefFromArray(v + nat_tot, ind.nrecv[nzone + 1]);
 563             }
 564             else
 565             {
 566                 sendBuffer = realArrayRefFromRvecArrayRef(sendBufferAccess.buffer);
 567                 int j      = 0;
 568                 for (int zone = 0; zone < nzone; zone++)
 569                 {
 570                     for (int i = ind.cell2at0[zone]; i < ind.cell2at1[zone]; i++)
 571                     {
 572                         sendBuffer[j++] = v[i];
 573                     }
 574                 }
 575             }
 576             /* Communicate the forces */
 577             ddSendrecv(dd, d, dddirForward, sendBuffer, receiveBuffer);
 578             /* Add the received forces */
 579             int n = 0;
 580             for (int j : ind.index)
 581             {
 582                 v[j] += receiveBuffer[n];
 583                 n++;
 584             }
 585         }
 586         nzone /= 2;
 587     }
 588 }
 589
 590 real dd_cutoff_multibody(const gmx_domdec_t* dd)
 591 {
 592     const gmx_domdec_comm_t& comm       = *dd->comm;
 593     const DDSystemInfo&      systemInfo = comm.systemInfo;
 594
 595     real r = -1;
 596     if (systemInfo.haveInterDomainMultiBodyBondeds)
 597     {
 598         if (comm.cutoff_mbody > 0)
 599         {
 600             r = comm.cutoff_mbody;
 601         }
 602         else
 603         {
 604             /* cutoff_mbody=0 means we do not have DLB */
 605             r = comm.cellsize_min[dd->dim[0]];
 606             for (int di = 1; di < dd->ndim; di++)
 607             {
 608                 r = std::min(r, comm.cellsize_min[dd->dim[di]]);
 609             }
 610             if (comm.systemInfo.filterBondedCommunication)
 611             {
 612                 r = std::max(r, comm.cutoff_mbody);
 613             }
 614             else
 615             {
 616                 r = std::min(r, systemInfo.cutoff);
 617             }
 618         }
 619     }
 620
 621     return r;
 622 }
 623
 624 real dd_cutoff_twobody(const gmx_domdec_t* dd)
 625 {
 626     const real r_mb = dd_cutoff_multibody(dd);
 627
 628     return std::max(dd->comm->systemInfo.cutoff, r_mb);
 629 }
 630
 631
 632 static void dd_cart_coord2pmecoord(const DDRankSetup&        ddRankSetup,
 633                                    const CartesianRankSetup& cartSetup,
 634                                    const ivec                coord,
 635                                    ivec                      coord_pme)
 636 {
 637     const int nc   = ddRankSetup.numPPCells[cartSetup.cartpmedim];
 638     const int ntot = cartSetup.ntot[cartSetup.cartpmedim];
 639     copy_ivec(coord, coord_pme);
 640     coord_pme[cartSetup.cartpmedim] =
 641             nc + (coord[cartSetup.cartpmedim] * (ntot - nc) + (ntot - nc) / 2) / nc;
 642 }
 643
 644 /* Returns the PME rank index in 0...npmenodes-1 for the PP cell with index ddCellIndex */
 645 static int ddindex2pmeindex(const DDRankSetup& ddRankSetup, const int ddCellIndex)
 646 {
 647     const int npp  = ddRankSetup.numPPRanks;
 648     const int npme = ddRankSetup.numRanksDoingPme;
 649
 650     /* Here we assign a PME node to communicate with this DD node
 651      * by assuming that the major index of both is x.
 652      * We add npme/2 to obtain an even distribution.
 653      */
 654     return (ddCellIndex * npme + npme / 2) / npp;
 655 }
 656
 657 static std::vector<int> dd_interleaved_pme_ranks(const DDRankSetup& ddRankSetup)
 658 {
 659     std::vector<int> pmeRanks(ddRankSetup.numRanksDoingPme);
 660
 661     int n = 0;
 662     for (int i = 0; i < ddRankSetup.numPPRanks; i++)
 663     {
 664         const int p0 = ddindex2pmeindex(ddRankSetup, i);
 665         const int p1 = ddindex2pmeindex(ddRankSetup, i + 1);
 666         if (i + 1 == ddRankSetup.numPPRanks || p1 > p0)
 667         {
 668             if (debug)
 669             {
 670                 fprintf(debug, "pme_rank[%d] = %d\n", n, i + 1 + n);
 671             }
 672             pmeRanks[n] = i + 1 + n;
 673             n++;
 674         }
 675     }
 676
 677     return pmeRanks;
 678 }
 679
 680 static int gmx_ddcoord2pmeindex(const gmx_domdec_t& dd, int x, int y, int z)
 681 {
 682     ivec coords;
 683
 684     coords[XX]     = x;
 685     coords[YY]     = y;
 686     coords[ZZ]     = z;
 687     const int slab = ddindex2pmeindex(dd.comm->ddRankSetup, dd_index(dd.numCells, coords));
 688
 689     return slab;
 690 }
 691
 692 static int ddcoord2simnodeid(const t_commrec* cr, int x, int y, int z)
 693 {
 694     const CartesianRankSetup& cartSetup = cr->dd->comm->cartesianRankSetup;
 695     ivec                      coords    = { x, y, z };
 696     int                       nodeid    = -1;
 697
 698     if (cartSetup.bCartesianPP_PME)
 699     {
 700 #if GMX_MPI
 701         MPI_Cart_rank(cr->mpi_comm_mysim, coords, &nodeid);
 702 #endif
 703     }
 704     else
 705     {
 706         int ddindex = dd_index(cr->dd->numCells, coords);
 707         if (cartSetup.bCartesianPP)
 708         {
 709             nodeid = cartSetup.ddindex2simnodeid[ddindex];
 710         }
 711         else
 712         {
 713             if (cr->dd->comm->ddRankSetup.usePmeOnlyRanks)
 714             {
 715                 nodeid = ddindex + gmx_ddcoord2pmeindex(*cr->dd, x, y, z);
 716             }
 717             else
 718             {
 719                 nodeid = ddindex;
 720             }
 721         }
 722     }
 723
 724     return nodeid;
 725 }
 726
 727 static int dd_simnode2pmenode(const DDRankSetup&        ddRankSetup,
 728                               const CartesianRankSetup& cartSetup,
 729                               gmx::ArrayRef<const int>  pmeRanks,
 730                               const t_commrec gmx_unused* cr,
 731                               const int                   sim_nodeid)
 732 {
 733     int pmenode = -1;
 734
 735     /* This assumes a uniform x domain decomposition grid cell size */
 736     if (cartSetup.bCartesianPP_PME)
 737     {
 738 #if GMX_MPI
 739         ivec coord, coord_pme;
 740         MPI_Cart_coords(cr->mpi_comm_mysim, sim_nodeid, DIM, coord);
 741         if (coord[cartSetup.cartpmedim] < ddRankSetup.numPPCells[cartSetup.cartpmedim])
 742         {
 743             /* This is a PP rank */
 744             dd_cart_coord2pmecoord(ddRankSetup, cartSetup, coord, coord_pme);
 745             MPI_Cart_rank(cr->mpi_comm_mysim, coord_pme, &pmenode);
 746         }
 747 #endif
 748     }
 749     else if (cartSetup.bCartesianPP)
 750     {
 751         if (sim_nodeid < ddRankSetup.numPPRanks)
 752         {
 753             pmenode = ddRankSetup.numPPRanks + ddindex2pmeindex(ddRankSetup, sim_nodeid);
 754         }
 755     }
 756     else
 757     {
 758         /* This assumes DD cells with identical x coordinates
 759          * are numbered sequentially.
 760          */
 761         if (pmeRanks.empty())
 762         {
 763             if (sim_nodeid < ddRankSetup.numPPRanks)
 764             {
 765                 /* The DD index equals the nodeid */
 766                 pmenode = ddRankSetup.numPPRanks + ddindex2pmeindex(ddRankSetup, sim_nodeid);
 767             }
 768         }
 769         else
 770         {
 771             int i = 0;
 772             while (sim_nodeid > pmeRanks[i])
 773             {
 774                 i++;
 775             }
 776             if (sim_nodeid < pmeRanks[i])
 777             {
 778                 pmenode = pmeRanks[i];
 779             }
 780         }
 781     }
 782
 783     return pmenode;
 784 }
 785
 786 NumPmeDomains getNumPmeDomains(const gmx_domdec_t* dd)
 787 {
 788     if (dd != nullptr)
 789     {
 790         return { dd->comm->ddRankSetup.npmenodes_x, dd->comm->ddRankSetup.npmenodes_y };
 791     }
 792     else
 793     {
 794         return { 1, 1 };
 795     }
 796 }
 797
 798 std::vector<int> get_pme_ddranks(const t_commrec* cr, const int pmenodeid)
 799 {
 800     const DDRankSetup&        ddRankSetup = cr->dd->comm->ddRankSetup;
 801     const CartesianRankSetup& cartSetup   = cr->dd->comm->cartesianRankSetup;
 802     GMX_RELEASE_ASSERT(ddRankSetup.usePmeOnlyRanks,
 803                        "This function should only be called when PME-only ranks are in use");
 804     std::vector<int> ddranks;
 805     ddranks.reserve((ddRankSetup.numPPRanks + ddRankSetup.numRanksDoingPme - 1) / ddRankSetup.numRanksDoingPme);
 806
 807     for (int x = 0; x < ddRankSetup.numPPCells[XX]; x++)
 808     {
 809         for (int y = 0; y < ddRankSetup.numPPCells[YY]; y++)
 810         {
 811             for (int z = 0; z < ddRankSetup.numPPCells[ZZ]; z++)
 812             {
 813                 if (cartSetup.bCartesianPP_PME)
 814                 {
 815                     ivec coord = { x, y, z };
 816                     ivec coord_pme;
 817                     dd_cart_coord2pmecoord(ddRankSetup, cartSetup, coord, coord_pme);
 818                     if (cr->dd->ci[XX] == coord_pme[XX] && cr->dd->ci[YY] == coord_pme[YY]
 819                         && cr->dd->ci[ZZ] == coord_pme[ZZ])
 820                     {
 821                         ddranks.push_back(ddcoord2simnodeid(cr, x, y, z));
 822                     }
 823                 }
 824                 else
 825                 {
 826                     /* The slab corresponds to the nodeid in the PME group */
 827                     if (gmx_ddcoord2pmeindex(*cr->dd, x, y, z) == pmenodeid)
 828                     {
 829                         ddranks.push_back(ddcoord2simnodeid(cr, x, y, z));
 830                     }
 831                 }
 832             }
 833         }
 834     }
 835     return ddranks;
 836 }
 837
 838 static gmx_bool receive_vir_ener(const gmx_domdec_t* dd, gmx::ArrayRef<const int> pmeRanks, const t_commrec* cr)
 839 {
 840     bool bReceive = true;
 841
 842     const DDRankSetup& ddRankSetup = dd->comm->ddRankSetup;
 843     if (ddRankSetup.usePmeOnlyRanks)
 844     {
 845         const CartesianRankSetup& cartSetup = dd->comm->cartesianRankSetup;
 846         if (cartSetup.bCartesianPP_PME)
 847         {
 848 #if GMX_MPI
 849             int  pmenode = dd_simnode2pmenode(ddRankSetup, cartSetup, pmeRanks, cr, cr->sim_nodeid);
 850             ivec coords;
 851             MPI_Cart_coords(cr->mpi_comm_mysim, cr->sim_nodeid, DIM, coords);
 852             coords[cartSetup.cartpmedim]++;
 853             if (coords[cartSetup.cartpmedim] < dd->numCells[cartSetup.cartpmedim])
 854             {
 855                 int rank = 0;
 856                 MPI_Cart_rank(cr->mpi_comm_mysim, coords, &rank);
 857                 if (dd_simnode2pmenode(ddRankSetup, cartSetup, pmeRanks, cr, rank) == pmenode)
 858                 {
 859                     /* This is not the last PP node for pmenode */
 860                     bReceive = FALSE;
 861                 }
 862             }
 863 #else
 864             GMX_RELEASE_ASSERT(
 865                     false,
 866                     "Without MPI we should not have Cartesian PP-PME with #PMEnodes < #DDnodes");
 867 #endif
 868         }
 869         else
 870         {
 871             int pmenode = dd_simnode2pmenode(ddRankSetup, cartSetup, pmeRanks, cr, cr->sim_nodeid);
 872             if (cr->sim_nodeid + 1 < cr->nnodes
 873                 && dd_simnode2pmenode(ddRankSetup, cartSetup, pmeRanks, cr, cr->sim_nodeid + 1) == pmenode)
 874             {
 875                 /* This is not the last PP node for pmenode */
 876                 bReceive = FALSE;
 877             }
 878         }
 879     }
 880
 881     return bReceive;
 882 }
 883
 884 static void set_slb_pme_dim_f(gmx_domdec_t* dd, int dim, real** dim_f)
 885 {
 886     gmx_domdec_comm_t* comm = dd->comm;
 887
 888     snew(*dim_f, dd->numCells[dim] + 1);
 889     (*dim_f)[0] = 0;
 890     for (int i = 1; i < dd->numCells[dim]; i++)
 891     {
 892         if (comm->slb_frac[dim])
 893         {
 894             (*dim_f)[i] = (*dim_f)[i - 1] + comm->slb_frac[dim][i - 1];
 895         }
 896         else
 897         {
 898             (*dim_f)[i] = static_cast<real>(i) / static_cast<real>(dd->numCells[dim]);
 899         }
 900     }
 901     (*dim_f)[dd->numCells[dim]] = 1;
 902 }
 903
 904 static void init_ddpme(gmx_domdec_t* dd, gmx_ddpme_t* ddpme, int dimind)
 905 {
 906     const DDRankSetup& ddRankSetup = dd->comm->ddRankSetup;
 907
 908     if (dimind == 0 && dd->dim[0] == YY && ddRankSetup.npmenodes_x == 1)
 909     {
 910         ddpme->dim = YY;
 911     }
 912     else
 913     {
 914         ddpme->dim = dimind;
 915     }
 916     ddpme->dim_match = (ddpme->dim == dd->dim[dimind]);
 917
 918     ddpme->nslab = (ddpme->dim == 0 ? ddRankSetup.npmenodes_x : ddRankSetup.npmenodes_y);
 919
 920     if (ddpme->nslab <= 1)
 921     {
 922         return;
 923     }
 924
 925     const int nso = ddRankSetup.numRanksDoingPme / ddpme->nslab;
 926     /* Determine for each PME slab the PP location range for dimension dim */
 927     snew(ddpme->pp_min, ddpme->nslab);
 928     snew(ddpme->pp_max, ddpme->nslab);
 929     for (int slab = 0; slab < ddpme->nslab; slab++)
 930     {
 931         ddpme->pp_min[slab] = dd->numCells[dd->dim[dimind]] - 1;
 932         ddpme->pp_max[slab] = 0;
 933     }
 934     for (int i = 0; i < dd->nnodes; i++)
 935     {
 936         ivec xyz;
 937         ddindex2xyz(dd->numCells, i, xyz);
 938         /* For y only use our y/z slab.
 939          * This assumes that the PME x grid size matches the DD grid size.
 940          */
 941         if (dimind == 0 || xyz[XX] == dd->ci[XX])
 942         {
 943             const int pmeindex  = ddindex2pmeindex(ddRankSetup, i);
 944             const int slab      = (dimind == 0) ? (pmeindex / nso) : (pmeindex % ddpme->nslab);
 945             ddpme->pp_min[slab] = std::min(ddpme->pp_min[slab], xyz[dimind]);
 946             ddpme->pp_max[slab] = std::max(ddpme->pp_max[slab], xyz[dimind]);
 947         }
 948     }
 949
 950     set_slb_pme_dim_f(dd, ddpme->dim, &ddpme->slb_dim_f);
 951 }
 952
 953 int dd_pme_maxshift_x(const gmx_domdec_t& dd)
 954 {
 955     const DDRankSetup& ddRankSetup = dd.comm->ddRankSetup;
 956
 957     if (ddRankSetup.ddpme[0].dim == XX)
 958     {
 959         return ddRankSetup.ddpme[0].maxshift;
 960     }
 961     else
 962     {
 963         return 0;
 964     }
 965 }
 966
 967 int dd_pme_maxshift_y(const gmx_domdec_t& dd)
 968 {
 969     const DDRankSetup& ddRankSetup = dd.comm->ddRankSetup;
 970
 971     if (ddRankSetup.ddpme[0].dim == YY)
 972     {
 973         return ddRankSetup.ddpme[0].maxshift;
 974     }
 975     else if (ddRankSetup.npmedecompdim >= 2 && ddRankSetup.ddpme[1].dim == YY)
 976     {
 977         return ddRankSetup.ddpme[1].maxshift;
 978     }
 979     else
 980     {
 981         return 0;
 982     }
 983 }
 984
 985 bool ddMayHaveSplitConstraints(const gmx_domdec_t& dd)
 986 {
 987     return dd.comm->systemInfo.mayHaveSplitConstraints;
 988 }
 989
 990 bool ddUsesUpdateGroups(const gmx_domdec_t& dd)
 991 {
 992     return dd.comm->systemInfo.useUpdateGroups;
 993 }
 994
 995 void dd_cycles_add(const gmx_domdec_t* dd, float cycles, int ddCycl)
 996 {
 997     /* Note that the cycles value can be incorrect, either 0 or some
 998      * extremely large value, when our thread migrated to another core
 999      * with an unsynchronized cycle counter. If this happens less often
1000      * that once per nstlist steps, this will not cause issues, since
1001      * we later subtract the maximum value from the sum over nstlist steps.
1002      * A zero count will slightly lower the total, but that's a small effect.
1003      * Note that the main purpose of the subtraction of the maximum value
1004      * is to avoid throwing off the load balancing when stalls occur due
1005      * e.g. system activity or network congestion.
1006      */
1007     dd->comm->cycl[ddCycl] += cycles;
1008     dd->comm->cycl_n[ddCycl]++;
1009     if (cycles > dd->comm->cycl_max[ddCycl])
1010     {
1011         dd->comm->cycl_max[ddCycl] = cycles;
1012     }
1013 }
1014
1015 #if GMX_MPI
1016 static void make_load_communicator(gmx_domdec_t* dd, int dim_ind, ivec loc)
1017 {
1018     MPI_Comm c_row = MPI_COMM_NULL;
1019     ivec     loc_c;
1020     bool     bPartOfGroup = false;
1021
1022     const int dim = dd->dim[dim_ind];
1023     copy_ivec(loc, loc_c);
1024     for (int i = 0; i < dd->numCells[dim]; i++)
1025     {
1026         loc_c[dim]     = i;
1027         const int rank = dd_index(dd->numCells, loc_c);
1028         if (rank == dd->rank)
1029         {
1030             /* This process is part of the group */
1031             bPartOfGroup = TRUE;
1032         }
1033     }
1034     MPI_Comm_split(dd->mpi_comm_all, bPartOfGroup ? 0 : MPI_UNDEFINED, dd->rank, &c_row);
1035     if (bPartOfGroup)
1036     {
1037         dd->comm->mpi_comm_load[dim_ind] = c_row;
1038         if (!isDlbDisabled(dd->comm))
1039         {
1040             DDCellsizesWithDlb& cellsizes = dd->comm->cellsizesWithDlb[dim_ind];
1041
1042             if (dd->ci[dim] == dd->master_ci[dim])
1043             {
1044                 /* This is the root process of this row */
1045                 cellsizes.rowMaster = std::make_unique<RowMaster>();
1046
1047                 RowMaster& rowMaster = *cellsizes.rowMaster;
1048                 rowMaster.cellFrac.resize(ddCellFractionBufferSize(dd, dim_ind));
1049                 rowMaster.oldCellFrac.resize(dd->numCells[dim] + 1);
1050                 rowMaster.isCellMin.resize(dd->numCells[dim]);
1051                 if (dim_ind > 0)
1052                 {
1053                     rowMaster.bounds.resize(dd->numCells[dim]);
1054                 }
1055                 rowMaster.buf_ncd.resize(dd->numCells[dim]);
1056             }
1057             else
1058             {
1059                 /* This is not a root process, we only need to receive cell_f */
1060                 cellsizes.fracRow.resize(ddCellFractionBufferSize(dd, dim_ind));
1061             }
1062         }
1063         if (dd->ci[dim] == dd->master_ci[dim])
1064         {
1065             snew(dd->comm->load[dim_ind].load, dd->numCells[dim] * DD_NLOAD_MAX);
1066         }
1067     }
1068 }
1069 #endif
1070
1071 void dd_setup_dlb_resource_sharing(const t_commrec* cr, int gpu_id)
1072 {
1073 #if GMX_MPI
1074     MPI_Comm mpi_comm_pp_physicalnode = MPI_COMM_NULL;
1075
1076     if (!thisRankHasDuty(cr, DUTY_PP) || gpu_id < 0)
1077     {
1078         /* Only ranks with short-ranged tasks (currently) use GPUs.
1079          * If we don't have GPUs assigned, there are no resources to share.
1080          */
1081         return;
1082     }
1083
1084     const int physicalnode_id_hash = gmx_physicalnode_id_hash();
1085
1086     gmx_domdec_t* dd = cr->dd;
1087
1088     if (debug)
1089     {
1090         fprintf(debug, "dd_setup_dd_dlb_gpu_sharing:\n");
1091         fprintf(debug, "DD PP rank %d physical node hash %d gpu_id %d\n", dd->rank, physicalnode_id_hash, gpu_id);
1092     }
1093     /* Split the PP communicator over the physical nodes */
1094     /* TODO: See if we should store this (before), as it's also used for
1095      * for the nodecomm summation.
1096      */
1097     // TODO PhysicalNodeCommunicator could be extended/used to handle
1098     // the need for per-node per-group communicators.
1099     MPI_Comm_split(dd->mpi_comm_all, physicalnode_id_hash, dd->rank, &mpi_comm_pp_physicalnode);
1100     MPI_Comm_split(mpi_comm_pp_physicalnode, gpu_id, dd->rank, &dd->comm->mpi_comm_gpu_shared);
1101     MPI_Comm_free(&mpi_comm_pp_physicalnode);
1102     MPI_Comm_size(dd->comm->mpi_comm_gpu_shared, &dd->comm->nrank_gpu_shared);
1103
1104     if (debug)
1105     {
1106         fprintf(debug, "nrank_gpu_shared %d\n", dd->comm->nrank_gpu_shared);
1107     }
1108
1109     /* Note that some ranks could share a GPU, while others don't */
1110
1111     if (dd->comm->nrank_gpu_shared == 1)
1112     {
1113         MPI_Comm_free(&dd->comm->mpi_comm_gpu_shared);
1114     }
1115 #else
1116     GMX_UNUSED_VALUE(cr);
1117     GMX_UNUSED_VALUE(gpu_id);
1118 #endif
1119 }
1120
1121 static void make_load_communicators(gmx_domdec_t gmx_unused* dd)
1122 {
1123 #if GMX_MPI
1124     ivec loc;
1125
1126     if (debug)
1127     {
1128         fprintf(debug, "Making load communicators\n");
1129     }
1130
1131     dd->comm->load = new domdec_load_t[std::max(dd->ndim, 1)];
1132     snew(dd->comm->mpi_comm_load, std::max(dd->ndim, 1));
1133
1134     if (dd->ndim == 0)
1135     {
1136         return;
1137     }
1138
1139     clear_ivec(loc);
1140     make_load_communicator(dd, 0, loc);
1141     if (dd->ndim > 1)
1142     {
1143         const int dim0 = dd->dim[0];
1144         for (int i = 0; i < dd->numCells[dim0]; i++)
1145         {
1146             loc[dim0] = i;
1147             make_load_communicator(dd, 1, loc);
1148         }
1149     }
1150     if (dd->ndim > 2)
1151     {
1152         const int dim0 = dd->dim[0];
1153         for (int i = 0; i < dd->numCells[dim0]; i++)
1154         {
1155             loc[dim0]      = i;
1156             const int dim1 = dd->dim[1];
1157             for (int j = 0; j < dd->numCells[dim1]; j++)
1158             {
1159                 loc[dim1] = j;
1160                 make_load_communicator(dd, 2, loc);
1161             }
1162         }
1163     }
1164
1165     if (debug)
1166     {
1167         fprintf(debug, "Finished making load communicators\n");
1168     }
1169 #endif
1170 }
1171
1172 /*! \brief Sets up the relation between neighboring domains and zones */
1173 static void setup_neighbor_relations(gmx_domdec_t* dd)
1174 {
1175     ivec tmp, s;
1176     GMX_ASSERT((dd->ndim >= 0) && (dd->ndim <= DIM), "Must have valid number of dimensions for DD");
1177
1178     for (int d = 0; d < dd->ndim; d++)
1179     {
1180         const int dim = dd->dim[d];
1181         copy_ivec(dd->ci, tmp);
1182         tmp[dim]           = (tmp[dim] + 1) % dd->numCells[dim];
1183         dd->neighbor[d][0] = ddcoord2ddnodeid(dd, tmp);
1184         copy_ivec(dd->ci, tmp);
1185         tmp[dim]           = (tmp[dim] - 1 + dd->numCells[dim]) % dd->numCells[dim];
1186         dd->neighbor[d][1] = ddcoord2ddnodeid(dd, tmp);
1187         if (debug)
1188         {
1189             fprintf(debug,
1190                     "DD rank %d neighbor ranks in dir %d are + %d - %d\n",
1191                     dd->rank,
1192                     dim,
1193                     dd->neighbor[d][0],
1194                     dd->neighbor[d][1]);
1195         }
1196     }
1197
1198     int nzone  = (1 << dd->ndim);
1199     int nizone = (1 << std::max(dd->ndim - 1, 0));
1200     assert(nizone >= 1 && nizone <= DD_MAXIZONE);
1201
1202     gmx_domdec_zones_t* zones = &dd->comm->zones;
1203
1204     for (int i = 0; i < nzone; i++)
1205     {
1206         int m = 0;
1207         clear_ivec(zones->shift[i]);
1208         for (int d = 0; d < dd->ndim; d++)
1209         {
1210             zones->shift[i][dd->dim[d]] = dd_zo[i][m++];
1211         }
1212     }
1213
1214     zones->n = nzone;
1215     for (int i = 0; i < nzone; i++)
1216     {
1217         for (int d = 0; d < DIM; d++)
1218         {
1219             s[d] = dd->ci[d] - zones->shift[i][d];
1220             if (s[d] < 0)
1221             {
1222                 s[d] += dd->numCells[d];
1223             }
1224             else if (s[d] >= dd->numCells[d])
1225             {
1226                 s[d] -= dd->numCells[d];
1227             }
1228         }
1229     }
1230     for (int iZoneIndex = 0; iZoneIndex < nizone; iZoneIndex++)
1231     {
1232         GMX_RELEASE_ASSERT(
1233                 ddNonbondedZonePairRanges[iZoneIndex][0] == iZoneIndex,
1234                 "The first element for each ddNonbondedZonePairRanges should match its index");
1235
1236         DDPairInteractionRanges iZone;
1237         iZone.iZoneIndex = iZoneIndex;
1238         /* dd_zp3 is for 3D decomposition, for fewer dimensions use only
1239          * j-zones up to nzone.
1240          */
1241         iZone.jZoneRange = gmx::Range<int>(std::min(ddNonbondedZonePairRanges[iZoneIndex][1], nzone),
1242                                            std::min(ddNonbondedZonePairRanges[iZoneIndex][2], nzone));
1243         for (int dim = 0; dim < DIM; dim++)
1244         {
1245             if (dd->numCells[dim] == 1)
1246             {
1247                 /* All shifts should be allowed */
1248                 iZone.shift0[dim] = -1;
1249                 iZone.shift1[dim] = 1;
1250             }
1251             else
1252             {
1253                 /* Determine the min/max j-zone shift wrt the i-zone */
1254                 iZone.shift0[dim] = 1;
1255                 iZone.shift1[dim] = -1;
1256                 for (int jZone : iZone.jZoneRange)
1257                 {
1258                     int shift_diff = zones->shift[jZone][dim] - zones->shift[iZoneIndex][dim];
1259                     if (shift_diff < iZone.shift0[dim])
1260                     {
1261                         iZone.shift0[dim] = shift_diff;
1262                     }
1263                     if (shift_diff > iZone.shift1[dim])
1264                     {
1265                         iZone.shift1[dim] = shift_diff;
1266                     }
1267                 }
1268             }
1269         }
1270
1271         zones->iZones.push_back(iZone);
1272     }
1273
1274     if (!isDlbDisabled(dd->comm))
1275     {
1276         dd->comm->cellsizesWithDlb.resize(dd->ndim);
1277     }
1278
1279     if (dd->comm->ddSettings.recordLoad)
1280     {
1281         make_load_communicators(dd);
1282     }
1283 }
1284
1285 static void make_pp_communicator(const gmx::MDLogger& mdlog,
1286                                  gmx_domdec_t*        dd,
1287                                  t_commrec gmx_unused* cr,
1288                                  bool gmx_unused reorder)
1289 {
1290 #if GMX_MPI
1291     gmx_domdec_comm_t*  comm      = dd->comm;
1292     CartesianRankSetup& cartSetup = comm->cartesianRankSetup;
1293
1294     if (cartSetup.bCartesianPP)
1295     {
1296         /* Set up cartesian communication for the particle-particle part */
1297         GMX_LOG(mdlog.info)
1298                 .appendTextFormatted("Will use a Cartesian communicator: %d x %d x %d",
1299                                      dd->numCells[XX],
1300                                      dd->numCells[YY],
1301                                      dd->numCells[ZZ]);
1302
1303         ivec periods;
1304         for (int i = 0; i < DIM; i++)
1305         {
1306             periods[i] = TRUE;
1307         }
1308         MPI_Comm comm_cart = MPI_COMM_NULL;
1309         MPI_Cart_create(cr->mpi_comm_mygroup, DIM, dd->numCells, periods, static_cast<int>(reorder), &comm_cart);
1310         /* We overwrite the old communicator with the new cartesian one */
1311         cr->mpi_comm_mygroup = comm_cart;
1312     }
1313
1314     dd->mpi_comm_all = cr->mpi_comm_mygroup;
1315     MPI_Comm_rank(dd->mpi_comm_all, &dd->rank);
1316
1317     if (cartSetup.bCartesianPP_PME)
1318     {
1319         /* Since we want to use the original cartesian setup for sim,
1320          * and not the one after split, we need to make an index.
1321          */
1322         cartSetup.ddindex2ddnodeid.resize(dd->nnodes);
1323         cartSetup.ddindex2ddnodeid[dd_index(dd->numCells, dd->ci)] = dd->rank;
1324         gmx_sumi(dd->nnodes, cartSetup.ddindex2ddnodeid.data(), cr);
1325         /* Get the rank of the DD master,
1326          * above we made sure that the master node is a PP node.
1327          */
1328         int rank = MASTER(cr) ? dd->rank : 0;
1329         MPI_Allreduce(&rank, &dd->masterrank, 1, MPI_INT, MPI_SUM, dd->mpi_comm_all);
1330     }
1331     else if (cartSetup.bCartesianPP)
1332     {
1333         if (!comm->ddRankSetup.usePmeOnlyRanks)
1334         {
1335             /* The PP communicator is also
1336              * the communicator for this simulation
1337              */
1338             cr->mpi_comm_mysim = cr->mpi_comm_mygroup;
1339         }
1340         cr->nodeid = dd->rank;
1341
1342         MPI_Cart_coords(dd->mpi_comm_all, dd->rank, DIM, dd->ci);
1343
1344         /* We need to make an index to go from the coordinates
1345          * to the nodeid of this simulation.
1346          */
1347         cartSetup.ddindex2simnodeid.resize(dd->nnodes);
1348         std::vector<int> buf(dd->nnodes);
1349         if (thisRankHasDuty(cr, DUTY_PP))
1350         {
1351             buf[dd_index(dd->numCells, dd->ci)] = cr->sim_nodeid;
1352         }
1353         /* Communicate the ddindex to simulation nodeid index */
1354         MPI_Allreduce(buf.data(), cartSetup.ddindex2simnodeid.data(), dd->nnodes, MPI_INT, MPI_SUM, cr->mpi_comm_mysim);
1355
1356         /* Determine the master coordinates and rank.
1357          * The DD master should be the same node as the master of this sim.
1358          */
1359         for (int i = 0; i < dd->nnodes; i++)
1360         {
1361             if (cartSetup.ddindex2simnodeid[i] == 0)
1362             {
1363                 ddindex2xyz(dd->numCells, i, dd->master_ci);
1364                 MPI_Cart_rank(dd->mpi_comm_all, dd->master_ci, &dd->masterrank);
1365             }
1366         }
1367         if (debug)
1368         {
1369             fprintf(debug, "The master rank is %d\n", dd->masterrank);
1370         }
1371     }
1372     else
1373     {
1374         /* No Cartesian communicators */
1375         /* We use the rank in dd->comm->all as DD index */
1376         ddindex2xyz(dd->numCells, dd->rank, dd->ci);
1377         /* The simulation master nodeid is 0, so the DD master rank is also 0 */
1378         dd->masterrank = 0;
1379         clear_ivec(dd->master_ci);
1380     }
1381 #endif
1382
1383     GMX_LOG(mdlog.info)
1384             .appendTextFormatted("Domain decomposition rank %d, coordinates %d %d %d\n",
1385                                  dd->rank,
1386                                  dd->ci[XX],
1387                                  dd->ci[YY],
1388                                  dd->ci[ZZ]);
1389     if (debug)
1390     {
1391         fprintf(debug,
1392                 "Domain decomposition rank %d, coordinates %d %d %d\n\n",
1393                 dd->rank,
1394                 dd->ci[XX],
1395                 dd->ci[YY],
1396                 dd->ci[ZZ]);
1397     }
1398 }
1399
1400 static void receive_ddindex2simnodeid(gmx_domdec_t* dd, t_commrec* cr)
1401 {
1402 #if GMX_MPI
1403     CartesianRankSetup& cartSetup = dd->comm->cartesianRankSetup;
1404
1405     if (!cartSetup.bCartesianPP_PME && cartSetup.bCartesianPP)
1406     {
1407         cartSetup.ddindex2simnodeid.resize(dd->nnodes);
1408         std::vector<int> buf(dd->nnodes);
1409         if (thisRankHasDuty(cr, DUTY_PP))
1410         {
1411             buf[dd_index(dd->numCells, dd->ci)] = cr->sim_nodeid;
1412         }
1413         /* Communicate the ddindex to simulation nodeid index */
1414         MPI_Allreduce(buf.data(), cartSetup.ddindex2simnodeid.data(), dd->nnodes, MPI_INT, MPI_SUM, cr->mpi_comm_mysim);
1415     }
1416 #else
1417     GMX_UNUSED_VALUE(dd);
1418     GMX_UNUSED_VALUE(cr);
1419 #endif
1420 }
1421
1422 static CartesianRankSetup split_communicator(const gmx::MDLogger& mdlog,
1423                                              t_commrec*           cr,
1424                                              const DdRankOrder    ddRankOrder,
1425                                              bool gmx_unused    reorder,
1426                                              const DDRankSetup& ddRankSetup,
1427                                              ivec               ddCellIndex,
1428                                              std::vector<int>*  pmeRanks)
1429 {
1430     CartesianRankSetup cartSetup;
1431
1432     cartSetup.bCartesianPP     = (ddRankOrder == DdRankOrder::cartesian);
1433     cartSetup.bCartesianPP_PME = false;
1434
1435     const ivec& numDDCells = ddRankSetup.numPPCells;
1436     /* Initially we set ntot to the number of PP cells */
1437     copy_ivec(numDDCells, cartSetup.ntot);
1438
1439     if (cartSetup.bCartesianPP)
1440     {
1441         const int numDDCellsTot = ddRankSetup.numPPRanks;
1442         bool      bDiv[DIM];
1443         for (int i = 1; i < DIM; i++)
1444         {
1445             bDiv[i] = ((ddRankSetup.numRanksDoingPme * numDDCells[i]) % numDDCellsTot == 0);
1446         }
1447         if (bDiv[YY] || bDiv[ZZ])
1448         {
1449             cartSetup.bCartesianPP_PME = TRUE;
1450             /* If we have 2D PME decomposition, which is always in x+y,
1451              * we stack the PME only nodes in z.
1452              * Otherwise we choose the direction that provides the thinnest slab
1453              * of PME only nodes as this will have the least effect
1454              * on the PP communication.
1455              * But for the PME communication the opposite might be better.
1456              */
1457             if (bDiv[ZZ] && (ddRankSetup.npmenodes_y > 1 || !bDiv[YY] || numDDCells[YY] > numDDCells[ZZ]))
1458             {
1459                 cartSetup.cartpmedim = ZZ;
1460             }
1461             else
1462             {
1463                 cartSetup.cartpmedim = YY;
1464             }
1465             cartSetup.ntot[cartSetup.cartpmedim] +=
1466                     (ddRankSetup.numRanksDoingPme * numDDCells[cartSetup.cartpmedim]) / numDDCellsTot;
1467         }
1468         else
1469         {
1470             GMX_LOG(mdlog.info)
1471                     .appendTextFormatted(
1472                             "Number of PME-only ranks (%d) is not a multiple of nx*ny (%d*%d) or "
1473                             "nx*nz (%d*%d)",
1474                             ddRankSetup.numRanksDoingPme,
1475                             numDDCells[XX],
1476                             numDDCells[YY],
1477                             numDDCells[XX],
1478                             numDDCells[ZZ]);
1479             GMX_LOG(mdlog.info)
1480                     .appendText("Will not use a Cartesian communicator for PP <-> PME\n");
1481         }
1482     }
1483
1484     if (cartSetup.bCartesianPP_PME)
1485     {
1486 #if GMX_MPI
1487         int  rank = 0;
1488         ivec periods;
1489
1490         GMX_LOG(mdlog.info)
1491                 .appendTextFormatted(
1492                         "Will use a Cartesian communicator for PP <-> PME: %d x %d x %d",
1493                         cartSetup.ntot[XX],
1494                         cartSetup.ntot[YY],
1495                         cartSetup.ntot[ZZ]);
1496
1497         for (int i = 0; i < DIM; i++)
1498         {
1499             periods[i] = TRUE;
1500         }
1501         MPI_Comm comm_cart = MPI_COMM_NULL;
1502         MPI_Cart_create(cr->mpi_comm_mysim, DIM, cartSetup.ntot, periods, static_cast<int>(reorder), &comm_cart);
1503         MPI_Comm_rank(comm_cart, &rank);
1504         if (MASTER(cr) && rank != 0)
1505         {
1506             gmx_fatal(FARGS, "MPI rank 0 was renumbered by MPI_Cart_create, we do not allow this");
1507         }
1508
1509         /* With this assigment we loose the link to the original communicator
1510          * which will usually be MPI_COMM_WORLD, unless have multisim.
1511          */
1512         cr->mpi_comm_mysim = comm_cart;
1513         cr->sim_nodeid     = rank;
1514
1515         MPI_Cart_coords(cr->mpi_comm_mysim, cr->sim_nodeid, DIM, ddCellIndex);
1516
1517         GMX_LOG(mdlog.info)
1518                 .appendTextFormatted("Cartesian rank %d, coordinates %d %d %d\n",
1519                                      cr->sim_nodeid,
1520                                      ddCellIndex[XX],
1521                                      ddCellIndex[YY],
1522                                      ddCellIndex[ZZ]);
1523
1524         if (ddCellIndex[cartSetup.cartpmedim] < numDDCells[cartSetup.cartpmedim])
1525         {
1526             cr->duty = DUTY_PP;
1527         }
1528         if (!ddRankSetup.usePmeOnlyRanks
1529             || ddCellIndex[cartSetup.cartpmedim] >= numDDCells[cartSetup.cartpmedim])
1530         {
1531             cr->duty = DUTY_PME;
1532         }
1533
1534         /* Split the sim communicator into PP and PME only nodes */
1535         MPI_Comm_split(cr->mpi_comm_mysim,
1536                        getThisRankDuties(cr),
1537                        dd_index(cartSetup.ntot, ddCellIndex),
1538                        &cr->mpi_comm_mygroup);
1539 #else
1540         GMX_UNUSED_VALUE(ddCellIndex);
1541 #endif
1542     }
1543     else
1544     {
1545         switch (ddRankOrder)
1546         {
1547             case DdRankOrder::pp_pme:
1548                 GMX_LOG(mdlog.info).appendText("Order of the ranks: PP first, PME last");
1549                 break;
1550             case DdRankOrder::interleave:
1551                 /* Interleave the PP-only and PME-only ranks */
1552                 GMX_LOG(mdlog.info).appendText("Interleaving PP and PME ranks");
1553                 *pmeRanks = dd_interleaved_pme_ranks(ddRankSetup);
1554                 break;
1555             case DdRankOrder::cartesian: break;
1556             default: gmx_fatal(FARGS, "Invalid ddRankOrder=%d", static_cast<int>(ddRankOrder));
1557         }
1558
1559         if (dd_simnode2pmenode(ddRankSetup, cartSetup, *pmeRanks, cr, cr->sim_nodeid) == -1)
1560         {
1561             cr->duty = DUTY_PME;
1562         }
1563         else
1564         {
1565             cr->duty = DUTY_PP;
1566         }
1567 #if GMX_MPI
1568         /* Split the sim communicator into PP and PME only nodes */
1569         MPI_Comm_split(cr->mpi_comm_mysim, getThisRankDuties(cr), cr->nodeid, &cr->mpi_comm_mygroup);
1570         MPI_Comm_rank(cr->mpi_comm_mygroup, &cr->nodeid);
1571 #endif
1572     }
1573
1574     GMX_LOG(mdlog.info)
1575             .appendTextFormatted("This rank does only %s work.\n",
1576                                  thisRankHasDuty(cr, DUTY_PP) ? "particle-particle" : "PME-mesh");
1577
1578     return cartSetup;
1579 }
1580
1581 /*! \brief Makes the PP communicator and the PME communicator, when needed
1582  *
1583  * Returns the Cartesian rank setup.
1584  * Sets \p cr->mpi_comm_mygroup
1585  * For PP ranks, sets the DD PP cell index in \p ddCellIndex.
1586  * With separate PME ranks in interleaved order, set the PME ranks in \p pmeRanks.
1587  */
1588 static CartesianRankSetup makeGroupCommunicators(const gmx::MDLogger& mdlog,
1589                                                  const DDSettings&    ddSettings,
1590                                                  const DdRankOrder    ddRankOrder,
1591                                                  const DDRankSetup&   ddRankSetup,
1592                                                  t_commrec*           cr,
1593                                                  ivec                 ddCellIndex,
1594                                                  std::vector<int>*    pmeRanks)
1595 {
1596     CartesianRankSetup cartSetup;
1597
1598     // As a default, both group and sim communicators are equal to the default communicator
1599     cr->mpi_comm_mygroup = cr->mpiDefaultCommunicator;
1600     cr->mpi_comm_mysim   = cr->mpiDefaultCommunicator;
1601     cr->nnodes           = cr->sizeOfDefaultCommunicator;
1602     cr->nodeid           = cr->rankInDefaultCommunicator;
1603     cr->sim_nodeid       = cr->rankInDefaultCommunicator;
1604
1605     if (ddRankSetup.usePmeOnlyRanks)
1606     {
1607         /* Split the communicator into a PP and PME part */
1608         cartSetup = split_communicator(
1609                 mdlog, cr, ddRankOrder, ddSettings.useCartesianReorder, ddRankSetup, ddCellIndex, pmeRanks);
1610     }
1611     else
1612     {
1613         /* All nodes do PP and PME */
1614         /* We do not require separate communicators */
1615         cartSetup.bCartesianPP     = false;
1616         cartSetup.bCartesianPP_PME = false;
1617     }
1618
1619     return cartSetup;
1620 }
1621
1622 /*! \brief For PP ranks, sets or makes the communicator
1623  *
1624  * For PME ranks get the rank id.
1625  * For PP only ranks, sets the PME-only rank.
1626  */
1627 static void setupGroupCommunication(const gmx::MDLogger&     mdlog,
1628                                     const DDSettings&        ddSettings,
1629                                     gmx::ArrayRef<const int> pmeRanks,
1630                                     t_commrec*               cr,
1631                                     const int                numAtomsInSystem,
1632                                     gmx_domdec_t*            dd)
1633 {
1634     const DDRankSetup&        ddRankSetup = dd->comm->ddRankSetup;
1635     const CartesianRankSetup& cartSetup   = dd->comm->cartesianRankSetup;
1636
1637     if (thisRankHasDuty(cr, DUTY_PP))
1638     {
1639         /* Copy or make a new PP communicator */
1640
1641         /* We (possibly) reordered the nodes in split_communicator,
1642          * so it is no longer required in make_pp_communicator.
1643          */
1644         const bool useCartesianReorder = (ddSettings.useCartesianReorder && !cartSetup.bCartesianPP_PME);
1645
1646         make_pp_communicator(mdlog, dd, cr, useCartesianReorder);
1647     }
1648     else
1649     {
1650         receive_ddindex2simnodeid(dd, cr);
1651     }
1652
1653     if (!thisRankHasDuty(cr, DUTY_PME))
1654     {
1655         /* Set up the commnuication to our PME node */
1656         dd->pme_nodeid = dd_simnode2pmenode(ddRankSetup, cartSetup, pmeRanks, cr, cr->sim_nodeid);
1657         dd->pme_receive_vir_ener = receive_vir_ener(dd, pmeRanks, cr);
1658         if (debug)
1659         {
1660             fprintf(debug,
1661                     "My pme_nodeid %d receive ener %s\n",
1662                     dd->pme_nodeid,
1663                     gmx::boolToString(dd->pme_receive_vir_ener));
1664         }
1665     }
1666     else
1667     {
1668         dd->pme_nodeid = -1;
1669     }
1670
1671     /* We can not use DDMASTER(dd), because dd->masterrank is set later */
1672     if (MASTER(cr))
1673     {
1674         dd->ma = std::make_unique<AtomDistribution>(dd->numCells, numAtomsInSystem, numAtomsInSystem);
1675     }
1676 }
1677
1678 static real* get_slb_frac(const gmx::MDLogger& mdlog, const char* dir, int nc, const char* size_string)
1679 {
1680     real* slb_frac = nullptr;
1681     if (nc > 1 && size_string != nullptr)
1682     {
1683         GMX_LOG(mdlog.info).appendTextFormatted("Using static load balancing for the %s direction", dir);
1684         snew(slb_frac, nc);
1685         real tot = 0;
1686         for (int i = 0; i < nc; i++)
1687         {
1688             double dbl = 0;
1689             int    n   = 0;
1690             sscanf(size_string, "%20lf%n", &dbl, &n);
1691             if (dbl == 0)
1692             {
1693                 gmx_fatal(FARGS,
1694                           "Incorrect or not enough DD cell size entries for direction %s: '%s'",
1695                           dir,
1696                           size_string);
1697             }
1698             slb_frac[i] = dbl;
1699             size_string += n;
1700             tot += slb_frac[i];
1701         }
1702         /* Normalize */
1703         std::string relativeCellSizes = "Relative cell sizes:";
1704         for (int i = 0; i < nc; i++)
1705         {
1706             slb_frac[i] /= tot;
1707             relativeCellSizes += gmx::formatString(" %5.3f", slb_frac[i]);
1708         }
1709         GMX_LOG(mdlog.info).appendText(relativeCellSizes);
1710     }
1711
1712     return slb_frac;
1713 }
1714
1715 static int multi_body_bondeds_count(const gmx_mtop_t& mtop)
1716 {
1717     int n = 0;
1718     for (const auto ilists : IListRange(mtop))
1719     {
1720         for (auto& ilist : extractILists(ilists.list(), IF_BOND))
1721         {
1722             if (NRAL(ilist.functionType) > 2)
1723             {
1724                 n += ilists.nmol() * (ilist.iatoms.size() / ilistStride(ilist));
1725             }
1726         }
1727     }
1728
1729     return n;
1730 }
1731
1732 static int dd_getenv(const gmx::MDLogger& mdlog, const char* env_var, int def)
1733 {
1734     int   nst = def;
1735     char* val = getenv(env_var);
1736     if (val)
1737     {
1738         if (sscanf(val, "%20d", &nst) <= 0)
1739         {
1740             nst = 1;
1741         }
1742         GMX_LOG(mdlog.info).appendTextFormatted("Found env.var. %s = %s, using value %d", env_var, val, nst);
1743     }
1744
1745     return nst;
1746 }
1747
1748 static void check_dd_restrictions(const gmx_domdec_t* dd, const t_inputrec& inputrec, const gmx::MDLogger& mdlog)
1749 {
1750     if (inputrec.pbcType == PbcType::Screw
1751         && (dd->numCells[XX] == 1 || dd->numCells[YY] > 1 || dd->numCells[ZZ] > 1))
1752     {
1753         gmx_fatal(FARGS,
1754                   "With pbc=%s can only do domain decomposition in the x-direction",
1755                   c_pbcTypeNames[inputrec.pbcType].c_str());
1756     }
1757
1758     if (inputrec.nstlist == 0)
1759     {
1760         gmx_fatal(FARGS, "Domain decomposition does not work with nstlist=0");
1761     }
1762
1763     if (inputrec.comm_mode == ComRemovalAlgorithm::Angular && inputrec.pbcType != PbcType::No)
1764     {
1765         GMX_LOG(mdlog.warning)
1766                 .appendText(
1767                         "comm-mode angular will give incorrect results when the comm group "
1768                         "partially crosses a periodic boundary");
1769     }
1770 }
1771
1772 static real average_cellsize_min(const gmx_ddbox_t& ddbox, const ivec numDomains)
1773 {
1774     real r = ddbox.box_size[XX];
1775     for (int d = 0; d < DIM; d++)
1776     {
1777         if (numDomains[d] > 1)
1778         {
1779             /* Check using the initial average cell size */
1780             r = std::min(r, ddbox.box_size[d] * ddbox.skew_fac[d] / numDomains[d]);
1781         }
1782     }
1783
1784     return r;
1785 }
1786
1787 /*! \brief Depending on the DLB initial value return the DLB switched off state or issue an error.
1788  */
1789 static DlbState forceDlbOffOrBail(DlbState             cmdlineDlbState,
1790                                   const std::string&   reasonStr,
1791                                   const gmx::MDLogger& mdlog)
1792 {
1793     std::string dlbNotSupportedErr = "Dynamic load balancing requested, but ";
1794     std::string dlbDisableNote     = "NOTE: disabling dynamic load balancing as ";
1795
1796     if (cmdlineDlbState == DlbState::onUser)
1797     {
1798         gmx_fatal(FARGS, "%s", (dlbNotSupportedErr + reasonStr).c_str());
1799     }
1800     else if (cmdlineDlbState == DlbState::offCanTurnOn)
1801     {
1802         GMX_LOG(mdlog.info).appendText(dlbDisableNote + reasonStr);
1803     }
1804     return DlbState::offForever;
1805 }
1806
1807 /*! \brief Return the dynamic load balancer's initial state based on initial conditions and user inputs.
1808  *
1809  * This function parses the parameters of "-dlb" command line option setting
1810  * corresponding state values. Then it checks the consistency of the determined
1811  * state with other run parameters and settings. As a result, the initial state
1812  * may be altered or an error may be thrown if incompatibility of options is detected.
1813  *
1814  * \param [in] mdlog       Logger.
1815  * \param [in] dlbOption   Enum value for the DLB option.
1816  * \param [in] bRecordLoad True if the load balancer is recording load information.
1817  * \param [in] mdrunOptions  Options for mdrun.
1818  * \param [in] inputrec    Pointer mdrun to input parameters.
1819  * \returns                DLB initial/startup state.
1820  */
1821 static DlbState determineInitialDlbState(const gmx::MDLogger&     mdlog,
1822                                          DlbOption                dlbOption,
1823                                          gmx_bool                 bRecordLoad,
1824                                          const gmx::MdrunOptions& mdrunOptions,
1825                                          const t_inputrec&        inputrec)
1826 {
1827     DlbState dlbState = DlbState::offCanTurnOn;
1828
1829     switch (dlbOption)
1830     {
1831         case DlbOption::turnOnWhenUseful: dlbState = DlbState::offCanTurnOn; break;
1832         case DlbOption::no: dlbState = DlbState::offUser; break;
1833         case DlbOption::yes: dlbState = DlbState::onUser; break;
1834         default: gmx_incons("Invalid dlbOption enum value");
1835     }
1836
1837     /* Reruns don't support DLB: bail or override auto mode */
1838     if (mdrunOptions.rerun)
1839     {
1840         std::string reasonStr = "it is not supported in reruns.";
1841         return forceDlbOffOrBail(dlbState, reasonStr, mdlog);
1842     }
1843
1844     /* Unsupported integrators */
1845     if (!EI_DYNAMICS(inputrec.eI))
1846     {
1847         auto reasonStr =
1848                 gmx::formatString("it is only supported with dynamics, not with integrator '%s'.",
1849                                   enumValueToString(inputrec.eI));
1850         return forceDlbOffOrBail(dlbState, reasonStr, mdlog);
1851     }
1852
1853     /* Without cycle counters we can't time work to balance on */
1854     if (!bRecordLoad)
1855     {
1856         std::string reasonStr =
1857                 "cycle counters unsupported or not enabled in the operating system kernel.";
1858         return forceDlbOffOrBail(dlbState, reasonStr, mdlog);
1859     }
1860
1861     if (mdrunOptions.reproducible)
1862     {
1863         std::string reasonStr = "you started a reproducible run.";
1864         switch (dlbState)
1865         {
1866             case DlbState::offUser: break;
1867             case DlbState::offForever:
1868                 GMX_RELEASE_ASSERT(false, "DlbState::offForever is not a valid initial state");
1869                 break;
1870             case DlbState::offCanTurnOn: return forceDlbOffOrBail(dlbState, reasonStr, mdlog);
1871             case DlbState::onCanTurnOff:
1872                 GMX_RELEASE_ASSERT(false, "DlbState::offCanTurnOff is not a valid initial state");
1873                 break;
1874             case DlbState::onUser:
1875                 return forceDlbOffOrBail(
1876                         dlbState,
1877                         reasonStr
1878                                 + " In load balanced runs binary reproducibility cannot be "
1879                                   "ensured.",
1880                         mdlog);
1881             default:
1882                 gmx_fatal(FARGS,
1883                           "Death horror: undefined case (%d) for load balancing choice",
1884                           static_cast<int>(dlbState));
1885         }
1886     }
1887
1888     return dlbState;
1889 }
1890
1891 static gmx_domdec_comm_t* init_dd_comm()
1892 {
1893     gmx_domdec_comm_t* comm = new gmx_domdec_comm_t;
1894
1895     comm->n_load_have    = 0;
1896     comm->n_load_collect = 0;
1897
1898     comm->haveTurnedOffDlb = false;
1899
1900     for (int i = 0; i < static_cast<int>(DDAtomRanges::Type::Number); i++)
1901     {
1902         comm->sum_nat[i] = 0;
1903     }
1904     comm->ndecomp   = 0;
1905     comm->nload     = 0;
1906     comm->load_step = 0;
1907     comm->load_sum  = 0;
1908     comm->load_max  = 0;
1909     clear_ivec(comm->load_lim);
1910     comm->load_mdf = 0;
1911     comm->load_pme = 0;
1912
1913     /* This should be replaced by a unique pointer */
1914     comm->balanceRegion = ddBalanceRegionAllocate();
1915
1916     return comm;
1917 }
1918
1919 /* Returns whether mtop contains constraints and/or vsites */
1920 static bool systemHasConstraintsOrVsites(const gmx_mtop_t& mtop)
1921 {
1922     return std::any_of(IListRange(mtop).begin(), IListRange(mtop).end(), [](const auto ilist) {
1923         return !extractILists(ilist.list(), IF_CONSTRAINT | IF_VSITE).empty();
1924     });
1925 }
1926
1927 static void setupUpdateGroups(const gmx::MDLogger& mdlog,
1928                               const gmx_mtop_t&    mtop,
1929                               const t_inputrec&    inputrec,
1930                               const real           cutoffMargin,
1931                               DDSystemInfo*        systemInfo)
1932 {
1933     /* When we have constraints and/or vsites, it is beneficial to use
1934      * update groups (when possible) to allow independent update of groups.
1935      */
1936     if (!systemHasConstraintsOrVsites(mtop))
1937     {
1938         /* No constraints or vsites, atoms can be updated independently */
1939         return;
1940     }
1941
1942     systemInfo->updateGroupingsPerMoleculeType = gmx::makeUpdateGroupingsPerMoleculeType(mtop);
1943     systemInfo->useUpdateGroups = (!systemInfo->updateGroupingsPerMoleculeType.empty()
1944                                    && getenv("GMX_NO_UPDATEGROUPS") == nullptr);
1945
1946     if (systemInfo->useUpdateGroups)
1947     {
1948         int numUpdateGroups = 0;
1949         for (const auto& molblock : mtop.molblock)
1950         {
1951             numUpdateGroups += molblock.nmol
1952                                * systemInfo->updateGroupingsPerMoleculeType[molblock.type].numBlocks();
1953         }
1954
1955         systemInfo->maxUpdateGroupRadius = computeMaxUpdateGroupRadius(
1956                 mtop, systemInfo->updateGroupingsPerMoleculeType, maxReferenceTemperature(inputrec));
1957
1958         /* To use update groups, the large domain-to-domain cutoff distance
1959          * should be compatible with the box size.
1960          */
1961         systemInfo->useUpdateGroups = (atomToAtomIntoDomainToDomainCutoff(*systemInfo, 0) < cutoffMargin);
1962
1963         if (systemInfo->useUpdateGroups)
1964         {
1965             GMX_LOG(mdlog.info)
1966                     .appendTextFormatted(
1967                             "Using update groups, nr %d, average size %.1f atoms, max. radius %.3f "
1968                             "nm\n",
1969                             numUpdateGroups,
1970                             mtop.natoms / static_cast<double>(numUpdateGroups),
1971                             systemInfo->maxUpdateGroupRadius);
1972         }
1973         else
1974         {
1975             GMX_LOG(mdlog.info)
1976                     .appendTextFormatted(
1977                             "The combination of rlist and box size prohibits the use of update "
1978                             "groups\n");
1979             systemInfo->updateGroupingsPerMoleculeType.clear();
1980         }
1981     }
1982 }
1983
1984 UnitCellInfo::UnitCellInfo(const t_inputrec& ir) :
1985     npbcdim(numPbcDimensions(ir.pbcType)),
1986     numBoundedDimensions(inputrec2nboundeddim(&ir)),
1987     ddBoxIsDynamic(numBoundedDimensions < DIM || inputrecDynamicBox(&ir)),
1988     haveScrewPBC(ir.pbcType == PbcType::Screw)
1989 {
1990 }
1991
1992 /* Returns whether molecules are always whole, i.e. not broken by PBC */
1993 static bool moleculesAreAlwaysWhole(const gmx_mtop_t&                           mtop,
1994                                     const bool                                  useUpdateGroups,
1995                                     gmx::ArrayRef<const gmx::RangePartitioning> updateGroupingsPerMoleculeType)
1996 {
1997     if (useUpdateGroups)
1998     {
1999         GMX_RELEASE_ASSERT(updateGroupingsPerMoleculeType.size() == mtop.moltype.size(),
2000                            "Need one grouping per moltype");
2001         for (size_t mol = 0; mol < mtop.moltype.size(); mol++)
2002         {
2003             if (updateGroupingsPerMoleculeType[mol].numBlocks() > 1)
2004             {
2005                 return false;
2006             }
2007         }
2008     }
2009     else
2010     {
2011         for (const auto& moltype : mtop.moltype)
2012         {
2013             if (moltype.atoms.nr > 1)
2014             {
2015                 return false;
2016             }
2017         }
2018     }
2019
2020     return true;
2021 }
2022
2023 /*! \brief Generate the simulation system information */
2024 static DDSystemInfo getSystemInfo(const gmx::MDLogger&           mdlog,
2025                                   DDRole                         ddRole,
2026                                   MPI_Comm                       communicator,
2027                                   const DomdecOptions&           options,
2028                                   const gmx_mtop_t&              mtop,
2029                                   const t_inputrec&              ir,
2030                                   const matrix                   box,
2031                                   gmx::ArrayRef<const gmx::RVec> xGlobal)
2032 {
2033     const real tenPercentMargin = 1.1;
2034
2035     DDSystemInfo systemInfo;
2036
2037     /* We need to decide on update groups early, as this affects communication distances */
2038     systemInfo.useUpdateGroups = false;
2039     if (ir.cutoff_scheme == CutoffScheme::Verlet)
2040     {
2041         real cutoffMargin = std::sqrt(max_cutoff2(ir.pbcType, box)) - ir.rlist;
2042         setupUpdateGroups(mdlog, mtop, ir, cutoffMargin, &systemInfo);
2043     }
2044
2045     systemInfo.moleculesAreAlwaysWhole = moleculesAreAlwaysWhole(
2046             mtop, systemInfo.useUpdateGroups, systemInfo.updateGroupingsPerMoleculeType);
2047     systemInfo.haveInterDomainBondeds =
2048             (!systemInfo.moleculesAreAlwaysWhole || mtop.bIntermolecularInteractions);
2049     systemInfo.haveInterDomainMultiBodyBondeds =
2050             (systemInfo.haveInterDomainBondeds && multi_body_bondeds_count(mtop) > 0);
2051
2052     if (systemInfo.useUpdateGroups)
2053     {
2054         systemInfo.mayHaveSplitConstraints = false;
2055         systemInfo.mayHaveSplitSettles     = false;
2056     }
2057     else
2058     {
2059         systemInfo.mayHaveSplitConstraints = (gmx_mtop_ftype_count(mtop, F_CONSTR) > 0
2060                                               || gmx_mtop_ftype_count(mtop, F_CONSTRNC) > 0);
2061         systemInfo.mayHaveSplitSettles     = (gmx_mtop_ftype_count(mtop, F_SETTLE) > 0);
2062     }
2063
2064     if (ir.rlist == 0)
2065     {
2066         /* Set the cut-off to some very large value,
2067          * so we don't need if statements everywhere in the code.
2068          * We use sqrt, since the cut-off is squared in some places.
2069          */
2070         systemInfo.cutoff = GMX_CUTOFF_INF;
2071     }
2072     else
2073     {
2074         systemInfo.cutoff = atomToAtomIntoDomainToDomainCutoff(systemInfo, ir.rlist);
2075     }
2076     systemInfo.minCutoffForMultiBody = 0;
2077
2078     /* Determine the minimum cell size limit, affected by many factors */
2079     systemInfo.cellsizeLimit             = 0;
2080     systemInfo.filterBondedCommunication = false;
2081
2082     /* We do not allow home atoms to move beyond the neighboring domain
2083      * between domain decomposition steps, which limits the cell size.
2084      * Get an estimate of cell size limit due to atom displacement.
2085      * In most cases this is a large overestimate, because it assumes
2086      * non-interaction atoms.
2087      * We set the chance to 1 in a trillion steps.
2088      */
2089     constexpr real c_chanceThatAtomMovesBeyondDomain = 1e-12;
2090     const real     limitForAtomDisplacement          = minCellSizeForAtomDisplacement(
2091             mtop, ir, systemInfo.updateGroupingsPerMoleculeType, c_chanceThatAtomMovesBeyondDomain);
2092     GMX_LOG(mdlog.info).appendTextFormatted("Minimum cell size due to atom displacement: %.3f nm", limitForAtomDisplacement);
2093
2094     systemInfo.cellsizeLimit = std::max(systemInfo.cellsizeLimit, limitForAtomDisplacement);
2095
2096     /* TODO: PME decomposition currently requires atoms not to be more than
2097      *       2/3 of comm->cutoff, which is >=rlist, outside of their domain.
2098      *       In nearly all cases, limitForAtomDisplacement will be smaller
2099      *       than 2/3*rlist, so the PME requirement is satisfied.
2100      *       But it would be better for both correctness and performance
2101      *       to use limitForAtomDisplacement instead of 2/3*comm->cutoff.
2102      *       Note that we would need to improve the pairlist buffer case.
2103      */
2104
2105     if (systemInfo.haveInterDomainBondeds)
2106     {
2107         if (options.minimumCommunicationRange > 0)
2108         {
2109             systemInfo.minCutoffForMultiBody =
2110                     atomToAtomIntoDomainToDomainCutoff(systemInfo, options.minimumCommunicationRange);
2111             if (options.useBondedCommunication)
2112             {
2113                 systemInfo.filterBondedCommunication =
2114                         (systemInfo.minCutoffForMultiBody > systemInfo.cutoff);
2115             }
2116             else
2117             {
2118                 systemInfo.cutoff = std::max(systemInfo.cutoff, systemInfo.minCutoffForMultiBody);
2119             }
2120         }
2121         else if (ir.bPeriodicMols)
2122         {
2123             /* Can not easily determine the required cut-off */
2124             GMX_LOG(mdlog.warning)
2125                     .appendText(
2126                             "NOTE: Periodic molecules are present in this system. Because of this, "
2127                             "the domain decomposition algorithm cannot easily determine the "
2128                             "minimum cell size that it requires for treating bonded interactions. "
2129                             "Instead, domain decomposition will assume that half the non-bonded "
2130                             "cut-off will be a suitable lower bound.");
2131             systemInfo.minCutoffForMultiBody = systemInfo.cutoff / 2;
2132         }
2133         else
2134         {
2135             real r_2b = 0;
2136             real r_mb = 0;
2137
2138             if (ddRole == DDRole::Master)
2139             {
2140                 dd_bonded_cg_distance(mdlog, mtop, ir, xGlobal, box, options.ddBondedChecking, &r_2b, &r_mb);
2141             }
2142             gmx_bcast(sizeof(r_2b), &r_2b, communicator);
2143             gmx_bcast(sizeof(r_mb), &r_mb, communicator);
2144
2145             /* We use an initial margin of 10% for the minimum cell size,
2146              * except when we are just below the non-bonded cut-off.
2147              */
2148             if (options.useBondedCommunication)
2149             {
2150                 if (std::max(r_2b, r_mb) > systemInfo.cutoff)
2151                 {
2152                     const real r_bonded              = std::max(r_2b, r_mb);
2153                     systemInfo.minCutoffForMultiBody = tenPercentMargin * r_bonded;
2154                     /* This is the (only) place where we turn on the filtering */
2155                     systemInfo.filterBondedCommunication = true;
2156                 }
2157                 else
2158                 {
2159                     const real r_bonded = r_mb;
2160                     systemInfo.minCutoffForMultiBody =
2161                             std::min(tenPercentMargin * r_bonded, systemInfo.cutoff);
2162                 }
2163                 /* We determine cutoff_mbody later */
2164                 systemInfo.increaseMultiBodyCutoff = true;
2165             }
2166             else
2167             {
2168                 /* No special bonded communication,
2169                  * simply increase the DD cut-off.
2170                  */
2171                 systemInfo.minCutoffForMultiBody = tenPercentMargin * std::max(r_2b, r_mb);
2172                 systemInfo.cutoff = std::max(systemInfo.cutoff, systemInfo.minCutoffForMultiBody);
2173             }
2174         }
2175         GMX_LOG(mdlog.info)
2176                 .appendTextFormatted("Minimum cell size due to bonded interactions: %.3f nm",
2177                                      systemInfo.minCutoffForMultiBody);
2178
2179         systemInfo.cellsizeLimit = std::max(systemInfo.cellsizeLimit, systemInfo.minCutoffForMultiBody);
2180     }
2181
2182     systemInfo.constraintCommunicationRange = 0;
2183     if (systemInfo.mayHaveSplitConstraints && options.constraintCommunicationRange <= 0)
2184     {
2185         /* There is a cell size limit due to the constraints (P-LINCS) */
2186         systemInfo.constraintCommunicationRange = gmx::constr_r_max(mdlog, &mtop, &ir);
2187         GMX_LOG(mdlog.info)
2188                 .appendTextFormatted("Estimated maximum distance required for P-LINCS: %.3f nm",
2189                                      systemInfo.constraintCommunicationRange);
2190         if (systemInfo.constraintCommunicationRange > systemInfo.cellsizeLimit)
2191         {
2192             GMX_LOG(mdlog.info)
2193                     .appendText(
2194                             "This distance will limit the DD cell size, you can override this with "
2195                             "-rcon");
2196         }
2197     }
2198     else if (options.constraintCommunicationRange > 0)
2199     {
2200         /* Here we do not check for dd->splitConstraints.
2201          * because one can also set a cell size limit for virtual sites only
2202          * and at this point we don't know yet if there are intercg v-sites.
2203          */
2204         GMX_LOG(mdlog.info)
2205                 .appendTextFormatted("User supplied maximum distance required for P-LINCS: %.3f nm",
2206                                      options.constraintCommunicationRange);
2207         systemInfo.constraintCommunicationRange = options.constraintCommunicationRange;
2208     }
2209     systemInfo.cellsizeLimit = std::max(systemInfo.cellsizeLimit, systemInfo.constraintCommunicationRange);
2210
2211     return systemInfo;
2212 }
2213
2214 /*! \brief Exit with a fatal error if the DDGridSetup cannot be
2215  * implemented. */
2216 static void checkDDGridSetup(const DDGridSetup&   ddGridSetup,
2217                              DDRole               ddRole,
2218                              MPI_Comm             communicator,
2219                              int                  numNodes,
2220                              const DomdecOptions& options,
2221                              const DDSettings&    ddSettings,
2222                              const DDSystemInfo&  systemInfo,
2223                              const real           cellsizeLimit,
2224                              const gmx_ddbox_t&   ddbox)
2225 {
2226     if (options.numCells[XX] <= 0 && (ddGridSetup.numDomains[XX] == 0))
2227     {
2228         const bool  bC = (systemInfo.mayHaveSplitConstraints
2229                          && systemInfo.constraintCommunicationRange > systemInfo.minCutoffForMultiBody);
2230         std::string message =
2231                 gmx::formatString("Change the number of ranks or mdrun option %s%s%s",
2232                                   !bC ? "-rdd" : "-rcon",
2233                                   ddSettings.initialDlbState != DlbState::offUser ? " or -dds" : "",
2234                                   bC ? " or your LINCS settings" : "");
2235
2236         gmx_fatal_collective(FARGS,
2237                              communicator,
2238                              ddRole == DDRole::Master,
2239                              "There is no domain decomposition for %d ranks that is compatible "
2240                              "with the given box and a minimum cell size of %g nm\n"
2241                              "%s\n"
2242                              "Look in the log file for details on the domain decomposition",
2243                              numNodes - ddGridSetup.numPmeOnlyRanks,
2244                              cellsizeLimit,
2245                              message.c_str());
2246     }
2247
2248     const real acs = average_cellsize_min(ddbox, ddGridSetup.numDomains);
2249     if (acs < cellsizeLimit)
2250     {
2251         if (options.numCells[XX] <= 0)
2252         {
2253             GMX_RELEASE_ASSERT(
2254                     false,
2255                     "dd_choose_grid() should return a grid that satisfies the cell size limits");
2256         }
2257         else
2258         {
2259             gmx_fatal_collective(
2260                     FARGS,
2261                     communicator,
2262                     ddRole == DDRole::Master,
2263                     "The initial cell size (%f) is smaller than the cell size limit (%f), change "
2264                     "options -dd, -rdd or -rcon, see the log file for details",
2265                     acs,
2266                     cellsizeLimit);
2267         }
2268     }
2269
2270     const int numPPRanks =
2271             ddGridSetup.numDomains[XX] * ddGridSetup.numDomains[YY] * ddGridSetup.numDomains[ZZ];
2272     if (numNodes - numPPRanks != ddGridSetup.numPmeOnlyRanks)
2273     {
2274         gmx_fatal_collective(FARGS,
2275                              communicator,
2276                              ddRole == DDRole::Master,
2277                              "The size of the domain decomposition grid (%d) does not match the "
2278                              "number of PP ranks (%d). The total number of ranks is %d",
2279                              numPPRanks,
2280                              numNodes - ddGridSetup.numPmeOnlyRanks,
2281                              numNodes);
2282     }
2283     if (ddGridSetup.numPmeOnlyRanks > numPPRanks)
2284     {
2285         gmx_fatal_collective(FARGS,
2286                              communicator,
2287                              ddRole == DDRole::Master,
2288                              "The number of separate PME ranks (%d) is larger than the number of "
2289                              "PP ranks (%d), this is not supported.",
2290                              ddGridSetup.numPmeOnlyRanks,
2291                              numPPRanks);
2292     }
2293 }
2294
2295 /*! \brief Set the cell size and interaction limits, as well as the DD grid */
2296 static DDRankSetup getDDRankSetup(const gmx::MDLogger& mdlog,
2297                                   int                  numNodes,
2298                                   const DDGridSetup&   ddGridSetup,
2299                                   const t_inputrec&    ir)
2300 {
2301     GMX_LOG(mdlog.info)
2302             .appendTextFormatted("Domain decomposition grid %d x %d x %d, separate PME ranks %d",
2303                                  ddGridSetup.numDomains[XX],
2304                                  ddGridSetup.numDomains[YY],
2305                                  ddGridSetup.numDomains[ZZ],
2306                                  ddGridSetup.numPmeOnlyRanks);
2307
2308     DDRankSetup ddRankSetup;
2309
2310     ddRankSetup.numPPRanks = numNodes - ddGridSetup.numPmeOnlyRanks;
2311     copy_ivec(ddGridSetup.numDomains, ddRankSetup.numPPCells);
2312
2313     ddRankSetup.usePmeOnlyRanks = (ddGridSetup.numPmeOnlyRanks > 0);
2314     if (ddRankSetup.usePmeOnlyRanks)
2315     {
2316         ddRankSetup.numRanksDoingPme = ddGridSetup.numPmeOnlyRanks;
2317     }
2318     else
2319     {
2320         ddRankSetup.numRanksDoingPme =
2321                 ddGridSetup.numDomains[XX] * ddGridSetup.numDomains[YY] * ddGridSetup.numDomains[ZZ];
2322     }
2323
2324     if (EEL_PME(ir.coulombtype) || EVDW_PME(ir.vdwtype))
2325     {
2326         /* The following choices should match those
2327          * in comm_cost_est in domdec_setup.c.
2328          * Note that here the checks have to take into account
2329          * that the decomposition might occur in a different order than xyz
2330          * (for instance through the env.var. GMX_DD_ORDER_ZYX),
2331          * in which case they will not match those in comm_cost_est,
2332          * but since that is mainly for testing purposes that's fine.
2333          */
2334         if (ddGridSetup.numDDDimensions >= 2 && ddGridSetup.ddDimensions[0] == XX
2335             && ddGridSetup.ddDimensions[1] == YY
2336             && ddRankSetup.numRanksDoingPme > ddGridSetup.numDomains[XX]
2337             && ddRankSetup.numRanksDoingPme % ddGridSetup.numDomains[XX] == 0
2338             && getenv("GMX_PMEONEDD") == nullptr)
2339         {
2340             ddRankSetup.npmedecompdim = 2;
2341             ddRankSetup.npmenodes_x   = ddGridSetup.numDomains[XX];
2342             ddRankSetup.npmenodes_y   = ddRankSetup.numRanksDoingPme / ddRankSetup.npmenodes_x;
2343         }
2344         else
2345         {
2346             /* In case nc is 1 in both x and y we could still choose to
2347              * decompose pme in y instead of x, but we use x for simplicity.
2348              */
2349             ddRankSetup.npmedecompdim = 1;
2350             if (ddGridSetup.ddDimensions[0] == YY)
2351             {
2352                 ddRankSetup.npmenodes_x = 1;
2353                 ddRankSetup.npmenodes_y = ddRankSetup.numRanksDoingPme;
2354             }
2355             else
2356             {
2357                 ddRankSetup.npmenodes_x = ddRankSetup.numRanksDoingPme;
2358                 ddRankSetup.npmenodes_y = 1;
2359             }
2360         }
2361         GMX_LOG(mdlog.info)
2362                 .appendTextFormatted("PME domain decomposition: %d x %d x %d",
2363                                      ddRankSetup.npmenodes_x,
2364                                      ddRankSetup.npmenodes_y,
2365                                      1);
2366     }
2367     else
2368     {
2369         ddRankSetup.npmedecompdim = 0;
2370         ddRankSetup.npmenodes_x   = 0;
2371         ddRankSetup.npmenodes_y   = 0;
2372     }
2373
2374     return ddRankSetup;
2375 }
2376
2377 /*! \brief Set the cell size and interaction limits */
2378 static void set_dd_limits(const gmx::MDLogger& mdlog,
2379                           DDRole               ddRole,
2380                           gmx_domdec_t*        dd,
2381                           const DomdecOptions& options,
2382                           const DDSettings&    ddSettings,
2383                           const DDSystemInfo&  systemInfo,
2384                           const DDGridSetup&   ddGridSetup,
2385                           const int            numPPRanks,
2386                           const gmx_mtop_t&    mtop,
2387                           const t_inputrec&    ir,
2388                           const gmx_ddbox_t&   ddbox)
2389 {
2390     gmx_domdec_comm_t* comm = dd->comm;
2391     comm->ddSettings        = ddSettings;
2392
2393     /* Initialize to GPU share count to 0, might change later */
2394     comm->nrank_gpu_shared = 0;
2395
2396     comm->dlbState = comm->ddSettings.initialDlbState;
2397     dd_dlb_set_should_check_whether_to_turn_dlb_on(dd, TRUE);
2398     /* To consider turning DLB on after 2*nstlist steps we need to check
2399      * at partitioning count 3. Thus we need to increase the first count by 2.
2400      */
2401     comm->ddPartioningCountFirstDlbOff += 2;
2402
2403     comm->bPMELoadBalDLBLimits = FALSE;
2404
2405     /* Allocate the charge group/atom sorting struct */
2406     comm->sort = std::make_unique<gmx_domdec_sort_t>();
2407
2408     comm->systemInfo = systemInfo;
2409
2410     if (systemInfo.useUpdateGroups)
2411     {
2412         /* Note: We would like to use dd->nnodes for the atom count estimate,
2413          *       but that is not yet available here. But this anyhow only
2414          *       affect performance up to the second dd_partition_system call.
2415          */
2416         const int homeAtomCountEstimate = mtop.natoms / numPPRanks;
2417         comm->updateGroupsCog           = std::make_unique<gmx::UpdateGroupsCog>(
2418                 mtop, systemInfo.updateGroupingsPerMoleculeType, maxReferenceTemperature(ir), homeAtomCountEstimate);
2419     }
2420
2421     /* Set the DD setup given by ddGridSetup */
2422     copy_ivec(ddGridSetup.numDomains, dd->numCells);
2423     dd->ndim = ddGridSetup.numDDDimensions;
2424     copy_ivec(ddGridSetup.ddDimensions, dd->dim);
2425
2426     dd->nnodes = dd->numCells[XX] * dd->numCells[YY] * dd->numCells[ZZ];
2427
2428     snew(comm->slb_frac, DIM);
2429     if (isDlbDisabled(comm))
2430     {
2431         comm->slb_frac[XX] = get_slb_frac(mdlog, "x", dd->numCells[XX], options.cellSizeX);
2432         comm->slb_frac[YY] = get_slb_frac(mdlog, "y", dd->numCells[YY], options.cellSizeY);
2433         comm->slb_frac[ZZ] = get_slb_frac(mdlog, "z", dd->numCells[ZZ], options.cellSizeZ);
2434     }
2435
2436     /* Set the multi-body cut-off and cellsize limit for DLB */
2437     comm->cutoff_mbody   = systemInfo.minCutoffForMultiBody;
2438     comm->cellsize_limit = systemInfo.cellsizeLimit;
2439     if (systemInfo.haveInterDomainBondeds && systemInfo.increaseMultiBodyCutoff)
2440     {
2441         if (systemInfo.filterBondedCommunication || !isDlbDisabled(comm))
2442         {
2443             /* Set the bonded communication distance to halfway
2444              * the minimum and the maximum,
2445              * since the extra communication cost is nearly zero.
2446              */
2447             real acs           = average_cellsize_min(ddbox, dd->numCells);
2448             comm->cutoff_mbody = 0.5 * (systemInfo.minCutoffForMultiBody + acs);
2449             if (!isDlbDisabled(comm))
2450             {
2451                 /* Check if this does not limit the scaling */
2452                 comm->cutoff_mbody = std::min(comm->cutoff_mbody, options.dlbScaling * acs);
2453             }
2454             if (!systemInfo.filterBondedCommunication)
2455             {
2456                 /* Without bBondComm do not go beyond the n.b. cut-off */
2457                 comm->cutoff_mbody = std::min(comm->cutoff_mbody, systemInfo.cutoff);
2458                 if (comm->cellsize_limit >= systemInfo.cutoff)
2459                 {
2460                     /* We don't loose a lot of efficieny
2461                      * when increasing it to the n.b. cut-off.
2462                      * It can even be slightly faster, because we need
2463                      * less checks for the communication setup.
2464                      */
2465                     comm->cutoff_mbody = systemInfo.cutoff;
2466                 }
2467             }
2468             /* Check if we did not end up below our original limit */
2469             comm->cutoff_mbody = std::max(comm->cutoff_mbody, systemInfo.minCutoffForMultiBody);
2470
2471             if (comm->cutoff_mbody > comm->cellsize_limit)
2472             {
2473                 comm->cellsize_limit = comm->cutoff_mbody;
2474             }
2475         }
2476         /* Without DLB and cutoff_mbody<cutoff, cutoff_mbody is dynamic */
2477     }
2478
2479     if (debug)
2480     {
2481         fprintf(debug,
2482                 "Bonded atom communication beyond the cut-off: %s\n"
2483                 "cellsize limit %f\n",
2484                 gmx::boolToString(systemInfo.filterBondedCommunication),
2485                 comm->cellsize_limit);
2486     }
2487
2488     if (ddRole == DDRole::Master)
2489     {
2490         check_dd_restrictions(dd, ir, mdlog);
2491     }
2492 }
2493
2494 static void writeSettings(gmx::TextWriter*   log,
2495                           gmx_domdec_t*      dd,
2496                           const gmx_mtop_t&  mtop,
2497                           const t_inputrec&  ir,
2498                           gmx_bool           bDynLoadBal,
2499                           real               dlb_scale,
2500                           const gmx_ddbox_t* ddbox)
2501 {
2502     gmx_domdec_comm_t* comm = dd->comm;
2503
2504     if (bDynLoadBal)
2505     {
2506         log->writeString("The maximum number of communication pulses is:");
2507         for (int d = 0; d < dd->ndim; d++)
2508         {
2509             log->writeStringFormatted(" %c %d", dim2char(dd->dim[d]), comm->cd[d].np_dlb);
2510         }
2511         log->ensureLineBreak();
2512         log->writeLineFormatted("The minimum size for domain decomposition cells is %.3f nm",
2513                                 comm->cellsize_limit);
2514         log->writeLineFormatted("The requested allowed shrink of DD cells (option -dds) is: %.2f", dlb_scale);
2515         log->writeString("The allowed shrink of domain decomposition cells is:");
2516         for (int d = 0; d < DIM; d++)
2517         {
2518             if (dd->numCells[d] > 1)
2519             {
2520                 const real shrink =
2521                         (d >= ddbox->npbcdim && dd->numCells[d] == 2)
2522                                 ? 0
2523                                 : comm->cellsize_min_dlb[d]
2524                                           / (ddbox->box_size[d] * ddbox->skew_fac[d] / dd->numCells[d]);
2525                 log->writeStringFormatted(" %c %.2f", dim2char(d), shrink);
2526             }
2527         }
2528         log->ensureLineBreak();
2529     }
2530     else
2531     {
2532         ivec np;
2533         set_dd_cell_sizes_slb(dd, ddbox, setcellsizeslbPULSE_ONLY, np);
2534         log->writeString("The initial number of communication pulses is:");
2535         for (int d = 0; d < dd->ndim; d++)
2536         {
2537             log->writeStringFormatted(" %c %d", dim2char(dd->dim[d]), np[dd->dim[d]]);
2538         }
2539         log->ensureLineBreak();
2540         log->writeString("The initial domain decomposition cell size is:");
2541         for (int d = 0; d < DIM; d++)
2542         {
2543             if (dd->numCells[d] > 1)
2544             {
2545                 log->writeStringFormatted(" %c %.2f nm", dim2char(d), dd->comm->cellsize_min[d]);
2546             }
2547         }
2548         log->ensureLineBreak();
2549         log->writeLine();
2550     }
2551
2552     const bool haveInterDomainVsites =
2553             (countInterUpdategroupVsites(mtop, comm->systemInfo.updateGroupingsPerMoleculeType) != 0);
2554
2555     if (comm->systemInfo.haveInterDomainBondeds || haveInterDomainVsites
2556         || comm->systemInfo.mayHaveSplitConstraints || comm->systemInfo.mayHaveSplitSettles)
2557     {
2558         std::string decompUnits;
2559         if (comm->systemInfo.useUpdateGroups)
2560         {
2561             decompUnits = "atom groups";
2562         }
2563         else
2564         {
2565             decompUnits = "atoms";
2566         }
2567
2568         log->writeLineFormatted("The maximum allowed distance for %s involved in interactions is:",
2569                                 decompUnits.c_str());
2570         log->writeLineFormatted(
2571                 "%40s  %-7s %6.3f nm", "non-bonded interactions", "", comm->systemInfo.cutoff);
2572
2573         real limit = 0;
2574         if (bDynLoadBal)
2575         {
2576             limit = dd->comm->cellsize_limit;
2577         }
2578         else
2579         {
2580             if (dd->unitCellInfo.ddBoxIsDynamic)
2581             {
2582                 log->writeLine(
2583                         "(the following are initial values, they could change due to box "
2584                         "deformation)");
2585             }
2586             limit = dd->comm->cellsize_min[XX];
2587             for (int d = 1; d < DIM; d++)
2588             {
2589                 limit = std::min(limit, dd->comm->cellsize_min[d]);
2590             }
2591         }
2592
2593         if (comm->systemInfo.haveInterDomainBondeds)
2594         {
2595             log->writeLineFormatted("%40s  %-7s %6.3f nm",
2596                                     "two-body bonded interactions",
2597                                     "(-rdd)",
2598                                     std::max(comm->systemInfo.cutoff, comm->cutoff_mbody));
2599             log->writeLineFormatted("%40s  %-7s %6.3f nm",
2600                                     "multi-body bonded interactions",
2601                                     "(-rdd)",
2602                                     (comm->systemInfo.filterBondedCommunication || isDlbOn(dd->comm))
2603                                             ? comm->cutoff_mbody
2604                                             : std::min(comm->systemInfo.cutoff, limit));
2605         }
2606         if (haveInterDomainVsites)
2607         {
2608             log->writeLineFormatted("%40s  %-7s %6.3f nm", "virtual site constructions", "(-rcon)", limit);
2609         }
2610         if (comm->systemInfo.mayHaveSplitConstraints || comm->systemInfo.mayHaveSplitSettles)
2611         {
2612             std::string separation =
2613                     gmx::formatString("atoms separated by up to %d constraints", 1 + ir.nProjOrder);
2614             log->writeLineFormatted("%40s  %-7s %6.3f nm\n", separation.c_str(), "(-rcon)", limit);
2615         }
2616         log->ensureLineBreak();
2617     }
2618 }
2619
2620 static void logSettings(const gmx::MDLogger& mdlog,
2621                         gmx_domdec_t*        dd,
2622                         const gmx_mtop_t&    mtop,
2623                         const t_inputrec&    ir,
2624                         real                 dlb_scale,
2625                         const gmx_ddbox_t*   ddbox)
2626 {
2627     gmx::StringOutputStream stream;
2628     gmx::TextWriter         log(&stream);
2629     writeSettings(&log, dd, mtop, ir, isDlbOn(dd->comm), dlb_scale, ddbox);
2630     if (dd->comm->dlbState == DlbState::offCanTurnOn)
2631     {
2632         {
2633             log.ensureEmptyLine();
2634             log.writeLine(
2635                     "When dynamic load balancing gets turned on, these settings will change to:");
2636         }
2637         writeSettings(&log, dd, mtop, ir, true, dlb_scale, ddbox);
2638     }
2639     GMX_LOG(mdlog.info).asParagraph().appendText(stream.toString());
2640 }
2641
2642 static void set_cell_limits_dlb(const gmx::MDLogger& mdlog,
2643                                 gmx_domdec_t*        dd,
2644                                 real                 dlb_scale,
2645                                 const t_inputrec&    inputrec,
2646                                 const gmx_ddbox_t*   ddbox)
2647 {
2648     int npulse       = 0;
2649     int npulse_d_max = 0;
2650     int npulse_d     = 0;
2651
2652     gmx_domdec_comm_t* comm = dd->comm;
2653
2654     bool bNoCutOff = (inputrec.rvdw == 0 || inputrec.rcoulomb == 0);
2655
2656     /* Determine the maximum number of comm. pulses in one dimension */
2657
2658     comm->cellsize_limit = std::max(comm->cellsize_limit, comm->cutoff_mbody);
2659
2660     /* Determine the maximum required number of grid pulses */
2661     if (comm->cellsize_limit >= comm->systemInfo.cutoff)
2662     {
2663         /* Only a single pulse is required */
2664         npulse = 1;
2665     }
2666     else if (!bNoCutOff && comm->cellsize_limit > 0)
2667     {
2668         /* We round down slightly here to avoid overhead due to the latency
2669          * of extra communication calls when the cut-off
2670          * would be only slightly longer than the cell size.
2671          * Later cellsize_limit is redetermined,
2672          * so we can not miss interactions due to this rounding.
2673          */
2674         npulse = static_cast<int>(0.96 + comm->systemInfo.cutoff / comm->cellsize_limit);
2675     }
2676     else
2677     {
2678         /* There is no cell size limit */
2679         npulse = std::max(dd->numCells[XX] - 1, std::max(dd->numCells[YY] - 1, dd->numCells[ZZ] - 1));
2680     }
2681
2682     if (!bNoCutOff && npulse > 1)
2683     {
2684         /* See if we can do with less pulses, based on dlb_scale */
2685         npulse_d_max = 0;
2686         for (int d = 0; d < dd->ndim; d++)
2687         {
2688             int dim  = dd->dim[d];
2689             npulse_d = static_cast<int>(
2690                     1
2691                     + dd->numCells[dim] * comm->systemInfo.cutoff
2692                               / (ddbox->box_size[dim] * ddbox->skew_fac[dim] * dlb_scale));
2693             npulse_d_max = std::max(npulse_d_max, npulse_d);
2694         }
2695         npulse = std::min(npulse, npulse_d_max);
2696     }
2697
2698     /* This env var can override npulse */
2699     const int ddPulseEnv = dd_getenv(mdlog, "GMX_DD_NPULSE", 0);
2700     if (ddPulseEnv > 0)
2701     {
2702         npulse = ddPulseEnv;
2703     }
2704
2705     comm->maxpulse       = 1;
2706     comm->bVacDLBNoLimit = (inputrec.pbcType == PbcType::No);
2707     for (int d = 0; d < dd->ndim; d++)
2708     {
2709         comm->cd[d].np_dlb = std::min(npulse, dd->numCells[dd->dim[d]] - 1);
2710         comm->maxpulse     = std::max(comm->maxpulse, comm->cd[d].np_dlb);
2711         if (comm->cd[d].np_dlb < dd->numCells[dd->dim[d]] - 1)
2712         {
2713             comm->bVacDLBNoLimit = FALSE;
2714         }
2715     }
2716
2717     /* cellsize_limit is set for LINCS in init_domain_decomposition */
2718     if (!comm->bVacDLBNoLimit)
2719     {
2720         comm->cellsize_limit = std::max(comm->cellsize_limit, comm->systemInfo.cutoff / comm->maxpulse);
2721     }
2722     comm->cellsize_limit = std::max(comm->cellsize_limit, comm->cutoff_mbody);
2723     /* Set the minimum cell size for each DD dimension */
2724     for (int d = 0; d < dd->ndim; d++)
2725     {
2726         if (comm->bVacDLBNoLimit || comm->cd[d].np_dlb * comm->cellsize_limit >= comm->systemInfo.cutoff)
2727         {
2728             comm->cellsize_min_dlb[dd->dim[d]] = comm->cellsize_limit;
2729         }
2730         else
2731         {
2732             comm->cellsize_min_dlb[dd->dim[d]] = comm->systemInfo.cutoff / comm->cd[d].np_dlb;
2733         }
2734     }
2735     if (comm->cutoff_mbody <= 0)
2736     {
2737         comm->cutoff_mbody = std::min(comm->systemInfo.cutoff, comm->cellsize_limit);
2738     }
2739     if (isDlbOn(comm))
2740     {
2741         set_dlb_limits(dd);
2742     }
2743 }
2744
2745 bool dd_moleculesAreAlwaysWhole(const gmx_domdec_t& dd)
2746 {
2747     return dd.comm->systemInfo.moleculesAreAlwaysWhole;
2748 }
2749
2750 bool dd_bonded_molpbc(const gmx_domdec_t& dd, PbcType pbcType)
2751 {
2752     /* If each molecule is a single charge group
2753      * or we use domain decomposition for each periodic dimension,
2754      * we do not need to take pbc into account for the bonded interactions.
2755      */
2756     return (pbcType != PbcType::No && dd.comm->systemInfo.haveInterDomainBondeds
2757             && !(dd.numCells[XX] > 1 && dd.numCells[YY] > 1
2758                  && (dd.numCells[ZZ] > 1 || pbcType == PbcType::XY)));
2759 }
2760
2761 /*! \brief Sets grid size limits and PP-PME setup, prints settings to log */
2762 static void set_ddgrid_parameters(const gmx::MDLogger& mdlog,
2763                                   gmx_domdec_t*        dd,
2764                                   real                 dlb_scale,
2765                                   const gmx_mtop_t&    mtop,
2766                                   const t_inputrec&    inputrec,
2767                                   const gmx_ddbox_t*   ddbox)
2768 {
2769     gmx_domdec_comm_t* comm        = dd->comm;
2770     DDRankSetup&       ddRankSetup = comm->ddRankSetup;
2771
2772     if (EEL_PME(inputrec.coulombtype) || EVDW_PME(inputrec.vdwtype))
2773     {
2774         init_ddpme(dd, &ddRankSetup.ddpme[0], 0);
2775         if (ddRankSetup.npmedecompdim >= 2)
2776         {
2777             init_ddpme(dd, &ddRankSetup.ddpme[1], 1);
2778         }
2779     }
2780     else
2781     {
2782         ddRankSetup.numRanksDoingPme = 0;
2783         if (dd->pme_nodeid >= 0)
2784         {
2785             gmx_fatal_collective(FARGS,
2786                                  dd->mpi_comm_all,
2787                                  DDMASTER(dd),
2788                                  "Can not have separate PME ranks without PME electrostatics");
2789         }
2790     }
2791
2792     if (debug)
2793     {
2794         fprintf(debug, "The DD cut-off is %f\n", comm->systemInfo.cutoff);
2795     }
2796     if (!isDlbDisabled(comm))
2797     {
2798         set_cell_limits_dlb(mdlog, dd, dlb_scale, inputrec, ddbox);
2799     }
2800
2801     logSettings(mdlog, dd, mtop, inputrec, dlb_scale, ddbox);
2802
2803     const real vol_frac = (inputrec.pbcType == PbcType::No)
2804                                   ? (1 - 1 / static_cast<double>(dd->nnodes))
2805                                   : ((1 + comm_box_frac(dd->numCells, comm->systemInfo.cutoff, *ddbox))
2806                                      / static_cast<double>(dd->nnodes));
2807     if (debug)
2808     {
2809         fprintf(debug, "Volume fraction for all DD zones: %f\n", vol_frac);
2810     }
2811     int natoms_tot = mtop.natoms;
2812
2813     dd->ga2la = new gmx_ga2la_t(natoms_tot, static_cast<int>(vol_frac * natoms_tot));
2814 }
2815
2816 /*! \brief Get some important DD parameters which can be modified by env.vars */
2817 static DDSettings getDDSettings(const gmx::MDLogger&     mdlog,
2818                                 const DomdecOptions&     options,
2819                                 const gmx::MdrunOptions& mdrunOptions,
2820                                 const t_inputrec&        ir)
2821 {
2822     DDSettings ddSettings;
2823
2824     ddSettings.useSendRecv2        = (dd_getenv(mdlog, "GMX_DD_USE_SENDRECV2", 0) != 0);
2825     ddSettings.dlb_scale_lim       = dd_getenv(mdlog, "GMX_DLB_MAX_BOX_SCALING", 10);
2826     ddSettings.useDDOrderZYX       = bool(dd_getenv(mdlog, "GMX_DD_ORDER_ZYX", 0));
2827     ddSettings.useCartesianReorder = bool(dd_getenv(mdlog, "GMX_NO_CART_REORDER", 1));
2828     ddSettings.eFlop               = dd_getenv(mdlog, "GMX_DLB_BASED_ON_FLOPS", 0);
2829     const int recload              = dd_getenv(mdlog, "GMX_DD_RECORD_LOAD", 1);
2830     ddSettings.nstDDDump           = dd_getenv(mdlog, "GMX_DD_NST_DUMP", 0);
2831     ddSettings.nstDDDumpGrid       = dd_getenv(mdlog, "GMX_DD_NST_DUMP_GRID", 0);
2832     ddSettings.DD_debug            = dd_getenv(mdlog, "GMX_DD_DEBUG", 0);
2833
2834     if (ddSettings.useSendRecv2)
2835     {
2836         GMX_LOG(mdlog.info)
2837                 .appendText(
2838                         "Will use two sequential MPI_Sendrecv calls instead of two simultaneous "
2839                         "non-blocking MPI_Irecv and MPI_Isend pairs for constraint and vsite "
2840                         "communication");
2841     }
2842
2843     if (ddSettings.eFlop)
2844     {
2845         GMX_LOG(mdlog.info).appendText("Will load balance based on FLOP count");
2846         ddSettings.recordLoad = true;
2847     }
2848     else
2849     {
2850         ddSettings.recordLoad = (wallcycle_have_counter() && recload > 0);
2851     }
2852
2853     ddSettings.initialDlbState =
2854             determineInitialDlbState(mdlog, options.dlbOption, ddSettings.recordLoad, mdrunOptions, ir);
2855     GMX_LOG(mdlog.info)
2856             .appendTextFormatted("Dynamic load balancing: %s",
2857                                  enumValueToString(ddSettings.initialDlbState));
2858
2859     return ddSettings;
2860 }
2861
2862 gmx_domdec_t::gmx_domdec_t(const t_inputrec& ir) : unitCellInfo(ir) {}
2863
2864 gmx_domdec_t::~gmx_domdec_t() = default;
2865
2866 namespace gmx
2867 {
2868
2869 // TODO once the functionality stablizes, move this class and
2870 // supporting functionality into builder.cpp
2871 /*! \brief Impl class for DD builder */
2872 class DomainDecompositionBuilder::Impl
2873 {
2874 public:
2875     //! Constructor
2876     Impl(const MDLogger&      mdlog,
2877          t_commrec*           cr,
2878          const DomdecOptions& options,
2879          const MdrunOptions&  mdrunOptions,
2880          const gmx_mtop_t&    mtop,
2881          const t_inputrec&    ir,
2882          const matrix         box,
2883          ArrayRef<const RVec> xGlobal);
2884
2885     //! Build the resulting DD manager
2886     gmx_domdec_t* build(LocalAtomSetManager* atomSets);
2887
2888     //! Objects used in constructing and configuring DD
2889     //! {
2890     //! Logging object
2891     const MDLogger& mdlog_;
2892     //! Communication object
2893     t_commrec* cr_;
2894     //! User-supplied options configuring DD behavior
2895     const DomdecOptions options_;
2896     //! Global system topology
2897     const gmx_mtop_t& mtop_;
2898     //! User input values from the tpr file
2899     const t_inputrec& ir_;
2900     //! }
2901
2902     //! Internal objects used in constructing DD
2903     //! {
2904     //! Settings combined from the user input
2905     DDSettings ddSettings_;
2906     //! Information derived from the simulation system
2907     DDSystemInfo systemInfo_;
2908     //! Box structure
2909     gmx_ddbox_t ddbox_ = { 0 };
2910     //! Organization of the DD grids
2911     DDGridSetup ddGridSetup_;
2912     //! Organzation of the DD ranks
2913     DDRankSetup ddRankSetup_;
2914     //! Number of DD cells in each dimension
2915     ivec ddCellIndex_ = { 0, 0, 0 };
2916     //! IDs of PME-only ranks
2917     std::vector<int> pmeRanks_;
2918     //! Contains a valid Cartesian-communicator-based setup, or defaults.
2919     CartesianRankSetup cartSetup_;
2920     //! }
2921 };
2922
2923 DomainDecompositionBuilder::Impl::Impl(const MDLogger&      mdlog,
2924                                        t_commrec*           cr,
2925                                        const DomdecOptions& options,
2926                                        const MdrunOptions&  mdrunOptions,
2927                                        const gmx_mtop_t&    mtop,
2928                                        const t_inputrec&    ir,
2929                                        const matrix         box,
2930                                        ArrayRef<const RVec> xGlobal) :
2931     mdlog_(mdlog),
2932     cr_(cr),
2933     options_(options),
2934     mtop_(mtop),
2935     ir_(ir)
2936 {
2937     GMX_LOG(mdlog_.info).appendTextFormatted("\nInitializing Domain Decomposition on %d ranks", cr_->sizeOfDefaultCommunicator);
2938
2939     ddSettings_ = getDDSettings(mdlog_, options_, mdrunOptions, ir_);
2940
2941     if (ddSettings_.eFlop > 1)
2942     {
2943         /* Ensure that we have different random flop counts on different ranks */
2944         srand(1 + cr_->rankInDefaultCommunicator);
2945     }
2946
2947     systemInfo_ = getSystemInfo(mdlog_,
2948                                 MASTER(cr_) ? DDRole::Master : DDRole::Agent,
2949                                 cr->mpiDefaultCommunicator,
2950                                 options_,
2951                                 mtop_,
2952                                 ir_,
2953                                 box,
2954                                 xGlobal);
2955
2956     const int  numRanksRequested         = cr_->sizeOfDefaultCommunicator;
2957     const bool checkForLargePrimeFactors = (options_.numCells[0] <= 0);
2958     checkForValidRankCountRequests(
2959             numRanksRequested, EEL_PME(ir_.coulombtype), options_.numPmeRanks, checkForLargePrimeFactors);
2960
2961     // DD grid setup uses a more different cell size limit for
2962     // automated setup than the one in systemInfo_. The latter is used
2963     // in set_dd_limits() to configure DLB, for example.
2964     const real gridSetupCellsizeLimit =
2965             getDDGridSetupCellSizeLimit(mdlog_,
2966                                         !isDlbDisabled(ddSettings_.initialDlbState),
2967                                         options_.dlbScaling,
2968                                         ir_,
2969                                         systemInfo_.cellsizeLimit);
2970     ddGridSetup_ = getDDGridSetup(mdlog_,
2971                                   MASTER(cr_) ? DDRole::Master : DDRole::Agent,
2972                                   cr->mpiDefaultCommunicator,
2973                                   numRanksRequested,
2974                                   options_,
2975                                   ddSettings_,
2976                                   systemInfo_,
2977                                   gridSetupCellsizeLimit,
2978                                   mtop_,
2979                                   ir_,
2980                                   box,
2981                                   xGlobal,
2982                                   &ddbox_);
2983     checkDDGridSetup(ddGridSetup_,
2984                      MASTER(cr_) ? DDRole::Master : DDRole::Agent,
2985                      cr->mpiDefaultCommunicator,
2986                      cr->sizeOfDefaultCommunicator,
2987                      options_,
2988                      ddSettings_,
2989                      systemInfo_,
2990                      gridSetupCellsizeLimit,
2991                      ddbox_);
2992
2993     cr_->npmenodes = ddGridSetup_.numPmeOnlyRanks;
2994
2995     ddRankSetup_ = getDDRankSetup(mdlog_, cr_->sizeOfDefaultCommunicator, ddGridSetup_, ir_);
2996
2997     /* Generate the group communicator, also decides the duty of each rank */
2998     cartSetup_ = makeGroupCommunicators(
2999             mdlog_, ddSettings_, options_.rankOrder, ddRankSetup_, cr_, ddCellIndex_, &pmeRanks_);
3000 }
3001
3002 gmx_domdec_t* DomainDecompositionBuilder::Impl::build(LocalAtomSetManager* atomSets)
3003 {
3004     gmx_domdec_t* dd = new gmx_domdec_t(ir_);
3005
3006     copy_ivec(ddCellIndex_, dd->ci);
3007
3008     dd->comm = init_dd_comm();
3009
3010     dd->comm->ddRankSetup        = ddRankSetup_;
3011     dd->comm->cartesianRankSetup = cartSetup_;
3012
3013     set_dd_limits(mdlog_,
3014                   MASTER(cr_) ? DDRole::Master : DDRole::Agent,
3015                   dd,
3016                   options_,
3017                   ddSettings_,
3018                   systemInfo_,
3019                   ddGridSetup_,
3020                   ddRankSetup_.numPPRanks,
3021                   mtop_,
3022                   ir_,
3023                   ddbox_);
3024
3025     setupGroupCommunication(mdlog_, ddSettings_, pmeRanks_, cr_, mtop_.natoms, dd);
3026
3027     if (thisRankHasDuty(cr_, DUTY_PP))
3028     {
3029         set_ddgrid_parameters(mdlog_, dd, options_.dlbScaling, mtop_, ir_, &ddbox_);
3030
3031         setup_neighbor_relations(dd);
3032     }
3033
3034     /* Set overallocation to avoid frequent reallocation of arrays */
3035     set_over_alloc_dd(true);
3036
3037     dd->atomSets = atomSets;
3038
3039     return dd;
3040 }
3041
3042 DomainDecompositionBuilder::DomainDecompositionBuilder(const MDLogger&      mdlog,
3043                                                        t_commrec*           cr,
3044                                                        const DomdecOptions& options,
3045                                                        const MdrunOptions&  mdrunOptions,
3046                                                        const gmx_mtop_t&    mtop,
3047                                                        const t_inputrec&    ir,
3048                                                        const matrix         box,
3049                                                        ArrayRef<const RVec> xGlobal) :
3050     impl_(new Impl(mdlog, cr, options, mdrunOptions, mtop, ir, box, xGlobal))
3051 {
3052 }
3053
3054 gmx_domdec_t* DomainDecompositionBuilder::build(LocalAtomSetManager* atomSets)
3055 {
3056     return impl_->build(atomSets);
3057 }
3058
3059 DomainDecompositionBuilder::~DomainDecompositionBuilder() = default;
3060
3061 } // namespace gmx
3062
3063 static gmx_bool test_dd_cutoff(const t_commrec* cr, const matrix box, gmx::ArrayRef<const gmx::RVec> x, real cutoffRequested)
3064 {
3065     gmx_ddbox_t ddbox;
3066     int         LocallyLimited = 0;
3067
3068     const auto* dd = cr->dd;
3069
3070     set_ddbox(*dd, false, box, true, x, &ddbox);
3071
3072     LocallyLimited = 0;
3073
3074     for (int d = 0; d < dd->ndim; d++)
3075     {
3076         const int dim = dd->dim[d];
3077
3078         real inv_cell_size = DD_CELL_MARGIN * dd->numCells[dim] / ddbox.box_size[dim];
3079         if (dd->unitCellInfo.ddBoxIsDynamic)
3080         {
3081             inv_cell_size *= DD_PRES_SCALE_MARGIN;
3082         }
3083
3084         const int np = 1 + static_cast<int>(cutoffRequested * inv_cell_size * ddbox.skew_fac[dim]);
3085
3086         if (!isDlbDisabled(dd->comm) && (dim < ddbox.npbcdim) && (dd->comm->cd[d].np_dlb > 0))
3087         {
3088             if (np > dd->comm->cd[d].np_dlb)
3089             {
3090                 return FALSE;
3091             }
3092
3093             /* If a current local cell size is smaller than the requested
3094              * cut-off, we could still fix it, but this gets very complicated.
3095              * Without fixing here, we might actually need more checks.
3096              */
3097             real cellSizeAlongDim =
3098                     (dd->comm->cell_x1[dim] - dd->comm->cell_x0[dim]) * ddbox.skew_fac[dim];
3099             if (cellSizeAlongDim * dd->comm->cd[d].np_dlb < cutoffRequested)
3100             {
3101                 LocallyLimited = 1;
3102             }
3103         }
3104     }
3105
3106     if (!isDlbDisabled(dd->comm))
3107     {
3108         /* If DLB is not active yet, we don't need to check the grid jumps.
3109          * Actually we shouldn't, because then the grid jump data is not set.
3110          */
3111         if (isDlbOn(dd->comm) && gmx::check_grid_jump(0, dd, cutoffRequested, &ddbox, FALSE))
3112         {
3113             LocallyLimited = 1;
3114         }
3115
3116         gmx_sumi(1, &LocallyLimited, cr);
3117
3118         if (LocallyLimited > 0)
3119         {
3120             return FALSE;
3121         }
3122     }
3123
3124     return TRUE;
3125 }
3126
3127 bool change_dd_cutoff(t_commrec* cr, const matrix box, gmx::ArrayRef<const gmx::RVec> x, real cutoffRequested)
3128 {
3129     bool bCutoffAllowed = test_dd_cutoff(cr, box, x, cutoffRequested);
3130
3131     if (bCutoffAllowed)
3132     {
3133         cr->dd->comm->systemInfo.cutoff = cutoffRequested;
3134     }
3135
3136     return bCutoffAllowed;
3137 }
3138
3139 void constructGpuHaloExchange(const gmx::MDLogger&            mdlog,
3140                               const t_commrec&                cr,
3141                               const gmx::DeviceStreamManager& deviceStreamManager,
3142                               gmx_wallcycle*                  wcycle)
3143 {
3144     GMX_RELEASE_ASSERT(deviceStreamManager.streamIsValid(gmx::DeviceStreamType::NonBondedLocal),
3145                        "Local non-bonded stream should be valid when using"
3146                        "GPU halo exchange.");
3147     GMX_RELEASE_ASSERT(deviceStreamManager.streamIsValid(gmx::DeviceStreamType::NonBondedNonLocal),
3148                        "Non-local non-bonded stream should be valid when using "
3149                        "GPU halo exchange.");
3150
3151     if (cr.dd->gpuHaloExchange[0].empty())
3152     {
3153         GMX_LOG(mdlog.warning)
3154                 .asParagraph()
3155                 .appendTextFormatted(
3156                         "NOTE: Activating the 'GPU halo exchange' feature, enabled "
3157                         "by the "
3158                         "GMX_GPU_DD_COMMS environment variable.");
3159     }
3160
3161     for (int d = 0; d < cr.dd->ndim; d++)
3162     {
3163         for (int pulse = cr.dd->gpuHaloExchange[d].size(); pulse < cr.dd->comm->cd[d].numPulses(); pulse++)
3164         {
3165             cr.dd->gpuHaloExchange[d].push_back(std::make_unique<gmx::GpuHaloExchange>(
3166                     cr.dd,
3167                     d,
3168                     cr.mpi_comm_mygroup,
3169                     deviceStreamManager.context(),
3170                     deviceStreamManager.stream(gmx::DeviceStreamType::NonBondedLocal),
3171                     deviceStreamManager.stream(gmx::DeviceStreamType::NonBondedNonLocal),
3172                     pulse,
3173                     wcycle));
3174         }
3175     }
3176 }
3177
3178 void reinitGpuHaloExchange(const t_commrec&              cr,
3179                            const DeviceBuffer<gmx::RVec> d_coordinatesBuffer,
3180                            const DeviceBuffer<gmx::RVec> d_forcesBuffer)
3181 {
3182     for (int d = 0; d < cr.dd->ndim; d++)
3183     {
3184         for (int pulse = 0; pulse < cr.dd->comm->cd[d].numPulses(); pulse++)
3185         {
3186             cr.dd->gpuHaloExchange[d][pulse]->reinitHalo(d_coordinatesBuffer, d_forcesBuffer);
3187         }
3188     }
3189 }
3190
3191 void communicateGpuHaloCoordinates(const t_commrec&      cr,
3192                                    const matrix          box,
3193                                    GpuEventSynchronizer* coordinatesReadyOnDeviceEvent)
3194 {
3195     for (int d = 0; d < cr.dd->ndim; d++)
3196     {
3197         for (int pulse = 0; pulse < cr.dd->comm->cd[d].numPulses(); pulse++)
3198         {
3199             cr.dd->gpuHaloExchange[d][pulse]->communicateHaloCoordinates(box, coordinatesReadyOnDeviceEvent);
3200         }
3201     }
3202 }
3203
3204 void communicateGpuHaloForces(const t_commrec& cr, bool accumulateForces)
3205 {
3206     for (int d = cr.dd->ndim - 1; d >= 0; d--)
3207     {
3208         for (int pulse = cr.dd->comm->cd[d].numPulses() - 1; pulse >= 0; pulse--)
3209         {
3210             cr.dd->gpuHaloExchange[d][pulse]->communicateHaloForces(accumulateForces);
3211         }
3212     }
3213 }