src/gromacs/domdec/domdec.cpp

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2005,2006,2007,2008,2009 by the GROMACS development team.
   5  * Copyright (c) 2010,2011,2012,2013,2014 by the GROMACS development team.
   6  * Copyright (c) 2015,2016,2017,2018,2019 by the GROMACS development team.
   7  * Copyright (c) 2020,2021, by the GROMACS development team, led by
   8  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   9  * and including many others, as listed in the AUTHORS file in the
  10  * top-level source directory and at http://www.gromacs.org.
  11  *
  12  * GROMACS is free software; you can redistribute it and/or
  13  * modify it under the terms of the GNU Lesser General Public License
  14  * as published by the Free Software Foundation; either version 2.1
  15  * of the License, or (at your option) any later version.
  16  *
  17  * GROMACS is distributed in the hope that it will be useful,
  18  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  19  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  20  * Lesser General Public License for more details.
  21  *
  22  * You should have received a copy of the GNU Lesser General Public
  23  * License along with GROMACS; if not, see
  24  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  25  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  26  *
  27  * If you want to redistribute modifications to GROMACS, please
  28  * consider that scientific software is very special. Version
  29  * control is crucial - bugs must be traceable. We will be happy to
  30  * consider code for inclusion in the official distribution, but
  31  * derived work must not be called official GROMACS. Details are found
  32  * in the README & COPYING files - if they are missing, get the
  33  * official version at http://www.gromacs.org.
  34  *
  35  * To help us fund GROMACS development, we humbly ask that you cite
  36  * the research papers on the package. Check out http://www.gromacs.org.
  37  */
  38
  39 #include "gmxpre.h"
  40
  41 #include "domdec.h"
  42
  43 #include "config.h"
  44
  45 #include <cassert>
  46 #include <cinttypes>
  47 #include <climits>
  48 #include <cmath>
  49 #include <cstdio>
  50 #include <cstdlib>
  51 #include <cstring>
  52
  53 #include <algorithm>
  54 #include <memory>
  55
  56 #include "gromacs/domdec/builder.h"
  57 #include "gromacs/domdec/collect.h"
  58 #include "gromacs/domdec/dlb.h"
  59 #include "gromacs/domdec/dlbtiming.h"
  60 #include "gromacs/domdec/domdec_network.h"
  61 #include "gromacs/domdec/ga2la.h"
  62 #include "gromacs/domdec/gpuhaloexchange.h"
  63 #include "gromacs/domdec/localtopologychecker.h"
  64 #include "gromacs/domdec/options.h"
  65 #include "gromacs/domdec/partition.h"
  66 #include "gromacs/ewald/pme.h"
  67 #include "gromacs/domdec/reversetopology.h"
  68 #include "gromacs/gmxlib/network.h"
  69 #include "gromacs/gmxlib/nrnb.h"
  70 #include "gromacs/gpu_utils/device_stream_manager.h"
  71 #include "gromacs/gpu_utils/gpu_utils.h"
  72 #include "gromacs/hardware/hw_info.h"
  73 #include "gromacs/math/vec.h"
  74 #include "gromacs/math/vectypes.h"
  75 #include "gromacs/mdlib/calc_verletbuf.h"
  76 #include "gromacs/mdlib/constr.h"
  77 #include "gromacs/mdlib/constraintrange.h"
  78 #include "gromacs/mdlib/updategroups.h"
  79 #include "gromacs/mdlib/vcm.h"
  80 #include "gromacs/mdlib/vsite.h"
  81 #include "gromacs/mdrun/mdmodules.h"
  82 #include "gromacs/mdtypes/commrec.h"
  83 #include "gromacs/mdtypes/forceoutput.h"
  84 #include "gromacs/mdtypes/inputrec.h"
  85 #include "gromacs/mdtypes/mdrunoptions.h"
  86 #include "gromacs/mdtypes/state.h"
  87 #include "gromacs/pbcutil/ishift.h"
  88 #include "gromacs/pbcutil/pbc.h"
  89 #include "gromacs/pulling/pull.h"
  90 #include "gromacs/timing/wallcycle.h"
  91 #include "gromacs/topology/block.h"
  92 #include "gromacs/topology/idef.h"
  93 #include "gromacs/topology/ifunc.h"
  94 #include "gromacs/topology/mtop_lookup.h"
  95 #include "gromacs/topology/mtop_util.h"
  96 #include "gromacs/topology/topology.h"
  97 #include "gromacs/utility/basedefinitions.h"
  98 #include "gromacs/utility/basenetwork.h"
  99 #include "gromacs/utility/cstringutil.h"
 100 #include "gromacs/utility/enumerationhelpers.h"
 101 #include "gromacs/utility/exceptions.h"
 102 #include "gromacs/utility/fatalerror.h"
 103 #include "gromacs/utility/gmxmpi.h"
 104 #include "gromacs/utility/logger.h"
 105 #include "gromacs/utility/mdmodulesnotifiers.h"
 106 #include "gromacs/utility/real.h"
 107 #include "gromacs/utility/smalloc.h"
 108 #include "gromacs/utility/strconvert.h"
 109 #include "gromacs/utility/stringstream.h"
 110 #include "gromacs/utility/stringutil.h"
 111 #include "gromacs/utility/textwriter.h"
 112
 113 #include "atomdistribution.h"
 114 #include "box.h"
 115 #include "cellsizes.h"
 116 #include "distribute.h"
 117 #include "domdec_constraints.h"
 118 #include "domdec_internal.h"
 119 #include "domdec_setup.h"
 120 #include "domdec_vsite.h"
 121 #include "redistribute.h"
 122 #include "utility.h"
 123
 124 // TODO remove this when moving domdec into gmx namespace
 125 using gmx::ArrayRef;
 126 using gmx::DdRankOrder;
 127 using gmx::DlbOption;
 128 using gmx::DomdecOptions;
 129 using gmx::RangePartitioning;
 130
 131 static const char* enumValueToString(DlbState enumValue)
 132 {
 133     static constexpr gmx::EnumerationArray<DlbState, const char*> dlbStateNames = {
 134         "off", "off", "auto", "locked", "on", "on"
 135     };
 136     return dlbStateNames[enumValue];
 137 }
 138
 139 /* The size per atom group of the cggl_flag buffer in gmx_domdec_comm_t */
 140 #define DD_CGIBS 2
 141
 142 /* The flags for the cggl_flag buffer in gmx_domdec_comm_t */
 143 #define DD_FLAG_NRCG 65535
 144 #define DD_FLAG_FW(d) (1 << (16 + (d)*2))
 145 #define DD_FLAG_BW(d) (1 << (16 + (d)*2 + 1))
 146
 147 /* The DD zone order */
 148 static const ivec dd_zo[DD_MAXZONE] = { { 0, 0, 0 }, { 1, 0, 0 }, { 1, 1, 0 }, { 0, 1, 0 },
 149                                         { 0, 1, 1 }, { 0, 0, 1 }, { 1, 0, 1 }, { 1, 1, 1 } };
 150
 151 /* The non-bonded zone-pair setup for domain decomposition
 152  * The first number is the i-zone, the second number the first j-zone seen by
 153  * this i-zone, the third number the last+1 j-zone seen by this i-zone.
 154  * As is, this is for 3D decomposition, where there are 4 i-zones.
 155  * With 2D decomposition use only the first 2 i-zones and a last+1 j-zone of 4.
 156  * With 1D decomposition use only the first i-zone and a last+1 j-zone of 2.
 157  */
 158 static const int ddNonbondedZonePairRanges[DD_MAXIZONE][3] = { { 0, 0, 8 },
 159                                                                { 1, 3, 6 },
 160                                                                { 2, 5, 6 },
 161                                                                { 3, 5, 7 } };
 162
 163 static void ddindex2xyz(const ivec nc, int ind, ivec xyz)
 164 {
 165     xyz[XX] = ind / (nc[YY] * nc[ZZ]);
 166     xyz[YY] = (ind / nc[ZZ]) % nc[YY];
 167     xyz[ZZ] = ind % nc[ZZ];
 168 }
 169
 170 static int ddcoord2ddnodeid(gmx_domdec_t* dd, ivec c)
 171 {
 172     int ddnodeid = -1;
 173
 174     const CartesianRankSetup& cartSetup = dd->comm->cartesianRankSetup;
 175     const int                 ddindex   = dd_index(dd->numCells, c);
 176     if (cartSetup.bCartesianPP_PME)
 177     {
 178         ddnodeid = cartSetup.ddindex2ddnodeid[ddindex];
 179     }
 180     else if (cartSetup.bCartesianPP)
 181     {
 182 #if GMX_MPI
 183         MPI_Cart_rank(dd->mpi_comm_all, c, &ddnodeid);
 184 #endif
 185     }
 186     else
 187     {
 188         ddnodeid = ddindex;
 189     }
 190
 191     return ddnodeid;
 192 }
 193
 194 int ddglatnr(const gmx_domdec_t* dd, int i)
 195 {
 196     int atnr = 0;
 197
 198     if (dd == nullptr)
 199     {
 200         atnr = i + 1;
 201     }
 202     else
 203     {
 204         if (i >= dd->comm->atomRanges.numAtomsTotal())
 205         {
 206             gmx_fatal(FARGS,
 207                       "glatnr called with %d, which is larger than the local number of atoms (%d)",
 208                       i,
 209                       dd->comm->atomRanges.numAtomsTotal());
 210         }
 211         atnr = dd->globalAtomIndices[i] + 1;
 212     }
 213
 214     return atnr;
 215 }
 216
 217 void dd_store_state(const gmx_domdec_t& dd, t_state* state)
 218 {
 219     if (state->ddp_count != dd.ddp_count)
 220     {
 221         gmx_incons("The MD state does not match the domain decomposition state");
 222     }
 223
 224     state->cg_gl.resize(dd.ncg_home);
 225     for (int i = 0; i < dd.ncg_home; i++)
 226     {
 227         state->cg_gl[i] = dd.globalAtomGroupIndices[i];
 228     }
 229
 230     state->ddp_count_cg_gl = dd.ddp_count;
 231 }
 232
 233 gmx_domdec_zones_t* domdec_zones(gmx_domdec_t* dd)
 234 {
 235     return &dd->comm->zones;
 236 }
 237
 238 int dd_numAtomsZones(const gmx_domdec_t& dd)
 239 {
 240     return dd.comm->atomRanges.end(DDAtomRanges::Type::Zones);
 241 }
 242
 243 int dd_numHomeAtoms(const gmx_domdec_t& dd)
 244 {
 245     return dd.comm->atomRanges.numHomeAtoms();
 246 }
 247
 248 int dd_natoms_mdatoms(const gmx_domdec_t& dd)
 249 {
 250     /* We currently set mdatoms entries for all atoms:
 251      * local + non-local + communicated for vsite + constraints
 252      */
 253
 254     return dd.comm->atomRanges.numAtomsTotal();
 255 }
 256
 257 int dd_natoms_vsite(const gmx_domdec_t& dd)
 258 {
 259     return dd.comm->atomRanges.end(DDAtomRanges::Type::Vsites);
 260 }
 261
 262 void dd_get_constraint_range(const gmx_domdec_t& dd, int* at_start, int* at_end)
 263 {
 264     *at_start = dd.comm->atomRanges.start(DDAtomRanges::Type::Constraints);
 265     *at_end   = dd.comm->atomRanges.end(DDAtomRanges::Type::Constraints);
 266 }
 267
 268 void dd_move_x(gmx_domdec_t* dd, const matrix box, gmx::ArrayRef<gmx::RVec> x, gmx_wallcycle* wcycle)
 269 {
 270     wallcycle_start(wcycle, WallCycleCounter::MoveX);
 271
 272     rvec shift = { 0, 0, 0 };
 273
 274     gmx_domdec_comm_t* comm = dd->comm;
 275
 276     int nzone   = 1;
 277     int nat_tot = comm->atomRanges.numHomeAtoms();
 278     for (int d = 0; d < dd->ndim; d++)
 279     {
 280         const bool bPBC   = (dd->ci[dd->dim[d]] == 0);
 281         const bool bScrew = (bPBC && dd->unitCellInfo.haveScrewPBC && dd->dim[d] == XX);
 282         if (bPBC)
 283         {
 284             copy_rvec(box[dd->dim[d]], shift);
 285         }
 286         gmx_domdec_comm_dim_t* cd = &comm->cd[d];
 287         for (const gmx_domdec_ind_t& ind : cd->ind)
 288         {
 289             DDBufferAccess<gmx::RVec> sendBufferAccess(comm->rvecBuffer, ind.nsend[nzone + 1]);
 290             gmx::ArrayRef<gmx::RVec>& sendBuffer = sendBufferAccess.buffer;
 291             int                       n          = 0;
 292             if (!bPBC)
 293             {
 294                 for (int j : ind.index)
 295                 {
 296                     sendBuffer[n] = x[j];
 297                     n++;
 298                 }
 299             }
 300             else if (!bScrew)
 301             {
 302                 for (int j : ind.index)
 303                 {
 304                     /* We need to shift the coordinates */
 305                     for (int d = 0; d < DIM; d++)
 306                     {
 307                         sendBuffer[n][d] = x[j][d] + shift[d];
 308                     }
 309                     n++;
 310                 }
 311             }
 312             else
 313             {
 314                 for (int j : ind.index)
 315                 {
 316                     /* Shift x */
 317                     sendBuffer[n][XX] = x[j][XX] + shift[XX];
 318                     /* Rotate y and z.
 319                      * This operation requires a special shift force
 320                      * treatment, which is performed in calc_vir.
 321                      */
 322                     sendBuffer[n][YY] = box[YY][YY] - x[j][YY];
 323                     sendBuffer[n][ZZ] = box[ZZ][ZZ] - x[j][ZZ];
 324                     n++;
 325                 }
 326             }
 327
 328             DDBufferAccess<gmx::RVec> receiveBufferAccess(
 329                     comm->rvecBuffer2, cd->receiveInPlace ? 0 : ind.nrecv[nzone + 1]);
 330
 331             gmx::ArrayRef<gmx::RVec> receiveBuffer;
 332             if (cd->receiveInPlace)
 333             {
 334                 receiveBuffer = gmx::arrayRefFromArray(x.data() + nat_tot, ind.nrecv[nzone + 1]);
 335             }
 336             else
 337             {
 338                 receiveBuffer = receiveBufferAccess.buffer;
 339             }
 340             /* Send and receive the coordinates */
 341             ddSendrecv(dd, d, dddirBackward, sendBuffer, receiveBuffer);
 342
 343             if (!cd->receiveInPlace)
 344             {
 345                 int j = 0;
 346                 for (int zone = 0; zone < nzone; zone++)
 347                 {
 348                     for (int i = ind.cell2at0[zone]; i < ind.cell2at1[zone]; i++)
 349                     {
 350                         x[i] = receiveBuffer[j++];
 351                     }
 352                 }
 353             }
 354             nat_tot += ind.nrecv[nzone + 1];
 355         }
 356         nzone += nzone;
 357     }
 358
 359     wallcycle_stop(wcycle, WallCycleCounter::MoveX);
 360 }
 361
 362 void dd_move_f(gmx_domdec_t* dd, gmx::ForceWithShiftForces* forceWithShiftForces, gmx_wallcycle* wcycle)
 363 {
 364     wallcycle_start(wcycle, WallCycleCounter::MoveF);
 365
 366     gmx::ArrayRef<gmx::RVec> f      = forceWithShiftForces->force();
 367     gmx::ArrayRef<gmx::RVec> fshift = forceWithShiftForces->shiftForces();
 368
 369     gmx_domdec_comm_t& comm    = *dd->comm;
 370     int                nzone   = comm.zones.n / 2;
 371     int                nat_tot = comm.atomRanges.end(DDAtomRanges::Type::Zones);
 372     for (int d = dd->ndim - 1; d >= 0; d--)
 373     {
 374         /* Only forces in domains near the PBC boundaries need to
 375            consider PBC in the treatment of fshift */
 376         const bool shiftForcesNeedPbc =
 377                 (forceWithShiftForces->computeVirial() && dd->ci[dd->dim[d]] == 0);
 378         const bool applyScrewPbc =
 379                 (shiftForcesNeedPbc && dd->unitCellInfo.haveScrewPBC && dd->dim[d] == XX);
 380         /* Determine which shift vector we need */
 381         ivec vis        = { 0, 0, 0 };
 382         vis[dd->dim[d]] = 1;
 383         const int is    = gmx::ivecToShiftIndex(vis);
 384
 385         /* Loop over the pulses */
 386         const gmx_domdec_comm_dim_t& cd = comm.cd[d];
 387         for (int p = cd.numPulses() - 1; p >= 0; p--)
 388         {
 389             const gmx_domdec_ind_t&   ind = cd.ind[p];
 390             DDBufferAccess<gmx::RVec> receiveBufferAccess(comm.rvecBuffer, ind.nsend[nzone + 1]);
 391             gmx::ArrayRef<gmx::RVec>& receiveBuffer = receiveBufferAccess.buffer;
 392
 393             nat_tot -= ind.nrecv[nzone + 1];
 394
 395             DDBufferAccess<gmx::RVec> sendBufferAccess(
 396                     comm.rvecBuffer2, cd.receiveInPlace ? 0 : ind.nrecv[nzone + 1]);
 397
 398             gmx::ArrayRef<gmx::RVec> sendBuffer;
 399             if (cd.receiveInPlace)
 400             {
 401                 sendBuffer = gmx::arrayRefFromArray(f.data() + nat_tot, ind.nrecv[nzone + 1]);
 402             }
 403             else
 404             {
 405                 sendBuffer = sendBufferAccess.buffer;
 406                 int j      = 0;
 407                 for (int zone = 0; zone < nzone; zone++)
 408                 {
 409                     for (int i = ind.cell2at0[zone]; i < ind.cell2at1[zone]; i++)
 410                     {
 411                         sendBuffer[j++] = f[i];
 412                     }
 413                 }
 414             }
 415             /* Communicate the forces */
 416             ddSendrecv(dd, d, dddirForward, sendBuffer, receiveBuffer);
 417             /* Add the received forces */
 418             int n = 0;
 419             if (!shiftForcesNeedPbc)
 420             {
 421                 for (int j : ind.index)
 422                 {
 423                     for (int d = 0; d < DIM; d++)
 424                     {
 425                         f[j][d] += receiveBuffer[n][d];
 426                     }
 427                     n++;
 428                 }
 429             }
 430             else if (!applyScrewPbc)
 431             {
 432                 for (int j : ind.index)
 433                 {
 434                     for (int d = 0; d < DIM; d++)
 435                     {
 436                         f[j][d] += receiveBuffer[n][d];
 437                     }
 438                     /* Add this force to the shift force */
 439                     for (int d = 0; d < DIM; d++)
 440                     {
 441                         fshift[is][d] += receiveBuffer[n][d];
 442                     }
 443                     n++;
 444                 }
 445             }
 446             else
 447             {
 448                 for (int j : ind.index)
 449                 {
 450                     /* Rotate the force */
 451                     f[j][XX] += receiveBuffer[n][XX];
 452                     f[j][YY] -= receiveBuffer[n][YY];
 453                     f[j][ZZ] -= receiveBuffer[n][ZZ];
 454                     if (shiftForcesNeedPbc)
 455                     {
 456                         /* Add this force to the shift force */
 457                         for (int d = 0; d < DIM; d++)
 458                         {
 459                             fshift[is][d] += receiveBuffer[n][d];
 460                         }
 461                     }
 462                     n++;
 463                 }
 464             }
 465         }
 466         nzone /= 2;
 467     }
 468     wallcycle_stop(wcycle, WallCycleCounter::MoveF);
 469 }
 470
 471 /* Convenience function for extracting a real buffer from an rvec buffer
 472  *
 473  * To reduce the number of temporary communication buffers and avoid
 474  * cache polution, we reuse gmx::RVec buffers for storing reals.
 475  * This functions return a real buffer reference with the same number
 476  * of elements as the gmx::RVec buffer (so 1/3 of the size in bytes).
 477  */
 478 static gmx::ArrayRef<real> realArrayRefFromRvecArrayRef(gmx::ArrayRef<gmx::RVec> arrayRef)
 479 {
 480     return gmx::arrayRefFromArray(as_rvec_array(arrayRef.data())[0], arrayRef.size());
 481 }
 482
 483 void dd_atom_spread_real(gmx_domdec_t* dd, real v[])
 484 {
 485     gmx_domdec_comm_t* comm = dd->comm;
 486
 487     int nzone   = 1;
 488     int nat_tot = comm->atomRanges.numHomeAtoms();
 489     for (int d = 0; d < dd->ndim; d++)
 490     {
 491         gmx_domdec_comm_dim_t* cd = &comm->cd[d];
 492         for (const gmx_domdec_ind_t& ind : cd->ind)
 493         {
 494             /* Note: We provision for RVec instead of real, so a factor of 3
 495              * more than needed. The buffer actually already has this size
 496              * and we pass a plain pointer below, so this does not matter.
 497              */
 498             DDBufferAccess<gmx::RVec> sendBufferAccess(comm->rvecBuffer, ind.nsend[nzone + 1]);
 499             gmx::ArrayRef<real> sendBuffer = realArrayRefFromRvecArrayRef(sendBufferAccess.buffer);
 500             int                 n          = 0;
 501             for (int j : ind.index)
 502             {
 503                 sendBuffer[n++] = v[j];
 504             }
 505
 506             DDBufferAccess<gmx::RVec> receiveBufferAccess(
 507                     comm->rvecBuffer2, cd->receiveInPlace ? 0 : ind.nrecv[nzone + 1]);
 508
 509             gmx::ArrayRef<real> receiveBuffer;
 510             if (cd->receiveInPlace)
 511             {
 512                 receiveBuffer = gmx::arrayRefFromArray(v + nat_tot, ind.nrecv[nzone + 1]);
 513             }
 514             else
 515             {
 516                 receiveBuffer = realArrayRefFromRvecArrayRef(receiveBufferAccess.buffer);
 517             }
 518             /* Send and receive the data */
 519             ddSendrecv(dd, d, dddirBackward, sendBuffer, receiveBuffer);
 520             if (!cd->receiveInPlace)
 521             {
 522                 int j = 0;
 523                 for (int zone = 0; zone < nzone; zone++)
 524                 {
 525                     for (int i = ind.cell2at0[zone]; i < ind.cell2at1[zone]; i++)
 526                     {
 527                         v[i] = receiveBuffer[j++];
 528                     }
 529                 }
 530             }
 531             nat_tot += ind.nrecv[nzone + 1];
 532         }
 533         nzone += nzone;
 534     }
 535 }
 536
 537 void dd_atom_sum_real(gmx_domdec_t* dd, real v[])
 538 {
 539     gmx_domdec_comm_t* comm = dd->comm;
 540
 541     int nzone   = comm->zones.n / 2;
 542     int nat_tot = comm->atomRanges.end(DDAtomRanges::Type::Zones);
 543     for (int d = dd->ndim - 1; d >= 0; d--)
 544     {
 545         gmx_domdec_comm_dim_t* cd = &comm->cd[d];
 546         for (int p = cd->numPulses() - 1; p >= 0; p--)
 547         {
 548             const gmx_domdec_ind_t& ind = cd->ind[p];
 549
 550             /* Note: We provision for RVec instead of real, so a factor of 3
 551              * more than needed. The buffer actually already has this size
 552              * and we typecast, so this works as intended.
 553              */
 554             DDBufferAccess<gmx::RVec> receiveBufferAccess(comm->rvecBuffer, ind.nsend[nzone + 1]);
 555             gmx::ArrayRef<real> receiveBuffer = realArrayRefFromRvecArrayRef(receiveBufferAccess.buffer);
 556             nat_tot -= ind.nrecv[nzone + 1];
 557
 558             DDBufferAccess<gmx::RVec> sendBufferAccess(
 559                     comm->rvecBuffer2, cd->receiveInPlace ? 0 : ind.nrecv[nzone + 1]);
 560
 561             gmx::ArrayRef<real> sendBuffer;
 562             if (cd->receiveInPlace)
 563             {
 564                 sendBuffer = gmx::arrayRefFromArray(v + nat_tot, ind.nrecv[nzone + 1]);
 565             }
 566             else
 567             {
 568                 sendBuffer = realArrayRefFromRvecArrayRef(sendBufferAccess.buffer);
 569                 int j      = 0;
 570                 for (int zone = 0; zone < nzone; zone++)
 571                 {
 572                     for (int i = ind.cell2at0[zone]; i < ind.cell2at1[zone]; i++)
 573                     {
 574                         sendBuffer[j++] = v[i];
 575                     }
 576                 }
 577             }
 578             /* Communicate the forces */
 579             ddSendrecv(dd, d, dddirForward, sendBuffer, receiveBuffer);
 580             /* Add the received forces */
 581             int n = 0;
 582             for (int j : ind.index)
 583             {
 584                 v[j] += receiveBuffer[n];
 585                 n++;
 586             }
 587         }
 588         nzone /= 2;
 589     }
 590 }
 591
 592 real dd_cutoff_multibody(const gmx_domdec_t* dd)
 593 {
 594     const gmx_domdec_comm_t& comm       = *dd->comm;
 595     const DDSystemInfo&      systemInfo = comm.systemInfo;
 596
 597     real r = -1;
 598     if (systemInfo.haveInterDomainMultiBodyBondeds)
 599     {
 600         if (comm.cutoff_mbody > 0)
 601         {
 602             r = comm.cutoff_mbody;
 603         }
 604         else
 605         {
 606             /* cutoff_mbody=0 means we do not have DLB */
 607             r = comm.cellsize_min[dd->dim[0]];
 608             for (int di = 1; di < dd->ndim; di++)
 609             {
 610                 r = std::min(r, comm.cellsize_min[dd->dim[di]]);
 611             }
 612             if (comm.systemInfo.filterBondedCommunication)
 613             {
 614                 r = std::max(r, comm.cutoff_mbody);
 615             }
 616             else
 617             {
 618                 r = std::min(r, systemInfo.cutoff);
 619             }
 620         }
 621     }
 622
 623     return r;
 624 }
 625
 626 real dd_cutoff_twobody(const gmx_domdec_t* dd)
 627 {
 628     const real r_mb = dd_cutoff_multibody(dd);
 629
 630     return std::max(dd->comm->systemInfo.cutoff, r_mb);
 631 }
 632
 633
 634 static void dd_cart_coord2pmecoord(const DDRankSetup&        ddRankSetup,
 635                                    const CartesianRankSetup& cartSetup,
 636                                    const ivec                coord,
 637                                    ivec                      coord_pme)
 638 {
 639     const int nc   = ddRankSetup.numPPCells[cartSetup.cartpmedim];
 640     const int ntot = cartSetup.ntot[cartSetup.cartpmedim];
 641     copy_ivec(coord, coord_pme);
 642     coord_pme[cartSetup.cartpmedim] =
 643             nc + (coord[cartSetup.cartpmedim] * (ntot - nc) + (ntot - nc) / 2) / nc;
 644 }
 645
 646 /* Returns the PME rank index in 0...npmenodes-1 for the PP cell with index ddCellIndex */
 647 static int ddindex2pmeindex(const DDRankSetup& ddRankSetup, const int ddCellIndex)
 648 {
 649     const int npp  = ddRankSetup.numPPRanks;
 650     const int npme = ddRankSetup.numRanksDoingPme;
 651
 652     /* Here we assign a PME node to communicate with this DD node
 653      * by assuming that the major index of both is x.
 654      * We add npme/2 to obtain an even distribution.
 655      */
 656     return (ddCellIndex * npme + npme / 2) / npp;
 657 }
 658
 659 static std::vector<int> dd_interleaved_pme_ranks(const DDRankSetup& ddRankSetup)
 660 {
 661     std::vector<int> pmeRanks(ddRankSetup.numRanksDoingPme);
 662
 663     int n = 0;
 664     for (int i = 0; i < ddRankSetup.numPPRanks; i++)
 665     {
 666         const int p0 = ddindex2pmeindex(ddRankSetup, i);
 667         const int p1 = ddindex2pmeindex(ddRankSetup, i + 1);
 668         if (i + 1 == ddRankSetup.numPPRanks || p1 > p0)
 669         {
 670             if (debug)
 671             {
 672                 fprintf(debug, "pme_rank[%d] = %d\n", n, i + 1 + n);
 673             }
 674             pmeRanks[n] = i + 1 + n;
 675             n++;
 676         }
 677     }
 678
 679     return pmeRanks;
 680 }
 681
 682 static int gmx_ddcoord2pmeindex(const gmx_domdec_t& dd, int x, int y, int z)
 683 {
 684     ivec coords;
 685
 686     coords[XX]     = x;
 687     coords[YY]     = y;
 688     coords[ZZ]     = z;
 689     const int slab = ddindex2pmeindex(dd.comm->ddRankSetup, dd_index(dd.numCells, coords));
 690
 691     return slab;
 692 }
 693
 694 static int ddcoord2simnodeid(const t_commrec* cr, int x, int y, int z)
 695 {
 696     const CartesianRankSetup& cartSetup = cr->dd->comm->cartesianRankSetup;
 697     ivec                      coords    = { x, y, z };
 698     int                       nodeid    = -1;
 699
 700     if (cartSetup.bCartesianPP_PME)
 701     {
 702 #if GMX_MPI
 703         MPI_Cart_rank(cr->mpi_comm_mysim, coords, &nodeid);
 704 #endif
 705     }
 706     else
 707     {
 708         int ddindex = dd_index(cr->dd->numCells, coords);
 709         if (cartSetup.bCartesianPP)
 710         {
 711             nodeid = cartSetup.ddindex2simnodeid[ddindex];
 712         }
 713         else
 714         {
 715             if (cr->dd->comm->ddRankSetup.usePmeOnlyRanks)
 716             {
 717                 nodeid = ddindex + gmx_ddcoord2pmeindex(*cr->dd, x, y, z);
 718             }
 719             else
 720             {
 721                 nodeid = ddindex;
 722             }
 723         }
 724     }
 725
 726     return nodeid;
 727 }
 728
 729 static int dd_simnode2pmenode(const DDRankSetup&        ddRankSetup,
 730                               const CartesianRankSetup& cartSetup,
 731                               gmx::ArrayRef<const int>  pmeRanks,
 732                               const t_commrec gmx_unused* cr,
 733                               const int                   sim_nodeid)
 734 {
 735     int pmenode = -1;
 736
 737     /* This assumes a uniform x domain decomposition grid cell size */
 738     if (cartSetup.bCartesianPP_PME)
 739     {
 740 #if GMX_MPI
 741         ivec coord, coord_pme;
 742         MPI_Cart_coords(cr->mpi_comm_mysim, sim_nodeid, DIM, coord);
 743         if (coord[cartSetup.cartpmedim] < ddRankSetup.numPPCells[cartSetup.cartpmedim])
 744         {
 745             /* This is a PP rank */
 746             dd_cart_coord2pmecoord(ddRankSetup, cartSetup, coord, coord_pme);
 747             MPI_Cart_rank(cr->mpi_comm_mysim, coord_pme, &pmenode);
 748         }
 749 #endif
 750     }
 751     else if (cartSetup.bCartesianPP)
 752     {
 753         if (sim_nodeid < ddRankSetup.numPPRanks)
 754         {
 755             pmenode = ddRankSetup.numPPRanks + ddindex2pmeindex(ddRankSetup, sim_nodeid);
 756         }
 757     }
 758     else
 759     {
 760         /* This assumes DD cells with identical x coordinates
 761          * are numbered sequentially.
 762          */
 763         if (pmeRanks.empty())
 764         {
 765             if (sim_nodeid < ddRankSetup.numPPRanks)
 766             {
 767                 /* The DD index equals the nodeid */
 768                 pmenode = ddRankSetup.numPPRanks + ddindex2pmeindex(ddRankSetup, sim_nodeid);
 769             }
 770         }
 771         else
 772         {
 773             int i = 0;
 774             while (sim_nodeid > pmeRanks[i])
 775             {
 776                 i++;
 777             }
 778             if (sim_nodeid < pmeRanks[i])
 779             {
 780                 pmenode = pmeRanks[i];
 781             }
 782         }
 783     }
 784
 785     return pmenode;
 786 }
 787
 788 NumPmeDomains getNumPmeDomains(const gmx_domdec_t* dd)
 789 {
 790     if (dd != nullptr)
 791     {
 792         return { dd->comm->ddRankSetup.npmenodes_x, dd->comm->ddRankSetup.npmenodes_y };
 793     }
 794     else
 795     {
 796         return { 1, 1 };
 797     }
 798 }
 799
 800 std::vector<int> get_pme_ddranks(const t_commrec* cr, const int pmenodeid)
 801 {
 802     const DDRankSetup&        ddRankSetup = cr->dd->comm->ddRankSetup;
 803     const CartesianRankSetup& cartSetup   = cr->dd->comm->cartesianRankSetup;
 804     GMX_RELEASE_ASSERT(ddRankSetup.usePmeOnlyRanks,
 805                        "This function should only be called when PME-only ranks are in use");
 806     std::vector<int> ddranks;
 807     ddranks.reserve((ddRankSetup.numPPRanks + ddRankSetup.numRanksDoingPme - 1) / ddRankSetup.numRanksDoingPme);
 808
 809     for (int x = 0; x < ddRankSetup.numPPCells[XX]; x++)
 810     {
 811         for (int y = 0; y < ddRankSetup.numPPCells[YY]; y++)
 812         {
 813             for (int z = 0; z < ddRankSetup.numPPCells[ZZ]; z++)
 814             {
 815                 if (cartSetup.bCartesianPP_PME)
 816                 {
 817                     ivec coord = { x, y, z };
 818                     ivec coord_pme;
 819                     dd_cart_coord2pmecoord(ddRankSetup, cartSetup, coord, coord_pme);
 820                     if (cr->dd->ci[XX] == coord_pme[XX] && cr->dd->ci[YY] == coord_pme[YY]
 821                         && cr->dd->ci[ZZ] == coord_pme[ZZ])
 822                     {
 823                         ddranks.push_back(ddcoord2simnodeid(cr, x, y, z));
 824                     }
 825                 }
 826                 else
 827                 {
 828                     /* The slab corresponds to the nodeid in the PME group */
 829                     if (gmx_ddcoord2pmeindex(*cr->dd, x, y, z) == pmenodeid)
 830                     {
 831                         ddranks.push_back(ddcoord2simnodeid(cr, x, y, z));
 832                     }
 833                 }
 834             }
 835         }
 836     }
 837     return ddranks;
 838 }
 839
 840 static gmx_bool receive_vir_ener(const gmx_domdec_t* dd, gmx::ArrayRef<const int> pmeRanks, const t_commrec* cr)
 841 {
 842     bool bReceive = true;
 843
 844     const DDRankSetup& ddRankSetup = dd->comm->ddRankSetup;
 845     if (ddRankSetup.usePmeOnlyRanks)
 846     {
 847         const CartesianRankSetup& cartSetup = dd->comm->cartesianRankSetup;
 848         if (cartSetup.bCartesianPP_PME)
 849         {
 850 #if GMX_MPI
 851             int  pmenode = dd_simnode2pmenode(ddRankSetup, cartSetup, pmeRanks, cr, cr->sim_nodeid);
 852             ivec coords;
 853             MPI_Cart_coords(cr->mpi_comm_mysim, cr->sim_nodeid, DIM, coords);
 854             coords[cartSetup.cartpmedim]++;
 855             if (coords[cartSetup.cartpmedim] < dd->numCells[cartSetup.cartpmedim])
 856             {
 857                 int rank = 0;
 858                 MPI_Cart_rank(cr->mpi_comm_mysim, coords, &rank);
 859                 if (dd_simnode2pmenode(ddRankSetup, cartSetup, pmeRanks, cr, rank) == pmenode)
 860                 {
 861                     /* This is not the last PP node for pmenode */
 862                     bReceive = FALSE;
 863                 }
 864             }
 865 #else
 866             GMX_RELEASE_ASSERT(
 867                     false,
 868                     "Without MPI we should not have Cartesian PP-PME with #PMEnodes < #DDnodes");
 869 #endif
 870         }
 871         else
 872         {
 873             int pmenode = dd_simnode2pmenode(ddRankSetup, cartSetup, pmeRanks, cr, cr->sim_nodeid);
 874             if (cr->sim_nodeid + 1 < cr->nnodes
 875                 && dd_simnode2pmenode(ddRankSetup, cartSetup, pmeRanks, cr, cr->sim_nodeid + 1) == pmenode)
 876             {
 877                 /* This is not the last PP node for pmenode */
 878                 bReceive = FALSE;
 879             }
 880         }
 881     }
 882
 883     return bReceive;
 884 }
 885
 886 static void set_slb_pme_dim_f(gmx_domdec_t* dd, int dim, real** dim_f)
 887 {
 888     gmx_domdec_comm_t* comm = dd->comm;
 889
 890     snew(*dim_f, dd->numCells[dim] + 1);
 891     (*dim_f)[0] = 0;
 892     for (int i = 1; i < dd->numCells[dim]; i++)
 893     {
 894         if (comm->slb_frac[dim])
 895         {
 896             (*dim_f)[i] = (*dim_f)[i - 1] + comm->slb_frac[dim][i - 1];
 897         }
 898         else
 899         {
 900             (*dim_f)[i] = static_cast<real>(i) / static_cast<real>(dd->numCells[dim]);
 901         }
 902     }
 903     (*dim_f)[dd->numCells[dim]] = 1;
 904 }
 905
 906 static void init_ddpme(gmx_domdec_t* dd, gmx_ddpme_t* ddpme, int dimind)
 907 {
 908     const DDRankSetup& ddRankSetup = dd->comm->ddRankSetup;
 909
 910     if (dimind == 0 && dd->dim[0] == YY && ddRankSetup.npmenodes_x == 1)
 911     {
 912         ddpme->dim = YY;
 913     }
 914     else
 915     {
 916         ddpme->dim = dimind;
 917     }
 918     ddpme->dim_match = (ddpme->dim == dd->dim[dimind]);
 919
 920     ddpme->nslab = (ddpme->dim == 0 ? ddRankSetup.npmenodes_x : ddRankSetup.npmenodes_y);
 921
 922     if (ddpme->nslab <= 1)
 923     {
 924         return;
 925     }
 926
 927     const int nso = ddRankSetup.numRanksDoingPme / ddpme->nslab;
 928     /* Determine for each PME slab the PP location range for dimension dim */
 929     snew(ddpme->pp_min, ddpme->nslab);
 930     snew(ddpme->pp_max, ddpme->nslab);
 931     for (int slab = 0; slab < ddpme->nslab; slab++)
 932     {
 933         ddpme->pp_min[slab] = dd->numCells[dd->dim[dimind]] - 1;
 934         ddpme->pp_max[slab] = 0;
 935     }
 936     for (int i = 0; i < dd->nnodes; i++)
 937     {
 938         ivec xyz;
 939         ddindex2xyz(dd->numCells, i, xyz);
 940         /* For y only use our y/z slab.
 941          * This assumes that the PME x grid size matches the DD grid size.
 942          */
 943         if (dimind == 0 || xyz[XX] == dd->ci[XX])
 944         {
 945             const int pmeindex  = ddindex2pmeindex(ddRankSetup, i);
 946             const int slab      = (dimind == 0) ? (pmeindex / nso) : (pmeindex % ddpme->nslab);
 947             ddpme->pp_min[slab] = std::min(ddpme->pp_min[slab], xyz[dimind]);
 948             ddpme->pp_max[slab] = std::max(ddpme->pp_max[slab], xyz[dimind]);
 949         }
 950     }
 951
 952     set_slb_pme_dim_f(dd, ddpme->dim, &ddpme->slb_dim_f);
 953 }
 954
 955 int dd_pme_maxshift_x(const gmx_domdec_t& dd)
 956 {
 957     const DDRankSetup& ddRankSetup = dd.comm->ddRankSetup;
 958
 959     if (ddRankSetup.ddpme[0].dim == XX)
 960     {
 961         return ddRankSetup.ddpme[0].maxshift;
 962     }
 963     else
 964     {
 965         return 0;
 966     }
 967 }
 968
 969 int dd_pme_maxshift_y(const gmx_domdec_t& dd)
 970 {
 971     const DDRankSetup& ddRankSetup = dd.comm->ddRankSetup;
 972
 973     if (ddRankSetup.ddpme[0].dim == YY)
 974     {
 975         return ddRankSetup.ddpme[0].maxshift;
 976     }
 977     else if (ddRankSetup.npmedecompdim >= 2 && ddRankSetup.ddpme[1].dim == YY)
 978     {
 979         return ddRankSetup.ddpme[1].maxshift;
 980     }
 981     else
 982     {
 983         return 0;
 984     }
 985 }
 986
 987 bool ddUsesUpdateGroups(const gmx_domdec_t& dd)
 988 {
 989     return dd.comm->systemInfo.useUpdateGroups;
 990 }
 991
 992 void dd_cycles_add(const gmx_domdec_t* dd, float cycles, int ddCycl)
 993 {
 994     /* Note that the cycles value can be incorrect, either 0 or some
 995      * extremely large value, when our thread migrated to another core
 996      * with an unsynchronized cycle counter. If this happens less often
 997      * that once per nstlist steps, this will not cause issues, since
 998      * we later subtract the maximum value from the sum over nstlist steps.
 999      * A zero count will slightly lower the total, but that's a small effect.
1000      * Note that the main purpose of the subtraction of the maximum value
1001      * is to avoid throwing off the load balancing when stalls occur due
1002      * e.g. system activity or network congestion.
1003      */
1004     dd->comm->cycl[ddCycl] += cycles;
1005     dd->comm->cycl_n[ddCycl]++;
1006     if (cycles > dd->comm->cycl_max[ddCycl])
1007     {
1008         dd->comm->cycl_max[ddCycl] = cycles;
1009     }
1010 }
1011
1012 #if GMX_MPI
1013 static void make_load_communicator(gmx_domdec_t* dd, int dim_ind, ivec loc)
1014 {
1015     MPI_Comm c_row = MPI_COMM_NULL;
1016     ivec     loc_c;
1017     bool     bPartOfGroup = false;
1018
1019     const int dim = dd->dim[dim_ind];
1020     copy_ivec(loc, loc_c);
1021     for (int i = 0; i < dd->numCells[dim]; i++)
1022     {
1023         loc_c[dim]     = i;
1024         const int rank = dd_index(dd->numCells, loc_c);
1025         if (rank == dd->rank)
1026         {
1027             /* This process is part of the group */
1028             bPartOfGroup = TRUE;
1029         }
1030     }
1031     MPI_Comm_split(dd->mpi_comm_all, bPartOfGroup ? 0 : MPI_UNDEFINED, dd->rank, &c_row);
1032     if (bPartOfGroup)
1033     {
1034         dd->comm->mpi_comm_load[dim_ind] = c_row;
1035         if (!isDlbDisabled(dd->comm))
1036         {
1037             DDCellsizesWithDlb& cellsizes = dd->comm->cellsizesWithDlb[dim_ind];
1038
1039             if (dd->ci[dim] == dd->master_ci[dim])
1040             {
1041                 /* This is the root process of this row */
1042                 cellsizes.rowMaster = std::make_unique<RowMaster>();
1043
1044                 RowMaster& rowMaster = *cellsizes.rowMaster;
1045                 rowMaster.cellFrac.resize(ddCellFractionBufferSize(dd, dim_ind));
1046                 rowMaster.oldCellFrac.resize(dd->numCells[dim] + 1);
1047                 rowMaster.isCellMin.resize(dd->numCells[dim]);
1048                 if (dim_ind > 0)
1049                 {
1050                     rowMaster.bounds.resize(dd->numCells[dim]);
1051                 }
1052                 rowMaster.buf_ncd.resize(dd->numCells[dim]);
1053             }
1054             else
1055             {
1056                 /* This is not a root process, we only need to receive cell_f */
1057                 cellsizes.fracRow.resize(ddCellFractionBufferSize(dd, dim_ind));
1058             }
1059         }
1060         if (dd->ci[dim] == dd->master_ci[dim])
1061         {
1062             snew(dd->comm->load[dim_ind].load, dd->numCells[dim] * DD_NLOAD_MAX);
1063         }
1064     }
1065 }
1066 #endif
1067
1068 void dd_setup_dlb_resource_sharing(const t_commrec* cr, int gpu_id)
1069 {
1070 #if GMX_MPI
1071     MPI_Comm mpi_comm_pp_physicalnode = MPI_COMM_NULL;
1072
1073     if (!thisRankHasDuty(cr, DUTY_PP) || gpu_id < 0)
1074     {
1075         /* Only ranks with short-ranged tasks (currently) use GPUs.
1076          * If we don't have GPUs assigned, there are no resources to share.
1077          */
1078         return;
1079     }
1080
1081     const int physicalnode_id_hash = gmx_physicalnode_id_hash();
1082
1083     gmx_domdec_t* dd = cr->dd;
1084
1085     if (debug)
1086     {
1087         fprintf(debug, "dd_setup_dd_dlb_gpu_sharing:\n");
1088         fprintf(debug, "DD PP rank %d physical node hash %d gpu_id %d\n", dd->rank, physicalnode_id_hash, gpu_id);
1089     }
1090     /* Split the PP communicator over the physical nodes */
1091     /* TODO: See if we should store this (before), as it's also used for
1092      * for the nodecomm summation.
1093      */
1094     // TODO PhysicalNodeCommunicator could be extended/used to handle
1095     // the need for per-node per-group communicators.
1096     MPI_Comm_split(dd->mpi_comm_all, physicalnode_id_hash, dd->rank, &mpi_comm_pp_physicalnode);
1097     MPI_Comm_split(mpi_comm_pp_physicalnode, gpu_id, dd->rank, &dd->comm->mpi_comm_gpu_shared);
1098     MPI_Comm_free(&mpi_comm_pp_physicalnode);
1099     MPI_Comm_size(dd->comm->mpi_comm_gpu_shared, &dd->comm->nrank_gpu_shared);
1100
1101     if (debug)
1102     {
1103         fprintf(debug, "nrank_gpu_shared %d\n", dd->comm->nrank_gpu_shared);
1104     }
1105
1106     /* Note that some ranks could share a GPU, while others don't */
1107
1108     if (dd->comm->nrank_gpu_shared == 1)
1109     {
1110         MPI_Comm_free(&dd->comm->mpi_comm_gpu_shared);
1111     }
1112 #else
1113     GMX_UNUSED_VALUE(cr);
1114     GMX_UNUSED_VALUE(gpu_id);
1115 #endif
1116 }
1117
1118 static void make_load_communicators(gmx_domdec_t gmx_unused* dd)
1119 {
1120 #if GMX_MPI
1121     ivec loc;
1122
1123     if (debug)
1124     {
1125         fprintf(debug, "Making load communicators\n");
1126     }
1127
1128     dd->comm->load = new domdec_load_t[std::max(dd->ndim, 1)];
1129     snew(dd->comm->mpi_comm_load, std::max(dd->ndim, 1));
1130
1131     if (dd->ndim == 0)
1132     {
1133         return;
1134     }
1135
1136     clear_ivec(loc);
1137     make_load_communicator(dd, 0, loc);
1138     if (dd->ndim > 1)
1139     {
1140         const int dim0 = dd->dim[0];
1141         for (int i = 0; i < dd->numCells[dim0]; i++)
1142         {
1143             loc[dim0] = i;
1144             make_load_communicator(dd, 1, loc);
1145         }
1146     }
1147     if (dd->ndim > 2)
1148     {
1149         const int dim0 = dd->dim[0];
1150         for (int i = 0; i < dd->numCells[dim0]; i++)
1151         {
1152             loc[dim0]      = i;
1153             const int dim1 = dd->dim[1];
1154             for (int j = 0; j < dd->numCells[dim1]; j++)
1155             {
1156                 loc[dim1] = j;
1157                 make_load_communicator(dd, 2, loc);
1158             }
1159         }
1160     }
1161
1162     if (debug)
1163     {
1164         fprintf(debug, "Finished making load communicators\n");
1165     }
1166 #endif
1167 }
1168
1169 /*! \brief Sets up the relation between neighboring domains and zones */
1170 static void setup_neighbor_relations(gmx_domdec_t* dd)
1171 {
1172     ivec tmp, s;
1173     GMX_ASSERT((dd->ndim >= 0) && (dd->ndim <= DIM), "Must have valid number of dimensions for DD");
1174
1175     for (int d = 0; d < dd->ndim; d++)
1176     {
1177         const int dim = dd->dim[d];
1178         copy_ivec(dd->ci, tmp);
1179         tmp[dim]           = (tmp[dim] + 1) % dd->numCells[dim];
1180         dd->neighbor[d][0] = ddcoord2ddnodeid(dd, tmp);
1181         copy_ivec(dd->ci, tmp);
1182         tmp[dim]           = (tmp[dim] - 1 + dd->numCells[dim]) % dd->numCells[dim];
1183         dd->neighbor[d][1] = ddcoord2ddnodeid(dd, tmp);
1184         if (debug)
1185         {
1186             fprintf(debug,
1187                     "DD rank %d neighbor ranks in dir %d are + %d - %d\n",
1188                     dd->rank,
1189                     dim,
1190                     dd->neighbor[d][0],
1191                     dd->neighbor[d][1]);
1192         }
1193     }
1194
1195     int nzone  = (1 << dd->ndim);
1196     int nizone = (1 << std::max(dd->ndim - 1, 0));
1197     assert(nizone >= 1 && nizone <= DD_MAXIZONE);
1198
1199     gmx_domdec_zones_t* zones = &dd->comm->zones;
1200
1201     for (int i = 0; i < nzone; i++)
1202     {
1203         int m = 0;
1204         clear_ivec(zones->shift[i]);
1205         for (int d = 0; d < dd->ndim; d++)
1206         {
1207             zones->shift[i][dd->dim[d]] = dd_zo[i][m++];
1208         }
1209     }
1210
1211     zones->n = nzone;
1212     for (int i = 0; i < nzone; i++)
1213     {
1214         for (int d = 0; d < DIM; d++)
1215         {
1216             s[d] = dd->ci[d] - zones->shift[i][d];
1217             if (s[d] < 0)
1218             {
1219                 s[d] += dd->numCells[d];
1220             }
1221             else if (s[d] >= dd->numCells[d])
1222             {
1223                 s[d] -= dd->numCells[d];
1224             }
1225         }
1226     }
1227     for (int iZoneIndex = 0; iZoneIndex < nizone; iZoneIndex++)
1228     {
1229         GMX_RELEASE_ASSERT(
1230                 ddNonbondedZonePairRanges[iZoneIndex][0] == iZoneIndex,
1231                 "The first element for each ddNonbondedZonePairRanges should match its index");
1232
1233         DDPairInteractionRanges iZone;
1234         iZone.iZoneIndex = iZoneIndex;
1235         /* dd_zp3 is for 3D decomposition, for fewer dimensions use only
1236          * j-zones up to nzone.
1237          */
1238         iZone.jZoneRange = gmx::Range<int>(std::min(ddNonbondedZonePairRanges[iZoneIndex][1], nzone),
1239                                            std::min(ddNonbondedZonePairRanges[iZoneIndex][2], nzone));
1240         for (int dim = 0; dim < DIM; dim++)
1241         {
1242             if (dd->numCells[dim] == 1)
1243             {
1244                 /* All shifts should be allowed */
1245                 iZone.shift0[dim] = -1;
1246                 iZone.shift1[dim] = 1;
1247             }
1248             else
1249             {
1250                 /* Determine the min/max j-zone shift wrt the i-zone */
1251                 iZone.shift0[dim] = 1;
1252                 iZone.shift1[dim] = -1;
1253                 for (int jZone : iZone.jZoneRange)
1254                 {
1255                     int shift_diff = zones->shift[jZone][dim] - zones->shift[iZoneIndex][dim];
1256                     if (shift_diff < iZone.shift0[dim])
1257                     {
1258                         iZone.shift0[dim] = shift_diff;
1259                     }
1260                     if (shift_diff > iZone.shift1[dim])
1261                     {
1262                         iZone.shift1[dim] = shift_diff;
1263                     }
1264                 }
1265             }
1266         }
1267
1268         zones->iZones.push_back(iZone);
1269     }
1270
1271     if (!isDlbDisabled(dd->comm))
1272     {
1273         dd->comm->cellsizesWithDlb.resize(dd->ndim);
1274     }
1275
1276     if (dd->comm->ddSettings.recordLoad)
1277     {
1278         make_load_communicators(dd);
1279     }
1280 }
1281
1282 static void make_pp_communicator(const gmx::MDLogger& mdlog,
1283                                  gmx_domdec_t*        dd,
1284                                  t_commrec gmx_unused* cr,
1285                                  bool gmx_unused       reorder)
1286 {
1287 #if GMX_MPI
1288     gmx_domdec_comm_t*  comm      = dd->comm;
1289     CartesianRankSetup& cartSetup = comm->cartesianRankSetup;
1290
1291     if (cartSetup.bCartesianPP)
1292     {
1293         /* Set up cartesian communication for the particle-particle part */
1294         GMX_LOG(mdlog.info)
1295                 .appendTextFormatted("Will use a Cartesian communicator: %d x %d x %d",
1296                                      dd->numCells[XX],
1297                                      dd->numCells[YY],
1298                                      dd->numCells[ZZ]);
1299
1300         ivec periods;
1301         for (int i = 0; i < DIM; i++)
1302         {
1303             periods[i] = TRUE;
1304         }
1305         MPI_Comm comm_cart = MPI_COMM_NULL;
1306         MPI_Cart_create(cr->mpi_comm_mygroup, DIM, dd->numCells, periods, static_cast<int>(reorder), &comm_cart);
1307         /* We overwrite the old communicator with the new cartesian one */
1308         cr->mpi_comm_mygroup = comm_cart;
1309     }
1310
1311     dd->mpi_comm_all = cr->mpi_comm_mygroup;
1312     MPI_Comm_rank(dd->mpi_comm_all, &dd->rank);
1313
1314     if (cartSetup.bCartesianPP_PME)
1315     {
1316         /* Since we want to use the original cartesian setup for sim,
1317          * and not the one after split, we need to make an index.
1318          */
1319         cartSetup.ddindex2ddnodeid.resize(dd->nnodes);
1320         cartSetup.ddindex2ddnodeid[dd_index(dd->numCells, dd->ci)] = dd->rank;
1321         gmx_sumi(dd->nnodes, cartSetup.ddindex2ddnodeid.data(), cr);
1322         /* Get the rank of the DD master,
1323          * above we made sure that the master node is a PP node.
1324          */
1325         int rank = MASTER(cr) ? dd->rank : 0;
1326         MPI_Allreduce(&rank, &dd->masterrank, 1, MPI_INT, MPI_SUM, dd->mpi_comm_all);
1327     }
1328     else if (cartSetup.bCartesianPP)
1329     {
1330         if (!comm->ddRankSetup.usePmeOnlyRanks)
1331         {
1332             /* The PP communicator is also
1333              * the communicator for this simulation
1334              */
1335             cr->mpi_comm_mysim = cr->mpi_comm_mygroup;
1336         }
1337         cr->nodeid = dd->rank;
1338
1339         MPI_Cart_coords(dd->mpi_comm_all, dd->rank, DIM, dd->ci);
1340
1341         /* We need to make an index to go from the coordinates
1342          * to the nodeid of this simulation.
1343          */
1344         cartSetup.ddindex2simnodeid.resize(dd->nnodes);
1345         std::vector<int> buf(dd->nnodes);
1346         if (thisRankHasDuty(cr, DUTY_PP))
1347         {
1348             buf[dd_index(dd->numCells, dd->ci)] = cr->sim_nodeid;
1349         }
1350         /* Communicate the ddindex to simulation nodeid index */
1351         MPI_Allreduce(buf.data(), cartSetup.ddindex2simnodeid.data(), dd->nnodes, MPI_INT, MPI_SUM, cr->mpi_comm_mysim);
1352
1353         /* Determine the master coordinates and rank.
1354          * The DD master should be the same node as the master of this sim.
1355          */
1356         for (int i = 0; i < dd->nnodes; i++)
1357         {
1358             if (cartSetup.ddindex2simnodeid[i] == 0)
1359             {
1360                 ddindex2xyz(dd->numCells, i, dd->master_ci);
1361                 MPI_Cart_rank(dd->mpi_comm_all, dd->master_ci, &dd->masterrank);
1362             }
1363         }
1364         if (debug)
1365         {
1366             fprintf(debug, "The master rank is %d\n", dd->masterrank);
1367         }
1368     }
1369     else
1370     {
1371         /* No Cartesian communicators */
1372         /* We use the rank in dd->comm->all as DD index */
1373         ddindex2xyz(dd->numCells, dd->rank, dd->ci);
1374         /* The simulation master nodeid is 0, so the DD master rank is also 0 */
1375         dd->masterrank = 0;
1376         clear_ivec(dd->master_ci);
1377     }
1378 #endif
1379
1380     GMX_LOG(mdlog.info)
1381             .appendTextFormatted("Domain decomposition rank %d, coordinates %d %d %d\n",
1382                                  dd->rank,
1383                                  dd->ci[XX],
1384                                  dd->ci[YY],
1385                                  dd->ci[ZZ]);
1386     if (debug)
1387     {
1388         fprintf(debug,
1389                 "Domain decomposition rank %d, coordinates %d %d %d\n\n",
1390                 dd->rank,
1391                 dd->ci[XX],
1392                 dd->ci[YY],
1393                 dd->ci[ZZ]);
1394     }
1395 }
1396
1397 static void receive_ddindex2simnodeid(gmx_domdec_t* dd, t_commrec* cr)
1398 {
1399 #if GMX_MPI
1400     CartesianRankSetup& cartSetup = dd->comm->cartesianRankSetup;
1401
1402     if (!cartSetup.bCartesianPP_PME && cartSetup.bCartesianPP)
1403     {
1404         cartSetup.ddindex2simnodeid.resize(dd->nnodes);
1405         std::vector<int> buf(dd->nnodes);
1406         if (thisRankHasDuty(cr, DUTY_PP))
1407         {
1408             buf[dd_index(dd->numCells, dd->ci)] = cr->sim_nodeid;
1409         }
1410         /* Communicate the ddindex to simulation nodeid index */
1411         MPI_Allreduce(buf.data(), cartSetup.ddindex2simnodeid.data(), dd->nnodes, MPI_INT, MPI_SUM, cr->mpi_comm_mysim);
1412     }
1413 #else
1414     GMX_UNUSED_VALUE(dd);
1415     GMX_UNUSED_VALUE(cr);
1416 #endif
1417 }
1418
1419 static CartesianRankSetup split_communicator(const gmx::MDLogger& mdlog,
1420                                              t_commrec*           cr,
1421                                              const DdRankOrder    ddRankOrder,
1422                                              bool gmx_unused      reorder,
1423                                              const DDRankSetup&   ddRankSetup,
1424                                              ivec                 ddCellIndex,
1425                                              std::vector<int>*    pmeRanks)
1426 {
1427     CartesianRankSetup cartSetup;
1428
1429     cartSetup.bCartesianPP     = (ddRankOrder == DdRankOrder::cartesian);
1430     cartSetup.bCartesianPP_PME = false;
1431
1432     const ivec& numDDCells = ddRankSetup.numPPCells;
1433     /* Initially we set ntot to the number of PP cells */
1434     copy_ivec(numDDCells, cartSetup.ntot);
1435
1436     if (cartSetup.bCartesianPP)
1437     {
1438         const int numDDCellsTot = ddRankSetup.numPPRanks;
1439         bool      bDiv[DIM];
1440         for (int i = 1; i < DIM; i++)
1441         {
1442             bDiv[i] = ((ddRankSetup.numRanksDoingPme * numDDCells[i]) % numDDCellsTot == 0);
1443         }
1444         if (bDiv[YY] || bDiv[ZZ])
1445         {
1446             cartSetup.bCartesianPP_PME = TRUE;
1447             /* If we have 2D PME decomposition, which is always in x+y,
1448              * we stack the PME only nodes in z.
1449              * Otherwise we choose the direction that provides the thinnest slab
1450              * of PME only nodes as this will have the least effect
1451              * on the PP communication.
1452              * But for the PME communication the opposite might be better.
1453              */
1454             if (bDiv[ZZ] && (ddRankSetup.npmenodes_y > 1 || !bDiv[YY] || numDDCells[YY] > numDDCells[ZZ]))
1455             {
1456                 cartSetup.cartpmedim = ZZ;
1457             }
1458             else
1459             {
1460                 cartSetup.cartpmedim = YY;
1461             }
1462             cartSetup.ntot[cartSetup.cartpmedim] +=
1463                     (ddRankSetup.numRanksDoingPme * numDDCells[cartSetup.cartpmedim]) / numDDCellsTot;
1464         }
1465         else
1466         {
1467             GMX_LOG(mdlog.info)
1468                     .appendTextFormatted(
1469                             "Number of PME-only ranks (%d) is not a multiple of nx*ny (%d*%d) or "
1470                             "nx*nz (%d*%d)",
1471                             ddRankSetup.numRanksDoingPme,
1472                             numDDCells[XX],
1473                             numDDCells[YY],
1474                             numDDCells[XX],
1475                             numDDCells[ZZ]);
1476             GMX_LOG(mdlog.info)
1477                     .appendText("Will not use a Cartesian communicator for PP <-> PME\n");
1478         }
1479     }
1480
1481     if (cartSetup.bCartesianPP_PME)
1482     {
1483 #if GMX_MPI
1484         int  rank = 0;
1485         ivec periods;
1486
1487         GMX_LOG(mdlog.info)
1488                 .appendTextFormatted(
1489                         "Will use a Cartesian communicator for PP <-> PME: %d x %d x %d",
1490                         cartSetup.ntot[XX],
1491                         cartSetup.ntot[YY],
1492                         cartSetup.ntot[ZZ]);
1493
1494         for (int i = 0; i < DIM; i++)
1495         {
1496             periods[i] = TRUE;
1497         }
1498         MPI_Comm comm_cart = MPI_COMM_NULL;
1499         MPI_Cart_create(cr->mpi_comm_mysim, DIM, cartSetup.ntot, periods, static_cast<int>(reorder), &comm_cart);
1500         MPI_Comm_rank(comm_cart, &rank);
1501         if (MASTER(cr) && rank != 0)
1502         {
1503             gmx_fatal(FARGS, "MPI rank 0 was renumbered by MPI_Cart_create, we do not allow this");
1504         }
1505
1506         /* With this assigment we loose the link to the original communicator
1507          * which will usually be MPI_COMM_WORLD, unless have multisim.
1508          */
1509         cr->mpi_comm_mysim = comm_cart;
1510         cr->sim_nodeid     = rank;
1511
1512         MPI_Cart_coords(cr->mpi_comm_mysim, cr->sim_nodeid, DIM, ddCellIndex);
1513
1514         GMX_LOG(mdlog.info)
1515                 .appendTextFormatted("Cartesian rank %d, coordinates %d %d %d\n",
1516                                      cr->sim_nodeid,
1517                                      ddCellIndex[XX],
1518                                      ddCellIndex[YY],
1519                                      ddCellIndex[ZZ]);
1520
1521         if (ddCellIndex[cartSetup.cartpmedim] < numDDCells[cartSetup.cartpmedim])
1522         {
1523             cr->duty = DUTY_PP;
1524         }
1525         if (!ddRankSetup.usePmeOnlyRanks
1526             || ddCellIndex[cartSetup.cartpmedim] >= numDDCells[cartSetup.cartpmedim])
1527         {
1528             cr->duty = DUTY_PME;
1529         }
1530
1531         /* Split the sim communicator into PP and PME only nodes */
1532         MPI_Comm_split(cr->mpi_comm_mysim,
1533                        getThisRankDuties(cr),
1534                        dd_index(cartSetup.ntot, ddCellIndex),
1535                        &cr->mpi_comm_mygroup);
1536 #else
1537         GMX_UNUSED_VALUE(ddCellIndex);
1538 #endif
1539     }
1540     else
1541     {
1542         switch (ddRankOrder)
1543         {
1544             case DdRankOrder::pp_pme:
1545                 GMX_LOG(mdlog.info).appendText("Order of the ranks: PP first, PME last");
1546                 break;
1547             case DdRankOrder::interleave:
1548                 /* Interleave the PP-only and PME-only ranks */
1549                 GMX_LOG(mdlog.info).appendText("Interleaving PP and PME ranks");
1550                 *pmeRanks = dd_interleaved_pme_ranks(ddRankSetup);
1551                 break;
1552             case DdRankOrder::cartesian: break;
1553             default: gmx_fatal(FARGS, "Invalid ddRankOrder=%d", static_cast<int>(ddRankOrder));
1554         }
1555
1556         if (dd_simnode2pmenode(ddRankSetup, cartSetup, *pmeRanks, cr, cr->sim_nodeid) == -1)
1557         {
1558             cr->duty = DUTY_PME;
1559         }
1560         else
1561         {
1562             cr->duty = DUTY_PP;
1563         }
1564 #if GMX_MPI
1565         /* Split the sim communicator into PP and PME only nodes */
1566         MPI_Comm_split(cr->mpi_comm_mysim, getThisRankDuties(cr), cr->nodeid, &cr->mpi_comm_mygroup);
1567         MPI_Comm_rank(cr->mpi_comm_mygroup, &cr->nodeid);
1568 #endif
1569     }
1570
1571     GMX_LOG(mdlog.info)
1572             .appendTextFormatted("This rank does only %s work.\n",
1573                                  thisRankHasDuty(cr, DUTY_PP) ? "particle-particle" : "PME-mesh");
1574
1575     return cartSetup;
1576 }
1577
1578 /*! \brief Makes the PP communicator and the PME communicator, when needed
1579  *
1580  * Returns the Cartesian rank setup.
1581  * Sets \p cr->mpi_comm_mygroup
1582  * For PP ranks, sets the DD PP cell index in \p ddCellIndex.
1583  * With separate PME ranks in interleaved order, set the PME ranks in \p pmeRanks.
1584  */
1585 static CartesianRankSetup makeGroupCommunicators(const gmx::MDLogger& mdlog,
1586                                                  const DDSettings&    ddSettings,
1587                                                  const DdRankOrder    ddRankOrder,
1588                                                  const DDRankSetup&   ddRankSetup,
1589                                                  t_commrec*           cr,
1590                                                  ivec                 ddCellIndex,
1591                                                  std::vector<int>*    pmeRanks)
1592 {
1593     CartesianRankSetup cartSetup;
1594
1595     // As a default, both group and sim communicators are equal to the default communicator
1596     cr->mpi_comm_mygroup = cr->mpiDefaultCommunicator;
1597     cr->mpi_comm_mysim   = cr->mpiDefaultCommunicator;
1598     cr->nnodes           = cr->sizeOfDefaultCommunicator;
1599     cr->nodeid           = cr->rankInDefaultCommunicator;
1600     cr->sim_nodeid       = cr->rankInDefaultCommunicator;
1601
1602     if (ddRankSetup.usePmeOnlyRanks)
1603     {
1604         /* Split the communicator into a PP and PME part */
1605         cartSetup = split_communicator(
1606                 mdlog, cr, ddRankOrder, ddSettings.useCartesianReorder, ddRankSetup, ddCellIndex, pmeRanks);
1607     }
1608     else
1609     {
1610         /* All nodes do PP and PME */
1611         /* We do not require separate communicators */
1612         cartSetup.bCartesianPP     = false;
1613         cartSetup.bCartesianPP_PME = false;
1614     }
1615
1616     return cartSetup;
1617 }
1618
1619 /*! \brief For PP ranks, sets or makes the communicator
1620  *
1621  * For PME ranks get the rank id.
1622  * For PP only ranks, sets the PME-only rank.
1623  */
1624 static void setupGroupCommunication(const gmx::MDLogger&     mdlog,
1625                                     const DDSettings&        ddSettings,
1626                                     gmx::ArrayRef<const int> pmeRanks,
1627                                     t_commrec*               cr,
1628                                     const int                numAtomsInSystem,
1629                                     gmx_domdec_t*            dd)
1630 {
1631     const DDRankSetup&        ddRankSetup = dd->comm->ddRankSetup;
1632     const CartesianRankSetup& cartSetup   = dd->comm->cartesianRankSetup;
1633
1634     if (thisRankHasDuty(cr, DUTY_PP))
1635     {
1636         /* Copy or make a new PP communicator */
1637
1638         /* We (possibly) reordered the nodes in split_communicator,
1639          * so it is no longer required in make_pp_communicator.
1640          */
1641         const bool useCartesianReorder = (ddSettings.useCartesianReorder && !cartSetup.bCartesianPP_PME);
1642
1643         make_pp_communicator(mdlog, dd, cr, useCartesianReorder);
1644     }
1645     else
1646     {
1647         receive_ddindex2simnodeid(dd, cr);
1648     }
1649
1650     if (!thisRankHasDuty(cr, DUTY_PME))
1651     {
1652         /* Set up the commnuication to our PME node */
1653         dd->pme_nodeid = dd_simnode2pmenode(ddRankSetup, cartSetup, pmeRanks, cr, cr->sim_nodeid);
1654         dd->pme_receive_vir_ener = receive_vir_ener(dd, pmeRanks, cr);
1655         if (debug)
1656         {
1657             fprintf(debug,
1658                     "My pme_nodeid %d receive ener %s\n",
1659                     dd->pme_nodeid,
1660                     gmx::boolToString(dd->pme_receive_vir_ener));
1661         }
1662     }
1663     else
1664     {
1665         dd->pme_nodeid = -1;
1666     }
1667
1668     /* We can not use DDMASTER(dd), because dd->masterrank is set later */
1669     if (MASTER(cr))
1670     {
1671         dd->ma = std::make_unique<AtomDistribution>(dd->numCells, numAtomsInSystem, numAtomsInSystem);
1672     }
1673 }
1674
1675 static real* get_slb_frac(const gmx::MDLogger& mdlog, const char* dir, int nc, const char* size_string)
1676 {
1677     real* slb_frac = nullptr;
1678     if (nc > 1 && size_string != nullptr)
1679     {
1680         GMX_LOG(mdlog.info).appendTextFormatted("Using static load balancing for the %s direction", dir);
1681         snew(slb_frac, nc);
1682         real tot = 0;
1683         for (int i = 0; i < nc; i++)
1684         {
1685             double dbl = 0;
1686             int    n   = 0;
1687             sscanf(size_string, "%20lf%n", &dbl, &n);
1688             if (dbl == 0)
1689             {
1690                 gmx_fatal(FARGS,
1691                           "Incorrect or not enough DD cell size entries for direction %s: '%s'",
1692                           dir,
1693                           size_string);
1694             }
1695             slb_frac[i] = dbl;
1696             size_string += n;
1697             tot += slb_frac[i];
1698         }
1699         /* Normalize */
1700         std::string relativeCellSizes = "Relative cell sizes:";
1701         for (int i = 0; i < nc; i++)
1702         {
1703             slb_frac[i] /= tot;
1704             relativeCellSizes += gmx::formatString(" %5.3f", slb_frac[i]);
1705         }
1706         GMX_LOG(mdlog.info).appendText(relativeCellSizes);
1707     }
1708
1709     return slb_frac;
1710 }
1711
1712 static int multi_body_bondeds_count(const gmx_mtop_t& mtop)
1713 {
1714     int n = 0;
1715     for (const auto ilists : IListRange(mtop))
1716     {
1717         for (auto& ilist : extractILists(ilists.list(), IF_BOND))
1718         {
1719             if (NRAL(ilist.functionType) > 2)
1720             {
1721                 n += ilists.nmol() * (ilist.iatoms.size() / ilistStride(ilist));
1722             }
1723         }
1724     }
1725
1726     return n;
1727 }
1728
1729 static int dd_getenv(const gmx::MDLogger& mdlog, const char* env_var, int def)
1730 {
1731     int   nst = def;
1732     char* val = getenv(env_var);
1733     if (val)
1734     {
1735         if (sscanf(val, "%20d", &nst) <= 0)
1736         {
1737             nst = 1;
1738         }
1739         GMX_LOG(mdlog.info).appendTextFormatted("Found env.var. %s = %s, using value %d", env_var, val, nst);
1740     }
1741
1742     return nst;
1743 }
1744
1745 static void check_dd_restrictions(const gmx_domdec_t* dd, const t_inputrec& inputrec, const gmx::MDLogger& mdlog)
1746 {
1747     if (inputrec.pbcType == PbcType::Screw
1748         && (dd->numCells[XX] == 1 || dd->numCells[YY] > 1 || dd->numCells[ZZ] > 1))
1749     {
1750         gmx_fatal(FARGS,
1751                   "With pbc=%s can only do domain decomposition in the x-direction",
1752                   c_pbcTypeNames[inputrec.pbcType].c_str());
1753     }
1754
1755     if (inputrec.nstlist == 0)
1756     {
1757         gmx_fatal(FARGS, "Domain decomposition does not work with nstlist=0");
1758     }
1759
1760     if (inputrec.comm_mode == ComRemovalAlgorithm::Angular && inputrec.pbcType != PbcType::No)
1761     {
1762         GMX_LOG(mdlog.warning)
1763                 .appendText(
1764                         "comm-mode angular will give incorrect results when the comm group "
1765                         "partially crosses a periodic boundary");
1766     }
1767 }
1768
1769 static real average_cellsize_min(const gmx_ddbox_t& ddbox, const ivec numDomains)
1770 {
1771     real r = ddbox.box_size[XX];
1772     for (int d = 0; d < DIM; d++)
1773     {
1774         if (numDomains[d] > 1)
1775         {
1776             /* Check using the initial average cell size */
1777             r = std::min(r, ddbox.box_size[d] * ddbox.skew_fac[d] / numDomains[d]);
1778         }
1779     }
1780
1781     return r;
1782 }
1783
1784 /*! \brief Depending on the DLB initial value return the DLB switched off state or issue an error.
1785  */
1786 static DlbState forceDlbOffOrBail(DlbState             cmdlineDlbState,
1787                                   const std::string&   reasonStr,
1788                                   const gmx::MDLogger& mdlog)
1789 {
1790     std::string dlbNotSupportedErr = "Dynamic load balancing requested, but ";
1791     std::string dlbDisableNote     = "NOTE: disabling dynamic load balancing as ";
1792
1793     if (cmdlineDlbState == DlbState::onUser)
1794     {
1795         gmx_fatal(FARGS, "%s", (dlbNotSupportedErr + reasonStr).c_str());
1796     }
1797     else if (cmdlineDlbState == DlbState::offCanTurnOn)
1798     {
1799         GMX_LOG(mdlog.info).appendText(dlbDisableNote + reasonStr);
1800     }
1801     return DlbState::offForever;
1802 }
1803
1804 /*! \brief Return the dynamic load balancer's initial state based on initial conditions and user inputs.
1805  *
1806  * This function parses the parameters of "-dlb" command line option setting
1807  * corresponding state values. Then it checks the consistency of the determined
1808  * state with other run parameters and settings. As a result, the initial state
1809  * may be altered or an error may be thrown if incompatibility of options is detected.
1810  *
1811  * \param [in] mdlog       Logger.
1812  * \param [in] dlbOption   Enum value for the DLB option.
1813  * \param [in] bRecordLoad True if the load balancer is recording load information.
1814  * \param [in] mdrunOptions  Options for mdrun.
1815  * \param [in] inputrec    Pointer mdrun to input parameters.
1816  * \returns                DLB initial/startup state.
1817  */
1818 static DlbState determineInitialDlbState(const gmx::MDLogger&     mdlog,
1819                                          DlbOption                dlbOption,
1820                                          gmx_bool                 bRecordLoad,
1821                                          const gmx::MdrunOptions& mdrunOptions,
1822                                          const t_inputrec&        inputrec)
1823 {
1824     DlbState dlbState = DlbState::offCanTurnOn;
1825
1826     switch (dlbOption)
1827     {
1828         case DlbOption::turnOnWhenUseful: dlbState = DlbState::offCanTurnOn; break;
1829         case DlbOption::no: dlbState = DlbState::offUser; break;
1830         case DlbOption::yes: dlbState = DlbState::onUser; break;
1831         default: gmx_incons("Invalid dlbOption enum value");
1832     }
1833
1834     /* Reruns don't support DLB: bail or override auto mode */
1835     if (mdrunOptions.rerun)
1836     {
1837         std::string reasonStr = "it is not supported in reruns.";
1838         return forceDlbOffOrBail(dlbState, reasonStr, mdlog);
1839     }
1840
1841     /* Unsupported integrators */
1842     if (!EI_DYNAMICS(inputrec.eI))
1843     {
1844         auto reasonStr =
1845                 gmx::formatString("it is only supported with dynamics, not with integrator '%s'.",
1846                                   enumValueToString(inputrec.eI));
1847         return forceDlbOffOrBail(dlbState, reasonStr, mdlog);
1848     }
1849
1850     /* Without cycle counters we can't time work to balance on */
1851     if (!bRecordLoad)
1852     {
1853         std::string reasonStr =
1854                 "cycle counters unsupported or not enabled in the operating system kernel.";
1855         return forceDlbOffOrBail(dlbState, reasonStr, mdlog);
1856     }
1857
1858     if (mdrunOptions.reproducible)
1859     {
1860         std::string reasonStr = "you started a reproducible run.";
1861         switch (dlbState)
1862         {
1863             case DlbState::offUser: break;
1864             case DlbState::offForever:
1865                 GMX_RELEASE_ASSERT(false, "DlbState::offForever is not a valid initial state");
1866                 break;
1867             case DlbState::offCanTurnOn: return forceDlbOffOrBail(dlbState, reasonStr, mdlog);
1868             case DlbState::onCanTurnOff:
1869                 GMX_RELEASE_ASSERT(false, "DlbState::offCanTurnOff is not a valid initial state");
1870                 break;
1871             case DlbState::onUser:
1872                 return forceDlbOffOrBail(
1873                         dlbState,
1874                         reasonStr
1875                                 + " In load balanced runs binary reproducibility cannot be "
1876                                   "ensured.",
1877                         mdlog);
1878             default:
1879                 gmx_fatal(FARGS,
1880                           "Death horror: undefined case (%d) for load balancing choice",
1881                           static_cast<int>(dlbState));
1882         }
1883     }
1884
1885     return dlbState;
1886 }
1887
1888 static gmx_domdec_comm_t* init_dd_comm()
1889 {
1890     gmx_domdec_comm_t* comm = new gmx_domdec_comm_t;
1891
1892     comm->n_load_have    = 0;
1893     comm->n_load_collect = 0;
1894
1895     comm->haveTurnedOffDlb = false;
1896
1897     for (int i = 0; i < static_cast<int>(DDAtomRanges::Type::Number); i++)
1898     {
1899         comm->sum_nat[i] = 0;
1900     }
1901     comm->ndecomp   = 0;
1902     comm->nload     = 0;
1903     comm->load_step = 0;
1904     comm->load_sum  = 0;
1905     comm->load_max  = 0;
1906     clear_ivec(comm->load_lim);
1907     comm->load_mdf = 0;
1908     comm->load_pme = 0;
1909
1910     /* This should be replaced by a unique pointer */
1911     comm->balanceRegion = ddBalanceRegionAllocate();
1912
1913     return comm;
1914 }
1915
1916 static void setupUpdateGroups(const gmx::MDLogger&              mdlog,
1917                               const gmx_mtop_t&                 mtop,
1918                               ArrayRef<const RangePartitioning> updateGroupingsPerMoleculeType,
1919                               const bool                        useUpdateGroups,
1920                               const real                        maxUpdateGroupRadius,
1921                               DDSystemInfo*                     systemInfo)
1922 {
1923     systemInfo->updateGroupingsPerMoleculeType = updateGroupingsPerMoleculeType;
1924     systemInfo->useUpdateGroups                = useUpdateGroups;
1925     systemInfo->maxUpdateGroupRadius           = maxUpdateGroupRadius;
1926
1927     if (systemInfo->useUpdateGroups)
1928     {
1929         int numUpdateGroups = 0;
1930         for (const auto& molblock : mtop.molblock)
1931         {
1932             numUpdateGroups += molblock.nmol
1933                                * systemInfo->updateGroupingsPerMoleculeType[molblock.type].numBlocks();
1934         }
1935
1936         GMX_LOG(mdlog.info)
1937                 .appendTextFormatted(
1938                         "Using update groups, nr %d, average size %.1f atoms, max. radius %.3f "
1939                         "nm\n",
1940                         numUpdateGroups,
1941                         mtop.natoms / static_cast<double>(numUpdateGroups),
1942                         systemInfo->maxUpdateGroupRadius);
1943     }
1944 }
1945
1946 UnitCellInfo::UnitCellInfo(const t_inputrec& ir) :
1947     npbcdim(numPbcDimensions(ir.pbcType)),
1948     numBoundedDimensions(inputrec2nboundeddim(&ir)),
1949     ddBoxIsDynamic(numBoundedDimensions < DIM || inputrecDynamicBox(&ir)),
1950     haveScrewPBC(ir.pbcType == PbcType::Screw)
1951 {
1952 }
1953
1954 /* Returns whether molecules are always whole, i.e. not broken by PBC */
1955 static bool moleculesAreAlwaysWhole(const gmx_mtop_t&                           mtop,
1956                                     const bool                                  useUpdateGroups,
1957                                     gmx::ArrayRef<const gmx::RangePartitioning> updateGroupingsPerMoleculeType)
1958 {
1959     if (useUpdateGroups)
1960     {
1961         GMX_RELEASE_ASSERT(updateGroupingsPerMoleculeType.size() == mtop.moltype.size(),
1962                            "Need one grouping per moltype");
1963         for (size_t mol = 0; mol < mtop.moltype.size(); mol++)
1964         {
1965             if (updateGroupingsPerMoleculeType[mol].numBlocks() > 1)
1966             {
1967                 return false;
1968             }
1969         }
1970     }
1971     else
1972     {
1973         for (const auto& moltype : mtop.moltype)
1974         {
1975             if (moltype.atoms.nr > 1)
1976             {
1977                 return false;
1978             }
1979         }
1980     }
1981
1982     return true;
1983 }
1984
1985 /*! \brief Generate the simulation system information */
1986 static DDSystemInfo getSystemInfo(const gmx::MDLogger&              mdlog,
1987                                   DDRole                            ddRole,
1988                                   MPI_Comm                          communicator,
1989                                   const DomdecOptions&              options,
1990                                   const gmx_mtop_t&                 mtop,
1991                                   const t_inputrec&                 ir,
1992                                   const matrix                      box,
1993                                   ArrayRef<const RangePartitioning> updateGroupingPerMoleculeType,
1994                                   const bool                        useUpdateGroups,
1995                                   const real                        maxUpdateGroupRadius,
1996                                   gmx::ArrayRef<const gmx::RVec>    xGlobal)
1997 {
1998     const real tenPercentMargin = 1.1;
1999
2000     DDSystemInfo systemInfo;
2001
2002     setupUpdateGroups(
2003             mdlog, mtop, updateGroupingPerMoleculeType, useUpdateGroups, maxUpdateGroupRadius, &systemInfo);
2004
2005     systemInfo.moleculesAreAlwaysWhole = moleculesAreAlwaysWhole(
2006             mtop, systemInfo.useUpdateGroups, systemInfo.updateGroupingsPerMoleculeType);
2007     systemInfo.haveInterDomainBondeds =
2008             (!systemInfo.moleculesAreAlwaysWhole || mtop.bIntermolecularInteractions);
2009     systemInfo.haveInterDomainMultiBodyBondeds =
2010             (systemInfo.haveInterDomainBondeds && multi_body_bondeds_count(mtop) > 0);
2011
2012     if (systemInfo.useUpdateGroups)
2013     {
2014         systemInfo.mayHaveSplitConstraints = false;
2015         systemInfo.mayHaveSplitSettles     = false;
2016     }
2017     else
2018     {
2019         systemInfo.mayHaveSplitConstraints = (gmx_mtop_ftype_count(mtop, F_CONSTR) > 0
2020                                               || gmx_mtop_ftype_count(mtop, F_CONSTRNC) > 0);
2021         systemInfo.mayHaveSplitSettles     = (gmx_mtop_ftype_count(mtop, F_SETTLE) > 0);
2022     }
2023
2024     if (ir.rlist == 0)
2025     {
2026         /* Set the cut-off to some very large value,
2027          * so we don't need if statements everywhere in the code.
2028          * We use sqrt, since the cut-off is squared in some places.
2029          */
2030         systemInfo.cutoff = GMX_CUTOFF_INF;
2031     }
2032     else
2033     {
2034         systemInfo.cutoff = atomToAtomIntoDomainToDomainCutoff(systemInfo, ir.rlist);
2035     }
2036     systemInfo.minCutoffForMultiBody = 0;
2037
2038     /* Determine the minimum cell size limit, affected by many factors */
2039     systemInfo.cellsizeLimit             = 0;
2040     systemInfo.filterBondedCommunication = false;
2041
2042     /* We do not allow home atoms to move beyond the neighboring domain
2043      * between domain decomposition steps, which limits the cell size.
2044      * Get an estimate of cell size limit due to atom displacement.
2045      * In most cases this is a large overestimate, because it assumes
2046      * non-interaction atoms.
2047      * We set the chance to 1 in a trillion steps.
2048      */
2049     constexpr real c_chanceThatAtomMovesBeyondDomain = 1e-12;
2050     const real     limitForAtomDisplacement          = minCellSizeForAtomDisplacement(
2051             mtop, ir, systemInfo.updateGroupingsPerMoleculeType, c_chanceThatAtomMovesBeyondDomain);
2052     GMX_LOG(mdlog.info).appendTextFormatted("Minimum cell size due to atom displacement: %.3f nm", limitForAtomDisplacement);
2053
2054     systemInfo.cellsizeLimit = std::max(systemInfo.cellsizeLimit, limitForAtomDisplacement);
2055
2056     /* TODO: PME decomposition currently requires atoms not to be more than
2057      *       2/3 of comm->cutoff, which is >=rlist, outside of their domain.
2058      *       In nearly all cases, limitForAtomDisplacement will be smaller
2059      *       than 2/3*rlist, so the PME requirement is satisfied.
2060      *       But it would be better for both correctness and performance
2061      *       to use limitForAtomDisplacement instead of 2/3*comm->cutoff.
2062      *       Note that we would need to improve the pairlist buffer case.
2063      */
2064
2065     if (systemInfo.haveInterDomainBondeds)
2066     {
2067         if (options.minimumCommunicationRange > 0)
2068         {
2069             systemInfo.minCutoffForMultiBody =
2070                     atomToAtomIntoDomainToDomainCutoff(systemInfo, options.minimumCommunicationRange);
2071             if (options.useBondedCommunication)
2072             {
2073                 systemInfo.filterBondedCommunication =
2074                         (systemInfo.minCutoffForMultiBody > systemInfo.cutoff);
2075             }
2076             else
2077             {
2078                 systemInfo.cutoff = std::max(systemInfo.cutoff, systemInfo.minCutoffForMultiBody);
2079             }
2080         }
2081         else if (ir.bPeriodicMols)
2082         {
2083             /* Can not easily determine the required cut-off */
2084             GMX_LOG(mdlog.warning)
2085                     .appendText(
2086                             "NOTE: Periodic molecules are present in this system. Because of this, "
2087                             "the domain decomposition algorithm cannot easily determine the "
2088                             "minimum cell size that it requires for treating bonded interactions. "
2089                             "Instead, domain decomposition will assume that half the non-bonded "
2090                             "cut-off will be a suitable lower bound.");
2091             systemInfo.minCutoffForMultiBody = systemInfo.cutoff / 2;
2092         }
2093         else
2094         {
2095             real r_2b = 0;
2096             real r_mb = 0;
2097
2098             if (ddRole == DDRole::Master)
2099             {
2100                 dd_bonded_cg_distance(mdlog, mtop, ir, xGlobal, box, options.ddBondedChecking, &r_2b, &r_mb);
2101             }
2102             gmx_bcast(sizeof(r_2b), &r_2b, communicator);
2103             gmx_bcast(sizeof(r_mb), &r_mb, communicator);
2104
2105             /* We use an initial margin of 10% for the minimum cell size,
2106              * except when we are just below the non-bonded cut-off.
2107              */
2108             if (options.useBondedCommunication)
2109             {
2110                 if (std::max(r_2b, r_mb) > systemInfo.cutoff)
2111                 {
2112                     const real r_bonded              = std::max(r_2b, r_mb);
2113                     systemInfo.minCutoffForMultiBody = tenPercentMargin * r_bonded;
2114                     /* This is the (only) place where we turn on the filtering */
2115                     systemInfo.filterBondedCommunication = true;
2116                 }
2117                 else
2118                 {
2119                     const real r_bonded = r_mb;
2120                     systemInfo.minCutoffForMultiBody =
2121                             std::min(tenPercentMargin * r_bonded, systemInfo.cutoff);
2122                 }
2123                 /* We determine cutoff_mbody later */
2124                 systemInfo.increaseMultiBodyCutoff = true;
2125             }
2126             else
2127             {
2128                 /* No special bonded communication,
2129                  * simply increase the DD cut-off.
2130                  */
2131                 systemInfo.minCutoffForMultiBody = tenPercentMargin * std::max(r_2b, r_mb);
2132                 systemInfo.cutoff = std::max(systemInfo.cutoff, systemInfo.minCutoffForMultiBody);
2133             }
2134         }
2135         GMX_LOG(mdlog.info)
2136                 .appendTextFormatted("Minimum cell size due to bonded interactions: %.3f nm",
2137                                      systemInfo.minCutoffForMultiBody);
2138
2139         systemInfo.cellsizeLimit = std::max(systemInfo.cellsizeLimit, systemInfo.minCutoffForMultiBody);
2140     }
2141
2142     systemInfo.constraintCommunicationRange = 0;
2143     if (systemInfo.mayHaveSplitConstraints && options.constraintCommunicationRange <= 0)
2144     {
2145         /* There is a cell size limit due to the constraints (P-LINCS) */
2146         systemInfo.constraintCommunicationRange = gmx::constr_r_max(mdlog, &mtop, &ir);
2147         GMX_LOG(mdlog.info)
2148                 .appendTextFormatted("Estimated maximum distance required for P-LINCS: %.3f nm",
2149                                      systemInfo.constraintCommunicationRange);
2150         if (systemInfo.constraintCommunicationRange > systemInfo.cellsizeLimit)
2151         {
2152             GMX_LOG(mdlog.info)
2153                     .appendText(
2154                             "This distance will limit the DD cell size, you can override this with "
2155                             "-rcon");
2156         }
2157     }
2158     else if (options.constraintCommunicationRange > 0)
2159     {
2160         /* Here we do not check for dd->splitConstraints.
2161          * because one can also set a cell size limit for virtual sites only
2162          * and at this point we don't know yet if there are intercg v-sites.
2163          */
2164         GMX_LOG(mdlog.info)
2165                 .appendTextFormatted("User supplied maximum distance required for P-LINCS: %.3f nm",
2166                                      options.constraintCommunicationRange);
2167         systemInfo.constraintCommunicationRange = options.constraintCommunicationRange;
2168     }
2169     systemInfo.cellsizeLimit = std::max(systemInfo.cellsizeLimit, systemInfo.constraintCommunicationRange);
2170
2171     return systemInfo;
2172 }
2173
2174 /*! \brief Exit with a fatal error if the DDGridSetup cannot be
2175  * implemented. */
2176 static void checkDDGridSetup(const DDGridSetup&   ddGridSetup,
2177                              DDRole               ddRole,
2178                              MPI_Comm             communicator,
2179                              int                  numNodes,
2180                              const DomdecOptions& options,
2181                              const DDSettings&    ddSettings,
2182                              const DDSystemInfo&  systemInfo,
2183                              const real           cellsizeLimit,
2184                              const gmx_ddbox_t&   ddbox)
2185 {
2186     if (options.numCells[XX] <= 0 && (ddGridSetup.numDomains[XX] == 0))
2187     {
2188         const bool  bC = (systemInfo.mayHaveSplitConstraints
2189                          && systemInfo.constraintCommunicationRange > systemInfo.minCutoffForMultiBody);
2190         std::string message =
2191                 gmx::formatString("Change the number of ranks or mdrun option %s%s%s",
2192                                   !bC ? "-rdd" : "-rcon",
2193                                   ddSettings.initialDlbState != DlbState::offUser ? " or -dds" : "",
2194                                   bC ? " or your LINCS settings" : "");
2195
2196         gmx_fatal_collective(FARGS,
2197                              communicator,
2198                              ddRole == DDRole::Master,
2199                              "There is no domain decomposition for %d ranks that is compatible "
2200                              "with the given box and a minimum cell size of %g nm\n"
2201                              "%s\n"
2202                              "Look in the log file for details on the domain decomposition",
2203                              numNodes - ddGridSetup.numPmeOnlyRanks,
2204                              cellsizeLimit,
2205                              message.c_str());
2206     }
2207
2208     const real acs = average_cellsize_min(ddbox, ddGridSetup.numDomains);
2209     if (acs < cellsizeLimit)
2210     {
2211         if (options.numCells[XX] <= 0)
2212         {
2213             GMX_RELEASE_ASSERT(
2214                     false,
2215                     "dd_choose_grid() should return a grid that satisfies the cell size limits");
2216         }
2217         else
2218         {
2219             gmx_fatal_collective(
2220                     FARGS,
2221                     communicator,
2222                     ddRole == DDRole::Master,
2223                     "The initial cell size (%f) is smaller than the cell size limit (%f), change "
2224                     "options -dd, -rdd or -rcon, see the log file for details",
2225                     acs,
2226                     cellsizeLimit);
2227         }
2228     }
2229
2230     const int numPPRanks =
2231             ddGridSetup.numDomains[XX] * ddGridSetup.numDomains[YY] * ddGridSetup.numDomains[ZZ];
2232     if (numNodes - numPPRanks != ddGridSetup.numPmeOnlyRanks)
2233     {
2234         gmx_fatal_collective(FARGS,
2235                              communicator,
2236                              ddRole == DDRole::Master,
2237                              "The size of the domain decomposition grid (%d) does not match the "
2238                              "number of PP ranks (%d). The total number of ranks is %d",
2239                              numPPRanks,
2240                              numNodes - ddGridSetup.numPmeOnlyRanks,
2241                              numNodes);
2242     }
2243     if (ddGridSetup.numPmeOnlyRanks > numPPRanks)
2244     {
2245         gmx_fatal_collective(FARGS,
2246                              communicator,
2247                              ddRole == DDRole::Master,
2248                              "The number of separate PME ranks (%d) is larger than the number of "
2249                              "PP ranks (%d), this is not supported.",
2250                              ddGridSetup.numPmeOnlyRanks,
2251                              numPPRanks);
2252     }
2253 }
2254
2255 /*! \brief Set the cell size and interaction limits, as well as the DD grid */
2256 static DDRankSetup getDDRankSetup(const gmx::MDLogger& mdlog,
2257                                   int                  numNodes,
2258                                   const DDGridSetup&   ddGridSetup,
2259                                   const t_inputrec&    ir)
2260 {
2261     GMX_LOG(mdlog.info)
2262             .appendTextFormatted("Domain decomposition grid %d x %d x %d, separate PME ranks %d",
2263                                  ddGridSetup.numDomains[XX],
2264                                  ddGridSetup.numDomains[YY],
2265                                  ddGridSetup.numDomains[ZZ],
2266                                  ddGridSetup.numPmeOnlyRanks);
2267
2268     DDRankSetup ddRankSetup;
2269
2270     ddRankSetup.numPPRanks = numNodes - ddGridSetup.numPmeOnlyRanks;
2271     copy_ivec(ddGridSetup.numDomains, ddRankSetup.numPPCells);
2272
2273     ddRankSetup.usePmeOnlyRanks = (ddGridSetup.numPmeOnlyRanks > 0);
2274     if (ddRankSetup.usePmeOnlyRanks)
2275     {
2276         ddRankSetup.numRanksDoingPme = ddGridSetup.numPmeOnlyRanks;
2277     }
2278     else
2279     {
2280         ddRankSetup.numRanksDoingPme =
2281                 ddGridSetup.numDomains[XX] * ddGridSetup.numDomains[YY] * ddGridSetup.numDomains[ZZ];
2282     }
2283
2284     if (EEL_PME(ir.coulombtype) || EVDW_PME(ir.vdwtype))
2285     {
2286         /* The following choices should match those
2287          * in comm_cost_est in domdec_setup.c.
2288          * Note that here the checks have to take into account
2289          * that the decomposition might occur in a different order than xyz
2290          * (for instance through the env.var. GMX_DD_ORDER_ZYX),
2291          * in which case they will not match those in comm_cost_est,
2292          * but since that is mainly for testing purposes that's fine.
2293          */
2294         if (ddGridSetup.numDDDimensions >= 2 && ddGridSetup.ddDimensions[0] == XX
2295             && ddGridSetup.ddDimensions[1] == YY
2296             && ddRankSetup.numRanksDoingPme > ddGridSetup.numDomains[XX]
2297             && ddRankSetup.numRanksDoingPme % ddGridSetup.numDomains[XX] == 0
2298             && getenv("GMX_PMEONEDD") == nullptr)
2299         {
2300             ddRankSetup.npmedecompdim = 2;
2301             ddRankSetup.npmenodes_x   = ddGridSetup.numDomains[XX];
2302             ddRankSetup.npmenodes_y   = ddRankSetup.numRanksDoingPme / ddRankSetup.npmenodes_x;
2303         }
2304         else
2305         {
2306             /* In case nc is 1 in both x and y we could still choose to
2307              * decompose pme in y instead of x, but we use x for simplicity.
2308              */
2309             ddRankSetup.npmedecompdim = 1;
2310             if (ddGridSetup.ddDimensions[0] == YY)
2311             {
2312                 ddRankSetup.npmenodes_x = 1;
2313                 ddRankSetup.npmenodes_y = ddRankSetup.numRanksDoingPme;
2314             }
2315             else
2316             {
2317                 ddRankSetup.npmenodes_x = ddRankSetup.numRanksDoingPme;
2318                 ddRankSetup.npmenodes_y = 1;
2319             }
2320         }
2321         GMX_LOG(mdlog.info)
2322                 .appendTextFormatted("PME domain decomposition: %d x %d x %d",
2323                                      ddRankSetup.npmenodes_x,
2324                                      ddRankSetup.npmenodes_y,
2325                                      1);
2326     }
2327     else
2328     {
2329         ddRankSetup.npmedecompdim = 0;
2330         ddRankSetup.npmenodes_x   = 0;
2331         ddRankSetup.npmenodes_y   = 0;
2332     }
2333
2334     return ddRankSetup;
2335 }
2336
2337 /*! \brief Set the cell size and interaction limits */
2338 static void set_dd_limits(const gmx::MDLogger& mdlog,
2339                           DDRole               ddRole,
2340                           gmx_domdec_t*        dd,
2341                           const DomdecOptions& options,
2342                           const DDSettings&    ddSettings,
2343                           const DDSystemInfo&  systemInfo,
2344                           const DDGridSetup&   ddGridSetup,
2345                           const int            numPPRanks,
2346                           const gmx_mtop_t&    mtop,
2347                           const t_inputrec&    ir,
2348                           const gmx_ddbox_t&   ddbox)
2349 {
2350     gmx_domdec_comm_t* comm = dd->comm;
2351     comm->ddSettings        = ddSettings;
2352
2353     /* Initialize to GPU share count to 0, might change later */
2354     comm->nrank_gpu_shared = 0;
2355
2356     comm->dlbState = comm->ddSettings.initialDlbState;
2357     dd_dlb_set_should_check_whether_to_turn_dlb_on(dd, TRUE);
2358     /* To consider turning DLB on after 2*nstlist steps we need to check
2359      * at partitioning count 3. Thus we need to increase the first count by 2.
2360      */
2361     comm->ddPartioningCountFirstDlbOff += 2;
2362
2363     comm->bPMELoadBalDLBLimits = FALSE;
2364
2365     /* Allocate the charge group/atom sorting struct */
2366     comm->sort = std::make_unique<gmx_domdec_sort_t>();
2367
2368     comm->systemInfo = systemInfo;
2369
2370     if (systemInfo.useUpdateGroups)
2371     {
2372         /* Note: We would like to use dd->nnodes for the atom count estimate,
2373          *       but that is not yet available here. But this anyhow only
2374          *       affect performance up to the second dd_partition_system call.
2375          */
2376         const int homeAtomCountEstimate = mtop.natoms / numPPRanks;
2377         comm->updateGroupsCog           = std::make_unique<gmx::UpdateGroupsCog>(
2378                 mtop, systemInfo.updateGroupingsPerMoleculeType, maxReferenceTemperature(ir), homeAtomCountEstimate);
2379     }
2380
2381     /* Set the DD setup given by ddGridSetup */
2382     copy_ivec(ddGridSetup.numDomains, dd->numCells);
2383     dd->ndim = ddGridSetup.numDDDimensions;
2384     copy_ivec(ddGridSetup.ddDimensions, dd->dim);
2385
2386     dd->nnodes = dd->numCells[XX] * dd->numCells[YY] * dd->numCells[ZZ];
2387
2388     snew(comm->slb_frac, DIM);
2389     if (isDlbDisabled(comm))
2390     {
2391         comm->slb_frac[XX] = get_slb_frac(mdlog, "x", dd->numCells[XX], options.cellSizeX);
2392         comm->slb_frac[YY] = get_slb_frac(mdlog, "y", dd->numCells[YY], options.cellSizeY);
2393         comm->slb_frac[ZZ] = get_slb_frac(mdlog, "z", dd->numCells[ZZ], options.cellSizeZ);
2394     }
2395
2396     /* Set the multi-body cut-off and cellsize limit for DLB */
2397     comm->cutoff_mbody   = systemInfo.minCutoffForMultiBody;
2398     comm->cellsize_limit = systemInfo.cellsizeLimit;
2399     if (systemInfo.haveInterDomainBondeds && systemInfo.increaseMultiBodyCutoff)
2400     {
2401         if (systemInfo.filterBondedCommunication || !isDlbDisabled(comm))
2402         {
2403             /* Set the bonded communication distance to halfway
2404              * the minimum and the maximum,
2405              * since the extra communication cost is nearly zero.
2406              */
2407             real acs           = average_cellsize_min(ddbox, dd->numCells);
2408             comm->cutoff_mbody = 0.5 * (systemInfo.minCutoffForMultiBody + acs);
2409             if (!isDlbDisabled(comm))
2410             {
2411                 /* Check if this does not limit the scaling */
2412                 comm->cutoff_mbody = std::min(comm->cutoff_mbody, options.dlbScaling * acs);
2413             }
2414             if (!systemInfo.filterBondedCommunication)
2415             {
2416                 /* Without bBondComm do not go beyond the n.b. cut-off */
2417                 comm->cutoff_mbody = std::min(comm->cutoff_mbody, systemInfo.cutoff);
2418                 if (comm->cellsize_limit >= systemInfo.cutoff)
2419                 {
2420                     /* We don't loose a lot of efficieny
2421                      * when increasing it to the n.b. cut-off.
2422                      * It can even be slightly faster, because we need
2423                      * less checks for the communication setup.
2424                      */
2425                     comm->cutoff_mbody = systemInfo.cutoff;
2426                 }
2427             }
2428             /* Check if we did not end up below our original limit */
2429             comm->cutoff_mbody = std::max(comm->cutoff_mbody, systemInfo.minCutoffForMultiBody);
2430
2431             if (comm->cutoff_mbody > comm->cellsize_limit)
2432             {
2433                 comm->cellsize_limit = comm->cutoff_mbody;
2434             }
2435         }
2436         /* Without DLB and cutoff_mbody<cutoff, cutoff_mbody is dynamic */
2437     }
2438
2439     if (debug)
2440     {
2441         fprintf(debug,
2442                 "Bonded atom communication beyond the cut-off: %s\n"
2443                 "cellsize limit %f\n",
2444                 gmx::boolToString(systemInfo.filterBondedCommunication),
2445                 comm->cellsize_limit);
2446     }
2447
2448     if (ddRole == DDRole::Master)
2449     {
2450         check_dd_restrictions(dd, ir, mdlog);
2451     }
2452 }
2453
2454 static void writeSettings(gmx::TextWriter*   log,
2455                           gmx_domdec_t*      dd,
2456                           const gmx_mtop_t&  mtop,
2457                           const t_inputrec&  ir,
2458                           gmx_bool           bDynLoadBal,
2459                           real               dlb_scale,
2460                           const gmx_ddbox_t* ddbox)
2461 {
2462     gmx_domdec_comm_t* comm = dd->comm;
2463
2464     if (bDynLoadBal)
2465     {
2466         log->writeString("The maximum number of communication pulses is:");
2467         for (int d = 0; d < dd->ndim; d++)
2468         {
2469             log->writeStringFormatted(" %c %d", dim2char(dd->dim[d]), comm->cd[d].np_dlb);
2470         }
2471         log->ensureLineBreak();
2472         log->writeLineFormatted("The minimum size for domain decomposition cells is %.3f nm",
2473                                 comm->cellsize_limit);
2474         log->writeLineFormatted("The requested allowed shrink of DD cells (option -dds) is: %.2f", dlb_scale);
2475         log->writeString("The allowed shrink of domain decomposition cells is:");
2476         for (int d = 0; d < DIM; d++)
2477         {
2478             if (dd->numCells[d] > 1)
2479             {
2480                 const real shrink =
2481                         (d >= ddbox->npbcdim && dd->numCells[d] == 2)
2482                                 ? 0
2483                                 : comm->cellsize_min_dlb[d]
2484                                           / (ddbox->box_size[d] * ddbox->skew_fac[d] / dd->numCells[d]);
2485                 log->writeStringFormatted(" %c %.2f", dim2char(d), shrink);
2486             }
2487         }
2488         log->ensureLineBreak();
2489     }
2490     else
2491     {
2492         ivec np;
2493         set_dd_cell_sizes_slb(dd, ddbox, setcellsizeslbPULSE_ONLY, np);
2494         log->writeString("The initial number of communication pulses is:");
2495         for (int d = 0; d < dd->ndim; d++)
2496         {
2497             log->writeStringFormatted(" %c %d", dim2char(dd->dim[d]), np[dd->dim[d]]);
2498         }
2499         log->ensureLineBreak();
2500         log->writeString("The initial domain decomposition cell size is:");
2501         for (int d = 0; d < DIM; d++)
2502         {
2503             if (dd->numCells[d] > 1)
2504             {
2505                 log->writeStringFormatted(" %c %.2f nm", dim2char(d), dd->comm->cellsize_min[d]);
2506             }
2507         }
2508         log->ensureLineBreak();
2509         log->writeLine();
2510     }
2511
2512     const bool haveInterDomainVsites =
2513             (countInterUpdategroupVsites(mtop, comm->systemInfo.updateGroupingsPerMoleculeType) != 0);
2514
2515     if (comm->systemInfo.haveInterDomainBondeds || haveInterDomainVsites
2516         || comm->systemInfo.mayHaveSplitConstraints || comm->systemInfo.mayHaveSplitSettles)
2517     {
2518         std::string decompUnits;
2519         if (comm->systemInfo.useUpdateGroups)
2520         {
2521             decompUnits = "atom groups";
2522         }
2523         else
2524         {
2525             decompUnits = "atoms";
2526         }
2527
2528         log->writeLineFormatted("The maximum allowed distance for %s involved in interactions is:",
2529                                 decompUnits.c_str());
2530         log->writeLineFormatted(
2531                 "%40s  %-7s %6.3f nm", "non-bonded interactions", "", comm->systemInfo.cutoff);
2532
2533         real limit = 0;
2534         if (bDynLoadBal)
2535         {
2536             limit = dd->comm->cellsize_limit;
2537         }
2538         else
2539         {
2540             if (dd->unitCellInfo.ddBoxIsDynamic)
2541             {
2542                 log->writeLine(
2543                         "(the following are initial values, they could change due to box "
2544                         "deformation)");
2545             }
2546             limit = dd->comm->cellsize_min[XX];
2547             for (int d = 1; d < DIM; d++)
2548             {
2549                 limit = std::min(limit, dd->comm->cellsize_min[d]);
2550             }
2551         }
2552
2553         if (comm->systemInfo.haveInterDomainBondeds)
2554         {
2555             log->writeLineFormatted("%40s  %-7s %6.3f nm",
2556                                     "two-body bonded interactions",
2557                                     "(-rdd)",
2558                                     std::max(comm->systemInfo.cutoff, comm->cutoff_mbody));
2559             log->writeLineFormatted("%40s  %-7s %6.3f nm",
2560                                     "multi-body bonded interactions",
2561                                     "(-rdd)",
2562                                     (comm->systemInfo.filterBondedCommunication || isDlbOn(dd->comm))
2563                                             ? comm->cutoff_mbody
2564                                             : std::min(comm->systemInfo.cutoff, limit));
2565         }
2566         if (haveInterDomainVsites)
2567         {
2568             log->writeLineFormatted("%40s  %-7s %6.3f nm", "virtual site constructions", "(-rcon)", limit);
2569         }
2570         if (comm->systemInfo.mayHaveSplitConstraints || comm->systemInfo.mayHaveSplitSettles)
2571         {
2572             std::string separation =
2573                     gmx::formatString("atoms separated by up to %d constraints", 1 + ir.nProjOrder);
2574             log->writeLineFormatted("%40s  %-7s %6.3f nm\n", separation.c_str(), "(-rcon)", limit);
2575         }
2576         log->ensureLineBreak();
2577     }
2578 }
2579
2580 static void logSettings(const gmx::MDLogger& mdlog,
2581                         gmx_domdec_t*        dd,
2582                         const gmx_mtop_t&    mtop,
2583                         const t_inputrec&    ir,
2584                         real                 dlb_scale,
2585                         const gmx_ddbox_t*   ddbox)
2586 {
2587     gmx::StringOutputStream stream;
2588     gmx::TextWriter         log(&stream);
2589     writeSettings(&log, dd, mtop, ir, isDlbOn(dd->comm), dlb_scale, ddbox);
2590     if (dd->comm->dlbState == DlbState::offCanTurnOn)
2591     {
2592         {
2593             log.ensureEmptyLine();
2594             log.writeLine(
2595                     "When dynamic load balancing gets turned on, these settings will change to:");
2596         }
2597         writeSettings(&log, dd, mtop, ir, true, dlb_scale, ddbox);
2598     }
2599     GMX_LOG(mdlog.info).asParagraph().appendText(stream.toString());
2600 }
2601
2602 static void set_cell_limits_dlb(const gmx::MDLogger& mdlog,
2603                                 gmx_domdec_t*        dd,
2604                                 real                 dlb_scale,
2605                                 const t_inputrec&    inputrec,
2606                                 const gmx_ddbox_t*   ddbox)
2607 {
2608     int npulse       = 0;
2609     int npulse_d_max = 0;
2610     int npulse_d     = 0;
2611
2612     gmx_domdec_comm_t* comm = dd->comm;
2613
2614     bool bNoCutOff = (inputrec.rvdw == 0 || inputrec.rcoulomb == 0);
2615
2616     /* Determine the maximum number of comm. pulses in one dimension */
2617
2618     comm->cellsize_limit = std::max(comm->cellsize_limit, comm->cutoff_mbody);
2619
2620     /* Determine the maximum required number of grid pulses */
2621     if (comm->cellsize_limit >= comm->systemInfo.cutoff)
2622     {
2623         /* Only a single pulse is required */
2624         npulse = 1;
2625     }
2626     else if (!bNoCutOff && comm->cellsize_limit > 0)
2627     {
2628         /* We round down slightly here to avoid overhead due to the latency
2629          * of extra communication calls when the cut-off
2630          * would be only slightly longer than the cell size.
2631          * Later cellsize_limit is redetermined,
2632          * so we can not miss interactions due to this rounding.
2633          */
2634         npulse = static_cast<int>(0.96 + comm->systemInfo.cutoff / comm->cellsize_limit);
2635     }
2636     else
2637     {
2638         /* There is no cell size limit */
2639         npulse = std::max(dd->numCells[XX] - 1, std::max(dd->numCells[YY] - 1, dd->numCells[ZZ] - 1));
2640     }
2641
2642     if (!bNoCutOff && npulse > 1)
2643     {
2644         /* See if we can do with less pulses, based on dlb_scale */
2645         npulse_d_max = 0;
2646         for (int d = 0; d < dd->ndim; d++)
2647         {
2648             int dim  = dd->dim[d];
2649             npulse_d = static_cast<int>(
2650                     1
2651                     + dd->numCells[dim] * comm->systemInfo.cutoff
2652                               / (ddbox->box_size[dim] * ddbox->skew_fac[dim] * dlb_scale));
2653             npulse_d_max = std::max(npulse_d_max, npulse_d);
2654         }
2655         npulse = std::min(npulse, npulse_d_max);
2656     }
2657
2658     /* This env var can override npulse */
2659     const int ddPulseEnv = dd_getenv(mdlog, "GMX_DD_NPULSE", 0);
2660     if (ddPulseEnv > 0)
2661     {
2662         npulse = ddPulseEnv;
2663     }
2664
2665     comm->maxpulse       = 1;
2666     comm->bVacDLBNoLimit = (inputrec.pbcType == PbcType::No);
2667     for (int d = 0; d < dd->ndim; d++)
2668     {
2669         comm->cd[d].np_dlb = std::min(npulse, dd->numCells[dd->dim[d]] - 1);
2670         comm->maxpulse     = std::max(comm->maxpulse, comm->cd[d].np_dlb);
2671         if (comm->cd[d].np_dlb < dd->numCells[dd->dim[d]] - 1)
2672         {
2673             comm->bVacDLBNoLimit = FALSE;
2674         }
2675     }
2676
2677     /* cellsize_limit is set for LINCS in init_domain_decomposition */
2678     if (!comm->bVacDLBNoLimit)
2679     {
2680         comm->cellsize_limit = std::max(comm->cellsize_limit, comm->systemInfo.cutoff / comm->maxpulse);
2681     }
2682     comm->cellsize_limit = std::max(comm->cellsize_limit, comm->cutoff_mbody);
2683     /* Set the minimum cell size for each DD dimension */
2684     for (int d = 0; d < dd->ndim; d++)
2685     {
2686         if (comm->bVacDLBNoLimit || comm->cd[d].np_dlb * comm->cellsize_limit >= comm->systemInfo.cutoff)
2687         {
2688             comm->cellsize_min_dlb[dd->dim[d]] = comm->cellsize_limit;
2689         }
2690         else
2691         {
2692             comm->cellsize_min_dlb[dd->dim[d]] = comm->systemInfo.cutoff / comm->cd[d].np_dlb;
2693         }
2694     }
2695     if (comm->cutoff_mbody <= 0)
2696     {
2697         comm->cutoff_mbody = std::min(comm->systemInfo.cutoff, comm->cellsize_limit);
2698     }
2699     if (isDlbOn(comm))
2700     {
2701         set_dlb_limits(dd);
2702     }
2703 }
2704
2705 bool dd_moleculesAreAlwaysWhole(const gmx_domdec_t& dd)
2706 {
2707     return dd.comm->systemInfo.moleculesAreAlwaysWhole;
2708 }
2709
2710 bool dd_bonded_molpbc(const gmx_domdec_t& dd, PbcType pbcType)
2711 {
2712     /* If each molecule is a single charge group
2713      * or we use domain decomposition for each periodic dimension,
2714      * we do not need to take pbc into account for the bonded interactions.
2715      */
2716     return (pbcType != PbcType::No && dd.comm->systemInfo.haveInterDomainBondeds
2717             && !(dd.numCells[XX] > 1 && dd.numCells[YY] > 1
2718                  && (dd.numCells[ZZ] > 1 || pbcType == PbcType::XY)));
2719 }
2720
2721 /*! \brief Sets grid size limits and PP-PME setup, prints settings to log */
2722 static void set_ddgrid_parameters(const gmx::MDLogger& mdlog,
2723                                   gmx_domdec_t*        dd,
2724                                   real                 dlb_scale,
2725                                   const gmx_mtop_t&    mtop,
2726                                   const t_inputrec&    inputrec,
2727                                   const gmx_ddbox_t*   ddbox)
2728 {
2729     gmx_domdec_comm_t* comm        = dd->comm;
2730     DDRankSetup&       ddRankSetup = comm->ddRankSetup;
2731
2732     if (EEL_PME(inputrec.coulombtype) || EVDW_PME(inputrec.vdwtype))
2733     {
2734         init_ddpme(dd, &ddRankSetup.ddpme[0], 0);
2735         if (ddRankSetup.npmedecompdim >= 2)
2736         {
2737             init_ddpme(dd, &ddRankSetup.ddpme[1], 1);
2738         }
2739     }
2740     else
2741     {
2742         ddRankSetup.numRanksDoingPme = 0;
2743         if (dd->pme_nodeid >= 0)
2744         {
2745             gmx_fatal_collective(FARGS,
2746                                  dd->mpi_comm_all,
2747                                  DDMASTER(dd),
2748                                  "Can not have separate PME ranks without PME electrostatics");
2749         }
2750     }
2751
2752     if (debug)
2753     {
2754         fprintf(debug, "The DD cut-off is %f\n", comm->systemInfo.cutoff);
2755     }
2756     if (!isDlbDisabled(comm))
2757     {
2758         set_cell_limits_dlb(mdlog, dd, dlb_scale, inputrec, ddbox);
2759     }
2760
2761     logSettings(mdlog, dd, mtop, inputrec, dlb_scale, ddbox);
2762
2763     const real vol_frac = (inputrec.pbcType == PbcType::No)
2764                                   ? (1 - 1 / static_cast<double>(dd->nnodes))
2765                                   : ((1 + comm_box_frac(dd->numCells, comm->systemInfo.cutoff, *ddbox))
2766                                      / static_cast<double>(dd->nnodes));
2767     if (debug)
2768     {
2769         fprintf(debug, "Volume fraction for all DD zones: %f\n", vol_frac);
2770     }
2771     int natoms_tot = mtop.natoms;
2772
2773     dd->ga2la = new gmx_ga2la_t(natoms_tot, static_cast<int>(vol_frac * natoms_tot));
2774 }
2775
2776 /*! \brief Get some important DD parameters which can be modified by env.vars */
2777 static DDSettings getDDSettings(const gmx::MDLogger&     mdlog,
2778                                 const DomdecOptions&     options,
2779                                 const gmx::MdrunOptions& mdrunOptions,
2780                                 const t_inputrec&        ir)
2781 {
2782     DDSettings ddSettings;
2783
2784     ddSettings.useSendRecv2        = (dd_getenv(mdlog, "GMX_DD_USE_SENDRECV2", 0) != 0);
2785     ddSettings.dlb_scale_lim       = dd_getenv(mdlog, "GMX_DLB_MAX_BOX_SCALING", 10);
2786     ddSettings.useDDOrderZYX       = bool(dd_getenv(mdlog, "GMX_DD_ORDER_ZYX", 0));
2787     ddSettings.useCartesianReorder = bool(dd_getenv(mdlog, "GMX_NO_CART_REORDER", 1));
2788     ddSettings.eFlop               = dd_getenv(mdlog, "GMX_DLB_BASED_ON_FLOPS", 0);
2789     const int recload              = dd_getenv(mdlog, "GMX_DD_RECORD_LOAD", 1);
2790     ddSettings.nstDDDump           = dd_getenv(mdlog, "GMX_DD_NST_DUMP", 0);
2791     ddSettings.nstDDDumpGrid       = dd_getenv(mdlog, "GMX_DD_NST_DUMP_GRID", 0);
2792     ddSettings.DD_debug            = dd_getenv(mdlog, "GMX_DD_DEBUG", 0);
2793
2794     if (ddSettings.useSendRecv2)
2795     {
2796         GMX_LOG(mdlog.info)
2797                 .appendText(
2798                         "Will use two sequential MPI_Sendrecv calls instead of two simultaneous "
2799                         "non-blocking MPI_Irecv and MPI_Isend pairs for constraint and vsite "
2800                         "communication");
2801     }
2802
2803     if (ddSettings.eFlop)
2804     {
2805         GMX_LOG(mdlog.info).appendText("Will load balance based on FLOP count");
2806         ddSettings.recordLoad = true;
2807     }
2808     else
2809     {
2810         ddSettings.recordLoad = (wallcycle_have_counter() && recload > 0);
2811     }
2812
2813     ddSettings.initialDlbState =
2814             determineInitialDlbState(mdlog, options.dlbOption, ddSettings.recordLoad, mdrunOptions, ir);
2815     GMX_LOG(mdlog.info)
2816             .appendTextFormatted("Dynamic load balancing: %s",
2817                                  enumValueToString(ddSettings.initialDlbState));
2818
2819     return ddSettings;
2820 }
2821
2822 gmx_domdec_t::gmx_domdec_t(const t_inputrec& ir) : unitCellInfo(ir) {}
2823
2824 gmx_domdec_t::~gmx_domdec_t() = default;
2825
2826 namespace gmx
2827 {
2828
2829 // TODO once the functionality stablizes, move this class and
2830 // supporting functionality into builder.cpp
2831 /*! \brief Impl class for DD builder */
2832 class DomainDecompositionBuilder::Impl
2833 {
2834 public:
2835     //! Constructor
2836     Impl(const MDLogger&                   mdlog,
2837          t_commrec*                        cr,
2838          const DomdecOptions&              options,
2839          const MdrunOptions&               mdrunOptions,
2840          const gmx_mtop_t&                 mtop,
2841          const t_inputrec&                 ir,
2842          const MDModulesNotifiers&         notifiers,
2843          const matrix                      box,
2844          ArrayRef<const RangePartitioning> updateGroupingPerMoleculeType,
2845          bool                              useUpdateGroups,
2846          real                              maxUpdateGroupRadius,
2847          ArrayRef<const RVec>              xGlobal,
2848          bool                              useGpuForNonbonded,
2849          bool                              useGpuForPme);
2850
2851     //! Build the resulting DD manager
2852     gmx_domdec_t* build(LocalAtomSetManager* atomSets);
2853
2854     //! Objects used in constructing and configuring DD
2855     //! {
2856     //! Logging object
2857     const MDLogger& mdlog_;
2858     //! Communication object
2859     t_commrec* cr_;
2860     //! User-supplied options configuring DD behavior
2861     const DomdecOptions options_;
2862     //! Global system topology
2863     const gmx_mtop_t& mtop_;
2864     //! User input values from the tpr file
2865     const t_inputrec& ir_;
2866     //! MdModules object
2867     const MDModulesNotifiers& notifiers_;
2868     //! }
2869
2870     //! Internal objects used in constructing DD
2871     //! {
2872     //! Settings combined from the user input
2873     DDSettings ddSettings_;
2874     //! Information derived from the simulation system
2875     DDSystemInfo systemInfo_;
2876     //! Box structure
2877     gmx_ddbox_t ddbox_ = { 0 };
2878     //! Organization of the DD grids
2879     DDGridSetup ddGridSetup_;
2880     //! Organzation of the DD ranks
2881     DDRankSetup ddRankSetup_;
2882     //! Number of DD cells in each dimension
2883     ivec ddCellIndex_ = { 0, 0, 0 };
2884     //! IDs of PME-only ranks
2885     std::vector<int> pmeRanks_;
2886     //! Contains a valid Cartesian-communicator-based setup, or defaults.
2887     CartesianRankSetup cartSetup_;
2888     //! }
2889 };
2890
2891 DomainDecompositionBuilder::Impl::Impl(const MDLogger&                   mdlog,
2892                                        t_commrec*                        cr,
2893                                        const DomdecOptions&              options,
2894                                        const MdrunOptions&               mdrunOptions,
2895                                        const gmx_mtop_t&                 mtop,
2896                                        const t_inputrec&                 ir,
2897                                        const MDModulesNotifiers&         notifiers,
2898                                        const matrix                      box,
2899                                        ArrayRef<const RangePartitioning> updateGroupingPerMoleculeType,
2900                                        const bool                        useUpdateGroups,
2901                                        const real                        maxUpdateGroupRadius,
2902                                        ArrayRef<const RVec>              xGlobal,
2903                                        bool                              useGpuForNonbonded,
2904                                        bool                              useGpuForPme) :
2905     mdlog_(mdlog), cr_(cr), options_(options), mtop_(mtop), ir_(ir), notifiers_(notifiers)
2906 {
2907     GMX_LOG(mdlog_.info).appendTextFormatted("\nInitializing Domain Decomposition on %d ranks", cr_->sizeOfDefaultCommunicator);
2908
2909     ddSettings_ = getDDSettings(mdlog_, options_, mdrunOptions, ir_);
2910
2911     if (ddSettings_.eFlop > 1)
2912     {
2913         /* Ensure that we have different random flop counts on different ranks */
2914         srand(1 + cr_->rankInDefaultCommunicator);
2915     }
2916
2917     systemInfo_ = getSystemInfo(mdlog_,
2918                                 MASTER(cr_) ? DDRole::Master : DDRole::Agent,
2919                                 cr->mpiDefaultCommunicator,
2920                                 options_,
2921                                 mtop_,
2922                                 ir_,
2923                                 box,
2924                                 updateGroupingPerMoleculeType,
2925                                 useUpdateGroups,
2926                                 maxUpdateGroupRadius,
2927                                 xGlobal);
2928
2929     const int  numRanksRequested         = cr_->sizeOfDefaultCommunicator;
2930     const bool checkForLargePrimeFactors = (options_.numCells[0] <= 0);
2931
2932
2933     /* Checks for ability to use PME-only ranks */
2934     auto separatePmeRanksPermitted = checkForSeparatePmeRanks(
2935             notifiers_, options_, numRanksRequested, useGpuForNonbonded, useGpuForPme);
2936
2937     /* Checks for validity of requested Ranks setup */
2938     checkForValidRankCountRequests(numRanksRequested,
2939                                    EEL_PME(ir_.coulombtype) | EVDW_PME(ir_.vdwtype),
2940                                    options_.numPmeRanks,
2941                                    separatePmeRanksPermitted,
2942                                    checkForLargePrimeFactors);
2943
2944     // DD grid setup uses a more different cell size limit for
2945     // automated setup than the one in systemInfo_. The latter is used
2946     // in set_dd_limits() to configure DLB, for example.
2947     const real gridSetupCellsizeLimit =
2948             getDDGridSetupCellSizeLimit(mdlog_,
2949                                         !isDlbDisabled(ddSettings_.initialDlbState),
2950                                         options_.dlbScaling,
2951                                         ir_,
2952                                         systemInfo_.cellsizeLimit);
2953     ddGridSetup_ = getDDGridSetup(mdlog_,
2954                                   MASTER(cr_) ? DDRole::Master : DDRole::Agent,
2955                                   cr->mpiDefaultCommunicator,
2956                                   numRanksRequested,
2957                                   options_,
2958                                   ddSettings_,
2959                                   systemInfo_,
2960                                   gridSetupCellsizeLimit,
2961                                   mtop_,
2962                                   ir_,
2963                                   separatePmeRanksPermitted,
2964                                   box,
2965                                   xGlobal,
2966                                   &ddbox_);
2967     checkDDGridSetup(ddGridSetup_,
2968                      MASTER(cr_) ? DDRole::Master : DDRole::Agent,
2969                      cr->mpiDefaultCommunicator,
2970                      cr->sizeOfDefaultCommunicator,
2971                      options_,
2972                      ddSettings_,
2973                      systemInfo_,
2974                      gridSetupCellsizeLimit,
2975                      ddbox_);
2976
2977     cr_->npmenodes = ddGridSetup_.numPmeOnlyRanks;
2978
2979     ddRankSetup_ = getDDRankSetup(mdlog_, cr_->sizeOfDefaultCommunicator, ddGridSetup_, ir_);
2980
2981     /* Generate the group communicator, also decides the duty of each rank */
2982     cartSetup_ = makeGroupCommunicators(
2983             mdlog_, ddSettings_, options_.rankOrder, ddRankSetup_, cr_, ddCellIndex_, &pmeRanks_);
2984 }
2985
2986 gmx_domdec_t* DomainDecompositionBuilder::Impl::build(LocalAtomSetManager* atomSets)
2987 {
2988     gmx_domdec_t* dd = new gmx_domdec_t(ir_);
2989
2990     copy_ivec(ddCellIndex_, dd->ci);
2991
2992     dd->comm = init_dd_comm();
2993
2994     dd->comm->ddRankSetup        = ddRankSetup_;
2995     dd->comm->cartesianRankSetup = cartSetup_;
2996
2997     set_dd_limits(mdlog_,
2998                   MASTER(cr_) ? DDRole::Master : DDRole::Agent,
2999                   dd,
3000                   options_,
3001                   ddSettings_,
3002                   systemInfo_,
3003                   ddGridSetup_,
3004                   ddRankSetup_.numPPRanks,
3005                   mtop_,
3006                   ir_,
3007                   ddbox_);
3008
3009     setupGroupCommunication(mdlog_, ddSettings_, pmeRanks_, cr_, mtop_.natoms, dd);
3010
3011     if (thisRankHasDuty(cr_, DUTY_PP))
3012     {
3013         set_ddgrid_parameters(mdlog_, dd, options_.dlbScaling, mtop_, ir_, &ddbox_);
3014
3015         setup_neighbor_relations(dd);
3016     }
3017
3018     /* Set overallocation to avoid frequent reallocation of arrays */
3019     set_over_alloc_dd(true);
3020
3021     dd->atomSets = atomSets;
3022
3023     dd->localTopologyChecker = std::make_unique<LocalTopologyChecker>();
3024
3025     return dd;
3026 }
3027
3028 DomainDecompositionBuilder::DomainDecompositionBuilder(const MDLogger&           mdlog,
3029                                                        t_commrec*                cr,
3030                                                        const DomdecOptions&      options,
3031                                                        const MdrunOptions&       mdrunOptions,
3032                                                        const gmx_mtop_t&         mtop,
3033                                                        const t_inputrec&         ir,
3034                                                        const MDModulesNotifiers& notifiers,
3035                                                        const matrix              box,
3036                                                        ArrayRef<const RangePartitioning> updateGroupingPerMoleculeType,
3037                                                        const bool           useUpdateGroups,
3038                                                        const real           maxUpdateGroupRadius,
3039                                                        ArrayRef<const RVec> xGlobal,
3040                                                        const bool           useGpuForNonbonded,
3041                                                        const bool           useGpuForPme) :
3042     impl_(new Impl(mdlog,
3043                    cr,
3044                    options,
3045                    mdrunOptions,
3046                    mtop,
3047                    ir,
3048                    notifiers,
3049                    box,
3050                    updateGroupingPerMoleculeType,
3051                    useUpdateGroups,
3052                    maxUpdateGroupRadius,
3053                    xGlobal,
3054                    useGpuForNonbonded,
3055                    useGpuForPme))
3056 {
3057 }
3058
3059 gmx_domdec_t* DomainDecompositionBuilder::build(LocalAtomSetManager* atomSets)
3060 {
3061     return impl_->build(atomSets);
3062 }
3063
3064 DomainDecompositionBuilder::~DomainDecompositionBuilder() = default;
3065
3066 } // namespace gmx
3067
3068 static gmx_bool test_dd_cutoff(const t_commrec* cr, const matrix box, gmx::ArrayRef<const gmx::RVec> x, real cutoffRequested)
3069 {
3070     gmx_ddbox_t ddbox;
3071     int         LocallyLimited = 0;
3072
3073     const auto* dd = cr->dd;
3074
3075     set_ddbox(*dd, false, box, true, x, &ddbox);
3076
3077     LocallyLimited = 0;
3078
3079     for (int d = 0; d < dd->ndim; d++)
3080     {
3081         const int dim = dd->dim[d];
3082
3083         real inv_cell_size = DD_CELL_MARGIN * dd->numCells[dim] / ddbox.box_size[dim];
3084         if (dd->unitCellInfo.ddBoxIsDynamic)
3085         {
3086             inv_cell_size *= DD_PRES_SCALE_MARGIN;
3087         }
3088
3089         const int np = 1 + static_cast<int>(cutoffRequested * inv_cell_size * ddbox.skew_fac[dim]);
3090
3091         if (!isDlbDisabled(dd->comm) && (dim < ddbox.npbcdim) && (dd->comm->cd[d].np_dlb > 0))
3092         {
3093             if (np > dd->comm->cd[d].np_dlb)
3094             {
3095                 return FALSE;
3096             }
3097
3098             /* If a current local cell size is smaller than the requested
3099              * cut-off, we could still fix it, but this gets very complicated.
3100              * Without fixing here, we might actually need more checks.
3101              */
3102             real cellSizeAlongDim =
3103                     (dd->comm->cell_x1[dim] - dd->comm->cell_x0[dim]) * ddbox.skew_fac[dim];
3104             if (cellSizeAlongDim * dd->comm->cd[d].np_dlb < cutoffRequested)
3105             {
3106                 LocallyLimited = 1;
3107             }
3108         }
3109     }
3110
3111     if (!isDlbDisabled(dd->comm))
3112     {
3113         /* If DLB is not active yet, we don't need to check the grid jumps.
3114          * Actually we shouldn't, because then the grid jump data is not set.
3115          */
3116         if (isDlbOn(dd->comm) && gmx::check_grid_jump(0, dd, cutoffRequested, &ddbox, FALSE))
3117         {
3118             LocallyLimited = 1;
3119         }
3120
3121         gmx_sumi(1, &LocallyLimited, cr);
3122
3123         if (LocallyLimited > 0)
3124         {
3125             return FALSE;
3126         }
3127     }
3128
3129     return TRUE;
3130 }
3131
3132 bool change_dd_cutoff(t_commrec* cr, const matrix box, gmx::ArrayRef<const gmx::RVec> x, real cutoffRequested)
3133 {
3134     bool bCutoffAllowed = test_dd_cutoff(cr, box, x, cutoffRequested);
3135
3136     if (bCutoffAllowed)
3137     {
3138         cr->dd->comm->systemInfo.cutoff = cutoffRequested;
3139     }
3140
3141     return bCutoffAllowed;
3142 }
3143
3144 void constructGpuHaloExchange(const gmx::MDLogger&            mdlog,
3145                               const t_commrec&                cr,
3146                               const gmx::DeviceStreamManager& deviceStreamManager,
3147                               gmx_wallcycle*                  wcycle)
3148 {
3149     GMX_RELEASE_ASSERT(deviceStreamManager.streamIsValid(gmx::DeviceStreamType::NonBondedLocal),
3150                        "Local non-bonded stream should be valid when using"
3151                        "GPU halo exchange.");
3152     GMX_RELEASE_ASSERT(deviceStreamManager.streamIsValid(gmx::DeviceStreamType::NonBondedNonLocal),
3153                        "Non-local non-bonded stream should be valid when using "
3154                        "GPU halo exchange.");
3155
3156     if (cr.dd->gpuHaloExchange[0].empty())
3157     {
3158         GMX_LOG(mdlog.warning)
3159                 .asParagraph()
3160                 .appendTextFormatted(
3161                         "NOTE: Activating the 'GPU halo exchange' feature, enabled "
3162                         "by the "
3163                         "GMX_GPU_DD_COMMS environment variable.");
3164     }
3165
3166     for (int d = 0; d < cr.dd->ndim; d++)
3167     {
3168         for (int pulse = cr.dd->gpuHaloExchange[d].size(); pulse < cr.dd->comm->cd[d].numPulses(); pulse++)
3169         {
3170             cr.dd->gpuHaloExchange[d].push_back(std::make_unique<gmx::GpuHaloExchange>(
3171                     cr.dd,
3172                     d,
3173                     cr.mpi_comm_mygroup,
3174                     deviceStreamManager.context(),
3175                     deviceStreamManager.stream(gmx::DeviceStreamType::NonBondedLocal),
3176                     deviceStreamManager.stream(gmx::DeviceStreamType::NonBondedNonLocal),
3177                     pulse,
3178                     wcycle));
3179         }
3180     }
3181 }
3182
3183 void reinitGpuHaloExchange(const t_commrec&              cr,
3184                            const DeviceBuffer<gmx::RVec> d_coordinatesBuffer,
3185                            const DeviceBuffer<gmx::RVec> d_forcesBuffer)
3186 {
3187     for (int d = 0; d < cr.dd->ndim; d++)
3188     {
3189         for (int pulse = 0; pulse < cr.dd->comm->cd[d].numPulses(); pulse++)
3190         {
3191             cr.dd->gpuHaloExchange[d][pulse]->reinitHalo(d_coordinatesBuffer, d_forcesBuffer);
3192         }
3193     }
3194 }
3195
3196 void communicateGpuHaloCoordinates(const t_commrec&      cr,
3197                                    const matrix          box,
3198                                    GpuEventSynchronizer* coordinatesReadyOnDeviceEvent)
3199 {
3200     for (int d = 0; d < cr.dd->ndim; d++)
3201     {
3202         for (int pulse = 0; pulse < cr.dd->comm->cd[d].numPulses(); pulse++)
3203         {
3204             cr.dd->gpuHaloExchange[d][pulse]->communicateHaloCoordinates(box, coordinatesReadyOnDeviceEvent);
3205         }
3206     }
3207 }
3208
3209 void communicateGpuHaloForces(const t_commrec& cr, bool accumulateForces)
3210 {
3211     for (int d = cr.dd->ndim - 1; d >= 0; d--)
3212     {
3213         for (int pulse = cr.dd->comm->cd[d].numPulses() - 1; pulse >= 0; pulse--)
3214         {
3215             cr.dd->gpuHaloExchange[d][pulse]->communicateHaloForces(accumulateForces);
3216         }
3217     }
3218 }