src/gromacs/domdec/domdec.cpp

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2005,2006,2007,2008,2009 by the GROMACS development team.
   5  * Copyright (c) 2010,2011,2012,2013,2014 by the GROMACS development team.
   6  * Copyright (c) 2015,2016,2017,2018,2019 by the GROMACS development team.
   7  * Copyright (c) 2020,2021, by the GROMACS development team, led by
   8  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   9  * and including many others, as listed in the AUTHORS file in the
  10  * top-level source directory and at http://www.gromacs.org.
  11  *
  12  * GROMACS is free software; you can redistribute it and/or
  13  * modify it under the terms of the GNU Lesser General Public License
  14  * as published by the Free Software Foundation; either version 2.1
  15  * of the License, or (at your option) any later version.
  16  *
  17  * GROMACS is distributed in the hope that it will be useful,
  18  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  19  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  20  * Lesser General Public License for more details.
  21  *
  22  * You should have received a copy of the GNU Lesser General Public
  23  * License along with GROMACS; if not, see
  24  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  25  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  26  *
  27  * If you want to redistribute modifications to GROMACS, please
  28  * consider that scientific software is very special. Version
  29  * control is crucial - bugs must be traceable. We will be happy to
  30  * consider code for inclusion in the official distribution, but
  31  * derived work must not be called official GROMACS. Details are found
  32  * in the README & COPYING files - if they are missing, get the
  33  * official version at http://www.gromacs.org.
  34  *
  35  * To help us fund GROMACS development, we humbly ask that you cite
  36  * the research papers on the package. Check out http://www.gromacs.org.
  37  */
  38
  39 #include "gmxpre.h"
  40
  41 #include "domdec.h"
  42
  43 #include "config.h"
  44
  45 #include <cassert>
  46 #include <cinttypes>
  47 #include <climits>
  48 #include <cmath>
  49 #include <cstdio>
  50 #include <cstdlib>
  51 #include <cstring>
  52
  53 #include <algorithm>
  54 #include <memory>
  55
  56 #include "gromacs/domdec/builder.h"
  57 #include "gromacs/domdec/collect.h"
  58 #include "gromacs/domdec/computemultibodycutoffs.h"
  59 #include "gromacs/domdec/dlb.h"
  60 #include "gromacs/domdec/dlbtiming.h"
  61 #include "gromacs/domdec/domdec_network.h"
  62 #include "gromacs/domdec/ga2la.h"
  63 #include "gromacs/domdec/gpuhaloexchange.h"
  64 #include "gromacs/domdec/localtopologychecker.h"
  65 #include "gromacs/domdec/options.h"
  66 #include "gromacs/domdec/partition.h"
  67 #include "gromacs/ewald/pme.h"
  68 #include "gromacs/domdec/reversetopology.h"
  69 #include "gromacs/gmxlib/network.h"
  70 #include "gromacs/gmxlib/nrnb.h"
  71 #include "gromacs/gpu_utils/device_stream_manager.h"
  72 #include "gromacs/gpu_utils/gpu_utils.h"
  73 #include "gromacs/hardware/hw_info.h"
  74 #include "gromacs/math/vec.h"
  75 #include "gromacs/math/vectypes.h"
  76 #include "gromacs/mdlib/calc_verletbuf.h"
  77 #include "gromacs/mdlib/constr.h"
  78 #include "gromacs/mdlib/constraintrange.h"
  79 #include "gromacs/mdlib/updategroups.h"
  80 #include "gromacs/mdlib/vcm.h"
  81 #include "gromacs/mdlib/vsite.h"
  82 #include "gromacs/mdrun/mdmodules.h"
  83 #include "gromacs/mdtypes/commrec.h"
  84 #include "gromacs/mdtypes/forceoutput.h"
  85 #include "gromacs/mdtypes/inputrec.h"
  86 #include "gromacs/mdtypes/mdrunoptions.h"
  87 #include "gromacs/mdtypes/state.h"
  88 #include "gromacs/pbcutil/ishift.h"
  89 #include "gromacs/pbcutil/pbc.h"
  90 #include "gromacs/pulling/pull.h"
  91 #include "gromacs/timing/wallcycle.h"
  92 #include "gromacs/topology/block.h"
  93 #include "gromacs/topology/idef.h"
  94 #include "gromacs/topology/ifunc.h"
  95 #include "gromacs/topology/mtop_lookup.h"
  96 #include "gromacs/topology/mtop_util.h"
  97 #include "gromacs/topology/topology.h"
  98 #include "gromacs/utility/basedefinitions.h"
  99 #include "gromacs/utility/basenetwork.h"
 100 #include "gromacs/utility/cstringutil.h"
 101 #include "gromacs/utility/enumerationhelpers.h"
 102 #include "gromacs/utility/exceptions.h"
 103 #include "gromacs/utility/fatalerror.h"
 104 #include "gromacs/utility/gmxmpi.h"
 105 #include "gromacs/utility/logger.h"
 106 #include "gromacs/utility/mdmodulesnotifiers.h"
 107 #include "gromacs/utility/real.h"
 108 #include "gromacs/utility/smalloc.h"
 109 #include "gromacs/utility/strconvert.h"
 110 #include "gromacs/utility/stringstream.h"
 111 #include "gromacs/utility/stringutil.h"
 112 #include "gromacs/utility/textwriter.h"
 113
 114 #include "atomdistribution.h"
 115 #include "box.h"
 116 #include "cellsizes.h"
 117 #include "distribute.h"
 118 #include "domdec_constraints.h"
 119 #include "domdec_internal.h"
 120 #include "domdec_setup.h"
 121 #include "domdec_vsite.h"
 122 #include "redistribute.h"
 123 #include "utility.h"
 124
 125 // TODO remove this when moving domdec into gmx namespace
 126 using gmx::ArrayRef;
 127 using gmx::DdRankOrder;
 128 using gmx::DlbOption;
 129 using gmx::DomdecOptions;
 130 using gmx::RangePartitioning;
 131
 132 static const char* enumValueToString(DlbState enumValue)
 133 {
 134     static constexpr gmx::EnumerationArray<DlbState, const char*> dlbStateNames = {
 135         "off", "off", "auto", "locked", "on", "on"
 136     };
 137     return dlbStateNames[enumValue];
 138 }
 139
 140 /* The size per atom group of the cggl_flag buffer in gmx_domdec_comm_t */
 141 #define DD_CGIBS 2
 142
 143 /* The flags for the cggl_flag buffer in gmx_domdec_comm_t */
 144 #define DD_FLAG_NRCG 65535
 145 #define DD_FLAG_FW(d) (1 << (16 + (d)*2))
 146 #define DD_FLAG_BW(d) (1 << (16 + (d)*2 + 1))
 147
 148 /* The DD zone order */
 149 static const ivec dd_zo[DD_MAXZONE] = { { 0, 0, 0 }, { 1, 0, 0 }, { 1, 1, 0 }, { 0, 1, 0 },
 150                                         { 0, 1, 1 }, { 0, 0, 1 }, { 1, 0, 1 }, { 1, 1, 1 } };
 151
 152 /* The non-bonded zone-pair setup for domain decomposition
 153  * The first number is the i-zone, the second number the first j-zone seen by
 154  * this i-zone, the third number the last+1 j-zone seen by this i-zone.
 155  * As is, this is for 3D decomposition, where there are 4 i-zones.
 156  * With 2D decomposition use only the first 2 i-zones and a last+1 j-zone of 4.
 157  * With 1D decomposition use only the first i-zone and a last+1 j-zone of 2.
 158  */
 159 static const int ddNonbondedZonePairRanges[DD_MAXIZONE][3] = { { 0, 0, 8 },
 160                                                                { 1, 3, 6 },
 161                                                                { 2, 5, 6 },
 162                                                                { 3, 5, 7 } };
 163
 164 static void ddindex2xyz(const ivec nc, int ind, ivec xyz)
 165 {
 166     xyz[XX] = ind / (nc[YY] * nc[ZZ]);
 167     xyz[YY] = (ind / nc[ZZ]) % nc[YY];
 168     xyz[ZZ] = ind % nc[ZZ];
 169 }
 170
 171 static int ddcoord2ddnodeid(gmx_domdec_t* dd, ivec c)
 172 {
 173     int ddnodeid = -1;
 174
 175     const CartesianRankSetup& cartSetup = dd->comm->cartesianRankSetup;
 176     const int                 ddindex   = dd_index(dd->numCells, c);
 177     if (cartSetup.bCartesianPP_PME)
 178     {
 179         ddnodeid = cartSetup.ddindex2ddnodeid[ddindex];
 180     }
 181     else if (cartSetup.bCartesianPP)
 182     {
 183 #if GMX_MPI
 184         MPI_Cart_rank(dd->mpi_comm_all, c, &ddnodeid);
 185 #endif
 186     }
 187     else
 188     {
 189         ddnodeid = ddindex;
 190     }
 191
 192     return ddnodeid;
 193 }
 194
 195 int ddglatnr(const gmx_domdec_t* dd, int i)
 196 {
 197     int atnr = 0;
 198
 199     if (dd == nullptr)
 200     {
 201         atnr = i + 1;
 202     }
 203     else
 204     {
 205         if (i >= dd->comm->atomRanges.numAtomsTotal())
 206         {
 207             gmx_fatal(FARGS,
 208                       "glatnr called with %d, which is larger than the local number of atoms (%d)",
 209                       i,
 210                       dd->comm->atomRanges.numAtomsTotal());
 211         }
 212         atnr = dd->globalAtomIndices[i] + 1;
 213     }
 214
 215     return atnr;
 216 }
 217
 218 void dd_store_state(const gmx_domdec_t& dd, t_state* state)
 219 {
 220     if (state->ddp_count != dd.ddp_count)
 221     {
 222         gmx_incons("The MD state does not match the domain decomposition state");
 223     }
 224
 225     state->cg_gl.resize(dd.numHomeAtoms);
 226     for (int i = 0; i < dd.numHomeAtoms; i++)
 227     {
 228         state->cg_gl[i] = dd.globalAtomGroupIndices[i];
 229     }
 230
 231     state->ddp_count_cg_gl = dd.ddp_count;
 232 }
 233
 234 gmx_domdec_zones_t* domdec_zones(gmx_domdec_t* dd)
 235 {
 236     return &dd->comm->zones;
 237 }
 238
 239 int dd_numAtomsZones(const gmx_domdec_t& dd)
 240 {
 241     return dd.comm->atomRanges.end(DDAtomRanges::Type::Zones);
 242 }
 243
 244 int dd_numHomeAtoms(const gmx_domdec_t& dd)
 245 {
 246     return dd.comm->atomRanges.numHomeAtoms();
 247 }
 248
 249 int dd_natoms_mdatoms(const gmx_domdec_t& dd)
 250 {
 251     /* We currently set mdatoms entries for all atoms:
 252      * local + non-local + communicated for vsite + constraints
 253      */
 254
 255     return dd.comm->atomRanges.numAtomsTotal();
 256 }
 257
 258 int dd_natoms_vsite(const gmx_domdec_t& dd)
 259 {
 260     return dd.comm->atomRanges.end(DDAtomRanges::Type::Vsites);
 261 }
 262
 263 void dd_get_constraint_range(const gmx_domdec_t& dd, int* at_start, int* at_end)
 264 {
 265     *at_start = dd.comm->atomRanges.start(DDAtomRanges::Type::Constraints);
 266     *at_end   = dd.comm->atomRanges.end(DDAtomRanges::Type::Constraints);
 267 }
 268
 269 void dd_move_x(gmx_domdec_t* dd, const matrix box, gmx::ArrayRef<gmx::RVec> x, gmx_wallcycle* wcycle)
 270 {
 271     wallcycle_start(wcycle, WallCycleCounter::MoveX);
 272
 273     rvec shift = { 0, 0, 0 };
 274
 275     gmx_domdec_comm_t* comm = dd->comm;
 276
 277     int nzone   = 1;
 278     int nat_tot = comm->atomRanges.numHomeAtoms();
 279     for (int d = 0; d < dd->ndim; d++)
 280     {
 281         const bool bPBC   = (dd->ci[dd->dim[d]] == 0);
 282         const bool bScrew = (bPBC && dd->unitCellInfo.haveScrewPBC && dd->dim[d] == XX);
 283         if (bPBC)
 284         {
 285             copy_rvec(box[dd->dim[d]], shift);
 286         }
 287         gmx_domdec_comm_dim_t* cd = &comm->cd[d];
 288         for (const gmx_domdec_ind_t& ind : cd->ind)
 289         {
 290             DDBufferAccess<gmx::RVec> sendBufferAccess(comm->rvecBuffer, ind.nsend[nzone + 1]);
 291             gmx::ArrayRef<gmx::RVec>& sendBuffer = sendBufferAccess.buffer;
 292             int                       n          = 0;
 293             if (!bPBC)
 294             {
 295                 for (int j : ind.index)
 296                 {
 297                     sendBuffer[n] = x[j];
 298                     n++;
 299                 }
 300             }
 301             else if (!bScrew)
 302             {
 303                 for (int j : ind.index)
 304                 {
 305                     /* We need to shift the coordinates */
 306                     for (int d = 0; d < DIM; d++)
 307                     {
 308                         sendBuffer[n][d] = x[j][d] + shift[d];
 309                     }
 310                     n++;
 311                 }
 312             }
 313             else
 314             {
 315                 for (int j : ind.index)
 316                 {
 317                     /* Shift x */
 318                     sendBuffer[n][XX] = x[j][XX] + shift[XX];
 319                     /* Rotate y and z.
 320                      * This operation requires a special shift force
 321                      * treatment, which is performed in calc_vir.
 322                      */
 323                     sendBuffer[n][YY] = box[YY][YY] - x[j][YY];
 324                     sendBuffer[n][ZZ] = box[ZZ][ZZ] - x[j][ZZ];
 325                     n++;
 326                 }
 327             }
 328
 329             DDBufferAccess<gmx::RVec> receiveBufferAccess(
 330                     comm->rvecBuffer2, cd->receiveInPlace ? 0 : ind.nrecv[nzone + 1]);
 331
 332             gmx::ArrayRef<gmx::RVec> receiveBuffer;
 333             if (cd->receiveInPlace)
 334             {
 335                 receiveBuffer = gmx::arrayRefFromArray(x.data() + nat_tot, ind.nrecv[nzone + 1]);
 336             }
 337             else
 338             {
 339                 receiveBuffer = receiveBufferAccess.buffer;
 340             }
 341             /* Send and receive the coordinates */
 342             ddSendrecv(dd, d, dddirBackward, sendBuffer, receiveBuffer);
 343
 344             if (!cd->receiveInPlace)
 345             {
 346                 int j = 0;
 347                 for (int zone = 0; zone < nzone; zone++)
 348                 {
 349                     for (int i = ind.cell2at0[zone]; i < ind.cell2at1[zone]; i++)
 350                     {
 351                         x[i] = receiveBuffer[j++];
 352                     }
 353                 }
 354             }
 355             nat_tot += ind.nrecv[nzone + 1];
 356         }
 357         nzone += nzone;
 358     }
 359
 360     wallcycle_stop(wcycle, WallCycleCounter::MoveX);
 361 }
 362
 363 void dd_move_f(gmx_domdec_t* dd, gmx::ForceWithShiftForces* forceWithShiftForces, gmx_wallcycle* wcycle)
 364 {
 365     wallcycle_start(wcycle, WallCycleCounter::MoveF);
 366
 367     gmx::ArrayRef<gmx::RVec> f      = forceWithShiftForces->force();
 368     gmx::ArrayRef<gmx::RVec> fshift = forceWithShiftForces->shiftForces();
 369
 370     gmx_domdec_comm_t& comm    = *dd->comm;
 371     int                nzone   = comm.zones.n / 2;
 372     int                nat_tot = comm.atomRanges.end(DDAtomRanges::Type::Zones);
 373     for (int d = dd->ndim - 1; d >= 0; d--)
 374     {
 375         /* Only forces in domains near the PBC boundaries need to
 376            consider PBC in the treatment of fshift */
 377         const bool shiftForcesNeedPbc =
 378                 (forceWithShiftForces->computeVirial() && dd->ci[dd->dim[d]] == 0);
 379         const bool applyScrewPbc =
 380                 (shiftForcesNeedPbc && dd->unitCellInfo.haveScrewPBC && dd->dim[d] == XX);
 381         /* Determine which shift vector we need */
 382         ivec vis        = { 0, 0, 0 };
 383         vis[dd->dim[d]] = 1;
 384         const int is    = gmx::ivecToShiftIndex(vis);
 385
 386         /* Loop over the pulses */
 387         const gmx_domdec_comm_dim_t& cd = comm.cd[d];
 388         for (int p = cd.numPulses() - 1; p >= 0; p--)
 389         {
 390             const gmx_domdec_ind_t&   ind = cd.ind[p];
 391             DDBufferAccess<gmx::RVec> receiveBufferAccess(comm.rvecBuffer, ind.nsend[nzone + 1]);
 392             gmx::ArrayRef<gmx::RVec>& receiveBuffer = receiveBufferAccess.buffer;
 393
 394             nat_tot -= ind.nrecv[nzone + 1];
 395
 396             DDBufferAccess<gmx::RVec> sendBufferAccess(
 397                     comm.rvecBuffer2, cd.receiveInPlace ? 0 : ind.nrecv[nzone + 1]);
 398
 399             gmx::ArrayRef<gmx::RVec> sendBuffer;
 400             if (cd.receiveInPlace)
 401             {
 402                 sendBuffer = gmx::arrayRefFromArray(f.data() + nat_tot, ind.nrecv[nzone + 1]);
 403             }
 404             else
 405             {
 406                 sendBuffer = sendBufferAccess.buffer;
 407                 int j      = 0;
 408                 for (int zone = 0; zone < nzone; zone++)
 409                 {
 410                     for (int i = ind.cell2at0[zone]; i < ind.cell2at1[zone]; i++)
 411                     {
 412                         sendBuffer[j++] = f[i];
 413                     }
 414                 }
 415             }
 416             /* Communicate the forces */
 417             ddSendrecv(dd, d, dddirForward, sendBuffer, receiveBuffer);
 418             /* Add the received forces */
 419             int n = 0;
 420             if (!shiftForcesNeedPbc)
 421             {
 422                 for (int j : ind.index)
 423                 {
 424                     for (int d = 0; d < DIM; d++)
 425                     {
 426                         f[j][d] += receiveBuffer[n][d];
 427                     }
 428                     n++;
 429                 }
 430             }
 431             else if (!applyScrewPbc)
 432             {
 433                 for (int j : ind.index)
 434                 {
 435                     for (int d = 0; d < DIM; d++)
 436                     {
 437                         f[j][d] += receiveBuffer[n][d];
 438                     }
 439                     /* Add this force to the shift force */
 440                     for (int d = 0; d < DIM; d++)
 441                     {
 442                         fshift[is][d] += receiveBuffer[n][d];
 443                     }
 444                     n++;
 445                 }
 446             }
 447             else
 448             {
 449                 for (int j : ind.index)
 450                 {
 451                     /* Rotate the force */
 452                     f[j][XX] += receiveBuffer[n][XX];
 453                     f[j][YY] -= receiveBuffer[n][YY];
 454                     f[j][ZZ] -= receiveBuffer[n][ZZ];
 455                     if (shiftForcesNeedPbc)
 456                     {
 457                         /* Add this force to the shift force */
 458                         for (int d = 0; d < DIM; d++)
 459                         {
 460                             fshift[is][d] += receiveBuffer[n][d];
 461                         }
 462                     }
 463                     n++;
 464                 }
 465             }
 466         }
 467         nzone /= 2;
 468     }
 469     wallcycle_stop(wcycle, WallCycleCounter::MoveF);
 470 }
 471
 472 /* Convenience function for extracting a real buffer from an rvec buffer
 473  *
 474  * To reduce the number of temporary communication buffers and avoid
 475  * cache polution, we reuse gmx::RVec buffers for storing reals.
 476  * This functions return a real buffer reference with the same number
 477  * of elements as the gmx::RVec buffer (so 1/3 of the size in bytes).
 478  */
 479 static gmx::ArrayRef<real> realArrayRefFromRvecArrayRef(gmx::ArrayRef<gmx::RVec> arrayRef)
 480 {
 481     return gmx::arrayRefFromArray(as_rvec_array(arrayRef.data())[0], arrayRef.size());
 482 }
 483
 484 void dd_atom_spread_real(gmx_domdec_t* dd, real v[])
 485 {
 486     gmx_domdec_comm_t* comm = dd->comm;
 487
 488     int nzone   = 1;
 489     int nat_tot = comm->atomRanges.numHomeAtoms();
 490     for (int d = 0; d < dd->ndim; d++)
 491     {
 492         gmx_domdec_comm_dim_t* cd = &comm->cd[d];
 493         for (const gmx_domdec_ind_t& ind : cd->ind)
 494         {
 495             /* Note: We provision for RVec instead of real, so a factor of 3
 496              * more than needed. The buffer actually already has this size
 497              * and we pass a plain pointer below, so this does not matter.
 498              */
 499             DDBufferAccess<gmx::RVec> sendBufferAccess(comm->rvecBuffer, ind.nsend[nzone + 1]);
 500             gmx::ArrayRef<real> sendBuffer = realArrayRefFromRvecArrayRef(sendBufferAccess.buffer);
 501             int                 n          = 0;
 502             for (int j : ind.index)
 503             {
 504                 sendBuffer[n++] = v[j];
 505             }
 506
 507             DDBufferAccess<gmx::RVec> receiveBufferAccess(
 508                     comm->rvecBuffer2, cd->receiveInPlace ? 0 : ind.nrecv[nzone + 1]);
 509
 510             gmx::ArrayRef<real> receiveBuffer;
 511             if (cd->receiveInPlace)
 512             {
 513                 receiveBuffer = gmx::arrayRefFromArray(v + nat_tot, ind.nrecv[nzone + 1]);
 514             }
 515             else
 516             {
 517                 receiveBuffer = realArrayRefFromRvecArrayRef(receiveBufferAccess.buffer);
 518             }
 519             /* Send and receive the data */
 520             ddSendrecv(dd, d, dddirBackward, sendBuffer, receiveBuffer);
 521             if (!cd->receiveInPlace)
 522             {
 523                 int j = 0;
 524                 for (int zone = 0; zone < nzone; zone++)
 525                 {
 526                     for (int i = ind.cell2at0[zone]; i < ind.cell2at1[zone]; i++)
 527                     {
 528                         v[i] = receiveBuffer[j++];
 529                     }
 530                 }
 531             }
 532             nat_tot += ind.nrecv[nzone + 1];
 533         }
 534         nzone += nzone;
 535     }
 536 }
 537
 538 void dd_atom_sum_real(gmx_domdec_t* dd, real v[])
 539 {
 540     gmx_domdec_comm_t* comm = dd->comm;
 541
 542     int nzone   = comm->zones.n / 2;
 543     int nat_tot = comm->atomRanges.end(DDAtomRanges::Type::Zones);
 544     for (int d = dd->ndim - 1; d >= 0; d--)
 545     {
 546         gmx_domdec_comm_dim_t* cd = &comm->cd[d];
 547         for (int p = cd->numPulses() - 1; p >= 0; p--)
 548         {
 549             const gmx_domdec_ind_t& ind = cd->ind[p];
 550
 551             /* Note: We provision for RVec instead of real, so a factor of 3
 552              * more than needed. The buffer actually already has this size
 553              * and we typecast, so this works as intended.
 554              */
 555             DDBufferAccess<gmx::RVec> receiveBufferAccess(comm->rvecBuffer, ind.nsend[nzone + 1]);
 556             gmx::ArrayRef<real> receiveBuffer = realArrayRefFromRvecArrayRef(receiveBufferAccess.buffer);
 557             nat_tot -= ind.nrecv[nzone + 1];
 558
 559             DDBufferAccess<gmx::RVec> sendBufferAccess(
 560                     comm->rvecBuffer2, cd->receiveInPlace ? 0 : ind.nrecv[nzone + 1]);
 561
 562             gmx::ArrayRef<real> sendBuffer;
 563             if (cd->receiveInPlace)
 564             {
 565                 sendBuffer = gmx::arrayRefFromArray(v + nat_tot, ind.nrecv[nzone + 1]);
 566             }
 567             else
 568             {
 569                 sendBuffer = realArrayRefFromRvecArrayRef(sendBufferAccess.buffer);
 570                 int j      = 0;
 571                 for (int zone = 0; zone < nzone; zone++)
 572                 {
 573                     for (int i = ind.cell2at0[zone]; i < ind.cell2at1[zone]; i++)
 574                     {
 575                         sendBuffer[j++] = v[i];
 576                     }
 577                 }
 578             }
 579             /* Communicate the forces */
 580             ddSendrecv(dd, d, dddirForward, sendBuffer, receiveBuffer);
 581             /* Add the received forces */
 582             int n = 0;
 583             for (int j : ind.index)
 584             {
 585                 v[j] += receiveBuffer[n];
 586                 n++;
 587             }
 588         }
 589         nzone /= 2;
 590     }
 591 }
 592
 593 real dd_cutoff_multibody(const gmx_domdec_t* dd)
 594 {
 595     const gmx_domdec_comm_t& comm       = *dd->comm;
 596     const DDSystemInfo&      systemInfo = comm.systemInfo;
 597
 598     real r = -1;
 599     if (systemInfo.haveInterDomainMultiBodyBondeds)
 600     {
 601         if (comm.cutoff_mbody > 0)
 602         {
 603             r = comm.cutoff_mbody;
 604         }
 605         else
 606         {
 607             /* cutoff_mbody=0 means we do not have DLB */
 608             r = comm.cellsize_min[dd->dim[0]];
 609             for (int di = 1; di < dd->ndim; di++)
 610             {
 611                 r = std::min(r, comm.cellsize_min[dd->dim[di]]);
 612             }
 613             if (comm.systemInfo.filterBondedCommunication)
 614             {
 615                 r = std::max(r, comm.cutoff_mbody);
 616             }
 617             else
 618             {
 619                 r = std::min(r, systemInfo.cutoff);
 620             }
 621         }
 622     }
 623
 624     return r;
 625 }
 626
 627 real dd_cutoff_twobody(const gmx_domdec_t* dd)
 628 {
 629     const real r_mb = dd_cutoff_multibody(dd);
 630
 631     return std::max(dd->comm->systemInfo.cutoff, r_mb);
 632 }
 633
 634
 635 static void dd_cart_coord2pmecoord(const DDRankSetup&        ddRankSetup,
 636                                    const CartesianRankSetup& cartSetup,
 637                                    const ivec                coord,
 638                                    ivec                      coord_pme)
 639 {
 640     const int nc   = ddRankSetup.numPPCells[cartSetup.cartpmedim];
 641     const int ntot = cartSetup.ntot[cartSetup.cartpmedim];
 642     copy_ivec(coord, coord_pme);
 643     coord_pme[cartSetup.cartpmedim] =
 644             nc + (coord[cartSetup.cartpmedim] * (ntot - nc) + (ntot - nc) / 2) / nc;
 645 }
 646
 647 /* Returns the PME rank index in 0...npmenodes-1 for the PP cell with index ddCellIndex */
 648 static int ddindex2pmeindex(const DDRankSetup& ddRankSetup, const int ddCellIndex)
 649 {
 650     const int npp  = ddRankSetup.numPPRanks;
 651     const int npme = ddRankSetup.numRanksDoingPme;
 652
 653     /* Here we assign a PME node to communicate with this DD node
 654      * by assuming that the major index of both is x.
 655      * We add npme/2 to obtain an even distribution.
 656      */
 657     return (ddCellIndex * npme + npme / 2) / npp;
 658 }
 659
 660 static std::vector<int> dd_interleaved_pme_ranks(const DDRankSetup& ddRankSetup)
 661 {
 662     std::vector<int> pmeRanks(ddRankSetup.numRanksDoingPme);
 663
 664     int n = 0;
 665     for (int i = 0; i < ddRankSetup.numPPRanks; i++)
 666     {
 667         const int p0 = ddindex2pmeindex(ddRankSetup, i);
 668         const int p1 = ddindex2pmeindex(ddRankSetup, i + 1);
 669         if (i + 1 == ddRankSetup.numPPRanks || p1 > p0)
 670         {
 671             if (debug)
 672             {
 673                 fprintf(debug, "pme_rank[%d] = %d\n", n, i + 1 + n);
 674             }
 675             pmeRanks[n] = i + 1 + n;
 676             n++;
 677         }
 678     }
 679
 680     return pmeRanks;
 681 }
 682
 683 static int gmx_ddcoord2pmeindex(const gmx_domdec_t& dd, int x, int y, int z)
 684 {
 685     ivec coords;
 686
 687     coords[XX]     = x;
 688     coords[YY]     = y;
 689     coords[ZZ]     = z;
 690     const int slab = ddindex2pmeindex(dd.comm->ddRankSetup, dd_index(dd.numCells, coords));
 691
 692     return slab;
 693 }
 694
 695 static int ddcoord2simnodeid(const t_commrec* cr, int x, int y, int z)
 696 {
 697     const CartesianRankSetup& cartSetup = cr->dd->comm->cartesianRankSetup;
 698     ivec                      coords    = { x, y, z };
 699     int                       nodeid    = -1;
 700
 701     if (cartSetup.bCartesianPP_PME)
 702     {
 703 #if GMX_MPI
 704         MPI_Cart_rank(cr->mpi_comm_mysim, coords, &nodeid);
 705 #endif
 706     }
 707     else
 708     {
 709         int ddindex = dd_index(cr->dd->numCells, coords);
 710         if (cartSetup.bCartesianPP)
 711         {
 712             nodeid = cartSetup.ddindex2simnodeid[ddindex];
 713         }
 714         else
 715         {
 716             if (cr->dd->comm->ddRankSetup.usePmeOnlyRanks)
 717             {
 718                 nodeid = ddindex + gmx_ddcoord2pmeindex(*cr->dd, x, y, z);
 719             }
 720             else
 721             {
 722                 nodeid = ddindex;
 723             }
 724         }
 725     }
 726
 727     return nodeid;
 728 }
 729
 730 static int dd_simnode2pmenode(const DDRankSetup&        ddRankSetup,
 731                               const CartesianRankSetup& cartSetup,
 732                               gmx::ArrayRef<const int>  pmeRanks,
 733                               const t_commrec gmx_unused* cr,
 734                               const int                   sim_nodeid)
 735 {
 736     int pmenode = -1;
 737
 738     /* This assumes a uniform x domain decomposition grid cell size */
 739     if (cartSetup.bCartesianPP_PME)
 740     {
 741 #if GMX_MPI
 742         ivec coord, coord_pme;
 743         MPI_Cart_coords(cr->mpi_comm_mysim, sim_nodeid, DIM, coord);
 744         if (coord[cartSetup.cartpmedim] < ddRankSetup.numPPCells[cartSetup.cartpmedim])
 745         {
 746             /* This is a PP rank */
 747             dd_cart_coord2pmecoord(ddRankSetup, cartSetup, coord, coord_pme);
 748             MPI_Cart_rank(cr->mpi_comm_mysim, coord_pme, &pmenode);
 749         }
 750 #endif
 751     }
 752     else if (cartSetup.bCartesianPP)
 753     {
 754         if (sim_nodeid < ddRankSetup.numPPRanks)
 755         {
 756             pmenode = ddRankSetup.numPPRanks + ddindex2pmeindex(ddRankSetup, sim_nodeid);
 757         }
 758     }
 759     else
 760     {
 761         /* This assumes DD cells with identical x coordinates
 762          * are numbered sequentially.
 763          */
 764         if (pmeRanks.empty())
 765         {
 766             if (sim_nodeid < ddRankSetup.numPPRanks)
 767             {
 768                 /* The DD index equals the nodeid */
 769                 pmenode = ddRankSetup.numPPRanks + ddindex2pmeindex(ddRankSetup, sim_nodeid);
 770             }
 771         }
 772         else
 773         {
 774             int i = 0;
 775             while (sim_nodeid > pmeRanks[i])
 776             {
 777                 i++;
 778             }
 779             if (sim_nodeid < pmeRanks[i])
 780             {
 781                 pmenode = pmeRanks[i];
 782             }
 783         }
 784     }
 785
 786     return pmenode;
 787 }
 788
 789 NumPmeDomains getNumPmeDomains(const gmx_domdec_t* dd)
 790 {
 791     if (dd != nullptr)
 792     {
 793         return { dd->comm->ddRankSetup.npmenodes_x, dd->comm->ddRankSetup.npmenodes_y };
 794     }
 795     else
 796     {
 797         return { 1, 1 };
 798     }
 799 }
 800
 801 std::vector<int> get_pme_ddranks(const t_commrec* cr, const int pmenodeid)
 802 {
 803     const DDRankSetup&        ddRankSetup = cr->dd->comm->ddRankSetup;
 804     const CartesianRankSetup& cartSetup   = cr->dd->comm->cartesianRankSetup;
 805     GMX_RELEASE_ASSERT(ddRankSetup.usePmeOnlyRanks,
 806                        "This function should only be called when PME-only ranks are in use");
 807     std::vector<int> ddranks;
 808     ddranks.reserve((ddRankSetup.numPPRanks + ddRankSetup.numRanksDoingPme - 1) / ddRankSetup.numRanksDoingPme);
 809
 810     for (int x = 0; x < ddRankSetup.numPPCells[XX]; x++)
 811     {
 812         for (int y = 0; y < ddRankSetup.numPPCells[YY]; y++)
 813         {
 814             for (int z = 0; z < ddRankSetup.numPPCells[ZZ]; z++)
 815             {
 816                 if (cartSetup.bCartesianPP_PME)
 817                 {
 818                     ivec coord = { x, y, z };
 819                     ivec coord_pme;
 820                     dd_cart_coord2pmecoord(ddRankSetup, cartSetup, coord, coord_pme);
 821                     if (cr->dd->ci[XX] == coord_pme[XX] && cr->dd->ci[YY] == coord_pme[YY]
 822                         && cr->dd->ci[ZZ] == coord_pme[ZZ])
 823                     {
 824                         ddranks.push_back(ddcoord2simnodeid(cr, x, y, z));
 825                     }
 826                 }
 827                 else
 828                 {
 829                     /* The slab corresponds to the nodeid in the PME group */
 830                     if (gmx_ddcoord2pmeindex(*cr->dd, x, y, z) == pmenodeid)
 831                     {
 832                         ddranks.push_back(ddcoord2simnodeid(cr, x, y, z));
 833                     }
 834                 }
 835             }
 836         }
 837     }
 838     return ddranks;
 839 }
 840
 841 static gmx_bool receive_vir_ener(const gmx_domdec_t* dd, gmx::ArrayRef<const int> pmeRanks, const t_commrec* cr)
 842 {
 843     bool bReceive = true;
 844
 845     const DDRankSetup& ddRankSetup = dd->comm->ddRankSetup;
 846     if (ddRankSetup.usePmeOnlyRanks)
 847     {
 848         const CartesianRankSetup& cartSetup = dd->comm->cartesianRankSetup;
 849         if (cartSetup.bCartesianPP_PME)
 850         {
 851 #if GMX_MPI
 852             int  pmenode = dd_simnode2pmenode(ddRankSetup, cartSetup, pmeRanks, cr, cr->sim_nodeid);
 853             ivec coords;
 854             MPI_Cart_coords(cr->mpi_comm_mysim, cr->sim_nodeid, DIM, coords);
 855             coords[cartSetup.cartpmedim]++;
 856             if (coords[cartSetup.cartpmedim] < dd->numCells[cartSetup.cartpmedim])
 857             {
 858                 int rank = 0;
 859                 MPI_Cart_rank(cr->mpi_comm_mysim, coords, &rank);
 860                 if (dd_simnode2pmenode(ddRankSetup, cartSetup, pmeRanks, cr, rank) == pmenode)
 861                 {
 862                     /* This is not the last PP node for pmenode */
 863                     bReceive = FALSE;
 864                 }
 865             }
 866 #else
 867             GMX_RELEASE_ASSERT(
 868                     false,
 869                     "Without MPI we should not have Cartesian PP-PME with #PMEnodes < #DDnodes");
 870 #endif
 871         }
 872         else
 873         {
 874             int pmenode = dd_simnode2pmenode(ddRankSetup, cartSetup, pmeRanks, cr, cr->sim_nodeid);
 875             if (cr->sim_nodeid + 1 < cr->nnodes
 876                 && dd_simnode2pmenode(ddRankSetup, cartSetup, pmeRanks, cr, cr->sim_nodeid + 1) == pmenode)
 877             {
 878                 /* This is not the last PP node for pmenode */
 879                 bReceive = FALSE;
 880             }
 881         }
 882     }
 883
 884     return bReceive;
 885 }
 886
 887 static void set_slb_pme_dim_f(gmx_domdec_t* dd, int dim, real** dim_f)
 888 {
 889     gmx_domdec_comm_t* comm = dd->comm;
 890
 891     snew(*dim_f, dd->numCells[dim] + 1);
 892     (*dim_f)[0] = 0;
 893     for (int i = 1; i < dd->numCells[dim]; i++)
 894     {
 895         if (comm->slb_frac[dim])
 896         {
 897             (*dim_f)[i] = (*dim_f)[i - 1] + comm->slb_frac[dim][i - 1];
 898         }
 899         else
 900         {
 901             (*dim_f)[i] = static_cast<real>(i) / static_cast<real>(dd->numCells[dim]);
 902         }
 903     }
 904     (*dim_f)[dd->numCells[dim]] = 1;
 905 }
 906
 907 static void init_ddpme(gmx_domdec_t* dd, gmx_ddpme_t* ddpme, int dimind)
 908 {
 909     const DDRankSetup& ddRankSetup = dd->comm->ddRankSetup;
 910
 911     if (dimind == 0 && dd->dim[0] == YY && ddRankSetup.npmenodes_x == 1)
 912     {
 913         ddpme->dim = YY;
 914     }
 915     else
 916     {
 917         ddpme->dim = dimind;
 918     }
 919     ddpme->dim_match = (ddpme->dim == dd->dim[dimind]);
 920
 921     ddpme->nslab = (ddpme->dim == 0 ? ddRankSetup.npmenodes_x : ddRankSetup.npmenodes_y);
 922
 923     if (ddpme->nslab <= 1)
 924     {
 925         return;
 926     }
 927
 928     const int nso = ddRankSetup.numRanksDoingPme / ddpme->nslab;
 929     /* Determine for each PME slab the PP location range for dimension dim */
 930     snew(ddpme->pp_min, ddpme->nslab);
 931     snew(ddpme->pp_max, ddpme->nslab);
 932     for (int slab = 0; slab < ddpme->nslab; slab++)
 933     {
 934         ddpme->pp_min[slab] = dd->numCells[dd->dim[dimind]] - 1;
 935         ddpme->pp_max[slab] = 0;
 936     }
 937     for (int i = 0; i < dd->nnodes; i++)
 938     {
 939         ivec xyz;
 940         ddindex2xyz(dd->numCells, i, xyz);
 941         /* For y only use our y/z slab.
 942          * This assumes that the PME x grid size matches the DD grid size.
 943          */
 944         if (dimind == 0 || xyz[XX] == dd->ci[XX])
 945         {
 946             const int pmeindex  = ddindex2pmeindex(ddRankSetup, i);
 947             const int slab      = (dimind == 0) ? (pmeindex / nso) : (pmeindex % ddpme->nslab);
 948             ddpme->pp_min[slab] = std::min(ddpme->pp_min[slab], xyz[dimind]);
 949             ddpme->pp_max[slab] = std::max(ddpme->pp_max[slab], xyz[dimind]);
 950         }
 951     }
 952
 953     set_slb_pme_dim_f(dd, ddpme->dim, &ddpme->slb_dim_f);
 954 }
 955
 956 int dd_pme_maxshift_x(const gmx_domdec_t& dd)
 957 {
 958     const DDRankSetup& ddRankSetup = dd.comm->ddRankSetup;
 959
 960     if (ddRankSetup.ddpme[0].dim == XX)
 961     {
 962         return ddRankSetup.ddpme[0].maxshift;
 963     }
 964     else
 965     {
 966         return 0;
 967     }
 968 }
 969
 970 int dd_pme_maxshift_y(const gmx_domdec_t& dd)
 971 {
 972     const DDRankSetup& ddRankSetup = dd.comm->ddRankSetup;
 973
 974     if (ddRankSetup.ddpme[0].dim == YY)
 975     {
 976         return ddRankSetup.ddpme[0].maxshift;
 977     }
 978     else if (ddRankSetup.npmedecompdim >= 2 && ddRankSetup.ddpme[1].dim == YY)
 979     {
 980         return ddRankSetup.ddpme[1].maxshift;
 981     }
 982     else
 983     {
 984         return 0;
 985     }
 986 }
 987
 988 bool ddUsesUpdateGroups(const gmx_domdec_t& dd)
 989 {
 990     return dd.comm->systemInfo.useUpdateGroups;
 991 }
 992
 993 void dd_cycles_add(const gmx_domdec_t* dd, float cycles, int ddCycl)
 994 {
 995     /* Note that the cycles value can be incorrect, either 0 or some
 996      * extremely large value, when our thread migrated to another core
 997      * with an unsynchronized cycle counter. If this happens less often
 998      * that once per nstlist steps, this will not cause issues, since
 999      * we later subtract the maximum value from the sum over nstlist steps.
1000      * A zero count will slightly lower the total, but that's a small effect.
1001      * Note that the main purpose of the subtraction of the maximum value
1002      * is to avoid throwing off the load balancing when stalls occur due
1003      * e.g. system activity or network congestion.
1004      */
1005     dd->comm->cycl[ddCycl] += cycles;
1006     dd->comm->cycl_n[ddCycl]++;
1007     if (cycles > dd->comm->cycl_max[ddCycl])
1008     {
1009         dd->comm->cycl_max[ddCycl] = cycles;
1010     }
1011 }
1012
1013 #if GMX_MPI
1014 static void make_load_communicator(gmx_domdec_t* dd, int dim_ind, ivec loc)
1015 {
1016     MPI_Comm c_row = MPI_COMM_NULL;
1017     ivec     loc_c;
1018     bool     bPartOfGroup = false;
1019
1020     const int dim = dd->dim[dim_ind];
1021     copy_ivec(loc, loc_c);
1022     for (int i = 0; i < dd->numCells[dim]; i++)
1023     {
1024         loc_c[dim]     = i;
1025         const int rank = dd_index(dd->numCells, loc_c);
1026         if (rank == dd->rank)
1027         {
1028             /* This process is part of the group */
1029             bPartOfGroup = TRUE;
1030         }
1031     }
1032     MPI_Comm_split(dd->mpi_comm_all, bPartOfGroup ? 0 : MPI_UNDEFINED, dd->rank, &c_row);
1033     if (bPartOfGroup)
1034     {
1035         dd->comm->mpi_comm_load[dim_ind] = c_row;
1036         if (!isDlbDisabled(dd->comm))
1037         {
1038             DDCellsizesWithDlb& cellsizes = dd->comm->cellsizesWithDlb[dim_ind];
1039
1040             if (dd->ci[dim] == dd->master_ci[dim])
1041             {
1042                 /* This is the root process of this row */
1043                 cellsizes.rowMaster = std::make_unique<RowMaster>();
1044
1045                 RowMaster& rowMaster = *cellsizes.rowMaster;
1046                 rowMaster.cellFrac.resize(ddCellFractionBufferSize(dd, dim_ind));
1047                 rowMaster.oldCellFrac.resize(dd->numCells[dim] + 1);
1048                 rowMaster.isCellMin.resize(dd->numCells[dim]);
1049                 if (dim_ind > 0)
1050                 {
1051                     rowMaster.bounds.resize(dd->numCells[dim]);
1052                 }
1053                 rowMaster.buf_ncd.resize(dd->numCells[dim]);
1054             }
1055             else
1056             {
1057                 /* This is not a root process, we only need to receive cell_f */
1058                 cellsizes.fracRow.resize(ddCellFractionBufferSize(dd, dim_ind));
1059             }
1060         }
1061         if (dd->ci[dim] == dd->master_ci[dim])
1062         {
1063             snew(dd->comm->load[dim_ind].load, dd->numCells[dim] * DD_NLOAD_MAX);
1064         }
1065     }
1066 }
1067 #endif
1068
1069 void dd_setup_dlb_resource_sharing(const t_commrec* cr, int gpu_id)
1070 {
1071 #if GMX_MPI
1072     MPI_Comm mpi_comm_pp_physicalnode = MPI_COMM_NULL;
1073
1074     if (!thisRankHasDuty(cr, DUTY_PP) || gpu_id < 0)
1075     {
1076         /* Only ranks with short-ranged tasks (currently) use GPUs.
1077          * If we don't have GPUs assigned, there are no resources to share.
1078          */
1079         return;
1080     }
1081
1082     const int physicalnode_id_hash = gmx_physicalnode_id_hash();
1083
1084     gmx_domdec_t* dd = cr->dd;
1085
1086     if (debug)
1087     {
1088         fprintf(debug, "dd_setup_dd_dlb_gpu_sharing:\n");
1089         fprintf(debug, "DD PP rank %d physical node hash %d gpu_id %d\n", dd->rank, physicalnode_id_hash, gpu_id);
1090     }
1091     /* Split the PP communicator over the physical nodes */
1092     /* TODO: See if we should store this (before), as it's also used for
1093      * for the nodecomm summation.
1094      */
1095     // TODO PhysicalNodeCommunicator could be extended/used to handle
1096     // the need for per-node per-group communicators.
1097     MPI_Comm_split(dd->mpi_comm_all, physicalnode_id_hash, dd->rank, &mpi_comm_pp_physicalnode);
1098     MPI_Comm_split(mpi_comm_pp_physicalnode, gpu_id, dd->rank, &dd->comm->mpi_comm_gpu_shared);
1099     MPI_Comm_free(&mpi_comm_pp_physicalnode);
1100     MPI_Comm_size(dd->comm->mpi_comm_gpu_shared, &dd->comm->nrank_gpu_shared);
1101
1102     if (debug)
1103     {
1104         fprintf(debug, "nrank_gpu_shared %d\n", dd->comm->nrank_gpu_shared);
1105     }
1106
1107     /* Note that some ranks could share a GPU, while others don't */
1108
1109     if (dd->comm->nrank_gpu_shared == 1)
1110     {
1111         MPI_Comm_free(&dd->comm->mpi_comm_gpu_shared);
1112     }
1113 #else
1114     GMX_UNUSED_VALUE(cr);
1115     GMX_UNUSED_VALUE(gpu_id);
1116 #endif
1117 }
1118
1119 static void make_load_communicators(gmx_domdec_t gmx_unused* dd)
1120 {
1121 #if GMX_MPI
1122     ivec loc;
1123
1124     if (debug)
1125     {
1126         fprintf(debug, "Making load communicators\n");
1127     }
1128
1129     dd->comm->load = new domdec_load_t[std::max(dd->ndim, 1)];
1130     snew(dd->comm->mpi_comm_load, std::max(dd->ndim, 1));
1131
1132     if (dd->ndim == 0)
1133     {
1134         return;
1135     }
1136
1137     clear_ivec(loc);
1138     make_load_communicator(dd, 0, loc);
1139     if (dd->ndim > 1)
1140     {
1141         const int dim0 = dd->dim[0];
1142         for (int i = 0; i < dd->numCells[dim0]; i++)
1143         {
1144             loc[dim0] = i;
1145             make_load_communicator(dd, 1, loc);
1146         }
1147     }
1148     if (dd->ndim > 2)
1149     {
1150         const int dim0 = dd->dim[0];
1151         for (int i = 0; i < dd->numCells[dim0]; i++)
1152         {
1153             loc[dim0]      = i;
1154             const int dim1 = dd->dim[1];
1155             for (int j = 0; j < dd->numCells[dim1]; j++)
1156             {
1157                 loc[dim1] = j;
1158                 make_load_communicator(dd, 2, loc);
1159             }
1160         }
1161     }
1162
1163     if (debug)
1164     {
1165         fprintf(debug, "Finished making load communicators\n");
1166     }
1167 #endif
1168 }
1169
1170 /*! \brief Sets up the relation between neighboring domains and zones */
1171 static void setup_neighbor_relations(gmx_domdec_t* dd)
1172 {
1173     ivec tmp, s;
1174     GMX_ASSERT((dd->ndim >= 0) && (dd->ndim <= DIM), "Must have valid number of dimensions for DD");
1175
1176     for (int d = 0; d < dd->ndim; d++)
1177     {
1178         const int dim = dd->dim[d];
1179         copy_ivec(dd->ci, tmp);
1180         tmp[dim]           = (tmp[dim] + 1) % dd->numCells[dim];
1181         dd->neighbor[d][0] = ddcoord2ddnodeid(dd, tmp);
1182         copy_ivec(dd->ci, tmp);
1183         tmp[dim]           = (tmp[dim] - 1 + dd->numCells[dim]) % dd->numCells[dim];
1184         dd->neighbor[d][1] = ddcoord2ddnodeid(dd, tmp);
1185         if (debug)
1186         {
1187             fprintf(debug,
1188                     "DD rank %d neighbor ranks in dir %d are + %d - %d\n",
1189                     dd->rank,
1190                     dim,
1191                     dd->neighbor[d][0],
1192                     dd->neighbor[d][1]);
1193         }
1194     }
1195
1196     int nzone  = (1 << dd->ndim);
1197     int nizone = (1 << std::max(dd->ndim - 1, 0));
1198     assert(nizone >= 1 && nizone <= DD_MAXIZONE);
1199
1200     gmx_domdec_zones_t* zones = &dd->comm->zones;
1201
1202     for (int i = 0; i < nzone; i++)
1203     {
1204         int m = 0;
1205         clear_ivec(zones->shift[i]);
1206         for (int d = 0; d < dd->ndim; d++)
1207         {
1208             zones->shift[i][dd->dim[d]] = dd_zo[i][m++];
1209         }
1210     }
1211
1212     zones->n = nzone;
1213     for (int i = 0; i < nzone; i++)
1214     {
1215         for (int d = 0; d < DIM; d++)
1216         {
1217             s[d] = dd->ci[d] - zones->shift[i][d];
1218             if (s[d] < 0)
1219             {
1220                 s[d] += dd->numCells[d];
1221             }
1222             else if (s[d] >= dd->numCells[d])
1223             {
1224                 s[d] -= dd->numCells[d];
1225             }
1226         }
1227     }
1228     for (int iZoneIndex = 0; iZoneIndex < nizone; iZoneIndex++)
1229     {
1230         GMX_RELEASE_ASSERT(
1231                 ddNonbondedZonePairRanges[iZoneIndex][0] == iZoneIndex,
1232                 "The first element for each ddNonbondedZonePairRanges should match its index");
1233
1234         DDPairInteractionRanges iZone;
1235         iZone.iZoneIndex = iZoneIndex;
1236         /* dd_zp3 is for 3D decomposition, for fewer dimensions use only
1237          * j-zones up to nzone.
1238          */
1239         iZone.jZoneRange = gmx::Range<int>(std::min(ddNonbondedZonePairRanges[iZoneIndex][1], nzone),
1240                                            std::min(ddNonbondedZonePairRanges[iZoneIndex][2], nzone));
1241         for (int dim = 0; dim < DIM; dim++)
1242         {
1243             if (dd->numCells[dim] == 1)
1244             {
1245                 /* All shifts should be allowed */
1246                 iZone.shift0[dim] = -1;
1247                 iZone.shift1[dim] = 1;
1248             }
1249             else
1250             {
1251                 /* Determine the min/max j-zone shift wrt the i-zone */
1252                 iZone.shift0[dim] = 1;
1253                 iZone.shift1[dim] = -1;
1254                 for (int jZone : iZone.jZoneRange)
1255                 {
1256                     int shift_diff = zones->shift[jZone][dim] - zones->shift[iZoneIndex][dim];
1257                     if (shift_diff < iZone.shift0[dim])
1258                     {
1259                         iZone.shift0[dim] = shift_diff;
1260                     }
1261                     if (shift_diff > iZone.shift1[dim])
1262                     {
1263                         iZone.shift1[dim] = shift_diff;
1264                     }
1265                 }
1266             }
1267         }
1268
1269         zones->iZones.push_back(iZone);
1270     }
1271
1272     if (!isDlbDisabled(dd->comm))
1273     {
1274         dd->comm->cellsizesWithDlb.resize(dd->ndim);
1275     }
1276
1277     if (dd->comm->ddSettings.recordLoad)
1278     {
1279         make_load_communicators(dd);
1280     }
1281 }
1282
1283 static void make_pp_communicator(const gmx::MDLogger& mdlog,
1284                                  gmx_domdec_t*        dd,
1285                                  t_commrec gmx_unused* cr,
1286                                  bool gmx_unused       reorder)
1287 {
1288 #if GMX_MPI
1289     gmx_domdec_comm_t*  comm      = dd->comm;
1290     CartesianRankSetup& cartSetup = comm->cartesianRankSetup;
1291
1292     if (cartSetup.bCartesianPP)
1293     {
1294         /* Set up cartesian communication for the particle-particle part */
1295         GMX_LOG(mdlog.info)
1296                 .appendTextFormatted("Will use a Cartesian communicator: %d x %d x %d",
1297                                      dd->numCells[XX],
1298                                      dd->numCells[YY],
1299                                      dd->numCells[ZZ]);
1300
1301         ivec periods;
1302         for (int i = 0; i < DIM; i++)
1303         {
1304             periods[i] = TRUE;
1305         }
1306         MPI_Comm comm_cart = MPI_COMM_NULL;
1307         MPI_Cart_create(cr->mpi_comm_mygroup, DIM, dd->numCells, periods, static_cast<int>(reorder), &comm_cart);
1308         /* We overwrite the old communicator with the new cartesian one */
1309         cr->mpi_comm_mygroup = comm_cart;
1310     }
1311
1312     dd->mpi_comm_all = cr->mpi_comm_mygroup;
1313     MPI_Comm_rank(dd->mpi_comm_all, &dd->rank);
1314
1315     if (cartSetup.bCartesianPP_PME)
1316     {
1317         /* Since we want to use the original cartesian setup for sim,
1318          * and not the one after split, we need to make an index.
1319          */
1320         cartSetup.ddindex2ddnodeid.resize(dd->nnodes);
1321         cartSetup.ddindex2ddnodeid[dd_index(dd->numCells, dd->ci)] = dd->rank;
1322         gmx_sumi(dd->nnodes, cartSetup.ddindex2ddnodeid.data(), cr);
1323         /* Get the rank of the DD master,
1324          * above we made sure that the master node is a PP node.
1325          */
1326         int rank = MASTER(cr) ? dd->rank : 0;
1327         MPI_Allreduce(&rank, &dd->masterrank, 1, MPI_INT, MPI_SUM, dd->mpi_comm_all);
1328     }
1329     else if (cartSetup.bCartesianPP)
1330     {
1331         if (!comm->ddRankSetup.usePmeOnlyRanks)
1332         {
1333             /* The PP communicator is also
1334              * the communicator for this simulation
1335              */
1336             cr->mpi_comm_mysim = cr->mpi_comm_mygroup;
1337         }
1338         cr->nodeid = dd->rank;
1339
1340         MPI_Cart_coords(dd->mpi_comm_all, dd->rank, DIM, dd->ci);
1341
1342         /* We need to make an index to go from the coordinates
1343          * to the nodeid of this simulation.
1344          */
1345         cartSetup.ddindex2simnodeid.resize(dd->nnodes);
1346         std::vector<int> buf(dd->nnodes);
1347         if (thisRankHasDuty(cr, DUTY_PP))
1348         {
1349             buf[dd_index(dd->numCells, dd->ci)] = cr->sim_nodeid;
1350         }
1351         /* Communicate the ddindex to simulation nodeid index */
1352         MPI_Allreduce(buf.data(), cartSetup.ddindex2simnodeid.data(), dd->nnodes, MPI_INT, MPI_SUM, cr->mpi_comm_mysim);
1353
1354         /* Determine the master coordinates and rank.
1355          * The DD master should be the same node as the master of this sim.
1356          */
1357         for (int i = 0; i < dd->nnodes; i++)
1358         {
1359             if (cartSetup.ddindex2simnodeid[i] == 0)
1360             {
1361                 ddindex2xyz(dd->numCells, i, dd->master_ci);
1362                 MPI_Cart_rank(dd->mpi_comm_all, dd->master_ci, &dd->masterrank);
1363             }
1364         }
1365         if (debug)
1366         {
1367             fprintf(debug, "The master rank is %d\n", dd->masterrank);
1368         }
1369     }
1370     else
1371     {
1372         /* No Cartesian communicators */
1373         /* We use the rank in dd->comm->all as DD index */
1374         ddindex2xyz(dd->numCells, dd->rank, dd->ci);
1375         /* The simulation master nodeid is 0, so the DD master rank is also 0 */
1376         dd->masterrank = 0;
1377         clear_ivec(dd->master_ci);
1378     }
1379 #endif
1380
1381     GMX_LOG(mdlog.info)
1382             .appendTextFormatted("Domain decomposition rank %d, coordinates %d %d %d\n",
1383                                  dd->rank,
1384                                  dd->ci[XX],
1385                                  dd->ci[YY],
1386                                  dd->ci[ZZ]);
1387     if (debug)
1388     {
1389         fprintf(debug,
1390                 "Domain decomposition rank %d, coordinates %d %d %d\n\n",
1391                 dd->rank,
1392                 dd->ci[XX],
1393                 dd->ci[YY],
1394                 dd->ci[ZZ]);
1395     }
1396 }
1397
1398 static void receive_ddindex2simnodeid(gmx_domdec_t* dd, t_commrec* cr)
1399 {
1400 #if GMX_MPI
1401     CartesianRankSetup& cartSetup = dd->comm->cartesianRankSetup;
1402
1403     if (!cartSetup.bCartesianPP_PME && cartSetup.bCartesianPP)
1404     {
1405         cartSetup.ddindex2simnodeid.resize(dd->nnodes);
1406         std::vector<int> buf(dd->nnodes);
1407         if (thisRankHasDuty(cr, DUTY_PP))
1408         {
1409             buf[dd_index(dd->numCells, dd->ci)] = cr->sim_nodeid;
1410         }
1411         /* Communicate the ddindex to simulation nodeid index */
1412         MPI_Allreduce(buf.data(), cartSetup.ddindex2simnodeid.data(), dd->nnodes, MPI_INT, MPI_SUM, cr->mpi_comm_mysim);
1413     }
1414 #else
1415     GMX_UNUSED_VALUE(dd);
1416     GMX_UNUSED_VALUE(cr);
1417 #endif
1418 }
1419
1420 static CartesianRankSetup split_communicator(const gmx::MDLogger& mdlog,
1421                                              t_commrec*           cr,
1422                                              const DdRankOrder    ddRankOrder,
1423                                              bool gmx_unused      reorder,
1424                                              const DDRankSetup&   ddRankSetup,
1425                                              ivec                 ddCellIndex,
1426                                              std::vector<int>*    pmeRanks)
1427 {
1428     CartesianRankSetup cartSetup;
1429
1430     cartSetup.bCartesianPP     = (ddRankOrder == DdRankOrder::cartesian);
1431     cartSetup.bCartesianPP_PME = false;
1432
1433     const ivec& numDDCells = ddRankSetup.numPPCells;
1434     /* Initially we set ntot to the number of PP cells */
1435     copy_ivec(numDDCells, cartSetup.ntot);
1436
1437     if (cartSetup.bCartesianPP)
1438     {
1439         const int numDDCellsTot = ddRankSetup.numPPRanks;
1440         bool      bDiv[DIM];
1441         for (int i = 1; i < DIM; i++)
1442         {
1443             bDiv[i] = ((ddRankSetup.numRanksDoingPme * numDDCells[i]) % numDDCellsTot == 0);
1444         }
1445         if (bDiv[YY] || bDiv[ZZ])
1446         {
1447             cartSetup.bCartesianPP_PME = TRUE;
1448             /* If we have 2D PME decomposition, which is always in x+y,
1449              * we stack the PME only nodes in z.
1450              * Otherwise we choose the direction that provides the thinnest slab
1451              * of PME only nodes as this will have the least effect
1452              * on the PP communication.
1453              * But for the PME communication the opposite might be better.
1454              */
1455             if (bDiv[ZZ] && (ddRankSetup.npmenodes_y > 1 || !bDiv[YY] || numDDCells[YY] > numDDCells[ZZ]))
1456             {
1457                 cartSetup.cartpmedim = ZZ;
1458             }
1459             else
1460             {
1461                 cartSetup.cartpmedim = YY;
1462             }
1463             cartSetup.ntot[cartSetup.cartpmedim] +=
1464                     (ddRankSetup.numRanksDoingPme * numDDCells[cartSetup.cartpmedim]) / numDDCellsTot;
1465         }
1466         else
1467         {
1468             GMX_LOG(mdlog.info)
1469                     .appendTextFormatted(
1470                             "Number of PME-only ranks (%d) is not a multiple of nx*ny (%d*%d) or "
1471                             "nx*nz (%d*%d)",
1472                             ddRankSetup.numRanksDoingPme,
1473                             numDDCells[XX],
1474                             numDDCells[YY],
1475                             numDDCells[XX],
1476                             numDDCells[ZZ]);
1477             GMX_LOG(mdlog.info)
1478                     .appendText("Will not use a Cartesian communicator for PP <-> PME\n");
1479         }
1480     }
1481
1482     if (cartSetup.bCartesianPP_PME)
1483     {
1484 #if GMX_MPI
1485         int  rank = 0;
1486         ivec periods;
1487
1488         GMX_LOG(mdlog.info)
1489                 .appendTextFormatted(
1490                         "Will use a Cartesian communicator for PP <-> PME: %d x %d x %d",
1491                         cartSetup.ntot[XX],
1492                         cartSetup.ntot[YY],
1493                         cartSetup.ntot[ZZ]);
1494
1495         for (int i = 0; i < DIM; i++)
1496         {
1497             periods[i] = TRUE;
1498         }
1499         MPI_Comm comm_cart = MPI_COMM_NULL;
1500         MPI_Cart_create(cr->mpi_comm_mysim, DIM, cartSetup.ntot, periods, static_cast<int>(reorder), &comm_cart);
1501         MPI_Comm_rank(comm_cart, &rank);
1502         if (MASTER(cr) && rank != 0)
1503         {
1504             gmx_fatal(FARGS, "MPI rank 0 was renumbered by MPI_Cart_create, we do not allow this");
1505         }
1506
1507         /* With this assigment we loose the link to the original communicator
1508          * which will usually be MPI_COMM_WORLD, unless have multisim.
1509          */
1510         cr->mpi_comm_mysim = comm_cart;
1511         cr->sim_nodeid     = rank;
1512
1513         MPI_Cart_coords(cr->mpi_comm_mysim, cr->sim_nodeid, DIM, ddCellIndex);
1514
1515         GMX_LOG(mdlog.info)
1516                 .appendTextFormatted("Cartesian rank %d, coordinates %d %d %d\n",
1517                                      cr->sim_nodeid,
1518                                      ddCellIndex[XX],
1519                                      ddCellIndex[YY],
1520                                      ddCellIndex[ZZ]);
1521
1522         if (ddCellIndex[cartSetup.cartpmedim] < numDDCells[cartSetup.cartpmedim])
1523         {
1524             cr->duty = DUTY_PP;
1525         }
1526         if (!ddRankSetup.usePmeOnlyRanks
1527             || ddCellIndex[cartSetup.cartpmedim] >= numDDCells[cartSetup.cartpmedim])
1528         {
1529             cr->duty = DUTY_PME;
1530         }
1531
1532         /* Split the sim communicator into PP and PME only nodes */
1533         MPI_Comm_split(cr->mpi_comm_mysim,
1534                        getThisRankDuties(cr),
1535                        dd_index(cartSetup.ntot, ddCellIndex),
1536                        &cr->mpi_comm_mygroup);
1537 #else
1538         GMX_UNUSED_VALUE(ddCellIndex);
1539 #endif
1540     }
1541     else
1542     {
1543         switch (ddRankOrder)
1544         {
1545             case DdRankOrder::pp_pme:
1546                 GMX_LOG(mdlog.info).appendText("Order of the ranks: PP first, PME last");
1547                 break;
1548             case DdRankOrder::interleave:
1549                 /* Interleave the PP-only and PME-only ranks */
1550                 GMX_LOG(mdlog.info).appendText("Interleaving PP and PME ranks");
1551                 *pmeRanks = dd_interleaved_pme_ranks(ddRankSetup);
1552                 break;
1553             case DdRankOrder::cartesian: break;
1554             default: gmx_fatal(FARGS, "Invalid ddRankOrder=%d", static_cast<int>(ddRankOrder));
1555         }
1556
1557         if (dd_simnode2pmenode(ddRankSetup, cartSetup, *pmeRanks, cr, cr->sim_nodeid) == -1)
1558         {
1559             cr->duty = DUTY_PME;
1560         }
1561         else
1562         {
1563             cr->duty = DUTY_PP;
1564         }
1565 #if GMX_MPI
1566         /* Split the sim communicator into PP and PME only nodes */
1567         MPI_Comm_split(cr->mpi_comm_mysim, getThisRankDuties(cr), cr->nodeid, &cr->mpi_comm_mygroup);
1568         MPI_Comm_rank(cr->mpi_comm_mygroup, &cr->nodeid);
1569 #endif
1570     }
1571
1572     GMX_LOG(mdlog.info)
1573             .appendTextFormatted("This rank does only %s work.\n",
1574                                  thisRankHasDuty(cr, DUTY_PP) ? "particle-particle" : "PME-mesh");
1575
1576     return cartSetup;
1577 }
1578
1579 /*! \brief Makes the PP communicator and the PME communicator, when needed
1580  *
1581  * Returns the Cartesian rank setup.
1582  * Sets \p cr->mpi_comm_mygroup
1583  * For PP ranks, sets the DD PP cell index in \p ddCellIndex.
1584  * With separate PME ranks in interleaved order, set the PME ranks in \p pmeRanks.
1585  */
1586 static CartesianRankSetup makeGroupCommunicators(const gmx::MDLogger& mdlog,
1587                                                  const DDSettings&    ddSettings,
1588                                                  const DdRankOrder    ddRankOrder,
1589                                                  const DDRankSetup&   ddRankSetup,
1590                                                  t_commrec*           cr,
1591                                                  ivec                 ddCellIndex,
1592                                                  std::vector<int>*    pmeRanks)
1593 {
1594     CartesianRankSetup cartSetup;
1595
1596     // As a default, both group and sim communicators are equal to the default communicator
1597     cr->mpi_comm_mygroup = cr->mpiDefaultCommunicator;
1598     cr->mpi_comm_mysim   = cr->mpiDefaultCommunicator;
1599     cr->nnodes           = cr->sizeOfDefaultCommunicator;
1600     cr->nodeid           = cr->rankInDefaultCommunicator;
1601     cr->sim_nodeid       = cr->rankInDefaultCommunicator;
1602
1603     if (ddRankSetup.usePmeOnlyRanks)
1604     {
1605         /* Split the communicator into a PP and PME part */
1606         cartSetup = split_communicator(
1607                 mdlog, cr, ddRankOrder, ddSettings.useCartesianReorder, ddRankSetup, ddCellIndex, pmeRanks);
1608     }
1609     else
1610     {
1611         /* All nodes do PP and PME */
1612         /* We do not require separate communicators */
1613         cartSetup.bCartesianPP     = false;
1614         cartSetup.bCartesianPP_PME = false;
1615     }
1616
1617     return cartSetup;
1618 }
1619
1620 /*! \brief For PP ranks, sets or makes the communicator
1621  *
1622  * For PME ranks get the rank id.
1623  * For PP only ranks, sets the PME-only rank.
1624  */
1625 static void setupGroupCommunication(const gmx::MDLogger&     mdlog,
1626                                     const DDSettings&        ddSettings,
1627                                     gmx::ArrayRef<const int> pmeRanks,
1628                                     t_commrec*               cr,
1629                                     const int                numAtomsInSystem,
1630                                     gmx_domdec_t*            dd)
1631 {
1632     const DDRankSetup&        ddRankSetup = dd->comm->ddRankSetup;
1633     const CartesianRankSetup& cartSetup   = dd->comm->cartesianRankSetup;
1634
1635     if (thisRankHasDuty(cr, DUTY_PP))
1636     {
1637         /* Copy or make a new PP communicator */
1638
1639         /* We (possibly) reordered the nodes in split_communicator,
1640          * so it is no longer required in make_pp_communicator.
1641          */
1642         const bool useCartesianReorder = (ddSettings.useCartesianReorder && !cartSetup.bCartesianPP_PME);
1643
1644         make_pp_communicator(mdlog, dd, cr, useCartesianReorder);
1645     }
1646     else
1647     {
1648         receive_ddindex2simnodeid(dd, cr);
1649     }
1650
1651     if (!thisRankHasDuty(cr, DUTY_PME))
1652     {
1653         /* Set up the commnuication to our PME node */
1654         dd->pme_nodeid = dd_simnode2pmenode(ddRankSetup, cartSetup, pmeRanks, cr, cr->sim_nodeid);
1655         dd->pme_receive_vir_ener = receive_vir_ener(dd, pmeRanks, cr);
1656         if (debug)
1657         {
1658             fprintf(debug,
1659                     "My pme_nodeid %d receive ener %s\n",
1660                     dd->pme_nodeid,
1661                     gmx::boolToString(dd->pme_receive_vir_ener));
1662         }
1663     }
1664     else
1665     {
1666         dd->pme_nodeid = -1;
1667     }
1668
1669     /* We can not use DDMASTER(dd), because dd->masterrank is set later */
1670     if (MASTER(cr))
1671     {
1672         dd->ma = std::make_unique<AtomDistribution>(dd->numCells, numAtomsInSystem, numAtomsInSystem);
1673     }
1674 }
1675
1676 static real* get_slb_frac(const gmx::MDLogger& mdlog, const char* dir, int nc, const char* size_string)
1677 {
1678     real* slb_frac = nullptr;
1679     if (nc > 1 && size_string != nullptr)
1680     {
1681         GMX_LOG(mdlog.info).appendTextFormatted("Using static load balancing for the %s direction", dir);
1682         snew(slb_frac, nc);
1683         real tot = 0;
1684         for (int i = 0; i < nc; i++)
1685         {
1686             double dbl = 0;
1687             int    n   = 0;
1688             sscanf(size_string, "%20lf%n", &dbl, &n);
1689             if (dbl == 0)
1690             {
1691                 gmx_fatal(FARGS,
1692                           "Incorrect or not enough DD cell size entries for direction %s: '%s'",
1693                           dir,
1694                           size_string);
1695             }
1696             slb_frac[i] = dbl;
1697             size_string += n;
1698             tot += slb_frac[i];
1699         }
1700         /* Normalize */
1701         std::string relativeCellSizes = "Relative cell sizes:";
1702         for (int i = 0; i < nc; i++)
1703         {
1704             slb_frac[i] /= tot;
1705             relativeCellSizes += gmx::formatString(" %5.3f", slb_frac[i]);
1706         }
1707         GMX_LOG(mdlog.info).appendText(relativeCellSizes);
1708     }
1709
1710     return slb_frac;
1711 }
1712
1713 static int multi_body_bondeds_count(const gmx_mtop_t& mtop)
1714 {
1715     int n = 0;
1716     for (const auto ilists : IListRange(mtop))
1717     {
1718         for (auto& ilist : extractILists(ilists.list(), IF_BOND))
1719         {
1720             if (NRAL(ilist.functionType) > 2)
1721             {
1722                 n += ilists.nmol() * (ilist.iatoms.size() / ilistStride(ilist));
1723             }
1724         }
1725     }
1726
1727     return n;
1728 }
1729
1730 static int dd_getenv(const gmx::MDLogger& mdlog, const char* env_var, int def)
1731 {
1732     int   nst = def;
1733     char* val = getenv(env_var);
1734     if (val)
1735     {
1736         if (sscanf(val, "%20d", &nst) <= 0)
1737         {
1738             nst = 1;
1739         }
1740         GMX_LOG(mdlog.info).appendTextFormatted("Found env.var. %s = %s, using value %d", env_var, val, nst);
1741     }
1742
1743     return nst;
1744 }
1745
1746 static void check_dd_restrictions(const gmx_domdec_t* dd, const t_inputrec& inputrec, const gmx::MDLogger& mdlog)
1747 {
1748     if (inputrec.pbcType == PbcType::Screw
1749         && (dd->numCells[XX] == 1 || dd->numCells[YY] > 1 || dd->numCells[ZZ] > 1))
1750     {
1751         gmx_fatal(FARGS,
1752                   "With pbc=%s can only do domain decomposition in the x-direction",
1753                   c_pbcTypeNames[inputrec.pbcType].c_str());
1754     }
1755
1756     if (inputrec.nstlist == 0)
1757     {
1758         gmx_fatal(FARGS, "Domain decomposition does not work with nstlist=0");
1759     }
1760
1761     if (inputrec.comm_mode == ComRemovalAlgorithm::Angular && inputrec.pbcType != PbcType::No)
1762     {
1763         GMX_LOG(mdlog.warning)
1764                 .appendText(
1765                         "comm-mode angular will give incorrect results when the comm group "
1766                         "partially crosses a periodic boundary");
1767     }
1768 }
1769
1770 static real average_cellsize_min(const gmx_ddbox_t& ddbox, const ivec numDomains)
1771 {
1772     real r = ddbox.box_size[XX];
1773     for (int d = 0; d < DIM; d++)
1774     {
1775         if (numDomains[d] > 1)
1776         {
1777             /* Check using the initial average cell size */
1778             r = std::min(r, ddbox.box_size[d] * ddbox.skew_fac[d] / numDomains[d]);
1779         }
1780     }
1781
1782     return r;
1783 }
1784
1785 /*! \brief Depending on the DLB initial value return the DLB switched off state or issue an error.
1786  */
1787 static DlbState forceDlbOffOrBail(DlbState             cmdlineDlbState,
1788                                   const std::string&   reasonStr,
1789                                   const gmx::MDLogger& mdlog)
1790 {
1791     std::string dlbNotSupportedErr = "Dynamic load balancing requested, but ";
1792     std::string dlbDisableNote     = "NOTE: disabling dynamic load balancing as ";
1793
1794     if (cmdlineDlbState == DlbState::onUser)
1795     {
1796         gmx_fatal(FARGS, "%s", (dlbNotSupportedErr + reasonStr).c_str());
1797     }
1798     else if (cmdlineDlbState == DlbState::offCanTurnOn)
1799     {
1800         GMX_LOG(mdlog.info).appendText(dlbDisableNote + reasonStr);
1801     }
1802     return DlbState::offForever;
1803 }
1804
1805 /*! \brief Return the dynamic load balancer's initial state based on initial conditions and user inputs.
1806  *
1807  * This function parses the parameters of "-dlb" command line option setting
1808  * corresponding state values. Then it checks the consistency of the determined
1809  * state with other run parameters and settings. As a result, the initial state
1810  * may be altered or an error may be thrown if incompatibility of options is detected.
1811  *
1812  * \param [in] mdlog       Logger.
1813  * \param [in] dlbOption   Enum value for the DLB option.
1814  * \param [in] bRecordLoad True if the load balancer is recording load information.
1815  * \param [in] mdrunOptions  Options for mdrun.
1816  * \param [in] inputrec    Pointer mdrun to input parameters.
1817  * \returns                DLB initial/startup state.
1818  */
1819 static DlbState determineInitialDlbState(const gmx::MDLogger&     mdlog,
1820                                          DlbOption                dlbOption,
1821                                          gmx_bool                 bRecordLoad,
1822                                          const gmx::MdrunOptions& mdrunOptions,
1823                                          const t_inputrec&        inputrec)
1824 {
1825     DlbState dlbState = DlbState::offCanTurnOn;
1826
1827     switch (dlbOption)
1828     {
1829         case DlbOption::turnOnWhenUseful: dlbState = DlbState::offCanTurnOn; break;
1830         case DlbOption::no: dlbState = DlbState::offUser; break;
1831         case DlbOption::yes: dlbState = DlbState::onUser; break;
1832         default: gmx_incons("Invalid dlbOption enum value");
1833     }
1834
1835     /* Reruns don't support DLB: bail or override auto mode */
1836     if (mdrunOptions.rerun)
1837     {
1838         std::string reasonStr = "it is not supported in reruns.";
1839         return forceDlbOffOrBail(dlbState, reasonStr, mdlog);
1840     }
1841
1842     /* Unsupported integrators */
1843     if (!EI_DYNAMICS(inputrec.eI))
1844     {
1845         auto reasonStr =
1846                 gmx::formatString("it is only supported with dynamics, not with integrator '%s'.",
1847                                   enumValueToString(inputrec.eI));
1848         return forceDlbOffOrBail(dlbState, reasonStr, mdlog);
1849     }
1850
1851     /* Without cycle counters we can't time work to balance on */
1852     if (!bRecordLoad)
1853     {
1854         std::string reasonStr =
1855                 "cycle counters unsupported or not enabled in the operating system kernel.";
1856         return forceDlbOffOrBail(dlbState, reasonStr, mdlog);
1857     }
1858
1859     if (mdrunOptions.reproducible)
1860     {
1861         std::string reasonStr = "you started a reproducible run.";
1862         switch (dlbState)
1863         {
1864             case DlbState::offUser: break;
1865             case DlbState::offForever:
1866                 GMX_RELEASE_ASSERT(false, "DlbState::offForever is not a valid initial state");
1867                 break;
1868             case DlbState::offCanTurnOn: return forceDlbOffOrBail(dlbState, reasonStr, mdlog);
1869             case DlbState::onCanTurnOff:
1870                 GMX_RELEASE_ASSERT(false, "DlbState::offCanTurnOff is not a valid initial state");
1871                 break;
1872             case DlbState::onUser:
1873                 return forceDlbOffOrBail(
1874                         dlbState,
1875                         reasonStr
1876                                 + " In load balanced runs binary reproducibility cannot be "
1877                                   "ensured.",
1878                         mdlog);
1879             default:
1880                 gmx_fatal(FARGS,
1881                           "Death horror: undefined case (%d) for load balancing choice",
1882                           static_cast<int>(dlbState));
1883         }
1884     }
1885
1886     return dlbState;
1887 }
1888
1889 static gmx_domdec_comm_t* init_dd_comm()
1890 {
1891     gmx_domdec_comm_t* comm = new gmx_domdec_comm_t;
1892
1893     comm->n_load_have    = 0;
1894     comm->n_load_collect = 0;
1895
1896     comm->haveTurnedOffDlb = false;
1897
1898     for (int i = 0; i < static_cast<int>(DDAtomRanges::Type::Number); i++)
1899     {
1900         comm->sum_nat[i] = 0;
1901     }
1902     comm->ndecomp   = 0;
1903     comm->nload     = 0;
1904     comm->load_step = 0;
1905     comm->load_sum  = 0;
1906     comm->load_max  = 0;
1907     clear_ivec(comm->load_lim);
1908     comm->load_mdf = 0;
1909     comm->load_pme = 0;
1910
1911     /* This should be replaced by a unique pointer */
1912     comm->balanceRegion = ddBalanceRegionAllocate();
1913
1914     return comm;
1915 }
1916
1917 static void setupUpdateGroups(const gmx::MDLogger&              mdlog,
1918                               const gmx_mtop_t&                 mtop,
1919                               ArrayRef<const RangePartitioning> updateGroupingsPerMoleculeType,
1920                               const bool                        useUpdateGroups,
1921                               const real                        maxUpdateGroupRadius,
1922                               DDSystemInfo*                     systemInfo)
1923 {
1924     systemInfo->updateGroupingsPerMoleculeType = updateGroupingsPerMoleculeType;
1925     systemInfo->useUpdateGroups                = useUpdateGroups;
1926     systemInfo->maxUpdateGroupRadius           = maxUpdateGroupRadius;
1927
1928     if (systemInfo->useUpdateGroups)
1929     {
1930         int numUpdateGroups = 0;
1931         for (const auto& molblock : mtop.molblock)
1932         {
1933             numUpdateGroups += molblock.nmol
1934                                * systemInfo->updateGroupingsPerMoleculeType[molblock.type].numBlocks();
1935         }
1936
1937         GMX_LOG(mdlog.info)
1938                 .appendTextFormatted(
1939                         "Using update groups, nr %d, average size %.1f atoms, max. radius %.3f "
1940                         "nm\n",
1941                         numUpdateGroups,
1942                         mtop.natoms / static_cast<double>(numUpdateGroups),
1943                         systemInfo->maxUpdateGroupRadius);
1944     }
1945 }
1946
1947 UnitCellInfo::UnitCellInfo(const t_inputrec& ir) :
1948     npbcdim(numPbcDimensions(ir.pbcType)),
1949     numBoundedDimensions(inputrec2nboundeddim(&ir)),
1950     ddBoxIsDynamic(numBoundedDimensions < DIM || inputrecDynamicBox(&ir)),
1951     haveScrewPBC(ir.pbcType == PbcType::Screw)
1952 {
1953 }
1954
1955 /* Returns whether molecules are always whole, i.e. not broken by PBC */
1956 static bool moleculesAreAlwaysWhole(const gmx_mtop_t&                           mtop,
1957                                     const bool                                  useUpdateGroups,
1958                                     gmx::ArrayRef<const gmx::RangePartitioning> updateGroupingsPerMoleculeType)
1959 {
1960     if (useUpdateGroups)
1961     {
1962         GMX_RELEASE_ASSERT(updateGroupingsPerMoleculeType.size() == mtop.moltype.size(),
1963                            "Need one grouping per moltype");
1964         for (size_t mol = 0; mol < mtop.moltype.size(); mol++)
1965         {
1966             if (updateGroupingsPerMoleculeType[mol].numBlocks() > 1)
1967             {
1968                 return false;
1969             }
1970         }
1971     }
1972     else
1973     {
1974         for (const auto& moltype : mtop.moltype)
1975         {
1976             if (moltype.atoms.nr > 1)
1977             {
1978                 return false;
1979             }
1980         }
1981     }
1982
1983     return true;
1984 }
1985
1986 /*! \brief Generate the simulation system information */
1987 static DDSystemInfo getSystemInfo(const gmx::MDLogger&              mdlog,
1988                                   DDRole                            ddRole,
1989                                   MPI_Comm                          communicator,
1990                                   const DomdecOptions&              options,
1991                                   const gmx_mtop_t&                 mtop,
1992                                   const t_inputrec&                 ir,
1993                                   const matrix                      box,
1994                                   ArrayRef<const RangePartitioning> updateGroupingPerMoleculeType,
1995                                   const bool                        useUpdateGroups,
1996                                   const real                        maxUpdateGroupRadius,
1997                                   gmx::ArrayRef<const gmx::RVec>    xGlobal)
1998 {
1999     const real tenPercentMargin = 1.1;
2000
2001     DDSystemInfo systemInfo;
2002
2003     setupUpdateGroups(
2004             mdlog, mtop, updateGroupingPerMoleculeType, useUpdateGroups, maxUpdateGroupRadius, &systemInfo);
2005
2006     systemInfo.moleculesAreAlwaysWhole = moleculesAreAlwaysWhole(
2007             mtop, systemInfo.useUpdateGroups, systemInfo.updateGroupingsPerMoleculeType);
2008     systemInfo.haveInterDomainBondeds =
2009             (!systemInfo.moleculesAreAlwaysWhole || mtop.bIntermolecularInteractions);
2010     systemInfo.haveInterDomainMultiBodyBondeds =
2011             (systemInfo.haveInterDomainBondeds && multi_body_bondeds_count(mtop) > 0);
2012
2013     if (systemInfo.useUpdateGroups)
2014     {
2015         systemInfo.mayHaveSplitConstraints = false;
2016         systemInfo.mayHaveSplitSettles     = false;
2017     }
2018     else
2019     {
2020         systemInfo.mayHaveSplitConstraints = (gmx_mtop_ftype_count(mtop, F_CONSTR) > 0
2021                                               || gmx_mtop_ftype_count(mtop, F_CONSTRNC) > 0);
2022         systemInfo.mayHaveSplitSettles     = (gmx_mtop_ftype_count(mtop, F_SETTLE) > 0);
2023     }
2024
2025     if (ir.rlist == 0)
2026     {
2027         /* Set the cut-off to some very large value,
2028          * so we don't need if statements everywhere in the code.
2029          * We use sqrt, since the cut-off is squared in some places.
2030          */
2031         systemInfo.cutoff = GMX_CUTOFF_INF;
2032     }
2033     else
2034     {
2035         systemInfo.cutoff = atomToAtomIntoDomainToDomainCutoff(systemInfo, ir.rlist);
2036     }
2037     systemInfo.minCutoffForMultiBody = 0;
2038
2039     /* Determine the minimum cell size limit, affected by many factors */
2040     systemInfo.cellsizeLimit             = 0;
2041     systemInfo.filterBondedCommunication = false;
2042
2043     /* We do not allow home atoms to move beyond the neighboring domain
2044      * between domain decomposition steps, which limits the cell size.
2045      * Get an estimate of cell size limit due to atom displacement.
2046      * In most cases this is a large overestimate, because it assumes
2047      * non-interaction atoms.
2048      * We set the chance to 1 in a trillion steps.
2049      */
2050     constexpr real c_chanceThatAtomMovesBeyondDomain = 1e-12;
2051     const real     limitForAtomDisplacement          = minCellSizeForAtomDisplacement(
2052             mtop, ir, systemInfo.updateGroupingsPerMoleculeType, c_chanceThatAtomMovesBeyondDomain);
2053     GMX_LOG(mdlog.info).appendTextFormatted("Minimum cell size due to atom displacement: %.3f nm", limitForAtomDisplacement);
2054
2055     systemInfo.cellsizeLimit = std::max(systemInfo.cellsizeLimit, limitForAtomDisplacement);
2056
2057     /* TODO: PME decomposition currently requires atoms not to be more than
2058      *       2/3 of comm->cutoff, which is >=rlist, outside of their domain.
2059      *       In nearly all cases, limitForAtomDisplacement will be smaller
2060      *       than 2/3*rlist, so the PME requirement is satisfied.
2061      *       But it would be better for both correctness and performance
2062      *       to use limitForAtomDisplacement instead of 2/3*comm->cutoff.
2063      *       Note that we would need to improve the pairlist buffer case.
2064      */
2065
2066     if (systemInfo.haveInterDomainBondeds)
2067     {
2068         if (options.minimumCommunicationRange > 0)
2069         {
2070             systemInfo.minCutoffForMultiBody =
2071                     atomToAtomIntoDomainToDomainCutoff(systemInfo, options.minimumCommunicationRange);
2072             if (options.useBondedCommunication)
2073             {
2074                 systemInfo.filterBondedCommunication =
2075                         (systemInfo.minCutoffForMultiBody > systemInfo.cutoff);
2076             }
2077             else
2078             {
2079                 systemInfo.cutoff = std::max(systemInfo.cutoff, systemInfo.minCutoffForMultiBody);
2080             }
2081         }
2082         else if (ir.bPeriodicMols)
2083         {
2084             /* Can not easily determine the required cut-off */
2085             GMX_LOG(mdlog.warning)
2086                     .appendText(
2087                             "NOTE: Periodic molecules are present in this system. Because of this, "
2088                             "the domain decomposition algorithm cannot easily determine the "
2089                             "minimum cell size that it requires for treating bonded interactions. "
2090                             "Instead, domain decomposition will assume that half the non-bonded "
2091                             "cut-off will be a suitable lower bound.");
2092             systemInfo.minCutoffForMultiBody = systemInfo.cutoff / 2;
2093         }
2094         else
2095         {
2096             real r_2b = 0;
2097             real r_mb = 0;
2098
2099             if (ddRole == DDRole::Master)
2100             {
2101                 dd_bonded_cg_distance(mdlog, mtop, ir, xGlobal, box, options.ddBondedChecking, &r_2b, &r_mb);
2102             }
2103             gmx_bcast(sizeof(r_2b), &r_2b, communicator);
2104             gmx_bcast(sizeof(r_mb), &r_mb, communicator);
2105
2106             /* We use an initial margin of 10% for the minimum cell size,
2107              * except when we are just below the non-bonded cut-off.
2108              */
2109             if (options.useBondedCommunication)
2110             {
2111                 if (std::max(r_2b, r_mb) > systemInfo.cutoff)
2112                 {
2113                     const real r_bonded              = std::max(r_2b, r_mb);
2114                     systemInfo.minCutoffForMultiBody = tenPercentMargin * r_bonded;
2115                     /* This is the (only) place where we turn on the filtering */
2116                     systemInfo.filterBondedCommunication = true;
2117                 }
2118                 else
2119                 {
2120                     const real r_bonded = r_mb;
2121                     systemInfo.minCutoffForMultiBody =
2122                             std::min(tenPercentMargin * r_bonded, systemInfo.cutoff);
2123                 }
2124                 /* We determine cutoff_mbody later */
2125                 systemInfo.increaseMultiBodyCutoff = true;
2126             }
2127             else
2128             {
2129                 /* No special bonded communication,
2130                  * simply increase the DD cut-off.
2131                  */
2132                 systemInfo.minCutoffForMultiBody = tenPercentMargin * std::max(r_2b, r_mb);
2133                 systemInfo.cutoff = std::max(systemInfo.cutoff, systemInfo.minCutoffForMultiBody);
2134             }
2135         }
2136         GMX_LOG(mdlog.info)
2137                 .appendTextFormatted("Minimum cell size due to bonded interactions: %.3f nm",
2138                                      systemInfo.minCutoffForMultiBody);
2139
2140         systemInfo.cellsizeLimit = std::max(systemInfo.cellsizeLimit, systemInfo.minCutoffForMultiBody);
2141     }
2142
2143     systemInfo.constraintCommunicationRange = 0;
2144     if (systemInfo.mayHaveSplitConstraints && options.constraintCommunicationRange <= 0)
2145     {
2146         /* There is a cell size limit due to the constraints (P-LINCS) */
2147         systemInfo.constraintCommunicationRange = gmx::constr_r_max(mdlog, &mtop, &ir);
2148         GMX_LOG(mdlog.info)
2149                 .appendTextFormatted("Estimated maximum distance required for P-LINCS: %.3f nm",
2150                                      systemInfo.constraintCommunicationRange);
2151         if (systemInfo.constraintCommunicationRange > systemInfo.cellsizeLimit)
2152         {
2153             GMX_LOG(mdlog.info)
2154                     .appendText(
2155                             "This distance will limit the DD cell size, you can override this with "
2156                             "-rcon");
2157         }
2158     }
2159     else if (options.constraintCommunicationRange > 0)
2160     {
2161         /* Here we do not check for dd->splitConstraints.
2162          * because one can also set a cell size limit for virtual sites only
2163          * and at this point we don't know yet if there are intercg v-sites.
2164          */
2165         GMX_LOG(mdlog.info)
2166                 .appendTextFormatted("User supplied maximum distance required for P-LINCS: %.3f nm",
2167                                      options.constraintCommunicationRange);
2168         systemInfo.constraintCommunicationRange = options.constraintCommunicationRange;
2169     }
2170     systemInfo.cellsizeLimit = std::max(systemInfo.cellsizeLimit, systemInfo.constraintCommunicationRange);
2171
2172     return systemInfo;
2173 }
2174
2175 /*! \brief Exit with a fatal error if the DDGridSetup cannot be
2176  * implemented. */
2177 static void checkDDGridSetup(const DDGridSetup&   ddGridSetup,
2178                              DDRole               ddRole,
2179                              MPI_Comm             communicator,
2180                              int                  numNodes,
2181                              const DomdecOptions& options,
2182                              const DDSettings&    ddSettings,
2183                              const DDSystemInfo&  systemInfo,
2184                              const real           cellsizeLimit,
2185                              const gmx_ddbox_t&   ddbox)
2186 {
2187     if (options.numCells[XX] <= 0 && (ddGridSetup.numDomains[XX] == 0))
2188     {
2189         const bool  bC = (systemInfo.mayHaveSplitConstraints
2190                          && systemInfo.constraintCommunicationRange > systemInfo.minCutoffForMultiBody);
2191         std::string message =
2192                 gmx::formatString("Change the number of ranks or mdrun option %s%s%s",
2193                                   !bC ? "-rdd" : "-rcon",
2194                                   ddSettings.initialDlbState != DlbState::offUser ? " or -dds" : "",
2195                                   bC ? " or your LINCS settings" : "");
2196
2197         gmx_fatal_collective(FARGS,
2198                              communicator,
2199                              ddRole == DDRole::Master,
2200                              "There is no domain decomposition for %d ranks that is compatible "
2201                              "with the given box and a minimum cell size of %g nm\n"
2202                              "%s\n"
2203                              "Look in the log file for details on the domain decomposition",
2204                              numNodes - ddGridSetup.numPmeOnlyRanks,
2205                              cellsizeLimit,
2206                              message.c_str());
2207     }
2208
2209     const real acs = average_cellsize_min(ddbox, ddGridSetup.numDomains);
2210     if (acs < cellsizeLimit)
2211     {
2212         if (options.numCells[XX] <= 0)
2213         {
2214             GMX_RELEASE_ASSERT(
2215                     false,
2216                     "dd_choose_grid() should return a grid that satisfies the cell size limits");
2217         }
2218         else
2219         {
2220             gmx_fatal_collective(
2221                     FARGS,
2222                     communicator,
2223                     ddRole == DDRole::Master,
2224                     "The initial cell size (%f) is smaller than the cell size limit (%f), change "
2225                     "options -dd, -rdd or -rcon, see the log file for details",
2226                     acs,
2227                     cellsizeLimit);
2228         }
2229     }
2230
2231     const int numPPRanks =
2232             ddGridSetup.numDomains[XX] * ddGridSetup.numDomains[YY] * ddGridSetup.numDomains[ZZ];
2233     if (numNodes - numPPRanks != ddGridSetup.numPmeOnlyRanks)
2234     {
2235         gmx_fatal_collective(FARGS,
2236                              communicator,
2237                              ddRole == DDRole::Master,
2238                              "The size of the domain decomposition grid (%d) does not match the "
2239                              "number of PP ranks (%d). The total number of ranks is %d",
2240                              numPPRanks,
2241                              numNodes - ddGridSetup.numPmeOnlyRanks,
2242                              numNodes);
2243     }
2244     if (ddGridSetup.numPmeOnlyRanks > numPPRanks)
2245     {
2246         gmx_fatal_collective(FARGS,
2247                              communicator,
2248                              ddRole == DDRole::Master,
2249                              "The number of separate PME ranks (%d) is larger than the number of "
2250                              "PP ranks (%d), this is not supported.",
2251                              ddGridSetup.numPmeOnlyRanks,
2252                              numPPRanks);
2253     }
2254 }
2255
2256 /*! \brief Set the cell size and interaction limits, as well as the DD grid */
2257 static DDRankSetup getDDRankSetup(const gmx::MDLogger& mdlog,
2258                                   int                  numNodes,
2259                                   const DDGridSetup&   ddGridSetup,
2260                                   const t_inputrec&    ir)
2261 {
2262     GMX_LOG(mdlog.info)
2263             .appendTextFormatted("Domain decomposition grid %d x %d x %d, separate PME ranks %d",
2264                                  ddGridSetup.numDomains[XX],
2265                                  ddGridSetup.numDomains[YY],
2266                                  ddGridSetup.numDomains[ZZ],
2267                                  ddGridSetup.numPmeOnlyRanks);
2268
2269     DDRankSetup ddRankSetup;
2270
2271     ddRankSetup.numPPRanks = numNodes - ddGridSetup.numPmeOnlyRanks;
2272     copy_ivec(ddGridSetup.numDomains, ddRankSetup.numPPCells);
2273
2274     ddRankSetup.usePmeOnlyRanks = (ddGridSetup.numPmeOnlyRanks > 0);
2275     if (ddRankSetup.usePmeOnlyRanks)
2276     {
2277         ddRankSetup.numRanksDoingPme = ddGridSetup.numPmeOnlyRanks;
2278     }
2279     else
2280     {
2281         ddRankSetup.numRanksDoingPme =
2282                 ddGridSetup.numDomains[XX] * ddGridSetup.numDomains[YY] * ddGridSetup.numDomains[ZZ];
2283     }
2284
2285     if (EEL_PME(ir.coulombtype) || EVDW_PME(ir.vdwtype))
2286     {
2287         /* The following choices should match those
2288          * in comm_cost_est in domdec_setup.c.
2289          * Note that here the checks have to take into account
2290          * that the decomposition might occur in a different order than xyz
2291          * (for instance through the env.var. GMX_DD_ORDER_ZYX),
2292          * in which case they will not match those in comm_cost_est,
2293          * but since that is mainly for testing purposes that's fine.
2294          */
2295         if (ddGridSetup.numDDDimensions >= 2 && ddGridSetup.ddDimensions[0] == XX
2296             && ddGridSetup.ddDimensions[1] == YY
2297             && ddRankSetup.numRanksDoingPme > ddGridSetup.numDomains[XX]
2298             && ddRankSetup.numRanksDoingPme % ddGridSetup.numDomains[XX] == 0
2299             && getenv("GMX_PMEONEDD") == nullptr)
2300         {
2301             ddRankSetup.npmedecompdim = 2;
2302             ddRankSetup.npmenodes_x   = ddGridSetup.numDomains[XX];
2303             ddRankSetup.npmenodes_y   = ddRankSetup.numRanksDoingPme / ddRankSetup.npmenodes_x;
2304         }
2305         else
2306         {
2307             /* In case nc is 1 in both x and y we could still choose to
2308              * decompose pme in y instead of x, but we use x for simplicity.
2309              */
2310             ddRankSetup.npmedecompdim = 1;
2311             if (ddGridSetup.ddDimensions[0] == YY)
2312             {
2313                 ddRankSetup.npmenodes_x = 1;
2314                 ddRankSetup.npmenodes_y = ddRankSetup.numRanksDoingPme;
2315             }
2316             else
2317             {
2318                 ddRankSetup.npmenodes_x = ddRankSetup.numRanksDoingPme;
2319                 ddRankSetup.npmenodes_y = 1;
2320             }
2321         }
2322         GMX_LOG(mdlog.info)
2323                 .appendTextFormatted("PME domain decomposition: %d x %d x %d",
2324                                      ddRankSetup.npmenodes_x,
2325                                      ddRankSetup.npmenodes_y,
2326                                      1);
2327     }
2328     else
2329     {
2330         ddRankSetup.npmedecompdim = 0;
2331         ddRankSetup.npmenodes_x   = 0;
2332         ddRankSetup.npmenodes_y   = 0;
2333     }
2334
2335     return ddRankSetup;
2336 }
2337
2338 /*! \brief Set the cell size and interaction limits */
2339 static void set_dd_limits(const gmx::MDLogger& mdlog,
2340                           DDRole               ddRole,
2341                           gmx_domdec_t*        dd,
2342                           const DomdecOptions& options,
2343                           const DDSettings&    ddSettings,
2344                           const DDSystemInfo&  systemInfo,
2345                           const DDGridSetup&   ddGridSetup,
2346                           const int            numPPRanks,
2347                           const gmx_mtop_t&    mtop,
2348                           const t_inputrec&    ir,
2349                           const gmx_ddbox_t&   ddbox)
2350 {
2351     gmx_domdec_comm_t* comm = dd->comm;
2352     comm->ddSettings        = ddSettings;
2353
2354     /* Initialize to GPU share count to 0, might change later */
2355     comm->nrank_gpu_shared = 0;
2356
2357     comm->dlbState = comm->ddSettings.initialDlbState;
2358     dd_dlb_set_should_check_whether_to_turn_dlb_on(dd, TRUE);
2359     /* To consider turning DLB on after 2*nstlist steps we need to check
2360      * at partitioning count 3. Thus we need to increase the first count by 2.
2361      */
2362     comm->ddPartioningCountFirstDlbOff += 2;
2363
2364     comm->bPMELoadBalDLBLimits = FALSE;
2365
2366     /* Allocate the charge group/atom sorting struct */
2367     comm->sort = std::make_unique<gmx_domdec_sort_t>();
2368
2369     comm->systemInfo = systemInfo;
2370
2371     if (systemInfo.useUpdateGroups)
2372     {
2373         /* Note: We would like to use dd->nnodes for the atom count estimate,
2374          *       but that is not yet available here. But this anyhow only
2375          *       affect performance up to the second dd_partition_system call.
2376          */
2377         const int homeAtomCountEstimate = mtop.natoms / numPPRanks;
2378         comm->updateGroupsCog           = std::make_unique<gmx::UpdateGroupsCog>(
2379                 mtop, systemInfo.updateGroupingsPerMoleculeType, maxReferenceTemperature(ir), homeAtomCountEstimate);
2380     }
2381
2382     /* Set the DD setup given by ddGridSetup */
2383     copy_ivec(ddGridSetup.numDomains, dd->numCells);
2384     dd->ndim = ddGridSetup.numDDDimensions;
2385     copy_ivec(ddGridSetup.ddDimensions, dd->dim);
2386
2387     dd->nnodes = dd->numCells[XX] * dd->numCells[YY] * dd->numCells[ZZ];
2388
2389     snew(comm->slb_frac, DIM);
2390     if (isDlbDisabled(comm))
2391     {
2392         comm->slb_frac[XX] = get_slb_frac(mdlog, "x", dd->numCells[XX], options.cellSizeX);
2393         comm->slb_frac[YY] = get_slb_frac(mdlog, "y", dd->numCells[YY], options.cellSizeY);
2394         comm->slb_frac[ZZ] = get_slb_frac(mdlog, "z", dd->numCells[ZZ], options.cellSizeZ);
2395     }
2396
2397     /* Set the multi-body cut-off and cellsize limit for DLB */
2398     comm->cutoff_mbody   = systemInfo.minCutoffForMultiBody;
2399     comm->cellsize_limit = systemInfo.cellsizeLimit;
2400     if (systemInfo.haveInterDomainBondeds && systemInfo.increaseMultiBodyCutoff)
2401     {
2402         if (systemInfo.filterBondedCommunication || !isDlbDisabled(comm))
2403         {
2404             /* Set the bonded communication distance to halfway
2405              * the minimum and the maximum,
2406              * since the extra communication cost is nearly zero.
2407              */
2408             real acs           = average_cellsize_min(ddbox, dd->numCells);
2409             comm->cutoff_mbody = 0.5 * (systemInfo.minCutoffForMultiBody + acs);
2410             if (!isDlbDisabled(comm))
2411             {
2412                 /* Check if this does not limit the scaling */
2413                 comm->cutoff_mbody = std::min(comm->cutoff_mbody, options.dlbScaling * acs);
2414             }
2415             if (!systemInfo.filterBondedCommunication)
2416             {
2417                 /* Without bBondComm do not go beyond the n.b. cut-off */
2418                 comm->cutoff_mbody = std::min(comm->cutoff_mbody, systemInfo.cutoff);
2419                 if (comm->cellsize_limit >= systemInfo.cutoff)
2420                 {
2421                     /* We don't loose a lot of efficieny
2422                      * when increasing it to the n.b. cut-off.
2423                      * It can even be slightly faster, because we need
2424                      * less checks for the communication setup.
2425                      */
2426                     comm->cutoff_mbody = systemInfo.cutoff;
2427                 }
2428             }
2429             /* Check if we did not end up below our original limit */
2430             comm->cutoff_mbody = std::max(comm->cutoff_mbody, systemInfo.minCutoffForMultiBody);
2431
2432             if (comm->cutoff_mbody > comm->cellsize_limit)
2433             {
2434                 comm->cellsize_limit = comm->cutoff_mbody;
2435             }
2436         }
2437         /* Without DLB and cutoff_mbody<cutoff, cutoff_mbody is dynamic */
2438     }
2439
2440     if (debug)
2441     {
2442         fprintf(debug,
2443                 "Bonded atom communication beyond the cut-off: %s\n"
2444                 "cellsize limit %f\n",
2445                 gmx::boolToString(systemInfo.filterBondedCommunication),
2446                 comm->cellsize_limit);
2447     }
2448
2449     if (ddRole == DDRole::Master)
2450     {
2451         check_dd_restrictions(dd, ir, mdlog);
2452     }
2453 }
2454
2455 static void writeSettings(gmx::TextWriter*   log,
2456                           gmx_domdec_t*      dd,
2457                           const gmx_mtop_t&  mtop,
2458                           const t_inputrec&  ir,
2459                           gmx_bool           bDynLoadBal,
2460                           real               dlb_scale,
2461                           const gmx_ddbox_t* ddbox)
2462 {
2463     gmx_domdec_comm_t* comm = dd->comm;
2464
2465     if (bDynLoadBal)
2466     {
2467         log->writeString("The maximum number of communication pulses is:");
2468         for (int d = 0; d < dd->ndim; d++)
2469         {
2470             log->writeStringFormatted(" %c %d", dim2char(dd->dim[d]), comm->cd[d].np_dlb);
2471         }
2472         log->ensureLineBreak();
2473         log->writeLineFormatted("The minimum size for domain decomposition cells is %.3f nm",
2474                                 comm->cellsize_limit);
2475         log->writeLineFormatted("The requested allowed shrink of DD cells (option -dds) is: %.2f", dlb_scale);
2476         log->writeString("The allowed shrink of domain decomposition cells is:");
2477         for (int d = 0; d < DIM; d++)
2478         {
2479             if (dd->numCells[d] > 1)
2480             {
2481                 const real shrink =
2482                         (d >= ddbox->npbcdim && dd->numCells[d] == 2)
2483                                 ? 0
2484                                 : comm->cellsize_min_dlb[d]
2485                                           / (ddbox->box_size[d] * ddbox->skew_fac[d] / dd->numCells[d]);
2486                 log->writeStringFormatted(" %c %.2f", dim2char(d), shrink);
2487             }
2488         }
2489         log->ensureLineBreak();
2490     }
2491     else
2492     {
2493         ivec np;
2494         set_dd_cell_sizes_slb(dd, ddbox, setcellsizeslbPULSE_ONLY, np);
2495         log->writeString("The initial number of communication pulses is:");
2496         for (int d = 0; d < dd->ndim; d++)
2497         {
2498             log->writeStringFormatted(" %c %d", dim2char(dd->dim[d]), np[dd->dim[d]]);
2499         }
2500         log->ensureLineBreak();
2501         log->writeString("The initial domain decomposition cell size is:");
2502         for (int d = 0; d < DIM; d++)
2503         {
2504             if (dd->numCells[d] > 1)
2505             {
2506                 log->writeStringFormatted(" %c %.2f nm", dim2char(d), dd->comm->cellsize_min[d]);
2507             }
2508         }
2509         log->ensureLineBreak();
2510         log->writeLine();
2511     }
2512
2513     const bool haveInterDomainVsites =
2514             (countInterUpdategroupVsites(mtop, comm->systemInfo.updateGroupingsPerMoleculeType) != 0);
2515
2516     if (comm->systemInfo.haveInterDomainBondeds || haveInterDomainVsites
2517         || comm->systemInfo.mayHaveSplitConstraints || comm->systemInfo.mayHaveSplitSettles)
2518     {
2519         std::string decompUnits;
2520         if (comm->systemInfo.useUpdateGroups)
2521         {
2522             decompUnits = "atom groups";
2523         }
2524         else
2525         {
2526             decompUnits = "atoms";
2527         }
2528
2529         log->writeLineFormatted("The maximum allowed distance for %s involved in interactions is:",
2530                                 decompUnits.c_str());
2531         log->writeLineFormatted(
2532                 "%40s  %-7s %6.3f nm", "non-bonded interactions", "", comm->systemInfo.cutoff);
2533
2534         real limit = 0;
2535         if (bDynLoadBal)
2536         {
2537             limit = dd->comm->cellsize_limit;
2538         }
2539         else
2540         {
2541             if (dd->unitCellInfo.ddBoxIsDynamic)
2542             {
2543                 log->writeLine(
2544                         "(the following are initial values, they could change due to box "
2545                         "deformation)");
2546             }
2547             limit = dd->comm->cellsize_min[XX];
2548             for (int d = 1; d < DIM; d++)
2549             {
2550                 limit = std::min(limit, dd->comm->cellsize_min[d]);
2551             }
2552         }
2553
2554         if (comm->systemInfo.haveInterDomainBondeds)
2555         {
2556             log->writeLineFormatted("%40s  %-7s %6.3f nm",
2557                                     "two-body bonded interactions",
2558                                     "(-rdd)",
2559                                     std::max(comm->systemInfo.cutoff, comm->cutoff_mbody));
2560             log->writeLineFormatted("%40s  %-7s %6.3f nm",
2561                                     "multi-body bonded interactions",
2562                                     "(-rdd)",
2563                                     (comm->systemInfo.filterBondedCommunication || isDlbOn(dd->comm))
2564                                             ? comm->cutoff_mbody
2565                                             : std::min(comm->systemInfo.cutoff, limit));
2566         }
2567         if (haveInterDomainVsites)
2568         {
2569             log->writeLineFormatted("%40s  %-7s %6.3f nm", "virtual site constructions", "(-rcon)", limit);
2570         }
2571         if (comm->systemInfo.mayHaveSplitConstraints || comm->systemInfo.mayHaveSplitSettles)
2572         {
2573             std::string separation =
2574                     gmx::formatString("atoms separated by up to %d constraints", 1 + ir.nProjOrder);
2575             log->writeLineFormatted("%40s  %-7s %6.3f nm\n", separation.c_str(), "(-rcon)", limit);
2576         }
2577         log->ensureLineBreak();
2578     }
2579 }
2580
2581 static void logSettings(const gmx::MDLogger& mdlog,
2582                         gmx_domdec_t*        dd,
2583                         const gmx_mtop_t&    mtop,
2584                         const t_inputrec&    ir,
2585                         real                 dlb_scale,
2586                         const gmx_ddbox_t*   ddbox)
2587 {
2588     gmx::StringOutputStream stream;
2589     gmx::TextWriter         log(&stream);
2590     writeSettings(&log, dd, mtop, ir, isDlbOn(dd->comm), dlb_scale, ddbox);
2591     if (dd->comm->dlbState == DlbState::offCanTurnOn)
2592     {
2593         {
2594             log.ensureEmptyLine();
2595             log.writeLine(
2596                     "When dynamic load balancing gets turned on, these settings will change to:");
2597         }
2598         writeSettings(&log, dd, mtop, ir, true, dlb_scale, ddbox);
2599     }
2600     GMX_LOG(mdlog.info).asParagraph().appendText(stream.toString());
2601 }
2602
2603 static void set_cell_limits_dlb(const gmx::MDLogger& mdlog,
2604                                 gmx_domdec_t*        dd,
2605                                 real                 dlb_scale,
2606                                 const t_inputrec&    inputrec,
2607                                 const gmx_ddbox_t*   ddbox)
2608 {
2609     int npulse       = 0;
2610     int npulse_d_max = 0;
2611     int npulse_d     = 0;
2612
2613     gmx_domdec_comm_t* comm = dd->comm;
2614
2615     bool bNoCutOff = (inputrec.rvdw == 0 || inputrec.rcoulomb == 0);
2616
2617     /* Determine the maximum number of comm. pulses in one dimension */
2618
2619     comm->cellsize_limit = std::max(comm->cellsize_limit, comm->cutoff_mbody);
2620
2621     /* Determine the maximum required number of grid pulses */
2622     if (comm->cellsize_limit >= comm->systemInfo.cutoff)
2623     {
2624         /* Only a single pulse is required */
2625         npulse = 1;
2626     }
2627     else if (!bNoCutOff && comm->cellsize_limit > 0)
2628     {
2629         /* We round down slightly here to avoid overhead due to the latency
2630          * of extra communication calls when the cut-off
2631          * would be only slightly longer than the cell size.
2632          * Later cellsize_limit is redetermined,
2633          * so we can not miss interactions due to this rounding.
2634          */
2635         npulse = static_cast<int>(0.96 + comm->systemInfo.cutoff / comm->cellsize_limit);
2636     }
2637     else
2638     {
2639         /* There is no cell size limit */
2640         npulse = std::max(dd->numCells[XX] - 1, std::max(dd->numCells[YY] - 1, dd->numCells[ZZ] - 1));
2641     }
2642
2643     if (!bNoCutOff && npulse > 1)
2644     {
2645         /* See if we can do with less pulses, based on dlb_scale */
2646         npulse_d_max = 0;
2647         for (int d = 0; d < dd->ndim; d++)
2648         {
2649             int dim  = dd->dim[d];
2650             npulse_d = static_cast<int>(
2651                     1
2652                     + dd->numCells[dim] * comm->systemInfo.cutoff
2653                               / (ddbox->box_size[dim] * ddbox->skew_fac[dim] * dlb_scale));
2654             npulse_d_max = std::max(npulse_d_max, npulse_d);
2655         }
2656         npulse = std::min(npulse, npulse_d_max);
2657     }
2658
2659     /* This env var can override npulse */
2660     const int ddPulseEnv = dd_getenv(mdlog, "GMX_DD_NPULSE", 0);
2661     if (ddPulseEnv > 0)
2662     {
2663         npulse = ddPulseEnv;
2664     }
2665
2666     comm->maxpulse       = 1;
2667     comm->bVacDLBNoLimit = (inputrec.pbcType == PbcType::No);
2668     for (int d = 0; d < dd->ndim; d++)
2669     {
2670         comm->cd[d].np_dlb = std::min(npulse, dd->numCells[dd->dim[d]] - 1);
2671         comm->maxpulse     = std::max(comm->maxpulse, comm->cd[d].np_dlb);
2672         if (comm->cd[d].np_dlb < dd->numCells[dd->dim[d]] - 1)
2673         {
2674             comm->bVacDLBNoLimit = FALSE;
2675         }
2676     }
2677
2678     /* cellsize_limit is set for LINCS in init_domain_decomposition */
2679     if (!comm->bVacDLBNoLimit)
2680     {
2681         comm->cellsize_limit = std::max(comm->cellsize_limit, comm->systemInfo.cutoff / comm->maxpulse);
2682     }
2683     comm->cellsize_limit = std::max(comm->cellsize_limit, comm->cutoff_mbody);
2684     /* Set the minimum cell size for each DD dimension */
2685     for (int d = 0; d < dd->ndim; d++)
2686     {
2687         if (comm->bVacDLBNoLimit || comm->cd[d].np_dlb * comm->cellsize_limit >= comm->systemInfo.cutoff)
2688         {
2689             comm->cellsize_min_dlb[dd->dim[d]] = comm->cellsize_limit;
2690         }
2691         else
2692         {
2693             comm->cellsize_min_dlb[dd->dim[d]] = comm->systemInfo.cutoff / comm->cd[d].np_dlb;
2694         }
2695     }
2696     if (comm->cutoff_mbody <= 0)
2697     {
2698         comm->cutoff_mbody = std::min(comm->systemInfo.cutoff, comm->cellsize_limit);
2699     }
2700     if (isDlbOn(comm))
2701     {
2702         set_dlb_limits(dd);
2703     }
2704 }
2705
2706 bool dd_moleculesAreAlwaysWhole(const gmx_domdec_t& dd)
2707 {
2708     return dd.comm->systemInfo.moleculesAreAlwaysWhole;
2709 }
2710
2711 bool dd_bonded_molpbc(const gmx_domdec_t& dd, PbcType pbcType)
2712 {
2713     /* If each molecule is a single charge group
2714      * or we use domain decomposition for each periodic dimension,
2715      * we do not need to take pbc into account for the bonded interactions.
2716      */
2717     return (pbcType != PbcType::No && dd.comm->systemInfo.haveInterDomainBondeds
2718             && !(dd.numCells[XX] > 1 && dd.numCells[YY] > 1
2719                  && (dd.numCells[ZZ] > 1 || pbcType == PbcType::XY)));
2720 }
2721
2722 /*! \brief Sets grid size limits and PP-PME setup, prints settings to log */
2723 static void set_ddgrid_parameters(const gmx::MDLogger& mdlog,
2724                                   gmx_domdec_t*        dd,
2725                                   real                 dlb_scale,
2726                                   const gmx_mtop_t&    mtop,
2727                                   const t_inputrec&    inputrec,
2728                                   const gmx_ddbox_t*   ddbox)
2729 {
2730     gmx_domdec_comm_t* comm        = dd->comm;
2731     DDRankSetup&       ddRankSetup = comm->ddRankSetup;
2732
2733     if (EEL_PME(inputrec.coulombtype) || EVDW_PME(inputrec.vdwtype))
2734     {
2735         init_ddpme(dd, &ddRankSetup.ddpme[0], 0);
2736         if (ddRankSetup.npmedecompdim >= 2)
2737         {
2738             init_ddpme(dd, &ddRankSetup.ddpme[1], 1);
2739         }
2740     }
2741     else
2742     {
2743         ddRankSetup.numRanksDoingPme = 0;
2744         if (dd->pme_nodeid >= 0)
2745         {
2746             gmx_fatal_collective(FARGS,
2747                                  dd->mpi_comm_all,
2748                                  DDMASTER(dd),
2749                                  "Can not have separate PME ranks without PME electrostatics");
2750         }
2751     }
2752
2753     if (debug)
2754     {
2755         fprintf(debug, "The DD cut-off is %f\n", comm->systemInfo.cutoff);
2756     }
2757     if (!isDlbDisabled(comm))
2758     {
2759         set_cell_limits_dlb(mdlog, dd, dlb_scale, inputrec, ddbox);
2760     }
2761
2762     logSettings(mdlog, dd, mtop, inputrec, dlb_scale, ddbox);
2763
2764     const real vol_frac = (inputrec.pbcType == PbcType::No)
2765                                   ? (1 - 1 / static_cast<double>(dd->nnodes))
2766                                   : ((1 + comm_box_frac(dd->numCells, comm->systemInfo.cutoff, *ddbox))
2767                                      / static_cast<double>(dd->nnodes));
2768     if (debug)
2769     {
2770         fprintf(debug, "Volume fraction for all DD zones: %f\n", vol_frac);
2771     }
2772     int natoms_tot = mtop.natoms;
2773
2774     dd->ga2la = new gmx_ga2la_t(natoms_tot, static_cast<int>(vol_frac * natoms_tot));
2775 }
2776
2777 /*! \brief Get some important DD parameters which can be modified by env.vars */
2778 static DDSettings getDDSettings(const gmx::MDLogger&     mdlog,
2779                                 const DomdecOptions&     options,
2780                                 const gmx::MdrunOptions& mdrunOptions,
2781                                 const t_inputrec&        ir)
2782 {
2783     DDSettings ddSettings;
2784
2785     ddSettings.useSendRecv2        = (dd_getenv(mdlog, "GMX_DD_USE_SENDRECV2", 0) != 0);
2786     ddSettings.dlb_scale_lim       = dd_getenv(mdlog, "GMX_DLB_MAX_BOX_SCALING", 10);
2787     ddSettings.useDDOrderZYX       = bool(dd_getenv(mdlog, "GMX_DD_ORDER_ZYX", 0));
2788     ddSettings.useCartesianReorder = bool(dd_getenv(mdlog, "GMX_NO_CART_REORDER", 1));
2789     ddSettings.eFlop               = dd_getenv(mdlog, "GMX_DLB_BASED_ON_FLOPS", 0);
2790     const int recload              = dd_getenv(mdlog, "GMX_DD_RECORD_LOAD", 1);
2791     ddSettings.nstDDDump           = dd_getenv(mdlog, "GMX_DD_NST_DUMP", 0);
2792     ddSettings.nstDDDumpGrid       = dd_getenv(mdlog, "GMX_DD_NST_DUMP_GRID", 0);
2793     ddSettings.DD_debug            = dd_getenv(mdlog, "GMX_DD_DEBUG", 0);
2794
2795     if (ddSettings.useSendRecv2)
2796     {
2797         GMX_LOG(mdlog.info)
2798                 .appendText(
2799                         "Will use two sequential MPI_Sendrecv calls instead of two simultaneous "
2800                         "non-blocking MPI_Irecv and MPI_Isend pairs for constraint and vsite "
2801                         "communication");
2802     }
2803
2804     if (ddSettings.eFlop)
2805     {
2806         GMX_LOG(mdlog.info).appendText("Will load balance based on FLOP count");
2807         ddSettings.recordLoad = true;
2808     }
2809     else
2810     {
2811         ddSettings.recordLoad = (wallcycle_have_counter() && recload > 0);
2812     }
2813
2814     ddSettings.initialDlbState =
2815             determineInitialDlbState(mdlog, options.dlbOption, ddSettings.recordLoad, mdrunOptions, ir);
2816     GMX_LOG(mdlog.info)
2817             .appendTextFormatted("Dynamic load balancing: %s",
2818                                  enumValueToString(ddSettings.initialDlbState));
2819
2820     return ddSettings;
2821 }
2822
2823 gmx_domdec_t::gmx_domdec_t(const t_inputrec& ir) : unitCellInfo(ir) {}
2824
2825 gmx_domdec_t::~gmx_domdec_t() = default;
2826
2827 namespace gmx
2828 {
2829
2830 // TODO once the functionality stablizes, move this class and
2831 // supporting functionality into builder.cpp
2832 /*! \brief Impl class for DD builder */
2833 class DomainDecompositionBuilder::Impl
2834 {
2835 public:
2836     //! Constructor
2837     Impl(const MDLogger&                   mdlog,
2838          t_commrec*                        cr,
2839          const DomdecOptions&              options,
2840          const MdrunOptions&               mdrunOptions,
2841          const gmx_mtop_t&                 mtop,
2842          const t_inputrec&                 ir,
2843          const MDModulesNotifiers&         notifiers,
2844          const matrix                      box,
2845          ArrayRef<const RangePartitioning> updateGroupingPerMoleculeType,
2846          bool                              useUpdateGroups,
2847          real                              maxUpdateGroupRadius,
2848          ArrayRef<const RVec>              xGlobal,
2849          bool                              useGpuForNonbonded,
2850          bool                              useGpuForPme);
2851
2852     //! Build the resulting DD manager
2853     gmx_domdec_t* build(LocalAtomSetManager* atomSets);
2854
2855     //! Objects used in constructing and configuring DD
2856     //! {
2857     //! Logging object
2858     const MDLogger& mdlog_;
2859     //! Communication object
2860     t_commrec* cr_;
2861     //! User-supplied options configuring DD behavior
2862     const DomdecOptions options_;
2863     //! Global system topology
2864     const gmx_mtop_t& mtop_;
2865     //! User input values from the tpr file
2866     const t_inputrec& ir_;
2867     //! MdModules object
2868     const MDModulesNotifiers& notifiers_;
2869     //! }
2870
2871     //! Internal objects used in constructing DD
2872     //! {
2873     //! Settings combined from the user input
2874     DDSettings ddSettings_;
2875     //! Information derived from the simulation system
2876     DDSystemInfo systemInfo_;
2877     //! Box structure
2878     gmx_ddbox_t ddbox_ = { 0 };
2879     //! Organization of the DD grids
2880     DDGridSetup ddGridSetup_;
2881     //! Organzation of the DD ranks
2882     DDRankSetup ddRankSetup_;
2883     //! Number of DD cells in each dimension
2884     ivec ddCellIndex_ = { 0, 0, 0 };
2885     //! IDs of PME-only ranks
2886     std::vector<int> pmeRanks_;
2887     //! Contains a valid Cartesian-communicator-based setup, or defaults.
2888     CartesianRankSetup cartSetup_;
2889     //! }
2890 };
2891
2892 DomainDecompositionBuilder::Impl::Impl(const MDLogger&                   mdlog,
2893                                        t_commrec*                        cr,
2894                                        const DomdecOptions&              options,
2895                                        const MdrunOptions&               mdrunOptions,
2896                                        const gmx_mtop_t&                 mtop,
2897                                        const t_inputrec&                 ir,
2898                                        const MDModulesNotifiers&         notifiers,
2899                                        const matrix                      box,
2900                                        ArrayRef<const RangePartitioning> updateGroupingPerMoleculeType,
2901                                        const bool                        useUpdateGroups,
2902                                        const real                        maxUpdateGroupRadius,
2903                                        ArrayRef<const RVec>              xGlobal,
2904                                        bool                              useGpuForNonbonded,
2905                                        bool                              useGpuForPme) :
2906     mdlog_(mdlog), cr_(cr), options_(options), mtop_(mtop), ir_(ir), notifiers_(notifiers)
2907 {
2908     GMX_LOG(mdlog_.info).appendTextFormatted("\nInitializing Domain Decomposition on %d ranks", cr_->sizeOfDefaultCommunicator);
2909
2910     ddSettings_ = getDDSettings(mdlog_, options_, mdrunOptions, ir_);
2911
2912     if (ddSettings_.eFlop > 1)
2913     {
2914         /* Ensure that we have different random flop counts on different ranks */
2915         srand(1 + cr_->rankInDefaultCommunicator);
2916     }
2917
2918     systemInfo_ = getSystemInfo(mdlog_,
2919                                 MASTER(cr_) ? DDRole::Master : DDRole::Agent,
2920                                 cr->mpiDefaultCommunicator,
2921                                 options_,
2922                                 mtop_,
2923                                 ir_,
2924                                 box,
2925                                 updateGroupingPerMoleculeType,
2926                                 useUpdateGroups,
2927                                 maxUpdateGroupRadius,
2928                                 xGlobal);
2929
2930     const int  numRanksRequested         = cr_->sizeOfDefaultCommunicator;
2931     const bool checkForLargePrimeFactors = (options_.numCells[0] <= 0);
2932
2933
2934     /* Checks for ability to use PME-only ranks */
2935     auto separatePmeRanksPermitted = checkForSeparatePmeRanks(
2936             notifiers_, options_, numRanksRequested, useGpuForNonbonded, useGpuForPme);
2937
2938     /* Checks for validity of requested Ranks setup */
2939     checkForValidRankCountRequests(numRanksRequested,
2940                                    EEL_PME(ir_.coulombtype) | EVDW_PME(ir_.vdwtype),
2941                                    options_.numPmeRanks,
2942                                    separatePmeRanksPermitted,
2943                                    checkForLargePrimeFactors);
2944
2945     // DD grid setup uses a more different cell size limit for
2946     // automated setup than the one in systemInfo_. The latter is used
2947     // in set_dd_limits() to configure DLB, for example.
2948     const real gridSetupCellsizeLimit =
2949             getDDGridSetupCellSizeLimit(mdlog_,
2950                                         !isDlbDisabled(ddSettings_.initialDlbState),
2951                                         options_.dlbScaling,
2952                                         ir_,
2953                                         systemInfo_.cellsizeLimit);
2954     ddGridSetup_ = getDDGridSetup(mdlog_,
2955                                   MASTER(cr_) ? DDRole::Master : DDRole::Agent,
2956                                   cr->mpiDefaultCommunicator,
2957                                   numRanksRequested,
2958                                   options_,
2959                                   ddSettings_,
2960                                   systemInfo_,
2961                                   gridSetupCellsizeLimit,
2962                                   mtop_,
2963                                   ir_,
2964                                   separatePmeRanksPermitted,
2965                                   box,
2966                                   xGlobal,
2967                                   &ddbox_);
2968     checkDDGridSetup(ddGridSetup_,
2969                      MASTER(cr_) ? DDRole::Master : DDRole::Agent,
2970                      cr->mpiDefaultCommunicator,
2971                      cr->sizeOfDefaultCommunicator,
2972                      options_,
2973                      ddSettings_,
2974                      systemInfo_,
2975                      gridSetupCellsizeLimit,
2976                      ddbox_);
2977
2978     cr_->npmenodes = ddGridSetup_.numPmeOnlyRanks;
2979
2980     ddRankSetup_ = getDDRankSetup(mdlog_, cr_->sizeOfDefaultCommunicator, ddGridSetup_, ir_);
2981
2982     /* Generate the group communicator, also decides the duty of each rank */
2983     cartSetup_ = makeGroupCommunicators(
2984             mdlog_, ddSettings_, options_.rankOrder, ddRankSetup_, cr_, ddCellIndex_, &pmeRanks_);
2985 }
2986
2987 gmx_domdec_t* DomainDecompositionBuilder::Impl::build(LocalAtomSetManager* atomSets)
2988 {
2989     gmx_domdec_t* dd = new gmx_domdec_t(ir_);
2990
2991     copy_ivec(ddCellIndex_, dd->ci);
2992
2993     dd->comm = init_dd_comm();
2994
2995     dd->comm->ddRankSetup        = ddRankSetup_;
2996     dd->comm->cartesianRankSetup = cartSetup_;
2997
2998     set_dd_limits(mdlog_,
2999                   MASTER(cr_) ? DDRole::Master : DDRole::Agent,
3000                   dd,
3001                   options_,
3002                   ddSettings_,
3003                   systemInfo_,
3004                   ddGridSetup_,
3005                   ddRankSetup_.numPPRanks,
3006                   mtop_,
3007                   ir_,
3008                   ddbox_);
3009
3010     setupGroupCommunication(mdlog_, ddSettings_, pmeRanks_, cr_, mtop_.natoms, dd);
3011
3012     if (thisRankHasDuty(cr_, DUTY_PP))
3013     {
3014         set_ddgrid_parameters(mdlog_, dd, options_.dlbScaling, mtop_, ir_, &ddbox_);
3015
3016         setup_neighbor_relations(dd);
3017     }
3018
3019     /* Set overallocation to avoid frequent reallocation of arrays */
3020     set_over_alloc_dd(true);
3021
3022     dd->atomSets = atomSets;
3023
3024     dd->localTopologyChecker =
3025             std::make_unique<LocalTopologyChecker>(mtop_, dd->comm->systemInfo.useUpdateGroups);
3026
3027     return dd;
3028 }
3029
3030 DomainDecompositionBuilder::DomainDecompositionBuilder(const MDLogger&           mdlog,
3031                                                        t_commrec*                cr,
3032                                                        const DomdecOptions&      options,
3033                                                        const MdrunOptions&       mdrunOptions,
3034                                                        const gmx_mtop_t&         mtop,
3035                                                        const t_inputrec&         ir,
3036                                                        const MDModulesNotifiers& notifiers,
3037                                                        const matrix              box,
3038                                                        ArrayRef<const RangePartitioning> updateGroupingPerMoleculeType,
3039                                                        const bool           useUpdateGroups,
3040                                                        const real           maxUpdateGroupRadius,
3041                                                        ArrayRef<const RVec> xGlobal,
3042                                                        const bool           useGpuForNonbonded,
3043                                                        const bool           useGpuForPme) :
3044     impl_(new Impl(mdlog,
3045                    cr,
3046                    options,
3047                    mdrunOptions,
3048                    mtop,
3049                    ir,
3050                    notifiers,
3051                    box,
3052                    updateGroupingPerMoleculeType,
3053                    useUpdateGroups,
3054                    maxUpdateGroupRadius,
3055                    xGlobal,
3056                    useGpuForNonbonded,
3057                    useGpuForPme))
3058 {
3059 }
3060
3061 gmx_domdec_t* DomainDecompositionBuilder::build(LocalAtomSetManager* atomSets)
3062 {
3063     return impl_->build(atomSets);
3064 }
3065
3066 DomainDecompositionBuilder::~DomainDecompositionBuilder() = default;
3067
3068 } // namespace gmx
3069
3070 static gmx_bool test_dd_cutoff(const t_commrec* cr, const matrix box, gmx::ArrayRef<const gmx::RVec> x, real cutoffRequested)
3071 {
3072     gmx_ddbox_t ddbox;
3073     int         LocallyLimited = 0;
3074
3075     const auto* dd = cr->dd;
3076
3077     set_ddbox(*dd, false, box, true, x, &ddbox);
3078
3079     LocallyLimited = 0;
3080
3081     for (int d = 0; d < dd->ndim; d++)
3082     {
3083         const int dim = dd->dim[d];
3084
3085         real inv_cell_size = DD_CELL_MARGIN * dd->numCells[dim] / ddbox.box_size[dim];
3086         if (dd->unitCellInfo.ddBoxIsDynamic)
3087         {
3088             inv_cell_size *= DD_PRES_SCALE_MARGIN;
3089         }
3090
3091         const int np = 1 + static_cast<int>(cutoffRequested * inv_cell_size * ddbox.skew_fac[dim]);
3092
3093         if (!isDlbDisabled(dd->comm) && (dim < ddbox.npbcdim) && (dd->comm->cd[d].np_dlb > 0))
3094         {
3095             if (np > dd->comm->cd[d].np_dlb)
3096             {
3097                 return FALSE;
3098             }
3099
3100             /* If a current local cell size is smaller than the requested
3101              * cut-off, we could still fix it, but this gets very complicated.
3102              * Without fixing here, we might actually need more checks.
3103              */
3104             real cellSizeAlongDim =
3105                     (dd->comm->cell_x1[dim] - dd->comm->cell_x0[dim]) * ddbox.skew_fac[dim];
3106             if (cellSizeAlongDim * dd->comm->cd[d].np_dlb < cutoffRequested)
3107             {
3108                 LocallyLimited = 1;
3109             }
3110         }
3111     }
3112
3113     if (!isDlbDisabled(dd->comm))
3114     {
3115         /* If DLB is not active yet, we don't need to check the grid jumps.
3116          * Actually we shouldn't, because then the grid jump data is not set.
3117          */
3118         if (isDlbOn(dd->comm) && gmx::check_grid_jump(0, dd, cutoffRequested, &ddbox, FALSE))
3119         {
3120             LocallyLimited = 1;
3121         }
3122
3123         gmx_sumi(1, &LocallyLimited, cr);
3124
3125         if (LocallyLimited > 0)
3126         {
3127             return FALSE;
3128         }
3129     }
3130
3131     return TRUE;
3132 }
3133
3134 bool change_dd_cutoff(t_commrec* cr, const matrix box, gmx::ArrayRef<const gmx::RVec> x, real cutoffRequested)
3135 {
3136     bool bCutoffAllowed = test_dd_cutoff(cr, box, x, cutoffRequested);
3137
3138     if (bCutoffAllowed)
3139     {
3140         cr->dd->comm->systemInfo.cutoff = cutoffRequested;
3141     }
3142
3143     return bCutoffAllowed;
3144 }
3145
3146 void constructGpuHaloExchange(const gmx::MDLogger&            mdlog,
3147                               const t_commrec&                cr,
3148                               const gmx::DeviceStreamManager& deviceStreamManager,
3149                               gmx_wallcycle*                  wcycle)
3150 {
3151     GMX_RELEASE_ASSERT(deviceStreamManager.streamIsValid(gmx::DeviceStreamType::NonBondedLocal),
3152                        "Local non-bonded stream should be valid when using"
3153                        "GPU halo exchange.");
3154     GMX_RELEASE_ASSERT(deviceStreamManager.streamIsValid(gmx::DeviceStreamType::NonBondedNonLocal),
3155                        "Non-local non-bonded stream should be valid when using "
3156                        "GPU halo exchange.");
3157
3158     if (cr.dd->gpuHaloExchange[0].empty())
3159     {
3160         GMX_LOG(mdlog.warning)
3161                 .asParagraph()
3162                 .appendTextFormatted(
3163                         "NOTE: Activating the 'GPU halo exchange' feature, enabled "
3164                         "by the "
3165                         "GMX_GPU_DD_COMMS environment variable.");
3166     }
3167
3168     for (int d = 0; d < cr.dd->ndim; d++)
3169     {
3170         for (int pulse = cr.dd->gpuHaloExchange[d].size(); pulse < cr.dd->comm->cd[d].numPulses(); pulse++)
3171         {
3172             cr.dd->gpuHaloExchange[d].push_back(std::make_unique<gmx::GpuHaloExchange>(
3173                     cr.dd,
3174                     d,
3175                     cr.mpi_comm_mygroup,
3176                     deviceStreamManager.context(),
3177                     deviceStreamManager.stream(gmx::DeviceStreamType::NonBondedLocal),
3178                     deviceStreamManager.stream(gmx::DeviceStreamType::NonBondedNonLocal),
3179                     pulse,
3180                     wcycle));
3181         }
3182     }
3183 }
3184
3185 void reinitGpuHaloExchange(const t_commrec&              cr,
3186                            const DeviceBuffer<gmx::RVec> d_coordinatesBuffer,
3187                            const DeviceBuffer<gmx::RVec> d_forcesBuffer)
3188 {
3189     for (int d = 0; d < cr.dd->ndim; d++)
3190     {
3191         for (int pulse = 0; pulse < cr.dd->comm->cd[d].numPulses(); pulse++)
3192         {
3193             cr.dd->gpuHaloExchange[d][pulse]->reinitHalo(d_coordinatesBuffer, d_forcesBuffer);
3194         }
3195     }
3196 }
3197
3198 void communicateGpuHaloCoordinates(const t_commrec&      cr,
3199                                    const matrix          box,
3200                                    GpuEventSynchronizer* coordinatesReadyOnDeviceEvent)
3201 {
3202     for (int d = 0; d < cr.dd->ndim; d++)
3203     {
3204         for (int pulse = 0; pulse < cr.dd->comm->cd[d].numPulses(); pulse++)
3205         {
3206             cr.dd->gpuHaloExchange[d][pulse]->communicateHaloCoordinates(box, coordinatesReadyOnDeviceEvent);
3207         }
3208     }
3209 }
3210
3211 void communicateGpuHaloForces(const t_commrec& cr, bool accumulateForces)
3212 {
3213     for (int d = cr.dd->ndim - 1; d >= 0; d--)
3214     {
3215         for (int pulse = cr.dd->comm->cd[d].numPulses() - 1; pulse >= 0; pulse--)
3216         {
3217             cr.dd->gpuHaloExchange[d][pulse]->communicateHaloForces(accumulateForces);
3218         }
3219     }
3220 }
3221
3222 void dd_init_local_state(const gmx_domdec_t& dd, const t_state* state_global, t_state* state_local)
3223 {
3224     std::array<int, 5> buf;
3225
3226     if (DDMASTER(dd))
3227     {
3228         buf[0] = state_global->flags;
3229         buf[1] = state_global->ngtc;
3230         buf[2] = state_global->nnhpres;
3231         buf[3] = state_global->nhchainlength;
3232         buf[4] = state_global->dfhist ? state_global->dfhist->nlambda : 0;
3233     }
3234     dd_bcast(&dd, buf.size() * sizeof(int), buf.data());
3235
3236     init_gtc_state(state_local, buf[1], buf[2], buf[3]);
3237     init_dfhist_state(state_local, buf[4]);
3238     state_local->flags = buf[0];
3239 }