src/gromacs/domdec/domdec.cpp

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2005,2006,2007,2008,2009 by the GROMACS development team.
   5  * Copyright (c) 2010,2011,2012,2013,2014 by the GROMACS development team.
   6  * Copyright (c) 2015,2016,2017,2018,2019 by the GROMACS development team.
   7  * Copyright (c) 2020,2021, by the GROMACS development team, led by
   8  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   9  * and including many others, as listed in the AUTHORS file in the
  10  * top-level source directory and at http://www.gromacs.org.
  11  *
  12  * GROMACS is free software; you can redistribute it and/or
  13  * modify it under the terms of the GNU Lesser General Public License
  14  * as published by the Free Software Foundation; either version 2.1
  15  * of the License, or (at your option) any later version.
  16  *
  17  * GROMACS is distributed in the hope that it will be useful,
  18  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  19  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  20  * Lesser General Public License for more details.
  21  *
  22  * You should have received a copy of the GNU Lesser General Public
  23  * License along with GROMACS; if not, see
  24  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  25  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  26  *
  27  * If you want to redistribute modifications to GROMACS, please
  28  * consider that scientific software is very special. Version
  29  * control is crucial - bugs must be traceable. We will be happy to
  30  * consider code for inclusion in the official distribution, but
  31  * derived work must not be called official GROMACS. Details are found
  32  * in the README & COPYING files - if they are missing, get the
  33  * official version at http://www.gromacs.org.
  34  *
  35  * To help us fund GROMACS development, we humbly ask that you cite
  36  * the research papers on the package. Check out http://www.gromacs.org.
  37  */
  38
  39 #include "gmxpre.h"
  40
  41 #include "domdec.h"
  42
  43 #include "config.h"
  44
  45 #include <cassert>
  46 #include <cinttypes>
  47 #include <climits>
  48 #include <cmath>
  49 #include <cstdio>
  50 #include <cstdlib>
  51 #include <cstring>
  52
  53 #include <algorithm>
  54 #include <memory>
  55
  56 #include "gromacs/domdec/builder.h"
  57 #include "gromacs/domdec/collect.h"
  58 #include "gromacs/domdec/computemultibodycutoffs.h"
  59 #include "gromacs/domdec/dlb.h"
  60 #include "gromacs/domdec/dlbtiming.h"
  61 #include "gromacs/domdec/domdec_network.h"
  62 #include "gromacs/domdec/ga2la.h"
  63 #include "gromacs/domdec/gpuhaloexchange.h"
  64 #include "gromacs/domdec/localtopologychecker.h"
  65 #include "gromacs/domdec/options.h"
  66 #include "gromacs/domdec/partition.h"
  67 #include "gromacs/ewald/pme.h"
  68 #include "gromacs/domdec/reversetopology.h"
  69 #include "gromacs/gmxlib/network.h"
  70 #include "gromacs/gmxlib/nrnb.h"
  71 #include "gromacs/gpu_utils/device_stream_manager.h"
  72 #include "gromacs/gpu_utils/gpu_utils.h"
  73 #include "gromacs/hardware/hw_info.h"
  74 #include "gromacs/math/vec.h"
  75 #include "gromacs/math/vectypes.h"
  76 #include "gromacs/mdlib/calc_verletbuf.h"
  77 #include "gromacs/mdlib/constr.h"
  78 #include "gromacs/mdlib/constraintrange.h"
  79 #include "gromacs/mdlib/updategroups.h"
  80 #include "gromacs/mdlib/vcm.h"
  81 #include "gromacs/mdlib/vsite.h"
  82 #include "gromacs/mdrun/mdmodules.h"
  83 #include "gromacs/mdtypes/commrec.h"
  84 #include "gromacs/mdtypes/forceoutput.h"
  85 #include "gromacs/mdtypes/forcerec.h"
  86 #include "gromacs/mdtypes/inputrec.h"
  87 #include "gromacs/mdtypes/mdrunoptions.h"
  88 #include "gromacs/mdtypes/state.h"
  89 #include "gromacs/pbcutil/ishift.h"
  90 #include "gromacs/pbcutil/pbc.h"
  91 #include "gromacs/pulling/pull.h"
  92 #include "gromacs/timing/wallcycle.h"
  93 #include "gromacs/topology/block.h"
  94 #include "gromacs/topology/idef.h"
  95 #include "gromacs/topology/ifunc.h"
  96 #include "gromacs/topology/mtop_lookup.h"
  97 #include "gromacs/topology/mtop_util.h"
  98 #include "gromacs/topology/topology.h"
  99 #include "gromacs/utility/basedefinitions.h"
 100 #include "gromacs/utility/basenetwork.h"
 101 #include "gromacs/utility/cstringutil.h"
 102 #include "gromacs/utility/enumerationhelpers.h"
 103 #include "gromacs/utility/exceptions.h"
 104 #include "gromacs/utility/fatalerror.h"
 105 #include "gromacs/utility/gmxmpi.h"
 106 #include "gromacs/utility/logger.h"
 107 #include "gromacs/utility/mdmodulesnotifiers.h"
 108 #include "gromacs/utility/real.h"
 109 #include "gromacs/utility/smalloc.h"
 110 #include "gromacs/utility/strconvert.h"
 111 #include "gromacs/utility/stringstream.h"
 112 #include "gromacs/utility/stringutil.h"
 113 #include "gromacs/utility/textwriter.h"
 114
 115 #include "atomdistribution.h"
 116 #include "box.h"
 117 #include "cellsizes.h"
 118 #include "distribute.h"
 119 #include "domdec_constraints.h"
 120 #include "domdec_internal.h"
 121 #include "domdec_setup.h"
 122 #include "domdec_vsite.h"
 123 #include "redistribute.h"
 124 #include "utility.h"
 125
 126 // TODO remove this when moving domdec into gmx namespace
 127 using gmx::ArrayRef;
 128 using gmx::DdRankOrder;
 129 using gmx::DlbOption;
 130 using gmx::DomdecOptions;
 131 using gmx::RangePartitioning;
 132
 133 static const char* enumValueToString(DlbState enumValue)
 134 {
 135     static constexpr gmx::EnumerationArray<DlbState, const char*> dlbStateNames = {
 136         "off", "off", "auto", "locked", "on", "on"
 137     };
 138     return dlbStateNames[enumValue];
 139 }
 140
 141 /* The DD zone order */
 142 static const ivec dd_zo[DD_MAXZONE] = { { 0, 0, 0 }, { 1, 0, 0 }, { 1, 1, 0 }, { 0, 1, 0 },
 143                                         { 0, 1, 1 }, { 0, 0, 1 }, { 1, 0, 1 }, { 1, 1, 1 } };
 144
 145 /* The non-bonded zone-pair setup for domain decomposition
 146  * The first number is the i-zone, the second number the first j-zone seen by
 147  * this i-zone, the third number the last+1 j-zone seen by this i-zone.
 148  * As is, this is for 3D decomposition, where there are 4 i-zones.
 149  * With 2D decomposition use only the first 2 i-zones and a last+1 j-zone of 4.
 150  * With 1D decomposition use only the first i-zone and a last+1 j-zone of 2.
 151  */
 152 static const int ddNonbondedZonePairRanges[DD_MAXIZONE][3] = { { 0, 0, 8 },
 153                                                                { 1, 3, 6 },
 154                                                                { 2, 5, 6 },
 155                                                                { 3, 5, 7 } };
 156
 157 static void ddindex2xyz(const ivec nc, int ind, ivec xyz)
 158 {
 159     xyz[XX] = ind / (nc[YY] * nc[ZZ]);
 160     xyz[YY] = (ind / nc[ZZ]) % nc[YY];
 161     xyz[ZZ] = ind % nc[ZZ];
 162 }
 163
 164 static int ddcoord2ddnodeid(gmx_domdec_t* dd, ivec c)
 165 {
 166     int ddnodeid = -1;
 167
 168     const CartesianRankSetup& cartSetup = dd->comm->cartesianRankSetup;
 169     const int                 ddindex   = dd_index(dd->numCells, c);
 170     if (cartSetup.bCartesianPP_PME)
 171     {
 172         ddnodeid = cartSetup.ddindex2ddnodeid[ddindex];
 173     }
 174     else if (cartSetup.bCartesianPP)
 175     {
 176 #if GMX_MPI
 177         MPI_Cart_rank(dd->mpi_comm_all, c, &ddnodeid);
 178 #endif
 179     }
 180     else
 181     {
 182         ddnodeid = ddindex;
 183     }
 184
 185     return ddnodeid;
 186 }
 187
 188 int ddglatnr(const gmx_domdec_t* dd, int i)
 189 {
 190     int atnr = 0;
 191
 192     if (dd == nullptr)
 193     {
 194         atnr = i + 1;
 195     }
 196     else
 197     {
 198         if (i >= dd->comm->atomRanges.numAtomsTotal())
 199         {
 200             gmx_fatal(FARGS,
 201                       "glatnr called with %d, which is larger than the local number of atoms (%d)",
 202                       i,
 203                       dd->comm->atomRanges.numAtomsTotal());
 204         }
 205         atnr = dd->globalAtomIndices[i] + 1;
 206     }
 207
 208     return atnr;
 209 }
 210
 211 void dd_store_state(const gmx_domdec_t& dd, t_state* state)
 212 {
 213     if (state->ddp_count != dd.ddp_count)
 214     {
 215         gmx_incons("The MD state does not match the domain decomposition state");
 216     }
 217
 218     state->cg_gl.resize(dd.numHomeAtoms);
 219     for (int i = 0; i < dd.numHomeAtoms; i++)
 220     {
 221         state->cg_gl[i] = dd.globalAtomGroupIndices[i];
 222     }
 223
 224     state->ddp_count_cg_gl = dd.ddp_count;
 225 }
 226
 227 gmx_domdec_zones_t* domdec_zones(gmx_domdec_t* dd)
 228 {
 229     return &dd->comm->zones;
 230 }
 231
 232 int dd_numAtomsZones(const gmx_domdec_t& dd)
 233 {
 234     return dd.comm->atomRanges.end(DDAtomRanges::Type::Zones);
 235 }
 236
 237 int dd_numHomeAtoms(const gmx_domdec_t& dd)
 238 {
 239     return dd.comm->atomRanges.numHomeAtoms();
 240 }
 241
 242 int dd_natoms_mdatoms(const gmx_domdec_t& dd)
 243 {
 244     /* We currently set mdatoms entries for all atoms:
 245      * local + non-local + communicated for vsite + constraints
 246      */
 247
 248     return dd.comm->atomRanges.numAtomsTotal();
 249 }
 250
 251 int dd_natoms_vsite(const gmx_domdec_t& dd)
 252 {
 253     return dd.comm->atomRanges.end(DDAtomRanges::Type::Vsites);
 254 }
 255
 256 void dd_get_constraint_range(const gmx_domdec_t& dd, int* at_start, int* at_end)
 257 {
 258     *at_start = dd.comm->atomRanges.start(DDAtomRanges::Type::Constraints);
 259     *at_end   = dd.comm->atomRanges.end(DDAtomRanges::Type::Constraints);
 260 }
 261
 262 void dd_move_x(gmx_domdec_t* dd, const matrix box, gmx::ArrayRef<gmx::RVec> x, gmx_wallcycle* wcycle)
 263 {
 264     wallcycle_start(wcycle, WallCycleCounter::MoveX);
 265
 266     rvec shift = { 0, 0, 0 };
 267
 268     gmx_domdec_comm_t* comm = dd->comm;
 269
 270     int nzone   = 1;
 271     int nat_tot = comm->atomRanges.numHomeAtoms();
 272     for (int d = 0; d < dd->ndim; d++)
 273     {
 274         const bool bPBC   = (dd->ci[dd->dim[d]] == 0);
 275         const bool bScrew = (bPBC && dd->unitCellInfo.haveScrewPBC && dd->dim[d] == XX);
 276         if (bPBC)
 277         {
 278             copy_rvec(box[dd->dim[d]], shift);
 279         }
 280         gmx_domdec_comm_dim_t* cd = &comm->cd[d];
 281         for (const gmx_domdec_ind_t& ind : cd->ind)
 282         {
 283             DDBufferAccess<gmx::RVec> sendBufferAccess(comm->rvecBuffer, ind.nsend[nzone + 1]);
 284             gmx::ArrayRef<gmx::RVec>& sendBuffer = sendBufferAccess.buffer;
 285             int                       n          = 0;
 286             if (!bPBC)
 287             {
 288                 for (int j : ind.index)
 289                 {
 290                     sendBuffer[n] = x[j];
 291                     n++;
 292                 }
 293             }
 294             else if (!bScrew)
 295             {
 296                 for (int j : ind.index)
 297                 {
 298                     /* We need to shift the coordinates */
 299                     for (int d = 0; d < DIM; d++)
 300                     {
 301                         sendBuffer[n][d] = x[j][d] + shift[d];
 302                     }
 303                     n++;
 304                 }
 305             }
 306             else
 307             {
 308                 for (int j : ind.index)
 309                 {
 310                     /* Shift x */
 311                     sendBuffer[n][XX] = x[j][XX] + shift[XX];
 312                     /* Rotate y and z.
 313                      * This operation requires a special shift force
 314                      * treatment, which is performed in calc_vir.
 315                      */
 316                     sendBuffer[n][YY] = box[YY][YY] - x[j][YY];
 317                     sendBuffer[n][ZZ] = box[ZZ][ZZ] - x[j][ZZ];
 318                     n++;
 319                 }
 320             }
 321
 322             DDBufferAccess<gmx::RVec> receiveBufferAccess(
 323                     comm->rvecBuffer2, cd->receiveInPlace ? 0 : ind.nrecv[nzone + 1]);
 324
 325             gmx::ArrayRef<gmx::RVec> receiveBuffer;
 326             if (cd->receiveInPlace)
 327             {
 328                 receiveBuffer = gmx::arrayRefFromArray(x.data() + nat_tot, ind.nrecv[nzone + 1]);
 329             }
 330             else
 331             {
 332                 receiveBuffer = receiveBufferAccess.buffer;
 333             }
 334             /* Send and receive the coordinates */
 335             ddSendrecv(dd, d, dddirBackward, sendBuffer, receiveBuffer);
 336
 337             if (!cd->receiveInPlace)
 338             {
 339                 int j = 0;
 340                 for (int zone = 0; zone < nzone; zone++)
 341                 {
 342                     for (int i = ind.cell2at0[zone]; i < ind.cell2at1[zone]; i++)
 343                     {
 344                         x[i] = receiveBuffer[j++];
 345                     }
 346                 }
 347             }
 348             nat_tot += ind.nrecv[nzone + 1];
 349         }
 350         nzone += nzone;
 351     }
 352
 353     wallcycle_stop(wcycle, WallCycleCounter::MoveX);
 354 }
 355
 356 void dd_move_f(gmx_domdec_t* dd, gmx::ForceWithShiftForces* forceWithShiftForces, gmx_wallcycle* wcycle)
 357 {
 358     wallcycle_start(wcycle, WallCycleCounter::MoveF);
 359
 360     gmx::ArrayRef<gmx::RVec> f      = forceWithShiftForces->force();
 361     gmx::ArrayRef<gmx::RVec> fshift = forceWithShiftForces->shiftForces();
 362
 363     gmx_domdec_comm_t& comm    = *dd->comm;
 364     int                nzone   = comm.zones.n / 2;
 365     int                nat_tot = comm.atomRanges.end(DDAtomRanges::Type::Zones);
 366     for (int d = dd->ndim - 1; d >= 0; d--)
 367     {
 368         /* Only forces in domains near the PBC boundaries need to
 369            consider PBC in the treatment of fshift */
 370         const bool shiftForcesNeedPbc =
 371                 (forceWithShiftForces->computeVirial() && dd->ci[dd->dim[d]] == 0);
 372         const bool applyScrewPbc =
 373                 (shiftForcesNeedPbc && dd->unitCellInfo.haveScrewPBC && dd->dim[d] == XX);
 374         /* Determine which shift vector we need */
 375         ivec vis        = { 0, 0, 0 };
 376         vis[dd->dim[d]] = 1;
 377         const int is    = gmx::ivecToShiftIndex(vis);
 378
 379         /* Loop over the pulses */
 380         const gmx_domdec_comm_dim_t& cd = comm.cd[d];
 381         for (int p = cd.numPulses() - 1; p >= 0; p--)
 382         {
 383             const gmx_domdec_ind_t&   ind = cd.ind[p];
 384             DDBufferAccess<gmx::RVec> receiveBufferAccess(comm.rvecBuffer, ind.nsend[nzone + 1]);
 385             gmx::ArrayRef<gmx::RVec>& receiveBuffer = receiveBufferAccess.buffer;
 386
 387             nat_tot -= ind.nrecv[nzone + 1];
 388
 389             DDBufferAccess<gmx::RVec> sendBufferAccess(
 390                     comm.rvecBuffer2, cd.receiveInPlace ? 0 : ind.nrecv[nzone + 1]);
 391
 392             gmx::ArrayRef<gmx::RVec> sendBuffer;
 393             if (cd.receiveInPlace)
 394             {
 395                 sendBuffer = gmx::arrayRefFromArray(f.data() + nat_tot, ind.nrecv[nzone + 1]);
 396             }
 397             else
 398             {
 399                 sendBuffer = sendBufferAccess.buffer;
 400                 int j      = 0;
 401                 for (int zone = 0; zone < nzone; zone++)
 402                 {
 403                     for (int i = ind.cell2at0[zone]; i < ind.cell2at1[zone]; i++)
 404                     {
 405                         sendBuffer[j++] = f[i];
 406                     }
 407                 }
 408             }
 409             /* Communicate the forces */
 410             ddSendrecv(dd, d, dddirForward, sendBuffer, receiveBuffer);
 411             /* Add the received forces */
 412             int n = 0;
 413             if (!shiftForcesNeedPbc)
 414             {
 415                 for (int j : ind.index)
 416                 {
 417                     for (int d = 0; d < DIM; d++)
 418                     {
 419                         f[j][d] += receiveBuffer[n][d];
 420                     }
 421                     n++;
 422                 }
 423             }
 424             else if (!applyScrewPbc)
 425             {
 426                 for (int j : ind.index)
 427                 {
 428                     for (int d = 0; d < DIM; d++)
 429                     {
 430                         f[j][d] += receiveBuffer[n][d];
 431                     }
 432                     /* Add this force to the shift force */
 433                     for (int d = 0; d < DIM; d++)
 434                     {
 435                         fshift[is][d] += receiveBuffer[n][d];
 436                     }
 437                     n++;
 438                 }
 439             }
 440             else
 441             {
 442                 for (int j : ind.index)
 443                 {
 444                     /* Rotate the force */
 445                     f[j][XX] += receiveBuffer[n][XX];
 446                     f[j][YY] -= receiveBuffer[n][YY];
 447                     f[j][ZZ] -= receiveBuffer[n][ZZ];
 448                     if (shiftForcesNeedPbc)
 449                     {
 450                         /* Add this force to the shift force */
 451                         for (int d = 0; d < DIM; d++)
 452                         {
 453                             fshift[is][d] += receiveBuffer[n][d];
 454                         }
 455                     }
 456                     n++;
 457                 }
 458             }
 459         }
 460         nzone /= 2;
 461     }
 462     wallcycle_stop(wcycle, WallCycleCounter::MoveF);
 463 }
 464
 465 /* Convenience function for extracting a real buffer from an rvec buffer
 466  *
 467  * To reduce the number of temporary communication buffers and avoid
 468  * cache polution, we reuse gmx::RVec buffers for storing reals.
 469  * This functions return a real buffer reference with the same number
 470  * of elements as the gmx::RVec buffer (so 1/3 of the size in bytes).
 471  */
 472 static gmx::ArrayRef<real> realArrayRefFromRvecArrayRef(gmx::ArrayRef<gmx::RVec> arrayRef)
 473 {
 474     return gmx::arrayRefFromArray(as_rvec_array(arrayRef.data())[0], arrayRef.size());
 475 }
 476
 477 void dd_atom_spread_real(gmx_domdec_t* dd, real v[])
 478 {
 479     gmx_domdec_comm_t* comm = dd->comm;
 480
 481     int nzone   = 1;
 482     int nat_tot = comm->atomRanges.numHomeAtoms();
 483     for (int d = 0; d < dd->ndim; d++)
 484     {
 485         gmx_domdec_comm_dim_t* cd = &comm->cd[d];
 486         for (const gmx_domdec_ind_t& ind : cd->ind)
 487         {
 488             /* Note: We provision for RVec instead of real, so a factor of 3
 489              * more than needed. The buffer actually already has this size
 490              * and we pass a plain pointer below, so this does not matter.
 491              */
 492             DDBufferAccess<gmx::RVec> sendBufferAccess(comm->rvecBuffer, ind.nsend[nzone + 1]);
 493             gmx::ArrayRef<real> sendBuffer = realArrayRefFromRvecArrayRef(sendBufferAccess.buffer);
 494             int                 n          = 0;
 495             for (int j : ind.index)
 496             {
 497                 sendBuffer[n++] = v[j];
 498             }
 499
 500             DDBufferAccess<gmx::RVec> receiveBufferAccess(
 501                     comm->rvecBuffer2, cd->receiveInPlace ? 0 : ind.nrecv[nzone + 1]);
 502
 503             gmx::ArrayRef<real> receiveBuffer;
 504             if (cd->receiveInPlace)
 505             {
 506                 receiveBuffer = gmx::arrayRefFromArray(v + nat_tot, ind.nrecv[nzone + 1]);
 507             }
 508             else
 509             {
 510                 receiveBuffer = realArrayRefFromRvecArrayRef(receiveBufferAccess.buffer);
 511             }
 512             /* Send and receive the data */
 513             ddSendrecv(dd, d, dddirBackward, sendBuffer, receiveBuffer);
 514             if (!cd->receiveInPlace)
 515             {
 516                 int j = 0;
 517                 for (int zone = 0; zone < nzone; zone++)
 518                 {
 519                     for (int i = ind.cell2at0[zone]; i < ind.cell2at1[zone]; i++)
 520                     {
 521                         v[i] = receiveBuffer[j++];
 522                     }
 523                 }
 524             }
 525             nat_tot += ind.nrecv[nzone + 1];
 526         }
 527         nzone += nzone;
 528     }
 529 }
 530
 531 void dd_atom_sum_real(gmx_domdec_t* dd, real v[])
 532 {
 533     gmx_domdec_comm_t* comm = dd->comm;
 534
 535     int nzone   = comm->zones.n / 2;
 536     int nat_tot = comm->atomRanges.end(DDAtomRanges::Type::Zones);
 537     for (int d = dd->ndim - 1; d >= 0; d--)
 538     {
 539         gmx_domdec_comm_dim_t* cd = &comm->cd[d];
 540         for (int p = cd->numPulses() - 1; p >= 0; p--)
 541         {
 542             const gmx_domdec_ind_t& ind = cd->ind[p];
 543
 544             /* Note: We provision for RVec instead of real, so a factor of 3
 545              * more than needed. The buffer actually already has this size
 546              * and we typecast, so this works as intended.
 547              */
 548             DDBufferAccess<gmx::RVec> receiveBufferAccess(comm->rvecBuffer, ind.nsend[nzone + 1]);
 549             gmx::ArrayRef<real> receiveBuffer = realArrayRefFromRvecArrayRef(receiveBufferAccess.buffer);
 550             nat_tot -= ind.nrecv[nzone + 1];
 551
 552             DDBufferAccess<gmx::RVec> sendBufferAccess(
 553                     comm->rvecBuffer2, cd->receiveInPlace ? 0 : ind.nrecv[nzone + 1]);
 554
 555             gmx::ArrayRef<real> sendBuffer;
 556             if (cd->receiveInPlace)
 557             {
 558                 sendBuffer = gmx::arrayRefFromArray(v + nat_tot, ind.nrecv[nzone + 1]);
 559             }
 560             else
 561             {
 562                 sendBuffer = realArrayRefFromRvecArrayRef(sendBufferAccess.buffer);
 563                 int j      = 0;
 564                 for (int zone = 0; zone < nzone; zone++)
 565                 {
 566                     for (int i = ind.cell2at0[zone]; i < ind.cell2at1[zone]; i++)
 567                     {
 568                         sendBuffer[j++] = v[i];
 569                     }
 570                 }
 571             }
 572             /* Communicate the forces */
 573             ddSendrecv(dd, d, dddirForward, sendBuffer, receiveBuffer);
 574             /* Add the received forces */
 575             int n = 0;
 576             for (int j : ind.index)
 577             {
 578                 v[j] += receiveBuffer[n];
 579                 n++;
 580             }
 581         }
 582         nzone /= 2;
 583     }
 584 }
 585
 586 real dd_cutoff_multibody(const gmx_domdec_t* dd)
 587 {
 588     const gmx_domdec_comm_t& comm       = *dd->comm;
 589     const DDSystemInfo&      systemInfo = comm.systemInfo;
 590
 591     real r = -1;
 592     if (systemInfo.haveInterDomainMultiBodyBondeds)
 593     {
 594         if (comm.cutoff_mbody > 0)
 595         {
 596             r = comm.cutoff_mbody;
 597         }
 598         else
 599         {
 600             /* cutoff_mbody=0 means we do not have DLB */
 601             r = comm.cellsize_min[dd->dim[0]];
 602             for (int di = 1; di < dd->ndim; di++)
 603             {
 604                 r = std::min(r, comm.cellsize_min[dd->dim[di]]);
 605             }
 606             if (comm.systemInfo.filterBondedCommunication)
 607             {
 608                 r = std::max(r, comm.cutoff_mbody);
 609             }
 610             else
 611             {
 612                 r = std::min(r, systemInfo.cutoff);
 613             }
 614         }
 615     }
 616
 617     return r;
 618 }
 619
 620 real dd_cutoff_twobody(const gmx_domdec_t* dd)
 621 {
 622     const real r_mb = dd_cutoff_multibody(dd);
 623
 624     return std::max(dd->comm->systemInfo.cutoff, r_mb);
 625 }
 626
 627
 628 static void dd_cart_coord2pmecoord(const DDRankSetup&        ddRankSetup,
 629                                    const CartesianRankSetup& cartSetup,
 630                                    const ivec                coord,
 631                                    ivec                      coord_pme)
 632 {
 633     const int nc   = ddRankSetup.numPPCells[cartSetup.cartpmedim];
 634     const int ntot = cartSetup.ntot[cartSetup.cartpmedim];
 635     copy_ivec(coord, coord_pme);
 636     coord_pme[cartSetup.cartpmedim] =
 637             nc + (coord[cartSetup.cartpmedim] * (ntot - nc) + (ntot - nc) / 2) / nc;
 638 }
 639
 640 /* Returns the PME rank index in 0...npmenodes-1 for the PP cell with index ddCellIndex */
 641 static int ddindex2pmeindex(const DDRankSetup& ddRankSetup, const int ddCellIndex)
 642 {
 643     const int npp  = ddRankSetup.numPPRanks;
 644     const int npme = ddRankSetup.numRanksDoingPme;
 645
 646     /* Here we assign a PME node to communicate with this DD node
 647      * by assuming that the major index of both is x.
 648      * We add npme/2 to obtain an even distribution.
 649      */
 650     return (ddCellIndex * npme + npme / 2) / npp;
 651 }
 652
 653 static std::vector<int> dd_interleaved_pme_ranks(const DDRankSetup& ddRankSetup)
 654 {
 655     std::vector<int> pmeRanks(ddRankSetup.numRanksDoingPme);
 656
 657     int n = 0;
 658     for (int i = 0; i < ddRankSetup.numPPRanks; i++)
 659     {
 660         const int p0 = ddindex2pmeindex(ddRankSetup, i);
 661         const int p1 = ddindex2pmeindex(ddRankSetup, i + 1);
 662         if (i + 1 == ddRankSetup.numPPRanks || p1 > p0)
 663         {
 664             if (debug)
 665             {
 666                 fprintf(debug, "pme_rank[%d] = %d\n", n, i + 1 + n);
 667             }
 668             pmeRanks[n] = i + 1 + n;
 669             n++;
 670         }
 671     }
 672
 673     return pmeRanks;
 674 }
 675
 676 static int gmx_ddcoord2pmeindex(const gmx_domdec_t& dd, int x, int y, int z)
 677 {
 678     ivec coords;
 679
 680     coords[XX]     = x;
 681     coords[YY]     = y;
 682     coords[ZZ]     = z;
 683     const int slab = ddindex2pmeindex(dd.comm->ddRankSetup, dd_index(dd.numCells, coords));
 684
 685     return slab;
 686 }
 687
 688 static int ddcoord2simnodeid(const t_commrec* cr, int x, int y, int z)
 689 {
 690     const CartesianRankSetup& cartSetup = cr->dd->comm->cartesianRankSetup;
 691     ivec                      coords    = { x, y, z };
 692     int                       nodeid    = -1;
 693
 694     if (cartSetup.bCartesianPP_PME)
 695     {
 696 #if GMX_MPI
 697         MPI_Cart_rank(cr->mpi_comm_mysim, coords, &nodeid);
 698 #endif
 699     }
 700     else
 701     {
 702         int ddindex = dd_index(cr->dd->numCells, coords);
 703         if (cartSetup.bCartesianPP)
 704         {
 705             nodeid = cartSetup.ddindex2simnodeid[ddindex];
 706         }
 707         else
 708         {
 709             const DDRankSetup& rankSetup = cr->dd->comm->ddRankSetup;
 710             if (rankSetup.rankOrder != DdRankOrder::pp_pme && rankSetup.usePmeOnlyRanks)
 711             {
 712                 nodeid = ddindex + gmx_ddcoord2pmeindex(*cr->dd, x, y, z);
 713             }
 714             else
 715             {
 716                 nodeid = ddindex;
 717             }
 718         }
 719     }
 720
 721     return nodeid;
 722 }
 723
 724 static int dd_simnode2pmenode(const DDRankSetup&        ddRankSetup,
 725                               const CartesianRankSetup& cartSetup,
 726                               gmx::ArrayRef<const int>  pmeRanks,
 727                               const t_commrec gmx_unused* cr,
 728                               const int                   sim_nodeid)
 729 {
 730     int pmenode = -1;
 731
 732     /* This assumes a uniform x domain decomposition grid cell size */
 733     if (cartSetup.bCartesianPP_PME)
 734     {
 735 #if GMX_MPI
 736         ivec coord, coord_pme;
 737         MPI_Cart_coords(cr->mpi_comm_mysim, sim_nodeid, DIM, coord);
 738         if (coord[cartSetup.cartpmedim] < ddRankSetup.numPPCells[cartSetup.cartpmedim])
 739         {
 740             /* This is a PP rank */
 741             dd_cart_coord2pmecoord(ddRankSetup, cartSetup, coord, coord_pme);
 742             MPI_Cart_rank(cr->mpi_comm_mysim, coord_pme, &pmenode);
 743         }
 744 #endif
 745     }
 746     else if (cartSetup.bCartesianPP)
 747     {
 748         if (sim_nodeid < ddRankSetup.numPPRanks)
 749         {
 750             pmenode = ddRankSetup.numPPRanks + ddindex2pmeindex(ddRankSetup, sim_nodeid);
 751         }
 752     }
 753     else
 754     {
 755         /* This assumes DD cells with identical x coordinates
 756          * are numbered sequentially.
 757          */
 758         if (pmeRanks.empty())
 759         {
 760             if (sim_nodeid < ddRankSetup.numPPRanks)
 761             {
 762                 /* The DD index equals the nodeid */
 763                 pmenode = ddRankSetup.numPPRanks + ddindex2pmeindex(ddRankSetup, sim_nodeid);
 764             }
 765         }
 766         else
 767         {
 768             int i = 0;
 769             while (sim_nodeid > pmeRanks[i])
 770             {
 771                 i++;
 772             }
 773             if (sim_nodeid < pmeRanks[i])
 774             {
 775                 pmenode = pmeRanks[i];
 776             }
 777         }
 778     }
 779
 780     return pmenode;
 781 }
 782
 783 NumPmeDomains getNumPmeDomains(const gmx_domdec_t* dd)
 784 {
 785     if (dd != nullptr)
 786     {
 787         return { dd->comm->ddRankSetup.npmenodes_x, dd->comm->ddRankSetup.npmenodes_y };
 788     }
 789     else
 790     {
 791         return { 1, 1 };
 792     }
 793 }
 794
 795 std::vector<int> get_pme_ddranks(const t_commrec* cr, const int pmenodeid)
 796 {
 797     const DDRankSetup&        ddRankSetup = cr->dd->comm->ddRankSetup;
 798     const CartesianRankSetup& cartSetup   = cr->dd->comm->cartesianRankSetup;
 799     GMX_RELEASE_ASSERT(ddRankSetup.usePmeOnlyRanks,
 800                        "This function should only be called when PME-only ranks are in use");
 801     std::vector<int> ddranks;
 802     ddranks.reserve((ddRankSetup.numPPRanks + ddRankSetup.numRanksDoingPme - 1) / ddRankSetup.numRanksDoingPme);
 803
 804     for (int x = 0; x < ddRankSetup.numPPCells[XX]; x++)
 805     {
 806         for (int y = 0; y < ddRankSetup.numPPCells[YY]; y++)
 807         {
 808             for (int z = 0; z < ddRankSetup.numPPCells[ZZ]; z++)
 809             {
 810                 if (cartSetup.bCartesianPP_PME)
 811                 {
 812                     ivec coord = { x, y, z };
 813                     ivec coord_pme;
 814                     dd_cart_coord2pmecoord(ddRankSetup, cartSetup, coord, coord_pme);
 815                     if (cr->dd->ci[XX] == coord_pme[XX] && cr->dd->ci[YY] == coord_pme[YY]
 816                         && cr->dd->ci[ZZ] == coord_pme[ZZ])
 817                     {
 818                         ddranks.push_back(ddcoord2simnodeid(cr, x, y, z));
 819                     }
 820                 }
 821                 else
 822                 {
 823                     /* The slab corresponds to the nodeid in the PME group */
 824                     if (gmx_ddcoord2pmeindex(*cr->dd, x, y, z) == pmenodeid)
 825                     {
 826                         ddranks.push_back(ddcoord2simnodeid(cr, x, y, z));
 827                     }
 828                 }
 829             }
 830         }
 831     }
 832     return ddranks;
 833 }
 834
 835 static gmx_bool receive_vir_ener(const gmx_domdec_t* dd, gmx::ArrayRef<const int> pmeRanks, const t_commrec* cr)
 836 {
 837     bool bReceive = true;
 838
 839     const DDRankSetup& ddRankSetup = dd->comm->ddRankSetup;
 840     if (ddRankSetup.usePmeOnlyRanks)
 841     {
 842         const CartesianRankSetup& cartSetup = dd->comm->cartesianRankSetup;
 843         if (cartSetup.bCartesianPP_PME)
 844         {
 845 #if GMX_MPI
 846             int  pmenode = dd_simnode2pmenode(ddRankSetup, cartSetup, pmeRanks, cr, cr->sim_nodeid);
 847             ivec coords;
 848             MPI_Cart_coords(cr->mpi_comm_mysim, cr->sim_nodeid, DIM, coords);
 849             coords[cartSetup.cartpmedim]++;
 850             if (coords[cartSetup.cartpmedim] < dd->numCells[cartSetup.cartpmedim])
 851             {
 852                 int rank = 0;
 853                 MPI_Cart_rank(cr->mpi_comm_mysim, coords, &rank);
 854                 if (dd_simnode2pmenode(ddRankSetup, cartSetup, pmeRanks, cr, rank) == pmenode)
 855                 {
 856                     /* This is not the last PP node for pmenode */
 857                     bReceive = FALSE;
 858                 }
 859             }
 860 #else
 861             GMX_RELEASE_ASSERT(
 862                     false,
 863                     "Without MPI we should not have Cartesian PP-PME with #PMEnodes < #DDnodes");
 864 #endif
 865         }
 866         else
 867         {
 868             int pmenode = dd_simnode2pmenode(ddRankSetup, cartSetup, pmeRanks, cr, cr->sim_nodeid);
 869             if (cr->sim_nodeid + 1 < cr->nnodes
 870                 && dd_simnode2pmenode(ddRankSetup, cartSetup, pmeRanks, cr, cr->sim_nodeid + 1) == pmenode)
 871             {
 872                 /* This is not the last PP node for pmenode */
 873                 bReceive = FALSE;
 874             }
 875         }
 876     }
 877
 878     return bReceive;
 879 }
 880
 881 static void set_slb_pme_dim_f(gmx_domdec_t* dd, int dim, real** dim_f)
 882 {
 883     gmx_domdec_comm_t* comm = dd->comm;
 884
 885     snew(*dim_f, dd->numCells[dim] + 1);
 886     (*dim_f)[0] = 0;
 887     for (int i = 1; i < dd->numCells[dim]; i++)
 888     {
 889         if (comm->slb_frac[dim])
 890         {
 891             (*dim_f)[i] = (*dim_f)[i - 1] + comm->slb_frac[dim][i - 1];
 892         }
 893         else
 894         {
 895             (*dim_f)[i] = static_cast<real>(i) / static_cast<real>(dd->numCells[dim]);
 896         }
 897     }
 898     (*dim_f)[dd->numCells[dim]] = 1;
 899 }
 900
 901 static void init_ddpme(gmx_domdec_t* dd, gmx_ddpme_t* ddpme, int dimind)
 902 {
 903     const DDRankSetup& ddRankSetup = dd->comm->ddRankSetup;
 904
 905     if (dimind == 0 && dd->dim[0] == YY && ddRankSetup.npmenodes_x == 1)
 906     {
 907         ddpme->dim = YY;
 908     }
 909     else
 910     {
 911         ddpme->dim = dimind;
 912     }
 913     ddpme->dim_match = (ddpme->dim == dd->dim[dimind]);
 914
 915     ddpme->nslab = (ddpme->dim == 0 ? ddRankSetup.npmenodes_x : ddRankSetup.npmenodes_y);
 916
 917     if (ddpme->nslab <= 1)
 918     {
 919         return;
 920     }
 921
 922     const int nso = ddRankSetup.numRanksDoingPme / ddpme->nslab;
 923     /* Determine for each PME slab the PP location range for dimension dim */
 924     snew(ddpme->pp_min, ddpme->nslab);
 925     snew(ddpme->pp_max, ddpme->nslab);
 926     for (int slab = 0; slab < ddpme->nslab; slab++)
 927     {
 928         ddpme->pp_min[slab] = dd->numCells[dd->dim[dimind]] - 1;
 929         ddpme->pp_max[slab] = 0;
 930     }
 931     for (int i = 0; i < dd->nnodes; i++)
 932     {
 933         ivec xyz;
 934         ddindex2xyz(dd->numCells, i, xyz);
 935         /* For y only use our y/z slab.
 936          * This assumes that the PME x grid size matches the DD grid size.
 937          */
 938         if (dimind == 0 || xyz[XX] == dd->ci[XX])
 939         {
 940             const int pmeindex  = ddindex2pmeindex(ddRankSetup, i);
 941             const int slab      = (dimind == 0) ? (pmeindex / nso) : (pmeindex % ddpme->nslab);
 942             ddpme->pp_min[slab] = std::min(ddpme->pp_min[slab], xyz[dimind]);
 943             ddpme->pp_max[slab] = std::max(ddpme->pp_max[slab], xyz[dimind]);
 944         }
 945     }
 946
 947     set_slb_pme_dim_f(dd, ddpme->dim, &ddpme->slb_dim_f);
 948 }
 949
 950 int dd_pme_maxshift_x(const gmx_domdec_t& dd)
 951 {
 952     const DDRankSetup& ddRankSetup = dd.comm->ddRankSetup;
 953
 954     if (ddRankSetup.ddpme[0].dim == XX)
 955     {
 956         return ddRankSetup.ddpme[0].maxshift;
 957     }
 958     else
 959     {
 960         return 0;
 961     }
 962 }
 963
 964 int dd_pme_maxshift_y(const gmx_domdec_t& dd)
 965 {
 966     const DDRankSetup& ddRankSetup = dd.comm->ddRankSetup;
 967
 968     if (ddRankSetup.ddpme[0].dim == YY)
 969     {
 970         return ddRankSetup.ddpme[0].maxshift;
 971     }
 972     else if (ddRankSetup.npmedecompdim >= 2 && ddRankSetup.ddpme[1].dim == YY)
 973     {
 974         return ddRankSetup.ddpme[1].maxshift;
 975     }
 976     else
 977     {
 978         return 0;
 979     }
 980 }
 981
 982 bool ddUsesUpdateGroups(const gmx_domdec_t& dd)
 983 {
 984     return dd.comm->systemInfo.useUpdateGroups;
 985 }
 986
 987 void dd_cycles_add(const gmx_domdec_t* dd, float cycles, int ddCycl)
 988 {
 989     /* Note that the cycles value can be incorrect, either 0 or some
 990      * extremely large value, when our thread migrated to another core
 991      * with an unsynchronized cycle counter. If this happens less often
 992      * that once per nstlist steps, this will not cause issues, since
 993      * we later subtract the maximum value from the sum over nstlist steps.
 994      * A zero count will slightly lower the total, but that's a small effect.
 995      * Note that the main purpose of the subtraction of the maximum value
 996      * is to avoid throwing off the load balancing when stalls occur due
 997      * e.g. system activity or network congestion.
 998      */
 999     dd->comm->cycl[ddCycl] += cycles;
1000     dd->comm->cycl_n[ddCycl]++;
1001     if (cycles > dd->comm->cycl_max[ddCycl])
1002     {
1003         dd->comm->cycl_max[ddCycl] = cycles;
1004     }
1005 }
1006
1007 #if GMX_MPI
1008 static void make_load_communicator(gmx_domdec_t* dd, int dim_ind, ivec loc)
1009 {
1010     MPI_Comm c_row = MPI_COMM_NULL;
1011     ivec     loc_c;
1012     bool     bPartOfGroup = false;
1013
1014     const int dim = dd->dim[dim_ind];
1015     copy_ivec(loc, loc_c);
1016     for (int i = 0; i < dd->numCells[dim]; i++)
1017     {
1018         loc_c[dim]     = i;
1019         const int rank = dd_index(dd->numCells, loc_c);
1020         if (rank == dd->rank)
1021         {
1022             /* This process is part of the group */
1023             bPartOfGroup = TRUE;
1024         }
1025     }
1026     MPI_Comm_split(dd->mpi_comm_all, bPartOfGroup ? 0 : MPI_UNDEFINED, dd->rank, &c_row);
1027     if (bPartOfGroup)
1028     {
1029         dd->comm->mpi_comm_load[dim_ind] = c_row;
1030         if (!isDlbDisabled(dd->comm))
1031         {
1032             DDCellsizesWithDlb& cellsizes = dd->comm->cellsizesWithDlb[dim_ind];
1033
1034             if (dd->ci[dim] == dd->master_ci[dim])
1035             {
1036                 /* This is the root process of this row */
1037                 cellsizes.rowMaster = std::make_unique<RowMaster>();
1038
1039                 RowMaster& rowMaster = *cellsizes.rowMaster;
1040                 rowMaster.cellFrac.resize(ddCellFractionBufferSize(dd, dim_ind));
1041                 rowMaster.oldCellFrac.resize(dd->numCells[dim] + 1);
1042                 rowMaster.isCellMin.resize(dd->numCells[dim]);
1043                 if (dim_ind > 0)
1044                 {
1045                     rowMaster.bounds.resize(dd->numCells[dim]);
1046                 }
1047                 rowMaster.buf_ncd.resize(dd->numCells[dim]);
1048             }
1049             else
1050             {
1051                 /* This is not a root process, we only need to receive cell_f */
1052                 cellsizes.fracRow.resize(ddCellFractionBufferSize(dd, dim_ind));
1053             }
1054         }
1055         if (dd->ci[dim] == dd->master_ci[dim])
1056         {
1057             snew(dd->comm->load[dim_ind].load, dd->numCells[dim] * DD_NLOAD_MAX);
1058         }
1059     }
1060 }
1061 #endif
1062
1063 void dd_setup_dlb_resource_sharing(const t_commrec* cr, int gpu_id)
1064 {
1065 #if GMX_MPI
1066     gmx_domdec_t* dd = cr->dd;
1067
1068     if (!thisRankHasDuty(cr, DUTY_PP) || gpu_id < 0)
1069     {
1070         /* Only ranks with short-ranged tasks (currently) use GPUs.
1071          * If we don't have GPUs assigned, there are no resources to share.
1072          */
1073         return;
1074     }
1075
1076     if (cr->nnodes == 1)
1077     {
1078         dd->comm->nrank_gpu_shared = 1;
1079
1080         return;
1081     }
1082
1083     const int physicalnode_id_hash = gmx_physicalnode_id_hash();
1084
1085     if (debug)
1086     {
1087         fprintf(debug, "dd_setup_dd_dlb_gpu_sharing:\n");
1088         fprintf(debug, "DD PP rank %d physical node hash %d gpu_id %d\n", dd->rank, physicalnode_id_hash, gpu_id);
1089     }
1090     /* Split the PP communicator over the physical nodes */
1091     /* TODO: See if we should store this (before), as it's also used for
1092      * for the nodecomm summation.
1093      */
1094     // TODO PhysicalNodeCommunicator could be extended/used to handle
1095     // the need for per-node per-group communicators.
1096     MPI_Comm mpi_comm_pp_physicalnode;
1097     MPI_Comm_split(dd->mpi_comm_all, physicalnode_id_hash, dd->rank, &mpi_comm_pp_physicalnode);
1098     MPI_Comm_split(mpi_comm_pp_physicalnode, gpu_id, dd->rank, &dd->comm->mpi_comm_gpu_shared);
1099     MPI_Comm_free(&mpi_comm_pp_physicalnode);
1100     MPI_Comm_size(dd->comm->mpi_comm_gpu_shared, &dd->comm->nrank_gpu_shared);
1101
1102     if (debug)
1103     {
1104         fprintf(debug, "nrank_gpu_shared %d\n", dd->comm->nrank_gpu_shared);
1105     }
1106
1107     /* Note that some ranks could share a GPU, while others don't */
1108
1109     if (dd->comm->nrank_gpu_shared == 1)
1110     {
1111         MPI_Comm_free(&dd->comm->mpi_comm_gpu_shared);
1112     }
1113 #else
1114     GMX_UNUSED_VALUE(cr);
1115     GMX_UNUSED_VALUE(gpu_id);
1116 #endif
1117 }
1118
1119 static void make_load_communicators(gmx_domdec_t gmx_unused* dd)
1120 {
1121 #if GMX_MPI
1122     ivec loc;
1123
1124     if (debug)
1125     {
1126         fprintf(debug, "Making load communicators\n");
1127     }
1128
1129     dd->comm->load = new domdec_load_t[std::max(dd->ndim, 1)];
1130     snew(dd->comm->mpi_comm_load, std::max(dd->ndim, 1));
1131
1132     if (dd->ndim == 0)
1133     {
1134         return;
1135     }
1136
1137     clear_ivec(loc);
1138     make_load_communicator(dd, 0, loc);
1139     if (dd->ndim > 1)
1140     {
1141         const int dim0 = dd->dim[0];
1142         for (int i = 0; i < dd->numCells[dim0]; i++)
1143         {
1144             loc[dim0] = i;
1145             make_load_communicator(dd, 1, loc);
1146         }
1147     }
1148     if (dd->ndim > 2)
1149     {
1150         const int dim0 = dd->dim[0];
1151         for (int i = 0; i < dd->numCells[dim0]; i++)
1152         {
1153             loc[dim0]      = i;
1154             const int dim1 = dd->dim[1];
1155             for (int j = 0; j < dd->numCells[dim1]; j++)
1156             {
1157                 loc[dim1] = j;
1158                 make_load_communicator(dd, 2, loc);
1159             }
1160         }
1161     }
1162
1163     if (debug)
1164     {
1165         fprintf(debug, "Finished making load communicators\n");
1166     }
1167 #endif
1168 }
1169
1170 /*! \brief Sets up the relation between neighboring domains and zones */
1171 static void setup_neighbor_relations(gmx_domdec_t* dd)
1172 {
1173     ivec tmp, s;
1174     GMX_ASSERT((dd->ndim >= 0) && (dd->ndim <= DIM), "Must have valid number of dimensions for DD");
1175
1176     for (int d = 0; d < dd->ndim; d++)
1177     {
1178         const int dim = dd->dim[d];
1179         copy_ivec(dd->ci, tmp);
1180         tmp[dim]           = (tmp[dim] + 1) % dd->numCells[dim];
1181         dd->neighbor[d][0] = ddcoord2ddnodeid(dd, tmp);
1182         copy_ivec(dd->ci, tmp);
1183         tmp[dim]           = (tmp[dim] - 1 + dd->numCells[dim]) % dd->numCells[dim];
1184         dd->neighbor[d][1] = ddcoord2ddnodeid(dd, tmp);
1185         if (debug)
1186         {
1187             fprintf(debug,
1188                     "DD rank %d neighbor ranks in dir %d are + %d - %d\n",
1189                     dd->rank,
1190                     dim,
1191                     dd->neighbor[d][0],
1192                     dd->neighbor[d][1]);
1193         }
1194     }
1195
1196     int nzone  = (1 << dd->ndim);
1197     int nizone = (1 << std::max(dd->ndim - 1, 0));
1198     assert(nizone >= 1 && nizone <= DD_MAXIZONE);
1199
1200     gmx_domdec_zones_t* zones = &dd->comm->zones;
1201
1202     for (int i = 0; i < nzone; i++)
1203     {
1204         int m = 0;
1205         clear_ivec(zones->shift[i]);
1206         for (int d = 0; d < dd->ndim; d++)
1207         {
1208             zones->shift[i][dd->dim[d]] = dd_zo[i][m++];
1209         }
1210     }
1211
1212     zones->n = nzone;
1213     for (int i = 0; i < nzone; i++)
1214     {
1215         for (int d = 0; d < DIM; d++)
1216         {
1217             s[d] = dd->ci[d] - zones->shift[i][d];
1218             if (s[d] < 0)
1219             {
1220                 s[d] += dd->numCells[d];
1221             }
1222             else if (s[d] >= dd->numCells[d])
1223             {
1224                 s[d] -= dd->numCells[d];
1225             }
1226         }
1227     }
1228     for (int iZoneIndex = 0; iZoneIndex < nizone; iZoneIndex++)
1229     {
1230         GMX_RELEASE_ASSERT(
1231                 ddNonbondedZonePairRanges[iZoneIndex][0] == iZoneIndex,
1232                 "The first element for each ddNonbondedZonePairRanges should match its index");
1233
1234         DDPairInteractionRanges iZone;
1235         iZone.iZoneIndex = iZoneIndex;
1236         /* dd_zp3 is for 3D decomposition, for fewer dimensions use only
1237          * j-zones up to nzone.
1238          */
1239         iZone.jZoneRange = gmx::Range<int>(std::min(ddNonbondedZonePairRanges[iZoneIndex][1], nzone),
1240                                            std::min(ddNonbondedZonePairRanges[iZoneIndex][2], nzone));
1241         for (int dim = 0; dim < DIM; dim++)
1242         {
1243             if (dd->numCells[dim] == 1)
1244             {
1245                 /* All shifts should be allowed */
1246                 iZone.shift0[dim] = -1;
1247                 iZone.shift1[dim] = 1;
1248             }
1249             else
1250             {
1251                 /* Determine the min/max j-zone shift wrt the i-zone */
1252                 iZone.shift0[dim] = 1;
1253                 iZone.shift1[dim] = -1;
1254                 for (int jZone : iZone.jZoneRange)
1255                 {
1256                     int shift_diff = zones->shift[jZone][dim] - zones->shift[iZoneIndex][dim];
1257                     if (shift_diff < iZone.shift0[dim])
1258                     {
1259                         iZone.shift0[dim] = shift_diff;
1260                     }
1261                     if (shift_diff > iZone.shift1[dim])
1262                     {
1263                         iZone.shift1[dim] = shift_diff;
1264                     }
1265                 }
1266             }
1267         }
1268
1269         zones->iZones.push_back(iZone);
1270     }
1271
1272     if (!isDlbDisabled(dd->comm))
1273     {
1274         dd->comm->cellsizesWithDlb.resize(dd->ndim);
1275     }
1276
1277     if (dd->comm->ddSettings.recordLoad)
1278     {
1279         make_load_communicators(dd);
1280     }
1281 }
1282
1283 static void make_pp_communicator(const gmx::MDLogger& mdlog,
1284                                  gmx_domdec_t*        dd,
1285                                  t_commrec gmx_unused* cr,
1286                                  bool gmx_unused       reorder)
1287 {
1288 #if GMX_MPI
1289     gmx_domdec_comm_t*  comm      = dd->comm;
1290     CartesianRankSetup& cartSetup = comm->cartesianRankSetup;
1291
1292     if (cartSetup.bCartesianPP)
1293     {
1294         /* Set up cartesian communication for the particle-particle part */
1295         GMX_LOG(mdlog.info)
1296                 .appendTextFormatted("Will use a Cartesian communicator: %d x %d x %d",
1297                                      dd->numCells[XX],
1298                                      dd->numCells[YY],
1299                                      dd->numCells[ZZ]);
1300
1301         ivec periods;
1302         for (int i = 0; i < DIM; i++)
1303         {
1304             periods[i] = TRUE;
1305         }
1306         MPI_Comm comm_cart = MPI_COMM_NULL;
1307         MPI_Cart_create(cr->mpi_comm_mygroup, DIM, dd->numCells, periods, static_cast<int>(reorder), &comm_cart);
1308         /* We overwrite the old communicator with the new cartesian one */
1309         cr->mpi_comm_mygroup = comm_cart;
1310     }
1311
1312     dd->mpi_comm_all = cr->mpi_comm_mygroup;
1313     MPI_Comm_rank(dd->mpi_comm_all, &dd->rank);
1314
1315     if (cartSetup.bCartesianPP_PME)
1316     {
1317         /* Since we want to use the original cartesian setup for sim,
1318          * and not the one after split, we need to make an index.
1319          */
1320         cartSetup.ddindex2ddnodeid.resize(dd->nnodes);
1321         cartSetup.ddindex2ddnodeid[dd_index(dd->numCells, dd->ci)] = dd->rank;
1322         gmx_sumi(dd->nnodes, cartSetup.ddindex2ddnodeid.data(), cr);
1323         /* Get the rank of the DD master,
1324          * above we made sure that the master node is a PP node.
1325          */
1326         int rank = MASTER(cr) ? dd->rank : 0;
1327         MPI_Allreduce(&rank, &dd->masterrank, 1, MPI_INT, MPI_SUM, dd->mpi_comm_all);
1328     }
1329     else if (cartSetup.bCartesianPP)
1330     {
1331         if (!comm->ddRankSetup.usePmeOnlyRanks)
1332         {
1333             /* The PP communicator is also
1334              * the communicator for this simulation
1335              */
1336             cr->mpi_comm_mysim = cr->mpi_comm_mygroup;
1337         }
1338         cr->nodeid = dd->rank;
1339
1340         MPI_Cart_coords(dd->mpi_comm_all, dd->rank, DIM, dd->ci);
1341
1342         /* We need to make an index to go from the coordinates
1343          * to the nodeid of this simulation.
1344          */
1345         cartSetup.ddindex2simnodeid.resize(dd->nnodes);
1346         std::vector<int> buf(dd->nnodes);
1347         if (thisRankHasDuty(cr, DUTY_PP))
1348         {
1349             buf[dd_index(dd->numCells, dd->ci)] = cr->sim_nodeid;
1350         }
1351         /* Communicate the ddindex to simulation nodeid index */
1352         MPI_Allreduce(buf.data(), cartSetup.ddindex2simnodeid.data(), dd->nnodes, MPI_INT, MPI_SUM, cr->mpi_comm_mysim);
1353
1354         /* Determine the master coordinates and rank.
1355          * The DD master should be the same node as the master of this sim.
1356          */
1357         for (int i = 0; i < dd->nnodes; i++)
1358         {
1359             if (cartSetup.ddindex2simnodeid[i] == 0)
1360             {
1361                 ddindex2xyz(dd->numCells, i, dd->master_ci);
1362                 MPI_Cart_rank(dd->mpi_comm_all, dd->master_ci, &dd->masterrank);
1363             }
1364         }
1365         if (debug)
1366         {
1367             fprintf(debug, "The master rank is %d\n", dd->masterrank);
1368         }
1369     }
1370     else
1371     {
1372         /* No Cartesian communicators */
1373         /* We use the rank in dd->comm->all as DD index */
1374         ddindex2xyz(dd->numCells, dd->rank, dd->ci);
1375         /* The simulation master nodeid is 0, so the DD master rank is also 0 */
1376         dd->masterrank = 0;
1377         clear_ivec(dd->master_ci);
1378     }
1379 #endif
1380
1381     GMX_LOG(mdlog.info)
1382             .appendTextFormatted("Domain decomposition rank %d, coordinates %d %d %d\n",
1383                                  dd->rank,
1384                                  dd->ci[XX],
1385                                  dd->ci[YY],
1386                                  dd->ci[ZZ]);
1387     if (debug)
1388     {
1389         fprintf(debug,
1390                 "Domain decomposition rank %d, coordinates %d %d %d\n\n",
1391                 dd->rank,
1392                 dd->ci[XX],
1393                 dd->ci[YY],
1394                 dd->ci[ZZ]);
1395     }
1396 }
1397
1398 static void receive_ddindex2simnodeid(gmx_domdec_t* dd, t_commrec* cr)
1399 {
1400 #if GMX_MPI
1401     CartesianRankSetup& cartSetup = dd->comm->cartesianRankSetup;
1402
1403     if (!cartSetup.bCartesianPP_PME && cartSetup.bCartesianPP)
1404     {
1405         cartSetup.ddindex2simnodeid.resize(dd->nnodes);
1406         std::vector<int> buf(dd->nnodes);
1407         if (thisRankHasDuty(cr, DUTY_PP))
1408         {
1409             buf[dd_index(dd->numCells, dd->ci)] = cr->sim_nodeid;
1410         }
1411         /* Communicate the ddindex to simulation nodeid index */
1412         MPI_Allreduce(buf.data(), cartSetup.ddindex2simnodeid.data(), dd->nnodes, MPI_INT, MPI_SUM, cr->mpi_comm_mysim);
1413     }
1414 #else
1415     GMX_UNUSED_VALUE(dd);
1416     GMX_UNUSED_VALUE(cr);
1417 #endif
1418 }
1419
1420 static CartesianRankSetup split_communicator(const gmx::MDLogger& mdlog,
1421                                              t_commrec*           cr,
1422                                              const DdRankOrder    ddRankOrder,
1423                                              bool gmx_unused      reorder,
1424                                              const DDRankSetup&   ddRankSetup,
1425                                              ivec                 ddCellIndex,
1426                                              std::vector<int>*    pmeRanks)
1427 {
1428     CartesianRankSetup cartSetup;
1429
1430     cartSetup.bCartesianPP     = (ddRankOrder == DdRankOrder::cartesian);
1431     cartSetup.bCartesianPP_PME = false;
1432
1433     const ivec& numDDCells = ddRankSetup.numPPCells;
1434     /* Initially we set ntot to the number of PP cells */
1435     copy_ivec(numDDCells, cartSetup.ntot);
1436
1437     if (cartSetup.bCartesianPP)
1438     {
1439         const int numDDCellsTot = ddRankSetup.numPPRanks;
1440         bool      bDiv[DIM];
1441         for (int i = 1; i < DIM; i++)
1442         {
1443             bDiv[i] = ((ddRankSetup.numRanksDoingPme * numDDCells[i]) % numDDCellsTot == 0);
1444         }
1445         if (bDiv[YY] || bDiv[ZZ])
1446         {
1447             cartSetup.bCartesianPP_PME = TRUE;
1448             /* If we have 2D PME decomposition, which is always in x+y,
1449              * we stack the PME only nodes in z.
1450              * Otherwise we choose the direction that provides the thinnest slab
1451              * of PME only nodes as this will have the least effect
1452              * on the PP communication.
1453              * But for the PME communication the opposite might be better.
1454              */
1455             if (bDiv[ZZ] && (ddRankSetup.npmenodes_y > 1 || !bDiv[YY] || numDDCells[YY] > numDDCells[ZZ]))
1456             {
1457                 cartSetup.cartpmedim = ZZ;
1458             }
1459             else
1460             {
1461                 cartSetup.cartpmedim = YY;
1462             }
1463             cartSetup.ntot[cartSetup.cartpmedim] +=
1464                     (ddRankSetup.numRanksDoingPme * numDDCells[cartSetup.cartpmedim]) / numDDCellsTot;
1465         }
1466         else
1467         {
1468             GMX_LOG(mdlog.info)
1469                     .appendTextFormatted(
1470                             "Number of PME-only ranks (%d) is not a multiple of nx*ny (%d*%d) or "
1471                             "nx*nz (%d*%d)",
1472                             ddRankSetup.numRanksDoingPme,
1473                             numDDCells[XX],
1474                             numDDCells[YY],
1475                             numDDCells[XX],
1476                             numDDCells[ZZ]);
1477             GMX_LOG(mdlog.info)
1478                     .appendText("Will not use a Cartesian communicator for PP <-> PME\n");
1479         }
1480     }
1481
1482     if (cartSetup.bCartesianPP_PME)
1483     {
1484 #if GMX_MPI
1485         int  rank = 0;
1486         ivec periods;
1487
1488         GMX_LOG(mdlog.info)
1489                 .appendTextFormatted(
1490                         "Will use a Cartesian communicator for PP <-> PME: %d x %d x %d",
1491                         cartSetup.ntot[XX],
1492                         cartSetup.ntot[YY],
1493                         cartSetup.ntot[ZZ]);
1494
1495         for (int i = 0; i < DIM; i++)
1496         {
1497             periods[i] = TRUE;
1498         }
1499         MPI_Comm comm_cart = MPI_COMM_NULL;
1500         MPI_Cart_create(cr->mpi_comm_mysim, DIM, cartSetup.ntot, periods, static_cast<int>(reorder), &comm_cart);
1501         MPI_Comm_rank(comm_cart, &rank);
1502         if (MASTER(cr) && rank != 0)
1503         {
1504             gmx_fatal(FARGS, "MPI rank 0 was renumbered by MPI_Cart_create, we do not allow this");
1505         }
1506
1507         /* With this assigment we loose the link to the original communicator
1508          * which will usually be MPI_COMM_WORLD, unless have multisim.
1509          */
1510         cr->mpi_comm_mysim = comm_cart;
1511         cr->sim_nodeid     = rank;
1512
1513         MPI_Cart_coords(cr->mpi_comm_mysim, cr->sim_nodeid, DIM, ddCellIndex);
1514
1515         GMX_LOG(mdlog.info)
1516                 .appendTextFormatted("Cartesian rank %d, coordinates %d %d %d\n",
1517                                      cr->sim_nodeid,
1518                                      ddCellIndex[XX],
1519                                      ddCellIndex[YY],
1520                                      ddCellIndex[ZZ]);
1521
1522         if (ddCellIndex[cartSetup.cartpmedim] < numDDCells[cartSetup.cartpmedim])
1523         {
1524             cr->duty = DUTY_PP;
1525         }
1526         if (!ddRankSetup.usePmeOnlyRanks
1527             || ddCellIndex[cartSetup.cartpmedim] >= numDDCells[cartSetup.cartpmedim])
1528         {
1529             cr->duty = DUTY_PME;
1530         }
1531
1532         /* Split the sim communicator into PP and PME only nodes */
1533         MPI_Comm_split(cr->mpi_comm_mysim,
1534                        getThisRankDuties(cr),
1535                        dd_index(cartSetup.ntot, ddCellIndex),
1536                        &cr->mpi_comm_mygroup);
1537         MPI_Comm_size(cr->mpi_comm_mygroup, &cr->sizeOfMyGroupCommunicator);
1538 #else
1539         GMX_UNUSED_VALUE(ddCellIndex);
1540 #endif
1541     }
1542     else
1543     {
1544         switch (ddRankOrder)
1545         {
1546             case DdRankOrder::pp_pme:
1547                 GMX_LOG(mdlog.info).appendText("Order of the ranks: PP first, PME last");
1548                 break;
1549             case DdRankOrder::interleave:
1550                 /* Interleave the PP-only and PME-only ranks */
1551                 GMX_LOG(mdlog.info).appendText("Interleaving PP and PME ranks");
1552                 *pmeRanks = dd_interleaved_pme_ranks(ddRankSetup);
1553                 break;
1554             case DdRankOrder::cartesian: break;
1555             default: gmx_fatal(FARGS, "Invalid ddRankOrder=%d", static_cast<int>(ddRankOrder));
1556         }
1557
1558         if (dd_simnode2pmenode(ddRankSetup, cartSetup, *pmeRanks, cr, cr->sim_nodeid) == -1)
1559         {
1560             cr->duty = DUTY_PME;
1561         }
1562         else
1563         {
1564             cr->duty = DUTY_PP;
1565         }
1566 #if GMX_MPI
1567         /* Split the sim communicator into PP and PME only nodes */
1568         MPI_Comm_split(cr->mpi_comm_mysim, getThisRankDuties(cr), cr->nodeid, &cr->mpi_comm_mygroup);
1569         MPI_Comm_size(cr->mpi_comm_mygroup, &cr->sizeOfMyGroupCommunicator);
1570         MPI_Comm_rank(cr->mpi_comm_mygroup, &cr->nodeid);
1571 #endif
1572     }
1573
1574     GMX_LOG(mdlog.info)
1575             .appendTextFormatted("This rank does only %s work.\n",
1576                                  thisRankHasDuty(cr, DUTY_PP) ? "particle-particle" : "PME-mesh");
1577
1578     return cartSetup;
1579 }
1580
1581 /*! \brief Makes the PP communicator and the PME communicator, when needed
1582  *
1583  * Returns the Cartesian rank setup.
1584  * Sets \p cr->mpi_comm_mygroup
1585  * For PP ranks, sets the DD PP cell index in \p ddCellIndex.
1586  * With separate PME ranks in interleaved order, set the PME ranks in \p pmeRanks.
1587  */
1588 static CartesianRankSetup makeGroupCommunicators(const gmx::MDLogger& mdlog,
1589                                                  const DDSettings&    ddSettings,
1590                                                  const DdRankOrder    ddRankOrder,
1591                                                  const DDRankSetup&   ddRankSetup,
1592                                                  t_commrec*           cr,
1593                                                  ivec                 ddCellIndex,
1594                                                  std::vector<int>*    pmeRanks)
1595 {
1596     CartesianRankSetup cartSetup;
1597
1598     // As a default, both group and sim communicators are equal to the default communicator
1599     cr->mpi_comm_mygroup = cr->mpiDefaultCommunicator;
1600     cr->mpi_comm_mysim   = cr->mpiDefaultCommunicator;
1601     cr->nnodes           = cr->sizeOfDefaultCommunicator;
1602     cr->nodeid           = cr->rankInDefaultCommunicator;
1603     cr->sim_nodeid       = cr->rankInDefaultCommunicator;
1604
1605     if (ddRankSetup.usePmeOnlyRanks)
1606     {
1607         /* Split the communicator into a PP and PME part */
1608         cartSetup = split_communicator(
1609                 mdlog, cr, ddRankOrder, ddSettings.useCartesianReorder, ddRankSetup, ddCellIndex, pmeRanks);
1610     }
1611     else
1612     {
1613         /* All nodes do PP and PME */
1614         /* We do not require separate communicators */
1615         cartSetup.bCartesianPP     = false;
1616         cartSetup.bCartesianPP_PME = false;
1617     }
1618
1619     return cartSetup;
1620 }
1621
1622 /*! \brief For PP ranks, sets or makes the communicator
1623  *
1624  * For PME ranks get the rank id.
1625  * For PP only ranks, sets the PME-only rank.
1626  */
1627 static void setupGroupCommunication(const gmx::MDLogger&     mdlog,
1628                                     const DDSettings&        ddSettings,
1629                                     gmx::ArrayRef<const int> pmeRanks,
1630                                     t_commrec*               cr,
1631                                     const int                numAtomsInSystem,
1632                                     gmx_domdec_t*            dd)
1633 {
1634     const DDRankSetup&        ddRankSetup = dd->comm->ddRankSetup;
1635     const CartesianRankSetup& cartSetup   = dd->comm->cartesianRankSetup;
1636
1637     if (thisRankHasDuty(cr, DUTY_PP))
1638     {
1639         if (dd->nnodes > 1)
1640         {
1641             /* Copy or make a new PP communicator */
1642
1643             /* We (possibly) reordered the nodes in split_communicator,
1644              * so it is no longer required in make_pp_communicator.
1645              */
1646             const bool useCartesianReorder =
1647                     (ddSettings.useCartesianReorder && !cartSetup.bCartesianPP_PME);
1648
1649             make_pp_communicator(mdlog, dd, cr, useCartesianReorder);
1650         }
1651     }
1652     else
1653     {
1654         receive_ddindex2simnodeid(dd, cr);
1655     }
1656
1657     if (!thisRankHasDuty(cr, DUTY_PME))
1658     {
1659         /* Set up the commnuication to our PME node */
1660         dd->pme_nodeid = dd_simnode2pmenode(ddRankSetup, cartSetup, pmeRanks, cr, cr->sim_nodeid);
1661         dd->pme_receive_vir_ener = receive_vir_ener(dd, pmeRanks, cr);
1662         if (debug)
1663         {
1664             fprintf(debug,
1665                     "My pme_nodeid %d receive ener %s\n",
1666                     dd->pme_nodeid,
1667                     gmx::boolToString(dd->pme_receive_vir_ener));
1668         }
1669     }
1670     else
1671     {
1672         dd->pme_nodeid = -1;
1673     }
1674
1675     /* We can not use DDMASTER(dd), because dd->masterrank is set later */
1676     if (MASTER(cr))
1677     {
1678         dd->ma = std::make_unique<AtomDistribution>(dd->numCells, numAtomsInSystem, numAtomsInSystem);
1679     }
1680 }
1681
1682 static real* get_slb_frac(const gmx::MDLogger& mdlog, const char* dir, int nc, const char* size_string)
1683 {
1684     real* slb_frac = nullptr;
1685     if (nc > 1 && size_string != nullptr)
1686     {
1687         GMX_LOG(mdlog.info).appendTextFormatted("Using static load balancing for the %s direction", dir);
1688         snew(slb_frac, nc);
1689         real tot = 0;
1690         for (int i = 0; i < nc; i++)
1691         {
1692             double dbl = 0;
1693             int    n   = 0;
1694             sscanf(size_string, "%20lf%n", &dbl, &n);
1695             if (dbl == 0)
1696             {
1697                 gmx_fatal(FARGS,
1698                           "Incorrect or not enough DD cell size entries for direction %s: '%s'",
1699                           dir,
1700                           size_string);
1701             }
1702             slb_frac[i] = dbl;
1703             size_string += n;
1704             tot += slb_frac[i];
1705         }
1706         /* Normalize */
1707         std::string relativeCellSizes = "Relative cell sizes:";
1708         for (int i = 0; i < nc; i++)
1709         {
1710             slb_frac[i] /= tot;
1711             relativeCellSizes += gmx::formatString(" %5.3f", slb_frac[i]);
1712         }
1713         GMX_LOG(mdlog.info).appendText(relativeCellSizes);
1714     }
1715
1716     return slb_frac;
1717 }
1718
1719 static int multi_body_bondeds_count(const gmx_mtop_t& mtop)
1720 {
1721     int n = 0;
1722     for (const auto ilists : IListRange(mtop))
1723     {
1724         for (auto& ilist : extractILists(ilists.list(), IF_BOND))
1725         {
1726             if (NRAL(ilist.functionType) > 2)
1727             {
1728                 n += ilists.nmol() * (ilist.iatoms.size() / ilistStride(ilist));
1729             }
1730         }
1731     }
1732
1733     return n;
1734 }
1735
1736 static int dd_getenv(const gmx::MDLogger& mdlog, const char* env_var, int def)
1737 {
1738     int   nst = def;
1739     char* val = getenv(env_var);
1740     if (val)
1741     {
1742         if (sscanf(val, "%20d", &nst) <= 0)
1743         {
1744             nst = 1;
1745         }
1746         GMX_LOG(mdlog.info).appendTextFormatted("Found env.var. %s = %s, using value %d", env_var, val, nst);
1747     }
1748
1749     return nst;
1750 }
1751
1752 static void check_dd_restrictions(const gmx_domdec_t* dd, const t_inputrec& inputrec, const gmx::MDLogger& mdlog)
1753 {
1754     if (inputrec.pbcType == PbcType::Screw
1755         && (dd->numCells[XX] == 1 || dd->numCells[YY] > 1 || dd->numCells[ZZ] > 1))
1756     {
1757         gmx_fatal(FARGS,
1758                   "With pbc=%s can only do domain decomposition in the x-direction",
1759                   c_pbcTypeNames[inputrec.pbcType].c_str());
1760     }
1761
1762     if (inputrec.nstlist == 0)
1763     {
1764         gmx_fatal(FARGS, "Domain decomposition does not work with nstlist=0");
1765     }
1766
1767     if (inputrec.comm_mode == ComRemovalAlgorithm::Angular && inputrec.pbcType != PbcType::No)
1768     {
1769         GMX_LOG(mdlog.warning)
1770                 .appendText(
1771                         "comm-mode angular will give incorrect results when the comm group "
1772                         "partially crosses a periodic boundary");
1773     }
1774 }
1775
1776 static real average_cellsize_min(const gmx_ddbox_t& ddbox, const ivec numDomains)
1777 {
1778     real r = ddbox.box_size[XX];
1779     for (int d = 0; d < DIM; d++)
1780     {
1781         if (numDomains[d] > 1)
1782         {
1783             /* Check using the initial average cell size */
1784             r = std::min(r, ddbox.box_size[d] * ddbox.skew_fac[d] / numDomains[d]);
1785         }
1786     }
1787
1788     return r;
1789 }
1790
1791 /*! \brief Depending on the DLB initial value return the DLB switched off state or issue an error.
1792  */
1793 static DlbState forceDlbOffOrBail(DlbState             cmdlineDlbState,
1794                                   const std::string&   reasonStr,
1795                                   const gmx::MDLogger& mdlog)
1796 {
1797     std::string dlbNotSupportedErr = "Dynamic load balancing requested, but ";
1798     std::string dlbDisableNote     = "NOTE: disabling dynamic load balancing as ";
1799
1800     if (cmdlineDlbState == DlbState::onUser)
1801     {
1802         gmx_fatal(FARGS, "%s", (dlbNotSupportedErr + reasonStr).c_str());
1803     }
1804     else if (cmdlineDlbState == DlbState::offCanTurnOn)
1805     {
1806         GMX_LOG(mdlog.info).appendText(dlbDisableNote + reasonStr);
1807     }
1808     return DlbState::offForever;
1809 }
1810
1811 /*! \brief Return the dynamic load balancer's initial state based on initial conditions and user inputs.
1812  *
1813  * This function parses the parameters of "-dlb" command line option setting
1814  * corresponding state values. Then it checks the consistency of the determined
1815  * state with other run parameters and settings. As a result, the initial state
1816  * may be altered or an error may be thrown if incompatibility of options is detected.
1817  *
1818  * \param [in] mdlog                Logger.
1819  * \param [in] dlbOption            Enum value for the DLB option.
1820  * \param [in] bRecordLoad          True if the load balancer is recording load information.
1821  * \param [in] mdrunOptions         Options for mdrun.
1822  * \param [in] inputrec             Pointer mdrun to input parameters.
1823  * \param [in] directGpuCommUsedWithGpuUpdate     Direct GPU halo exchange and GPU update enabled
1824  * \returns                         DLB initial/startup state.
1825  */
1826 static DlbState determineInitialDlbState(const gmx::MDLogger&     mdlog,
1827                                          DlbOption                dlbOption,
1828                                          gmx_bool                 bRecordLoad,
1829                                          const gmx::MdrunOptions& mdrunOptions,
1830                                          const t_inputrec&        inputrec,
1831                                          const bool               directGpuCommUsedWithGpuUpdate)
1832 {
1833     DlbState dlbState = DlbState::offCanTurnOn;
1834
1835     switch (dlbOption)
1836     {
1837         case DlbOption::turnOnWhenUseful: dlbState = DlbState::offCanTurnOn; break;
1838         case DlbOption::no: dlbState = DlbState::offUser; break;
1839         case DlbOption::yes: dlbState = DlbState::onUser; break;
1840         default: gmx_incons("Invalid dlbOption enum value");
1841     }
1842
1843     // P2P GPU comm + GPU update leads to case in which we enqueue async work for multiple timesteps
1844     // DLB needs to be disabled in that case
1845     if (directGpuCommUsedWithGpuUpdate)
1846     {
1847         std::string reasonStr =
1848                 "it is not supported with GPU direct communication + GPU update enabled.";
1849         return forceDlbOffOrBail(dlbState, reasonStr, mdlog);
1850     }
1851
1852     /* Reruns don't support DLB: bail or override auto mode */
1853     if (mdrunOptions.rerun)
1854     {
1855         std::string reasonStr = "it is not supported in reruns.";
1856         return forceDlbOffOrBail(dlbState, reasonStr, mdlog);
1857     }
1858
1859     /* Unsupported integrators */
1860     if (!EI_DYNAMICS(inputrec.eI))
1861     {
1862         auto reasonStr =
1863                 gmx::formatString("it is only supported with dynamics, not with integrator '%s'.",
1864                                   enumValueToString(inputrec.eI));
1865         return forceDlbOffOrBail(dlbState, reasonStr, mdlog);
1866     }
1867
1868     /* Without cycle counters we can't time work to balance on */
1869     if (!bRecordLoad)
1870     {
1871         std::string reasonStr =
1872                 "cycle counters unsupported or not enabled in the operating system kernel.";
1873         return forceDlbOffOrBail(dlbState, reasonStr, mdlog);
1874     }
1875
1876     if (mdrunOptions.reproducible)
1877     {
1878         std::string reasonStr = "you started a reproducible run.";
1879         switch (dlbState)
1880         {
1881             case DlbState::offUser: break;
1882             case DlbState::offForever:
1883                 GMX_RELEASE_ASSERT(false, "DlbState::offForever is not a valid initial state");
1884                 break;
1885             case DlbState::offCanTurnOn: return forceDlbOffOrBail(dlbState, reasonStr, mdlog);
1886             case DlbState::onCanTurnOff:
1887                 GMX_RELEASE_ASSERT(false, "DlbState::offCanTurnOff is not a valid initial state");
1888                 break;
1889             case DlbState::onUser:
1890                 return forceDlbOffOrBail(
1891                         dlbState,
1892                         reasonStr
1893                                 + " In load balanced runs binary reproducibility cannot be "
1894                                   "ensured.",
1895                         mdlog);
1896             default:
1897                 gmx_fatal(FARGS,
1898                           "Death horror: undefined case (%d) for load balancing choice",
1899                           static_cast<int>(dlbState));
1900         }
1901     }
1902
1903     return dlbState;
1904 }
1905
1906 static gmx_domdec_comm_t* init_dd_comm()
1907 {
1908     gmx_domdec_comm_t* comm = new gmx_domdec_comm_t;
1909
1910     comm->n_load_have    = 0;
1911     comm->n_load_collect = 0;
1912
1913     comm->haveTurnedOffDlb = false;
1914
1915     for (int i = 0; i < static_cast<int>(DDAtomRanges::Type::Number); i++)
1916     {
1917         comm->sum_nat[i] = 0;
1918     }
1919     comm->ndecomp   = 0;
1920     comm->nload     = 0;
1921     comm->load_step = 0;
1922     comm->load_sum  = 0;
1923     comm->load_max  = 0;
1924     clear_ivec(comm->load_lim);
1925     comm->load_mdf = 0;
1926     comm->load_pme = 0;
1927
1928     /* This should be replaced by a unique pointer */
1929     comm->balanceRegion = ddBalanceRegionAllocate();
1930
1931     return comm;
1932 }
1933
1934 static void setupUpdateGroups(const gmx::MDLogger&              mdlog,
1935                               const gmx_mtop_t&                 mtop,
1936                               ArrayRef<const RangePartitioning> updateGroupingsPerMoleculeType,
1937                               const bool                        useUpdateGroups,
1938                               const real                        maxUpdateGroupRadius,
1939                               DDSystemInfo*                     systemInfo)
1940 {
1941     systemInfo->updateGroupingsPerMoleculeType = updateGroupingsPerMoleculeType;
1942     systemInfo->useUpdateGroups                = useUpdateGroups;
1943     systemInfo->maxUpdateGroupRadius           = maxUpdateGroupRadius;
1944
1945     if (systemInfo->useUpdateGroups)
1946     {
1947         int numUpdateGroups = 0;
1948         for (const auto& molblock : mtop.molblock)
1949         {
1950             numUpdateGroups += molblock.nmol
1951                                * systemInfo->updateGroupingsPerMoleculeType[molblock.type].numBlocks();
1952         }
1953
1954         GMX_LOG(mdlog.info)
1955                 .appendTextFormatted(
1956                         "Using update groups, nr %d, average size %.1f atoms, max. radius %.3f "
1957                         "nm\n",
1958                         numUpdateGroups,
1959                         mtop.natoms / static_cast<double>(numUpdateGroups),
1960                         systemInfo->maxUpdateGroupRadius);
1961     }
1962 }
1963
1964 UnitCellInfo::UnitCellInfo(const t_inputrec& ir) :
1965     npbcdim(numPbcDimensions(ir.pbcType)),
1966     numBoundedDimensions(inputrec2nboundeddim(&ir)),
1967     ddBoxIsDynamic(numBoundedDimensions < DIM || inputrecDynamicBox(&ir)),
1968     haveScrewPBC(ir.pbcType == PbcType::Screw)
1969 {
1970 }
1971
1972 /* Returns whether molecules are always whole, i.e. not broken by PBC */
1973 static bool moleculesAreAlwaysWhole(const gmx_mtop_t&                           mtop,
1974                                     const bool                                  useUpdateGroups,
1975                                     gmx::ArrayRef<const gmx::RangePartitioning> updateGroupingsPerMoleculeType)
1976 {
1977     if (useUpdateGroups)
1978     {
1979         GMX_RELEASE_ASSERT(updateGroupingsPerMoleculeType.size() == mtop.moltype.size(),
1980                            "Need one grouping per moltype");
1981         for (size_t mol = 0; mol < mtop.moltype.size(); mol++)
1982         {
1983             if (updateGroupingsPerMoleculeType[mol].numBlocks() > 1)
1984             {
1985                 return false;
1986             }
1987         }
1988     }
1989     else
1990     {
1991         for (const auto& moltype : mtop.moltype)
1992         {
1993             if (moltype.atoms.nr > 1)
1994             {
1995                 return false;
1996             }
1997         }
1998     }
1999
2000     return true;
2001 }
2002
2003 /*! \brief Generate the simulation system information */
2004 static DDSystemInfo getSystemInfo(const gmx::MDLogger&              mdlog,
2005                                   DDRole                            ddRole,
2006                                   MPI_Comm                          communicator,
2007                                   const DomdecOptions&              options,
2008                                   const gmx_mtop_t&                 mtop,
2009                                   const t_inputrec&                 ir,
2010                                   const matrix                      box,
2011                                   ArrayRef<const RangePartitioning> updateGroupingPerMoleculeType,
2012                                   const bool                        useUpdateGroups,
2013                                   const real                        maxUpdateGroupRadius,
2014                                   gmx::ArrayRef<const gmx::RVec>    xGlobal)
2015 {
2016     const real tenPercentMargin = 1.1;
2017
2018     DDSystemInfo systemInfo;
2019
2020     setupUpdateGroups(
2021             mdlog, mtop, updateGroupingPerMoleculeType, useUpdateGroups, maxUpdateGroupRadius, &systemInfo);
2022
2023     systemInfo.moleculesAreAlwaysWhole = moleculesAreAlwaysWhole(
2024             mtop, systemInfo.useUpdateGroups, systemInfo.updateGroupingsPerMoleculeType);
2025     systemInfo.haveInterDomainBondeds =
2026             (!systemInfo.moleculesAreAlwaysWhole || mtop.bIntermolecularInteractions);
2027     systemInfo.haveInterDomainMultiBodyBondeds =
2028             (systemInfo.haveInterDomainBondeds && multi_body_bondeds_count(mtop) > 0);
2029
2030     if (systemInfo.useUpdateGroups)
2031     {
2032         systemInfo.mayHaveSplitConstraints = false;
2033         systemInfo.mayHaveSplitSettles     = false;
2034     }
2035     else
2036     {
2037         systemInfo.mayHaveSplitConstraints = (gmx_mtop_ftype_count(mtop, F_CONSTR) > 0
2038                                               || gmx_mtop_ftype_count(mtop, F_CONSTRNC) > 0);
2039         systemInfo.mayHaveSplitSettles     = (gmx_mtop_ftype_count(mtop, F_SETTLE) > 0);
2040     }
2041
2042     if (ir.rlist == 0)
2043     {
2044         /* Set the cut-off to some very large value,
2045          * so we don't need if statements everywhere in the code.
2046          * We use sqrt, since the cut-off is squared in some places.
2047          */
2048         systemInfo.cutoff = GMX_CUTOFF_INF;
2049     }
2050     else
2051     {
2052         systemInfo.cutoff = atomToAtomIntoDomainToDomainCutoff(systemInfo, ir.rlist);
2053     }
2054     systemInfo.minCutoffForMultiBody = 0;
2055
2056     /* Determine the minimum cell size limit, affected by many factors */
2057     systemInfo.cellsizeLimit             = 0;
2058     systemInfo.filterBondedCommunication = false;
2059
2060     /* We do not allow home atoms to move beyond the neighboring domain
2061      * between domain decomposition steps, which limits the cell size.
2062      * Get an estimate of cell size limit due to atom displacement.
2063      * In most cases this is a large overestimate, because it assumes
2064      * non-interaction atoms.
2065      * We set the chance to 1 in a trillion steps.
2066      */
2067     constexpr real c_chanceThatAtomMovesBeyondDomain = 1e-12;
2068     const real     limitForAtomDisplacement          = minCellSizeForAtomDisplacement(
2069             mtop, ir, systemInfo.updateGroupingsPerMoleculeType, c_chanceThatAtomMovesBeyondDomain);
2070     GMX_LOG(mdlog.info).appendTextFormatted("Minimum cell size due to atom displacement: %.3f nm", limitForAtomDisplacement);
2071
2072     systemInfo.cellsizeLimit = std::max(systemInfo.cellsizeLimit, limitForAtomDisplacement);
2073
2074     /* TODO: PME decomposition currently requires atoms not to be more than
2075      *       2/3 of comm->cutoff, which is >=rlist, outside of their domain.
2076      *       In nearly all cases, limitForAtomDisplacement will be smaller
2077      *       than 2/3*rlist, so the PME requirement is satisfied.
2078      *       But it would be better for both correctness and performance
2079      *       to use limitForAtomDisplacement instead of 2/3*comm->cutoff.
2080      *       Note that we would need to improve the pairlist buffer case.
2081      */
2082
2083     if (systemInfo.haveInterDomainBondeds)
2084     {
2085         if (options.minimumCommunicationRange > 0)
2086         {
2087             systemInfo.minCutoffForMultiBody =
2088                     atomToAtomIntoDomainToDomainCutoff(systemInfo, options.minimumCommunicationRange);
2089             if (options.useBondedCommunication)
2090             {
2091                 systemInfo.filterBondedCommunication =
2092                         (systemInfo.minCutoffForMultiBody > systemInfo.cutoff);
2093             }
2094             else
2095             {
2096                 systemInfo.cutoff = std::max(systemInfo.cutoff, systemInfo.minCutoffForMultiBody);
2097             }
2098         }
2099         else if (ir.bPeriodicMols)
2100         {
2101             /* Can not easily determine the required cut-off */
2102             GMX_LOG(mdlog.warning)
2103                     .appendText(
2104                             "NOTE: Periodic molecules are present in this system. Because of this, "
2105                             "the domain decomposition algorithm cannot easily determine the "
2106                             "minimum cell size that it requires for treating bonded interactions. "
2107                             "Instead, domain decomposition will assume that half the non-bonded "
2108                             "cut-off will be a suitable lower bound.");
2109             systemInfo.minCutoffForMultiBody = systemInfo.cutoff / 2;
2110         }
2111         else
2112         {
2113             real r_2b = 0;
2114             real r_mb = 0;
2115
2116             if (ddRole == DDRole::Master)
2117             {
2118                 dd_bonded_cg_distance(mdlog, mtop, ir, xGlobal, box, options.ddBondedChecking, &r_2b, &r_mb);
2119             }
2120             gmx_bcast(sizeof(r_2b), &r_2b, communicator);
2121             gmx_bcast(sizeof(r_mb), &r_mb, communicator);
2122
2123             /* We use an initial margin of 10% for the minimum cell size,
2124              * except when we are just below the non-bonded cut-off.
2125              */
2126             if (options.useBondedCommunication)
2127             {
2128                 if (std::max(r_2b, r_mb) > systemInfo.cutoff)
2129                 {
2130                     const real r_bonded              = std::max(r_2b, r_mb);
2131                     systemInfo.minCutoffForMultiBody = tenPercentMargin * r_bonded;
2132                     /* This is the (only) place where we turn on the filtering */
2133                     systemInfo.filterBondedCommunication = true;
2134                 }
2135                 else
2136                 {
2137                     const real r_bonded = r_mb;
2138                     systemInfo.minCutoffForMultiBody =
2139                             std::min(tenPercentMargin * r_bonded, systemInfo.cutoff);
2140                 }
2141                 /* We determine cutoff_mbody later */
2142                 systemInfo.increaseMultiBodyCutoff = true;
2143             }
2144             else
2145             {
2146                 /* No special bonded communication,
2147                  * simply increase the DD cut-off.
2148                  */
2149                 systemInfo.minCutoffForMultiBody = tenPercentMargin * std::max(r_2b, r_mb);
2150                 systemInfo.cutoff = std::max(systemInfo.cutoff, systemInfo.minCutoffForMultiBody);
2151             }
2152         }
2153         GMX_LOG(mdlog.info)
2154                 .appendTextFormatted("Minimum cell size due to bonded interactions: %.3f nm",
2155                                      systemInfo.minCutoffForMultiBody);
2156
2157         systemInfo.cellsizeLimit = std::max(systemInfo.cellsizeLimit, systemInfo.minCutoffForMultiBody);
2158     }
2159
2160     systemInfo.constraintCommunicationRange = 0;
2161     if (systemInfo.mayHaveSplitConstraints && options.constraintCommunicationRange <= 0)
2162     {
2163         /* There is a cell size limit due to the constraints (P-LINCS) */
2164         systemInfo.constraintCommunicationRange = gmx::constr_r_max(mdlog, &mtop, &ir);
2165         GMX_LOG(mdlog.info)
2166                 .appendTextFormatted("Estimated maximum distance required for P-LINCS: %.3f nm",
2167                                      systemInfo.constraintCommunicationRange);
2168         if (systemInfo.constraintCommunicationRange > systemInfo.cellsizeLimit)
2169         {
2170             GMX_LOG(mdlog.info)
2171                     .appendText(
2172                             "This distance will limit the DD cell size, you can override this with "
2173                             "-rcon");
2174         }
2175     }
2176     else if (options.constraintCommunicationRange > 0)
2177     {
2178         /* Here we do not check for dd->splitConstraints.
2179          * because one can also set a cell size limit for virtual sites only
2180          * and at this point we don't know yet if there are intercg v-sites.
2181          */
2182         GMX_LOG(mdlog.info)
2183                 .appendTextFormatted("User supplied maximum distance required for P-LINCS: %.3f nm",
2184                                      options.constraintCommunicationRange);
2185         systemInfo.constraintCommunicationRange = options.constraintCommunicationRange;
2186     }
2187     systemInfo.cellsizeLimit = std::max(systemInfo.cellsizeLimit, systemInfo.constraintCommunicationRange);
2188
2189     return systemInfo;
2190 }
2191
2192 /*! \brief Exit with a fatal error if the DDGridSetup cannot be
2193  * implemented. */
2194 static void checkDDGridSetup(const DDGridSetup&   ddGridSetup,
2195                              DDRole               ddRole,
2196                              MPI_Comm             communicator,
2197                              int                  numNodes,
2198                              const DomdecOptions& options,
2199                              const DDSettings&    ddSettings,
2200                              const DDSystemInfo&  systemInfo,
2201                              const real           cellsizeLimit,
2202                              const gmx_ddbox_t&   ddbox)
2203 {
2204     if (options.numCells[XX] <= 0 && (ddGridSetup.numDomains[XX] == 0))
2205     {
2206         const bool  bC = (systemInfo.mayHaveSplitConstraints
2207                          && systemInfo.constraintCommunicationRange > systemInfo.minCutoffForMultiBody);
2208         std::string message =
2209                 gmx::formatString("Change the number of ranks or mdrun option %s%s%s",
2210                                   !bC ? "-rdd" : "-rcon",
2211                                   ddSettings.initialDlbState != DlbState::offUser ? " or -dds" : "",
2212                                   bC ? " or your LINCS settings" : "");
2213
2214         gmx_fatal_collective(FARGS,
2215                              communicator,
2216                              ddRole == DDRole::Master,
2217                              "There is no domain decomposition for %d ranks that is compatible "
2218                              "with the given box and a minimum cell size of %g nm\n"
2219                              "%s\n"
2220                              "Look in the log file for details on the domain decomposition",
2221                              numNodes - ddGridSetup.numPmeOnlyRanks,
2222                              cellsizeLimit,
2223                              message.c_str());
2224     }
2225
2226     const real acs = average_cellsize_min(ddbox, ddGridSetup.numDomains);
2227     if (acs < cellsizeLimit)
2228     {
2229         if (options.numCells[XX] <= 0)
2230         {
2231             GMX_RELEASE_ASSERT(
2232                     false,
2233                     "dd_choose_grid() should return a grid that satisfies the cell size limits");
2234         }
2235         else
2236         {
2237             gmx_fatal_collective(
2238                     FARGS,
2239                     communicator,
2240                     ddRole == DDRole::Master,
2241                     "The initial cell size (%f) is smaller than the cell size limit (%f), change "
2242                     "options -dd, -rdd or -rcon, see the log file for details",
2243                     acs,
2244                     cellsizeLimit);
2245         }
2246     }
2247
2248     const int numPPRanks =
2249             ddGridSetup.numDomains[XX] * ddGridSetup.numDomains[YY] * ddGridSetup.numDomains[ZZ];
2250     if (numNodes - numPPRanks != ddGridSetup.numPmeOnlyRanks)
2251     {
2252         gmx_fatal_collective(FARGS,
2253                              communicator,
2254                              ddRole == DDRole::Master,
2255                              "The size of the domain decomposition grid (%d) does not match the "
2256                              "number of PP ranks (%d). The total number of ranks is %d",
2257                              numPPRanks,
2258                              numNodes - ddGridSetup.numPmeOnlyRanks,
2259                              numNodes);
2260     }
2261     if (ddGridSetup.numPmeOnlyRanks > numPPRanks)
2262     {
2263         gmx_fatal_collective(FARGS,
2264                              communicator,
2265                              ddRole == DDRole::Master,
2266                              "The number of separate PME ranks (%d) is larger than the number of "
2267                              "PP ranks (%d), this is not supported.",
2268                              ddGridSetup.numPmeOnlyRanks,
2269                              numPPRanks);
2270     }
2271 }
2272
2273 /*! \brief Set the cell size and interaction limits, as well as the DD grid */
2274 static DDRankSetup getDDRankSetup(const gmx::MDLogger& mdlog,
2275                                   int                  numNodes,
2276                                   const DdRankOrder    rankOrder,
2277                                   const DDGridSetup&   ddGridSetup,
2278                                   const t_inputrec&    ir)
2279 {
2280     GMX_LOG(mdlog.info)
2281             .appendTextFormatted("Domain decomposition grid %d x %d x %d, separate PME ranks %d",
2282                                  ddGridSetup.numDomains[XX],
2283                                  ddGridSetup.numDomains[YY],
2284                                  ddGridSetup.numDomains[ZZ],
2285                                  ddGridSetup.numPmeOnlyRanks);
2286
2287     DDRankSetup ddRankSetup;
2288
2289     ddRankSetup.rankOrder = rankOrder;
2290
2291     ddRankSetup.numPPRanks = numNodes - ddGridSetup.numPmeOnlyRanks;
2292     copy_ivec(ddGridSetup.numDomains, ddRankSetup.numPPCells);
2293
2294     ddRankSetup.usePmeOnlyRanks = (ddGridSetup.numPmeOnlyRanks > 0);
2295     if (ddRankSetup.usePmeOnlyRanks)
2296     {
2297         ddRankSetup.numRanksDoingPme = ddGridSetup.numPmeOnlyRanks;
2298     }
2299     else
2300     {
2301         ddRankSetup.numRanksDoingPme =
2302                 ddGridSetup.numDomains[XX] * ddGridSetup.numDomains[YY] * ddGridSetup.numDomains[ZZ];
2303     }
2304
2305     if (EEL_PME(ir.coulombtype) || EVDW_PME(ir.vdwtype))
2306     {
2307         /* The following choices should match those
2308          * in comm_cost_est in domdec_setup.c.
2309          * Note that here the checks have to take into account
2310          * that the decomposition might occur in a different order than xyz
2311          * (for instance through the env.var. GMX_DD_ORDER_ZYX),
2312          * in which case they will not match those in comm_cost_est,
2313          * but since that is mainly for testing purposes that's fine.
2314          */
2315         if (ddGridSetup.numDDDimensions >= 2 && ddGridSetup.ddDimensions[0] == XX
2316             && ddGridSetup.ddDimensions[1] == YY
2317             && ddRankSetup.numRanksDoingPme > ddGridSetup.numDomains[XX]
2318             && ddRankSetup.numRanksDoingPme % ddGridSetup.numDomains[XX] == 0
2319             && getenv("GMX_PMEONEDD") == nullptr)
2320         {
2321             ddRankSetup.npmedecompdim = 2;
2322             ddRankSetup.npmenodes_x   = ddGridSetup.numDomains[XX];
2323             ddRankSetup.npmenodes_y   = ddRankSetup.numRanksDoingPme / ddRankSetup.npmenodes_x;
2324         }
2325         else
2326         {
2327             /* In case nc is 1 in both x and y we could still choose to
2328              * decompose pme in y instead of x, but we use x for simplicity.
2329              */
2330             ddRankSetup.npmedecompdim = 1;
2331             if (ddGridSetup.ddDimensions[0] == YY)
2332             {
2333                 ddRankSetup.npmenodes_x = 1;
2334                 ddRankSetup.npmenodes_y = ddRankSetup.numRanksDoingPme;
2335             }
2336             else
2337             {
2338                 ddRankSetup.npmenodes_x = ddRankSetup.numRanksDoingPme;
2339                 ddRankSetup.npmenodes_y = 1;
2340             }
2341         }
2342         GMX_LOG(mdlog.info)
2343                 .appendTextFormatted("PME domain decomposition: %d x %d x %d",
2344                                      ddRankSetup.npmenodes_x,
2345                                      ddRankSetup.npmenodes_y,
2346                                      1);
2347     }
2348     else
2349     {
2350         ddRankSetup.npmedecompdim = 0;
2351         ddRankSetup.npmenodes_x   = 0;
2352         ddRankSetup.npmenodes_y   = 0;
2353     }
2354
2355     return ddRankSetup;
2356 }
2357
2358 /*! \brief Set the cell size and interaction limits */
2359 static void set_dd_limits(const gmx::MDLogger& mdlog,
2360                           DDRole               ddRole,
2361                           gmx_domdec_t*        dd,
2362                           const DomdecOptions& options,
2363                           const DDSettings&    ddSettings,
2364                           const DDSystemInfo&  systemInfo,
2365                           const DDGridSetup&   ddGridSetup,
2366                           const int            numPPRanks,
2367                           const gmx_mtop_t&    mtop,
2368                           const t_inputrec&    ir,
2369                           const gmx_ddbox_t&   ddbox)
2370 {
2371     gmx_domdec_comm_t* comm = dd->comm;
2372     comm->ddSettings        = ddSettings;
2373
2374     /* Initialize to GPU share count to 0, might change later */
2375     comm->nrank_gpu_shared = 0;
2376
2377     comm->dlbState = comm->ddSettings.initialDlbState;
2378     dd_dlb_set_should_check_whether_to_turn_dlb_on(dd, TRUE);
2379     /* To consider turning DLB on after 2*nstlist steps we need to check
2380      * at partitioning count 3. Thus we need to increase the first count by 2.
2381      */
2382     comm->ddPartioningCountFirstDlbOff += 2;
2383
2384     comm->bPMELoadBalDLBLimits = FALSE;
2385
2386     /* Allocate the charge group/atom sorting struct */
2387     comm->sort = std::make_unique<gmx_domdec_sort_t>();
2388
2389     comm->systemInfo = systemInfo;
2390
2391     if (systemInfo.useUpdateGroups)
2392     {
2393         /* Note: We would like to use dd->nnodes for the atom count estimate,
2394          *       but that is not yet available here. But this anyhow only
2395          *       affect performance up to the second dd_partition_system call.
2396          */
2397         const int homeAtomCountEstimate = mtop.natoms / numPPRanks;
2398         comm->updateGroupsCog           = std::make_unique<gmx::UpdateGroupsCog>(
2399                 mtop, systemInfo.updateGroupingsPerMoleculeType, maxReferenceTemperature(ir), homeAtomCountEstimate);
2400     }
2401
2402     /* Set the DD setup given by ddGridSetup */
2403     copy_ivec(ddGridSetup.numDomains, dd->numCells);
2404     dd->ndim = ddGridSetup.numDDDimensions;
2405     copy_ivec(ddGridSetup.ddDimensions, dd->dim);
2406
2407     dd->nnodes = dd->numCells[XX] * dd->numCells[YY] * dd->numCells[ZZ];
2408
2409     snew(comm->slb_frac, DIM);
2410     if (isDlbDisabled(comm))
2411     {
2412         comm->slb_frac[XX] = get_slb_frac(mdlog, "x", dd->numCells[XX], options.cellSizeX);
2413         comm->slb_frac[YY] = get_slb_frac(mdlog, "y", dd->numCells[YY], options.cellSizeY);
2414         comm->slb_frac[ZZ] = get_slb_frac(mdlog, "z", dd->numCells[ZZ], options.cellSizeZ);
2415     }
2416
2417     /* Set the multi-body cut-off and cellsize limit for DLB */
2418     comm->cutoff_mbody   = systemInfo.minCutoffForMultiBody;
2419     comm->cellsize_limit = systemInfo.cellsizeLimit;
2420     if (systemInfo.haveInterDomainBondeds && systemInfo.increaseMultiBodyCutoff)
2421     {
2422         if (systemInfo.filterBondedCommunication || !isDlbDisabled(comm))
2423         {
2424             /* Set the bonded communication distance to halfway
2425              * the minimum and the maximum,
2426              * since the extra communication cost is nearly zero.
2427              */
2428             real acs           = average_cellsize_min(ddbox, dd->numCells);
2429             comm->cutoff_mbody = 0.5 * (systemInfo.minCutoffForMultiBody + acs);
2430             if (!isDlbDisabled(comm))
2431             {
2432                 /* Check if this does not limit the scaling */
2433                 comm->cutoff_mbody = std::min(comm->cutoff_mbody, options.dlbScaling * acs);
2434             }
2435             if (!systemInfo.filterBondedCommunication)
2436             {
2437                 /* Without bBondComm do not go beyond the n.b. cut-off */
2438                 comm->cutoff_mbody = std::min(comm->cutoff_mbody, systemInfo.cutoff);
2439                 if (comm->cellsize_limit >= systemInfo.cutoff)
2440                 {
2441                     /* We don't loose a lot of efficieny
2442                      * when increasing it to the n.b. cut-off.
2443                      * It can even be slightly faster, because we need
2444                      * less checks for the communication setup.
2445                      */
2446                     comm->cutoff_mbody = systemInfo.cutoff;
2447                 }
2448             }
2449             /* Check if we did not end up below our original limit */
2450             comm->cutoff_mbody = std::max(comm->cutoff_mbody, systemInfo.minCutoffForMultiBody);
2451
2452             if (comm->cutoff_mbody > comm->cellsize_limit)
2453             {
2454                 comm->cellsize_limit = comm->cutoff_mbody;
2455             }
2456         }
2457         /* Without DLB and cutoff_mbody<cutoff, cutoff_mbody is dynamic */
2458     }
2459
2460     if (debug)
2461     {
2462         fprintf(debug,
2463                 "Bonded atom communication beyond the cut-off: %s\n"
2464                 "cellsize limit %f\n",
2465                 gmx::boolToString(systemInfo.filterBondedCommunication),
2466                 comm->cellsize_limit);
2467     }
2468
2469     if (ddRole == DDRole::Master)
2470     {
2471         check_dd_restrictions(dd, ir, mdlog);
2472     }
2473 }
2474
2475 static void writeSettings(gmx::TextWriter*   log,
2476                           gmx_domdec_t*      dd,
2477                           const gmx_mtop_t&  mtop,
2478                           const t_inputrec&  ir,
2479                           gmx_bool           bDynLoadBal,
2480                           real               dlb_scale,
2481                           const gmx_ddbox_t* ddbox)
2482 {
2483     gmx_domdec_comm_t* comm = dd->comm;
2484
2485     if (bDynLoadBal)
2486     {
2487         log->writeString("The maximum number of communication pulses is:");
2488         for (int d = 0; d < dd->ndim; d++)
2489         {
2490             log->writeStringFormatted(" %c %d", dim2char(dd->dim[d]), comm->cd[d].np_dlb);
2491         }
2492         log->ensureLineBreak();
2493         log->writeLineFormatted("The minimum size for domain decomposition cells is %.3f nm",
2494                                 comm->cellsize_limit);
2495         log->writeLineFormatted("The requested allowed shrink of DD cells (option -dds) is: %.2f", dlb_scale);
2496         log->writeString("The allowed shrink of domain decomposition cells is:");
2497         for (int d = 0; d < DIM; d++)
2498         {
2499             if (dd->numCells[d] > 1)
2500             {
2501                 const real shrink =
2502                         (d >= ddbox->npbcdim && dd->numCells[d] == 2)
2503                                 ? 0
2504                                 : comm->cellsize_min_dlb[d]
2505                                           / (ddbox->box_size[d] * ddbox->skew_fac[d] / dd->numCells[d]);
2506                 log->writeStringFormatted(" %c %.2f", dim2char(d), shrink);
2507             }
2508         }
2509         log->ensureLineBreak();
2510     }
2511     else
2512     {
2513         ivec np;
2514         set_dd_cell_sizes_slb(dd, ddbox, setcellsizeslbPULSE_ONLY, np);
2515         log->writeString("The initial number of communication pulses is:");
2516         for (int d = 0; d < dd->ndim; d++)
2517         {
2518             log->writeStringFormatted(" %c %d", dim2char(dd->dim[d]), np[dd->dim[d]]);
2519         }
2520         log->ensureLineBreak();
2521         log->writeString("The initial domain decomposition cell size is:");
2522         for (int d = 0; d < DIM; d++)
2523         {
2524             if (dd->numCells[d] > 1)
2525             {
2526                 log->writeStringFormatted(" %c %.2f nm", dim2char(d), dd->comm->cellsize_min[d]);
2527             }
2528         }
2529         log->ensureLineBreak();
2530         log->writeLine();
2531     }
2532
2533     const bool haveInterDomainVsites =
2534             (countInterUpdategroupVsites(mtop, comm->systemInfo.updateGroupingsPerMoleculeType) != 0);
2535
2536     if (comm->systemInfo.haveInterDomainBondeds || haveInterDomainVsites
2537         || comm->systemInfo.mayHaveSplitConstraints || comm->systemInfo.mayHaveSplitSettles)
2538     {
2539         std::string decompUnits;
2540         if (comm->systemInfo.useUpdateGroups)
2541         {
2542             decompUnits = "atom groups";
2543         }
2544         else
2545         {
2546             decompUnits = "atoms";
2547         }
2548
2549         log->writeLineFormatted("The maximum allowed distance for %s involved in interactions is:",
2550                                 decompUnits.c_str());
2551         log->writeLineFormatted(
2552                 "%40s  %-7s %6.3f nm", "non-bonded interactions", "", comm->systemInfo.cutoff);
2553
2554         real limit = 0;
2555         if (bDynLoadBal)
2556         {
2557             limit = dd->comm->cellsize_limit;
2558         }
2559         else
2560         {
2561             if (dd->unitCellInfo.ddBoxIsDynamic)
2562             {
2563                 log->writeLine(
2564                         "(the following are initial values, they could change due to box "
2565                         "deformation)");
2566             }
2567             limit = dd->comm->cellsize_min[XX];
2568             for (int d = 1; d < DIM; d++)
2569             {
2570                 limit = std::min(limit, dd->comm->cellsize_min[d]);
2571             }
2572         }
2573
2574         if (comm->systemInfo.haveInterDomainBondeds)
2575         {
2576             log->writeLineFormatted("%40s  %-7s %6.3f nm",
2577                                     "two-body bonded interactions",
2578                                     "(-rdd)",
2579                                     std::max(comm->systemInfo.cutoff, comm->cutoff_mbody));
2580             log->writeLineFormatted("%40s  %-7s %6.3f nm",
2581                                     "multi-body bonded interactions",
2582                                     "(-rdd)",
2583                                     (comm->systemInfo.filterBondedCommunication || isDlbOn(dd->comm))
2584                                             ? comm->cutoff_mbody
2585                                             : std::min(comm->systemInfo.cutoff, limit));
2586         }
2587         if (haveInterDomainVsites)
2588         {
2589             log->writeLineFormatted("%40s  %-7s %6.3f nm", "virtual site constructions", "(-rcon)", limit);
2590         }
2591         if (comm->systemInfo.mayHaveSplitConstraints || comm->systemInfo.mayHaveSplitSettles)
2592         {
2593             std::string separation =
2594                     gmx::formatString("atoms separated by up to %d constraints", 1 + ir.nProjOrder);
2595             log->writeLineFormatted("%40s  %-7s %6.3f nm\n", separation.c_str(), "(-rcon)", limit);
2596         }
2597         log->ensureLineBreak();
2598     }
2599 }
2600
2601 static void logSettings(const gmx::MDLogger& mdlog,
2602                         gmx_domdec_t*        dd,
2603                         const gmx_mtop_t&    mtop,
2604                         const t_inputrec&    ir,
2605                         real                 dlb_scale,
2606                         const gmx_ddbox_t*   ddbox)
2607 {
2608     gmx::StringOutputStream stream;
2609     gmx::TextWriter         log(&stream);
2610     writeSettings(&log, dd, mtop, ir, isDlbOn(dd->comm), dlb_scale, ddbox);
2611     if (dd->comm->dlbState == DlbState::offCanTurnOn)
2612     {
2613         {
2614             log.ensureEmptyLine();
2615             log.writeLine(
2616                     "When dynamic load balancing gets turned on, these settings will change to:");
2617         }
2618         writeSettings(&log, dd, mtop, ir, true, dlb_scale, ddbox);
2619     }
2620     GMX_LOG(mdlog.info).asParagraph().appendText(stream.toString());
2621 }
2622
2623 static void set_cell_limits_dlb(const gmx::MDLogger& mdlog,
2624                                 gmx_domdec_t*        dd,
2625                                 real                 dlb_scale,
2626                                 const t_inputrec&    inputrec,
2627                                 const gmx_ddbox_t*   ddbox)
2628 {
2629     int npulse       = 0;
2630     int npulse_d_max = 0;
2631     int npulse_d     = 0;
2632
2633     gmx_domdec_comm_t* comm = dd->comm;
2634
2635     bool bNoCutOff = (inputrec.rvdw == 0 || inputrec.rcoulomb == 0);
2636
2637     /* Determine the maximum number of comm. pulses in one dimension */
2638
2639     comm->cellsize_limit = std::max(comm->cellsize_limit, comm->cutoff_mbody);
2640
2641     /* Determine the maximum required number of grid pulses */
2642     if (comm->cellsize_limit >= comm->systemInfo.cutoff)
2643     {
2644         /* Only a single pulse is required */
2645         npulse = 1;
2646     }
2647     else if (!bNoCutOff && comm->cellsize_limit > 0)
2648     {
2649         /* We round down slightly here to avoid overhead due to the latency
2650          * of extra communication calls when the cut-off
2651          * would be only slightly longer than the cell size.
2652          * Later cellsize_limit is redetermined,
2653          * so we can not miss interactions due to this rounding.
2654          */
2655         npulse = static_cast<int>(0.96 + comm->systemInfo.cutoff / comm->cellsize_limit);
2656     }
2657     else
2658     {
2659         /* There is no cell size limit */
2660         npulse = std::max(dd->numCells[XX] - 1, std::max(dd->numCells[YY] - 1, dd->numCells[ZZ] - 1));
2661     }
2662
2663     if (!bNoCutOff && npulse > 1)
2664     {
2665         /* See if we can do with less pulses, based on dlb_scale */
2666         npulse_d_max = 0;
2667         for (int d = 0; d < dd->ndim; d++)
2668         {
2669             int dim  = dd->dim[d];
2670             npulse_d = static_cast<int>(
2671                     1
2672                     + dd->numCells[dim] * comm->systemInfo.cutoff
2673                               / (ddbox->box_size[dim] * ddbox->skew_fac[dim] * dlb_scale));
2674             npulse_d_max = std::max(npulse_d_max, npulse_d);
2675         }
2676         npulse = std::min(npulse, npulse_d_max);
2677     }
2678
2679     /* This env var can override npulse */
2680     const int ddPulseEnv = dd_getenv(mdlog, "GMX_DD_NPULSE", 0);
2681     if (ddPulseEnv > 0)
2682     {
2683         npulse = ddPulseEnv;
2684     }
2685
2686     comm->maxpulse       = 1;
2687     comm->bVacDLBNoLimit = (inputrec.pbcType == PbcType::No);
2688     for (int d = 0; d < dd->ndim; d++)
2689     {
2690         comm->cd[d].np_dlb = std::min(npulse, dd->numCells[dd->dim[d]] - 1);
2691         comm->maxpulse     = std::max(comm->maxpulse, comm->cd[d].np_dlb);
2692         if (comm->cd[d].np_dlb < dd->numCells[dd->dim[d]] - 1)
2693         {
2694             comm->bVacDLBNoLimit = FALSE;
2695         }
2696     }
2697
2698     /* cellsize_limit is set for LINCS in init_domain_decomposition */
2699     if (!comm->bVacDLBNoLimit)
2700     {
2701         comm->cellsize_limit = std::max(comm->cellsize_limit, comm->systemInfo.cutoff / comm->maxpulse);
2702     }
2703     comm->cellsize_limit = std::max(comm->cellsize_limit, comm->cutoff_mbody);
2704     /* Set the minimum cell size for each DD dimension */
2705     for (int d = 0; d < dd->ndim; d++)
2706     {
2707         if (comm->bVacDLBNoLimit || comm->cd[d].np_dlb * comm->cellsize_limit >= comm->systemInfo.cutoff)
2708         {
2709             comm->cellsize_min_dlb[dd->dim[d]] = comm->cellsize_limit;
2710         }
2711         else
2712         {
2713             comm->cellsize_min_dlb[dd->dim[d]] = comm->systemInfo.cutoff / comm->cd[d].np_dlb;
2714         }
2715     }
2716     if (comm->cutoff_mbody <= 0)
2717     {
2718         comm->cutoff_mbody = std::min(comm->systemInfo.cutoff, comm->cellsize_limit);
2719     }
2720     if (isDlbOn(comm))
2721     {
2722         set_dlb_limits(dd);
2723     }
2724 }
2725
2726 bool dd_moleculesAreAlwaysWhole(const gmx_domdec_t& dd)
2727 {
2728     return dd.comm->systemInfo.moleculesAreAlwaysWhole;
2729 }
2730
2731 bool dd_bonded_molpbc(const gmx_domdec_t& dd, PbcType pbcType)
2732 {
2733     /* If each molecule is a single charge group
2734      * or we use domain decomposition for each periodic dimension,
2735      * we do not need to take pbc into account for the bonded interactions.
2736      */
2737     return (pbcType != PbcType::No && dd.comm->systemInfo.haveInterDomainBondeds
2738             && !(dd.numCells[XX] > 1 && dd.numCells[YY] > 1
2739                  && (dd.numCells[ZZ] > 1 || pbcType == PbcType::XY)));
2740 }
2741
2742 /*! \brief Sets grid size limits and PP-PME setup, prints settings to log */
2743 static void set_ddgrid_parameters(const gmx::MDLogger& mdlog,
2744                                   gmx_domdec_t*        dd,
2745                                   real                 dlb_scale,
2746                                   const gmx_mtop_t&    mtop,
2747                                   const t_inputrec&    inputrec,
2748                                   const gmx_ddbox_t*   ddbox)
2749 {
2750     gmx_domdec_comm_t* comm        = dd->comm;
2751     DDRankSetup&       ddRankSetup = comm->ddRankSetup;
2752
2753     if (EEL_PME(inputrec.coulombtype) || EVDW_PME(inputrec.vdwtype))
2754     {
2755         init_ddpme(dd, &ddRankSetup.ddpme[0], 0);
2756         if (ddRankSetup.npmedecompdim >= 2)
2757         {
2758             init_ddpme(dd, &ddRankSetup.ddpme[1], 1);
2759         }
2760     }
2761     else
2762     {
2763         ddRankSetup.numRanksDoingPme = 0;
2764         if (dd->pme_nodeid >= 0)
2765         {
2766             gmx_fatal_collective(FARGS,
2767                                  dd->mpi_comm_all,
2768                                  DDMASTER(dd),
2769                                  "Can not have separate PME ranks without PME electrostatics");
2770         }
2771     }
2772
2773     if (debug)
2774     {
2775         fprintf(debug, "The DD cut-off is %f\n", comm->systemInfo.cutoff);
2776     }
2777     if (!isDlbDisabled(comm))
2778     {
2779         set_cell_limits_dlb(mdlog, dd, dlb_scale, inputrec, ddbox);
2780     }
2781
2782     logSettings(mdlog, dd, mtop, inputrec, dlb_scale, ddbox);
2783
2784     const real vol_frac = (inputrec.pbcType == PbcType::No)
2785                                   ? (1 - 1 / static_cast<double>(dd->nnodes))
2786                                   : ((1 + comm_box_frac(dd->numCells, comm->systemInfo.cutoff, *ddbox))
2787                                      / static_cast<double>(dd->nnodes));
2788     if (debug)
2789     {
2790         fprintf(debug, "Volume fraction for all DD zones: %f\n", vol_frac);
2791     }
2792     int natoms_tot = mtop.natoms;
2793
2794     dd->ga2la = new gmx_ga2la_t(natoms_tot, static_cast<int>(vol_frac * natoms_tot));
2795 }
2796
2797 /*! \brief Get some important DD parameters which can be modified by env.vars */
2798 static DDSettings getDDSettings(const gmx::MDLogger&     mdlog,
2799                                 const DomdecOptions&     options,
2800                                 const gmx::MdrunOptions& mdrunOptions,
2801                                 const t_inputrec&        ir,
2802                                 const bool               directGpuCommUsedWithGpuUpdate)
2803 {
2804     DDSettings ddSettings;
2805
2806     ddSettings.useSendRecv2        = (dd_getenv(mdlog, "GMX_DD_USE_SENDRECV2", 0) != 0);
2807     ddSettings.dlb_scale_lim       = dd_getenv(mdlog, "GMX_DLB_MAX_BOX_SCALING", 10);
2808     ddSettings.useDDOrderZYX       = bool(dd_getenv(mdlog, "GMX_DD_ORDER_ZYX", 0));
2809     ddSettings.useCartesianReorder = bool(dd_getenv(mdlog, "GMX_NO_CART_REORDER", 1));
2810     ddSettings.eFlop               = dd_getenv(mdlog, "GMX_DLB_BASED_ON_FLOPS", 0);
2811     const int recload              = dd_getenv(mdlog, "GMX_DD_RECORD_LOAD", 1);
2812     ddSettings.nstDDDump           = dd_getenv(mdlog, "GMX_DD_NST_DUMP", 0);
2813     ddSettings.nstDDDumpGrid       = dd_getenv(mdlog, "GMX_DD_NST_DUMP_GRID", 0);
2814     ddSettings.DD_debug            = dd_getenv(mdlog, "GMX_DD_DEBUG", 0);
2815
2816     if (ddSettings.useSendRecv2)
2817     {
2818         GMX_LOG(mdlog.info)
2819                 .appendText(
2820                         "Will use two sequential MPI_Sendrecv calls instead of two simultaneous "
2821                         "non-blocking MPI_Irecv and MPI_Isend pairs for constraint and vsite "
2822                         "communication");
2823     }
2824
2825     if (ddSettings.eFlop)
2826     {
2827         GMX_LOG(mdlog.info).appendText("Will load balance based on FLOP count");
2828         ddSettings.recordLoad = true;
2829     }
2830     else
2831     {
2832         ddSettings.recordLoad = (wallcycle_have_counter() && recload > 0);
2833     }
2834
2835     ddSettings.initialDlbState = determineInitialDlbState(
2836             mdlog, options.dlbOption, ddSettings.recordLoad, mdrunOptions, ir, directGpuCommUsedWithGpuUpdate);
2837     GMX_LOG(mdlog.info)
2838             .appendTextFormatted("Dynamic load balancing: %s",
2839                                  enumValueToString(ddSettings.initialDlbState));
2840
2841     return ddSettings;
2842 }
2843
2844 gmx_domdec_t::gmx_domdec_t(const t_inputrec& ir) : unitCellInfo(ir) {}
2845
2846 gmx_domdec_t::~gmx_domdec_t() = default;
2847
2848 namespace gmx
2849 {
2850
2851 // TODO once the functionality stablizes, move this class and
2852 // supporting functionality into builder.cpp
2853 /*! \brief Impl class for DD builder */
2854 class DomainDecompositionBuilder::Impl
2855 {
2856 public:
2857     //! Constructor
2858     Impl(const MDLogger&                   mdlog,
2859          t_commrec*                        cr,
2860          const DomdecOptions&              options,
2861          const MdrunOptions&               mdrunOptions,
2862          const gmx_mtop_t&                 mtop,
2863          const t_inputrec&                 ir,
2864          const MDModulesNotifiers&         notifiers,
2865          const matrix                      box,
2866          ArrayRef<const RangePartitioning> updateGroupingPerMoleculeType,
2867          bool                              useUpdateGroups,
2868          real                              maxUpdateGroupRadius,
2869          ArrayRef<const RVec>              xGlobal,
2870          bool                              useGpuForNonbonded,
2871          bool                              useGpuForPme,
2872          bool                              directGpuCommUsedWithGpuUpdate);
2873
2874     //! Build the resulting DD manager
2875     gmx_domdec_t* build(LocalAtomSetManager*       atomSets,
2876                         const gmx_localtop_t&      localTopology,
2877                         const t_state&             localState,
2878                         ObservablesReducerBuilder* observablesReducerBuilder);
2879
2880     //! Objects used in constructing and configuring DD
2881     //! {
2882     //! Logging object
2883     const MDLogger& mdlog_;
2884     //! Communication object
2885     t_commrec* cr_;
2886     //! User-supplied options configuring DD behavior
2887     const DomdecOptions options_;
2888     //! Global system topology
2889     const gmx_mtop_t& mtop_;
2890     //! User input values from the tpr file
2891     const t_inputrec& ir_;
2892     //! MdModules object
2893     const MDModulesNotifiers& notifiers_;
2894     //! }
2895
2896     //! Internal objects used in constructing DD
2897     //! {
2898     //! Settings combined from the user input
2899     DDSettings ddSettings_;
2900     //! Information derived from the simulation system
2901     DDSystemInfo systemInfo_;
2902     //! Box structure
2903     gmx_ddbox_t ddbox_ = { 0 };
2904     //! Organization of the DD grids
2905     DDGridSetup ddGridSetup_;
2906     //! Organzation of the DD ranks
2907     DDRankSetup ddRankSetup_;
2908     //! Number of DD cells in each dimension
2909     ivec ddCellIndex_ = { 0, 0, 0 };
2910     //! IDs of PME-only ranks
2911     std::vector<int> pmeRanks_;
2912     //! Contains a valid Cartesian-communicator-based setup, or defaults.
2913     CartesianRankSetup cartSetup_;
2914     //! }
2915 };
2916
2917 DomainDecompositionBuilder::Impl::Impl(const MDLogger&                   mdlog,
2918                                        t_commrec*                        cr,
2919                                        const DomdecOptions&              options,
2920                                        const MdrunOptions&               mdrunOptions,
2921                                        const gmx_mtop_t&                 mtop,
2922                                        const t_inputrec&                 ir,
2923                                        const MDModulesNotifiers&         notifiers,
2924                                        const matrix                      box,
2925                                        ArrayRef<const RangePartitioning> updateGroupingPerMoleculeType,
2926                                        const bool                        useUpdateGroups,
2927                                        const real                        maxUpdateGroupRadius,
2928                                        ArrayRef<const RVec>              xGlobal,
2929                                        bool                              useGpuForNonbonded,
2930                                        bool                              useGpuForPme,
2931                                        bool directGpuCommUsedWithGpuUpdate) :
2932     mdlog_(mdlog), cr_(cr), options_(options), mtop_(mtop), ir_(ir), notifiers_(notifiers)
2933 {
2934     GMX_LOG(mdlog_.info).appendTextFormatted("\nInitializing Domain Decomposition on %d ranks", cr_->sizeOfDefaultCommunicator);
2935
2936     ddSettings_ = getDDSettings(mdlog_, options_, mdrunOptions, ir_, directGpuCommUsedWithGpuUpdate);
2937
2938     if (ddSettings_.eFlop > 1)
2939     {
2940         /* Ensure that we have different random flop counts on different ranks */
2941         srand(1 + cr_->rankInDefaultCommunicator);
2942     }
2943
2944     systemInfo_ = getSystemInfo(mdlog_,
2945                                 MASTER(cr_) ? DDRole::Master : DDRole::Agent,
2946                                 cr->mpiDefaultCommunicator,
2947                                 options_,
2948                                 mtop_,
2949                                 ir_,
2950                                 box,
2951                                 updateGroupingPerMoleculeType,
2952                                 useUpdateGroups,
2953                                 maxUpdateGroupRadius,
2954                                 xGlobal);
2955
2956     const int  numRanksRequested         = cr_->sizeOfDefaultCommunicator;
2957     const bool checkForLargePrimeFactors = (options_.numCells[0] <= 0);
2958
2959
2960     /* Checks for ability to use PME-only ranks */
2961     auto separatePmeRanksPermitted = checkForSeparatePmeRanks(
2962             notifiers_, options_, numRanksRequested, useGpuForNonbonded, useGpuForPme);
2963
2964     /* Checks for validity of requested Ranks setup */
2965     checkForValidRankCountRequests(numRanksRequested,
2966                                    EEL_PME(ir_.coulombtype) | EVDW_PME(ir_.vdwtype),
2967                                    options_.numPmeRanks,
2968                                    separatePmeRanksPermitted,
2969                                    checkForLargePrimeFactors);
2970
2971     // DD grid setup uses a more different cell size limit for
2972     // automated setup than the one in systemInfo_. The latter is used
2973     // in set_dd_limits() to configure DLB, for example.
2974     const real gridSetupCellsizeLimit =
2975             getDDGridSetupCellSizeLimit(mdlog_,
2976                                         !isDlbDisabled(ddSettings_.initialDlbState),
2977                                         options_.dlbScaling,
2978                                         ir_,
2979                                         systemInfo_.cellsizeLimit);
2980     ddGridSetup_ = getDDGridSetup(mdlog_,
2981                                   MASTER(cr_) ? DDRole::Master : DDRole::Agent,
2982                                   cr->mpiDefaultCommunicator,
2983                                   numRanksRequested,
2984                                   options_,
2985                                   ddSettings_,
2986                                   systemInfo_,
2987                                   gridSetupCellsizeLimit,
2988                                   mtop_,
2989                                   ir_,
2990                                   separatePmeRanksPermitted,
2991                                   box,
2992                                   xGlobal,
2993                                   &ddbox_);
2994     checkDDGridSetup(ddGridSetup_,
2995                      MASTER(cr_) ? DDRole::Master : DDRole::Agent,
2996                      cr->mpiDefaultCommunicator,
2997                      cr->sizeOfDefaultCommunicator,
2998                      options_,
2999                      ddSettings_,
3000                      systemInfo_,
3001                      gridSetupCellsizeLimit,
3002                      ddbox_);
3003
3004     cr_->npmenodes = ddGridSetup_.numPmeOnlyRanks;
3005
3006     ddRankSetup_ = getDDRankSetup(
3007             mdlog_, cr_->sizeOfDefaultCommunicator, options_.rankOrder, ddGridSetup_, ir_);
3008
3009     /* Generate the group communicator, also decides the duty of each rank */
3010     cartSetup_ = makeGroupCommunicators(
3011             mdlog_, ddSettings_, options_.rankOrder, ddRankSetup_, cr_, ddCellIndex_, &pmeRanks_);
3012 }
3013
3014 gmx_domdec_t* DomainDecompositionBuilder::Impl::build(LocalAtomSetManager*  atomSets,
3015                                                       const gmx_localtop_t& localTopology,
3016                                                       const t_state&        localState,
3017                                                       ObservablesReducerBuilder* observablesReducerBuilder)
3018 {
3019     gmx_domdec_t* dd = new gmx_domdec_t(ir_);
3020
3021     copy_ivec(ddCellIndex_, dd->ci);
3022
3023     dd->comm = init_dd_comm();
3024
3025     dd->comm->ddRankSetup        = ddRankSetup_;
3026     dd->comm->cartesianRankSetup = cartSetup_;
3027
3028     set_dd_limits(mdlog_,
3029                   MASTER(cr_) ? DDRole::Master : DDRole::Agent,
3030                   dd,
3031                   options_,
3032                   ddSettings_,
3033                   systemInfo_,
3034                   ddGridSetup_,
3035                   ddRankSetup_.numPPRanks,
3036                   mtop_,
3037                   ir_,
3038                   ddbox_);
3039
3040     setupGroupCommunication(mdlog_, ddSettings_, pmeRanks_, cr_, mtop_.natoms, dd);
3041
3042     if (thisRankHasDuty(cr_, DUTY_PP))
3043     {
3044         set_ddgrid_parameters(mdlog_, dd, options_.dlbScaling, mtop_, ir_, &ddbox_);
3045
3046         setup_neighbor_relations(dd);
3047     }
3048
3049     /* Set overallocation to avoid frequent reallocation of arrays */
3050     set_over_alloc_dd(true);
3051
3052     dd->atomSets = atomSets;
3053
3054     dd->localTopologyChecker = std::make_unique<LocalTopologyChecker>(
3055             mdlog_, cr_, mtop_, localTopology, localState, dd->comm->systemInfo.useUpdateGroups, observablesReducerBuilder);
3056
3057     return dd;
3058 }
3059
3060 DomainDecompositionBuilder::DomainDecompositionBuilder(const MDLogger&           mdlog,
3061                                                        t_commrec*                cr,
3062                                                        const DomdecOptions&      options,
3063                                                        const MdrunOptions&       mdrunOptions,
3064                                                        const gmx_mtop_t&         mtop,
3065                                                        const t_inputrec&         ir,
3066                                                        const MDModulesNotifiers& notifiers,
3067                                                        const matrix              box,
3068                                                        ArrayRef<const RangePartitioning> updateGroupingPerMoleculeType,
3069                                                        const bool           useUpdateGroups,
3070                                                        const real           maxUpdateGroupRadius,
3071                                                        ArrayRef<const RVec> xGlobal,
3072                                                        const bool           useGpuForNonbonded,
3073                                                        const bool           useGpuForPme,
3074                                                        const bool directGpuCommUsedWithGpuUpdate) :
3075     impl_(new Impl(mdlog,
3076                    cr,
3077                    options,
3078                    mdrunOptions,
3079                    mtop,
3080                    ir,
3081                    notifiers,
3082                    box,
3083                    updateGroupingPerMoleculeType,
3084                    useUpdateGroups,
3085                    maxUpdateGroupRadius,
3086                    xGlobal,
3087                    useGpuForNonbonded,
3088                    useGpuForPme,
3089                    directGpuCommUsedWithGpuUpdate))
3090 {
3091 }
3092
3093 gmx_domdec_t* DomainDecompositionBuilder::build(LocalAtomSetManager*       atomSets,
3094                                                 const gmx_localtop_t&      localTopology,
3095                                                 const t_state&             localState,
3096                                                 ObservablesReducerBuilder* observablesReducerBuilder)
3097 {
3098     return impl_->build(atomSets, localTopology, localState, observablesReducerBuilder);
3099 }
3100
3101 DomainDecompositionBuilder::~DomainDecompositionBuilder() = default;
3102
3103 } // namespace gmx
3104
3105 static gmx_bool test_dd_cutoff(const t_commrec* cr, const matrix box, gmx::ArrayRef<const gmx::RVec> x, real cutoffRequested)
3106 {
3107     gmx_ddbox_t ddbox;
3108     int         LocallyLimited = 0;
3109
3110     const auto* dd = cr->dd;
3111
3112     set_ddbox(*dd, false, box, true, x, &ddbox);
3113
3114     LocallyLimited = 0;
3115
3116     for (int d = 0; d < dd->ndim; d++)
3117     {
3118         const int dim = dd->dim[d];
3119
3120         real inv_cell_size = DD_CELL_MARGIN * dd->numCells[dim] / ddbox.box_size[dim];
3121         if (dd->unitCellInfo.ddBoxIsDynamic)
3122         {
3123             inv_cell_size *= DD_PRES_SCALE_MARGIN;
3124         }
3125
3126         const int np = 1 + static_cast<int>(cutoffRequested * inv_cell_size * ddbox.skew_fac[dim]);
3127
3128         if (!isDlbDisabled(dd->comm) && (dim < ddbox.npbcdim) && (dd->comm->cd[d].np_dlb > 0))
3129         {
3130             if (np > dd->comm->cd[d].np_dlb)
3131             {
3132                 return FALSE;
3133             }
3134
3135             /* If a current local cell size is smaller than the requested
3136              * cut-off, we could still fix it, but this gets very complicated.
3137              * Without fixing here, we might actually need more checks.
3138              */
3139             real cellSizeAlongDim =
3140                     (dd->comm->cell_x1[dim] - dd->comm->cell_x0[dim]) * ddbox.skew_fac[dim];
3141             if (cellSizeAlongDim * dd->comm->cd[d].np_dlb < cutoffRequested)
3142             {
3143                 LocallyLimited = 1;
3144             }
3145         }
3146     }
3147
3148     if (!isDlbDisabled(dd->comm))
3149     {
3150         /* If DLB is not active yet, we don't need to check the grid jumps.
3151          * Actually we shouldn't, because then the grid jump data is not set.
3152          */
3153         if (isDlbOn(dd->comm) && gmx::check_grid_jump(0, dd, cutoffRequested, &ddbox, FALSE))
3154         {
3155             LocallyLimited = 1;
3156         }
3157
3158         gmx_sumi(1, &LocallyLimited, cr);
3159
3160         if (LocallyLimited > 0)
3161         {
3162             return FALSE;
3163         }
3164     }
3165
3166     return TRUE;
3167 }
3168
3169 bool change_dd_cutoff(t_commrec* cr, const matrix box, gmx::ArrayRef<const gmx::RVec> x, real cutoffRequested)
3170 {
3171     bool bCutoffAllowed = test_dd_cutoff(cr, box, x, cutoffRequested);
3172
3173     if (bCutoffAllowed)
3174     {
3175         cr->dd->comm->systemInfo.cutoff = cutoffRequested;
3176     }
3177
3178     return bCutoffAllowed;
3179 }
3180
3181 void constructGpuHaloExchange(const gmx::MDLogger&            mdlog,
3182                               const t_commrec&                cr,
3183                               const gmx::DeviceStreamManager& deviceStreamManager,
3184                               gmx_wallcycle*                  wcycle)
3185 {
3186     GMX_RELEASE_ASSERT(deviceStreamManager.streamIsValid(gmx::DeviceStreamType::NonBondedLocal),
3187                        "Local non-bonded stream should be valid when using"
3188                        "GPU halo exchange.");
3189     GMX_RELEASE_ASSERT(deviceStreamManager.streamIsValid(gmx::DeviceStreamType::NonBondedNonLocal),
3190                        "Non-local non-bonded stream should be valid when using "
3191                        "GPU halo exchange.");
3192
3193     if (cr.dd->gpuHaloExchange[0].empty())
3194     {
3195         GMX_LOG(mdlog.warning)
3196                 .asParagraph()
3197                 .appendTextFormatted(
3198                         "NOTE: Activating the 'GPU halo exchange' feature, enabled "
3199                         "by the "
3200                         "GMX_GPU_DD_COMMS environment variable.");
3201     }
3202
3203     for (int d = 0; d < cr.dd->ndim; d++)
3204     {
3205         for (int pulse = cr.dd->gpuHaloExchange[d].size(); pulse < cr.dd->comm->cd[d].numPulses(); pulse++)
3206         {
3207             cr.dd->gpuHaloExchange[d].push_back(std::make_unique<gmx::GpuHaloExchange>(
3208                     cr.dd,
3209                     d,
3210                     cr.mpi_comm_mygroup,
3211                     deviceStreamManager.context(),
3212                     deviceStreamManager.stream(gmx::DeviceStreamType::NonBondedLocal),
3213                     deviceStreamManager.stream(gmx::DeviceStreamType::NonBondedNonLocal),
3214                     pulse,
3215                     wcycle));
3216         }
3217     }
3218 }
3219
3220 void reinitGpuHaloExchange(const t_commrec&              cr,
3221                            const DeviceBuffer<gmx::RVec> d_coordinatesBuffer,
3222                            const DeviceBuffer<gmx::RVec> d_forcesBuffer)
3223 {
3224     for (int d = 0; d < cr.dd->ndim; d++)
3225     {
3226         for (int pulse = 0; pulse < cr.dd->comm->cd[d].numPulses(); pulse++)
3227         {
3228             cr.dd->gpuHaloExchange[d][pulse]->reinitHalo(d_coordinatesBuffer, d_forcesBuffer);
3229         }
3230     }
3231 }
3232
3233 void communicateGpuHaloCoordinates(const t_commrec&      cr,
3234                                    const matrix          box,
3235                                    GpuEventSynchronizer* coordinatesReadyOnDeviceEvent)
3236 {
3237     for (int d = 0; d < cr.dd->ndim; d++)
3238     {
3239         for (int pulse = 0; pulse < cr.dd->comm->cd[d].numPulses(); pulse++)
3240         {
3241             cr.dd->gpuHaloExchange[d][pulse]->communicateHaloCoordinates(box, coordinatesReadyOnDeviceEvent);
3242         }
3243     }
3244 }
3245
3246 void communicateGpuHaloForces(const t_commrec& cr, bool accumulateForces)
3247 {
3248     for (int d = cr.dd->ndim - 1; d >= 0; d--)
3249     {
3250         for (int pulse = cr.dd->comm->cd[d].numPulses() - 1; pulse >= 0; pulse--)
3251         {
3252             cr.dd->gpuHaloExchange[d][pulse]->communicateHaloForces(accumulateForces);
3253         }
3254     }
3255 }
3256
3257 void dd_init_local_state(const gmx_domdec_t& dd, const t_state* state_global, t_state* state_local)
3258 {
3259     std::array<int, 5> buf;
3260
3261     if (DDMASTER(dd))
3262     {
3263         buf[0] = state_global->flags;
3264         buf[1] = state_global->ngtc;
3265         buf[2] = state_global->nnhpres;
3266         buf[3] = state_global->nhchainlength;
3267         buf[4] = state_global->dfhist ? state_global->dfhist->nlambda : 0;
3268     }
3269     dd_bcast(&dd, buf.size() * sizeof(int), buf.data());
3270
3271     init_gtc_state(state_local, buf[1], buf[2], buf[3]);
3272     init_dfhist_state(state_local, buf[4]);
3273     state_local->flags = buf[0];
3274 }
3275
3276 void putUpdateGroupAtomsInSamePeriodicImage(const gmx_domdec_t&      dd,
3277                                             const gmx_mtop_t&        mtop,
3278                                             const matrix             box,
3279                                             gmx::ArrayRef<gmx::RVec> positions)
3280 {
3281     int atomOffset = 0;
3282     for (const gmx_molblock_t& molblock : mtop.molblock)
3283     {
3284         const auto& updateGrouping = dd.comm->systemInfo.updateGroupingsPerMoleculeType[molblock.type];
3285
3286         for (int mol = 0; mol < molblock.nmol; mol++)
3287         {
3288             for (int g = 0; g < updateGrouping.numBlocks(); g++)
3289             {
3290                 const auto& block     = updateGrouping.block(g);
3291                 const int   atomBegin = atomOffset + block.begin();
3292                 const int   atomEnd   = atomOffset + block.end();
3293                 for (int a = atomBegin + 1; a < atomEnd; a++)
3294                 {
3295                     // Make sure that atoms in the same update group
3296                     // are in the same periodic image after restarts.
3297                     for (int d = DIM - 1; d >= 0; d--)
3298                     {
3299                         while (positions[a][d] - positions[atomBegin][d] > 0.5_real * box[d][d])
3300                         {
3301                             positions[a] -= box[d];
3302                         }
3303                         while (positions[a][d] - positions[atomBegin][d] < -0.5_real * box[d][d])
3304                         {
3305                             positions[a] += box[d];
3306                         }
3307                     }
3308                 }
3309             }
3310             atomOffset += updateGrouping.fullRange().end();
3311         }
3312     }
3313 }