src/gromacs/domdec/partition.cpp

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2018,2019,2020,2021, by the GROMACS development team, led by
   5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   6  * and including many others, as listed in the AUTHORS file in the
   7  * top-level source directory and at http://www.gromacs.org.
   8  *
   9  * GROMACS is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public License
  11  * as published by the Free Software Foundation; either version 2.1
  12  * of the License, or (at your option) any later version.
  13  *
  14  * GROMACS is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public
  20  * License along with GROMACS; if not, see
  21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  23  *
  24  * If you want to redistribute modifications to GROMACS, please
  25  * consider that scientific software is very special. Version
  26  * control is crucial - bugs must be traceable. We will be happy to
  27  * consider code for inclusion in the official distribution, but
  28  * derived work must not be called official GROMACS. Details are found
  29  * in the README & COPYING files - if they are missing, get the
  30  * official version at http://www.gromacs.org.
  31  *
  32  * To help us fund GROMACS development, we humbly ask that you cite
  33  * the research papers on the package. Check out http://www.gromacs.org.
  34  */
  35 /*! \internal \file
  36  *
  37  * \brief This file defines functions for mdrun to call to make a new
  38  * domain decomposition, and check it.
  39  *
  40  * \author Berk Hess <hess@kth.se>
  41  * \ingroup module_domdec
  42  */
  43
  44 #include "gmxpre.h"
  45
  46 #include "partition.h"
  47
  48 #include "config.h"
  49
  50 #include <cassert>
  51 #include <cstdio>
  52
  53 #include <algorithm>
  54
  55 #include "gromacs/domdec/collect.h"
  56 #include "gromacs/domdec/dlb.h"
  57 #include "gromacs/domdec/dlbtiming.h"
  58 #include "gromacs/domdec/domdec.h"
  59 #include "gromacs/domdec/domdec_network.h"
  60 #include "gromacs/domdec/ga2la.h"
  61 #include "gromacs/domdec/localatomsetmanager.h"
  62 #include "gromacs/domdec/mdsetup.h"
  63 #include "gromacs/ewald/pme_pp.h"
  64 #include "gromacs/gmxlib/network.h"
  65 #include "gromacs/gmxlib/nrnb.h"
  66 #include "gromacs/imd/imd.h"
  67 #include "gromacs/math/functions.h"
  68 #include "gromacs/math/vec.h"
  69 #include "gromacs/mdlib/forcerec.h"
  70 #include "gromacs/mdlib/gmx_omp_nthreads.h"
  71 #include "gromacs/mdlib/mdatoms.h"
  72 #include "gromacs/mdlib/nsgrid.h"
  73 #include "gromacs/mdlib/vsite.h"
  74 #include "gromacs/mdtypes/commrec.h"
  75 #include "gromacs/mdtypes/forcerec.h"
  76 #include "gromacs/mdtypes/inputrec.h"
  77 #include "gromacs/mdtypes/md_enums.h"
  78 #include "gromacs/mdtypes/mdatom.h"
  79 #include "gromacs/mdtypes/nblist.h"
  80 #include "gromacs/mdtypes/state.h"
  81 #include "gromacs/nbnxm/nbnxm.h"
  82 #include "gromacs/pulling/pull.h"
  83 #include "gromacs/timing/wallcycle.h"
  84 #include "gromacs/topology/mtop_util.h"
  85 #include "gromacs/topology/topology.h"
  86 #include "gromacs/utility/cstringutil.h"
  87 #include "gromacs/utility/fatalerror.h"
  88 #include "gromacs/utility/logger.h"
  89 #include "gromacs/utility/real.h"
  90 #include "gromacs/utility/smalloc.h"
  91 #include "gromacs/utility/strconvert.h"
  92 #include "gromacs/utility/stringstream.h"
  93 #include "gromacs/utility/stringutil.h"
  94 #include "gromacs/utility/textwriter.h"
  95
  96 #include "box.h"
  97 #include "cellsizes.h"
  98 #include "distribute.h"
  99 #include "domdec_constraints.h"
 100 #include "domdec_internal.h"
 101 #include "domdec_vsite.h"
 102 #include "dump.h"
 103 #include "redistribute.h"
 104 #include "utility.h"
 105
 106 /*! \brief Turn on DLB when the load imbalance causes this amount of total loss.
 107  *
 108  * There is a bit of overhead with DLB and it's difficult to achieve
 109  * a load imbalance of less than 2% with DLB.
 110  */
 111 #define DD_PERF_LOSS_DLB_ON 0.02
 112
 113 //! Warn about imbalance due to PP or PP/PME load imbalance at this loss.
 114 #define DD_PERF_LOSS_WARN 0.05
 115
 116
 117 //! Debug helper printing a DD zone
 118 static void print_ddzone(FILE* fp, int d, int i, int j, gmx_ddzone_t* zone)
 119 {
 120     fprintf(fp,
 121             "zone d0 %d d1 %d d2 %d  min0 %6.3f max1 %6.3f mch0 %6.3f mch1 %6.3f p1_0 %6.3f p1_1 "
 122             "%6.3f\n",
 123             d,
 124             i,
 125             j,
 126             zone->min0,
 127             zone->max1,
 128             zone->mch0,
 129             zone->mch0,
 130             zone->p1_0,
 131             zone->p1_1);
 132 }
 133
 134 /*! \brief Using the home grid size as input in cell_ns_x0 and cell_ns_x1
 135  * takes the extremes over all home and remote zones in the halo
 136  * and returns the results in cell_ns_x0 and cell_ns_x1.
 137  * Note: only used with the group cut-off scheme.
 138  */
 139 static void dd_move_cellx(gmx_domdec_t* dd, const gmx_ddbox_t* ddbox, rvec cell_ns_x0, rvec cell_ns_x1)
 140 {
 141     constexpr int      c_ddZoneCommMaxNumZones = 5;
 142     gmx_ddzone_t       buf_s[c_ddZoneCommMaxNumZones];
 143     gmx_ddzone_t       buf_r[c_ddZoneCommMaxNumZones];
 144     gmx_ddzone_t       buf_e[c_ddZoneCommMaxNumZones];
 145     gmx_domdec_comm_t* comm = dd->comm;
 146
 147     rvec extr_s[2];
 148     rvec extr_r[2];
 149     for (int d = 1; d < dd->ndim; d++)
 150     {
 151         int           dim = dd->dim[d];
 152         gmx_ddzone_t& zp  = (d == 1) ? comm->zone_d1[0] : comm->zone_d2[0][0];
 153
 154         /* Copy the base sizes of the home zone */
 155         zp.min0    = cell_ns_x0[dim];
 156         zp.max1    = cell_ns_x1[dim];
 157         zp.min1    = cell_ns_x1[dim];
 158         zp.mch0    = cell_ns_x0[dim];
 159         zp.mch1    = cell_ns_x1[dim];
 160         zp.p1_0    = cell_ns_x0[dim];
 161         zp.p1_1    = cell_ns_x1[dim];
 162         zp.dataSet = 1;
 163     }
 164
 165     gmx::ArrayRef<DDCellsizesWithDlb> cellsizes = comm->cellsizesWithDlb;
 166
 167     /* Loop backward over the dimensions and aggregate the extremes
 168      * of the cell sizes.
 169      */
 170     for (int d = dd->ndim - 2; d >= 0; d--)
 171     {
 172         const int  dim      = dd->dim[d];
 173         const bool applyPbc = (dim < ddbox->npbcdim);
 174
 175         /* Use an rvec to store two reals */
 176         extr_s[d][0] = cellsizes[d + 1].fracLower;
 177         extr_s[d][1] = cellsizes[d + 1].fracUpper;
 178         extr_s[d][2] = cellsizes[d + 1].fracUpper;
 179
 180         int pos = 0;
 181         GMX_ASSERT(pos < c_ddZoneCommMaxNumZones, "The buffers should be sufficiently large");
 182         /* Store the extremes in the backward sending buffer,
 183          * so they get updated separately from the forward communication.
 184          */
 185         for (int d1 = d; d1 < dd->ndim - 1; d1++)
 186         {
 187             gmx_ddzone_t& buf = buf_s[pos];
 188
 189             /* We invert the order to be able to use the same loop for buf_e */
 190             buf.min0 = extr_s[d1][1];
 191             buf.max1 = extr_s[d1][0];
 192             buf.min1 = extr_s[d1][2];
 193             buf.mch0 = 0;
 194             buf.mch1 = 0;
 195             /* Store the cell corner of the dimension we communicate along */
 196             buf.p1_0    = comm->cell_x0[dim];
 197             buf.p1_1    = 0;
 198             buf.dataSet = 1;
 199             pos++;
 200         }
 201
 202         buf_s[pos] = (dd->ndim == 2) ? comm->zone_d1[0] : comm->zone_d2[0][0];
 203         pos++;
 204
 205         if (dd->ndim == 3 && d == 0)
 206         {
 207             buf_s[pos] = comm->zone_d2[0][1];
 208             pos++;
 209             buf_s[pos] = comm->zone_d1[0];
 210             pos++;
 211         }
 212
 213         /* We only need to communicate the extremes
 214          * in the forward direction
 215          */
 216         int numPulses = comm->cd[d].numPulses();
 217         int numPulsesMin;
 218         if (applyPbc)
 219         {
 220             /* Take the minimum to avoid double communication */
 221             numPulsesMin = std::min(numPulses, dd->numCells[dim] - 1 - numPulses);
 222         }
 223         else
 224         {
 225             /* Without PBC we should really not communicate over
 226              * the boundaries, but implementing that complicates
 227              * the communication setup and therefore we simply
 228              * do all communication, but ignore some data.
 229              */
 230             numPulsesMin = numPulses;
 231         }
 232         for (int pulse = 0; pulse < numPulsesMin; pulse++)
 233         {
 234             /* Communicate the extremes forward */
 235             bool receiveValidData = (applyPbc || dd->ci[dim] > 0);
 236
 237             int numElements = dd->ndim - d - 1;
 238             ddSendrecv(dd, d, dddirForward, extr_s + d, numElements, extr_r + d, numElements);
 239
 240             if (receiveValidData)
 241             {
 242                 for (int d1 = d; d1 < dd->ndim - 1; d1++)
 243                 {
 244                     extr_s[d1][0] = std::max(extr_s[d1][0], extr_r[d1][0]);
 245                     extr_s[d1][1] = std::min(extr_s[d1][1], extr_r[d1][1]);
 246                     extr_s[d1][2] = std::min(extr_s[d1][2], extr_r[d1][2]);
 247                 }
 248             }
 249         }
 250
 251         const int numElementsInBuffer = pos;
 252         for (int pulse = 0; pulse < numPulses; pulse++)
 253         {
 254             /* Communicate all the zone information backward */
 255             bool receiveValidData = (applyPbc || dd->ci[dim] < dd->numCells[dim] - 1);
 256
 257             static_assert(
 258                     sizeof(gmx_ddzone_t) == c_ddzoneNumReals * sizeof(real),
 259                     "Here we expect gmx_ddzone_t to consist of c_ddzoneNumReals reals (only)");
 260
 261             int numReals = numElementsInBuffer * c_ddzoneNumReals;
 262             ddSendrecv(dd,
 263                        d,
 264                        dddirBackward,
 265                        gmx::arrayRefFromArray(&buf_s[0].min0, numReals),
 266                        gmx::arrayRefFromArray(&buf_r[0].min0, numReals));
 267
 268             rvec dh = { 0 };
 269             if (pulse > 0)
 270             {
 271                 for (int d1 = d + 1; d1 < dd->ndim; d1++)
 272                 {
 273                     /* Determine the decrease of maximum required
 274                      * communication height along d1 due to the distance along d,
 275                      * this avoids a lot of useless atom communication.
 276                      */
 277                     real dist_d = comm->cell_x1[dim] - buf_r[0].p1_0;
 278
 279                     real c;
 280                     if (ddbox->tric_dir[dim])
 281                     {
 282                         /* c is the off-diagonal coupling between the cell planes
 283                          * along directions d and d1.
 284                          */
 285                         c = ddbox->v[dim][dd->dim[d1]][dim];
 286                     }
 287                     else
 288                     {
 289                         c = 0;
 290                     }
 291                     real det = (1 + c * c) * gmx::square(comm->systemInfo.cutoff) - dist_d * dist_d;
 292                     if (det > 0)
 293                     {
 294                         dh[d1] = comm->systemInfo.cutoff - (c * dist_d + std::sqrt(det)) / (1 + c * c);
 295                     }
 296                     else
 297                     {
 298                         /* A negative value signals out of range */
 299                         dh[d1] = -1;
 300                     }
 301                 }
 302             }
 303
 304             /* Accumulate the extremes over all pulses */
 305             for (int i = 0; i < numElementsInBuffer; i++)
 306             {
 307                 if (pulse == 0)
 308                 {
 309                     buf_e[i] = buf_r[i];
 310                 }
 311                 else
 312                 {
 313                     if (receiveValidData)
 314                     {
 315                         buf_e[i].min0 = std::min(buf_e[i].min0, buf_r[i].min0);
 316                         buf_e[i].max1 = std::max(buf_e[i].max1, buf_r[i].max1);
 317                         buf_e[i].min1 = std::min(buf_e[i].min1, buf_r[i].min1);
 318                     }
 319
 320                     int d1;
 321                     if (dd->ndim == 3 && d == 0 && i == numElementsInBuffer - 1)
 322                     {
 323                         d1 = 1;
 324                     }
 325                     else
 326                     {
 327                         d1 = d + 1;
 328                     }
 329                     if (receiveValidData && dh[d1] >= 0)
 330                     {
 331                         buf_e[i].mch0 = std::max(buf_e[i].mch0, buf_r[i].mch0 - dh[d1]);
 332                         buf_e[i].mch1 = std::max(buf_e[i].mch1, buf_r[i].mch1 - dh[d1]);
 333                     }
 334                 }
 335                 /* Copy the received buffer to the send buffer,
 336                  * to pass the data through with the next pulse.
 337                  */
 338                 buf_s[i] = buf_r[i];
 339             }
 340             if (((applyPbc || dd->ci[dim] + numPulses < dd->numCells[dim]) && pulse == numPulses - 1)
 341                 || (!applyPbc && dd->ci[dim] + 1 + pulse == dd->numCells[dim] - 1))
 342             {
 343                 /* Store the extremes */
 344                 int pos = 0;
 345
 346                 for (int d1 = d; d1 < dd->ndim - 1; d1++)
 347                 {
 348                     extr_s[d1][1] = std::min(extr_s[d1][1], buf_e[pos].min0);
 349                     extr_s[d1][0] = std::max(extr_s[d1][0], buf_e[pos].max1);
 350                     extr_s[d1][2] = std::min(extr_s[d1][2], buf_e[pos].min1);
 351                     pos++;
 352                 }
 353
 354                 if (d == 1 || (d == 0 && dd->ndim == 3))
 355                 {
 356                     for (int i = d; i < 2; i++)
 357                     {
 358                         comm->zone_d2[1 - d][i] = buf_e[pos];
 359                         pos++;
 360                     }
 361                 }
 362                 if (d == 0)
 363                 {
 364                     comm->zone_d1[1] = buf_e[pos];
 365                     pos++;
 366                 }
 367             }
 368             else
 369             {
 370                 if (d == 1 || (d == 0 && dd->ndim == 3))
 371                 {
 372                     for (int i = d; i < 2; i++)
 373                     {
 374                         comm->zone_d2[1 - d][i].dataSet = 0;
 375                     }
 376                 }
 377                 if (d == 0)
 378                 {
 379                     comm->zone_d1[1].dataSet = 0;
 380                 }
 381             }
 382         }
 383     }
 384
 385     if (dd->ndim >= 2)
 386     {
 387         int dim = dd->dim[1];
 388         for (int i = 0; i < 2; i++)
 389         {
 390             if (comm->zone_d1[i].dataSet != 0)
 391             {
 392                 if (debug)
 393                 {
 394                     print_ddzone(debug, 1, i, 0, &comm->zone_d1[i]);
 395                 }
 396                 cell_ns_x0[dim] = std::min(cell_ns_x0[dim], comm->zone_d1[i].min0);
 397                 cell_ns_x1[dim] = std::max(cell_ns_x1[dim], comm->zone_d1[i].max1);
 398             }
 399         }
 400     }
 401     if (dd->ndim >= 3)
 402     {
 403         int dim = dd->dim[2];
 404         for (int i = 0; i < 2; i++)
 405         {
 406             for (int j = 0; j < 2; j++)
 407             {
 408                 if (comm->zone_d2[i][j].dataSet != 0)
 409                 {
 410                     if (debug)
 411                     {
 412                         print_ddzone(debug, 2, i, j, &comm->zone_d2[i][j]);
 413                     }
 414                     cell_ns_x0[dim] = std::min(cell_ns_x0[dim], comm->zone_d2[i][j].min0);
 415                     cell_ns_x1[dim] = std::max(cell_ns_x1[dim], comm->zone_d2[i][j].max1);
 416                 }
 417             }
 418         }
 419     }
 420     for (int d = 1; d < dd->ndim; d++)
 421     {
 422         cellsizes[d].fracLowerMax = extr_s[d - 1][0];
 423         cellsizes[d].fracUpperMin = extr_s[d - 1][1];
 424         if (debug)
 425         {
 426             fprintf(debug,
 427                     "Cell fraction d %d, max0 %f, min1 %f\n",
 428                     d,
 429                     cellsizes[d].fracLowerMax,
 430                     cellsizes[d].fracUpperMin);
 431         }
 432     }
 433 }
 434
 435 //! Sets the charge-group zones to be equal to the home zone.
 436 static void set_zones_ncg_home(gmx_domdec_t* dd)
 437 {
 438     gmx_domdec_zones_t* zones;
 439     int                 i;
 440
 441     zones = &dd->comm->zones;
 442
 443     zones->cg_range[0] = 0;
 444     for (i = 1; i < zones->n + 1; i++)
 445     {
 446         zones->cg_range[i] = dd->ncg_home;
 447     }
 448     /* zone_ncg1[0] should always be equal to ncg_home */
 449     dd->comm->zone_ncg1[0] = dd->ncg_home;
 450 }
 451
 452 //! Restore atom groups for the charge groups.
 453 static void restoreAtomGroups(gmx_domdec_t* dd, const t_state* state)
 454 {
 455     gmx::ArrayRef<const int> atomGroupsState = state->cg_gl;
 456
 457     std::vector<int>& globalAtomGroupIndices = dd->globalAtomGroupIndices;
 458
 459     globalAtomGroupIndices.resize(atomGroupsState.size());
 460
 461     /* Copy back the global charge group indices from state
 462      * and rebuild the local charge group to atom index.
 463      */
 464     for (gmx::index i = 0; i < atomGroupsState.ssize(); i++)
 465     {
 466         globalAtomGroupIndices[i] = atomGroupsState[i];
 467     }
 468
 469     dd->ncg_home = atomGroupsState.size();
 470     dd->comm->atomRanges.setEnd(DDAtomRanges::Type::Home, atomGroupsState.ssize());
 471
 472     set_zones_ncg_home(dd);
 473 }
 474
 475 //! Sets the cginfo structures.
 476 static void dd_set_cginfo(gmx::ArrayRef<const int> index_gl, int cg0, int cg1, t_forcerec* fr)
 477 {
 478     if (fr != nullptr)
 479     {
 480         gmx::ArrayRef<cginfo_mb_t> cginfo_mb = fr->cginfo_mb;
 481         gmx::ArrayRef<int>         cginfo    = fr->cginfo;
 482
 483         for (int cg = cg0; cg < cg1; cg++)
 484         {
 485             cginfo[cg] = ddcginfo(cginfo_mb, index_gl[cg]);
 486         }
 487     }
 488 }
 489
 490 //! Makes the mappings between global and local atom indices during DD repartioning.
 491 static void make_dd_indices(gmx_domdec_t* dd, const int atomStart)
 492 {
 493     const int                numZones               = dd->comm->zones.n;
 494     gmx::ArrayRef<const int> zone2cg                = dd->comm->zones.cg_range;
 495     gmx::ArrayRef<const int> zone_ncg1              = dd->comm->zone_ncg1;
 496     gmx::ArrayRef<const int> globalAtomGroupIndices = dd->globalAtomGroupIndices;
 497
 498     std::vector<int>& globalAtomIndices = dd->globalAtomIndices;
 499     gmx_ga2la_t&      ga2la             = *dd->ga2la;
 500
 501     if (zone2cg[1] != dd->ncg_home)
 502     {
 503         gmx_incons("dd->ncg_zone is not up to date");
 504     }
 505
 506     /* Make the local to global and global to local atom index */
 507     int a = atomStart;
 508     globalAtomIndices.resize(a);
 509     for (int zone = 0; zone < numZones; zone++)
 510     {
 511         int cg0;
 512         if (zone == 0)
 513         {
 514             cg0 = atomStart;
 515         }
 516         else
 517         {
 518             cg0 = zone2cg[zone];
 519         }
 520         int cg1    = zone2cg[zone + 1];
 521         int cg1_p1 = cg0 + zone_ncg1[zone];
 522
 523         for (int cg = cg0; cg < cg1; cg++)
 524         {
 525             int zone1 = zone;
 526             if (cg >= cg1_p1)
 527             {
 528                 /* Signal that this cg is from more than one pulse away */
 529                 zone1 += numZones;
 530             }
 531             int cg_gl = globalAtomGroupIndices[cg];
 532             globalAtomIndices.push_back(cg_gl);
 533             ga2la.insert(cg_gl, { a, zone1 });
 534             a++;
 535         }
 536     }
 537 }
 538
 539 //! Checks whether global and local atom indices are consistent.
 540 static void check_index_consistency(const gmx_domdec_t* dd, int natoms_sys, const char* where)
 541 {
 542     int nerr = 0;
 543
 544     const int numAtomsInZones = dd->comm->atomRanges.end(DDAtomRanges::Type::Zones);
 545
 546     if (dd->comm->ddSettings.DD_debug > 1)
 547     {
 548         std::vector<int> have(natoms_sys);
 549         for (int a = 0; a < numAtomsInZones; a++)
 550         {
 551             int globalAtomIndex = dd->globalAtomIndices[a];
 552             if (have[globalAtomIndex] > 0)
 553             {
 554                 fprintf(stderr,
 555                         "DD rank %d: global atom %d occurs twice: index %d and %d\n",
 556                         dd->rank,
 557                         globalAtomIndex + 1,
 558                         have[globalAtomIndex],
 559                         a + 1);
 560             }
 561             else
 562             {
 563                 have[globalAtomIndex] = a + 1;
 564             }
 565         }
 566     }
 567
 568     std::vector<int> have(numAtomsInZones);
 569
 570     int ngl = 0;
 571     for (int i = 0; i < natoms_sys; i++)
 572     {
 573         if (const auto entry = dd->ga2la->find(i))
 574         {
 575             const int a = entry->la;
 576             if (a >= numAtomsInZones)
 577             {
 578                 fprintf(stderr,
 579                         "DD rank %d: global atom %d marked as local atom %d, which is larger than "
 580                         "nat_tot (%d)\n",
 581                         dd->rank,
 582                         i + 1,
 583                         a + 1,
 584                         numAtomsInZones);
 585                 nerr++;
 586             }
 587             else
 588             {
 589                 have[a] = 1;
 590                 if (dd->globalAtomIndices[a] != i)
 591                 {
 592                     fprintf(stderr,
 593                             "DD rank %d: global atom %d marked as local atom %d, which has global "
 594                             "atom index %d\n",
 595                             dd->rank,
 596                             i + 1,
 597                             a + 1,
 598                             dd->globalAtomIndices[a] + 1);
 599                     nerr++;
 600                 }
 601             }
 602             ngl++;
 603         }
 604     }
 605     if (ngl != numAtomsInZones)
 606     {
 607         fprintf(stderr, "DD rank %d, %s: %d global atom indices, %d local atoms\n", dd->rank, where, ngl, numAtomsInZones);
 608     }
 609     for (int a = 0; a < numAtomsInZones; a++)
 610     {
 611         if (have[a] == 0)
 612         {
 613             fprintf(stderr,
 614                     "DD rank %d, %s: local atom %d, global %d has no global index\n",
 615                     dd->rank,
 616                     where,
 617                     a + 1,
 618                     dd->globalAtomIndices[a] + 1);
 619         }
 620     }
 621
 622     if (nerr > 0)
 623     {
 624         gmx_fatal(FARGS, "DD rank %d, %s: %d atom(group) index inconsistencies", dd->rank, where, nerr);
 625     }
 626 }
 627
 628 //! Clear all DD global state indices
 629 static void clearDDStateIndices(gmx_domdec_t* dd, const bool keepLocalAtomIndices)
 630 {
 631     gmx_ga2la_t& ga2la = *dd->ga2la;
 632
 633     if (!keepLocalAtomIndices)
 634     {
 635         /* Clear the whole list without the overhead of searching */
 636         ga2la.clear();
 637     }
 638     else
 639     {
 640         const int numAtomsInZones = dd->comm->atomRanges.end(DDAtomRanges::Type::Zones);
 641         for (int i = 0; i < numAtomsInZones; i++)
 642         {
 643             ga2la.erase(dd->globalAtomIndices[i]);
 644         }
 645     }
 646
 647     dd_clear_local_vsite_indices(dd);
 648
 649     if (dd->constraints)
 650     {
 651         dd_clear_local_constraint_indices(dd);
 652     }
 653 }
 654
 655 bool check_grid_jump(int64_t step, const gmx_domdec_t* dd, real cutoff, const gmx_ddbox_t* ddbox, gmx_bool bFatal)
 656 {
 657     gmx_domdec_comm_t* comm    = dd->comm;
 658     bool               invalid = false;
 659
 660     for (int d = 1; d < dd->ndim; d++)
 661     {
 662         const DDCellsizesWithDlb& cellsizes = comm->cellsizesWithDlb[d];
 663         const int                 dim       = dd->dim[d];
 664         const real                limit     = grid_jump_limit(comm, cutoff, d);
 665         real                      bfac      = ddbox->box_size[dim];
 666         if (ddbox->tric_dir[dim])
 667         {
 668             bfac *= ddbox->skew_fac[dim];
 669         }
 670         if ((cellsizes.fracUpper - cellsizes.fracLowerMax) * bfac < limit
 671             || (cellsizes.fracLower - cellsizes.fracUpperMin) * bfac > -limit)
 672         {
 673             invalid = true;
 674
 675             if (bFatal)
 676             {
 677                 char buf[22];
 678
 679                 /* This error should never be triggered under normal
 680                  * circumstances, but you never know ...
 681                  */
 682                 gmx_fatal(FARGS,
 683                           "step %s: The domain decomposition grid has shifted too much in the "
 684                           "%c-direction around cell %d %d %d. This should not have happened. "
 685                           "Running with fewer ranks might avoid this issue.",
 686                           gmx_step_str(step, buf),
 687                           dim2char(dim),
 688                           dd->ci[XX],
 689                           dd->ci[YY],
 690                           dd->ci[ZZ]);
 691             }
 692         }
 693     }
 694
 695     return invalid;
 696 }
 697 //! Return the duration of force calculations on this rank.
 698 static float dd_force_load(gmx_domdec_comm_t* comm)
 699 {
 700     float load;
 701
 702     if (comm->ddSettings.eFlop)
 703     {
 704         load = comm->flop;
 705         if (comm->ddSettings.eFlop > 1)
 706         {
 707             load *= 1.0 + (comm->ddSettings.eFlop - 1) * (0.1 * rand() / RAND_MAX - 0.05);
 708         }
 709     }
 710     else
 711     {
 712         load = comm->cycl[ddCyclF];
 713         if (comm->cycl_n[ddCyclF] > 1)
 714         {
 715             /* Subtract the maximum of the last n cycle counts
 716              * to get rid of possible high counts due to other sources,
 717              * for instance system activity, that would otherwise
 718              * affect the dynamic load balancing.
 719              */
 720             load -= comm->cycl_max[ddCyclF];
 721         }
 722
 723 #if GMX_MPI
 724         if (comm->cycl_n[ddCyclWaitGPU] && comm->nrank_gpu_shared > 1)
 725         {
 726             float gpu_wait, gpu_wait_sum;
 727
 728             gpu_wait = comm->cycl[ddCyclWaitGPU];
 729             if (comm->cycl_n[ddCyclF] > 1)
 730             {
 731                 /* We should remove the WaitGPU time of the same MD step
 732                  * as the one with the maximum F time, since the F time
 733                  * and the wait time are not independent.
 734                  * Furthermore, the step for the max F time should be chosen
 735                  * the same on all ranks that share the same GPU.
 736                  * But to keep the code simple, we remove the average instead.
 737                  * The main reason for artificially long times at some steps
 738                  * is spurious CPU activity or MPI time, so we don't expect
 739                  * that changes in the GPU wait time matter a lot here.
 740                  */
 741                 gpu_wait *= (comm->cycl_n[ddCyclF] - 1) / static_cast<float>(comm->cycl_n[ddCyclF]);
 742             }
 743             /* Sum the wait times over the ranks that share the same GPU */
 744             MPI_Allreduce(&gpu_wait, &gpu_wait_sum, 1, MPI_FLOAT, MPI_SUM, comm->mpi_comm_gpu_shared);
 745             /* Replace the wait time by the average over the ranks */
 746             load += -gpu_wait + gpu_wait_sum / comm->nrank_gpu_shared;
 747         }
 748 #endif
 749     }
 750
 751     return load;
 752 }
 753
 754 //! Runs cell size checks and communicates the boundaries.
 755 static void comm_dd_ns_cell_sizes(gmx_domdec_t* dd, gmx_ddbox_t* ddbox, rvec cell_ns_x0, rvec cell_ns_x1, int64_t step)
 756 {
 757     gmx_domdec_comm_t* comm;
 758     int                dim_ind, dim;
 759
 760     comm = dd->comm;
 761
 762     for (dim_ind = 0; dim_ind < dd->ndim; dim_ind++)
 763     {
 764         dim = dd->dim[dim_ind];
 765
 766         /* Without PBC we don't have restrictions on the outer cells */
 767         if (!(dim >= ddbox->npbcdim && (dd->ci[dim] == 0 || dd->ci[dim] == dd->numCells[dim] - 1))
 768             && isDlbOn(comm)
 769             && (comm->cell_x1[dim] - comm->cell_x0[dim]) * ddbox->skew_fac[dim] < comm->cellsize_min[dim])
 770         {
 771             char buf[22];
 772             gmx_fatal(FARGS,
 773                       "step %s: The %c-size (%f) times the triclinic skew factor (%f) is smaller "
 774                       "than the smallest allowed cell size (%f) for domain decomposition grid cell "
 775                       "%d %d %d",
 776                       gmx_step_str(step, buf),
 777                       dim2char(dim),
 778                       comm->cell_x1[dim] - comm->cell_x0[dim],
 779                       ddbox->skew_fac[dim],
 780                       dd->comm->cellsize_min[dim],
 781                       dd->ci[XX],
 782                       dd->ci[YY],
 783                       dd->ci[ZZ]);
 784         }
 785     }
 786
 787     if ((isDlbOn(dd->comm) && dd->ndim > 1) || ddbox->nboundeddim < DIM)
 788     {
 789         /* Communicate the boundaries and update cell_ns_x0/1 */
 790         dd_move_cellx(dd, ddbox, cell_ns_x0, cell_ns_x1);
 791         if (isDlbOn(dd->comm) && dd->ndim > 1)
 792         {
 793             check_grid_jump(step, dd, dd->comm->systemInfo.cutoff, ddbox, TRUE);
 794         }
 795     }
 796 }
 797
 798 //! Compute and communicate to determine the load distribution across PP ranks.
 799 static void get_load_distribution(gmx_domdec_t* dd, gmx_wallcycle_t wcycle)
 800 {
 801     gmx_domdec_comm_t* comm;
 802     domdec_load_t*     load;
 803     float              cell_frac = 0, sbuf[DD_NLOAD_MAX];
 804     gmx_bool           bSepPME;
 805
 806     if (debug)
 807     {
 808         fprintf(debug, "get_load_distribution start\n");
 809     }
 810
 811     wallcycle_start(wcycle, ewcDDCOMMLOAD);
 812
 813     comm = dd->comm;
 814
 815     bSepPME = (dd->pme_nodeid >= 0);
 816
 817     if (dd->ndim == 0 && bSepPME)
 818     {
 819         /* Without decomposition, but with PME nodes, we need the load */
 820         comm->load[0].mdf = comm->cycl[ddCyclPPduringPME];
 821         comm->load[0].pme = comm->cycl[ddCyclPME];
 822     }
 823
 824     // Either we have DLB off, or we have it on and the array is large enough
 825     GMX_ASSERT(!isDlbOn(dd->comm) || static_cast<int>(dd->comm->cellsizesWithDlb.size()) == dd->ndim,
 826                "DLB cell sizes data not set up properly ");
 827     for (int d = dd->ndim - 1; d >= 0; d--)
 828     {
 829         const int dim = dd->dim[d];
 830         /* Check if we participate in the communication in this dimension */
 831         if (d == dd->ndim - 1 || (dd->ci[dd->dim[d + 1]] == 0 && dd->ci[dd->dim[dd->ndim - 1]] == 0))
 832         {
 833             load = &comm->load[d];
 834             if (isDlbOn(dd->comm))
 835             {
 836                 cell_frac = comm->cellsizesWithDlb[d].fracUpper - comm->cellsizesWithDlb[d].fracLower;
 837             }
 838             int pos = 0;
 839             if (d == dd->ndim - 1)
 840             {
 841                 sbuf[pos++] = dd_force_load(comm);
 842                 sbuf[pos++] = sbuf[0];
 843                 if (isDlbOn(dd->comm))
 844                 {
 845                     sbuf[pos++] = sbuf[0];
 846                     sbuf[pos++] = cell_frac;
 847                     if (d > 0)
 848                     {
 849                         sbuf[pos++] = comm->cellsizesWithDlb[d].fracLowerMax;
 850                         sbuf[pos++] = comm->cellsizesWithDlb[d].fracUpperMin;
 851                     }
 852                 }
 853                 if (bSepPME)
 854                 {
 855                     sbuf[pos++] = comm->cycl[ddCyclPPduringPME];
 856                     sbuf[pos++] = comm->cycl[ddCyclPME];
 857                 }
 858             }
 859             else
 860             {
 861                 sbuf[pos++] = comm->load[d + 1].sum;
 862                 sbuf[pos++] = comm->load[d + 1].max;
 863                 if (isDlbOn(dd->comm))
 864                 {
 865                     sbuf[pos++] = comm->load[d + 1].sum_m;
 866                     sbuf[pos++] = comm->load[d + 1].cvol_min * cell_frac;
 867                     sbuf[pos++] = comm->load[d + 1].flags;
 868                     if (d > 0)
 869                     {
 870                         sbuf[pos++] = comm->cellsizesWithDlb[d].fracLowerMax;
 871                         sbuf[pos++] = comm->cellsizesWithDlb[d].fracUpperMin;
 872                     }
 873                 }
 874                 if (bSepPME)
 875                 {
 876                     sbuf[pos++] = comm->load[d + 1].mdf;
 877                     sbuf[pos++] = comm->load[d + 1].pme;
 878                 }
 879             }
 880             load->nload = pos;
 881             /* Communicate a row in DD direction d.
 882              * The communicators are setup such that the root always has rank 0.
 883              */
 884 #if GMX_MPI
 885             MPI_Gather(sbuf,
 886                        load->nload * sizeof(float),
 887                        MPI_BYTE,
 888                        load->load,
 889                        load->nload * sizeof(float),
 890                        MPI_BYTE,
 891                        0,
 892                        comm->mpi_comm_load[d]);
 893 #endif
 894             if (dd->ci[dim] == dd->master_ci[dim])
 895             {
 896                 /* We are the master along this row, process this row */
 897                 RowMaster* rowMaster = nullptr;
 898
 899                 if (isDlbOn(comm))
 900                 {
 901                     rowMaster = comm->cellsizesWithDlb[d].rowMaster.get();
 902                 }
 903                 load->sum      = 0;
 904                 load->max      = 0;
 905                 load->sum_m    = 0;
 906                 load->cvol_min = 1;
 907                 load->flags    = 0;
 908                 load->mdf      = 0;
 909                 load->pme      = 0;
 910                 int pos        = 0;
 911                 for (int i = 0; i < dd->numCells[dim]; i++)
 912                 {
 913                     load->sum += load->load[pos++];
 914                     load->max = std::max(load->max, load->load[pos]);
 915                     pos++;
 916                     if (isDlbOn(dd->comm))
 917                     {
 918                         if (rowMaster->dlbIsLimited)
 919                         {
 920                             /* This direction could not be load balanced properly,
 921                              * therefore we need to use the maximum iso the average load.
 922                              */
 923                             load->sum_m = std::max(load->sum_m, load->load[pos]);
 924                         }
 925                         else
 926                         {
 927                             load->sum_m += load->load[pos];
 928                         }
 929                         pos++;
 930                         load->cvol_min = std::min(load->cvol_min, load->load[pos]);
 931                         pos++;
 932                         if (d < dd->ndim - 1)
 933                         {
 934                             load->flags = gmx::roundToInt(load->load[pos++]);
 935                         }
 936                         if (d > 0)
 937                         {
 938                             rowMaster->bounds[i].cellFracLowerMax = load->load[pos++];
 939                             rowMaster->bounds[i].cellFracUpperMin = load->load[pos++];
 940                         }
 941                     }
 942                     if (bSepPME)
 943                     {
 944                         load->mdf = std::max(load->mdf, load->load[pos]);
 945                         pos++;
 946                         load->pme = std::max(load->pme, load->load[pos]);
 947                         pos++;
 948                     }
 949                 }
 950                 if (isDlbOn(comm) && rowMaster->dlbIsLimited)
 951                 {
 952                     load->sum_m *= dd->numCells[dim];
 953                     load->flags |= (1 << d);
 954                 }
 955             }
 956         }
 957     }
 958
 959     if (DDMASTER(dd))
 960     {
 961         comm->nload += dd_load_count(comm);
 962         comm->load_step += comm->cycl[ddCyclStep];
 963         comm->load_sum += comm->load[0].sum;
 964         comm->load_max += comm->load[0].max;
 965         if (isDlbOn(comm))
 966         {
 967             for (int d = 0; d < dd->ndim; d++)
 968             {
 969                 if (comm->load[0].flags & (1 << d))
 970                 {
 971                     comm->load_lim[d]++;
 972                 }
 973             }
 974         }
 975         if (bSepPME)
 976         {
 977             comm->load_mdf += comm->load[0].mdf;
 978             comm->load_pme += comm->load[0].pme;
 979         }
 980     }
 981
 982     wallcycle_stop(wcycle, ewcDDCOMMLOAD);
 983
 984     if (debug)
 985     {
 986         fprintf(debug, "get_load_distribution finished\n");
 987     }
 988 }
 989
 990 /*! \brief Return the relative performance loss on the total run time
 991  * due to the force calculation load imbalance. */
 992 static float dd_force_load_fraction(gmx_domdec_t* dd)
 993 {
 994     if (dd->comm->nload > 0 && dd->comm->load_step > 0)
 995     {
 996         return dd->comm->load_sum / (dd->comm->load_step * dd->nnodes);
 997     }
 998     else
 999     {
1000         return 0;
1001     }
1002 }
1003
1004 /*! \brief Return the relative performance loss on the total run time
1005  * due to the force calculation load imbalance. */
1006 static float dd_force_imb_perf_loss(gmx_domdec_t* dd)
1007 {
1008     if (dd->comm->nload > 0 && dd->comm->load_step > 0)
1009     {
1010         return (dd->comm->load_max * dd->nnodes - dd->comm->load_sum) / (dd->comm->load_step * dd->nnodes);
1011     }
1012     else
1013     {
1014         return 0;
1015     }
1016 }
1017
1018 //! Print load-balance report e.g. at the end of a run.
1019 static void print_dd_load_av(FILE* fplog, gmx_domdec_t* dd)
1020 {
1021     gmx_domdec_comm_t* comm = dd->comm;
1022
1023     /* Only the master rank prints loads and only if we measured loads */
1024     if (!DDMASTER(dd) || comm->nload == 0)
1025     {
1026         return;
1027     }
1028
1029     char buf[STRLEN];
1030     int  numPpRanks  = dd->nnodes;
1031     int  numPmeRanks = (comm->ddRankSetup.usePmeOnlyRanks ? comm->ddRankSetup.numRanksDoingPme : 0);
1032     int  numRanks    = numPpRanks + numPmeRanks;
1033     float lossFraction = 0;
1034
1035     /* Print the average load imbalance and performance loss */
1036     if (dd->nnodes > 1 && comm->load_sum > 0)
1037     {
1038         float imbalance = comm->load_max * numPpRanks / comm->load_sum - 1;
1039         lossFraction    = dd_force_imb_perf_loss(dd);
1040
1041         std::string msg = "\nDynamic load balancing report:\n";
1042         std::string dlbStateStr;
1043
1044         switch (dd->comm->dlbState)
1045         {
1046             case DlbState::offUser:
1047                 dlbStateStr = "DLB was off during the run per user request.";
1048                 break;
1049             case DlbState::offForever:
1050                 /* Currectly this can happen due to performance loss observed, cell size
1051                  * limitations or incompatibility with other settings observed during
1052                  * determineInitialDlbState(). */
1053                 dlbStateStr = "DLB got disabled because it was unsuitable to use.";
1054                 break;
1055             case DlbState::offCanTurnOn:
1056                 dlbStateStr = "DLB was off during the run due to low measured imbalance.";
1057                 break;
1058             case DlbState::offTemporarilyLocked:
1059                 dlbStateStr =
1060                         "DLB was locked at the end of the run due to unfinished PP-PME "
1061                         "balancing.";
1062                 break;
1063             case DlbState::onCanTurnOff:
1064                 dlbStateStr = "DLB was turned on during the run due to measured imbalance.";
1065                 break;
1066             case DlbState::onUser:
1067                 dlbStateStr = "DLB was permanently on during the run per user request.";
1068                 break;
1069             default: GMX_ASSERT(false, "Undocumented DLB state");
1070         }
1071
1072         msg += " " + dlbStateStr + "\n";
1073         msg += gmx::formatString(" Average load imbalance: %.1f%%.\n", imbalance * 100);
1074         msg += gmx::formatString(
1075                 " The balanceable part of the MD step is %d%%, load imbalance is computed from "
1076                 "this.\n",
1077                 gmx::roundToInt(dd_force_load_fraction(dd) * 100));
1078         msg += gmx::formatString(
1079                 " Part of the total run time spent waiting due to load imbalance: %.1f%%.\n",
1080                 lossFraction * 100);
1081         fprintf(fplog, "%s", msg.c_str());
1082         fprintf(stderr, "\n%s", msg.c_str());
1083     }
1084
1085     /* Print during what percentage of steps the  load balancing was limited */
1086     bool dlbWasLimited = false;
1087     if (isDlbOn(comm))
1088     {
1089         sprintf(buf, " Steps where the load balancing was limited by -rdd, -rcon and/or -dds:");
1090         for (int d = 0; d < dd->ndim; d++)
1091         {
1092             int limitPercentage = (200 * comm->load_lim[d] + 1) / (2 * comm->nload);
1093             sprintf(buf + strlen(buf), " %c %d %%", dim2char(dd->dim[d]), limitPercentage);
1094             if (limitPercentage >= 50)
1095             {
1096                 dlbWasLimited = true;
1097             }
1098         }
1099         sprintf(buf + strlen(buf), "\n");
1100         fprintf(fplog, "%s", buf);
1101         fprintf(stderr, "%s", buf);
1102     }
1103
1104     /* Print the performance loss due to separate PME - PP rank imbalance */
1105     float lossFractionPme = 0;
1106     if (numPmeRanks > 0 && comm->load_mdf > 0 && comm->load_step > 0)
1107     {
1108         float pmeForceRatio = comm->load_pme / comm->load_mdf;
1109         lossFractionPme     = (comm->load_pme - comm->load_mdf) / comm->load_step;
1110         if (lossFractionPme <= 0)
1111         {
1112             lossFractionPme *= numPmeRanks / static_cast<float>(numRanks);
1113         }
1114         else
1115         {
1116             lossFractionPme *= numPpRanks / static_cast<float>(numRanks);
1117         }
1118         sprintf(buf, " Average PME mesh/force load: %5.3f\n", pmeForceRatio);
1119         fprintf(fplog, "%s", buf);
1120         fprintf(stderr, "%s", buf);
1121         sprintf(buf,
1122                 " Part of the total run time spent waiting due to PP/PME imbalance: %.1f %%\n",
1123                 std::fabs(lossFractionPme) * 100);
1124         fprintf(fplog, "%s", buf);
1125         fprintf(stderr, "%s", buf);
1126     }
1127     fprintf(fplog, "\n");
1128     fprintf(stderr, "\n");
1129
1130     if ((lossFraction >= DD_PERF_LOSS_WARN) && (dd->comm->dlbState != DlbState::offTemporarilyLocked))
1131     {
1132         std::string message = gmx::formatString(
1133                 "NOTE: %.1f %% of the available CPU time was lost due to load imbalance\n"
1134                 "      in the domain decomposition.\n",
1135                 lossFraction * 100);
1136
1137         bool hadSuggestion = false;
1138         if (dd->comm->dlbState == DlbState::offUser)
1139         {
1140             message += "      You might want to allow dynamic load balancing (option -dlb auto.)\n";
1141             hadSuggestion = true;
1142         }
1143         else if (dd->comm->dlbState == DlbState::offCanTurnOn)
1144         {
1145             message +=
1146                     "      Dynamic load balancing was automatically disabled, but it might be "
1147                     "beneficial to manually tuning it on (option -dlb on.)\n";
1148             hadSuggestion = true;
1149         }
1150         else if (dlbWasLimited)
1151         {
1152             message +=
1153                     "      You might want to decrease the cell size limit (options -rdd, -rcon "
1154                     "and/or -dds).\n";
1155             hadSuggestion = true;
1156         }
1157         message += gmx::formatString(
1158                 "      You can %sconsider manually changing the decomposition (option -dd);\n"
1159                 "      e.g. by using fewer domains along the box dimension in which there is\n"
1160                 "      considerable inhomogeneity in the simulated system.",
1161                 hadSuggestion ? "also " : "");
1162
1163         fprintf(fplog, "%s\n", message.c_str());
1164         fprintf(stderr, "%s\n", message.c_str());
1165     }
1166     if (numPmeRanks > 0 && std::fabs(lossFractionPme) >= DD_PERF_LOSS_WARN)
1167     {
1168         sprintf(buf,
1169                 "NOTE: %.1f %% performance was lost because the PME ranks\n"
1170                 "      had %s work to do than the PP ranks.\n"
1171                 "      You might want to %s the number of PME ranks\n"
1172                 "      or %s the cut-off and the grid spacing.\n",
1173                 std::fabs(lossFractionPme * 100),
1174                 (lossFractionPme < 0) ? "less" : "more",
1175                 (lossFractionPme < 0) ? "decrease" : "increase",
1176                 (lossFractionPme < 0) ? "decrease" : "increase");
1177         fprintf(fplog, "%s\n", buf);
1178         fprintf(stderr, "%s\n", buf);
1179     }
1180 }
1181
1182 //! Return the minimum communication volume.
1183 static float dd_vol_min(gmx_domdec_t* dd)
1184 {
1185     return dd->comm->load[0].cvol_min * dd->nnodes;
1186 }
1187
1188 //! Return the DD load flags.
1189 static int dd_load_flags(gmx_domdec_t* dd)
1190 {
1191     return dd->comm->load[0].flags;
1192 }
1193
1194 //! Return the reported load imbalance in force calculations.
1195 static float dd_f_imbal(gmx_domdec_t* dd)
1196 {
1197     if (dd->comm->load[0].sum > 0)
1198     {
1199         return dd->comm->load[0].max * dd->nnodes / dd->comm->load[0].sum - 1.0F;
1200     }
1201     else
1202     {
1203         /* Something is wrong in the cycle counting, report no load imbalance */
1204         return 0.0F;
1205     }
1206 }
1207
1208 //! Returns DD load balance report.
1209 static std::string dd_print_load(gmx_domdec_t* dd, int64_t step)
1210 {
1211     gmx::StringOutputStream stream;
1212     gmx::TextWriter         log(&stream);
1213
1214     int flags = dd_load_flags(dd);
1215     if (flags)
1216     {
1217         log.writeString("DD  load balancing is limited by minimum cell size in dimension");
1218         for (int d = 0; d < dd->ndim; d++)
1219         {
1220             if (flags & (1 << d))
1221             {
1222                 log.writeStringFormatted(" %c", dim2char(dd->dim[d]));
1223             }
1224         }
1225         log.ensureLineBreak();
1226     }
1227     log.writeString("DD  step " + gmx::toString(step));
1228     if (isDlbOn(dd->comm))
1229     {
1230         log.writeStringFormatted("  vol min/aver %5.3f%c", dd_vol_min(dd), flags ? '!' : ' ');
1231     }
1232     if (dd->nnodes > 1)
1233     {
1234         log.writeStringFormatted(" load imb.: force %4.1f%%", dd_f_imbal(dd) * 100);
1235     }
1236     if (dd->comm->cycl_n[ddCyclPME])
1237     {
1238         log.writeStringFormatted("  pme mesh/force %5.3f", dd_pme_f_ratio(dd));
1239     }
1240     log.ensureLineBreak();
1241     return stream.toString();
1242 }
1243
1244 //! Prints DD load balance report in mdrun verbose mode.
1245 static void dd_print_load_verbose(gmx_domdec_t* dd)
1246 {
1247     if (isDlbOn(dd->comm))
1248     {
1249         fprintf(stderr, "vol %4.2f%c ", dd_vol_min(dd), dd_load_flags(dd) ? '!' : ' ');
1250     }
1251     if (dd->nnodes > 1)
1252     {
1253         fprintf(stderr, "imb F %2d%% ", gmx::roundToInt(dd_f_imbal(dd) * 100));
1254     }
1255     if (dd->comm->cycl_n[ddCyclPME])
1256     {
1257         fprintf(stderr, "pme/F %4.2f ", dd_pme_f_ratio(dd));
1258     }
1259 }
1260
1261 //! Turns on dynamic load balancing if possible and needed.
1262 static void turn_on_dlb(const gmx::MDLogger& mdlog, gmx_domdec_t* dd, int64_t step)
1263 {
1264     gmx_domdec_comm_t* comm = dd->comm;
1265
1266     real cellsize_min = comm->cellsize_min[dd->dim[0]];
1267     for (int d = 1; d < dd->ndim; d++)
1268     {
1269         cellsize_min = std::min(cellsize_min, comm->cellsize_min[dd->dim[d]]);
1270     }
1271
1272     /* Turn off DLB if we're too close to the cell size limit. */
1273     if (cellsize_min < comm->cellsize_limit * 1.05)
1274     {
1275         GMX_LOG(mdlog.info)
1276                 .appendTextFormatted(
1277                         "step %s Measured %.1f %% performance loss due to load imbalance, "
1278                         "but the minimum cell size is smaller than 1.05 times the cell size limit. "
1279                         "Will no longer try dynamic load balancing.",
1280                         gmx::toString(step).c_str(),
1281                         dd_force_imb_perf_loss(dd) * 100);
1282
1283         comm->dlbState = DlbState::offForever;
1284         return;
1285     }
1286
1287     GMX_LOG(mdlog.info)
1288             .appendTextFormatted(
1289                     "step %s Turning on dynamic load balancing, because the performance loss due "
1290                     "to load imbalance is %.1f %%.",
1291                     gmx::toString(step).c_str(),
1292                     dd_force_imb_perf_loss(dd) * 100);
1293     comm->dlbState = DlbState::onCanTurnOff;
1294
1295     /* Store the non-DLB performance, so we can check if DLB actually
1296      * improves performance.
1297      */
1298     GMX_RELEASE_ASSERT(comm->cycl_n[ddCyclStep] > 0,
1299                        "When we turned on DLB, we should have measured cycles");
1300     comm->cyclesPerStepBeforeDLB = comm->cycl[ddCyclStep] / comm->cycl_n[ddCyclStep];
1301
1302     set_dlb_limits(dd);
1303
1304     /* We can set the required cell size info here,
1305      * so we do not need to communicate this.
1306      * The grid is completely uniform.
1307      */
1308     for (int d = 0; d < dd->ndim; d++)
1309     {
1310         RowMaster* rowMaster = comm->cellsizesWithDlb[d].rowMaster.get();
1311
1312         if (rowMaster)
1313         {
1314             comm->load[d].sum_m = comm->load[d].sum;
1315
1316             int nc = dd->numCells[dd->dim[d]];
1317             for (int i = 0; i < nc; i++)
1318             {
1319                 rowMaster->cellFrac[i] = i / static_cast<real>(nc);
1320                 if (d > 0)
1321                 {
1322                     rowMaster->bounds[i].cellFracLowerMax = i / static_cast<real>(nc);
1323                     rowMaster->bounds[i].cellFracUpperMin = (i + 1) / static_cast<real>(nc);
1324                 }
1325             }
1326             rowMaster->cellFrac[nc] = 1.0;
1327         }
1328     }
1329 }
1330
1331 //! Turns off dynamic load balancing (but leave it able to turn back on).
1332 static void turn_off_dlb(const gmx::MDLogger& mdlog, gmx_domdec_t* dd, int64_t step)
1333 {
1334     GMX_LOG(mdlog.info)
1335             .appendText(
1336                     "step " + gmx::toString(step)
1337                     + " Turning off dynamic load balancing, because it is degrading performance.");
1338     dd->comm->dlbState                     = DlbState::offCanTurnOn;
1339     dd->comm->haveTurnedOffDlb             = true;
1340     dd->comm->ddPartioningCountFirstDlbOff = dd->ddp_count;
1341 }
1342
1343 //! Turns off dynamic load balancing permanently.
1344 static void turn_off_dlb_forever(const gmx::MDLogger& mdlog, gmx_domdec_t* dd, int64_t step)
1345 {
1346     GMX_RELEASE_ASSERT(dd->comm->dlbState == DlbState::offCanTurnOn,
1347                        "Can only turn off DLB forever when it was in the can-turn-on state");
1348     GMX_LOG(mdlog.info)
1349             .appendText(
1350                     "step " + gmx::toString(step)
1351                     + " Will no longer try dynamic load balancing, as it degraded performance.");
1352     dd->comm->dlbState = DlbState::offForever;
1353 }
1354
1355 void set_dd_dlb_max_cutoff(t_commrec* cr, real cutoff)
1356 {
1357     gmx_domdec_comm_t* comm;
1358
1359     comm = cr->dd->comm;
1360
1361     /* Turn on the DLB limiting (might have been on already) */
1362     comm->bPMELoadBalDLBLimits = TRUE;
1363
1364     /* Change the cut-off limit */
1365     comm->PMELoadBal_max_cutoff = cutoff;
1366
1367     if (debug)
1368     {
1369         fprintf(debug,
1370                 "PME load balancing set a limit to the DLB staggering such that a %f cut-off will "
1371                 "continue to fit\n",
1372                 comm->PMELoadBal_max_cutoff);
1373     }
1374 }
1375
1376 //! Merge atom buffers.
1377 static void merge_cg_buffers(int                            ncell,
1378                              gmx_domdec_comm_dim_t*         cd,
1379                              int                            pulse,
1380                              int*                           ncg_cell,
1381                              gmx::ArrayRef<int>             index_gl,
1382                              const int*                     recv_i,
1383                              gmx::ArrayRef<gmx::RVec>       x,
1384                              gmx::ArrayRef<const gmx::RVec> recv_vr,
1385                              gmx::ArrayRef<cginfo_mb_t>     cginfo_mb,
1386                              gmx::ArrayRef<int>             cginfo)
1387 {
1388     gmx_domdec_ind_t *ind, *ind_p;
1389     int               p, cell, c, cg, cg0, cg1, cg_gl;
1390     int               shift;
1391
1392     ind = &cd->ind[pulse];
1393
1394     /* First correct the already stored data */
1395     shift = ind->nrecv[ncell];
1396     for (cell = ncell - 1; cell >= 0; cell--)
1397     {
1398         shift -= ind->nrecv[cell];
1399         if (shift > 0)
1400         {
1401             /* Move the cg's present from previous grid pulses */
1402             cg0 = ncg_cell[ncell + cell];
1403             cg1 = ncg_cell[ncell + cell + 1];
1404             for (cg = cg1 - 1; cg >= cg0; cg--)
1405             {
1406                 index_gl[cg + shift] = index_gl[cg];
1407                 x[cg + shift]        = x[cg];
1408                 cginfo[cg + shift]   = cginfo[cg];
1409             }
1410             /* Correct the already stored send indices for the shift */
1411             for (p = 1; p <= pulse; p++)
1412             {
1413                 ind_p = &cd->ind[p];
1414                 cg0   = 0;
1415                 for (c = 0; c < cell; c++)
1416                 {
1417                     cg0 += ind_p->nsend[c];
1418                 }
1419                 cg1 = cg0 + ind_p->nsend[cell];
1420                 for (cg = cg0; cg < cg1; cg++)
1421                 {
1422                     ind_p->index[cg] += shift;
1423                 }
1424             }
1425         }
1426     }
1427
1428     /* Merge in the communicated buffers */
1429     shift = 0;
1430     cg0   = 0;
1431     for (cell = 0; cell < ncell; cell++)
1432     {
1433         cg1 = ncg_cell[ncell + cell + 1] + shift;
1434         for (cg = 0; cg < ind->nrecv[cell]; cg++)
1435         {
1436             /* Copy this atom from the buffer */
1437             index_gl[cg1] = recv_i[cg0];
1438             x[cg1]        = recv_vr[cg0];
1439             /* Copy information */
1440             cg_gl       = index_gl[cg1];
1441             cginfo[cg1] = ddcginfo(cginfo_mb, cg_gl);
1442             cg0++;
1443             cg1++;
1444         }
1445         shift += ind->nrecv[cell];
1446         ncg_cell[ncell + cell + 1] = cg1;
1447     }
1448 }
1449
1450 //! Makes a range partitioning for the atom groups wthin a cell
1451 static void make_cell2at_index(gmx_domdec_comm_dim_t* cd, int nzone, int atomGroupStart)
1452 {
1453     /* Store the atom block boundaries for easy copying of communication buffers
1454      */
1455     int g = atomGroupStart;
1456     for (int zone = 0; zone < nzone; zone++)
1457     {
1458         for (gmx_domdec_ind_t& ind : cd->ind)
1459         {
1460             ind.cell2at0[zone] = g;
1461             g += ind.nrecv[zone];
1462             ind.cell2at1[zone] = g;
1463         }
1464     }
1465 }
1466
1467 //! Returns whether a link is missing.
1468 static gmx_bool missing_link(const t_blocka& link, const int globalAtomIndex, const gmx_ga2la_t& ga2la)
1469 {
1470     for (int i = link.index[globalAtomIndex]; i < link.index[globalAtomIndex + 1]; i++)
1471     {
1472         if (!ga2la.findHome(link.a[i]))
1473         {
1474             return true;
1475         }
1476     }
1477
1478     return false;
1479 }
1480
1481 //! Domain corners for communication, a maximum of 4 i-zones see a j domain
1482 typedef struct
1483 {
1484     //! The corners for the non-bonded communication.
1485     real c[DIM][4];
1486     //! Corner for rounding.
1487     real cr0;
1488     //! Corners for rounding.
1489     real cr1[4];
1490     //! Corners for bounded communication.
1491     real bc[DIM];
1492     //! Corner for rounding for bonded communication.
1493     real bcr1;
1494 } dd_corners_t;
1495
1496 //! Determine the corners of the domain(s) we are communicating with.
1497 static void set_dd_corners(const gmx_domdec_t* dd, int dim0, int dim1, int dim2, gmx_bool bDistMB, dd_corners_t* c)
1498 {
1499     const gmx_domdec_comm_t*  comm;
1500     const gmx_domdec_zones_t* zones;
1501
1502     comm = dd->comm;
1503
1504     zones = &comm->zones;
1505
1506     /* Keep the compiler happy */
1507     c->cr0  = 0;
1508     c->bcr1 = 0;
1509
1510     /* The first dimension is equal for all cells */
1511     c->c[0][0] = comm->cell_x0[dim0];
1512     if (bDistMB)
1513     {
1514         c->bc[0] = c->c[0][0];
1515     }
1516     if (dd->ndim >= 2)
1517     {
1518         dim1 = dd->dim[1];
1519         /* This cell row is only seen from the first row */
1520         c->c[1][0] = comm->cell_x0[dim1];
1521         /* All rows can see this row */
1522         c->c[1][1] = comm->cell_x0[dim1];
1523         if (isDlbOn(dd->comm))
1524         {
1525             c->c[1][1] = std::max(comm->cell_x0[dim1], comm->zone_d1[1].mch0);
1526             if (bDistMB)
1527             {
1528                 /* For the multi-body distance we need the maximum */
1529                 c->bc[1] = std::max(comm->cell_x0[dim1], comm->zone_d1[1].p1_0);
1530             }
1531         }
1532         /* Set the upper-right corner for rounding */
1533         c->cr0 = comm->cell_x1[dim0];
1534
1535         if (dd->ndim >= 3)
1536         {
1537             dim2 = dd->dim[2];
1538             for (int j = 0; j < 4; j++)
1539             {
1540                 c->c[2][j] = comm->cell_x0[dim2];
1541             }
1542             if (isDlbOn(dd->comm))
1543             {
1544                 /* Use the maximum of the i-cells that see a j-cell */
1545                 for (const auto& iZone : zones->iZones)
1546                 {
1547                     const int iZoneIndex = iZone.iZoneIndex;
1548                     for (int jZone : iZone.jZoneRange)
1549                     {
1550                         if (jZone >= 4)
1551                         {
1552                             c->c[2][jZone - 4] = std::max(
1553                                     c->c[2][jZone - 4],
1554                                     comm->zone_d2[zones->shift[iZoneIndex][dim0]][zones->shift[iZoneIndex][dim1]]
1555                                             .mch0);
1556                         }
1557                     }
1558                 }
1559                 if (bDistMB)
1560                 {
1561                     /* For the multi-body distance we need the maximum */
1562                     c->bc[2] = comm->cell_x0[dim2];
1563                     for (int i = 0; i < 2; i++)
1564                     {
1565                         for (int j = 0; j < 2; j++)
1566                         {
1567                             c->bc[2] = std::max(c->bc[2], comm->zone_d2[i][j].p1_0);
1568                         }
1569                     }
1570                 }
1571             }
1572
1573             /* Set the upper-right corner for rounding */
1574             /* Cell (0,0,0) and cell (1,0,0) can see cell 4 (0,1,1)
1575              * Only cell (0,0,0) can see cell 7 (1,1,1)
1576              */
1577             c->cr1[0] = comm->cell_x1[dim1];
1578             c->cr1[3] = comm->cell_x1[dim1];
1579             if (isDlbOn(dd->comm))
1580             {
1581                 c->cr1[0] = std::max(comm->cell_x1[dim1], comm->zone_d1[1].mch1);
1582                 if (bDistMB)
1583                 {
1584                     /* For the multi-body distance we need the maximum */
1585                     c->bcr1 = std::max(comm->cell_x1[dim1], comm->zone_d1[1].p1_1);
1586                 }
1587             }
1588         }
1589     }
1590 }
1591
1592 /*! \brief Add the atom groups we need to send in this pulse from this
1593  * zone to \p localAtomGroups and \p work. */
1594 static void get_zone_pulse_cgs(gmx_domdec_t*            dd,
1595                                int                      zonei,
1596                                int                      zone,
1597                                int                      cg0,
1598                                int                      cg1,
1599                                gmx::ArrayRef<const int> globalAtomGroupIndices,
1600                                int                      dim,
1601                                int                      dim_ind,
1602                                int                      dim0,
1603                                int                      dim1,
1604                                int                      dim2,
1605                                real                     r_comm2,
1606                                real                     r_bcomm2,
1607                                matrix                   box,
1608                                bool                     distanceIsTriclinic,
1609                                rvec*                    normal,
1610                                real                     skew_fac2_d,
1611                                real                     skew_fac_01,
1612                                rvec*                    v_d,
1613                                rvec*                    v_0,
1614                                rvec*                    v_1,
1615                                const dd_corners_t*      c,
1616                                const rvec               sf2_round,
1617                                gmx_bool                 bDistBonded,
1618                                gmx_bool                 bBondComm,
1619                                gmx_bool                 bDist2B,
1620                                gmx_bool                 bDistMB,
1621                                rvec*                    cg_cm,
1622                                gmx::ArrayRef<const int> cginfo,
1623                                std::vector<int>*        localAtomGroups,
1624                                dd_comm_setup_work_t*    work)
1625 {
1626     gmx_domdec_comm_t* comm;
1627     gmx_bool           bScrew;
1628     gmx_bool           bDistMB_pulse;
1629     int                cg, i;
1630     real               r2, rb2, r, tric_sh;
1631     rvec               rn, rb;
1632     int                dimd;
1633     int                nsend_z, nat;
1634
1635     comm = dd->comm;
1636
1637     bScrew = (dd->unitCellInfo.haveScrewPBC && dim == XX);
1638
1639     bDistMB_pulse = (bDistMB && bDistBonded);
1640
1641     /* Unpack the work data */
1642     std::vector<int>&       ibuf = work->atomGroupBuffer;
1643     std::vector<gmx::RVec>& vbuf = work->positionBuffer;
1644     nsend_z                      = 0;
1645     nat                          = work->nat;
1646
1647     for (cg = cg0; cg < cg1; cg++)
1648     {
1649         r2  = 0;
1650         rb2 = 0;
1651         if (!distanceIsTriclinic)
1652         {
1653             /* Rectangular direction, easy */
1654             r = cg_cm[cg][dim] - c->c[dim_ind][zone];
1655             if (r > 0)
1656             {
1657                 r2 += r * r;
1658             }
1659             if (bDistMB_pulse)
1660             {
1661                 r = cg_cm[cg][dim] - c->bc[dim_ind];
1662                 if (r > 0)
1663                 {
1664                     rb2 += r * r;
1665                 }
1666             }
1667             /* Rounding gives at most a 16% reduction
1668              * in communicated atoms
1669              */
1670             if (dim_ind >= 1 && (zonei == 1 || zonei == 2))
1671             {
1672                 r = cg_cm[cg][dim0] - c->cr0;
1673                 /* This is the first dimension, so always r >= 0 */
1674                 r2 += r * r;
1675                 if (bDistMB_pulse)
1676                 {
1677                     rb2 += r * r;
1678                 }
1679             }
1680             if (dim_ind == 2 && (zonei == 2 || zonei == 3))
1681             {
1682                 r = cg_cm[cg][dim1] - c->cr1[zone];
1683                 if (r > 0)
1684                 {
1685                     r2 += r * r;
1686                 }
1687                 if (bDistMB_pulse)
1688                 {
1689                     r = cg_cm[cg][dim1] - c->bcr1;
1690                     if (r > 0)
1691                     {
1692                         rb2 += r * r;
1693                     }
1694                 }
1695             }
1696         }
1697         else
1698         {
1699             /* Triclinic direction, more complicated */
1700             clear_rvec(rn);
1701             clear_rvec(rb);
1702             /* Rounding, conservative as the skew_fac multiplication
1703              * will slightly underestimate the distance.
1704              */
1705             if (dim_ind >= 1 && (zonei == 1 || zonei == 2))
1706             {
1707                 rn[dim0] = cg_cm[cg][dim0] - c->cr0;
1708                 for (i = dim0 + 1; i < DIM; i++)
1709                 {
1710                     rn[dim0] -= cg_cm[cg][i] * v_0[i][dim0];
1711                 }
1712                 r2 = rn[dim0] * rn[dim0] * sf2_round[dim0];
1713                 if (bDistMB_pulse)
1714                 {
1715                     rb[dim0] = rn[dim0];
1716                     rb2      = r2;
1717                 }
1718                 /* Take care that the cell planes along dim0 might not
1719                  * be orthogonal to those along dim1 and dim2.
1720                  */
1721                 for (i = 1; i <= dim_ind; i++)
1722                 {
1723                     dimd = dd->dim[i];
1724                     if (normal[dim0][dimd] > 0)
1725                     {
1726                         rn[dimd] -= rn[dim0] * normal[dim0][dimd];
1727                         if (bDistMB_pulse)
1728                         {
1729                             rb[dimd] -= rb[dim0] * normal[dim0][dimd];
1730                         }
1731                     }
1732                 }
1733             }
1734             if (dim_ind == 2 && (zonei == 2 || zonei == 3))
1735             {
1736                 GMX_ASSERT(dim1 >= 0 && dim1 < DIM, "Must have a valid dimension index");
1737                 rn[dim1] += cg_cm[cg][dim1] - c->cr1[zone];
1738                 tric_sh = 0;
1739                 for (i = dim1 + 1; i < DIM; i++)
1740                 {
1741                     tric_sh -= cg_cm[cg][i] * v_1[i][dim1];
1742                 }
1743                 rn[dim1] += tric_sh;
1744                 if (rn[dim1] > 0)
1745                 {
1746                     r2 += rn[dim1] * rn[dim1] * sf2_round[dim1];
1747                     /* Take care of coupling of the distances
1748                      * to the planes along dim0 and dim1 through dim2.
1749                      */
1750                     r2 -= rn[dim0] * rn[dim1] * skew_fac_01;
1751                     /* Take care that the cell planes along dim1
1752                      * might not be orthogonal to that along dim2.
1753                      */
1754                     if (normal[dim1][dim2] > 0)
1755                     {
1756                         rn[dim2] -= rn[dim1] * normal[dim1][dim2];
1757                     }
1758                 }
1759                 if (bDistMB_pulse)
1760                 {
1761                     rb[dim1] += cg_cm[cg][dim1] - c->bcr1 + tric_sh;
1762                     if (rb[dim1] > 0)
1763                     {
1764                         rb2 += rb[dim1] * rb[dim1] * sf2_round[dim1];
1765                         /* Take care of coupling of the distances
1766                          * to the planes along dim0 and dim1 through dim2.
1767                          */
1768                         rb2 -= rb[dim0] * rb[dim1] * skew_fac_01;
1769                         /* Take care that the cell planes along dim1
1770                          * might not be orthogonal to that along dim2.
1771                          */
1772                         if (normal[dim1][dim2] > 0)
1773                         {
1774                             rb[dim2] -= rb[dim1] * normal[dim1][dim2];
1775                         }
1776                     }
1777                 }
1778             }
1779             /* The distance along the communication direction */
1780             rn[dim] += cg_cm[cg][dim] - c->c[dim_ind][zone];
1781             tric_sh = 0;
1782             for (i = dim + 1; i < DIM; i++)
1783             {
1784                 tric_sh -= cg_cm[cg][i] * v_d[i][dim];
1785             }
1786             rn[dim] += tric_sh;
1787             if (rn[dim] > 0)
1788             {
1789                 r2 += rn[dim] * rn[dim] * skew_fac2_d;
1790                 /* Take care of coupling of the distances
1791                  * to the planes along dim0 and dim1 through dim2.
1792                  */
1793                 if (dim_ind == 1 && zonei == 1)
1794                 {
1795                     r2 -= rn[dim0] * rn[dim] * skew_fac_01;
1796                 }
1797             }
1798             if (bDistMB_pulse)
1799             {
1800                 clear_rvec(rb);
1801                 GMX_ASSERT(dim >= 0 && dim < DIM, "Must have a valid dimension index");
1802                 rb[dim] += cg_cm[cg][dim] - c->bc[dim_ind] + tric_sh;
1803                 if (rb[dim] > 0)
1804                 {
1805                     rb2 += rb[dim] * rb[dim] * skew_fac2_d;
1806                     /* Take care of coupling of the distances
1807                      * to the planes along dim0 and dim1 through dim2.
1808                      */
1809                     if (dim_ind == 1 && zonei == 1)
1810                     {
1811                         rb2 -= rb[dim0] * rb[dim] * skew_fac_01;
1812                     }
1813                 }
1814             }
1815         }
1816
1817         if (r2 < r_comm2
1818             || (bDistBonded && ((bDistMB && rb2 < r_bcomm2) || (bDist2B && r2 < r_bcomm2))
1819                 && (!bBondComm
1820                     || (GET_CGINFO_BOND_INTER(cginfo[cg])
1821                         && missing_link(*comm->bondedLinks, globalAtomGroupIndices[cg], *dd->ga2la)))))
1822         {
1823             /* Store the local and global atom group indices and position */
1824             localAtomGroups->push_back(cg);
1825             ibuf.push_back(globalAtomGroupIndices[cg]);
1826             nsend_z++;
1827
1828             rvec posPbc;
1829             if (dd->ci[dim] == 0)
1830             {
1831                 /* Correct cg_cm for pbc */
1832                 rvec_add(cg_cm[cg], box[dim], posPbc);
1833                 if (bScrew)
1834                 {
1835                     posPbc[YY] = box[YY][YY] - posPbc[YY];
1836                     posPbc[ZZ] = box[ZZ][ZZ] - posPbc[ZZ];
1837                 }
1838             }
1839             else
1840             {
1841                 copy_rvec(cg_cm[cg], posPbc);
1842             }
1843             vbuf.emplace_back(posPbc[XX], posPbc[YY], posPbc[ZZ]);
1844
1845             nat += 1;
1846         }
1847     }
1848
1849     work->nat        = nat;
1850     work->nsend_zone = nsend_z;
1851 }
1852
1853 //! Clear data.
1854 static void clearCommSetupData(dd_comm_setup_work_t* work)
1855 {
1856     work->localAtomGroupBuffer.clear();
1857     work->atomGroupBuffer.clear();
1858     work->positionBuffer.clear();
1859     work->nat        = 0;
1860     work->nsend_zone = 0;
1861 }
1862
1863 //! Prepare DD communication.
1864 static void setup_dd_communication(gmx_domdec_t* dd, matrix box, gmx_ddbox_t* ddbox, t_forcerec* fr, t_state* state)
1865 {
1866     int                    dim_ind, dim, dim0, dim1, dim2, dimd, nat_tot;
1867     int                    nzone, nzone_send, zone, zonei, cg0, cg1;
1868     int                    c;
1869     int *                  zone_cg_range, pos_cg;
1870     gmx_domdec_comm_t*     comm;
1871     gmx_domdec_zones_t*    zones;
1872     gmx_domdec_comm_dim_t* cd;
1873     gmx_bool               bBondComm, bDist2B, bDistMB, bDistBonded;
1874     dd_corners_t           corners;
1875     rvec *                 normal, *v_d, *v_0 = nullptr, *v_1 = nullptr;
1876     real                   skew_fac2_d, skew_fac_01;
1877     rvec                   sf2_round;
1878
1879     if (debug)
1880     {
1881         fprintf(debug, "Setting up DD communication\n");
1882     }
1883
1884     comm = dd->comm;
1885
1886     if (comm->dth.empty())
1887     {
1888         /* Initialize the thread data.
1889          * This can not be done in init_domain_decomposition,
1890          * as the numbers of threads is determined later.
1891          */
1892         int numThreads = gmx_omp_nthreads_get(emntDomdec);
1893         comm->dth.resize(numThreads);
1894     }
1895
1896     bBondComm = comm->systemInfo.filterBondedCommunication;
1897
1898     /* Do we need to determine extra distances for multi-body bondeds? */
1899     bDistMB = (comm->systemInfo.haveInterDomainMultiBodyBondeds && isDlbOn(dd->comm) && dd->ndim > 1);
1900
1901     /* Do we need to determine extra distances for only two-body bondeds? */
1902     bDist2B = (bBondComm && !bDistMB);
1903
1904     const real r_comm2 =
1905             gmx::square(domainToDomainIntoAtomToDomainCutoff(comm->systemInfo, comm->systemInfo.cutoff));
1906     const real r_bcomm2 =
1907             gmx::square(domainToDomainIntoAtomToDomainCutoff(comm->systemInfo, comm->cutoff_mbody));
1908
1909     if (debug)
1910     {
1911         fprintf(debug, "bBondComm %s, r_bc %f\n", gmx::boolToString(bBondComm), std::sqrt(r_bcomm2));
1912     }
1913
1914     zones = &comm->zones;
1915
1916     dim0 = dd->dim[0];
1917     dim1 = (dd->ndim >= 2 ? dd->dim[1] : -1);
1918     dim2 = (dd->ndim >= 3 ? dd->dim[2] : -1);
1919
1920     set_dd_corners(dd, dim0, dim1, dim2, bDistMB, &corners);
1921
1922     /* Triclinic stuff */
1923     normal      = ddbox->normal;
1924     skew_fac_01 = 0;
1925     if (dd->ndim >= 2)
1926     {
1927         v_0 = ddbox->v[dim0];
1928         if (ddbox->tric_dir[dim0] && ddbox->tric_dir[dim1])
1929         {
1930             /* Determine the coupling coefficient for the distances
1931              * to the cell planes along dim0 and dim1 through dim2.
1932              * This is required for correct rounding.
1933              */
1934             skew_fac_01 = ddbox->v[dim0][dim1 + 1][dim0] * ddbox->v[dim1][dim1 + 1][dim1];
1935             if (debug)
1936             {
1937                 fprintf(debug, "\nskew_fac_01 %f\n", skew_fac_01);
1938             }
1939         }
1940     }
1941     if (dd->ndim >= 3)
1942     {
1943         v_1 = ddbox->v[dim1];
1944     }
1945
1946     zone_cg_range                        = zones->cg_range.data();
1947     gmx::ArrayRef<cginfo_mb_t> cginfo_mb = fr->cginfo_mb;
1948
1949     zone_cg_range[0]   = 0;
1950     zone_cg_range[1]   = dd->ncg_home;
1951     comm->zone_ncg1[0] = dd->ncg_home;
1952     pos_cg             = dd->ncg_home;
1953
1954     nat_tot = comm->atomRanges.numHomeAtoms();
1955     nzone   = 1;
1956     for (dim_ind = 0; dim_ind < dd->ndim; dim_ind++)
1957     {
1958         dim = dd->dim[dim_ind];
1959         cd  = &comm->cd[dim_ind];
1960
1961         /* Check if we need to compute triclinic distances along this dim */
1962         bool distanceIsTriclinic = false;
1963         for (int i = 0; i <= dim_ind; i++)
1964         {
1965             if (ddbox->tric_dir[dd->dim[i]])
1966             {
1967                 distanceIsTriclinic = true;
1968             }
1969         }
1970
1971         if (dim >= ddbox->npbcdim && dd->ci[dim] == 0)
1972         {
1973             /* No pbc in this dimension, the first node should not comm. */
1974             nzone_send = 0;
1975         }
1976         else
1977         {
1978             nzone_send = nzone;
1979         }
1980
1981         v_d         = ddbox->v[dim];
1982         skew_fac2_d = gmx::square(ddbox->skew_fac[dim]);
1983
1984         cd->receiveInPlace = true;
1985         for (int p = 0; p < cd->numPulses(); p++)
1986         {
1987             /* Only atoms communicated in the first pulse are used
1988              * for multi-body bonded interactions or for bBondComm.
1989              */
1990             bDistBonded = ((bDistMB || bDist2B) && p == 0);
1991
1992             gmx_domdec_ind_t* ind = &cd->ind[p];
1993
1994             /* Thread 0 writes in the global index array */
1995             ind->index.clear();
1996             clearCommSetupData(&comm->dth[0]);
1997
1998             for (zone = 0; zone < nzone_send; zone++)
1999             {
2000                 if (dim_ind > 0 && distanceIsTriclinic)
2001                 {
2002                     /* Determine slightly more optimized skew_fac's
2003                      * for rounding.
2004                      * This reduces the number of communicated atoms
2005                      * by about 10% for 3D DD of rhombic dodecahedra.
2006                      */
2007                     for (dimd = 0; dimd < dim; dimd++)
2008                     {
2009                         sf2_round[dimd] = 1;
2010                         if (ddbox->tric_dir[dimd])
2011                         {
2012                             for (int i = dd->dim[dimd] + 1; i < DIM; i++)
2013                             {
2014                                 /* If we are shifted in dimension i
2015                                  * and the cell plane is tilted forward
2016                                  * in dimension i, skip this coupling.
2017                                  */
2018                                 if (!(zones->shift[nzone + zone][i] && ddbox->v[dimd][i][dimd] >= 0))
2019                                 {
2020                                     sf2_round[dimd] += gmx::square(ddbox->v[dimd][i][dimd]);
2021                                 }
2022                             }
2023                             sf2_round[dimd] = 1 / sf2_round[dimd];
2024                         }
2025                     }
2026                 }
2027
2028                 zonei = zone_perm[dim_ind][zone];
2029                 if (p == 0)
2030                 {
2031                     /* Here we permutate the zones to obtain a convenient order
2032                      * for neighbor searching
2033                      */
2034                     cg0 = zone_cg_range[zonei];
2035                     cg1 = zone_cg_range[zonei + 1];
2036                 }
2037                 else
2038                 {
2039                     /* Look only at the cg's received in the previous grid pulse
2040                      */
2041                     cg1 = zone_cg_range[nzone + zone + 1];
2042                     cg0 = cg1 - cd->ind[p - 1].nrecv[zone];
2043                 }
2044
2045                 const int numThreads = gmx::ssize(comm->dth);
2046 #pragma omp parallel for num_threads(numThreads) schedule(static)
2047                 for (int th = 0; th < numThreads; th++)
2048                 {
2049                     try
2050                     {
2051                         dd_comm_setup_work_t& work = comm->dth[th];
2052
2053                         /* Retain data accumulated into buffers of thread 0 */
2054                         if (th > 0)
2055                         {
2056                             clearCommSetupData(&work);
2057                         }
2058
2059                         int cg0_th = cg0 + ((cg1 - cg0) * th) / numThreads;
2060                         int cg1_th = cg0 + ((cg1 - cg0) * (th + 1)) / numThreads;
2061
2062                         /* Get the cg's for this pulse in this zone */
2063                         get_zone_pulse_cgs(dd,
2064                                            zonei,
2065                                            zone,
2066                                            cg0_th,
2067                                            cg1_th,
2068                                            dd->globalAtomGroupIndices,
2069                                            dim,
2070                                            dim_ind,
2071                                            dim0,
2072                                            dim1,
2073                                            dim2,
2074                                            r_comm2,
2075                                            r_bcomm2,
2076                                            box,
2077                                            distanceIsTriclinic,
2078                                            normal,
2079                                            skew_fac2_d,
2080                                            skew_fac_01,
2081                                            v_d,
2082                                            v_0,
2083                                            v_1,
2084                                            &corners,
2085                                            sf2_round,
2086                                            bDistBonded,
2087                                            bBondComm,
2088                                            bDist2B,
2089                                            bDistMB,
2090                                            state->x.rvec_array(),
2091                                            fr->cginfo,
2092                                            th == 0 ? &ind->index : &work.localAtomGroupBuffer,
2093                                            &work);
2094                     }
2095                     GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR
2096                 } // END
2097
2098                 std::vector<int>&       atomGroups = comm->dth[0].atomGroupBuffer;
2099                 std::vector<gmx::RVec>& positions  = comm->dth[0].positionBuffer;
2100                 ind->nsend[zone]                   = comm->dth[0].nsend_zone;
2101                 /* Append data of threads>=1 to the communication buffers */
2102                 for (int th = 1; th < numThreads; th++)
2103                 {
2104                     const dd_comm_setup_work_t& dth = comm->dth[th];
2105
2106                     ind->index.insert(ind->index.end(),
2107                                       dth.localAtomGroupBuffer.begin(),
2108                                       dth.localAtomGroupBuffer.end());
2109                     atomGroups.insert(
2110                             atomGroups.end(), dth.atomGroupBuffer.begin(), dth.atomGroupBuffer.end());
2111                     positions.insert(
2112                             positions.end(), dth.positionBuffer.begin(), dth.positionBuffer.end());
2113                     comm->dth[0].nat += dth.nat;
2114                     ind->nsend[zone] += dth.nsend_zone;
2115                 }
2116             }
2117             /* Clear the counts in case we do not have pbc */
2118             for (zone = nzone_send; zone < nzone; zone++)
2119             {
2120                 ind->nsend[zone] = 0;
2121             }
2122             ind->nsend[nzone]     = ind->index.size();
2123             ind->nsend[nzone + 1] = comm->dth[0].nat;
2124             /* Communicate the number of cg's and atoms to receive */
2125             ddSendrecv(dd, dim_ind, dddirBackward, ind->nsend, nzone + 2, ind->nrecv, nzone + 2);
2126
2127             if (p > 0)
2128             {
2129                 /* We can receive in place if only the last zone is not empty */
2130                 for (zone = 0; zone < nzone - 1; zone++)
2131                 {
2132                     if (ind->nrecv[zone] > 0)
2133                     {
2134                         cd->receiveInPlace = false;
2135                     }
2136                 }
2137             }
2138
2139             int receiveBufferSize = 0;
2140             if (!cd->receiveInPlace)
2141             {
2142                 receiveBufferSize = ind->nrecv[nzone];
2143             }
2144             /* These buffer are actually only needed with in-place */
2145             DDBufferAccess<int>       globalAtomGroupBuffer(comm->intBuffer, receiveBufferSize);
2146             DDBufferAccess<gmx::RVec> rvecBuffer(comm->rvecBuffer, receiveBufferSize);
2147
2148             dd_comm_setup_work_t& work = comm->dth[0];
2149
2150             /* Make space for the global cg indices */
2151             int numAtomGroupsNew = pos_cg + ind->nrecv[nzone];
2152             dd->globalAtomGroupIndices.resize(numAtomGroupsNew);
2153             /* Communicate the global cg indices */
2154             gmx::ArrayRef<int> integerBufferRef;
2155             if (cd->receiveInPlace)
2156             {
2157                 integerBufferRef = gmx::arrayRefFromArray(
2158                         dd->globalAtomGroupIndices.data() + pos_cg, ind->nrecv[nzone]);
2159             }
2160             else
2161             {
2162                 integerBufferRef = globalAtomGroupBuffer.buffer;
2163             }
2164             ddSendrecv<int>(dd, dim_ind, dddirBackward, work.atomGroupBuffer, integerBufferRef);
2165
2166             /* Make space for cg_cm */
2167             dd_resize_atominfo_and_state(fr, state, pos_cg + ind->nrecv[nzone]);
2168
2169             /* Communicate the coordinates */
2170             gmx::ArrayRef<gmx::RVec> rvecBufferRef;
2171             if (cd->receiveInPlace)
2172             {
2173                 rvecBufferRef = gmx::makeArrayRef(state->x).subArray(pos_cg, ind->nrecv[nzone]);
2174             }
2175             else
2176             {
2177                 rvecBufferRef = rvecBuffer.buffer;
2178             }
2179             ddSendrecv<gmx::RVec>(dd, dim_ind, dddirBackward, work.positionBuffer, rvecBufferRef);
2180
2181             /* Make the charge group index */
2182             if (cd->receiveInPlace)
2183             {
2184                 zone = (p == 0 ? 0 : nzone - 1);
2185                 while (zone < nzone)
2186                 {
2187                     for (int i = 0; i < ind->nrecv[zone]; i++)
2188                     {
2189                         int globalAtomIndex = dd->globalAtomGroupIndices[pos_cg];
2190                         fr->cginfo[pos_cg]  = ddcginfo(cginfo_mb, globalAtomIndex);
2191                         pos_cg++;
2192                     }
2193                     if (p == 0)
2194                     {
2195                         comm->zone_ncg1[nzone + zone] = ind->nrecv[zone];
2196                     }
2197                     zone++;
2198                     zone_cg_range[nzone + zone] = pos_cg;
2199                 }
2200             }
2201             else
2202             {
2203                 /* This part of the code is never executed with bBondComm. */
2204                 merge_cg_buffers(nzone,
2205                                  cd,
2206                                  p,
2207                                  zone_cg_range,
2208                                  dd->globalAtomGroupIndices,
2209                                  integerBufferRef.data(),
2210                                  state->x,
2211                                  rvecBufferRef,
2212                                  fr->cginfo_mb,
2213                                  fr->cginfo);
2214                 pos_cg += ind->nrecv[nzone];
2215             }
2216             nat_tot += ind->nrecv[nzone + 1];
2217         }
2218         if (!cd->receiveInPlace)
2219         {
2220             /* Store the atom block for easy copying of communication buffers */
2221             make_cell2at_index(cd, nzone, zone_cg_range[nzone]);
2222         }
2223         nzone += nzone;
2224     }
2225
2226     comm->atomRanges.setEnd(DDAtomRanges::Type::Zones, nat_tot);
2227
2228     if (!bBondComm)
2229     {
2230         /* We don't need to update cginfo, since that was alrady done above.
2231          * So we pass NULL for the forcerec.
2232          */
2233         dd_set_cginfo(dd->globalAtomGroupIndices, dd->ncg_home, dd->globalAtomGroupIndices.size(), nullptr);
2234     }
2235
2236     if (debug)
2237     {
2238         fprintf(debug, "Finished setting up DD communication, zones:");
2239         for (c = 0; c < zones->n; c++)
2240         {
2241             fprintf(debug, " %d", zones->cg_range[c + 1] - zones->cg_range[c]);
2242         }
2243         fprintf(debug, "\n");
2244     }
2245 }
2246
2247 //! Set boundaries for the charge group range.
2248 static void set_cg_boundaries(gmx_domdec_zones_t* zones)
2249 {
2250     for (auto& iZone : zones->iZones)
2251     {
2252         iZone.iAtomRange = gmx::Range<int>(0, zones->cg_range[iZone.iZoneIndex + 1]);
2253         iZone.jAtomRange = gmx::Range<int>(zones->cg_range[iZone.jZoneRange.begin()],
2254                                            zones->cg_range[iZone.jZoneRange.end()]);
2255     }
2256 }
2257
2258 /*! \brief Set zone dimensions for zones \p zone_start to \p zone_end-1
2259  *
2260  * Also sets the atom density for the home zone when \p zone_start=0.
2261  * For this \p numMovedChargeGroupsInHomeZone needs to be passed to tell
2262  * how many charge groups will move but are still part of the current range.
2263  * \todo When converting domdec to use proper classes, all these variables
2264  *       should be private and a method should return the correct count
2265  *       depending on an internal state.
2266  *
2267  * \param[in,out] dd          The domain decomposition struct
2268  * \param[in]     box         The box
2269  * \param[in]     ddbox       The domain decomposition box struct
2270  * \param[in]     zone_start  The start of the zone range to set sizes for
2271  * \param[in]     zone_end    The end of the zone range to set sizes for
2272  * \param[in]     numMovedChargeGroupsInHomeZone  The number of charge groups in the home zone that should moved but are still present in dd->comm->zones.cg_range
2273  */
2274 static void set_zones_size(gmx_domdec_t*      dd,
2275                            matrix             box,
2276                            const gmx_ddbox_t* ddbox,
2277                            int                zone_start,
2278                            int                zone_end,
2279                            int                numMovedChargeGroupsInHomeZone)
2280 {
2281     gmx_domdec_comm_t*  comm;
2282     gmx_domdec_zones_t* zones;
2283     gmx_bool            bDistMB;
2284     int                 z, d, dim;
2285     real                rcs, rcmbs;
2286     int                 i, j;
2287     real                vol;
2288
2289     comm = dd->comm;
2290
2291     zones = &comm->zones;
2292
2293     /* Do we need to determine extra distances for multi-body bondeds? */
2294     bDistMB = (comm->systemInfo.haveInterDomainMultiBodyBondeds && isDlbOn(dd->comm) && dd->ndim > 1);
2295
2296     for (z = zone_start; z < zone_end; z++)
2297     {
2298         /* Copy cell limits to zone limits.
2299          * Valid for non-DD dims and non-shifted dims.
2300          */
2301         copy_rvec(comm->cell_x0, zones->size[z].x0);
2302         copy_rvec(comm->cell_x1, zones->size[z].x1);
2303     }
2304
2305     for (d = 0; d < dd->ndim; d++)
2306     {
2307         dim = dd->dim[d];
2308
2309         for (z = 0; z < zones->n; z++)
2310         {
2311             /* With a staggered grid we have different sizes
2312              * for non-shifted dimensions.
2313              */
2314             if (isDlbOn(dd->comm) && zones->shift[z][dim] == 0)
2315             {
2316                 if (d == 1)
2317                 {
2318                     zones->size[z].x0[dim] = comm->zone_d1[zones->shift[z][dd->dim[d - 1]]].min0;
2319                     zones->size[z].x1[dim] = comm->zone_d1[zones->shift[z][dd->dim[d - 1]]].max1;
2320                 }
2321                 else if (d == 2)
2322                 {
2323                     zones->size[z].x0[dim] =
2324                             comm->zone_d2[zones->shift[z][dd->dim[d - 2]]][zones->shift[z][dd->dim[d - 1]]]
2325                                     .min0;
2326                     zones->size[z].x1[dim] =
2327                             comm->zone_d2[zones->shift[z][dd->dim[d - 2]]][zones->shift[z][dd->dim[d - 1]]]
2328                                     .max1;
2329                 }
2330             }
2331         }
2332
2333         rcs   = comm->systemInfo.cutoff;
2334         rcmbs = comm->cutoff_mbody;
2335         if (ddbox->tric_dir[dim])
2336         {
2337             rcs /= ddbox->skew_fac[dim];
2338             rcmbs /= ddbox->skew_fac[dim];
2339         }
2340
2341         /* Set the lower limit for the shifted zone dimensions */
2342         for (z = zone_start; z < zone_end; z++)
2343         {
2344             if (zones->shift[z][dim] > 0)
2345             {
2346                 dim = dd->dim[d];
2347                 if (!isDlbOn(dd->comm) || d == 0)
2348                 {
2349                     zones->size[z].x0[dim] = comm->cell_x1[dim];
2350                     zones->size[z].x1[dim] = comm->cell_x1[dim] + rcs;
2351                 }
2352                 else
2353                 {
2354                     /* Here we take the lower limit of the zone from
2355                      * the lowest domain of the zone below.
2356                      */
2357                     if (z < 4)
2358                     {
2359                         zones->size[z].x0[dim] = comm->zone_d1[zones->shift[z][dd->dim[d - 1]]].min1;
2360                     }
2361                     else
2362                     {
2363                         if (d == 1)
2364                         {
2365                             zones->size[z].x0[dim] = zones->size[zone_perm[2][z - 4]].x0[dim];
2366                         }
2367                         else
2368                         {
2369                             zones->size[z].x0[dim] =
2370                                     comm->zone_d2[zones->shift[z][dd->dim[d - 2]]][zones->shift[z][dd->dim[d - 1]]]
2371                                             .min1;
2372                         }
2373                     }
2374                     /* A temporary limit, is updated below */
2375                     zones->size[z].x1[dim] = zones->size[z].x0[dim];
2376
2377                     if (bDistMB)
2378                     {
2379                         for (size_t zi = 0; zi < zones->iZones.size(); zi++)
2380                         {
2381                             if (zones->shift[zi][dim] == 0)
2382                             {
2383                                 /* This takes the whole zone into account.
2384                                  * With multiple pulses this will lead
2385                                  * to a larger zone then strictly necessary.
2386                                  */
2387                                 zones->size[z].x1[dim] = std::max(zones->size[z].x1[dim],
2388                                                                   zones->size[zi].x1[dim] + rcmbs);
2389                             }
2390                         }
2391                     }
2392                 }
2393             }
2394         }
2395
2396         /* Loop over the i-zones to set the upper limit of each
2397          * j-zone they see.
2398          */
2399         for (const auto& iZone : zones->iZones)
2400         {
2401             const int zi = iZone.iZoneIndex;
2402             if (zones->shift[zi][dim] == 0)
2403             {
2404                 /* We should only use zones up to zone_end */
2405                 const auto& jZoneRangeFull = iZone.jZoneRange;
2406                 if (zone_end <= *jZoneRangeFull.begin())
2407                 {
2408                     continue;
2409                 }
2410                 const gmx::Range<int> jZoneRange(*jZoneRangeFull.begin(),
2411                                                  std::min(*jZoneRangeFull.end(), zone_end));
2412                 for (int jZone : jZoneRange)
2413                 {
2414                     if (zones->shift[jZone][dim] > 0)
2415                     {
2416                         zones->size[jZone].x1[dim] =
2417                                 std::max(zones->size[jZone].x1[dim], zones->size[zi].x1[dim] + rcs);
2418                     }
2419                 }
2420             }
2421         }
2422     }
2423
2424     for (z = zone_start; z < zone_end; z++)
2425     {
2426         /* Initialization only required to keep the compiler happy */
2427         rvec corner_min = { 0, 0, 0 }, corner_max = { 0, 0, 0 }, corner;
2428         int  nc, c;
2429
2430         /* To determine the bounding box for a zone we need to find
2431          * the extreme corners of 4, 2 or 1 corners.
2432          */
2433         nc = 1 << (ddbox->nboundeddim - 1);
2434
2435         for (c = 0; c < nc; c++)
2436         {
2437             /* Set up a zone corner at x=0, ignoring trilinic couplings */
2438             corner[XX] = 0;
2439             if ((c & 1) == 0)
2440             {
2441                 corner[YY] = zones->size[z].x0[YY];
2442             }
2443             else
2444             {
2445                 corner[YY] = zones->size[z].x1[YY];
2446             }
2447             if ((c & 2) == 0)
2448             {
2449                 corner[ZZ] = zones->size[z].x0[ZZ];
2450             }
2451             else
2452             {
2453                 corner[ZZ] = zones->size[z].x1[ZZ];
2454             }
2455             if (dd->ndim == 1 && dd->dim[0] < ZZ && ZZ < dd->unitCellInfo.npbcdim
2456                 && box[ZZ][1 - dd->dim[0]] != 0)
2457             {
2458                 /* With 1D domain decomposition the cg's are not in
2459                  * the triclinic box, but triclinic x-y and rectangular y/x-z.
2460                  * Shift the corner of the z-vector back to along the box
2461                  * vector of dimension d, so it will later end up at 0 along d.
2462                  * This can affect the location of this corner along dd->dim[0]
2463                  * through the matrix operation below if box[d][dd->dim[0]]!=0.
2464                  */
2465                 int d = 1 - dd->dim[0];
2466
2467                 corner[d] -= corner[ZZ] * box[ZZ][d] / box[ZZ][ZZ];
2468             }
2469             /* Apply the triclinic couplings */
2470             for (i = YY; i < ddbox->npbcdim && i < DIM; i++)
2471             {
2472                 for (j = XX; j < i; j++)
2473                 {
2474                     corner[j] += corner[i] * box[i][j] / box[i][i];
2475                 }
2476             }
2477             if (c == 0)
2478             {
2479                 copy_rvec(corner, corner_min);
2480                 copy_rvec(corner, corner_max);
2481             }
2482             else
2483             {
2484                 for (i = 0; i < DIM; i++)
2485                 {
2486                     corner_min[i] = std::min(corner_min[i], corner[i]);
2487                     corner_max[i] = std::max(corner_max[i], corner[i]);
2488                 }
2489             }
2490         }
2491         /* Copy the extreme cornes without offset along x */
2492         for (i = 0; i < DIM; i++)
2493         {
2494             zones->size[z].bb_x0[i] = corner_min[i];
2495             zones->size[z].bb_x1[i] = corner_max[i];
2496         }
2497         /* Add the offset along x */
2498         zones->size[z].bb_x0[XX] += zones->size[z].x0[XX];
2499         zones->size[z].bb_x1[XX] += zones->size[z].x1[XX];
2500     }
2501
2502     if (zone_start == 0)
2503     {
2504         vol = 1;
2505         for (dim = 0; dim < DIM; dim++)
2506         {
2507             vol *= zones->size[0].x1[dim] - zones->size[0].x0[dim];
2508         }
2509         zones->dens_zone0 =
2510                 (zones->cg_range[1] - zones->cg_range[0] - numMovedChargeGroupsInHomeZone) / vol;
2511     }
2512
2513     if (debug)
2514     {
2515         for (z = zone_start; z < zone_end; z++)
2516         {
2517             fprintf(debug,
2518                     "zone %d    %6.3f - %6.3f  %6.3f - %6.3f  %6.3f - %6.3f\n",
2519                     z,
2520                     zones->size[z].x0[XX],
2521                     zones->size[z].x1[XX],
2522                     zones->size[z].x0[YY],
2523                     zones->size[z].x1[YY],
2524                     zones->size[z].x0[ZZ],
2525                     zones->size[z].x1[ZZ]);
2526             fprintf(debug,
2527                     "zone %d bb %6.3f - %6.3f  %6.3f - %6.3f  %6.3f - %6.3f\n",
2528                     z,
2529                     zones->size[z].bb_x0[XX],
2530                     zones->size[z].bb_x1[XX],
2531                     zones->size[z].bb_x0[YY],
2532                     zones->size[z].bb_x1[YY],
2533                     zones->size[z].bb_x0[ZZ],
2534                     zones->size[z].bb_x1[ZZ]);
2535         }
2536     }
2537 }
2538
2539 /*! \brief Order data in \p dataToSort according to \p sort
2540  *
2541  * Note: both buffers should have at least \p sort.size() elements.
2542  */
2543 template<typename T>
2544 static void orderVector(gmx::ArrayRef<const gmx_cgsort_t> sort,
2545                         gmx::ArrayRef<T>                  dataToSort,
2546                         gmx::ArrayRef<T>                  sortBuffer)
2547 {
2548     GMX_ASSERT(dataToSort.size() >= sort.size(), "The vector needs to be sufficiently large");
2549     GMX_ASSERT(sortBuffer.size() >= sort.size(),
2550                "The sorting buffer needs to be sufficiently large");
2551
2552     /* Order the data into the temporary buffer */
2553     size_t i = 0;
2554     for (const gmx_cgsort_t& entry : sort)
2555     {
2556         sortBuffer[i++] = dataToSort[entry.ind];
2557     }
2558
2559     /* Copy back to the original array */
2560     std::copy(sortBuffer.begin(), sortBuffer.begin() + sort.size(), dataToSort.begin());
2561 }
2562
2563 /*! \brief Order data in \p dataToSort according to \p sort
2564  *
2565  * Note: \p vectorToSort should have at least \p sort.size() elements,
2566  *       \p workVector is resized when it is too small.
2567  */
2568 template<typename T>
2569 static void orderVector(gmx::ArrayRef<const gmx_cgsort_t> sort,
2570                         gmx::ArrayRef<T>                  vectorToSort,
2571                         std::vector<T>*                   workVector)
2572 {
2573     if (gmx::index(workVector->size()) < sort.ssize())
2574     {
2575         workVector->resize(sort.size());
2576     }
2577     orderVector<T>(sort, vectorToSort, *workVector);
2578 }
2579
2580 //! Returns the sorting order for atoms based on the nbnxn grid order in sort
2581 static void dd_sort_order_nbnxn(const t_forcerec* fr, std::vector<gmx_cgsort_t>* sort)
2582 {
2583     gmx::ArrayRef<const int> atomOrder = fr->nbv->getLocalAtomOrder();
2584
2585     /* Using push_back() instead of this resize results in much slower code */
2586     sort->resize(atomOrder.size());
2587     gmx::ArrayRef<gmx_cgsort_t> buffer    = *sort;
2588     size_t                      numSorted = 0;
2589     for (int i : atomOrder)
2590     {
2591         if (i >= 0)
2592         {
2593             /* The values of nsc and ind_gl are not used in this case */
2594             buffer[numSorted++].ind = i;
2595         }
2596     }
2597     sort->resize(numSorted);
2598 }
2599
2600 //! Returns the sorting state for DD.
2601 static void dd_sort_state(gmx_domdec_t* dd, t_forcerec* fr, t_state* state)
2602 {
2603     gmx_domdec_sort_t* sort = dd->comm->sort.get();
2604
2605     dd_sort_order_nbnxn(fr, &sort->sorted);
2606
2607     /* We alloc with the old size, since cgindex is still old */
2608     DDBufferAccess<gmx::RVec> rvecBuffer(dd->comm->rvecBuffer, dd->ncg_home);
2609
2610     /* Set the new home atom/charge group count */
2611     dd->ncg_home = sort->sorted.size();
2612     if (debug)
2613     {
2614         fprintf(debug, "Set the new home atom count to %d\n", dd->ncg_home);
2615     }
2616
2617     /* Reorder the state */
2618     gmx::ArrayRef<const gmx_cgsort_t> cgsort = sort->sorted;
2619     GMX_RELEASE_ASSERT(cgsort.ssize() == dd->ncg_home, "We should sort all the home atom groups");
2620
2621     if (state->flags & enumValueToBitMask(StateEntry::X))
2622     {
2623         orderVector(cgsort, makeArrayRef(state->x), rvecBuffer.buffer);
2624     }
2625     if (state->flags & enumValueToBitMask(StateEntry::V))
2626     {
2627         orderVector(cgsort, makeArrayRef(state->v), rvecBuffer.buffer);
2628     }
2629     if (state->flags & enumValueToBitMask(StateEntry::Cgp))
2630     {
2631         orderVector(cgsort, makeArrayRef(state->cg_p), rvecBuffer.buffer);
2632     }
2633
2634     /* Reorder the global cg index */
2635     orderVector<int>(cgsort, dd->globalAtomGroupIndices, &sort->intBuffer);
2636     /* Reorder the cginfo */
2637     orderVector<int>(cgsort, fr->cginfo, &sort->intBuffer);
2638     /* Set the home atom number */
2639     dd->comm->atomRanges.setEnd(DDAtomRanges::Type::Home, dd->ncg_home);
2640
2641     /* The atoms are now exactly in grid order, update the grid order */
2642     fr->nbv->setLocalAtomOrder();
2643 }
2644
2645 //! Accumulates load statistics.
2646 static void add_dd_statistics(gmx_domdec_t* dd)
2647 {
2648     gmx_domdec_comm_t* comm = dd->comm;
2649
2650     for (int i = 0; i < static_cast<int>(DDAtomRanges::Type::Number); i++)
2651     {
2652         auto range = static_cast<DDAtomRanges::Type>(i);
2653         comm->sum_nat[i] += comm->atomRanges.end(range) - comm->atomRanges.start(range);
2654     }
2655     comm->ndecomp++;
2656 }
2657
2658 void reset_dd_statistics_counters(gmx_domdec_t* dd)
2659 {
2660     gmx_domdec_comm_t* comm = dd->comm;
2661
2662     /* Reset all the statistics and counters for total run counting */
2663     for (int i = 0; i < static_cast<int>(DDAtomRanges::Type::Number); i++)
2664     {
2665         comm->sum_nat[i] = 0;
2666     }
2667     comm->ndecomp   = 0;
2668     comm->nload     = 0;
2669     comm->load_step = 0;
2670     comm->load_sum  = 0;
2671     comm->load_max  = 0;
2672     clear_ivec(comm->load_lim);
2673     comm->load_mdf = 0;
2674     comm->load_pme = 0;
2675 }
2676
2677 void print_dd_statistics(const t_commrec* cr, const t_inputrec& inputrec, FILE* fplog)
2678 {
2679     gmx_domdec_comm_t* comm = cr->dd->comm;
2680
2681     const int numRanges = static_cast<int>(DDAtomRanges::Type::Number);
2682     gmx_sumd(numRanges, comm->sum_nat, cr);
2683
2684     if (fplog == nullptr)
2685     {
2686         return;
2687     }
2688
2689     fprintf(fplog, "\n    D O M A I N   D E C O M P O S I T I O N   S T A T I S T I C S\n\n");
2690
2691     for (int i = static_cast<int>(DDAtomRanges::Type::Zones); i < numRanges; i++)
2692     {
2693         auto   range = static_cast<DDAtomRanges::Type>(i);
2694         double av    = comm->sum_nat[i] / comm->ndecomp;
2695         switch (range)
2696         {
2697             case DDAtomRanges::Type::Zones:
2698                 fprintf(fplog, " av. #atoms communicated per step for force:  %d x %.1f\n", 2, av);
2699                 break;
2700             case DDAtomRanges::Type::Vsites:
2701                 if (cr->dd->vsite_comm)
2702                 {
2703                     fprintf(fplog,
2704                             " av. #atoms communicated per step for vsites: %d x %.1f\n",
2705                             (EEL_PME(inputrec.coulombtype)
2706                              || inputrec.coulombtype == CoulombInteractionType::Ewald)
2707                                     ? 3
2708                                     : 2,
2709                             av);
2710                 }
2711                 break;
2712             case DDAtomRanges::Type::Constraints:
2713                 if (cr->dd->constraint_comm)
2714                 {
2715                     fprintf(fplog,
2716                             " av. #atoms communicated per step for LINCS:  %d x %.1f\n",
2717                             1 + inputrec.nLincsIter,
2718                             av);
2719                 }
2720                 break;
2721             default: gmx_incons(" Unknown type for DD statistics");
2722         }
2723     }
2724     fprintf(fplog, "\n");
2725
2726     if (comm->ddSettings.recordLoad && EI_DYNAMICS(inputrec.eI))
2727     {
2728         print_dd_load_av(fplog, cr->dd);
2729     }
2730 }
2731
2732 //!\brief TODO Remove fplog when group scheme and charge groups are gone
2733 void dd_partition_system(FILE*                     fplog,
2734                          const gmx::MDLogger&      mdlog,
2735                          int64_t                   step,
2736                          const t_commrec*          cr,
2737                          gmx_bool                  bMasterState,
2738                          int                       nstglobalcomm,
2739                          t_state*                  state_global,
2740                          const gmx_mtop_t&         top_global,
2741                          const t_inputrec&         inputrec,
2742                          gmx::ImdSession*          imdSession,
2743                          pull_t*                   pull_work,
2744                          t_state*                  state_local,
2745                          gmx::ForceBuffers*        f,
2746                          gmx::MDAtoms*             mdAtoms,
2747                          gmx_localtop_t*           top_local,
2748                          t_forcerec*               fr,
2749                          gmx::VirtualSitesHandler* vsite,
2750                          gmx::Constraints*         constr,
2751                          t_nrnb*                   nrnb,
2752                          gmx_wallcycle*            wcycle,
2753                          gmx_bool                  bVerbose)
2754 {
2755     gmx_domdec_t*      dd;
2756     gmx_domdec_comm_t* comm;
2757     gmx_ddbox_t        ddbox = { 0 };
2758     int64_t            step_pcoupl;
2759     rvec               cell_ns_x0, cell_ns_x1;
2760     int                ncgindex_set, ncg_moved, nat_f_novirsum;
2761     gmx_bool           bBoxChanged, bNStGlobalComm, bDoDLB, bCheckWhetherToTurnDlbOn, bLogLoad;
2762     gmx_bool           bRedist;
2763     ivec               np;
2764     real               grid_density;
2765     char               sbuf[22];
2766
2767     wallcycle_start(wcycle, ewcDOMDEC);
2768
2769     dd   = cr->dd;
2770     comm = dd->comm;
2771
2772     // TODO if the update code becomes accessible here, use
2773     // upd->deform for this logic.
2774     bBoxChanged = (bMasterState || inputrecDeform(&inputrec));
2775     if (inputrec.epc != PressureCoupling::No)
2776     {
2777         /* With nstpcouple > 1 pressure coupling happens.
2778          * one step after calculating the pressure.
2779          * Box scaling happens at the end of the MD step,
2780          * after the DD partitioning.
2781          * We therefore have to do DLB in the first partitioning
2782          * after an MD step where P-coupling occurred.
2783          * We need to determine the last step in which p-coupling occurred.
2784          * MRS -- need to validate this for vv?
2785          */
2786         int n = inputrec.nstpcouple;
2787         if (n == 1)
2788         {
2789             step_pcoupl = step - 1;
2790         }
2791         else
2792         {
2793             step_pcoupl = ((step - 1) / n) * n + 1;
2794         }
2795         if (step_pcoupl >= comm->partition_step)
2796         {
2797             bBoxChanged = TRUE;
2798         }
2799     }
2800
2801     bNStGlobalComm = (step % nstglobalcomm == 0);
2802
2803     if (!isDlbOn(comm))
2804     {
2805         bDoDLB = FALSE;
2806     }
2807     else
2808     {
2809         /* Should we do dynamic load balacing this step?
2810          * Since it requires (possibly expensive) global communication,
2811          * we might want to do DLB less frequently.
2812          */
2813         if (bBoxChanged || inputrec.epc != PressureCoupling::No)
2814         {
2815             bDoDLB = bBoxChanged;
2816         }
2817         else
2818         {
2819             bDoDLB = bNStGlobalComm;
2820         }
2821     }
2822
2823     /* Check if we have recorded loads on the nodes */
2824     if (comm->ddSettings.recordLoad && dd_load_count(comm) > 0)
2825     {
2826         bCheckWhetherToTurnDlbOn = dd_dlb_get_should_check_whether_to_turn_dlb_on(dd);
2827
2828         /* Print load every nstlog, first and last step to the log file */
2829         bLogLoad = ((inputrec.nstlog > 0 && step % inputrec.nstlog == 0) || comm->n_load_collect == 0
2830                     || (inputrec.nsteps >= 0
2831                         && (step + inputrec.nstlist > inputrec.init_step + inputrec.nsteps)));
2832
2833         /* Avoid extra communication due to verbose screen output
2834          * when nstglobalcomm is set.
2835          */
2836         if (bDoDLB || bLogLoad || bCheckWhetherToTurnDlbOn
2837             || (bVerbose && (inputrec.nstlist == 0 || nstglobalcomm <= inputrec.nstlist)))
2838         {
2839             get_load_distribution(dd, wcycle);
2840             if (DDMASTER(dd))
2841             {
2842                 if (bLogLoad)
2843                 {
2844                     GMX_LOG(mdlog.info).asParagraph().appendText(dd_print_load(dd, step - 1));
2845                 }
2846                 if (bVerbose)
2847                 {
2848                     dd_print_load_verbose(dd);
2849                 }
2850             }
2851             comm->n_load_collect++;
2852
2853             if (isDlbOn(comm))
2854             {
2855                 if (DDMASTER(dd))
2856                 {
2857                     /* Add the measured cycles to the running average */
2858                     const float averageFactor = 0.1F;
2859                     comm->cyclesPerStepDlbExpAverage =
2860                             (1 - averageFactor) * comm->cyclesPerStepDlbExpAverage
2861                             + averageFactor * comm->cycl[ddCyclStep] / comm->cycl_n[ddCyclStep];
2862                 }
2863                 if (comm->dlbState == DlbState::onCanTurnOff
2864                     && dd->comm->n_load_have % c_checkTurnDlbOffInterval == c_checkTurnDlbOffInterval - 1)
2865                 {
2866                     gmx_bool turnOffDlb;
2867                     if (DDMASTER(dd))
2868                     {
2869                         /* If the running averaged cycles with DLB are more
2870                          * than before we turned on DLB, turn off DLB.
2871                          * We will again run and check the cycles without DLB
2872                          * and we can then decide if to turn off DLB forever.
2873                          */
2874                         turnOffDlb = (comm->cyclesPerStepDlbExpAverage > comm->cyclesPerStepBeforeDLB);
2875                     }
2876                     dd_bcast(dd, sizeof(turnOffDlb), &turnOffDlb);
2877                     if (turnOffDlb)
2878                     {
2879                         /* To turn off DLB, we need to redistribute the atoms */
2880                         dd_collect_state(dd, state_local, state_global);
2881                         bMasterState = TRUE;
2882                         turn_off_dlb(mdlog, dd, step);
2883                     }
2884                 }
2885             }
2886             else if (bCheckWhetherToTurnDlbOn)
2887             {
2888                 gmx_bool turnOffDlbForever = FALSE;
2889                 gmx_bool turnOnDlb         = FALSE;
2890
2891                 /* Since the timings are node dependent, the master decides */
2892                 if (DDMASTER(dd))
2893                 {
2894                     /* If we recently turned off DLB, we want to check if
2895                      * performance is better without DLB. We want to do this
2896                      * ASAP to minimize the chance that external factors
2897                      * slowed down the DLB step are gone here and we
2898                      * incorrectly conclude that DLB was causing the slowdown.
2899                      * So we measure one nstlist block, no running average.
2900                      */
2901                     if (comm->haveTurnedOffDlb
2902                         && comm->cycl[ddCyclStep] / comm->cycl_n[ddCyclStep] < comm->cyclesPerStepDlbExpAverage)
2903                     {
2904                         /* After turning off DLB we ran nstlist steps in fewer
2905                          * cycles than with DLB. This likely means that DLB
2906                          * in not benefical, but this could be due to a one
2907                          * time unlucky fluctuation, so we require two such
2908                          * observations in close succession to turn off DLB
2909                          * forever.
2910                          */
2911                         if (comm->dlbSlowerPartitioningCount > 0
2912                             && dd->ddp_count < comm->dlbSlowerPartitioningCount + 10 * c_checkTurnDlbOnInterval)
2913                         {
2914                             turnOffDlbForever = TRUE;
2915                         }
2916                         comm->haveTurnedOffDlb = false;
2917                         /* Register when we last measured DLB slowdown */
2918                         comm->dlbSlowerPartitioningCount = dd->ddp_count;
2919                     }
2920                     else
2921                     {
2922                         /* Here we check if the max PME rank load is more than 0.98
2923                          * the max PP force load. If so, PP DLB will not help,
2924                          * since we are (almost) limited by PME. Furthermore,
2925                          * DLB will cause a significant extra x/f redistribution
2926                          * cost on the PME ranks, which will then surely result
2927                          * in lower total performance.
2928                          */
2929                         if (comm->ddRankSetup.usePmeOnlyRanks && dd_pme_f_ratio(dd) > 1 - DD_PERF_LOSS_DLB_ON)
2930                         {
2931                             turnOnDlb = FALSE;
2932                         }
2933                         else
2934                         {
2935                             turnOnDlb = (dd_force_imb_perf_loss(dd) >= DD_PERF_LOSS_DLB_ON);
2936                         }
2937                     }
2938                 }
2939                 struct
2940                 {
2941                     gmx_bool turnOffDlbForever;
2942                     gmx_bool turnOnDlb;
2943                 } bools{ turnOffDlbForever, turnOnDlb };
2944                 dd_bcast(dd, sizeof(bools), &bools);
2945                 if (bools.turnOffDlbForever)
2946                 {
2947                     turn_off_dlb_forever(mdlog, dd, step);
2948                 }
2949                 else if (bools.turnOnDlb)
2950                 {
2951                     turn_on_dlb(mdlog, dd, step);
2952                     bDoDLB = TRUE;
2953                 }
2954             }
2955         }
2956         comm->n_load_have++;
2957     }
2958
2959     bRedist = FALSE;
2960     if (bMasterState)
2961     {
2962         /* Clear the old state */
2963         clearDDStateIndices(dd, false);
2964         ncgindex_set = 0;
2965
2966         auto xGlobal = positionsFromStatePointer(state_global);
2967
2968         set_ddbox(*dd, true, DDMASTER(dd) ? state_global->box : nullptr, true, xGlobal, &ddbox);
2969
2970         distributeState(mdlog, dd, top_global, state_global, ddbox, state_local);
2971
2972         /* Ensure that we have space for the new distribution */
2973         dd_resize_atominfo_and_state(fr, state_local, dd->ncg_home);
2974
2975         inc_nrnb(nrnb, eNR_CGCM, comm->atomRanges.numHomeAtoms());
2976
2977         dd_set_cginfo(dd->globalAtomGroupIndices, 0, dd->ncg_home, fr);
2978     }
2979     else if (state_local->ddp_count != dd->ddp_count)
2980     {
2981         if (state_local->ddp_count > dd->ddp_count)
2982         {
2983             gmx_fatal(FARGS,
2984                       "Internal inconsistency state_local->ddp_count (%d) > dd->ddp_count (%" PRId64
2985                       ")",
2986                       state_local->ddp_count,
2987                       dd->ddp_count);
2988         }
2989
2990         if (state_local->ddp_count_cg_gl != state_local->ddp_count)
2991         {
2992             gmx_fatal(FARGS,
2993                       "Internal inconsistency state_local->ddp_count_cg_gl (%d) != "
2994                       "state_local->ddp_count (%d)",
2995                       state_local->ddp_count_cg_gl,
2996                       state_local->ddp_count);
2997         }
2998
2999         /* Clear the old state */
3000         clearDDStateIndices(dd, false);
3001
3002         /* Restore the atom group indices from state_local */
3003         restoreAtomGroups(dd, state_local);
3004         make_dd_indices(dd, 0);
3005         ncgindex_set = dd->ncg_home;
3006
3007         inc_nrnb(nrnb, eNR_CGCM, comm->atomRanges.numHomeAtoms());
3008
3009         dd_set_cginfo(dd->globalAtomGroupIndices, 0, dd->ncg_home, fr);
3010
3011         set_ddbox(*dd, bMasterState, state_local->box, true, state_local->x, &ddbox);
3012
3013         bRedist = isDlbOn(comm);
3014     }
3015     else
3016     {
3017         /* We have the full state, only redistribute the cgs */
3018
3019         /* Clear the non-home indices */
3020         clearDDStateIndices(dd, true);
3021         ncgindex_set = 0;
3022
3023         /* To avoid global communication, we do not recompute the extent
3024          * of the system for dims without pbc. Therefore we need to copy
3025          * the previously computed values when we do not communicate.
3026          */
3027         if (!bNStGlobalComm)
3028         {
3029             copy_rvec(comm->box0, ddbox.box0);
3030             copy_rvec(comm->box_size, ddbox.box_size);
3031         }
3032         set_ddbox(*dd, bMasterState, state_local->box, bNStGlobalComm, state_local->x, &ddbox);
3033
3034         bBoxChanged = TRUE;
3035         bRedist     = TRUE;
3036     }
3037     /* Copy needed for dim's without pbc when avoiding communication */
3038     copy_rvec(ddbox.box0, comm->box0);
3039     copy_rvec(ddbox.box_size, comm->box_size);
3040
3041     set_dd_cell_sizes(dd, &ddbox, dd->unitCellInfo.ddBoxIsDynamic, bMasterState, bDoDLB, step, wcycle);
3042
3043     if (comm->ddSettings.nstDDDumpGrid > 0 && step % comm->ddSettings.nstDDDumpGrid == 0)
3044     {
3045         write_dd_grid_pdb("dd_grid", step, dd, state_local->box, &ddbox);
3046     }
3047
3048     if (comm->systemInfo.useUpdateGroups)
3049     {
3050         comm->updateGroupsCog->addCogs(
3051                 gmx::arrayRefFromArray(dd->globalAtomGroupIndices.data(), dd->ncg_home), state_local->x);
3052     }
3053
3054     /* Check if we should sort the charge groups */
3055     const bool bSortCG = (bMasterState || bRedist);
3056
3057     /* When repartitioning we mark atom groups that will move to neighboring
3058      * DD cells, but we do not move them right away for performance reasons.
3059      * Thus we need to keep track of how many charge groups will move for
3060      * obtaining correct local charge group / atom counts.
3061      */
3062     ncg_moved = 0;
3063     if (bRedist)
3064     {
3065         wallcycle_sub_start(wcycle, ewcsDD_REDIST);
3066
3067         ncgindex_set = dd->ncg_home;
3068         dd_redistribute_cg(fplog, step, dd, ddbox.tric_dir, state_local, fr, nrnb, &ncg_moved);
3069
3070         GMX_RELEASE_ASSERT(bSortCG, "Sorting is required after redistribution");
3071
3072         if (comm->systemInfo.useUpdateGroups)
3073         {
3074             comm->updateGroupsCog->addCogs(
3075                     gmx::arrayRefFromArray(dd->globalAtomGroupIndices.data(), dd->ncg_home),
3076                     state_local->x);
3077         }
3078
3079         wallcycle_sub_stop(wcycle, ewcsDD_REDIST);
3080     }
3081
3082     // TODO: Integrate this code in the nbnxm module
3083     get_nsgrid_boundaries(ddbox.nboundeddim,
3084                           state_local->box,
3085                           dd,
3086                           &ddbox,
3087                           &comm->cell_x0,
3088                           &comm->cell_x1,
3089                           dd->ncg_home,
3090                           as_rvec_array(state_local->x.data()),
3091                           cell_ns_x0,
3092                           cell_ns_x1,
3093                           &grid_density);
3094
3095     if (bBoxChanged)
3096     {
3097         comm_dd_ns_cell_sizes(dd, &ddbox, cell_ns_x0, cell_ns_x1, step);
3098     }
3099
3100     if (bSortCG)
3101     {
3102         wallcycle_sub_start(wcycle, ewcsDD_GRID);
3103
3104         /* Sort the state on charge group position.
3105          * This enables exact restarts from this step.
3106          * It also improves performance by about 15% with larger numbers
3107          * of atoms per node.
3108          */
3109
3110         /* Fill the ns grid with the home cell,
3111          * so we can sort with the indices.
3112          */
3113         set_zones_ncg_home(dd);
3114
3115         set_zones_size(dd, state_local->box, &ddbox, 0, 1, ncg_moved);
3116
3117         nbnxn_put_on_grid(fr->nbv.get(),
3118                           state_local->box,
3119                           0,
3120                           comm->zones.size[0].bb_x0,
3121                           comm->zones.size[0].bb_x1,
3122                           comm->updateGroupsCog.get(),
3123                           { 0, dd->ncg_home },
3124                           comm->zones.dens_zone0,
3125                           fr->cginfo,
3126                           state_local->x,
3127                           ncg_moved,
3128                           bRedist ? comm->movedBuffer.data() : nullptr);
3129
3130         if (debug)
3131         {
3132             fprintf(debug, "Step %s, sorting the %d home charge groups\n", gmx_step_str(step, sbuf), dd->ncg_home);
3133         }
3134         dd_sort_state(dd, fr, state_local);
3135
3136         /* After sorting and compacting we set the correct size */
3137         state_change_natoms(state_local, comm->atomRanges.numHomeAtoms());
3138
3139         /* Rebuild all the indices */
3140         dd->ga2la->clear();
3141         ncgindex_set = 0;
3142
3143         wallcycle_sub_stop(wcycle, ewcsDD_GRID);
3144     }
3145     else
3146     {
3147         /* With the group scheme the sorting array is part of the DD state,
3148          * but it just got out of sync, so mark as invalid by emptying it.
3149          */
3150         if (inputrec.cutoff_scheme == CutoffScheme::Group)
3151         {
3152             comm->sort->sorted.clear();
3153         }
3154     }
3155
3156     if (comm->systemInfo.useUpdateGroups)
3157     {
3158         /* The update groups cog's are invalid after sorting
3159          * and need to be cleared before the next partitioning anyhow.
3160          */
3161         comm->updateGroupsCog->clear();
3162     }
3163
3164     wallcycle_sub_start(wcycle, ewcsDD_SETUPCOMM);
3165
3166     /* Set the induces for the home atoms */
3167     set_zones_ncg_home(dd);
3168     make_dd_indices(dd, ncgindex_set);
3169
3170     /* Setup up the communication and communicate the coordinates */
3171     setup_dd_communication(dd, state_local->box, &ddbox, fr, state_local);
3172
3173     /* Set the indices for the halo atoms */
3174     make_dd_indices(dd, dd->ncg_home);
3175
3176     /* Set the charge group boundaries for neighbor searching */
3177     set_cg_boundaries(&comm->zones);
3178
3179     /* When bSortCG=true, we have already set the size for zone 0 */
3180     set_zones_size(dd, state_local->box, &ddbox, bSortCG ? 1 : 0, comm->zones.n, 0);
3181
3182     wallcycle_sub_stop(wcycle, ewcsDD_SETUPCOMM);
3183
3184     /*
3185        write_dd_pdb("dd_home",step,"dump",top_global,cr,
3186                  -1,state_local->x.rvec_array(),state_local->box);
3187      */
3188
3189     wallcycle_sub_start(wcycle, ewcsDD_MAKETOP);
3190
3191     /* Extract a local topology from the global topology */
3192     for (int i = 0; i < dd->ndim; i++)
3193     {
3194         np[dd->dim[i]] = comm->cd[i].numPulses();
3195     }
3196     dd_make_local_top(dd,
3197                       &comm->zones,
3198                       dd->unitCellInfo.npbcdim,
3199                       state_local->box,
3200                       comm->cellsize_min,
3201                       np,
3202                       fr,
3203                       state_local->x.rvec_array(),
3204                       top_global,
3205                       top_local);
3206
3207     wallcycle_sub_stop(wcycle, ewcsDD_MAKETOP);
3208
3209     wallcycle_sub_start(wcycle, ewcsDD_MAKECONSTR);
3210
3211     /* Set up the special atom communication */
3212     int n = comm->atomRanges.end(DDAtomRanges::Type::Zones);
3213     for (int i = static_cast<int>(DDAtomRanges::Type::Zones) + 1;
3214          i < static_cast<int>(DDAtomRanges::Type::Number);
3215          i++)
3216     {
3217         auto range = static_cast<DDAtomRanges::Type>(i);
3218         switch (range)
3219         {
3220             case DDAtomRanges::Type::Vsites:
3221                 if (vsite && vsite->numInterUpdategroupVirtualSites())
3222                 {
3223                     n = dd_make_local_vsites(dd, n, top_local->idef.il);
3224                 }
3225                 break;
3226             case DDAtomRanges::Type::Constraints:
3227                 if (dd->comm->systemInfo.haveSplitConstraints || dd->comm->systemInfo.haveSplitSettles)
3228                 {
3229                     /* Only for inter-cg constraints we need special code */
3230                     n = dd_make_local_constraints(dd,
3231                                                   n,
3232                                                   top_global,
3233                                                   fr->cginfo.data(),
3234                                                   constr,
3235                                                   inputrec.nProjOrder,
3236                                                   top_local->idef.il);
3237                 }
3238                 break;
3239             default: gmx_incons("Unknown special atom type setup");
3240         }
3241         comm->atomRanges.setEnd(range, n);
3242     }
3243
3244     wallcycle_sub_stop(wcycle, ewcsDD_MAKECONSTR);
3245
3246     wallcycle_sub_start(wcycle, ewcsDD_TOPOTHER);
3247
3248     /* Make space for the extra coordinates for virtual site
3249      * or constraint communication.
3250      */
3251     state_local->natoms = comm->atomRanges.numAtomsTotal();
3252
3253     state_change_natoms(state_local, state_local->natoms);
3254
3255     if (vsite && vsite->numInterUpdategroupVirtualSites())
3256     {
3257         nat_f_novirsum = comm->atomRanges.end(DDAtomRanges::Type::Vsites);
3258     }
3259     else
3260     {
3261         if (EEL_FULL(inputrec.coulombtype) && dd->haveExclusions)
3262         {
3263             nat_f_novirsum = comm->atomRanges.end(DDAtomRanges::Type::Zones);
3264         }
3265         else
3266         {
3267             nat_f_novirsum = comm->atomRanges.numHomeAtoms();
3268         }
3269     }
3270
3271     /* Set the number of atoms required for the force calculation.
3272      * Forces need to be constrained when doing energy
3273      * minimization. For simple simulations we could avoid some
3274      * allocation, zeroing and copying, but this is probably not worth
3275      * the complications and checking.
3276      */
3277     forcerec_set_ranges(fr,
3278                         comm->atomRanges.end(DDAtomRanges::Type::Zones),
3279                         comm->atomRanges.end(DDAtomRanges::Type::Constraints),
3280                         nat_f_novirsum);
3281
3282     /* Update atom data for mdatoms and several algorithms */
3283     mdAlgorithmsSetupAtomData(cr, inputrec, top_global, top_local, fr, f, mdAtoms, constr, vsite, nullptr);
3284
3285     auto mdatoms = mdAtoms->mdatoms();
3286     if (!thisRankHasDuty(cr, DUTY_PME))
3287     {
3288         /* Send the charges and/or c6/sigmas to our PME only node */
3289         gmx_pme_send_parameters(cr,
3290                                 *fr->ic,
3291                                 mdatoms->nChargePerturbed != 0,
3292                                 mdatoms->nTypePerturbed != 0,
3293                                 gmx::arrayRefFromArray(mdatoms->chargeA, mdatoms->nr),
3294                                 gmx::arrayRefFromArray(mdatoms->chargeB, mdatoms->nr),
3295                                 gmx::arrayRefFromArray(mdatoms->sqrt_c6A, mdatoms->nr),
3296                                 gmx::arrayRefFromArray(mdatoms->sqrt_c6B, mdatoms->nr),
3297                                 gmx::arrayRefFromArray(mdatoms->sigmaA, mdatoms->nr),
3298                                 gmx::arrayRefFromArray(mdatoms->sigmaB, mdatoms->nr),
3299                                 dd_pme_maxshift_x(*dd),
3300                                 dd_pme_maxshift_y(*dd));
3301     }
3302
3303     if (dd->atomSets != nullptr)
3304     {
3305         /* Update the local atom sets */
3306         dd->atomSets->setIndicesInDomainDecomposition(*(dd->ga2la));
3307     }
3308
3309     // The pull group construction can need the atom sets updated above
3310     if (inputrec.bPull)
3311     {
3312         /* Update the local pull groups */
3313         dd_make_local_pull_groups(cr, pull_work);
3314     }
3315
3316     /* Update the local atoms to be communicated via the IMD protocol if bIMD is TRUE. */
3317     imdSession->dd_make_local_IMD_atoms(dd);
3318
3319     add_dd_statistics(dd);
3320
3321     /* Make sure we only count the cycles for this DD partitioning */
3322     clear_dd_cycle_counts(dd);
3323
3324     /* Because the order of the atoms might have changed since
3325      * the last vsite construction, we need to communicate the constructing
3326      * atom coordinates again (for spreading the forces this MD step).
3327      */
3328     dd_move_x_vsites(*dd, state_local->box, state_local->x.rvec_array());
3329
3330     wallcycle_sub_stop(wcycle, ewcsDD_TOPOTHER);
3331
3332     if (comm->ddSettings.nstDDDump > 0 && step % comm->ddSettings.nstDDDump == 0)
3333     {
3334         dd_move_x(dd, state_local->box, state_local->x, nullWallcycle);
3335         write_dd_pdb("dd_dump",
3336                      step,
3337                      "dump",
3338                      top_global,
3339                      cr,
3340                      -1,
3341                      state_local->x.rvec_array(),
3342                      state_local->box);
3343     }
3344
3345     /* Store the partitioning step */
3346     comm->partition_step = step;
3347
3348     /* Increase the DD partitioning counter */
3349     dd->ddp_count++;
3350     /* The state currently matches this DD partitioning count, store it */
3351     state_local->ddp_count = dd->ddp_count;
3352     if (bMasterState)
3353     {
3354         /* The DD master node knows the complete cg distribution,
3355          * store the count so we can possibly skip the cg info communication.
3356          */
3357         comm->master_cg_ddp_count = (bSortCG ? 0 : dd->ddp_count);
3358     }
3359
3360     if (comm->ddSettings.DD_debug > 0)
3361     {
3362         /* Set the env var GMX_DD_DEBUG if you suspect corrupted indices */
3363         check_index_consistency(dd, top_global.natoms, "after partitioning");
3364     }
3365
3366     wallcycle_stop(wcycle, ewcDOMDEC);
3367 }
3368
3369 /*! \brief Check whether bonded interactions are missing, if appropriate */
3370 void checkNumberOfBondedInteractions(const gmx::MDLogger&           mdlog,
3371                                      t_commrec*                     cr,
3372                                      int                            totalNumberOfBondedInteractions,
3373                                      const gmx_mtop_t&              top_global,
3374                                      const gmx_localtop_t*          top_local,
3375                                      gmx::ArrayRef<const gmx::RVec> x,
3376                                      const matrix                   box,
3377                                      bool* shouldCheckNumberOfBondedInteractions)
3378 {
3379     if (*shouldCheckNumberOfBondedInteractions)
3380     {
3381         if (totalNumberOfBondedInteractions != cr->dd->nbonded_global)
3382         {
3383             dd_print_missing_interactions(
3384                     mdlog, cr, totalNumberOfBondedInteractions, top_global, top_local, x, box); // Does not return
3385         }
3386         *shouldCheckNumberOfBondedInteractions = false;
3387     }
3388 }