src/gromacs/domdec/partition.cpp

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2018,2019,2020,2021, by the GROMACS development team, led by
   5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   6  * and including many others, as listed in the AUTHORS file in the
   7  * top-level source directory and at http://www.gromacs.org.
   8  *
   9  * GROMACS is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public License
  11  * as published by the Free Software Foundation; either version 2.1
  12  * of the License, or (at your option) any later version.
  13  *
  14  * GROMACS is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public
  20  * License along with GROMACS; if not, see
  21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  23  *
  24  * If you want to redistribute modifications to GROMACS, please
  25  * consider that scientific software is very special. Version
  26  * control is crucial - bugs must be traceable. We will be happy to
  27  * consider code for inclusion in the official distribution, but
  28  * derived work must not be called official GROMACS. Details are found
  29  * in the README & COPYING files - if they are missing, get the
  30  * official version at http://www.gromacs.org.
  31  *
  32  * To help us fund GROMACS development, we humbly ask that you cite
  33  * the research papers on the package. Check out http://www.gromacs.org.
  34  */
  35 /*! \internal \file
  36  *
  37  * \brief This file defines functions for mdrun to call to make a new
  38  * domain decomposition, and check it.
  39  *
  40  * \author Berk Hess <hess@kth.se>
  41  * \ingroup module_domdec
  42  */
  43
  44 #include "gmxpre.h"
  45
  46 #include "gromacs/utility/arrayref.h"
  47 #include "partition.h"
  48
  49 #include "config.h"
  50
  51 #include <cassert>
  52 #include <cstdio>
  53
  54 #include <algorithm>
  55
  56 #include "gromacs/domdec/collect.h"
  57 #include "gromacs/domdec/dlb.h"
  58 #include "gromacs/domdec/dlbtiming.h"
  59 #include "gromacs/domdec/domdec.h"
  60 #include "gromacs/domdec/domdec_network.h"
  61 #include "gromacs/domdec/ga2la.h"
  62 #include "gromacs/domdec/localatomsetmanager.h"
  63 #include "gromacs/domdec/localtopologychecker.h"
  64 #include "gromacs/domdec/mdsetup.h"
  65 #include "gromacs/domdec/nsgrid.h"
  66 #include "gromacs/ewald/pme_pp.h"
  67 #include "gromacs/gmxlib/network.h"
  68 #include "gromacs/gmxlib/nrnb.h"
  69 #include "gromacs/imd/imd.h"
  70 #include "gromacs/math/functions.h"
  71 #include "gromacs/math/vec.h"
  72 #include "gromacs/mdlib/forcerec.h"
  73 #include "gromacs/mdlib/gmx_omp_nthreads.h"
  74 #include "gromacs/mdlib/mdatoms.h"
  75 #include "gromacs/mdlib/vsite.h"
  76 #include "gromacs/mdtypes/commrec.h"
  77 #include "gromacs/mdtypes/forcerec.h"
  78 #include "gromacs/mdtypes/inputrec.h"
  79 #include "gromacs/mdtypes/md_enums.h"
  80 #include "gromacs/mdtypes/mdatom.h"
  81 #include "gromacs/mdtypes/nblist.h"
  82 #include "gromacs/mdtypes/state.h"
  83 #include "gromacs/nbnxm/nbnxm.h"
  84 #include "gromacs/pulling/pull.h"
  85 #include "gromacs/timing/wallcycle.h"
  86 #include "gromacs/topology/mtop_util.h"
  87 #include "gromacs/topology/topology.h"
  88 #include "gromacs/utility/cstringutil.h"
  89 #include "gromacs/utility/fatalerror.h"
  90 #include "gromacs/utility/logger.h"
  91 #include "gromacs/utility/real.h"
  92 #include "gromacs/utility/smalloc.h"
  93 #include "gromacs/utility/strconvert.h"
  94 #include "gromacs/utility/stringstream.h"
  95 #include "gromacs/utility/stringutil.h"
  96 #include "gromacs/utility/textwriter.h"
  97
  98 #include "box.h"
  99 #include "cellsizes.h"
 100 #include "distribute.h"
 101 #include "domdec_constraints.h"
 102 #include "domdec_internal.h"
 103 #include "domdec_vsite.h"
 104 #include "dump.h"
 105 #include "redistribute.h"
 106 #include "utility.h"
 107
 108 /*! \brief Turn on DLB when the load imbalance causes this amount of total loss.
 109  *
 110  * There is a bit of overhead with DLB and it's difficult to achieve
 111  * a load imbalance of less than 2% with DLB.
 112  */
 113 #define DD_PERF_LOSS_DLB_ON 0.02
 114
 115 //! Warn about imbalance due to PP or PP/PME load imbalance at this loss.
 116 #define DD_PERF_LOSS_WARN 0.05
 117
 118
 119 //! Debug helper printing a DD zone
 120 static void print_ddzone(FILE* fp, int d, int i, int j, gmx_ddzone_t* zone)
 121 {
 122     fprintf(fp,
 123             "zone d0 %d d1 %d d2 %d  min0 %6.3f max1 %6.3f mch0 %6.3f mch1 %6.3f p1_0 %6.3f p1_1 "
 124             "%6.3f\n",
 125             d,
 126             i,
 127             j,
 128             zone->min0,
 129             zone->max1,
 130             zone->mch0,
 131             zone->mch0,
 132             zone->p1_0,
 133             zone->p1_1);
 134 }
 135
 136 /*! \brief Using the home grid size as input in cell_ns_x0 and cell_ns_x1
 137  * takes the extremes over all home and remote zones in the halo
 138  * and returns the results in cell_ns_x0 and cell_ns_x1.
 139  * Note: only used with the group cut-off scheme.
 140  */
 141 static void dd_move_cellx(gmx_domdec_t* dd, const gmx_ddbox_t* ddbox, rvec cell_ns_x0, rvec cell_ns_x1)
 142 {
 143     constexpr int      c_ddZoneCommMaxNumZones = 5;
 144     gmx_ddzone_t       buf_s[c_ddZoneCommMaxNumZones];
 145     gmx_ddzone_t       buf_r[c_ddZoneCommMaxNumZones];
 146     gmx_ddzone_t       buf_e[c_ddZoneCommMaxNumZones];
 147     gmx_domdec_comm_t* comm = dd->comm;
 148
 149     rvec extr_s[2];
 150     rvec extr_r[2];
 151     for (int d = 1; d < dd->ndim; d++)
 152     {
 153         int           dim = dd->dim[d];
 154         gmx_ddzone_t& zp  = (d == 1) ? comm->zone_d1[0] : comm->zone_d2[0][0];
 155
 156         /* Copy the base sizes of the home zone */
 157         zp.min0    = cell_ns_x0[dim];
 158         zp.max1    = cell_ns_x1[dim];
 159         zp.min1    = cell_ns_x1[dim];
 160         zp.mch0    = cell_ns_x0[dim];
 161         zp.mch1    = cell_ns_x1[dim];
 162         zp.p1_0    = cell_ns_x0[dim];
 163         zp.p1_1    = cell_ns_x1[dim];
 164         zp.dataSet = 1;
 165     }
 166
 167     gmx::ArrayRef<DDCellsizesWithDlb> cellsizes = comm->cellsizesWithDlb;
 168
 169     /* Loop backward over the dimensions and aggregate the extremes
 170      * of the cell sizes.
 171      */
 172     for (int d = dd->ndim - 2; d >= 0; d--)
 173     {
 174         const int  dim      = dd->dim[d];
 175         const bool applyPbc = (dim < ddbox->npbcdim);
 176
 177         /* Use an rvec to store two reals */
 178         extr_s[d][0] = cellsizes[d + 1].fracLower;
 179         extr_s[d][1] = cellsizes[d + 1].fracUpper;
 180         extr_s[d][2] = cellsizes[d + 1].fracUpper;
 181
 182         int pos = 0;
 183         GMX_ASSERT(pos < c_ddZoneCommMaxNumZones, "The buffers should be sufficiently large");
 184         /* Store the extremes in the backward sending buffer,
 185          * so they get updated separately from the forward communication.
 186          */
 187         for (int d1 = d; d1 < dd->ndim - 1; d1++)
 188         {
 189             gmx_ddzone_t& buf = buf_s[pos];
 190
 191             /* We invert the order to be able to use the same loop for buf_e */
 192             buf.min0 = extr_s[d1][1];
 193             buf.max1 = extr_s[d1][0];
 194             buf.min1 = extr_s[d1][2];
 195             buf.mch0 = 0;
 196             buf.mch1 = 0;
 197             /* Store the cell corner of the dimension we communicate along */
 198             buf.p1_0    = comm->cell_x0[dim];
 199             buf.p1_1    = 0;
 200             buf.dataSet = 1;
 201             pos++;
 202         }
 203
 204         buf_s[pos] = (dd->ndim == 2) ? comm->zone_d1[0] : comm->zone_d2[0][0];
 205         pos++;
 206
 207         if (dd->ndim == 3 && d == 0)
 208         {
 209             buf_s[pos] = comm->zone_d2[0][1];
 210             pos++;
 211             buf_s[pos] = comm->zone_d1[0];
 212             pos++;
 213         }
 214
 215         /* We only need to communicate the extremes
 216          * in the forward direction
 217          */
 218         int numPulses = comm->cd[d].numPulses();
 219         int numPulsesMin;
 220         if (applyPbc)
 221         {
 222             /* Take the minimum to avoid double communication */
 223             numPulsesMin = std::min(numPulses, dd->numCells[dim] - 1 - numPulses);
 224         }
 225         else
 226         {
 227             /* Without PBC we should really not communicate over
 228              * the boundaries, but implementing that complicates
 229              * the communication setup and therefore we simply
 230              * do all communication, but ignore some data.
 231              */
 232             numPulsesMin = numPulses;
 233         }
 234         for (int pulse = 0; pulse < numPulsesMin; pulse++)
 235         {
 236             /* Communicate the extremes forward */
 237             bool receiveValidData = (applyPbc || dd->ci[dim] > 0);
 238
 239             int numElements = dd->ndim - d - 1;
 240             ddSendrecv(dd, d, dddirForward, extr_s + d, numElements, extr_r + d, numElements);
 241
 242             if (receiveValidData)
 243             {
 244                 for (int d1 = d; d1 < dd->ndim - 1; d1++)
 245                 {
 246                     extr_s[d1][0] = std::max(extr_s[d1][0], extr_r[d1][0]);
 247                     extr_s[d1][1] = std::min(extr_s[d1][1], extr_r[d1][1]);
 248                     extr_s[d1][2] = std::min(extr_s[d1][2], extr_r[d1][2]);
 249                 }
 250             }
 251         }
 252
 253         const int numElementsInBuffer = pos;
 254         for (int pulse = 0; pulse < numPulses; pulse++)
 255         {
 256             /* Communicate all the zone information backward */
 257             bool receiveValidData = (applyPbc || dd->ci[dim] < dd->numCells[dim] - 1);
 258
 259             static_assert(
 260                     sizeof(gmx_ddzone_t) == c_ddzoneNumReals * sizeof(real),
 261                     "Here we expect gmx_ddzone_t to consist of c_ddzoneNumReals reals (only)");
 262
 263             int numReals = numElementsInBuffer * c_ddzoneNumReals;
 264             ddSendrecv(dd,
 265                        d,
 266                        dddirBackward,
 267                        gmx::arrayRefFromArray(&buf_s[0].min0, numReals),
 268                        gmx::arrayRefFromArray(&buf_r[0].min0, numReals));
 269
 270             rvec dh = { 0 };
 271             if (pulse > 0)
 272             {
 273                 for (int d1 = d + 1; d1 < dd->ndim; d1++)
 274                 {
 275                     /* Determine the decrease of maximum required
 276                      * communication height along d1 due to the distance along d,
 277                      * this avoids a lot of useless atom communication.
 278                      */
 279                     real dist_d = comm->cell_x1[dim] - buf_r[0].p1_0;
 280
 281                     real c;
 282                     if (ddbox->tric_dir[dim])
 283                     {
 284                         /* c is the off-diagonal coupling between the cell planes
 285                          * along directions d and d1.
 286                          */
 287                         c = ddbox->v[dim][dd->dim[d1]][dim];
 288                     }
 289                     else
 290                     {
 291                         c = 0;
 292                     }
 293                     real det = (1 + c * c) * gmx::square(comm->systemInfo.cutoff) - dist_d * dist_d;
 294                     if (det > 0)
 295                     {
 296                         dh[d1] = comm->systemInfo.cutoff - (c * dist_d + std::sqrt(det)) / (1 + c * c);
 297                     }
 298                     else
 299                     {
 300                         /* A negative value signals out of range */
 301                         dh[d1] = -1;
 302                     }
 303                 }
 304             }
 305
 306             /* Accumulate the extremes over all pulses */
 307             for (int i = 0; i < numElementsInBuffer; i++)
 308             {
 309                 if (pulse == 0)
 310                 {
 311                     buf_e[i] = buf_r[i];
 312                 }
 313                 else
 314                 {
 315                     if (receiveValidData)
 316                     {
 317                         buf_e[i].min0 = std::min(buf_e[i].min0, buf_r[i].min0);
 318                         buf_e[i].max1 = std::max(buf_e[i].max1, buf_r[i].max1);
 319                         buf_e[i].min1 = std::min(buf_e[i].min1, buf_r[i].min1);
 320                     }
 321
 322                     int d1;
 323                     if (dd->ndim == 3 && d == 0 && i == numElementsInBuffer - 1)
 324                     {
 325                         d1 = 1;
 326                     }
 327                     else
 328                     {
 329                         d1 = d + 1;
 330                     }
 331                     if (receiveValidData && dh[d1] >= 0)
 332                     {
 333                         buf_e[i].mch0 = std::max(buf_e[i].mch0, buf_r[i].mch0 - dh[d1]);
 334                         buf_e[i].mch1 = std::max(buf_e[i].mch1, buf_r[i].mch1 - dh[d1]);
 335                     }
 336                 }
 337                 /* Copy the received buffer to the send buffer,
 338                  * to pass the data through with the next pulse.
 339                  */
 340                 buf_s[i] = buf_r[i];
 341             }
 342             if (((applyPbc || dd->ci[dim] + numPulses < dd->numCells[dim]) && pulse == numPulses - 1)
 343                 || (!applyPbc && dd->ci[dim] + 1 + pulse == dd->numCells[dim] - 1))
 344             {
 345                 /* Store the extremes */
 346                 int pos = 0;
 347
 348                 for (int d1 = d; d1 < dd->ndim - 1; d1++)
 349                 {
 350                     extr_s[d1][1] = std::min(extr_s[d1][1], buf_e[pos].min0);
 351                     extr_s[d1][0] = std::max(extr_s[d1][0], buf_e[pos].max1);
 352                     extr_s[d1][2] = std::min(extr_s[d1][2], buf_e[pos].min1);
 353                     pos++;
 354                 }
 355
 356                 if (d == 1 || (d == 0 && dd->ndim == 3))
 357                 {
 358                     for (int i = d; i < 2; i++)
 359                     {
 360                         comm->zone_d2[1 - d][i] = buf_e[pos];
 361                         pos++;
 362                     }
 363                 }
 364                 if (d == 0)
 365                 {
 366                     comm->zone_d1[1] = buf_e[pos];
 367                     pos++;
 368                 }
 369             }
 370             else
 371             {
 372                 if (d == 1 || (d == 0 && dd->ndim == 3))
 373                 {
 374                     for (int i = d; i < 2; i++)
 375                     {
 376                         comm->zone_d2[1 - d][i].dataSet = 0;
 377                     }
 378                 }
 379                 if (d == 0)
 380                 {
 381                     comm->zone_d1[1].dataSet = 0;
 382                 }
 383             }
 384         }
 385     }
 386
 387     if (dd->ndim >= 2)
 388     {
 389         int dim = dd->dim[1];
 390         for (int i = 0; i < 2; i++)
 391         {
 392             if (comm->zone_d1[i].dataSet != 0)
 393             {
 394                 if (debug)
 395                 {
 396                     print_ddzone(debug, 1, i, 0, &comm->zone_d1[i]);
 397                 }
 398                 cell_ns_x0[dim] = std::min(cell_ns_x0[dim], comm->zone_d1[i].min0);
 399                 cell_ns_x1[dim] = std::max(cell_ns_x1[dim], comm->zone_d1[i].max1);
 400             }
 401         }
 402     }
 403     if (dd->ndim >= 3)
 404     {
 405         int dim = dd->dim[2];
 406         for (int i = 0; i < 2; i++)
 407         {
 408             for (int j = 0; j < 2; j++)
 409             {
 410                 if (comm->zone_d2[i][j].dataSet != 0)
 411                 {
 412                     if (debug)
 413                     {
 414                         print_ddzone(debug, 2, i, j, &comm->zone_d2[i][j]);
 415                     }
 416                     cell_ns_x0[dim] = std::min(cell_ns_x0[dim], comm->zone_d2[i][j].min0);
 417                     cell_ns_x1[dim] = std::max(cell_ns_x1[dim], comm->zone_d2[i][j].max1);
 418                 }
 419             }
 420         }
 421     }
 422     for (int d = 1; d < dd->ndim; d++)
 423     {
 424         cellsizes[d].fracLowerMax = extr_s[d - 1][0];
 425         cellsizes[d].fracUpperMin = extr_s[d - 1][1];
 426         if (debug)
 427         {
 428             fprintf(debug,
 429                     "Cell fraction d %d, max0 %f, min1 %f\n",
 430                     d,
 431                     cellsizes[d].fracLowerMax,
 432                     cellsizes[d].fracUpperMin);
 433         }
 434     }
 435 }
 436
 437 //! Sets the charge-group zones to be equal to the home zone.
 438 static void set_zones_ncg_home(gmx_domdec_t* dd)
 439 {
 440     gmx_domdec_zones_t* zones;
 441     int                 i;
 442
 443     zones = &dd->comm->zones;
 444
 445     zones->cg_range[0] = 0;
 446     for (i = 1; i < zones->n + 1; i++)
 447     {
 448         zones->cg_range[i] = dd->ncg_home;
 449     }
 450     /* zone_ncg1[0] should always be equal to ncg_home */
 451     dd->comm->zone_ncg1[0] = dd->ncg_home;
 452 }
 453
 454 //! Restore atom groups for the charge groups.
 455 static void restoreAtomGroups(gmx_domdec_t* dd, const t_state* state)
 456 {
 457     gmx::ArrayRef<const int> atomGroupsState = state->cg_gl;
 458
 459     std::vector<int>& globalAtomGroupIndices = dd->globalAtomGroupIndices;
 460
 461     globalAtomGroupIndices.resize(atomGroupsState.size());
 462
 463     /* Copy back the global charge group indices from state
 464      * and rebuild the local charge group to atom index.
 465      */
 466     for (gmx::index i = 0; i < atomGroupsState.ssize(); i++)
 467     {
 468         globalAtomGroupIndices[i] = atomGroupsState[i];
 469     }
 470
 471     dd->ncg_home = atomGroupsState.size();
 472     dd->comm->atomRanges.setEnd(DDAtomRanges::Type::Home, atomGroupsState.ssize());
 473
 474     set_zones_ncg_home(dd);
 475 }
 476
 477 //! Sets the cginfo structures.
 478 static void dd_set_cginfo(gmx::ArrayRef<const int> index_gl, int cg0, int cg1, t_forcerec* fr)
 479 {
 480     if (fr != nullptr)
 481     {
 482         gmx::ArrayRef<cginfo_mb_t> cginfo_mb = fr->cginfo_mb;
 483         gmx::ArrayRef<int>         cginfo    = fr->cginfo;
 484
 485         for (int cg = cg0; cg < cg1; cg++)
 486         {
 487             cginfo[cg] = ddcginfo(cginfo_mb, index_gl[cg]);
 488         }
 489     }
 490 }
 491
 492 //! Makes the mappings between global and local atom indices during DD repartioning.
 493 static void make_dd_indices(gmx_domdec_t* dd, const int atomStart)
 494 {
 495     const int                numZones               = dd->comm->zones.n;
 496     gmx::ArrayRef<const int> zone2cg                = dd->comm->zones.cg_range;
 497     gmx::ArrayRef<const int> zone_ncg1              = dd->comm->zone_ncg1;
 498     gmx::ArrayRef<const int> globalAtomGroupIndices = dd->globalAtomGroupIndices;
 499
 500     std::vector<int>& globalAtomIndices = dd->globalAtomIndices;
 501     gmx_ga2la_t&      ga2la             = *dd->ga2la;
 502
 503     if (zone2cg[1] != dd->ncg_home)
 504     {
 505         gmx_incons("dd->ncg_zone is not up to date");
 506     }
 507
 508     /* Make the local to global and global to local atom index */
 509     int a = atomStart;
 510     globalAtomIndices.resize(a);
 511     for (int zone = 0; zone < numZones; zone++)
 512     {
 513         int cg0;
 514         if (zone == 0)
 515         {
 516             cg0 = atomStart;
 517         }
 518         else
 519         {
 520             cg0 = zone2cg[zone];
 521         }
 522         int cg1    = zone2cg[zone + 1];
 523         int cg1_p1 = cg0 + zone_ncg1[zone];
 524
 525         for (int cg = cg0; cg < cg1; cg++)
 526         {
 527             int zone1 = zone;
 528             if (cg >= cg1_p1)
 529             {
 530                 /* Signal that this cg is from more than one pulse away */
 531                 zone1 += numZones;
 532             }
 533             int cg_gl = globalAtomGroupIndices[cg];
 534             globalAtomIndices.push_back(cg_gl);
 535             ga2la.insert(cg_gl, { a, zone1 });
 536             a++;
 537         }
 538     }
 539 }
 540
 541 //! Checks whether global and local atom indices are consistent.
 542 static void check_index_consistency(const gmx_domdec_t* dd, int natoms_sys, const char* where)
 543 {
 544     int nerr = 0;
 545
 546     const int numAtomsInZones = dd->comm->atomRanges.end(DDAtomRanges::Type::Zones);
 547
 548     if (dd->comm->ddSettings.DD_debug > 1)
 549     {
 550         std::vector<int> have(natoms_sys);
 551         for (int a = 0; a < numAtomsInZones; a++)
 552         {
 553             int globalAtomIndex = dd->globalAtomIndices[a];
 554             if (have[globalAtomIndex] > 0)
 555             {
 556                 fprintf(stderr,
 557                         "DD rank %d: global atom %d occurs twice: index %d and %d\n",
 558                         dd->rank,
 559                         globalAtomIndex + 1,
 560                         have[globalAtomIndex],
 561                         a + 1);
 562             }
 563             else
 564             {
 565                 have[globalAtomIndex] = a + 1;
 566             }
 567         }
 568     }
 569
 570     std::vector<int> have(numAtomsInZones);
 571
 572     int ngl = 0;
 573     for (int i = 0; i < natoms_sys; i++)
 574     {
 575         if (const auto* entry = dd->ga2la->find(i))
 576         {
 577             const int a = entry->la;
 578             if (a >= numAtomsInZones)
 579             {
 580                 fprintf(stderr,
 581                         "DD rank %d: global atom %d marked as local atom %d, which is larger than "
 582                         "nat_tot (%d)\n",
 583                         dd->rank,
 584                         i + 1,
 585                         a + 1,
 586                         numAtomsInZones);
 587                 nerr++;
 588             }
 589             else
 590             {
 591                 have[a] = 1;
 592                 if (dd->globalAtomIndices[a] != i)
 593                 {
 594                     fprintf(stderr,
 595                             "DD rank %d: global atom %d marked as local atom %d, which has global "
 596                             "atom index %d\n",
 597                             dd->rank,
 598                             i + 1,
 599                             a + 1,
 600                             dd->globalAtomIndices[a] + 1);
 601                     nerr++;
 602                 }
 603             }
 604             ngl++;
 605         }
 606     }
 607     if (ngl != numAtomsInZones)
 608     {
 609         fprintf(stderr, "DD rank %d, %s: %d global atom indices, %d local atoms\n", dd->rank, where, ngl, numAtomsInZones);
 610     }
 611     for (int a = 0; a < numAtomsInZones; a++)
 612     {
 613         if (have[a] == 0)
 614         {
 615             fprintf(stderr,
 616                     "DD rank %d, %s: local atom %d, global %d has no global index\n",
 617                     dd->rank,
 618                     where,
 619                     a + 1,
 620                     dd->globalAtomIndices[a] + 1);
 621         }
 622     }
 623
 624     if (nerr > 0)
 625     {
 626         gmx_fatal(FARGS, "DD rank %d, %s: %d atom(group) index inconsistencies", dd->rank, where, nerr);
 627     }
 628 }
 629
 630 //! Clear all DD global state indices
 631 static void clearDDStateIndices(gmx_domdec_t* dd, const bool keepLocalAtomIndices)
 632 {
 633     gmx_ga2la_t& ga2la = *dd->ga2la;
 634
 635     if (!keepLocalAtomIndices)
 636     {
 637         /* Clear the whole list without the overhead of searching */
 638         ga2la.clear();
 639     }
 640     else
 641     {
 642         const int numAtomsInZones = dd->comm->atomRanges.end(DDAtomRanges::Type::Zones);
 643         for (int i = 0; i < numAtomsInZones; i++)
 644         {
 645             ga2la.erase(dd->globalAtomIndices[i]);
 646         }
 647     }
 648
 649     dd_clear_local_vsite_indices(dd);
 650
 651     if (dd->constraints)
 652     {
 653         dd_clear_local_constraint_indices(dd);
 654     }
 655 }
 656
 657 //! Return the duration of force calculations on this rank.
 658 static float dd_force_load(gmx_domdec_comm_t* comm)
 659 {
 660     float load;
 661
 662     if (comm->ddSettings.eFlop)
 663     {
 664         load = comm->flop;
 665         if (comm->ddSettings.eFlop > 1)
 666         {
 667             load *= 1.0 + (comm->ddSettings.eFlop - 1) * (0.1 * rand() / RAND_MAX - 0.05);
 668         }
 669     }
 670     else
 671     {
 672         load = comm->cycl[ddCyclF];
 673         if (comm->cycl_n[ddCyclF] > 1)
 674         {
 675             /* Subtract the maximum of the last n cycle counts
 676              * to get rid of possible high counts due to other sources,
 677              * for instance system activity, that would otherwise
 678              * affect the dynamic load balancing.
 679              */
 680             load -= comm->cycl_max[ddCyclF];
 681         }
 682
 683 #if GMX_MPI
 684         if (comm->cycl_n[ddCyclWaitGPU] && comm->nrank_gpu_shared > 1)
 685         {
 686             float gpu_wait, gpu_wait_sum;
 687
 688             gpu_wait = comm->cycl[ddCyclWaitGPU];
 689             if (comm->cycl_n[ddCyclF] > 1)
 690             {
 691                 /* We should remove the WaitGPU time of the same MD step
 692                  * as the one with the maximum F time, since the F time
 693                  * and the wait time are not independent.
 694                  * Furthermore, the step for the max F time should be chosen
 695                  * the same on all ranks that share the same GPU.
 696                  * But to keep the code simple, we remove the average instead.
 697                  * The main reason for artificially long times at some steps
 698                  * is spurious CPU activity or MPI time, so we don't expect
 699                  * that changes in the GPU wait time matter a lot here.
 700                  */
 701                 gpu_wait *= (comm->cycl_n[ddCyclF] - 1) / static_cast<float>(comm->cycl_n[ddCyclF]);
 702             }
 703             /* Sum the wait times over the ranks that share the same GPU */
 704             MPI_Allreduce(&gpu_wait, &gpu_wait_sum, 1, MPI_FLOAT, MPI_SUM, comm->mpi_comm_gpu_shared);
 705             /* Replace the wait time by the average over the ranks */
 706             load += -gpu_wait + gpu_wait_sum / comm->nrank_gpu_shared;
 707         }
 708 #endif
 709     }
 710
 711     return load;
 712 }
 713
 714 //! Runs cell size checks and communicates the boundaries.
 715 static void comm_dd_ns_cell_sizes(gmx_domdec_t* dd, gmx_ddbox_t* ddbox, rvec cell_ns_x0, rvec cell_ns_x1, int64_t step)
 716 {
 717     gmx_domdec_comm_t* comm;
 718     int                dim_ind, dim;
 719
 720     comm = dd->comm;
 721
 722     for (dim_ind = 0; dim_ind < dd->ndim; dim_ind++)
 723     {
 724         dim = dd->dim[dim_ind];
 725
 726         /* Without PBC we don't have restrictions on the outer cells */
 727         if (!(dim >= ddbox->npbcdim && (dd->ci[dim] == 0 || dd->ci[dim] == dd->numCells[dim] - 1))
 728             && isDlbOn(comm)
 729             && (comm->cell_x1[dim] - comm->cell_x0[dim]) * ddbox->skew_fac[dim] < comm->cellsize_min[dim])
 730         {
 731             char buf[22];
 732             gmx_fatal(FARGS,
 733                       "step %s: The %c-size (%f) times the triclinic skew factor (%f) is smaller "
 734                       "than the smallest allowed cell size (%f) for domain decomposition grid cell "
 735                       "%d %d %d",
 736                       gmx_step_str(step, buf),
 737                       dim2char(dim),
 738                       comm->cell_x1[dim] - comm->cell_x0[dim],
 739                       ddbox->skew_fac[dim],
 740                       dd->comm->cellsize_min[dim],
 741                       dd->ci[XX],
 742                       dd->ci[YY],
 743                       dd->ci[ZZ]);
 744         }
 745     }
 746
 747     if ((isDlbOn(dd->comm) && dd->ndim > 1) || ddbox->nboundeddim < DIM)
 748     {
 749         /* Communicate the boundaries and update cell_ns_x0/1 */
 750         dd_move_cellx(dd, ddbox, cell_ns_x0, cell_ns_x1);
 751         if (isDlbOn(dd->comm) && dd->ndim > 1)
 752         {
 753             gmx::check_grid_jump(step, dd, dd->comm->systemInfo.cutoff, ddbox, TRUE);
 754         }
 755     }
 756 }
 757
 758 //! Compute and communicate to determine the load distribution across PP ranks.
 759 static void get_load_distribution(gmx_domdec_t* dd, gmx_wallcycle* wcycle)
 760 {
 761     gmx_domdec_comm_t* comm;
 762     domdec_load_t*     load;
 763     float              cell_frac = 0, sbuf[DD_NLOAD_MAX];
 764     gmx_bool           bSepPME;
 765
 766     if (debug)
 767     {
 768         fprintf(debug, "get_load_distribution start\n");
 769     }
 770
 771     wallcycle_start(wcycle, WallCycleCounter::DDCommLoad);
 772
 773     comm = dd->comm;
 774
 775     bSepPME = (dd->pme_nodeid >= 0);
 776
 777     if (dd->ndim == 0 && bSepPME)
 778     {
 779         /* Without decomposition, but with PME nodes, we need the load */
 780         comm->load[0].mdf = comm->cycl[ddCyclPPduringPME];
 781         comm->load[0].pme = comm->cycl[ddCyclPME];
 782     }
 783
 784     // Either we have DLB off, or we have it on and the array is large enough
 785     GMX_ASSERT(!isDlbOn(dd->comm) || static_cast<int>(dd->comm->cellsizesWithDlb.size()) == dd->ndim,
 786                "DLB cell sizes data not set up properly ");
 787     for (int d = dd->ndim - 1; d >= 0; d--)
 788     {
 789         const int dim = dd->dim[d];
 790         /* Check if we participate in the communication in this dimension */
 791         if (d == dd->ndim - 1 || (dd->ci[dd->dim[d + 1]] == 0 && dd->ci[dd->dim[dd->ndim - 1]] == 0))
 792         {
 793             load = &comm->load[d];
 794             if (isDlbOn(dd->comm))
 795             {
 796                 cell_frac = comm->cellsizesWithDlb[d].fracUpper - comm->cellsizesWithDlb[d].fracLower;
 797             }
 798             int pos = 0;
 799             if (d == dd->ndim - 1)
 800             {
 801                 sbuf[pos++] = dd_force_load(comm);
 802                 sbuf[pos++] = sbuf[0];
 803                 if (isDlbOn(dd->comm))
 804                 {
 805                     sbuf[pos++] = sbuf[0];
 806                     sbuf[pos++] = cell_frac;
 807                     if (d > 0)
 808                     {
 809                         sbuf[pos++] = comm->cellsizesWithDlb[d].fracLowerMax;
 810                         sbuf[pos++] = comm->cellsizesWithDlb[d].fracUpperMin;
 811                     }
 812                 }
 813                 if (bSepPME)
 814                 {
 815                     sbuf[pos++] = comm->cycl[ddCyclPPduringPME];
 816                     sbuf[pos++] = comm->cycl[ddCyclPME];
 817                 }
 818             }
 819             else
 820             {
 821                 sbuf[pos++] = comm->load[d + 1].sum;
 822                 sbuf[pos++] = comm->load[d + 1].max;
 823                 if (isDlbOn(dd->comm))
 824                 {
 825                     sbuf[pos++] = comm->load[d + 1].sum_m;
 826                     sbuf[pos++] = comm->load[d + 1].cvol_min * cell_frac;
 827                     sbuf[pos++] = comm->load[d + 1].flags;
 828                     if (d > 0)
 829                     {
 830                         sbuf[pos++] = comm->cellsizesWithDlb[d].fracLowerMax;
 831                         sbuf[pos++] = comm->cellsizesWithDlb[d].fracUpperMin;
 832                     }
 833                 }
 834                 if (bSepPME)
 835                 {
 836                     sbuf[pos++] = comm->load[d + 1].mdf;
 837                     sbuf[pos++] = comm->load[d + 1].pme;
 838                 }
 839             }
 840             load->nload = pos;
 841             /* Communicate a row in DD direction d.
 842              * The communicators are setup such that the root always has rank 0.
 843              */
 844 #if GMX_MPI
 845             MPI_Gather(sbuf,
 846                        load->nload * sizeof(float),
 847                        MPI_BYTE,
 848                        load->load,
 849                        load->nload * sizeof(float),
 850                        MPI_BYTE,
 851                        0,
 852                        comm->mpi_comm_load[d]);
 853 #endif
 854             if (dd->ci[dim] == dd->master_ci[dim])
 855             {
 856                 /* We are the master along this row, process this row */
 857                 RowMaster* rowMaster = nullptr;
 858
 859                 if (isDlbOn(comm))
 860                 {
 861                     rowMaster = comm->cellsizesWithDlb[d].rowMaster.get();
 862                 }
 863                 load->sum      = 0;
 864                 load->max      = 0;
 865                 load->sum_m    = 0;
 866                 load->cvol_min = 1;
 867                 load->flags    = 0;
 868                 load->mdf      = 0;
 869                 load->pme      = 0;
 870                 int pos        = 0;
 871                 for (int i = 0; i < dd->numCells[dim]; i++)
 872                 {
 873                     load->sum += load->load[pos++];
 874                     load->max = std::max(load->max, load->load[pos]);
 875                     pos++;
 876                     if (isDlbOn(dd->comm))
 877                     {
 878                         if (rowMaster->dlbIsLimited)
 879                         {
 880                             /* This direction could not be load balanced properly,
 881                              * therefore we need to use the maximum iso the average load.
 882                              */
 883                             load->sum_m = std::max(load->sum_m, load->load[pos]);
 884                         }
 885                         else
 886                         {
 887                             load->sum_m += load->load[pos];
 888                         }
 889                         pos++;
 890                         load->cvol_min = std::min(load->cvol_min, load->load[pos]);
 891                         pos++;
 892                         if (d < dd->ndim - 1)
 893                         {
 894                             load->flags = gmx::roundToInt(load->load[pos++]);
 895                         }
 896                         if (d > 0)
 897                         {
 898                             rowMaster->bounds[i].cellFracLowerMax = load->load[pos++];
 899                             rowMaster->bounds[i].cellFracUpperMin = load->load[pos++];
 900                         }
 901                     }
 902                     if (bSepPME)
 903                     {
 904                         load->mdf = std::max(load->mdf, load->load[pos]);
 905                         pos++;
 906                         load->pme = std::max(load->pme, load->load[pos]);
 907                         pos++;
 908                     }
 909                 }
 910                 if (isDlbOn(comm) && rowMaster->dlbIsLimited)
 911                 {
 912                     load->sum_m *= dd->numCells[dim];
 913                     load->flags |= (1 << d);
 914                 }
 915             }
 916         }
 917     }
 918
 919     if (DDMASTER(dd))
 920     {
 921         comm->nload += dd_load_count(comm);
 922         comm->load_step += comm->cycl[ddCyclStep];
 923         comm->load_sum += comm->load[0].sum;
 924         comm->load_max += comm->load[0].max;
 925         if (isDlbOn(comm))
 926         {
 927             for (int d = 0; d < dd->ndim; d++)
 928             {
 929                 if (comm->load[0].flags & (1 << d))
 930                 {
 931                     comm->load_lim[d]++;
 932                 }
 933             }
 934         }
 935         if (bSepPME)
 936         {
 937             comm->load_mdf += comm->load[0].mdf;
 938             comm->load_pme += comm->load[0].pme;
 939         }
 940     }
 941
 942     wallcycle_stop(wcycle, WallCycleCounter::DDCommLoad);
 943
 944     if (debug)
 945     {
 946         fprintf(debug, "get_load_distribution finished\n");
 947     }
 948 }
 949
 950 /*! \brief Return the relative performance loss on the total run time
 951  * due to the force calculation load imbalance. */
 952 static float dd_force_load_fraction(gmx_domdec_t* dd)
 953 {
 954     if (dd->comm->nload > 0 && dd->comm->load_step > 0)
 955     {
 956         return dd->comm->load_sum / (dd->comm->load_step * dd->nnodes);
 957     }
 958     else
 959     {
 960         return 0;
 961     }
 962 }
 963
 964 /*! \brief Return the relative performance loss on the total run time
 965  * due to the force calculation load imbalance. */
 966 static float dd_force_imb_perf_loss(gmx_domdec_t* dd)
 967 {
 968     if (dd->comm->nload > 0 && dd->comm->load_step > 0)
 969     {
 970         return (dd->comm->load_max * dd->nnodes - dd->comm->load_sum) / (dd->comm->load_step * dd->nnodes);
 971     }
 972     else
 973     {
 974         return 0;
 975     }
 976 }
 977
 978 //! Print load-balance report e.g. at the end of a run.
 979 static void print_dd_load_av(FILE* fplog, gmx_domdec_t* dd)
 980 {
 981     gmx_domdec_comm_t* comm = dd->comm;
 982
 983     /* Only the master rank prints loads and only if we measured loads */
 984     if (!DDMASTER(dd) || comm->nload == 0)
 985     {
 986         return;
 987     }
 988
 989     char buf[STRLEN];
 990     int  numPpRanks  = dd->nnodes;
 991     int  numPmeRanks = (comm->ddRankSetup.usePmeOnlyRanks ? comm->ddRankSetup.numRanksDoingPme : 0);
 992     int  numRanks    = numPpRanks + numPmeRanks;
 993     float lossFraction = 0;
 994
 995     /* Print the average load imbalance and performance loss */
 996     if (dd->nnodes > 1 && comm->load_sum > 0)
 997     {
 998         float imbalance = comm->load_max * numPpRanks / comm->load_sum - 1;
 999         lossFraction    = dd_force_imb_perf_loss(dd);
1000
1001         std::string msg = "\nDynamic load balancing report:\n";
1002         std::string dlbStateStr;
1003
1004         switch (dd->comm->dlbState)
1005         {
1006             case DlbState::offUser:
1007                 dlbStateStr = "DLB was off during the run per user request.";
1008                 break;
1009             case DlbState::offForever:
1010                 /* Currectly this can happen due to performance loss observed, cell size
1011                  * limitations or incompatibility with other settings observed during
1012                  * determineInitialDlbState(). */
1013                 dlbStateStr = "DLB got disabled because it was unsuitable to use.";
1014                 break;
1015             case DlbState::offCanTurnOn:
1016                 dlbStateStr = "DLB was off during the run due to low measured imbalance.";
1017                 break;
1018             case DlbState::offTemporarilyLocked:
1019                 dlbStateStr =
1020                         "DLB was locked at the end of the run due to unfinished PP-PME "
1021                         "balancing.";
1022                 break;
1023             case DlbState::onCanTurnOff:
1024                 dlbStateStr = "DLB was turned on during the run due to measured imbalance.";
1025                 break;
1026             case DlbState::onUser:
1027                 dlbStateStr = "DLB was permanently on during the run per user request.";
1028                 break;
1029             default: GMX_ASSERT(false, "Undocumented DLB state");
1030         }
1031
1032         msg += " " + dlbStateStr + "\n";
1033         msg += gmx::formatString(" Average load imbalance: %.1f%%.\n", imbalance * 100);
1034         msg += gmx::formatString(
1035                 " The balanceable part of the MD step is %d%%, load imbalance is computed from "
1036                 "this.\n",
1037                 gmx::roundToInt(dd_force_load_fraction(dd) * 100));
1038         msg += gmx::formatString(
1039                 " Part of the total run time spent waiting due to load imbalance: %.1f%%.\n",
1040                 lossFraction * 100);
1041         fprintf(fplog, "%s", msg.c_str());
1042         fprintf(stderr, "\n%s", msg.c_str());
1043     }
1044
1045     /* Print during what percentage of steps the  load balancing was limited */
1046     bool dlbWasLimited = false;
1047     if (isDlbOn(comm))
1048     {
1049         sprintf(buf, " Steps where the load balancing was limited by -rdd, -rcon and/or -dds:");
1050         for (int d = 0; d < dd->ndim; d++)
1051         {
1052             int limitPercentage = (200 * comm->load_lim[d] + 1) / (2 * comm->nload);
1053             sprintf(buf + strlen(buf), " %c %d %%", dim2char(dd->dim[d]), limitPercentage);
1054             if (limitPercentage >= 50)
1055             {
1056                 dlbWasLimited = true;
1057             }
1058         }
1059         sprintf(buf + strlen(buf), "\n");
1060         fprintf(fplog, "%s", buf);
1061         fprintf(stderr, "%s", buf);
1062     }
1063
1064     /* Print the performance loss due to separate PME - PP rank imbalance */
1065     float lossFractionPme = 0;
1066     if (numPmeRanks > 0 && comm->load_mdf > 0 && comm->load_step > 0)
1067     {
1068         float pmeForceRatio = comm->load_pme / comm->load_mdf;
1069         lossFractionPme     = (comm->load_pme - comm->load_mdf) / comm->load_step;
1070         if (lossFractionPme <= 0)
1071         {
1072             lossFractionPme *= numPmeRanks / static_cast<float>(numRanks);
1073         }
1074         else
1075         {
1076             lossFractionPme *= numPpRanks / static_cast<float>(numRanks);
1077         }
1078         sprintf(buf, " Average PME mesh/force load: %5.3f\n", pmeForceRatio);
1079         fprintf(fplog, "%s", buf);
1080         fprintf(stderr, "%s", buf);
1081         sprintf(buf,
1082                 " Part of the total run time spent waiting due to PP/PME imbalance: %.1f %%\n",
1083                 std::fabs(lossFractionPme) * 100);
1084         fprintf(fplog, "%s", buf);
1085         fprintf(stderr, "%s", buf);
1086     }
1087     fprintf(fplog, "\n");
1088     fprintf(stderr, "\n");
1089
1090     if ((lossFraction >= DD_PERF_LOSS_WARN) && (dd->comm->dlbState != DlbState::offTemporarilyLocked))
1091     {
1092         std::string message = gmx::formatString(
1093                 "NOTE: %.1f %% of the available CPU time was lost due to load imbalance\n"
1094                 "      in the domain decomposition.\n",
1095                 lossFraction * 100);
1096
1097         bool hadSuggestion = false;
1098         if (dd->comm->dlbState == DlbState::offUser)
1099         {
1100             message += "      You might want to allow dynamic load balancing (option -dlb auto.)\n";
1101             hadSuggestion = true;
1102         }
1103         else if (dd->comm->dlbState == DlbState::offCanTurnOn)
1104         {
1105             message +=
1106                     "      Dynamic load balancing was automatically disabled, but it might be "
1107                     "beneficial to manually tuning it on (option -dlb on.)\n";
1108             hadSuggestion = true;
1109         }
1110         else if (dlbWasLimited)
1111         {
1112             message +=
1113                     "      You might want to decrease the cell size limit (options -rdd, -rcon "
1114                     "and/or -dds).\n";
1115             hadSuggestion = true;
1116         }
1117         message += gmx::formatString(
1118                 "      You can %sconsider manually changing the decomposition (option -dd);\n"
1119                 "      e.g. by using fewer domains along the box dimension in which there is\n"
1120                 "      considerable inhomogeneity in the simulated system.",
1121                 hadSuggestion ? "also " : "");
1122
1123         fprintf(fplog, "%s\n", message.c_str());
1124         fprintf(stderr, "%s\n", message.c_str());
1125     }
1126     if (numPmeRanks > 0 && std::fabs(lossFractionPme) >= DD_PERF_LOSS_WARN)
1127     {
1128         sprintf(buf,
1129                 "NOTE: %.1f %% performance was lost because the PME ranks\n"
1130                 "      had %s work to do than the PP ranks.\n"
1131                 "      You might want to %s the number of PME ranks\n"
1132                 "      or %s the cut-off and the grid spacing.\n",
1133                 std::fabs(lossFractionPme * 100),
1134                 (lossFractionPme < 0) ? "less" : "more",
1135                 (lossFractionPme < 0) ? "decrease" : "increase",
1136                 (lossFractionPme < 0) ? "decrease" : "increase");
1137         fprintf(fplog, "%s\n", buf);
1138         fprintf(stderr, "%s\n", buf);
1139     }
1140 }
1141
1142 //! Return the minimum communication volume.
1143 static float dd_vol_min(gmx_domdec_t* dd)
1144 {
1145     return dd->comm->load[0].cvol_min * dd->nnodes;
1146 }
1147
1148 //! Return the DD load flags.
1149 static int dd_load_flags(gmx_domdec_t* dd)
1150 {
1151     return dd->comm->load[0].flags;
1152 }
1153
1154 //! Return the reported load imbalance in force calculations.
1155 static float dd_f_imbal(gmx_domdec_t* dd)
1156 {
1157     if (dd->comm->load[0].sum > 0)
1158     {
1159         return dd->comm->load[0].max * dd->nnodes / dd->comm->load[0].sum - 1.0F;
1160     }
1161     else
1162     {
1163         /* Something is wrong in the cycle counting, report no load imbalance */
1164         return 0.0F;
1165     }
1166 }
1167
1168 //! Returns DD load balance report.
1169 static std::string dd_print_load(gmx_domdec_t* dd, int64_t step)
1170 {
1171     gmx::StringOutputStream stream;
1172     gmx::TextWriter         log(&stream);
1173
1174     int flags = dd_load_flags(dd);
1175     if (flags)
1176     {
1177         log.writeString("DD  load balancing is limited by minimum cell size in dimension");
1178         for (int d = 0; d < dd->ndim; d++)
1179         {
1180             if (flags & (1 << d))
1181             {
1182                 log.writeStringFormatted(" %c", dim2char(dd->dim[d]));
1183             }
1184         }
1185         log.ensureLineBreak();
1186     }
1187     log.writeString("DD  step " + gmx::toString(step));
1188     if (isDlbOn(dd->comm))
1189     {
1190         log.writeStringFormatted("  vol min/aver %5.3f%c", dd_vol_min(dd), flags ? '!' : ' ');
1191     }
1192     if (dd->nnodes > 1)
1193     {
1194         log.writeStringFormatted(" load imb.: force %4.1f%%", dd_f_imbal(dd) * 100);
1195     }
1196     if (dd->comm->cycl_n[ddCyclPME])
1197     {
1198         log.writeStringFormatted("  pme mesh/force %5.3f", dd_pme_f_ratio(dd));
1199     }
1200     log.ensureLineBreak();
1201     return stream.toString();
1202 }
1203
1204 //! Prints DD load balance report in mdrun verbose mode.
1205 static void dd_print_load_verbose(gmx_domdec_t* dd)
1206 {
1207     if (isDlbOn(dd->comm))
1208     {
1209         fprintf(stderr, "vol %4.2f%c ", dd_vol_min(dd), dd_load_flags(dd) ? '!' : ' ');
1210     }
1211     if (dd->nnodes > 1)
1212     {
1213         fprintf(stderr, "imb F %2d%% ", gmx::roundToInt(dd_f_imbal(dd) * 100));
1214     }
1215     if (dd->comm->cycl_n[ddCyclPME])
1216     {
1217         fprintf(stderr, "pme/F %4.2f ", dd_pme_f_ratio(dd));
1218     }
1219 }
1220
1221 //! Turns on dynamic load balancing if possible and needed.
1222 static void turn_on_dlb(const gmx::MDLogger& mdlog, gmx_domdec_t* dd, int64_t step)
1223 {
1224     gmx_domdec_comm_t* comm = dd->comm;
1225
1226     real cellsize_min = comm->cellsize_min[dd->dim[0]];
1227     for (int d = 1; d < dd->ndim; d++)
1228     {
1229         cellsize_min = std::min(cellsize_min, comm->cellsize_min[dd->dim[d]]);
1230     }
1231
1232     /* Turn off DLB if we're too close to the cell size limit. */
1233     if (cellsize_min < comm->cellsize_limit * 1.05)
1234     {
1235         GMX_LOG(mdlog.info)
1236                 .appendTextFormatted(
1237                         "step %s Measured %.1f %% performance loss due to load imbalance, "
1238                         "but the minimum cell size is smaller than 1.05 times the cell size limit. "
1239                         "Will no longer try dynamic load balancing.",
1240                         gmx::toString(step).c_str(),
1241                         dd_force_imb_perf_loss(dd) * 100);
1242
1243         comm->dlbState = DlbState::offForever;
1244         return;
1245     }
1246
1247     GMX_LOG(mdlog.info)
1248             .appendTextFormatted(
1249                     "step %s Turning on dynamic load balancing, because the performance loss due "
1250                     "to load imbalance is %.1f %%.",
1251                     gmx::toString(step).c_str(),
1252                     dd_force_imb_perf_loss(dd) * 100);
1253     comm->dlbState = DlbState::onCanTurnOff;
1254
1255     /* Store the non-DLB performance, so we can check if DLB actually
1256      * improves performance.
1257      */
1258     GMX_RELEASE_ASSERT(comm->cycl_n[ddCyclStep] > 0,
1259                        "When we turned on DLB, we should have measured cycles");
1260     comm->cyclesPerStepBeforeDLB = comm->cycl[ddCyclStep] / comm->cycl_n[ddCyclStep];
1261
1262     set_dlb_limits(dd);
1263
1264     /* We can set the required cell size info here,
1265      * so we do not need to communicate this.
1266      * The grid is completely uniform.
1267      */
1268     for (int d = 0; d < dd->ndim; d++)
1269     {
1270         RowMaster* rowMaster = comm->cellsizesWithDlb[d].rowMaster.get();
1271
1272         if (rowMaster)
1273         {
1274             comm->load[d].sum_m = comm->load[d].sum;
1275
1276             int nc = dd->numCells[dd->dim[d]];
1277             for (int i = 0; i < nc; i++)
1278             {
1279                 rowMaster->cellFrac[i] = i / static_cast<real>(nc);
1280                 if (d > 0)
1281                 {
1282                     rowMaster->bounds[i].cellFracLowerMax = i / static_cast<real>(nc);
1283                     rowMaster->bounds[i].cellFracUpperMin = (i + 1) / static_cast<real>(nc);
1284                 }
1285             }
1286             rowMaster->cellFrac[nc] = 1.0;
1287         }
1288     }
1289 }
1290
1291 //! Turns off dynamic load balancing (but leave it able to turn back on).
1292 static void turn_off_dlb(const gmx::MDLogger& mdlog, gmx_domdec_t* dd, int64_t step)
1293 {
1294     GMX_LOG(mdlog.info)
1295             .appendText(
1296                     "step " + gmx::toString(step)
1297                     + " Turning off dynamic load balancing, because it is degrading performance.");
1298     dd->comm->dlbState                     = DlbState::offCanTurnOn;
1299     dd->comm->haveTurnedOffDlb             = true;
1300     dd->comm->ddPartioningCountFirstDlbOff = dd->ddp_count;
1301 }
1302
1303 //! Turns off dynamic load balancing permanently.
1304 static void turn_off_dlb_forever(const gmx::MDLogger& mdlog, gmx_domdec_t* dd, int64_t step)
1305 {
1306     GMX_RELEASE_ASSERT(dd->comm->dlbState == DlbState::offCanTurnOn,
1307                        "Can only turn off DLB forever when it was in the can-turn-on state");
1308     GMX_LOG(mdlog.info)
1309             .appendText(
1310                     "step " + gmx::toString(step)
1311                     + " Will no longer try dynamic load balancing, as it degraded performance.");
1312     dd->comm->dlbState = DlbState::offForever;
1313 }
1314
1315 void set_dd_dlb_max_cutoff(t_commrec* cr, real cutoff)
1316 {
1317     gmx_domdec_comm_t* comm;
1318
1319     comm = cr->dd->comm;
1320
1321     /* Turn on the DLB limiting (might have been on already) */
1322     comm->bPMELoadBalDLBLimits = TRUE;
1323
1324     /* Change the cut-off limit */
1325     comm->PMELoadBal_max_cutoff = cutoff;
1326
1327     if (debug)
1328     {
1329         fprintf(debug,
1330                 "PME load balancing set a limit to the DLB staggering such that a %f cut-off will "
1331                 "continue to fit\n",
1332                 comm->PMELoadBal_max_cutoff);
1333     }
1334 }
1335
1336 //! Merge atom buffers.
1337 static void merge_cg_buffers(int                            ncell,
1338                              gmx_domdec_comm_dim_t*         cd,
1339                              int                            pulse,
1340                              int*                           ncg_cell,
1341                              gmx::ArrayRef<int>             index_gl,
1342                              const int*                     recv_i,
1343                              gmx::ArrayRef<gmx::RVec>       x,
1344                              gmx::ArrayRef<const gmx::RVec> recv_vr,
1345                              gmx::ArrayRef<cginfo_mb_t>     cginfo_mb,
1346                              gmx::ArrayRef<int>             cginfo)
1347 {
1348     gmx_domdec_ind_t *ind, *ind_p;
1349     int               p, cell, c, cg, cg0, cg1, cg_gl;
1350     int               shift;
1351
1352     ind = &cd->ind[pulse];
1353
1354     /* First correct the already stored data */
1355     shift = ind->nrecv[ncell];
1356     for (cell = ncell - 1; cell >= 0; cell--)
1357     {
1358         shift -= ind->nrecv[cell];
1359         if (shift > 0)
1360         {
1361             /* Move the cg's present from previous grid pulses */
1362             cg0 = ncg_cell[ncell + cell];
1363             cg1 = ncg_cell[ncell + cell + 1];
1364             for (cg = cg1 - 1; cg >= cg0; cg--)
1365             {
1366                 index_gl[cg + shift] = index_gl[cg];
1367                 x[cg + shift]        = x[cg];
1368                 cginfo[cg + shift]   = cginfo[cg];
1369             }
1370             /* Correct the already stored send indices for the shift */
1371             for (p = 1; p <= pulse; p++)
1372             {
1373                 ind_p = &cd->ind[p];
1374                 cg0   = 0;
1375                 for (c = 0; c < cell; c++)
1376                 {
1377                     cg0 += ind_p->nsend[c];
1378                 }
1379                 cg1 = cg0 + ind_p->nsend[cell];
1380                 for (cg = cg0; cg < cg1; cg++)
1381                 {
1382                     ind_p->index[cg] += shift;
1383                 }
1384             }
1385         }
1386     }
1387
1388     /* Merge in the communicated buffers */
1389     shift = 0;
1390     cg0   = 0;
1391     for (cell = 0; cell < ncell; cell++)
1392     {
1393         cg1 = ncg_cell[ncell + cell + 1] + shift;
1394         for (cg = 0; cg < ind->nrecv[cell]; cg++)
1395         {
1396             /* Copy this atom from the buffer */
1397             index_gl[cg1] = recv_i[cg0];
1398             x[cg1]        = recv_vr[cg0];
1399             /* Copy information */
1400             cg_gl       = index_gl[cg1];
1401             cginfo[cg1] = ddcginfo(cginfo_mb, cg_gl);
1402             cg0++;
1403             cg1++;
1404         }
1405         shift += ind->nrecv[cell];
1406         ncg_cell[ncell + cell + 1] = cg1;
1407     }
1408 }
1409
1410 //! Makes a range partitioning for the atom groups wthin a cell
1411 static void make_cell2at_index(gmx_domdec_comm_dim_t* cd, int nzone, int atomGroupStart)
1412 {
1413     /* Store the atom block boundaries for easy copying of communication buffers
1414      */
1415     int g = atomGroupStart;
1416     for (int zone = 0; zone < nzone; zone++)
1417     {
1418         for (gmx_domdec_ind_t& ind : cd->ind)
1419         {
1420             ind.cell2at0[zone] = g;
1421             g += ind.nrecv[zone];
1422             ind.cell2at1[zone] = g;
1423         }
1424     }
1425 }
1426
1427 //! Returns whether a link is missing.
1428 static gmx_bool missing_link(const t_blocka& link, const int globalAtomIndex, const gmx_ga2la_t& ga2la)
1429 {
1430     for (int i = link.index[globalAtomIndex]; i < link.index[globalAtomIndex + 1]; i++)
1431     {
1432         if (!ga2la.findHome(link.a[i]))
1433         {
1434             return true;
1435         }
1436     }
1437
1438     return false;
1439 }
1440
1441 //! Domain corners for communication, a maximum of 4 i-zones see a j domain
1442 typedef struct
1443 {
1444     //! The corners for the non-bonded communication.
1445     real c[DIM][4];
1446     //! Corner for rounding.
1447     real cr0;
1448     //! Corners for rounding.
1449     real cr1[4];
1450     //! Corners for bounded communication.
1451     real bc[DIM];
1452     //! Corner for rounding for bonded communication.
1453     real bcr1;
1454 } dd_corners_t;
1455
1456 //! Determine the corners of the domain(s) we are communicating with.
1457 static void set_dd_corners(const gmx_domdec_t* dd, int dim0, int dim1, int dim2, gmx_bool bDistMB, dd_corners_t* c)
1458 {
1459     const gmx_domdec_comm_t*  comm;
1460     const gmx_domdec_zones_t* zones;
1461
1462     comm = dd->comm;
1463
1464     zones = &comm->zones;
1465
1466     /* Keep the compiler happy */
1467     c->cr0  = 0;
1468     c->bcr1 = 0;
1469
1470     /* The first dimension is equal for all cells */
1471     c->c[0][0] = comm->cell_x0[dim0];
1472     if (bDistMB)
1473     {
1474         c->bc[0] = c->c[0][0];
1475     }
1476     if (dd->ndim >= 2)
1477     {
1478         dim1 = dd->dim[1];
1479         /* This cell row is only seen from the first row */
1480         c->c[1][0] = comm->cell_x0[dim1];
1481         /* All rows can see this row */
1482         c->c[1][1] = comm->cell_x0[dim1];
1483         if (isDlbOn(dd->comm))
1484         {
1485             c->c[1][1] = std::max(comm->cell_x0[dim1], comm->zone_d1[1].mch0);
1486             if (bDistMB)
1487             {
1488                 /* For the multi-body distance we need the maximum */
1489                 c->bc[1] = std::max(comm->cell_x0[dim1], comm->zone_d1[1].p1_0);
1490             }
1491         }
1492         /* Set the upper-right corner for rounding */
1493         c->cr0 = comm->cell_x1[dim0];
1494
1495         if (dd->ndim >= 3)
1496         {
1497             dim2 = dd->dim[2];
1498             for (int j = 0; j < 4; j++)
1499             {
1500                 c->c[2][j] = comm->cell_x0[dim2];
1501             }
1502             if (isDlbOn(dd->comm))
1503             {
1504                 /* Use the maximum of the i-cells that see a j-cell */
1505                 for (const auto& iZone : zones->iZones)
1506                 {
1507                     const int iZoneIndex = iZone.iZoneIndex;
1508                     for (int jZone : iZone.jZoneRange)
1509                     {
1510                         if (jZone >= 4)
1511                         {
1512                             c->c[2][jZone - 4] = std::max(
1513                                     c->c[2][jZone - 4],
1514                                     comm->zone_d2[zones->shift[iZoneIndex][dim0]][zones->shift[iZoneIndex][dim1]]
1515                                             .mch0);
1516                         }
1517                     }
1518                 }
1519                 if (bDistMB)
1520                 {
1521                     /* For the multi-body distance we need the maximum */
1522                     c->bc[2] = comm->cell_x0[dim2];
1523                     for (int i = 0; i < 2; i++)
1524                     {
1525                         for (int j = 0; j < 2; j++)
1526                         {
1527                             c->bc[2] = std::max(c->bc[2], comm->zone_d2[i][j].p1_0);
1528                         }
1529                     }
1530                 }
1531             }
1532
1533             /* Set the upper-right corner for rounding */
1534             /* Cell (0,0,0) and cell (1,0,0) can see cell 4 (0,1,1)
1535              * Only cell (0,0,0) can see cell 7 (1,1,1)
1536              */
1537             c->cr1[0] = comm->cell_x1[dim1];
1538             c->cr1[3] = comm->cell_x1[dim1];
1539             if (isDlbOn(dd->comm))
1540             {
1541                 c->cr1[0] = std::max(comm->cell_x1[dim1], comm->zone_d1[1].mch1);
1542                 if (bDistMB)
1543                 {
1544                     /* For the multi-body distance we need the maximum */
1545                     c->bcr1 = std::max(comm->cell_x1[dim1], comm->zone_d1[1].p1_1);
1546                 }
1547             }
1548         }
1549     }
1550 }
1551
1552 /*! \brief Add the atom groups and coordinates we need to send in this
1553  * pulse from this zone to \p localAtomGroups and \p work. */
1554 static void get_zone_pulse_groups(gmx_domdec_t*                  dd,
1555                                   int                            zonei,
1556                                   int                            zone,
1557                                   int                            cg0,
1558                                   int                            cg1,
1559                                   gmx::ArrayRef<const int>       globalAtomGroupIndices,
1560                                   int                            dim,
1561                                   int                            dim_ind,
1562                                   int                            dim0,
1563                                   int                            dim1,
1564                                   int                            dim2,
1565                                   real                           r_comm2,
1566                                   real                           r_bcomm2,
1567                                   matrix                         box,
1568                                   bool                           distanceIsTriclinic,
1569                                   rvec*                          normal,
1570                                   real                           skew_fac2_d,
1571                                   real                           skew_fac_01,
1572                                   rvec*                          v_d,
1573                                   rvec*                          v_0,
1574                                   rvec*                          v_1,
1575                                   const dd_corners_t*            c,
1576                                   const rvec                     sf2_round,
1577                                   gmx_bool                       bDistBonded,
1578                                   gmx_bool                       bBondComm,
1579                                   gmx_bool                       bDist2B,
1580                                   gmx_bool                       bDistMB,
1581                                   gmx::ArrayRef<const gmx::RVec> coordinates,
1582                                   gmx::ArrayRef<const int>       cginfo,
1583                                   std::vector<int>*              localAtomGroups,
1584                                   dd_comm_setup_work_t*          work)
1585 {
1586     gmx_domdec_comm_t* comm;
1587     gmx_bool           bScrew;
1588     gmx_bool           bDistMB_pulse;
1589     int                cg, i;
1590     real               r2, rb2, r, tric_sh;
1591     rvec               rn, rb;
1592     int                dimd;
1593     int                nsend_z, nat;
1594
1595     comm = dd->comm;
1596
1597     bScrew = (dd->unitCellInfo.haveScrewPBC && dim == XX);
1598
1599     bDistMB_pulse = (bDistMB && bDistBonded);
1600
1601     /* Unpack the work data */
1602     std::vector<int>&       ibuf = work->atomGroupBuffer;
1603     std::vector<gmx::RVec>& vbuf = work->positionBuffer;
1604     nsend_z                      = 0;
1605     nat                          = work->nat;
1606
1607     for (cg = cg0; cg < cg1; cg++)
1608     {
1609         r2  = 0;
1610         rb2 = 0;
1611         if (!distanceIsTriclinic)
1612         {
1613             /* Rectangular direction, easy */
1614             r = coordinates[cg][dim] - c->c[dim_ind][zone];
1615             if (r > 0)
1616             {
1617                 r2 += r * r;
1618             }
1619             if (bDistMB_pulse)
1620             {
1621                 r = coordinates[cg][dim] - c->bc[dim_ind];
1622                 if (r > 0)
1623                 {
1624                     rb2 += r * r;
1625                 }
1626             }
1627             /* Rounding gives at most a 16% reduction
1628              * in communicated atoms
1629              */
1630             if (dim_ind >= 1 && (zonei == 1 || zonei == 2))
1631             {
1632                 r = coordinates[cg][dim0] - c->cr0;
1633                 /* This is the first dimension, so always r >= 0 */
1634                 r2 += r * r;
1635                 if (bDistMB_pulse)
1636                 {
1637                     rb2 += r * r;
1638                 }
1639             }
1640             if (dim_ind == 2 && (zonei == 2 || zonei == 3))
1641             {
1642                 r = coordinates[cg][dim1] - c->cr1[zone];
1643                 if (r > 0)
1644                 {
1645                     r2 += r * r;
1646                 }
1647                 if (bDistMB_pulse)
1648                 {
1649                     r = coordinates[cg][dim1] - c->bcr1;
1650                     if (r > 0)
1651                     {
1652                         rb2 += r * r;
1653                     }
1654                 }
1655             }
1656         }
1657         else
1658         {
1659             /* Triclinic direction, more complicated */
1660             clear_rvec(rn);
1661             clear_rvec(rb);
1662             /* Rounding, conservative as the skew_fac multiplication
1663              * will slightly underestimate the distance.
1664              */
1665             if (dim_ind >= 1 && (zonei == 1 || zonei == 2))
1666             {
1667                 rn[dim0] = coordinates[cg][dim0] - c->cr0;
1668                 for (i = dim0 + 1; i < DIM; i++)
1669                 {
1670                     rn[dim0] -= coordinates[cg][i] * v_0[i][dim0];
1671                 }
1672                 r2 = rn[dim0] * rn[dim0] * sf2_round[dim0];
1673                 if (bDistMB_pulse)
1674                 {
1675                     rb[dim0] = rn[dim0];
1676                     rb2      = r2;
1677                 }
1678                 /* Take care that the cell planes along dim0 might not
1679                  * be orthogonal to those along dim1 and dim2.
1680                  */
1681                 for (i = 1; i <= dim_ind; i++)
1682                 {
1683                     dimd = dd->dim[i];
1684                     if (normal[dim0][dimd] > 0)
1685                     {
1686                         rn[dimd] -= rn[dim0] * normal[dim0][dimd];
1687                         if (bDistMB_pulse)
1688                         {
1689                             rb[dimd] -= rb[dim0] * normal[dim0][dimd];
1690                         }
1691                     }
1692                 }
1693             }
1694             if (dim_ind == 2 && (zonei == 2 || zonei == 3))
1695             {
1696                 GMX_ASSERT(dim1 >= 0 && dim1 < DIM, "Must have a valid dimension index");
1697                 rn[dim1] += coordinates[cg][dim1] - c->cr1[zone];
1698                 tric_sh = 0;
1699                 for (i = dim1 + 1; i < DIM; i++)
1700                 {
1701                     tric_sh -= coordinates[cg][i] * v_1[i][dim1];
1702                 }
1703                 rn[dim1] += tric_sh;
1704                 if (rn[dim1] > 0)
1705                 {
1706                     r2 += rn[dim1] * rn[dim1] * sf2_round[dim1];
1707                     /* Take care of coupling of the distances
1708                      * to the planes along dim0 and dim1 through dim2.
1709                      */
1710                     r2 -= rn[dim0] * rn[dim1] * skew_fac_01;
1711                     /* Take care that the cell planes along dim1
1712                      * might not be orthogonal to that along dim2.
1713                      */
1714                     if (normal[dim1][dim2] > 0)
1715                     {
1716                         rn[dim2] -= rn[dim1] * normal[dim1][dim2];
1717                     }
1718                 }
1719                 if (bDistMB_pulse)
1720                 {
1721                     rb[dim1] += coordinates[cg][dim1] - c->bcr1 + tric_sh;
1722                     if (rb[dim1] > 0)
1723                     {
1724                         rb2 += rb[dim1] * rb[dim1] * sf2_round[dim1];
1725                         /* Take care of coupling of the distances
1726                          * to the planes along dim0 and dim1 through dim2.
1727                          */
1728                         rb2 -= rb[dim0] * rb[dim1] * skew_fac_01;
1729                         /* Take care that the cell planes along dim1
1730                          * might not be orthogonal to that along dim2.
1731                          */
1732                         if (normal[dim1][dim2] > 0)
1733                         {
1734                             rb[dim2] -= rb[dim1] * normal[dim1][dim2];
1735                         }
1736                     }
1737                 }
1738             }
1739             /* The distance along the communication direction */
1740             rn[dim] += coordinates[cg][dim] - c->c[dim_ind][zone];
1741             tric_sh = 0;
1742             for (i = dim + 1; i < DIM; i++)
1743             {
1744                 tric_sh -= coordinates[cg][i] * v_d[i][dim];
1745             }
1746             rn[dim] += tric_sh;
1747             if (rn[dim] > 0)
1748             {
1749                 r2 += rn[dim] * rn[dim] * skew_fac2_d;
1750                 /* Take care of coupling of the distances
1751                  * to the planes along dim0 and dim1 through dim2.
1752                  */
1753                 if (dim_ind == 1 && zonei == 1)
1754                 {
1755                     r2 -= rn[dim0] * rn[dim] * skew_fac_01;
1756                 }
1757             }
1758             if (bDistMB_pulse)
1759             {
1760                 clear_rvec(rb);
1761                 GMX_ASSERT(dim >= 0 && dim < DIM, "Must have a valid dimension index");
1762                 rb[dim] += coordinates[cg][dim] - c->bc[dim_ind] + tric_sh;
1763                 if (rb[dim] > 0)
1764                 {
1765                     rb2 += rb[dim] * rb[dim] * skew_fac2_d;
1766                     /* Take care of coupling of the distances
1767                      * to the planes along dim0 and dim1 through dim2.
1768                      */
1769                     if (dim_ind == 1 && zonei == 1)
1770                     {
1771                         rb2 -= rb[dim0] * rb[dim] * skew_fac_01;
1772                     }
1773                 }
1774             }
1775         }
1776
1777         if (r2 < r_comm2
1778             || (bDistBonded && ((bDistMB && rb2 < r_bcomm2) || (bDist2B && r2 < r_bcomm2))
1779                 && (!bBondComm
1780                     || (GET_CGINFO_BOND_INTER(cginfo[cg])
1781                         && missing_link(*comm->bondedLinks, globalAtomGroupIndices[cg], *dd->ga2la)))))
1782         {
1783             /* Store the local and global atom group indices and position */
1784             localAtomGroups->push_back(cg);
1785             ibuf.push_back(globalAtomGroupIndices[cg]);
1786             nsend_z++;
1787
1788             rvec posPbc;
1789             if (dd->ci[dim] == 0)
1790             {
1791                 /* Correct coordinates for pbc */
1792                 rvec_add(coordinates[cg], box[dim], posPbc);
1793                 if (bScrew)
1794                 {
1795                     posPbc[YY] = box[YY][YY] - posPbc[YY];
1796                     posPbc[ZZ] = box[ZZ][ZZ] - posPbc[ZZ];
1797                 }
1798             }
1799             else
1800             {
1801                 copy_rvec(coordinates[cg], posPbc);
1802             }
1803             vbuf.emplace_back(posPbc[XX], posPbc[YY], posPbc[ZZ]);
1804
1805             nat += 1;
1806         }
1807     }
1808
1809     work->nat        = nat;
1810     work->nsend_zone = nsend_z;
1811 }
1812
1813 //! Clear data.
1814 static void clearCommSetupData(dd_comm_setup_work_t* work)
1815 {
1816     work->localAtomGroupBuffer.clear();
1817     work->atomGroupBuffer.clear();
1818     work->positionBuffer.clear();
1819     work->nat        = 0;
1820     work->nsend_zone = 0;
1821 }
1822
1823 //! Prepare DD communication.
1824 static void setup_dd_communication(gmx_domdec_t* dd, matrix box, gmx_ddbox_t* ddbox, t_forcerec* fr, t_state* state)
1825 {
1826     int                    dim_ind, dim, dim0, dim1, dim2, dimd, nat_tot;
1827     int                    nzone, nzone_send, zone, zonei, cg0, cg1;
1828     int                    c;
1829     int *                  zone_cg_range, pos_cg;
1830     gmx_domdec_comm_t*     comm;
1831     gmx_domdec_zones_t*    zones;
1832     gmx_domdec_comm_dim_t* cd;
1833     gmx_bool               bBondComm, bDist2B, bDistMB, bDistBonded;
1834     dd_corners_t           corners;
1835     rvec *                 normal, *v_d, *v_0 = nullptr, *v_1 = nullptr;
1836     real                   skew_fac2_d, skew_fac_01;
1837     rvec                   sf2_round;
1838
1839     if (debug)
1840     {
1841         fprintf(debug, "Setting up DD communication\n");
1842     }
1843
1844     comm = dd->comm;
1845
1846     if (comm->dth.empty())
1847     {
1848         /* Initialize the thread data.
1849          * This can not be done in init_domain_decomposition,
1850          * as the numbers of threads is determined later.
1851          */
1852         int numThreads = gmx_omp_nthreads_get(ModuleMultiThread::Domdec);
1853         comm->dth.resize(numThreads);
1854     }
1855
1856     bBondComm = comm->systemInfo.filterBondedCommunication;
1857
1858     /* Do we need to determine extra distances for multi-body bondeds? */
1859     bDistMB = (comm->systemInfo.haveInterDomainMultiBodyBondeds && isDlbOn(dd->comm) && dd->ndim > 1);
1860
1861     /* Do we need to determine extra distances for only two-body bondeds? */
1862     bDist2B = (bBondComm && !bDistMB);
1863
1864     const real r_comm2 =
1865             gmx::square(domainToDomainIntoAtomToDomainCutoff(comm->systemInfo, comm->systemInfo.cutoff));
1866     const real r_bcomm2 =
1867             gmx::square(domainToDomainIntoAtomToDomainCutoff(comm->systemInfo, comm->cutoff_mbody));
1868
1869     if (debug)
1870     {
1871         fprintf(debug, "bBondComm %s, r_bc %f\n", gmx::boolToString(bBondComm), std::sqrt(r_bcomm2));
1872     }
1873
1874     zones = &comm->zones;
1875
1876     dim0 = dd->dim[0];
1877     dim1 = (dd->ndim >= 2 ? dd->dim[1] : -1);
1878     dim2 = (dd->ndim >= 3 ? dd->dim[2] : -1);
1879
1880     set_dd_corners(dd, dim0, dim1, dim2, bDistMB, &corners);
1881
1882     /* Triclinic stuff */
1883     normal      = ddbox->normal;
1884     skew_fac_01 = 0;
1885     if (dd->ndim >= 2)
1886     {
1887         v_0 = ddbox->v[dim0];
1888         if (ddbox->tric_dir[dim0] && ddbox->tric_dir[dim1])
1889         {
1890             /* Determine the coupling coefficient for the distances
1891              * to the cell planes along dim0 and dim1 through dim2.
1892              * This is required for correct rounding.
1893              */
1894             skew_fac_01 = ddbox->v[dim0][dim1 + 1][dim0] * ddbox->v[dim1][dim1 + 1][dim1];
1895             if (debug)
1896             {
1897                 fprintf(debug, "\nskew_fac_01 %f\n", skew_fac_01);
1898             }
1899         }
1900     }
1901     if (dd->ndim >= 3)
1902     {
1903         v_1 = ddbox->v[dim1];
1904     }
1905
1906     zone_cg_range                        = zones->cg_range.data();
1907     gmx::ArrayRef<cginfo_mb_t> cginfo_mb = fr->cginfo_mb;
1908
1909     zone_cg_range[0]   = 0;
1910     zone_cg_range[1]   = dd->ncg_home;
1911     comm->zone_ncg1[0] = dd->ncg_home;
1912     pos_cg             = dd->ncg_home;
1913
1914     nat_tot = comm->atomRanges.numHomeAtoms();
1915     nzone   = 1;
1916     for (dim_ind = 0; dim_ind < dd->ndim; dim_ind++)
1917     {
1918         dim = dd->dim[dim_ind];
1919         cd  = &comm->cd[dim_ind];
1920
1921         /* Check if we need to compute triclinic distances along this dim */
1922         bool distanceIsTriclinic = false;
1923         for (int i = 0; i <= dim_ind; i++)
1924         {
1925             if (ddbox->tric_dir[dd->dim[i]])
1926             {
1927                 distanceIsTriclinic = true;
1928             }
1929         }
1930
1931         if (dim >= ddbox->npbcdim && dd->ci[dim] == 0)
1932         {
1933             /* No pbc in this dimension, the first node should not comm. */
1934             nzone_send = 0;
1935         }
1936         else
1937         {
1938             nzone_send = nzone;
1939         }
1940
1941         v_d         = ddbox->v[dim];
1942         skew_fac2_d = gmx::square(ddbox->skew_fac[dim]);
1943
1944         cd->receiveInPlace = true;
1945         for (int p = 0; p < cd->numPulses(); p++)
1946         {
1947             /* Only atoms communicated in the first pulse are used
1948              * for multi-body bonded interactions or for bBondComm.
1949              */
1950             bDistBonded = ((bDistMB || bDist2B) && p == 0);
1951
1952             gmx_domdec_ind_t* ind = &cd->ind[p];
1953
1954             /* Thread 0 writes in the global index array */
1955             ind->index.clear();
1956             clearCommSetupData(&comm->dth[0]);
1957
1958             for (zone = 0; zone < nzone_send; zone++)
1959             {
1960                 if (dim_ind > 0 && distanceIsTriclinic)
1961                 {
1962                     /* Determine slightly more optimized skew_fac's
1963                      * for rounding.
1964                      * This reduces the number of communicated atoms
1965                      * by about 10% for 3D DD of rhombic dodecahedra.
1966                      */
1967                     for (dimd = 0; dimd < dim; dimd++)
1968                     {
1969                         sf2_round[dimd] = 1;
1970                         if (ddbox->tric_dir[dimd])
1971                         {
1972                             for (int i = dd->dim[dimd] + 1; i < DIM; i++)
1973                             {
1974                                 /* If we are shifted in dimension i
1975                                  * and the cell plane is tilted forward
1976                                  * in dimension i, skip this coupling.
1977                                  */
1978                                 if (!(zones->shift[nzone + zone][i] && ddbox->v[dimd][i][dimd] >= 0))
1979                                 {
1980                                     sf2_round[dimd] += gmx::square(ddbox->v[dimd][i][dimd]);
1981                                 }
1982                             }
1983                             sf2_round[dimd] = 1 / sf2_round[dimd];
1984                         }
1985                     }
1986                 }
1987
1988                 zonei = zone_perm[dim_ind][zone];
1989                 if (p == 0)
1990                 {
1991                     /* Here we permutate the zones to obtain a convenient order
1992                      * for neighbor searching
1993                      */
1994                     cg0 = zone_cg_range[zonei];
1995                     cg1 = zone_cg_range[zonei + 1];
1996                 }
1997                 else
1998                 {
1999                     /* Look only at the cg's received in the previous grid pulse
2000                      */
2001                     cg1 = zone_cg_range[nzone + zone + 1];
2002                     cg0 = cg1 - cd->ind[p - 1].nrecv[zone];
2003                 }
2004
2005                 const int numThreads = gmx::ssize(comm->dth);
2006 #pragma omp parallel for num_threads(numThreads) schedule(static)
2007                 for (int th = 0; th < numThreads; th++)
2008                 {
2009                     try
2010                     {
2011                         dd_comm_setup_work_t& work = comm->dth[th];
2012
2013                         /* Retain data accumulated into buffers of thread 0 */
2014                         if (th > 0)
2015                         {
2016                             clearCommSetupData(&work);
2017                         }
2018
2019                         int cg0_th = cg0 + ((cg1 - cg0) * th) / numThreads;
2020                         int cg1_th = cg0 + ((cg1 - cg0) * (th + 1)) / numThreads;
2021
2022                         /* Get the atom groups and coordinates for this pulse in this zone */
2023                         get_zone_pulse_groups(dd,
2024                                               zonei,
2025                                               zone,
2026                                               cg0_th,
2027                                               cg1_th,
2028                                               dd->globalAtomGroupIndices,
2029                                               dim,
2030                                               dim_ind,
2031                                               dim0,
2032                                               dim1,
2033                                               dim2,
2034                                               r_comm2,
2035                                               r_bcomm2,
2036                                               box,
2037                                               distanceIsTriclinic,
2038                                               normal,
2039                                               skew_fac2_d,
2040                                               skew_fac_01,
2041                                               v_d,
2042                                               v_0,
2043                                               v_1,
2044                                               &corners,
2045                                               sf2_round,
2046                                               bDistBonded,
2047                                               bBondComm,
2048                                               bDist2B,
2049                                               bDistMB,
2050                                               state->x,
2051                                               fr->cginfo,
2052                                               th == 0 ? &ind->index : &work.localAtomGroupBuffer,
2053                                               &work);
2054                     }
2055                     GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR
2056                 } // END
2057
2058                 std::vector<int>&       atomGroups = comm->dth[0].atomGroupBuffer;
2059                 std::vector<gmx::RVec>& positions  = comm->dth[0].positionBuffer;
2060                 ind->nsend[zone]                   = comm->dth[0].nsend_zone;
2061                 /* Append data of threads>=1 to the communication buffers */
2062                 for (int th = 1; th < numThreads; th++)
2063                 {
2064                     const dd_comm_setup_work_t& dth = comm->dth[th];
2065
2066                     ind->index.insert(ind->index.end(),
2067                                       dth.localAtomGroupBuffer.begin(),
2068                                       dth.localAtomGroupBuffer.end());
2069                     atomGroups.insert(
2070                             atomGroups.end(), dth.atomGroupBuffer.begin(), dth.atomGroupBuffer.end());
2071                     positions.insert(
2072                             positions.end(), dth.positionBuffer.begin(), dth.positionBuffer.end());
2073                     comm->dth[0].nat += dth.nat;
2074                     ind->nsend[zone] += dth.nsend_zone;
2075                 }
2076             }
2077             /* Clear the counts in case we do not have pbc */
2078             for (zone = nzone_send; zone < nzone; zone++)
2079             {
2080                 ind->nsend[zone] = 0;
2081             }
2082             ind->nsend[nzone]     = ind->index.size();
2083             ind->nsend[nzone + 1] = comm->dth[0].nat;
2084             /* Communicate the number of cg's and atoms to receive */
2085             ddSendrecv(dd, dim_ind, dddirBackward, ind->nsend, nzone + 2, ind->nrecv, nzone + 2);
2086
2087             if (p > 0)
2088             {
2089                 /* We can receive in place if only the last zone is not empty */
2090                 for (zone = 0; zone < nzone - 1; zone++)
2091                 {
2092                     if (ind->nrecv[zone] > 0)
2093                     {
2094                         cd->receiveInPlace = false;
2095                     }
2096                 }
2097             }
2098
2099             int receiveBufferSize = 0;
2100             if (!cd->receiveInPlace)
2101             {
2102                 receiveBufferSize = ind->nrecv[nzone];
2103             }
2104             /* These buffer are actually only needed with in-place */
2105             DDBufferAccess<int>       globalAtomGroupBuffer(comm->intBuffer, receiveBufferSize);
2106             DDBufferAccess<gmx::RVec> rvecBuffer(comm->rvecBuffer, receiveBufferSize);
2107
2108             dd_comm_setup_work_t& work = comm->dth[0];
2109
2110             /* Make space for the global cg indices */
2111             int numAtomGroupsNew = pos_cg + ind->nrecv[nzone];
2112             dd->globalAtomGroupIndices.resize(numAtomGroupsNew);
2113             /* Communicate the global cg indices */
2114             gmx::ArrayRef<int> integerBufferRef;
2115             if (cd->receiveInPlace)
2116             {
2117                 integerBufferRef = gmx::arrayRefFromArray(
2118                         dd->globalAtomGroupIndices.data() + pos_cg, ind->nrecv[nzone]);
2119             }
2120             else
2121             {
2122                 integerBufferRef = globalAtomGroupBuffer.buffer;
2123             }
2124             ddSendrecv<int>(dd, dim_ind, dddirBackward, work.atomGroupBuffer, integerBufferRef);
2125
2126             /* Make space for cginfo */
2127             dd_resize_atominfo_and_state(fr, state, pos_cg + ind->nrecv[nzone]);
2128
2129             /* Communicate the coordinates */
2130             gmx::ArrayRef<gmx::RVec> rvecBufferRef;
2131             if (cd->receiveInPlace)
2132             {
2133                 rvecBufferRef = gmx::makeArrayRef(state->x).subArray(pos_cg, ind->nrecv[nzone]);
2134             }
2135             else
2136             {
2137                 rvecBufferRef = rvecBuffer.buffer;
2138             }
2139             ddSendrecv<gmx::RVec>(dd, dim_ind, dddirBackward, work.positionBuffer, rvecBufferRef);
2140
2141             /* Make the charge group index */
2142             if (cd->receiveInPlace)
2143             {
2144                 zone = (p == 0 ? 0 : nzone - 1);
2145                 while (zone < nzone)
2146                 {
2147                     for (int i = 0; i < ind->nrecv[zone]; i++)
2148                     {
2149                         int globalAtomIndex = dd->globalAtomGroupIndices[pos_cg];
2150                         fr->cginfo[pos_cg]  = ddcginfo(cginfo_mb, globalAtomIndex);
2151                         pos_cg++;
2152                     }
2153                     if (p == 0)
2154                     {
2155                         comm->zone_ncg1[nzone + zone] = ind->nrecv[zone];
2156                     }
2157                     zone++;
2158                     zone_cg_range[nzone + zone] = pos_cg;
2159                 }
2160             }
2161             else
2162             {
2163                 /* This part of the code is never executed with bBondComm. */
2164                 merge_cg_buffers(nzone,
2165                                  cd,
2166                                  p,
2167                                  zone_cg_range,
2168                                  dd->globalAtomGroupIndices,
2169                                  integerBufferRef.data(),
2170                                  state->x,
2171                                  rvecBufferRef,
2172                                  fr->cginfo_mb,
2173                                  fr->cginfo);
2174                 pos_cg += ind->nrecv[nzone];
2175             }
2176             nat_tot += ind->nrecv[nzone + 1];
2177         }
2178         if (!cd->receiveInPlace)
2179         {
2180             /* Store the atom block for easy copying of communication buffers */
2181             make_cell2at_index(cd, nzone, zone_cg_range[nzone]);
2182         }
2183         nzone += nzone;
2184     }
2185
2186     comm->atomRanges.setEnd(DDAtomRanges::Type::Zones, nat_tot);
2187
2188     if (!bBondComm)
2189     {
2190         /* We don't need to update cginfo, since that was alrady done above.
2191          * So we pass NULL for the forcerec.
2192          */
2193         dd_set_cginfo(dd->globalAtomGroupIndices, dd->ncg_home, dd->globalAtomGroupIndices.size(), nullptr);
2194     }
2195
2196     if (debug)
2197     {
2198         fprintf(debug, "Finished setting up DD communication, zones:");
2199         for (c = 0; c < zones->n; c++)
2200         {
2201             fprintf(debug, " %d", zones->cg_range[c + 1] - zones->cg_range[c]);
2202         }
2203         fprintf(debug, "\n");
2204     }
2205 }
2206
2207 //! Set boundaries for the charge group range.
2208 static void set_cg_boundaries(gmx_domdec_zones_t* zones)
2209 {
2210     for (auto& iZone : zones->iZones)
2211     {
2212         iZone.iAtomRange = gmx::Range<int>(0, zones->cg_range[iZone.iZoneIndex + 1]);
2213         iZone.jAtomRange = gmx::Range<int>(zones->cg_range[iZone.jZoneRange.begin()],
2214                                            zones->cg_range[iZone.jZoneRange.end()]);
2215     }
2216 }
2217
2218 /*! \brief Set zone dimensions for zones \p zone_start to \p zone_end-1
2219  *
2220  * Also sets the atom density for the home zone when \p zone_start=0.
2221  * For this \p numMovedChargeGroupsInHomeZone needs to be passed to tell
2222  * how many charge groups will move but are still part of the current range.
2223  * \todo When converting domdec to use proper classes, all these variables
2224  *       should be private and a method should return the correct count
2225  *       depending on an internal state.
2226  *
2227  * \param[in,out] dd          The domain decomposition struct
2228  * \param[in]     box         The box
2229  * \param[in]     ddbox       The domain decomposition box struct
2230  * \param[in]     zone_start  The start of the zone range to set sizes for
2231  * \param[in]     zone_end    The end of the zone range to set sizes for
2232  * \param[in]     numMovedChargeGroupsInHomeZone  The number of charge groups in the home zone that should moved but are still present in dd->comm->zones.cg_range
2233  */
2234 static void set_zones_size(gmx_domdec_t*      dd,
2235                            matrix             box,
2236                            const gmx_ddbox_t* ddbox,
2237                            int                zone_start,
2238                            int                zone_end,
2239                            int                numMovedChargeGroupsInHomeZone)
2240 {
2241     gmx_domdec_comm_t*  comm;
2242     gmx_domdec_zones_t* zones;
2243     gmx_bool            bDistMB;
2244     int                 z, d, dim;
2245     real                rcs, rcmbs;
2246     int                 i, j;
2247     real                vol;
2248
2249     comm = dd->comm;
2250
2251     zones = &comm->zones;
2252
2253     /* Do we need to determine extra distances for multi-body bondeds? */
2254     bDistMB = (comm->systemInfo.haveInterDomainMultiBodyBondeds && isDlbOn(dd->comm) && dd->ndim > 1);
2255
2256     for (z = zone_start; z < zone_end; z++)
2257     {
2258         /* Copy cell limits to zone limits.
2259          * Valid for non-DD dims and non-shifted dims.
2260          */
2261         copy_rvec(comm->cell_x0, zones->size[z].x0);
2262         copy_rvec(comm->cell_x1, zones->size[z].x1);
2263     }
2264
2265     for (d = 0; d < dd->ndim; d++)
2266     {
2267         dim = dd->dim[d];
2268
2269         for (z = 0; z < zones->n; z++)
2270         {
2271             /* With a staggered grid we have different sizes
2272              * for non-shifted dimensions.
2273              */
2274             if (isDlbOn(dd->comm) && zones->shift[z][dim] == 0)
2275             {
2276                 if (d == 1)
2277                 {
2278                     zones->size[z].x0[dim] = comm->zone_d1[zones->shift[z][dd->dim[d - 1]]].min0;
2279                     zones->size[z].x1[dim] = comm->zone_d1[zones->shift[z][dd->dim[d - 1]]].max1;
2280                 }
2281                 else if (d == 2)
2282                 {
2283                     zones->size[z].x0[dim] =
2284                             comm->zone_d2[zones->shift[z][dd->dim[d - 2]]][zones->shift[z][dd->dim[d - 1]]]
2285                                     .min0;
2286                     zones->size[z].x1[dim] =
2287                             comm->zone_d2[zones->shift[z][dd->dim[d - 2]]][zones->shift[z][dd->dim[d - 1]]]
2288                                     .max1;
2289                 }
2290             }
2291         }
2292
2293         rcs   = comm->systemInfo.cutoff;
2294         rcmbs = comm->cutoff_mbody;
2295         if (ddbox->tric_dir[dim])
2296         {
2297             rcs /= ddbox->skew_fac[dim];
2298             rcmbs /= ddbox->skew_fac[dim];
2299         }
2300
2301         /* Set the lower limit for the shifted zone dimensions */
2302         for (z = zone_start; z < zone_end; z++)
2303         {
2304             if (zones->shift[z][dim] > 0)
2305             {
2306                 dim = dd->dim[d];
2307                 if (!isDlbOn(dd->comm) || d == 0)
2308                 {
2309                     zones->size[z].x0[dim] = comm->cell_x1[dim];
2310                     zones->size[z].x1[dim] = comm->cell_x1[dim] + rcs;
2311                 }
2312                 else
2313                 {
2314                     /* Here we take the lower limit of the zone from
2315                      * the lowest domain of the zone below.
2316                      */
2317                     if (z < 4)
2318                     {
2319                         zones->size[z].x0[dim] = comm->zone_d1[zones->shift[z][dd->dim[d - 1]]].min1;
2320                     }
2321                     else
2322                     {
2323                         if (d == 1)
2324                         {
2325                             zones->size[z].x0[dim] = zones->size[zone_perm[2][z - 4]].x0[dim];
2326                         }
2327                         else
2328                         {
2329                             zones->size[z].x0[dim] =
2330                                     comm->zone_d2[zones->shift[z][dd->dim[d - 2]]][zones->shift[z][dd->dim[d - 1]]]
2331                                             .min1;
2332                         }
2333                     }
2334                     /* A temporary limit, is updated below */
2335                     zones->size[z].x1[dim] = zones->size[z].x0[dim];
2336
2337                     if (bDistMB)
2338                     {
2339                         for (size_t zi = 0; zi < zones->iZones.size(); zi++)
2340                         {
2341                             if (zones->shift[zi][dim] == 0)
2342                             {
2343                                 /* This takes the whole zone into account.
2344                                  * With multiple pulses this will lead
2345                                  * to a larger zone then strictly necessary.
2346                                  */
2347                                 zones->size[z].x1[dim] = std::max(zones->size[z].x1[dim],
2348                                                                   zones->size[zi].x1[dim] + rcmbs);
2349                             }
2350                         }
2351                     }
2352                 }
2353             }
2354         }
2355
2356         /* Loop over the i-zones to set the upper limit of each
2357          * j-zone they see.
2358          */
2359         for (const auto& iZone : zones->iZones)
2360         {
2361             const int zi = iZone.iZoneIndex;
2362             if (zones->shift[zi][dim] == 0)
2363             {
2364                 /* We should only use zones up to zone_end */
2365                 const auto& jZoneRangeFull = iZone.jZoneRange;
2366                 if (zone_end <= *jZoneRangeFull.begin())
2367                 {
2368                     continue;
2369                 }
2370                 const gmx::Range<int> jZoneRange(*jZoneRangeFull.begin(),
2371                                                  std::min(*jZoneRangeFull.end(), zone_end));
2372                 for (int jZone : jZoneRange)
2373                 {
2374                     if (zones->shift[jZone][dim] > 0)
2375                     {
2376                         zones->size[jZone].x1[dim] =
2377                                 std::max(zones->size[jZone].x1[dim], zones->size[zi].x1[dim] + rcs);
2378                     }
2379                 }
2380             }
2381         }
2382     }
2383
2384     for (z = zone_start; z < zone_end; z++)
2385     {
2386         /* Initialization only required to keep the compiler happy */
2387         rvec corner_min = { 0, 0, 0 }, corner_max = { 0, 0, 0 }, corner;
2388         int  nc, c;
2389
2390         /* To determine the bounding box for a zone we need to find
2391          * the extreme corners of 4, 2 or 1 corners.
2392          */
2393         nc = 1 << (ddbox->nboundeddim - 1);
2394
2395         for (c = 0; c < nc; c++)
2396         {
2397             /* Set up a zone corner at x=0, ignoring trilinic couplings */
2398             corner[XX] = 0;
2399             if ((c & 1) == 0)
2400             {
2401                 corner[YY] = zones->size[z].x0[YY];
2402             }
2403             else
2404             {
2405                 corner[YY] = zones->size[z].x1[YY];
2406             }
2407             if ((c & 2) == 0)
2408             {
2409                 corner[ZZ] = zones->size[z].x0[ZZ];
2410             }
2411             else
2412             {
2413                 corner[ZZ] = zones->size[z].x1[ZZ];
2414             }
2415             if (dd->ndim == 1 && dd->dim[0] < ZZ && ZZ < dd->unitCellInfo.npbcdim
2416                 && box[ZZ][1 - dd->dim[0]] != 0)
2417             {
2418                 /* With 1D domain decomposition the cg's are not in
2419                  * the triclinic box, but triclinic x-y and rectangular y/x-z.
2420                  * Shift the corner of the z-vector back to along the box
2421                  * vector of dimension d, so it will later end up at 0 along d.
2422                  * This can affect the location of this corner along dd->dim[0]
2423                  * through the matrix operation below if box[d][dd->dim[0]]!=0.
2424                  */
2425                 int d = 1 - dd->dim[0];
2426
2427                 corner[d] -= corner[ZZ] * box[ZZ][d] / box[ZZ][ZZ];
2428             }
2429             /* Apply the triclinic couplings */
2430             for (i = YY; i < ddbox->npbcdim && i < DIM; i++)
2431             {
2432                 for (j = XX; j < i; j++)
2433                 {
2434                     corner[j] += corner[i] * box[i][j] / box[i][i];
2435                 }
2436             }
2437             if (c == 0)
2438             {
2439                 copy_rvec(corner, corner_min);
2440                 copy_rvec(corner, corner_max);
2441             }
2442             else
2443             {
2444                 for (i = 0; i < DIM; i++)
2445                 {
2446                     corner_min[i] = std::min(corner_min[i], corner[i]);
2447                     corner_max[i] = std::max(corner_max[i], corner[i]);
2448                 }
2449             }
2450         }
2451         /* Copy the extreme cornes without offset along x */
2452         for (i = 0; i < DIM; i++)
2453         {
2454             zones->size[z].bb_x0[i] = corner_min[i];
2455             zones->size[z].bb_x1[i] = corner_max[i];
2456         }
2457         /* Add the offset along x */
2458         zones->size[z].bb_x0[XX] += zones->size[z].x0[XX];
2459         zones->size[z].bb_x1[XX] += zones->size[z].x1[XX];
2460     }
2461
2462     if (zone_start == 0)
2463     {
2464         vol = 1;
2465         for (dim = 0; dim < DIM; dim++)
2466         {
2467             vol *= zones->size[0].x1[dim] - zones->size[0].x0[dim];
2468         }
2469         zones->dens_zone0 =
2470                 (zones->cg_range[1] - zones->cg_range[0] - numMovedChargeGroupsInHomeZone) / vol;
2471     }
2472
2473     if (debug)
2474     {
2475         for (z = zone_start; z < zone_end; z++)
2476         {
2477             fprintf(debug,
2478                     "zone %d    %6.3f - %6.3f  %6.3f - %6.3f  %6.3f - %6.3f\n",
2479                     z,
2480                     zones->size[z].x0[XX],
2481                     zones->size[z].x1[XX],
2482                     zones->size[z].x0[YY],
2483                     zones->size[z].x1[YY],
2484                     zones->size[z].x0[ZZ],
2485                     zones->size[z].x1[ZZ]);
2486             fprintf(debug,
2487                     "zone %d bb %6.3f - %6.3f  %6.3f - %6.3f  %6.3f - %6.3f\n",
2488                     z,
2489                     zones->size[z].bb_x0[XX],
2490                     zones->size[z].bb_x1[XX],
2491                     zones->size[z].bb_x0[YY],
2492                     zones->size[z].bb_x1[YY],
2493                     zones->size[z].bb_x0[ZZ],
2494                     zones->size[z].bb_x1[ZZ]);
2495         }
2496     }
2497 }
2498
2499 /*! \brief Order data in \p dataToSort according to \p sort
2500  *
2501  * Note: both buffers should have at least \p sort.size() elements.
2502  */
2503 template<typename T>
2504 static void orderVector(gmx::ArrayRef<const gmx_cgsort_t> sort,
2505                         gmx::ArrayRef<T>                  dataToSort,
2506                         gmx::ArrayRef<T>                  sortBuffer)
2507 {
2508     GMX_ASSERT(dataToSort.size() >= sort.size(), "The vector needs to be sufficiently large");
2509     GMX_ASSERT(sortBuffer.size() >= sort.size(),
2510                "The sorting buffer needs to be sufficiently large");
2511
2512     /* Order the data into the temporary buffer */
2513     size_t i = 0;
2514     for (const gmx_cgsort_t& entry : sort)
2515     {
2516         sortBuffer[i++] = dataToSort[entry.ind];
2517     }
2518
2519     /* Copy back to the original array */
2520     std::copy(sortBuffer.begin(), sortBuffer.begin() + sort.size(), dataToSort.begin());
2521 }
2522
2523 /*! \brief Order data in \p dataToSort according to \p sort
2524  *
2525  * Note: \p vectorToSort should have at least \p sort.size() elements,
2526  *       \p workVector is resized when it is too small.
2527  */
2528 template<typename T>
2529 static void orderVector(gmx::ArrayRef<const gmx_cgsort_t> sort,
2530                         gmx::ArrayRef<T>                  vectorToSort,
2531                         std::vector<T>*                   workVector)
2532 {
2533     if (gmx::index(workVector->size()) < sort.ssize())
2534     {
2535         workVector->resize(sort.size());
2536     }
2537     orderVector<T>(sort, vectorToSort, *workVector);
2538 }
2539
2540 //! Returns the sorting order for atoms based on the nbnxn grid order in sort
2541 static void dd_sort_order_nbnxn(const t_forcerec* fr, std::vector<gmx_cgsort_t>* sort)
2542 {
2543     gmx::ArrayRef<const int> atomOrder = fr->nbv->getLocalAtomOrder();
2544
2545     /* Using push_back() instead of this resize results in much slower code */
2546     sort->resize(atomOrder.size());
2547     gmx::ArrayRef<gmx_cgsort_t> buffer    = *sort;
2548     size_t                      numSorted = 0;
2549     for (int i : atomOrder)
2550     {
2551         if (i >= 0)
2552         {
2553             /* The values of nsc and ind_gl are not used in this case */
2554             buffer[numSorted++].ind = i;
2555         }
2556     }
2557     sort->resize(numSorted);
2558 }
2559
2560 //! Returns the sorting state for DD.
2561 static void dd_sort_state(gmx_domdec_t* dd, t_forcerec* fr, t_state* state)
2562 {
2563     gmx_domdec_sort_t* sort = dd->comm->sort.get();
2564
2565     dd_sort_order_nbnxn(fr, &sort->sorted);
2566
2567     /* We alloc with the old size, since cgindex is still old */
2568     DDBufferAccess<gmx::RVec> rvecBuffer(dd->comm->rvecBuffer, dd->ncg_home);
2569
2570     /* Set the new home atom/charge group count */
2571     dd->ncg_home = sort->sorted.size();
2572     if (debug)
2573     {
2574         fprintf(debug, "Set the new home atom count to %d\n", dd->ncg_home);
2575     }
2576
2577     /* Reorder the state */
2578     gmx::ArrayRef<const gmx_cgsort_t> cgsort = sort->sorted;
2579     GMX_RELEASE_ASSERT(cgsort.ssize() == dd->ncg_home, "We should sort all the home atom groups");
2580
2581     if (state->flags & enumValueToBitMask(StateEntry::X))
2582     {
2583         orderVector(cgsort, makeArrayRef(state->x), rvecBuffer.buffer);
2584     }
2585     if (state->flags & enumValueToBitMask(StateEntry::V))
2586     {
2587         orderVector(cgsort, makeArrayRef(state->v), rvecBuffer.buffer);
2588     }
2589     if (state->flags & enumValueToBitMask(StateEntry::Cgp))
2590     {
2591         orderVector(cgsort, makeArrayRef(state->cg_p), rvecBuffer.buffer);
2592     }
2593
2594     /* Reorder the global cg index */
2595     orderVector<int>(cgsort, dd->globalAtomGroupIndices, &sort->intBuffer);
2596     /* Reorder the cginfo */
2597     orderVector<int>(cgsort, fr->cginfo, &sort->intBuffer);
2598     /* Set the home atom number */
2599     dd->comm->atomRanges.setEnd(DDAtomRanges::Type::Home, dd->ncg_home);
2600
2601     /* The atoms are now exactly in grid order, update the grid order */
2602     fr->nbv->setLocalAtomOrder();
2603 }
2604
2605 //! Accumulates load statistics.
2606 static void add_dd_statistics(gmx_domdec_t* dd)
2607 {
2608     gmx_domdec_comm_t* comm = dd->comm;
2609
2610     for (int i = 0; i < static_cast<int>(DDAtomRanges::Type::Number); i++)
2611     {
2612         auto range = static_cast<DDAtomRanges::Type>(i);
2613         comm->sum_nat[i] += comm->atomRanges.end(range) - comm->atomRanges.start(range);
2614     }
2615     comm->ndecomp++;
2616 }
2617
2618 void reset_dd_statistics_counters(gmx_domdec_t* dd)
2619 {
2620     gmx_domdec_comm_t* comm = dd->comm;
2621
2622     /* Reset all the statistics and counters for total run counting */
2623     for (int i = 0; i < static_cast<int>(DDAtomRanges::Type::Number); i++)
2624     {
2625         comm->sum_nat[i] = 0;
2626     }
2627     comm->ndecomp   = 0;
2628     comm->nload     = 0;
2629     comm->load_step = 0;
2630     comm->load_sum  = 0;
2631     comm->load_max  = 0;
2632     clear_ivec(comm->load_lim);
2633     comm->load_mdf = 0;
2634     comm->load_pme = 0;
2635 }
2636
2637 namespace gmx
2638 {
2639
2640 bool check_grid_jump(int64_t step, const gmx_domdec_t* dd, real cutoff, const gmx_ddbox_t* ddbox, bool bFatal)
2641 {
2642     gmx_domdec_comm_t* comm    = dd->comm;
2643     bool               invalid = false;
2644
2645     for (int d = 1; d < dd->ndim; d++)
2646     {
2647         const DDCellsizesWithDlb& cellsizes = comm->cellsizesWithDlb[d];
2648         const int                 dim       = dd->dim[d];
2649         const real                limit     = grid_jump_limit(comm, cutoff, d);
2650         real                      bfac      = ddbox->box_size[dim];
2651         if (ddbox->tric_dir[dim])
2652         {
2653             bfac *= ddbox->skew_fac[dim];
2654         }
2655         if ((cellsizes.fracUpper - cellsizes.fracLowerMax) * bfac < limit
2656             || (cellsizes.fracLower - cellsizes.fracUpperMin) * bfac > -limit)
2657         {
2658             invalid = true;
2659
2660             if (bFatal)
2661             {
2662                 char buf[22];
2663
2664                 /* This error should never be triggered under normal
2665                  * circumstances, but you never know ...
2666                  */
2667                 gmx_fatal(FARGS,
2668                           "step %s: The domain decomposition grid has shifted too much in the "
2669                           "%c-direction around cell %d %d %d. This should not have happened. "
2670                           "Running with fewer ranks might avoid this issue.",
2671                           gmx_step_str(step, buf),
2672                           dim2char(dim),
2673                           dd->ci[XX],
2674                           dd->ci[YY],
2675                           dd->ci[ZZ]);
2676             }
2677         }
2678     }
2679
2680     return invalid;
2681 }
2682
2683 void print_dd_statistics(const t_commrec* cr, const t_inputrec& inputrec, FILE* fplog)
2684 {
2685     gmx_domdec_comm_t* comm = cr->dd->comm;
2686
2687     const int numRanges = static_cast<int>(DDAtomRanges::Type::Number);
2688     gmx_sumd(numRanges, comm->sum_nat, cr);
2689
2690     if (fplog == nullptr)
2691     {
2692         return;
2693     }
2694
2695     fprintf(fplog, "\n    D O M A I N   D E C O M P O S I T I O N   S T A T I S T I C S\n\n");
2696
2697     for (int i = static_cast<int>(DDAtomRanges::Type::Zones); i < numRanges; i++)
2698     {
2699         auto   range = static_cast<DDAtomRanges::Type>(i);
2700         double av    = comm->sum_nat[i] / comm->ndecomp;
2701         switch (range)
2702         {
2703             case DDAtomRanges::Type::Zones:
2704                 fprintf(fplog, " av. #atoms communicated per step for force:  %d x %.1f\n", 2, av);
2705                 break;
2706             case DDAtomRanges::Type::Vsites:
2707                 if (cr->dd->vsite_comm)
2708                 {
2709                     fprintf(fplog,
2710                             " av. #atoms communicated per step for vsites: %d x %.1f\n",
2711                             (EEL_PME(inputrec.coulombtype)
2712                              || inputrec.coulombtype == CoulombInteractionType::Ewald)
2713                                     ? 3
2714                                     : 2,
2715                             av);
2716                 }
2717                 break;
2718             case DDAtomRanges::Type::Constraints:
2719                 if (cr->dd->constraint_comm)
2720                 {
2721                     fprintf(fplog,
2722                             " av. #atoms communicated per step for LINCS:  %d x %.1f\n",
2723                             1 + inputrec.nLincsIter,
2724                             av);
2725                 }
2726                 break;
2727             default: gmx_incons(" Unknown type for DD statistics");
2728         }
2729     }
2730     fprintf(fplog, "\n");
2731
2732     if (comm->ddSettings.recordLoad && EI_DYNAMICS(inputrec.eI))
2733     {
2734         print_dd_load_av(fplog, cr->dd);
2735     }
2736 }
2737
2738 //!\brief TODO Remove fplog when group scheme and charge groups are gone
2739 void dd_partition_system(FILE*                     fplog,
2740                          const gmx::MDLogger&      mdlog,
2741                          int64_t                   step,
2742                          const t_commrec*          cr,
2743                          bool                      bMasterState,
2744                          int                       nstglobalcomm,
2745                          t_state*                  state_global,
2746                          const gmx_mtop_t&         top_global,
2747                          const t_inputrec&         inputrec,
2748                          gmx::ImdSession*          imdSession,
2749                          pull_t*                   pull_work,
2750                          t_state*                  state_local,
2751                          gmx::ForceBuffers*        f,
2752                          gmx::MDAtoms*             mdAtoms,
2753                          gmx_localtop_t*           top_local,
2754                          t_forcerec*               fr,
2755                          gmx::VirtualSitesHandler* vsite,
2756                          gmx::Constraints*         constr,
2757                          t_nrnb*                   nrnb,
2758                          gmx_wallcycle*            wcycle,
2759                          bool                      bVerbose)
2760 {
2761     gmx_ddbox_t ddbox = { 0 };
2762     int         ncgindex_set;
2763     char        sbuf[22];
2764
2765     wallcycle_start(wcycle, WallCycleCounter::Domdec);
2766
2767     gmx_domdec_t*      dd   = cr->dd;
2768     gmx_domdec_comm_t* comm = dd->comm;
2769
2770     // TODO if the update code becomes accessible here, use
2771     // upd->deform for this logic.
2772     bool bBoxChanged = (bMasterState || inputrecDeform(&inputrec));
2773     if (inputrec.epc != PressureCoupling::No)
2774     {
2775         /* With nstpcouple > 1 pressure coupling happens.
2776          * one step after calculating the pressure.
2777          * Box scaling happens at the end of the MD step,
2778          * after the DD partitioning.
2779          * We therefore have to do DLB in the first partitioning
2780          * after an MD step where P-coupling occurred.
2781          * We need to determine the last step in which p-coupling occurred.
2782          * MRS -- need to validate this for vv?
2783          */
2784         int     n = inputrec.nstpcouple;
2785         int64_t step_pcoupl;
2786         if (n == 1)
2787         {
2788             step_pcoupl = step - 1;
2789         }
2790         else
2791         {
2792             step_pcoupl = ((step - 1) / n) * n + 1;
2793         }
2794         if (step_pcoupl >= comm->partition_step)
2795         {
2796             bBoxChanged = true;
2797         }
2798     }
2799
2800     bool bNStGlobalComm = (step % nstglobalcomm == 0);
2801     bool bDoDLB;
2802     if (!isDlbOn(comm))
2803     {
2804         bDoDLB = false;
2805     }
2806     else
2807     {
2808         /* Should we do dynamic load balacing this step?
2809          * Since it requires (possibly expensive) global communication,
2810          * we might want to do DLB less frequently.
2811          */
2812         if (bBoxChanged || inputrec.epc != PressureCoupling::No)
2813         {
2814             bDoDLB = bBoxChanged;
2815         }
2816         else
2817         {
2818             bDoDLB = bNStGlobalComm;
2819         }
2820     }
2821
2822     /* Check if we have recorded loads on the nodes */
2823     if (comm->ddSettings.recordLoad && dd_load_count(comm) > 0)
2824     {
2825         bool bCheckWhetherToTurnDlbOn = dd_dlb_get_should_check_whether_to_turn_dlb_on(dd);
2826
2827         /* Print load every nstlog, first and last step to the log file */
2828         bool bLogLoad = ((inputrec.nstlog > 0 && step % inputrec.nstlog == 0) || comm->n_load_collect == 0
2829                          || (inputrec.nsteps >= 0
2830                              && (step + inputrec.nstlist > inputrec.init_step + inputrec.nsteps)));
2831
2832         /* Avoid extra communication due to verbose screen output
2833          * when nstglobalcomm is set.
2834          */
2835         if (bDoDLB || bLogLoad || bCheckWhetherToTurnDlbOn
2836             || (bVerbose && (inputrec.nstlist == 0 || nstglobalcomm <= inputrec.nstlist)))
2837         {
2838             get_load_distribution(dd, wcycle);
2839             if (DDMASTER(dd))
2840             {
2841                 if (bLogLoad)
2842                 {
2843                     GMX_LOG(mdlog.info).asParagraph().appendText(dd_print_load(dd, step - 1));
2844                 }
2845                 if (bVerbose)
2846                 {
2847                     dd_print_load_verbose(dd);
2848                 }
2849             }
2850             comm->n_load_collect++;
2851
2852             if (isDlbOn(comm))
2853             {
2854                 if (DDMASTER(dd))
2855                 {
2856                     /* Add the measured cycles to the running average */
2857                     const float averageFactor = 0.1F;
2858                     comm->cyclesPerStepDlbExpAverage =
2859                             (1 - averageFactor) * comm->cyclesPerStepDlbExpAverage
2860                             + averageFactor * comm->cycl[ddCyclStep] / comm->cycl_n[ddCyclStep];
2861                 }
2862                 if (comm->dlbState == DlbState::onCanTurnOff
2863                     && dd->comm->n_load_have % c_checkTurnDlbOffInterval == c_checkTurnDlbOffInterval - 1)
2864                 {
2865                     bool turnOffDlb;
2866                     if (DDMASTER(dd))
2867                     {
2868                         /* If the running averaged cycles with DLB are more
2869                          * than before we turned on DLB, turn off DLB.
2870                          * We will again run and check the cycles without DLB
2871                          * and we can then decide if to turn off DLB forever.
2872                          */
2873                         turnOffDlb = (comm->cyclesPerStepDlbExpAverage > comm->cyclesPerStepBeforeDLB);
2874                     }
2875                     dd_bcast(dd, sizeof(turnOffDlb), &turnOffDlb);
2876                     if (turnOffDlb)
2877                     {
2878                         /* To turn off DLB, we need to redistribute the atoms */
2879                         dd_collect_state(dd, state_local, state_global);
2880                         bMasterState = true;
2881                         turn_off_dlb(mdlog, dd, step);
2882                     }
2883                 }
2884             }
2885             else if (bCheckWhetherToTurnDlbOn)
2886             {
2887                 bool turnOffDlbForever = false;
2888                 bool turnOnDlb         = false;
2889
2890                 /* Since the timings are node dependent, the master decides */
2891                 if (DDMASTER(dd))
2892                 {
2893                     /* If we recently turned off DLB, we want to check if
2894                      * performance is better without DLB. We want to do this
2895                      * ASAP to minimize the chance that external factors
2896                      * slowed down the DLB step are gone here and we
2897                      * incorrectly conclude that DLB was causing the slowdown.
2898                      * So we measure one nstlist block, no running average.
2899                      */
2900                     if (comm->haveTurnedOffDlb
2901                         && comm->cycl[ddCyclStep] / comm->cycl_n[ddCyclStep] < comm->cyclesPerStepDlbExpAverage)
2902                     {
2903                         /* After turning off DLB we ran nstlist steps in fewer
2904                          * cycles than with DLB. This likely means that DLB
2905                          * in not benefical, but this could be due to a one
2906                          * time unlucky fluctuation, so we require two such
2907                          * observations in close succession to turn off DLB
2908                          * forever.
2909                          */
2910                         if (comm->dlbSlowerPartitioningCount > 0
2911                             && dd->ddp_count < comm->dlbSlowerPartitioningCount + 10 * c_checkTurnDlbOnInterval)
2912                         {
2913                             turnOffDlbForever = true;
2914                         }
2915                         comm->haveTurnedOffDlb = false;
2916                         /* Register when we last measured DLB slowdown */
2917                         comm->dlbSlowerPartitioningCount = dd->ddp_count;
2918                     }
2919                     else
2920                     {
2921                         /* Here we check if the max PME rank load is more than 0.98
2922                          * the max PP force load. If so, PP DLB will not help,
2923                          * since we are (almost) limited by PME. Furthermore,
2924                          * DLB will cause a significant extra x/f redistribution
2925                          * cost on the PME ranks, which will then surely result
2926                          * in lower total performance.
2927                          */
2928                         if (comm->ddRankSetup.usePmeOnlyRanks && dd_pme_f_ratio(dd) > 1 - DD_PERF_LOSS_DLB_ON)
2929                         {
2930                             turnOnDlb = false;
2931                         }
2932                         else
2933                         {
2934                             turnOnDlb = (dd_force_imb_perf_loss(dd) >= DD_PERF_LOSS_DLB_ON);
2935                         }
2936                     }
2937                 }
2938                 struct
2939                 {
2940                     bool turnOffDlbForever;
2941                     bool turnOnDlb;
2942                 } bools{ turnOffDlbForever, turnOnDlb };
2943                 dd_bcast(dd, sizeof(bools), &bools);
2944                 if (bools.turnOffDlbForever)
2945                 {
2946                     turn_off_dlb_forever(mdlog, dd, step);
2947                 }
2948                 else if (bools.turnOnDlb)
2949                 {
2950                     turn_on_dlb(mdlog, dd, step);
2951                     bDoDLB = true;
2952                 }
2953             }
2954         }
2955         comm->n_load_have++;
2956     }
2957
2958     bool bRedist = false;
2959     if (bMasterState)
2960     {
2961         /* Clear the old state */
2962         clearDDStateIndices(dd, false);
2963         ncgindex_set = 0;
2964
2965         auto xGlobal = positionsFromStatePointer(state_global);
2966
2967         set_ddbox(*dd, true, DDMASTER(dd) ? state_global->box : nullptr, true, xGlobal, &ddbox);
2968
2969         distributeState(mdlog, dd, top_global, state_global, ddbox, state_local);
2970
2971         /* Ensure that we have space for the new distribution */
2972         dd_resize_atominfo_and_state(fr, state_local, dd->ncg_home);
2973
2974         inc_nrnb(nrnb, eNR_CGCM, comm->atomRanges.numHomeAtoms());
2975
2976         dd_set_cginfo(dd->globalAtomGroupIndices, 0, dd->ncg_home, fr);
2977     }
2978     else if (state_local->ddp_count != dd->ddp_count)
2979     {
2980         if (state_local->ddp_count > dd->ddp_count)
2981         {
2982             gmx_fatal(FARGS,
2983                       "Internal inconsistency state_local->ddp_count (%d) > dd->ddp_count (%" PRId64
2984                       ")",
2985                       state_local->ddp_count,
2986                       dd->ddp_count);
2987         }
2988
2989         if (state_local->ddp_count_cg_gl != state_local->ddp_count)
2990         {
2991             gmx_fatal(FARGS,
2992                       "Internal inconsistency state_local->ddp_count_cg_gl (%d) != "
2993                       "state_local->ddp_count (%d)",
2994                       state_local->ddp_count_cg_gl,
2995                       state_local->ddp_count);
2996         }
2997
2998         /* Clear the old state */
2999         clearDDStateIndices(dd, false);
3000
3001         /* Restore the atom group indices from state_local */
3002         restoreAtomGroups(dd, state_local);
3003         make_dd_indices(dd, 0);
3004         ncgindex_set = dd->ncg_home;
3005
3006         inc_nrnb(nrnb, eNR_CGCM, comm->atomRanges.numHomeAtoms());
3007
3008         dd_set_cginfo(dd->globalAtomGroupIndices, 0, dd->ncg_home, fr);
3009
3010         set_ddbox(*dd, bMasterState, state_local->box, true, state_local->x, &ddbox);
3011
3012         bRedist = isDlbOn(comm);
3013     }
3014     else
3015     {
3016         /* We have the full state, only redistribute the cgs */
3017
3018         /* Clear the non-home indices */
3019         clearDDStateIndices(dd, true);
3020         ncgindex_set = 0;
3021
3022         /* To avoid global communication, we do not recompute the extent
3023          * of the system for dims without pbc. Therefore we need to copy
3024          * the previously computed values when we do not communicate.
3025          */
3026         if (!bNStGlobalComm)
3027         {
3028             copy_rvec(comm->box0, ddbox.box0);
3029             copy_rvec(comm->box_size, ddbox.box_size);
3030         }
3031         set_ddbox(*dd, bMasterState, state_local->box, bNStGlobalComm, state_local->x, &ddbox);
3032
3033         bBoxChanged = true;
3034         bRedist     = true;
3035     }
3036     /* Copy needed for dim's without pbc when avoiding communication */
3037     copy_rvec(ddbox.box0, comm->box0);
3038     copy_rvec(ddbox.box_size, comm->box_size);
3039
3040     set_dd_cell_sizes(dd, &ddbox, dd->unitCellInfo.ddBoxIsDynamic, bMasterState, bDoDLB, step, wcycle);
3041
3042     if (comm->ddSettings.nstDDDumpGrid > 0 && step % comm->ddSettings.nstDDDumpGrid == 0)
3043     {
3044         write_dd_grid_pdb("dd_grid", step, dd, state_local->box, &ddbox);
3045     }
3046
3047     if (comm->systemInfo.useUpdateGroups)
3048     {
3049         comm->updateGroupsCog->addCogs(
3050                 gmx::arrayRefFromArray(dd->globalAtomGroupIndices.data(), dd->ncg_home), state_local->x);
3051     }
3052
3053     /* Check if we should sort the charge groups */
3054     const bool bSortCG = (bMasterState || bRedist);
3055
3056     /* When repartitioning we mark atom groups that will move to neighboring
3057      * DD cells, but we do not move them right away for performance reasons.
3058      * Thus we need to keep track of how many charge groups will move for
3059      * obtaining correct local charge group / atom counts.
3060      */
3061     int ncg_moved = 0;
3062     if (bRedist)
3063     {
3064         wallcycle_sub_start(wcycle, WallCycleSubCounter::DDRedist);
3065
3066         ncgindex_set = dd->ncg_home;
3067         dd_redistribute_cg(fplog, step, dd, ddbox.tric_dir, state_local, fr, nrnb, &ncg_moved);
3068
3069         GMX_RELEASE_ASSERT(bSortCG, "Sorting is required after redistribution");
3070
3071         if (comm->systemInfo.useUpdateGroups)
3072         {
3073             comm->updateGroupsCog->addCogs(
3074                     gmx::arrayRefFromArray(dd->globalAtomGroupIndices.data(), dd->ncg_home),
3075                     state_local->x);
3076         }
3077
3078         wallcycle_sub_stop(wcycle, WallCycleSubCounter::DDRedist);
3079     }
3080
3081     RVec cell_ns_x0, cell_ns_x1;
3082     get_nsgrid_boundaries(ddbox.nboundeddim,
3083                           state_local->box,
3084                           dd,
3085                           &ddbox,
3086                           &comm->cell_x0,
3087                           &comm->cell_x1,
3088                           dd->ncg_home,
3089                           as_rvec_array(state_local->x.data()),
3090                           cell_ns_x0,
3091                           cell_ns_x1);
3092
3093     if (bBoxChanged)
3094     {
3095         comm_dd_ns_cell_sizes(dd, &ddbox, cell_ns_x0, cell_ns_x1, step);
3096     }
3097
3098     if (bSortCG)
3099     {
3100         wallcycle_sub_start(wcycle, WallCycleSubCounter::DDGrid);
3101
3102         /* Sort the state on charge group position.
3103          * This enables exact restarts from this step.
3104          * It also improves performance by about 15% with larger numbers
3105          * of atoms per node.
3106          */
3107
3108         /* Fill the ns grid with the home cell,
3109          * so we can sort with the indices.
3110          */
3111         set_zones_ncg_home(dd);
3112
3113         set_zones_size(dd, state_local->box, &ddbox, 0, 1, ncg_moved);
3114
3115         nbnxn_put_on_grid(fr->nbv.get(),
3116                           state_local->box,
3117                           0,
3118                           comm->zones.size[0].bb_x0,
3119                           comm->zones.size[0].bb_x1,
3120                           comm->updateGroupsCog.get(),
3121                           { 0, dd->ncg_home },
3122                           comm->zones.dens_zone0,
3123                           fr->cginfo,
3124                           state_local->x,
3125                           ncg_moved,
3126                           bRedist ? comm->movedBuffer.data() : nullptr);
3127
3128         if (debug)
3129         {
3130             fprintf(debug, "Step %s, sorting the %d home charge groups\n", gmx_step_str(step, sbuf), dd->ncg_home);
3131         }
3132         dd_sort_state(dd, fr, state_local);
3133
3134         /* After sorting and compacting we set the correct size */
3135         state_change_natoms(state_local, comm->atomRanges.numHomeAtoms());
3136
3137         /* Rebuild all the indices */
3138         dd->ga2la->clear();
3139         ncgindex_set = 0;
3140
3141         wallcycle_sub_stop(wcycle, WallCycleSubCounter::DDGrid);
3142     }
3143     else
3144     {
3145         /* With the group scheme the sorting array is part of the DD state,
3146          * but it just got out of sync, so mark as invalid by emptying it.
3147          */
3148         if (inputrec.cutoff_scheme == CutoffScheme::Group)
3149         {
3150             comm->sort->sorted.clear();
3151         }
3152     }
3153
3154     if (comm->systemInfo.useUpdateGroups)
3155     {
3156         /* The update groups cog's are invalid after sorting
3157          * and need to be cleared before the next partitioning anyhow.
3158          */
3159         comm->updateGroupsCog->clear();
3160     }
3161
3162     wallcycle_sub_start(wcycle, WallCycleSubCounter::DDSetupComm);
3163
3164     /* Set the induces for the home atoms */
3165     set_zones_ncg_home(dd);
3166     make_dd_indices(dd, ncgindex_set);
3167
3168     /* Setup up the communication and communicate the coordinates */
3169     setup_dd_communication(dd, state_local->box, &ddbox, fr, state_local);
3170
3171     /* Set the indices for the halo atoms */
3172     make_dd_indices(dd, dd->ncg_home);
3173
3174     /* Set the charge group boundaries for neighbor searching */
3175     set_cg_boundaries(&comm->zones);
3176
3177     /* When bSortCG=true, we have already set the size for zone 0 */
3178     set_zones_size(dd, state_local->box, &ddbox, bSortCG ? 1 : 0, comm->zones.n, 0);
3179
3180     wallcycle_sub_stop(wcycle, WallCycleSubCounter::DDSetupComm);
3181
3182     /*
3183        write_dd_pdb("dd_home",step,"dump",top_global,cr,
3184                  -1,state_local->x.rvec_array(),state_local->box);
3185      */
3186
3187     wallcycle_sub_start(wcycle, WallCycleSubCounter::DDMakeTop);
3188
3189     /* Extract a local topology from the global topology */
3190     IVec numPulses;
3191     for (int i = 0; i < dd->ndim; i++)
3192     {
3193         numPulses[dd->dim[i]] = comm->cd[i].numPulses();
3194     }
3195     int numBondedInteractionsToReduce = dd_make_local_top(dd,
3196                                                           &comm->zones,
3197                                                           dd->unitCellInfo.npbcdim,
3198                                                           state_local->box,
3199                                                           comm->cellsize_min,
3200                                                           numPulses,
3201                                                           fr,
3202                                                           state_local->x,
3203                                                           top_global,
3204                                                           top_local);
3205     scheduleCheckOfLocalTopology(dd, numBondedInteractionsToReduce);
3206
3207     wallcycle_sub_stop(wcycle, WallCycleSubCounter::DDMakeTop);
3208
3209     wallcycle_sub_start(wcycle, WallCycleSubCounter::DDMakeConstr);
3210
3211     /* Set up the special atom communication */
3212     int n = comm->atomRanges.end(DDAtomRanges::Type::Zones);
3213     for (int i = static_cast<int>(DDAtomRanges::Type::Zones) + 1;
3214          i < static_cast<int>(DDAtomRanges::Type::Number);
3215          i++)
3216     {
3217         auto range = static_cast<DDAtomRanges::Type>(i);
3218         switch (range)
3219         {
3220             case DDAtomRanges::Type::Vsites:
3221                 if (vsite && vsite->numInterUpdategroupVirtualSites())
3222                 {
3223                     n = dd_make_local_vsites(dd, n, top_local->idef.il);
3224                 }
3225                 break;
3226             case DDAtomRanges::Type::Constraints:
3227                 if (dd->comm->systemInfo.mayHaveSplitConstraints || dd->comm->systemInfo.mayHaveSplitSettles)
3228                 {
3229                     /* Only for inter-cg constraints we need special code */
3230                     n = dd_make_local_constraints(dd,
3231                                                   n,
3232                                                   top_global,
3233                                                   fr->cginfo.data(),
3234                                                   constr,
3235                                                   inputrec.nProjOrder,
3236                                                   top_local->idef.il);
3237                 }
3238                 break;
3239             default: gmx_incons("Unknown special atom type setup");
3240         }
3241         comm->atomRanges.setEnd(range, n);
3242     }
3243
3244     wallcycle_sub_stop(wcycle, WallCycleSubCounter::DDMakeConstr);
3245
3246     wallcycle_sub_start(wcycle, WallCycleSubCounter::DDTopOther);
3247
3248     /* Make space for the extra coordinates for virtual site
3249      * or constraint communication.
3250      */
3251     state_local->natoms = comm->atomRanges.numAtomsTotal();
3252
3253     state_change_natoms(state_local, state_local->natoms);
3254
3255     int nat_f_novirsum;
3256     if (vsite && vsite->numInterUpdategroupVirtualSites())
3257     {
3258         nat_f_novirsum = comm->atomRanges.end(DDAtomRanges::Type::Vsites);
3259     }
3260     else
3261     {
3262         if (EEL_FULL(inputrec.coulombtype) && dd->haveExclusions)
3263         {
3264             nat_f_novirsum = comm->atomRanges.end(DDAtomRanges::Type::Zones);
3265         }
3266         else
3267         {
3268             nat_f_novirsum = comm->atomRanges.numHomeAtoms();
3269         }
3270     }
3271
3272     /* Set the number of atoms required for the force calculation.
3273      * Forces need to be constrained when doing energy
3274      * minimization. For simple simulations we could avoid some
3275      * allocation, zeroing and copying, but this is probably not worth
3276      * the complications and checking.
3277      */
3278     forcerec_set_ranges(fr,
3279                         comm->atomRanges.end(DDAtomRanges::Type::Zones),
3280                         comm->atomRanges.end(DDAtomRanges::Type::Constraints),
3281                         nat_f_novirsum);
3282
3283     /* Update atom data for mdatoms and several algorithms */
3284     mdAlgorithmsSetupAtomData(cr, inputrec, top_global, top_local, fr, f, mdAtoms, constr, vsite, nullptr);
3285
3286     auto* mdatoms = mdAtoms->mdatoms();
3287     if (!thisRankHasDuty(cr, DUTY_PME))
3288     {
3289         /* Send the charges and/or c6/sigmas to our PME only node */
3290         gmx_pme_send_parameters(
3291                 cr,
3292                 *fr->ic,
3293                 mdatoms->nChargePerturbed != 0,
3294                 mdatoms->nTypePerturbed != 0,
3295                 mdatoms->chargeA ? gmx::arrayRefFromArray(mdatoms->chargeA, mdatoms->nr)
3296                                  : gmx::ArrayRef<real>{},
3297                 mdatoms->chargeB ? gmx::arrayRefFromArray(mdatoms->chargeB, mdatoms->nr)
3298                                  : gmx::ArrayRef<real>{},
3299                 mdatoms->sqrt_c6A ? gmx::arrayRefFromArray(mdatoms->sqrt_c6A, mdatoms->nr)
3300                                   : gmx::ArrayRef<real>{},
3301                 mdatoms->sqrt_c6B ? gmx::arrayRefFromArray(mdatoms->sqrt_c6B, mdatoms->nr)
3302                                   : gmx::ArrayRef<real>{},
3303                 mdatoms->sigmaA ? gmx::arrayRefFromArray(mdatoms->sigmaA, mdatoms->nr)
3304                                 : gmx::ArrayRef<real>{},
3305                 mdatoms->sigmaB ? gmx::arrayRefFromArray(mdatoms->sigmaB, mdatoms->nr)
3306                                 : gmx::ArrayRef<real>{},
3307                 dd_pme_maxshift_x(*dd),
3308                 dd_pme_maxshift_y(*dd));
3309     }
3310
3311     if (dd->atomSets != nullptr)
3312     {
3313         /* Update the local atom sets */
3314         dd->atomSets->setIndicesInDomainDecomposition(*(dd->ga2la));
3315     }
3316
3317     // The pull group construction can need the atom sets updated above
3318     if (inputrec.bPull)
3319     {
3320         /* Update the local pull groups */
3321         dd_make_local_pull_groups(cr, pull_work);
3322     }
3323
3324     /* Update the local atoms to be communicated via the IMD protocol if bIMD is true. */
3325     imdSession->dd_make_local_IMD_atoms(dd);
3326
3327     add_dd_statistics(dd);
3328
3329     /* Make sure we only count the cycles for this DD partitioning */
3330     clear_dd_cycle_counts(dd);
3331
3332     /* Because the order of the atoms might have changed since
3333      * the last vsite construction, we need to communicate the constructing
3334      * atom coordinates again (for spreading the forces this MD step).
3335      */
3336     dd_move_x_vsites(*dd, state_local->box, state_local->x.rvec_array());
3337
3338     wallcycle_sub_stop(wcycle, WallCycleSubCounter::DDTopOther);
3339
3340     if (comm->ddSettings.nstDDDump > 0 && step % comm->ddSettings.nstDDDump == 0)
3341     {
3342         dd_move_x(dd, state_local->box, state_local->x, nullptr);
3343         write_dd_pdb("dd_dump",
3344                      step,
3345                      "dump",
3346                      top_global,
3347                      cr,
3348                      -1,
3349                      state_local->x.rvec_array(),
3350                      state_local->box);
3351     }
3352
3353     /* Store the partitioning step */
3354     comm->partition_step = step;
3355
3356     /* Increase the DD partitioning counter */
3357     dd->ddp_count++;
3358     /* The state currently matches this DD partitioning count, store it */
3359     state_local->ddp_count = dd->ddp_count;
3360     if (bMasterState)
3361     {
3362         /* The DD master node knows the complete cg distribution,
3363          * store the count so we can possibly skip the cg info communication.
3364          */
3365         comm->master_cg_ddp_count = (bSortCG ? 0 : dd->ddp_count);
3366     }
3367
3368     if (comm->ddSettings.DD_debug > 0)
3369     {
3370         /* Set the env var GMX_DD_DEBUG if you suspect corrupted indices */
3371         check_index_consistency(dd, top_global.natoms, "after partitioning");
3372     }
3373
3374     wallcycle_stop(wcycle, WallCycleCounter::Domdec);
3375 }
3376
3377 } // namespace gmx