src/gromacs/domdec/partition.cpp

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2018,2019,2020,2021, by the GROMACS development team, led by
   5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   6  * and including many others, as listed in the AUTHORS file in the
   7  * top-level source directory and at http://www.gromacs.org.
   8  *
   9  * GROMACS is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public License
  11  * as published by the Free Software Foundation; either version 2.1
  12  * of the License, or (at your option) any later version.
  13  *
  14  * GROMACS is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public
  20  * License along with GROMACS; if not, see
  21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  23  *
  24  * If you want to redistribute modifications to GROMACS, please
  25  * consider that scientific software is very special. Version
  26  * control is crucial - bugs must be traceable. We will be happy to
  27  * consider code for inclusion in the official distribution, but
  28  * derived work must not be called official GROMACS. Details are found
  29  * in the README & COPYING files - if they are missing, get the
  30  * official version at http://www.gromacs.org.
  31  *
  32  * To help us fund GROMACS development, we humbly ask that you cite
  33  * the research papers on the package. Check out http://www.gromacs.org.
  34  */
  35 /*! \internal \file
  36  *
  37  * \brief This file defines functions for mdrun to call to make a new
  38  * domain decomposition, and check it.
  39  *
  40  * \author Berk Hess <hess@kth.se>
  41  * \ingroup module_domdec
  42  */
  43
  44 #include "gmxpre.h"
  45
  46 #include "gromacs/utility/arrayref.h"
  47 #include "partition.h"
  48
  49 #include "config.h"
  50
  51 #include <cassert>
  52 #include <cstdio>
  53
  54 #include <algorithm>
  55
  56 #include "gromacs/domdec/collect.h"
  57 #include "gromacs/domdec/dlb.h"
  58 #include "gromacs/domdec/dlbtiming.h"
  59 #include "gromacs/domdec/domdec.h"
  60 #include "gromacs/domdec/domdec_network.h"
  61 #include "gromacs/domdec/ga2la.h"
  62 #include "gromacs/domdec/localatomsetmanager.h"
  63 #include "gromacs/domdec/mdsetup.h"
  64 #include "gromacs/domdec/nsgrid.h"
  65 #include "gromacs/ewald/pme_pp.h"
  66 #include "gromacs/gmxlib/network.h"
  67 #include "gromacs/gmxlib/nrnb.h"
  68 #include "gromacs/imd/imd.h"
  69 #include "gromacs/math/functions.h"
  70 #include "gromacs/math/vec.h"
  71 #include "gromacs/mdlib/forcerec.h"
  72 #include "gromacs/mdlib/gmx_omp_nthreads.h"
  73 #include "gromacs/mdlib/mdatoms.h"
  74 #include "gromacs/mdlib/vsite.h"
  75 #include "gromacs/mdtypes/commrec.h"
  76 #include "gromacs/mdtypes/forcerec.h"
  77 #include "gromacs/mdtypes/inputrec.h"
  78 #include "gromacs/mdtypes/md_enums.h"
  79 #include "gromacs/mdtypes/mdatom.h"
  80 #include "gromacs/mdtypes/nblist.h"
  81 #include "gromacs/mdtypes/state.h"
  82 #include "gromacs/nbnxm/nbnxm.h"
  83 #include "gromacs/pulling/pull.h"
  84 #include "gromacs/timing/wallcycle.h"
  85 #include "gromacs/topology/mtop_util.h"
  86 #include "gromacs/topology/topology.h"
  87 #include "gromacs/utility/cstringutil.h"
  88 #include "gromacs/utility/fatalerror.h"
  89 #include "gromacs/utility/logger.h"
  90 #include "gromacs/utility/real.h"
  91 #include "gromacs/utility/smalloc.h"
  92 #include "gromacs/utility/strconvert.h"
  93 #include "gromacs/utility/stringstream.h"
  94 #include "gromacs/utility/stringutil.h"
  95 #include "gromacs/utility/textwriter.h"
  96
  97 #include "box.h"
  98 #include "cellsizes.h"
  99 #include "distribute.h"
 100 #include "domdec_constraints.h"
 101 #include "domdec_internal.h"
 102 #include "domdec_vsite.h"
 103 #include "dump.h"
 104 #include "redistribute.h"
 105 #include "utility.h"
 106
 107 /*! \brief Turn on DLB when the load imbalance causes this amount of total loss.
 108  *
 109  * There is a bit of overhead with DLB and it's difficult to achieve
 110  * a load imbalance of less than 2% with DLB.
 111  */
 112 #define DD_PERF_LOSS_DLB_ON 0.02
 113
 114 //! Warn about imbalance due to PP or PP/PME load imbalance at this loss.
 115 #define DD_PERF_LOSS_WARN 0.05
 116
 117
 118 //! Debug helper printing a DD zone
 119 static void print_ddzone(FILE* fp, int d, int i, int j, gmx_ddzone_t* zone)
 120 {
 121     fprintf(fp,
 122             "zone d0 %d d1 %d d2 %d  min0 %6.3f max1 %6.3f mch0 %6.3f mch1 %6.3f p1_0 %6.3f p1_1 "
 123             "%6.3f\n",
 124             d,
 125             i,
 126             j,
 127             zone->min0,
 128             zone->max1,
 129             zone->mch0,
 130             zone->mch0,
 131             zone->p1_0,
 132             zone->p1_1);
 133 }
 134
 135 /*! \brief Using the home grid size as input in cell_ns_x0 and cell_ns_x1
 136  * takes the extremes over all home and remote zones in the halo
 137  * and returns the results in cell_ns_x0 and cell_ns_x1.
 138  * Note: only used with the group cut-off scheme.
 139  */
 140 static void dd_move_cellx(gmx_domdec_t* dd, const gmx_ddbox_t* ddbox, rvec cell_ns_x0, rvec cell_ns_x1)
 141 {
 142     constexpr int      c_ddZoneCommMaxNumZones = 5;
 143     gmx_ddzone_t       buf_s[c_ddZoneCommMaxNumZones];
 144     gmx_ddzone_t       buf_r[c_ddZoneCommMaxNumZones];
 145     gmx_ddzone_t       buf_e[c_ddZoneCommMaxNumZones];
 146     gmx_domdec_comm_t* comm = dd->comm;
 147
 148     rvec extr_s[2];
 149     rvec extr_r[2];
 150     for (int d = 1; d < dd->ndim; d++)
 151     {
 152         int           dim = dd->dim[d];
 153         gmx_ddzone_t& zp  = (d == 1) ? comm->zone_d1[0] : comm->zone_d2[0][0];
 154
 155         /* Copy the base sizes of the home zone */
 156         zp.min0    = cell_ns_x0[dim];
 157         zp.max1    = cell_ns_x1[dim];
 158         zp.min1    = cell_ns_x1[dim];
 159         zp.mch0    = cell_ns_x0[dim];
 160         zp.mch1    = cell_ns_x1[dim];
 161         zp.p1_0    = cell_ns_x0[dim];
 162         zp.p1_1    = cell_ns_x1[dim];
 163         zp.dataSet = 1;
 164     }
 165
 166     gmx::ArrayRef<DDCellsizesWithDlb> cellsizes = comm->cellsizesWithDlb;
 167
 168     /* Loop backward over the dimensions and aggregate the extremes
 169      * of the cell sizes.
 170      */
 171     for (int d = dd->ndim - 2; d >= 0; d--)
 172     {
 173         const int  dim      = dd->dim[d];
 174         const bool applyPbc = (dim < ddbox->npbcdim);
 175
 176         /* Use an rvec to store two reals */
 177         extr_s[d][0] = cellsizes[d + 1].fracLower;
 178         extr_s[d][1] = cellsizes[d + 1].fracUpper;
 179         extr_s[d][2] = cellsizes[d + 1].fracUpper;
 180
 181         int pos = 0;
 182         GMX_ASSERT(pos < c_ddZoneCommMaxNumZones, "The buffers should be sufficiently large");
 183         /* Store the extremes in the backward sending buffer,
 184          * so they get updated separately from the forward communication.
 185          */
 186         for (int d1 = d; d1 < dd->ndim - 1; d1++)
 187         {
 188             gmx_ddzone_t& buf = buf_s[pos];
 189
 190             /* We invert the order to be able to use the same loop for buf_e */
 191             buf.min0 = extr_s[d1][1];
 192             buf.max1 = extr_s[d1][0];
 193             buf.min1 = extr_s[d1][2];
 194             buf.mch0 = 0;
 195             buf.mch1 = 0;
 196             /* Store the cell corner of the dimension we communicate along */
 197             buf.p1_0    = comm->cell_x0[dim];
 198             buf.p1_1    = 0;
 199             buf.dataSet = 1;
 200             pos++;
 201         }
 202
 203         buf_s[pos] = (dd->ndim == 2) ? comm->zone_d1[0] : comm->zone_d2[0][0];
 204         pos++;
 205
 206         if (dd->ndim == 3 && d == 0)
 207         {
 208             buf_s[pos] = comm->zone_d2[0][1];
 209             pos++;
 210             buf_s[pos] = comm->zone_d1[0];
 211             pos++;
 212         }
 213
 214         /* We only need to communicate the extremes
 215          * in the forward direction
 216          */
 217         int numPulses = comm->cd[d].numPulses();
 218         int numPulsesMin;
 219         if (applyPbc)
 220         {
 221             /* Take the minimum to avoid double communication */
 222             numPulsesMin = std::min(numPulses, dd->numCells[dim] - 1 - numPulses);
 223         }
 224         else
 225         {
 226             /* Without PBC we should really not communicate over
 227              * the boundaries, but implementing that complicates
 228              * the communication setup and therefore we simply
 229              * do all communication, but ignore some data.
 230              */
 231             numPulsesMin = numPulses;
 232         }
 233         for (int pulse = 0; pulse < numPulsesMin; pulse++)
 234         {
 235             /* Communicate the extremes forward */
 236             bool receiveValidData = (applyPbc || dd->ci[dim] > 0);
 237
 238             int numElements = dd->ndim - d - 1;
 239             ddSendrecv(dd, d, dddirForward, extr_s + d, numElements, extr_r + d, numElements);
 240
 241             if (receiveValidData)
 242             {
 243                 for (int d1 = d; d1 < dd->ndim - 1; d1++)
 244                 {
 245                     extr_s[d1][0] = std::max(extr_s[d1][0], extr_r[d1][0]);
 246                     extr_s[d1][1] = std::min(extr_s[d1][1], extr_r[d1][1]);
 247                     extr_s[d1][2] = std::min(extr_s[d1][2], extr_r[d1][2]);
 248                 }
 249             }
 250         }
 251
 252         const int numElementsInBuffer = pos;
 253         for (int pulse = 0; pulse < numPulses; pulse++)
 254         {
 255             /* Communicate all the zone information backward */
 256             bool receiveValidData = (applyPbc || dd->ci[dim] < dd->numCells[dim] - 1);
 257
 258             static_assert(
 259                     sizeof(gmx_ddzone_t) == c_ddzoneNumReals * sizeof(real),
 260                     "Here we expect gmx_ddzone_t to consist of c_ddzoneNumReals reals (only)");
 261
 262             int numReals = numElementsInBuffer * c_ddzoneNumReals;
 263             ddSendrecv(dd,
 264                        d,
 265                        dddirBackward,
 266                        gmx::arrayRefFromArray(&buf_s[0].min0, numReals),
 267                        gmx::arrayRefFromArray(&buf_r[0].min0, numReals));
 268
 269             rvec dh = { 0 };
 270             if (pulse > 0)
 271             {
 272                 for (int d1 = d + 1; d1 < dd->ndim; d1++)
 273                 {
 274                     /* Determine the decrease of maximum required
 275                      * communication height along d1 due to the distance along d,
 276                      * this avoids a lot of useless atom communication.
 277                      */
 278                     real dist_d = comm->cell_x1[dim] - buf_r[0].p1_0;
 279
 280                     real c;
 281                     if (ddbox->tric_dir[dim])
 282                     {
 283                         /* c is the off-diagonal coupling between the cell planes
 284                          * along directions d and d1.
 285                          */
 286                         c = ddbox->v[dim][dd->dim[d1]][dim];
 287                     }
 288                     else
 289                     {
 290                         c = 0;
 291                     }
 292                     real det = (1 + c * c) * gmx::square(comm->systemInfo.cutoff) - dist_d * dist_d;
 293                     if (det > 0)
 294                     {
 295                         dh[d1] = comm->systemInfo.cutoff - (c * dist_d + std::sqrt(det)) / (1 + c * c);
 296                     }
 297                     else
 298                     {
 299                         /* A negative value signals out of range */
 300                         dh[d1] = -1;
 301                     }
 302                 }
 303             }
 304
 305             /* Accumulate the extremes over all pulses */
 306             for (int i = 0; i < numElementsInBuffer; i++)
 307             {
 308                 if (pulse == 0)
 309                 {
 310                     buf_e[i] = buf_r[i];
 311                 }
 312                 else
 313                 {
 314                     if (receiveValidData)
 315                     {
 316                         buf_e[i].min0 = std::min(buf_e[i].min0, buf_r[i].min0);
 317                         buf_e[i].max1 = std::max(buf_e[i].max1, buf_r[i].max1);
 318                         buf_e[i].min1 = std::min(buf_e[i].min1, buf_r[i].min1);
 319                     }
 320
 321                     int d1;
 322                     if (dd->ndim == 3 && d == 0 && i == numElementsInBuffer - 1)
 323                     {
 324                         d1 = 1;
 325                     }
 326                     else
 327                     {
 328                         d1 = d + 1;
 329                     }
 330                     if (receiveValidData && dh[d1] >= 0)
 331                     {
 332                         buf_e[i].mch0 = std::max(buf_e[i].mch0, buf_r[i].mch0 - dh[d1]);
 333                         buf_e[i].mch1 = std::max(buf_e[i].mch1, buf_r[i].mch1 - dh[d1]);
 334                     }
 335                 }
 336                 /* Copy the received buffer to the send buffer,
 337                  * to pass the data through with the next pulse.
 338                  */
 339                 buf_s[i] = buf_r[i];
 340             }
 341             if (((applyPbc || dd->ci[dim] + numPulses < dd->numCells[dim]) && pulse == numPulses - 1)
 342                 || (!applyPbc && dd->ci[dim] + 1 + pulse == dd->numCells[dim] - 1))
 343             {
 344                 /* Store the extremes */
 345                 int pos = 0;
 346
 347                 for (int d1 = d; d1 < dd->ndim - 1; d1++)
 348                 {
 349                     extr_s[d1][1] = std::min(extr_s[d1][1], buf_e[pos].min0);
 350                     extr_s[d1][0] = std::max(extr_s[d1][0], buf_e[pos].max1);
 351                     extr_s[d1][2] = std::min(extr_s[d1][2], buf_e[pos].min1);
 352                     pos++;
 353                 }
 354
 355                 if (d == 1 || (d == 0 && dd->ndim == 3))
 356                 {
 357                     for (int i = d; i < 2; i++)
 358                     {
 359                         comm->zone_d2[1 - d][i] = buf_e[pos];
 360                         pos++;
 361                     }
 362                 }
 363                 if (d == 0)
 364                 {
 365                     comm->zone_d1[1] = buf_e[pos];
 366                     pos++;
 367                 }
 368             }
 369             else
 370             {
 371                 if (d == 1 || (d == 0 && dd->ndim == 3))
 372                 {
 373                     for (int i = d; i < 2; i++)
 374                     {
 375                         comm->zone_d2[1 - d][i].dataSet = 0;
 376                     }
 377                 }
 378                 if (d == 0)
 379                 {
 380                     comm->zone_d1[1].dataSet = 0;
 381                 }
 382             }
 383         }
 384     }
 385
 386     if (dd->ndim >= 2)
 387     {
 388         int dim = dd->dim[1];
 389         for (int i = 0; i < 2; i++)
 390         {
 391             if (comm->zone_d1[i].dataSet != 0)
 392             {
 393                 if (debug)
 394                 {
 395                     print_ddzone(debug, 1, i, 0, &comm->zone_d1[i]);
 396                 }
 397                 cell_ns_x0[dim] = std::min(cell_ns_x0[dim], comm->zone_d1[i].min0);
 398                 cell_ns_x1[dim] = std::max(cell_ns_x1[dim], comm->zone_d1[i].max1);
 399             }
 400         }
 401     }
 402     if (dd->ndim >= 3)
 403     {
 404         int dim = dd->dim[2];
 405         for (int i = 0; i < 2; i++)
 406         {
 407             for (int j = 0; j < 2; j++)
 408             {
 409                 if (comm->zone_d2[i][j].dataSet != 0)
 410                 {
 411                     if (debug)
 412                     {
 413                         print_ddzone(debug, 2, i, j, &comm->zone_d2[i][j]);
 414                     }
 415                     cell_ns_x0[dim] = std::min(cell_ns_x0[dim], comm->zone_d2[i][j].min0);
 416                     cell_ns_x1[dim] = std::max(cell_ns_x1[dim], comm->zone_d2[i][j].max1);
 417                 }
 418             }
 419         }
 420     }
 421     for (int d = 1; d < dd->ndim; d++)
 422     {
 423         cellsizes[d].fracLowerMax = extr_s[d - 1][0];
 424         cellsizes[d].fracUpperMin = extr_s[d - 1][1];
 425         if (debug)
 426         {
 427             fprintf(debug,
 428                     "Cell fraction d %d, max0 %f, min1 %f\n",
 429                     d,
 430                     cellsizes[d].fracLowerMax,
 431                     cellsizes[d].fracUpperMin);
 432         }
 433     }
 434 }
 435
 436 //! Sets the charge-group zones to be equal to the home zone.
 437 static void set_zones_ncg_home(gmx_domdec_t* dd)
 438 {
 439     gmx_domdec_zones_t* zones;
 440     int                 i;
 441
 442     zones = &dd->comm->zones;
 443
 444     zones->cg_range[0] = 0;
 445     for (i = 1; i < zones->n + 1; i++)
 446     {
 447         zones->cg_range[i] = dd->ncg_home;
 448     }
 449     /* zone_ncg1[0] should always be equal to ncg_home */
 450     dd->comm->zone_ncg1[0] = dd->ncg_home;
 451 }
 452
 453 //! Restore atom groups for the charge groups.
 454 static void restoreAtomGroups(gmx_domdec_t* dd, const t_state* state)
 455 {
 456     gmx::ArrayRef<const int> atomGroupsState = state->cg_gl;
 457
 458     std::vector<int>& globalAtomGroupIndices = dd->globalAtomGroupIndices;
 459
 460     globalAtomGroupIndices.resize(atomGroupsState.size());
 461
 462     /* Copy back the global charge group indices from state
 463      * and rebuild the local charge group to atom index.
 464      */
 465     for (gmx::index i = 0; i < atomGroupsState.ssize(); i++)
 466     {
 467         globalAtomGroupIndices[i] = atomGroupsState[i];
 468     }
 469
 470     dd->ncg_home = atomGroupsState.size();
 471     dd->comm->atomRanges.setEnd(DDAtomRanges::Type::Home, atomGroupsState.ssize());
 472
 473     set_zones_ncg_home(dd);
 474 }
 475
 476 //! Sets the cginfo structures.
 477 static void dd_set_cginfo(gmx::ArrayRef<const int> index_gl, int cg0, int cg1, t_forcerec* fr)
 478 {
 479     if (fr != nullptr)
 480     {
 481         gmx::ArrayRef<cginfo_mb_t> cginfo_mb = fr->cginfo_mb;
 482         gmx::ArrayRef<int>         cginfo    = fr->cginfo;
 483
 484         for (int cg = cg0; cg < cg1; cg++)
 485         {
 486             cginfo[cg] = ddcginfo(cginfo_mb, index_gl[cg]);
 487         }
 488     }
 489 }
 490
 491 //! Makes the mappings between global and local atom indices during DD repartioning.
 492 static void make_dd_indices(gmx_domdec_t* dd, const int atomStart)
 493 {
 494     const int                numZones               = dd->comm->zones.n;
 495     gmx::ArrayRef<const int> zone2cg                = dd->comm->zones.cg_range;
 496     gmx::ArrayRef<const int> zone_ncg1              = dd->comm->zone_ncg1;
 497     gmx::ArrayRef<const int> globalAtomGroupIndices = dd->globalAtomGroupIndices;
 498
 499     std::vector<int>& globalAtomIndices = dd->globalAtomIndices;
 500     gmx_ga2la_t&      ga2la             = *dd->ga2la;
 501
 502     if (zone2cg[1] != dd->ncg_home)
 503     {
 504         gmx_incons("dd->ncg_zone is not up to date");
 505     }
 506
 507     /* Make the local to global and global to local atom index */
 508     int a = atomStart;
 509     globalAtomIndices.resize(a);
 510     for (int zone = 0; zone < numZones; zone++)
 511     {
 512         int cg0;
 513         if (zone == 0)
 514         {
 515             cg0 = atomStart;
 516         }
 517         else
 518         {
 519             cg0 = zone2cg[zone];
 520         }
 521         int cg1    = zone2cg[zone + 1];
 522         int cg1_p1 = cg0 + zone_ncg1[zone];
 523
 524         for (int cg = cg0; cg < cg1; cg++)
 525         {
 526             int zone1 = zone;
 527             if (cg >= cg1_p1)
 528             {
 529                 /* Signal that this cg is from more than one pulse away */
 530                 zone1 += numZones;
 531             }
 532             int cg_gl = globalAtomGroupIndices[cg];
 533             globalAtomIndices.push_back(cg_gl);
 534             ga2la.insert(cg_gl, { a, zone1 });
 535             a++;
 536         }
 537     }
 538 }
 539
 540 //! Checks whether global and local atom indices are consistent.
 541 static void check_index_consistency(const gmx_domdec_t* dd, int natoms_sys, const char* where)
 542 {
 543     int nerr = 0;
 544
 545     const int numAtomsInZones = dd->comm->atomRanges.end(DDAtomRanges::Type::Zones);
 546
 547     if (dd->comm->ddSettings.DD_debug > 1)
 548     {
 549         std::vector<int> have(natoms_sys);
 550         for (int a = 0; a < numAtomsInZones; a++)
 551         {
 552             int globalAtomIndex = dd->globalAtomIndices[a];
 553             if (have[globalAtomIndex] > 0)
 554             {
 555                 fprintf(stderr,
 556                         "DD rank %d: global atom %d occurs twice: index %d and %d\n",
 557                         dd->rank,
 558                         globalAtomIndex + 1,
 559                         have[globalAtomIndex],
 560                         a + 1);
 561             }
 562             else
 563             {
 564                 have[globalAtomIndex] = a + 1;
 565             }
 566         }
 567     }
 568
 569     std::vector<int> have(numAtomsInZones);
 570
 571     int ngl = 0;
 572     for (int i = 0; i < natoms_sys; i++)
 573     {
 574         if (const auto* entry = dd->ga2la->find(i))
 575         {
 576             const int a = entry->la;
 577             if (a >= numAtomsInZones)
 578             {
 579                 fprintf(stderr,
 580                         "DD rank %d: global atom %d marked as local atom %d, which is larger than "
 581                         "nat_tot (%d)\n",
 582                         dd->rank,
 583                         i + 1,
 584                         a + 1,
 585                         numAtomsInZones);
 586                 nerr++;
 587             }
 588             else
 589             {
 590                 have[a] = 1;
 591                 if (dd->globalAtomIndices[a] != i)
 592                 {
 593                     fprintf(stderr,
 594                             "DD rank %d: global atom %d marked as local atom %d, which has global "
 595                             "atom index %d\n",
 596                             dd->rank,
 597                             i + 1,
 598                             a + 1,
 599                             dd->globalAtomIndices[a] + 1);
 600                     nerr++;
 601                 }
 602             }
 603             ngl++;
 604         }
 605     }
 606     if (ngl != numAtomsInZones)
 607     {
 608         fprintf(stderr, "DD rank %d, %s: %d global atom indices, %d local atoms\n", dd->rank, where, ngl, numAtomsInZones);
 609     }
 610     for (int a = 0; a < numAtomsInZones; a++)
 611     {
 612         if (have[a] == 0)
 613         {
 614             fprintf(stderr,
 615                     "DD rank %d, %s: local atom %d, global %d has no global index\n",
 616                     dd->rank,
 617                     where,
 618                     a + 1,
 619                     dd->globalAtomIndices[a] + 1);
 620         }
 621     }
 622
 623     if (nerr > 0)
 624     {
 625         gmx_fatal(FARGS, "DD rank %d, %s: %d atom(group) index inconsistencies", dd->rank, where, nerr);
 626     }
 627 }
 628
 629 //! Clear all DD global state indices
 630 static void clearDDStateIndices(gmx_domdec_t* dd, const bool keepLocalAtomIndices)
 631 {
 632     gmx_ga2la_t& ga2la = *dd->ga2la;
 633
 634     if (!keepLocalAtomIndices)
 635     {
 636         /* Clear the whole list without the overhead of searching */
 637         ga2la.clear();
 638     }
 639     else
 640     {
 641         const int numAtomsInZones = dd->comm->atomRanges.end(DDAtomRanges::Type::Zones);
 642         for (int i = 0; i < numAtomsInZones; i++)
 643         {
 644             ga2la.erase(dd->globalAtomIndices[i]);
 645         }
 646     }
 647
 648     dd_clear_local_vsite_indices(dd);
 649
 650     if (dd->constraints)
 651     {
 652         dd_clear_local_constraint_indices(dd);
 653     }
 654 }
 655
 656 //! Return the duration of force calculations on this rank.
 657 static float dd_force_load(gmx_domdec_comm_t* comm)
 658 {
 659     float load;
 660
 661     if (comm->ddSettings.eFlop)
 662     {
 663         load = comm->flop;
 664         if (comm->ddSettings.eFlop > 1)
 665         {
 666             load *= 1.0 + (comm->ddSettings.eFlop - 1) * (0.1 * rand() / RAND_MAX - 0.05);
 667         }
 668     }
 669     else
 670     {
 671         load = comm->cycl[ddCyclF];
 672         if (comm->cycl_n[ddCyclF] > 1)
 673         {
 674             /* Subtract the maximum of the last n cycle counts
 675              * to get rid of possible high counts due to other sources,
 676              * for instance system activity, that would otherwise
 677              * affect the dynamic load balancing.
 678              */
 679             load -= comm->cycl_max[ddCyclF];
 680         }
 681
 682 #if GMX_MPI
 683         if (comm->cycl_n[ddCyclWaitGPU] && comm->nrank_gpu_shared > 1)
 684         {
 685             float gpu_wait, gpu_wait_sum;
 686
 687             gpu_wait = comm->cycl[ddCyclWaitGPU];
 688             if (comm->cycl_n[ddCyclF] > 1)
 689             {
 690                 /* We should remove the WaitGPU time of the same MD step
 691                  * as the one with the maximum F time, since the F time
 692                  * and the wait time are not independent.
 693                  * Furthermore, the step for the max F time should be chosen
 694                  * the same on all ranks that share the same GPU.
 695                  * But to keep the code simple, we remove the average instead.
 696                  * The main reason for artificially long times at some steps
 697                  * is spurious CPU activity or MPI time, so we don't expect
 698                  * that changes in the GPU wait time matter a lot here.
 699                  */
 700                 gpu_wait *= (comm->cycl_n[ddCyclF] - 1) / static_cast<float>(comm->cycl_n[ddCyclF]);
 701             }
 702             /* Sum the wait times over the ranks that share the same GPU */
 703             MPI_Allreduce(&gpu_wait, &gpu_wait_sum, 1, MPI_FLOAT, MPI_SUM, comm->mpi_comm_gpu_shared);
 704             /* Replace the wait time by the average over the ranks */
 705             load += -gpu_wait + gpu_wait_sum / comm->nrank_gpu_shared;
 706         }
 707 #endif
 708     }
 709
 710     return load;
 711 }
 712
 713 //! Runs cell size checks and communicates the boundaries.
 714 static void comm_dd_ns_cell_sizes(gmx_domdec_t* dd, gmx_ddbox_t* ddbox, rvec cell_ns_x0, rvec cell_ns_x1, int64_t step)
 715 {
 716     gmx_domdec_comm_t* comm;
 717     int                dim_ind, dim;
 718
 719     comm = dd->comm;
 720
 721     for (dim_ind = 0; dim_ind < dd->ndim; dim_ind++)
 722     {
 723         dim = dd->dim[dim_ind];
 724
 725         /* Without PBC we don't have restrictions on the outer cells */
 726         if (!(dim >= ddbox->npbcdim && (dd->ci[dim] == 0 || dd->ci[dim] == dd->numCells[dim] - 1))
 727             && isDlbOn(comm)
 728             && (comm->cell_x1[dim] - comm->cell_x0[dim]) * ddbox->skew_fac[dim] < comm->cellsize_min[dim])
 729         {
 730             char buf[22];
 731             gmx_fatal(FARGS,
 732                       "step %s: The %c-size (%f) times the triclinic skew factor (%f) is smaller "
 733                       "than the smallest allowed cell size (%f) for domain decomposition grid cell "
 734                       "%d %d %d",
 735                       gmx_step_str(step, buf),
 736                       dim2char(dim),
 737                       comm->cell_x1[dim] - comm->cell_x0[dim],
 738                       ddbox->skew_fac[dim],
 739                       dd->comm->cellsize_min[dim],
 740                       dd->ci[XX],
 741                       dd->ci[YY],
 742                       dd->ci[ZZ]);
 743         }
 744     }
 745
 746     if ((isDlbOn(dd->comm) && dd->ndim > 1) || ddbox->nboundeddim < DIM)
 747     {
 748         /* Communicate the boundaries and update cell_ns_x0/1 */
 749         dd_move_cellx(dd, ddbox, cell_ns_x0, cell_ns_x1);
 750         if (isDlbOn(dd->comm) && dd->ndim > 1)
 751         {
 752             gmx::check_grid_jump(step, dd, dd->comm->systemInfo.cutoff, ddbox, TRUE);
 753         }
 754     }
 755 }
 756
 757 //! Compute and communicate to determine the load distribution across PP ranks.
 758 static void get_load_distribution(gmx_domdec_t* dd, gmx_wallcycle* wcycle)
 759 {
 760     gmx_domdec_comm_t* comm;
 761     domdec_load_t*     load;
 762     float              cell_frac = 0, sbuf[DD_NLOAD_MAX];
 763     gmx_bool           bSepPME;
 764
 765     if (debug)
 766     {
 767         fprintf(debug, "get_load_distribution start\n");
 768     }
 769
 770     wallcycle_start(wcycle, WallCycleCounter::DDCommLoad);
 771
 772     comm = dd->comm;
 773
 774     bSepPME = (dd->pme_nodeid >= 0);
 775
 776     if (dd->ndim == 0 && bSepPME)
 777     {
 778         /* Without decomposition, but with PME nodes, we need the load */
 779         comm->load[0].mdf = comm->cycl[ddCyclPPduringPME];
 780         comm->load[0].pme = comm->cycl[ddCyclPME];
 781     }
 782
 783     // Either we have DLB off, or we have it on and the array is large enough
 784     GMX_ASSERT(!isDlbOn(dd->comm) || static_cast<int>(dd->comm->cellsizesWithDlb.size()) == dd->ndim,
 785                "DLB cell sizes data not set up properly ");
 786     for (int d = dd->ndim - 1; d >= 0; d--)
 787     {
 788         const int dim = dd->dim[d];
 789         /* Check if we participate in the communication in this dimension */
 790         if (d == dd->ndim - 1 || (dd->ci[dd->dim[d + 1]] == 0 && dd->ci[dd->dim[dd->ndim - 1]] == 0))
 791         {
 792             load = &comm->load[d];
 793             if (isDlbOn(dd->comm))
 794             {
 795                 cell_frac = comm->cellsizesWithDlb[d].fracUpper - comm->cellsizesWithDlb[d].fracLower;
 796             }
 797             int pos = 0;
 798             if (d == dd->ndim - 1)
 799             {
 800                 sbuf[pos++] = dd_force_load(comm);
 801                 sbuf[pos++] = sbuf[0];
 802                 if (isDlbOn(dd->comm))
 803                 {
 804                     sbuf[pos++] = sbuf[0];
 805                     sbuf[pos++] = cell_frac;
 806                     if (d > 0)
 807                     {
 808                         sbuf[pos++] = comm->cellsizesWithDlb[d].fracLowerMax;
 809                         sbuf[pos++] = comm->cellsizesWithDlb[d].fracUpperMin;
 810                     }
 811                 }
 812                 if (bSepPME)
 813                 {
 814                     sbuf[pos++] = comm->cycl[ddCyclPPduringPME];
 815                     sbuf[pos++] = comm->cycl[ddCyclPME];
 816                 }
 817             }
 818             else
 819             {
 820                 sbuf[pos++] = comm->load[d + 1].sum;
 821                 sbuf[pos++] = comm->load[d + 1].max;
 822                 if (isDlbOn(dd->comm))
 823                 {
 824                     sbuf[pos++] = comm->load[d + 1].sum_m;
 825                     sbuf[pos++] = comm->load[d + 1].cvol_min * cell_frac;
 826                     sbuf[pos++] = comm->load[d + 1].flags;
 827                     if (d > 0)
 828                     {
 829                         sbuf[pos++] = comm->cellsizesWithDlb[d].fracLowerMax;
 830                         sbuf[pos++] = comm->cellsizesWithDlb[d].fracUpperMin;
 831                     }
 832                 }
 833                 if (bSepPME)
 834                 {
 835                     sbuf[pos++] = comm->load[d + 1].mdf;
 836                     sbuf[pos++] = comm->load[d + 1].pme;
 837                 }
 838             }
 839             load->nload = pos;
 840             /* Communicate a row in DD direction d.
 841              * The communicators are setup such that the root always has rank 0.
 842              */
 843 #if GMX_MPI
 844             MPI_Gather(sbuf,
 845                        load->nload * sizeof(float),
 846                        MPI_BYTE,
 847                        load->load,
 848                        load->nload * sizeof(float),
 849                        MPI_BYTE,
 850                        0,
 851                        comm->mpi_comm_load[d]);
 852 #endif
 853             if (dd->ci[dim] == dd->master_ci[dim])
 854             {
 855                 /* We are the master along this row, process this row */
 856                 RowMaster* rowMaster = nullptr;
 857
 858                 if (isDlbOn(comm))
 859                 {
 860                     rowMaster = comm->cellsizesWithDlb[d].rowMaster.get();
 861                 }
 862                 load->sum      = 0;
 863                 load->max      = 0;
 864                 load->sum_m    = 0;
 865                 load->cvol_min = 1;
 866                 load->flags    = 0;
 867                 load->mdf      = 0;
 868                 load->pme      = 0;
 869                 int pos        = 0;
 870                 for (int i = 0; i < dd->numCells[dim]; i++)
 871                 {
 872                     load->sum += load->load[pos++];
 873                     load->max = std::max(load->max, load->load[pos]);
 874                     pos++;
 875                     if (isDlbOn(dd->comm))
 876                     {
 877                         if (rowMaster->dlbIsLimited)
 878                         {
 879                             /* This direction could not be load balanced properly,
 880                              * therefore we need to use the maximum iso the average load.
 881                              */
 882                             load->sum_m = std::max(load->sum_m, load->load[pos]);
 883                         }
 884                         else
 885                         {
 886                             load->sum_m += load->load[pos];
 887                         }
 888                         pos++;
 889                         load->cvol_min = std::min(load->cvol_min, load->load[pos]);
 890                         pos++;
 891                         if (d < dd->ndim - 1)
 892                         {
 893                             load->flags = gmx::roundToInt(load->load[pos++]);
 894                         }
 895                         if (d > 0)
 896                         {
 897                             rowMaster->bounds[i].cellFracLowerMax = load->load[pos++];
 898                             rowMaster->bounds[i].cellFracUpperMin = load->load[pos++];
 899                         }
 900                     }
 901                     if (bSepPME)
 902                     {
 903                         load->mdf = std::max(load->mdf, load->load[pos]);
 904                         pos++;
 905                         load->pme = std::max(load->pme, load->load[pos]);
 906                         pos++;
 907                     }
 908                 }
 909                 if (isDlbOn(comm) && rowMaster->dlbIsLimited)
 910                 {
 911                     load->sum_m *= dd->numCells[dim];
 912                     load->flags |= (1 << d);
 913                 }
 914             }
 915         }
 916     }
 917
 918     if (DDMASTER(dd))
 919     {
 920         comm->nload += dd_load_count(comm);
 921         comm->load_step += comm->cycl[ddCyclStep];
 922         comm->load_sum += comm->load[0].sum;
 923         comm->load_max += comm->load[0].max;
 924         if (isDlbOn(comm))
 925         {
 926             for (int d = 0; d < dd->ndim; d++)
 927             {
 928                 if (comm->load[0].flags & (1 << d))
 929                 {
 930                     comm->load_lim[d]++;
 931                 }
 932             }
 933         }
 934         if (bSepPME)
 935         {
 936             comm->load_mdf += comm->load[0].mdf;
 937             comm->load_pme += comm->load[0].pme;
 938         }
 939     }
 940
 941     wallcycle_stop(wcycle, WallCycleCounter::DDCommLoad);
 942
 943     if (debug)
 944     {
 945         fprintf(debug, "get_load_distribution finished\n");
 946     }
 947 }
 948
 949 /*! \brief Return the relative performance loss on the total run time
 950  * due to the force calculation load imbalance. */
 951 static float dd_force_load_fraction(gmx_domdec_t* dd)
 952 {
 953     if (dd->comm->nload > 0 && dd->comm->load_step > 0)
 954     {
 955         return dd->comm->load_sum / (dd->comm->load_step * dd->nnodes);
 956     }
 957     else
 958     {
 959         return 0;
 960     }
 961 }
 962
 963 /*! \brief Return the relative performance loss on the total run time
 964  * due to the force calculation load imbalance. */
 965 static float dd_force_imb_perf_loss(gmx_domdec_t* dd)
 966 {
 967     if (dd->comm->nload > 0 && dd->comm->load_step > 0)
 968     {
 969         return (dd->comm->load_max * dd->nnodes - dd->comm->load_sum) / (dd->comm->load_step * dd->nnodes);
 970     }
 971     else
 972     {
 973         return 0;
 974     }
 975 }
 976
 977 //! Print load-balance report e.g. at the end of a run.
 978 static void print_dd_load_av(FILE* fplog, gmx_domdec_t* dd)
 979 {
 980     gmx_domdec_comm_t* comm = dd->comm;
 981
 982     /* Only the master rank prints loads and only if we measured loads */
 983     if (!DDMASTER(dd) || comm->nload == 0)
 984     {
 985         return;
 986     }
 987
 988     char buf[STRLEN];
 989     int  numPpRanks  = dd->nnodes;
 990     int  numPmeRanks = (comm->ddRankSetup.usePmeOnlyRanks ? comm->ddRankSetup.numRanksDoingPme : 0);
 991     int  numRanks    = numPpRanks + numPmeRanks;
 992     float lossFraction = 0;
 993
 994     /* Print the average load imbalance and performance loss */
 995     if (dd->nnodes > 1 && comm->load_sum > 0)
 996     {
 997         float imbalance = comm->load_max * numPpRanks / comm->load_sum - 1;
 998         lossFraction    = dd_force_imb_perf_loss(dd);
 999
1000         std::string msg = "\nDynamic load balancing report:\n";
1001         std::string dlbStateStr;
1002
1003         switch (dd->comm->dlbState)
1004         {
1005             case DlbState::offUser:
1006                 dlbStateStr = "DLB was off during the run per user request.";
1007                 break;
1008             case DlbState::offForever:
1009                 /* Currectly this can happen due to performance loss observed, cell size
1010                  * limitations or incompatibility with other settings observed during
1011                  * determineInitialDlbState(). */
1012                 dlbStateStr = "DLB got disabled because it was unsuitable to use.";
1013                 break;
1014             case DlbState::offCanTurnOn:
1015                 dlbStateStr = "DLB was off during the run due to low measured imbalance.";
1016                 break;
1017             case DlbState::offTemporarilyLocked:
1018                 dlbStateStr =
1019                         "DLB was locked at the end of the run due to unfinished PP-PME "
1020                         "balancing.";
1021                 break;
1022             case DlbState::onCanTurnOff:
1023                 dlbStateStr = "DLB was turned on during the run due to measured imbalance.";
1024                 break;
1025             case DlbState::onUser:
1026                 dlbStateStr = "DLB was permanently on during the run per user request.";
1027                 break;
1028             default: GMX_ASSERT(false, "Undocumented DLB state");
1029         }
1030
1031         msg += " " + dlbStateStr + "\n";
1032         msg += gmx::formatString(" Average load imbalance: %.1f%%.\n", imbalance * 100);
1033         msg += gmx::formatString(
1034                 " The balanceable part of the MD step is %d%%, load imbalance is computed from "
1035                 "this.\n",
1036                 gmx::roundToInt(dd_force_load_fraction(dd) * 100));
1037         msg += gmx::formatString(
1038                 " Part of the total run time spent waiting due to load imbalance: %.1f%%.\n",
1039                 lossFraction * 100);
1040         fprintf(fplog, "%s", msg.c_str());
1041         fprintf(stderr, "\n%s", msg.c_str());
1042     }
1043
1044     /* Print during what percentage of steps the  load balancing was limited */
1045     bool dlbWasLimited = false;
1046     if (isDlbOn(comm))
1047     {
1048         sprintf(buf, " Steps where the load balancing was limited by -rdd, -rcon and/or -dds:");
1049         for (int d = 0; d < dd->ndim; d++)
1050         {
1051             int limitPercentage = (200 * comm->load_lim[d] + 1) / (2 * comm->nload);
1052             sprintf(buf + strlen(buf), " %c %d %%", dim2char(dd->dim[d]), limitPercentage);
1053             if (limitPercentage >= 50)
1054             {
1055                 dlbWasLimited = true;
1056             }
1057         }
1058         sprintf(buf + strlen(buf), "\n");
1059         fprintf(fplog, "%s", buf);
1060         fprintf(stderr, "%s", buf);
1061     }
1062
1063     /* Print the performance loss due to separate PME - PP rank imbalance */
1064     float lossFractionPme = 0;
1065     if (numPmeRanks > 0 && comm->load_mdf > 0 && comm->load_step > 0)
1066     {
1067         float pmeForceRatio = comm->load_pme / comm->load_mdf;
1068         lossFractionPme     = (comm->load_pme - comm->load_mdf) / comm->load_step;
1069         if (lossFractionPme <= 0)
1070         {
1071             lossFractionPme *= numPmeRanks / static_cast<float>(numRanks);
1072         }
1073         else
1074         {
1075             lossFractionPme *= numPpRanks / static_cast<float>(numRanks);
1076         }
1077         sprintf(buf, " Average PME mesh/force load: %5.3f\n", pmeForceRatio);
1078         fprintf(fplog, "%s", buf);
1079         fprintf(stderr, "%s", buf);
1080         sprintf(buf,
1081                 " Part of the total run time spent waiting due to PP/PME imbalance: %.1f %%\n",
1082                 std::fabs(lossFractionPme) * 100);
1083         fprintf(fplog, "%s", buf);
1084         fprintf(stderr, "%s", buf);
1085     }
1086     fprintf(fplog, "\n");
1087     fprintf(stderr, "\n");
1088
1089     if ((lossFraction >= DD_PERF_LOSS_WARN) && (dd->comm->dlbState != DlbState::offTemporarilyLocked))
1090     {
1091         std::string message = gmx::formatString(
1092                 "NOTE: %.1f %% of the available CPU time was lost due to load imbalance\n"
1093                 "      in the domain decomposition.\n",
1094                 lossFraction * 100);
1095
1096         bool hadSuggestion = false;
1097         if (dd->comm->dlbState == DlbState::offUser)
1098         {
1099             message += "      You might want to allow dynamic load balancing (option -dlb auto.)\n";
1100             hadSuggestion = true;
1101         }
1102         else if (dd->comm->dlbState == DlbState::offCanTurnOn)
1103         {
1104             message +=
1105                     "      Dynamic load balancing was automatically disabled, but it might be "
1106                     "beneficial to manually tuning it on (option -dlb on.)\n";
1107             hadSuggestion = true;
1108         }
1109         else if (dlbWasLimited)
1110         {
1111             message +=
1112                     "      You might want to decrease the cell size limit (options -rdd, -rcon "
1113                     "and/or -dds).\n";
1114             hadSuggestion = true;
1115         }
1116         message += gmx::formatString(
1117                 "      You can %sconsider manually changing the decomposition (option -dd);\n"
1118                 "      e.g. by using fewer domains along the box dimension in which there is\n"
1119                 "      considerable inhomogeneity in the simulated system.",
1120                 hadSuggestion ? "also " : "");
1121
1122         fprintf(fplog, "%s\n", message.c_str());
1123         fprintf(stderr, "%s\n", message.c_str());
1124     }
1125     if (numPmeRanks > 0 && std::fabs(lossFractionPme) >= DD_PERF_LOSS_WARN)
1126     {
1127         sprintf(buf,
1128                 "NOTE: %.1f %% performance was lost because the PME ranks\n"
1129                 "      had %s work to do than the PP ranks.\n"
1130                 "      You might want to %s the number of PME ranks\n"
1131                 "      or %s the cut-off and the grid spacing.\n",
1132                 std::fabs(lossFractionPme * 100),
1133                 (lossFractionPme < 0) ? "less" : "more",
1134                 (lossFractionPme < 0) ? "decrease" : "increase",
1135                 (lossFractionPme < 0) ? "decrease" : "increase");
1136         fprintf(fplog, "%s\n", buf);
1137         fprintf(stderr, "%s\n", buf);
1138     }
1139 }
1140
1141 //! Return the minimum communication volume.
1142 static float dd_vol_min(gmx_domdec_t* dd)
1143 {
1144     return dd->comm->load[0].cvol_min * dd->nnodes;
1145 }
1146
1147 //! Return the DD load flags.
1148 static int dd_load_flags(gmx_domdec_t* dd)
1149 {
1150     return dd->comm->load[0].flags;
1151 }
1152
1153 //! Return the reported load imbalance in force calculations.
1154 static float dd_f_imbal(gmx_domdec_t* dd)
1155 {
1156     if (dd->comm->load[0].sum > 0)
1157     {
1158         return dd->comm->load[0].max * dd->nnodes / dd->comm->load[0].sum - 1.0F;
1159     }
1160     else
1161     {
1162         /* Something is wrong in the cycle counting, report no load imbalance */
1163         return 0.0F;
1164     }
1165 }
1166
1167 //! Returns DD load balance report.
1168 static std::string dd_print_load(gmx_domdec_t* dd, int64_t step)
1169 {
1170     gmx::StringOutputStream stream;
1171     gmx::TextWriter         log(&stream);
1172
1173     int flags = dd_load_flags(dd);
1174     if (flags)
1175     {
1176         log.writeString("DD  load balancing is limited by minimum cell size in dimension");
1177         for (int d = 0; d < dd->ndim; d++)
1178         {
1179             if (flags & (1 << d))
1180             {
1181                 log.writeStringFormatted(" %c", dim2char(dd->dim[d]));
1182             }
1183         }
1184         log.ensureLineBreak();
1185     }
1186     log.writeString("DD  step " + gmx::toString(step));
1187     if (isDlbOn(dd->comm))
1188     {
1189         log.writeStringFormatted("  vol min/aver %5.3f%c", dd_vol_min(dd), flags ? '!' : ' ');
1190     }
1191     if (dd->nnodes > 1)
1192     {
1193         log.writeStringFormatted(" load imb.: force %4.1f%%", dd_f_imbal(dd) * 100);
1194     }
1195     if (dd->comm->cycl_n[ddCyclPME])
1196     {
1197         log.writeStringFormatted("  pme mesh/force %5.3f", dd_pme_f_ratio(dd));
1198     }
1199     log.ensureLineBreak();
1200     return stream.toString();
1201 }
1202
1203 //! Prints DD load balance report in mdrun verbose mode.
1204 static void dd_print_load_verbose(gmx_domdec_t* dd)
1205 {
1206     if (isDlbOn(dd->comm))
1207     {
1208         fprintf(stderr, "vol %4.2f%c ", dd_vol_min(dd), dd_load_flags(dd) ? '!' : ' ');
1209     }
1210     if (dd->nnodes > 1)
1211     {
1212         fprintf(stderr, "imb F %2d%% ", gmx::roundToInt(dd_f_imbal(dd) * 100));
1213     }
1214     if (dd->comm->cycl_n[ddCyclPME])
1215     {
1216         fprintf(stderr, "pme/F %4.2f ", dd_pme_f_ratio(dd));
1217     }
1218 }
1219
1220 //! Turns on dynamic load balancing if possible and needed.
1221 static void turn_on_dlb(const gmx::MDLogger& mdlog, gmx_domdec_t* dd, int64_t step)
1222 {
1223     gmx_domdec_comm_t* comm = dd->comm;
1224
1225     real cellsize_min = comm->cellsize_min[dd->dim[0]];
1226     for (int d = 1; d < dd->ndim; d++)
1227     {
1228         cellsize_min = std::min(cellsize_min, comm->cellsize_min[dd->dim[d]]);
1229     }
1230
1231     /* Turn off DLB if we're too close to the cell size limit. */
1232     if (cellsize_min < comm->cellsize_limit * 1.05)
1233     {
1234         GMX_LOG(mdlog.info)
1235                 .appendTextFormatted(
1236                         "step %s Measured %.1f %% performance loss due to load imbalance, "
1237                         "but the minimum cell size is smaller than 1.05 times the cell size limit. "
1238                         "Will no longer try dynamic load balancing.",
1239                         gmx::toString(step).c_str(),
1240                         dd_force_imb_perf_loss(dd) * 100);
1241
1242         comm->dlbState = DlbState::offForever;
1243         return;
1244     }
1245
1246     GMX_LOG(mdlog.info)
1247             .appendTextFormatted(
1248                     "step %s Turning on dynamic load balancing, because the performance loss due "
1249                     "to load imbalance is %.1f %%.",
1250                     gmx::toString(step).c_str(),
1251                     dd_force_imb_perf_loss(dd) * 100);
1252     comm->dlbState = DlbState::onCanTurnOff;
1253
1254     /* Store the non-DLB performance, so we can check if DLB actually
1255      * improves performance.
1256      */
1257     GMX_RELEASE_ASSERT(comm->cycl_n[ddCyclStep] > 0,
1258                        "When we turned on DLB, we should have measured cycles");
1259     comm->cyclesPerStepBeforeDLB = comm->cycl[ddCyclStep] / comm->cycl_n[ddCyclStep];
1260
1261     set_dlb_limits(dd);
1262
1263     /* We can set the required cell size info here,
1264      * so we do not need to communicate this.
1265      * The grid is completely uniform.
1266      */
1267     for (int d = 0; d < dd->ndim; d++)
1268     {
1269         RowMaster* rowMaster = comm->cellsizesWithDlb[d].rowMaster.get();
1270
1271         if (rowMaster)
1272         {
1273             comm->load[d].sum_m = comm->load[d].sum;
1274
1275             int nc = dd->numCells[dd->dim[d]];
1276             for (int i = 0; i < nc; i++)
1277             {
1278                 rowMaster->cellFrac[i] = i / static_cast<real>(nc);
1279                 if (d > 0)
1280                 {
1281                     rowMaster->bounds[i].cellFracLowerMax = i / static_cast<real>(nc);
1282                     rowMaster->bounds[i].cellFracUpperMin = (i + 1) / static_cast<real>(nc);
1283                 }
1284             }
1285             rowMaster->cellFrac[nc] = 1.0;
1286         }
1287     }
1288 }
1289
1290 //! Turns off dynamic load balancing (but leave it able to turn back on).
1291 static void turn_off_dlb(const gmx::MDLogger& mdlog, gmx_domdec_t* dd, int64_t step)
1292 {
1293     GMX_LOG(mdlog.info)
1294             .appendText(
1295                     "step " + gmx::toString(step)
1296                     + " Turning off dynamic load balancing, because it is degrading performance.");
1297     dd->comm->dlbState                     = DlbState::offCanTurnOn;
1298     dd->comm->haveTurnedOffDlb             = true;
1299     dd->comm->ddPartioningCountFirstDlbOff = dd->ddp_count;
1300 }
1301
1302 //! Turns off dynamic load balancing permanently.
1303 static void turn_off_dlb_forever(const gmx::MDLogger& mdlog, gmx_domdec_t* dd, int64_t step)
1304 {
1305     GMX_RELEASE_ASSERT(dd->comm->dlbState == DlbState::offCanTurnOn,
1306                        "Can only turn off DLB forever when it was in the can-turn-on state");
1307     GMX_LOG(mdlog.info)
1308             .appendText(
1309                     "step " + gmx::toString(step)
1310                     + " Will no longer try dynamic load balancing, as it degraded performance.");
1311     dd->comm->dlbState = DlbState::offForever;
1312 }
1313
1314 void set_dd_dlb_max_cutoff(t_commrec* cr, real cutoff)
1315 {
1316     gmx_domdec_comm_t* comm;
1317
1318     comm = cr->dd->comm;
1319
1320     /* Turn on the DLB limiting (might have been on already) */
1321     comm->bPMELoadBalDLBLimits = TRUE;
1322
1323     /* Change the cut-off limit */
1324     comm->PMELoadBal_max_cutoff = cutoff;
1325
1326     if (debug)
1327     {
1328         fprintf(debug,
1329                 "PME load balancing set a limit to the DLB staggering such that a %f cut-off will "
1330                 "continue to fit\n",
1331                 comm->PMELoadBal_max_cutoff);
1332     }
1333 }
1334
1335 //! Merge atom buffers.
1336 static void merge_cg_buffers(int                            ncell,
1337                              gmx_domdec_comm_dim_t*         cd,
1338                              int                            pulse,
1339                              int*                           ncg_cell,
1340                              gmx::ArrayRef<int>             index_gl,
1341                              const int*                     recv_i,
1342                              gmx::ArrayRef<gmx::RVec>       x,
1343                              gmx::ArrayRef<const gmx::RVec> recv_vr,
1344                              gmx::ArrayRef<cginfo_mb_t>     cginfo_mb,
1345                              gmx::ArrayRef<int>             cginfo)
1346 {
1347     gmx_domdec_ind_t *ind, *ind_p;
1348     int               p, cell, c, cg, cg0, cg1, cg_gl;
1349     int               shift;
1350
1351     ind = &cd->ind[pulse];
1352
1353     /* First correct the already stored data */
1354     shift = ind->nrecv[ncell];
1355     for (cell = ncell - 1; cell >= 0; cell--)
1356     {
1357         shift -= ind->nrecv[cell];
1358         if (shift > 0)
1359         {
1360             /* Move the cg's present from previous grid pulses */
1361             cg0 = ncg_cell[ncell + cell];
1362             cg1 = ncg_cell[ncell + cell + 1];
1363             for (cg = cg1 - 1; cg >= cg0; cg--)
1364             {
1365                 index_gl[cg + shift] = index_gl[cg];
1366                 x[cg + shift]        = x[cg];
1367                 cginfo[cg + shift]   = cginfo[cg];
1368             }
1369             /* Correct the already stored send indices for the shift */
1370             for (p = 1; p <= pulse; p++)
1371             {
1372                 ind_p = &cd->ind[p];
1373                 cg0   = 0;
1374                 for (c = 0; c < cell; c++)
1375                 {
1376                     cg0 += ind_p->nsend[c];
1377                 }
1378                 cg1 = cg0 + ind_p->nsend[cell];
1379                 for (cg = cg0; cg < cg1; cg++)
1380                 {
1381                     ind_p->index[cg] += shift;
1382                 }
1383             }
1384         }
1385     }
1386
1387     /* Merge in the communicated buffers */
1388     shift = 0;
1389     cg0   = 0;
1390     for (cell = 0; cell < ncell; cell++)
1391     {
1392         cg1 = ncg_cell[ncell + cell + 1] + shift;
1393         for (cg = 0; cg < ind->nrecv[cell]; cg++)
1394         {
1395             /* Copy this atom from the buffer */
1396             index_gl[cg1] = recv_i[cg0];
1397             x[cg1]        = recv_vr[cg0];
1398             /* Copy information */
1399             cg_gl       = index_gl[cg1];
1400             cginfo[cg1] = ddcginfo(cginfo_mb, cg_gl);
1401             cg0++;
1402             cg1++;
1403         }
1404         shift += ind->nrecv[cell];
1405         ncg_cell[ncell + cell + 1] = cg1;
1406     }
1407 }
1408
1409 //! Makes a range partitioning for the atom groups wthin a cell
1410 static void make_cell2at_index(gmx_domdec_comm_dim_t* cd, int nzone, int atomGroupStart)
1411 {
1412     /* Store the atom block boundaries for easy copying of communication buffers
1413      */
1414     int g = atomGroupStart;
1415     for (int zone = 0; zone < nzone; zone++)
1416     {
1417         for (gmx_domdec_ind_t& ind : cd->ind)
1418         {
1419             ind.cell2at0[zone] = g;
1420             g += ind.nrecv[zone];
1421             ind.cell2at1[zone] = g;
1422         }
1423     }
1424 }
1425
1426 //! Returns whether a link is missing.
1427 static gmx_bool missing_link(const t_blocka& link, const int globalAtomIndex, const gmx_ga2la_t& ga2la)
1428 {
1429     for (int i = link.index[globalAtomIndex]; i < link.index[globalAtomIndex + 1]; i++)
1430     {
1431         if (!ga2la.findHome(link.a[i]))
1432         {
1433             return true;
1434         }
1435     }
1436
1437     return false;
1438 }
1439
1440 //! Domain corners for communication, a maximum of 4 i-zones see a j domain
1441 typedef struct
1442 {
1443     //! The corners for the non-bonded communication.
1444     real c[DIM][4];
1445     //! Corner for rounding.
1446     real cr0;
1447     //! Corners for rounding.
1448     real cr1[4];
1449     //! Corners for bounded communication.
1450     real bc[DIM];
1451     //! Corner for rounding for bonded communication.
1452     real bcr1;
1453 } dd_corners_t;
1454
1455 //! Determine the corners of the domain(s) we are communicating with.
1456 static void set_dd_corners(const gmx_domdec_t* dd, int dim0, int dim1, int dim2, gmx_bool bDistMB, dd_corners_t* c)
1457 {
1458     const gmx_domdec_comm_t*  comm;
1459     const gmx_domdec_zones_t* zones;
1460
1461     comm = dd->comm;
1462
1463     zones = &comm->zones;
1464
1465     /* Keep the compiler happy */
1466     c->cr0  = 0;
1467     c->bcr1 = 0;
1468
1469     /* The first dimension is equal for all cells */
1470     c->c[0][0] = comm->cell_x0[dim0];
1471     if (bDistMB)
1472     {
1473         c->bc[0] = c->c[0][0];
1474     }
1475     if (dd->ndim >= 2)
1476     {
1477         dim1 = dd->dim[1];
1478         /* This cell row is only seen from the first row */
1479         c->c[1][0] = comm->cell_x0[dim1];
1480         /* All rows can see this row */
1481         c->c[1][1] = comm->cell_x0[dim1];
1482         if (isDlbOn(dd->comm))
1483         {
1484             c->c[1][1] = std::max(comm->cell_x0[dim1], comm->zone_d1[1].mch0);
1485             if (bDistMB)
1486             {
1487                 /* For the multi-body distance we need the maximum */
1488                 c->bc[1] = std::max(comm->cell_x0[dim1], comm->zone_d1[1].p1_0);
1489             }
1490         }
1491         /* Set the upper-right corner for rounding */
1492         c->cr0 = comm->cell_x1[dim0];
1493
1494         if (dd->ndim >= 3)
1495         {
1496             dim2 = dd->dim[2];
1497             for (int j = 0; j < 4; j++)
1498             {
1499                 c->c[2][j] = comm->cell_x0[dim2];
1500             }
1501             if (isDlbOn(dd->comm))
1502             {
1503                 /* Use the maximum of the i-cells that see a j-cell */
1504                 for (const auto& iZone : zones->iZones)
1505                 {
1506                     const int iZoneIndex = iZone.iZoneIndex;
1507                     for (int jZone : iZone.jZoneRange)
1508                     {
1509                         if (jZone >= 4)
1510                         {
1511                             c->c[2][jZone - 4] = std::max(
1512                                     c->c[2][jZone - 4],
1513                                     comm->zone_d2[zones->shift[iZoneIndex][dim0]][zones->shift[iZoneIndex][dim1]]
1514                                             .mch0);
1515                         }
1516                     }
1517                 }
1518                 if (bDistMB)
1519                 {
1520                     /* For the multi-body distance we need the maximum */
1521                     c->bc[2] = comm->cell_x0[dim2];
1522                     for (int i = 0; i < 2; i++)
1523                     {
1524                         for (int j = 0; j < 2; j++)
1525                         {
1526                             c->bc[2] = std::max(c->bc[2], comm->zone_d2[i][j].p1_0);
1527                         }
1528                     }
1529                 }
1530             }
1531
1532             /* Set the upper-right corner for rounding */
1533             /* Cell (0,0,0) and cell (1,0,0) can see cell 4 (0,1,1)
1534              * Only cell (0,0,0) can see cell 7 (1,1,1)
1535              */
1536             c->cr1[0] = comm->cell_x1[dim1];
1537             c->cr1[3] = comm->cell_x1[dim1];
1538             if (isDlbOn(dd->comm))
1539             {
1540                 c->cr1[0] = std::max(comm->cell_x1[dim1], comm->zone_d1[1].mch1);
1541                 if (bDistMB)
1542                 {
1543                     /* For the multi-body distance we need the maximum */
1544                     c->bcr1 = std::max(comm->cell_x1[dim1], comm->zone_d1[1].p1_1);
1545                 }
1546             }
1547         }
1548     }
1549 }
1550
1551 /*! \brief Add the atom groups and coordinates we need to send in this
1552  * pulse from this zone to \p localAtomGroups and \p work. */
1553 static void get_zone_pulse_groups(gmx_domdec_t*                  dd,
1554                                   int                            zonei,
1555                                   int                            zone,
1556                                   int                            cg0,
1557                                   int                            cg1,
1558                                   gmx::ArrayRef<const int>       globalAtomGroupIndices,
1559                                   int                            dim,
1560                                   int                            dim_ind,
1561                                   int                            dim0,
1562                                   int                            dim1,
1563                                   int                            dim2,
1564                                   real                           r_comm2,
1565                                   real                           r_bcomm2,
1566                                   matrix                         box,
1567                                   bool                           distanceIsTriclinic,
1568                                   rvec*                          normal,
1569                                   real                           skew_fac2_d,
1570                                   real                           skew_fac_01,
1571                                   rvec*                          v_d,
1572                                   rvec*                          v_0,
1573                                   rvec*                          v_1,
1574                                   const dd_corners_t*            c,
1575                                   const rvec                     sf2_round,
1576                                   gmx_bool                       bDistBonded,
1577                                   gmx_bool                       bBondComm,
1578                                   gmx_bool                       bDist2B,
1579                                   gmx_bool                       bDistMB,
1580                                   gmx::ArrayRef<const gmx::RVec> coordinates,
1581                                   gmx::ArrayRef<const int>       cginfo,
1582                                   std::vector<int>*              localAtomGroups,
1583                                   dd_comm_setup_work_t*          work)
1584 {
1585     gmx_domdec_comm_t* comm;
1586     gmx_bool           bScrew;
1587     gmx_bool           bDistMB_pulse;
1588     int                cg, i;
1589     real               r2, rb2, r, tric_sh;
1590     rvec               rn, rb;
1591     int                dimd;
1592     int                nsend_z, nat;
1593
1594     comm = dd->comm;
1595
1596     bScrew = (dd->unitCellInfo.haveScrewPBC && dim == XX);
1597
1598     bDistMB_pulse = (bDistMB && bDistBonded);
1599
1600     /* Unpack the work data */
1601     std::vector<int>&       ibuf = work->atomGroupBuffer;
1602     std::vector<gmx::RVec>& vbuf = work->positionBuffer;
1603     nsend_z                      = 0;
1604     nat                          = work->nat;
1605
1606     for (cg = cg0; cg < cg1; cg++)
1607     {
1608         r2  = 0;
1609         rb2 = 0;
1610         if (!distanceIsTriclinic)
1611         {
1612             /* Rectangular direction, easy */
1613             r = coordinates[cg][dim] - c->c[dim_ind][zone];
1614             if (r > 0)
1615             {
1616                 r2 += r * r;
1617             }
1618             if (bDistMB_pulse)
1619             {
1620                 r = coordinates[cg][dim] - c->bc[dim_ind];
1621                 if (r > 0)
1622                 {
1623                     rb2 += r * r;
1624                 }
1625             }
1626             /* Rounding gives at most a 16% reduction
1627              * in communicated atoms
1628              */
1629             if (dim_ind >= 1 && (zonei == 1 || zonei == 2))
1630             {
1631                 r = coordinates[cg][dim0] - c->cr0;
1632                 /* This is the first dimension, so always r >= 0 */
1633                 r2 += r * r;
1634                 if (bDistMB_pulse)
1635                 {
1636                     rb2 += r * r;
1637                 }
1638             }
1639             if (dim_ind == 2 && (zonei == 2 || zonei == 3))
1640             {
1641                 r = coordinates[cg][dim1] - c->cr1[zone];
1642                 if (r > 0)
1643                 {
1644                     r2 += r * r;
1645                 }
1646                 if (bDistMB_pulse)
1647                 {
1648                     r = coordinates[cg][dim1] - c->bcr1;
1649                     if (r > 0)
1650                     {
1651                         rb2 += r * r;
1652                     }
1653                 }
1654             }
1655         }
1656         else
1657         {
1658             /* Triclinic direction, more complicated */
1659             clear_rvec(rn);
1660             clear_rvec(rb);
1661             /* Rounding, conservative as the skew_fac multiplication
1662              * will slightly underestimate the distance.
1663              */
1664             if (dim_ind >= 1 && (zonei == 1 || zonei == 2))
1665             {
1666                 rn[dim0] = coordinates[cg][dim0] - c->cr0;
1667                 for (i = dim0 + 1; i < DIM; i++)
1668                 {
1669                     rn[dim0] -= coordinates[cg][i] * v_0[i][dim0];
1670                 }
1671                 r2 = rn[dim0] * rn[dim0] * sf2_round[dim0];
1672                 if (bDistMB_pulse)
1673                 {
1674                     rb[dim0] = rn[dim0];
1675                     rb2      = r2;
1676                 }
1677                 /* Take care that the cell planes along dim0 might not
1678                  * be orthogonal to those along dim1 and dim2.
1679                  */
1680                 for (i = 1; i <= dim_ind; i++)
1681                 {
1682                     dimd = dd->dim[i];
1683                     if (normal[dim0][dimd] > 0)
1684                     {
1685                         rn[dimd] -= rn[dim0] * normal[dim0][dimd];
1686                         if (bDistMB_pulse)
1687                         {
1688                             rb[dimd] -= rb[dim0] * normal[dim0][dimd];
1689                         }
1690                     }
1691                 }
1692             }
1693             if (dim_ind == 2 && (zonei == 2 || zonei == 3))
1694             {
1695                 GMX_ASSERT(dim1 >= 0 && dim1 < DIM, "Must have a valid dimension index");
1696                 rn[dim1] += coordinates[cg][dim1] - c->cr1[zone];
1697                 tric_sh = 0;
1698                 for (i = dim1 + 1; i < DIM; i++)
1699                 {
1700                     tric_sh -= coordinates[cg][i] * v_1[i][dim1];
1701                 }
1702                 rn[dim1] += tric_sh;
1703                 if (rn[dim1] > 0)
1704                 {
1705                     r2 += rn[dim1] * rn[dim1] * sf2_round[dim1];
1706                     /* Take care of coupling of the distances
1707                      * to the planes along dim0 and dim1 through dim2.
1708                      */
1709                     r2 -= rn[dim0] * rn[dim1] * skew_fac_01;
1710                     /* Take care that the cell planes along dim1
1711                      * might not be orthogonal to that along dim2.
1712                      */
1713                     if (normal[dim1][dim2] > 0)
1714                     {
1715                         rn[dim2] -= rn[dim1] * normal[dim1][dim2];
1716                     }
1717                 }
1718                 if (bDistMB_pulse)
1719                 {
1720                     rb[dim1] += coordinates[cg][dim1] - c->bcr1 + tric_sh;
1721                     if (rb[dim1] > 0)
1722                     {
1723                         rb2 += rb[dim1] * rb[dim1] * sf2_round[dim1];
1724                         /* Take care of coupling of the distances
1725                          * to the planes along dim0 and dim1 through dim2.
1726                          */
1727                         rb2 -= rb[dim0] * rb[dim1] * skew_fac_01;
1728                         /* Take care that the cell planes along dim1
1729                          * might not be orthogonal to that along dim2.
1730                          */
1731                         if (normal[dim1][dim2] > 0)
1732                         {
1733                             rb[dim2] -= rb[dim1] * normal[dim1][dim2];
1734                         }
1735                     }
1736                 }
1737             }
1738             /* The distance along the communication direction */
1739             rn[dim] += coordinates[cg][dim] - c->c[dim_ind][zone];
1740             tric_sh = 0;
1741             for (i = dim + 1; i < DIM; i++)
1742             {
1743                 tric_sh -= coordinates[cg][i] * v_d[i][dim];
1744             }
1745             rn[dim] += tric_sh;
1746             if (rn[dim] > 0)
1747             {
1748                 r2 += rn[dim] * rn[dim] * skew_fac2_d;
1749                 /* Take care of coupling of the distances
1750                  * to the planes along dim0 and dim1 through dim2.
1751                  */
1752                 if (dim_ind == 1 && zonei == 1)
1753                 {
1754                     r2 -= rn[dim0] * rn[dim] * skew_fac_01;
1755                 }
1756             }
1757             if (bDistMB_pulse)
1758             {
1759                 clear_rvec(rb);
1760                 GMX_ASSERT(dim >= 0 && dim < DIM, "Must have a valid dimension index");
1761                 rb[dim] += coordinates[cg][dim] - c->bc[dim_ind] + tric_sh;
1762                 if (rb[dim] > 0)
1763                 {
1764                     rb2 += rb[dim] * rb[dim] * skew_fac2_d;
1765                     /* Take care of coupling of the distances
1766                      * to the planes along dim0 and dim1 through dim2.
1767                      */
1768                     if (dim_ind == 1 && zonei == 1)
1769                     {
1770                         rb2 -= rb[dim0] * rb[dim] * skew_fac_01;
1771                     }
1772                 }
1773             }
1774         }
1775
1776         if (r2 < r_comm2
1777             || (bDistBonded && ((bDistMB && rb2 < r_bcomm2) || (bDist2B && r2 < r_bcomm2))
1778                 && (!bBondComm
1779                     || (GET_CGINFO_BOND_INTER(cginfo[cg])
1780                         && missing_link(*comm->bondedLinks, globalAtomGroupIndices[cg], *dd->ga2la)))))
1781         {
1782             /* Store the local and global atom group indices and position */
1783             localAtomGroups->push_back(cg);
1784             ibuf.push_back(globalAtomGroupIndices[cg]);
1785             nsend_z++;
1786
1787             rvec posPbc;
1788             if (dd->ci[dim] == 0)
1789             {
1790                 /* Correct coordinates for pbc */
1791                 rvec_add(coordinates[cg], box[dim], posPbc);
1792                 if (bScrew)
1793                 {
1794                     posPbc[YY] = box[YY][YY] - posPbc[YY];
1795                     posPbc[ZZ] = box[ZZ][ZZ] - posPbc[ZZ];
1796                 }
1797             }
1798             else
1799             {
1800                 copy_rvec(coordinates[cg], posPbc);
1801             }
1802             vbuf.emplace_back(posPbc[XX], posPbc[YY], posPbc[ZZ]);
1803
1804             nat += 1;
1805         }
1806     }
1807
1808     work->nat        = nat;
1809     work->nsend_zone = nsend_z;
1810 }
1811
1812 //! Clear data.
1813 static void clearCommSetupData(dd_comm_setup_work_t* work)
1814 {
1815     work->localAtomGroupBuffer.clear();
1816     work->atomGroupBuffer.clear();
1817     work->positionBuffer.clear();
1818     work->nat        = 0;
1819     work->nsend_zone = 0;
1820 }
1821
1822 //! Prepare DD communication.
1823 static void setup_dd_communication(gmx_domdec_t* dd, matrix box, gmx_ddbox_t* ddbox, t_forcerec* fr, t_state* state)
1824 {
1825     int                    dim_ind, dim, dim0, dim1, dim2, dimd, nat_tot;
1826     int                    nzone, nzone_send, zone, zonei, cg0, cg1;
1827     int                    c;
1828     int *                  zone_cg_range, pos_cg;
1829     gmx_domdec_comm_t*     comm;
1830     gmx_domdec_zones_t*    zones;
1831     gmx_domdec_comm_dim_t* cd;
1832     gmx_bool               bBondComm, bDist2B, bDistMB, bDistBonded;
1833     dd_corners_t           corners;
1834     rvec *                 normal, *v_d, *v_0 = nullptr, *v_1 = nullptr;
1835     real                   skew_fac2_d, skew_fac_01;
1836     rvec                   sf2_round;
1837
1838     if (debug)
1839     {
1840         fprintf(debug, "Setting up DD communication\n");
1841     }
1842
1843     comm = dd->comm;
1844
1845     if (comm->dth.empty())
1846     {
1847         /* Initialize the thread data.
1848          * This can not be done in init_domain_decomposition,
1849          * as the numbers of threads is determined later.
1850          */
1851         int numThreads = gmx_omp_nthreads_get(ModuleMultiThread::Domdec);
1852         comm->dth.resize(numThreads);
1853     }
1854
1855     bBondComm = comm->systemInfo.filterBondedCommunication;
1856
1857     /* Do we need to determine extra distances for multi-body bondeds? */
1858     bDistMB = (comm->systemInfo.haveInterDomainMultiBodyBondeds && isDlbOn(dd->comm) && dd->ndim > 1);
1859
1860     /* Do we need to determine extra distances for only two-body bondeds? */
1861     bDist2B = (bBondComm && !bDistMB);
1862
1863     const real r_comm2 =
1864             gmx::square(domainToDomainIntoAtomToDomainCutoff(comm->systemInfo, comm->systemInfo.cutoff));
1865     const real r_bcomm2 =
1866             gmx::square(domainToDomainIntoAtomToDomainCutoff(comm->systemInfo, comm->cutoff_mbody));
1867
1868     if (debug)
1869     {
1870         fprintf(debug, "bBondComm %s, r_bc %f\n", gmx::boolToString(bBondComm), std::sqrt(r_bcomm2));
1871     }
1872
1873     zones = &comm->zones;
1874
1875     dim0 = dd->dim[0];
1876     dim1 = (dd->ndim >= 2 ? dd->dim[1] : -1);
1877     dim2 = (dd->ndim >= 3 ? dd->dim[2] : -1);
1878
1879     set_dd_corners(dd, dim0, dim1, dim2, bDistMB, &corners);
1880
1881     /* Triclinic stuff */
1882     normal      = ddbox->normal;
1883     skew_fac_01 = 0;
1884     if (dd->ndim >= 2)
1885     {
1886         v_0 = ddbox->v[dim0];
1887         if (ddbox->tric_dir[dim0] && ddbox->tric_dir[dim1])
1888         {
1889             /* Determine the coupling coefficient for the distances
1890              * to the cell planes along dim0 and dim1 through dim2.
1891              * This is required for correct rounding.
1892              */
1893             skew_fac_01 = ddbox->v[dim0][dim1 + 1][dim0] * ddbox->v[dim1][dim1 + 1][dim1];
1894             if (debug)
1895             {
1896                 fprintf(debug, "\nskew_fac_01 %f\n", skew_fac_01);
1897             }
1898         }
1899     }
1900     if (dd->ndim >= 3)
1901     {
1902         v_1 = ddbox->v[dim1];
1903     }
1904
1905     zone_cg_range                        = zones->cg_range.data();
1906     gmx::ArrayRef<cginfo_mb_t> cginfo_mb = fr->cginfo_mb;
1907
1908     zone_cg_range[0]   = 0;
1909     zone_cg_range[1]   = dd->ncg_home;
1910     comm->zone_ncg1[0] = dd->ncg_home;
1911     pos_cg             = dd->ncg_home;
1912
1913     nat_tot = comm->atomRanges.numHomeAtoms();
1914     nzone   = 1;
1915     for (dim_ind = 0; dim_ind < dd->ndim; dim_ind++)
1916     {
1917         dim = dd->dim[dim_ind];
1918         cd  = &comm->cd[dim_ind];
1919
1920         /* Check if we need to compute triclinic distances along this dim */
1921         bool distanceIsTriclinic = false;
1922         for (int i = 0; i <= dim_ind; i++)
1923         {
1924             if (ddbox->tric_dir[dd->dim[i]])
1925             {
1926                 distanceIsTriclinic = true;
1927             }
1928         }
1929
1930         if (dim >= ddbox->npbcdim && dd->ci[dim] == 0)
1931         {
1932             /* No pbc in this dimension, the first node should not comm. */
1933             nzone_send = 0;
1934         }
1935         else
1936         {
1937             nzone_send = nzone;
1938         }
1939
1940         v_d         = ddbox->v[dim];
1941         skew_fac2_d = gmx::square(ddbox->skew_fac[dim]);
1942
1943         cd->receiveInPlace = true;
1944         for (int p = 0; p < cd->numPulses(); p++)
1945         {
1946             /* Only atoms communicated in the first pulse are used
1947              * for multi-body bonded interactions or for bBondComm.
1948              */
1949             bDistBonded = ((bDistMB || bDist2B) && p == 0);
1950
1951             gmx_domdec_ind_t* ind = &cd->ind[p];
1952
1953             /* Thread 0 writes in the global index array */
1954             ind->index.clear();
1955             clearCommSetupData(&comm->dth[0]);
1956
1957             for (zone = 0; zone < nzone_send; zone++)
1958             {
1959                 if (dim_ind > 0 && distanceIsTriclinic)
1960                 {
1961                     /* Determine slightly more optimized skew_fac's
1962                      * for rounding.
1963                      * This reduces the number of communicated atoms
1964                      * by about 10% for 3D DD of rhombic dodecahedra.
1965                      */
1966                     for (dimd = 0; dimd < dim; dimd++)
1967                     {
1968                         sf2_round[dimd] = 1;
1969                         if (ddbox->tric_dir[dimd])
1970                         {
1971                             for (int i = dd->dim[dimd] + 1; i < DIM; i++)
1972                             {
1973                                 /* If we are shifted in dimension i
1974                                  * and the cell plane is tilted forward
1975                                  * in dimension i, skip this coupling.
1976                                  */
1977                                 if (!(zones->shift[nzone + zone][i] && ddbox->v[dimd][i][dimd] >= 0))
1978                                 {
1979                                     sf2_round[dimd] += gmx::square(ddbox->v[dimd][i][dimd]);
1980                                 }
1981                             }
1982                             sf2_round[dimd] = 1 / sf2_round[dimd];
1983                         }
1984                     }
1985                 }
1986
1987                 zonei = zone_perm[dim_ind][zone];
1988                 if (p == 0)
1989                 {
1990                     /* Here we permutate the zones to obtain a convenient order
1991                      * for neighbor searching
1992                      */
1993                     cg0 = zone_cg_range[zonei];
1994                     cg1 = zone_cg_range[zonei + 1];
1995                 }
1996                 else
1997                 {
1998                     /* Look only at the cg's received in the previous grid pulse
1999                      */
2000                     cg1 = zone_cg_range[nzone + zone + 1];
2001                     cg0 = cg1 - cd->ind[p - 1].nrecv[zone];
2002                 }
2003
2004                 const int numThreads = gmx::ssize(comm->dth);
2005 #pragma omp parallel for num_threads(numThreads) schedule(static)
2006                 for (int th = 0; th < numThreads; th++)
2007                 {
2008                     try
2009                     {
2010                         dd_comm_setup_work_t& work = comm->dth[th];
2011
2012                         /* Retain data accumulated into buffers of thread 0 */
2013                         if (th > 0)
2014                         {
2015                             clearCommSetupData(&work);
2016                         }
2017
2018                         int cg0_th = cg0 + ((cg1 - cg0) * th) / numThreads;
2019                         int cg1_th = cg0 + ((cg1 - cg0) * (th + 1)) / numThreads;
2020
2021                         /* Get the atom groups and coordinates for this pulse in this zone */
2022                         get_zone_pulse_groups(dd,
2023                                               zonei,
2024                                               zone,
2025                                               cg0_th,
2026                                               cg1_th,
2027                                               dd->globalAtomGroupIndices,
2028                                               dim,
2029                                               dim_ind,
2030                                               dim0,
2031                                               dim1,
2032                                               dim2,
2033                                               r_comm2,
2034                                               r_bcomm2,
2035                                               box,
2036                                               distanceIsTriclinic,
2037                                               normal,
2038                                               skew_fac2_d,
2039                                               skew_fac_01,
2040                                               v_d,
2041                                               v_0,
2042                                               v_1,
2043                                               &corners,
2044                                               sf2_round,
2045                                               bDistBonded,
2046                                               bBondComm,
2047                                               bDist2B,
2048                                               bDistMB,
2049                                               state->x,
2050                                               fr->cginfo,
2051                                               th == 0 ? &ind->index : &work.localAtomGroupBuffer,
2052                                               &work);
2053                     }
2054                     GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR
2055                 } // END
2056
2057                 std::vector<int>&       atomGroups = comm->dth[0].atomGroupBuffer;
2058                 std::vector<gmx::RVec>& positions  = comm->dth[0].positionBuffer;
2059                 ind->nsend[zone]                   = comm->dth[0].nsend_zone;
2060                 /* Append data of threads>=1 to the communication buffers */
2061                 for (int th = 1; th < numThreads; th++)
2062                 {
2063                     const dd_comm_setup_work_t& dth = comm->dth[th];
2064
2065                     ind->index.insert(ind->index.end(),
2066                                       dth.localAtomGroupBuffer.begin(),
2067                                       dth.localAtomGroupBuffer.end());
2068                     atomGroups.insert(
2069                             atomGroups.end(), dth.atomGroupBuffer.begin(), dth.atomGroupBuffer.end());
2070                     positions.insert(
2071                             positions.end(), dth.positionBuffer.begin(), dth.positionBuffer.end());
2072                     comm->dth[0].nat += dth.nat;
2073                     ind->nsend[zone] += dth.nsend_zone;
2074                 }
2075             }
2076             /* Clear the counts in case we do not have pbc */
2077             for (zone = nzone_send; zone < nzone; zone++)
2078             {
2079                 ind->nsend[zone] = 0;
2080             }
2081             ind->nsend[nzone]     = ind->index.size();
2082             ind->nsend[nzone + 1] = comm->dth[0].nat;
2083             /* Communicate the number of cg's and atoms to receive */
2084             ddSendrecv(dd, dim_ind, dddirBackward, ind->nsend, nzone + 2, ind->nrecv, nzone + 2);
2085
2086             if (p > 0)
2087             {
2088                 /* We can receive in place if only the last zone is not empty */
2089                 for (zone = 0; zone < nzone - 1; zone++)
2090                 {
2091                     if (ind->nrecv[zone] > 0)
2092                     {
2093                         cd->receiveInPlace = false;
2094                     }
2095                 }
2096             }
2097
2098             int receiveBufferSize = 0;
2099             if (!cd->receiveInPlace)
2100             {
2101                 receiveBufferSize = ind->nrecv[nzone];
2102             }
2103             /* These buffer are actually only needed with in-place */
2104             DDBufferAccess<int>       globalAtomGroupBuffer(comm->intBuffer, receiveBufferSize);
2105             DDBufferAccess<gmx::RVec> rvecBuffer(comm->rvecBuffer, receiveBufferSize);
2106
2107             dd_comm_setup_work_t& work = comm->dth[0];
2108
2109             /* Make space for the global cg indices */
2110             int numAtomGroupsNew = pos_cg + ind->nrecv[nzone];
2111             dd->globalAtomGroupIndices.resize(numAtomGroupsNew);
2112             /* Communicate the global cg indices */
2113             gmx::ArrayRef<int> integerBufferRef;
2114             if (cd->receiveInPlace)
2115             {
2116                 integerBufferRef = gmx::arrayRefFromArray(
2117                         dd->globalAtomGroupIndices.data() + pos_cg, ind->nrecv[nzone]);
2118             }
2119             else
2120             {
2121                 integerBufferRef = globalAtomGroupBuffer.buffer;
2122             }
2123             ddSendrecv<int>(dd, dim_ind, dddirBackward, work.atomGroupBuffer, integerBufferRef);
2124
2125             /* Make space for cginfo */
2126             dd_resize_atominfo_and_state(fr, state, pos_cg + ind->nrecv[nzone]);
2127
2128             /* Communicate the coordinates */
2129             gmx::ArrayRef<gmx::RVec> rvecBufferRef;
2130             if (cd->receiveInPlace)
2131             {
2132                 rvecBufferRef = gmx::makeArrayRef(state->x).subArray(pos_cg, ind->nrecv[nzone]);
2133             }
2134             else
2135             {
2136                 rvecBufferRef = rvecBuffer.buffer;
2137             }
2138             ddSendrecv<gmx::RVec>(dd, dim_ind, dddirBackward, work.positionBuffer, rvecBufferRef);
2139
2140             /* Make the charge group index */
2141             if (cd->receiveInPlace)
2142             {
2143                 zone = (p == 0 ? 0 : nzone - 1);
2144                 while (zone < nzone)
2145                 {
2146                     for (int i = 0; i < ind->nrecv[zone]; i++)
2147                     {
2148                         int globalAtomIndex = dd->globalAtomGroupIndices[pos_cg];
2149                         fr->cginfo[pos_cg]  = ddcginfo(cginfo_mb, globalAtomIndex);
2150                         pos_cg++;
2151                     }
2152                     if (p == 0)
2153                     {
2154                         comm->zone_ncg1[nzone + zone] = ind->nrecv[zone];
2155                     }
2156                     zone++;
2157                     zone_cg_range[nzone + zone] = pos_cg;
2158                 }
2159             }
2160             else
2161             {
2162                 /* This part of the code is never executed with bBondComm. */
2163                 merge_cg_buffers(nzone,
2164                                  cd,
2165                                  p,
2166                                  zone_cg_range,
2167                                  dd->globalAtomGroupIndices,
2168                                  integerBufferRef.data(),
2169                                  state->x,
2170                                  rvecBufferRef,
2171                                  fr->cginfo_mb,
2172                                  fr->cginfo);
2173                 pos_cg += ind->nrecv[nzone];
2174             }
2175             nat_tot += ind->nrecv[nzone + 1];
2176         }
2177         if (!cd->receiveInPlace)
2178         {
2179             /* Store the atom block for easy copying of communication buffers */
2180             make_cell2at_index(cd, nzone, zone_cg_range[nzone]);
2181         }
2182         nzone += nzone;
2183     }
2184
2185     comm->atomRanges.setEnd(DDAtomRanges::Type::Zones, nat_tot);
2186
2187     if (!bBondComm)
2188     {
2189         /* We don't need to update cginfo, since that was alrady done above.
2190          * So we pass NULL for the forcerec.
2191          */
2192         dd_set_cginfo(dd->globalAtomGroupIndices, dd->ncg_home, dd->globalAtomGroupIndices.size(), nullptr);
2193     }
2194
2195     if (debug)
2196     {
2197         fprintf(debug, "Finished setting up DD communication, zones:");
2198         for (c = 0; c < zones->n; c++)
2199         {
2200             fprintf(debug, " %d", zones->cg_range[c + 1] - zones->cg_range[c]);
2201         }
2202         fprintf(debug, "\n");
2203     }
2204 }
2205
2206 //! Set boundaries for the charge group range.
2207 static void set_cg_boundaries(gmx_domdec_zones_t* zones)
2208 {
2209     for (auto& iZone : zones->iZones)
2210     {
2211         iZone.iAtomRange = gmx::Range<int>(0, zones->cg_range[iZone.iZoneIndex + 1]);
2212         iZone.jAtomRange = gmx::Range<int>(zones->cg_range[iZone.jZoneRange.begin()],
2213                                            zones->cg_range[iZone.jZoneRange.end()]);
2214     }
2215 }
2216
2217 /*! \brief Set zone dimensions for zones \p zone_start to \p zone_end-1
2218  *
2219  * Also sets the atom density for the home zone when \p zone_start=0.
2220  * For this \p numMovedChargeGroupsInHomeZone needs to be passed to tell
2221  * how many charge groups will move but are still part of the current range.
2222  * \todo When converting domdec to use proper classes, all these variables
2223  *       should be private and a method should return the correct count
2224  *       depending on an internal state.
2225  *
2226  * \param[in,out] dd          The domain decomposition struct
2227  * \param[in]     box         The box
2228  * \param[in]     ddbox       The domain decomposition box struct
2229  * \param[in]     zone_start  The start of the zone range to set sizes for
2230  * \param[in]     zone_end    The end of the zone range to set sizes for
2231  * \param[in]     numMovedChargeGroupsInHomeZone  The number of charge groups in the home zone that should moved but are still present in dd->comm->zones.cg_range
2232  */
2233 static void set_zones_size(gmx_domdec_t*      dd,
2234                            matrix             box,
2235                            const gmx_ddbox_t* ddbox,
2236                            int                zone_start,
2237                            int                zone_end,
2238                            int                numMovedChargeGroupsInHomeZone)
2239 {
2240     gmx_domdec_comm_t*  comm;
2241     gmx_domdec_zones_t* zones;
2242     gmx_bool            bDistMB;
2243     int                 z, d, dim;
2244     real                rcs, rcmbs;
2245     int                 i, j;
2246     real                vol;
2247
2248     comm = dd->comm;
2249
2250     zones = &comm->zones;
2251
2252     /* Do we need to determine extra distances for multi-body bondeds? */
2253     bDistMB = (comm->systemInfo.haveInterDomainMultiBodyBondeds && isDlbOn(dd->comm) && dd->ndim > 1);
2254
2255     for (z = zone_start; z < zone_end; z++)
2256     {
2257         /* Copy cell limits to zone limits.
2258          * Valid for non-DD dims and non-shifted dims.
2259          */
2260         copy_rvec(comm->cell_x0, zones->size[z].x0);
2261         copy_rvec(comm->cell_x1, zones->size[z].x1);
2262     }
2263
2264     for (d = 0; d < dd->ndim; d++)
2265     {
2266         dim = dd->dim[d];
2267
2268         for (z = 0; z < zones->n; z++)
2269         {
2270             /* With a staggered grid we have different sizes
2271              * for non-shifted dimensions.
2272              */
2273             if (isDlbOn(dd->comm) && zones->shift[z][dim] == 0)
2274             {
2275                 if (d == 1)
2276                 {
2277                     zones->size[z].x0[dim] = comm->zone_d1[zones->shift[z][dd->dim[d - 1]]].min0;
2278                     zones->size[z].x1[dim] = comm->zone_d1[zones->shift[z][dd->dim[d - 1]]].max1;
2279                 }
2280                 else if (d == 2)
2281                 {
2282                     zones->size[z].x0[dim] =
2283                             comm->zone_d2[zones->shift[z][dd->dim[d - 2]]][zones->shift[z][dd->dim[d - 1]]]
2284                                     .min0;
2285                     zones->size[z].x1[dim] =
2286                             comm->zone_d2[zones->shift[z][dd->dim[d - 2]]][zones->shift[z][dd->dim[d - 1]]]
2287                                     .max1;
2288                 }
2289             }
2290         }
2291
2292         rcs   = comm->systemInfo.cutoff;
2293         rcmbs = comm->cutoff_mbody;
2294         if (ddbox->tric_dir[dim])
2295         {
2296             rcs /= ddbox->skew_fac[dim];
2297             rcmbs /= ddbox->skew_fac[dim];
2298         }
2299
2300         /* Set the lower limit for the shifted zone dimensions */
2301         for (z = zone_start; z < zone_end; z++)
2302         {
2303             if (zones->shift[z][dim] > 0)
2304             {
2305                 dim = dd->dim[d];
2306                 if (!isDlbOn(dd->comm) || d == 0)
2307                 {
2308                     zones->size[z].x0[dim] = comm->cell_x1[dim];
2309                     zones->size[z].x1[dim] = comm->cell_x1[dim] + rcs;
2310                 }
2311                 else
2312                 {
2313                     /* Here we take the lower limit of the zone from
2314                      * the lowest domain of the zone below.
2315                      */
2316                     if (z < 4)
2317                     {
2318                         zones->size[z].x0[dim] = comm->zone_d1[zones->shift[z][dd->dim[d - 1]]].min1;
2319                     }
2320                     else
2321                     {
2322                         if (d == 1)
2323                         {
2324                             zones->size[z].x0[dim] = zones->size[zone_perm[2][z - 4]].x0[dim];
2325                         }
2326                         else
2327                         {
2328                             zones->size[z].x0[dim] =
2329                                     comm->zone_d2[zones->shift[z][dd->dim[d - 2]]][zones->shift[z][dd->dim[d - 1]]]
2330                                             .min1;
2331                         }
2332                     }
2333                     /* A temporary limit, is updated below */
2334                     zones->size[z].x1[dim] = zones->size[z].x0[dim];
2335
2336                     if (bDistMB)
2337                     {
2338                         for (size_t zi = 0; zi < zones->iZones.size(); zi++)
2339                         {
2340                             if (zones->shift[zi][dim] == 0)
2341                             {
2342                                 /* This takes the whole zone into account.
2343                                  * With multiple pulses this will lead
2344                                  * to a larger zone then strictly necessary.
2345                                  */
2346                                 zones->size[z].x1[dim] = std::max(zones->size[z].x1[dim],
2347                                                                   zones->size[zi].x1[dim] + rcmbs);
2348                             }
2349                         }
2350                     }
2351                 }
2352             }
2353         }
2354
2355         /* Loop over the i-zones to set the upper limit of each
2356          * j-zone they see.
2357          */
2358         for (const auto& iZone : zones->iZones)
2359         {
2360             const int zi = iZone.iZoneIndex;
2361             if (zones->shift[zi][dim] == 0)
2362             {
2363                 /* We should only use zones up to zone_end */
2364                 const auto& jZoneRangeFull = iZone.jZoneRange;
2365                 if (zone_end <= *jZoneRangeFull.begin())
2366                 {
2367                     continue;
2368                 }
2369                 const gmx::Range<int> jZoneRange(*jZoneRangeFull.begin(),
2370                                                  std::min(*jZoneRangeFull.end(), zone_end));
2371                 for (int jZone : jZoneRange)
2372                 {
2373                     if (zones->shift[jZone][dim] > 0)
2374                     {
2375                         zones->size[jZone].x1[dim] =
2376                                 std::max(zones->size[jZone].x1[dim], zones->size[zi].x1[dim] + rcs);
2377                     }
2378                 }
2379             }
2380         }
2381     }
2382
2383     for (z = zone_start; z < zone_end; z++)
2384     {
2385         /* Initialization only required to keep the compiler happy */
2386         rvec corner_min = { 0, 0, 0 }, corner_max = { 0, 0, 0 }, corner;
2387         int  nc, c;
2388
2389         /* To determine the bounding box for a zone we need to find
2390          * the extreme corners of 4, 2 or 1 corners.
2391          */
2392         nc = 1 << (ddbox->nboundeddim - 1);
2393
2394         for (c = 0; c < nc; c++)
2395         {
2396             /* Set up a zone corner at x=0, ignoring trilinic couplings */
2397             corner[XX] = 0;
2398             if ((c & 1) == 0)
2399             {
2400                 corner[YY] = zones->size[z].x0[YY];
2401             }
2402             else
2403             {
2404                 corner[YY] = zones->size[z].x1[YY];
2405             }
2406             if ((c & 2) == 0)
2407             {
2408                 corner[ZZ] = zones->size[z].x0[ZZ];
2409             }
2410             else
2411             {
2412                 corner[ZZ] = zones->size[z].x1[ZZ];
2413             }
2414             if (dd->ndim == 1 && dd->dim[0] < ZZ && ZZ < dd->unitCellInfo.npbcdim
2415                 && box[ZZ][1 - dd->dim[0]] != 0)
2416             {
2417                 /* With 1D domain decomposition the cg's are not in
2418                  * the triclinic box, but triclinic x-y and rectangular y/x-z.
2419                  * Shift the corner of the z-vector back to along the box
2420                  * vector of dimension d, so it will later end up at 0 along d.
2421                  * This can affect the location of this corner along dd->dim[0]
2422                  * through the matrix operation below if box[d][dd->dim[0]]!=0.
2423                  */
2424                 int d = 1 - dd->dim[0];
2425
2426                 corner[d] -= corner[ZZ] * box[ZZ][d] / box[ZZ][ZZ];
2427             }
2428             /* Apply the triclinic couplings */
2429             for (i = YY; i < ddbox->npbcdim && i < DIM; i++)
2430             {
2431                 for (j = XX; j < i; j++)
2432                 {
2433                     corner[j] += corner[i] * box[i][j] / box[i][i];
2434                 }
2435             }
2436             if (c == 0)
2437             {
2438                 copy_rvec(corner, corner_min);
2439                 copy_rvec(corner, corner_max);
2440             }
2441             else
2442             {
2443                 for (i = 0; i < DIM; i++)
2444                 {
2445                     corner_min[i] = std::min(corner_min[i], corner[i]);
2446                     corner_max[i] = std::max(corner_max[i], corner[i]);
2447                 }
2448             }
2449         }
2450         /* Copy the extreme cornes without offset along x */
2451         for (i = 0; i < DIM; i++)
2452         {
2453             zones->size[z].bb_x0[i] = corner_min[i];
2454             zones->size[z].bb_x1[i] = corner_max[i];
2455         }
2456         /* Add the offset along x */
2457         zones->size[z].bb_x0[XX] += zones->size[z].x0[XX];
2458         zones->size[z].bb_x1[XX] += zones->size[z].x1[XX];
2459     }
2460
2461     if (zone_start == 0)
2462     {
2463         vol = 1;
2464         for (dim = 0; dim < DIM; dim++)
2465         {
2466             vol *= zones->size[0].x1[dim] - zones->size[0].x0[dim];
2467         }
2468         zones->dens_zone0 =
2469                 (zones->cg_range[1] - zones->cg_range[0] - numMovedChargeGroupsInHomeZone) / vol;
2470     }
2471
2472     if (debug)
2473     {
2474         for (z = zone_start; z < zone_end; z++)
2475         {
2476             fprintf(debug,
2477                     "zone %d    %6.3f - %6.3f  %6.3f - %6.3f  %6.3f - %6.3f\n",
2478                     z,
2479                     zones->size[z].x0[XX],
2480                     zones->size[z].x1[XX],
2481                     zones->size[z].x0[YY],
2482                     zones->size[z].x1[YY],
2483                     zones->size[z].x0[ZZ],
2484                     zones->size[z].x1[ZZ]);
2485             fprintf(debug,
2486                     "zone %d bb %6.3f - %6.3f  %6.3f - %6.3f  %6.3f - %6.3f\n",
2487                     z,
2488                     zones->size[z].bb_x0[XX],
2489                     zones->size[z].bb_x1[XX],
2490                     zones->size[z].bb_x0[YY],
2491                     zones->size[z].bb_x1[YY],
2492                     zones->size[z].bb_x0[ZZ],
2493                     zones->size[z].bb_x1[ZZ]);
2494         }
2495     }
2496 }
2497
2498 /*! \brief Order data in \p dataToSort according to \p sort
2499  *
2500  * Note: both buffers should have at least \p sort.size() elements.
2501  */
2502 template<typename T>
2503 static void orderVector(gmx::ArrayRef<const gmx_cgsort_t> sort,
2504                         gmx::ArrayRef<T>                  dataToSort,
2505                         gmx::ArrayRef<T>                  sortBuffer)
2506 {
2507     GMX_ASSERT(dataToSort.size() >= sort.size(), "The vector needs to be sufficiently large");
2508     GMX_ASSERT(sortBuffer.size() >= sort.size(),
2509                "The sorting buffer needs to be sufficiently large");
2510
2511     /* Order the data into the temporary buffer */
2512     size_t i = 0;
2513     for (const gmx_cgsort_t& entry : sort)
2514     {
2515         sortBuffer[i++] = dataToSort[entry.ind];
2516     }
2517
2518     /* Copy back to the original array */
2519     std::copy(sortBuffer.begin(), sortBuffer.begin() + sort.size(), dataToSort.begin());
2520 }
2521
2522 /*! \brief Order data in \p dataToSort according to \p sort
2523  *
2524  * Note: \p vectorToSort should have at least \p sort.size() elements,
2525  *       \p workVector is resized when it is too small.
2526  */
2527 template<typename T>
2528 static void orderVector(gmx::ArrayRef<const gmx_cgsort_t> sort,
2529                         gmx::ArrayRef<T>                  vectorToSort,
2530                         std::vector<T>*                   workVector)
2531 {
2532     if (gmx::index(workVector->size()) < sort.ssize())
2533     {
2534         workVector->resize(sort.size());
2535     }
2536     orderVector<T>(sort, vectorToSort, *workVector);
2537 }
2538
2539 //! Returns the sorting order for atoms based on the nbnxn grid order in sort
2540 static void dd_sort_order_nbnxn(const t_forcerec* fr, std::vector<gmx_cgsort_t>* sort)
2541 {
2542     gmx::ArrayRef<const int> atomOrder = fr->nbv->getLocalAtomOrder();
2543
2544     /* Using push_back() instead of this resize results in much slower code */
2545     sort->resize(atomOrder.size());
2546     gmx::ArrayRef<gmx_cgsort_t> buffer    = *sort;
2547     size_t                      numSorted = 0;
2548     for (int i : atomOrder)
2549     {
2550         if (i >= 0)
2551         {
2552             /* The values of nsc and ind_gl are not used in this case */
2553             buffer[numSorted++].ind = i;
2554         }
2555     }
2556     sort->resize(numSorted);
2557 }
2558
2559 //! Returns the sorting state for DD.
2560 static void dd_sort_state(gmx_domdec_t* dd, t_forcerec* fr, t_state* state)
2561 {
2562     gmx_domdec_sort_t* sort = dd->comm->sort.get();
2563
2564     dd_sort_order_nbnxn(fr, &sort->sorted);
2565
2566     /* We alloc with the old size, since cgindex is still old */
2567     DDBufferAccess<gmx::RVec> rvecBuffer(dd->comm->rvecBuffer, dd->ncg_home);
2568
2569     /* Set the new home atom/charge group count */
2570     dd->ncg_home = sort->sorted.size();
2571     if (debug)
2572     {
2573         fprintf(debug, "Set the new home atom count to %d\n", dd->ncg_home);
2574     }
2575
2576     /* Reorder the state */
2577     gmx::ArrayRef<const gmx_cgsort_t> cgsort = sort->sorted;
2578     GMX_RELEASE_ASSERT(cgsort.ssize() == dd->ncg_home, "We should sort all the home atom groups");
2579
2580     if (state->flags & enumValueToBitMask(StateEntry::X))
2581     {
2582         orderVector(cgsort, makeArrayRef(state->x), rvecBuffer.buffer);
2583     }
2584     if (state->flags & enumValueToBitMask(StateEntry::V))
2585     {
2586         orderVector(cgsort, makeArrayRef(state->v), rvecBuffer.buffer);
2587     }
2588     if (state->flags & enumValueToBitMask(StateEntry::Cgp))
2589     {
2590         orderVector(cgsort, makeArrayRef(state->cg_p), rvecBuffer.buffer);
2591     }
2592
2593     /* Reorder the global cg index */
2594     orderVector<int>(cgsort, dd->globalAtomGroupIndices, &sort->intBuffer);
2595     /* Reorder the cginfo */
2596     orderVector<int>(cgsort, fr->cginfo, &sort->intBuffer);
2597     /* Set the home atom number */
2598     dd->comm->atomRanges.setEnd(DDAtomRanges::Type::Home, dd->ncg_home);
2599
2600     /* The atoms are now exactly in grid order, update the grid order */
2601     fr->nbv->setLocalAtomOrder();
2602 }
2603
2604 //! Accumulates load statistics.
2605 static void add_dd_statistics(gmx_domdec_t* dd)
2606 {
2607     gmx_domdec_comm_t* comm = dd->comm;
2608
2609     for (int i = 0; i < static_cast<int>(DDAtomRanges::Type::Number); i++)
2610     {
2611         auto range = static_cast<DDAtomRanges::Type>(i);
2612         comm->sum_nat[i] += comm->atomRanges.end(range) - comm->atomRanges.start(range);
2613     }
2614     comm->ndecomp++;
2615 }
2616
2617 void reset_dd_statistics_counters(gmx_domdec_t* dd)
2618 {
2619     gmx_domdec_comm_t* comm = dd->comm;
2620
2621     /* Reset all the statistics and counters for total run counting */
2622     for (int i = 0; i < static_cast<int>(DDAtomRanges::Type::Number); i++)
2623     {
2624         comm->sum_nat[i] = 0;
2625     }
2626     comm->ndecomp   = 0;
2627     comm->nload     = 0;
2628     comm->load_step = 0;
2629     comm->load_sum  = 0;
2630     comm->load_max  = 0;
2631     clear_ivec(comm->load_lim);
2632     comm->load_mdf = 0;
2633     comm->load_pme = 0;
2634 }
2635
2636 namespace gmx
2637 {
2638
2639 bool check_grid_jump(int64_t step, const gmx_domdec_t* dd, real cutoff, const gmx_ddbox_t* ddbox, bool bFatal)
2640 {
2641     gmx_domdec_comm_t* comm    = dd->comm;
2642     bool               invalid = false;
2643
2644     for (int d = 1; d < dd->ndim; d++)
2645     {
2646         const DDCellsizesWithDlb& cellsizes = comm->cellsizesWithDlb[d];
2647         const int                 dim       = dd->dim[d];
2648         const real                limit     = grid_jump_limit(comm, cutoff, d);
2649         real                      bfac      = ddbox->box_size[dim];
2650         if (ddbox->tric_dir[dim])
2651         {
2652             bfac *= ddbox->skew_fac[dim];
2653         }
2654         if ((cellsizes.fracUpper - cellsizes.fracLowerMax) * bfac < limit
2655             || (cellsizes.fracLower - cellsizes.fracUpperMin) * bfac > -limit)
2656         {
2657             invalid = true;
2658
2659             if (bFatal)
2660             {
2661                 char buf[22];
2662
2663                 /* This error should never be triggered under normal
2664                  * circumstances, but you never know ...
2665                  */
2666                 gmx_fatal(FARGS,
2667                           "step %s: The domain decomposition grid has shifted too much in the "
2668                           "%c-direction around cell %d %d %d. This should not have happened. "
2669                           "Running with fewer ranks might avoid this issue.",
2670                           gmx_step_str(step, buf),
2671                           dim2char(dim),
2672                           dd->ci[XX],
2673                           dd->ci[YY],
2674                           dd->ci[ZZ]);
2675             }
2676         }
2677     }
2678
2679     return invalid;
2680 }
2681
2682 void print_dd_statistics(const t_commrec* cr, const t_inputrec& inputrec, FILE* fplog)
2683 {
2684     gmx_domdec_comm_t* comm = cr->dd->comm;
2685
2686     const int numRanges = static_cast<int>(DDAtomRanges::Type::Number);
2687     gmx_sumd(numRanges, comm->sum_nat, cr);
2688
2689     if (fplog == nullptr)
2690     {
2691         return;
2692     }
2693
2694     fprintf(fplog, "\n    D O M A I N   D E C O M P O S I T I O N   S T A T I S T I C S\n\n");
2695
2696     for (int i = static_cast<int>(DDAtomRanges::Type::Zones); i < numRanges; i++)
2697     {
2698         auto   range = static_cast<DDAtomRanges::Type>(i);
2699         double av    = comm->sum_nat[i] / comm->ndecomp;
2700         switch (range)
2701         {
2702             case DDAtomRanges::Type::Zones:
2703                 fprintf(fplog, " av. #atoms communicated per step for force:  %d x %.1f\n", 2, av);
2704                 break;
2705             case DDAtomRanges::Type::Vsites:
2706                 if (cr->dd->vsite_comm)
2707                 {
2708                     fprintf(fplog,
2709                             " av. #atoms communicated per step for vsites: %d x %.1f\n",
2710                             (EEL_PME(inputrec.coulombtype)
2711                              || inputrec.coulombtype == CoulombInteractionType::Ewald)
2712                                     ? 3
2713                                     : 2,
2714                             av);
2715                 }
2716                 break;
2717             case DDAtomRanges::Type::Constraints:
2718                 if (cr->dd->constraint_comm)
2719                 {
2720                     fprintf(fplog,
2721                             " av. #atoms communicated per step for LINCS:  %d x %.1f\n",
2722                             1 + inputrec.nLincsIter,
2723                             av);
2724                 }
2725                 break;
2726             default: gmx_incons(" Unknown type for DD statistics");
2727         }
2728     }
2729     fprintf(fplog, "\n");
2730
2731     if (comm->ddSettings.recordLoad && EI_DYNAMICS(inputrec.eI))
2732     {
2733         print_dd_load_av(fplog, cr->dd);
2734     }
2735 }
2736
2737 //!\brief TODO Remove fplog when group scheme and charge groups are gone
2738 void dd_partition_system(FILE*                     fplog,
2739                          const gmx::MDLogger&      mdlog,
2740                          int64_t                   step,
2741                          const t_commrec*          cr,
2742                          bool                      bMasterState,
2743                          int                       nstglobalcomm,
2744                          t_state*                  state_global,
2745                          const gmx_mtop_t&         top_global,
2746                          const t_inputrec&         inputrec,
2747                          gmx::ImdSession*          imdSession,
2748                          pull_t*                   pull_work,
2749                          t_state*                  state_local,
2750                          gmx::ForceBuffers*        f,
2751                          gmx::MDAtoms*             mdAtoms,
2752                          gmx_localtop_t*           top_local,
2753                          t_forcerec*               fr,
2754                          gmx::VirtualSitesHandler* vsite,
2755                          gmx::Constraints*         constr,
2756                          t_nrnb*                   nrnb,
2757                          gmx_wallcycle*            wcycle,
2758                          bool                      bVerbose)
2759 {
2760     gmx_ddbox_t ddbox = { 0 };
2761     int         ncgindex_set;
2762     char        sbuf[22];
2763
2764     wallcycle_start(wcycle, WallCycleCounter::Domdec);
2765
2766     gmx_domdec_t*      dd   = cr->dd;
2767     gmx_domdec_comm_t* comm = dd->comm;
2768
2769     // TODO if the update code becomes accessible here, use
2770     // upd->deform for this logic.
2771     bool bBoxChanged = (bMasterState || inputrecDeform(&inputrec));
2772     if (inputrec.epc != PressureCoupling::No)
2773     {
2774         /* With nstpcouple > 1 pressure coupling happens.
2775          * one step after calculating the pressure.
2776          * Box scaling happens at the end of the MD step,
2777          * after the DD partitioning.
2778          * We therefore have to do DLB in the first partitioning
2779          * after an MD step where P-coupling occurred.
2780          * We need to determine the last step in which p-coupling occurred.
2781          * MRS -- need to validate this for vv?
2782          */
2783         int     n = inputrec.nstpcouple;
2784         int64_t step_pcoupl;
2785         if (n == 1)
2786         {
2787             step_pcoupl = step - 1;
2788         }
2789         else
2790         {
2791             step_pcoupl = ((step - 1) / n) * n + 1;
2792         }
2793         if (step_pcoupl >= comm->partition_step)
2794         {
2795             bBoxChanged = true;
2796         }
2797     }
2798
2799     bool bNStGlobalComm = (step % nstglobalcomm == 0);
2800     bool bDoDLB;
2801     if (!isDlbOn(comm))
2802     {
2803         bDoDLB = false;
2804     }
2805     else
2806     {
2807         /* Should we do dynamic load balacing this step?
2808          * Since it requires (possibly expensive) global communication,
2809          * we might want to do DLB less frequently.
2810          */
2811         if (bBoxChanged || inputrec.epc != PressureCoupling::No)
2812         {
2813             bDoDLB = bBoxChanged;
2814         }
2815         else
2816         {
2817             bDoDLB = bNStGlobalComm;
2818         }
2819     }
2820
2821     /* Check if we have recorded loads on the nodes */
2822     if (comm->ddSettings.recordLoad && dd_load_count(comm) > 0)
2823     {
2824         bool bCheckWhetherToTurnDlbOn = dd_dlb_get_should_check_whether_to_turn_dlb_on(dd);
2825
2826         /* Print load every nstlog, first and last step to the log file */
2827         bool bLogLoad = ((inputrec.nstlog > 0 && step % inputrec.nstlog == 0) || comm->n_load_collect == 0
2828                          || (inputrec.nsteps >= 0
2829                              && (step + inputrec.nstlist > inputrec.init_step + inputrec.nsteps)));
2830
2831         /* Avoid extra communication due to verbose screen output
2832          * when nstglobalcomm is set.
2833          */
2834         if (bDoDLB || bLogLoad || bCheckWhetherToTurnDlbOn
2835             || (bVerbose && (inputrec.nstlist == 0 || nstglobalcomm <= inputrec.nstlist)))
2836         {
2837             get_load_distribution(dd, wcycle);
2838             if (DDMASTER(dd))
2839             {
2840                 if (bLogLoad)
2841                 {
2842                     GMX_LOG(mdlog.info).asParagraph().appendText(dd_print_load(dd, step - 1));
2843                 }
2844                 if (bVerbose)
2845                 {
2846                     dd_print_load_verbose(dd);
2847                 }
2848             }
2849             comm->n_load_collect++;
2850
2851             if (isDlbOn(comm))
2852             {
2853                 if (DDMASTER(dd))
2854                 {
2855                     /* Add the measured cycles to the running average */
2856                     const float averageFactor = 0.1F;
2857                     comm->cyclesPerStepDlbExpAverage =
2858                             (1 - averageFactor) * comm->cyclesPerStepDlbExpAverage
2859                             + averageFactor * comm->cycl[ddCyclStep] / comm->cycl_n[ddCyclStep];
2860                 }
2861                 if (comm->dlbState == DlbState::onCanTurnOff
2862                     && dd->comm->n_load_have % c_checkTurnDlbOffInterval == c_checkTurnDlbOffInterval - 1)
2863                 {
2864                     bool turnOffDlb;
2865                     if (DDMASTER(dd))
2866                     {
2867                         /* If the running averaged cycles with DLB are more
2868                          * than before we turned on DLB, turn off DLB.
2869                          * We will again run and check the cycles without DLB
2870                          * and we can then decide if to turn off DLB forever.
2871                          */
2872                         turnOffDlb = (comm->cyclesPerStepDlbExpAverage > comm->cyclesPerStepBeforeDLB);
2873                     }
2874                     dd_bcast(dd, sizeof(turnOffDlb), &turnOffDlb);
2875                     if (turnOffDlb)
2876                     {
2877                         /* To turn off DLB, we need to redistribute the atoms */
2878                         dd_collect_state(dd, state_local, state_global);
2879                         bMasterState = true;
2880                         turn_off_dlb(mdlog, dd, step);
2881                     }
2882                 }
2883             }
2884             else if (bCheckWhetherToTurnDlbOn)
2885             {
2886                 bool turnOffDlbForever = false;
2887                 bool turnOnDlb         = false;
2888
2889                 /* Since the timings are node dependent, the master decides */
2890                 if (DDMASTER(dd))
2891                 {
2892                     /* If we recently turned off DLB, we want to check if
2893                      * performance is better without DLB. We want to do this
2894                      * ASAP to minimize the chance that external factors
2895                      * slowed down the DLB step are gone here and we
2896                      * incorrectly conclude that DLB was causing the slowdown.
2897                      * So we measure one nstlist block, no running average.
2898                      */
2899                     if (comm->haveTurnedOffDlb
2900                         && comm->cycl[ddCyclStep] / comm->cycl_n[ddCyclStep] < comm->cyclesPerStepDlbExpAverage)
2901                     {
2902                         /* After turning off DLB we ran nstlist steps in fewer
2903                          * cycles than with DLB. This likely means that DLB
2904                          * in not benefical, but this could be due to a one
2905                          * time unlucky fluctuation, so we require two such
2906                          * observations in close succession to turn off DLB
2907                          * forever.
2908                          */
2909                         if (comm->dlbSlowerPartitioningCount > 0
2910                             && dd->ddp_count < comm->dlbSlowerPartitioningCount + 10 * c_checkTurnDlbOnInterval)
2911                         {
2912                             turnOffDlbForever = true;
2913                         }
2914                         comm->haveTurnedOffDlb = false;
2915                         /* Register when we last measured DLB slowdown */
2916                         comm->dlbSlowerPartitioningCount = dd->ddp_count;
2917                     }
2918                     else
2919                     {
2920                         /* Here we check if the max PME rank load is more than 0.98
2921                          * the max PP force load. If so, PP DLB will not help,
2922                          * since we are (almost) limited by PME. Furthermore,
2923                          * DLB will cause a significant extra x/f redistribution
2924                          * cost on the PME ranks, which will then surely result
2925                          * in lower total performance.
2926                          */
2927                         if (comm->ddRankSetup.usePmeOnlyRanks && dd_pme_f_ratio(dd) > 1 - DD_PERF_LOSS_DLB_ON)
2928                         {
2929                             turnOnDlb = false;
2930                         }
2931                         else
2932                         {
2933                             turnOnDlb = (dd_force_imb_perf_loss(dd) >= DD_PERF_LOSS_DLB_ON);
2934                         }
2935                     }
2936                 }
2937                 struct
2938                 {
2939                     bool turnOffDlbForever;
2940                     bool turnOnDlb;
2941                 } bools{ turnOffDlbForever, turnOnDlb };
2942                 dd_bcast(dd, sizeof(bools), &bools);
2943                 if (bools.turnOffDlbForever)
2944                 {
2945                     turn_off_dlb_forever(mdlog, dd, step);
2946                 }
2947                 else if (bools.turnOnDlb)
2948                 {
2949                     turn_on_dlb(mdlog, dd, step);
2950                     bDoDLB = true;
2951                 }
2952             }
2953         }
2954         comm->n_load_have++;
2955     }
2956
2957     bool bRedist = false;
2958     if (bMasterState)
2959     {
2960         /* Clear the old state */
2961         clearDDStateIndices(dd, false);
2962         ncgindex_set = 0;
2963
2964         auto xGlobal = positionsFromStatePointer(state_global);
2965
2966         set_ddbox(*dd, true, DDMASTER(dd) ? state_global->box : nullptr, true, xGlobal, &ddbox);
2967
2968         distributeState(mdlog, dd, top_global, state_global, ddbox, state_local);
2969
2970         /* Ensure that we have space for the new distribution */
2971         dd_resize_atominfo_and_state(fr, state_local, dd->ncg_home);
2972
2973         inc_nrnb(nrnb, eNR_CGCM, comm->atomRanges.numHomeAtoms());
2974
2975         dd_set_cginfo(dd->globalAtomGroupIndices, 0, dd->ncg_home, fr);
2976     }
2977     else if (state_local->ddp_count != dd->ddp_count)
2978     {
2979         if (state_local->ddp_count > dd->ddp_count)
2980         {
2981             gmx_fatal(FARGS,
2982                       "Internal inconsistency state_local->ddp_count (%d) > dd->ddp_count (%" PRId64
2983                       ")",
2984                       state_local->ddp_count,
2985                       dd->ddp_count);
2986         }
2987
2988         if (state_local->ddp_count_cg_gl != state_local->ddp_count)
2989         {
2990             gmx_fatal(FARGS,
2991                       "Internal inconsistency state_local->ddp_count_cg_gl (%d) != "
2992                       "state_local->ddp_count (%d)",
2993                       state_local->ddp_count_cg_gl,
2994                       state_local->ddp_count);
2995         }
2996
2997         /* Clear the old state */
2998         clearDDStateIndices(dd, false);
2999
3000         /* Restore the atom group indices from state_local */
3001         restoreAtomGroups(dd, state_local);
3002         make_dd_indices(dd, 0);
3003         ncgindex_set = dd->ncg_home;
3004
3005         inc_nrnb(nrnb, eNR_CGCM, comm->atomRanges.numHomeAtoms());
3006
3007         dd_set_cginfo(dd->globalAtomGroupIndices, 0, dd->ncg_home, fr);
3008
3009         set_ddbox(*dd, bMasterState, state_local->box, true, state_local->x, &ddbox);
3010
3011         bRedist = isDlbOn(comm);
3012     }
3013     else
3014     {
3015         /* We have the full state, only redistribute the cgs */
3016
3017         /* Clear the non-home indices */
3018         clearDDStateIndices(dd, true);
3019         ncgindex_set = 0;
3020
3021         /* To avoid global communication, we do not recompute the extent
3022          * of the system for dims without pbc. Therefore we need to copy
3023          * the previously computed values when we do not communicate.
3024          */
3025         if (!bNStGlobalComm)
3026         {
3027             copy_rvec(comm->box0, ddbox.box0);
3028             copy_rvec(comm->box_size, ddbox.box_size);
3029         }
3030         set_ddbox(*dd, bMasterState, state_local->box, bNStGlobalComm, state_local->x, &ddbox);
3031
3032         bBoxChanged = true;
3033         bRedist     = true;
3034     }
3035     /* Copy needed for dim's without pbc when avoiding communication */
3036     copy_rvec(ddbox.box0, comm->box0);
3037     copy_rvec(ddbox.box_size, comm->box_size);
3038
3039     set_dd_cell_sizes(dd, &ddbox, dd->unitCellInfo.ddBoxIsDynamic, bMasterState, bDoDLB, step, wcycle);
3040
3041     if (comm->ddSettings.nstDDDumpGrid > 0 && step % comm->ddSettings.nstDDDumpGrid == 0)
3042     {
3043         write_dd_grid_pdb("dd_grid", step, dd, state_local->box, &ddbox);
3044     }
3045
3046     if (comm->systemInfo.useUpdateGroups)
3047     {
3048         comm->updateGroupsCog->addCogs(
3049                 gmx::arrayRefFromArray(dd->globalAtomGroupIndices.data(), dd->ncg_home), state_local->x);
3050     }
3051
3052     /* Check if we should sort the charge groups */
3053     const bool bSortCG = (bMasterState || bRedist);
3054
3055     /* When repartitioning we mark atom groups that will move to neighboring
3056      * DD cells, but we do not move them right away for performance reasons.
3057      * Thus we need to keep track of how many charge groups will move for
3058      * obtaining correct local charge group / atom counts.
3059      */
3060     int ncg_moved = 0;
3061     if (bRedist)
3062     {
3063         wallcycle_sub_start(wcycle, WallCycleSubCounter::DDRedist);
3064
3065         ncgindex_set = dd->ncg_home;
3066         dd_redistribute_cg(fplog, step, dd, ddbox.tric_dir, state_local, fr, nrnb, &ncg_moved);
3067
3068         GMX_RELEASE_ASSERT(bSortCG, "Sorting is required after redistribution");
3069
3070         if (comm->systemInfo.useUpdateGroups)
3071         {
3072             comm->updateGroupsCog->addCogs(
3073                     gmx::arrayRefFromArray(dd->globalAtomGroupIndices.data(), dd->ncg_home),
3074                     state_local->x);
3075         }
3076
3077         wallcycle_sub_stop(wcycle, WallCycleSubCounter::DDRedist);
3078     }
3079
3080     RVec cell_ns_x0, cell_ns_x1;
3081     get_nsgrid_boundaries(ddbox.nboundeddim,
3082                           state_local->box,
3083                           dd,
3084                           &ddbox,
3085                           &comm->cell_x0,
3086                           &comm->cell_x1,
3087                           dd->ncg_home,
3088                           as_rvec_array(state_local->x.data()),
3089                           cell_ns_x0,
3090                           cell_ns_x1);
3091
3092     if (bBoxChanged)
3093     {
3094         comm_dd_ns_cell_sizes(dd, &ddbox, cell_ns_x0, cell_ns_x1, step);
3095     }
3096
3097     if (bSortCG)
3098     {
3099         wallcycle_sub_start(wcycle, WallCycleSubCounter::DDGrid);
3100
3101         /* Sort the state on charge group position.
3102          * This enables exact restarts from this step.
3103          * It also improves performance by about 15% with larger numbers
3104          * of atoms per node.
3105          */
3106
3107         /* Fill the ns grid with the home cell,
3108          * so we can sort with the indices.
3109          */
3110         set_zones_ncg_home(dd);
3111
3112         set_zones_size(dd, state_local->box, &ddbox, 0, 1, ncg_moved);
3113
3114         nbnxn_put_on_grid(fr->nbv.get(),
3115                           state_local->box,
3116                           0,
3117                           comm->zones.size[0].bb_x0,
3118                           comm->zones.size[0].bb_x1,
3119                           comm->updateGroupsCog.get(),
3120                           { 0, dd->ncg_home },
3121                           comm->zones.dens_zone0,
3122                           fr->cginfo,
3123                           state_local->x,
3124                           ncg_moved,
3125                           bRedist ? comm->movedBuffer.data() : nullptr);
3126
3127         if (debug)
3128         {
3129             fprintf(debug, "Step %s, sorting the %d home charge groups\n", gmx_step_str(step, sbuf), dd->ncg_home);
3130         }
3131         dd_sort_state(dd, fr, state_local);
3132
3133         /* After sorting and compacting we set the correct size */
3134         state_change_natoms(state_local, comm->atomRanges.numHomeAtoms());
3135
3136         /* Rebuild all the indices */
3137         dd->ga2la->clear();
3138         ncgindex_set = 0;
3139
3140         wallcycle_sub_stop(wcycle, WallCycleSubCounter::DDGrid);
3141     }
3142     else
3143     {
3144         /* With the group scheme the sorting array is part of the DD state,
3145          * but it just got out of sync, so mark as invalid by emptying it.
3146          */
3147         if (inputrec.cutoff_scheme == CutoffScheme::Group)
3148         {
3149             comm->sort->sorted.clear();
3150         }
3151     }
3152
3153     if (comm->systemInfo.useUpdateGroups)
3154     {
3155         /* The update groups cog's are invalid after sorting
3156          * and need to be cleared before the next partitioning anyhow.
3157          */
3158         comm->updateGroupsCog->clear();
3159     }
3160
3161     wallcycle_sub_start(wcycle, WallCycleSubCounter::DDSetupComm);
3162
3163     /* Set the induces for the home atoms */
3164     set_zones_ncg_home(dd);
3165     make_dd_indices(dd, ncgindex_set);
3166
3167     /* Setup up the communication and communicate the coordinates */
3168     setup_dd_communication(dd, state_local->box, &ddbox, fr, state_local);
3169
3170     /* Set the indices for the halo atoms */
3171     make_dd_indices(dd, dd->ncg_home);
3172
3173     /* Set the charge group boundaries for neighbor searching */
3174     set_cg_boundaries(&comm->zones);
3175
3176     /* When bSortCG=true, we have already set the size for zone 0 */
3177     set_zones_size(dd, state_local->box, &ddbox, bSortCG ? 1 : 0, comm->zones.n, 0);
3178
3179     wallcycle_sub_stop(wcycle, WallCycleSubCounter::DDSetupComm);
3180
3181     /*
3182        write_dd_pdb("dd_home",step,"dump",top_global,cr,
3183                  -1,state_local->x.rvec_array(),state_local->box);
3184      */
3185
3186     wallcycle_sub_start(wcycle, WallCycleSubCounter::DDMakeTop);
3187
3188     /* Extract a local topology from the global topology */
3189     IVec numPulses;
3190     for (int i = 0; i < dd->ndim; i++)
3191     {
3192         numPulses[dd->dim[i]] = comm->cd[i].numPulses();
3193     }
3194     dd_make_local_top(dd,
3195                       &comm->zones,
3196                       dd->unitCellInfo.npbcdim,
3197                       state_local->box,
3198                       comm->cellsize_min,
3199                       numPulses,
3200                       fr,
3201                       state_local->x,
3202                       top_global,
3203                       top_local);
3204
3205     wallcycle_sub_stop(wcycle, WallCycleSubCounter::DDMakeTop);
3206
3207     wallcycle_sub_start(wcycle, WallCycleSubCounter::DDMakeConstr);
3208
3209     /* Set up the special atom communication */
3210     int n = comm->atomRanges.end(DDAtomRanges::Type::Zones);
3211     for (int i = static_cast<int>(DDAtomRanges::Type::Zones) + 1;
3212          i < static_cast<int>(DDAtomRanges::Type::Number);
3213          i++)
3214     {
3215         auto range = static_cast<DDAtomRanges::Type>(i);
3216         switch (range)
3217         {
3218             case DDAtomRanges::Type::Vsites:
3219                 if (vsite && vsite->numInterUpdategroupVirtualSites())
3220                 {
3221                     n = dd_make_local_vsites(dd, n, top_local->idef.il);
3222                 }
3223                 break;
3224             case DDAtomRanges::Type::Constraints:
3225                 if (dd->comm->systemInfo.haveSplitConstraints || dd->comm->systemInfo.haveSplitSettles)
3226                 {
3227                     /* Only for inter-cg constraints we need special code */
3228                     n = dd_make_local_constraints(dd,
3229                                                   n,
3230                                                   top_global,
3231                                                   fr->cginfo.data(),
3232                                                   constr,
3233                                                   inputrec.nProjOrder,
3234                                                   top_local->idef.il);
3235                 }
3236                 break;
3237             default: gmx_incons("Unknown special atom type setup");
3238         }
3239         comm->atomRanges.setEnd(range, n);
3240     }
3241
3242     wallcycle_sub_stop(wcycle, WallCycleSubCounter::DDMakeConstr);
3243
3244     wallcycle_sub_start(wcycle, WallCycleSubCounter::DDTopOther);
3245
3246     /* Make space for the extra coordinates for virtual site
3247      * or constraint communication.
3248      */
3249     state_local->natoms = comm->atomRanges.numAtomsTotal();
3250
3251     state_change_natoms(state_local, state_local->natoms);
3252
3253     int nat_f_novirsum;
3254     if (vsite && vsite->numInterUpdategroupVirtualSites())
3255     {
3256         nat_f_novirsum = comm->atomRanges.end(DDAtomRanges::Type::Vsites);
3257     }
3258     else
3259     {
3260         if (EEL_FULL(inputrec.coulombtype) && dd->haveExclusions)
3261         {
3262             nat_f_novirsum = comm->atomRanges.end(DDAtomRanges::Type::Zones);
3263         }
3264         else
3265         {
3266             nat_f_novirsum = comm->atomRanges.numHomeAtoms();
3267         }
3268     }
3269
3270     /* Set the number of atoms required for the force calculation.
3271      * Forces need to be constrained when doing energy
3272      * minimization. For simple simulations we could avoid some
3273      * allocation, zeroing and copying, but this is probably not worth
3274      * the complications and checking.
3275      */
3276     forcerec_set_ranges(fr,
3277                         comm->atomRanges.end(DDAtomRanges::Type::Zones),
3278                         comm->atomRanges.end(DDAtomRanges::Type::Constraints),
3279                         nat_f_novirsum);
3280
3281     /* Update atom data for mdatoms and several algorithms */
3282     mdAlgorithmsSetupAtomData(cr, inputrec, top_global, top_local, fr, f, mdAtoms, constr, vsite, nullptr);
3283
3284     auto* mdatoms = mdAtoms->mdatoms();
3285     if (!thisRankHasDuty(cr, DUTY_PME))
3286     {
3287         /* Send the charges and/or c6/sigmas to our PME only node */
3288         gmx_pme_send_parameters(cr,
3289                                 *fr->ic,
3290                                 mdatoms->nChargePerturbed != 0,
3291                                 mdatoms->nTypePerturbed != 0,
3292                                 gmx::arrayRefFromArray(mdatoms->chargeA, mdatoms->nr),
3293                                 mdatoms->chargeB ? gmx::arrayRefFromArray(mdatoms->chargeB, mdatoms->nr)
3294                                                  : gmx::ArrayRef<real>{},
3295                                 gmx::arrayRefFromArray(mdatoms->sqrt_c6A, mdatoms->nr),
3296                                 mdatoms->sqrt_c6B ? gmx::arrayRefFromArray(mdatoms->sqrt_c6B, mdatoms->nr)
3297                                                   : gmx::ArrayRef<real>{},
3298                                 gmx::arrayRefFromArray(mdatoms->sigmaA, mdatoms->nr),
3299                                 mdatoms->sigmaB ? gmx::arrayRefFromArray(mdatoms->sigmaB, mdatoms->nr)
3300                                                 : gmx::ArrayRef<real>{},
3301                                 dd_pme_maxshift_x(*dd),
3302                                 dd_pme_maxshift_y(*dd));
3303     }
3304
3305     if (dd->atomSets != nullptr)
3306     {
3307         /* Update the local atom sets */
3308         dd->atomSets->setIndicesInDomainDecomposition(*(dd->ga2la));
3309     }
3310
3311     // The pull group construction can need the atom sets updated above
3312     if (inputrec.bPull)
3313     {
3314         /* Update the local pull groups */
3315         dd_make_local_pull_groups(cr, pull_work);
3316     }
3317
3318     /* Update the local atoms to be communicated via the IMD protocol if bIMD is true. */
3319     imdSession->dd_make_local_IMD_atoms(dd);
3320
3321     add_dd_statistics(dd);
3322
3323     /* Make sure we only count the cycles for this DD partitioning */
3324     clear_dd_cycle_counts(dd);
3325
3326     /* Because the order of the atoms might have changed since
3327      * the last vsite construction, we need to communicate the constructing
3328      * atom coordinates again (for spreading the forces this MD step).
3329      */
3330     dd_move_x_vsites(*dd, state_local->box, state_local->x.rvec_array());
3331
3332     wallcycle_sub_stop(wcycle, WallCycleSubCounter::DDTopOther);
3333
3334     if (comm->ddSettings.nstDDDump > 0 && step % comm->ddSettings.nstDDDump == 0)
3335     {
3336         dd_move_x(dd, state_local->box, state_local->x, nullptr);
3337         write_dd_pdb("dd_dump",
3338                      step,
3339                      "dump",
3340                      top_global,
3341                      cr,
3342                      -1,
3343                      state_local->x.rvec_array(),
3344                      state_local->box);
3345     }
3346
3347     /* Store the partitioning step */
3348     comm->partition_step = step;
3349
3350     /* Increase the DD partitioning counter */
3351     dd->ddp_count++;
3352     /* The state currently matches this DD partitioning count, store it */
3353     state_local->ddp_count = dd->ddp_count;
3354     if (bMasterState)
3355     {
3356         /* The DD master node knows the complete cg distribution,
3357          * store the count so we can possibly skip the cg info communication.
3358          */
3359         comm->master_cg_ddp_count = (bSortCG ? 0 : dd->ddp_count);
3360     }
3361
3362     if (comm->ddSettings.DD_debug > 0)
3363     {
3364         /* Set the env var GMX_DD_DEBUG if you suspect corrupted indices */
3365         check_index_consistency(dd, top_global.natoms, "after partitioning");
3366     }
3367
3368     wallcycle_stop(wcycle, WallCycleCounter::Domdec);
3369 }
3370
3371 } // namespace gmx