src/gromacs/domdec/partition.cpp

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2018,2019,2020, by the GROMACS development team, led by
   5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   6  * and including many others, as listed in the AUTHORS file in the
   7  * top-level source directory and at http://www.gromacs.org.
   8  *
   9  * GROMACS is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public License
  11  * as published by the Free Software Foundation; either version 2.1
  12  * of the License, or (at your option) any later version.
  13  *
  14  * GROMACS is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public
  20  * License along with GROMACS; if not, see
  21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  23  *
  24  * If you want to redistribute modifications to GROMACS, please
  25  * consider that scientific software is very special. Version
  26  * control is crucial - bugs must be traceable. We will be happy to
  27  * consider code for inclusion in the official distribution, but
  28  * derived work must not be called official GROMACS. Details are found
  29  * in the README & COPYING files - if they are missing, get the
  30  * official version at http://www.gromacs.org.
  31  *
  32  * To help us fund GROMACS development, we humbly ask that you cite
  33  * the research papers on the package. Check out http://www.gromacs.org.
  34  */
  35 /*! \internal \file
  36  *
  37  * \brief This file defines functions for mdrun to call to make a new
  38  * domain decomposition, and check it.
  39  *
  40  * \author Berk Hess <hess@kth.se>
  41  * \ingroup module_domdec
  42  */
  43
  44 #include "gmxpre.h"
  45
  46 #include "partition.h"
  47
  48 #include "config.h"
  49
  50 #include <cassert>
  51 #include <cstdio>
  52
  53 #include <algorithm>
  54
  55 #include "gromacs/domdec/collect.h"
  56 #include "gromacs/domdec/dlb.h"
  57 #include "gromacs/domdec/dlbtiming.h"
  58 #include "gromacs/domdec/domdec.h"
  59 #include "gromacs/domdec/domdec_network.h"
  60 #include "gromacs/domdec/ga2la.h"
  61 #include "gromacs/domdec/localatomsetmanager.h"
  62 #include "gromacs/domdec/mdsetup.h"
  63 #include "gromacs/ewald/pme_pp.h"
  64 #include "gromacs/gmxlib/network.h"
  65 #include "gromacs/gmxlib/nrnb.h"
  66 #include "gromacs/imd/imd.h"
  67 #include "gromacs/math/functions.h"
  68 #include "gromacs/math/vec.h"
  69 #include "gromacs/mdlib/forcerec.h"
  70 #include "gromacs/mdlib/gmx_omp_nthreads.h"
  71 #include "gromacs/mdlib/mdatoms.h"
  72 #include "gromacs/mdlib/nsgrid.h"
  73 #include "gromacs/mdlib/vsite.h"
  74 #include "gromacs/mdtypes/commrec.h"
  75 #include "gromacs/mdtypes/forcerec.h"
  76 #include "gromacs/mdtypes/inputrec.h"
  77 #include "gromacs/mdtypes/md_enums.h"
  78 #include "gromacs/mdtypes/mdatom.h"
  79 #include "gromacs/mdtypes/nblist.h"
  80 #include "gromacs/mdtypes/state.h"
  81 #include "gromacs/nbnxm/nbnxm.h"
  82 #include "gromacs/pulling/pull.h"
  83 #include "gromacs/timing/wallcycle.h"
  84 #include "gromacs/topology/mtop_util.h"
  85 #include "gromacs/topology/topology.h"
  86 #include "gromacs/utility/cstringutil.h"
  87 #include "gromacs/utility/fatalerror.h"
  88 #include "gromacs/utility/logger.h"
  89 #include "gromacs/utility/real.h"
  90 #include "gromacs/utility/smalloc.h"
  91 #include "gromacs/utility/strconvert.h"
  92 #include "gromacs/utility/stringstream.h"
  93 #include "gromacs/utility/stringutil.h"
  94 #include "gromacs/utility/textwriter.h"
  95
  96 #include "box.h"
  97 #include "cellsizes.h"
  98 #include "distribute.h"
  99 #include "domdec_constraints.h"
 100 #include "domdec_internal.h"
 101 #include "domdec_vsite.h"
 102 #include "dump.h"
 103 #include "redistribute.h"
 104 #include "utility.h"
 105
 106 /*! \brief Turn on DLB when the load imbalance causes this amount of total loss.
 107  *
 108  * There is a bit of overhead with DLB and it's difficult to achieve
 109  * a load imbalance of less than 2% with DLB.
 110  */
 111 #define DD_PERF_LOSS_DLB_ON 0.02
 112
 113 //! Warn about imbalance due to PP or PP/PME load imbalance at this loss.
 114 #define DD_PERF_LOSS_WARN 0.05
 115
 116
 117 //! Debug helper printing a DD zone
 118 static void print_ddzone(FILE* fp, int d, int i, int j, gmx_ddzone_t* zone)
 119 {
 120     fprintf(fp,
 121             "zone d0 %d d1 %d d2 %d  min0 %6.3f max1 %6.3f mch0 %6.3f mch1 %6.3f p1_0 %6.3f p1_1 "
 122             "%6.3f\n",
 123             d, i, j, zone->min0, zone->max1, zone->mch0, zone->mch0, zone->p1_0, zone->p1_1);
 124 }
 125
 126 /*! \brief Using the home grid size as input in cell_ns_x0 and cell_ns_x1
 127  * takes the extremes over all home and remote zones in the halo
 128  * and returns the results in cell_ns_x0 and cell_ns_x1.
 129  * Note: only used with the group cut-off scheme.
 130  */
 131 static void dd_move_cellx(gmx_domdec_t* dd, const gmx_ddbox_t* ddbox, rvec cell_ns_x0, rvec cell_ns_x1)
 132 {
 133     constexpr int      c_ddZoneCommMaxNumZones = 5;
 134     gmx_ddzone_t       buf_s[c_ddZoneCommMaxNumZones];
 135     gmx_ddzone_t       buf_r[c_ddZoneCommMaxNumZones];
 136     gmx_ddzone_t       buf_e[c_ddZoneCommMaxNumZones];
 137     gmx_domdec_comm_t* comm = dd->comm;
 138
 139     rvec extr_s[2];
 140     rvec extr_r[2];
 141     for (int d = 1; d < dd->ndim; d++)
 142     {
 143         int           dim = dd->dim[d];
 144         gmx_ddzone_t& zp  = (d == 1) ? comm->zone_d1[0] : comm->zone_d2[0][0];
 145
 146         /* Copy the base sizes of the home zone */
 147         zp.min0    = cell_ns_x0[dim];
 148         zp.max1    = cell_ns_x1[dim];
 149         zp.min1    = cell_ns_x1[dim];
 150         zp.mch0    = cell_ns_x0[dim];
 151         zp.mch1    = cell_ns_x1[dim];
 152         zp.p1_0    = cell_ns_x0[dim];
 153         zp.p1_1    = cell_ns_x1[dim];
 154         zp.dataSet = 1;
 155     }
 156
 157     gmx::ArrayRef<DDCellsizesWithDlb> cellsizes = comm->cellsizesWithDlb;
 158
 159     /* Loop backward over the dimensions and aggregate the extremes
 160      * of the cell sizes.
 161      */
 162     for (int d = dd->ndim - 2; d >= 0; d--)
 163     {
 164         const int  dim      = dd->dim[d];
 165         const bool applyPbc = (dim < ddbox->npbcdim);
 166
 167         /* Use an rvec to store two reals */
 168         extr_s[d][0] = cellsizes[d + 1].fracLower;
 169         extr_s[d][1] = cellsizes[d + 1].fracUpper;
 170         extr_s[d][2] = cellsizes[d + 1].fracUpper;
 171
 172         int pos = 0;
 173         GMX_ASSERT(pos < c_ddZoneCommMaxNumZones, "The buffers should be sufficiently large");
 174         /* Store the extremes in the backward sending buffer,
 175          * so they get updated separately from the forward communication.
 176          */
 177         for (int d1 = d; d1 < dd->ndim - 1; d1++)
 178         {
 179             gmx_ddzone_t& buf = buf_s[pos];
 180
 181             /* We invert the order to be able to use the same loop for buf_e */
 182             buf.min0 = extr_s[d1][1];
 183             buf.max1 = extr_s[d1][0];
 184             buf.min1 = extr_s[d1][2];
 185             buf.mch0 = 0;
 186             buf.mch1 = 0;
 187             /* Store the cell corner of the dimension we communicate along */
 188             buf.p1_0    = comm->cell_x0[dim];
 189             buf.p1_1    = 0;
 190             buf.dataSet = 1;
 191             pos++;
 192         }
 193
 194         buf_s[pos] = (dd->ndim == 2) ? comm->zone_d1[0] : comm->zone_d2[0][0];
 195         pos++;
 196
 197         if (dd->ndim == 3 && d == 0)
 198         {
 199             buf_s[pos] = comm->zone_d2[0][1];
 200             pos++;
 201             buf_s[pos] = comm->zone_d1[0];
 202             pos++;
 203         }
 204
 205         /* We only need to communicate the extremes
 206          * in the forward direction
 207          */
 208         int numPulses = comm->cd[d].numPulses();
 209         int numPulsesMin;
 210         if (applyPbc)
 211         {
 212             /* Take the minimum to avoid double communication */
 213             numPulsesMin = std::min(numPulses, dd->numCells[dim] - 1 - numPulses);
 214         }
 215         else
 216         {
 217             /* Without PBC we should really not communicate over
 218              * the boundaries, but implementing that complicates
 219              * the communication setup and therefore we simply
 220              * do all communication, but ignore some data.
 221              */
 222             numPulsesMin = numPulses;
 223         }
 224         for (int pulse = 0; pulse < numPulsesMin; pulse++)
 225         {
 226             /* Communicate the extremes forward */
 227             bool receiveValidData = (applyPbc || dd->ci[dim] > 0);
 228
 229             int numElements = dd->ndim - d - 1;
 230             ddSendrecv(dd, d, dddirForward, extr_s + d, numElements, extr_r + d, numElements);
 231
 232             if (receiveValidData)
 233             {
 234                 for (int d1 = d; d1 < dd->ndim - 1; d1++)
 235                 {
 236                     extr_s[d1][0] = std::max(extr_s[d1][0], extr_r[d1][0]);
 237                     extr_s[d1][1] = std::min(extr_s[d1][1], extr_r[d1][1]);
 238                     extr_s[d1][2] = std::min(extr_s[d1][2], extr_r[d1][2]);
 239                 }
 240             }
 241         }
 242
 243         const int numElementsInBuffer = pos;
 244         for (int pulse = 0; pulse < numPulses; pulse++)
 245         {
 246             /* Communicate all the zone information backward */
 247             bool receiveValidData = (applyPbc || dd->ci[dim] < dd->numCells[dim] - 1);
 248
 249             static_assert(
 250                     sizeof(gmx_ddzone_t) == c_ddzoneNumReals * sizeof(real),
 251                     "Here we expect gmx_ddzone_t to consist of c_ddzoneNumReals reals (only)");
 252
 253             int numReals = numElementsInBuffer * c_ddzoneNumReals;
 254             ddSendrecv(dd, d, dddirBackward, gmx::arrayRefFromArray(&buf_s[0].min0, numReals),
 255                        gmx::arrayRefFromArray(&buf_r[0].min0, numReals));
 256
 257             rvec dh = { 0 };
 258             if (pulse > 0)
 259             {
 260                 for (int d1 = d + 1; d1 < dd->ndim; d1++)
 261                 {
 262                     /* Determine the decrease of maximum required
 263                      * communication height along d1 due to the distance along d,
 264                      * this avoids a lot of useless atom communication.
 265                      */
 266                     real dist_d = comm->cell_x1[dim] - buf_r[0].p1_0;
 267
 268                     real c;
 269                     if (ddbox->tric_dir[dim])
 270                     {
 271                         /* c is the off-diagonal coupling between the cell planes
 272                          * along directions d and d1.
 273                          */
 274                         c = ddbox->v[dim][dd->dim[d1]][dim];
 275                     }
 276                     else
 277                     {
 278                         c = 0;
 279                     }
 280                     real det = (1 + c * c) * gmx::square(comm->systemInfo.cutoff) - dist_d * dist_d;
 281                     if (det > 0)
 282                     {
 283                         dh[d1] = comm->systemInfo.cutoff - (c * dist_d + std::sqrt(det)) / (1 + c * c);
 284                     }
 285                     else
 286                     {
 287                         /* A negative value signals out of range */
 288                         dh[d1] = -1;
 289                     }
 290                 }
 291             }
 292
 293             /* Accumulate the extremes over all pulses */
 294             for (int i = 0; i < numElementsInBuffer; i++)
 295             {
 296                 if (pulse == 0)
 297                 {
 298                     buf_e[i] = buf_r[i];
 299                 }
 300                 else
 301                 {
 302                     if (receiveValidData)
 303                     {
 304                         buf_e[i].min0 = std::min(buf_e[i].min0, buf_r[i].min0);
 305                         buf_e[i].max1 = std::max(buf_e[i].max1, buf_r[i].max1);
 306                         buf_e[i].min1 = std::min(buf_e[i].min1, buf_r[i].min1);
 307                     }
 308
 309                     int d1;
 310                     if (dd->ndim == 3 && d == 0 && i == numElementsInBuffer - 1)
 311                     {
 312                         d1 = 1;
 313                     }
 314                     else
 315                     {
 316                         d1 = d + 1;
 317                     }
 318                     if (receiveValidData && dh[d1] >= 0)
 319                     {
 320                         buf_e[i].mch0 = std::max(buf_e[i].mch0, buf_r[i].mch0 - dh[d1]);
 321                         buf_e[i].mch1 = std::max(buf_e[i].mch1, buf_r[i].mch1 - dh[d1]);
 322                     }
 323                 }
 324                 /* Copy the received buffer to the send buffer,
 325                  * to pass the data through with the next pulse.
 326                  */
 327                 buf_s[i] = buf_r[i];
 328             }
 329             if (((applyPbc || dd->ci[dim] + numPulses < dd->numCells[dim]) && pulse == numPulses - 1)
 330                 || (!applyPbc && dd->ci[dim] + 1 + pulse == dd->numCells[dim] - 1))
 331             {
 332                 /* Store the extremes */
 333                 int pos = 0;
 334
 335                 for (int d1 = d; d1 < dd->ndim - 1; d1++)
 336                 {
 337                     extr_s[d1][1] = std::min(extr_s[d1][1], buf_e[pos].min0);
 338                     extr_s[d1][0] = std::max(extr_s[d1][0], buf_e[pos].max1);
 339                     extr_s[d1][2] = std::min(extr_s[d1][2], buf_e[pos].min1);
 340                     pos++;
 341                 }
 342
 343                 if (d == 1 || (d == 0 && dd->ndim == 3))
 344                 {
 345                     for (int i = d; i < 2; i++)
 346                     {
 347                         comm->zone_d2[1 - d][i] = buf_e[pos];
 348                         pos++;
 349                     }
 350                 }
 351                 if (d == 0)
 352                 {
 353                     comm->zone_d1[1] = buf_e[pos];
 354                     pos++;
 355                 }
 356             }
 357             else
 358             {
 359                 if (d == 1 || (d == 0 && dd->ndim == 3))
 360                 {
 361                     for (int i = d; i < 2; i++)
 362                     {
 363                         comm->zone_d2[1 - d][i].dataSet = 0;
 364                     }
 365                 }
 366                 if (d == 0)
 367                 {
 368                     comm->zone_d1[1].dataSet = 0;
 369                 }
 370             }
 371         }
 372     }
 373
 374     if (dd->ndim >= 2)
 375     {
 376         int dim = dd->dim[1];
 377         for (int i = 0; i < 2; i++)
 378         {
 379             if (comm->zone_d1[i].dataSet != 0)
 380             {
 381                 if (debug)
 382                 {
 383                     print_ddzone(debug, 1, i, 0, &comm->zone_d1[i]);
 384                 }
 385                 cell_ns_x0[dim] = std::min(cell_ns_x0[dim], comm->zone_d1[i].min0);
 386                 cell_ns_x1[dim] = std::max(cell_ns_x1[dim], comm->zone_d1[i].max1);
 387             }
 388         }
 389     }
 390     if (dd->ndim >= 3)
 391     {
 392         int dim = dd->dim[2];
 393         for (int i = 0; i < 2; i++)
 394         {
 395             for (int j = 0; j < 2; j++)
 396             {
 397                 if (comm->zone_d2[i][j].dataSet != 0)
 398                 {
 399                     if (debug)
 400                     {
 401                         print_ddzone(debug, 2, i, j, &comm->zone_d2[i][j]);
 402                     }
 403                     cell_ns_x0[dim] = std::min(cell_ns_x0[dim], comm->zone_d2[i][j].min0);
 404                     cell_ns_x1[dim] = std::max(cell_ns_x1[dim], comm->zone_d2[i][j].max1);
 405                 }
 406             }
 407         }
 408     }
 409     for (int d = 1; d < dd->ndim; d++)
 410     {
 411         cellsizes[d].fracLowerMax = extr_s[d - 1][0];
 412         cellsizes[d].fracUpperMin = extr_s[d - 1][1];
 413         if (debug)
 414         {
 415             fprintf(debug, "Cell fraction d %d, max0 %f, min1 %f\n", d, cellsizes[d].fracLowerMax,
 416                     cellsizes[d].fracUpperMin);
 417         }
 418     }
 419 }
 420
 421 //! Sets the charge-group zones to be equal to the home zone.
 422 static void set_zones_ncg_home(gmx_domdec_t* dd)
 423 {
 424     gmx_domdec_zones_t* zones;
 425     int                 i;
 426
 427     zones = &dd->comm->zones;
 428
 429     zones->cg_range[0] = 0;
 430     for (i = 1; i < zones->n + 1; i++)
 431     {
 432         zones->cg_range[i] = dd->ncg_home;
 433     }
 434     /* zone_ncg1[0] should always be equal to ncg_home */
 435     dd->comm->zone_ncg1[0] = dd->ncg_home;
 436 }
 437
 438 //! Restore atom groups for the charge groups.
 439 static void restoreAtomGroups(gmx_domdec_t* dd, const t_state* state)
 440 {
 441     gmx::ArrayRef<const int> atomGroupsState = state->cg_gl;
 442
 443     std::vector<int>& globalAtomGroupIndices = dd->globalAtomGroupIndices;
 444
 445     globalAtomGroupIndices.resize(atomGroupsState.size());
 446
 447     /* Copy back the global charge group indices from state
 448      * and rebuild the local charge group to atom index.
 449      */
 450     for (gmx::index i = 0; i < atomGroupsState.ssize(); i++)
 451     {
 452         globalAtomGroupIndices[i] = atomGroupsState[i];
 453     }
 454
 455     dd->ncg_home = atomGroupsState.size();
 456     dd->comm->atomRanges.setEnd(DDAtomRanges::Type::Home, atomGroupsState.ssize());
 457
 458     set_zones_ncg_home(dd);
 459 }
 460
 461 //! Sets the cginfo structures.
 462 static void dd_set_cginfo(gmx::ArrayRef<const int> index_gl, int cg0, int cg1, t_forcerec* fr)
 463 {
 464     if (fr != nullptr)
 465     {
 466         gmx::ArrayRef<cginfo_mb_t> cginfo_mb = fr->cginfo_mb;
 467         gmx::ArrayRef<int>         cginfo    = fr->cginfo;
 468
 469         for (int cg = cg0; cg < cg1; cg++)
 470         {
 471             cginfo[cg] = ddcginfo(cginfo_mb, index_gl[cg]);
 472         }
 473     }
 474 }
 475
 476 //! Makes the mappings between global and local atom indices during DD repartioning.
 477 static void make_dd_indices(gmx_domdec_t* dd, const int atomStart)
 478 {
 479     const int                numZones               = dd->comm->zones.n;
 480     const int*               zone2cg                = dd->comm->zones.cg_range;
 481     const int*               zone_ncg1              = dd->comm->zone_ncg1;
 482     gmx::ArrayRef<const int> globalAtomGroupIndices = dd->globalAtomGroupIndices;
 483
 484     std::vector<int>& globalAtomIndices = dd->globalAtomIndices;
 485     gmx_ga2la_t&      ga2la             = *dd->ga2la;
 486
 487     if (zone2cg[1] != dd->ncg_home)
 488     {
 489         gmx_incons("dd->ncg_zone is not up to date");
 490     }
 491
 492     /* Make the local to global and global to local atom index */
 493     int a = atomStart;
 494     globalAtomIndices.resize(a);
 495     for (int zone = 0; zone < numZones; zone++)
 496     {
 497         int cg0;
 498         if (zone == 0)
 499         {
 500             cg0 = atomStart;
 501         }
 502         else
 503         {
 504             cg0 = zone2cg[zone];
 505         }
 506         int cg1    = zone2cg[zone + 1];
 507         int cg1_p1 = cg0 + zone_ncg1[zone];
 508
 509         for (int cg = cg0; cg < cg1; cg++)
 510         {
 511             int zone1 = zone;
 512             if (cg >= cg1_p1)
 513             {
 514                 /* Signal that this cg is from more than one pulse away */
 515                 zone1 += numZones;
 516             }
 517             int cg_gl = globalAtomGroupIndices[cg];
 518             globalAtomIndices.push_back(cg_gl);
 519             ga2la.insert(cg_gl, { a, zone1 });
 520             a++;
 521         }
 522     }
 523 }
 524
 525 //! Checks whether global and local atom indices are consistent.
 526 static void check_index_consistency(const gmx_domdec_t* dd, int natoms_sys, const char* where)
 527 {
 528     int nerr = 0;
 529
 530     const int numAtomsInZones = dd->comm->atomRanges.end(DDAtomRanges::Type::Zones);
 531
 532     if (dd->comm->ddSettings.DD_debug > 1)
 533     {
 534         std::vector<int> have(natoms_sys);
 535         for (int a = 0; a < numAtomsInZones; a++)
 536         {
 537             int globalAtomIndex = dd->globalAtomIndices[a];
 538             if (have[globalAtomIndex] > 0)
 539             {
 540                 fprintf(stderr, "DD rank %d: global atom %d occurs twice: index %d and %d\n",
 541                         dd->rank, globalAtomIndex + 1, have[globalAtomIndex], a + 1);
 542             }
 543             else
 544             {
 545                 have[globalAtomIndex] = a + 1;
 546             }
 547         }
 548     }
 549
 550     std::vector<int> have(numAtomsInZones);
 551
 552     int ngl = 0;
 553     for (int i = 0; i < natoms_sys; i++)
 554     {
 555         if (const auto entry = dd->ga2la->find(i))
 556         {
 557             const int a = entry->la;
 558             if (a >= numAtomsInZones)
 559             {
 560                 fprintf(stderr,
 561                         "DD rank %d: global atom %d marked as local atom %d, which is larger than "
 562                         "nat_tot (%d)\n",
 563                         dd->rank, i + 1, a + 1, numAtomsInZones);
 564                 nerr++;
 565             }
 566             else
 567             {
 568                 have[a] = 1;
 569                 if (dd->globalAtomIndices[a] != i)
 570                 {
 571                     fprintf(stderr,
 572                             "DD rank %d: global atom %d marked as local atom %d, which has global "
 573                             "atom index %d\n",
 574                             dd->rank, i + 1, a + 1, dd->globalAtomIndices[a] + 1);
 575                     nerr++;
 576                 }
 577             }
 578             ngl++;
 579         }
 580     }
 581     if (ngl != numAtomsInZones)
 582     {
 583         fprintf(stderr, "DD rank %d, %s: %d global atom indices, %d local atoms\n", dd->rank, where,
 584                 ngl, numAtomsInZones);
 585     }
 586     for (int a = 0; a < numAtomsInZones; a++)
 587     {
 588         if (have[a] == 0)
 589         {
 590             fprintf(stderr, "DD rank %d, %s: local atom %d, global %d has no global index\n",
 591                     dd->rank, where, a + 1, dd->globalAtomIndices[a] + 1);
 592         }
 593     }
 594
 595     if (nerr > 0)
 596     {
 597         gmx_fatal(FARGS, "DD rank %d, %s: %d atom(group) index inconsistencies", dd->rank, where, nerr);
 598     }
 599 }
 600
 601 //! Clear all DD global state indices
 602 static void clearDDStateIndices(gmx_domdec_t* dd, const bool keepLocalAtomIndices)
 603 {
 604     gmx_ga2la_t& ga2la = *dd->ga2la;
 605
 606     if (!keepLocalAtomIndices)
 607     {
 608         /* Clear the whole list without the overhead of searching */
 609         ga2la.clear();
 610     }
 611     else
 612     {
 613         const int numAtomsInZones = dd->comm->atomRanges.end(DDAtomRanges::Type::Zones);
 614         for (int i = 0; i < numAtomsInZones; i++)
 615         {
 616             ga2la.erase(dd->globalAtomIndices[i]);
 617         }
 618     }
 619
 620     dd_clear_local_vsite_indices(dd);
 621
 622     if (dd->constraints)
 623     {
 624         dd_clear_local_constraint_indices(dd);
 625     }
 626 }
 627
 628 bool check_grid_jump(int64_t step, const gmx_domdec_t* dd, real cutoff, const gmx_ddbox_t* ddbox, gmx_bool bFatal)
 629 {
 630     gmx_domdec_comm_t* comm    = dd->comm;
 631     bool               invalid = false;
 632
 633     for (int d = 1; d < dd->ndim; d++)
 634     {
 635         const DDCellsizesWithDlb& cellsizes = comm->cellsizesWithDlb[d];
 636         const int                 dim       = dd->dim[d];
 637         const real                limit     = grid_jump_limit(comm, cutoff, d);
 638         real                      bfac      = ddbox->box_size[dim];
 639         if (ddbox->tric_dir[dim])
 640         {
 641             bfac *= ddbox->skew_fac[dim];
 642         }
 643         if ((cellsizes.fracUpper - cellsizes.fracLowerMax) * bfac < limit
 644             || (cellsizes.fracLower - cellsizes.fracUpperMin) * bfac > -limit)
 645         {
 646             invalid = true;
 647
 648             if (bFatal)
 649             {
 650                 char buf[22];
 651
 652                 /* This error should never be triggered under normal
 653                  * circumstances, but you never know ...
 654                  */
 655                 gmx_fatal(FARGS,
 656                           "step %s: The domain decomposition grid has shifted too much in the "
 657                           "%c-direction around cell %d %d %d. This should not have happened. "
 658                           "Running with fewer ranks might avoid this issue.",
 659                           gmx_step_str(step, buf), dim2char(dim), dd->ci[XX], dd->ci[YY], dd->ci[ZZ]);
 660             }
 661         }
 662     }
 663
 664     return invalid;
 665 }
 666 //! Return the duration of force calculations on this rank.
 667 static float dd_force_load(gmx_domdec_comm_t* comm)
 668 {
 669     float load;
 670
 671     if (comm->ddSettings.eFlop)
 672     {
 673         load = comm->flop;
 674         if (comm->ddSettings.eFlop > 1)
 675         {
 676             load *= 1.0 + (comm->ddSettings.eFlop - 1) * (0.1 * rand() / RAND_MAX - 0.05);
 677         }
 678     }
 679     else
 680     {
 681         load = comm->cycl[ddCyclF];
 682         if (comm->cycl_n[ddCyclF] > 1)
 683         {
 684             /* Subtract the maximum of the last n cycle counts
 685              * to get rid of possible high counts due to other sources,
 686              * for instance system activity, that would otherwise
 687              * affect the dynamic load balancing.
 688              */
 689             load -= comm->cycl_max[ddCyclF];
 690         }
 691
 692 #if GMX_MPI
 693         if (comm->cycl_n[ddCyclWaitGPU] && comm->nrank_gpu_shared > 1)
 694         {
 695             float gpu_wait, gpu_wait_sum;
 696
 697             gpu_wait = comm->cycl[ddCyclWaitGPU];
 698             if (comm->cycl_n[ddCyclF] > 1)
 699             {
 700                 /* We should remove the WaitGPU time of the same MD step
 701                  * as the one with the maximum F time, since the F time
 702                  * and the wait time are not independent.
 703                  * Furthermore, the step for the max F time should be chosen
 704                  * the same on all ranks that share the same GPU.
 705                  * But to keep the code simple, we remove the average instead.
 706                  * The main reason for artificially long times at some steps
 707                  * is spurious CPU activity or MPI time, so we don't expect
 708                  * that changes in the GPU wait time matter a lot here.
 709                  */
 710                 gpu_wait *= (comm->cycl_n[ddCyclF] - 1) / static_cast<float>(comm->cycl_n[ddCyclF]);
 711             }
 712             /* Sum the wait times over the ranks that share the same GPU */
 713             MPI_Allreduce(&gpu_wait, &gpu_wait_sum, 1, MPI_FLOAT, MPI_SUM, comm->mpi_comm_gpu_shared);
 714             /* Replace the wait time by the average over the ranks */
 715             load += -gpu_wait + gpu_wait_sum / comm->nrank_gpu_shared;
 716         }
 717 #endif
 718     }
 719
 720     return load;
 721 }
 722
 723 //! Runs cell size checks and communicates the boundaries.
 724 static void comm_dd_ns_cell_sizes(gmx_domdec_t* dd, gmx_ddbox_t* ddbox, rvec cell_ns_x0, rvec cell_ns_x1, int64_t step)
 725 {
 726     gmx_domdec_comm_t* comm;
 727     int                dim_ind, dim;
 728
 729     comm = dd->comm;
 730
 731     for (dim_ind = 0; dim_ind < dd->ndim; dim_ind++)
 732     {
 733         dim = dd->dim[dim_ind];
 734
 735         /* Without PBC we don't have restrictions on the outer cells */
 736         if (!(dim >= ddbox->npbcdim && (dd->ci[dim] == 0 || dd->ci[dim] == dd->numCells[dim] - 1))
 737             && isDlbOn(comm)
 738             && (comm->cell_x1[dim] - comm->cell_x0[dim]) * ddbox->skew_fac[dim] < comm->cellsize_min[dim])
 739         {
 740             char buf[22];
 741             gmx_fatal(FARGS,
 742                       "step %s: The %c-size (%f) times the triclinic skew factor (%f) is smaller "
 743                       "than the smallest allowed cell size (%f) for domain decomposition grid cell "
 744                       "%d %d %d",
 745                       gmx_step_str(step, buf), dim2char(dim),
 746                       comm->cell_x1[dim] - comm->cell_x0[dim], ddbox->skew_fac[dim],
 747                       dd->comm->cellsize_min[dim], dd->ci[XX], dd->ci[YY], dd->ci[ZZ]);
 748         }
 749     }
 750
 751     if ((isDlbOn(dd->comm) && dd->ndim > 1) || ddbox->nboundeddim < DIM)
 752     {
 753         /* Communicate the boundaries and update cell_ns_x0/1 */
 754         dd_move_cellx(dd, ddbox, cell_ns_x0, cell_ns_x1);
 755         if (isDlbOn(dd->comm) && dd->ndim > 1)
 756         {
 757             check_grid_jump(step, dd, dd->comm->systemInfo.cutoff, ddbox, TRUE);
 758         }
 759     }
 760 }
 761
 762 //! Compute and communicate to determine the load distribution across PP ranks.
 763 static void get_load_distribution(gmx_domdec_t* dd, gmx_wallcycle_t wcycle)
 764 {
 765     gmx_domdec_comm_t* comm;
 766     domdec_load_t*     load;
 767     float              cell_frac = 0, sbuf[DD_NLOAD_MAX];
 768     gmx_bool           bSepPME;
 769
 770     if (debug)
 771     {
 772         fprintf(debug, "get_load_distribution start\n");
 773     }
 774
 775     wallcycle_start(wcycle, ewcDDCOMMLOAD);
 776
 777     comm = dd->comm;
 778
 779     bSepPME = (dd->pme_nodeid >= 0);
 780
 781     if (dd->ndim == 0 && bSepPME)
 782     {
 783         /* Without decomposition, but with PME nodes, we need the load */
 784         comm->load[0].mdf = comm->cycl[ddCyclPPduringPME];
 785         comm->load[0].pme = comm->cycl[ddCyclPME];
 786     }
 787
 788     for (int d = dd->ndim - 1; d >= 0; d--)
 789     {
 790         const DDCellsizesWithDlb* cellsizes = (isDlbOn(dd->comm) ? &comm->cellsizesWithDlb[d] : nullptr);
 791         const int                 dim       = dd->dim[d];
 792         /* Check if we participate in the communication in this dimension */
 793         if (d == dd->ndim - 1 || (dd->ci[dd->dim[d + 1]] == 0 && dd->ci[dd->dim[dd->ndim - 1]] == 0))
 794         {
 795             load = &comm->load[d];
 796             if (isDlbOn(dd->comm))
 797             {
 798                 cell_frac = cellsizes->fracUpper - cellsizes->fracLower;
 799             }
 800             int pos = 0;
 801             if (d == dd->ndim - 1)
 802             {
 803                 sbuf[pos++] = dd_force_load(comm);
 804                 sbuf[pos++] = sbuf[0];
 805                 if (isDlbOn(dd->comm))
 806                 {
 807                     sbuf[pos++] = sbuf[0];
 808                     sbuf[pos++] = cell_frac;
 809                     if (d > 0)
 810                     {
 811                         sbuf[pos++] = cellsizes->fracLowerMax;
 812                         sbuf[pos++] = cellsizes->fracUpperMin;
 813                     }
 814                 }
 815                 if (bSepPME)
 816                 {
 817                     sbuf[pos++] = comm->cycl[ddCyclPPduringPME];
 818                     sbuf[pos++] = comm->cycl[ddCyclPME];
 819                 }
 820             }
 821             else
 822             {
 823                 sbuf[pos++] = comm->load[d + 1].sum;
 824                 sbuf[pos++] = comm->load[d + 1].max;
 825                 if (isDlbOn(dd->comm))
 826                 {
 827                     sbuf[pos++] = comm->load[d + 1].sum_m;
 828                     sbuf[pos++] = comm->load[d + 1].cvol_min * cell_frac;
 829                     sbuf[pos++] = comm->load[d + 1].flags;
 830                     if (d > 0)
 831                     {
 832                         sbuf[pos++] = cellsizes->fracLowerMax;
 833                         sbuf[pos++] = cellsizes->fracUpperMin;
 834                     }
 835                 }
 836                 if (bSepPME)
 837                 {
 838                     sbuf[pos++] = comm->load[d + 1].mdf;
 839                     sbuf[pos++] = comm->load[d + 1].pme;
 840                 }
 841             }
 842             load->nload = pos;
 843             /* Communicate a row in DD direction d.
 844              * The communicators are setup such that the root always has rank 0.
 845              */
 846 #if GMX_MPI
 847             MPI_Gather(sbuf, load->nload * sizeof(float), MPI_BYTE, load->load,
 848                        load->nload * sizeof(float), MPI_BYTE, 0, comm->mpi_comm_load[d]);
 849 #endif
 850             if (dd->ci[dim] == dd->master_ci[dim])
 851             {
 852                 /* We are the master along this row, process this row */
 853                 RowMaster* rowMaster = nullptr;
 854
 855                 if (isDlbOn(comm))
 856                 {
 857                     rowMaster = cellsizes->rowMaster.get();
 858                 }
 859                 load->sum      = 0;
 860                 load->max      = 0;
 861                 load->sum_m    = 0;
 862                 load->cvol_min = 1;
 863                 load->flags    = 0;
 864                 load->mdf      = 0;
 865                 load->pme      = 0;
 866                 int pos        = 0;
 867                 for (int i = 0; i < dd->numCells[dim]; i++)
 868                 {
 869                     load->sum += load->load[pos++];
 870                     load->max = std::max(load->max, load->load[pos]);
 871                     pos++;
 872                     if (isDlbOn(dd->comm))
 873                     {
 874                         if (rowMaster->dlbIsLimited)
 875                         {
 876                             /* This direction could not be load balanced properly,
 877                              * therefore we need to use the maximum iso the average load.
 878                              */
 879                             load->sum_m = std::max(load->sum_m, load->load[pos]);
 880                         }
 881                         else
 882                         {
 883                             load->sum_m += load->load[pos];
 884                         }
 885                         pos++;
 886                         load->cvol_min = std::min(load->cvol_min, load->load[pos]);
 887                         pos++;
 888                         if (d < dd->ndim - 1)
 889                         {
 890                             load->flags = gmx::roundToInt(load->load[pos++]);
 891                         }
 892                         if (d > 0)
 893                         {
 894                             rowMaster->bounds[i].cellFracLowerMax = load->load[pos++];
 895                             rowMaster->bounds[i].cellFracUpperMin = load->load[pos++];
 896                         }
 897                     }
 898                     if (bSepPME)
 899                     {
 900                         load->mdf = std::max(load->mdf, load->load[pos]);
 901                         pos++;
 902                         load->pme = std::max(load->pme, load->load[pos]);
 903                         pos++;
 904                     }
 905                 }
 906                 if (isDlbOn(comm) && rowMaster->dlbIsLimited)
 907                 {
 908                     load->sum_m *= dd->numCells[dim];
 909                     load->flags |= (1 << d);
 910                 }
 911             }
 912         }
 913     }
 914
 915     if (DDMASTER(dd))
 916     {
 917         comm->nload += dd_load_count(comm);
 918         comm->load_step += comm->cycl[ddCyclStep];
 919         comm->load_sum += comm->load[0].sum;
 920         comm->load_max += comm->load[0].max;
 921         if (isDlbOn(comm))
 922         {
 923             for (int d = 0; d < dd->ndim; d++)
 924             {
 925                 if (comm->load[0].flags & (1 << d))
 926                 {
 927                     comm->load_lim[d]++;
 928                 }
 929             }
 930         }
 931         if (bSepPME)
 932         {
 933             comm->load_mdf += comm->load[0].mdf;
 934             comm->load_pme += comm->load[0].pme;
 935         }
 936     }
 937
 938     wallcycle_stop(wcycle, ewcDDCOMMLOAD);
 939
 940     if (debug)
 941     {
 942         fprintf(debug, "get_load_distribution finished\n");
 943     }
 944 }
 945
 946 /*! \brief Return the relative performance loss on the total run time
 947  * due to the force calculation load imbalance. */
 948 static float dd_force_load_fraction(gmx_domdec_t* dd)
 949 {
 950     if (dd->comm->nload > 0 && dd->comm->load_step > 0)
 951     {
 952         return dd->comm->load_sum / (dd->comm->load_step * dd->nnodes);
 953     }
 954     else
 955     {
 956         return 0;
 957     }
 958 }
 959
 960 /*! \brief Return the relative performance loss on the total run time
 961  * due to the force calculation load imbalance. */
 962 static float dd_force_imb_perf_loss(gmx_domdec_t* dd)
 963 {
 964     if (dd->comm->nload > 0 && dd->comm->load_step > 0)
 965     {
 966         return (dd->comm->load_max * dd->nnodes - dd->comm->load_sum) / (dd->comm->load_step * dd->nnodes);
 967     }
 968     else
 969     {
 970         return 0;
 971     }
 972 }
 973
 974 //! Print load-balance report e.g. at the end of a run.
 975 static void print_dd_load_av(FILE* fplog, gmx_domdec_t* dd)
 976 {
 977     gmx_domdec_comm_t* comm = dd->comm;
 978
 979     /* Only the master rank prints loads and only if we measured loads */
 980     if (!DDMASTER(dd) || comm->nload == 0)
 981     {
 982         return;
 983     }
 984
 985     char buf[STRLEN];
 986     int  numPpRanks  = dd->nnodes;
 987     int  numPmeRanks = (comm->ddRankSetup.usePmeOnlyRanks ? comm->ddRankSetup.numRanksDoingPme : 0);
 988     int  numRanks    = numPpRanks + numPmeRanks;
 989     float lossFraction = 0;
 990
 991     /* Print the average load imbalance and performance loss */
 992     if (dd->nnodes > 1 && comm->load_sum > 0)
 993     {
 994         float imbalance = comm->load_max * numPpRanks / comm->load_sum - 1;
 995         lossFraction    = dd_force_imb_perf_loss(dd);
 996
 997         std::string msg = "\nDynamic load balancing report:\n";
 998         std::string dlbStateStr;
 999
1000         switch (dd->comm->dlbState)
1001         {
1002             case DlbState::offUser:
1003                 dlbStateStr = "DLB was off during the run per user request.";
1004                 break;
1005             case DlbState::offForever:
1006                 /* Currectly this can happen due to performance loss observed, cell size
1007                  * limitations or incompatibility with other settings observed during
1008                  * determineInitialDlbState(). */
1009                 dlbStateStr = "DLB got disabled because it was unsuitable to use.";
1010                 break;
1011             case DlbState::offCanTurnOn:
1012                 dlbStateStr = "DLB was off during the run due to low measured imbalance.";
1013                 break;
1014             case DlbState::offTemporarilyLocked:
1015                 dlbStateStr =
1016                         "DLB was locked at the end of the run due to unfinished PP-PME "
1017                         "balancing.";
1018                 break;
1019             case DlbState::onCanTurnOff:
1020                 dlbStateStr = "DLB was turned on during the run due to measured imbalance.";
1021                 break;
1022             case DlbState::onUser:
1023                 dlbStateStr = "DLB was permanently on during the run per user request.";
1024                 break;
1025             default: GMX_ASSERT(false, "Undocumented DLB state");
1026         }
1027
1028         msg += " " + dlbStateStr + "\n";
1029         msg += gmx::formatString(" Average load imbalance: %.1f%%.\n", imbalance * 100);
1030         msg += gmx::formatString(
1031                 " The balanceable part of the MD step is %d%%, load imbalance is computed from "
1032                 "this.\n",
1033                 gmx::roundToInt(dd_force_load_fraction(dd) * 100));
1034         msg += gmx::formatString(
1035                 " Part of the total run time spent waiting due to load imbalance: %.1f%%.\n",
1036                 lossFraction * 100);
1037         fprintf(fplog, "%s", msg.c_str());
1038         fprintf(stderr, "\n%s", msg.c_str());
1039     }
1040
1041     /* Print during what percentage of steps the  load balancing was limited */
1042     bool dlbWasLimited = false;
1043     if (isDlbOn(comm))
1044     {
1045         sprintf(buf, " Steps where the load balancing was limited by -rdd, -rcon and/or -dds:");
1046         for (int d = 0; d < dd->ndim; d++)
1047         {
1048             int limitPercentage = (200 * comm->load_lim[d] + 1) / (2 * comm->nload);
1049             sprintf(buf + strlen(buf), " %c %d %%", dim2char(dd->dim[d]), limitPercentage);
1050             if (limitPercentage >= 50)
1051             {
1052                 dlbWasLimited = true;
1053             }
1054         }
1055         sprintf(buf + strlen(buf), "\n");
1056         fprintf(fplog, "%s", buf);
1057         fprintf(stderr, "%s", buf);
1058     }
1059
1060     /* Print the performance loss due to separate PME - PP rank imbalance */
1061     float lossFractionPme = 0;
1062     if (numPmeRanks > 0 && comm->load_mdf > 0 && comm->load_step > 0)
1063     {
1064         float pmeForceRatio = comm->load_pme / comm->load_mdf;
1065         lossFractionPme     = (comm->load_pme - comm->load_mdf) / comm->load_step;
1066         if (lossFractionPme <= 0)
1067         {
1068             lossFractionPme *= numPmeRanks / static_cast<float>(numRanks);
1069         }
1070         else
1071         {
1072             lossFractionPme *= numPpRanks / static_cast<float>(numRanks);
1073         }
1074         sprintf(buf, " Average PME mesh/force load: %5.3f\n", pmeForceRatio);
1075         fprintf(fplog, "%s", buf);
1076         fprintf(stderr, "%s", buf);
1077         sprintf(buf, " Part of the total run time spent waiting due to PP/PME imbalance: %.1f %%\n",
1078                 std::fabs(lossFractionPme) * 100);
1079         fprintf(fplog, "%s", buf);
1080         fprintf(stderr, "%s", buf);
1081     }
1082     fprintf(fplog, "\n");
1083     fprintf(stderr, "\n");
1084
1085     if (lossFraction >= DD_PERF_LOSS_WARN)
1086     {
1087         std::string message = gmx::formatString(
1088                 "NOTE: %.1f %% of the available CPU time was lost due to load imbalance\n"
1089                 "      in the domain decomposition.\n",
1090                 lossFraction * 100);
1091
1092         bool hadSuggestion = false;
1093         if (!isDlbOn(comm))
1094         {
1095             message += "      You might want to use dynamic load balancing (option -dlb.)\n";
1096             hadSuggestion = true;
1097         }
1098         else if (dlbWasLimited)
1099         {
1100             message +=
1101                     "      You might want to decrease the cell size limit (options -rdd, -rcon "
1102                     "and/or -dds).\n";
1103             hadSuggestion = true;
1104         }
1105         message += gmx::formatString(
1106                 "      You can %sconsider manually changing the decomposition (option -dd);\n"
1107                 "      e.g. by using fewer domains along the box dimension in which there is\n"
1108                 "      considerable inhomogeneity in the simulated system.",
1109                 hadSuggestion ? "also " : "");
1110
1111
1112         fprintf(fplog, "%s\n", message.c_str());
1113         fprintf(stderr, "%s\n", message.c_str());
1114     }
1115     if (numPmeRanks > 0 && std::fabs(lossFractionPme) >= DD_PERF_LOSS_WARN)
1116     {
1117         sprintf(buf,
1118                 "NOTE: %.1f %% performance was lost because the PME ranks\n"
1119                 "      had %s work to do than the PP ranks.\n"
1120                 "      You might want to %s the number of PME ranks\n"
1121                 "      or %s the cut-off and the grid spacing.\n",
1122                 std::fabs(lossFractionPme * 100), (lossFractionPme < 0) ? "less" : "more",
1123                 (lossFractionPme < 0) ? "decrease" : "increase",
1124                 (lossFractionPme < 0) ? "decrease" : "increase");
1125         fprintf(fplog, "%s\n", buf);
1126         fprintf(stderr, "%s\n", buf);
1127     }
1128 }
1129
1130 //! Return the minimum communication volume.
1131 static float dd_vol_min(gmx_domdec_t* dd)
1132 {
1133     return dd->comm->load[0].cvol_min * dd->nnodes;
1134 }
1135
1136 //! Return the DD load flags.
1137 static int dd_load_flags(gmx_domdec_t* dd)
1138 {
1139     return dd->comm->load[0].flags;
1140 }
1141
1142 //! Return the reported load imbalance in force calculations.
1143 static float dd_f_imbal(gmx_domdec_t* dd)
1144 {
1145     if (dd->comm->load[0].sum > 0)
1146     {
1147         return dd->comm->load[0].max * dd->nnodes / dd->comm->load[0].sum - 1.0F;
1148     }
1149     else
1150     {
1151         /* Something is wrong in the cycle counting, report no load imbalance */
1152         return 0.0F;
1153     }
1154 }
1155
1156 //! Returns DD load balance report.
1157 static std::string dd_print_load(gmx_domdec_t* dd, int64_t step)
1158 {
1159     gmx::StringOutputStream stream;
1160     gmx::TextWriter         log(&stream);
1161
1162     int flags = dd_load_flags(dd);
1163     if (flags)
1164     {
1165         log.writeString("DD  load balancing is limited by minimum cell size in dimension");
1166         for (int d = 0; d < dd->ndim; d++)
1167         {
1168             if (flags & (1 << d))
1169             {
1170                 log.writeStringFormatted(" %c", dim2char(dd->dim[d]));
1171             }
1172         }
1173         log.ensureLineBreak();
1174     }
1175     log.writeString("DD  step " + gmx::toString(step));
1176     if (isDlbOn(dd->comm))
1177     {
1178         log.writeStringFormatted("  vol min/aver %5.3f%c", dd_vol_min(dd), flags ? '!' : ' ');
1179     }
1180     if (dd->nnodes > 1)
1181     {
1182         log.writeStringFormatted(" load imb.: force %4.1f%%", dd_f_imbal(dd) * 100);
1183     }
1184     if (dd->comm->cycl_n[ddCyclPME])
1185     {
1186         log.writeStringFormatted("  pme mesh/force %5.3f", dd_pme_f_ratio(dd));
1187     }
1188     log.ensureLineBreak();
1189     return stream.toString();
1190 }
1191
1192 //! Prints DD load balance report in mdrun verbose mode.
1193 static void dd_print_load_verbose(gmx_domdec_t* dd)
1194 {
1195     if (isDlbOn(dd->comm))
1196     {
1197         fprintf(stderr, "vol %4.2f%c ", dd_vol_min(dd), dd_load_flags(dd) ? '!' : ' ');
1198     }
1199     if (dd->nnodes > 1)
1200     {
1201         fprintf(stderr, "imb F %2d%% ", gmx::roundToInt(dd_f_imbal(dd) * 100));
1202     }
1203     if (dd->comm->cycl_n[ddCyclPME])
1204     {
1205         fprintf(stderr, "pme/F %4.2f ", dd_pme_f_ratio(dd));
1206     }
1207 }
1208
1209 //! Turns on dynamic load balancing if possible and needed.
1210 static void turn_on_dlb(const gmx::MDLogger& mdlog, gmx_domdec_t* dd, int64_t step)
1211 {
1212     gmx_domdec_comm_t* comm = dd->comm;
1213
1214     real cellsize_min = comm->cellsize_min[dd->dim[0]];
1215     for (int d = 1; d < dd->ndim; d++)
1216     {
1217         cellsize_min = std::min(cellsize_min, comm->cellsize_min[dd->dim[d]]);
1218     }
1219
1220     /* Turn off DLB if we're too close to the cell size limit. */
1221     if (cellsize_min < comm->cellsize_limit * 1.05)
1222     {
1223         GMX_LOG(mdlog.info)
1224                 .appendTextFormatted(
1225                         "step %s Measured %.1f %% performance loss due to load imbalance, "
1226                         "but the minimum cell size is smaller than 1.05 times the cell size limit. "
1227                         "Will no longer try dynamic load balancing.",
1228                         gmx::toString(step).c_str(), dd_force_imb_perf_loss(dd) * 100);
1229
1230         comm->dlbState = DlbState::offForever;
1231         return;
1232     }
1233
1234     GMX_LOG(mdlog.info)
1235             .appendTextFormatted(
1236                     "step %s Turning on dynamic load balancing, because the performance loss due "
1237                     "to load imbalance is %.1f %%.",
1238                     gmx::toString(step).c_str(), dd_force_imb_perf_loss(dd) * 100);
1239     comm->dlbState = DlbState::onCanTurnOff;
1240
1241     /* Store the non-DLB performance, so we can check if DLB actually
1242      * improves performance.
1243      */
1244     GMX_RELEASE_ASSERT(comm->cycl_n[ddCyclStep] > 0,
1245                        "When we turned on DLB, we should have measured cycles");
1246     comm->cyclesPerStepBeforeDLB = comm->cycl[ddCyclStep] / comm->cycl_n[ddCyclStep];
1247
1248     set_dlb_limits(dd);
1249
1250     /* We can set the required cell size info here,
1251      * so we do not need to communicate this.
1252      * The grid is completely uniform.
1253      */
1254     for (int d = 0; d < dd->ndim; d++)
1255     {
1256         RowMaster* rowMaster = comm->cellsizesWithDlb[d].rowMaster.get();
1257
1258         if (rowMaster)
1259         {
1260             comm->load[d].sum_m = comm->load[d].sum;
1261
1262             int nc = dd->numCells[dd->dim[d]];
1263             for (int i = 0; i < nc; i++)
1264             {
1265                 rowMaster->cellFrac[i] = i / static_cast<real>(nc);
1266                 if (d > 0)
1267                 {
1268                     rowMaster->bounds[i].cellFracLowerMax = i / static_cast<real>(nc);
1269                     rowMaster->bounds[i].cellFracUpperMin = (i + 1) / static_cast<real>(nc);
1270                 }
1271             }
1272             rowMaster->cellFrac[nc] = 1.0;
1273         }
1274     }
1275 }
1276
1277 //! Turns off dynamic load balancing (but leave it able to turn back on).
1278 static void turn_off_dlb(const gmx::MDLogger& mdlog, gmx_domdec_t* dd, int64_t step)
1279 {
1280     GMX_LOG(mdlog.info)
1281             .appendText(
1282                     "step " + gmx::toString(step)
1283                     + " Turning off dynamic load balancing, because it is degrading performance.");
1284     dd->comm->dlbState                     = DlbState::offCanTurnOn;
1285     dd->comm->haveTurnedOffDlb             = true;
1286     dd->comm->ddPartioningCountFirstDlbOff = dd->ddp_count;
1287 }
1288
1289 //! Turns off dynamic load balancing permanently.
1290 static void turn_off_dlb_forever(const gmx::MDLogger& mdlog, gmx_domdec_t* dd, int64_t step)
1291 {
1292     GMX_RELEASE_ASSERT(dd->comm->dlbState == DlbState::offCanTurnOn,
1293                        "Can only turn off DLB forever when it was in the can-turn-on state");
1294     GMX_LOG(mdlog.info)
1295             .appendText(
1296                     "step " + gmx::toString(step)
1297                     + " Will no longer try dynamic load balancing, as it degraded performance.");
1298     dd->comm->dlbState = DlbState::offForever;
1299 }
1300
1301 void set_dd_dlb_max_cutoff(t_commrec* cr, real cutoff)
1302 {
1303     gmx_domdec_comm_t* comm;
1304
1305     comm = cr->dd->comm;
1306
1307     /* Turn on the DLB limiting (might have been on already) */
1308     comm->bPMELoadBalDLBLimits = TRUE;
1309
1310     /* Change the cut-off limit */
1311     comm->PMELoadBal_max_cutoff = cutoff;
1312
1313     if (debug)
1314     {
1315         fprintf(debug,
1316                 "PME load balancing set a limit to the DLB staggering such that a %f cut-off will "
1317                 "continue to fit\n",
1318                 comm->PMELoadBal_max_cutoff);
1319     }
1320 }
1321
1322 //! Merge atom buffers.
1323 static void merge_cg_buffers(int                            ncell,
1324                              gmx_domdec_comm_dim_t*         cd,
1325                              int                            pulse,
1326                              int*                           ncg_cell,
1327                              gmx::ArrayRef<int>             index_gl,
1328                              const int*                     recv_i,
1329                              gmx::ArrayRef<gmx::RVec>       x,
1330                              gmx::ArrayRef<const gmx::RVec> recv_vr,
1331                              gmx::ArrayRef<cginfo_mb_t>     cginfo_mb,
1332                              gmx::ArrayRef<int>             cginfo)
1333 {
1334     gmx_domdec_ind_t *ind, *ind_p;
1335     int               p, cell, c, cg, cg0, cg1, cg_gl;
1336     int               shift;
1337
1338     ind = &cd->ind[pulse];
1339
1340     /* First correct the already stored data */
1341     shift = ind->nrecv[ncell];
1342     for (cell = ncell - 1; cell >= 0; cell--)
1343     {
1344         shift -= ind->nrecv[cell];
1345         if (shift > 0)
1346         {
1347             /* Move the cg's present from previous grid pulses */
1348             cg0 = ncg_cell[ncell + cell];
1349             cg1 = ncg_cell[ncell + cell + 1];
1350             for (cg = cg1 - 1; cg >= cg0; cg--)
1351             {
1352                 index_gl[cg + shift] = index_gl[cg];
1353                 x[cg + shift]        = x[cg];
1354                 cginfo[cg + shift]   = cginfo[cg];
1355             }
1356             /* Correct the already stored send indices for the shift */
1357             for (p = 1; p <= pulse; p++)
1358             {
1359                 ind_p = &cd->ind[p];
1360                 cg0   = 0;
1361                 for (c = 0; c < cell; c++)
1362                 {
1363                     cg0 += ind_p->nsend[c];
1364                 }
1365                 cg1 = cg0 + ind_p->nsend[cell];
1366                 for (cg = cg0; cg < cg1; cg++)
1367                 {
1368                     ind_p->index[cg] += shift;
1369                 }
1370             }
1371         }
1372     }
1373
1374     /* Merge in the communicated buffers */
1375     shift = 0;
1376     cg0   = 0;
1377     for (cell = 0; cell < ncell; cell++)
1378     {
1379         cg1 = ncg_cell[ncell + cell + 1] + shift;
1380         for (cg = 0; cg < ind->nrecv[cell]; cg++)
1381         {
1382             /* Copy this atom from the buffer */
1383             index_gl[cg1] = recv_i[cg0];
1384             x[cg1]        = recv_vr[cg0];
1385             /* Copy information */
1386             cg_gl       = index_gl[cg1];
1387             cginfo[cg1] = ddcginfo(cginfo_mb, cg_gl);
1388             cg0++;
1389             cg1++;
1390         }
1391         shift += ind->nrecv[cell];
1392         ncg_cell[ncell + cell + 1] = cg1;
1393     }
1394 }
1395
1396 //! Makes a range partitioning for the atom groups wthin a cell
1397 static void make_cell2at_index(gmx_domdec_comm_dim_t* cd, int nzone, int atomGroupStart)
1398 {
1399     /* Store the atom block boundaries for easy copying of communication buffers
1400      */
1401     int g = atomGroupStart;
1402     for (int zone = 0; zone < nzone; zone++)
1403     {
1404         for (gmx_domdec_ind_t& ind : cd->ind)
1405         {
1406             ind.cell2at0[zone] = g;
1407             g += ind.nrecv[zone];
1408             ind.cell2at1[zone] = g;
1409         }
1410     }
1411 }
1412
1413 //! Returns whether a link is missing.
1414 static gmx_bool missing_link(const t_blocka& link, const int globalAtomIndex, const gmx_ga2la_t& ga2la)
1415 {
1416     for (int i = link.index[globalAtomIndex]; i < link.index[globalAtomIndex + 1]; i++)
1417     {
1418         if (!ga2la.findHome(link.a[i]))
1419         {
1420             return true;
1421         }
1422     }
1423
1424     return false;
1425 }
1426
1427 //! Domain corners for communication, a maximum of 4 i-zones see a j domain
1428 typedef struct
1429 {
1430     //! The corners for the non-bonded communication.
1431     real c[DIM][4];
1432     //! Corner for rounding.
1433     real cr0;
1434     //! Corners for rounding.
1435     real cr1[4];
1436     //! Corners for bounded communication.
1437     real bc[DIM];
1438     //! Corner for rounding for bonded communication.
1439     real bcr1;
1440 } dd_corners_t;
1441
1442 //! Determine the corners of the domain(s) we are communicating with.
1443 static void set_dd_corners(const gmx_domdec_t* dd, int dim0, int dim1, int dim2, gmx_bool bDistMB, dd_corners_t* c)
1444 {
1445     const gmx_domdec_comm_t*  comm;
1446     const gmx_domdec_zones_t* zones;
1447
1448     comm = dd->comm;
1449
1450     zones = &comm->zones;
1451
1452     /* Keep the compiler happy */
1453     c->cr0  = 0;
1454     c->bcr1 = 0;
1455
1456     /* The first dimension is equal for all cells */
1457     c->c[0][0] = comm->cell_x0[dim0];
1458     if (bDistMB)
1459     {
1460         c->bc[0] = c->c[0][0];
1461     }
1462     if (dd->ndim >= 2)
1463     {
1464         dim1 = dd->dim[1];
1465         /* This cell row is only seen from the first row */
1466         c->c[1][0] = comm->cell_x0[dim1];
1467         /* All rows can see this row */
1468         c->c[1][1] = comm->cell_x0[dim1];
1469         if (isDlbOn(dd->comm))
1470         {
1471             c->c[1][1] = std::max(comm->cell_x0[dim1], comm->zone_d1[1].mch0);
1472             if (bDistMB)
1473             {
1474                 /* For the multi-body distance we need the maximum */
1475                 c->bc[1] = std::max(comm->cell_x0[dim1], comm->zone_d1[1].p1_0);
1476             }
1477         }
1478         /* Set the upper-right corner for rounding */
1479         c->cr0 = comm->cell_x1[dim0];
1480
1481         if (dd->ndim >= 3)
1482         {
1483             dim2 = dd->dim[2];
1484             for (int j = 0; j < 4; j++)
1485             {
1486                 c->c[2][j] = comm->cell_x0[dim2];
1487             }
1488             if (isDlbOn(dd->comm))
1489             {
1490                 /* Use the maximum of the i-cells that see a j-cell */
1491                 for (const auto& iZone : zones->iZones)
1492                 {
1493                     const int iZoneIndex = iZone.iZoneIndex;
1494                     for (int jZone : iZone.jZoneRange)
1495                     {
1496                         if (jZone >= 4)
1497                         {
1498                             c->c[2][jZone - 4] = std::max(
1499                                     c->c[2][jZone - 4],
1500                                     comm->zone_d2[zones->shift[iZoneIndex][dim0]][zones->shift[iZoneIndex][dim1]]
1501                                             .mch0);
1502                         }
1503                     }
1504                 }
1505                 if (bDistMB)
1506                 {
1507                     /* For the multi-body distance we need the maximum */
1508                     c->bc[2] = comm->cell_x0[dim2];
1509                     for (int i = 0; i < 2; i++)
1510                     {
1511                         for (int j = 0; j < 2; j++)
1512                         {
1513                             c->bc[2] = std::max(c->bc[2], comm->zone_d2[i][j].p1_0);
1514                         }
1515                     }
1516                 }
1517             }
1518
1519             /* Set the upper-right corner for rounding */
1520             /* Cell (0,0,0) and cell (1,0,0) can see cell 4 (0,1,1)
1521              * Only cell (0,0,0) can see cell 7 (1,1,1)
1522              */
1523             c->cr1[0] = comm->cell_x1[dim1];
1524             c->cr1[3] = comm->cell_x1[dim1];
1525             if (isDlbOn(dd->comm))
1526             {
1527                 c->cr1[0] = std::max(comm->cell_x1[dim1], comm->zone_d1[1].mch1);
1528                 if (bDistMB)
1529                 {
1530                     /* For the multi-body distance we need the maximum */
1531                     c->bcr1 = std::max(comm->cell_x1[dim1], comm->zone_d1[1].p1_1);
1532                 }
1533             }
1534         }
1535     }
1536 }
1537
1538 /*! \brief Add the atom groups we need to send in this pulse from this
1539  * zone to \p localAtomGroups and \p work. */
1540 static void get_zone_pulse_cgs(gmx_domdec_t*            dd,
1541                                int                      zonei,
1542                                int                      zone,
1543                                int                      cg0,
1544                                int                      cg1,
1545                                gmx::ArrayRef<const int> globalAtomGroupIndices,
1546                                int                      dim,
1547                                int                      dim_ind,
1548                                int                      dim0,
1549                                int                      dim1,
1550                                int                      dim2,
1551                                real                     r_comm2,
1552                                real                     r_bcomm2,
1553                                matrix                   box,
1554                                bool                     distanceIsTriclinic,
1555                                rvec*                    normal,
1556                                real                     skew_fac2_d,
1557                                real                     skew_fac_01,
1558                                rvec*                    v_d,
1559                                rvec*                    v_0,
1560                                rvec*                    v_1,
1561                                const dd_corners_t*      c,
1562                                const rvec               sf2_round,
1563                                gmx_bool                 bDistBonded,
1564                                gmx_bool                 bBondComm,
1565                                gmx_bool                 bDist2B,
1566                                gmx_bool                 bDistMB,
1567                                rvec*                    cg_cm,
1568                                gmx::ArrayRef<const int> cginfo,
1569                                std::vector<int>*        localAtomGroups,
1570                                dd_comm_setup_work_t*    work)
1571 {
1572     gmx_domdec_comm_t* comm;
1573     gmx_bool           bScrew;
1574     gmx_bool           bDistMB_pulse;
1575     int                cg, i;
1576     real               r2, rb2, r, tric_sh;
1577     rvec               rn, rb;
1578     int                dimd;
1579     int                nsend_z, nat;
1580
1581     comm = dd->comm;
1582
1583     bScrew = (dd->unitCellInfo.haveScrewPBC && dim == XX);
1584
1585     bDistMB_pulse = (bDistMB && bDistBonded);
1586
1587     /* Unpack the work data */
1588     std::vector<int>&       ibuf = work->atomGroupBuffer;
1589     std::vector<gmx::RVec>& vbuf = work->positionBuffer;
1590     nsend_z                      = 0;
1591     nat                          = work->nat;
1592
1593     for (cg = cg0; cg < cg1; cg++)
1594     {
1595         r2  = 0;
1596         rb2 = 0;
1597         if (!distanceIsTriclinic)
1598         {
1599             /* Rectangular direction, easy */
1600             r = cg_cm[cg][dim] - c->c[dim_ind][zone];
1601             if (r > 0)
1602             {
1603                 r2 += r * r;
1604             }
1605             if (bDistMB_pulse)
1606             {
1607                 r = cg_cm[cg][dim] - c->bc[dim_ind];
1608                 if (r > 0)
1609                 {
1610                     rb2 += r * r;
1611                 }
1612             }
1613             /* Rounding gives at most a 16% reduction
1614              * in communicated atoms
1615              */
1616             if (dim_ind >= 1 && (zonei == 1 || zonei == 2))
1617             {
1618                 r = cg_cm[cg][dim0] - c->cr0;
1619                 /* This is the first dimension, so always r >= 0 */
1620                 r2 += r * r;
1621                 if (bDistMB_pulse)
1622                 {
1623                     rb2 += r * r;
1624                 }
1625             }
1626             if (dim_ind == 2 && (zonei == 2 || zonei == 3))
1627             {
1628                 r = cg_cm[cg][dim1] - c->cr1[zone];
1629                 if (r > 0)
1630                 {
1631                     r2 += r * r;
1632                 }
1633                 if (bDistMB_pulse)
1634                 {
1635                     r = cg_cm[cg][dim1] - c->bcr1;
1636                     if (r > 0)
1637                     {
1638                         rb2 += r * r;
1639                     }
1640                 }
1641             }
1642         }
1643         else
1644         {
1645             /* Triclinic direction, more complicated */
1646             clear_rvec(rn);
1647             clear_rvec(rb);
1648             /* Rounding, conservative as the skew_fac multiplication
1649              * will slightly underestimate the distance.
1650              */
1651             if (dim_ind >= 1 && (zonei == 1 || zonei == 2))
1652             {
1653                 rn[dim0] = cg_cm[cg][dim0] - c->cr0;
1654                 for (i = dim0 + 1; i < DIM; i++)
1655                 {
1656                     rn[dim0] -= cg_cm[cg][i] * v_0[i][dim0];
1657                 }
1658                 r2 = rn[dim0] * rn[dim0] * sf2_round[dim0];
1659                 if (bDistMB_pulse)
1660                 {
1661                     rb[dim0] = rn[dim0];
1662                     rb2      = r2;
1663                 }
1664                 /* Take care that the cell planes along dim0 might not
1665                  * be orthogonal to those along dim1 and dim2.
1666                  */
1667                 for (i = 1; i <= dim_ind; i++)
1668                 {
1669                     dimd = dd->dim[i];
1670                     if (normal[dim0][dimd] > 0)
1671                     {
1672                         rn[dimd] -= rn[dim0] * normal[dim0][dimd];
1673                         if (bDistMB_pulse)
1674                         {
1675                             rb[dimd] -= rb[dim0] * normal[dim0][dimd];
1676                         }
1677                     }
1678                 }
1679             }
1680             if (dim_ind == 2 && (zonei == 2 || zonei == 3))
1681             {
1682                 GMX_ASSERT(dim1 >= 0 && dim1 < DIM, "Must have a valid dimension index");
1683                 rn[dim1] += cg_cm[cg][dim1] - c->cr1[zone];
1684                 tric_sh = 0;
1685                 for (i = dim1 + 1; i < DIM; i++)
1686                 {
1687                     tric_sh -= cg_cm[cg][i] * v_1[i][dim1];
1688                 }
1689                 rn[dim1] += tric_sh;
1690                 if (rn[dim1] > 0)
1691                 {
1692                     r2 += rn[dim1] * rn[dim1] * sf2_round[dim1];
1693                     /* Take care of coupling of the distances
1694                      * to the planes along dim0 and dim1 through dim2.
1695                      */
1696                     r2 -= rn[dim0] * rn[dim1] * skew_fac_01;
1697                     /* Take care that the cell planes along dim1
1698                      * might not be orthogonal to that along dim2.
1699                      */
1700                     if (normal[dim1][dim2] > 0)
1701                     {
1702                         rn[dim2] -= rn[dim1] * normal[dim1][dim2];
1703                     }
1704                 }
1705                 if (bDistMB_pulse)
1706                 {
1707                     rb[dim1] += cg_cm[cg][dim1] - c->bcr1 + tric_sh;
1708                     if (rb[dim1] > 0)
1709                     {
1710                         rb2 += rb[dim1] * rb[dim1] * sf2_round[dim1];
1711                         /* Take care of coupling of the distances
1712                          * to the planes along dim0 and dim1 through dim2.
1713                          */
1714                         rb2 -= rb[dim0] * rb[dim1] * skew_fac_01;
1715                         /* Take care that the cell planes along dim1
1716                          * might not be orthogonal to that along dim2.
1717                          */
1718                         if (normal[dim1][dim2] > 0)
1719                         {
1720                             rb[dim2] -= rb[dim1] * normal[dim1][dim2];
1721                         }
1722                     }
1723                 }
1724             }
1725             /* The distance along the communication direction */
1726             rn[dim] += cg_cm[cg][dim] - c->c[dim_ind][zone];
1727             tric_sh = 0;
1728             for (i = dim + 1; i < DIM; i++)
1729             {
1730                 tric_sh -= cg_cm[cg][i] * v_d[i][dim];
1731             }
1732             rn[dim] += tric_sh;
1733             if (rn[dim] > 0)
1734             {
1735                 r2 += rn[dim] * rn[dim] * skew_fac2_d;
1736                 /* Take care of coupling of the distances
1737                  * to the planes along dim0 and dim1 through dim2.
1738                  */
1739                 if (dim_ind == 1 && zonei == 1)
1740                 {
1741                     r2 -= rn[dim0] * rn[dim] * skew_fac_01;
1742                 }
1743             }
1744             if (bDistMB_pulse)
1745             {
1746                 clear_rvec(rb);
1747                 GMX_ASSERT(dim >= 0 && dim < DIM, "Must have a valid dimension index");
1748                 rb[dim] += cg_cm[cg][dim] - c->bc[dim_ind] + tric_sh;
1749                 if (rb[dim] > 0)
1750                 {
1751                     rb2 += rb[dim] * rb[dim] * skew_fac2_d;
1752                     /* Take care of coupling of the distances
1753                      * to the planes along dim0 and dim1 through dim2.
1754                      */
1755                     if (dim_ind == 1 && zonei == 1)
1756                     {
1757                         rb2 -= rb[dim0] * rb[dim] * skew_fac_01;
1758                     }
1759                 }
1760             }
1761         }
1762
1763         if (r2 < r_comm2
1764             || (bDistBonded && ((bDistMB && rb2 < r_bcomm2) || (bDist2B && r2 < r_bcomm2))
1765                 && (!bBondComm
1766                     || (GET_CGINFO_BOND_INTER(cginfo[cg])
1767                         && missing_link(*comm->bondedLinks, globalAtomGroupIndices[cg], *dd->ga2la)))))
1768         {
1769             /* Store the local and global atom group indices and position */
1770             localAtomGroups->push_back(cg);
1771             ibuf.push_back(globalAtomGroupIndices[cg]);
1772             nsend_z++;
1773
1774             rvec posPbc;
1775             if (dd->ci[dim] == 0)
1776             {
1777                 /* Correct cg_cm for pbc */
1778                 rvec_add(cg_cm[cg], box[dim], posPbc);
1779                 if (bScrew)
1780                 {
1781                     posPbc[YY] = box[YY][YY] - posPbc[YY];
1782                     posPbc[ZZ] = box[ZZ][ZZ] - posPbc[ZZ];
1783                 }
1784             }
1785             else
1786             {
1787                 copy_rvec(cg_cm[cg], posPbc);
1788             }
1789             vbuf.emplace_back(posPbc[XX], posPbc[YY], posPbc[ZZ]);
1790
1791             nat += 1;
1792         }
1793     }
1794
1795     work->nat        = nat;
1796     work->nsend_zone = nsend_z;
1797 }
1798
1799 //! Clear data.
1800 static void clearCommSetupData(dd_comm_setup_work_t* work)
1801 {
1802     work->localAtomGroupBuffer.clear();
1803     work->atomGroupBuffer.clear();
1804     work->positionBuffer.clear();
1805     work->nat        = 0;
1806     work->nsend_zone = 0;
1807 }
1808
1809 //! Prepare DD communication.
1810 static void setup_dd_communication(gmx_domdec_t* dd, matrix box, gmx_ddbox_t* ddbox, t_forcerec* fr, t_state* state)
1811 {
1812     int                    dim_ind, dim, dim0, dim1, dim2, dimd, nat_tot;
1813     int                    nzone, nzone_send, zone, zonei, cg0, cg1;
1814     int                    c;
1815     int *                  zone_cg_range, pos_cg;
1816     gmx_domdec_comm_t*     comm;
1817     gmx_domdec_zones_t*    zones;
1818     gmx_domdec_comm_dim_t* cd;
1819     gmx_bool               bBondComm, bDist2B, bDistMB, bDistBonded;
1820     dd_corners_t           corners;
1821     rvec *                 normal, *v_d, *v_0 = nullptr, *v_1 = nullptr;
1822     real                   skew_fac2_d, skew_fac_01;
1823     rvec                   sf2_round;
1824
1825     if (debug)
1826     {
1827         fprintf(debug, "Setting up DD communication\n");
1828     }
1829
1830     comm = dd->comm;
1831
1832     if (comm->dth.empty())
1833     {
1834         /* Initialize the thread data.
1835          * This can not be done in init_domain_decomposition,
1836          * as the numbers of threads is determined later.
1837          */
1838         int numThreads = gmx_omp_nthreads_get(emntDomdec);
1839         comm->dth.resize(numThreads);
1840     }
1841
1842     bBondComm = comm->systemInfo.filterBondedCommunication;
1843
1844     /* Do we need to determine extra distances for multi-body bondeds? */
1845     bDistMB = (comm->systemInfo.haveInterDomainMultiBodyBondeds && isDlbOn(dd->comm) && dd->ndim > 1);
1846
1847     /* Do we need to determine extra distances for only two-body bondeds? */
1848     bDist2B = (bBondComm && !bDistMB);
1849
1850     const real r_comm2 =
1851             gmx::square(domainToDomainIntoAtomToDomainCutoff(comm->systemInfo, comm->systemInfo.cutoff));
1852     const real r_bcomm2 =
1853             gmx::square(domainToDomainIntoAtomToDomainCutoff(comm->systemInfo, comm->cutoff_mbody));
1854
1855     if (debug)
1856     {
1857         fprintf(debug, "bBondComm %s, r_bc %f\n", gmx::boolToString(bBondComm), std::sqrt(r_bcomm2));
1858     }
1859
1860     zones = &comm->zones;
1861
1862     dim0 = dd->dim[0];
1863     dim1 = (dd->ndim >= 2 ? dd->dim[1] : -1);
1864     dim2 = (dd->ndim >= 3 ? dd->dim[2] : -1);
1865
1866     set_dd_corners(dd, dim0, dim1, dim2, bDistMB, &corners);
1867
1868     /* Triclinic stuff */
1869     normal      = ddbox->normal;
1870     skew_fac_01 = 0;
1871     if (dd->ndim >= 2)
1872     {
1873         v_0 = ddbox->v[dim0];
1874         if (ddbox->tric_dir[dim0] && ddbox->tric_dir[dim1])
1875         {
1876             /* Determine the coupling coefficient for the distances
1877              * to the cell planes along dim0 and dim1 through dim2.
1878              * This is required for correct rounding.
1879              */
1880             skew_fac_01 = ddbox->v[dim0][dim1 + 1][dim0] * ddbox->v[dim1][dim1 + 1][dim1];
1881             if (debug)
1882             {
1883                 fprintf(debug, "\nskew_fac_01 %f\n", skew_fac_01);
1884             }
1885         }
1886     }
1887     if (dd->ndim >= 3)
1888     {
1889         v_1 = ddbox->v[dim1];
1890     }
1891
1892     zone_cg_range                        = zones->cg_range;
1893     gmx::ArrayRef<cginfo_mb_t> cginfo_mb = fr->cginfo_mb;
1894
1895     zone_cg_range[0]   = 0;
1896     zone_cg_range[1]   = dd->ncg_home;
1897     comm->zone_ncg1[0] = dd->ncg_home;
1898     pos_cg             = dd->ncg_home;
1899
1900     nat_tot = comm->atomRanges.numHomeAtoms();
1901     nzone   = 1;
1902     for (dim_ind = 0; dim_ind < dd->ndim; dim_ind++)
1903     {
1904         dim = dd->dim[dim_ind];
1905         cd  = &comm->cd[dim_ind];
1906
1907         /* Check if we need to compute triclinic distances along this dim */
1908         bool distanceIsTriclinic = false;
1909         for (int i = 0; i <= dim_ind; i++)
1910         {
1911             if (ddbox->tric_dir[dd->dim[i]])
1912             {
1913                 distanceIsTriclinic = true;
1914             }
1915         }
1916
1917         if (dim >= ddbox->npbcdim && dd->ci[dim] == 0)
1918         {
1919             /* No pbc in this dimension, the first node should not comm. */
1920             nzone_send = 0;
1921         }
1922         else
1923         {
1924             nzone_send = nzone;
1925         }
1926
1927         v_d         = ddbox->v[dim];
1928         skew_fac2_d = gmx::square(ddbox->skew_fac[dim]);
1929
1930         cd->receiveInPlace = true;
1931         for (int p = 0; p < cd->numPulses(); p++)
1932         {
1933             /* Only atoms communicated in the first pulse are used
1934              * for multi-body bonded interactions or for bBondComm.
1935              */
1936             bDistBonded = ((bDistMB || bDist2B) && p == 0);
1937
1938             gmx_domdec_ind_t* ind = &cd->ind[p];
1939
1940             /* Thread 0 writes in the global index array */
1941             ind->index.clear();
1942             clearCommSetupData(&comm->dth[0]);
1943
1944             for (zone = 0; zone < nzone_send; zone++)
1945             {
1946                 if (dim_ind > 0 && distanceIsTriclinic)
1947                 {
1948                     /* Determine slightly more optimized skew_fac's
1949                      * for rounding.
1950                      * This reduces the number of communicated atoms
1951                      * by about 10% for 3D DD of rhombic dodecahedra.
1952                      */
1953                     for (dimd = 0; dimd < dim; dimd++)
1954                     {
1955                         sf2_round[dimd] = 1;
1956                         if (ddbox->tric_dir[dimd])
1957                         {
1958                             for (int i = dd->dim[dimd] + 1; i < DIM; i++)
1959                             {
1960                                 /* If we are shifted in dimension i
1961                                  * and the cell plane is tilted forward
1962                                  * in dimension i, skip this coupling.
1963                                  */
1964                                 if (!(zones->shift[nzone + zone][i] && ddbox->v[dimd][i][dimd] >= 0))
1965                                 {
1966                                     sf2_round[dimd] += gmx::square(ddbox->v[dimd][i][dimd]);
1967                                 }
1968                             }
1969                             sf2_round[dimd] = 1 / sf2_round[dimd];
1970                         }
1971                     }
1972                 }
1973
1974                 zonei = zone_perm[dim_ind][zone];
1975                 if (p == 0)
1976                 {
1977                     /* Here we permutate the zones to obtain a convenient order
1978                      * for neighbor searching
1979                      */
1980                     cg0 = zone_cg_range[zonei];
1981                     cg1 = zone_cg_range[zonei + 1];
1982                 }
1983                 else
1984                 {
1985                     /* Look only at the cg's received in the previous grid pulse
1986                      */
1987                     cg1 = zone_cg_range[nzone + zone + 1];
1988                     cg0 = cg1 - cd->ind[p - 1].nrecv[zone];
1989                 }
1990
1991                 const int numThreads = gmx::ssize(comm->dth);
1992 #pragma omp parallel for num_threads(numThreads) schedule(static)
1993                 for (int th = 0; th < numThreads; th++)
1994                 {
1995                     try
1996                     {
1997                         dd_comm_setup_work_t& work = comm->dth[th];
1998
1999                         /* Retain data accumulated into buffers of thread 0 */
2000                         if (th > 0)
2001                         {
2002                             clearCommSetupData(&work);
2003                         }
2004
2005                         int cg0_th = cg0 + ((cg1 - cg0) * th) / numThreads;
2006                         int cg1_th = cg0 + ((cg1 - cg0) * (th + 1)) / numThreads;
2007
2008                         /* Get the cg's for this pulse in this zone */
2009                         get_zone_pulse_cgs(dd, zonei, zone, cg0_th, cg1_th, dd->globalAtomGroupIndices,
2010                                            dim, dim_ind, dim0, dim1, dim2, r_comm2, r_bcomm2, box,
2011                                            distanceIsTriclinic, normal, skew_fac2_d, skew_fac_01,
2012                                            v_d, v_0, v_1, &corners, sf2_round, bDistBonded, bBondComm,
2013                                            bDist2B, bDistMB, state->x.rvec_array(), fr->cginfo,
2014                                            th == 0 ? &ind->index : &work.localAtomGroupBuffer, &work);
2015                     }
2016                     GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR
2017                 } // END
2018
2019                 std::vector<int>&       atomGroups = comm->dth[0].atomGroupBuffer;
2020                 std::vector<gmx::RVec>& positions  = comm->dth[0].positionBuffer;
2021                 ind->nsend[zone]                   = comm->dth[0].nsend_zone;
2022                 /* Append data of threads>=1 to the communication buffers */
2023                 for (int th = 1; th < numThreads; th++)
2024                 {
2025                     const dd_comm_setup_work_t& dth = comm->dth[th];
2026
2027                     ind->index.insert(ind->index.end(), dth.localAtomGroupBuffer.begin(),
2028                                       dth.localAtomGroupBuffer.end());
2029                     atomGroups.insert(atomGroups.end(), dth.atomGroupBuffer.begin(),
2030                                       dth.atomGroupBuffer.end());
2031                     positions.insert(positions.end(), dth.positionBuffer.begin(),
2032                                      dth.positionBuffer.end());
2033                     comm->dth[0].nat += dth.nat;
2034                     ind->nsend[zone] += dth.nsend_zone;
2035                 }
2036             }
2037             /* Clear the counts in case we do not have pbc */
2038             for (zone = nzone_send; zone < nzone; zone++)
2039             {
2040                 ind->nsend[zone] = 0;
2041             }
2042             ind->nsend[nzone]     = ind->index.size();
2043             ind->nsend[nzone + 1] = comm->dth[0].nat;
2044             /* Communicate the number of cg's and atoms to receive */
2045             ddSendrecv(dd, dim_ind, dddirBackward, ind->nsend, nzone + 2, ind->nrecv, nzone + 2);
2046
2047             if (p > 0)
2048             {
2049                 /* We can receive in place if only the last zone is not empty */
2050                 for (zone = 0; zone < nzone - 1; zone++)
2051                 {
2052                     if (ind->nrecv[zone] > 0)
2053                     {
2054                         cd->receiveInPlace = false;
2055                     }
2056                 }
2057             }
2058
2059             int receiveBufferSize = 0;
2060             if (!cd->receiveInPlace)
2061             {
2062                 receiveBufferSize = ind->nrecv[nzone];
2063             }
2064             /* These buffer are actually only needed with in-place */
2065             DDBufferAccess<int>       globalAtomGroupBuffer(comm->intBuffer, receiveBufferSize);
2066             DDBufferAccess<gmx::RVec> rvecBuffer(comm->rvecBuffer, receiveBufferSize);
2067
2068             dd_comm_setup_work_t& work = comm->dth[0];
2069
2070             /* Make space for the global cg indices */
2071             int numAtomGroupsNew = pos_cg + ind->nrecv[nzone];
2072             dd->globalAtomGroupIndices.resize(numAtomGroupsNew);
2073             /* Communicate the global cg indices */
2074             gmx::ArrayRef<int> integerBufferRef;
2075             if (cd->receiveInPlace)
2076             {
2077                 integerBufferRef = gmx::arrayRefFromArray(
2078                         dd->globalAtomGroupIndices.data() + pos_cg, ind->nrecv[nzone]);
2079             }
2080             else
2081             {
2082                 integerBufferRef = globalAtomGroupBuffer.buffer;
2083             }
2084             ddSendrecv<int>(dd, dim_ind, dddirBackward, work.atomGroupBuffer, integerBufferRef);
2085
2086             /* Make space for cg_cm */
2087             dd_resize_atominfo_and_state(fr, state, pos_cg + ind->nrecv[nzone]);
2088
2089             /* Communicate the coordinates */
2090             gmx::ArrayRef<gmx::RVec> rvecBufferRef;
2091             if (cd->receiveInPlace)
2092             {
2093                 rvecBufferRef = gmx::makeArrayRef(state->x).subArray(pos_cg, ind->nrecv[nzone]);
2094             }
2095             else
2096             {
2097                 rvecBufferRef = rvecBuffer.buffer;
2098             }
2099             ddSendrecv<gmx::RVec>(dd, dim_ind, dddirBackward, work.positionBuffer, rvecBufferRef);
2100
2101             /* Make the charge group index */
2102             if (cd->receiveInPlace)
2103             {
2104                 zone = (p == 0 ? 0 : nzone - 1);
2105                 while (zone < nzone)
2106                 {
2107                     for (int i = 0; i < ind->nrecv[zone]; i++)
2108                     {
2109                         int globalAtomIndex = dd->globalAtomGroupIndices[pos_cg];
2110                         fr->cginfo[pos_cg]  = ddcginfo(cginfo_mb, globalAtomIndex);
2111                         pos_cg++;
2112                     }
2113                     if (p == 0)
2114                     {
2115                         comm->zone_ncg1[nzone + zone] = ind->nrecv[zone];
2116                     }
2117                     zone++;
2118                     zone_cg_range[nzone + zone] = pos_cg;
2119                 }
2120             }
2121             else
2122             {
2123                 /* This part of the code is never executed with bBondComm. */
2124                 merge_cg_buffers(nzone, cd, p, zone_cg_range, dd->globalAtomGroupIndices,
2125                                  integerBufferRef.data(), state->x, rvecBufferRef, fr->cginfo_mb,
2126                                  fr->cginfo);
2127                 pos_cg += ind->nrecv[nzone];
2128             }
2129             nat_tot += ind->nrecv[nzone + 1];
2130         }
2131         if (!cd->receiveInPlace)
2132         {
2133             /* Store the atom block for easy copying of communication buffers */
2134             make_cell2at_index(cd, nzone, zone_cg_range[nzone]);
2135         }
2136         nzone += nzone;
2137     }
2138
2139     comm->atomRanges.setEnd(DDAtomRanges::Type::Zones, nat_tot);
2140
2141     if (!bBondComm)
2142     {
2143         /* We don't need to update cginfo, since that was alrady done above.
2144          * So we pass NULL for the forcerec.
2145          */
2146         dd_set_cginfo(dd->globalAtomGroupIndices, dd->ncg_home, dd->globalAtomGroupIndices.size(), nullptr);
2147     }
2148
2149     if (debug)
2150     {
2151         fprintf(debug, "Finished setting up DD communication, zones:");
2152         for (c = 0; c < zones->n; c++)
2153         {
2154             fprintf(debug, " %d", zones->cg_range[c + 1] - zones->cg_range[c]);
2155         }
2156         fprintf(debug, "\n");
2157     }
2158 }
2159
2160 //! Set boundaries for the charge group range.
2161 static void set_cg_boundaries(gmx_domdec_zones_t* zones)
2162 {
2163     for (auto& iZone : zones->iZones)
2164     {
2165         iZone.iAtomRange = gmx::Range<int>(0, zones->cg_range[iZone.iZoneIndex + 1]);
2166         iZone.jAtomRange = gmx::Range<int>(zones->cg_range[iZone.jZoneRange.begin()],
2167                                            zones->cg_range[iZone.jZoneRange.end()]);
2168     }
2169 }
2170
2171 /*! \brief Set zone dimensions for zones \p zone_start to \p zone_end-1
2172  *
2173  * Also sets the atom density for the home zone when \p zone_start=0.
2174  * For this \p numMovedChargeGroupsInHomeZone needs to be passed to tell
2175  * how many charge groups will move but are still part of the current range.
2176  * \todo When converting domdec to use proper classes, all these variables
2177  *       should be private and a method should return the correct count
2178  *       depending on an internal state.
2179  *
2180  * \param[in,out] dd          The domain decomposition struct
2181  * \param[in]     box         The box
2182  * \param[in]     ddbox       The domain decomposition box struct
2183  * \param[in]     zone_start  The start of the zone range to set sizes for
2184  * \param[in]     zone_end    The end of the zone range to set sizes for
2185  * \param[in]     numMovedChargeGroupsInHomeZone  The number of charge groups in the home zone that should moved but are still present in dd->comm->zones.cg_range
2186  */
2187 static void set_zones_size(gmx_domdec_t*      dd,
2188                            matrix             box,
2189                            const gmx_ddbox_t* ddbox,
2190                            int                zone_start,
2191                            int                zone_end,
2192                            int                numMovedChargeGroupsInHomeZone)
2193 {
2194     gmx_domdec_comm_t*  comm;
2195     gmx_domdec_zones_t* zones;
2196     gmx_bool            bDistMB;
2197     int                 z, d, dim;
2198     real                rcs, rcmbs;
2199     int                 i, j;
2200     real                vol;
2201
2202     comm = dd->comm;
2203
2204     zones = &comm->zones;
2205
2206     /* Do we need to determine extra distances for multi-body bondeds? */
2207     bDistMB = (comm->systemInfo.haveInterDomainMultiBodyBondeds && isDlbOn(dd->comm) && dd->ndim > 1);
2208
2209     for (z = zone_start; z < zone_end; z++)
2210     {
2211         /* Copy cell limits to zone limits.
2212          * Valid for non-DD dims and non-shifted dims.
2213          */
2214         copy_rvec(comm->cell_x0, zones->size[z].x0);
2215         copy_rvec(comm->cell_x1, zones->size[z].x1);
2216     }
2217
2218     for (d = 0; d < dd->ndim; d++)
2219     {
2220         dim = dd->dim[d];
2221
2222         for (z = 0; z < zones->n; z++)
2223         {
2224             /* With a staggered grid we have different sizes
2225              * for non-shifted dimensions.
2226              */
2227             if (isDlbOn(dd->comm) && zones->shift[z][dim] == 0)
2228             {
2229                 if (d == 1)
2230                 {
2231                     zones->size[z].x0[dim] = comm->zone_d1[zones->shift[z][dd->dim[d - 1]]].min0;
2232                     zones->size[z].x1[dim] = comm->zone_d1[zones->shift[z][dd->dim[d - 1]]].max1;
2233                 }
2234                 else if (d == 2)
2235                 {
2236                     zones->size[z].x0[dim] =
2237                             comm->zone_d2[zones->shift[z][dd->dim[d - 2]]][zones->shift[z][dd->dim[d - 1]]]
2238                                     .min0;
2239                     zones->size[z].x1[dim] =
2240                             comm->zone_d2[zones->shift[z][dd->dim[d - 2]]][zones->shift[z][dd->dim[d - 1]]]
2241                                     .max1;
2242                 }
2243             }
2244         }
2245
2246         rcs   = comm->systemInfo.cutoff;
2247         rcmbs = comm->cutoff_mbody;
2248         if (ddbox->tric_dir[dim])
2249         {
2250             rcs /= ddbox->skew_fac[dim];
2251             rcmbs /= ddbox->skew_fac[dim];
2252         }
2253
2254         /* Set the lower limit for the shifted zone dimensions */
2255         for (z = zone_start; z < zone_end; z++)
2256         {
2257             if (zones->shift[z][dim] > 0)
2258             {
2259                 dim = dd->dim[d];
2260                 if (!isDlbOn(dd->comm) || d == 0)
2261                 {
2262                     zones->size[z].x0[dim] = comm->cell_x1[dim];
2263                     zones->size[z].x1[dim] = comm->cell_x1[dim] + rcs;
2264                 }
2265                 else
2266                 {
2267                     /* Here we take the lower limit of the zone from
2268                      * the lowest domain of the zone below.
2269                      */
2270                     if (z < 4)
2271                     {
2272                         zones->size[z].x0[dim] = comm->zone_d1[zones->shift[z][dd->dim[d - 1]]].min1;
2273                     }
2274                     else
2275                     {
2276                         if (d == 1)
2277                         {
2278                             zones->size[z].x0[dim] = zones->size[zone_perm[2][z - 4]].x0[dim];
2279                         }
2280                         else
2281                         {
2282                             zones->size[z].x0[dim] =
2283                                     comm->zone_d2[zones->shift[z][dd->dim[d - 2]]][zones->shift[z][dd->dim[d - 1]]]
2284                                             .min1;
2285                         }
2286                     }
2287                     /* A temporary limit, is updated below */
2288                     zones->size[z].x1[dim] = zones->size[z].x0[dim];
2289
2290                     if (bDistMB)
2291                     {
2292                         for (size_t zi = 0; zi < zones->iZones.size(); zi++)
2293                         {
2294                             if (zones->shift[zi][dim] == 0)
2295                             {
2296                                 /* This takes the whole zone into account.
2297                                  * With multiple pulses this will lead
2298                                  * to a larger zone then strictly necessary.
2299                                  */
2300                                 zones->size[z].x1[dim] = std::max(zones->size[z].x1[dim],
2301                                                                   zones->size[zi].x1[dim] + rcmbs);
2302                             }
2303                         }
2304                     }
2305                 }
2306             }
2307         }
2308
2309         /* Loop over the i-zones to set the upper limit of each
2310          * j-zone they see.
2311          */
2312         for (const auto& iZone : zones->iZones)
2313         {
2314             const int zi = iZone.iZoneIndex;
2315             if (zones->shift[zi][dim] == 0)
2316             {
2317                 /* We should only use zones up to zone_end */
2318                 const auto& jZoneRangeFull = iZone.jZoneRange;
2319                 if (zone_end <= *jZoneRangeFull.begin())
2320                 {
2321                     continue;
2322                 }
2323                 const gmx::Range<int> jZoneRange(*jZoneRangeFull.begin(),
2324                                                  std::min(*jZoneRangeFull.end(), zone_end));
2325                 for (int jZone : jZoneRange)
2326                 {
2327                     if (zones->shift[jZone][dim] > 0)
2328                     {
2329                         zones->size[jZone].x1[dim] =
2330                                 std::max(zones->size[jZone].x1[dim], zones->size[zi].x1[dim] + rcs);
2331                     }
2332                 }
2333             }
2334         }
2335     }
2336
2337     for (z = zone_start; z < zone_end; z++)
2338     {
2339         /* Initialization only required to keep the compiler happy */
2340         rvec corner_min = { 0, 0, 0 }, corner_max = { 0, 0, 0 }, corner;
2341         int  nc, c;
2342
2343         /* To determine the bounding box for a zone we need to find
2344          * the extreme corners of 4, 2 or 1 corners.
2345          */
2346         nc = 1 << (ddbox->nboundeddim - 1);
2347
2348         for (c = 0; c < nc; c++)
2349         {
2350             /* Set up a zone corner at x=0, ignoring trilinic couplings */
2351             corner[XX] = 0;
2352             if ((c & 1) == 0)
2353             {
2354                 corner[YY] = zones->size[z].x0[YY];
2355             }
2356             else
2357             {
2358                 corner[YY] = zones->size[z].x1[YY];
2359             }
2360             if ((c & 2) == 0)
2361             {
2362                 corner[ZZ] = zones->size[z].x0[ZZ];
2363             }
2364             else
2365             {
2366                 corner[ZZ] = zones->size[z].x1[ZZ];
2367             }
2368             if (dd->ndim == 1 && dd->dim[0] < ZZ && ZZ < dd->unitCellInfo.npbcdim
2369                 && box[ZZ][1 - dd->dim[0]] != 0)
2370             {
2371                 /* With 1D domain decomposition the cg's are not in
2372                  * the triclinic box, but triclinic x-y and rectangular y/x-z.
2373                  * Shift the corner of the z-vector back to along the box
2374                  * vector of dimension d, so it will later end up at 0 along d.
2375                  * This can affect the location of this corner along dd->dim[0]
2376                  * through the matrix operation below if box[d][dd->dim[0]]!=0.
2377                  */
2378                 int d = 1 - dd->dim[0];
2379
2380                 corner[d] -= corner[ZZ] * box[ZZ][d] / box[ZZ][ZZ];
2381             }
2382             /* Apply the triclinic couplings */
2383             for (i = YY; i < ddbox->npbcdim && i < DIM; i++)
2384             {
2385                 for (j = XX; j < i; j++)
2386                 {
2387                     corner[j] += corner[i] * box[i][j] / box[i][i];
2388                 }
2389             }
2390             if (c == 0)
2391             {
2392                 copy_rvec(corner, corner_min);
2393                 copy_rvec(corner, corner_max);
2394             }
2395             else
2396             {
2397                 for (i = 0; i < DIM; i++)
2398                 {
2399                     corner_min[i] = std::min(corner_min[i], corner[i]);
2400                     corner_max[i] = std::max(corner_max[i], corner[i]);
2401                 }
2402             }
2403         }
2404         /* Copy the extreme cornes without offset along x */
2405         for (i = 0; i < DIM; i++)
2406         {
2407             zones->size[z].bb_x0[i] = corner_min[i];
2408             zones->size[z].bb_x1[i] = corner_max[i];
2409         }
2410         /* Add the offset along x */
2411         zones->size[z].bb_x0[XX] += zones->size[z].x0[XX];
2412         zones->size[z].bb_x1[XX] += zones->size[z].x1[XX];
2413     }
2414
2415     if (zone_start == 0)
2416     {
2417         vol = 1;
2418         for (dim = 0; dim < DIM; dim++)
2419         {
2420             vol *= zones->size[0].x1[dim] - zones->size[0].x0[dim];
2421         }
2422         zones->dens_zone0 =
2423                 (zones->cg_range[1] - zones->cg_range[0] - numMovedChargeGroupsInHomeZone) / vol;
2424     }
2425
2426     if (debug)
2427     {
2428         for (z = zone_start; z < zone_end; z++)
2429         {
2430             fprintf(debug, "zone %d    %6.3f - %6.3f  %6.3f - %6.3f  %6.3f - %6.3f\n", z,
2431                     zones->size[z].x0[XX], zones->size[z].x1[XX], zones->size[z].x0[YY],
2432                     zones->size[z].x1[YY], zones->size[z].x0[ZZ], zones->size[z].x1[ZZ]);
2433             fprintf(debug, "zone %d bb %6.3f - %6.3f  %6.3f - %6.3f  %6.3f - %6.3f\n", z,
2434                     zones->size[z].bb_x0[XX], zones->size[z].bb_x1[XX], zones->size[z].bb_x0[YY],
2435                     zones->size[z].bb_x1[YY], zones->size[z].bb_x0[ZZ], zones->size[z].bb_x1[ZZ]);
2436         }
2437     }
2438 }
2439
2440 /*! \brief Order data in \p dataToSort according to \p sort
2441  *
2442  * Note: both buffers should have at least \p sort.size() elements.
2443  */
2444 template<typename T>
2445 static void orderVector(gmx::ArrayRef<const gmx_cgsort_t> sort,
2446                         gmx::ArrayRef<T>                  dataToSort,
2447                         gmx::ArrayRef<T>                  sortBuffer)
2448 {
2449     GMX_ASSERT(dataToSort.size() >= sort.size(), "The vector needs to be sufficiently large");
2450     GMX_ASSERT(sortBuffer.size() >= sort.size(),
2451                "The sorting buffer needs to be sufficiently large");
2452
2453     /* Order the data into the temporary buffer */
2454     size_t i = 0;
2455     for (const gmx_cgsort_t& entry : sort)
2456     {
2457         sortBuffer[i++] = dataToSort[entry.ind];
2458     }
2459
2460     /* Copy back to the original array */
2461     std::copy(sortBuffer.begin(), sortBuffer.begin() + sort.size(), dataToSort.begin());
2462 }
2463
2464 /*! \brief Order data in \p dataToSort according to \p sort
2465  *
2466  * Note: \p vectorToSort should have at least \p sort.size() elements,
2467  *       \p workVector is resized when it is too small.
2468  */
2469 template<typename T>
2470 static void orderVector(gmx::ArrayRef<const gmx_cgsort_t> sort,
2471                         gmx::ArrayRef<T>                  vectorToSort,
2472                         std::vector<T>*                   workVector)
2473 {
2474     if (gmx::index(workVector->size()) < sort.ssize())
2475     {
2476         workVector->resize(sort.size());
2477     }
2478     orderVector<T>(sort, vectorToSort, *workVector);
2479 }
2480
2481 //! Returns the sorting order for atoms based on the nbnxn grid order in sort
2482 static void dd_sort_order_nbnxn(const t_forcerec* fr, std::vector<gmx_cgsort_t>* sort)
2483 {
2484     gmx::ArrayRef<const int> atomOrder = fr->nbv->getLocalAtomOrder();
2485
2486     /* Using push_back() instead of this resize results in much slower code */
2487     sort->resize(atomOrder.size());
2488     gmx::ArrayRef<gmx_cgsort_t> buffer    = *sort;
2489     size_t                      numSorted = 0;
2490     for (int i : atomOrder)
2491     {
2492         if (i >= 0)
2493         {
2494             /* The values of nsc and ind_gl are not used in this case */
2495             buffer[numSorted++].ind = i;
2496         }
2497     }
2498     sort->resize(numSorted);
2499 }
2500
2501 //! Returns the sorting state for DD.
2502 static void dd_sort_state(gmx_domdec_t* dd, t_forcerec* fr, t_state* state)
2503 {
2504     gmx_domdec_sort_t* sort = dd->comm->sort.get();
2505
2506     dd_sort_order_nbnxn(fr, &sort->sorted);
2507
2508     /* We alloc with the old size, since cgindex is still old */
2509     DDBufferAccess<gmx::RVec> rvecBuffer(dd->comm->rvecBuffer, dd->ncg_home);
2510
2511     /* Set the new home atom/charge group count */
2512     dd->ncg_home = sort->sorted.size();
2513     if (debug)
2514     {
2515         fprintf(debug, "Set the new home atom count to %d\n", dd->ncg_home);
2516     }
2517
2518     /* Reorder the state */
2519     gmx::ArrayRef<const gmx_cgsort_t> cgsort = sort->sorted;
2520     GMX_RELEASE_ASSERT(cgsort.ssize() == dd->ncg_home, "We should sort all the home atom groups");
2521
2522     if (state->flags & (1 << estX))
2523     {
2524         orderVector(cgsort, makeArrayRef(state->x), rvecBuffer.buffer);
2525     }
2526     if (state->flags & (1 << estV))
2527     {
2528         orderVector(cgsort, makeArrayRef(state->v), rvecBuffer.buffer);
2529     }
2530     if (state->flags & (1 << estCGP))
2531     {
2532         orderVector(cgsort, makeArrayRef(state->cg_p), rvecBuffer.buffer);
2533     }
2534
2535     /* Reorder the global cg index */
2536     orderVector<int>(cgsort, dd->globalAtomGroupIndices, &sort->intBuffer);
2537     /* Reorder the cginfo */
2538     orderVector<int>(cgsort, fr->cginfo, &sort->intBuffer);
2539     /* Set the home atom number */
2540     dd->comm->atomRanges.setEnd(DDAtomRanges::Type::Home, dd->ncg_home);
2541
2542     /* The atoms are now exactly in grid order, update the grid order */
2543     fr->nbv->setLocalAtomOrder();
2544 }
2545
2546 //! Accumulates load statistics.
2547 static void add_dd_statistics(gmx_domdec_t* dd)
2548 {
2549     gmx_domdec_comm_t* comm = dd->comm;
2550
2551     for (int i = 0; i < static_cast<int>(DDAtomRanges::Type::Number); i++)
2552     {
2553         auto range = static_cast<DDAtomRanges::Type>(i);
2554         comm->sum_nat[i] += comm->atomRanges.end(range) - comm->atomRanges.start(range);
2555     }
2556     comm->ndecomp++;
2557 }
2558
2559 void reset_dd_statistics_counters(gmx_domdec_t* dd)
2560 {
2561     gmx_domdec_comm_t* comm = dd->comm;
2562
2563     /* Reset all the statistics and counters for total run counting */
2564     for (int i = 0; i < static_cast<int>(DDAtomRanges::Type::Number); i++)
2565     {
2566         comm->sum_nat[i] = 0;
2567     }
2568     comm->ndecomp   = 0;
2569     comm->nload     = 0;
2570     comm->load_step = 0;
2571     comm->load_sum  = 0;
2572     comm->load_max  = 0;
2573     clear_ivec(comm->load_lim);
2574     comm->load_mdf = 0;
2575     comm->load_pme = 0;
2576 }
2577
2578 void print_dd_statistics(const t_commrec* cr, const t_inputrec* ir, FILE* fplog)
2579 {
2580     gmx_domdec_comm_t* comm = cr->dd->comm;
2581
2582     const int numRanges = static_cast<int>(DDAtomRanges::Type::Number);
2583     gmx_sumd(numRanges, comm->sum_nat, cr);
2584
2585     if (fplog == nullptr)
2586     {
2587         return;
2588     }
2589
2590     fprintf(fplog, "\n    D O M A I N   D E C O M P O S I T I O N   S T A T I S T I C S\n\n");
2591
2592     for (int i = static_cast<int>(DDAtomRanges::Type::Zones); i < numRanges; i++)
2593     {
2594         auto   range = static_cast<DDAtomRanges::Type>(i);
2595         double av    = comm->sum_nat[i] / comm->ndecomp;
2596         switch (range)
2597         {
2598             case DDAtomRanges::Type::Zones:
2599                 fprintf(fplog, " av. #atoms communicated per step for force:  %d x %.1f\n", 2, av);
2600                 break;
2601             case DDAtomRanges::Type::Vsites:
2602                 if (cr->dd->vsite_comm)
2603                 {
2604                     fprintf(fplog, " av. #atoms communicated per step for vsites: %d x %.1f\n",
2605                             (EEL_PME(ir->coulombtype) || ir->coulombtype == eelEWALD) ? 3 : 2, av);
2606                 }
2607                 break;
2608             case DDAtomRanges::Type::Constraints:
2609                 if (cr->dd->constraint_comm)
2610                 {
2611                     fprintf(fplog, " av. #atoms communicated per step for LINCS:  %d x %.1f\n",
2612                             1 + ir->nLincsIter, av);
2613                 }
2614                 break;
2615             default: gmx_incons(" Unknown type for DD statistics");
2616         }
2617     }
2618     fprintf(fplog, "\n");
2619
2620     if (comm->ddSettings.recordLoad && EI_DYNAMICS(ir->eI))
2621     {
2622         print_dd_load_av(fplog, cr->dd);
2623     }
2624 }
2625
2626 //!\brief TODO Remove fplog when group scheme and charge groups are gone
2627 void dd_partition_system(FILE*                        fplog,
2628                          const gmx::MDLogger&         mdlog,
2629                          int64_t                      step,
2630                          const t_commrec*             cr,
2631                          gmx_bool                     bMasterState,
2632                          int                          nstglobalcomm,
2633                          t_state*                     state_global,
2634                          const gmx_mtop_t&            top_global,
2635                          const t_inputrec*            ir,
2636                          gmx::ImdSession*             imdSession,
2637                          pull_t*                      pull_work,
2638                          t_state*                     state_local,
2639                          PaddedHostVector<gmx::RVec>* f,
2640                          gmx::MDAtoms*                mdAtoms,
2641                          gmx_localtop_t*              top_local,
2642                          t_forcerec*                  fr,
2643                          gmx::VirtualSitesHandler*    vsite,
2644                          gmx::Constraints*            constr,
2645                          t_nrnb*                      nrnb,
2646                          gmx_wallcycle*               wcycle,
2647                          gmx_bool                     bVerbose)
2648 {
2649     gmx_domdec_t*      dd;
2650     gmx_domdec_comm_t* comm;
2651     gmx_ddbox_t        ddbox = { 0 };
2652     int64_t            step_pcoupl;
2653     rvec               cell_ns_x0, cell_ns_x1;
2654     int                ncgindex_set, ncg_moved, nat_f_novirsum;
2655     gmx_bool           bBoxChanged, bNStGlobalComm, bDoDLB, bCheckWhetherToTurnDlbOn, bLogLoad;
2656     gmx_bool           bRedist;
2657     ivec               np;
2658     real               grid_density;
2659     char               sbuf[22];
2660
2661     wallcycle_start(wcycle, ewcDOMDEC);
2662
2663     dd   = cr->dd;
2664     comm = dd->comm;
2665
2666     // TODO if the update code becomes accessible here, use
2667     // upd->deform for this logic.
2668     bBoxChanged = (bMasterState || inputrecDeform(ir));
2669     if (ir->epc != epcNO)
2670     {
2671         /* With nstpcouple > 1 pressure coupling happens.
2672          * one step after calculating the pressure.
2673          * Box scaling happens at the end of the MD step,
2674          * after the DD partitioning.
2675          * We therefore have to do DLB in the first partitioning
2676          * after an MD step where P-coupling occurred.
2677          * We need to determine the last step in which p-coupling occurred.
2678          * MRS -- need to validate this for vv?
2679          */
2680         int n = ir->nstpcouple;
2681         if (n == 1)
2682         {
2683             step_pcoupl = step - 1;
2684         }
2685         else
2686         {
2687             step_pcoupl = ((step - 1) / n) * n + 1;
2688         }
2689         if (step_pcoupl >= comm->partition_step)
2690         {
2691             bBoxChanged = TRUE;
2692         }
2693     }
2694
2695     bNStGlobalComm = (step % nstglobalcomm == 0);
2696
2697     if (!isDlbOn(comm))
2698     {
2699         bDoDLB = FALSE;
2700     }
2701     else
2702     {
2703         /* Should we do dynamic load balacing this step?
2704          * Since it requires (possibly expensive) global communication,
2705          * we might want to do DLB less frequently.
2706          */
2707         if (bBoxChanged || ir->epc != epcNO)
2708         {
2709             bDoDLB = bBoxChanged;
2710         }
2711         else
2712         {
2713             bDoDLB = bNStGlobalComm;
2714         }
2715     }
2716
2717     /* Check if we have recorded loads on the nodes */
2718     if (comm->ddSettings.recordLoad && dd_load_count(comm) > 0)
2719     {
2720         bCheckWhetherToTurnDlbOn = dd_dlb_get_should_check_whether_to_turn_dlb_on(dd);
2721
2722         /* Print load every nstlog, first and last step to the log file */
2723         bLogLoad = ((ir->nstlog > 0 && step % ir->nstlog == 0) || comm->n_load_collect == 0
2724                     || (ir->nsteps >= 0 && (step + ir->nstlist > ir->init_step + ir->nsteps)));
2725
2726         /* Avoid extra communication due to verbose screen output
2727          * when nstglobalcomm is set.
2728          */
2729         if (bDoDLB || bLogLoad || bCheckWhetherToTurnDlbOn
2730             || (bVerbose && (ir->nstlist == 0 || nstglobalcomm <= ir->nstlist)))
2731         {
2732             get_load_distribution(dd, wcycle);
2733             if (DDMASTER(dd))
2734             {
2735                 if (bLogLoad)
2736                 {
2737                     GMX_LOG(mdlog.info).asParagraph().appendText(dd_print_load(dd, step - 1));
2738                 }
2739                 if (bVerbose)
2740                 {
2741                     dd_print_load_verbose(dd);
2742                 }
2743             }
2744             comm->n_load_collect++;
2745
2746             if (isDlbOn(comm))
2747             {
2748                 if (DDMASTER(dd))
2749                 {
2750                     /* Add the measured cycles to the running average */
2751                     const float averageFactor = 0.1F;
2752                     comm->cyclesPerStepDlbExpAverage =
2753                             (1 - averageFactor) * comm->cyclesPerStepDlbExpAverage
2754                             + averageFactor * comm->cycl[ddCyclStep] / comm->cycl_n[ddCyclStep];
2755                 }
2756                 if (comm->dlbState == DlbState::onCanTurnOff
2757                     && dd->comm->n_load_have % c_checkTurnDlbOffInterval == c_checkTurnDlbOffInterval - 1)
2758                 {
2759                     gmx_bool turnOffDlb;
2760                     if (DDMASTER(dd))
2761                     {
2762                         /* If the running averaged cycles with DLB are more
2763                          * than before we turned on DLB, turn off DLB.
2764                          * We will again run and check the cycles without DLB
2765                          * and we can then decide if to turn off DLB forever.
2766                          */
2767                         turnOffDlb = (comm->cyclesPerStepDlbExpAverage > comm->cyclesPerStepBeforeDLB);
2768                     }
2769                     dd_bcast(dd, sizeof(turnOffDlb), &turnOffDlb);
2770                     if (turnOffDlb)
2771                     {
2772                         /* To turn off DLB, we need to redistribute the atoms */
2773                         dd_collect_state(dd, state_local, state_global);
2774                         bMasterState = TRUE;
2775                         turn_off_dlb(mdlog, dd, step);
2776                     }
2777                 }
2778             }
2779             else if (bCheckWhetherToTurnDlbOn)
2780             {
2781                 gmx_bool turnOffDlbForever = FALSE;
2782                 gmx_bool turnOnDlb         = FALSE;
2783
2784                 /* Since the timings are node dependent, the master decides */
2785                 if (DDMASTER(dd))
2786                 {
2787                     /* If we recently turned off DLB, we want to check if
2788                      * performance is better without DLB. We want to do this
2789                      * ASAP to minimize the chance that external factors
2790                      * slowed down the DLB step are gone here and we
2791                      * incorrectly conclude that DLB was causing the slowdown.
2792                      * So we measure one nstlist block, no running average.
2793                      */
2794                     if (comm->haveTurnedOffDlb
2795                         && comm->cycl[ddCyclStep] / comm->cycl_n[ddCyclStep] < comm->cyclesPerStepDlbExpAverage)
2796                     {
2797                         /* After turning off DLB we ran nstlist steps in fewer
2798                          * cycles than with DLB. This likely means that DLB
2799                          * in not benefical, but this could be due to a one
2800                          * time unlucky fluctuation, so we require two such
2801                          * observations in close succession to turn off DLB
2802                          * forever.
2803                          */
2804                         if (comm->dlbSlowerPartitioningCount > 0
2805                             && dd->ddp_count < comm->dlbSlowerPartitioningCount + 10 * c_checkTurnDlbOnInterval)
2806                         {
2807                             turnOffDlbForever = TRUE;
2808                         }
2809                         comm->haveTurnedOffDlb = false;
2810                         /* Register when we last measured DLB slowdown */
2811                         comm->dlbSlowerPartitioningCount = dd->ddp_count;
2812                     }
2813                     else
2814                     {
2815                         /* Here we check if the max PME rank load is more than 0.98
2816                          * the max PP force load. If so, PP DLB will not help,
2817                          * since we are (almost) limited by PME. Furthermore,
2818                          * DLB will cause a significant extra x/f redistribution
2819                          * cost on the PME ranks, which will then surely result
2820                          * in lower total performance.
2821                          */
2822                         if (comm->ddRankSetup.usePmeOnlyRanks && dd_pme_f_ratio(dd) > 1 - DD_PERF_LOSS_DLB_ON)
2823                         {
2824                             turnOnDlb = FALSE;
2825                         }
2826                         else
2827                         {
2828                             turnOnDlb = (dd_force_imb_perf_loss(dd) >= DD_PERF_LOSS_DLB_ON);
2829                         }
2830                     }
2831                 }
2832                 struct
2833                 {
2834                     gmx_bool turnOffDlbForever;
2835                     gmx_bool turnOnDlb;
2836                 } bools{ turnOffDlbForever, turnOnDlb };
2837                 dd_bcast(dd, sizeof(bools), &bools);
2838                 if (bools.turnOffDlbForever)
2839                 {
2840                     turn_off_dlb_forever(mdlog, dd, step);
2841                 }
2842                 else if (bools.turnOnDlb)
2843                 {
2844                     turn_on_dlb(mdlog, dd, step);
2845                     bDoDLB = TRUE;
2846                 }
2847             }
2848         }
2849         comm->n_load_have++;
2850     }
2851
2852     bRedist = FALSE;
2853     if (bMasterState)
2854     {
2855         /* Clear the old state */
2856         clearDDStateIndices(dd, false);
2857         ncgindex_set = 0;
2858
2859         auto xGlobal = positionsFromStatePointer(state_global);
2860
2861         set_ddbox(*dd, true, DDMASTER(dd) ? state_global->box : nullptr, true, xGlobal, &ddbox);
2862
2863         distributeState(mdlog, dd, top_global, state_global, ddbox, state_local);
2864
2865         /* Ensure that we have space for the new distribution */
2866         dd_resize_atominfo_and_state(fr, state_local, dd->ncg_home);
2867
2868         inc_nrnb(nrnb, eNR_CGCM, comm->atomRanges.numHomeAtoms());
2869
2870         dd_set_cginfo(dd->globalAtomGroupIndices, 0, dd->ncg_home, fr);
2871     }
2872     else if (state_local->ddp_count != dd->ddp_count)
2873     {
2874         if (state_local->ddp_count > dd->ddp_count)
2875         {
2876             gmx_fatal(FARGS,
2877                       "Internal inconsistency state_local->ddp_count (%d) > dd->ddp_count (%" PRId64
2878                       ")",
2879                       state_local->ddp_count, dd->ddp_count);
2880         }
2881
2882         if (state_local->ddp_count_cg_gl != state_local->ddp_count)
2883         {
2884             gmx_fatal(FARGS,
2885                       "Internal inconsistency state_local->ddp_count_cg_gl (%d) != "
2886                       "state_local->ddp_count (%d)",
2887                       state_local->ddp_count_cg_gl, state_local->ddp_count);
2888         }
2889
2890         /* Clear the old state */
2891         clearDDStateIndices(dd, false);
2892
2893         /* Restore the atom group indices from state_local */
2894         restoreAtomGroups(dd, state_local);
2895         make_dd_indices(dd, 0);
2896         ncgindex_set = dd->ncg_home;
2897
2898         inc_nrnb(nrnb, eNR_CGCM, comm->atomRanges.numHomeAtoms());
2899
2900         dd_set_cginfo(dd->globalAtomGroupIndices, 0, dd->ncg_home, fr);
2901
2902         set_ddbox(*dd, bMasterState, state_local->box, true, state_local->x, &ddbox);
2903
2904         bRedist = isDlbOn(comm);
2905     }
2906     else
2907     {
2908         /* We have the full state, only redistribute the cgs */
2909
2910         /* Clear the non-home indices */
2911         clearDDStateIndices(dd, true);
2912         ncgindex_set = 0;
2913
2914         /* To avoid global communication, we do not recompute the extent
2915          * of the system for dims without pbc. Therefore we need to copy
2916          * the previously computed values when we do not communicate.
2917          */
2918         if (!bNStGlobalComm)
2919         {
2920             copy_rvec(comm->box0, ddbox.box0);
2921             copy_rvec(comm->box_size, ddbox.box_size);
2922         }
2923         set_ddbox(*dd, bMasterState, state_local->box, bNStGlobalComm, state_local->x, &ddbox);
2924
2925         bBoxChanged = TRUE;
2926         bRedist     = TRUE;
2927     }
2928     /* Copy needed for dim's without pbc when avoiding communication */
2929     copy_rvec(ddbox.box0, comm->box0);
2930     copy_rvec(ddbox.box_size, comm->box_size);
2931
2932     set_dd_cell_sizes(dd, &ddbox, dd->unitCellInfo.ddBoxIsDynamic, bMasterState, bDoDLB, step, wcycle);
2933
2934     if (comm->ddSettings.nstDDDumpGrid > 0 && step % comm->ddSettings.nstDDDumpGrid == 0)
2935     {
2936         write_dd_grid_pdb("dd_grid", step, dd, state_local->box, &ddbox);
2937     }
2938
2939     if (comm->systemInfo.useUpdateGroups)
2940     {
2941         comm->updateGroupsCog->addCogs(
2942                 gmx::arrayRefFromArray(dd->globalAtomGroupIndices.data(), dd->ncg_home), state_local->x);
2943     }
2944
2945     /* Check if we should sort the charge groups */
2946     const bool bSortCG = (bMasterState || bRedist);
2947
2948     /* When repartitioning we mark atom groups that will move to neighboring
2949      * DD cells, but we do not move them right away for performance reasons.
2950      * Thus we need to keep track of how many charge groups will move for
2951      * obtaining correct local charge group / atom counts.
2952      */
2953     ncg_moved = 0;
2954     if (bRedist)
2955     {
2956         wallcycle_sub_start(wcycle, ewcsDD_REDIST);
2957
2958         ncgindex_set = dd->ncg_home;
2959         dd_redistribute_cg(fplog, step, dd, ddbox.tric_dir, state_local, fr, nrnb, &ncg_moved);
2960
2961         GMX_RELEASE_ASSERT(bSortCG, "Sorting is required after redistribution");
2962
2963         if (comm->systemInfo.useUpdateGroups)
2964         {
2965             comm->updateGroupsCog->addCogs(
2966                     gmx::arrayRefFromArray(dd->globalAtomGroupIndices.data(), dd->ncg_home),
2967                     state_local->x);
2968         }
2969
2970         wallcycle_sub_stop(wcycle, ewcsDD_REDIST);
2971     }
2972
2973     // TODO: Integrate this code in the nbnxm module
2974     get_nsgrid_boundaries(ddbox.nboundeddim, state_local->box, dd, &ddbox, &comm->cell_x0,
2975                           &comm->cell_x1, dd->ncg_home, as_rvec_array(state_local->x.data()),
2976                           cell_ns_x0, cell_ns_x1, &grid_density);
2977
2978     if (bBoxChanged)
2979     {
2980         comm_dd_ns_cell_sizes(dd, &ddbox, cell_ns_x0, cell_ns_x1, step);
2981     }
2982
2983     if (bSortCG)
2984     {
2985         wallcycle_sub_start(wcycle, ewcsDD_GRID);
2986
2987         /* Sort the state on charge group position.
2988          * This enables exact restarts from this step.
2989          * It also improves performance by about 15% with larger numbers
2990          * of atoms per node.
2991          */
2992
2993         /* Fill the ns grid with the home cell,
2994          * so we can sort with the indices.
2995          */
2996         set_zones_ncg_home(dd);
2997
2998         set_zones_size(dd, state_local->box, &ddbox, 0, 1, ncg_moved);
2999
3000         nbnxn_put_on_grid(fr->nbv.get(), state_local->box, 0, comm->zones.size[0].bb_x0,
3001                           comm->zones.size[0].bb_x1, comm->updateGroupsCog.get(),
3002                           { 0, dd->ncg_home }, comm->zones.dens_zone0, fr->cginfo, state_local->x,
3003                           ncg_moved, bRedist ? comm->movedBuffer.data() : nullptr);
3004
3005         if (debug)
3006         {
3007             fprintf(debug, "Step %s, sorting the %d home charge groups\n", gmx_step_str(step, sbuf),
3008                     dd->ncg_home);
3009         }
3010         dd_sort_state(dd, fr, state_local);
3011
3012         /* After sorting and compacting we set the correct size */
3013         state_change_natoms(state_local, comm->atomRanges.numHomeAtoms());
3014
3015         /* Rebuild all the indices */
3016         dd->ga2la->clear();
3017         ncgindex_set = 0;
3018
3019         wallcycle_sub_stop(wcycle, ewcsDD_GRID);
3020     }
3021     else
3022     {
3023         /* With the group scheme the sorting array is part of the DD state,
3024          * but it just got out of sync, so mark as invalid by emptying it.
3025          */
3026         if (ir->cutoff_scheme == ecutsGROUP)
3027         {
3028             comm->sort->sorted.clear();
3029         }
3030     }
3031
3032     if (comm->systemInfo.useUpdateGroups)
3033     {
3034         /* The update groups cog's are invalid after sorting
3035          * and need to be cleared before the next partitioning anyhow.
3036          */
3037         comm->updateGroupsCog->clear();
3038     }
3039
3040     wallcycle_sub_start(wcycle, ewcsDD_SETUPCOMM);
3041
3042     /* Set the induces for the home atoms */
3043     set_zones_ncg_home(dd);
3044     make_dd_indices(dd, ncgindex_set);
3045
3046     /* Setup up the communication and communicate the coordinates */
3047     setup_dd_communication(dd, state_local->box, &ddbox, fr, state_local);
3048
3049     /* Set the indices for the halo atoms */
3050     make_dd_indices(dd, dd->ncg_home);
3051
3052     /* Set the charge group boundaries for neighbor searching */
3053     set_cg_boundaries(&comm->zones);
3054
3055     if (fr->cutoff_scheme == ecutsVERLET)
3056     {
3057         /* When bSortCG=true, we have already set the size for zone 0 */
3058         set_zones_size(dd, state_local->box, &ddbox, bSortCG ? 1 : 0, comm->zones.n, 0);
3059     }
3060
3061     wallcycle_sub_stop(wcycle, ewcsDD_SETUPCOMM);
3062
3063     /*
3064        write_dd_pdb("dd_home",step,"dump",top_global,cr,
3065                  -1,state_local->x.rvec_array(),state_local->box);
3066      */
3067
3068     wallcycle_sub_start(wcycle, ewcsDD_MAKETOP);
3069
3070     /* Extract a local topology from the global topology */
3071     for (int i = 0; i < dd->ndim; i++)
3072     {
3073         np[dd->dim[i]] = comm->cd[i].numPulses();
3074     }
3075     dd_make_local_top(dd, &comm->zones, dd->unitCellInfo.npbcdim, state_local->box,
3076                       comm->cellsize_min, np, fr, state_local->x.rvec_array(), top_global, top_local);
3077
3078     wallcycle_sub_stop(wcycle, ewcsDD_MAKETOP);
3079
3080     wallcycle_sub_start(wcycle, ewcsDD_MAKECONSTR);
3081
3082     /* Set up the special atom communication */
3083     int n = comm->atomRanges.end(DDAtomRanges::Type::Zones);
3084     for (int i = static_cast<int>(DDAtomRanges::Type::Zones) + 1;
3085          i < static_cast<int>(DDAtomRanges::Type::Number); i++)
3086     {
3087         auto range = static_cast<DDAtomRanges::Type>(i);
3088         switch (range)
3089         {
3090             case DDAtomRanges::Type::Vsites:
3091                 if (vsite && vsite->numInterUpdategroupVirtualSites())
3092                 {
3093                     n = dd_make_local_vsites(dd, n, top_local->idef.il);
3094                 }
3095                 break;
3096             case DDAtomRanges::Type::Constraints:
3097                 if (dd->comm->systemInfo.haveSplitConstraints || dd->comm->systemInfo.haveSplitSettles)
3098                 {
3099                     /* Only for inter-cg constraints we need special code */
3100                     n = dd_make_local_constraints(dd, n, &top_global, fr->cginfo.data(), constr,
3101                                                   ir->nProjOrder, top_local->idef.il);
3102                 }
3103                 break;
3104             default: gmx_incons("Unknown special atom type setup");
3105         }
3106         comm->atomRanges.setEnd(range, n);
3107     }
3108
3109     wallcycle_sub_stop(wcycle, ewcsDD_MAKECONSTR);
3110
3111     wallcycle_sub_start(wcycle, ewcsDD_TOPOTHER);
3112
3113     /* Make space for the extra coordinates for virtual site
3114      * or constraint communication.
3115      */
3116     state_local->natoms = comm->atomRanges.numAtomsTotal();
3117
3118     state_change_natoms(state_local, state_local->natoms);
3119
3120     if (fr->haveDirectVirialContributions)
3121     {
3122         if (vsite && vsite->numInterUpdategroupVirtualSites())
3123         {
3124             nat_f_novirsum = comm->atomRanges.end(DDAtomRanges::Type::Vsites);
3125         }
3126         else
3127         {
3128             if (EEL_FULL(ir->coulombtype) && dd->haveExclusions)
3129             {
3130                 nat_f_novirsum = comm->atomRanges.end(DDAtomRanges::Type::Zones);
3131             }
3132             else
3133             {
3134                 nat_f_novirsum = comm->atomRanges.numHomeAtoms();
3135             }
3136         }
3137     }
3138     else
3139     {
3140         nat_f_novirsum = 0;
3141     }
3142
3143     /* Set the number of atoms required for the force calculation.
3144      * Forces need to be constrained when doing energy
3145      * minimization. For simple simulations we could avoid some
3146      * allocation, zeroing and copying, but this is probably not worth
3147      * the complications and checking.
3148      */
3149     forcerec_set_ranges(fr, comm->atomRanges.end(DDAtomRanges::Type::Zones),
3150                         comm->atomRanges.end(DDAtomRanges::Type::Constraints), nat_f_novirsum);
3151
3152     /* Update atom data for mdatoms and several algorithms */
3153     mdAlgorithmsSetupAtomData(cr, ir, top_global, top_local, fr, f, mdAtoms, constr, vsite, nullptr);
3154
3155     auto mdatoms = mdAtoms->mdatoms();
3156     if (!thisRankHasDuty(cr, DUTY_PME))
3157     {
3158         /* Send the charges and/or c6/sigmas to our PME only node */
3159         gmx_pme_send_parameters(cr, fr->ic, mdatoms->nChargePerturbed != 0,
3160                                 mdatoms->nTypePerturbed != 0, mdatoms->chargeA, mdatoms->chargeB,
3161                                 mdatoms->sqrt_c6A, mdatoms->sqrt_c6B, mdatoms->sigmaA,
3162                                 mdatoms->sigmaB, dd_pme_maxshift_x(dd), dd_pme_maxshift_y(dd));
3163     }
3164
3165     if (ir->bPull)
3166     {
3167         /* Update the local pull groups */
3168         dd_make_local_pull_groups(cr, pull_work);
3169     }
3170
3171     if (dd->atomSets != nullptr)
3172     {
3173         /* Update the local atom sets */
3174         dd->atomSets->setIndicesInDomainDecomposition(*(dd->ga2la));
3175     }
3176
3177     /* Update the local atoms to be communicated via the IMD protocol if bIMD is TRUE. */
3178     imdSession->dd_make_local_IMD_atoms(dd);
3179
3180     add_dd_statistics(dd);
3181
3182     /* Make sure we only count the cycles for this DD partitioning */
3183     clear_dd_cycle_counts(dd);
3184
3185     /* Because the order of the atoms might have changed since
3186      * the last vsite construction, we need to communicate the constructing
3187      * atom coordinates again (for spreading the forces this MD step).
3188      */
3189     dd_move_x_vsites(*dd, state_local->box, state_local->x.rvec_array());
3190
3191     wallcycle_sub_stop(wcycle, ewcsDD_TOPOTHER);
3192
3193     if (comm->ddSettings.nstDDDump > 0 && step % comm->ddSettings.nstDDDump == 0)
3194     {
3195         dd_move_x(dd, state_local->box, state_local->x, nullWallcycle);
3196         write_dd_pdb("dd_dump", step, "dump", &top_global, cr, -1, state_local->x.rvec_array(),
3197                      state_local->box);
3198     }
3199
3200     /* Store the partitioning step */
3201     comm->partition_step = step;
3202
3203     /* Increase the DD partitioning counter */
3204     dd->ddp_count++;
3205     /* The state currently matches this DD partitioning count, store it */
3206     state_local->ddp_count = dd->ddp_count;
3207     if (bMasterState)
3208     {
3209         /* The DD master node knows the complete cg distribution,
3210          * store the count so we can possibly skip the cg info communication.
3211          */
3212         comm->master_cg_ddp_count = (bSortCG ? 0 : dd->ddp_count);
3213     }
3214
3215     if (comm->ddSettings.DD_debug > 0)
3216     {
3217         /* Set the env var GMX_DD_DEBUG if you suspect corrupted indices */
3218         check_index_consistency(dd, top_global.natoms, "after partitioning");
3219     }
3220
3221     wallcycle_stop(wcycle, ewcDOMDEC);
3222 }
3223
3224 /*! \brief Check whether bonded interactions are missing, if appropriate */
3225 void checkNumberOfBondedInteractions(const gmx::MDLogger&           mdlog,
3226                                      t_commrec*                     cr,
3227                                      int                            totalNumberOfBondedInteractions,
3228                                      const gmx_mtop_t*              top_global,
3229                                      const gmx_localtop_t*          top_local,
3230                                      gmx::ArrayRef<const gmx::RVec> x,
3231                                      const matrix                   box,
3232                                      bool* shouldCheckNumberOfBondedInteractions)
3233 {
3234     if (*shouldCheckNumberOfBondedInteractions)
3235     {
3236         if (totalNumberOfBondedInteractions != cr->dd->nbonded_global)
3237         {
3238             dd_print_missing_interactions(mdlog, cr, totalNumberOfBondedInteractions, top_global,
3239                                           top_local, x, box); // Does not return
3240         }
3241         *shouldCheckNumberOfBondedInteractions = false;
3242     }
3243 }