src/gromacs/domdec/partition.cpp

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2018,2019, by the GROMACS development team, led by
   5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   6  * and including many others, as listed in the AUTHORS file in the
   7  * top-level source directory and at http://www.gromacs.org.
   8  *
   9  * GROMACS is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public License
  11  * as published by the Free Software Foundation; either version 2.1
  12  * of the License, or (at your option) any later version.
  13  *
  14  * GROMACS is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public
  20  * License along with GROMACS; if not, see
  21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  23  *
  24  * If you want to redistribute modifications to GROMACS, please
  25  * consider that scientific software is very special. Version
  26  * control is crucial - bugs must be traceable. We will be happy to
  27  * consider code for inclusion in the official distribution, but
  28  * derived work must not be called official GROMACS. Details are found
  29  * in the README & COPYING files - if they are missing, get the
  30  * official version at http://www.gromacs.org.
  31  *
  32  * To help us fund GROMACS development, we humbly ask that you cite
  33  * the research papers on the package. Check out http://www.gromacs.org.
  34  */
  35 /*! \internal \file
  36  *
  37  * \brief This file defines functions for mdrun to call to make a new
  38  * domain decomposition, and check it.
  39  *
  40  * \author Berk Hess <hess@kth.se>
  41  * \ingroup module_domdec
  42  */
  43
  44 #include "gmxpre.h"
  45
  46 #include "partition.h"
  47
  48 #include "config.h"
  49
  50 #include <cassert>
  51 #include <cstdio>
  52
  53 #include <algorithm>
  54
  55 #include "gromacs/domdec/collect.h"
  56 #include "gromacs/domdec/dlb.h"
  57 #include "gromacs/domdec/dlbtiming.h"
  58 #include "gromacs/domdec/domdec.h"
  59 #include "gromacs/domdec/domdec_network.h"
  60 #include "gromacs/domdec/ga2la.h"
  61 #include "gromacs/domdec/localatomsetmanager.h"
  62 #include "gromacs/domdec/mdsetup.h"
  63 #include "gromacs/ewald/pme.h"
  64 #include "gromacs/gmxlib/network.h"
  65 #include "gromacs/gmxlib/nrnb.h"
  66 #include "gromacs/imd/imd.h"
  67 #include "gromacs/math/functions.h"
  68 #include "gromacs/math/vec.h"
  69 #include "gromacs/mdlib/forcerec.h"
  70 #include "gromacs/mdlib/gmx_omp_nthreads.h"
  71 #include "gromacs/mdlib/mdatoms.h"
  72 #include "gromacs/mdlib/nsgrid.h"
  73 #include "gromacs/mdlib/vsite.h"
  74 #include "gromacs/mdtypes/commrec.h"
  75 #include "gromacs/mdtypes/forcerec.h"
  76 #include "gromacs/mdtypes/inputrec.h"
  77 #include "gromacs/mdtypes/md_enums.h"
  78 #include "gromacs/mdtypes/nblist.h"
  79 #include "gromacs/mdtypes/state.h"
  80 #include "gromacs/nbnxm/nbnxm.h"
  81 #include "gromacs/pulling/pull.h"
  82 #include "gromacs/timing/wallcycle.h"
  83 #include "gromacs/topology/mtop_util.h"
  84 #include "gromacs/topology/topology.h"
  85 #include "gromacs/utility/cstringutil.h"
  86 #include "gromacs/utility/fatalerror.h"
  87 #include "gromacs/utility/logger.h"
  88 #include "gromacs/utility/real.h"
  89 #include "gromacs/utility/smalloc.h"
  90 #include "gromacs/utility/strconvert.h"
  91 #include "gromacs/utility/stringstream.h"
  92 #include "gromacs/utility/stringutil.h"
  93 #include "gromacs/utility/textwriter.h"
  94
  95 #include "box.h"
  96 #include "cellsizes.h"
  97 #include "distribute.h"
  98 #include "domdec_constraints.h"
  99 #include "domdec_internal.h"
 100 #include "domdec_vsite.h"
 101 #include "dump.h"
 102 #include "redistribute.h"
 103 #include "utility.h"
 104
 105 /*! \brief Turn on DLB when the load imbalance causes this amount of total loss.
 106  *
 107  * There is a bit of overhead with DLB and it's difficult to achieve
 108  * a load imbalance of less than 2% with DLB.
 109  */
 110 #define DD_PERF_LOSS_DLB_ON  0.02
 111
 112 //! Warn about imbalance due to PP or PP/PME load imbalance at this loss.
 113 #define DD_PERF_LOSS_WARN    0.05
 114
 115
 116 //! Debug helper printing a DD zone
 117 static void print_ddzone(FILE *fp, int d, int i, int j, gmx_ddzone_t *zone)
 118 {
 119     fprintf(fp, "zone d0 %d d1 %d d2 %d  min0 %6.3f max1 %6.3f mch0 %6.3f mch1 %6.3f p1_0 %6.3f p1_1 %6.3f\n",
 120             d, i, j,
 121             zone->min0, zone->max1,
 122             zone->mch0, zone->mch0,
 123             zone->p1_0, zone->p1_1);
 124 }
 125
 126 /*! \brief Using the home grid size as input in cell_ns_x0 and cell_ns_x1
 127  * takes the extremes over all home and remote zones in the halo
 128  * and returns the results in cell_ns_x0 and cell_ns_x1.
 129  * Note: only used with the group cut-off scheme.
 130  */
 131 static void dd_move_cellx(gmx_domdec_t      *dd,
 132                           const gmx_ddbox_t *ddbox,
 133                           rvec               cell_ns_x0,
 134                           rvec               cell_ns_x1)
 135 {
 136     constexpr int      c_ddZoneCommMaxNumZones = 5;
 137     gmx_ddzone_t       buf_s[c_ddZoneCommMaxNumZones];
 138     gmx_ddzone_t       buf_r[c_ddZoneCommMaxNumZones];
 139     gmx_ddzone_t       buf_e[c_ddZoneCommMaxNumZones];
 140     gmx_domdec_comm_t *comm = dd->comm;
 141
 142     rvec               extr_s[2];
 143     rvec               extr_r[2];
 144     for (int d = 1; d < dd->ndim; d++)
 145     {
 146         int           dim = dd->dim[d];
 147         gmx_ddzone_t &zp  = (d == 1) ? comm->zone_d1[0] : comm->zone_d2[0][0];
 148
 149         /* Copy the base sizes of the home zone */
 150         zp.min0    = cell_ns_x0[dim];
 151         zp.max1    = cell_ns_x1[dim];
 152         zp.min1    = cell_ns_x1[dim];
 153         zp.mch0    = cell_ns_x0[dim];
 154         zp.mch1    = cell_ns_x1[dim];
 155         zp.p1_0    = cell_ns_x0[dim];
 156         zp.p1_1    = cell_ns_x1[dim];
 157         zp.dataSet = 1;
 158     }
 159
 160     gmx::ArrayRef<DDCellsizesWithDlb> cellsizes = comm->cellsizesWithDlb;
 161
 162     /* Loop backward over the dimensions and aggregate the extremes
 163      * of the cell sizes.
 164      */
 165     for (int d = dd->ndim - 2; d >= 0; d--)
 166     {
 167         const int  dim      = dd->dim[d];
 168         const bool applyPbc = (dim < ddbox->npbcdim);
 169
 170         /* Use an rvec to store two reals */
 171         extr_s[d][0] = cellsizes[d + 1].fracLower;
 172         extr_s[d][1] = cellsizes[d + 1].fracUpper;
 173         extr_s[d][2] = cellsizes[d + 1].fracUpper;
 174
 175         int pos = 0;
 176         GMX_ASSERT(pos < c_ddZoneCommMaxNumZones, "The buffers should be sufficiently large");
 177         /* Store the extremes in the backward sending buffer,
 178          * so they get updated separately from the forward communication.
 179          */
 180         for (int d1 = d; d1 < dd->ndim-1; d1++)
 181         {
 182             gmx_ddzone_t &buf = buf_s[pos];
 183
 184             /* We invert the order to be able to use the same loop for buf_e */
 185             buf.min0    = extr_s[d1][1];
 186             buf.max1    = extr_s[d1][0];
 187             buf.min1    = extr_s[d1][2];
 188             buf.mch0    = 0;
 189             buf.mch1    = 0;
 190             /* Store the cell corner of the dimension we communicate along */
 191             buf.p1_0    = comm->cell_x0[dim];
 192             buf.p1_1    = 0;
 193             buf.dataSet = 1;
 194             pos++;
 195         }
 196
 197         buf_s[pos] = (dd->ndim == 2) ? comm->zone_d1[0] : comm->zone_d2[0][0];
 198         pos++;
 199
 200         if (dd->ndim == 3 && d == 0)
 201         {
 202             buf_s[pos] = comm->zone_d2[0][1];
 203             pos++;
 204             buf_s[pos] = comm->zone_d1[0];
 205             pos++;
 206         }
 207
 208         /* We only need to communicate the extremes
 209          * in the forward direction
 210          */
 211         int numPulses = comm->cd[d].numPulses();
 212         int numPulsesMin;
 213         if (applyPbc)
 214         {
 215             /* Take the minimum to avoid double communication */
 216             numPulsesMin = std::min(numPulses, dd->nc[dim] - 1 - numPulses);
 217         }
 218         else
 219         {
 220             /* Without PBC we should really not communicate over
 221              * the boundaries, but implementing that complicates
 222              * the communication setup and therefore we simply
 223              * do all communication, but ignore some data.
 224              */
 225             numPulsesMin = numPulses;
 226         }
 227         for (int pulse = 0; pulse < numPulsesMin; pulse++)
 228         {
 229             /* Communicate the extremes forward */
 230             bool receiveValidData = (applyPbc || dd->ci[dim] > 0);
 231
 232             int  numElements      = dd->ndim - d - 1;
 233             ddSendrecv(dd, d, dddirForward,
 234                        extr_s + d, numElements,
 235                        extr_r + d, numElements);
 236
 237             if (receiveValidData)
 238             {
 239                 for (int d1 = d; d1 < dd->ndim - 1; d1++)
 240                 {
 241                     extr_s[d1][0] = std::max(extr_s[d1][0], extr_r[d1][0]);
 242                     extr_s[d1][1] = std::min(extr_s[d1][1], extr_r[d1][1]);
 243                     extr_s[d1][2] = std::min(extr_s[d1][2], extr_r[d1][2]);
 244                 }
 245             }
 246         }
 247
 248         const int numElementsInBuffer = pos;
 249         for (int pulse = 0; pulse < numPulses; pulse++)
 250         {
 251             /* Communicate all the zone information backward */
 252             bool receiveValidData = (applyPbc || dd->ci[dim] < dd->nc[dim] - 1);
 253
 254             static_assert(sizeof(gmx_ddzone_t) == c_ddzoneNumReals*sizeof(real), "Here we expect gmx_ddzone_t to consist of c_ddzoneNumReals reals (only)");
 255
 256             int numReals = numElementsInBuffer*c_ddzoneNumReals;
 257             ddSendrecv(dd, d, dddirBackward,
 258                        gmx::arrayRefFromArray(&buf_s[0].min0, numReals),
 259                        gmx::arrayRefFromArray(&buf_r[0].min0, numReals));
 260
 261             rvec dh = { 0 };
 262             if (pulse > 0)
 263             {
 264                 for (int d1 = d + 1; d1 < dd->ndim; d1++)
 265                 {
 266                     /* Determine the decrease of maximum required
 267                      * communication height along d1 due to the distance along d,
 268                      * this avoids a lot of useless atom communication.
 269                      */
 270                     real dist_d = comm->cell_x1[dim] - buf_r[0].p1_0;
 271
 272                     real c;
 273                     if (ddbox->tric_dir[dim])
 274                     {
 275                         /* c is the off-diagonal coupling between the cell planes
 276                          * along directions d and d1.
 277                          */
 278                         c = ddbox->v[dim][dd->dim[d1]][dim];
 279                     }
 280                     else
 281                     {
 282                         c = 0;
 283                     }
 284                     real det = (1 + c*c)*gmx::square(comm->systemInfo.cutoff) - dist_d*dist_d;
 285                     if (det > 0)
 286                     {
 287                         dh[d1] = comm->systemInfo.cutoff - (c*dist_d + std::sqrt(det))/(1 + c*c);
 288                     }
 289                     else
 290                     {
 291                         /* A negative value signals out of range */
 292                         dh[d1] = -1;
 293                     }
 294                 }
 295             }
 296
 297             /* Accumulate the extremes over all pulses */
 298             for (int i = 0; i < numElementsInBuffer; i++)
 299             {
 300                 if (pulse == 0)
 301                 {
 302                     buf_e[i] = buf_r[i];
 303                 }
 304                 else
 305                 {
 306                     if (receiveValidData)
 307                     {
 308                         buf_e[i].min0 = std::min(buf_e[i].min0, buf_r[i].min0);
 309                         buf_e[i].max1 = std::max(buf_e[i].max1, buf_r[i].max1);
 310                         buf_e[i].min1 = std::min(buf_e[i].min1, buf_r[i].min1);
 311                     }
 312
 313                     int d1;
 314                     if (dd->ndim == 3 && d == 0 && i == numElementsInBuffer - 1)
 315                     {
 316                         d1 = 1;
 317                     }
 318                     else
 319                     {
 320                         d1 = d + 1;
 321                     }
 322                     if (receiveValidData && dh[d1] >= 0)
 323                     {
 324                         buf_e[i].mch0 = std::max(buf_e[i].mch0, buf_r[i].mch0-dh[d1]);
 325                         buf_e[i].mch1 = std::max(buf_e[i].mch1, buf_r[i].mch1-dh[d1]);
 326                     }
 327                 }
 328                 /* Copy the received buffer to the send buffer,
 329                  * to pass the data through with the next pulse.
 330                  */
 331                 buf_s[i] = buf_r[i];
 332             }
 333             if (((applyPbc || dd->ci[dim] + numPulses < dd->nc[dim]) && pulse == numPulses - 1) ||
 334                 (!applyPbc && dd->ci[dim] + 1 + pulse == dd->nc[dim] - 1))
 335             {
 336                 /* Store the extremes */
 337                 int pos = 0;
 338
 339                 for (int d1 = d; d1 < dd->ndim-1; d1++)
 340                 {
 341                     extr_s[d1][1] = std::min(extr_s[d1][1], buf_e[pos].min0);
 342                     extr_s[d1][0] = std::max(extr_s[d1][0], buf_e[pos].max1);
 343                     extr_s[d1][2] = std::min(extr_s[d1][2], buf_e[pos].min1);
 344                     pos++;
 345                 }
 346
 347                 if (d == 1 || (d == 0 && dd->ndim == 3))
 348                 {
 349                     for (int i = d; i < 2; i++)
 350                     {
 351                         comm->zone_d2[1-d][i] = buf_e[pos];
 352                         pos++;
 353                     }
 354                 }
 355                 if (d == 0)
 356                 {
 357                     comm->zone_d1[1] = buf_e[pos];
 358                     pos++;
 359                 }
 360             }
 361             else
 362             {
 363                 if (d == 1 || (d == 0 && dd->ndim == 3))
 364                 {
 365                     for (int i = d; i < 2; i++)
 366                     {
 367                         comm->zone_d2[1 - d][i].dataSet = 0;
 368                     }
 369                 }
 370                 if (d == 0)
 371                 {
 372                     comm->zone_d1[1].dataSet = 0;
 373                 }
 374             }
 375         }
 376     }
 377
 378     if (dd->ndim >= 2)
 379     {
 380         int dim = dd->dim[1];
 381         for (int i = 0; i < 2; i++)
 382         {
 383             if (comm->zone_d1[i].dataSet != 0)
 384             {
 385                 if (debug)
 386                 {
 387                     print_ddzone(debug, 1, i, 0, &comm->zone_d1[i]);
 388                 }
 389                 cell_ns_x0[dim] = std::min(cell_ns_x0[dim], comm->zone_d1[i].min0);
 390                 cell_ns_x1[dim] = std::max(cell_ns_x1[dim], comm->zone_d1[i].max1);
 391             }
 392         }
 393     }
 394     if (dd->ndim >= 3)
 395     {
 396         int dim = dd->dim[2];
 397         for (int i = 0; i < 2; i++)
 398         {
 399             for (int j = 0; j < 2; j++)
 400             {
 401                 if (comm->zone_d2[i][j].dataSet != 0)
 402                 {
 403                     if (debug)
 404                     {
 405                         print_ddzone(debug, 2, i, j, &comm->zone_d2[i][j]);
 406                     }
 407                     cell_ns_x0[dim] = std::min(cell_ns_x0[dim], comm->zone_d2[i][j].min0);
 408                     cell_ns_x1[dim] = std::max(cell_ns_x1[dim], comm->zone_d2[i][j].max1);
 409                 }
 410             }
 411         }
 412     }
 413     for (int d = 1; d < dd->ndim; d++)
 414     {
 415         cellsizes[d].fracLowerMax = extr_s[d-1][0];
 416         cellsizes[d].fracUpperMin = extr_s[d-1][1];
 417         if (debug)
 418         {
 419             fprintf(debug, "Cell fraction d %d, max0 %f, min1 %f\n",
 420                     d, cellsizes[d].fracLowerMax, cellsizes[d].fracUpperMin);
 421         }
 422     }
 423 }
 424
 425 //! Sets the charge-group zones to be equal to the home zone.
 426 static void set_zones_ncg_home(gmx_domdec_t *dd)
 427 {
 428     gmx_domdec_zones_t *zones;
 429     int                 i;
 430
 431     zones = &dd->comm->zones;
 432
 433     zones->cg_range[0] = 0;
 434     for (i = 1; i < zones->n+1; i++)
 435     {
 436         zones->cg_range[i] = dd->ncg_home;
 437     }
 438     /* zone_ncg1[0] should always be equal to ncg_home */
 439     dd->comm->zone_ncg1[0] = dd->ncg_home;
 440 }
 441
 442 //! Restore atom groups for the charge groups.
 443 static void restoreAtomGroups(gmx_domdec_t  *dd,
 444                               const t_state *state)
 445 {
 446     gmx::ArrayRef<const int>  atomGroupsState        = state->cg_gl;
 447
 448     std::vector<int>         &globalAtomGroupIndices = dd->globalAtomGroupIndices;
 449
 450     globalAtomGroupIndices.resize(atomGroupsState.size());
 451
 452     /* Copy back the global charge group indices from state
 453      * and rebuild the local charge group to atom index.
 454      */
 455     for (gmx::index i = 0; i < atomGroupsState.ssize(); i++)
 456     {
 457         globalAtomGroupIndices[i] = atomGroupsState[i];
 458     }
 459
 460     dd->ncg_home = atomGroupsState.size();
 461     dd->comm->atomRanges.setEnd(DDAtomRanges::Type::Home, atomGroupsState.ssize());
 462
 463     set_zones_ncg_home(dd);
 464 }
 465
 466 //! Sets the cginfo structures.
 467 static void dd_set_cginfo(gmx::ArrayRef<const int> index_gl, int cg0, int cg1,
 468                           t_forcerec *fr)
 469 {
 470     if (fr != nullptr)
 471     {
 472         const cginfo_mb_t *cginfo_mb = fr->cginfo_mb;
 473         gmx::ArrayRef<int> cginfo    = fr->cginfo;
 474
 475         for (int cg = cg0; cg < cg1; cg++)
 476         {
 477             cginfo[cg] = ddcginfo(cginfo_mb, index_gl[cg]);
 478         }
 479     }
 480 }
 481
 482 //! Makes the mappings between global and local atom indices during DD repartioning.
 483 static void make_dd_indices(gmx_domdec_t *dd,
 484                             const int     atomStart)
 485 {
 486     const int                 numZones               = dd->comm->zones.n;
 487     const int                *zone2cg                = dd->comm->zones.cg_range;
 488     const int                *zone_ncg1              = dd->comm->zone_ncg1;
 489     gmx::ArrayRef<const int>  globalAtomGroupIndices = dd->globalAtomGroupIndices;
 490
 491     std::vector<int>         &globalAtomIndices      = dd->globalAtomIndices;
 492     gmx_ga2la_t              &ga2la                  = *dd->ga2la;
 493
 494     if (zone2cg[1] != dd->ncg_home)
 495     {
 496         gmx_incons("dd->ncg_zone is not up to date");
 497     }
 498
 499     /* Make the local to global and global to local atom index */
 500     int a = atomStart;
 501     globalAtomIndices.resize(a);
 502     for (int zone = 0; zone < numZones; zone++)
 503     {
 504         int cg0;
 505         if (zone == 0)
 506         {
 507             cg0 = atomStart;
 508         }
 509         else
 510         {
 511             cg0 = zone2cg[zone];
 512         }
 513         int cg1    = zone2cg[zone+1];
 514         int cg1_p1 = cg0 + zone_ncg1[zone];
 515
 516         for (int cg = cg0; cg < cg1; cg++)
 517         {
 518             int zone1 = zone;
 519             if (cg >= cg1_p1)
 520             {
 521                 /* Signal that this cg is from more than one pulse away */
 522                 zone1 += numZones;
 523             }
 524             int cg_gl = globalAtomGroupIndices[cg];
 525             globalAtomIndices.push_back(cg_gl);
 526             ga2la.insert(cg_gl, {a, zone1});
 527             a++;
 528         }
 529     }
 530 }
 531
 532 //! Checks whether global and local atom indices are consistent.
 533 static void check_index_consistency(const gmx_domdec_t *dd,
 534                                     int                 natoms_sys,
 535                                     const char         *where)
 536 {
 537     int       nerr = 0;
 538
 539     const int numAtomsInZones = dd->comm->atomRanges.end(DDAtomRanges::Type::Zones);
 540
 541     if (dd->comm->ddSettings.DD_debug > 1)
 542     {
 543         std::vector<int> have(natoms_sys);
 544         for (int a = 0; a < numAtomsInZones; a++)
 545         {
 546             int globalAtomIndex = dd->globalAtomIndices[a];
 547             if (have[globalAtomIndex] > 0)
 548             {
 549                 fprintf(stderr, "DD rank %d: global atom %d occurs twice: index %d and %d\n", dd->rank, globalAtomIndex + 1, have[globalAtomIndex], a+1);
 550             }
 551             else
 552             {
 553                 have[globalAtomIndex] = a + 1;
 554             }
 555         }
 556     }
 557
 558     std::vector<int> have(numAtomsInZones);
 559
 560     int              ngl = 0;
 561     for (int i = 0; i < natoms_sys; i++)
 562     {
 563         if (const auto entry = dd->ga2la->find(i))
 564         {
 565             const int a = entry->la;
 566             if (a >= numAtomsInZones)
 567             {
 568                 fprintf(stderr, "DD rank %d: global atom %d marked as local atom %d, which is larger than nat_tot (%d)\n", dd->rank, i+1, a+1, numAtomsInZones);
 569                 nerr++;
 570             }
 571             else
 572             {
 573                 have[a] = 1;
 574                 if (dd->globalAtomIndices[a] != i)
 575                 {
 576                     fprintf(stderr, "DD rank %d: global atom %d marked as local atom %d, which has global atom index %d\n", dd->rank, i+1, a+1, dd->globalAtomIndices[a]+1);
 577                     nerr++;
 578                 }
 579             }
 580             ngl++;
 581         }
 582     }
 583     if (ngl != numAtomsInZones)
 584     {
 585         fprintf(stderr,
 586                 "DD rank %d, %s: %d global atom indices, %d local atoms\n",
 587                 dd->rank, where, ngl, numAtomsInZones);
 588     }
 589     for (int a = 0; a < numAtomsInZones; a++)
 590     {
 591         if (have[a] == 0)
 592         {
 593             fprintf(stderr,
 594                     "DD rank %d, %s: local atom %d, global %d has no global index\n",
 595                     dd->rank, where, a + 1, dd->globalAtomIndices[a] + 1);
 596         }
 597     }
 598
 599     if (nerr > 0)
 600     {
 601         gmx_fatal(FARGS, "DD rank %d, %s: %d atom(group) index inconsistencies",
 602                   dd->rank, where, nerr);
 603     }
 604 }
 605
 606 //! Clear all DD global state indices
 607 static void clearDDStateIndices(gmx_domdec_t *dd,
 608                                 const bool    keepLocalAtomIndices)
 609 {
 610     gmx_ga2la_t &ga2la = *dd->ga2la;
 611
 612     if (!keepLocalAtomIndices)
 613     {
 614         /* Clear the whole list without the overhead of searching */
 615         ga2la.clear();
 616     }
 617     else
 618     {
 619         const int numAtomsInZones = dd->comm->atomRanges.end(DDAtomRanges::Type::Zones);
 620         for (int i = 0; i < numAtomsInZones; i++)
 621         {
 622             ga2la.erase(dd->globalAtomIndices[i]);
 623         }
 624     }
 625
 626     dd_clear_local_vsite_indices(dd);
 627
 628     if (dd->constraints)
 629     {
 630         dd_clear_local_constraint_indices(dd);
 631     }
 632 }
 633
 634 bool check_grid_jump(int64_t             step,
 635                      const gmx_domdec_t *dd,
 636                      real                cutoff,
 637                      const gmx_ddbox_t  *ddbox,
 638                      gmx_bool            bFatal)
 639 {
 640     gmx_domdec_comm_t *comm    = dd->comm;
 641     bool               invalid = false;
 642
 643     for (int d = 1; d < dd->ndim; d++)
 644     {
 645         const DDCellsizesWithDlb &cellsizes = comm->cellsizesWithDlb[d];
 646         const int                 dim       = dd->dim[d];
 647         const real                limit     = grid_jump_limit(comm, cutoff, d);
 648         real                      bfac      = ddbox->box_size[dim];
 649         if (ddbox->tric_dir[dim])
 650         {
 651             bfac *= ddbox->skew_fac[dim];
 652         }
 653         if ((cellsizes.fracUpper - cellsizes.fracLowerMax)*bfac <  limit ||
 654             (cellsizes.fracLower - cellsizes.fracUpperMin)*bfac > -limit)
 655         {
 656             invalid = true;
 657
 658             if (bFatal)
 659             {
 660                 char buf[22];
 661
 662                 /* This error should never be triggered under normal
 663                  * circumstances, but you never know ...
 664                  */
 665                 gmx_fatal(FARGS, "step %s: The domain decomposition grid has shifted too much in the %c-direction around cell %d %d %d. This should not have happened. Running with fewer ranks might avoid this issue.",
 666                           gmx_step_str(step, buf),
 667                           dim2char(dim), dd->ci[XX], dd->ci[YY], dd->ci[ZZ]);
 668             }
 669         }
 670     }
 671
 672     return invalid;
 673 }
 674 //! Return the duration of force calculations on this rank.
 675 static float dd_force_load(gmx_domdec_comm_t *comm)
 676 {
 677     float load;
 678
 679     if (comm->ddSettings.eFlop)
 680     {
 681         load = comm->flop;
 682         if (comm->ddSettings.eFlop > 1)
 683         {
 684             load *= 1.0 + (comm->ddSettings.eFlop - 1)*(0.1*rand()/RAND_MAX - 0.05);
 685         }
 686     }
 687     else
 688     {
 689         load = comm->cycl[ddCyclF];
 690         if (comm->cycl_n[ddCyclF] > 1)
 691         {
 692             /* Subtract the maximum of the last n cycle counts
 693              * to get rid of possible high counts due to other sources,
 694              * for instance system activity, that would otherwise
 695              * affect the dynamic load balancing.
 696              */
 697             load -= comm->cycl_max[ddCyclF];
 698         }
 699
 700 #if GMX_MPI
 701         if (comm->cycl_n[ddCyclWaitGPU] && comm->nrank_gpu_shared > 1)
 702         {
 703             float gpu_wait, gpu_wait_sum;
 704
 705             gpu_wait = comm->cycl[ddCyclWaitGPU];
 706             if (comm->cycl_n[ddCyclF] > 1)
 707             {
 708                 /* We should remove the WaitGPU time of the same MD step
 709                  * as the one with the maximum F time, since the F time
 710                  * and the wait time are not independent.
 711                  * Furthermore, the step for the max F time should be chosen
 712                  * the same on all ranks that share the same GPU.
 713                  * But to keep the code simple, we remove the average instead.
 714                  * The main reason for artificially long times at some steps
 715                  * is spurious CPU activity or MPI time, so we don't expect
 716                  * that changes in the GPU wait time matter a lot here.
 717                  */
 718                 gpu_wait *= (comm->cycl_n[ddCyclF] - 1)/static_cast<float>(comm->cycl_n[ddCyclF]);
 719             }
 720             /* Sum the wait times over the ranks that share the same GPU */
 721             MPI_Allreduce(&gpu_wait, &gpu_wait_sum, 1, MPI_FLOAT, MPI_SUM,
 722                           comm->mpi_comm_gpu_shared);
 723             /* Replace the wait time by the average over the ranks */
 724             load += -gpu_wait + gpu_wait_sum/comm->nrank_gpu_shared;
 725         }
 726 #endif
 727     }
 728
 729     return load;
 730 }
 731
 732 //! Runs cell size checks and communicates the boundaries.
 733 static void comm_dd_ns_cell_sizes(gmx_domdec_t *dd,
 734                                   gmx_ddbox_t *ddbox,
 735                                   rvec cell_ns_x0, rvec cell_ns_x1,
 736                                   int64_t step)
 737 {
 738     gmx_domdec_comm_t *comm;
 739     int                dim_ind, dim;
 740
 741     comm = dd->comm;
 742
 743     for (dim_ind = 0; dim_ind < dd->ndim; dim_ind++)
 744     {
 745         dim = dd->dim[dim_ind];
 746
 747         /* Without PBC we don't have restrictions on the outer cells */
 748         if (!(dim >= ddbox->npbcdim &&
 749               (dd->ci[dim] == 0 || dd->ci[dim] == dd->nc[dim] - 1)) &&
 750             isDlbOn(comm) &&
 751             (comm->cell_x1[dim] - comm->cell_x0[dim])*ddbox->skew_fac[dim] <
 752             comm->cellsize_min[dim])
 753         {
 754             char buf[22];
 755             gmx_fatal(FARGS, "step %s: The %c-size (%f) times the triclinic skew factor (%f) is smaller than the smallest allowed cell size (%f) for domain decomposition grid cell %d %d %d",
 756                       gmx_step_str(step, buf), dim2char(dim),
 757                       comm->cell_x1[dim] - comm->cell_x0[dim],
 758                       ddbox->skew_fac[dim],
 759                       dd->comm->cellsize_min[dim],
 760                       dd->ci[XX], dd->ci[YY], dd->ci[ZZ]);
 761         }
 762     }
 763
 764     if ((isDlbOn(dd->comm) && dd->ndim > 1) || ddbox->nboundeddim < DIM)
 765     {
 766         /* Communicate the boundaries and update cell_ns_x0/1 */
 767         dd_move_cellx(dd, ddbox, cell_ns_x0, cell_ns_x1);
 768         if (isDlbOn(dd->comm) && dd->ndim > 1)
 769         {
 770             check_grid_jump(step, dd, dd->comm->systemInfo.cutoff, ddbox, TRUE);
 771         }
 772     }
 773 }
 774
 775 //! Compute and communicate to determine the load distribution across PP ranks.
 776 static void get_load_distribution(gmx_domdec_t *dd, gmx_wallcycle_t wcycle)
 777 {
 778     gmx_domdec_comm_t *comm;
 779     domdec_load_t     *load;
 780     float              cell_frac = 0, sbuf[DD_NLOAD_MAX];
 781     gmx_bool           bSepPME;
 782
 783     if (debug)
 784     {
 785         fprintf(debug, "get_load_distribution start\n");
 786     }
 787
 788     wallcycle_start(wcycle, ewcDDCOMMLOAD);
 789
 790     comm = dd->comm;
 791
 792     bSepPME = (dd->pme_nodeid >= 0);
 793
 794     if (dd->ndim == 0 && bSepPME)
 795     {
 796         /* Without decomposition, but with PME nodes, we need the load */
 797         comm->load[0].mdf = comm->cycl[ddCyclPPduringPME];
 798         comm->load[0].pme = comm->cycl[ddCyclPME];
 799     }
 800
 801     for (int d = dd->ndim - 1; d >= 0; d--)
 802     {
 803         const DDCellsizesWithDlb *cellsizes = (isDlbOn(dd->comm) ? &comm->cellsizesWithDlb[d] : nullptr);
 804         const int                 dim       = dd->dim[d];
 805         /* Check if we participate in the communication in this dimension */
 806         if (d == dd->ndim-1 ||
 807             (dd->ci[dd->dim[d+1]] == 0 && dd->ci[dd->dim[dd->ndim-1]] == 0))
 808         {
 809             load = &comm->load[d];
 810             if (isDlbOn(dd->comm))
 811             {
 812                 cell_frac = cellsizes->fracUpper - cellsizes->fracLower;
 813             }
 814             int pos = 0;
 815             if (d == dd->ndim-1)
 816             {
 817                 sbuf[pos++] = dd_force_load(comm);
 818                 sbuf[pos++] = sbuf[0];
 819                 if (isDlbOn(dd->comm))
 820                 {
 821                     sbuf[pos++] = sbuf[0];
 822                     sbuf[pos++] = cell_frac;
 823                     if (d > 0)
 824                     {
 825                         sbuf[pos++] = cellsizes->fracLowerMax;
 826                         sbuf[pos++] = cellsizes->fracUpperMin;
 827                     }
 828                 }
 829                 if (bSepPME)
 830                 {
 831                     sbuf[pos++] = comm->cycl[ddCyclPPduringPME];
 832                     sbuf[pos++] = comm->cycl[ddCyclPME];
 833                 }
 834             }
 835             else
 836             {
 837                 sbuf[pos++] = comm->load[d+1].sum;
 838                 sbuf[pos++] = comm->load[d+1].max;
 839                 if (isDlbOn(dd->comm))
 840                 {
 841                     sbuf[pos++] = comm->load[d+1].sum_m;
 842                     sbuf[pos++] = comm->load[d+1].cvol_min*cell_frac;
 843                     sbuf[pos++] = comm->load[d+1].flags;
 844                     if (d > 0)
 845                     {
 846                         sbuf[pos++] = cellsizes->fracLowerMax;
 847                         sbuf[pos++] = cellsizes->fracUpperMin;
 848                     }
 849                 }
 850                 if (bSepPME)
 851                 {
 852                     sbuf[pos++] = comm->load[d+1].mdf;
 853                     sbuf[pos++] = comm->load[d+1].pme;
 854                 }
 855             }
 856             load->nload = pos;
 857             /* Communicate a row in DD direction d.
 858              * The communicators are setup such that the root always has rank 0.
 859              */
 860 #if GMX_MPI
 861             MPI_Gather(sbuf, load->nload*sizeof(float), MPI_BYTE,
 862                        load->load, load->nload*sizeof(float), MPI_BYTE,
 863                        0, comm->mpi_comm_load[d]);
 864 #endif
 865             if (dd->ci[dim] == dd->master_ci[dim])
 866             {
 867                 /* We are the master along this row, process this row */
 868                 RowMaster *rowMaster = nullptr;
 869
 870                 if (isDlbOn(comm))
 871                 {
 872                     rowMaster = cellsizes->rowMaster.get();
 873                 }
 874                 load->sum      = 0;
 875                 load->max      = 0;
 876                 load->sum_m    = 0;
 877                 load->cvol_min = 1;
 878                 load->flags    = 0;
 879                 load->mdf      = 0;
 880                 load->pme      = 0;
 881                 int pos        = 0;
 882                 for (int i = 0; i < dd->nc[dim]; i++)
 883                 {
 884                     load->sum += load->load[pos++];
 885                     load->max  = std::max(load->max, load->load[pos]);
 886                     pos++;
 887                     if (isDlbOn(dd->comm))
 888                     {
 889                         if (rowMaster->dlbIsLimited)
 890                         {
 891                             /* This direction could not be load balanced properly,
 892                              * therefore we need to use the maximum iso the average load.
 893                              */
 894                             load->sum_m = std::max(load->sum_m, load->load[pos]);
 895                         }
 896                         else
 897                         {
 898                             load->sum_m += load->load[pos];
 899                         }
 900                         pos++;
 901                         load->cvol_min = std::min(load->cvol_min, load->load[pos]);
 902                         pos++;
 903                         if (d < dd->ndim-1)
 904                         {
 905                             load->flags = gmx::roundToInt(load->load[pos++]);
 906                         }
 907                         if (d > 0)
 908                         {
 909                             rowMaster->bounds[i].cellFracLowerMax = load->load[pos++];
 910                             rowMaster->bounds[i].cellFracUpperMin = load->load[pos++];
 911                         }
 912                     }
 913                     if (bSepPME)
 914                     {
 915                         load->mdf = std::max(load->mdf, load->load[pos]);
 916                         pos++;
 917                         load->pme = std::max(load->pme, load->load[pos]);
 918                         pos++;
 919                     }
 920                 }
 921                 if (isDlbOn(comm) && rowMaster->dlbIsLimited)
 922                 {
 923                     load->sum_m *= dd->nc[dim];
 924                     load->flags |= (1<<d);
 925                 }
 926             }
 927         }
 928     }
 929
 930     if (DDMASTER(dd))
 931     {
 932         comm->nload      += dd_load_count(comm);
 933         comm->load_step  += comm->cycl[ddCyclStep];
 934         comm->load_sum   += comm->load[0].sum;
 935         comm->load_max   += comm->load[0].max;
 936         if (isDlbOn(comm))
 937         {
 938             for (int d = 0; d < dd->ndim; d++)
 939             {
 940                 if (comm->load[0].flags & (1<<d))
 941                 {
 942                     comm->load_lim[d]++;
 943                 }
 944             }
 945         }
 946         if (bSepPME)
 947         {
 948             comm->load_mdf += comm->load[0].mdf;
 949             comm->load_pme += comm->load[0].pme;
 950         }
 951     }
 952
 953     wallcycle_stop(wcycle, ewcDDCOMMLOAD);
 954
 955     if (debug)
 956     {
 957         fprintf(debug, "get_load_distribution finished\n");
 958     }
 959 }
 960
 961 /*! \brief Return the relative performance loss on the total run time
 962  * due to the force calculation load imbalance. */
 963 static float dd_force_load_fraction(gmx_domdec_t *dd)
 964 {
 965     if (dd->comm->nload > 0 && dd->comm->load_step > 0)
 966     {
 967         return dd->comm->load_sum/(dd->comm->load_step*dd->nnodes);
 968     }
 969     else
 970     {
 971         return 0;
 972     }
 973 }
 974
 975 /*! \brief Return the relative performance loss on the total run time
 976  * due to the force calculation load imbalance. */
 977 static float dd_force_imb_perf_loss(gmx_domdec_t *dd)
 978 {
 979     if (dd->comm->nload > 0 && dd->comm->load_step > 0)
 980     {
 981         return
 982             (dd->comm->load_max*dd->nnodes - dd->comm->load_sum)/
 983             (dd->comm->load_step*dd->nnodes);
 984     }
 985     else
 986     {
 987         return 0;
 988     }
 989 }
 990
 991 //! Print load-balance report e.g. at the end of a run.
 992 static void print_dd_load_av(FILE *fplog, gmx_domdec_t *dd)
 993 {
 994     gmx_domdec_comm_t *comm = dd->comm;
 995
 996     /* Only the master rank prints loads and only if we measured loads */
 997     if (!DDMASTER(dd) || comm->nload == 0)
 998     {
 999         return;
1000     }
1001
1002     char  buf[STRLEN];
1003     int   numPpRanks   = dd->nnodes;
1004     int   numPmeRanks  = (comm->ddRankSetup.usePmeOnlyRanks ? comm->ddRankSetup.numRanksDoingPme : 0);
1005     int   numRanks     = numPpRanks + numPmeRanks;
1006     float lossFraction = 0;
1007
1008     /* Print the average load imbalance and performance loss */
1009     if (dd->nnodes > 1 && comm->load_sum > 0)
1010     {
1011         float imbalance = comm->load_max*numPpRanks/comm->load_sum - 1;
1012         lossFraction    = dd_force_imb_perf_loss(dd);
1013
1014         std::string msg         = "\nDynamic load balancing report:\n";
1015         std::string dlbStateStr;
1016
1017         switch (dd->comm->dlbState)
1018         {
1019             case DlbState::offUser:
1020                 dlbStateStr = "DLB was off during the run per user request.";
1021                 break;
1022             case DlbState::offForever:
1023                 /* Currectly this can happen due to performance loss observed, cell size
1024                  * limitations or incompatibility with other settings observed during
1025                  * determineInitialDlbState(). */
1026                 dlbStateStr = "DLB got disabled because it was unsuitable to use.";
1027                 break;
1028             case DlbState::offCanTurnOn:
1029                 dlbStateStr = "DLB was off during the run due to low measured imbalance.";
1030                 break;
1031             case DlbState::offTemporarilyLocked:
1032                 dlbStateStr = "DLB was locked at the end of the run due to unfinished PP-PME balancing.";
1033                 break;
1034             case DlbState::onCanTurnOff:
1035                 dlbStateStr = "DLB was turned on during the run due to measured imbalance.";
1036                 break;
1037             case DlbState::onUser:
1038                 dlbStateStr = "DLB was permanently on during the run per user request.";
1039                 break;
1040             default:
1041                 GMX_ASSERT(false, "Undocumented DLB state");
1042         }
1043
1044         msg += " " + dlbStateStr + "\n";
1045         msg += gmx::formatString(" Average load imbalance: %.1f%%.\n", imbalance*100);
1046         msg += gmx::formatString(" The balanceable part of the MD step is %d%%, load imbalance is computed from this.\n",
1047                                  gmx::roundToInt(dd_force_load_fraction(dd)*100));
1048         msg += gmx::formatString(" Part of the total run time spent waiting due to load imbalance: %.1f%%.\n",
1049                                  lossFraction*100);
1050         fprintf(fplog, "%s", msg.c_str());
1051         fprintf(stderr, "\n%s", msg.c_str());
1052     }
1053
1054     /* Print during what percentage of steps the  load balancing was limited */
1055     bool dlbWasLimited = false;
1056     if (isDlbOn(comm))
1057     {
1058         sprintf(buf, " Steps where the load balancing was limited by -rdd, -rcon and/or -dds:");
1059         for (int d = 0; d < dd->ndim; d++)
1060         {
1061             int limitPercentage = (200*comm->load_lim[d] + 1)/(2*comm->nload);
1062             sprintf(buf+strlen(buf), " %c %d %%",
1063                     dim2char(dd->dim[d]), limitPercentage);
1064             if (limitPercentage >= 50)
1065             {
1066                 dlbWasLimited = true;
1067             }
1068         }
1069         sprintf(buf + strlen(buf), "\n");
1070         fprintf(fplog, "%s", buf);
1071         fprintf(stderr, "%s", buf);
1072     }
1073
1074     /* Print the performance loss due to separate PME - PP rank imbalance */
1075     float lossFractionPme = 0;
1076     if (numPmeRanks > 0 && comm->load_mdf > 0 && comm->load_step > 0)
1077     {
1078         float pmeForceRatio = comm->load_pme/comm->load_mdf;
1079         lossFractionPme     = (comm->load_pme - comm->load_mdf)/comm->load_step;
1080         if (lossFractionPme <= 0)
1081         {
1082             lossFractionPme *= numPmeRanks/static_cast<float>(numRanks);
1083         }
1084         else
1085         {
1086             lossFractionPme *= numPpRanks/static_cast<float>(numRanks);
1087         }
1088         sprintf(buf, " Average PME mesh/force load: %5.3f\n", pmeForceRatio);
1089         fprintf(fplog, "%s", buf);
1090         fprintf(stderr, "%s", buf);
1091         sprintf(buf, " Part of the total run time spent waiting due to PP/PME imbalance: %.1f %%\n", std::fabs(lossFractionPme)*100);
1092         fprintf(fplog, "%s", buf);
1093         fprintf(stderr, "%s", buf);
1094     }
1095     fprintf(fplog, "\n");
1096     fprintf(stderr, "\n");
1097
1098     if (lossFraction >= DD_PERF_LOSS_WARN)
1099     {
1100         std::string message = gmx::formatString(
1101                     "NOTE: %.1f %% of the available CPU time was lost due to load imbalance\n"
1102                     "      in the domain decomposition.\n", lossFraction*100);
1103
1104         bool hadSuggestion = false;
1105         if (!isDlbOn(comm))
1106         {
1107             message      += "      You might want to use dynamic load balancing (option -dlb.)\n";
1108             hadSuggestion = true;
1109         }
1110         else if (dlbWasLimited)
1111         {
1112             message      += "      You might want to decrease the cell size limit (options -rdd, -rcon and/or -dds).\n";
1113             hadSuggestion = true;
1114         }
1115         message += gmx::formatString(
1116                     "      You can %sconsider manually changing the decomposition (option -dd);\n"
1117                     "      e.g. by using fewer domains along the box dimension in which there is\n"
1118                     "      considerable inhomogeneity in the simulated system.",
1119                     hadSuggestion ? "also " : "");
1120
1121
1122         fprintf(fplog, "%s\n", message.c_str());
1123         fprintf(stderr, "%s\n", message.c_str());
1124     }
1125     if (numPmeRanks > 0 && std::fabs(lossFractionPme) >= DD_PERF_LOSS_WARN)
1126     {
1127         sprintf(buf,
1128                 "NOTE: %.1f %% performance was lost because the PME ranks\n"
1129                 "      had %s work to do than the PP ranks.\n"
1130                 "      You might want to %s the number of PME ranks\n"
1131                 "      or %s the cut-off and the grid spacing.\n",
1132                 std::fabs(lossFractionPme*100),
1133                 (lossFractionPme < 0) ? "less"     : "more",
1134                 (lossFractionPme < 0) ? "decrease" : "increase",
1135                 (lossFractionPme < 0) ? "decrease" : "increase");
1136         fprintf(fplog, "%s\n", buf);
1137         fprintf(stderr, "%s\n", buf);
1138     }
1139 }
1140
1141 //! Return the minimum communication volume.
1142 static float dd_vol_min(gmx_domdec_t *dd)
1143 {
1144     return dd->comm->load[0].cvol_min*dd->nnodes;
1145 }
1146
1147 //! Return the DD load flags.
1148 static int dd_load_flags(gmx_domdec_t *dd)
1149 {
1150     return dd->comm->load[0].flags;
1151 }
1152
1153 //! Return the reported load imbalance in force calculations.
1154 static float dd_f_imbal(gmx_domdec_t *dd)
1155 {
1156     if (dd->comm->load[0].sum > 0)
1157     {
1158         return dd->comm->load[0].max*dd->nnodes/dd->comm->load[0].sum - 1.0F;
1159     }
1160     else
1161     {
1162         /* Something is wrong in the cycle counting, report no load imbalance */
1163         return 0.0F;
1164     }
1165 }
1166
1167 //! Returns DD load balance report.
1168 static std::string
1169 dd_print_load(gmx_domdec_t *dd,
1170               int64_t       step)
1171 {
1172     gmx::StringOutputStream stream;
1173     gmx::TextWriter         log(&stream);
1174
1175     int                     flags = dd_load_flags(dd);
1176     if (flags)
1177     {
1178         log.writeString("DD  load balancing is limited by minimum cell size in dimension");
1179         for (int d = 0; d < dd->ndim; d++)
1180         {
1181             if (flags & (1<<d))
1182             {
1183                 log.writeStringFormatted(" %c", dim2char(dd->dim[d]));
1184             }
1185         }
1186         log.ensureLineBreak();
1187     }
1188     log.writeString("DD  step " + gmx::toString(step));
1189     if (isDlbOn(dd->comm))
1190     {
1191         log.writeStringFormatted("  vol min/aver %5.3f%c",
1192                                  dd_vol_min(dd), flags ? '!' : ' ');
1193     }
1194     if (dd->nnodes > 1)
1195     {
1196         log.writeStringFormatted(" load imb.: force %4.1f%%", dd_f_imbal(dd)*100);
1197     }
1198     if (dd->comm->cycl_n[ddCyclPME])
1199     {
1200         log.writeStringFormatted("  pme mesh/force %5.3f", dd_pme_f_ratio(dd));
1201     }
1202     log.ensureLineBreak();
1203     return stream.toString();
1204 }
1205
1206 //! Prints DD load balance report in mdrun verbose mode.
1207 static void dd_print_load_verbose(gmx_domdec_t *dd)
1208 {
1209     if (isDlbOn(dd->comm))
1210     {
1211         fprintf(stderr, "vol %4.2f%c ",
1212                 dd_vol_min(dd), dd_load_flags(dd) ? '!' : ' ');
1213     }
1214     if (dd->nnodes > 1)
1215     {
1216         fprintf(stderr, "imb F %2d%% ", gmx::roundToInt(dd_f_imbal(dd)*100));
1217     }
1218     if (dd->comm->cycl_n[ddCyclPME])
1219     {
1220         fprintf(stderr, "pme/F %4.2f ", dd_pme_f_ratio(dd));
1221     }
1222 }
1223
1224 //! Turns on dynamic load balancing if possible and needed.
1225 static void turn_on_dlb(const gmx::MDLogger &mdlog,
1226                         gmx_domdec_t        *dd,
1227                         int64_t              step)
1228 {
1229     gmx_domdec_comm_t *comm         = dd->comm;
1230
1231     real               cellsize_min = comm->cellsize_min[dd->dim[0]];
1232     for (int d = 1; d < dd->ndim; d++)
1233     {
1234         cellsize_min = std::min(cellsize_min, comm->cellsize_min[dd->dim[d]]);
1235     }
1236
1237     /* Turn off DLB if we're too close to the cell size limit. */
1238     if (cellsize_min < comm->cellsize_limit*1.05)
1239     {
1240         GMX_LOG(mdlog.info).appendTextFormatted(
1241                 "step %s Measured %.1f %% performance loss due to load imbalance, "
1242                 "but the minimum cell size is smaller than 1.05 times the cell size limit. "
1243                 "Will no longer try dynamic load balancing.",
1244                 gmx::toString(step).c_str(), dd_force_imb_perf_loss(dd)*100);
1245
1246         comm->dlbState = DlbState::offForever;
1247         return;
1248     }
1249
1250     GMX_LOG(mdlog.info).appendTextFormatted(
1251             "step %s Turning on dynamic load balancing, because the performance loss due to load imbalance is %.1f %%.",
1252             gmx::toString(step).c_str(), dd_force_imb_perf_loss(dd)*100);
1253     comm->dlbState = DlbState::onCanTurnOff;
1254
1255     /* Store the non-DLB performance, so we can check if DLB actually
1256      * improves performance.
1257      */
1258     GMX_RELEASE_ASSERT(comm->cycl_n[ddCyclStep] > 0, "When we turned on DLB, we should have measured cycles");
1259     comm->cyclesPerStepBeforeDLB = comm->cycl[ddCyclStep]/comm->cycl_n[ddCyclStep];
1260
1261     set_dlb_limits(dd);
1262
1263     /* We can set the required cell size info here,
1264      * so we do not need to communicate this.
1265      * The grid is completely uniform.
1266      */
1267     for (int d = 0; d < dd->ndim; d++)
1268     {
1269         RowMaster *rowMaster = comm->cellsizesWithDlb[d].rowMaster.get();
1270
1271         if (rowMaster)
1272         {
1273             comm->load[d].sum_m = comm->load[d].sum;
1274
1275             int nc = dd->nc[dd->dim[d]];
1276             for (int i = 0; i < nc; i++)
1277             {
1278                 rowMaster->cellFrac[i] = i/static_cast<real>(nc);
1279                 if (d > 0)
1280                 {
1281                     rowMaster->bounds[i].cellFracLowerMax =  i     /static_cast<real>(nc);
1282                     rowMaster->bounds[i].cellFracUpperMin = (i + 1)/static_cast<real>(nc);
1283                 }
1284             }
1285             rowMaster->cellFrac[nc] = 1.0;
1286         }
1287     }
1288 }
1289
1290 //! Turns off dynamic load balancing (but leave it able to turn back on).
1291 static void turn_off_dlb(const gmx::MDLogger &mdlog,
1292                          gmx_domdec_t        *dd,
1293                          int64_t              step)
1294 {
1295     GMX_LOG(mdlog.info).appendText(
1296             "step " + gmx::toString(step) + " Turning off dynamic load balancing, because it is degrading performance.");
1297     dd->comm->dlbState                     = DlbState::offCanTurnOn;
1298     dd->comm->haveTurnedOffDlb             = true;
1299     dd->comm->ddPartioningCountFirstDlbOff = dd->ddp_count;
1300 }
1301
1302 //! Turns off dynamic load balancing permanently.
1303 static void turn_off_dlb_forever(const gmx::MDLogger &mdlog,
1304                                  gmx_domdec_t        *dd,
1305                                  int64_t              step)
1306 {
1307     GMX_RELEASE_ASSERT(dd->comm->dlbState == DlbState::offCanTurnOn, "Can only turn off DLB forever when it was in the can-turn-on state");
1308     GMX_LOG(mdlog.info).appendText(
1309             "step " + gmx::toString(step) + " Will no longer try dynamic load balancing, as it degraded performance.");
1310     dd->comm->dlbState = DlbState::offForever;
1311 }
1312
1313 void set_dd_dlb_max_cutoff(t_commrec *cr, real cutoff)
1314 {
1315     gmx_domdec_comm_t *comm;
1316
1317     comm = cr->dd->comm;
1318
1319     /* Turn on the DLB limiting (might have been on already) */
1320     comm->bPMELoadBalDLBLimits = TRUE;
1321
1322     /* Change the cut-off limit */
1323     comm->PMELoadBal_max_cutoff = cutoff;
1324
1325     if (debug)
1326     {
1327         fprintf(debug, "PME load balancing set a limit to the DLB staggering such that a %f cut-off will continue to fit\n",
1328                 comm->PMELoadBal_max_cutoff);
1329     }
1330 }
1331
1332 //! Merge atom buffers.
1333 static void merge_cg_buffers(int ncell,
1334                              gmx_domdec_comm_dim_t *cd, int pulse,
1335                              int  *ncg_cell,
1336                              gmx::ArrayRef<int> index_gl,
1337                              const int  *recv_i,
1338                              gmx::ArrayRef<gmx::RVec> x,
1339                              gmx::ArrayRef<const gmx::RVec> recv_vr,
1340                              cginfo_mb_t *cginfo_mb,
1341                              gmx::ArrayRef<int> cginfo)
1342 {
1343     gmx_domdec_ind_t *ind, *ind_p;
1344     int               p, cell, c, cg, cg0, cg1, cg_gl;
1345     int               shift;
1346
1347     ind = &cd->ind[pulse];
1348
1349     /* First correct the already stored data */
1350     shift = ind->nrecv[ncell];
1351     for (cell = ncell-1; cell >= 0; cell--)
1352     {
1353         shift -= ind->nrecv[cell];
1354         if (shift > 0)
1355         {
1356             /* Move the cg's present from previous grid pulses */
1357             cg0                = ncg_cell[ncell+cell];
1358             cg1                = ncg_cell[ncell+cell+1];
1359             for (cg = cg1-1; cg >= cg0; cg--)
1360             {
1361                 index_gl[cg + shift] = index_gl[cg];
1362                 x[cg + shift]        = x[cg];
1363                 cginfo[cg + shift]   = cginfo[cg];
1364             }
1365             /* Correct the already stored send indices for the shift */
1366             for (p = 1; p <= pulse; p++)
1367             {
1368                 ind_p = &cd->ind[p];
1369                 cg0   = 0;
1370                 for (c = 0; c < cell; c++)
1371                 {
1372                     cg0 += ind_p->nsend[c];
1373                 }
1374                 cg1 = cg0 + ind_p->nsend[cell];
1375                 for (cg = cg0; cg < cg1; cg++)
1376                 {
1377                     ind_p->index[cg] += shift;
1378                 }
1379             }
1380         }
1381     }
1382
1383     /* Merge in the communicated buffers */
1384     shift    = 0;
1385     cg0      = 0;
1386     for (cell = 0; cell < ncell; cell++)
1387     {
1388         cg1 = ncg_cell[ncell+cell+1] + shift;
1389         for (cg = 0; cg < ind->nrecv[cell]; cg++)
1390         {
1391             /* Copy this atom from the buffer */
1392             index_gl[cg1] = recv_i[cg0];
1393             x[cg1]        = recv_vr[cg0];
1394             /* Copy information */
1395             cg_gl          = index_gl[cg1];
1396             cginfo[cg1]    = ddcginfo(cginfo_mb, cg_gl);
1397             cg0++;
1398             cg1++;
1399         }
1400         shift                 += ind->nrecv[cell];
1401         ncg_cell[ncell+cell+1] = cg1;
1402     }
1403 }
1404
1405 //! Makes a range partitioning for the atom groups wthin a cell
1406 static void make_cell2at_index(gmx_domdec_comm_dim_t        *cd,
1407                                int                           nzone,
1408                                int                           atomGroupStart)
1409 {
1410     /* Store the atom block boundaries for easy copying of communication buffers
1411      */
1412     int g = atomGroupStart;
1413     for (int zone = 0; zone < nzone; zone++)
1414     {
1415         for (gmx_domdec_ind_t &ind : cd->ind)
1416         {
1417             ind.cell2at0[zone]  = g;
1418             g                  += ind.nrecv[zone];
1419             ind.cell2at1[zone]  = g;
1420         }
1421     }
1422 }
1423
1424 //! Returns whether a link is missing.
1425 static gmx_bool missing_link(const t_blocka    &link,
1426                              const int          globalAtomIndex,
1427                              const gmx_ga2la_t &ga2la)
1428 {
1429     for (int i = link.index[globalAtomIndex]; i < link.index[globalAtomIndex + 1]; i++)
1430     {
1431         if (!ga2la.findHome(link.a[i]))
1432         {
1433             return true;
1434         }
1435     }
1436
1437     return false;
1438 }
1439
1440 //! Domain corners for communication, a maximum of 4 i-zones see a j domain
1441 typedef struct {
1442     //! The corners for the non-bonded communication.
1443     real c[DIM][4];
1444     //! Corner for rounding.
1445     real cr0;
1446     //! Corners for rounding.
1447     real cr1[4];
1448     //! Corners for bounded communication.
1449     real bc[DIM];
1450     //! Corner for rounding for bonded communication.
1451     real bcr1;
1452 } dd_corners_t;
1453
1454 //! Determine the corners of the domain(s) we are communicating with.
1455 static void
1456 set_dd_corners(const gmx_domdec_t *dd,
1457                int dim0, int dim1, int dim2,
1458                gmx_bool bDistMB,
1459                dd_corners_t *c)
1460 {
1461     const gmx_domdec_comm_t  *comm;
1462     const gmx_domdec_zones_t *zones;
1463
1464     comm = dd->comm;
1465
1466     zones = &comm->zones;
1467
1468     /* Keep the compiler happy */
1469     c->cr0  = 0;
1470     c->bcr1 = 0;
1471
1472     /* The first dimension is equal for all cells */
1473     c->c[0][0] = comm->cell_x0[dim0];
1474     if (bDistMB)
1475     {
1476         c->bc[0] = c->c[0][0];
1477     }
1478     if (dd->ndim >= 2)
1479     {
1480         dim1 = dd->dim[1];
1481         /* This cell row is only seen from the first row */
1482         c->c[1][0] = comm->cell_x0[dim1];
1483         /* All rows can see this row */
1484         c->c[1][1] = comm->cell_x0[dim1];
1485         if (isDlbOn(dd->comm))
1486         {
1487             c->c[1][1] = std::max(comm->cell_x0[dim1], comm->zone_d1[1].mch0);
1488             if (bDistMB)
1489             {
1490                 /* For the multi-body distance we need the maximum */
1491                 c->bc[1] = std::max(comm->cell_x0[dim1], comm->zone_d1[1].p1_0);
1492             }
1493         }
1494         /* Set the upper-right corner for rounding */
1495         c->cr0 = comm->cell_x1[dim0];
1496
1497         if (dd->ndim >= 3)
1498         {
1499             dim2 = dd->dim[2];
1500             for (int j = 0; j < 4; j++)
1501             {
1502                 c->c[2][j] = comm->cell_x0[dim2];
1503             }
1504             if (isDlbOn(dd->comm))
1505             {
1506                 /* Use the maximum of the i-cells that see a j-cell */
1507                 for (const auto &iZone : zones->iZones)
1508                 {
1509                     const int iZoneIndex = iZone.iZoneIndex;
1510                     for (int jZone : iZone.jZoneRange)
1511                     {
1512                         if (jZone >= 4)
1513                         {
1514                             c->c[2][jZone - 4] =
1515                                 std::max(c->c[2][jZone - 4],
1516                                          comm->zone_d2[zones->shift[iZoneIndex][dim0]][zones->shift[iZoneIndex][dim1]].mch0);
1517                         }
1518                     }
1519                 }
1520                 if (bDistMB)
1521                 {
1522                     /* For the multi-body distance we need the maximum */
1523                     c->bc[2] = comm->cell_x0[dim2];
1524                     for (int i = 0; i < 2; i++)
1525                     {
1526                         for (int j = 0; j < 2; j++)
1527                         {
1528                             c->bc[2] = std::max(c->bc[2], comm->zone_d2[i][j].p1_0);
1529                         }
1530                     }
1531                 }
1532             }
1533
1534             /* Set the upper-right corner for rounding */
1535             /* Cell (0,0,0) and cell (1,0,0) can see cell 4 (0,1,1)
1536              * Only cell (0,0,0) can see cell 7 (1,1,1)
1537              */
1538             c->cr1[0] = comm->cell_x1[dim1];
1539             c->cr1[3] = comm->cell_x1[dim1];
1540             if (isDlbOn(dd->comm))
1541             {
1542                 c->cr1[0] = std::max(comm->cell_x1[dim1], comm->zone_d1[1].mch1);
1543                 if (bDistMB)
1544                 {
1545                     /* For the multi-body distance we need the maximum */
1546                     c->bcr1 = std::max(comm->cell_x1[dim1], comm->zone_d1[1].p1_1);
1547                 }
1548             }
1549         }
1550     }
1551 }
1552
1553 /*! \brief Add the atom groups we need to send in this pulse from this
1554  * zone to \p localAtomGroups and \p work. */
1555 static void
1556 get_zone_pulse_cgs(gmx_domdec_t *dd,
1557                    int zonei, int zone,
1558                    int cg0, int cg1,
1559                    gmx::ArrayRef<const int> globalAtomGroupIndices,
1560                    int dim, int dim_ind,
1561                    int dim0, int dim1, int dim2,
1562                    real r_comm2, real r_bcomm2,
1563                    matrix box,
1564                    bool distanceIsTriclinic,
1565                    rvec *normal,
1566                    real skew_fac2_d, real skew_fac_01,
1567                    rvec *v_d, rvec *v_0, rvec *v_1,
1568                    const dd_corners_t *c,
1569                    const rvec sf2_round,
1570                    gmx_bool bDistBonded,
1571                    gmx_bool bBondComm,
1572                    gmx_bool bDist2B,
1573                    gmx_bool bDistMB,
1574                    rvec *cg_cm,
1575                    gmx::ArrayRef<const int> cginfo,
1576                    std::vector<int>     *localAtomGroups,
1577                    dd_comm_setup_work_t *work)
1578 {
1579     gmx_domdec_comm_t *comm;
1580     gmx_bool           bScrew;
1581     gmx_bool           bDistMB_pulse;
1582     int                cg, i;
1583     real               r2, rb2, r, tric_sh;
1584     rvec               rn, rb;
1585     int                dimd;
1586     int                nsend_z, nat;
1587
1588     comm = dd->comm;
1589
1590     bScrew = (dd->unitCellInfo.haveScrewPBC && dim == XX);
1591
1592     bDistMB_pulse = (bDistMB && bDistBonded);
1593
1594     /* Unpack the work data */
1595     std::vector<int>       &ibuf = work->atomGroupBuffer;
1596     std::vector<gmx::RVec> &vbuf = work->positionBuffer;
1597     nsend_z                      = 0;
1598     nat                          = work->nat;
1599
1600     for (cg = cg0; cg < cg1; cg++)
1601     {
1602         r2  = 0;
1603         rb2 = 0;
1604         if (!distanceIsTriclinic)
1605         {
1606             /* Rectangular direction, easy */
1607             r = cg_cm[cg][dim] - c->c[dim_ind][zone];
1608             if (r > 0)
1609             {
1610                 r2 += r*r;
1611             }
1612             if (bDistMB_pulse)
1613             {
1614                 r = cg_cm[cg][dim] - c->bc[dim_ind];
1615                 if (r > 0)
1616                 {
1617                     rb2 += r*r;
1618                 }
1619             }
1620             /* Rounding gives at most a 16% reduction
1621              * in communicated atoms
1622              */
1623             if (dim_ind >= 1 && (zonei == 1 || zonei == 2))
1624             {
1625                 r = cg_cm[cg][dim0] - c->cr0;
1626                 /* This is the first dimension, so always r >= 0 */
1627                 r2 += r*r;
1628                 if (bDistMB_pulse)
1629                 {
1630                     rb2 += r*r;
1631                 }
1632             }
1633             if (dim_ind == 2 && (zonei == 2 || zonei == 3))
1634             {
1635                 r = cg_cm[cg][dim1] - c->cr1[zone];
1636                 if (r > 0)
1637                 {
1638                     r2 += r*r;
1639                 }
1640                 if (bDistMB_pulse)
1641                 {
1642                     r = cg_cm[cg][dim1] - c->bcr1;
1643                     if (r > 0)
1644                     {
1645                         rb2 += r*r;
1646                     }
1647                 }
1648             }
1649         }
1650         else
1651         {
1652             /* Triclinic direction, more complicated */
1653             clear_rvec(rn);
1654             clear_rvec(rb);
1655             /* Rounding, conservative as the skew_fac multiplication
1656              * will slightly underestimate the distance.
1657              */
1658             if (dim_ind >= 1 && (zonei == 1 || zonei == 2))
1659             {
1660                 rn[dim0] = cg_cm[cg][dim0] - c->cr0;
1661                 for (i = dim0+1; i < DIM; i++)
1662                 {
1663                     rn[dim0] -= cg_cm[cg][i]*v_0[i][dim0];
1664                 }
1665                 r2 = rn[dim0]*rn[dim0]*sf2_round[dim0];
1666                 if (bDistMB_pulse)
1667                 {
1668                     rb[dim0] = rn[dim0];
1669                     rb2      = r2;
1670                 }
1671                 /* Take care that the cell planes along dim0 might not
1672                  * be orthogonal to those along dim1 and dim2.
1673                  */
1674                 for (i = 1; i <= dim_ind; i++)
1675                 {
1676                     dimd = dd->dim[i];
1677                     if (normal[dim0][dimd] > 0)
1678                     {
1679                         rn[dimd] -= rn[dim0]*normal[dim0][dimd];
1680                         if (bDistMB_pulse)
1681                         {
1682                             rb[dimd] -= rb[dim0]*normal[dim0][dimd];
1683                         }
1684                     }
1685                 }
1686             }
1687             if (dim_ind == 2 && (zonei == 2 || zonei == 3))
1688             {
1689                 GMX_ASSERT(dim1 >= 0 && dim1 < DIM, "Must have a valid dimension index");
1690                 rn[dim1] += cg_cm[cg][dim1] - c->cr1[zone];
1691                 tric_sh   = 0;
1692                 for (i = dim1+1; i < DIM; i++)
1693                 {
1694                     tric_sh -= cg_cm[cg][i]*v_1[i][dim1];
1695                 }
1696                 rn[dim1] += tric_sh;
1697                 if (rn[dim1] > 0)
1698                 {
1699                     r2 += rn[dim1]*rn[dim1]*sf2_round[dim1];
1700                     /* Take care of coupling of the distances
1701                      * to the planes along dim0 and dim1 through dim2.
1702                      */
1703                     r2 -= rn[dim0]*rn[dim1]*skew_fac_01;
1704                     /* Take care that the cell planes along dim1
1705                      * might not be orthogonal to that along dim2.
1706                      */
1707                     if (normal[dim1][dim2] > 0)
1708                     {
1709                         rn[dim2] -= rn[dim1]*normal[dim1][dim2];
1710                     }
1711                 }
1712                 if (bDistMB_pulse)
1713                 {
1714                     rb[dim1] +=
1715                         cg_cm[cg][dim1] - c->bcr1 + tric_sh;
1716                     if (rb[dim1] > 0)
1717                     {
1718                         rb2 += rb[dim1]*rb[dim1]*sf2_round[dim1];
1719                         /* Take care of coupling of the distances
1720                          * to the planes along dim0 and dim1 through dim2.
1721                          */
1722                         rb2 -= rb[dim0]*rb[dim1]*skew_fac_01;
1723                         /* Take care that the cell planes along dim1
1724                          * might not be orthogonal to that along dim2.
1725                          */
1726                         if (normal[dim1][dim2] > 0)
1727                         {
1728                             rb[dim2] -= rb[dim1]*normal[dim1][dim2];
1729                         }
1730                     }
1731                 }
1732             }
1733             /* The distance along the communication direction */
1734             rn[dim] += cg_cm[cg][dim] - c->c[dim_ind][zone];
1735             tric_sh  = 0;
1736             for (i = dim+1; i < DIM; i++)
1737             {
1738                 tric_sh -= cg_cm[cg][i]*v_d[i][dim];
1739             }
1740             rn[dim] += tric_sh;
1741             if (rn[dim] > 0)
1742             {
1743                 r2 += rn[dim]*rn[dim]*skew_fac2_d;
1744                 /* Take care of coupling of the distances
1745                  * to the planes along dim0 and dim1 through dim2.
1746                  */
1747                 if (dim_ind == 1 && zonei == 1)
1748                 {
1749                     r2 -= rn[dim0]*rn[dim]*skew_fac_01;
1750                 }
1751             }
1752             if (bDistMB_pulse)
1753             {
1754                 clear_rvec(rb);
1755                 GMX_ASSERT(dim >= 0 && dim < DIM, "Must have a valid dimension index");
1756                 rb[dim] += cg_cm[cg][dim] - c->bc[dim_ind] + tric_sh;
1757                 if (rb[dim] > 0)
1758                 {
1759                     rb2 += rb[dim]*rb[dim]*skew_fac2_d;
1760                     /* Take care of coupling of the distances
1761                      * to the planes along dim0 and dim1 through dim2.
1762                      */
1763                     if (dim_ind == 1 && zonei == 1)
1764                     {
1765                         rb2 -= rb[dim0]*rb[dim]*skew_fac_01;
1766                     }
1767                 }
1768             }
1769         }
1770
1771         if (r2 < r_comm2 ||
1772             (bDistBonded &&
1773              ((bDistMB && rb2 < r_bcomm2) ||
1774               (bDist2B && r2  < r_bcomm2)) &&
1775              (!bBondComm ||
1776               (GET_CGINFO_BOND_INTER(cginfo[cg]) &&
1777                missing_link(*comm->bondedLinks, globalAtomGroupIndices[cg],
1778                             *dd->ga2la)))))
1779         {
1780             /* Store the local and global atom group indices and position */
1781             localAtomGroups->push_back(cg);
1782             ibuf.push_back(globalAtomGroupIndices[cg]);
1783             nsend_z++;
1784
1785             rvec posPbc;
1786             if (dd->ci[dim] == 0)
1787             {
1788                 /* Correct cg_cm for pbc */
1789                 rvec_add(cg_cm[cg], box[dim], posPbc);
1790                 if (bScrew)
1791                 {
1792                     posPbc[YY] = box[YY][YY] - posPbc[YY];
1793                     posPbc[ZZ] = box[ZZ][ZZ] - posPbc[ZZ];
1794                 }
1795             }
1796             else
1797             {
1798                 copy_rvec(cg_cm[cg], posPbc);
1799             }
1800             vbuf.emplace_back(posPbc[XX], posPbc[YY], posPbc[ZZ]);
1801
1802             nat += 1;
1803         }
1804     }
1805
1806     work->nat        = nat;
1807     work->nsend_zone = nsend_z;
1808 }
1809
1810 //! Clear data.
1811 static void clearCommSetupData(dd_comm_setup_work_t *work)
1812 {
1813     work->localAtomGroupBuffer.clear();
1814     work->atomGroupBuffer.clear();
1815     work->positionBuffer.clear();
1816     work->nat        = 0;
1817     work->nsend_zone = 0;
1818 }
1819
1820 //! Prepare DD communication.
1821 static void setup_dd_communication(gmx_domdec_t *dd,
1822                                    matrix box, gmx_ddbox_t *ddbox,
1823                                    t_forcerec *fr,
1824                                    t_state *state,
1825                                    PaddedHostVector<gmx::RVec> *f)
1826 {
1827     int                    dim_ind, dim, dim0, dim1, dim2, dimd, nat_tot;
1828     int                    nzone, nzone_send, zone, zonei, cg0, cg1;
1829     int                    c;
1830     int                   *zone_cg_range, pos_cg;
1831     gmx_domdec_comm_t     *comm;
1832     gmx_domdec_zones_t    *zones;
1833     gmx_domdec_comm_dim_t *cd;
1834     cginfo_mb_t           *cginfo_mb;
1835     gmx_bool               bBondComm, bDist2B, bDistMB, bDistBonded;
1836     dd_corners_t           corners;
1837     rvec                  *normal, *v_d, *v_0 = nullptr, *v_1 = nullptr;
1838     real                   skew_fac2_d, skew_fac_01;
1839     rvec                   sf2_round;
1840
1841     if (debug)
1842     {
1843         fprintf(debug, "Setting up DD communication\n");
1844     }
1845
1846     comm  = dd->comm;
1847
1848     if (comm->dth.empty())
1849     {
1850         /* Initialize the thread data.
1851          * This can not be done in init_domain_decomposition,
1852          * as the numbers of threads is determined later.
1853          */
1854         int numThreads = gmx_omp_nthreads_get(emntDomdec);
1855         comm->dth.resize(numThreads);
1856     }
1857
1858     bBondComm = comm->systemInfo.filterBondedCommunication;
1859
1860     /* Do we need to determine extra distances for multi-body bondeds? */
1861     bDistMB = (comm->systemInfo.haveInterDomainMultiBodyBondeds && isDlbOn(dd->comm) && dd->ndim > 1);
1862
1863     /* Do we need to determine extra distances for only two-body bondeds? */
1864     bDist2B = (bBondComm && !bDistMB);
1865
1866     const real r_comm2  = gmx::square(domainToDomainIntoAtomToDomainCutoff(comm->systemInfo, comm->systemInfo.cutoff));
1867     const real r_bcomm2 = gmx::square(domainToDomainIntoAtomToDomainCutoff(comm->systemInfo, comm->cutoff_mbody));
1868
1869     if (debug)
1870     {
1871         fprintf(debug, "bBondComm %s, r_bc %f\n", gmx::boolToString(bBondComm), std::sqrt(r_bcomm2));
1872     }
1873
1874     zones = &comm->zones;
1875
1876     dim0 = dd->dim[0];
1877     dim1 = (dd->ndim >= 2 ? dd->dim[1] : -1);
1878     dim2 = (dd->ndim >= 3 ? dd->dim[2] : -1);
1879
1880     set_dd_corners(dd, dim0, dim1, dim2, bDistMB, &corners);
1881
1882     /* Triclinic stuff */
1883     normal      = ddbox->normal;
1884     skew_fac_01 = 0;
1885     if (dd->ndim >= 2)
1886     {
1887         v_0 = ddbox->v[dim0];
1888         if (ddbox->tric_dir[dim0] && ddbox->tric_dir[dim1])
1889         {
1890             /* Determine the coupling coefficient for the distances
1891              * to the cell planes along dim0 and dim1 through dim2.
1892              * This is required for correct rounding.
1893              */
1894             skew_fac_01 =
1895                 ddbox->v[dim0][dim1+1][dim0]*ddbox->v[dim1][dim1+1][dim1];
1896             if (debug)
1897             {
1898                 fprintf(debug, "\nskew_fac_01 %f\n", skew_fac_01);
1899             }
1900         }
1901     }
1902     if (dd->ndim >= 3)
1903     {
1904         v_1 = ddbox->v[dim1];
1905     }
1906
1907     zone_cg_range = zones->cg_range;
1908     cginfo_mb     = fr->cginfo_mb;
1909
1910     zone_cg_range[0]   = 0;
1911     zone_cg_range[1]   = dd->ncg_home;
1912     comm->zone_ncg1[0] = dd->ncg_home;
1913     pos_cg             = dd->ncg_home;
1914
1915     nat_tot = comm->atomRanges.numHomeAtoms();
1916     nzone   = 1;
1917     for (dim_ind = 0; dim_ind < dd->ndim; dim_ind++)
1918     {
1919         dim = dd->dim[dim_ind];
1920         cd  = &comm->cd[dim_ind];
1921
1922         /* Check if we need to compute triclinic distances along this dim */
1923         bool distanceIsTriclinic = false;
1924         for (int i = 0; i <= dim_ind; i++)
1925         {
1926             if (ddbox->tric_dir[dd->dim[i]])
1927             {
1928                 distanceIsTriclinic = true;
1929             }
1930         }
1931
1932         if (dim >= ddbox->npbcdim && dd->ci[dim] == 0)
1933         {
1934             /* No pbc in this dimension, the first node should not comm. */
1935             nzone_send = 0;
1936         }
1937         else
1938         {
1939             nzone_send = nzone;
1940         }
1941
1942         v_d                = ddbox->v[dim];
1943         skew_fac2_d        = gmx::square(ddbox->skew_fac[dim]);
1944
1945         cd->receiveInPlace = true;
1946         for (int p = 0; p < cd->numPulses(); p++)
1947         {
1948             /* Only atoms communicated in the first pulse are used
1949              * for multi-body bonded interactions or for bBondComm.
1950              */
1951             bDistBonded = ((bDistMB || bDist2B) && p == 0);
1952
1953             gmx_domdec_ind_t *ind = &cd->ind[p];
1954
1955             /* Thread 0 writes in the global index array */
1956             ind->index.clear();
1957             clearCommSetupData(&comm->dth[0]);
1958
1959             for (zone = 0; zone < nzone_send; zone++)
1960             {
1961                 if (dim_ind > 0 && distanceIsTriclinic)
1962                 {
1963                     /* Determine slightly more optimized skew_fac's
1964                      * for rounding.
1965                      * This reduces the number of communicated atoms
1966                      * by about 10% for 3D DD of rhombic dodecahedra.
1967                      */
1968                     for (dimd = 0; dimd < dim; dimd++)
1969                     {
1970                         sf2_round[dimd] = 1;
1971                         if (ddbox->tric_dir[dimd])
1972                         {
1973                             for (int i = dd->dim[dimd] + 1; i < DIM; i++)
1974                             {
1975                                 /* If we are shifted in dimension i
1976                                  * and the cell plane is tilted forward
1977                                  * in dimension i, skip this coupling.
1978                                  */
1979                                 if (!(zones->shift[nzone+zone][i] &&
1980                                       ddbox->v[dimd][i][dimd] >= 0))
1981                                 {
1982                                     sf2_round[dimd] +=
1983                                         gmx::square(ddbox->v[dimd][i][dimd]);
1984                                 }
1985                             }
1986                             sf2_round[dimd] = 1/sf2_round[dimd];
1987                         }
1988                     }
1989                 }
1990
1991                 zonei = zone_perm[dim_ind][zone];
1992                 if (p == 0)
1993                 {
1994                     /* Here we permutate the zones to obtain a convenient order
1995                      * for neighbor searching
1996                      */
1997                     cg0 = zone_cg_range[zonei];
1998                     cg1 = zone_cg_range[zonei+1];
1999                 }
2000                 else
2001                 {
2002                     /* Look only at the cg's received in the previous grid pulse
2003                      */
2004                     cg1 = zone_cg_range[nzone+zone+1];
2005                     cg0 = cg1 - cd->ind[p-1].nrecv[zone];
2006                 }
2007
2008                 const int numThreads = gmx::ssize(comm->dth);
2009 #pragma omp parallel for num_threads(numThreads) schedule(static)
2010                 for (int th = 0; th < numThreads; th++)
2011                 {
2012                     try
2013                     {
2014                         dd_comm_setup_work_t &work = comm->dth[th];
2015
2016                         /* Retain data accumulated into buffers of thread 0 */
2017                         if (th > 0)
2018                         {
2019                             clearCommSetupData(&work);
2020                         }
2021
2022                         int cg0_th = cg0 + ((cg1 - cg0)* th   )/numThreads;
2023                         int cg1_th = cg0 + ((cg1 - cg0)*(th+1))/numThreads;
2024
2025                         /* Get the cg's for this pulse in this zone */
2026                         get_zone_pulse_cgs(dd, zonei, zone, cg0_th, cg1_th,
2027                                            dd->globalAtomGroupIndices,
2028                                            dim, dim_ind, dim0, dim1, dim2,
2029                                            r_comm2, r_bcomm2,
2030                                            box, distanceIsTriclinic,
2031                                            normal, skew_fac2_d, skew_fac_01,
2032                                            v_d, v_0, v_1, &corners, sf2_round,
2033                                            bDistBonded, bBondComm,
2034                                            bDist2B, bDistMB,
2035                                            state->x.rvec_array(),
2036                                            fr->cginfo,
2037                                            th == 0 ? &ind->index : &work.localAtomGroupBuffer,
2038                                            &work);
2039                     }
2040                     GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
2041                 } // END
2042
2043                 std::vector<int>       &atomGroups = comm->dth[0].atomGroupBuffer;
2044                 std::vector<gmx::RVec> &positions  = comm->dth[0].positionBuffer;
2045                 ind->nsend[zone]  = comm->dth[0].nsend_zone;
2046                 /* Append data of threads>=1 to the communication buffers */
2047                 for (int th = 1; th < numThreads; th++)
2048                 {
2049                     const dd_comm_setup_work_t &dth = comm->dth[th];
2050
2051                     ind->index.insert(ind->index.end(), dth.localAtomGroupBuffer.begin(), dth.localAtomGroupBuffer.end());
2052                     atomGroups.insert(atomGroups.end(), dth.atomGroupBuffer.begin(), dth.atomGroupBuffer.end());
2053                     positions.insert(positions.end(), dth.positionBuffer.begin(), dth.positionBuffer.end());
2054                     comm->dth[0].nat += dth.nat;
2055                     ind->nsend[zone] += dth.nsend_zone;
2056                 }
2057             }
2058             /* Clear the counts in case we do not have pbc */
2059             for (zone = nzone_send; zone < nzone; zone++)
2060             {
2061                 ind->nsend[zone] = 0;
2062             }
2063             ind->nsend[nzone]     = ind->index.size();
2064             ind->nsend[nzone + 1] = comm->dth[0].nat;
2065             /* Communicate the number of cg's and atoms to receive */
2066             ddSendrecv(dd, dim_ind, dddirBackward,
2067                        ind->nsend, nzone+2,
2068                        ind->nrecv, nzone+2);
2069
2070             if (p > 0)
2071             {
2072                 /* We can receive in place if only the last zone is not empty */
2073                 for (zone = 0; zone < nzone-1; zone++)
2074                 {
2075                     if (ind->nrecv[zone] > 0)
2076                     {
2077                         cd->receiveInPlace = false;
2078                     }
2079                 }
2080             }
2081
2082             int receiveBufferSize = 0;
2083             if (!cd->receiveInPlace)
2084             {
2085                 receiveBufferSize = ind->nrecv[nzone];
2086             }
2087             /* These buffer are actually only needed with in-place */
2088             DDBufferAccess<int>       globalAtomGroupBuffer(comm->intBuffer, receiveBufferSize);
2089             DDBufferAccess<gmx::RVec> rvecBuffer(comm->rvecBuffer, receiveBufferSize);
2090
2091             dd_comm_setup_work_t     &work = comm->dth[0];
2092
2093             /* Make space for the global cg indices */
2094             int numAtomGroupsNew = pos_cg + ind->nrecv[nzone];
2095             dd->globalAtomGroupIndices.resize(numAtomGroupsNew);
2096             /* Communicate the global cg indices */
2097             gmx::ArrayRef<int> integerBufferRef;
2098             if (cd->receiveInPlace)
2099             {
2100                 integerBufferRef = gmx::arrayRefFromArray(dd->globalAtomGroupIndices.data() + pos_cg, ind->nrecv[nzone]);
2101             }
2102             else
2103             {
2104                 integerBufferRef = globalAtomGroupBuffer.buffer;
2105             }
2106             ddSendrecv<int>(dd, dim_ind, dddirBackward,
2107                             work.atomGroupBuffer, integerBufferRef);
2108
2109             /* Make space for cg_cm */
2110             dd_check_alloc_ncg(fr, state, f, pos_cg + ind->nrecv[nzone]);
2111
2112             /* Communicate the coordinates */
2113             gmx::ArrayRef<gmx::RVec> rvecBufferRef;
2114             if (cd->receiveInPlace)
2115             {
2116                 rvecBufferRef = gmx::makeArrayRef(state->x).subArray(pos_cg, ind->nrecv[nzone]);
2117             }
2118             else
2119             {
2120                 rvecBufferRef = rvecBuffer.buffer;
2121             }
2122             ddSendrecv<gmx::RVec>(dd, dim_ind, dddirBackward,
2123                                   work.positionBuffer, rvecBufferRef);
2124
2125             /* Make the charge group index */
2126             if (cd->receiveInPlace)
2127             {
2128                 zone = (p == 0 ? 0 : nzone - 1);
2129                 while (zone < nzone)
2130                 {
2131                     for (int i = 0; i < ind->nrecv[zone]; i++)
2132                     {
2133                         int globalAtomIndex = dd->globalAtomGroupIndices[pos_cg];
2134                         fr->cginfo[pos_cg]  = ddcginfo(cginfo_mb, globalAtomIndex);
2135                         pos_cg++;
2136                     }
2137                     if (p == 0)
2138                     {
2139                         comm->zone_ncg1[nzone+zone] = ind->nrecv[zone];
2140                     }
2141                     zone++;
2142                     zone_cg_range[nzone+zone] = pos_cg;
2143                 }
2144             }
2145             else
2146             {
2147                 /* This part of the code is never executed with bBondComm. */
2148                 merge_cg_buffers(nzone, cd, p, zone_cg_range,
2149                                  dd->globalAtomGroupIndices, integerBufferRef.data(),
2150                                  state->x, rvecBufferRef,
2151                                  fr->cginfo_mb, fr->cginfo);
2152                 pos_cg += ind->nrecv[nzone];
2153             }
2154             nat_tot += ind->nrecv[nzone+1];
2155         }
2156         if (!cd->receiveInPlace)
2157         {
2158             /* Store the atom block for easy copying of communication buffers */
2159             make_cell2at_index(cd, nzone, zone_cg_range[nzone]);
2160         }
2161         nzone += nzone;
2162     }
2163
2164     comm->atomRanges.setEnd(DDAtomRanges::Type::Zones, nat_tot);
2165
2166     if (!bBondComm)
2167     {
2168         /* We don't need to update cginfo, since that was alrady done above.
2169          * So we pass NULL for the forcerec.
2170          */
2171         dd_set_cginfo(dd->globalAtomGroupIndices,
2172                       dd->ncg_home, dd->globalAtomGroupIndices.size(),
2173                       nullptr);
2174     }
2175
2176     if (debug)
2177     {
2178         fprintf(debug, "Finished setting up DD communication, zones:");
2179         for (c = 0; c < zones->n; c++)
2180         {
2181             fprintf(debug, " %d", zones->cg_range[c+1]-zones->cg_range[c]);
2182         }
2183         fprintf(debug, "\n");
2184     }
2185 }
2186
2187 //! Set boundaries for the charge group range.
2188 static void set_cg_boundaries(gmx_domdec_zones_t *zones)
2189 {
2190     for (auto &iZone : zones->iZones)
2191     {
2192         iZone.iAtomRange = gmx::Range<int>(0, zones->cg_range[iZone.iZoneIndex + 1]);
2193         iZone.jAtomRange =
2194             gmx::Range<int>(zones->cg_range[iZone.jZoneRange.begin()],
2195                             zones->cg_range[iZone.jZoneRange.end()]);
2196     }
2197 }
2198
2199 /*! \brief Set zone dimensions for zones \p zone_start to \p zone_end-1
2200  *
2201  * Also sets the atom density for the home zone when \p zone_start=0.
2202  * For this \p numMovedChargeGroupsInHomeZone needs to be passed to tell
2203  * how many charge groups will move but are still part of the current range.
2204  * \todo When converting domdec to use proper classes, all these variables
2205  *       should be private and a method should return the correct count
2206  *       depending on an internal state.
2207  *
2208  * \param[in,out] dd          The domain decomposition struct
2209  * \param[in]     box         The box
2210  * \param[in]     ddbox       The domain decomposition box struct
2211  * \param[in]     zone_start  The start of the zone range to set sizes for
2212  * \param[in]     zone_end    The end of the zone range to set sizes for
2213  * \param[in]     numMovedChargeGroupsInHomeZone  The number of charge groups in the home zone that should moved but are still present in dd->comm->zones.cg_range
2214  */
2215 static void set_zones_size(gmx_domdec_t *dd,
2216                            matrix box, const gmx_ddbox_t *ddbox,
2217                            int zone_start, int zone_end,
2218                            int numMovedChargeGroupsInHomeZone)
2219 {
2220     gmx_domdec_comm_t  *comm;
2221     gmx_domdec_zones_t *zones;
2222     gmx_bool            bDistMB;
2223     int                 z, d, dim;
2224     real                rcs, rcmbs;
2225     int                 i, j;
2226     real                vol;
2227
2228     comm = dd->comm;
2229
2230     zones = &comm->zones;
2231
2232     /* Do we need to determine extra distances for multi-body bondeds? */
2233     bDistMB = (comm->systemInfo.haveInterDomainMultiBodyBondeds &&
2234                isDlbOn(dd->comm) &&
2235                dd->ndim > 1);
2236
2237     for (z = zone_start; z < zone_end; z++)
2238     {
2239         /* Copy cell limits to zone limits.
2240          * Valid for non-DD dims and non-shifted dims.
2241          */
2242         copy_rvec(comm->cell_x0, zones->size[z].x0);
2243         copy_rvec(comm->cell_x1, zones->size[z].x1);
2244     }
2245
2246     for (d = 0; d < dd->ndim; d++)
2247     {
2248         dim = dd->dim[d];
2249
2250         for (z = 0; z < zones->n; z++)
2251         {
2252             /* With a staggered grid we have different sizes
2253              * for non-shifted dimensions.
2254              */
2255             if (isDlbOn(dd->comm) && zones->shift[z][dim] == 0)
2256             {
2257                 if (d == 1)
2258                 {
2259                     zones->size[z].x0[dim] = comm->zone_d1[zones->shift[z][dd->dim[d-1]]].min0;
2260                     zones->size[z].x1[dim] = comm->zone_d1[zones->shift[z][dd->dim[d-1]]].max1;
2261                 }
2262                 else if (d == 2)
2263                 {
2264                     zones->size[z].x0[dim] = comm->zone_d2[zones->shift[z][dd->dim[d-2]]][zones->shift[z][dd->dim[d-1]]].min0;
2265                     zones->size[z].x1[dim] = comm->zone_d2[zones->shift[z][dd->dim[d-2]]][zones->shift[z][dd->dim[d-1]]].max1;
2266                 }
2267             }
2268         }
2269
2270         rcs   = comm->systemInfo.cutoff;
2271         rcmbs = comm->cutoff_mbody;
2272         if (ddbox->tric_dir[dim])
2273         {
2274             rcs   /= ddbox->skew_fac[dim];
2275             rcmbs /= ddbox->skew_fac[dim];
2276         }
2277
2278         /* Set the lower limit for the shifted zone dimensions */
2279         for (z = zone_start; z < zone_end; z++)
2280         {
2281             if (zones->shift[z][dim] > 0)
2282             {
2283                 dim = dd->dim[d];
2284                 if (!isDlbOn(dd->comm) || d == 0)
2285                 {
2286                     zones->size[z].x0[dim] = comm->cell_x1[dim];
2287                     zones->size[z].x1[dim] = comm->cell_x1[dim] + rcs;
2288                 }
2289                 else
2290                 {
2291                     /* Here we take the lower limit of the zone from
2292                      * the lowest domain of the zone below.
2293                      */
2294                     if (z < 4)
2295                     {
2296                         zones->size[z].x0[dim] =
2297                             comm->zone_d1[zones->shift[z][dd->dim[d-1]]].min1;
2298                     }
2299                     else
2300                     {
2301                         if (d == 1)
2302                         {
2303                             zones->size[z].x0[dim] =
2304                                 zones->size[zone_perm[2][z-4]].x0[dim];
2305                         }
2306                         else
2307                         {
2308                             zones->size[z].x0[dim] =
2309                                 comm->zone_d2[zones->shift[z][dd->dim[d-2]]][zones->shift[z][dd->dim[d-1]]].min1;
2310                         }
2311                     }
2312                     /* A temporary limit, is updated below */
2313                     zones->size[z].x1[dim] = zones->size[z].x0[dim];
2314
2315                     if (bDistMB)
2316                     {
2317                         for (size_t zi = 0; zi < zones->iZones.size(); zi++)
2318                         {
2319                             if (zones->shift[zi][dim] == 0)
2320                             {
2321                                 /* This takes the whole zone into account.
2322                                  * With multiple pulses this will lead
2323                                  * to a larger zone then strictly necessary.
2324                                  */
2325                                 zones->size[z].x1[dim] = std::max(zones->size[z].x1[dim],
2326                                                                   zones->size[zi].x1[dim]+rcmbs);
2327                             }
2328                         }
2329                     }
2330                 }
2331             }
2332         }
2333
2334         /* Loop over the i-zones to set the upper limit of each
2335          * j-zone they see.
2336          */
2337         for (const auto &iZone : zones->iZones)
2338         {
2339             const int zi = iZone.iZoneIndex;
2340             if (zones->shift[zi][dim] == 0)
2341             {
2342                 /* We should only use zones up to zone_end */
2343                 const auto &jZoneRangeFull = iZone.jZoneRange;
2344                 if (zone_end <= *jZoneRangeFull.begin())
2345                 {
2346                     continue;
2347                 }
2348                 const gmx::Range<int> jZoneRange(*jZoneRangeFull.begin(),
2349                                                  std::min(*jZoneRangeFull.end(), zone_end));
2350                 for (int jZone : jZoneRange)
2351                 {
2352                     if (zones->shift[jZone][dim] > 0)
2353                     {
2354                         zones->size[jZone].x1[dim] = std::max(zones->size[jZone].x1[dim],
2355                                                               zones->size[zi].x1[dim]+rcs);
2356                     }
2357                 }
2358             }
2359         }
2360     }
2361
2362     for (z = zone_start; z < zone_end; z++)
2363     {
2364         /* Initialization only required to keep the compiler happy */
2365         rvec corner_min = {0, 0, 0}, corner_max = {0, 0, 0}, corner;
2366         int  nc, c;
2367
2368         /* To determine the bounding box for a zone we need to find
2369          * the extreme corners of 4, 2 or 1 corners.
2370          */
2371         nc = 1 << (ddbox->nboundeddim - 1);
2372
2373         for (c = 0; c < nc; c++)
2374         {
2375             /* Set up a zone corner at x=0, ignoring trilinic couplings */
2376             corner[XX] = 0;
2377             if ((c & 1) == 0)
2378             {
2379                 corner[YY] = zones->size[z].x0[YY];
2380             }
2381             else
2382             {
2383                 corner[YY] = zones->size[z].x1[YY];
2384             }
2385             if ((c & 2) == 0)
2386             {
2387                 corner[ZZ] = zones->size[z].x0[ZZ];
2388             }
2389             else
2390             {
2391                 corner[ZZ] = zones->size[z].x1[ZZ];
2392             }
2393             if (dd->ndim == 1 && dd->dim[0] < ZZ && ZZ < dd->unitCellInfo.npbcdim &&
2394                 box[ZZ][1 - dd->dim[0]] != 0)
2395             {
2396                 /* With 1D domain decomposition the cg's are not in
2397                  * the triclinic box, but triclinic x-y and rectangular y/x-z.
2398                  * Shift the corner of the z-vector back to along the box
2399                  * vector of dimension d, so it will later end up at 0 along d.
2400                  * This can affect the location of this corner along dd->dim[0]
2401                  * through the matrix operation below if box[d][dd->dim[0]]!=0.
2402                  */
2403                 int d = 1 - dd->dim[0];
2404
2405                 corner[d] -= corner[ZZ]*box[ZZ][d]/box[ZZ][ZZ];
2406             }
2407             /* Apply the triclinic couplings */
2408             for (i = YY; i < ddbox->npbcdim && i < DIM; i++)
2409             {
2410                 for (j = XX; j < i; j++)
2411                 {
2412                     corner[j] += corner[i]*box[i][j]/box[i][i];
2413                 }
2414             }
2415             if (c == 0)
2416             {
2417                 copy_rvec(corner, corner_min);
2418                 copy_rvec(corner, corner_max);
2419             }
2420             else
2421             {
2422                 for (i = 0; i < DIM; i++)
2423                 {
2424                     corner_min[i] = std::min(corner_min[i], corner[i]);
2425                     corner_max[i] = std::max(corner_max[i], corner[i]);
2426                 }
2427             }
2428         }
2429         /* Copy the extreme cornes without offset along x */
2430         for (i = 0; i < DIM; i++)
2431         {
2432             zones->size[z].bb_x0[i] = corner_min[i];
2433             zones->size[z].bb_x1[i] = corner_max[i];
2434         }
2435         /* Add the offset along x */
2436         zones->size[z].bb_x0[XX] += zones->size[z].x0[XX];
2437         zones->size[z].bb_x1[XX] += zones->size[z].x1[XX];
2438     }
2439
2440     if (zone_start == 0)
2441     {
2442         vol = 1;
2443         for (dim = 0; dim < DIM; dim++)
2444         {
2445             vol *= zones->size[0].x1[dim] - zones->size[0].x0[dim];
2446         }
2447         zones->dens_zone0 = (zones->cg_range[1] - zones->cg_range[0] - numMovedChargeGroupsInHomeZone)/vol;
2448     }
2449
2450     if (debug)
2451     {
2452         for (z = zone_start; z < zone_end; z++)
2453         {
2454             fprintf(debug, "zone %d    %6.3f - %6.3f  %6.3f - %6.3f  %6.3f - %6.3f\n",
2455                     z,
2456                     zones->size[z].x0[XX], zones->size[z].x1[XX],
2457                     zones->size[z].x0[YY], zones->size[z].x1[YY],
2458                     zones->size[z].x0[ZZ], zones->size[z].x1[ZZ]);
2459             fprintf(debug, "zone %d bb %6.3f - %6.3f  %6.3f - %6.3f  %6.3f - %6.3f\n",
2460                     z,
2461                     zones->size[z].bb_x0[XX], zones->size[z].bb_x1[XX],
2462                     zones->size[z].bb_x0[YY], zones->size[z].bb_x1[YY],
2463                     zones->size[z].bb_x0[ZZ], zones->size[z].bb_x1[ZZ]);
2464         }
2465     }
2466 }
2467
2468 /*! \brief Order data in \p dataToSort according to \p sort
2469  *
2470  * Note: both buffers should have at least \p sort.size() elements.
2471  */
2472 template <typename T>
2473 static void
2474 orderVector(gmx::ArrayRef<const gmx_cgsort_t>  sort,
2475             gmx::ArrayRef<T>                   dataToSort,
2476             gmx::ArrayRef<T>                   sortBuffer)
2477 {
2478     GMX_ASSERT(dataToSort.size() >= sort.size(), "The vector needs to be sufficiently large");
2479     GMX_ASSERT(sortBuffer.size() >= sort.size(), "The sorting buffer needs to be sufficiently large");
2480
2481     /* Order the data into the temporary buffer */
2482     size_t i = 0;
2483     for (const gmx_cgsort_t &entry : sort)
2484     {
2485         sortBuffer[i++] = dataToSort[entry.ind];
2486     }
2487
2488     /* Copy back to the original array */
2489     std::copy(sortBuffer.begin(), sortBuffer.begin() + sort.size(),
2490               dataToSort.begin());
2491 }
2492
2493 /*! \brief Order data in \p dataToSort according to \p sort
2494  *
2495  * Note: \p vectorToSort should have at least \p sort.size() elements,
2496  *       \p workVector is resized when it is too small.
2497  */
2498 template <typename T>
2499 static void
2500 orderVector(gmx::ArrayRef<const gmx_cgsort_t>  sort,
2501             gmx::ArrayRef<T>                   vectorToSort,
2502             std::vector<T>                    *workVector)
2503 {
2504     if (gmx::index(workVector->size()) < sort.ssize())
2505     {
2506         workVector->resize(sort.size());
2507     }
2508     orderVector<T>(sort, vectorToSort, *workVector);
2509 }
2510
2511 //! Returns the sorting order for atoms based on the nbnxn grid order in sort
2512 static void dd_sort_order_nbnxn(const t_forcerec          *fr,
2513                                 std::vector<gmx_cgsort_t> *sort)
2514 {
2515     gmx::ArrayRef<const int> atomOrder = fr->nbv->getLocalAtomOrder();
2516
2517     /* Using push_back() instead of this resize results in much slower code */
2518     sort->resize(atomOrder.size());
2519     gmx::ArrayRef<gmx_cgsort_t> buffer    = *sort;
2520     size_t                      numSorted = 0;
2521     for (int i : atomOrder)
2522     {
2523         if (i >= 0)
2524         {
2525             /* The values of nsc and ind_gl are not used in this case */
2526             buffer[numSorted++].ind = i;
2527         }
2528     }
2529     sort->resize(numSorted);
2530 }
2531
2532 //! Returns the sorting state for DD.
2533 static void dd_sort_state(gmx_domdec_t *dd, t_forcerec *fr, t_state *state)
2534 {
2535     gmx_domdec_sort_t *sort = dd->comm->sort.get();
2536
2537     dd_sort_order_nbnxn(fr, &sort->sorted);
2538
2539     /* We alloc with the old size, since cgindex is still old */
2540     DDBufferAccess<gmx::RVec>     rvecBuffer(dd->comm->rvecBuffer, dd->ncg_home);
2541
2542     /* Set the new home atom/charge group count */
2543     dd->ncg_home = sort->sorted.size();
2544     if (debug)
2545     {
2546         fprintf(debug, "Set the new home atom count to %d\n",
2547                 dd->ncg_home);
2548     }
2549
2550     /* Reorder the state */
2551     gmx::ArrayRef<const gmx_cgsort_t> cgsort = sort->sorted;
2552     GMX_RELEASE_ASSERT(cgsort.ssize() == dd->ncg_home, "We should sort all the home atom groups");
2553
2554     if (state->flags & (1 << estX))
2555     {
2556         orderVector(cgsort, makeArrayRef(state->x), rvecBuffer.buffer);
2557     }
2558     if (state->flags & (1 << estV))
2559     {
2560         orderVector(cgsort, makeArrayRef(state->v), rvecBuffer.buffer);
2561     }
2562     if (state->flags & (1 << estCGP))
2563     {
2564         orderVector(cgsort, makeArrayRef(state->cg_p), rvecBuffer.buffer);
2565     }
2566
2567     /* Reorder the global cg index */
2568     orderVector<int>(cgsort, dd->globalAtomGroupIndices, &sort->intBuffer);
2569     /* Reorder the cginfo */
2570     orderVector<int>(cgsort, fr->cginfo, &sort->intBuffer);
2571     /* Set the home atom number */
2572     dd->comm->atomRanges.setEnd(DDAtomRanges::Type::Home, dd->ncg_home);
2573
2574     /* The atoms are now exactly in grid order, update the grid order */
2575     fr->nbv->setLocalAtomOrder();
2576 }
2577
2578 //! Accumulates load statistics.
2579 static void add_dd_statistics(gmx_domdec_t *dd)
2580 {
2581     gmx_domdec_comm_t *comm = dd->comm;
2582
2583     for (int i = 0; i < static_cast<int>(DDAtomRanges::Type::Number); i++)
2584     {
2585         auto range = static_cast<DDAtomRanges::Type>(i);
2586         comm->sum_nat[i] +=
2587             comm->atomRanges.end(range) - comm->atomRanges.start(range);
2588     }
2589     comm->ndecomp++;
2590 }
2591
2592 void reset_dd_statistics_counters(gmx_domdec_t *dd)
2593 {
2594     gmx_domdec_comm_t *comm = dd->comm;
2595
2596     /* Reset all the statistics and counters for total run counting */
2597     for (int i = 0; i < static_cast<int>(DDAtomRanges::Type::Number); i++)
2598     {
2599         comm->sum_nat[i] = 0;
2600     }
2601     comm->ndecomp   = 0;
2602     comm->nload     = 0;
2603     comm->load_step = 0;
2604     comm->load_sum  = 0;
2605     comm->load_max  = 0;
2606     clear_ivec(comm->load_lim);
2607     comm->load_mdf = 0;
2608     comm->load_pme = 0;
2609 }
2610
2611 void print_dd_statistics(const t_commrec *cr, const t_inputrec *ir, FILE *fplog)
2612 {
2613     gmx_domdec_comm_t *comm      = cr->dd->comm;
2614
2615     const int          numRanges = static_cast<int>(DDAtomRanges::Type::Number);
2616     gmx_sumd(numRanges, comm->sum_nat, cr);
2617
2618     if (fplog == nullptr)
2619     {
2620         return;
2621     }
2622
2623     fprintf(fplog, "\n    D O M A I N   D E C O M P O S I T I O N   S T A T I S T I C S\n\n");
2624
2625     for (int i = static_cast<int>(DDAtomRanges::Type::Zones); i < numRanges; i++)
2626     {
2627         auto   range = static_cast<DDAtomRanges::Type>(i);
2628         double av    = comm->sum_nat[i]/comm->ndecomp;
2629         switch (range)
2630         {
2631             case DDAtomRanges::Type::Zones:
2632                 fprintf(fplog,
2633                         " av. #atoms communicated per step for force:  %d x %.1f\n",
2634                         2, av);
2635                 break;
2636             case DDAtomRanges::Type::Vsites:
2637                 if (cr->dd->vsite_comm)
2638                 {
2639                     fprintf(fplog,
2640                             " av. #atoms communicated per step for vsites: %d x %.1f\n",
2641                             (EEL_PME(ir->coulombtype) || ir->coulombtype == eelEWALD) ? 3 : 2,
2642                             av);
2643                 }
2644                 break;
2645             case DDAtomRanges::Type::Constraints:
2646                 if (cr->dd->constraint_comm)
2647                 {
2648                     fprintf(fplog,
2649                             " av. #atoms communicated per step for LINCS:  %d x %.1f\n",
2650                             1 + ir->nLincsIter, av);
2651                 }
2652                 break;
2653             default:
2654                 gmx_incons(" Unknown type for DD statistics");
2655         }
2656     }
2657     fprintf(fplog, "\n");
2658
2659     if (comm->ddSettings.recordLoad && EI_DYNAMICS(ir->eI))
2660     {
2661         print_dd_load_av(fplog, cr->dd);
2662     }
2663 }
2664
2665 //!\brief TODO Remove fplog when group scheme and charge groups are gone
2666 void dd_partition_system(FILE                        *fplog,
2667                          const gmx::MDLogger         &mdlog,
2668                          int64_t                      step,
2669                          const t_commrec             *cr,
2670                          gmx_bool                     bMasterState,
2671                          int                          nstglobalcomm,
2672                          t_state                     *state_global,
2673                          const gmx_mtop_t            &top_global,
2674                          const t_inputrec            *ir,
2675                          gmx::ImdSession             *imdSession,
2676                          pull_t                      *pull_work,
2677                          t_state                     *state_local,
2678                          PaddedHostVector<gmx::RVec> *f,
2679                          gmx::MDAtoms                *mdAtoms,
2680                          gmx_localtop_t              *top_local,
2681                          t_forcerec                  *fr,
2682                          gmx_vsite_t                 *vsite,
2683                          gmx::Constraints            *constr,
2684                          t_nrnb                      *nrnb,
2685                          gmx_wallcycle               *wcycle,
2686                          gmx_bool                     bVerbose)
2687 {
2688     gmx_domdec_t      *dd;
2689     gmx_domdec_comm_t *comm;
2690     gmx_ddbox_t        ddbox = {0};
2691     int64_t            step_pcoupl;
2692     rvec               cell_ns_x0, cell_ns_x1;
2693     int                ncgindex_set, ncg_moved, nat_f_novirsum;
2694     gmx_bool           bBoxChanged, bNStGlobalComm, bDoDLB, bCheckWhetherToTurnDlbOn, bLogLoad;
2695     gmx_bool           bRedist;
2696     ivec               np;
2697     real               grid_density;
2698     char               sbuf[22];
2699
2700     wallcycle_start(wcycle, ewcDOMDEC);
2701
2702     dd   = cr->dd;
2703     comm = dd->comm;
2704
2705     // TODO if the update code becomes accessible here, use
2706     // upd->deform for this logic.
2707     bBoxChanged = (bMasterState || inputrecDeform(ir));
2708     if (ir->epc != epcNO)
2709     {
2710         /* With nstpcouple > 1 pressure coupling happens.
2711          * one step after calculating the pressure.
2712          * Box scaling happens at the end of the MD step,
2713          * after the DD partitioning.
2714          * We therefore have to do DLB in the first partitioning
2715          * after an MD step where P-coupling occurred.
2716          * We need to determine the last step in which p-coupling occurred.
2717          * MRS -- need to validate this for vv?
2718          */
2719         int n = ir->nstpcouple;
2720         if (n == 1)
2721         {
2722             step_pcoupl = step - 1;
2723         }
2724         else
2725         {
2726             step_pcoupl = ((step - 1)/n)*n + 1;
2727         }
2728         if (step_pcoupl >= comm->partition_step)
2729         {
2730             bBoxChanged = TRUE;
2731         }
2732     }
2733
2734     bNStGlobalComm = (step % nstglobalcomm == 0);
2735
2736     if (!isDlbOn(comm))
2737     {
2738         bDoDLB = FALSE;
2739     }
2740     else
2741     {
2742         /* Should we do dynamic load balacing this step?
2743          * Since it requires (possibly expensive) global communication,
2744          * we might want to do DLB less frequently.
2745          */
2746         if (bBoxChanged || ir->epc != epcNO)
2747         {
2748             bDoDLB = bBoxChanged;
2749         }
2750         else
2751         {
2752             bDoDLB = bNStGlobalComm;
2753         }
2754     }
2755
2756     /* Check if we have recorded loads on the nodes */
2757     if (comm->ddSettings.recordLoad && dd_load_count(comm) > 0)
2758     {
2759         bCheckWhetherToTurnDlbOn = dd_dlb_get_should_check_whether_to_turn_dlb_on(dd);
2760
2761         /* Print load every nstlog, first and last step to the log file */
2762         bLogLoad = ((ir->nstlog > 0 && step % ir->nstlog == 0) ||
2763                     comm->n_load_collect == 0 ||
2764                     (ir->nsteps >= 0 &&
2765                      (step + ir->nstlist > ir->init_step + ir->nsteps)));
2766
2767         /* Avoid extra communication due to verbose screen output
2768          * when nstglobalcomm is set.
2769          */
2770         if (bDoDLB || bLogLoad || bCheckWhetherToTurnDlbOn ||
2771             (bVerbose && (ir->nstlist == 0 || nstglobalcomm <= ir->nstlist)))
2772         {
2773             get_load_distribution(dd, wcycle);
2774             if (DDMASTER(dd))
2775             {
2776                 if (bLogLoad)
2777                 {
2778                     GMX_LOG(mdlog.info).asParagraph().appendText(dd_print_load(dd, step-1));
2779                 }
2780                 if (bVerbose)
2781                 {
2782                     dd_print_load_verbose(dd);
2783                 }
2784             }
2785             comm->n_load_collect++;
2786
2787             if (isDlbOn(comm))
2788             {
2789                 if (DDMASTER(dd))
2790                 {
2791                     /* Add the measured cycles to the running average */
2792                     const float averageFactor        = 0.1F;
2793                     comm->cyclesPerStepDlbExpAverage =
2794                         (1 - averageFactor)*comm->cyclesPerStepDlbExpAverage +
2795                         averageFactor*comm->cycl[ddCyclStep]/comm->cycl_n[ddCyclStep];
2796                 }
2797                 if (comm->dlbState == DlbState::onCanTurnOff &&
2798                     dd->comm->n_load_have % c_checkTurnDlbOffInterval == c_checkTurnDlbOffInterval - 1)
2799                 {
2800                     gmx_bool turnOffDlb;
2801                     if (DDMASTER(dd))
2802                     {
2803                         /* If the running averaged cycles with DLB are more
2804                          * than before we turned on DLB, turn off DLB.
2805                          * We will again run and check the cycles without DLB
2806                          * and we can then decide if to turn off DLB forever.
2807                          */
2808                         turnOffDlb = (comm->cyclesPerStepDlbExpAverage >
2809                                       comm->cyclesPerStepBeforeDLB);
2810                     }
2811                     dd_bcast(dd, sizeof(turnOffDlb), &turnOffDlb);
2812                     if (turnOffDlb)
2813                     {
2814                         /* To turn off DLB, we need to redistribute the atoms */
2815                         dd_collect_state(dd, state_local, state_global);
2816                         bMasterState = TRUE;
2817                         turn_off_dlb(mdlog, dd, step);
2818                     }
2819                 }
2820             }
2821             else if (bCheckWhetherToTurnDlbOn)
2822             {
2823                 gmx_bool turnOffDlbForever = FALSE;
2824                 gmx_bool turnOnDlb         = FALSE;
2825
2826                 /* Since the timings are node dependent, the master decides */
2827                 if (DDMASTER(dd))
2828                 {
2829                     /* If we recently turned off DLB, we want to check if
2830                      * performance is better without DLB. We want to do this
2831                      * ASAP to minimize the chance that external factors
2832                      * slowed down the DLB step are gone here and we
2833                      * incorrectly conclude that DLB was causing the slowdown.
2834                      * So we measure one nstlist block, no running average.
2835                      */
2836                     if (comm->haveTurnedOffDlb &&
2837                         comm->cycl[ddCyclStep]/comm->cycl_n[ddCyclStep] <
2838                         comm->cyclesPerStepDlbExpAverage)
2839                     {
2840                         /* After turning off DLB we ran nstlist steps in fewer
2841                          * cycles than with DLB. This likely means that DLB
2842                          * in not benefical, but this could be due to a one
2843                          * time unlucky fluctuation, so we require two such
2844                          * observations in close succession to turn off DLB
2845                          * forever.
2846                          */
2847                         if (comm->dlbSlowerPartitioningCount > 0 &&
2848                             dd->ddp_count < comm->dlbSlowerPartitioningCount + 10*c_checkTurnDlbOnInterval)
2849                         {
2850                             turnOffDlbForever = TRUE;
2851                         }
2852                         comm->haveTurnedOffDlb           = false;
2853                         /* Register when we last measured DLB slowdown */
2854                         comm->dlbSlowerPartitioningCount = dd->ddp_count;
2855                     }
2856                     else
2857                     {
2858                         /* Here we check if the max PME rank load is more than 0.98
2859                          * the max PP force load. If so, PP DLB will not help,
2860                          * since we are (almost) limited by PME. Furthermore,
2861                          * DLB will cause a significant extra x/f redistribution
2862                          * cost on the PME ranks, which will then surely result
2863                          * in lower total performance.
2864                          */
2865                         if (comm->ddRankSetup.usePmeOnlyRanks &&
2866                             dd_pme_f_ratio(dd) > 1 - DD_PERF_LOSS_DLB_ON)
2867                         {
2868                             turnOnDlb = FALSE;
2869                         }
2870                         else
2871                         {
2872                             turnOnDlb = (dd_force_imb_perf_loss(dd) >= DD_PERF_LOSS_DLB_ON);
2873                         }
2874                     }
2875                 }
2876                 struct
2877                 {
2878                     gmx_bool turnOffDlbForever;
2879                     gmx_bool turnOnDlb;
2880                 }
2881                 bools {
2882                     turnOffDlbForever, turnOnDlb
2883                 };
2884                 dd_bcast(dd, sizeof(bools), &bools);
2885                 if (bools.turnOffDlbForever)
2886                 {
2887                     turn_off_dlb_forever(mdlog, dd, step);
2888                 }
2889                 else if (bools.turnOnDlb)
2890                 {
2891                     turn_on_dlb(mdlog, dd, step);
2892                     bDoDLB = TRUE;
2893                 }
2894             }
2895         }
2896         comm->n_load_have++;
2897     }
2898
2899     bRedist = FALSE;
2900     if (bMasterState)
2901     {
2902         /* Clear the old state */
2903         clearDDStateIndices(dd, false);
2904         ncgindex_set = 0;
2905
2906         auto xGlobal = positionsFromStatePointer(state_global);
2907
2908         set_ddbox(*dd, true,
2909                   DDMASTER(dd) ? state_global->box : nullptr,
2910                   true, xGlobal,
2911                   &ddbox);
2912
2913         distributeState(mdlog, dd, top_global, state_global, ddbox, state_local, f);
2914
2915         /* Ensure that we have space for the new distribution */
2916         dd_check_alloc_ncg(fr, state_local, f, dd->ncg_home);
2917
2918         inc_nrnb(nrnb, eNR_CGCM, comm->atomRanges.numHomeAtoms());
2919
2920         dd_set_cginfo(dd->globalAtomGroupIndices, 0, dd->ncg_home, fr);
2921     }
2922     else if (state_local->ddp_count != dd->ddp_count)
2923     {
2924         if (state_local->ddp_count > dd->ddp_count)
2925         {
2926             gmx_fatal(FARGS, "Internal inconsistency state_local->ddp_count (%d) > dd->ddp_count (%" PRId64 ")", state_local->ddp_count, dd->ddp_count);
2927         }
2928
2929         if (state_local->ddp_count_cg_gl != state_local->ddp_count)
2930         {
2931             gmx_fatal(FARGS, "Internal inconsistency state_local->ddp_count_cg_gl (%d) != state_local->ddp_count (%d)", state_local->ddp_count_cg_gl, state_local->ddp_count);
2932         }
2933
2934         /* Clear the old state */
2935         clearDDStateIndices(dd, false);
2936
2937         /* Restore the atom group indices from state_local */
2938         restoreAtomGroups(dd, state_local);
2939         make_dd_indices(dd, 0);
2940         ncgindex_set = dd->ncg_home;
2941
2942         inc_nrnb(nrnb, eNR_CGCM, comm->atomRanges.numHomeAtoms());
2943
2944         dd_set_cginfo(dd->globalAtomGroupIndices, 0, dd->ncg_home, fr);
2945
2946         set_ddbox(*dd, bMasterState, state_local->box,
2947                   true, state_local->x, &ddbox);
2948
2949         bRedist = isDlbOn(comm);
2950     }
2951     else
2952     {
2953         /* We have the full state, only redistribute the cgs */
2954
2955         /* Clear the non-home indices */
2956         clearDDStateIndices(dd, true);
2957         ncgindex_set = 0;
2958
2959         /* To avoid global communication, we do not recompute the extent
2960          * of the system for dims without pbc. Therefore we need to copy
2961          * the previously computed values when we do not communicate.
2962          */
2963         if (!bNStGlobalComm)
2964         {
2965             copy_rvec(comm->box0, ddbox.box0    );
2966             copy_rvec(comm->box_size, ddbox.box_size);
2967         }
2968         set_ddbox(*dd, bMasterState, state_local->box,
2969                   bNStGlobalComm, state_local->x, &ddbox);
2970
2971         bBoxChanged = TRUE;
2972         bRedist     = TRUE;
2973     }
2974     /* Copy needed for dim's without pbc when avoiding communication */
2975     copy_rvec(ddbox.box0, comm->box0    );
2976     copy_rvec(ddbox.box_size, comm->box_size);
2977
2978     set_dd_cell_sizes(dd, &ddbox, dd->unitCellInfo.ddBoxIsDynamic, bMasterState, bDoDLB,
2979                       step, wcycle);
2980
2981     if (comm->ddSettings.nstDDDumpGrid > 0 && step % comm->ddSettings.nstDDDumpGrid == 0)
2982     {
2983         write_dd_grid_pdb("dd_grid", step, dd, state_local->box, &ddbox);
2984     }
2985
2986     if (comm->systemInfo.useUpdateGroups)
2987     {
2988         comm->updateGroupsCog->addCogs(gmx::arrayRefFromArray(dd->globalAtomGroupIndices.data(), dd->ncg_home),
2989                                        state_local->x);
2990     }
2991
2992     /* Check if we should sort the charge groups */
2993     const bool bSortCG = (bMasterState || bRedist);
2994
2995     /* When repartitioning we mark atom groups that will move to neighboring
2996      * DD cells, but we do not move them right away for performance reasons.
2997      * Thus we need to keep track of how many charge groups will move for
2998      * obtaining correct local charge group / atom counts.
2999      */
3000     ncg_moved = 0;
3001     if (bRedist)
3002     {
3003         wallcycle_sub_start(wcycle, ewcsDD_REDIST);
3004
3005         ncgindex_set = dd->ncg_home;
3006         dd_redistribute_cg(fplog, step, dd, ddbox.tric_dir,
3007                            state_local, f, fr,
3008                            nrnb, &ncg_moved);
3009
3010         GMX_RELEASE_ASSERT(bSortCG, "Sorting is required after redistribution");
3011
3012         if (comm->systemInfo.useUpdateGroups)
3013         {
3014             comm->updateGroupsCog->addCogs(gmx::arrayRefFromArray(dd->globalAtomGroupIndices.data(), dd->ncg_home),
3015                                            state_local->x);
3016         }
3017
3018         wallcycle_sub_stop(wcycle, ewcsDD_REDIST);
3019     }
3020
3021     // TODO: Integrate this code in the nbnxm module
3022     get_nsgrid_boundaries(ddbox.nboundeddim, state_local->box,
3023                           dd, &ddbox,
3024                           &comm->cell_x0, &comm->cell_x1,
3025                           dd->ncg_home, as_rvec_array(state_local->x.data()),
3026                           cell_ns_x0, cell_ns_x1, &grid_density);
3027
3028     if (bBoxChanged)
3029     {
3030         comm_dd_ns_cell_sizes(dd, &ddbox, cell_ns_x0, cell_ns_x1, step);
3031     }
3032
3033     if (bSortCG)
3034     {
3035         wallcycle_sub_start(wcycle, ewcsDD_GRID);
3036
3037         /* Sort the state on charge group position.
3038          * This enables exact restarts from this step.
3039          * It also improves performance by about 15% with larger numbers
3040          * of atoms per node.
3041          */
3042
3043         /* Fill the ns grid with the home cell,
3044          * so we can sort with the indices.
3045          */
3046         set_zones_ncg_home(dd);
3047
3048         set_zones_size(dd, state_local->box, &ddbox, 0, 1, ncg_moved);
3049
3050         nbnxn_put_on_grid(fr->nbv.get(), state_local->box,
3051                           0,
3052                           comm->zones.size[0].bb_x0,
3053                           comm->zones.size[0].bb_x1,
3054                           comm->updateGroupsCog.get(),
3055                           { 0, dd->ncg_home },
3056                           comm->zones.dens_zone0,
3057                           fr->cginfo,
3058                           state_local->x,
3059                           ncg_moved, bRedist ? comm->movedBuffer.data() : nullptr);
3060
3061         if (debug)
3062         {
3063             fprintf(debug, "Step %s, sorting the %d home charge groups\n",
3064                     gmx_step_str(step, sbuf), dd->ncg_home);
3065         }
3066         dd_sort_state(dd, fr, state_local);
3067
3068         /* After sorting and compacting we set the correct size */
3069         dd_resize_state(state_local, f, comm->atomRanges.numHomeAtoms());
3070
3071         /* Rebuild all the indices */
3072         dd->ga2la->clear();
3073         ncgindex_set = 0;
3074
3075         wallcycle_sub_stop(wcycle, ewcsDD_GRID);
3076     }
3077     else
3078     {
3079         /* With the group scheme the sorting array is part of the DD state,
3080          * but it just got out of sync, so mark as invalid by emptying it.
3081          */
3082         if (ir->cutoff_scheme == ecutsGROUP)
3083         {
3084             comm->sort->sorted.clear();
3085         }
3086     }
3087
3088     if (comm->systemInfo.useUpdateGroups)
3089     {
3090         /* The update groups cog's are invalid after sorting
3091          * and need to be cleared before the next partitioning anyhow.
3092          */
3093         comm->updateGroupsCog->clear();
3094     }
3095
3096     wallcycle_sub_start(wcycle, ewcsDD_SETUPCOMM);
3097
3098     /* Setup up the communication and communicate the coordinates */
3099     setup_dd_communication(dd, state_local->box, &ddbox, fr, state_local, f);
3100
3101     /* Set the indices */
3102     make_dd_indices(dd, ncgindex_set);
3103
3104     /* Set the charge group boundaries for neighbor searching */
3105     set_cg_boundaries(&comm->zones);
3106
3107     if (fr->cutoff_scheme == ecutsVERLET)
3108     {
3109         /* When bSortCG=true, we have already set the size for zone 0 */
3110         set_zones_size(dd, state_local->box, &ddbox,
3111                        bSortCG ? 1 : 0, comm->zones.n,
3112                        0);
3113     }
3114
3115     wallcycle_sub_stop(wcycle, ewcsDD_SETUPCOMM);
3116
3117     /*
3118        write_dd_pdb("dd_home",step,"dump",top_global,cr,
3119                  -1,state_local->x.rvec_array(),state_local->box);
3120      */
3121
3122     wallcycle_sub_start(wcycle, ewcsDD_MAKETOP);
3123
3124     /* Extract a local topology from the global topology */
3125     for (int i = 0; i < dd->ndim; i++)
3126     {
3127         np[dd->dim[i]] = comm->cd[i].numPulses();
3128     }
3129     dd_make_local_top(dd, &comm->zones, dd->unitCellInfo.npbcdim, state_local->box,
3130                       comm->cellsize_min, np,
3131                       fr,
3132                       state_local->x.rvec_array(),
3133                       top_global, top_local);
3134
3135     wallcycle_sub_stop(wcycle, ewcsDD_MAKETOP);
3136
3137     wallcycle_sub_start(wcycle, ewcsDD_MAKECONSTR);
3138
3139     /* Set up the special atom communication */
3140     int n = comm->atomRanges.end(DDAtomRanges::Type::Zones);
3141     for (int i = static_cast<int>(DDAtomRanges::Type::Zones) + 1; i < static_cast<int>(DDAtomRanges::Type::Number); i++)
3142     {
3143         auto range = static_cast<DDAtomRanges::Type>(i);
3144         switch (range)
3145         {
3146             case DDAtomRanges::Type::Vsites:
3147                 if (vsite && vsite->numInterUpdategroupVsites)
3148                 {
3149                     n = dd_make_local_vsites(dd, n, top_local->idef.il);
3150                 }
3151                 break;
3152             case DDAtomRanges::Type::Constraints:
3153                 if (dd->comm->systemInfo.haveSplitConstraints || dd->comm->systemInfo.haveSplitSettles)
3154                 {
3155                     /* Only for inter-cg constraints we need special code */
3156                     n = dd_make_local_constraints(dd, n, &top_global, fr->cginfo.data(),
3157                                                   constr, ir->nProjOrder,
3158                                                   top_local->idef.il);
3159                 }
3160                 break;
3161             default:
3162                 gmx_incons("Unknown special atom type setup");
3163         }
3164         comm->atomRanges.setEnd(range, n);
3165     }
3166
3167     wallcycle_sub_stop(wcycle, ewcsDD_MAKECONSTR);
3168
3169     wallcycle_sub_start(wcycle, ewcsDD_TOPOTHER);
3170
3171     /* Make space for the extra coordinates for virtual site
3172      * or constraint communication.
3173      */
3174     state_local->natoms = comm->atomRanges.numAtomsTotal();
3175
3176     dd_resize_state(state_local, f, state_local->natoms);
3177
3178     if (fr->haveDirectVirialContributions)
3179     {
3180         if (vsite && vsite->numInterUpdategroupVsites)
3181         {
3182             nat_f_novirsum = comm->atomRanges.end(DDAtomRanges::Type::Vsites);
3183         }
3184         else
3185         {
3186             if (EEL_FULL(ir->coulombtype) && dd->haveExclusions)
3187             {
3188                 nat_f_novirsum = comm->atomRanges.end(DDAtomRanges::Type::Zones);
3189             }
3190             else
3191             {
3192                 nat_f_novirsum = comm->atomRanges.numHomeAtoms();
3193             }
3194         }
3195     }
3196     else
3197     {
3198         nat_f_novirsum = 0;
3199     }
3200
3201     /* Set the number of atoms required for the force calculation.
3202      * Forces need to be constrained when doing energy
3203      * minimization. For simple simulations we could avoid some
3204      * allocation, zeroing and copying, but this is probably not worth
3205      * the complications and checking.
3206      */
3207     forcerec_set_ranges(fr,
3208                         comm->atomRanges.end(DDAtomRanges::Type::Zones),
3209                         comm->atomRanges.end(DDAtomRanges::Type::Constraints),
3210                         nat_f_novirsum);
3211
3212     /* Update atom data for mdatoms and several algorithms */
3213     mdAlgorithmsSetupAtomData(cr, ir, top_global, top_local, fr,
3214                               nullptr, mdAtoms, constr, vsite, nullptr);
3215
3216     auto mdatoms = mdAtoms->mdatoms();
3217     if (!thisRankHasDuty(cr, DUTY_PME))
3218     {
3219         /* Send the charges and/or c6/sigmas to our PME only node */
3220         gmx_pme_send_parameters(cr,
3221                                 fr->ic,
3222                                 mdatoms->nChargePerturbed != 0, mdatoms->nTypePerturbed != 0,
3223                                 mdatoms->chargeA, mdatoms->chargeB,
3224                                 mdatoms->sqrt_c6A, mdatoms->sqrt_c6B,
3225                                 mdatoms->sigmaA, mdatoms->sigmaB,
3226                                 dd_pme_maxshift_x(dd), dd_pme_maxshift_y(dd));
3227     }
3228
3229     if (ir->bPull)
3230     {
3231         /* Update the local pull groups */
3232         dd_make_local_pull_groups(cr, pull_work);
3233     }
3234
3235     if (dd->atomSets != nullptr)
3236     {
3237         /* Update the local atom sets */
3238         dd->atomSets->setIndicesInDomainDecomposition(*(dd->ga2la));
3239     }
3240
3241     /* Update the local atoms to be communicated via the IMD protocol if bIMD is TRUE. */
3242     imdSession->dd_make_local_IMD_atoms(dd);
3243
3244     add_dd_statistics(dd);
3245
3246     /* Make sure we only count the cycles for this DD partitioning */
3247     clear_dd_cycle_counts(dd);
3248
3249     /* Because the order of the atoms might have changed since
3250      * the last vsite construction, we need to communicate the constructing
3251      * atom coordinates again (for spreading the forces this MD step).
3252      */
3253     dd_move_x_vsites(dd, state_local->box, state_local->x.rvec_array());
3254
3255     wallcycle_sub_stop(wcycle, ewcsDD_TOPOTHER);
3256
3257     if (comm->ddSettings.nstDDDump > 0 && step % comm->ddSettings.nstDDDump == 0)
3258     {
3259         dd_move_x(dd, state_local->box, state_local->x, nullWallcycle);
3260         write_dd_pdb("dd_dump", step, "dump", &top_global, cr,
3261                      -1, state_local->x.rvec_array(), state_local->box);
3262     }
3263
3264     /* Store the partitioning step */
3265     comm->partition_step = step;
3266
3267     /* Increase the DD partitioning counter */
3268     dd->ddp_count++;
3269     /* The state currently matches this DD partitioning count, store it */
3270     state_local->ddp_count = dd->ddp_count;
3271     if (bMasterState)
3272     {
3273         /* The DD master node knows the complete cg distribution,
3274          * store the count so we can possibly skip the cg info communication.
3275          */
3276         comm->master_cg_ddp_count = (bSortCG ? 0 : dd->ddp_count);
3277     }
3278
3279     if (comm->ddSettings.DD_debug > 0)
3280     {
3281         /* Set the env var GMX_DD_DEBUG if you suspect corrupted indices */
3282         check_index_consistency(dd, top_global.natoms, "after partitioning");
3283     }
3284
3285     wallcycle_stop(wcycle, ewcDOMDEC);
3286 }
3287
3288 /*! \brief Check whether bonded interactions are missing, if appropriate */
3289 void checkNumberOfBondedInteractions(const gmx::MDLogger  &mdlog,
3290                                      t_commrec            *cr,
3291                                      int                   totalNumberOfBondedInteractions,
3292                                      const gmx_mtop_t     *top_global,
3293                                      const gmx_localtop_t *top_local,
3294                                      const rvec           *x,
3295                                      const matrix          box,
3296                                      bool                 *shouldCheckNumberOfBondedInteractions)
3297 {
3298     if (*shouldCheckNumberOfBondedInteractions)
3299     {
3300         if (totalNumberOfBondedInteractions != cr->dd->nbonded_global)
3301         {
3302             dd_print_missing_interactions(mdlog, cr, totalNumberOfBondedInteractions, top_global, top_local, x, box); // Does not return
3303         }
3304         *shouldCheckNumberOfBondedInteractions = false;
3305     }
3306 }