src/gromacs/domdec/partition.cpp

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2018,2019, by the GROMACS development team, led by
   5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   6  * and including many others, as listed in the AUTHORS file in the
   7  * top-level source directory and at http://www.gromacs.org.
   8  *
   9  * GROMACS is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public License
  11  * as published by the Free Software Foundation; either version 2.1
  12  * of the License, or (at your option) any later version.
  13  *
  14  * GROMACS is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public
  20  * License along with GROMACS; if not, see
  21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  23  *
  24  * If you want to redistribute modifications to GROMACS, please
  25  * consider that scientific software is very special. Version
  26  * control is crucial - bugs must be traceable. We will be happy to
  27  * consider code for inclusion in the official distribution, but
  28  * derived work must not be called official GROMACS. Details are found
  29  * in the README & COPYING files - if they are missing, get the
  30  * official version at http://www.gromacs.org.
  31  *
  32  * To help us fund GROMACS development, we humbly ask that you cite
  33  * the research papers on the package. Check out http://www.gromacs.org.
  34  */
  35 /*! \internal \file
  36  *
  37  * \brief This file defines functions for mdrun to call to make a new
  38  * domain decomposition, and check it.
  39  *
  40  * \author Berk Hess <hess@kth.se>
  41  * \ingroup module_domdec
  42  */
  43
  44 #include "gmxpre.h"
  45
  46 #include "partition.h"
  47
  48 #include "config.h"
  49
  50 #include <cassert>
  51 #include <cstdio>
  52
  53 #include <algorithm>
  54
  55 #include "gromacs/domdec/collect.h"
  56 #include "gromacs/domdec/dlb.h"
  57 #include "gromacs/domdec/dlbtiming.h"
  58 #include "gromacs/domdec/domdec.h"
  59 #include "gromacs/domdec/domdec_network.h"
  60 #include "gromacs/domdec/ga2la.h"
  61 #include "gromacs/domdec/localatomsetmanager.h"
  62 #include "gromacs/ewald/pme.h"
  63 #include "gromacs/gmxlib/chargegroup.h"
  64 #include "gromacs/gmxlib/network.h"
  65 #include "gromacs/gmxlib/nrnb.h"
  66 #include "gromacs/imd/imd.h"
  67 #include "gromacs/math/functions.h"
  68 #include "gromacs/math/vec.h"
  69 #include "gromacs/mdlib/forcerec.h"
  70 #include "gromacs/mdlib/gmx_omp_nthreads.h"
  71 #include "gromacs/mdlib/mdatoms.h"
  72 #include "gromacs/mdlib/mdsetup.h"
  73 #include "gromacs/mdlib/nb_verlet.h"
  74 #include "gromacs/mdlib/nbnxn_grid.h"
  75 #include "gromacs/mdlib/nsgrid.h"
  76 #include "gromacs/mdlib/vsite.h"
  77 #include "gromacs/mdtypes/commrec.h"
  78 #include "gromacs/mdtypes/forcerec.h"
  79 #include "gromacs/mdtypes/inputrec.h"
  80 #include "gromacs/mdtypes/md_enums.h"
  81 #include "gromacs/mdtypes/nblist.h"
  82 #include "gromacs/mdtypes/state.h"
  83 #include "gromacs/pulling/pull.h"
  84 #include "gromacs/timing/wallcycle.h"
  85 #include "gromacs/topology/mtop_util.h"
  86 #include "gromacs/topology/topology.h"
  87 #include "gromacs/utility/cstringutil.h"
  88 #include "gromacs/utility/fatalerror.h"
  89 #include "gromacs/utility/logger.h"
  90 #include "gromacs/utility/real.h"
  91 #include "gromacs/utility/smalloc.h"
  92 #include "gromacs/utility/strconvert.h"
  93 #include "gromacs/utility/stringstream.h"
  94 #include "gromacs/utility/stringutil.h"
  95 #include "gromacs/utility/textwriter.h"
  96
  97 #include "box.h"
  98 #include "cellsizes.h"
  99 #include "distribute.h"
 100 #include "domdec_constraints.h"
 101 #include "domdec_internal.h"
 102 #include "domdec_vsite.h"
 103 #include "dump.h"
 104 #include "redistribute.h"
 105 #include "utility.h"
 106
 107 /*! \brief Turn on DLB when the load imbalance causes this amount of total loss.
 108  *
 109  * There is a bit of overhead with DLB and it's difficult to achieve
 110  * a load imbalance of less than 2% with DLB.
 111  */
 112 #define DD_PERF_LOSS_DLB_ON  0.02
 113
 114 //! Warn about imbalance due to PP or PP/PME load imbalance at this loss.
 115 #define DD_PERF_LOSS_WARN    0.05
 116
 117
 118 //! Debug helper printing a DD zone
 119 static void print_ddzone(FILE *fp, int d, int i, int j, gmx_ddzone_t *zone)
 120 {
 121     fprintf(fp, "zone d0 %d d1 %d d2 %d  min0 %6.3f max1 %6.3f mch0 %6.3f mch1 %6.3f p1_0 %6.3f p1_1 %6.3f\n",
 122             d, i, j,
 123             zone->min0, zone->max1,
 124             zone->mch0, zone->mch0,
 125             zone->p1_0, zone->p1_1);
 126 }
 127
 128 /*! \brief Using the home grid size as input in cell_ns_x0 and cell_ns_x1
 129  * takes the extremes over all home and remote zones in the halo
 130  * and returns the results in cell_ns_x0 and cell_ns_x1.
 131  * Note: only used with the group cut-off scheme.
 132  */
 133 static void dd_move_cellx(gmx_domdec_t      *dd,
 134                           const gmx_ddbox_t *ddbox,
 135                           rvec               cell_ns_x0,
 136                           rvec               cell_ns_x1)
 137 {
 138     constexpr int      c_ddZoneCommMaxNumZones = 5;
 139     gmx_ddzone_t       buf_s[c_ddZoneCommMaxNumZones];
 140     gmx_ddzone_t       buf_r[c_ddZoneCommMaxNumZones];
 141     gmx_ddzone_t       buf_e[c_ddZoneCommMaxNumZones];
 142     gmx_domdec_comm_t *comm = dd->comm;
 143
 144     rvec               extr_s[2];
 145     rvec               extr_r[2];
 146     for (int d = 1; d < dd->ndim; d++)
 147     {
 148         int           dim = dd->dim[d];
 149         gmx_ddzone_t &zp  = (d == 1) ? comm->zone_d1[0] : comm->zone_d2[0][0];
 150
 151         /* Copy the base sizes of the home zone */
 152         zp.min0    = cell_ns_x0[dim];
 153         zp.max1    = cell_ns_x1[dim];
 154         zp.min1    = cell_ns_x1[dim];
 155         zp.mch0    = cell_ns_x0[dim];
 156         zp.mch1    = cell_ns_x1[dim];
 157         zp.p1_0    = cell_ns_x0[dim];
 158         zp.p1_1    = cell_ns_x1[dim];
 159         zp.dataSet = 1;
 160     }
 161
 162     gmx::ArrayRef<DDCellsizesWithDlb> cellsizes = comm->cellsizesWithDlb;
 163
 164     /* Loop backward over the dimensions and aggregate the extremes
 165      * of the cell sizes.
 166      */
 167     for (int d = dd->ndim - 2; d >= 0; d--)
 168     {
 169         const int  dim      = dd->dim[d];
 170         const bool applyPbc = (dim < ddbox->npbcdim);
 171
 172         /* Use an rvec to store two reals */
 173         extr_s[d][0] = cellsizes[d + 1].fracLower;
 174         extr_s[d][1] = cellsizes[d + 1].fracUpper;
 175         extr_s[d][2] = cellsizes[d + 1].fracUpper;
 176
 177         int pos = 0;
 178         GMX_ASSERT(pos < c_ddZoneCommMaxNumZones, "The buffers should be sufficiently large");
 179         /* Store the extremes in the backward sending buffer,
 180          * so they get updated separately from the forward communication.
 181          */
 182         for (int d1 = d; d1 < dd->ndim-1; d1++)
 183         {
 184             gmx_ddzone_t &buf = buf_s[pos];
 185
 186             /* We invert the order to be able to use the same loop for buf_e */
 187             buf.min0    = extr_s[d1][1];
 188             buf.max1    = extr_s[d1][0];
 189             buf.min1    = extr_s[d1][2];
 190             buf.mch0    = 0;
 191             buf.mch1    = 0;
 192             /* Store the cell corner of the dimension we communicate along */
 193             buf.p1_0    = comm->cell_x0[dim];
 194             buf.p1_1    = 0;
 195             buf.dataSet = 1;
 196             pos++;
 197         }
 198
 199         buf_s[pos] = (dd->ndim == 2) ? comm->zone_d1[0] : comm->zone_d2[0][0];
 200         pos++;
 201
 202         if (dd->ndim == 3 && d == 0)
 203         {
 204             buf_s[pos] = comm->zone_d2[0][1];
 205             pos++;
 206             buf_s[pos] = comm->zone_d1[0];
 207             pos++;
 208         }
 209
 210         /* We only need to communicate the extremes
 211          * in the forward direction
 212          */
 213         int numPulses = comm->cd[d].numPulses();
 214         int numPulsesMin;
 215         if (applyPbc)
 216         {
 217             /* Take the minimum to avoid double communication */
 218             numPulsesMin = std::min(numPulses, dd->nc[dim] - 1 - numPulses);
 219         }
 220         else
 221         {
 222             /* Without PBC we should really not communicate over
 223              * the boundaries, but implementing that complicates
 224              * the communication setup and therefore we simply
 225              * do all communication, but ignore some data.
 226              */
 227             numPulsesMin = numPulses;
 228         }
 229         for (int pulse = 0; pulse < numPulsesMin; pulse++)
 230         {
 231             /* Communicate the extremes forward */
 232             bool receiveValidData = (applyPbc || dd->ci[dim] > 0);
 233
 234             int  numElements      = dd->ndim - d - 1;
 235             ddSendrecv(dd, d, dddirForward,
 236                        extr_s + d, numElements,
 237                        extr_r + d, numElements);
 238
 239             if (receiveValidData)
 240             {
 241                 for (int d1 = d; d1 < dd->ndim - 1; d1++)
 242                 {
 243                     extr_s[d1][0] = std::max(extr_s[d1][0], extr_r[d1][0]);
 244                     extr_s[d1][1] = std::min(extr_s[d1][1], extr_r[d1][1]);
 245                     extr_s[d1][2] = std::min(extr_s[d1][2], extr_r[d1][2]);
 246                 }
 247             }
 248         }
 249
 250         const int numElementsInBuffer = pos;
 251         for (int pulse = 0; pulse < numPulses; pulse++)
 252         {
 253             /* Communicate all the zone information backward */
 254             bool receiveValidData = (applyPbc || dd->ci[dim] < dd->nc[dim] - 1);
 255
 256             static_assert(sizeof(gmx_ddzone_t) == c_ddzoneNumReals*sizeof(real), "Here we expect gmx_ddzone_t to consist of c_ddzoneNumReals reals (only)");
 257
 258             int numReals = numElementsInBuffer*c_ddzoneNumReals;
 259             ddSendrecv(dd, d, dddirBackward,
 260                        gmx::arrayRefFromArray(&buf_s[0].min0, numReals),
 261                        gmx::arrayRefFromArray(&buf_r[0].min0, numReals));
 262
 263             rvec dh = { 0 };
 264             if (pulse > 0)
 265             {
 266                 for (int d1 = d + 1; d1 < dd->ndim; d1++)
 267                 {
 268                     /* Determine the decrease of maximum required
 269                      * communication height along d1 due to the distance along d,
 270                      * this avoids a lot of useless atom communication.
 271                      */
 272                     real dist_d = comm->cell_x1[dim] - buf_r[0].p1_0;
 273
 274                     real c;
 275                     if (ddbox->tric_dir[dim])
 276                     {
 277                         /* c is the off-diagonal coupling between the cell planes
 278                          * along directions d and d1.
 279                          */
 280                         c = ddbox->v[dim][dd->dim[d1]][dim];
 281                     }
 282                     else
 283                     {
 284                         c = 0;
 285                     }
 286                     real det = (1 + c*c)*comm->cutoff*comm->cutoff - dist_d*dist_d;
 287                     if (det > 0)
 288                     {
 289                         dh[d1] = comm->cutoff - (c*dist_d + std::sqrt(det))/(1 + c*c);
 290                     }
 291                     else
 292                     {
 293                         /* A negative value signals out of range */
 294                         dh[d1] = -1;
 295                     }
 296                 }
 297             }
 298
 299             /* Accumulate the extremes over all pulses */
 300             for (int i = 0; i < numElementsInBuffer; i++)
 301             {
 302                 if (pulse == 0)
 303                 {
 304                     buf_e[i] = buf_r[i];
 305                 }
 306                 else
 307                 {
 308                     if (receiveValidData)
 309                     {
 310                         buf_e[i].min0 = std::min(buf_e[i].min0, buf_r[i].min0);
 311                         buf_e[i].max1 = std::max(buf_e[i].max1, buf_r[i].max1);
 312                         buf_e[i].min1 = std::min(buf_e[i].min1, buf_r[i].min1);
 313                     }
 314
 315                     int d1;
 316                     if (dd->ndim == 3 && d == 0 && i == numElementsInBuffer - 1)
 317                     {
 318                         d1 = 1;
 319                     }
 320                     else
 321                     {
 322                         d1 = d + 1;
 323                     }
 324                     if (receiveValidData && dh[d1] >= 0)
 325                     {
 326                         buf_e[i].mch0 = std::max(buf_e[i].mch0, buf_r[i].mch0-dh[d1]);
 327                         buf_e[i].mch1 = std::max(buf_e[i].mch1, buf_r[i].mch1-dh[d1]);
 328                     }
 329                 }
 330                 /* Copy the received buffer to the send buffer,
 331                  * to pass the data through with the next pulse.
 332                  */
 333                 buf_s[i] = buf_r[i];
 334             }
 335             if (((applyPbc || dd->ci[dim] + numPulses < dd->nc[dim]) && pulse == numPulses - 1) ||
 336                 (!applyPbc && dd->ci[dim] + 1 + pulse == dd->nc[dim] - 1))
 337             {
 338                 /* Store the extremes */
 339                 int pos = 0;
 340
 341                 for (int d1 = d; d1 < dd->ndim-1; d1++)
 342                 {
 343                     extr_s[d1][1] = std::min(extr_s[d1][1], buf_e[pos].min0);
 344                     extr_s[d1][0] = std::max(extr_s[d1][0], buf_e[pos].max1);
 345                     extr_s[d1][2] = std::min(extr_s[d1][2], buf_e[pos].min1);
 346                     pos++;
 347                 }
 348
 349                 if (d == 1 || (d == 0 && dd->ndim == 3))
 350                 {
 351                     for (int i = d; i < 2; i++)
 352                     {
 353                         comm->zone_d2[1-d][i] = buf_e[pos];
 354                         pos++;
 355                     }
 356                 }
 357                 if (d == 0)
 358                 {
 359                     comm->zone_d1[1] = buf_e[pos];
 360                     pos++;
 361                 }
 362             }
 363             else
 364             {
 365                 if (d == 1 || (d == 0 && dd->ndim == 3))
 366                 {
 367                     for (int i = d; i < 2; i++)
 368                     {
 369                         comm->zone_d2[1 - d][i].dataSet = 0;
 370                     }
 371                 }
 372                 if (d == 0)
 373                 {
 374                     comm->zone_d1[1].dataSet = 0;
 375                 }
 376             }
 377         }
 378     }
 379
 380     if (dd->ndim >= 2)
 381     {
 382         int dim = dd->dim[1];
 383         for (int i = 0; i < 2; i++)
 384         {
 385             if (comm->zone_d1[i].dataSet != 0)
 386             {
 387                 if (debug)
 388                 {
 389                     print_ddzone(debug, 1, i, 0, &comm->zone_d1[i]);
 390                 }
 391                 cell_ns_x0[dim] = std::min(cell_ns_x0[dim], comm->zone_d1[i].min0);
 392                 cell_ns_x1[dim] = std::max(cell_ns_x1[dim], comm->zone_d1[i].max1);
 393             }
 394         }
 395     }
 396     if (dd->ndim >= 3)
 397     {
 398         int dim = dd->dim[2];
 399         for (int i = 0; i < 2; i++)
 400         {
 401             for (int j = 0; j < 2; j++)
 402             {
 403                 if (comm->zone_d2[i][j].dataSet != 0)
 404                 {
 405                     if (debug)
 406                     {
 407                         print_ddzone(debug, 2, i, j, &comm->zone_d2[i][j]);
 408                     }
 409                     cell_ns_x0[dim] = std::min(cell_ns_x0[dim], comm->zone_d2[i][j].min0);
 410                     cell_ns_x1[dim] = std::max(cell_ns_x1[dim], comm->zone_d2[i][j].max1);
 411                 }
 412             }
 413         }
 414     }
 415     for (int d = 1; d < dd->ndim; d++)
 416     {
 417         cellsizes[d].fracLowerMax = extr_s[d-1][0];
 418         cellsizes[d].fracUpperMin = extr_s[d-1][1];
 419         if (debug)
 420         {
 421             fprintf(debug, "Cell fraction d %d, max0 %f, min1 %f\n",
 422                     d, cellsizes[d].fracLowerMax, cellsizes[d].fracUpperMin);
 423         }
 424     }
 425 }
 426
 427 //! Sets the charge-group zones to be equal to the home zone.
 428 static void set_zones_ncg_home(gmx_domdec_t *dd)
 429 {
 430     gmx_domdec_zones_t *zones;
 431     int                 i;
 432
 433     zones = &dd->comm->zones;
 434
 435     zones->cg_range[0] = 0;
 436     for (i = 1; i < zones->n+1; i++)
 437     {
 438         zones->cg_range[i] = dd->ncg_home;
 439     }
 440     /* zone_ncg1[0] should always be equal to ncg_home */
 441     dd->comm->zone_ncg1[0] = dd->ncg_home;
 442 }
 443
 444 //! Restore atom groups for the charge groups.
 445 static void restoreAtomGroups(gmx_domdec_t *dd,
 446                               const int *gcgs_index, const t_state *state)
 447 {
 448     gmx::ArrayRef<const int>  atomGroupsState        = state->cg_gl;
 449
 450     std::vector<int>         &globalAtomGroupIndices = dd->globalAtomGroupIndices;
 451     gmx::RangePartitioning   &atomGrouping           = dd->atomGrouping_;
 452
 453     globalAtomGroupIndices.resize(atomGroupsState.size());
 454     atomGrouping.clear();
 455
 456     /* Copy back the global charge group indices from state
 457      * and rebuild the local charge group to atom index.
 458      */
 459     for (gmx::index i = 0; i < atomGroupsState.size(); i++)
 460     {
 461         const int atomGroupGlobal  = atomGroupsState[i];
 462         const int groupSize        = gcgs_index[atomGroupGlobal + 1] - gcgs_index[atomGroupGlobal];
 463         globalAtomGroupIndices[i]  = atomGroupGlobal;
 464         atomGrouping.appendBlock(groupSize);
 465     }
 466
 467     dd->ncg_home = atomGroupsState.size();
 468     dd->comm->atomRanges.setEnd(DDAtomRanges::Type::Home, atomGrouping.fullRange().end());
 469
 470     set_zones_ncg_home(dd);
 471 }
 472
 473 //! Sets the cginfo structures.
 474 static void dd_set_cginfo(gmx::ArrayRef<const int> index_gl, int cg0, int cg1,
 475                           t_forcerec *fr, char *bLocalCG)
 476 {
 477     cginfo_mb_t *cginfo_mb;
 478     int         *cginfo;
 479     int          cg;
 480
 481     if (fr != nullptr)
 482     {
 483         cginfo_mb = fr->cginfo_mb;
 484         cginfo    = fr->cginfo;
 485
 486         for (cg = cg0; cg < cg1; cg++)
 487         {
 488             cginfo[cg] = ddcginfo(cginfo_mb, index_gl[cg]);
 489         }
 490     }
 491
 492     if (bLocalCG != nullptr)
 493     {
 494         for (cg = cg0; cg < cg1; cg++)
 495         {
 496             bLocalCG[index_gl[cg]] = TRUE;
 497         }
 498     }
 499 }
 500
 501 //! Makes the mappings between global and local atom indices during DD repartioning.
 502 static void make_dd_indices(gmx_domdec_t *dd,
 503                             const int *gcgs_index, int cg_start)
 504 {
 505     const int                 numZones               = dd->comm->zones.n;
 506     const int                *zone2cg                = dd->comm->zones.cg_range;
 507     const int                *zone_ncg1              = dd->comm->zone_ncg1;
 508     gmx::ArrayRef<const int>  globalAtomGroupIndices = dd->globalAtomGroupIndices;
 509     const gmx_bool            bCGs                   = dd->comm->bCGs;
 510
 511     std::vector<int>         &globalAtomIndices      = dd->globalAtomIndices;
 512     gmx_ga2la_t              &ga2la                  = *dd->ga2la;
 513
 514     if (zone2cg[1] != dd->ncg_home)
 515     {
 516         gmx_incons("dd->ncg_zone is not up to date");
 517     }
 518
 519     /* Make the local to global and global to local atom index */
 520     int a = dd->atomGrouping().subRange(cg_start, cg_start).begin();
 521     globalAtomIndices.resize(a);
 522     for (int zone = 0; zone < numZones; zone++)
 523     {
 524         int cg0;
 525         if (zone == 0)
 526         {
 527             cg0 = cg_start;
 528         }
 529         else
 530         {
 531             cg0 = zone2cg[zone];
 532         }
 533         int cg1    = zone2cg[zone+1];
 534         int cg1_p1 = cg0 + zone_ncg1[zone];
 535
 536         for (int cg = cg0; cg < cg1; cg++)
 537         {
 538             int zone1 = zone;
 539             if (cg >= cg1_p1)
 540             {
 541                 /* Signal that this cg is from more than one pulse away */
 542                 zone1 += numZones;
 543             }
 544             int cg_gl = globalAtomGroupIndices[cg];
 545             if (bCGs)
 546             {
 547                 for (int a_gl = gcgs_index[cg_gl]; a_gl < gcgs_index[cg_gl+1]; a_gl++)
 548                 {
 549                     globalAtomIndices.push_back(a_gl);
 550                     ga2la.insert(a_gl, {a, zone1});
 551                     a++;
 552                 }
 553             }
 554             else
 555             {
 556                 globalAtomIndices.push_back(cg_gl);
 557                 ga2la.insert(cg_gl, {a, zone1});
 558                 a++;
 559             }
 560         }
 561     }
 562 }
 563
 564 //! Checks the charge-group assignements.
 565 static int check_bLocalCG(gmx_domdec_t *dd, int ncg_sys, const char *bLocalCG,
 566                           const char *where)
 567 {
 568     int nerr = 0;
 569     if (bLocalCG == nullptr)
 570     {
 571         return nerr;
 572     }
 573     for (size_t i = 0; i < dd->globalAtomGroupIndices.size(); i++)
 574     {
 575         if (!bLocalCG[dd->globalAtomGroupIndices[i]])
 576         {
 577             fprintf(stderr,
 578                     "DD rank %d, %s: atom group %zu, global atom group %d is not marked in bLocalCG (ncg_home %d)\n", dd->rank, where, i + 1, dd->globalAtomGroupIndices[i] + 1, dd->ncg_home);
 579             nerr++;
 580         }
 581     }
 582     size_t ngl = 0;
 583     for (int i = 0; i < ncg_sys; i++)
 584     {
 585         if (bLocalCG[i])
 586         {
 587             ngl++;
 588         }
 589     }
 590     if (ngl != dd->globalAtomGroupIndices.size())
 591     {
 592         fprintf(stderr, "DD rank %d, %s: In bLocalCG %zu atom groups are marked as local, whereas there are %zu\n", dd->rank, where, ngl, dd->globalAtomGroupIndices.size());
 593         nerr++;
 594     }
 595
 596     return nerr;
 597 }
 598
 599 //! Checks whether global and local atom indices are consistent.
 600 static void check_index_consistency(gmx_domdec_t *dd,
 601                                     int natoms_sys, int ncg_sys,
 602                                     const char *where)
 603 {
 604     int       nerr = 0;
 605
 606     const int numAtomsInZones = dd->comm->atomRanges.end(DDAtomRanges::Type::Zones);
 607
 608     if (dd->comm->DD_debug > 1)
 609     {
 610         std::vector<int> have(natoms_sys);
 611         for (int a = 0; a < numAtomsInZones; a++)
 612         {
 613             int globalAtomIndex = dd->globalAtomIndices[a];
 614             if (have[globalAtomIndex] > 0)
 615             {
 616                 fprintf(stderr, "DD rank %d: global atom %d occurs twice: index %d and %d\n", dd->rank, globalAtomIndex + 1, have[globalAtomIndex], a+1);
 617             }
 618             else
 619             {
 620                 have[globalAtomIndex] = a + 1;
 621             }
 622         }
 623     }
 624
 625     std::vector<int> have(numAtomsInZones);
 626
 627     int              ngl = 0;
 628     for (int i = 0; i < natoms_sys; i++)
 629     {
 630         if (const auto entry = dd->ga2la->find(i))
 631         {
 632             const int a = entry->la;
 633             if (a >= numAtomsInZones)
 634             {
 635                 fprintf(stderr, "DD rank %d: global atom %d marked as local atom %d, which is larger than nat_tot (%d)\n", dd->rank, i+1, a+1, numAtomsInZones);
 636                 nerr++;
 637             }
 638             else
 639             {
 640                 have[a] = 1;
 641                 if (dd->globalAtomIndices[a] != i)
 642                 {
 643                     fprintf(stderr, "DD rank %d: global atom %d marked as local atom %d, which has global atom index %d\n", dd->rank, i+1, a+1, dd->globalAtomIndices[a]+1);
 644                     nerr++;
 645                 }
 646             }
 647             ngl++;
 648         }
 649     }
 650     if (ngl != numAtomsInZones)
 651     {
 652         fprintf(stderr,
 653                 "DD rank %d, %s: %d global atom indices, %d local atoms\n",
 654                 dd->rank, where, ngl, numAtomsInZones);
 655     }
 656     for (int a = 0; a < numAtomsInZones; a++)
 657     {
 658         if (have[a] == 0)
 659         {
 660             fprintf(stderr,
 661                     "DD rank %d, %s: local atom %d, global %d has no global index\n",
 662                     dd->rank, where, a + 1, dd->globalAtomIndices[a] + 1);
 663         }
 664     }
 665
 666     nerr += check_bLocalCG(dd, ncg_sys, dd->comm->bLocalCG, where);
 667
 668     if (nerr > 0)
 669     {
 670         gmx_fatal(FARGS, "DD rank %d, %s: %d atom(group) index inconsistencies",
 671                   dd->rank, where, nerr);
 672     }
 673 }
 674
 675 //! Clear all DD global state indices, starting from \p atomGroupStart and \p atomStart
 676 static void clearDDStateIndices(gmx_domdec_t *dd,
 677                                 int           atomGroupStart,
 678                                 int           atomStart)
 679 {
 680     gmx_ga2la_t &ga2la = *dd->ga2la;
 681
 682     if (atomStart == 0)
 683     {
 684         /* Clear the whole list without the overhead of searching */
 685         ga2la.clear();
 686     }
 687     else
 688     {
 689         const int numAtomsInZones = dd->comm->atomRanges.end(DDAtomRanges::Type::Zones);
 690         for (int i = 0; i < numAtomsInZones; i++)
 691         {
 692             ga2la.erase(dd->globalAtomIndices[i]);
 693         }
 694     }
 695
 696     char *bLocalCG = dd->comm->bLocalCG;
 697     if (bLocalCG)
 698     {
 699         for (size_t atomGroup = atomGroupStart; atomGroup < dd->globalAtomGroupIndices.size(); atomGroup++)
 700         {
 701             bLocalCG[dd->globalAtomGroupIndices[atomGroup]] = FALSE;
 702         }
 703     }
 704
 705     dd_clear_local_vsite_indices(dd);
 706
 707     if (dd->constraints)
 708     {
 709         dd_clear_local_constraint_indices(dd);
 710     }
 711 }
 712
 713 bool check_grid_jump(int64_t             step,
 714                      const gmx_domdec_t *dd,
 715                      real                cutoff,
 716                      const gmx_ddbox_t  *ddbox,
 717                      gmx_bool            bFatal)
 718 {
 719     gmx_domdec_comm_t *comm    = dd->comm;
 720     bool               invalid = false;
 721
 722     for (int d = 1; d < dd->ndim; d++)
 723     {
 724         const DDCellsizesWithDlb &cellsizes = comm->cellsizesWithDlb[d];
 725         const int                 dim       = dd->dim[d];
 726         const real                limit     = grid_jump_limit(comm, cutoff, d);
 727         real                      bfac      = ddbox->box_size[dim];
 728         if (ddbox->tric_dir[dim])
 729         {
 730             bfac *= ddbox->skew_fac[dim];
 731         }
 732         if ((cellsizes.fracUpper - cellsizes.fracLowerMax)*bfac <  limit ||
 733             (cellsizes.fracLower - cellsizes.fracUpperMin)*bfac > -limit)
 734         {
 735             invalid = true;
 736
 737             if (bFatal)
 738             {
 739                 char buf[22];
 740
 741                 /* This error should never be triggered under normal
 742                  * circumstances, but you never know ...
 743                  */
 744                 gmx_fatal(FARGS, "step %s: The domain decomposition grid has shifted too much in the %c-direction around cell %d %d %d. This should not have happened. Running with fewer ranks might avoid this issue.",
 745                           gmx_step_str(step, buf),
 746                           dim2char(dim), dd->ci[XX], dd->ci[YY], dd->ci[ZZ]);
 747             }
 748         }
 749     }
 750
 751     return invalid;
 752 }
 753 //! Return the duration of force calculations on this rank.
 754 static float dd_force_load(gmx_domdec_comm_t *comm)
 755 {
 756     float load;
 757
 758     if (comm->eFlop)
 759     {
 760         load = comm->flop;
 761         if (comm->eFlop > 1)
 762         {
 763             load *= 1.0 + (comm->eFlop - 1)*(0.1*rand()/RAND_MAX - 0.05);
 764         }
 765     }
 766     else
 767     {
 768         load = comm->cycl[ddCyclF];
 769         if (comm->cycl_n[ddCyclF] > 1)
 770         {
 771             /* Subtract the maximum of the last n cycle counts
 772              * to get rid of possible high counts due to other sources,
 773              * for instance system activity, that would otherwise
 774              * affect the dynamic load balancing.
 775              */
 776             load -= comm->cycl_max[ddCyclF];
 777         }
 778
 779 #if GMX_MPI
 780         if (comm->cycl_n[ddCyclWaitGPU] && comm->nrank_gpu_shared > 1)
 781         {
 782             float gpu_wait, gpu_wait_sum;
 783
 784             gpu_wait = comm->cycl[ddCyclWaitGPU];
 785             if (comm->cycl_n[ddCyclF] > 1)
 786             {
 787                 /* We should remove the WaitGPU time of the same MD step
 788                  * as the one with the maximum F time, since the F time
 789                  * and the wait time are not independent.
 790                  * Furthermore, the step for the max F time should be chosen
 791                  * the same on all ranks that share the same GPU.
 792                  * But to keep the code simple, we remove the average instead.
 793                  * The main reason for artificially long times at some steps
 794                  * is spurious CPU activity or MPI time, so we don't expect
 795                  * that changes in the GPU wait time matter a lot here.
 796                  */
 797                 gpu_wait *= (comm->cycl_n[ddCyclF] - 1)/static_cast<float>(comm->cycl_n[ddCyclF]);
 798             }
 799             /* Sum the wait times over the ranks that share the same GPU */
 800             MPI_Allreduce(&gpu_wait, &gpu_wait_sum, 1, MPI_FLOAT, MPI_SUM,
 801                           comm->mpi_comm_gpu_shared);
 802             /* Replace the wait time by the average over the ranks */
 803             load += -gpu_wait + gpu_wait_sum/comm->nrank_gpu_shared;
 804         }
 805 #endif
 806     }
 807
 808     return load;
 809 }
 810
 811 //! Runs cell size checks and communicates the boundaries.
 812 static void comm_dd_ns_cell_sizes(gmx_domdec_t *dd,
 813                                   gmx_ddbox_t *ddbox,
 814                                   rvec cell_ns_x0, rvec cell_ns_x1,
 815                                   int64_t step)
 816 {
 817     gmx_domdec_comm_t *comm;
 818     int                dim_ind, dim;
 819
 820     comm = dd->comm;
 821
 822     for (dim_ind = 0; dim_ind < dd->ndim; dim_ind++)
 823     {
 824         dim = dd->dim[dim_ind];
 825
 826         /* Without PBC we don't have restrictions on the outer cells */
 827         if (!(dim >= ddbox->npbcdim &&
 828               (dd->ci[dim] == 0 || dd->ci[dim] == dd->nc[dim] - 1)) &&
 829             isDlbOn(comm) &&
 830             (comm->cell_x1[dim] - comm->cell_x0[dim])*ddbox->skew_fac[dim] <
 831             comm->cellsize_min[dim])
 832         {
 833             char buf[22];
 834             gmx_fatal(FARGS, "step %s: The %c-size (%f) times the triclinic skew factor (%f) is smaller than the smallest allowed cell size (%f) for domain decomposition grid cell %d %d %d",
 835                       gmx_step_str(step, buf), dim2char(dim),
 836                       comm->cell_x1[dim] - comm->cell_x0[dim],
 837                       ddbox->skew_fac[dim],
 838                       dd->comm->cellsize_min[dim],
 839                       dd->ci[XX], dd->ci[YY], dd->ci[ZZ]);
 840         }
 841     }
 842
 843     if ((isDlbOn(dd->comm) && dd->ndim > 1) || ddbox->nboundeddim < DIM)
 844     {
 845         /* Communicate the boundaries and update cell_ns_x0/1 */
 846         dd_move_cellx(dd, ddbox, cell_ns_x0, cell_ns_x1);
 847         if (isDlbOn(dd->comm) && dd->ndim > 1)
 848         {
 849             check_grid_jump(step, dd, dd->comm->cutoff, ddbox, TRUE);
 850         }
 851     }
 852 }
 853
 854 //! Compute and communicate to determine the load distribution across PP ranks.
 855 static void get_load_distribution(gmx_domdec_t *dd, gmx_wallcycle_t wcycle)
 856 {
 857     gmx_domdec_comm_t *comm;
 858     domdec_load_t     *load;
 859     float              cell_frac = 0, sbuf[DD_NLOAD_MAX];
 860     gmx_bool           bSepPME;
 861
 862     if (debug)
 863     {
 864         fprintf(debug, "get_load_distribution start\n");
 865     }
 866
 867     wallcycle_start(wcycle, ewcDDCOMMLOAD);
 868
 869     comm = dd->comm;
 870
 871     bSepPME = (dd->pme_nodeid >= 0);
 872
 873     if (dd->ndim == 0 && bSepPME)
 874     {
 875         /* Without decomposition, but with PME nodes, we need the load */
 876         comm->load[0].mdf = comm->cycl[ddCyclPPduringPME];
 877         comm->load[0].pme = comm->cycl[ddCyclPME];
 878     }
 879
 880     for (int d = dd->ndim - 1; d >= 0; d--)
 881     {
 882         const DDCellsizesWithDlb *cellsizes = (isDlbOn(dd->comm) ? &comm->cellsizesWithDlb[d] : nullptr);
 883         const int                 dim       = dd->dim[d];
 884         /* Check if we participate in the communication in this dimension */
 885         if (d == dd->ndim-1 ||
 886             (dd->ci[dd->dim[d+1]] == 0 && dd->ci[dd->dim[dd->ndim-1]] == 0))
 887         {
 888             load = &comm->load[d];
 889             if (isDlbOn(dd->comm))
 890             {
 891                 cell_frac = cellsizes->fracUpper - cellsizes->fracLower;
 892             }
 893             int pos = 0;
 894             if (d == dd->ndim-1)
 895             {
 896                 sbuf[pos++] = dd_force_load(comm);
 897                 sbuf[pos++] = sbuf[0];
 898                 if (isDlbOn(dd->comm))
 899                 {
 900                     sbuf[pos++] = sbuf[0];
 901                     sbuf[pos++] = cell_frac;
 902                     if (d > 0)
 903                     {
 904                         sbuf[pos++] = cellsizes->fracLowerMax;
 905                         sbuf[pos++] = cellsizes->fracUpperMin;
 906                     }
 907                 }
 908                 if (bSepPME)
 909                 {
 910                     sbuf[pos++] = comm->cycl[ddCyclPPduringPME];
 911                     sbuf[pos++] = comm->cycl[ddCyclPME];
 912                 }
 913             }
 914             else
 915             {
 916                 sbuf[pos++] = comm->load[d+1].sum;
 917                 sbuf[pos++] = comm->load[d+1].max;
 918                 if (isDlbOn(dd->comm))
 919                 {
 920                     sbuf[pos++] = comm->load[d+1].sum_m;
 921                     sbuf[pos++] = comm->load[d+1].cvol_min*cell_frac;
 922                     sbuf[pos++] = comm->load[d+1].flags;
 923                     if (d > 0)
 924                     {
 925                         sbuf[pos++] = cellsizes->fracLowerMax;
 926                         sbuf[pos++] = cellsizes->fracUpperMin;
 927                     }
 928                 }
 929                 if (bSepPME)
 930                 {
 931                     sbuf[pos++] = comm->load[d+1].mdf;
 932                     sbuf[pos++] = comm->load[d+1].pme;
 933                 }
 934             }
 935             load->nload = pos;
 936             /* Communicate a row in DD direction d.
 937              * The communicators are setup such that the root always has rank 0.
 938              */
 939 #if GMX_MPI
 940             MPI_Gather(sbuf, load->nload*sizeof(float), MPI_BYTE,
 941                        load->load, load->nload*sizeof(float), MPI_BYTE,
 942                        0, comm->mpi_comm_load[d]);
 943 #endif
 944             if (dd->ci[dim] == dd->master_ci[dim])
 945             {
 946                 /* We are the master along this row, process this row */
 947                 RowMaster *rowMaster = nullptr;
 948
 949                 if (isDlbOn(comm))
 950                 {
 951                     rowMaster = cellsizes->rowMaster.get();
 952                 }
 953                 load->sum      = 0;
 954                 load->max      = 0;
 955                 load->sum_m    = 0;
 956                 load->cvol_min = 1;
 957                 load->flags    = 0;
 958                 load->mdf      = 0;
 959                 load->pme      = 0;
 960                 int pos        = 0;
 961                 for (int i = 0; i < dd->nc[dim]; i++)
 962                 {
 963                     load->sum += load->load[pos++];
 964                     load->max  = std::max(load->max, load->load[pos]);
 965                     pos++;
 966                     if (isDlbOn(dd->comm))
 967                     {
 968                         if (rowMaster->dlbIsLimited)
 969                         {
 970                             /* This direction could not be load balanced properly,
 971                              * therefore we need to use the maximum iso the average load.
 972                              */
 973                             load->sum_m = std::max(load->sum_m, load->load[pos]);
 974                         }
 975                         else
 976                         {
 977                             load->sum_m += load->load[pos];
 978                         }
 979                         pos++;
 980                         load->cvol_min = std::min(load->cvol_min, load->load[pos]);
 981                         pos++;
 982                         if (d < dd->ndim-1)
 983                         {
 984                             load->flags = gmx::roundToInt(load->load[pos++]);
 985                         }
 986                         if (d > 0)
 987                         {
 988                             rowMaster->bounds[i].cellFracLowerMax = load->load[pos++];
 989                             rowMaster->bounds[i].cellFracUpperMin = load->load[pos++];
 990                         }
 991                     }
 992                     if (bSepPME)
 993                     {
 994                         load->mdf = std::max(load->mdf, load->load[pos]);
 995                         pos++;
 996                         load->pme = std::max(load->pme, load->load[pos]);
 997                         pos++;
 998                     }
 999                 }
1000                 if (isDlbOn(comm) && rowMaster->dlbIsLimited)
1001                 {
1002                     load->sum_m *= dd->nc[dim];
1003                     load->flags |= (1<<d);
1004                 }
1005             }
1006         }
1007     }
1008
1009     if (DDMASTER(dd))
1010     {
1011         comm->nload      += dd_load_count(comm);
1012         comm->load_step  += comm->cycl[ddCyclStep];
1013         comm->load_sum   += comm->load[0].sum;
1014         comm->load_max   += comm->load[0].max;
1015         if (isDlbOn(comm))
1016         {
1017             for (int d = 0; d < dd->ndim; d++)
1018             {
1019                 if (comm->load[0].flags & (1<<d))
1020                 {
1021                     comm->load_lim[d]++;
1022                 }
1023             }
1024         }
1025         if (bSepPME)
1026         {
1027             comm->load_mdf += comm->load[0].mdf;
1028             comm->load_pme += comm->load[0].pme;
1029         }
1030     }
1031
1032     wallcycle_stop(wcycle, ewcDDCOMMLOAD);
1033
1034     if (debug)
1035     {
1036         fprintf(debug, "get_load_distribution finished\n");
1037     }
1038 }
1039
1040 /*! \brief Return the relative performance loss on the total run time
1041  * due to the force calculation load imbalance. */
1042 static float dd_force_load_fraction(gmx_domdec_t *dd)
1043 {
1044     if (dd->comm->nload > 0 && dd->comm->load_step > 0)
1045     {
1046         return dd->comm->load_sum/(dd->comm->load_step*dd->nnodes);
1047     }
1048     else
1049     {
1050         return 0;
1051     }
1052 }
1053
1054 /*! \brief Return the relative performance loss on the total run time
1055  * due to the force calculation load imbalance. */
1056 static float dd_force_imb_perf_loss(gmx_domdec_t *dd)
1057 {
1058     if (dd->comm->nload > 0 && dd->comm->load_step > 0)
1059     {
1060         return
1061             (dd->comm->load_max*dd->nnodes - dd->comm->load_sum)/
1062             (dd->comm->load_step*dd->nnodes);
1063     }
1064     else
1065     {
1066         return 0;
1067     }
1068 }
1069
1070 //! Print load-balance report e.g. at the end of a run.
1071 static void print_dd_load_av(FILE *fplog, gmx_domdec_t *dd)
1072 {
1073     gmx_domdec_comm_t *comm = dd->comm;
1074
1075     /* Only the master rank prints loads and only if we measured loads */
1076     if (!DDMASTER(dd) || comm->nload == 0)
1077     {
1078         return;
1079     }
1080
1081     char  buf[STRLEN];
1082     int   numPpRanks   = dd->nnodes;
1083     int   numPmeRanks  = (dd->pme_nodeid >= 0) ? comm->npmenodes : 0;
1084     int   numRanks     = numPpRanks + numPmeRanks;
1085     float lossFraction = 0;
1086
1087     /* Print the average load imbalance and performance loss */
1088     if (dd->nnodes > 1 && comm->load_sum > 0)
1089     {
1090         float imbalance = comm->load_max*numPpRanks/comm->load_sum - 1;
1091         lossFraction    = dd_force_imb_perf_loss(dd);
1092
1093         std::string msg         = "\nDynamic load balancing report:\n";
1094         std::string dlbStateStr;
1095
1096         switch (dd->comm->dlbState)
1097         {
1098             case DlbState::offUser:
1099                 dlbStateStr = "DLB was off during the run per user request.";
1100                 break;
1101             case DlbState::offForever:
1102                 /* Currectly this can happen due to performance loss observed, cell size
1103                  * limitations or incompatibility with other settings observed during
1104                  * determineInitialDlbState(). */
1105                 dlbStateStr = "DLB got disabled because it was unsuitable to use.";
1106                 break;
1107             case DlbState::offCanTurnOn:
1108                 dlbStateStr = "DLB was off during the run due to low measured imbalance.";
1109                 break;
1110             case DlbState::offTemporarilyLocked:
1111                 dlbStateStr = "DLB was locked at the end of the run due to unfinished PP-PME balancing.";
1112                 break;
1113             case DlbState::onCanTurnOff:
1114                 dlbStateStr = "DLB was turned on during the run due to measured imbalance.";
1115                 break;
1116             case DlbState::onUser:
1117                 dlbStateStr = "DLB was permanently on during the run per user request.";
1118                 break;
1119             default:
1120                 GMX_ASSERT(false, "Undocumented DLB state");
1121         }
1122
1123         msg += " " + dlbStateStr + "\n";
1124         msg += gmx::formatString(" Average load imbalance: %.1f%%.\n", imbalance*100);
1125         msg += gmx::formatString(" The balanceable part of the MD step is %d%%, load imbalance is computed from this.\n",
1126                                  gmx::roundToInt(dd_force_load_fraction(dd)*100));
1127         msg += gmx::formatString(" Part of the total run time spent waiting due to load imbalance: %.1f%%.\n",
1128                                  lossFraction*100);
1129         fprintf(fplog, "%s", msg.c_str());
1130         fprintf(stderr, "\n%s", msg.c_str());
1131     }
1132
1133     /* Print during what percentage of steps the  load balancing was limited */
1134     bool dlbWasLimited = false;
1135     if (isDlbOn(comm))
1136     {
1137         sprintf(buf, " Steps where the load balancing was limited by -rdd, -rcon and/or -dds:");
1138         for (int d = 0; d < dd->ndim; d++)
1139         {
1140             int limitPercentage = (200*comm->load_lim[d] + 1)/(2*comm->nload);
1141             sprintf(buf+strlen(buf), " %c %d %%",
1142                     dim2char(dd->dim[d]), limitPercentage);
1143             if (limitPercentage >= 50)
1144             {
1145                 dlbWasLimited = true;
1146             }
1147         }
1148         sprintf(buf + strlen(buf), "\n");
1149         fprintf(fplog, "%s", buf);
1150         fprintf(stderr, "%s", buf);
1151     }
1152
1153     /* Print the performance loss due to separate PME - PP rank imbalance */
1154     float lossFractionPme = 0;
1155     if (numPmeRanks > 0 && comm->load_mdf > 0 && comm->load_step > 0)
1156     {
1157         float pmeForceRatio = comm->load_pme/comm->load_mdf;
1158         lossFractionPme     = (comm->load_pme - comm->load_mdf)/comm->load_step;
1159         if (lossFractionPme <= 0)
1160         {
1161             lossFractionPme *= numPmeRanks/static_cast<float>(numRanks);
1162         }
1163         else
1164         {
1165             lossFractionPme *= numPpRanks/static_cast<float>(numRanks);
1166         }
1167         sprintf(buf, " Average PME mesh/force load: %5.3f\n", pmeForceRatio);
1168         fprintf(fplog, "%s", buf);
1169         fprintf(stderr, "%s", buf);
1170         sprintf(buf, " Part of the total run time spent waiting due to PP/PME imbalance: %.1f %%\n", std::fabs(lossFractionPme)*100);
1171         fprintf(fplog, "%s", buf);
1172         fprintf(stderr, "%s", buf);
1173     }
1174     fprintf(fplog, "\n");
1175     fprintf(stderr, "\n");
1176
1177     if (lossFraction >= DD_PERF_LOSS_WARN)
1178     {
1179         std::string message = gmx::formatString(
1180                     "NOTE: %.1f %% of the available CPU time was lost due to load imbalance\n"
1181                     "      in the domain decomposition.\n", lossFraction*100);
1182
1183         bool hadSuggestion = false;
1184         if (!isDlbOn(comm))
1185         {
1186             message      += "      You might want to use dynamic load balancing (option -dlb.)\n";
1187             hadSuggestion = true;
1188         }
1189         else if (dlbWasLimited)
1190         {
1191             message      += "      You might want to decrease the cell size limit (options -rdd, -rcon and/or -dds).\n";
1192             hadSuggestion = true;
1193         }
1194         message += gmx::formatString(
1195                     "      You can %sconsider manually changing the decomposition (option -dd);\n"
1196                     "      e.g. by using fewer domains along the box dimension in which there is\n"
1197                     "      considerable inhomogeneity in the simulated system.",
1198                     hadSuggestion ? "also " : "");
1199
1200
1201         fprintf(fplog, "%s\n", message.c_str());
1202         fprintf(stderr, "%s\n", message.c_str());
1203     }
1204     if (numPmeRanks > 0 && std::fabs(lossFractionPme) >= DD_PERF_LOSS_WARN)
1205     {
1206         sprintf(buf,
1207                 "NOTE: %.1f %% performance was lost because the PME ranks\n"
1208                 "      had %s work to do than the PP ranks.\n"
1209                 "      You might want to %s the number of PME ranks\n"
1210                 "      or %s the cut-off and the grid spacing.\n",
1211                 std::fabs(lossFractionPme*100),
1212                 (lossFractionPme < 0) ? "less"     : "more",
1213                 (lossFractionPme < 0) ? "decrease" : "increase",
1214                 (lossFractionPme < 0) ? "decrease" : "increase");
1215         fprintf(fplog, "%s\n", buf);
1216         fprintf(stderr, "%s\n", buf);
1217     }
1218 }
1219
1220 //! Return the minimum communication volume.
1221 static float dd_vol_min(gmx_domdec_t *dd)
1222 {
1223     return dd->comm->load[0].cvol_min*dd->nnodes;
1224 }
1225
1226 //! Return the DD load flags.
1227 static int dd_load_flags(gmx_domdec_t *dd)
1228 {
1229     return dd->comm->load[0].flags;
1230 }
1231
1232 //! Return the reported load imbalance in force calculations.
1233 static float dd_f_imbal(gmx_domdec_t *dd)
1234 {
1235     if (dd->comm->load[0].sum > 0)
1236     {
1237         return dd->comm->load[0].max*dd->nnodes/dd->comm->load[0].sum - 1.0f;
1238     }
1239     else
1240     {
1241         /* Something is wrong in the cycle counting, report no load imbalance */
1242         return 0.0f;
1243     }
1244 }
1245
1246 //! Returns DD load balance report.
1247 static std::string
1248 dd_print_load(gmx_domdec_t *dd,
1249               int64_t       step)
1250 {
1251     gmx::StringOutputStream stream;
1252     gmx::TextWriter         log(&stream);
1253
1254     int                     flags = dd_load_flags(dd);
1255     if (flags)
1256     {
1257         log.writeString("DD  load balancing is limited by minimum cell size in dimension");
1258         for (int d = 0; d < dd->ndim; d++)
1259         {
1260             if (flags & (1<<d))
1261             {
1262                 log.writeStringFormatted(" %c", dim2char(dd->dim[d]));
1263             }
1264         }
1265         log.ensureLineBreak();
1266     }
1267     log.writeString("DD  step " + gmx::toString(step));
1268     if (isDlbOn(dd->comm))
1269     {
1270         log.writeStringFormatted("  vol min/aver %5.3f%c",
1271                                  dd_vol_min(dd), flags ? '!' : ' ');
1272     }
1273     if (dd->nnodes > 1)
1274     {
1275         log.writeStringFormatted(" load imb.: force %4.1f%%", dd_f_imbal(dd)*100);
1276     }
1277     if (dd->comm->cycl_n[ddCyclPME])
1278     {
1279         log.writeStringFormatted("  pme mesh/force %5.3f", dd_pme_f_ratio(dd));
1280     }
1281     log.ensureLineBreak();
1282     return stream.toString();
1283 }
1284
1285 //! Prints DD load balance report in mdrun verbose mode.
1286 static void dd_print_load_verbose(gmx_domdec_t *dd)
1287 {
1288     if (isDlbOn(dd->comm))
1289     {
1290         fprintf(stderr, "vol %4.2f%c ",
1291                 dd_vol_min(dd), dd_load_flags(dd) ? '!' : ' ');
1292     }
1293     if (dd->nnodes > 1)
1294     {
1295         fprintf(stderr, "imb F %2d%% ", gmx::roundToInt(dd_f_imbal(dd)*100));
1296     }
1297     if (dd->comm->cycl_n[ddCyclPME])
1298     {
1299         fprintf(stderr, "pme/F %4.2f ", dd_pme_f_ratio(dd));
1300     }
1301 }
1302
1303 //! Turns on dynamic load balancing if possible and needed.
1304 static void turn_on_dlb(const gmx::MDLogger &mdlog,
1305                         gmx_domdec_t        *dd,
1306                         int64_t              step)
1307 {
1308     gmx_domdec_comm_t *comm         = dd->comm;
1309
1310     real               cellsize_min = comm->cellsize_min[dd->dim[0]];
1311     for (int d = 1; d < dd->ndim; d++)
1312     {
1313         cellsize_min = std::min(cellsize_min, comm->cellsize_min[dd->dim[d]]);
1314     }
1315
1316     /* Turn off DLB if we're too close to the cell size limit. */
1317     if (cellsize_min < comm->cellsize_limit*1.05)
1318     {
1319         GMX_LOG(mdlog.info).appendTextFormatted(
1320                 "step %s Measured %.1f %% performance loss due to load imbalance, "
1321                 "but the minimum cell size is smaller than 1.05 times the cell size limit. "
1322                 "Will no longer try dynamic load balancing.",
1323                 gmx::toString(step).c_str(), dd_force_imb_perf_loss(dd)*100);
1324
1325         comm->dlbState = DlbState::offForever;
1326         return;
1327     }
1328
1329     GMX_LOG(mdlog.info).appendTextFormatted(
1330             "step %s Turning on dynamic load balancing, because the performance loss due to load imbalance is %.1f %%.",
1331             gmx::toString(step).c_str(), dd_force_imb_perf_loss(dd)*100);
1332     comm->dlbState = DlbState::onCanTurnOff;
1333
1334     /* Store the non-DLB performance, so we can check if DLB actually
1335      * improves performance.
1336      */
1337     GMX_RELEASE_ASSERT(comm->cycl_n[ddCyclStep] > 0, "When we turned on DLB, we should have measured cycles");
1338     comm->cyclesPerStepBeforeDLB = comm->cycl[ddCyclStep]/comm->cycl_n[ddCyclStep];
1339
1340     set_dlb_limits(dd);
1341
1342     /* We can set the required cell size info here,
1343      * so we do not need to communicate this.
1344      * The grid is completely uniform.
1345      */
1346     for (int d = 0; d < dd->ndim; d++)
1347     {
1348         RowMaster *rowMaster = comm->cellsizesWithDlb[d].rowMaster.get();
1349
1350         if (rowMaster)
1351         {
1352             comm->load[d].sum_m = comm->load[d].sum;
1353
1354             int nc = dd->nc[dd->dim[d]];
1355             for (int i = 0; i < nc; i++)
1356             {
1357                 rowMaster->cellFrac[i] = i/static_cast<real>(nc);
1358                 if (d > 0)
1359                 {
1360                     rowMaster->bounds[i].cellFracLowerMax =  i     /static_cast<real>(nc);
1361                     rowMaster->bounds[i].cellFracUpperMin = (i + 1)/static_cast<real>(nc);
1362                 }
1363             }
1364             rowMaster->cellFrac[nc] = 1.0;
1365         }
1366     }
1367 }
1368
1369 //! Turns off dynamic load balancing (but leave it able to turn back on).
1370 static void turn_off_dlb(const gmx::MDLogger &mdlog,
1371                          gmx_domdec_t        *dd,
1372                          int64_t              step)
1373 {
1374     GMX_LOG(mdlog.info).appendText(
1375             "step " + gmx::toString(step) + " Turning off dynamic load balancing, because it is degrading performance.");
1376     dd->comm->dlbState                     = DlbState::offCanTurnOn;
1377     dd->comm->haveTurnedOffDlb             = true;
1378     dd->comm->ddPartioningCountFirstDlbOff = dd->ddp_count;
1379 }
1380
1381 //! Turns off dynamic load balancing permanently.
1382 static void turn_off_dlb_forever(const gmx::MDLogger &mdlog,
1383                                  gmx_domdec_t        *dd,
1384                                  int64_t              step)
1385 {
1386     GMX_RELEASE_ASSERT(dd->comm->dlbState == DlbState::offCanTurnOn, "Can only turn off DLB forever when it was in the can-turn-on state");
1387     GMX_LOG(mdlog.info).appendText(
1388             "step " + gmx::toString(step) + " Will no longer try dynamic load balancing, as it degraded performance.");
1389     dd->comm->dlbState = DlbState::offForever;
1390 }
1391
1392 void set_dd_dlb_max_cutoff(t_commrec *cr, real cutoff)
1393 {
1394     gmx_domdec_comm_t *comm;
1395
1396     comm = cr->dd->comm;
1397
1398     /* Turn on the DLB limiting (might have been on already) */
1399     comm->bPMELoadBalDLBLimits = TRUE;
1400
1401     /* Change the cut-off limit */
1402     comm->PMELoadBal_max_cutoff = cutoff;
1403
1404     if (debug)
1405     {
1406         fprintf(debug, "PME load balancing set a limit to the DLB staggering such that a %f cut-off will continue to fit\n",
1407                 comm->PMELoadBal_max_cutoff);
1408     }
1409 }
1410
1411 //! Merges charge-group buffers.
1412 static void merge_cg_buffers(int ncell,
1413                              gmx_domdec_comm_dim_t *cd, int pulse,
1414                              int  *ncg_cell,
1415                              gmx::ArrayRef<int> index_gl,
1416                              const int  *recv_i,
1417                              rvec *cg_cm,    rvec *recv_vr,
1418                              gmx::ArrayRef<int> cgindex,
1419                              cginfo_mb_t *cginfo_mb, int *cginfo)
1420 {
1421     gmx_domdec_ind_t *ind, *ind_p;
1422     int               p, cell, c, cg, cg0, cg1, cg_gl, nat;
1423     int               shift, shift_at;
1424
1425     ind = &cd->ind[pulse];
1426
1427     /* First correct the already stored data */
1428     shift = ind->nrecv[ncell];
1429     for (cell = ncell-1; cell >= 0; cell--)
1430     {
1431         shift -= ind->nrecv[cell];
1432         if (shift > 0)
1433         {
1434             /* Move the cg's present from previous grid pulses */
1435             cg0                = ncg_cell[ncell+cell];
1436             cg1                = ncg_cell[ncell+cell+1];
1437             cgindex[cg1+shift] = cgindex[cg1];
1438             for (cg = cg1-1; cg >= cg0; cg--)
1439             {
1440                 index_gl[cg+shift] = index_gl[cg];
1441                 copy_rvec(cg_cm[cg], cg_cm[cg+shift]);
1442                 cgindex[cg+shift] = cgindex[cg];
1443                 cginfo[cg+shift]  = cginfo[cg];
1444             }
1445             /* Correct the already stored send indices for the shift */
1446             for (p = 1; p <= pulse; p++)
1447             {
1448                 ind_p = &cd->ind[p];
1449                 cg0   = 0;
1450                 for (c = 0; c < cell; c++)
1451                 {
1452                     cg0 += ind_p->nsend[c];
1453                 }
1454                 cg1 = cg0 + ind_p->nsend[cell];
1455                 for (cg = cg0; cg < cg1; cg++)
1456                 {
1457                     ind_p->index[cg] += shift;
1458                 }
1459             }
1460         }
1461     }
1462
1463     /* Merge in the communicated buffers */
1464     shift    = 0;
1465     shift_at = 0;
1466     cg0      = 0;
1467     for (cell = 0; cell < ncell; cell++)
1468     {
1469         cg1 = ncg_cell[ncell+cell+1] + shift;
1470         if (shift_at > 0)
1471         {
1472             /* Correct the old cg indices */
1473             for (cg = ncg_cell[ncell+cell]; cg < cg1; cg++)
1474             {
1475                 cgindex[cg+1] += shift_at;
1476             }
1477         }
1478         for (cg = 0; cg < ind->nrecv[cell]; cg++)
1479         {
1480             /* Copy this charge group from the buffer */
1481             index_gl[cg1] = recv_i[cg0];
1482             copy_rvec(recv_vr[cg0], cg_cm[cg1]);
1483             /* Add it to the cgindex */
1484             cg_gl          = index_gl[cg1];
1485             cginfo[cg1]    = ddcginfo(cginfo_mb, cg_gl);
1486             nat            = GET_CGINFO_NATOMS(cginfo[cg1]);
1487             cgindex[cg1+1] = cgindex[cg1] + nat;
1488             cg0++;
1489             cg1++;
1490             shift_at += nat;
1491         }
1492         shift                 += ind->nrecv[cell];
1493         ncg_cell[ncell+cell+1] = cg1;
1494     }
1495 }
1496
1497 //! Makes a range partitioning for the atom groups wthin a cell
1498 static void make_cell2at_index(gmx_domdec_comm_dim_t        *cd,
1499                                int                           nzone,
1500                                int                           atomGroupStart,
1501                                const gmx::RangePartitioning &atomGroups)
1502 {
1503     /* Store the atom block boundaries for easy copying of communication buffers
1504      */
1505     int g = atomGroupStart;
1506     for (int zone = 0; zone < nzone; zone++)
1507     {
1508         for (gmx_domdec_ind_t &ind : cd->ind)
1509         {
1510             const auto range    = atomGroups.subRange(g, g + ind.nrecv[zone]);
1511             ind.cell2at0[zone]  = range.begin();
1512             ind.cell2at1[zone]  = range.end();
1513             g                  += ind.nrecv[zone];
1514         }
1515     }
1516 }
1517
1518 //! Returns whether a link is missing.
1519 static gmx_bool missing_link(t_blocka *link, int cg_gl, const char *bLocalCG)
1520 {
1521     int      i;
1522     gmx_bool bMiss;
1523
1524     bMiss = FALSE;
1525     for (i = link->index[cg_gl]; i < link->index[cg_gl+1]; i++)
1526     {
1527         if (!bLocalCG[link->a[i]])
1528         {
1529             bMiss = TRUE;
1530         }
1531     }
1532
1533     return bMiss;
1534 }
1535
1536 //! Domain corners for communication, a maximum of 4 i-zones see a j domain
1537 typedef struct {
1538     //! The corners for the non-bonded communication.
1539     real c[DIM][4];
1540     //! Corner for rounding.
1541     real cr0;
1542     //! Corners for rounding.
1543     real cr1[4];
1544     //! Corners for bounded communication.
1545     real bc[DIM];
1546     //! Corner for rounding for bonded communication.
1547     real bcr1;
1548 } dd_corners_t;
1549
1550 //! Determine the corners of the domain(s) we are communicating with.
1551 static void
1552 set_dd_corners(const gmx_domdec_t *dd,
1553                int dim0, int dim1, int dim2,
1554                gmx_bool bDistMB,
1555                dd_corners_t *c)
1556 {
1557     const gmx_domdec_comm_t  *comm;
1558     const gmx_domdec_zones_t *zones;
1559     int i, j;
1560
1561     comm = dd->comm;
1562
1563     zones = &comm->zones;
1564
1565     /* Keep the compiler happy */
1566     c->cr0  = 0;
1567     c->bcr1 = 0;
1568
1569     /* The first dimension is equal for all cells */
1570     c->c[0][0] = comm->cell_x0[dim0];
1571     if (bDistMB)
1572     {
1573         c->bc[0] = c->c[0][0];
1574     }
1575     if (dd->ndim >= 2)
1576     {
1577         dim1 = dd->dim[1];
1578         /* This cell row is only seen from the first row */
1579         c->c[1][0] = comm->cell_x0[dim1];
1580         /* All rows can see this row */
1581         c->c[1][1] = comm->cell_x0[dim1];
1582         if (isDlbOn(dd->comm))
1583         {
1584             c->c[1][1] = std::max(comm->cell_x0[dim1], comm->zone_d1[1].mch0);
1585             if (bDistMB)
1586             {
1587                 /* For the multi-body distance we need the maximum */
1588                 c->bc[1] = std::max(comm->cell_x0[dim1], comm->zone_d1[1].p1_0);
1589             }
1590         }
1591         /* Set the upper-right corner for rounding */
1592         c->cr0 = comm->cell_x1[dim0];
1593
1594         if (dd->ndim >= 3)
1595         {
1596             dim2 = dd->dim[2];
1597             for (j = 0; j < 4; j++)
1598             {
1599                 c->c[2][j] = comm->cell_x0[dim2];
1600             }
1601             if (isDlbOn(dd->comm))
1602             {
1603                 /* Use the maximum of the i-cells that see a j-cell */
1604                 for (i = 0; i < zones->nizone; i++)
1605                 {
1606                     for (j = zones->izone[i].j0; j < zones->izone[i].j1; j++)
1607                     {
1608                         if (j >= 4)
1609                         {
1610                             c->c[2][j-4] =
1611                                 std::max(c->c[2][j-4],
1612                                          comm->zone_d2[zones->shift[i][dim0]][zones->shift[i][dim1]].mch0);
1613                         }
1614                     }
1615                 }
1616                 if (bDistMB)
1617                 {
1618                     /* For the multi-body distance we need the maximum */
1619                     c->bc[2] = comm->cell_x0[dim2];
1620                     for (i = 0; i < 2; i++)
1621                     {
1622                         for (j = 0; j < 2; j++)
1623                         {
1624                             c->bc[2] = std::max(c->bc[2], comm->zone_d2[i][j].p1_0);
1625                         }
1626                     }
1627                 }
1628             }
1629
1630             /* Set the upper-right corner for rounding */
1631             /* Cell (0,0,0) and cell (1,0,0) can see cell 4 (0,1,1)
1632              * Only cell (0,0,0) can see cell 7 (1,1,1)
1633              */
1634             c->cr1[0] = comm->cell_x1[dim1];
1635             c->cr1[3] = comm->cell_x1[dim1];
1636             if (isDlbOn(dd->comm))
1637             {
1638                 c->cr1[0] = std::max(comm->cell_x1[dim1], comm->zone_d1[1].mch1);
1639                 if (bDistMB)
1640                 {
1641                     /* For the multi-body distance we need the maximum */
1642                     c->bcr1 = std::max(comm->cell_x1[dim1], comm->zone_d1[1].p1_1);
1643                 }
1644             }
1645         }
1646     }
1647 }
1648
1649 /*! \brief Add the atom groups we need to send in this pulse from this
1650  * zone to \p localAtomGroups and \p work. */
1651 static void
1652 get_zone_pulse_cgs(gmx_domdec_t *dd,
1653                    int zonei, int zone,
1654                    int cg0, int cg1,
1655                    gmx::ArrayRef<const int> globalAtomGroupIndices,
1656                    const gmx::RangePartitioning &atomGroups,
1657                    int dim, int dim_ind,
1658                    int dim0, int dim1, int dim2,
1659                    real r_comm2, real r_bcomm2,
1660                    matrix box,
1661                    bool distanceIsTriclinic,
1662                    rvec *normal,
1663                    real skew_fac2_d, real skew_fac_01,
1664                    rvec *v_d, rvec *v_0, rvec *v_1,
1665                    const dd_corners_t *c,
1666                    const rvec sf2_round,
1667                    gmx_bool bDistBonded,
1668                    gmx_bool bBondComm,
1669                    gmx_bool bDist2B,
1670                    gmx_bool bDistMB,
1671                    rvec *cg_cm,
1672                    const int *cginfo,
1673                    std::vector<int>     *localAtomGroups,
1674                    dd_comm_setup_work_t *work)
1675 {
1676     gmx_domdec_comm_t *comm;
1677     gmx_bool           bScrew;
1678     gmx_bool           bDistMB_pulse;
1679     int                cg, i;
1680     real               r2, rb2, r, tric_sh;
1681     rvec               rn, rb;
1682     int                dimd;
1683     int                nsend_z, nat;
1684
1685     comm = dd->comm;
1686
1687     bScrew = (dd->bScrewPBC && dim == XX);
1688
1689     bDistMB_pulse = (bDistMB && bDistBonded);
1690
1691     /* Unpack the work data */
1692     std::vector<int>       &ibuf = work->atomGroupBuffer;
1693     std::vector<gmx::RVec> &vbuf = work->positionBuffer;
1694     nsend_z                      = 0;
1695     nat                          = work->nat;
1696
1697     for (cg = cg0; cg < cg1; cg++)
1698     {
1699         r2  = 0;
1700         rb2 = 0;
1701         if (!distanceIsTriclinic)
1702         {
1703             /* Rectangular direction, easy */
1704             r = cg_cm[cg][dim] - c->c[dim_ind][zone];
1705             if (r > 0)
1706             {
1707                 r2 += r*r;
1708             }
1709             if (bDistMB_pulse)
1710             {
1711                 r = cg_cm[cg][dim] - c->bc[dim_ind];
1712                 if (r > 0)
1713                 {
1714                     rb2 += r*r;
1715                 }
1716             }
1717             /* Rounding gives at most a 16% reduction
1718              * in communicated atoms
1719              */
1720             if (dim_ind >= 1 && (zonei == 1 || zonei == 2))
1721             {
1722                 r = cg_cm[cg][dim0] - c->cr0;
1723                 /* This is the first dimension, so always r >= 0 */
1724                 r2 += r*r;
1725                 if (bDistMB_pulse)
1726                 {
1727                     rb2 += r*r;
1728                 }
1729             }
1730             if (dim_ind == 2 && (zonei == 2 || zonei == 3))
1731             {
1732                 r = cg_cm[cg][dim1] - c->cr1[zone];
1733                 if (r > 0)
1734                 {
1735                     r2 += r*r;
1736                 }
1737                 if (bDistMB_pulse)
1738                 {
1739                     r = cg_cm[cg][dim1] - c->bcr1;
1740                     if (r > 0)
1741                     {
1742                         rb2 += r*r;
1743                     }
1744                 }
1745             }
1746         }
1747         else
1748         {
1749             /* Triclinic direction, more complicated */
1750             clear_rvec(rn);
1751             clear_rvec(rb);
1752             /* Rounding, conservative as the skew_fac multiplication
1753              * will slightly underestimate the distance.
1754              */
1755             if (dim_ind >= 1 && (zonei == 1 || zonei == 2))
1756             {
1757                 rn[dim0] = cg_cm[cg][dim0] - c->cr0;
1758                 for (i = dim0+1; i < DIM; i++)
1759                 {
1760                     rn[dim0] -= cg_cm[cg][i]*v_0[i][dim0];
1761                 }
1762                 r2 = rn[dim0]*rn[dim0]*sf2_round[dim0];
1763                 if (bDistMB_pulse)
1764                 {
1765                     rb[dim0] = rn[dim0];
1766                     rb2      = r2;
1767                 }
1768                 /* Take care that the cell planes along dim0 might not
1769                  * be orthogonal to those along dim1 and dim2.
1770                  */
1771                 for (i = 1; i <= dim_ind; i++)
1772                 {
1773                     dimd = dd->dim[i];
1774                     if (normal[dim0][dimd] > 0)
1775                     {
1776                         rn[dimd] -= rn[dim0]*normal[dim0][dimd];
1777                         if (bDistMB_pulse)
1778                         {
1779                             rb[dimd] -= rb[dim0]*normal[dim0][dimd];
1780                         }
1781                     }
1782                 }
1783             }
1784             if (dim_ind == 2 && (zonei == 2 || zonei == 3))
1785             {
1786                 rn[dim1] += cg_cm[cg][dim1] - c->cr1[zone];
1787                 tric_sh   = 0;
1788                 for (i = dim1+1; i < DIM; i++)
1789                 {
1790                     tric_sh -= cg_cm[cg][i]*v_1[i][dim1];
1791                 }
1792                 rn[dim1] += tric_sh;
1793                 if (rn[dim1] > 0)
1794                 {
1795                     r2 += rn[dim1]*rn[dim1]*sf2_round[dim1];
1796                     /* Take care of coupling of the distances
1797                      * to the planes along dim0 and dim1 through dim2.
1798                      */
1799                     r2 -= rn[dim0]*rn[dim1]*skew_fac_01;
1800                     /* Take care that the cell planes along dim1
1801                      * might not be orthogonal to that along dim2.
1802                      */
1803                     if (normal[dim1][dim2] > 0)
1804                     {
1805                         rn[dim2] -= rn[dim1]*normal[dim1][dim2];
1806                     }
1807                 }
1808                 if (bDistMB_pulse)
1809                 {
1810                     rb[dim1] +=
1811                         cg_cm[cg][dim1] - c->bcr1 + tric_sh;
1812                     if (rb[dim1] > 0)
1813                     {
1814                         rb2 += rb[dim1]*rb[dim1]*sf2_round[dim1];
1815                         /* Take care of coupling of the distances
1816                          * to the planes along dim0 and dim1 through dim2.
1817                          */
1818                         rb2 -= rb[dim0]*rb[dim1]*skew_fac_01;
1819                         /* Take care that the cell planes along dim1
1820                          * might not be orthogonal to that along dim2.
1821                          */
1822                         if (normal[dim1][dim2] > 0)
1823                         {
1824                             rb[dim2] -= rb[dim1]*normal[dim1][dim2];
1825                         }
1826                     }
1827                 }
1828             }
1829             /* The distance along the communication direction */
1830             rn[dim] += cg_cm[cg][dim] - c->c[dim_ind][zone];
1831             tric_sh  = 0;
1832             for (i = dim+1; i < DIM; i++)
1833             {
1834                 tric_sh -= cg_cm[cg][i]*v_d[i][dim];
1835             }
1836             rn[dim] += tric_sh;
1837             if (rn[dim] > 0)
1838             {
1839                 r2 += rn[dim]*rn[dim]*skew_fac2_d;
1840                 /* Take care of coupling of the distances
1841                  * to the planes along dim0 and dim1 through dim2.
1842                  */
1843                 if (dim_ind == 1 && zonei == 1)
1844                 {
1845                     r2 -= rn[dim0]*rn[dim]*skew_fac_01;
1846                 }
1847             }
1848             if (bDistMB_pulse)
1849             {
1850                 clear_rvec(rb);
1851                 rb[dim] += cg_cm[cg][dim] - c->bc[dim_ind] + tric_sh;
1852                 if (rb[dim] > 0)
1853                 {
1854                     rb2 += rb[dim]*rb[dim]*skew_fac2_d;
1855                     /* Take care of coupling of the distances
1856                      * to the planes along dim0 and dim1 through dim2.
1857                      */
1858                     if (dim_ind == 1 && zonei == 1)
1859                     {
1860                         rb2 -= rb[dim0]*rb[dim]*skew_fac_01;
1861                     }
1862                 }
1863             }
1864         }
1865
1866         if (r2 < r_comm2 ||
1867             (bDistBonded &&
1868              ((bDistMB && rb2 < r_bcomm2) ||
1869               (bDist2B && r2  < r_bcomm2)) &&
1870              (!bBondComm ||
1871               (GET_CGINFO_BOND_INTER(cginfo[cg]) &&
1872                missing_link(comm->cglink, globalAtomGroupIndices[cg],
1873                             comm->bLocalCG)))))
1874         {
1875             /* Store the local and global atom group indices and position */
1876             localAtomGroups->push_back(cg);
1877             ibuf.push_back(globalAtomGroupIndices[cg]);
1878             nsend_z++;
1879
1880             rvec posPbc;
1881             if (dd->ci[dim] == 0)
1882             {
1883                 /* Correct cg_cm for pbc */
1884                 rvec_add(cg_cm[cg], box[dim], posPbc);
1885                 if (bScrew)
1886                 {
1887                     posPbc[YY] = box[YY][YY] - posPbc[YY];
1888                     posPbc[ZZ] = box[ZZ][ZZ] - posPbc[ZZ];
1889                 }
1890             }
1891             else
1892             {
1893                 copy_rvec(cg_cm[cg], posPbc);
1894             }
1895             vbuf.emplace_back(posPbc[XX], posPbc[YY], posPbc[ZZ]);
1896
1897             nat += atomGroups.block(cg).size();
1898         }
1899     }
1900
1901     work->nat        = nat;
1902     work->nsend_zone = nsend_z;
1903 }
1904
1905 //! Clear data.
1906 static void clearCommSetupData(dd_comm_setup_work_t *work)
1907 {
1908     work->localAtomGroupBuffer.clear();
1909     work->atomGroupBuffer.clear();
1910     work->positionBuffer.clear();
1911     work->nat        = 0;
1912     work->nsend_zone = 0;
1913 }
1914
1915 //! Prepare DD communication.
1916 static void setup_dd_communication(gmx_domdec_t *dd,
1917                                    matrix box, gmx_ddbox_t *ddbox,
1918                                    t_forcerec *fr,
1919                                    t_state *state,
1920                                    PaddedVector<gmx::RVec> *f)
1921 {
1922     int                    dim_ind, dim, dim0, dim1, dim2, dimd, nat_tot;
1923     int                    nzone, nzone_send, zone, zonei, cg0, cg1;
1924     int                    c, i, cg, cg_gl, nrcg;
1925     int                   *zone_cg_range, pos_cg;
1926     gmx_domdec_comm_t     *comm;
1927     gmx_domdec_zones_t    *zones;
1928     gmx_domdec_comm_dim_t *cd;
1929     cginfo_mb_t           *cginfo_mb;
1930     gmx_bool               bBondComm, bDist2B, bDistMB, bDistBonded;
1931     dd_corners_t           corners;
1932     rvec                  *cg_cm, *normal, *v_d, *v_0 = nullptr, *v_1 = nullptr;
1933     real                   skew_fac2_d, skew_fac_01;
1934     rvec                   sf2_round;
1935
1936     if (debug)
1937     {
1938         fprintf(debug, "Setting up DD communication\n");
1939     }
1940
1941     comm  = dd->comm;
1942
1943     if (comm->dth.empty())
1944     {
1945         /* Initialize the thread data.
1946          * This can not be done in init_domain_decomposition,
1947          * as the numbers of threads is determined later.
1948          */
1949         int numThreads = gmx_omp_nthreads_get(emntDomdec);
1950         comm->dth.resize(numThreads);
1951     }
1952
1953     switch (fr->cutoff_scheme)
1954     {
1955         case ecutsGROUP:
1956             cg_cm = fr->cg_cm;
1957             break;
1958         case ecutsVERLET:
1959             cg_cm = state->x.rvec_array();
1960             break;
1961         default:
1962             gmx_incons("unimplemented");
1963     }
1964
1965     bBondComm = comm->bBondComm;
1966
1967     /* Do we need to determine extra distances for multi-body bondeds? */
1968     bDistMB = (comm->bInterCGMultiBody && isDlbOn(dd->comm) && dd->ndim > 1);
1969
1970     /* Do we need to determine extra distances for only two-body bondeds? */
1971     bDist2B = (bBondComm && !bDistMB);
1972
1973     const real r_comm2  = gmx::square(domainToDomainIntoAtomToDomainCutoff(*comm, comm->cutoff));
1974     const real r_bcomm2 = gmx::square(domainToDomainIntoAtomToDomainCutoff(*comm, comm->cutoff_mbody));
1975
1976     if (debug)
1977     {
1978         fprintf(debug, "bBondComm %s, r_bc %f\n", gmx::boolToString(bBondComm), std::sqrt(r_bcomm2));
1979     }
1980
1981     zones = &comm->zones;
1982
1983     dim0 = dd->dim[0];
1984     dim1 = (dd->ndim >= 2 ? dd->dim[1] : -1);
1985     dim2 = (dd->ndim >= 3 ? dd->dim[2] : -1);
1986
1987     set_dd_corners(dd, dim0, dim1, dim2, bDistMB, &corners);
1988
1989     /* Triclinic stuff */
1990     normal      = ddbox->normal;
1991     skew_fac_01 = 0;
1992     if (dd->ndim >= 2)
1993     {
1994         v_0 = ddbox->v[dim0];
1995         if (ddbox->tric_dir[dim0] && ddbox->tric_dir[dim1])
1996         {
1997             /* Determine the coupling coefficient for the distances
1998              * to the cell planes along dim0 and dim1 through dim2.
1999              * This is required for correct rounding.
2000              */
2001             skew_fac_01 =
2002                 ddbox->v[dim0][dim1+1][dim0]*ddbox->v[dim1][dim1+1][dim1];
2003             if (debug)
2004             {
2005                 fprintf(debug, "\nskew_fac_01 %f\n", skew_fac_01);
2006             }
2007         }
2008     }
2009     if (dd->ndim >= 3)
2010     {
2011         v_1 = ddbox->v[dim1];
2012     }
2013
2014     zone_cg_range = zones->cg_range;
2015     cginfo_mb     = fr->cginfo_mb;
2016
2017     zone_cg_range[0]   = 0;
2018     zone_cg_range[1]   = dd->ncg_home;
2019     comm->zone_ncg1[0] = dd->ncg_home;
2020     pos_cg             = dd->ncg_home;
2021
2022     nat_tot = comm->atomRanges.numHomeAtoms();
2023     nzone   = 1;
2024     for (dim_ind = 0; dim_ind < dd->ndim; dim_ind++)
2025     {
2026         dim = dd->dim[dim_ind];
2027         cd  = &comm->cd[dim_ind];
2028
2029         /* Check if we need to compute triclinic distances along this dim */
2030         bool distanceIsTriclinic = false;
2031         for (i = 0; i <= dim_ind; i++)
2032         {
2033             if (ddbox->tric_dir[dd->dim[i]])
2034             {
2035                 distanceIsTriclinic = true;
2036             }
2037         }
2038
2039         if (dim >= ddbox->npbcdim && dd->ci[dim] == 0)
2040         {
2041             /* No pbc in this dimension, the first node should not comm. */
2042             nzone_send = 0;
2043         }
2044         else
2045         {
2046             nzone_send = nzone;
2047         }
2048
2049         v_d                = ddbox->v[dim];
2050         skew_fac2_d        = gmx::square(ddbox->skew_fac[dim]);
2051
2052         cd->receiveInPlace = true;
2053         for (int p = 0; p < cd->numPulses(); p++)
2054         {
2055             /* Only atoms communicated in the first pulse are used
2056              * for multi-body bonded interactions or for bBondComm.
2057              */
2058             bDistBonded = ((bDistMB || bDist2B) && p == 0);
2059
2060             gmx_domdec_ind_t *ind = &cd->ind[p];
2061
2062             /* Thread 0 writes in the global index array */
2063             ind->index.clear();
2064             clearCommSetupData(&comm->dth[0]);
2065
2066             for (zone = 0; zone < nzone_send; zone++)
2067             {
2068                 if (dim_ind > 0 && distanceIsTriclinic)
2069                 {
2070                     /* Determine slightly more optimized skew_fac's
2071                      * for rounding.
2072                      * This reduces the number of communicated atoms
2073                      * by about 10% for 3D DD of rhombic dodecahedra.
2074                      */
2075                     for (dimd = 0; dimd < dim; dimd++)
2076                     {
2077                         sf2_round[dimd] = 1;
2078                         if (ddbox->tric_dir[dimd])
2079                         {
2080                             for (i = dd->dim[dimd]+1; i < DIM; i++)
2081                             {
2082                                 /* If we are shifted in dimension i
2083                                  * and the cell plane is tilted forward
2084                                  * in dimension i, skip this coupling.
2085                                  */
2086                                 if (!(zones->shift[nzone+zone][i] &&
2087                                       ddbox->v[dimd][i][dimd] >= 0))
2088                                 {
2089                                     sf2_round[dimd] +=
2090                                         gmx::square(ddbox->v[dimd][i][dimd]);
2091                                 }
2092                             }
2093                             sf2_round[dimd] = 1/sf2_round[dimd];
2094                         }
2095                     }
2096                 }
2097
2098                 zonei = zone_perm[dim_ind][zone];
2099                 if (p == 0)
2100                 {
2101                     /* Here we permutate the zones to obtain a convenient order
2102                      * for neighbor searching
2103                      */
2104                     cg0 = zone_cg_range[zonei];
2105                     cg1 = zone_cg_range[zonei+1];
2106                 }
2107                 else
2108                 {
2109                     /* Look only at the cg's received in the previous grid pulse
2110                      */
2111                     cg1 = zone_cg_range[nzone+zone+1];
2112                     cg0 = cg1 - cd->ind[p-1].nrecv[zone];
2113                 }
2114
2115                 const int numThreads = static_cast<int>(comm->dth.size());
2116 #pragma omp parallel for num_threads(numThreads) schedule(static)
2117                 for (int th = 0; th < numThreads; th++)
2118                 {
2119                     try
2120                     {
2121                         dd_comm_setup_work_t &work = comm->dth[th];
2122
2123                         /* Retain data accumulated into buffers of thread 0 */
2124                         if (th > 0)
2125                         {
2126                             clearCommSetupData(&work);
2127                         }
2128
2129                         int cg0_th = cg0 + ((cg1 - cg0)* th   )/numThreads;
2130                         int cg1_th = cg0 + ((cg1 - cg0)*(th+1))/numThreads;
2131
2132                         /* Get the cg's for this pulse in this zone */
2133                         get_zone_pulse_cgs(dd, zonei, zone, cg0_th, cg1_th,
2134                                            dd->globalAtomGroupIndices,
2135                                            dd->atomGrouping(),
2136                                            dim, dim_ind, dim0, dim1, dim2,
2137                                            r_comm2, r_bcomm2,
2138                                            box, distanceIsTriclinic,
2139                                            normal, skew_fac2_d, skew_fac_01,
2140                                            v_d, v_0, v_1, &corners, sf2_round,
2141                                            bDistBonded, bBondComm,
2142                                            bDist2B, bDistMB,
2143                                            cg_cm, fr->cginfo,
2144                                            th == 0 ? &ind->index : &work.localAtomGroupBuffer,
2145                                            &work);
2146                     }
2147                     GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
2148                 } // END
2149
2150                 std::vector<int>       &atomGroups = comm->dth[0].atomGroupBuffer;
2151                 std::vector<gmx::RVec> &positions  = comm->dth[0].positionBuffer;
2152                 ind->nsend[zone]  = comm->dth[0].nsend_zone;
2153                 /* Append data of threads>=1 to the communication buffers */
2154                 for (int th = 1; th < numThreads; th++)
2155                 {
2156                     const dd_comm_setup_work_t &dth = comm->dth[th];
2157
2158                     ind->index.insert(ind->index.end(), dth.localAtomGroupBuffer.begin(), dth.localAtomGroupBuffer.end());
2159                     atomGroups.insert(atomGroups.end(), dth.atomGroupBuffer.begin(), dth.atomGroupBuffer.end());
2160                     positions.insert(positions.end(), dth.positionBuffer.begin(), dth.positionBuffer.end());
2161                     comm->dth[0].nat += dth.nat;
2162                     ind->nsend[zone] += dth.nsend_zone;
2163                 }
2164             }
2165             /* Clear the counts in case we do not have pbc */
2166             for (zone = nzone_send; zone < nzone; zone++)
2167             {
2168                 ind->nsend[zone] = 0;
2169             }
2170             ind->nsend[nzone]     = ind->index.size();
2171             ind->nsend[nzone + 1] = comm->dth[0].nat;
2172             /* Communicate the number of cg's and atoms to receive */
2173             ddSendrecv(dd, dim_ind, dddirBackward,
2174                        ind->nsend, nzone+2,
2175                        ind->nrecv, nzone+2);
2176
2177             if (p > 0)
2178             {
2179                 /* We can receive in place if only the last zone is not empty */
2180                 for (zone = 0; zone < nzone-1; zone++)
2181                 {
2182                     if (ind->nrecv[zone] > 0)
2183                     {
2184                         cd->receiveInPlace = false;
2185                     }
2186                 }
2187             }
2188
2189             int receiveBufferSize = 0;
2190             if (!cd->receiveInPlace)
2191             {
2192                 receiveBufferSize = ind->nrecv[nzone];
2193             }
2194             /* These buffer are actually only needed with in-place */
2195             DDBufferAccess<int>       globalAtomGroupBuffer(comm->intBuffer, receiveBufferSize);
2196             DDBufferAccess<gmx::RVec> rvecBuffer(comm->rvecBuffer, receiveBufferSize);
2197
2198             dd_comm_setup_work_t     &work = comm->dth[0];
2199
2200             /* Make space for the global cg indices */
2201             int numAtomGroupsNew = pos_cg + ind->nrecv[nzone];
2202             dd->globalAtomGroupIndices.resize(numAtomGroupsNew);
2203             /* Communicate the global cg indices */
2204             gmx::ArrayRef<int> integerBufferRef;
2205             if (cd->receiveInPlace)
2206             {
2207                 integerBufferRef = gmx::arrayRefFromArray(dd->globalAtomGroupIndices.data() + pos_cg, ind->nrecv[nzone]);
2208             }
2209             else
2210             {
2211                 integerBufferRef = globalAtomGroupBuffer.buffer;
2212             }
2213             ddSendrecv<int>(dd, dim_ind, dddirBackward,
2214                             work.atomGroupBuffer, integerBufferRef);
2215
2216             /* Make space for cg_cm */
2217             dd_check_alloc_ncg(fr, state, f, pos_cg + ind->nrecv[nzone]);
2218             if (fr->cutoff_scheme == ecutsGROUP)
2219             {
2220                 cg_cm = fr->cg_cm;
2221             }
2222             else
2223             {
2224                 cg_cm = state->x.rvec_array();
2225             }
2226             /* Communicate cg_cm */
2227             gmx::ArrayRef<gmx::RVec> rvecBufferRef;
2228             if (cd->receiveInPlace)
2229             {
2230                 rvecBufferRef = gmx::arrayRefFromArray(reinterpret_cast<gmx::RVec *>(cg_cm + pos_cg), ind->nrecv[nzone]);
2231             }
2232             else
2233             {
2234                 rvecBufferRef = rvecBuffer.buffer;
2235             }
2236             ddSendrecv<gmx::RVec>(dd, dim_ind, dddirBackward,
2237                                   work.positionBuffer, rvecBufferRef);
2238
2239             /* Make the charge group index */
2240             if (cd->receiveInPlace)
2241             {
2242                 zone = (p == 0 ? 0 : nzone - 1);
2243                 while (zone < nzone)
2244                 {
2245                     for (cg = 0; cg < ind->nrecv[zone]; cg++)
2246                     {
2247                         cg_gl                              = dd->globalAtomGroupIndices[pos_cg];
2248                         fr->cginfo[pos_cg]                 = ddcginfo(cginfo_mb, cg_gl);
2249                         nrcg                               = GET_CGINFO_NATOMS(fr->cginfo[pos_cg]);
2250                         dd->atomGrouping_.appendBlock(nrcg);
2251                         if (bBondComm)
2252                         {
2253                             /* Update the charge group presence,
2254                              * so we can use it in the next pass of the loop.
2255                              */
2256                             comm->bLocalCG[cg_gl] = TRUE;
2257                         }
2258                         pos_cg++;
2259                     }
2260                     if (p == 0)
2261                     {
2262                         comm->zone_ncg1[nzone+zone] = ind->nrecv[zone];
2263                     }
2264                     zone++;
2265                     zone_cg_range[nzone+zone] = pos_cg;
2266                 }
2267             }
2268             else
2269             {
2270                 /* This part of the code is never executed with bBondComm. */
2271                 std::vector<int> &atomGroupsIndex = dd->atomGrouping_.rawIndex();
2272                 atomGroupsIndex.resize(numAtomGroupsNew + 1);
2273
2274                 merge_cg_buffers(nzone, cd, p, zone_cg_range,
2275                                  dd->globalAtomGroupIndices, integerBufferRef.data(),
2276                                  cg_cm, as_rvec_array(rvecBufferRef.data()),
2277                                  atomGroupsIndex,
2278                                  fr->cginfo_mb, fr->cginfo);
2279                 pos_cg += ind->nrecv[nzone];
2280             }
2281             nat_tot += ind->nrecv[nzone+1];
2282         }
2283         if (!cd->receiveInPlace)
2284         {
2285             /* Store the atom block for easy copying of communication buffers */
2286             make_cell2at_index(cd, nzone, zone_cg_range[nzone], dd->atomGrouping());
2287         }
2288         nzone += nzone;
2289     }
2290
2291     comm->atomRanges.setEnd(DDAtomRanges::Type::Zones, nat_tot);
2292
2293     if (!bBondComm)
2294     {
2295         /* We don't need to update cginfo, since that was alrady done above.
2296          * So we pass NULL for the forcerec.
2297          */
2298         dd_set_cginfo(dd->globalAtomGroupIndices,
2299                       dd->ncg_home, dd->globalAtomGroupIndices.size(),
2300                       nullptr, comm->bLocalCG);
2301     }
2302
2303     if (debug)
2304     {
2305         fprintf(debug, "Finished setting up DD communication, zones:");
2306         for (c = 0; c < zones->n; c++)
2307         {
2308             fprintf(debug, " %d", zones->cg_range[c+1]-zones->cg_range[c]);
2309         }
2310         fprintf(debug, "\n");
2311     }
2312 }
2313
2314 //! Set boundaries for the charge group range.
2315 static void set_cg_boundaries(gmx_domdec_zones_t *zones)
2316 {
2317     int c;
2318
2319     for (c = 0; c < zones->nizone; c++)
2320     {
2321         zones->izone[c].cg1  = zones->cg_range[c+1];
2322         zones->izone[c].jcg0 = zones->cg_range[zones->izone[c].j0];
2323         zones->izone[c].jcg1 = zones->cg_range[zones->izone[c].j1];
2324     }
2325 }
2326
2327 /*! \brief Set zone dimensions for zones \p zone_start to \p zone_end-1
2328  *
2329  * Also sets the atom density for the home zone when \p zone_start=0.
2330  * For this \p numMovedChargeGroupsInHomeZone needs to be passed to tell
2331  * how many charge groups will move but are still part of the current range.
2332  * \todo When converting domdec to use proper classes, all these variables
2333  *       should be private and a method should return the correct count
2334  *       depending on an internal state.
2335  *
2336  * \param[in,out] dd          The domain decomposition struct
2337  * \param[in]     box         The box
2338  * \param[in]     ddbox       The domain decomposition box struct
2339  * \param[in]     zone_start  The start of the zone range to set sizes for
2340  * \param[in]     zone_end    The end of the zone range to set sizes for
2341  * \param[in]     numMovedChargeGroupsInHomeZone  The number of charge groups in the home zone that should moved but are still present in dd->comm->zones.cg_range
2342  */
2343 static void set_zones_size(gmx_domdec_t *dd,
2344                            matrix box, const gmx_ddbox_t *ddbox,
2345                            int zone_start, int zone_end,
2346                            int numMovedChargeGroupsInHomeZone)
2347 {
2348     gmx_domdec_comm_t  *comm;
2349     gmx_domdec_zones_t *zones;
2350     gmx_bool            bDistMB;
2351     int                 z, zi, d, dim;
2352     real                rcs, rcmbs;
2353     int                 i, j;
2354     real                vol;
2355
2356     comm = dd->comm;
2357
2358     zones = &comm->zones;
2359
2360     /* Do we need to determine extra distances for multi-body bondeds? */
2361     bDistMB = (comm->bInterCGMultiBody && isDlbOn(dd->comm) && dd->ndim > 1);
2362
2363     for (z = zone_start; z < zone_end; z++)
2364     {
2365         /* Copy cell limits to zone limits.
2366          * Valid for non-DD dims and non-shifted dims.
2367          */
2368         copy_rvec(comm->cell_x0, zones->size[z].x0);
2369         copy_rvec(comm->cell_x1, zones->size[z].x1);
2370     }
2371
2372     for (d = 0; d < dd->ndim; d++)
2373     {
2374         dim = dd->dim[d];
2375
2376         for (z = 0; z < zones->n; z++)
2377         {
2378             /* With a staggered grid we have different sizes
2379              * for non-shifted dimensions.
2380              */
2381             if (isDlbOn(dd->comm) && zones->shift[z][dim] == 0)
2382             {
2383                 if (d == 1)
2384                 {
2385                     zones->size[z].x0[dim] = comm->zone_d1[zones->shift[z][dd->dim[d-1]]].min0;
2386                     zones->size[z].x1[dim] = comm->zone_d1[zones->shift[z][dd->dim[d-1]]].max1;
2387                 }
2388                 else if (d == 2)
2389                 {
2390                     zones->size[z].x0[dim] = comm->zone_d2[zones->shift[z][dd->dim[d-2]]][zones->shift[z][dd->dim[d-1]]].min0;
2391                     zones->size[z].x1[dim] = comm->zone_d2[zones->shift[z][dd->dim[d-2]]][zones->shift[z][dd->dim[d-1]]].max1;
2392                 }
2393             }
2394         }
2395
2396         rcs   = comm->cutoff;
2397         rcmbs = comm->cutoff_mbody;
2398         if (ddbox->tric_dir[dim])
2399         {
2400             rcs   /= ddbox->skew_fac[dim];
2401             rcmbs /= ddbox->skew_fac[dim];
2402         }
2403
2404         /* Set the lower limit for the shifted zone dimensions */
2405         for (z = zone_start; z < zone_end; z++)
2406         {
2407             if (zones->shift[z][dim] > 0)
2408             {
2409                 dim = dd->dim[d];
2410                 if (!isDlbOn(dd->comm) || d == 0)
2411                 {
2412                     zones->size[z].x0[dim] = comm->cell_x1[dim];
2413                     zones->size[z].x1[dim] = comm->cell_x1[dim] + rcs;
2414                 }
2415                 else
2416                 {
2417                     /* Here we take the lower limit of the zone from
2418                      * the lowest domain of the zone below.
2419                      */
2420                     if (z < 4)
2421                     {
2422                         zones->size[z].x0[dim] =
2423                             comm->zone_d1[zones->shift[z][dd->dim[d-1]]].min1;
2424                     }
2425                     else
2426                     {
2427                         if (d == 1)
2428                         {
2429                             zones->size[z].x0[dim] =
2430                                 zones->size[zone_perm[2][z-4]].x0[dim];
2431                         }
2432                         else
2433                         {
2434                             zones->size[z].x0[dim] =
2435                                 comm->zone_d2[zones->shift[z][dd->dim[d-2]]][zones->shift[z][dd->dim[d-1]]].min1;
2436                         }
2437                     }
2438                     /* A temporary limit, is updated below */
2439                     zones->size[z].x1[dim] = zones->size[z].x0[dim];
2440
2441                     if (bDistMB)
2442                     {
2443                         for (zi = 0; zi < zones->nizone; zi++)
2444                         {
2445                             if (zones->shift[zi][dim] == 0)
2446                             {
2447                                 /* This takes the whole zone into account.
2448                                  * With multiple pulses this will lead
2449                                  * to a larger zone then strictly necessary.
2450                                  */
2451                                 zones->size[z].x1[dim] = std::max(zones->size[z].x1[dim],
2452                                                                   zones->size[zi].x1[dim]+rcmbs);
2453                             }
2454                         }
2455                     }
2456                 }
2457             }
2458         }
2459
2460         /* Loop over the i-zones to set the upper limit of each
2461          * j-zone they see.
2462          */
2463         for (zi = 0; zi < zones->nizone; zi++)
2464         {
2465             if (zones->shift[zi][dim] == 0)
2466             {
2467                 /* We should only use zones up to zone_end */
2468                 int jZoneEnd = std::min(zones->izone[zi].j1, zone_end);
2469                 for (z = zones->izone[zi].j0; z < jZoneEnd; z++)
2470                 {
2471                     if (zones->shift[z][dim] > 0)
2472                     {
2473                         zones->size[z].x1[dim] = std::max(zones->size[z].x1[dim],
2474                                                           zones->size[zi].x1[dim]+rcs);
2475                     }
2476                 }
2477             }
2478         }
2479     }
2480
2481     for (z = zone_start; z < zone_end; z++)
2482     {
2483         /* Initialization only required to keep the compiler happy */
2484         rvec corner_min = {0, 0, 0}, corner_max = {0, 0, 0}, corner;
2485         int  nc, c;
2486
2487         /* To determine the bounding box for a zone we need to find
2488          * the extreme corners of 4, 2 or 1 corners.
2489          */
2490         nc = 1 << (ddbox->nboundeddim - 1);
2491
2492         for (c = 0; c < nc; c++)
2493         {
2494             /* Set up a zone corner at x=0, ignoring trilinic couplings */
2495             corner[XX] = 0;
2496             if ((c & 1) == 0)
2497             {
2498                 corner[YY] = zones->size[z].x0[YY];
2499             }
2500             else
2501             {
2502                 corner[YY] = zones->size[z].x1[YY];
2503             }
2504             if ((c & 2) == 0)
2505             {
2506                 corner[ZZ] = zones->size[z].x0[ZZ];
2507             }
2508             else
2509             {
2510                 corner[ZZ] = zones->size[z].x1[ZZ];
2511             }
2512             if (dd->ndim == 1 && dd->dim[0] < ZZ && ZZ < dd->npbcdim &&
2513                 box[ZZ][1 - dd->dim[0]] != 0)
2514             {
2515                 /* With 1D domain decomposition the cg's are not in
2516                  * the triclinic box, but triclinic x-y and rectangular y/x-z.
2517                  * Shift the corner of the z-vector back to along the box
2518                  * vector of dimension d, so it will later end up at 0 along d.
2519                  * This can affect the location of this corner along dd->dim[0]
2520                  * through the matrix operation below if box[d][dd->dim[0]]!=0.
2521                  */
2522                 int d = 1 - dd->dim[0];
2523
2524                 corner[d] -= corner[ZZ]*box[ZZ][d]/box[ZZ][ZZ];
2525             }
2526             /* Apply the triclinic couplings */
2527             assert(ddbox->npbcdim <= DIM);
2528             for (i = YY; i < ddbox->npbcdim; i++)
2529             {
2530                 for (j = XX; j < i; j++)
2531                 {
2532                     corner[j] += corner[i]*box[i][j]/box[i][i];
2533                 }
2534             }
2535             if (c == 0)
2536             {
2537                 copy_rvec(corner, corner_min);
2538                 copy_rvec(corner, corner_max);
2539             }
2540             else
2541             {
2542                 for (i = 0; i < DIM; i++)
2543                 {
2544                     corner_min[i] = std::min(corner_min[i], corner[i]);
2545                     corner_max[i] = std::max(corner_max[i], corner[i]);
2546                 }
2547             }
2548         }
2549         /* Copy the extreme cornes without offset along x */
2550         for (i = 0; i < DIM; i++)
2551         {
2552             zones->size[z].bb_x0[i] = corner_min[i];
2553             zones->size[z].bb_x1[i] = corner_max[i];
2554         }
2555         /* Add the offset along x */
2556         zones->size[z].bb_x0[XX] += zones->size[z].x0[XX];
2557         zones->size[z].bb_x1[XX] += zones->size[z].x1[XX];
2558     }
2559
2560     if (zone_start == 0)
2561     {
2562         vol = 1;
2563         for (dim = 0; dim < DIM; dim++)
2564         {
2565             vol *= zones->size[0].x1[dim] - zones->size[0].x0[dim];
2566         }
2567         zones->dens_zone0 = (zones->cg_range[1] - zones->cg_range[0] - numMovedChargeGroupsInHomeZone)/vol;
2568     }
2569
2570     if (debug)
2571     {
2572         for (z = zone_start; z < zone_end; z++)
2573         {
2574             fprintf(debug, "zone %d    %6.3f - %6.3f  %6.3f - %6.3f  %6.3f - %6.3f\n",
2575                     z,
2576                     zones->size[z].x0[XX], zones->size[z].x1[XX],
2577                     zones->size[z].x0[YY], zones->size[z].x1[YY],
2578                     zones->size[z].x0[ZZ], zones->size[z].x1[ZZ]);
2579             fprintf(debug, "zone %d bb %6.3f - %6.3f  %6.3f - %6.3f  %6.3f - %6.3f\n",
2580                     z,
2581                     zones->size[z].bb_x0[XX], zones->size[z].bb_x1[XX],
2582                     zones->size[z].bb_x0[YY], zones->size[z].bb_x1[YY],
2583                     zones->size[z].bb_x0[ZZ], zones->size[z].bb_x1[ZZ]);
2584         }
2585     }
2586 }
2587
2588 //! Comparator for sorting charge groups.
2589 static bool comp_cgsort(const gmx_cgsort_t &a, const gmx_cgsort_t &b)
2590 {
2591
2592     if (a.nsc == b.nsc)
2593     {
2594         return a.ind_gl < b.ind_gl;
2595     }
2596     return a.nsc < b.nsc;
2597 }
2598
2599 /*! \brief Order data in \p dataToSort according to \p sort
2600  *
2601  * Note: both buffers should have at least \p sort.size() elements.
2602  */
2603 template <typename T>
2604 static void
2605 orderVector(gmx::ArrayRef<const gmx_cgsort_t>  sort,
2606             gmx::ArrayRef<T>                   dataToSort,
2607             gmx::ArrayRef<T>                   sortBuffer)
2608 {
2609     GMX_ASSERT(dataToSort.size() >= sort.size(), "The vector needs to be sufficiently large");
2610     GMX_ASSERT(sortBuffer.size() >= sort.size(), "The sorting buffer needs to be sufficiently large");
2611
2612     /* Order the data into the temporary buffer */
2613     size_t i = 0;
2614     for (const gmx_cgsort_t &entry : sort)
2615     {
2616         sortBuffer[i++] = dataToSort[entry.ind];
2617     }
2618
2619     /* Copy back to the original array */
2620     std::copy(sortBuffer.begin(), sortBuffer.begin() + sort.size(),
2621               dataToSort.begin());
2622 }
2623
2624 /*! \brief Order data in \p dataToSort according to \p sort
2625  *
2626  * Note: \p vectorToSort should have at least \p sort.size() elements,
2627  *       \p workVector is resized when it is too small.
2628  */
2629 template <typename T>
2630 static void
2631 orderVector(gmx::ArrayRef<const gmx_cgsort_t>  sort,
2632             gmx::ArrayRef<T>                   vectorToSort,
2633             std::vector<T>                    *workVector)
2634 {
2635     if (gmx::index(workVector->size()) < sort.size())
2636     {
2637         workVector->resize(sort.size());
2638     }
2639     orderVector<T>(sort, vectorToSort, *workVector);
2640 }
2641
2642 //! Order vectors of atoms.
2643 static void order_vec_atom(const gmx::RangePartitioning      *atomGroups,
2644                            gmx::ArrayRef<const gmx_cgsort_t>  sort,
2645                            gmx::ArrayRef<gmx::RVec>           v,
2646                            gmx::ArrayRef<gmx::RVec>           buf)
2647 {
2648     if (atomGroups == nullptr)
2649     {
2650         /* Avoid the useless loop of the atoms within a cg */
2651         orderVector(sort, v, buf);
2652
2653         return;
2654     }
2655
2656     /* Order the data */
2657     int a = 0;
2658     for (const gmx_cgsort_t &entry : sort)
2659     {
2660         for (int i : atomGroups->block(entry.ind))
2661         {
2662             copy_rvec(v[i], buf[a]);
2663             a++;
2664         }
2665     }
2666     int atot = a;
2667
2668     /* Copy back to the original array */
2669     for (int a = 0; a < atot; a++)
2670     {
2671         copy_rvec(buf[a], v[a]);
2672     }
2673 }
2674
2675 //! Returns whether a < b */
2676 static bool compareCgsort(const gmx_cgsort_t &a,
2677                           const gmx_cgsort_t &b)
2678 {
2679     return (a.nsc < b.nsc ||
2680             (a.nsc == b.nsc && a.ind_gl < b.ind_gl));
2681 }
2682
2683 //! Do sorting of charge groups.
2684 static void orderedSort(gmx::ArrayRef<const gmx_cgsort_t>  stationary,
2685                         gmx::ArrayRef<gmx_cgsort_t>        moved,
2686                         std::vector<gmx_cgsort_t>         *sort1)
2687 {
2688     /* The new indices are not very ordered, so we qsort them */
2689     std::sort(moved.begin(), moved.end(), comp_cgsort);
2690
2691     /* stationary is already ordered, so now we can merge the two arrays */
2692     sort1->resize(stationary.size() + moved.size());
2693     std::merge(stationary.begin(), stationary.end(),
2694                moved.begin(), moved.end(),
2695                sort1->begin(),
2696                compareCgsort);
2697 }
2698
2699 /*! \brief Set the sorting order for systems with charge groups, returned in sort->sort.
2700  *
2701  * The order is according to the global charge group index.
2702  * This adds and removes charge groups that moved between domains.
2703  */
2704 static void dd_sort_order(const gmx_domdec_t *dd,
2705                           const t_forcerec   *fr,
2706                           int                 ncg_home_old,
2707                           gmx_domdec_sort_t  *sort)
2708 {
2709     const int *a          = fr->ns->grid->cell_index;
2710
2711     const int  movedValue = NSGRID_SIGNAL_MOVED_FAC*fr->ns->grid->ncells;
2712
2713     if (ncg_home_old >= 0)
2714     {
2715         std::vector<gmx_cgsort_t> &stationary = sort->stationary;
2716         std::vector<gmx_cgsort_t> &moved      = sort->moved;
2717
2718         /* The charge groups that remained in the same ns grid cell
2719          * are completely ordered. So we can sort efficiently by sorting
2720          * the charge groups that did move into the stationary list.
2721          * Note: push_back() seems to be slightly slower than direct access.
2722          */
2723         stationary.clear();
2724         moved.clear();
2725         for (int i = 0; i < dd->ncg_home; i++)
2726         {
2727             /* Check if this cg did not move to another node */
2728             if (a[i] < movedValue)
2729             {
2730                 gmx_cgsort_t entry;
2731                 entry.nsc    = a[i];
2732                 entry.ind_gl = dd->globalAtomGroupIndices[i];
2733                 entry.ind    = i;
2734
2735                 if (i >= ncg_home_old || a[i] != sort->sorted[i].nsc)
2736                 {
2737                     /* This cg is new on this node or moved ns grid cell */
2738                     moved.push_back(entry);
2739                 }
2740                 else
2741                 {
2742                     /* This cg did not move */
2743                     stationary.push_back(entry);
2744                 }
2745             }
2746         }
2747
2748         if (debug)
2749         {
2750             fprintf(debug, "ordered sort cgs: stationary %zu moved %zu\n",
2751                     stationary.size(), moved.size());
2752         }
2753         /* Sort efficiently */
2754         orderedSort(stationary, moved, &sort->sorted);
2755     }
2756     else
2757     {
2758         std::vector<gmx_cgsort_t> &cgsort   = sort->sorted;
2759         cgsort.clear();
2760         cgsort.reserve(dd->ncg_home);
2761         int                        numCGNew = 0;
2762         for (int i = 0; i < dd->ncg_home; i++)
2763         {
2764             /* Sort on the ns grid cell indices
2765              * and the global topology index
2766              */
2767             gmx_cgsort_t entry;
2768             entry.nsc    = a[i];
2769             entry.ind_gl = dd->globalAtomGroupIndices[i];
2770             entry.ind    = i;
2771             cgsort.push_back(entry);
2772             if (cgsort[i].nsc < movedValue)
2773             {
2774                 numCGNew++;
2775             }
2776         }
2777         if (debug)
2778         {
2779             fprintf(debug, "qsort cgs: %d new home %d\n", dd->ncg_home, numCGNew);
2780         }
2781         /* Determine the order of the charge groups using qsort */
2782         std::sort(cgsort.begin(), cgsort.end(), comp_cgsort);
2783
2784         /* Remove the charge groups which are no longer at home here */
2785         cgsort.resize(numCGNew);
2786     }
2787 }
2788
2789 //! Returns the sorting order for atoms based on the nbnxn grid order in sort
2790 static void dd_sort_order_nbnxn(const t_forcerec          *fr,
2791                                 std::vector<gmx_cgsort_t> *sort)
2792 {
2793     gmx::ArrayRef<const int> atomOrder = nbnxn_get_atomorder(fr->nbv->nbs.get());
2794
2795     /* Using push_back() instead of this resize results in much slower code */
2796     sort->resize(atomOrder.size());
2797     gmx::ArrayRef<gmx_cgsort_t> buffer    = *sort;
2798     size_t                      numSorted = 0;
2799     for (int i : atomOrder)
2800     {
2801         if (i >= 0)
2802         {
2803             /* The values of nsc and ind_gl are not used in this case */
2804             buffer[numSorted++].ind = i;
2805         }
2806     }
2807     sort->resize(numSorted);
2808 }
2809
2810 //! Returns the sorting state for DD.
2811 static void dd_sort_state(gmx_domdec_t *dd, rvec *cgcm, t_forcerec *fr, t_state *state,
2812                           int ncg_home_old)
2813 {
2814     gmx_domdec_sort_t *sort = dd->comm->sort.get();
2815
2816     switch (fr->cutoff_scheme)
2817     {
2818         case ecutsGROUP:
2819             dd_sort_order(dd, fr, ncg_home_old, sort);
2820             break;
2821         case ecutsVERLET:
2822             dd_sort_order_nbnxn(fr, &sort->sorted);
2823             break;
2824         default:
2825             gmx_incons("unimplemented");
2826     }
2827
2828     const gmx::RangePartitioning &atomGrouping = dd->atomGrouping();
2829
2830     /* We alloc with the old size, since cgindex is still old */
2831     GMX_ASSERT(atomGrouping.numBlocks() == dd->ncg_home, "atomGroups and dd should be consistent");
2832     DDBufferAccess<gmx::RVec>     rvecBuffer(dd->comm->rvecBuffer, atomGrouping.fullRange().end());
2833
2834     const gmx::RangePartitioning *atomGroupsPtr = (dd->comm->bCGs ? &atomGrouping : nullptr);
2835
2836     /* Set the new home atom/charge group count */
2837     dd->ncg_home = sort->sorted.size();
2838     if (debug)
2839     {
2840         fprintf(debug, "Set the new home %s count to %d\n",
2841                 dd->comm->bCGs ? "charge group" : "atom",
2842                 dd->ncg_home);
2843     }
2844
2845     /* Reorder the state */
2846     gmx::ArrayRef<const gmx_cgsort_t> cgsort = sort->sorted;
2847     GMX_RELEASE_ASSERT(cgsort.size() == dd->ncg_home, "We should sort all the home atom groups");
2848
2849     if (state->flags & (1 << estX))
2850     {
2851         order_vec_atom(atomGroupsPtr, cgsort, state->x, rvecBuffer.buffer);
2852     }
2853     if (state->flags & (1 << estV))
2854     {
2855         order_vec_atom(atomGroupsPtr, cgsort, state->v, rvecBuffer.buffer);
2856     }
2857     if (state->flags & (1 << estCGP))
2858     {
2859         order_vec_atom(atomGroupsPtr, cgsort, state->cg_p, rvecBuffer.buffer);
2860     }
2861
2862     if (fr->cutoff_scheme == ecutsGROUP)
2863     {
2864         /* Reorder cgcm */
2865         gmx::ArrayRef<gmx::RVec> cgcmRef = gmx::arrayRefFromArray(reinterpret_cast<gmx::RVec *>(cgcm[0]), cgsort.size());
2866         orderVector(cgsort, cgcmRef, rvecBuffer.buffer);
2867     }
2868
2869     /* Reorder the global cg index */
2870     orderVector<int>(cgsort, dd->globalAtomGroupIndices, &sort->intBuffer);
2871     /* Reorder the cginfo */
2872     orderVector<int>(cgsort, gmx::arrayRefFromArray(fr->cginfo, cgsort.size()), &sort->intBuffer);
2873     /* Rebuild the local cg index */
2874     if (dd->comm->bCGs)
2875     {
2876         /* We make a new, ordered atomGroups object and assign it to
2877          * the old one. This causes some allocation overhead, but saves
2878          * a copy back of the whole index.
2879          */
2880         gmx::RangePartitioning ordered;
2881         for (const gmx_cgsort_t &entry : cgsort)
2882         {
2883             ordered.appendBlock(atomGrouping.block(entry.ind).size());
2884         }
2885         dd->atomGrouping_ = ordered;
2886     }
2887     else
2888     {
2889         dd->atomGrouping_.setAllBlocksSizeOne(dd->ncg_home);
2890     }
2891     /* Set the home atom number */
2892     dd->comm->atomRanges.setEnd(DDAtomRanges::Type::Home, dd->atomGrouping().fullRange().end());
2893
2894     if (fr->cutoff_scheme == ecutsVERLET)
2895     {
2896         /* The atoms are now exactly in grid order, update the grid order */
2897         nbnxn_set_atomorder(fr->nbv->nbs.get());
2898     }
2899     else
2900     {
2901         /* Copy the sorted ns cell indices back to the ns grid struct */
2902         for (gmx::index i = 0; i < cgsort.size(); i++)
2903         {
2904             fr->ns->grid->cell_index[i] = cgsort[i].nsc;
2905         }
2906         fr->ns->grid->nr = cgsort.size();
2907     }
2908 }
2909
2910 //! Accumulates load statistics.
2911 static void add_dd_statistics(gmx_domdec_t *dd)
2912 {
2913     gmx_domdec_comm_t *comm = dd->comm;
2914
2915     for (int i = 0; i < static_cast<int>(DDAtomRanges::Type::Number); i++)
2916     {
2917         auto range = static_cast<DDAtomRanges::Type>(i);
2918         comm->sum_nat[i] +=
2919             comm->atomRanges.end(range) - comm->atomRanges.start(range);
2920     }
2921     comm->ndecomp++;
2922 }
2923
2924 void reset_dd_statistics_counters(gmx_domdec_t *dd)
2925 {
2926     gmx_domdec_comm_t *comm = dd->comm;
2927
2928     /* Reset all the statistics and counters for total run counting */
2929     for (int i = 0; i < static_cast<int>(DDAtomRanges::Type::Number); i++)
2930     {
2931         comm->sum_nat[i] = 0;
2932     }
2933     comm->ndecomp   = 0;
2934     comm->nload     = 0;
2935     comm->load_step = 0;
2936     comm->load_sum  = 0;
2937     comm->load_max  = 0;
2938     clear_ivec(comm->load_lim);
2939     comm->load_mdf = 0;
2940     comm->load_pme = 0;
2941 }
2942
2943 void print_dd_statistics(const t_commrec *cr, const t_inputrec *ir, FILE *fplog)
2944 {
2945     gmx_domdec_comm_t *comm      = cr->dd->comm;
2946
2947     const int          numRanges = static_cast<int>(DDAtomRanges::Type::Number);
2948     gmx_sumd(numRanges, comm->sum_nat, cr);
2949
2950     if (fplog == nullptr)
2951     {
2952         return;
2953     }
2954
2955     fprintf(fplog, "\n    D O M A I N   D E C O M P O S I T I O N   S T A T I S T I C S\n\n");
2956
2957     for (int i = static_cast<int>(DDAtomRanges::Type::Zones); i < numRanges; i++)
2958     {
2959         auto   range = static_cast<DDAtomRanges::Type>(i);
2960         double av    = comm->sum_nat[i]/comm->ndecomp;
2961         switch (range)
2962         {
2963             case DDAtomRanges::Type::Zones:
2964                 fprintf(fplog,
2965                         " av. #atoms communicated per step for force:  %d x %.1f\n",
2966                         2, av);
2967                 break;
2968             case DDAtomRanges::Type::Vsites:
2969                 if (cr->dd->vsite_comm)
2970                 {
2971                     fprintf(fplog,
2972                             " av. #atoms communicated per step for vsites: %d x %.1f\n",
2973                             (EEL_PME(ir->coulombtype) || ir->coulombtype == eelEWALD) ? 3 : 2,
2974                             av);
2975                 }
2976                 break;
2977             case DDAtomRanges::Type::Constraints:
2978                 if (cr->dd->constraint_comm)
2979                 {
2980                     fprintf(fplog,
2981                             " av. #atoms communicated per step for LINCS:  %d x %.1f\n",
2982                             1 + ir->nLincsIter, av);
2983                 }
2984                 break;
2985             default:
2986                 gmx_incons(" Unknown type for DD statistics");
2987         }
2988     }
2989     fprintf(fplog, "\n");
2990
2991     if (comm->bRecordLoad && EI_DYNAMICS(ir->eI))
2992     {
2993         print_dd_load_av(fplog, cr->dd);
2994     }
2995 }
2996
2997 // TODO Remove fplog when group scheme and charge groups are gone
2998 void dd_partition_system(FILE                    *fplog,
2999                          const gmx::MDLogger     &mdlog,
3000                          int64_t                  step,
3001                          const t_commrec         *cr,
3002                          gmx_bool                 bMasterState,
3003                          int                      nstglobalcomm,
3004                          t_state                 *state_global,
3005                          const gmx_mtop_t        &top_global,
3006                          const t_inputrec        *ir,
3007                          t_state                 *state_local,
3008                          PaddedVector<gmx::RVec> *f,
3009                          gmx::MDAtoms            *mdAtoms,
3010                          gmx_localtop_t          *top_local,
3011                          t_forcerec              *fr,
3012                          gmx_vsite_t             *vsite,
3013                          gmx::Constraints        *constr,
3014                          t_nrnb                  *nrnb,
3015                          gmx_wallcycle           *wcycle,
3016                          gmx_bool                 bVerbose)
3017 {
3018     gmx_domdec_t      *dd;
3019     gmx_domdec_comm_t *comm;
3020     gmx_ddbox_t        ddbox = {0};
3021     t_block           *cgs_gl;
3022     int64_t            step_pcoupl;
3023     rvec               cell_ns_x0, cell_ns_x1;
3024     int                ncgindex_set, ncg_home_old = -1, ncg_moved, nat_f_novirsum;
3025     gmx_bool           bBoxChanged, bNStGlobalComm, bDoDLB, bCheckWhetherToTurnDlbOn, bLogLoad;
3026     gmx_bool           bRedist, bResortAll;
3027     ivec               ncells_old = {0, 0, 0}, ncells_new = {0, 0, 0}, np;
3028     real               grid_density;
3029     char               sbuf[22];
3030
3031     wallcycle_start(wcycle, ewcDOMDEC);
3032
3033     dd   = cr->dd;
3034     comm = dd->comm;
3035
3036     // TODO if the update code becomes accessible here, use
3037     // upd->deform for this logic.
3038     bBoxChanged = (bMasterState || inputrecDeform(ir));
3039     if (ir->epc != epcNO)
3040     {
3041         /* With nstpcouple > 1 pressure coupling happens.
3042          * one step after calculating the pressure.
3043          * Box scaling happens at the end of the MD step,
3044          * after the DD partitioning.
3045          * We therefore have to do DLB in the first partitioning
3046          * after an MD step where P-coupling occurred.
3047          * We need to determine the last step in which p-coupling occurred.
3048          * MRS -- need to validate this for vv?
3049          */
3050         int n = ir->nstpcouple;
3051         if (n == 1)
3052         {
3053             step_pcoupl = step - 1;
3054         }
3055         else
3056         {
3057             step_pcoupl = ((step - 1)/n)*n + 1;
3058         }
3059         if (step_pcoupl >= comm->partition_step)
3060         {
3061             bBoxChanged = TRUE;
3062         }
3063     }
3064
3065     bNStGlobalComm = (step % nstglobalcomm == 0);
3066
3067     if (!isDlbOn(comm))
3068     {
3069         bDoDLB = FALSE;
3070     }
3071     else
3072     {
3073         /* Should we do dynamic load balacing this step?
3074          * Since it requires (possibly expensive) global communication,
3075          * we might want to do DLB less frequently.
3076          */
3077         if (bBoxChanged || ir->epc != epcNO)
3078         {
3079             bDoDLB = bBoxChanged;
3080         }
3081         else
3082         {
3083             bDoDLB = bNStGlobalComm;
3084         }
3085     }
3086
3087     /* Check if we have recorded loads on the nodes */
3088     if (comm->bRecordLoad && dd_load_count(comm) > 0)
3089     {
3090         bCheckWhetherToTurnDlbOn = dd_dlb_get_should_check_whether_to_turn_dlb_on(dd);
3091
3092         /* Print load every nstlog, first and last step to the log file */
3093         bLogLoad = ((ir->nstlog > 0 && step % ir->nstlog == 0) ||
3094                     comm->n_load_collect == 0 ||
3095                     (ir->nsteps >= 0 &&
3096                      (step + ir->nstlist > ir->init_step + ir->nsteps)));
3097
3098         /* Avoid extra communication due to verbose screen output
3099          * when nstglobalcomm is set.
3100          */
3101         if (bDoDLB || bLogLoad || bCheckWhetherToTurnDlbOn ||
3102             (bVerbose && (ir->nstlist == 0 || nstglobalcomm <= ir->nstlist)))
3103         {
3104             get_load_distribution(dd, wcycle);
3105             if (DDMASTER(dd))
3106             {
3107                 if (bLogLoad)
3108                 {
3109                     GMX_LOG(mdlog.info).asParagraph().appendText(dd_print_load(dd, step-1));
3110                 }
3111                 if (bVerbose)
3112                 {
3113                     dd_print_load_verbose(dd);
3114                 }
3115             }
3116             comm->n_load_collect++;
3117
3118             if (isDlbOn(comm))
3119             {
3120                 if (DDMASTER(dd))
3121                 {
3122                     /* Add the measured cycles to the running average */
3123                     const float averageFactor        = 0.1f;
3124                     comm->cyclesPerStepDlbExpAverage =
3125                         (1 - averageFactor)*comm->cyclesPerStepDlbExpAverage +
3126                         averageFactor*comm->cycl[ddCyclStep]/comm->cycl_n[ddCyclStep];
3127                 }
3128                 if (comm->dlbState == DlbState::onCanTurnOff &&
3129                     dd->comm->n_load_have % c_checkTurnDlbOffInterval == c_checkTurnDlbOffInterval - 1)
3130                 {
3131                     gmx_bool turnOffDlb;
3132                     if (DDMASTER(dd))
3133                     {
3134                         /* If the running averaged cycles with DLB are more
3135                          * than before we turned on DLB, turn off DLB.
3136                          * We will again run and check the cycles without DLB
3137                          * and we can then decide if to turn off DLB forever.
3138                          */
3139                         turnOffDlb = (comm->cyclesPerStepDlbExpAverage >
3140                                       comm->cyclesPerStepBeforeDLB);
3141                     }
3142                     dd_bcast(dd, sizeof(turnOffDlb), &turnOffDlb);
3143                     if (turnOffDlb)
3144                     {
3145                         /* To turn off DLB, we need to redistribute the atoms */
3146                         dd_collect_state(dd, state_local, state_global);
3147                         bMasterState = TRUE;
3148                         turn_off_dlb(mdlog, dd, step);
3149                     }
3150                 }
3151             }
3152             else if (bCheckWhetherToTurnDlbOn)
3153             {
3154                 gmx_bool turnOffDlbForever = FALSE;
3155                 gmx_bool turnOnDlb         = FALSE;
3156
3157                 /* Since the timings are node dependent, the master decides */
3158                 if (DDMASTER(dd))
3159                 {
3160                     /* If we recently turned off DLB, we want to check if
3161                      * performance is better without DLB. We want to do this
3162                      * ASAP to minimize the chance that external factors
3163                      * slowed down the DLB step are gone here and we
3164                      * incorrectly conclude that DLB was causing the slowdown.
3165                      * So we measure one nstlist block, no running average.
3166                      */
3167                     if (comm->haveTurnedOffDlb &&
3168                         comm->cycl[ddCyclStep]/comm->cycl_n[ddCyclStep] <
3169                         comm->cyclesPerStepDlbExpAverage)
3170                     {
3171                         /* After turning off DLB we ran nstlist steps in fewer
3172                          * cycles than with DLB. This likely means that DLB
3173                          * in not benefical, but this could be due to a one
3174                          * time unlucky fluctuation, so we require two such
3175                          * observations in close succession to turn off DLB
3176                          * forever.
3177                          */
3178                         if (comm->dlbSlowerPartitioningCount > 0 &&
3179                             dd->ddp_count < comm->dlbSlowerPartitioningCount + 10*c_checkTurnDlbOnInterval)
3180                         {
3181                             turnOffDlbForever = TRUE;
3182                         }
3183                         comm->haveTurnedOffDlb           = false;
3184                         /* Register when we last measured DLB slowdown */
3185                         comm->dlbSlowerPartitioningCount = dd->ddp_count;
3186                     }
3187                     else
3188                     {
3189                         /* Here we check if the max PME rank load is more than 0.98
3190                          * the max PP force load. If so, PP DLB will not help,
3191                          * since we are (almost) limited by PME. Furthermore,
3192                          * DLB will cause a significant extra x/f redistribution
3193                          * cost on the PME ranks, which will then surely result
3194                          * in lower total performance.
3195                          */
3196                         if (cr->npmenodes > 0 &&
3197                             dd_pme_f_ratio(dd) > 1 - DD_PERF_LOSS_DLB_ON)
3198                         {
3199                             turnOnDlb = FALSE;
3200                         }
3201                         else
3202                         {
3203                             turnOnDlb = (dd_force_imb_perf_loss(dd) >= DD_PERF_LOSS_DLB_ON);
3204                         }
3205                     }
3206                 }
3207                 struct
3208                 {
3209                     gmx_bool turnOffDlbForever;
3210                     gmx_bool turnOnDlb;
3211                 }
3212                 bools {
3213                     turnOffDlbForever, turnOnDlb
3214                 };
3215                 dd_bcast(dd, sizeof(bools), &bools);
3216                 if (bools.turnOffDlbForever)
3217                 {
3218                     turn_off_dlb_forever(mdlog, dd, step);
3219                 }
3220                 else if (bools.turnOnDlb)
3221                 {
3222                     turn_on_dlb(mdlog, dd, step);
3223                     bDoDLB = TRUE;
3224                 }
3225             }
3226         }
3227         comm->n_load_have++;
3228     }
3229
3230     cgs_gl = &comm->cgs_gl;
3231
3232     bRedist = FALSE;
3233     if (bMasterState)
3234     {
3235         /* Clear the old state */
3236         clearDDStateIndices(dd, 0, 0);
3237         ncgindex_set = 0;
3238
3239         auto xGlobal = positionsFromStatePointer(state_global);
3240
3241         set_ddbox(*dd, true,
3242                   DDMASTER(dd) ? state_global->box : nullptr,
3243                   true, xGlobal,
3244                   &ddbox);
3245
3246         distributeState(mdlog, dd, top_global, state_global, ddbox, state_local, f);
3247
3248         dd_make_local_cgs(dd, &top_local->cgs);
3249
3250         /* Ensure that we have space for the new distribution */
3251         dd_check_alloc_ncg(fr, state_local, f, dd->ncg_home);
3252
3253         if (fr->cutoff_scheme == ecutsGROUP)
3254         {
3255             calc_cgcm(fplog, 0, dd->ncg_home,
3256                       &top_local->cgs, state_local->x.rvec_array(), fr->cg_cm);
3257         }
3258
3259         inc_nrnb(nrnb, eNR_CGCM, comm->atomRanges.numHomeAtoms());
3260
3261         dd_set_cginfo(dd->globalAtomGroupIndices, 0, dd->ncg_home, fr, comm->bLocalCG);
3262     }
3263     else if (state_local->ddp_count != dd->ddp_count)
3264     {
3265         if (state_local->ddp_count > dd->ddp_count)
3266         {
3267             gmx_fatal(FARGS, "Internal inconsistency state_local->ddp_count (%d) > dd->ddp_count (%" PRId64 ")", state_local->ddp_count, dd->ddp_count);
3268         }
3269
3270         if (state_local->ddp_count_cg_gl != state_local->ddp_count)
3271         {
3272             gmx_fatal(FARGS, "Internal inconsistency state_local->ddp_count_cg_gl (%d) != state_local->ddp_count (%d)", state_local->ddp_count_cg_gl, state_local->ddp_count);
3273         }
3274
3275         /* Clear the old state */
3276         clearDDStateIndices(dd, 0, 0);
3277
3278         /* Restore the atom group indices from state_local */
3279         restoreAtomGroups(dd, cgs_gl->index, state_local);
3280         make_dd_indices(dd, cgs_gl->index, 0);
3281         ncgindex_set = dd->ncg_home;
3282
3283         if (fr->cutoff_scheme == ecutsGROUP)
3284         {
3285             /* Redetermine the cg COMs */
3286             calc_cgcm(fplog, 0, dd->ncg_home,
3287                       &top_local->cgs, state_local->x.rvec_array(), fr->cg_cm);
3288         }
3289
3290         inc_nrnb(nrnb, eNR_CGCM, comm->atomRanges.numHomeAtoms());
3291
3292         dd_set_cginfo(dd->globalAtomGroupIndices, 0, dd->ncg_home, fr, comm->bLocalCG);
3293
3294         set_ddbox(*dd, bMasterState, state_local->box,
3295                   true, state_local->x, &ddbox);
3296
3297         bRedist = isDlbOn(comm);
3298     }
3299     else
3300     {
3301         /* We have the full state, only redistribute the cgs */
3302
3303         /* Clear the non-home indices */
3304         clearDDStateIndices(dd, dd->ncg_home, comm->atomRanges.numHomeAtoms());
3305         ncgindex_set = 0;
3306
3307         /* Avoid global communication for dim's without pbc and -gcom */
3308         if (!bNStGlobalComm)
3309         {
3310             copy_rvec(comm->box0, ddbox.box0    );
3311             copy_rvec(comm->box_size, ddbox.box_size);
3312         }
3313         set_ddbox(*dd, bMasterState, state_local->box,
3314                   bNStGlobalComm, state_local->x, &ddbox);
3315
3316         bBoxChanged = TRUE;
3317         bRedist     = TRUE;
3318     }
3319     /* For dim's without pbc and -gcom */
3320     copy_rvec(ddbox.box0, comm->box0    );
3321     copy_rvec(ddbox.box_size, comm->box_size);
3322
3323     set_dd_cell_sizes(dd, &ddbox, dynamic_dd_box(*dd), bMasterState, bDoDLB,
3324                       step, wcycle);
3325
3326     if (comm->nstDDDumpGrid > 0 && step % comm->nstDDDumpGrid == 0)
3327     {
3328         write_dd_grid_pdb("dd_grid", step, dd, state_local->box, &ddbox);
3329     }
3330
3331     if (comm->useUpdateGroups)
3332     {
3333         comm->updateGroupsCog->addCogs(gmx::arrayRefFromArray(dd->globalAtomGroupIndices.data(), dd->ncg_home),
3334                                        state_local->x);
3335     }
3336
3337     /* Check if we should sort the charge groups */
3338     const bool bSortCG = (bMasterState || bRedist);
3339
3340     ncg_home_old = dd->ncg_home;
3341
3342     /* When repartitioning we mark atom groups that will move to neighboring
3343      * DD cells, but we do not move them right away for performance reasons.
3344      * Thus we need to keep track of how many charge groups will move for
3345      * obtaining correct local charge group / atom counts.
3346      */
3347     ncg_moved = 0;
3348     if (bRedist)
3349     {
3350         wallcycle_sub_start(wcycle, ewcsDD_REDIST);
3351
3352         ncgindex_set = dd->ncg_home;
3353         dd_redistribute_cg(fplog, step, dd, ddbox.tric_dir,
3354                            state_local, f, fr,
3355                            nrnb, &ncg_moved);
3356
3357         GMX_RELEASE_ASSERT(bSortCG, "Sorting is required after redistribution");
3358
3359         if (comm->useUpdateGroups)
3360         {
3361             comm->updateGroupsCog->addCogs(gmx::arrayRefFromArray(dd->globalAtomGroupIndices.data(), dd->ncg_home),
3362                                            state_local->x);
3363         }
3364
3365         wallcycle_sub_stop(wcycle, ewcsDD_REDIST);
3366     }
3367
3368     get_nsgrid_boundaries(ddbox.nboundeddim, state_local->box,
3369                           dd, &ddbox,
3370                           &comm->cell_x0, &comm->cell_x1,
3371                           dd->ncg_home, fr->cg_cm,
3372                           cell_ns_x0, cell_ns_x1, &grid_density);
3373
3374     if (bBoxChanged)
3375     {
3376         comm_dd_ns_cell_sizes(dd, &ddbox, cell_ns_x0, cell_ns_x1, step);
3377     }
3378
3379     switch (fr->cutoff_scheme)
3380     {
3381         case ecutsGROUP:
3382             copy_ivec(fr->ns->grid->n, ncells_old);
3383             grid_first(fplog, fr->ns->grid, dd, &ddbox,
3384                        state_local->box, cell_ns_x0, cell_ns_x1,
3385                        fr->rlist, grid_density);
3386             break;
3387         case ecutsVERLET:
3388             nbnxn_get_ncells(fr->nbv->nbs.get(), &ncells_old[XX], &ncells_old[YY]);
3389             break;
3390         default:
3391             gmx_incons("unimplemented");
3392     }
3393     /* We need to store tric_dir for dd_get_ns_ranges called from ns.c */
3394     copy_ivec(ddbox.tric_dir, comm->tric_dir);
3395
3396     if (bSortCG)
3397     {
3398         wallcycle_sub_start(wcycle, ewcsDD_GRID);
3399
3400         /* Sort the state on charge group position.
3401          * This enables exact restarts from this step.
3402          * It also improves performance by about 15% with larger numbers
3403          * of atoms per node.
3404          */
3405
3406         /* Fill the ns grid with the home cell,
3407          * so we can sort with the indices.
3408          */
3409         set_zones_ncg_home(dd);
3410
3411         switch (fr->cutoff_scheme)
3412         {
3413             case ecutsVERLET:
3414                 set_zones_size(dd, state_local->box, &ddbox, 0, 1, ncg_moved);
3415
3416                 nbnxn_put_on_grid(fr->nbv->nbs.get(), fr->ePBC, state_local->box,
3417                                   0,
3418                                   comm->zones.size[0].bb_x0,
3419                                   comm->zones.size[0].bb_x1,
3420                                   comm->updateGroupsCog.get(),
3421                                   0, dd->ncg_home,
3422                                   comm->zones.dens_zone0,
3423                                   fr->cginfo,
3424                                   state_local->x,
3425                                   ncg_moved, bRedist ? comm->movedBuffer.data() : nullptr,
3426                                   fr->nbv->grp[eintLocal].kernel_type,
3427                                   fr->nbv->nbat);
3428
3429                 nbnxn_get_ncells(fr->nbv->nbs.get(), &ncells_new[XX], &ncells_new[YY]);
3430                 break;
3431             case ecutsGROUP:
3432                 fill_grid(&comm->zones, fr->ns->grid, dd->ncg_home,
3433                           0, dd->ncg_home, fr->cg_cm);
3434
3435                 copy_ivec(fr->ns->grid->n, ncells_new);
3436                 break;
3437             default:
3438                 gmx_incons("unimplemented");
3439         }
3440
3441         bResortAll = bMasterState;
3442
3443         /* Check if we can user the old order and ns grid cell indices
3444          * of the charge groups to sort the charge groups efficiently.
3445          */
3446         if (ncells_new[XX] != ncells_old[XX] ||
3447             ncells_new[YY] != ncells_old[YY] ||
3448             ncells_new[ZZ] != ncells_old[ZZ])
3449         {
3450             bResortAll = TRUE;
3451         }
3452
3453         if (debug)
3454         {
3455             fprintf(debug, "Step %s, sorting the %d home charge groups\n",
3456                     gmx_step_str(step, sbuf), dd->ncg_home);
3457         }
3458         dd_sort_state(dd, fr->cg_cm, fr, state_local,
3459                       bResortAll ? -1 : ncg_home_old);
3460
3461         /* After sorting and compacting we set the correct size */
3462         dd_resize_state(state_local, f, comm->atomRanges.numHomeAtoms());
3463
3464         /* Rebuild all the indices */
3465         dd->ga2la->clear();
3466         ncgindex_set = 0;
3467
3468         wallcycle_sub_stop(wcycle, ewcsDD_GRID);
3469     }
3470
3471     if (comm->useUpdateGroups)
3472     {
3473         /* The update groups cog's are invalid after sorting
3474          * and need to be cleared before the next partitioning anyhow.
3475          */
3476         comm->updateGroupsCog->clear();
3477     }
3478
3479     wallcycle_sub_start(wcycle, ewcsDD_SETUPCOMM);
3480
3481     /* Setup up the communication and communicate the coordinates */
3482     setup_dd_communication(dd, state_local->box, &ddbox, fr, state_local, f);
3483
3484     /* Set the indices */
3485     make_dd_indices(dd, cgs_gl->index, ncgindex_set);
3486
3487     /* Set the charge group boundaries for neighbor searching */
3488     set_cg_boundaries(&comm->zones);
3489
3490     if (fr->cutoff_scheme == ecutsVERLET)
3491     {
3492         /* When bSortCG=true, we have already set the size for zone 0 */
3493         set_zones_size(dd, state_local->box, &ddbox,
3494                        bSortCG ? 1 : 0, comm->zones.n,
3495                        0);
3496     }
3497
3498     wallcycle_sub_stop(wcycle, ewcsDD_SETUPCOMM);
3499
3500     /*
3501        write_dd_pdb("dd_home",step,"dump",top_global,cr,
3502                  -1,state_local->x.rvec_array(),state_local->box);
3503      */
3504
3505     wallcycle_sub_start(wcycle, ewcsDD_MAKETOP);
3506
3507     /* Extract a local topology from the global topology */
3508     for (int i = 0; i < dd->ndim; i++)
3509     {
3510         np[dd->dim[i]] = comm->cd[i].numPulses();
3511     }
3512     dd_make_local_top(dd, &comm->zones, dd->npbcdim, state_local->box,
3513                       comm->cellsize_min, np,
3514                       fr,
3515                       fr->cutoff_scheme == ecutsGROUP ? fr->cg_cm : state_local->x.rvec_array(),
3516                       vsite, top_global, top_local);
3517
3518     wallcycle_sub_stop(wcycle, ewcsDD_MAKETOP);
3519
3520     wallcycle_sub_start(wcycle, ewcsDD_MAKECONSTR);
3521
3522     /* Set up the special atom communication */
3523     int n = comm->atomRanges.end(DDAtomRanges::Type::Zones);
3524     for (int i = static_cast<int>(DDAtomRanges::Type::Zones) + 1; i < static_cast<int>(DDAtomRanges::Type::Number); i++)
3525     {
3526         auto range = static_cast<DDAtomRanges::Type>(i);
3527         switch (range)
3528         {
3529             case DDAtomRanges::Type::Vsites:
3530                 if (vsite && vsite->n_intercg_vsite)
3531                 {
3532                     n = dd_make_local_vsites(dd, n, top_local->idef.il);
3533                 }
3534                 break;
3535             case DDAtomRanges::Type::Constraints:
3536                 if (dd->splitConstraints || dd->splitSettles)
3537                 {
3538                     /* Only for inter-cg constraints we need special code */
3539                     n = dd_make_local_constraints(dd, n, &top_global, fr->cginfo,
3540                                                   constr, ir->nProjOrder,
3541                                                   top_local->idef.il);
3542                 }
3543                 break;
3544             default:
3545                 gmx_incons("Unknown special atom type setup");
3546         }
3547         comm->atomRanges.setEnd(range, n);
3548     }
3549
3550     wallcycle_sub_stop(wcycle, ewcsDD_MAKECONSTR);
3551
3552     wallcycle_sub_start(wcycle, ewcsDD_TOPOTHER);
3553
3554     /* Make space for the extra coordinates for virtual site
3555      * or constraint communication.
3556      */
3557     state_local->natoms = comm->atomRanges.numAtomsTotal();
3558
3559     dd_resize_state(state_local, f, state_local->natoms);
3560
3561     if (fr->haveDirectVirialContributions)
3562     {
3563         if (vsite && vsite->n_intercg_vsite)
3564         {
3565             nat_f_novirsum = comm->atomRanges.end(DDAtomRanges::Type::Vsites);
3566         }
3567         else
3568         {
3569             if (EEL_FULL(ir->coulombtype) && dd->n_intercg_excl > 0)
3570             {
3571                 nat_f_novirsum = comm->atomRanges.end(DDAtomRanges::Type::Zones);
3572             }
3573             else
3574             {
3575                 nat_f_novirsum = comm->atomRanges.numHomeAtoms();
3576             }
3577         }
3578     }
3579     else
3580     {
3581         nat_f_novirsum = 0;
3582     }
3583
3584     /* Set the number of atoms required for the force calculation.
3585      * Forces need to be constrained when doing energy
3586      * minimization. For simple simulations we could avoid some
3587      * allocation, zeroing and copying, but this is probably not worth
3588      * the complications and checking.
3589      */
3590     forcerec_set_ranges(fr, dd->ncg_home, dd->globalAtomGroupIndices.size(),
3591                         comm->atomRanges.end(DDAtomRanges::Type::Zones),
3592                         comm->atomRanges.end(DDAtomRanges::Type::Constraints),
3593                         nat_f_novirsum);
3594
3595     /* Update atom data for mdatoms and several algorithms */
3596     mdAlgorithmsSetupAtomData(cr, ir, top_global, top_local, fr,
3597                               nullptr, mdAtoms, constr, vsite, nullptr);
3598
3599     auto mdatoms = mdAtoms->mdatoms();
3600     if (!thisRankHasDuty(cr, DUTY_PME))
3601     {
3602         /* Send the charges and/or c6/sigmas to our PME only node */
3603         gmx_pme_send_parameters(cr,
3604                                 fr->ic,
3605                                 mdatoms->nChargePerturbed != 0, mdatoms->nTypePerturbed != 0,
3606                                 mdatoms->chargeA, mdatoms->chargeB,
3607                                 mdatoms->sqrt_c6A, mdatoms->sqrt_c6B,
3608                                 mdatoms->sigmaA, mdatoms->sigmaB,
3609                                 dd_pme_maxshift_x(dd), dd_pme_maxshift_y(dd));
3610     }
3611
3612     if (ir->bPull)
3613     {
3614         /* Update the local pull groups */
3615         dd_make_local_pull_groups(cr, ir->pull_work);
3616     }
3617
3618     if (dd->atomSets != nullptr)
3619     {
3620         /* Update the local atom sets */
3621         dd->atomSets->setIndicesInDomainDecomposition(*(dd->ga2la));
3622     }
3623
3624     /* Update the local atoms to be communicated via the IMD protocol if bIMD is TRUE. */
3625     dd_make_local_IMD_atoms(ir->bIMD, dd, ir->imd);
3626
3627     add_dd_statistics(dd);
3628
3629     /* Make sure we only count the cycles for this DD partitioning */
3630     clear_dd_cycle_counts(dd);
3631
3632     /* Because the order of the atoms might have changed since
3633      * the last vsite construction, we need to communicate the constructing
3634      * atom coordinates again (for spreading the forces this MD step).
3635      */
3636     dd_move_x_vsites(dd, state_local->box, state_local->x.rvec_array());
3637
3638     wallcycle_sub_stop(wcycle, ewcsDD_TOPOTHER);
3639
3640     if (comm->nstDDDump > 0 && step % comm->nstDDDump == 0)
3641     {
3642         dd_move_x(dd, state_local->box, state_local->x, nullWallcycle);
3643         write_dd_pdb("dd_dump", step, "dump", &top_global, cr,
3644                      -1, state_local->x.rvec_array(), state_local->box);
3645     }
3646
3647     /* Store the partitioning step */
3648     comm->partition_step = step;
3649
3650     /* Increase the DD partitioning counter */
3651     dd->ddp_count++;
3652     /* The state currently matches this DD partitioning count, store it */
3653     state_local->ddp_count = dd->ddp_count;
3654     if (bMasterState)
3655     {
3656         /* The DD master node knows the complete cg distribution,
3657          * store the count so we can possibly skip the cg info communication.
3658          */
3659         comm->master_cg_ddp_count = (bSortCG ? 0 : dd->ddp_count);
3660     }
3661
3662     if (comm->DD_debug > 0)
3663     {
3664         /* Set the env var GMX_DD_DEBUG if you suspect corrupted indices */
3665         check_index_consistency(dd, top_global.natoms, ncg_mtop(&top_global),
3666                                 "after partitioning");
3667     }
3668
3669     wallcycle_stop(wcycle, ewcDOMDEC);
3670 }
3671
3672 /*! \brief Check whether bonded interactions are missing, if appropriate */
3673 void checkNumberOfBondedInteractions(const gmx::MDLogger  &mdlog,
3674                                      t_commrec            *cr,
3675                                      int                   totalNumberOfBondedInteractions,
3676                                      const gmx_mtop_t     *top_global,
3677                                      const gmx_localtop_t *top_local,
3678                                      const t_state        *state,
3679                                      bool                 *shouldCheckNumberOfBondedInteractions)
3680 {
3681     if (*shouldCheckNumberOfBondedInteractions)
3682     {
3683         if (totalNumberOfBondedInteractions != cr->dd->nbonded_global)
3684         {
3685             dd_print_missing_interactions(mdlog, cr, totalNumberOfBondedInteractions, top_global, top_local, state); // Does not return
3686         }
3687         *shouldCheckNumberOfBondedInteractions = false;
3688     }
3689 }