Merge branch release-2020 into merge-2020-into-2021
[alexxy/gromacs.git] / src / gromacs / domdec / partition.cpp
1 /*
2  * This file is part of the GROMACS molecular simulation package.
3  *
4  * Copyright (c) 2018,2019,2020,2021, by the GROMACS development team, led by
5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6  * and including many others, as listed in the AUTHORS file in the
7  * top-level source directory and at http://www.gromacs.org.
8  *
9  * GROMACS is free software; you can redistribute it and/or
10  * modify it under the terms of the GNU Lesser General Public License
11  * as published by the Free Software Foundation; either version 2.1
12  * of the License, or (at your option) any later version.
13  *
14  * GROMACS is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17  * Lesser General Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser General Public
20  * License along with GROMACS; if not, see
21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
23  *
24  * If you want to redistribute modifications to GROMACS, please
25  * consider that scientific software is very special. Version
26  * control is crucial - bugs must be traceable. We will be happy to
27  * consider code for inclusion in the official distribution, but
28  * derived work must not be called official GROMACS. Details are found
29  * in the README & COPYING files - if they are missing, get the
30  * official version at http://www.gromacs.org.
31  *
32  * To help us fund GROMACS development, we humbly ask that you cite
33  * the research papers on the package. Check out http://www.gromacs.org.
34  */
35 /*! \internal \file
36  *
37  * \brief This file defines functions for mdrun to call to make a new
38  * domain decomposition, and check it.
39  *
40  * \author Berk Hess <hess@kth.se>
41  * \ingroup module_domdec
42  */
43
44 #include "gmxpre.h"
45
46 #include "partition.h"
47
48 #include "config.h"
49
50 #include <cassert>
51 #include <cstdio>
52
53 #include <algorithm>
54
55 #include "gromacs/domdec/collect.h"
56 #include "gromacs/domdec/dlb.h"
57 #include "gromacs/domdec/dlbtiming.h"
58 #include "gromacs/domdec/domdec.h"
59 #include "gromacs/domdec/domdec_network.h"
60 #include "gromacs/domdec/ga2la.h"
61 #include "gromacs/domdec/localatomsetmanager.h"
62 #include "gromacs/domdec/mdsetup.h"
63 #include "gromacs/ewald/pme_pp.h"
64 #include "gromacs/gmxlib/network.h"
65 #include "gromacs/gmxlib/nrnb.h"
66 #include "gromacs/imd/imd.h"
67 #include "gromacs/math/functions.h"
68 #include "gromacs/math/vec.h"
69 #include "gromacs/mdlib/forcerec.h"
70 #include "gromacs/mdlib/gmx_omp_nthreads.h"
71 #include "gromacs/mdlib/mdatoms.h"
72 #include "gromacs/mdlib/nsgrid.h"
73 #include "gromacs/mdlib/vsite.h"
74 #include "gromacs/mdtypes/commrec.h"
75 #include "gromacs/mdtypes/forcerec.h"
76 #include "gromacs/mdtypes/inputrec.h"
77 #include "gromacs/mdtypes/md_enums.h"
78 #include "gromacs/mdtypes/mdatom.h"
79 #include "gromacs/mdtypes/nblist.h"
80 #include "gromacs/mdtypes/state.h"
81 #include "gromacs/nbnxm/nbnxm.h"
82 #include "gromacs/pulling/pull.h"
83 #include "gromacs/timing/wallcycle.h"
84 #include "gromacs/topology/mtop_util.h"
85 #include "gromacs/topology/topology.h"
86 #include "gromacs/utility/cstringutil.h"
87 #include "gromacs/utility/fatalerror.h"
88 #include "gromacs/utility/logger.h"
89 #include "gromacs/utility/real.h"
90 #include "gromacs/utility/smalloc.h"
91 #include "gromacs/utility/strconvert.h"
92 #include "gromacs/utility/stringstream.h"
93 #include "gromacs/utility/stringutil.h"
94 #include "gromacs/utility/textwriter.h"
95
96 #include "box.h"
97 #include "cellsizes.h"
98 #include "distribute.h"
99 #include "domdec_constraints.h"
100 #include "domdec_internal.h"
101 #include "domdec_vsite.h"
102 #include "dump.h"
103 #include "redistribute.h"
104 #include "utility.h"
105
106 /*! \brief Turn on DLB when the load imbalance causes this amount of total loss.
107  *
108  * There is a bit of overhead with DLB and it's difficult to achieve
109  * a load imbalance of less than 2% with DLB.
110  */
111 #define DD_PERF_LOSS_DLB_ON 0.02
112
113 //! Warn about imbalance due to PP or PP/PME load imbalance at this loss.
114 #define DD_PERF_LOSS_WARN 0.05
115
116
117 //! Debug helper printing a DD zone
118 static void print_ddzone(FILE* fp, int d, int i, int j, gmx_ddzone_t* zone)
119 {
120     fprintf(fp,
121             "zone d0 %d d1 %d d2 %d  min0 %6.3f max1 %6.3f mch0 %6.3f mch1 %6.3f p1_0 %6.3f p1_1 "
122             "%6.3f\n",
123             d, i, j, zone->min0, zone->max1, zone->mch0, zone->mch0, zone->p1_0, zone->p1_1);
124 }
125
126 /*! \brief Using the home grid size as input in cell_ns_x0 and cell_ns_x1
127  * takes the extremes over all home and remote zones in the halo
128  * and returns the results in cell_ns_x0 and cell_ns_x1.
129  * Note: only used with the group cut-off scheme.
130  */
131 static void dd_move_cellx(gmx_domdec_t* dd, const gmx_ddbox_t* ddbox, rvec cell_ns_x0, rvec cell_ns_x1)
132 {
133     constexpr int      c_ddZoneCommMaxNumZones = 5;
134     gmx_ddzone_t       buf_s[c_ddZoneCommMaxNumZones];
135     gmx_ddzone_t       buf_r[c_ddZoneCommMaxNumZones];
136     gmx_ddzone_t       buf_e[c_ddZoneCommMaxNumZones];
137     gmx_domdec_comm_t* comm = dd->comm;
138
139     rvec extr_s[2];
140     rvec extr_r[2];
141     for (int d = 1; d < dd->ndim; d++)
142     {
143         int           dim = dd->dim[d];
144         gmx_ddzone_t& zp  = (d == 1) ? comm->zone_d1[0] : comm->zone_d2[0][0];
145
146         /* Copy the base sizes of the home zone */
147         zp.min0    = cell_ns_x0[dim];
148         zp.max1    = cell_ns_x1[dim];
149         zp.min1    = cell_ns_x1[dim];
150         zp.mch0    = cell_ns_x0[dim];
151         zp.mch1    = cell_ns_x1[dim];
152         zp.p1_0    = cell_ns_x0[dim];
153         zp.p1_1    = cell_ns_x1[dim];
154         zp.dataSet = 1;
155     }
156
157     gmx::ArrayRef<DDCellsizesWithDlb> cellsizes = comm->cellsizesWithDlb;
158
159     /* Loop backward over the dimensions and aggregate the extremes
160      * of the cell sizes.
161      */
162     for (int d = dd->ndim - 2; d >= 0; d--)
163     {
164         const int  dim      = dd->dim[d];
165         const bool applyPbc = (dim < ddbox->npbcdim);
166
167         /* Use an rvec to store two reals */
168         extr_s[d][0] = cellsizes[d + 1].fracLower;
169         extr_s[d][1] = cellsizes[d + 1].fracUpper;
170         extr_s[d][2] = cellsizes[d + 1].fracUpper;
171
172         int pos = 0;
173         GMX_ASSERT(pos < c_ddZoneCommMaxNumZones, "The buffers should be sufficiently large");
174         /* Store the extremes in the backward sending buffer,
175          * so they get updated separately from the forward communication.
176          */
177         for (int d1 = d; d1 < dd->ndim - 1; d1++)
178         {
179             gmx_ddzone_t& buf = buf_s[pos];
180
181             /* We invert the order to be able to use the same loop for buf_e */
182             buf.min0 = extr_s[d1][1];
183             buf.max1 = extr_s[d1][0];
184             buf.min1 = extr_s[d1][2];
185             buf.mch0 = 0;
186             buf.mch1 = 0;
187             /* Store the cell corner of the dimension we communicate along */
188             buf.p1_0    = comm->cell_x0[dim];
189             buf.p1_1    = 0;
190             buf.dataSet = 1;
191             pos++;
192         }
193
194         buf_s[pos] = (dd->ndim == 2) ? comm->zone_d1[0] : comm->zone_d2[0][0];
195         pos++;
196
197         if (dd->ndim == 3 && d == 0)
198         {
199             buf_s[pos] = comm->zone_d2[0][1];
200             pos++;
201             buf_s[pos] = comm->zone_d1[0];
202             pos++;
203         }
204
205         /* We only need to communicate the extremes
206          * in the forward direction
207          */
208         int numPulses = comm->cd[d].numPulses();
209         int numPulsesMin;
210         if (applyPbc)
211         {
212             /* Take the minimum to avoid double communication */
213             numPulsesMin = std::min(numPulses, dd->numCells[dim] - 1 - numPulses);
214         }
215         else
216         {
217             /* Without PBC we should really not communicate over
218              * the boundaries, but implementing that complicates
219              * the communication setup and therefore we simply
220              * do all communication, but ignore some data.
221              */
222             numPulsesMin = numPulses;
223         }
224         for (int pulse = 0; pulse < numPulsesMin; pulse++)
225         {
226             /* Communicate the extremes forward */
227             bool receiveValidData = (applyPbc || dd->ci[dim] > 0);
228
229             int numElements = dd->ndim - d - 1;
230             ddSendrecv(dd, d, dddirForward, extr_s + d, numElements, extr_r + d, numElements);
231
232             if (receiveValidData)
233             {
234                 for (int d1 = d; d1 < dd->ndim - 1; d1++)
235                 {
236                     extr_s[d1][0] = std::max(extr_s[d1][0], extr_r[d1][0]);
237                     extr_s[d1][1] = std::min(extr_s[d1][1], extr_r[d1][1]);
238                     extr_s[d1][2] = std::min(extr_s[d1][2], extr_r[d1][2]);
239                 }
240             }
241         }
242
243         const int numElementsInBuffer = pos;
244         for (int pulse = 0; pulse < numPulses; pulse++)
245         {
246             /* Communicate all the zone information backward */
247             bool receiveValidData = (applyPbc || dd->ci[dim] < dd->numCells[dim] - 1);
248
249             static_assert(
250                     sizeof(gmx_ddzone_t) == c_ddzoneNumReals * sizeof(real),
251                     "Here we expect gmx_ddzone_t to consist of c_ddzoneNumReals reals (only)");
252
253             int numReals = numElementsInBuffer * c_ddzoneNumReals;
254             ddSendrecv(dd, d, dddirBackward, gmx::arrayRefFromArray(&buf_s[0].min0, numReals),
255                        gmx::arrayRefFromArray(&buf_r[0].min0, numReals));
256
257             rvec dh = { 0 };
258             if (pulse > 0)
259             {
260                 for (int d1 = d + 1; d1 < dd->ndim; d1++)
261                 {
262                     /* Determine the decrease of maximum required
263                      * communication height along d1 due to the distance along d,
264                      * this avoids a lot of useless atom communication.
265                      */
266                     real dist_d = comm->cell_x1[dim] - buf_r[0].p1_0;
267
268                     real c;
269                     if (ddbox->tric_dir[dim])
270                     {
271                         /* c is the off-diagonal coupling between the cell planes
272                          * along directions d and d1.
273                          */
274                         c = ddbox->v[dim][dd->dim[d1]][dim];
275                     }
276                     else
277                     {
278                         c = 0;
279                     }
280                     real det = (1 + c * c) * gmx::square(comm->systemInfo.cutoff) - dist_d * dist_d;
281                     if (det > 0)
282                     {
283                         dh[d1] = comm->systemInfo.cutoff - (c * dist_d + std::sqrt(det)) / (1 + c * c);
284                     }
285                     else
286                     {
287                         /* A negative value signals out of range */
288                         dh[d1] = -1;
289                     }
290                 }
291             }
292
293             /* Accumulate the extremes over all pulses */
294             for (int i = 0; i < numElementsInBuffer; i++)
295             {
296                 if (pulse == 0)
297                 {
298                     buf_e[i] = buf_r[i];
299                 }
300                 else
301                 {
302                     if (receiveValidData)
303                     {
304                         buf_e[i].min0 = std::min(buf_e[i].min0, buf_r[i].min0);
305                         buf_e[i].max1 = std::max(buf_e[i].max1, buf_r[i].max1);
306                         buf_e[i].min1 = std::min(buf_e[i].min1, buf_r[i].min1);
307                     }
308
309                     int d1;
310                     if (dd->ndim == 3 && d == 0 && i == numElementsInBuffer - 1)
311                     {
312                         d1 = 1;
313                     }
314                     else
315                     {
316                         d1 = d + 1;
317                     }
318                     if (receiveValidData && dh[d1] >= 0)
319                     {
320                         buf_e[i].mch0 = std::max(buf_e[i].mch0, buf_r[i].mch0 - dh[d1]);
321                         buf_e[i].mch1 = std::max(buf_e[i].mch1, buf_r[i].mch1 - dh[d1]);
322                     }
323                 }
324                 /* Copy the received buffer to the send buffer,
325                  * to pass the data through with the next pulse.
326                  */
327                 buf_s[i] = buf_r[i];
328             }
329             if (((applyPbc || dd->ci[dim] + numPulses < dd->numCells[dim]) && pulse == numPulses - 1)
330                 || (!applyPbc && dd->ci[dim] + 1 + pulse == dd->numCells[dim] - 1))
331             {
332                 /* Store the extremes */
333                 int pos = 0;
334
335                 for (int d1 = d; d1 < dd->ndim - 1; d1++)
336                 {
337                     extr_s[d1][1] = std::min(extr_s[d1][1], buf_e[pos].min0);
338                     extr_s[d1][0] = std::max(extr_s[d1][0], buf_e[pos].max1);
339                     extr_s[d1][2] = std::min(extr_s[d1][2], buf_e[pos].min1);
340                     pos++;
341                 }
342
343                 if (d == 1 || (d == 0 && dd->ndim == 3))
344                 {
345                     for (int i = d; i < 2; i++)
346                     {
347                         comm->zone_d2[1 - d][i] = buf_e[pos];
348                         pos++;
349                     }
350                 }
351                 if (d == 0)
352                 {
353                     comm->zone_d1[1] = buf_e[pos];
354                     pos++;
355                 }
356             }
357             else
358             {
359                 if (d == 1 || (d == 0 && dd->ndim == 3))
360                 {
361                     for (int i = d; i < 2; i++)
362                     {
363                         comm->zone_d2[1 - d][i].dataSet = 0;
364                     }
365                 }
366                 if (d == 0)
367                 {
368                     comm->zone_d1[1].dataSet = 0;
369                 }
370             }
371         }
372     }
373
374     if (dd->ndim >= 2)
375     {
376         int dim = dd->dim[1];
377         for (int i = 0; i < 2; i++)
378         {
379             if (comm->zone_d1[i].dataSet != 0)
380             {
381                 if (debug)
382                 {
383                     print_ddzone(debug, 1, i, 0, &comm->zone_d1[i]);
384                 }
385                 cell_ns_x0[dim] = std::min(cell_ns_x0[dim], comm->zone_d1[i].min0);
386                 cell_ns_x1[dim] = std::max(cell_ns_x1[dim], comm->zone_d1[i].max1);
387             }
388         }
389     }
390     if (dd->ndim >= 3)
391     {
392         int dim = dd->dim[2];
393         for (int i = 0; i < 2; i++)
394         {
395             for (int j = 0; j < 2; j++)
396             {
397                 if (comm->zone_d2[i][j].dataSet != 0)
398                 {
399                     if (debug)
400                     {
401                         print_ddzone(debug, 2, i, j, &comm->zone_d2[i][j]);
402                     }
403                     cell_ns_x0[dim] = std::min(cell_ns_x0[dim], comm->zone_d2[i][j].min0);
404                     cell_ns_x1[dim] = std::max(cell_ns_x1[dim], comm->zone_d2[i][j].max1);
405                 }
406             }
407         }
408     }
409     for (int d = 1; d < dd->ndim; d++)
410     {
411         cellsizes[d].fracLowerMax = extr_s[d - 1][0];
412         cellsizes[d].fracUpperMin = extr_s[d - 1][1];
413         if (debug)
414         {
415             fprintf(debug, "Cell fraction d %d, max0 %f, min1 %f\n", d, cellsizes[d].fracLowerMax,
416                     cellsizes[d].fracUpperMin);
417         }
418     }
419 }
420
421 //! Sets the charge-group zones to be equal to the home zone.
422 static void set_zones_ncg_home(gmx_domdec_t* dd)
423 {
424     gmx_domdec_zones_t* zones;
425     int                 i;
426
427     zones = &dd->comm->zones;
428
429     zones->cg_range[0] = 0;
430     for (i = 1; i < zones->n + 1; i++)
431     {
432         zones->cg_range[i] = dd->ncg_home;
433     }
434     /* zone_ncg1[0] should always be equal to ncg_home */
435     dd->comm->zone_ncg1[0] = dd->ncg_home;
436 }
437
438 //! Restore atom groups for the charge groups.
439 static void restoreAtomGroups(gmx_domdec_t* dd, const t_state* state)
440 {
441     gmx::ArrayRef<const int> atomGroupsState = state->cg_gl;
442
443     std::vector<int>& globalAtomGroupIndices = dd->globalAtomGroupIndices;
444
445     globalAtomGroupIndices.resize(atomGroupsState.size());
446
447     /* Copy back the global charge group indices from state
448      * and rebuild the local charge group to atom index.
449      */
450     for (gmx::index i = 0; i < atomGroupsState.ssize(); i++)
451     {
452         globalAtomGroupIndices[i] = atomGroupsState[i];
453     }
454
455     dd->ncg_home = atomGroupsState.size();
456     dd->comm->atomRanges.setEnd(DDAtomRanges::Type::Home, atomGroupsState.ssize());
457
458     set_zones_ncg_home(dd);
459 }
460
461 //! Sets the cginfo structures.
462 static void dd_set_cginfo(gmx::ArrayRef<const int> index_gl, int cg0, int cg1, t_forcerec* fr)
463 {
464     if (fr != nullptr)
465     {
466         gmx::ArrayRef<cginfo_mb_t> cginfo_mb = fr->cginfo_mb;
467         gmx::ArrayRef<int>         cginfo    = fr->cginfo;
468
469         for (int cg = cg0; cg < cg1; cg++)
470         {
471             cginfo[cg] = ddcginfo(cginfo_mb, index_gl[cg]);
472         }
473     }
474 }
475
476 //! Makes the mappings between global and local atom indices during DD repartioning.
477 static void make_dd_indices(gmx_domdec_t* dd, const int atomStart)
478 {
479     const int                numZones               = dd->comm->zones.n;
480     const int*               zone2cg                = dd->comm->zones.cg_range;
481     const int*               zone_ncg1              = dd->comm->zone_ncg1;
482     gmx::ArrayRef<const int> globalAtomGroupIndices = dd->globalAtomGroupIndices;
483
484     std::vector<int>& globalAtomIndices = dd->globalAtomIndices;
485     gmx_ga2la_t&      ga2la             = *dd->ga2la;
486
487     if (zone2cg[1] != dd->ncg_home)
488     {
489         gmx_incons("dd->ncg_zone is not up to date");
490     }
491
492     /* Make the local to global and global to local atom index */
493     int a = atomStart;
494     globalAtomIndices.resize(a);
495     for (int zone = 0; zone < numZones; zone++)
496     {
497         int cg0;
498         if (zone == 0)
499         {
500             cg0 = atomStart;
501         }
502         else
503         {
504             cg0 = zone2cg[zone];
505         }
506         int cg1    = zone2cg[zone + 1];
507         int cg1_p1 = cg0 + zone_ncg1[zone];
508
509         for (int cg = cg0; cg < cg1; cg++)
510         {
511             int zone1 = zone;
512             if (cg >= cg1_p1)
513             {
514                 /* Signal that this cg is from more than one pulse away */
515                 zone1 += numZones;
516             }
517             int cg_gl = globalAtomGroupIndices[cg];
518             globalAtomIndices.push_back(cg_gl);
519             ga2la.insert(cg_gl, { a, zone1 });
520             a++;
521         }
522     }
523 }
524
525 //! Checks whether global and local atom indices are consistent.
526 static void check_index_consistency(const gmx_domdec_t* dd, int natoms_sys, const char* where)
527 {
528     int nerr = 0;
529
530     const int numAtomsInZones = dd->comm->atomRanges.end(DDAtomRanges::Type::Zones);
531
532     if (dd->comm->ddSettings.DD_debug > 1)
533     {
534         std::vector<int> have(natoms_sys);
535         for (int a = 0; a < numAtomsInZones; a++)
536         {
537             int globalAtomIndex = dd->globalAtomIndices[a];
538             if (have[globalAtomIndex] > 0)
539             {
540                 fprintf(stderr, "DD rank %d: global atom %d occurs twice: index %d and %d\n",
541                         dd->rank, globalAtomIndex + 1, have[globalAtomIndex], a + 1);
542             }
543             else
544             {
545                 have[globalAtomIndex] = a + 1;
546             }
547         }
548     }
549
550     std::vector<int> have(numAtomsInZones);
551
552     int ngl = 0;
553     for (int i = 0; i < natoms_sys; i++)
554     {
555         if (const auto entry = dd->ga2la->find(i))
556         {
557             const int a = entry->la;
558             if (a >= numAtomsInZones)
559             {
560                 fprintf(stderr,
561                         "DD rank %d: global atom %d marked as local atom %d, which is larger than "
562                         "nat_tot (%d)\n",
563                         dd->rank, i + 1, a + 1, numAtomsInZones);
564                 nerr++;
565             }
566             else
567             {
568                 have[a] = 1;
569                 if (dd->globalAtomIndices[a] != i)
570                 {
571                     fprintf(stderr,
572                             "DD rank %d: global atom %d marked as local atom %d, which has global "
573                             "atom index %d\n",
574                             dd->rank, i + 1, a + 1, dd->globalAtomIndices[a] + 1);
575                     nerr++;
576                 }
577             }
578             ngl++;
579         }
580     }
581     if (ngl != numAtomsInZones)
582     {
583         fprintf(stderr, "DD rank %d, %s: %d global atom indices, %d local atoms\n", dd->rank, where,
584                 ngl, numAtomsInZones);
585     }
586     for (int a = 0; a < numAtomsInZones; a++)
587     {
588         if (have[a] == 0)
589         {
590             fprintf(stderr, "DD rank %d, %s: local atom %d, global %d has no global index\n",
591                     dd->rank, where, a + 1, dd->globalAtomIndices[a] + 1);
592         }
593     }
594
595     if (nerr > 0)
596     {
597         gmx_fatal(FARGS, "DD rank %d, %s: %d atom(group) index inconsistencies", dd->rank, where, nerr);
598     }
599 }
600
601 //! Clear all DD global state indices
602 static void clearDDStateIndices(gmx_domdec_t* dd, const bool keepLocalAtomIndices)
603 {
604     gmx_ga2la_t& ga2la = *dd->ga2la;
605
606     if (!keepLocalAtomIndices)
607     {
608         /* Clear the whole list without the overhead of searching */
609         ga2la.clear();
610     }
611     else
612     {
613         const int numAtomsInZones = dd->comm->atomRanges.end(DDAtomRanges::Type::Zones);
614         for (int i = 0; i < numAtomsInZones; i++)
615         {
616             ga2la.erase(dd->globalAtomIndices[i]);
617         }
618     }
619
620     dd_clear_local_vsite_indices(dd);
621
622     if (dd->constraints)
623     {
624         dd_clear_local_constraint_indices(dd);
625     }
626 }
627
628 bool check_grid_jump(int64_t step, const gmx_domdec_t* dd, real cutoff, const gmx_ddbox_t* ddbox, gmx_bool bFatal)
629 {
630     gmx_domdec_comm_t* comm    = dd->comm;
631     bool               invalid = false;
632
633     for (int d = 1; d < dd->ndim; d++)
634     {
635         const DDCellsizesWithDlb& cellsizes = comm->cellsizesWithDlb[d];
636         const int                 dim       = dd->dim[d];
637         const real                limit     = grid_jump_limit(comm, cutoff, d);
638         real                      bfac      = ddbox->box_size[dim];
639         if (ddbox->tric_dir[dim])
640         {
641             bfac *= ddbox->skew_fac[dim];
642         }
643         if ((cellsizes.fracUpper - cellsizes.fracLowerMax) * bfac < limit
644             || (cellsizes.fracLower - cellsizes.fracUpperMin) * bfac > -limit)
645         {
646             invalid = true;
647
648             if (bFatal)
649             {
650                 char buf[22];
651
652                 /* This error should never be triggered under normal
653                  * circumstances, but you never know ...
654                  */
655                 gmx_fatal(FARGS,
656                           "step %s: The domain decomposition grid has shifted too much in the "
657                           "%c-direction around cell %d %d %d. This should not have happened. "
658                           "Running with fewer ranks might avoid this issue.",
659                           gmx_step_str(step, buf), dim2char(dim), dd->ci[XX], dd->ci[YY], dd->ci[ZZ]);
660             }
661         }
662     }
663
664     return invalid;
665 }
666 //! Return the duration of force calculations on this rank.
667 static float dd_force_load(gmx_domdec_comm_t* comm)
668 {
669     float load;
670
671     if (comm->ddSettings.eFlop)
672     {
673         load = comm->flop;
674         if (comm->ddSettings.eFlop > 1)
675         {
676             load *= 1.0 + (comm->ddSettings.eFlop - 1) * (0.1 * rand() / RAND_MAX - 0.05);
677         }
678     }
679     else
680     {
681         load = comm->cycl[ddCyclF];
682         if (comm->cycl_n[ddCyclF] > 1)
683         {
684             /* Subtract the maximum of the last n cycle counts
685              * to get rid of possible high counts due to other sources,
686              * for instance system activity, that would otherwise
687              * affect the dynamic load balancing.
688              */
689             load -= comm->cycl_max[ddCyclF];
690         }
691
692 #if GMX_MPI
693         if (comm->cycl_n[ddCyclWaitGPU] && comm->nrank_gpu_shared > 1)
694         {
695             float gpu_wait, gpu_wait_sum;
696
697             gpu_wait = comm->cycl[ddCyclWaitGPU];
698             if (comm->cycl_n[ddCyclF] > 1)
699             {
700                 /* We should remove the WaitGPU time of the same MD step
701                  * as the one with the maximum F time, since the F time
702                  * and the wait time are not independent.
703                  * Furthermore, the step for the max F time should be chosen
704                  * the same on all ranks that share the same GPU.
705                  * But to keep the code simple, we remove the average instead.
706                  * The main reason for artificially long times at some steps
707                  * is spurious CPU activity or MPI time, so we don't expect
708                  * that changes in the GPU wait time matter a lot here.
709                  */
710                 gpu_wait *= (comm->cycl_n[ddCyclF] - 1) / static_cast<float>(comm->cycl_n[ddCyclF]);
711             }
712             /* Sum the wait times over the ranks that share the same GPU */
713             MPI_Allreduce(&gpu_wait, &gpu_wait_sum, 1, MPI_FLOAT, MPI_SUM, comm->mpi_comm_gpu_shared);
714             /* Replace the wait time by the average over the ranks */
715             load += -gpu_wait + gpu_wait_sum / comm->nrank_gpu_shared;
716         }
717 #endif
718     }
719
720     return load;
721 }
722
723 //! Runs cell size checks and communicates the boundaries.
724 static void comm_dd_ns_cell_sizes(gmx_domdec_t* dd, gmx_ddbox_t* ddbox, rvec cell_ns_x0, rvec cell_ns_x1, int64_t step)
725 {
726     gmx_domdec_comm_t* comm;
727     int                dim_ind, dim;
728
729     comm = dd->comm;
730
731     for (dim_ind = 0; dim_ind < dd->ndim; dim_ind++)
732     {
733         dim = dd->dim[dim_ind];
734
735         /* Without PBC we don't have restrictions on the outer cells */
736         if (!(dim >= ddbox->npbcdim && (dd->ci[dim] == 0 || dd->ci[dim] == dd->numCells[dim] - 1))
737             && isDlbOn(comm)
738             && (comm->cell_x1[dim] - comm->cell_x0[dim]) * ddbox->skew_fac[dim] < comm->cellsize_min[dim])
739         {
740             char buf[22];
741             gmx_fatal(FARGS,
742                       "step %s: The %c-size (%f) times the triclinic skew factor (%f) is smaller "
743                       "than the smallest allowed cell size (%f) for domain decomposition grid cell "
744                       "%d %d %d",
745                       gmx_step_str(step, buf), dim2char(dim),
746                       comm->cell_x1[dim] - comm->cell_x0[dim], ddbox->skew_fac[dim],
747                       dd->comm->cellsize_min[dim], dd->ci[XX], dd->ci[YY], dd->ci[ZZ]);
748         }
749     }
750
751     if ((isDlbOn(dd->comm) && dd->ndim > 1) || ddbox->nboundeddim < DIM)
752     {
753         /* Communicate the boundaries and update cell_ns_x0/1 */
754         dd_move_cellx(dd, ddbox, cell_ns_x0, cell_ns_x1);
755         if (isDlbOn(dd->comm) && dd->ndim > 1)
756         {
757             check_grid_jump(step, dd, dd->comm->systemInfo.cutoff, ddbox, TRUE);
758         }
759     }
760 }
761
762 //! Compute and communicate to determine the load distribution across PP ranks.
763 static void get_load_distribution(gmx_domdec_t* dd, gmx_wallcycle_t wcycle)
764 {
765     gmx_domdec_comm_t* comm;
766     domdec_load_t*     load;
767     float              cell_frac = 0, sbuf[DD_NLOAD_MAX];
768     gmx_bool           bSepPME;
769
770     if (debug)
771     {
772         fprintf(debug, "get_load_distribution start\n");
773     }
774
775     wallcycle_start(wcycle, ewcDDCOMMLOAD);
776
777     comm = dd->comm;
778
779     bSepPME = (dd->pme_nodeid >= 0);
780
781     if (dd->ndim == 0 && bSepPME)
782     {
783         /* Without decomposition, but with PME nodes, we need the load */
784         comm->load[0].mdf = comm->cycl[ddCyclPPduringPME];
785         comm->load[0].pme = comm->cycl[ddCyclPME];
786     }
787
788     for (int d = dd->ndim - 1; d >= 0; d--)
789     {
790         const DDCellsizesWithDlb* cellsizes = (isDlbOn(dd->comm) ? &comm->cellsizesWithDlb[d] : nullptr);
791         const int                 dim       = dd->dim[d];
792         /* Check if we participate in the communication in this dimension */
793         if (d == dd->ndim - 1 || (dd->ci[dd->dim[d + 1]] == 0 && dd->ci[dd->dim[dd->ndim - 1]] == 0))
794         {
795             load = &comm->load[d];
796             if (isDlbOn(dd->comm))
797             {
798                 cell_frac = cellsizes->fracUpper - cellsizes->fracLower;
799             }
800             int pos = 0;
801             if (d == dd->ndim - 1)
802             {
803                 sbuf[pos++] = dd_force_load(comm);
804                 sbuf[pos++] = sbuf[0];
805                 if (isDlbOn(dd->comm))
806                 {
807                     sbuf[pos++] = sbuf[0];
808                     sbuf[pos++] = cell_frac;
809                     if (d > 0)
810                     {
811                         sbuf[pos++] = cellsizes->fracLowerMax;
812                         sbuf[pos++] = cellsizes->fracUpperMin;
813                     }
814                 }
815                 if (bSepPME)
816                 {
817                     sbuf[pos++] = comm->cycl[ddCyclPPduringPME];
818                     sbuf[pos++] = comm->cycl[ddCyclPME];
819                 }
820             }
821             else
822             {
823                 sbuf[pos++] = comm->load[d + 1].sum;
824                 sbuf[pos++] = comm->load[d + 1].max;
825                 if (isDlbOn(dd->comm))
826                 {
827                     sbuf[pos++] = comm->load[d + 1].sum_m;
828                     sbuf[pos++] = comm->load[d + 1].cvol_min * cell_frac;
829                     sbuf[pos++] = comm->load[d + 1].flags;
830                     if (d > 0)
831                     {
832                         sbuf[pos++] = cellsizes->fracLowerMax;
833                         sbuf[pos++] = cellsizes->fracUpperMin;
834                     }
835                 }
836                 if (bSepPME)
837                 {
838                     sbuf[pos++] = comm->load[d + 1].mdf;
839                     sbuf[pos++] = comm->load[d + 1].pme;
840                 }
841             }
842             load->nload = pos;
843             /* Communicate a row in DD direction d.
844              * The communicators are setup such that the root always has rank 0.
845              */
846 #if GMX_MPI
847             MPI_Gather(sbuf, load->nload * sizeof(float), MPI_BYTE, load->load,
848                        load->nload * sizeof(float), MPI_BYTE, 0, comm->mpi_comm_load[d]);
849 #endif
850             if (dd->ci[dim] == dd->master_ci[dim])
851             {
852                 /* We are the master along this row, process this row */
853                 RowMaster* rowMaster = nullptr;
854
855                 if (isDlbOn(comm))
856                 {
857                     rowMaster = cellsizes->rowMaster.get();
858                 }
859                 load->sum      = 0;
860                 load->max      = 0;
861                 load->sum_m    = 0;
862                 load->cvol_min = 1;
863                 load->flags    = 0;
864                 load->mdf      = 0;
865                 load->pme      = 0;
866                 int pos        = 0;
867                 for (int i = 0; i < dd->numCells[dim]; i++)
868                 {
869                     load->sum += load->load[pos++];
870                     load->max = std::max(load->max, load->load[pos]);
871                     pos++;
872                     if (isDlbOn(dd->comm))
873                     {
874                         if (rowMaster->dlbIsLimited)
875                         {
876                             /* This direction could not be load balanced properly,
877                              * therefore we need to use the maximum iso the average load.
878                              */
879                             load->sum_m = std::max(load->sum_m, load->load[pos]);
880                         }
881                         else
882                         {
883                             load->sum_m += load->load[pos];
884                         }
885                         pos++;
886                         load->cvol_min = std::min(load->cvol_min, load->load[pos]);
887                         pos++;
888                         if (d < dd->ndim - 1)
889                         {
890                             load->flags = gmx::roundToInt(load->load[pos++]);
891                         }
892                         if (d > 0)
893                         {
894                             rowMaster->bounds[i].cellFracLowerMax = load->load[pos++];
895                             rowMaster->bounds[i].cellFracUpperMin = load->load[pos++];
896                         }
897                     }
898                     if (bSepPME)
899                     {
900                         load->mdf = std::max(load->mdf, load->load[pos]);
901                         pos++;
902                         load->pme = std::max(load->pme, load->load[pos]);
903                         pos++;
904                     }
905                 }
906                 if (isDlbOn(comm) && rowMaster->dlbIsLimited)
907                 {
908                     load->sum_m *= dd->numCells[dim];
909                     load->flags |= (1 << d);
910                 }
911             }
912         }
913     }
914
915     if (DDMASTER(dd))
916     {
917         comm->nload += dd_load_count(comm);
918         comm->load_step += comm->cycl[ddCyclStep];
919         comm->load_sum += comm->load[0].sum;
920         comm->load_max += comm->load[0].max;
921         if (isDlbOn(comm))
922         {
923             for (int d = 0; d < dd->ndim; d++)
924             {
925                 if (comm->load[0].flags & (1 << d))
926                 {
927                     comm->load_lim[d]++;
928                 }
929             }
930         }
931         if (bSepPME)
932         {
933             comm->load_mdf += comm->load[0].mdf;
934             comm->load_pme += comm->load[0].pme;
935         }
936     }
937
938     wallcycle_stop(wcycle, ewcDDCOMMLOAD);
939
940     if (debug)
941     {
942         fprintf(debug, "get_load_distribution finished\n");
943     }
944 }
945
946 /*! \brief Return the relative performance loss on the total run time
947  * due to the force calculation load imbalance. */
948 static float dd_force_load_fraction(gmx_domdec_t* dd)
949 {
950     if (dd->comm->nload > 0 && dd->comm->load_step > 0)
951     {
952         return dd->comm->load_sum / (dd->comm->load_step * dd->nnodes);
953     }
954     else
955     {
956         return 0;
957     }
958 }
959
960 /*! \brief Return the relative performance loss on the total run time
961  * due to the force calculation load imbalance. */
962 static float dd_force_imb_perf_loss(gmx_domdec_t* dd)
963 {
964     if (dd->comm->nload > 0 && dd->comm->load_step > 0)
965     {
966         return (dd->comm->load_max * dd->nnodes - dd->comm->load_sum) / (dd->comm->load_step * dd->nnodes);
967     }
968     else
969     {
970         return 0;
971     }
972 }
973
974 //! Print load-balance report e.g. at the end of a run.
975 static void print_dd_load_av(FILE* fplog, gmx_domdec_t* dd)
976 {
977     gmx_domdec_comm_t* comm = dd->comm;
978
979     /* Only the master rank prints loads and only if we measured loads */
980     if (!DDMASTER(dd) || comm->nload == 0)
981     {
982         return;
983     }
984
985     char buf[STRLEN];
986     int  numPpRanks  = dd->nnodes;
987     int  numPmeRanks = (comm->ddRankSetup.usePmeOnlyRanks ? comm->ddRankSetup.numRanksDoingPme : 0);
988     int  numRanks    = numPpRanks + numPmeRanks;
989     float lossFraction = 0;
990
991     /* Print the average load imbalance and performance loss */
992     if (dd->nnodes > 1 && comm->load_sum > 0)
993     {
994         float imbalance = comm->load_max * numPpRanks / comm->load_sum - 1;
995         lossFraction    = dd_force_imb_perf_loss(dd);
996
997         std::string msg = "\nDynamic load balancing report:\n";
998         std::string dlbStateStr;
999
1000         switch (dd->comm->dlbState)
1001         {
1002             case DlbState::offUser:
1003                 dlbStateStr = "DLB was off during the run per user request.";
1004                 break;
1005             case DlbState::offForever:
1006                 /* Currectly this can happen due to performance loss observed, cell size
1007                  * limitations or incompatibility with other settings observed during
1008                  * determineInitialDlbState(). */
1009                 dlbStateStr = "DLB got disabled because it was unsuitable to use.";
1010                 break;
1011             case DlbState::offCanTurnOn:
1012                 dlbStateStr = "DLB was off during the run due to low measured imbalance.";
1013                 break;
1014             case DlbState::offTemporarilyLocked:
1015                 dlbStateStr =
1016                         "DLB was locked at the end of the run due to unfinished PP-PME "
1017                         "balancing.";
1018                 break;
1019             case DlbState::onCanTurnOff:
1020                 dlbStateStr = "DLB was turned on during the run due to measured imbalance.";
1021                 break;
1022             case DlbState::onUser:
1023                 dlbStateStr = "DLB was permanently on during the run per user request.";
1024                 break;
1025             default: GMX_ASSERT(false, "Undocumented DLB state");
1026         }
1027
1028         msg += " " + dlbStateStr + "\n";
1029         msg += gmx::formatString(" Average load imbalance: %.1f%%.\n", imbalance * 100);
1030         msg += gmx::formatString(
1031                 " The balanceable part of the MD step is %d%%, load imbalance is computed from "
1032                 "this.\n",
1033                 gmx::roundToInt(dd_force_load_fraction(dd) * 100));
1034         msg += gmx::formatString(
1035                 " Part of the total run time spent waiting due to load imbalance: %.1f%%.\n",
1036                 lossFraction * 100);
1037         fprintf(fplog, "%s", msg.c_str());
1038         fprintf(stderr, "\n%s", msg.c_str());
1039     }
1040
1041     /* Print during what percentage of steps the  load balancing was limited */
1042     bool dlbWasLimited = false;
1043     if (isDlbOn(comm))
1044     {
1045         sprintf(buf, " Steps where the load balancing was limited by -rdd, -rcon and/or -dds:");
1046         for (int d = 0; d < dd->ndim; d++)
1047         {
1048             int limitPercentage = (200 * comm->load_lim[d] + 1) / (2 * comm->nload);
1049             sprintf(buf + strlen(buf), " %c %d %%", dim2char(dd->dim[d]), limitPercentage);
1050             if (limitPercentage >= 50)
1051             {
1052                 dlbWasLimited = true;
1053             }
1054         }
1055         sprintf(buf + strlen(buf), "\n");
1056         fprintf(fplog, "%s", buf);
1057         fprintf(stderr, "%s", buf);
1058     }
1059
1060     /* Print the performance loss due to separate PME - PP rank imbalance */
1061     float lossFractionPme = 0;
1062     if (numPmeRanks > 0 && comm->load_mdf > 0 && comm->load_step > 0)
1063     {
1064         float pmeForceRatio = comm->load_pme / comm->load_mdf;
1065         lossFractionPme     = (comm->load_pme - comm->load_mdf) / comm->load_step;
1066         if (lossFractionPme <= 0)
1067         {
1068             lossFractionPme *= numPmeRanks / static_cast<float>(numRanks);
1069         }
1070         else
1071         {
1072             lossFractionPme *= numPpRanks / static_cast<float>(numRanks);
1073         }
1074         sprintf(buf, " Average PME mesh/force load: %5.3f\n", pmeForceRatio);
1075         fprintf(fplog, "%s", buf);
1076         fprintf(stderr, "%s", buf);
1077         sprintf(buf, " Part of the total run time spent waiting due to PP/PME imbalance: %.1f %%\n",
1078                 std::fabs(lossFractionPme) * 100);
1079         fprintf(fplog, "%s", buf);
1080         fprintf(stderr, "%s", buf);
1081     }
1082     fprintf(fplog, "\n");
1083     fprintf(stderr, "\n");
1084
1085     if ((lossFraction >= DD_PERF_LOSS_WARN) && (dd->comm->dlbState != DlbState::offTemporarilyLocked))
1086     {
1087         std::string message = gmx::formatString(
1088                 "NOTE: %.1f %% of the available CPU time was lost due to load imbalance\n"
1089                 "      in the domain decomposition.\n",
1090                 lossFraction * 100);
1091
1092         bool hadSuggestion = false;
1093         if (dd->comm->dlbState == DlbState::offUser)
1094         {
1095             message += "      You might want to allow dynamic load balancing (option -dlb auto.)\n";
1096             hadSuggestion = true;
1097         }
1098         else if (dd->comm->dlbState == DlbState::offCanTurnOn)
1099         {
1100             message +=
1101                     "      Dynamic load balancing was automatically disabled, but it might be "
1102                     "beneficial to manually tuning it on (option -dlb on.)\n";
1103             hadSuggestion = true;
1104         }
1105         else if (dlbWasLimited)
1106         {
1107             message +=
1108                     "      You might want to decrease the cell size limit (options -rdd, -rcon "
1109                     "and/or -dds).\n";
1110             hadSuggestion = true;
1111         }
1112         message += gmx::formatString(
1113                 "      You can %sconsider manually changing the decomposition (option -dd);\n"
1114                 "      e.g. by using fewer domains along the box dimension in which there is\n"
1115                 "      considerable inhomogeneity in the simulated system.",
1116                 hadSuggestion ? "also " : "");
1117
1118         fprintf(fplog, "%s\n", message.c_str());
1119         fprintf(stderr, "%s\n", message.c_str());
1120     }
1121     if (numPmeRanks > 0 && std::fabs(lossFractionPme) >= DD_PERF_LOSS_WARN)
1122     {
1123         sprintf(buf,
1124                 "NOTE: %.1f %% performance was lost because the PME ranks\n"
1125                 "      had %s work to do than the PP ranks.\n"
1126                 "      You might want to %s the number of PME ranks\n"
1127                 "      or %s the cut-off and the grid spacing.\n",
1128                 std::fabs(lossFractionPme * 100), (lossFractionPme < 0) ? "less" : "more",
1129                 (lossFractionPme < 0) ? "decrease" : "increase",
1130                 (lossFractionPme < 0) ? "decrease" : "increase");
1131         fprintf(fplog, "%s\n", buf);
1132         fprintf(stderr, "%s\n", buf);
1133     }
1134 }
1135
1136 //! Return the minimum communication volume.
1137 static float dd_vol_min(gmx_domdec_t* dd)
1138 {
1139     return dd->comm->load[0].cvol_min * dd->nnodes;
1140 }
1141
1142 //! Return the DD load flags.
1143 static int dd_load_flags(gmx_domdec_t* dd)
1144 {
1145     return dd->comm->load[0].flags;
1146 }
1147
1148 //! Return the reported load imbalance in force calculations.
1149 static float dd_f_imbal(gmx_domdec_t* dd)
1150 {
1151     if (dd->comm->load[0].sum > 0)
1152     {
1153         return dd->comm->load[0].max * dd->nnodes / dd->comm->load[0].sum - 1.0F;
1154     }
1155     else
1156     {
1157         /* Something is wrong in the cycle counting, report no load imbalance */
1158         return 0.0F;
1159     }
1160 }
1161
1162 //! Returns DD load balance report.
1163 static std::string dd_print_load(gmx_domdec_t* dd, int64_t step)
1164 {
1165     gmx::StringOutputStream stream;
1166     gmx::TextWriter         log(&stream);
1167
1168     int flags = dd_load_flags(dd);
1169     if (flags)
1170     {
1171         log.writeString("DD  load balancing is limited by minimum cell size in dimension");
1172         for (int d = 0; d < dd->ndim; d++)
1173         {
1174             if (flags & (1 << d))
1175             {
1176                 log.writeStringFormatted(" %c", dim2char(dd->dim[d]));
1177             }
1178         }
1179         log.ensureLineBreak();
1180     }
1181     log.writeString("DD  step " + gmx::toString(step));
1182     if (isDlbOn(dd->comm))
1183     {
1184         log.writeStringFormatted("  vol min/aver %5.3f%c", dd_vol_min(dd), flags ? '!' : ' ');
1185     }
1186     if (dd->nnodes > 1)
1187     {
1188         log.writeStringFormatted(" load imb.: force %4.1f%%", dd_f_imbal(dd) * 100);
1189     }
1190     if (dd->comm->cycl_n[ddCyclPME])
1191     {
1192         log.writeStringFormatted("  pme mesh/force %5.3f", dd_pme_f_ratio(dd));
1193     }
1194     log.ensureLineBreak();
1195     return stream.toString();
1196 }
1197
1198 //! Prints DD load balance report in mdrun verbose mode.
1199 static void dd_print_load_verbose(gmx_domdec_t* dd)
1200 {
1201     if (isDlbOn(dd->comm))
1202     {
1203         fprintf(stderr, "vol %4.2f%c ", dd_vol_min(dd), dd_load_flags(dd) ? '!' : ' ');
1204     }
1205     if (dd->nnodes > 1)
1206     {
1207         fprintf(stderr, "imb F %2d%% ", gmx::roundToInt(dd_f_imbal(dd) * 100));
1208     }
1209     if (dd->comm->cycl_n[ddCyclPME])
1210     {
1211         fprintf(stderr, "pme/F %4.2f ", dd_pme_f_ratio(dd));
1212     }
1213 }
1214
1215 //! Turns on dynamic load balancing if possible and needed.
1216 static void turn_on_dlb(const gmx::MDLogger& mdlog, gmx_domdec_t* dd, int64_t step)
1217 {
1218     gmx_domdec_comm_t* comm = dd->comm;
1219
1220     real cellsize_min = comm->cellsize_min[dd->dim[0]];
1221     for (int d = 1; d < dd->ndim; d++)
1222     {
1223         cellsize_min = std::min(cellsize_min, comm->cellsize_min[dd->dim[d]]);
1224     }
1225
1226     /* Turn off DLB if we're too close to the cell size limit. */
1227     if (cellsize_min < comm->cellsize_limit * 1.05)
1228     {
1229         GMX_LOG(mdlog.info)
1230                 .appendTextFormatted(
1231                         "step %s Measured %.1f %% performance loss due to load imbalance, "
1232                         "but the minimum cell size is smaller than 1.05 times the cell size limit. "
1233                         "Will no longer try dynamic load balancing.",
1234                         gmx::toString(step).c_str(), dd_force_imb_perf_loss(dd) * 100);
1235
1236         comm->dlbState = DlbState::offForever;
1237         return;
1238     }
1239
1240     GMX_LOG(mdlog.info)
1241             .appendTextFormatted(
1242                     "step %s Turning on dynamic load balancing, because the performance loss due "
1243                     "to load imbalance is %.1f %%.",
1244                     gmx::toString(step).c_str(), dd_force_imb_perf_loss(dd) * 100);
1245     comm->dlbState = DlbState::onCanTurnOff;
1246
1247     /* Store the non-DLB performance, so we can check if DLB actually
1248      * improves performance.
1249      */
1250     GMX_RELEASE_ASSERT(comm->cycl_n[ddCyclStep] > 0,
1251                        "When we turned on DLB, we should have measured cycles");
1252     comm->cyclesPerStepBeforeDLB = comm->cycl[ddCyclStep] / comm->cycl_n[ddCyclStep];
1253
1254     set_dlb_limits(dd);
1255
1256     /* We can set the required cell size info here,
1257      * so we do not need to communicate this.
1258      * The grid is completely uniform.
1259      */
1260     for (int d = 0; d < dd->ndim; d++)
1261     {
1262         RowMaster* rowMaster = comm->cellsizesWithDlb[d].rowMaster.get();
1263
1264         if (rowMaster)
1265         {
1266             comm->load[d].sum_m = comm->load[d].sum;
1267
1268             int nc = dd->numCells[dd->dim[d]];
1269             for (int i = 0; i < nc; i++)
1270             {
1271                 rowMaster->cellFrac[i] = i / static_cast<real>(nc);
1272                 if (d > 0)
1273                 {
1274                     rowMaster->bounds[i].cellFracLowerMax = i / static_cast<real>(nc);
1275                     rowMaster->bounds[i].cellFracUpperMin = (i + 1) / static_cast<real>(nc);
1276                 }
1277             }
1278             rowMaster->cellFrac[nc] = 1.0;
1279         }
1280     }
1281 }
1282
1283 //! Turns off dynamic load balancing (but leave it able to turn back on).
1284 static void turn_off_dlb(const gmx::MDLogger& mdlog, gmx_domdec_t* dd, int64_t step)
1285 {
1286     GMX_LOG(mdlog.info)
1287             .appendText(
1288                     "step " + gmx::toString(step)
1289                     + " Turning off dynamic load balancing, because it is degrading performance.");
1290     dd->comm->dlbState                     = DlbState::offCanTurnOn;
1291     dd->comm->haveTurnedOffDlb             = true;
1292     dd->comm->ddPartioningCountFirstDlbOff = dd->ddp_count;
1293 }
1294
1295 //! Turns off dynamic load balancing permanently.
1296 static void turn_off_dlb_forever(const gmx::MDLogger& mdlog, gmx_domdec_t* dd, int64_t step)
1297 {
1298     GMX_RELEASE_ASSERT(dd->comm->dlbState == DlbState::offCanTurnOn,
1299                        "Can only turn off DLB forever when it was in the can-turn-on state");
1300     GMX_LOG(mdlog.info)
1301             .appendText(
1302                     "step " + gmx::toString(step)
1303                     + " Will no longer try dynamic load balancing, as it degraded performance.");
1304     dd->comm->dlbState = DlbState::offForever;
1305 }
1306
1307 void set_dd_dlb_max_cutoff(t_commrec* cr, real cutoff)
1308 {
1309     gmx_domdec_comm_t* comm;
1310
1311     comm = cr->dd->comm;
1312
1313     /* Turn on the DLB limiting (might have been on already) */
1314     comm->bPMELoadBalDLBLimits = TRUE;
1315
1316     /* Change the cut-off limit */
1317     comm->PMELoadBal_max_cutoff = cutoff;
1318
1319     if (debug)
1320     {
1321         fprintf(debug,
1322                 "PME load balancing set a limit to the DLB staggering such that a %f cut-off will "
1323                 "continue to fit\n",
1324                 comm->PMELoadBal_max_cutoff);
1325     }
1326 }
1327
1328 //! Merge atom buffers.
1329 static void merge_cg_buffers(int                            ncell,
1330                              gmx_domdec_comm_dim_t*         cd,
1331                              int                            pulse,
1332                              int*                           ncg_cell,
1333                              gmx::ArrayRef<int>             index_gl,
1334                              const int*                     recv_i,
1335                              gmx::ArrayRef<gmx::RVec>       x,
1336                              gmx::ArrayRef<const gmx::RVec> recv_vr,
1337                              gmx::ArrayRef<cginfo_mb_t>     cginfo_mb,
1338                              gmx::ArrayRef<int>             cginfo)
1339 {
1340     gmx_domdec_ind_t *ind, *ind_p;
1341     int               p, cell, c, cg, cg0, cg1, cg_gl;
1342     int               shift;
1343
1344     ind = &cd->ind[pulse];
1345
1346     /* First correct the already stored data */
1347     shift = ind->nrecv[ncell];
1348     for (cell = ncell - 1; cell >= 0; cell--)
1349     {
1350         shift -= ind->nrecv[cell];
1351         if (shift > 0)
1352         {
1353             /* Move the cg's present from previous grid pulses */
1354             cg0 = ncg_cell[ncell + cell];
1355             cg1 = ncg_cell[ncell + cell + 1];
1356             for (cg = cg1 - 1; cg >= cg0; cg--)
1357             {
1358                 index_gl[cg + shift] = index_gl[cg];
1359                 x[cg + shift]        = x[cg];
1360                 cginfo[cg + shift]   = cginfo[cg];
1361             }
1362             /* Correct the already stored send indices for the shift */
1363             for (p = 1; p <= pulse; p++)
1364             {
1365                 ind_p = &cd->ind[p];
1366                 cg0   = 0;
1367                 for (c = 0; c < cell; c++)
1368                 {
1369                     cg0 += ind_p->nsend[c];
1370                 }
1371                 cg1 = cg0 + ind_p->nsend[cell];
1372                 for (cg = cg0; cg < cg1; cg++)
1373                 {
1374                     ind_p->index[cg] += shift;
1375                 }
1376             }
1377         }
1378     }
1379
1380     /* Merge in the communicated buffers */
1381     shift = 0;
1382     cg0   = 0;
1383     for (cell = 0; cell < ncell; cell++)
1384     {
1385         cg1 = ncg_cell[ncell + cell + 1] + shift;
1386         for (cg = 0; cg < ind->nrecv[cell]; cg++)
1387         {
1388             /* Copy this atom from the buffer */
1389             index_gl[cg1] = recv_i[cg0];
1390             x[cg1]        = recv_vr[cg0];
1391             /* Copy information */
1392             cg_gl       = index_gl[cg1];
1393             cginfo[cg1] = ddcginfo(cginfo_mb, cg_gl);
1394             cg0++;
1395             cg1++;
1396         }
1397         shift += ind->nrecv[cell];
1398         ncg_cell[ncell + cell + 1] = cg1;
1399     }
1400 }
1401
1402 //! Makes a range partitioning for the atom groups wthin a cell
1403 static void make_cell2at_index(gmx_domdec_comm_dim_t* cd, int nzone, int atomGroupStart)
1404 {
1405     /* Store the atom block boundaries for easy copying of communication buffers
1406      */
1407     int g = atomGroupStart;
1408     for (int zone = 0; zone < nzone; zone++)
1409     {
1410         for (gmx_domdec_ind_t& ind : cd->ind)
1411         {
1412             ind.cell2at0[zone] = g;
1413             g += ind.nrecv[zone];
1414             ind.cell2at1[zone] = g;
1415         }
1416     }
1417 }
1418
1419 //! Returns whether a link is missing.
1420 static gmx_bool missing_link(const t_blocka& link, const int globalAtomIndex, const gmx_ga2la_t& ga2la)
1421 {
1422     for (int i = link.index[globalAtomIndex]; i < link.index[globalAtomIndex + 1]; i++)
1423     {
1424         if (!ga2la.findHome(link.a[i]))
1425         {
1426             return true;
1427         }
1428     }
1429
1430     return false;
1431 }
1432
1433 //! Domain corners for communication, a maximum of 4 i-zones see a j domain
1434 typedef struct
1435 {
1436     //! The corners for the non-bonded communication.
1437     real c[DIM][4];
1438     //! Corner for rounding.
1439     real cr0;
1440     //! Corners for rounding.
1441     real cr1[4];
1442     //! Corners for bounded communication.
1443     real bc[DIM];
1444     //! Corner for rounding for bonded communication.
1445     real bcr1;
1446 } dd_corners_t;
1447
1448 //! Determine the corners of the domain(s) we are communicating with.
1449 static void set_dd_corners(const gmx_domdec_t* dd, int dim0, int dim1, int dim2, gmx_bool bDistMB, dd_corners_t* c)
1450 {
1451     const gmx_domdec_comm_t*  comm;
1452     const gmx_domdec_zones_t* zones;
1453
1454     comm = dd->comm;
1455
1456     zones = &comm->zones;
1457
1458     /* Keep the compiler happy */
1459     c->cr0  = 0;
1460     c->bcr1 = 0;
1461
1462     /* The first dimension is equal for all cells */
1463     c->c[0][0] = comm->cell_x0[dim0];
1464     if (bDistMB)
1465     {
1466         c->bc[0] = c->c[0][0];
1467     }
1468     if (dd->ndim >= 2)
1469     {
1470         dim1 = dd->dim[1];
1471         /* This cell row is only seen from the first row */
1472         c->c[1][0] = comm->cell_x0[dim1];
1473         /* All rows can see this row */
1474         c->c[1][1] = comm->cell_x0[dim1];
1475         if (isDlbOn(dd->comm))
1476         {
1477             c->c[1][1] = std::max(comm->cell_x0[dim1], comm->zone_d1[1].mch0);
1478             if (bDistMB)
1479             {
1480                 /* For the multi-body distance we need the maximum */
1481                 c->bc[1] = std::max(comm->cell_x0[dim1], comm->zone_d1[1].p1_0);
1482             }
1483         }
1484         /* Set the upper-right corner for rounding */
1485         c->cr0 = comm->cell_x1[dim0];
1486
1487         if (dd->ndim >= 3)
1488         {
1489             dim2 = dd->dim[2];
1490             for (int j = 0; j < 4; j++)
1491             {
1492                 c->c[2][j] = comm->cell_x0[dim2];
1493             }
1494             if (isDlbOn(dd->comm))
1495             {
1496                 /* Use the maximum of the i-cells that see a j-cell */
1497                 for (const auto& iZone : zones->iZones)
1498                 {
1499                     const int iZoneIndex = iZone.iZoneIndex;
1500                     for (int jZone : iZone.jZoneRange)
1501                     {
1502                         if (jZone >= 4)
1503                         {
1504                             c->c[2][jZone - 4] = std::max(
1505                                     c->c[2][jZone - 4],
1506                                     comm->zone_d2[zones->shift[iZoneIndex][dim0]][zones->shift[iZoneIndex][dim1]]
1507                                             .mch0);
1508                         }
1509                     }
1510                 }
1511                 if (bDistMB)
1512                 {
1513                     /* For the multi-body distance we need the maximum */
1514                     c->bc[2] = comm->cell_x0[dim2];
1515                     for (int i = 0; i < 2; i++)
1516                     {
1517                         for (int j = 0; j < 2; j++)
1518                         {
1519                             c->bc[2] = std::max(c->bc[2], comm->zone_d2[i][j].p1_0);
1520                         }
1521                     }
1522                 }
1523             }
1524
1525             /* Set the upper-right corner for rounding */
1526             /* Cell (0,0,0) and cell (1,0,0) can see cell 4 (0,1,1)
1527              * Only cell (0,0,0) can see cell 7 (1,1,1)
1528              */
1529             c->cr1[0] = comm->cell_x1[dim1];
1530             c->cr1[3] = comm->cell_x1[dim1];
1531             if (isDlbOn(dd->comm))
1532             {
1533                 c->cr1[0] = std::max(comm->cell_x1[dim1], comm->zone_d1[1].mch1);
1534                 if (bDistMB)
1535                 {
1536                     /* For the multi-body distance we need the maximum */
1537                     c->bcr1 = std::max(comm->cell_x1[dim1], comm->zone_d1[1].p1_1);
1538                 }
1539             }
1540         }
1541     }
1542 }
1543
1544 /*! \brief Add the atom groups we need to send in this pulse from this
1545  * zone to \p localAtomGroups and \p work. */
1546 static void get_zone_pulse_cgs(gmx_domdec_t*            dd,
1547                                int                      zonei,
1548                                int                      zone,
1549                                int                      cg0,
1550                                int                      cg1,
1551                                gmx::ArrayRef<const int> globalAtomGroupIndices,
1552                                int                      dim,
1553                                int                      dim_ind,
1554                                int                      dim0,
1555                                int                      dim1,
1556                                int                      dim2,
1557                                real                     r_comm2,
1558                                real                     r_bcomm2,
1559                                matrix                   box,
1560                                bool                     distanceIsTriclinic,
1561                                rvec*                    normal,
1562                                real                     skew_fac2_d,
1563                                real                     skew_fac_01,
1564                                rvec*                    v_d,
1565                                rvec*                    v_0,
1566                                rvec*                    v_1,
1567                                const dd_corners_t*      c,
1568                                const rvec               sf2_round,
1569                                gmx_bool                 bDistBonded,
1570                                gmx_bool                 bBondComm,
1571                                gmx_bool                 bDist2B,
1572                                gmx_bool                 bDistMB,
1573                                rvec*                    cg_cm,
1574                                gmx::ArrayRef<const int> cginfo,
1575                                std::vector<int>*        localAtomGroups,
1576                                dd_comm_setup_work_t*    work)
1577 {
1578     gmx_domdec_comm_t* comm;
1579     gmx_bool           bScrew;
1580     gmx_bool           bDistMB_pulse;
1581     int                cg, i;
1582     real               r2, rb2, r, tric_sh;
1583     rvec               rn, rb;
1584     int                dimd;
1585     int                nsend_z, nat;
1586
1587     comm = dd->comm;
1588
1589     bScrew = (dd->unitCellInfo.haveScrewPBC && dim == XX);
1590
1591     bDistMB_pulse = (bDistMB && bDistBonded);
1592
1593     /* Unpack the work data */
1594     std::vector<int>&       ibuf = work->atomGroupBuffer;
1595     std::vector<gmx::RVec>& vbuf = work->positionBuffer;
1596     nsend_z                      = 0;
1597     nat                          = work->nat;
1598
1599     for (cg = cg0; cg < cg1; cg++)
1600     {
1601         r2  = 0;
1602         rb2 = 0;
1603         if (!distanceIsTriclinic)
1604         {
1605             /* Rectangular direction, easy */
1606             r = cg_cm[cg][dim] - c->c[dim_ind][zone];
1607             if (r > 0)
1608             {
1609                 r2 += r * r;
1610             }
1611             if (bDistMB_pulse)
1612             {
1613                 r = cg_cm[cg][dim] - c->bc[dim_ind];
1614                 if (r > 0)
1615                 {
1616                     rb2 += r * r;
1617                 }
1618             }
1619             /* Rounding gives at most a 16% reduction
1620              * in communicated atoms
1621              */
1622             if (dim_ind >= 1 && (zonei == 1 || zonei == 2))
1623             {
1624                 r = cg_cm[cg][dim0] - c->cr0;
1625                 /* This is the first dimension, so always r >= 0 */
1626                 r2 += r * r;
1627                 if (bDistMB_pulse)
1628                 {
1629                     rb2 += r * r;
1630                 }
1631             }
1632             if (dim_ind == 2 && (zonei == 2 || zonei == 3))
1633             {
1634                 r = cg_cm[cg][dim1] - c->cr1[zone];
1635                 if (r > 0)
1636                 {
1637                     r2 += r * r;
1638                 }
1639                 if (bDistMB_pulse)
1640                 {
1641                     r = cg_cm[cg][dim1] - c->bcr1;
1642                     if (r > 0)
1643                     {
1644                         rb2 += r * r;
1645                     }
1646                 }
1647             }
1648         }
1649         else
1650         {
1651             /* Triclinic direction, more complicated */
1652             clear_rvec(rn);
1653             clear_rvec(rb);
1654             /* Rounding, conservative as the skew_fac multiplication
1655              * will slightly underestimate the distance.
1656              */
1657             if (dim_ind >= 1 && (zonei == 1 || zonei == 2))
1658             {
1659                 rn[dim0] = cg_cm[cg][dim0] - c->cr0;
1660                 for (i = dim0 + 1; i < DIM; i++)
1661                 {
1662                     rn[dim0] -= cg_cm[cg][i] * v_0[i][dim0];
1663                 }
1664                 r2 = rn[dim0] * rn[dim0] * sf2_round[dim0];
1665                 if (bDistMB_pulse)
1666                 {
1667                     rb[dim0] = rn[dim0];
1668                     rb2      = r2;
1669                 }
1670                 /* Take care that the cell planes along dim0 might not
1671                  * be orthogonal to those along dim1 and dim2.
1672                  */
1673                 for (i = 1; i <= dim_ind; i++)
1674                 {
1675                     dimd = dd->dim[i];
1676                     if (normal[dim0][dimd] > 0)
1677                     {
1678                         rn[dimd] -= rn[dim0] * normal[dim0][dimd];
1679                         if (bDistMB_pulse)
1680                         {
1681                             rb[dimd] -= rb[dim0] * normal[dim0][dimd];
1682                         }
1683                     }
1684                 }
1685             }
1686             if (dim_ind == 2 && (zonei == 2 || zonei == 3))
1687             {
1688                 GMX_ASSERT(dim1 >= 0 && dim1 < DIM, "Must have a valid dimension index");
1689                 rn[dim1] += cg_cm[cg][dim1] - c->cr1[zone];
1690                 tric_sh = 0;
1691                 for (i = dim1 + 1; i < DIM; i++)
1692                 {
1693                     tric_sh -= cg_cm[cg][i] * v_1[i][dim1];
1694                 }
1695                 rn[dim1] += tric_sh;
1696                 if (rn[dim1] > 0)
1697                 {
1698                     r2 += rn[dim1] * rn[dim1] * sf2_round[dim1];
1699                     /* Take care of coupling of the distances
1700                      * to the planes along dim0 and dim1 through dim2.
1701                      */
1702                     r2 -= rn[dim0] * rn[dim1] * skew_fac_01;
1703                     /* Take care that the cell planes along dim1
1704                      * might not be orthogonal to that along dim2.
1705                      */
1706                     if (normal[dim1][dim2] > 0)
1707                     {
1708                         rn[dim2] -= rn[dim1] * normal[dim1][dim2];
1709                     }
1710                 }
1711                 if (bDistMB_pulse)
1712                 {
1713                     rb[dim1] += cg_cm[cg][dim1] - c->bcr1 + tric_sh;
1714                     if (rb[dim1] > 0)
1715                     {
1716                         rb2 += rb[dim1] * rb[dim1] * sf2_round[dim1];
1717                         /* Take care of coupling of the distances
1718                          * to the planes along dim0 and dim1 through dim2.
1719                          */
1720                         rb2 -= rb[dim0] * rb[dim1] * skew_fac_01;
1721                         /* Take care that the cell planes along dim1
1722                          * might not be orthogonal to that along dim2.
1723                          */
1724                         if (normal[dim1][dim2] > 0)
1725                         {
1726                             rb[dim2] -= rb[dim1] * normal[dim1][dim2];
1727                         }
1728                     }
1729                 }
1730             }
1731             /* The distance along the communication direction */
1732             rn[dim] += cg_cm[cg][dim] - c->c[dim_ind][zone];
1733             tric_sh = 0;
1734             for (i = dim + 1; i < DIM; i++)
1735             {
1736                 tric_sh -= cg_cm[cg][i] * v_d[i][dim];
1737             }
1738             rn[dim] += tric_sh;
1739             if (rn[dim] > 0)
1740             {
1741                 r2 += rn[dim] * rn[dim] * skew_fac2_d;
1742                 /* Take care of coupling of the distances
1743                  * to the planes along dim0 and dim1 through dim2.
1744                  */
1745                 if (dim_ind == 1 && zonei == 1)
1746                 {
1747                     r2 -= rn[dim0] * rn[dim] * skew_fac_01;
1748                 }
1749             }
1750             if (bDistMB_pulse)
1751             {
1752                 clear_rvec(rb);
1753                 GMX_ASSERT(dim >= 0 && dim < DIM, "Must have a valid dimension index");
1754                 rb[dim] += cg_cm[cg][dim] - c->bc[dim_ind] + tric_sh;
1755                 if (rb[dim] > 0)
1756                 {
1757                     rb2 += rb[dim] * rb[dim] * skew_fac2_d;
1758                     /* Take care of coupling of the distances
1759                      * to the planes along dim0 and dim1 through dim2.
1760                      */
1761                     if (dim_ind == 1 && zonei == 1)
1762                     {
1763                         rb2 -= rb[dim0] * rb[dim] * skew_fac_01;
1764                     }
1765                 }
1766             }
1767         }
1768
1769         if (r2 < r_comm2
1770             || (bDistBonded && ((bDistMB && rb2 < r_bcomm2) || (bDist2B && r2 < r_bcomm2))
1771                 && (!bBondComm
1772                     || (GET_CGINFO_BOND_INTER(cginfo[cg])
1773                         && missing_link(*comm->bondedLinks, globalAtomGroupIndices[cg], *dd->ga2la)))))
1774         {
1775             /* Store the local and global atom group indices and position */
1776             localAtomGroups->push_back(cg);
1777             ibuf.push_back(globalAtomGroupIndices[cg]);
1778             nsend_z++;
1779
1780             rvec posPbc;
1781             if (dd->ci[dim] == 0)
1782             {
1783                 /* Correct cg_cm for pbc */
1784                 rvec_add(cg_cm[cg], box[dim], posPbc);
1785                 if (bScrew)
1786                 {
1787                     posPbc[YY] = box[YY][YY] - posPbc[YY];
1788                     posPbc[ZZ] = box[ZZ][ZZ] - posPbc[ZZ];
1789                 }
1790             }
1791             else
1792             {
1793                 copy_rvec(cg_cm[cg], posPbc);
1794             }
1795             vbuf.emplace_back(posPbc[XX], posPbc[YY], posPbc[ZZ]);
1796
1797             nat += 1;
1798         }
1799     }
1800
1801     work->nat        = nat;
1802     work->nsend_zone = nsend_z;
1803 }
1804
1805 //! Clear data.
1806 static void clearCommSetupData(dd_comm_setup_work_t* work)
1807 {
1808     work->localAtomGroupBuffer.clear();
1809     work->atomGroupBuffer.clear();
1810     work->positionBuffer.clear();
1811     work->nat        = 0;
1812     work->nsend_zone = 0;
1813 }
1814
1815 //! Prepare DD communication.
1816 static void setup_dd_communication(gmx_domdec_t* dd, matrix box, gmx_ddbox_t* ddbox, t_forcerec* fr, t_state* state)
1817 {
1818     int                    dim_ind, dim, dim0, dim1, dim2, dimd, nat_tot;
1819     int                    nzone, nzone_send, zone, zonei, cg0, cg1;
1820     int                    c;
1821     int *                  zone_cg_range, pos_cg;
1822     gmx_domdec_comm_t*     comm;
1823     gmx_domdec_zones_t*    zones;
1824     gmx_domdec_comm_dim_t* cd;
1825     gmx_bool               bBondComm, bDist2B, bDistMB, bDistBonded;
1826     dd_corners_t           corners;
1827     rvec *                 normal, *v_d, *v_0 = nullptr, *v_1 = nullptr;
1828     real                   skew_fac2_d, skew_fac_01;
1829     rvec                   sf2_round;
1830
1831     if (debug)
1832     {
1833         fprintf(debug, "Setting up DD communication\n");
1834     }
1835
1836     comm = dd->comm;
1837
1838     if (comm->dth.empty())
1839     {
1840         /* Initialize the thread data.
1841          * This can not be done in init_domain_decomposition,
1842          * as the numbers of threads is determined later.
1843          */
1844         int numThreads = gmx_omp_nthreads_get(emntDomdec);
1845         comm->dth.resize(numThreads);
1846     }
1847
1848     bBondComm = comm->systemInfo.filterBondedCommunication;
1849
1850     /* Do we need to determine extra distances for multi-body bondeds? */
1851     bDistMB = (comm->systemInfo.haveInterDomainMultiBodyBondeds && isDlbOn(dd->comm) && dd->ndim > 1);
1852
1853     /* Do we need to determine extra distances for only two-body bondeds? */
1854     bDist2B = (bBondComm && !bDistMB);
1855
1856     const real r_comm2 =
1857             gmx::square(domainToDomainIntoAtomToDomainCutoff(comm->systemInfo, comm->systemInfo.cutoff));
1858     const real r_bcomm2 =
1859             gmx::square(domainToDomainIntoAtomToDomainCutoff(comm->systemInfo, comm->cutoff_mbody));
1860
1861     if (debug)
1862     {
1863         fprintf(debug, "bBondComm %s, r_bc %f\n", gmx::boolToString(bBondComm), std::sqrt(r_bcomm2));
1864     }
1865
1866     zones = &comm->zones;
1867
1868     dim0 = dd->dim[0];
1869     dim1 = (dd->ndim >= 2 ? dd->dim[1] : -1);
1870     dim2 = (dd->ndim >= 3 ? dd->dim[2] : -1);
1871
1872     set_dd_corners(dd, dim0, dim1, dim2, bDistMB, &corners);
1873
1874     /* Triclinic stuff */
1875     normal      = ddbox->normal;
1876     skew_fac_01 = 0;
1877     if (dd->ndim >= 2)
1878     {
1879         v_0 = ddbox->v[dim0];
1880         if (ddbox->tric_dir[dim0] && ddbox->tric_dir[dim1])
1881         {
1882             /* Determine the coupling coefficient for the distances
1883              * to the cell planes along dim0 and dim1 through dim2.
1884              * This is required for correct rounding.
1885              */
1886             skew_fac_01 = ddbox->v[dim0][dim1 + 1][dim0] * ddbox->v[dim1][dim1 + 1][dim1];
1887             if (debug)
1888             {
1889                 fprintf(debug, "\nskew_fac_01 %f\n", skew_fac_01);
1890             }
1891         }
1892     }
1893     if (dd->ndim >= 3)
1894     {
1895         v_1 = ddbox->v[dim1];
1896     }
1897
1898     zone_cg_range                        = zones->cg_range;
1899     gmx::ArrayRef<cginfo_mb_t> cginfo_mb = fr->cginfo_mb;
1900
1901     zone_cg_range[0]   = 0;
1902     zone_cg_range[1]   = dd->ncg_home;
1903     comm->zone_ncg1[0] = dd->ncg_home;
1904     pos_cg             = dd->ncg_home;
1905
1906     nat_tot = comm->atomRanges.numHomeAtoms();
1907     nzone   = 1;
1908     for (dim_ind = 0; dim_ind < dd->ndim; dim_ind++)
1909     {
1910         dim = dd->dim[dim_ind];
1911         cd  = &comm->cd[dim_ind];
1912
1913         /* Check if we need to compute triclinic distances along this dim */
1914         bool distanceIsTriclinic = false;
1915         for (int i = 0; i <= dim_ind; i++)
1916         {
1917             if (ddbox->tric_dir[dd->dim[i]])
1918             {
1919                 distanceIsTriclinic = true;
1920             }
1921         }
1922
1923         if (dim >= ddbox->npbcdim && dd->ci[dim] == 0)
1924         {
1925             /* No pbc in this dimension, the first node should not comm. */
1926             nzone_send = 0;
1927         }
1928         else
1929         {
1930             nzone_send = nzone;
1931         }
1932
1933         v_d         = ddbox->v[dim];
1934         skew_fac2_d = gmx::square(ddbox->skew_fac[dim]);
1935
1936         cd->receiveInPlace = true;
1937         for (int p = 0; p < cd->numPulses(); p++)
1938         {
1939             /* Only atoms communicated in the first pulse are used
1940              * for multi-body bonded interactions or for bBondComm.
1941              */
1942             bDistBonded = ((bDistMB || bDist2B) && p == 0);
1943
1944             gmx_domdec_ind_t* ind = &cd->ind[p];
1945
1946             /* Thread 0 writes in the global index array */
1947             ind->index.clear();
1948             clearCommSetupData(&comm->dth[0]);
1949
1950             for (zone = 0; zone < nzone_send; zone++)
1951             {
1952                 if (dim_ind > 0 && distanceIsTriclinic)
1953                 {
1954                     /* Determine slightly more optimized skew_fac's
1955                      * for rounding.
1956                      * This reduces the number of communicated atoms
1957                      * by about 10% for 3D DD of rhombic dodecahedra.
1958                      */
1959                     for (dimd = 0; dimd < dim; dimd++)
1960                     {
1961                         sf2_round[dimd] = 1;
1962                         if (ddbox->tric_dir[dimd])
1963                         {
1964                             for (int i = dd->dim[dimd] + 1; i < DIM; i++)
1965                             {
1966                                 /* If we are shifted in dimension i
1967                                  * and the cell plane is tilted forward
1968                                  * in dimension i, skip this coupling.
1969                                  */
1970                                 if (!(zones->shift[nzone + zone][i] && ddbox->v[dimd][i][dimd] >= 0))
1971                                 {
1972                                     sf2_round[dimd] += gmx::square(ddbox->v[dimd][i][dimd]);
1973                                 }
1974                             }
1975                             sf2_round[dimd] = 1 / sf2_round[dimd];
1976                         }
1977                     }
1978                 }
1979
1980                 zonei = zone_perm[dim_ind][zone];
1981                 if (p == 0)
1982                 {
1983                     /* Here we permutate the zones to obtain a convenient order
1984                      * for neighbor searching
1985                      */
1986                     cg0 = zone_cg_range[zonei];
1987                     cg1 = zone_cg_range[zonei + 1];
1988                 }
1989                 else
1990                 {
1991                     /* Look only at the cg's received in the previous grid pulse
1992                      */
1993                     cg1 = zone_cg_range[nzone + zone + 1];
1994                     cg0 = cg1 - cd->ind[p - 1].nrecv[zone];
1995                 }
1996
1997                 const int numThreads = gmx::ssize(comm->dth);
1998 #pragma omp parallel for num_threads(numThreads) schedule(static)
1999                 for (int th = 0; th < numThreads; th++)
2000                 {
2001                     try
2002                     {
2003                         dd_comm_setup_work_t& work = comm->dth[th];
2004
2005                         /* Retain data accumulated into buffers of thread 0 */
2006                         if (th > 0)
2007                         {
2008                             clearCommSetupData(&work);
2009                         }
2010
2011                         int cg0_th = cg0 + ((cg1 - cg0) * th) / numThreads;
2012                         int cg1_th = cg0 + ((cg1 - cg0) * (th + 1)) / numThreads;
2013
2014                         /* Get the cg's for this pulse in this zone */
2015                         get_zone_pulse_cgs(dd, zonei, zone, cg0_th, cg1_th, dd->globalAtomGroupIndices,
2016                                            dim, dim_ind, dim0, dim1, dim2, r_comm2, r_bcomm2, box,
2017                                            distanceIsTriclinic, normal, skew_fac2_d, skew_fac_01,
2018                                            v_d, v_0, v_1, &corners, sf2_round, bDistBonded, bBondComm,
2019                                            bDist2B, bDistMB, state->x.rvec_array(), fr->cginfo,
2020                                            th == 0 ? &ind->index : &work.localAtomGroupBuffer, &work);
2021                     }
2022                     GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR
2023                 } // END
2024
2025                 std::vector<int>&       atomGroups = comm->dth[0].atomGroupBuffer;
2026                 std::vector<gmx::RVec>& positions  = comm->dth[0].positionBuffer;
2027                 ind->nsend[zone]                   = comm->dth[0].nsend_zone;
2028                 /* Append data of threads>=1 to the communication buffers */
2029                 for (int th = 1; th < numThreads; th++)
2030                 {
2031                     const dd_comm_setup_work_t& dth = comm->dth[th];
2032
2033                     ind->index.insert(ind->index.end(), dth.localAtomGroupBuffer.begin(),
2034                                       dth.localAtomGroupBuffer.end());
2035                     atomGroups.insert(atomGroups.end(), dth.atomGroupBuffer.begin(),
2036                                       dth.atomGroupBuffer.end());
2037                     positions.insert(positions.end(), dth.positionBuffer.begin(),
2038                                      dth.positionBuffer.end());
2039                     comm->dth[0].nat += dth.nat;
2040                     ind->nsend[zone] += dth.nsend_zone;
2041                 }
2042             }
2043             /* Clear the counts in case we do not have pbc */
2044             for (zone = nzone_send; zone < nzone; zone++)
2045             {
2046                 ind->nsend[zone] = 0;
2047             }
2048             ind->nsend[nzone]     = ind->index.size();
2049             ind->nsend[nzone + 1] = comm->dth[0].nat;
2050             /* Communicate the number of cg's and atoms to receive */
2051             ddSendrecv(dd, dim_ind, dddirBackward, ind->nsend, nzone + 2, ind->nrecv, nzone + 2);
2052
2053             if (p > 0)
2054             {
2055                 /* We can receive in place if only the last zone is not empty */
2056                 for (zone = 0; zone < nzone - 1; zone++)
2057                 {
2058                     if (ind->nrecv[zone] > 0)
2059                     {
2060                         cd->receiveInPlace = false;
2061                     }
2062                 }
2063             }
2064
2065             int receiveBufferSize = 0;
2066             if (!cd->receiveInPlace)
2067             {
2068                 receiveBufferSize = ind->nrecv[nzone];
2069             }
2070             /* These buffer are actually only needed with in-place */
2071             DDBufferAccess<int>       globalAtomGroupBuffer(comm->intBuffer, receiveBufferSize);
2072             DDBufferAccess<gmx::RVec> rvecBuffer(comm->rvecBuffer, receiveBufferSize);
2073
2074             dd_comm_setup_work_t& work = comm->dth[0];
2075
2076             /* Make space for the global cg indices */
2077             int numAtomGroupsNew = pos_cg + ind->nrecv[nzone];
2078             dd->globalAtomGroupIndices.resize(numAtomGroupsNew);
2079             /* Communicate the global cg indices */
2080             gmx::ArrayRef<int> integerBufferRef;
2081             if (cd->receiveInPlace)
2082             {
2083                 integerBufferRef = gmx::arrayRefFromArray(
2084                         dd->globalAtomGroupIndices.data() + pos_cg, ind->nrecv[nzone]);
2085             }
2086             else
2087             {
2088                 integerBufferRef = globalAtomGroupBuffer.buffer;
2089             }
2090             ddSendrecv<int>(dd, dim_ind, dddirBackward, work.atomGroupBuffer, integerBufferRef);
2091
2092             /* Make space for cg_cm */
2093             dd_resize_atominfo_and_state(fr, state, pos_cg + ind->nrecv[nzone]);
2094
2095             /* Communicate the coordinates */
2096             gmx::ArrayRef<gmx::RVec> rvecBufferRef;
2097             if (cd->receiveInPlace)
2098             {
2099                 rvecBufferRef = gmx::makeArrayRef(state->x).subArray(pos_cg, ind->nrecv[nzone]);
2100             }
2101             else
2102             {
2103                 rvecBufferRef = rvecBuffer.buffer;
2104             }
2105             ddSendrecv<gmx::RVec>(dd, dim_ind, dddirBackward, work.positionBuffer, rvecBufferRef);
2106
2107             /* Make the charge group index */
2108             if (cd->receiveInPlace)
2109             {
2110                 zone = (p == 0 ? 0 : nzone - 1);
2111                 while (zone < nzone)
2112                 {
2113                     for (int i = 0; i < ind->nrecv[zone]; i++)
2114                     {
2115                         int globalAtomIndex = dd->globalAtomGroupIndices[pos_cg];
2116                         fr->cginfo[pos_cg]  = ddcginfo(cginfo_mb, globalAtomIndex);
2117                         pos_cg++;
2118                     }
2119                     if (p == 0)
2120                     {
2121                         comm->zone_ncg1[nzone + zone] = ind->nrecv[zone];
2122                     }
2123                     zone++;
2124                     zone_cg_range[nzone + zone] = pos_cg;
2125                 }
2126             }
2127             else
2128             {
2129                 /* This part of the code is never executed with bBondComm. */
2130                 merge_cg_buffers(nzone, cd, p, zone_cg_range, dd->globalAtomGroupIndices,
2131                                  integerBufferRef.data(), state->x, rvecBufferRef, fr->cginfo_mb,
2132                                  fr->cginfo);
2133                 pos_cg += ind->nrecv[nzone];
2134             }
2135             nat_tot += ind->nrecv[nzone + 1];
2136         }
2137         if (!cd->receiveInPlace)
2138         {
2139             /* Store the atom block for easy copying of communication buffers */
2140             make_cell2at_index(cd, nzone, zone_cg_range[nzone]);
2141         }
2142         nzone += nzone;
2143     }
2144
2145     comm->atomRanges.setEnd(DDAtomRanges::Type::Zones, nat_tot);
2146
2147     if (!bBondComm)
2148     {
2149         /* We don't need to update cginfo, since that was alrady done above.
2150          * So we pass NULL for the forcerec.
2151          */
2152         dd_set_cginfo(dd->globalAtomGroupIndices, dd->ncg_home, dd->globalAtomGroupIndices.size(), nullptr);
2153     }
2154
2155     if (debug)
2156     {
2157         fprintf(debug, "Finished setting up DD communication, zones:");
2158         for (c = 0; c < zones->n; c++)
2159         {
2160             fprintf(debug, " %d", zones->cg_range[c + 1] - zones->cg_range[c]);
2161         }
2162         fprintf(debug, "\n");
2163     }
2164 }
2165
2166 //! Set boundaries for the charge group range.
2167 static void set_cg_boundaries(gmx_domdec_zones_t* zones)
2168 {
2169     for (auto& iZone : zones->iZones)
2170     {
2171         iZone.iAtomRange = gmx::Range<int>(0, zones->cg_range[iZone.iZoneIndex + 1]);
2172         iZone.jAtomRange = gmx::Range<int>(zones->cg_range[iZone.jZoneRange.begin()],
2173                                            zones->cg_range[iZone.jZoneRange.end()]);
2174     }
2175 }
2176
2177 /*! \brief Set zone dimensions for zones \p zone_start to \p zone_end-1
2178  *
2179  * Also sets the atom density for the home zone when \p zone_start=0.
2180  * For this \p numMovedChargeGroupsInHomeZone needs to be passed to tell
2181  * how many charge groups will move but are still part of the current range.
2182  * \todo When converting domdec to use proper classes, all these variables
2183  *       should be private and a method should return the correct count
2184  *       depending on an internal state.
2185  *
2186  * \param[in,out] dd          The domain decomposition struct
2187  * \param[in]     box         The box
2188  * \param[in]     ddbox       The domain decomposition box struct
2189  * \param[in]     zone_start  The start of the zone range to set sizes for
2190  * \param[in]     zone_end    The end of the zone range to set sizes for
2191  * \param[in]     numMovedChargeGroupsInHomeZone  The number of charge groups in the home zone that should moved but are still present in dd->comm->zones.cg_range
2192  */
2193 static void set_zones_size(gmx_domdec_t*      dd,
2194                            matrix             box,
2195                            const gmx_ddbox_t* ddbox,
2196                            int                zone_start,
2197                            int                zone_end,
2198                            int                numMovedChargeGroupsInHomeZone)
2199 {
2200     gmx_domdec_comm_t*  comm;
2201     gmx_domdec_zones_t* zones;
2202     gmx_bool            bDistMB;
2203     int                 z, d, dim;
2204     real                rcs, rcmbs;
2205     int                 i, j;
2206     real                vol;
2207
2208     comm = dd->comm;
2209
2210     zones = &comm->zones;
2211
2212     /* Do we need to determine extra distances for multi-body bondeds? */
2213     bDistMB = (comm->systemInfo.haveInterDomainMultiBodyBondeds && isDlbOn(dd->comm) && dd->ndim > 1);
2214
2215     for (z = zone_start; z < zone_end; z++)
2216     {
2217         /* Copy cell limits to zone limits.
2218          * Valid for non-DD dims and non-shifted dims.
2219          */
2220         copy_rvec(comm->cell_x0, zones->size[z].x0);
2221         copy_rvec(comm->cell_x1, zones->size[z].x1);
2222     }
2223
2224     for (d = 0; d < dd->ndim; d++)
2225     {
2226         dim = dd->dim[d];
2227
2228         for (z = 0; z < zones->n; z++)
2229         {
2230             /* With a staggered grid we have different sizes
2231              * for non-shifted dimensions.
2232              */
2233             if (isDlbOn(dd->comm) && zones->shift[z][dim] == 0)
2234             {
2235                 if (d == 1)
2236                 {
2237                     zones->size[z].x0[dim] = comm->zone_d1[zones->shift[z][dd->dim[d - 1]]].min0;
2238                     zones->size[z].x1[dim] = comm->zone_d1[zones->shift[z][dd->dim[d - 1]]].max1;
2239                 }
2240                 else if (d == 2)
2241                 {
2242                     zones->size[z].x0[dim] =
2243                             comm->zone_d2[zones->shift[z][dd->dim[d - 2]]][zones->shift[z][dd->dim[d - 1]]]
2244                                     .min0;
2245                     zones->size[z].x1[dim] =
2246                             comm->zone_d2[zones->shift[z][dd->dim[d - 2]]][zones->shift[z][dd->dim[d - 1]]]
2247                                     .max1;
2248                 }
2249             }
2250         }
2251
2252         rcs   = comm->systemInfo.cutoff;
2253         rcmbs = comm->cutoff_mbody;
2254         if (ddbox->tric_dir[dim])
2255         {
2256             rcs /= ddbox->skew_fac[dim];
2257             rcmbs /= ddbox->skew_fac[dim];
2258         }
2259
2260         /* Set the lower limit for the shifted zone dimensions */
2261         for (z = zone_start; z < zone_end; z++)
2262         {
2263             if (zones->shift[z][dim] > 0)
2264             {
2265                 dim = dd->dim[d];
2266                 if (!isDlbOn(dd->comm) || d == 0)
2267                 {
2268                     zones->size[z].x0[dim] = comm->cell_x1[dim];
2269                     zones->size[z].x1[dim] = comm->cell_x1[dim] + rcs;
2270                 }
2271                 else
2272                 {
2273                     /* Here we take the lower limit of the zone from
2274                      * the lowest domain of the zone below.
2275                      */
2276                     if (z < 4)
2277                     {
2278                         zones->size[z].x0[dim] = comm->zone_d1[zones->shift[z][dd->dim[d - 1]]].min1;
2279                     }
2280                     else
2281                     {
2282                         if (d == 1)
2283                         {
2284                             zones->size[z].x0[dim] = zones->size[zone_perm[2][z - 4]].x0[dim];
2285                         }
2286                         else
2287                         {
2288                             zones->size[z].x0[dim] =
2289                                     comm->zone_d2[zones->shift[z][dd->dim[d - 2]]][zones->shift[z][dd->dim[d - 1]]]
2290                                             .min1;
2291                         }
2292                     }
2293                     /* A temporary limit, is updated below */
2294                     zones->size[z].x1[dim] = zones->size[z].x0[dim];
2295
2296                     if (bDistMB)
2297                     {
2298                         for (size_t zi = 0; zi < zones->iZones.size(); zi++)
2299                         {
2300                             if (zones->shift[zi][dim] == 0)
2301                             {
2302                                 /* This takes the whole zone into account.
2303                                  * With multiple pulses this will lead
2304                                  * to a larger zone then strictly necessary.
2305                                  */
2306                                 zones->size[z].x1[dim] = std::max(zones->size[z].x1[dim],
2307                                                                   zones->size[zi].x1[dim] + rcmbs);
2308                             }
2309                         }
2310                     }
2311                 }
2312             }
2313         }
2314
2315         /* Loop over the i-zones to set the upper limit of each
2316          * j-zone they see.
2317          */
2318         for (const auto& iZone : zones->iZones)
2319         {
2320             const int zi = iZone.iZoneIndex;
2321             if (zones->shift[zi][dim] == 0)
2322             {
2323                 /* We should only use zones up to zone_end */
2324                 const auto& jZoneRangeFull = iZone.jZoneRange;
2325                 if (zone_end <= *jZoneRangeFull.begin())
2326                 {
2327                     continue;
2328                 }
2329                 const gmx::Range<int> jZoneRange(*jZoneRangeFull.begin(),
2330                                                  std::min(*jZoneRangeFull.end(), zone_end));
2331                 for (int jZone : jZoneRange)
2332                 {
2333                     if (zones->shift[jZone][dim] > 0)
2334                     {
2335                         zones->size[jZone].x1[dim] =
2336                                 std::max(zones->size[jZone].x1[dim], zones->size[zi].x1[dim] + rcs);
2337                     }
2338                 }
2339             }
2340         }
2341     }
2342
2343     for (z = zone_start; z < zone_end; z++)
2344     {
2345         /* Initialization only required to keep the compiler happy */
2346         rvec corner_min = { 0, 0, 0 }, corner_max = { 0, 0, 0 }, corner;
2347         int  nc, c;
2348
2349         /* To determine the bounding box for a zone we need to find
2350          * the extreme corners of 4, 2 or 1 corners.
2351          */
2352         nc = 1 << (ddbox->nboundeddim - 1);
2353
2354         for (c = 0; c < nc; c++)
2355         {
2356             /* Set up a zone corner at x=0, ignoring trilinic couplings */
2357             corner[XX] = 0;
2358             if ((c & 1) == 0)
2359             {
2360                 corner[YY] = zones->size[z].x0[YY];
2361             }
2362             else
2363             {
2364                 corner[YY] = zones->size[z].x1[YY];
2365             }
2366             if ((c & 2) == 0)
2367             {
2368                 corner[ZZ] = zones->size[z].x0[ZZ];
2369             }
2370             else
2371             {
2372                 corner[ZZ] = zones->size[z].x1[ZZ];
2373             }
2374             if (dd->ndim == 1 && dd->dim[0] < ZZ && ZZ < dd->unitCellInfo.npbcdim
2375                 && box[ZZ][1 - dd->dim[0]] != 0)
2376             {
2377                 /* With 1D domain decomposition the cg's are not in
2378                  * the triclinic box, but triclinic x-y and rectangular y/x-z.
2379                  * Shift the corner of the z-vector back to along the box
2380                  * vector of dimension d, so it will later end up at 0 along d.
2381                  * This can affect the location of this corner along dd->dim[0]
2382                  * through the matrix operation below if box[d][dd->dim[0]]!=0.
2383                  */
2384                 int d = 1 - dd->dim[0];
2385
2386                 corner[d] -= corner[ZZ] * box[ZZ][d] / box[ZZ][ZZ];
2387             }
2388             /* Apply the triclinic couplings */
2389             for (i = YY; i < ddbox->npbcdim && i < DIM; i++)
2390             {
2391                 for (j = XX; j < i; j++)
2392                 {
2393                     corner[j] += corner[i] * box[i][j] / box[i][i];
2394                 }
2395             }
2396             if (c == 0)
2397             {
2398                 copy_rvec(corner, corner_min);
2399                 copy_rvec(corner, corner_max);
2400             }
2401             else
2402             {
2403                 for (i = 0; i < DIM; i++)
2404                 {
2405                     corner_min[i] = std::min(corner_min[i], corner[i]);
2406                     corner_max[i] = std::max(corner_max[i], corner[i]);
2407                 }
2408             }
2409         }
2410         /* Copy the extreme cornes without offset along x */
2411         for (i = 0; i < DIM; i++)
2412         {
2413             zones->size[z].bb_x0[i] = corner_min[i];
2414             zones->size[z].bb_x1[i] = corner_max[i];
2415         }
2416         /* Add the offset along x */
2417         zones->size[z].bb_x0[XX] += zones->size[z].x0[XX];
2418         zones->size[z].bb_x1[XX] += zones->size[z].x1[XX];
2419     }
2420
2421     if (zone_start == 0)
2422     {
2423         vol = 1;
2424         for (dim = 0; dim < DIM; dim++)
2425         {
2426             vol *= zones->size[0].x1[dim] - zones->size[0].x0[dim];
2427         }
2428         zones->dens_zone0 =
2429                 (zones->cg_range[1] - zones->cg_range[0] - numMovedChargeGroupsInHomeZone) / vol;
2430     }
2431
2432     if (debug)
2433     {
2434         for (z = zone_start; z < zone_end; z++)
2435         {
2436             fprintf(debug, "zone %d    %6.3f - %6.3f  %6.3f - %6.3f  %6.3f - %6.3f\n", z,
2437                     zones->size[z].x0[XX], zones->size[z].x1[XX], zones->size[z].x0[YY],
2438                     zones->size[z].x1[YY], zones->size[z].x0[ZZ], zones->size[z].x1[ZZ]);
2439             fprintf(debug, "zone %d bb %6.3f - %6.3f  %6.3f - %6.3f  %6.3f - %6.3f\n", z,
2440                     zones->size[z].bb_x0[XX], zones->size[z].bb_x1[XX], zones->size[z].bb_x0[YY],
2441                     zones->size[z].bb_x1[YY], zones->size[z].bb_x0[ZZ], zones->size[z].bb_x1[ZZ]);
2442         }
2443     }
2444 }
2445
2446 /*! \brief Order data in \p dataToSort according to \p sort
2447  *
2448  * Note: both buffers should have at least \p sort.size() elements.
2449  */
2450 template<typename T>
2451 static void orderVector(gmx::ArrayRef<const gmx_cgsort_t> sort,
2452                         gmx::ArrayRef<T>                  dataToSort,
2453                         gmx::ArrayRef<T>                  sortBuffer)
2454 {
2455     GMX_ASSERT(dataToSort.size() >= sort.size(), "The vector needs to be sufficiently large");
2456     GMX_ASSERT(sortBuffer.size() >= sort.size(),
2457                "The sorting buffer needs to be sufficiently large");
2458
2459     /* Order the data into the temporary buffer */
2460     size_t i = 0;
2461     for (const gmx_cgsort_t& entry : sort)
2462     {
2463         sortBuffer[i++] = dataToSort[entry.ind];
2464     }
2465
2466     /* Copy back to the original array */
2467     std::copy(sortBuffer.begin(), sortBuffer.begin() + sort.size(), dataToSort.begin());
2468 }
2469
2470 /*! \brief Order data in \p dataToSort according to \p sort
2471  *
2472  * Note: \p vectorToSort should have at least \p sort.size() elements,
2473  *       \p workVector is resized when it is too small.
2474  */
2475 template<typename T>
2476 static void orderVector(gmx::ArrayRef<const gmx_cgsort_t> sort,
2477                         gmx::ArrayRef<T>                  vectorToSort,
2478                         std::vector<T>*                   workVector)
2479 {
2480     if (gmx::index(workVector->size()) < sort.ssize())
2481     {
2482         workVector->resize(sort.size());
2483     }
2484     orderVector<T>(sort, vectorToSort, *workVector);
2485 }
2486
2487 //! Returns the sorting order for atoms based on the nbnxn grid order in sort
2488 static void dd_sort_order_nbnxn(const t_forcerec* fr, std::vector<gmx_cgsort_t>* sort)
2489 {
2490     gmx::ArrayRef<const int> atomOrder = fr->nbv->getLocalAtomOrder();
2491
2492     /* Using push_back() instead of this resize results in much slower code */
2493     sort->resize(atomOrder.size());
2494     gmx::ArrayRef<gmx_cgsort_t> buffer    = *sort;
2495     size_t                      numSorted = 0;
2496     for (int i : atomOrder)
2497     {
2498         if (i >= 0)
2499         {
2500             /* The values of nsc and ind_gl are not used in this case */
2501             buffer[numSorted++].ind = i;
2502         }
2503     }
2504     sort->resize(numSorted);
2505 }
2506
2507 //! Returns the sorting state for DD.
2508 static void dd_sort_state(gmx_domdec_t* dd, t_forcerec* fr, t_state* state)
2509 {
2510     gmx_domdec_sort_t* sort = dd->comm->sort.get();
2511
2512     dd_sort_order_nbnxn(fr, &sort->sorted);
2513
2514     /* We alloc with the old size, since cgindex is still old */
2515     DDBufferAccess<gmx::RVec> rvecBuffer(dd->comm->rvecBuffer, dd->ncg_home);
2516
2517     /* Set the new home atom/charge group count */
2518     dd->ncg_home = sort->sorted.size();
2519     if (debug)
2520     {
2521         fprintf(debug, "Set the new home atom count to %d\n", dd->ncg_home);
2522     }
2523
2524     /* Reorder the state */
2525     gmx::ArrayRef<const gmx_cgsort_t> cgsort = sort->sorted;
2526     GMX_RELEASE_ASSERT(cgsort.ssize() == dd->ncg_home, "We should sort all the home atom groups");
2527
2528     if (state->flags & (1 << estX))
2529     {
2530         orderVector(cgsort, makeArrayRef(state->x), rvecBuffer.buffer);
2531     }
2532     if (state->flags & (1 << estV))
2533     {
2534         orderVector(cgsort, makeArrayRef(state->v), rvecBuffer.buffer);
2535     }
2536     if (state->flags & (1 << estCGP))
2537     {
2538         orderVector(cgsort, makeArrayRef(state->cg_p), rvecBuffer.buffer);
2539     }
2540
2541     /* Reorder the global cg index */
2542     orderVector<int>(cgsort, dd->globalAtomGroupIndices, &sort->intBuffer);
2543     /* Reorder the cginfo */
2544     orderVector<int>(cgsort, fr->cginfo, &sort->intBuffer);
2545     /* Set the home atom number */
2546     dd->comm->atomRanges.setEnd(DDAtomRanges::Type::Home, dd->ncg_home);
2547
2548     /* The atoms are now exactly in grid order, update the grid order */
2549     fr->nbv->setLocalAtomOrder();
2550 }
2551
2552 //! Accumulates load statistics.
2553 static void add_dd_statistics(gmx_domdec_t* dd)
2554 {
2555     gmx_domdec_comm_t* comm = dd->comm;
2556
2557     for (int i = 0; i < static_cast<int>(DDAtomRanges::Type::Number); i++)
2558     {
2559         auto range = static_cast<DDAtomRanges::Type>(i);
2560         comm->sum_nat[i] += comm->atomRanges.end(range) - comm->atomRanges.start(range);
2561     }
2562     comm->ndecomp++;
2563 }
2564
2565 void reset_dd_statistics_counters(gmx_domdec_t* dd)
2566 {
2567     gmx_domdec_comm_t* comm = dd->comm;
2568
2569     /* Reset all the statistics and counters for total run counting */
2570     for (int i = 0; i < static_cast<int>(DDAtomRanges::Type::Number); i++)
2571     {
2572         comm->sum_nat[i] = 0;
2573     }
2574     comm->ndecomp   = 0;
2575     comm->nload     = 0;
2576     comm->load_step = 0;
2577     comm->load_sum  = 0;
2578     comm->load_max  = 0;
2579     clear_ivec(comm->load_lim);
2580     comm->load_mdf = 0;
2581     comm->load_pme = 0;
2582 }
2583
2584 void print_dd_statistics(const t_commrec* cr, const t_inputrec* ir, FILE* fplog)
2585 {
2586     gmx_domdec_comm_t* comm = cr->dd->comm;
2587
2588     const int numRanges = static_cast<int>(DDAtomRanges::Type::Number);
2589     gmx_sumd(numRanges, comm->sum_nat, cr);
2590
2591     if (fplog == nullptr)
2592     {
2593         return;
2594     }
2595
2596     fprintf(fplog, "\n    D O M A I N   D E C O M P O S I T I O N   S T A T I S T I C S\n\n");
2597
2598     for (int i = static_cast<int>(DDAtomRanges::Type::Zones); i < numRanges; i++)
2599     {
2600         auto   range = static_cast<DDAtomRanges::Type>(i);
2601         double av    = comm->sum_nat[i] / comm->ndecomp;
2602         switch (range)
2603         {
2604             case DDAtomRanges::Type::Zones:
2605                 fprintf(fplog, " av. #atoms communicated per step for force:  %d x %.1f\n", 2, av);
2606                 break;
2607             case DDAtomRanges::Type::Vsites:
2608                 if (cr->dd->vsite_comm)
2609                 {
2610                     fprintf(fplog, " av. #atoms communicated per step for vsites: %d x %.1f\n",
2611                             (EEL_PME(ir->coulombtype) || ir->coulombtype == eelEWALD) ? 3 : 2, av);
2612                 }
2613                 break;
2614             case DDAtomRanges::Type::Constraints:
2615                 if (cr->dd->constraint_comm)
2616                 {
2617                     fprintf(fplog, " av. #atoms communicated per step for LINCS:  %d x %.1f\n",
2618                             1 + ir->nLincsIter, av);
2619                 }
2620                 break;
2621             default: gmx_incons(" Unknown type for DD statistics");
2622         }
2623     }
2624     fprintf(fplog, "\n");
2625
2626     if (comm->ddSettings.recordLoad && EI_DYNAMICS(ir->eI))
2627     {
2628         print_dd_load_av(fplog, cr->dd);
2629     }
2630 }
2631
2632 //!\brief TODO Remove fplog when group scheme and charge groups are gone
2633 void dd_partition_system(FILE*                     fplog,
2634                          const gmx::MDLogger&      mdlog,
2635                          int64_t                   step,
2636                          const t_commrec*          cr,
2637                          gmx_bool                  bMasterState,
2638                          int                       nstglobalcomm,
2639                          t_state*                  state_global,
2640                          const gmx_mtop_t&         top_global,
2641                          const t_inputrec*         ir,
2642                          gmx::ImdSession*          imdSession,
2643                          pull_t*                   pull_work,
2644                          t_state*                  state_local,
2645                          gmx::ForceBuffers*        f,
2646                          gmx::MDAtoms*             mdAtoms,
2647                          gmx_localtop_t*           top_local,
2648                          t_forcerec*               fr,
2649                          gmx::VirtualSitesHandler* vsite,
2650                          gmx::Constraints*         constr,
2651                          t_nrnb*                   nrnb,
2652                          gmx_wallcycle*            wcycle,
2653                          gmx_bool                  bVerbose)
2654 {
2655     gmx_domdec_t*      dd;
2656     gmx_domdec_comm_t* comm;
2657     gmx_ddbox_t        ddbox = { 0 };
2658     int64_t            step_pcoupl;
2659     rvec               cell_ns_x0, cell_ns_x1;
2660     int                ncgindex_set, ncg_moved, nat_f_novirsum;
2661     gmx_bool           bBoxChanged, bNStGlobalComm, bDoDLB, bCheckWhetherToTurnDlbOn, bLogLoad;
2662     gmx_bool           bRedist;
2663     ivec               np;
2664     real               grid_density;
2665     char               sbuf[22];
2666
2667     wallcycle_start(wcycle, ewcDOMDEC);
2668
2669     dd   = cr->dd;
2670     comm = dd->comm;
2671
2672     // TODO if the update code becomes accessible here, use
2673     // upd->deform for this logic.
2674     bBoxChanged = (bMasterState || inputrecDeform(ir));
2675     if (ir->epc != epcNO)
2676     {
2677         /* With nstpcouple > 1 pressure coupling happens.
2678          * one step after calculating the pressure.
2679          * Box scaling happens at the end of the MD step,
2680          * after the DD partitioning.
2681          * We therefore have to do DLB in the first partitioning
2682          * after an MD step where P-coupling occurred.
2683          * We need to determine the last step in which p-coupling occurred.
2684          * MRS -- need to validate this for vv?
2685          */
2686         int n = ir->nstpcouple;
2687         if (n == 1)
2688         {
2689             step_pcoupl = step - 1;
2690         }
2691         else
2692         {
2693             step_pcoupl = ((step - 1) / n) * n + 1;
2694         }
2695         if (step_pcoupl >= comm->partition_step)
2696         {
2697             bBoxChanged = TRUE;
2698         }
2699     }
2700
2701     bNStGlobalComm = (step % nstglobalcomm == 0);
2702
2703     if (!isDlbOn(comm))
2704     {
2705         bDoDLB = FALSE;
2706     }
2707     else
2708     {
2709         /* Should we do dynamic load balacing this step?
2710          * Since it requires (possibly expensive) global communication,
2711          * we might want to do DLB less frequently.
2712          */
2713         if (bBoxChanged || ir->epc != epcNO)
2714         {
2715             bDoDLB = bBoxChanged;
2716         }
2717         else
2718         {
2719             bDoDLB = bNStGlobalComm;
2720         }
2721     }
2722
2723     /* Check if we have recorded loads on the nodes */
2724     if (comm->ddSettings.recordLoad && dd_load_count(comm) > 0)
2725     {
2726         bCheckWhetherToTurnDlbOn = dd_dlb_get_should_check_whether_to_turn_dlb_on(dd);
2727
2728         /* Print load every nstlog, first and last step to the log file */
2729         bLogLoad = ((ir->nstlog > 0 && step % ir->nstlog == 0) || comm->n_load_collect == 0
2730                     || (ir->nsteps >= 0 && (step + ir->nstlist > ir->init_step + ir->nsteps)));
2731
2732         /* Avoid extra communication due to verbose screen output
2733          * when nstglobalcomm is set.
2734          */
2735         if (bDoDLB || bLogLoad || bCheckWhetherToTurnDlbOn
2736             || (bVerbose && (ir->nstlist == 0 || nstglobalcomm <= ir->nstlist)))
2737         {
2738             get_load_distribution(dd, wcycle);
2739             if (DDMASTER(dd))
2740             {
2741                 if (bLogLoad)
2742                 {
2743                     GMX_LOG(mdlog.info).asParagraph().appendText(dd_print_load(dd, step - 1));
2744                 }
2745                 if (bVerbose)
2746                 {
2747                     dd_print_load_verbose(dd);
2748                 }
2749             }
2750             comm->n_load_collect++;
2751
2752             if (isDlbOn(comm))
2753             {
2754                 if (DDMASTER(dd))
2755                 {
2756                     /* Add the measured cycles to the running average */
2757                     const float averageFactor = 0.1F;
2758                     comm->cyclesPerStepDlbExpAverage =
2759                             (1 - averageFactor) * comm->cyclesPerStepDlbExpAverage
2760                             + averageFactor * comm->cycl[ddCyclStep] / comm->cycl_n[ddCyclStep];
2761                 }
2762                 if (comm->dlbState == DlbState::onCanTurnOff
2763                     && dd->comm->n_load_have % c_checkTurnDlbOffInterval == c_checkTurnDlbOffInterval - 1)
2764                 {
2765                     gmx_bool turnOffDlb;
2766                     if (DDMASTER(dd))
2767                     {
2768                         /* If the running averaged cycles with DLB are more
2769                          * than before we turned on DLB, turn off DLB.
2770                          * We will again run and check the cycles without DLB
2771                          * and we can then decide if to turn off DLB forever.
2772                          */
2773                         turnOffDlb = (comm->cyclesPerStepDlbExpAverage > comm->cyclesPerStepBeforeDLB);
2774                     }
2775                     dd_bcast(dd, sizeof(turnOffDlb), &turnOffDlb);
2776                     if (turnOffDlb)
2777                     {
2778                         /* To turn off DLB, we need to redistribute the atoms */
2779                         dd_collect_state(dd, state_local, state_global);
2780                         bMasterState = TRUE;
2781                         turn_off_dlb(mdlog, dd, step);
2782                     }
2783                 }
2784             }
2785             else if (bCheckWhetherToTurnDlbOn)
2786             {
2787                 gmx_bool turnOffDlbForever = FALSE;
2788                 gmx_bool turnOnDlb         = FALSE;
2789
2790                 /* Since the timings are node dependent, the master decides */
2791                 if (DDMASTER(dd))
2792                 {
2793                     /* If we recently turned off DLB, we want to check if
2794                      * performance is better without DLB. We want to do this
2795                      * ASAP to minimize the chance that external factors
2796                      * slowed down the DLB step are gone here and we
2797                      * incorrectly conclude that DLB was causing the slowdown.
2798                      * So we measure one nstlist block, no running average.
2799                      */
2800                     if (comm->haveTurnedOffDlb
2801                         && comm->cycl[ddCyclStep] / comm->cycl_n[ddCyclStep] < comm->cyclesPerStepDlbExpAverage)
2802                     {
2803                         /* After turning off DLB we ran nstlist steps in fewer
2804                          * cycles than with DLB. This likely means that DLB
2805                          * in not benefical, but this could be due to a one
2806                          * time unlucky fluctuation, so we require two such
2807                          * observations in close succession to turn off DLB
2808                          * forever.
2809                          */
2810                         if (comm->dlbSlowerPartitioningCount > 0
2811                             && dd->ddp_count < comm->dlbSlowerPartitioningCount + 10 * c_checkTurnDlbOnInterval)
2812                         {
2813                             turnOffDlbForever = TRUE;
2814                         }
2815                         comm->haveTurnedOffDlb = false;
2816                         /* Register when we last measured DLB slowdown */
2817                         comm->dlbSlowerPartitioningCount = dd->ddp_count;
2818                     }
2819                     else
2820                     {
2821                         /* Here we check if the max PME rank load is more than 0.98
2822                          * the max PP force load. If so, PP DLB will not help,
2823                          * since we are (almost) limited by PME. Furthermore,
2824                          * DLB will cause a significant extra x/f redistribution
2825                          * cost on the PME ranks, which will then surely result
2826                          * in lower total performance.
2827                          */
2828                         if (comm->ddRankSetup.usePmeOnlyRanks && dd_pme_f_ratio(dd) > 1 - DD_PERF_LOSS_DLB_ON)
2829                         {
2830                             turnOnDlb = FALSE;
2831                         }
2832                         else
2833                         {
2834                             turnOnDlb = (dd_force_imb_perf_loss(dd) >= DD_PERF_LOSS_DLB_ON);
2835                         }
2836                     }
2837                 }
2838                 struct
2839                 {
2840                     gmx_bool turnOffDlbForever;
2841                     gmx_bool turnOnDlb;
2842                 } bools{ turnOffDlbForever, turnOnDlb };
2843                 dd_bcast(dd, sizeof(bools), &bools);
2844                 if (bools.turnOffDlbForever)
2845                 {
2846                     turn_off_dlb_forever(mdlog, dd, step);
2847                 }
2848                 else if (bools.turnOnDlb)
2849                 {
2850                     turn_on_dlb(mdlog, dd, step);
2851                     bDoDLB = TRUE;
2852                 }
2853             }
2854         }
2855         comm->n_load_have++;
2856     }
2857
2858     bRedist = FALSE;
2859     if (bMasterState)
2860     {
2861         /* Clear the old state */
2862         clearDDStateIndices(dd, false);
2863         ncgindex_set = 0;
2864
2865         auto xGlobal = positionsFromStatePointer(state_global);
2866
2867         set_ddbox(*dd, true, DDMASTER(dd) ? state_global->box : nullptr, true, xGlobal, &ddbox);
2868
2869         distributeState(mdlog, dd, top_global, state_global, ddbox, state_local);
2870
2871         /* Ensure that we have space for the new distribution */
2872         dd_resize_atominfo_and_state(fr, state_local, dd->ncg_home);
2873
2874         inc_nrnb(nrnb, eNR_CGCM, comm->atomRanges.numHomeAtoms());
2875
2876         dd_set_cginfo(dd->globalAtomGroupIndices, 0, dd->ncg_home, fr);
2877     }
2878     else if (state_local->ddp_count != dd->ddp_count)
2879     {
2880         if (state_local->ddp_count > dd->ddp_count)
2881         {
2882             gmx_fatal(FARGS,
2883                       "Internal inconsistency state_local->ddp_count (%d) > dd->ddp_count (%" PRId64
2884                       ")",
2885                       state_local->ddp_count, dd->ddp_count);
2886         }
2887
2888         if (state_local->ddp_count_cg_gl != state_local->ddp_count)
2889         {
2890             gmx_fatal(FARGS,
2891                       "Internal inconsistency state_local->ddp_count_cg_gl (%d) != "
2892                       "state_local->ddp_count (%d)",
2893                       state_local->ddp_count_cg_gl, state_local->ddp_count);
2894         }
2895
2896         /* Clear the old state */
2897         clearDDStateIndices(dd, false);
2898
2899         /* Restore the atom group indices from state_local */
2900         restoreAtomGroups(dd, state_local);
2901         make_dd_indices(dd, 0);
2902         ncgindex_set = dd->ncg_home;
2903
2904         inc_nrnb(nrnb, eNR_CGCM, comm->atomRanges.numHomeAtoms());
2905
2906         dd_set_cginfo(dd->globalAtomGroupIndices, 0, dd->ncg_home, fr);
2907
2908         set_ddbox(*dd, bMasterState, state_local->box, true, state_local->x, &ddbox);
2909
2910         bRedist = isDlbOn(comm);
2911     }
2912     else
2913     {
2914         /* We have the full state, only redistribute the cgs */
2915
2916         /* Clear the non-home indices */
2917         clearDDStateIndices(dd, true);
2918         ncgindex_set = 0;
2919
2920         /* To avoid global communication, we do not recompute the extent
2921          * of the system for dims without pbc. Therefore we need to copy
2922          * the previously computed values when we do not communicate.
2923          */
2924         if (!bNStGlobalComm)
2925         {
2926             copy_rvec(comm->box0, ddbox.box0);
2927             copy_rvec(comm->box_size, ddbox.box_size);
2928         }
2929         set_ddbox(*dd, bMasterState, state_local->box, bNStGlobalComm, state_local->x, &ddbox);
2930
2931         bBoxChanged = TRUE;
2932         bRedist     = TRUE;
2933     }
2934     /* Copy needed for dim's without pbc when avoiding communication */
2935     copy_rvec(ddbox.box0, comm->box0);
2936     copy_rvec(ddbox.box_size, comm->box_size);
2937
2938     set_dd_cell_sizes(dd, &ddbox, dd->unitCellInfo.ddBoxIsDynamic, bMasterState, bDoDLB, step, wcycle);
2939
2940     if (comm->ddSettings.nstDDDumpGrid > 0 && step % comm->ddSettings.nstDDDumpGrid == 0)
2941     {
2942         write_dd_grid_pdb("dd_grid", step, dd, state_local->box, &ddbox);
2943     }
2944
2945     if (comm->systemInfo.useUpdateGroups)
2946     {
2947         comm->updateGroupsCog->addCogs(
2948                 gmx::arrayRefFromArray(dd->globalAtomGroupIndices.data(), dd->ncg_home), state_local->x);
2949     }
2950
2951     /* Check if we should sort the charge groups */
2952     const bool bSortCG = (bMasterState || bRedist);
2953
2954     /* When repartitioning we mark atom groups that will move to neighboring
2955      * DD cells, but we do not move them right away for performance reasons.
2956      * Thus we need to keep track of how many charge groups will move for
2957      * obtaining correct local charge group / atom counts.
2958      */
2959     ncg_moved = 0;
2960     if (bRedist)
2961     {
2962         wallcycle_sub_start(wcycle, ewcsDD_REDIST);
2963
2964         ncgindex_set = dd->ncg_home;
2965         dd_redistribute_cg(fplog, step, dd, ddbox.tric_dir, state_local, fr, nrnb, &ncg_moved);
2966
2967         GMX_RELEASE_ASSERT(bSortCG, "Sorting is required after redistribution");
2968
2969         if (comm->systemInfo.useUpdateGroups)
2970         {
2971             comm->updateGroupsCog->addCogs(
2972                     gmx::arrayRefFromArray(dd->globalAtomGroupIndices.data(), dd->ncg_home),
2973                     state_local->x);
2974         }
2975
2976         wallcycle_sub_stop(wcycle, ewcsDD_REDIST);
2977     }
2978
2979     // TODO: Integrate this code in the nbnxm module
2980     get_nsgrid_boundaries(ddbox.nboundeddim, state_local->box, dd, &ddbox, &comm->cell_x0,
2981                           &comm->cell_x1, dd->ncg_home, as_rvec_array(state_local->x.data()),
2982                           cell_ns_x0, cell_ns_x1, &grid_density);
2983
2984     if (bBoxChanged)
2985     {
2986         comm_dd_ns_cell_sizes(dd, &ddbox, cell_ns_x0, cell_ns_x1, step);
2987     }
2988
2989     if (bSortCG)
2990     {
2991         wallcycle_sub_start(wcycle, ewcsDD_GRID);
2992
2993         /* Sort the state on charge group position.
2994          * This enables exact restarts from this step.
2995          * It also improves performance by about 15% with larger numbers
2996          * of atoms per node.
2997          */
2998
2999         /* Fill the ns grid with the home cell,
3000          * so we can sort with the indices.
3001          */
3002         set_zones_ncg_home(dd);
3003
3004         set_zones_size(dd, state_local->box, &ddbox, 0, 1, ncg_moved);
3005
3006         nbnxn_put_on_grid(fr->nbv.get(), state_local->box, 0, comm->zones.size[0].bb_x0,
3007                           comm->zones.size[0].bb_x1, comm->updateGroupsCog.get(),
3008                           { 0, dd->ncg_home }, comm->zones.dens_zone0, fr->cginfo, state_local->x,
3009                           ncg_moved, bRedist ? comm->movedBuffer.data() : nullptr);
3010
3011         if (debug)
3012         {
3013             fprintf(debug, "Step %s, sorting the %d home charge groups\n", gmx_step_str(step, sbuf),
3014                     dd->ncg_home);
3015         }
3016         dd_sort_state(dd, fr, state_local);
3017
3018         /* After sorting and compacting we set the correct size */
3019         state_change_natoms(state_local, comm->atomRanges.numHomeAtoms());
3020
3021         /* Rebuild all the indices */
3022         dd->ga2la->clear();
3023         ncgindex_set = 0;
3024
3025         wallcycle_sub_stop(wcycle, ewcsDD_GRID);
3026     }
3027     else
3028     {
3029         /* With the group scheme the sorting array is part of the DD state,
3030          * but it just got out of sync, so mark as invalid by emptying it.
3031          */
3032         if (ir->cutoff_scheme == ecutsGROUP)
3033         {
3034             comm->sort->sorted.clear();
3035         }
3036     }
3037
3038     if (comm->systemInfo.useUpdateGroups)
3039     {
3040         /* The update groups cog's are invalid after sorting
3041          * and need to be cleared before the next partitioning anyhow.
3042          */
3043         comm->updateGroupsCog->clear();
3044     }
3045
3046     wallcycle_sub_start(wcycle, ewcsDD_SETUPCOMM);
3047
3048     /* Set the induces for the home atoms */
3049     set_zones_ncg_home(dd);
3050     make_dd_indices(dd, ncgindex_set);
3051
3052     /* Setup up the communication and communicate the coordinates */
3053     setup_dd_communication(dd, state_local->box, &ddbox, fr, state_local);
3054
3055     /* Set the indices for the halo atoms */
3056     make_dd_indices(dd, dd->ncg_home);
3057
3058     /* Set the charge group boundaries for neighbor searching */
3059     set_cg_boundaries(&comm->zones);
3060
3061     /* When bSortCG=true, we have already set the size for zone 0 */
3062     set_zones_size(dd, state_local->box, &ddbox, bSortCG ? 1 : 0, comm->zones.n, 0);
3063
3064     wallcycle_sub_stop(wcycle, ewcsDD_SETUPCOMM);
3065
3066     /*
3067        write_dd_pdb("dd_home",step,"dump",top_global,cr,
3068                  -1,state_local->x.rvec_array(),state_local->box);
3069      */
3070
3071     wallcycle_sub_start(wcycle, ewcsDD_MAKETOP);
3072
3073     /* Extract a local topology from the global topology */
3074     for (int i = 0; i < dd->ndim; i++)
3075     {
3076         np[dd->dim[i]] = comm->cd[i].numPulses();
3077     }
3078     dd_make_local_top(dd, &comm->zones, dd->unitCellInfo.npbcdim, state_local->box,
3079                       comm->cellsize_min, np, fr, state_local->x.rvec_array(), top_global, top_local);
3080
3081     wallcycle_sub_stop(wcycle, ewcsDD_MAKETOP);
3082
3083     wallcycle_sub_start(wcycle, ewcsDD_MAKECONSTR);
3084
3085     /* Set up the special atom communication */
3086     int n = comm->atomRanges.end(DDAtomRanges::Type::Zones);
3087     for (int i = static_cast<int>(DDAtomRanges::Type::Zones) + 1;
3088          i < static_cast<int>(DDAtomRanges::Type::Number); i++)
3089     {
3090         auto range = static_cast<DDAtomRanges::Type>(i);
3091         switch (range)
3092         {
3093             case DDAtomRanges::Type::Vsites:
3094                 if (vsite && vsite->numInterUpdategroupVirtualSites())
3095                 {
3096                     n = dd_make_local_vsites(dd, n, top_local->idef.il);
3097                 }
3098                 break;
3099             case DDAtomRanges::Type::Constraints:
3100                 if (dd->comm->systemInfo.haveSplitConstraints || dd->comm->systemInfo.haveSplitSettles)
3101                 {
3102                     /* Only for inter-cg constraints we need special code */
3103                     n = dd_make_local_constraints(dd, n, &top_global, fr->cginfo.data(), constr,
3104                                                   ir->nProjOrder, top_local->idef.il);
3105                 }
3106                 break;
3107             default: gmx_incons("Unknown special atom type setup");
3108         }
3109         comm->atomRanges.setEnd(range, n);
3110     }
3111
3112     wallcycle_sub_stop(wcycle, ewcsDD_MAKECONSTR);
3113
3114     wallcycle_sub_start(wcycle, ewcsDD_TOPOTHER);
3115
3116     /* Make space for the extra coordinates for virtual site
3117      * or constraint communication.
3118      */
3119     state_local->natoms = comm->atomRanges.numAtomsTotal();
3120
3121     state_change_natoms(state_local, state_local->natoms);
3122
3123     if (vsite && vsite->numInterUpdategroupVirtualSites())
3124     {
3125         nat_f_novirsum = comm->atomRanges.end(DDAtomRanges::Type::Vsites);
3126     }
3127     else
3128     {
3129         if (EEL_FULL(ir->coulombtype) && dd->haveExclusions)
3130         {
3131             nat_f_novirsum = comm->atomRanges.end(DDAtomRanges::Type::Zones);
3132         }
3133         else
3134         {
3135             nat_f_novirsum = comm->atomRanges.numHomeAtoms();
3136         }
3137     }
3138
3139     /* Set the number of atoms required for the force calculation.
3140      * Forces need to be constrained when doing energy
3141      * minimization. For simple simulations we could avoid some
3142      * allocation, zeroing and copying, but this is probably not worth
3143      * the complications and checking.
3144      */
3145     forcerec_set_ranges(fr, comm->atomRanges.end(DDAtomRanges::Type::Zones),
3146                         comm->atomRanges.end(DDAtomRanges::Type::Constraints), nat_f_novirsum);
3147
3148     /* Update atom data for mdatoms and several algorithms */
3149     mdAlgorithmsSetupAtomData(cr, ir, top_global, top_local, fr, f, mdAtoms, constr, vsite, nullptr);
3150
3151     auto mdatoms = mdAtoms->mdatoms();
3152     if (!thisRankHasDuty(cr, DUTY_PME))
3153     {
3154         /* Send the charges and/or c6/sigmas to our PME only node */
3155         gmx_pme_send_parameters(cr, fr->ic, mdatoms->nChargePerturbed != 0,
3156                                 mdatoms->nTypePerturbed != 0, mdatoms->chargeA, mdatoms->chargeB,
3157                                 mdatoms->sqrt_c6A, mdatoms->sqrt_c6B, mdatoms->sigmaA,
3158                                 mdatoms->sigmaB, dd_pme_maxshift_x(dd), dd_pme_maxshift_y(dd));
3159     }
3160
3161     if (dd->atomSets != nullptr)
3162     {
3163         /* Update the local atom sets */
3164         dd->atomSets->setIndicesInDomainDecomposition(*(dd->ga2la));
3165     }
3166
3167     // The pull group construction can need the atom sets updated above
3168     if (ir->bPull)
3169     {
3170         /* Update the local pull groups */
3171         dd_make_local_pull_groups(cr, pull_work);
3172     }
3173
3174     /* Update the local atoms to be communicated via the IMD protocol if bIMD is TRUE. */
3175     imdSession->dd_make_local_IMD_atoms(dd);
3176
3177     add_dd_statistics(dd);
3178
3179     /* Make sure we only count the cycles for this DD partitioning */
3180     clear_dd_cycle_counts(dd);
3181
3182     /* Because the order of the atoms might have changed since
3183      * the last vsite construction, we need to communicate the constructing
3184      * atom coordinates again (for spreading the forces this MD step).
3185      */
3186     dd_move_x_vsites(*dd, state_local->box, state_local->x.rvec_array());
3187
3188     wallcycle_sub_stop(wcycle, ewcsDD_TOPOTHER);
3189
3190     if (comm->ddSettings.nstDDDump > 0 && step % comm->ddSettings.nstDDDump == 0)
3191     {
3192         dd_move_x(dd, state_local->box, state_local->x, nullWallcycle);
3193         write_dd_pdb("dd_dump", step, "dump", &top_global, cr, -1, state_local->x.rvec_array(),
3194                      state_local->box);
3195     }
3196
3197     /* Store the partitioning step */
3198     comm->partition_step = step;
3199
3200     /* Increase the DD partitioning counter */
3201     dd->ddp_count++;
3202     /* The state currently matches this DD partitioning count, store it */
3203     state_local->ddp_count = dd->ddp_count;
3204     if (bMasterState)
3205     {
3206         /* The DD master node knows the complete cg distribution,
3207          * store the count so we can possibly skip the cg info communication.
3208          */
3209         comm->master_cg_ddp_count = (bSortCG ? 0 : dd->ddp_count);
3210     }
3211
3212     if (comm->ddSettings.DD_debug > 0)
3213     {
3214         /* Set the env var GMX_DD_DEBUG if you suspect corrupted indices */
3215         check_index_consistency(dd, top_global.natoms, "after partitioning");
3216     }
3217
3218     wallcycle_stop(wcycle, ewcDOMDEC);
3219 }
3220
3221 /*! \brief Check whether bonded interactions are missing, if appropriate */
3222 void checkNumberOfBondedInteractions(const gmx::MDLogger&           mdlog,
3223                                      t_commrec*                     cr,
3224                                      int                            totalNumberOfBondedInteractions,
3225                                      const gmx_mtop_t*              top_global,
3226                                      const gmx_localtop_t*          top_local,
3227                                      gmx::ArrayRef<const gmx::RVec> x,
3228                                      const matrix                   box,
3229                                      bool* shouldCheckNumberOfBondedInteractions)
3230 {
3231     if (*shouldCheckNumberOfBondedInteractions)
3232     {
3233         if (totalNumberOfBondedInteractions != cr->dd->nbonded_global)
3234         {
3235             dd_print_missing_interactions(mdlog, cr, totalNumberOfBondedInteractions, top_global,
3236                                           top_local, x, box); // Does not return
3237         }
3238         *shouldCheckNumberOfBondedInteractions = false;
3239     }
3240 }