7bc033884e7914163806f288e7778698f76837cb
[alexxy/gromacs.git] / src / gromacs / domdec / partition.cpp
1 /*
2  * This file is part of the GROMACS molecular simulation package.
3  *
4  * Copyright (c) 2018,2019,2020,2021, by the GROMACS development team, led by
5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6  * and including many others, as listed in the AUTHORS file in the
7  * top-level source directory and at http://www.gromacs.org.
8  *
9  * GROMACS is free software; you can redistribute it and/or
10  * modify it under the terms of the GNU Lesser General Public License
11  * as published by the Free Software Foundation; either version 2.1
12  * of the License, or (at your option) any later version.
13  *
14  * GROMACS is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17  * Lesser General Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser General Public
20  * License along with GROMACS; if not, see
21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
23  *
24  * If you want to redistribute modifications to GROMACS, please
25  * consider that scientific software is very special. Version
26  * control is crucial - bugs must be traceable. We will be happy to
27  * consider code for inclusion in the official distribution, but
28  * derived work must not be called official GROMACS. Details are found
29  * in the README & COPYING files - if they are missing, get the
30  * official version at http://www.gromacs.org.
31  *
32  * To help us fund GROMACS development, we humbly ask that you cite
33  * the research papers on the package. Check out http://www.gromacs.org.
34  */
35 /*! \internal \file
36  *
37  * \brief This file defines functions for mdrun to call to make a new
38  * domain decomposition, and check it.
39  *
40  * \author Berk Hess <hess@kth.se>
41  * \ingroup module_domdec
42  */
43
44 #include "gmxpre.h"
45
46 #include "partition.h"
47
48 #include "config.h"
49
50 #include <cassert>
51 #include <cstdio>
52
53 #include <algorithm>
54
55 #include "gromacs/domdec/collect.h"
56 #include "gromacs/domdec/dlb.h"
57 #include "gromacs/domdec/dlbtiming.h"
58 #include "gromacs/domdec/domdec.h"
59 #include "gromacs/domdec/domdec_network.h"
60 #include "gromacs/domdec/ga2la.h"
61 #include "gromacs/domdec/localatomsetmanager.h"
62 #include "gromacs/domdec/mdsetup.h"
63 #include "gromacs/ewald/pme_pp.h"
64 #include "gromacs/gmxlib/network.h"
65 #include "gromacs/gmxlib/nrnb.h"
66 #include "gromacs/imd/imd.h"
67 #include "gromacs/math/functions.h"
68 #include "gromacs/math/vec.h"
69 #include "gromacs/mdlib/forcerec.h"
70 #include "gromacs/mdlib/gmx_omp_nthreads.h"
71 #include "gromacs/mdlib/mdatoms.h"
72 #include "gromacs/mdlib/nsgrid.h"
73 #include "gromacs/mdlib/vsite.h"
74 #include "gromacs/mdtypes/commrec.h"
75 #include "gromacs/mdtypes/forcerec.h"
76 #include "gromacs/mdtypes/inputrec.h"
77 #include "gromacs/mdtypes/md_enums.h"
78 #include "gromacs/mdtypes/mdatom.h"
79 #include "gromacs/mdtypes/nblist.h"
80 #include "gromacs/mdtypes/state.h"
81 #include "gromacs/nbnxm/nbnxm.h"
82 #include "gromacs/pulling/pull.h"
83 #include "gromacs/timing/wallcycle.h"
84 #include "gromacs/topology/mtop_util.h"
85 #include "gromacs/topology/topology.h"
86 #include "gromacs/utility/cstringutil.h"
87 #include "gromacs/utility/fatalerror.h"
88 #include "gromacs/utility/logger.h"
89 #include "gromacs/utility/real.h"
90 #include "gromacs/utility/smalloc.h"
91 #include "gromacs/utility/strconvert.h"
92 #include "gromacs/utility/stringstream.h"
93 #include "gromacs/utility/stringutil.h"
94 #include "gromacs/utility/textwriter.h"
95
96 #include "box.h"
97 #include "cellsizes.h"
98 #include "distribute.h"
99 #include "domdec_constraints.h"
100 #include "domdec_internal.h"
101 #include "domdec_vsite.h"
102 #include "dump.h"
103 #include "redistribute.h"
104 #include "utility.h"
105
106 /*! \brief Turn on DLB when the load imbalance causes this amount of total loss.
107  *
108  * There is a bit of overhead with DLB and it's difficult to achieve
109  * a load imbalance of less than 2% with DLB.
110  */
111 #define DD_PERF_LOSS_DLB_ON 0.02
112
113 //! Warn about imbalance due to PP or PP/PME load imbalance at this loss.
114 #define DD_PERF_LOSS_WARN 0.05
115
116
117 //! Debug helper printing a DD zone
118 static void print_ddzone(FILE* fp, int d, int i, int j, gmx_ddzone_t* zone)
119 {
120     fprintf(fp,
121             "zone d0 %d d1 %d d2 %d  min0 %6.3f max1 %6.3f mch0 %6.3f mch1 %6.3f p1_0 %6.3f p1_1 "
122             "%6.3f\n",
123             d,
124             i,
125             j,
126             zone->min0,
127             zone->max1,
128             zone->mch0,
129             zone->mch0,
130             zone->p1_0,
131             zone->p1_1);
132 }
133
134 /*! \brief Using the home grid size as input in cell_ns_x0 and cell_ns_x1
135  * takes the extremes over all home and remote zones in the halo
136  * and returns the results in cell_ns_x0 and cell_ns_x1.
137  * Note: only used with the group cut-off scheme.
138  */
139 static void dd_move_cellx(gmx_domdec_t* dd, const gmx_ddbox_t* ddbox, rvec cell_ns_x0, rvec cell_ns_x1)
140 {
141     constexpr int      c_ddZoneCommMaxNumZones = 5;
142     gmx_ddzone_t       buf_s[c_ddZoneCommMaxNumZones];
143     gmx_ddzone_t       buf_r[c_ddZoneCommMaxNumZones];
144     gmx_ddzone_t       buf_e[c_ddZoneCommMaxNumZones];
145     gmx_domdec_comm_t* comm = dd->comm;
146
147     rvec extr_s[2];
148     rvec extr_r[2];
149     for (int d = 1; d < dd->ndim; d++)
150     {
151         int           dim = dd->dim[d];
152         gmx_ddzone_t& zp  = (d == 1) ? comm->zone_d1[0] : comm->zone_d2[0][0];
153
154         /* Copy the base sizes of the home zone */
155         zp.min0    = cell_ns_x0[dim];
156         zp.max1    = cell_ns_x1[dim];
157         zp.min1    = cell_ns_x1[dim];
158         zp.mch0    = cell_ns_x0[dim];
159         zp.mch1    = cell_ns_x1[dim];
160         zp.p1_0    = cell_ns_x0[dim];
161         zp.p1_1    = cell_ns_x1[dim];
162         zp.dataSet = 1;
163     }
164
165     gmx::ArrayRef<DDCellsizesWithDlb> cellsizes = comm->cellsizesWithDlb;
166
167     /* Loop backward over the dimensions and aggregate the extremes
168      * of the cell sizes.
169      */
170     for (int d = dd->ndim - 2; d >= 0; d--)
171     {
172         const int  dim      = dd->dim[d];
173         const bool applyPbc = (dim < ddbox->npbcdim);
174
175         /* Use an rvec to store two reals */
176         extr_s[d][0] = cellsizes[d + 1].fracLower;
177         extr_s[d][1] = cellsizes[d + 1].fracUpper;
178         extr_s[d][2] = cellsizes[d + 1].fracUpper;
179
180         int pos = 0;
181         GMX_ASSERT(pos < c_ddZoneCommMaxNumZones, "The buffers should be sufficiently large");
182         /* Store the extremes in the backward sending buffer,
183          * so they get updated separately from the forward communication.
184          */
185         for (int d1 = d; d1 < dd->ndim - 1; d1++)
186         {
187             gmx_ddzone_t& buf = buf_s[pos];
188
189             /* We invert the order to be able to use the same loop for buf_e */
190             buf.min0 = extr_s[d1][1];
191             buf.max1 = extr_s[d1][0];
192             buf.min1 = extr_s[d1][2];
193             buf.mch0 = 0;
194             buf.mch1 = 0;
195             /* Store the cell corner of the dimension we communicate along */
196             buf.p1_0    = comm->cell_x0[dim];
197             buf.p1_1    = 0;
198             buf.dataSet = 1;
199             pos++;
200         }
201
202         buf_s[pos] = (dd->ndim == 2) ? comm->zone_d1[0] : comm->zone_d2[0][0];
203         pos++;
204
205         if (dd->ndim == 3 && d == 0)
206         {
207             buf_s[pos] = comm->zone_d2[0][1];
208             pos++;
209             buf_s[pos] = comm->zone_d1[0];
210             pos++;
211         }
212
213         /* We only need to communicate the extremes
214          * in the forward direction
215          */
216         int numPulses = comm->cd[d].numPulses();
217         int numPulsesMin;
218         if (applyPbc)
219         {
220             /* Take the minimum to avoid double communication */
221             numPulsesMin = std::min(numPulses, dd->numCells[dim] - 1 - numPulses);
222         }
223         else
224         {
225             /* Without PBC we should really not communicate over
226              * the boundaries, but implementing that complicates
227              * the communication setup and therefore we simply
228              * do all communication, but ignore some data.
229              */
230             numPulsesMin = numPulses;
231         }
232         for (int pulse = 0; pulse < numPulsesMin; pulse++)
233         {
234             /* Communicate the extremes forward */
235             bool receiveValidData = (applyPbc || dd->ci[dim] > 0);
236
237             int numElements = dd->ndim - d - 1;
238             ddSendrecv(dd, d, dddirForward, extr_s + d, numElements, extr_r + d, numElements);
239
240             if (receiveValidData)
241             {
242                 for (int d1 = d; d1 < dd->ndim - 1; d1++)
243                 {
244                     extr_s[d1][0] = std::max(extr_s[d1][0], extr_r[d1][0]);
245                     extr_s[d1][1] = std::min(extr_s[d1][1], extr_r[d1][1]);
246                     extr_s[d1][2] = std::min(extr_s[d1][2], extr_r[d1][2]);
247                 }
248             }
249         }
250
251         const int numElementsInBuffer = pos;
252         for (int pulse = 0; pulse < numPulses; pulse++)
253         {
254             /* Communicate all the zone information backward */
255             bool receiveValidData = (applyPbc || dd->ci[dim] < dd->numCells[dim] - 1);
256
257             static_assert(
258                     sizeof(gmx_ddzone_t) == c_ddzoneNumReals * sizeof(real),
259                     "Here we expect gmx_ddzone_t to consist of c_ddzoneNumReals reals (only)");
260
261             int numReals = numElementsInBuffer * c_ddzoneNumReals;
262             ddSendrecv(dd,
263                        d,
264                        dddirBackward,
265                        gmx::arrayRefFromArray(&buf_s[0].min0, numReals),
266                        gmx::arrayRefFromArray(&buf_r[0].min0, numReals));
267
268             rvec dh = { 0 };
269             if (pulse > 0)
270             {
271                 for (int d1 = d + 1; d1 < dd->ndim; d1++)
272                 {
273                     /* Determine the decrease of maximum required
274                      * communication height along d1 due to the distance along d,
275                      * this avoids a lot of useless atom communication.
276                      */
277                     real dist_d = comm->cell_x1[dim] - buf_r[0].p1_0;
278
279                     real c;
280                     if (ddbox->tric_dir[dim])
281                     {
282                         /* c is the off-diagonal coupling between the cell planes
283                          * along directions d and d1.
284                          */
285                         c = ddbox->v[dim][dd->dim[d1]][dim];
286                     }
287                     else
288                     {
289                         c = 0;
290                     }
291                     real det = (1 + c * c) * gmx::square(comm->systemInfo.cutoff) - dist_d * dist_d;
292                     if (det > 0)
293                     {
294                         dh[d1] = comm->systemInfo.cutoff - (c * dist_d + std::sqrt(det)) / (1 + c * c);
295                     }
296                     else
297                     {
298                         /* A negative value signals out of range */
299                         dh[d1] = -1;
300                     }
301                 }
302             }
303
304             /* Accumulate the extremes over all pulses */
305             for (int i = 0; i < numElementsInBuffer; i++)
306             {
307                 if (pulse == 0)
308                 {
309                     buf_e[i] = buf_r[i];
310                 }
311                 else
312                 {
313                     if (receiveValidData)
314                     {
315                         buf_e[i].min0 = std::min(buf_e[i].min0, buf_r[i].min0);
316                         buf_e[i].max1 = std::max(buf_e[i].max1, buf_r[i].max1);
317                         buf_e[i].min1 = std::min(buf_e[i].min1, buf_r[i].min1);
318                     }
319
320                     int d1;
321                     if (dd->ndim == 3 && d == 0 && i == numElementsInBuffer - 1)
322                     {
323                         d1 = 1;
324                     }
325                     else
326                     {
327                         d1 = d + 1;
328                     }
329                     if (receiveValidData && dh[d1] >= 0)
330                     {
331                         buf_e[i].mch0 = std::max(buf_e[i].mch0, buf_r[i].mch0 - dh[d1]);
332                         buf_e[i].mch1 = std::max(buf_e[i].mch1, buf_r[i].mch1 - dh[d1]);
333                     }
334                 }
335                 /* Copy the received buffer to the send buffer,
336                  * to pass the data through with the next pulse.
337                  */
338                 buf_s[i] = buf_r[i];
339             }
340             if (((applyPbc || dd->ci[dim] + numPulses < dd->numCells[dim]) && pulse == numPulses - 1)
341                 || (!applyPbc && dd->ci[dim] + 1 + pulse == dd->numCells[dim] - 1))
342             {
343                 /* Store the extremes */
344                 int pos = 0;
345
346                 for (int d1 = d; d1 < dd->ndim - 1; d1++)
347                 {
348                     extr_s[d1][1] = std::min(extr_s[d1][1], buf_e[pos].min0);
349                     extr_s[d1][0] = std::max(extr_s[d1][0], buf_e[pos].max1);
350                     extr_s[d1][2] = std::min(extr_s[d1][2], buf_e[pos].min1);
351                     pos++;
352                 }
353
354                 if (d == 1 || (d == 0 && dd->ndim == 3))
355                 {
356                     for (int i = d; i < 2; i++)
357                     {
358                         comm->zone_d2[1 - d][i] = buf_e[pos];
359                         pos++;
360                     }
361                 }
362                 if (d == 0)
363                 {
364                     comm->zone_d1[1] = buf_e[pos];
365                     pos++;
366                 }
367             }
368             else
369             {
370                 if (d == 1 || (d == 0 && dd->ndim == 3))
371                 {
372                     for (int i = d; i < 2; i++)
373                     {
374                         comm->zone_d2[1 - d][i].dataSet = 0;
375                     }
376                 }
377                 if (d == 0)
378                 {
379                     comm->zone_d1[1].dataSet = 0;
380                 }
381             }
382         }
383     }
384
385     if (dd->ndim >= 2)
386     {
387         int dim = dd->dim[1];
388         for (int i = 0; i < 2; i++)
389         {
390             if (comm->zone_d1[i].dataSet != 0)
391             {
392                 if (debug)
393                 {
394                     print_ddzone(debug, 1, i, 0, &comm->zone_d1[i]);
395                 }
396                 cell_ns_x0[dim] = std::min(cell_ns_x0[dim], comm->zone_d1[i].min0);
397                 cell_ns_x1[dim] = std::max(cell_ns_x1[dim], comm->zone_d1[i].max1);
398             }
399         }
400     }
401     if (dd->ndim >= 3)
402     {
403         int dim = dd->dim[2];
404         for (int i = 0; i < 2; i++)
405         {
406             for (int j = 0; j < 2; j++)
407             {
408                 if (comm->zone_d2[i][j].dataSet != 0)
409                 {
410                     if (debug)
411                     {
412                         print_ddzone(debug, 2, i, j, &comm->zone_d2[i][j]);
413                     }
414                     cell_ns_x0[dim] = std::min(cell_ns_x0[dim], comm->zone_d2[i][j].min0);
415                     cell_ns_x1[dim] = std::max(cell_ns_x1[dim], comm->zone_d2[i][j].max1);
416                 }
417             }
418         }
419     }
420     for (int d = 1; d < dd->ndim; d++)
421     {
422         cellsizes[d].fracLowerMax = extr_s[d - 1][0];
423         cellsizes[d].fracUpperMin = extr_s[d - 1][1];
424         if (debug)
425         {
426             fprintf(debug,
427                     "Cell fraction d %d, max0 %f, min1 %f\n",
428                     d,
429                     cellsizes[d].fracLowerMax,
430                     cellsizes[d].fracUpperMin);
431         }
432     }
433 }
434
435 //! Sets the charge-group zones to be equal to the home zone.
436 static void set_zones_ncg_home(gmx_domdec_t* dd)
437 {
438     gmx_domdec_zones_t* zones;
439     int                 i;
440
441     zones = &dd->comm->zones;
442
443     zones->cg_range[0] = 0;
444     for (i = 1; i < zones->n + 1; i++)
445     {
446         zones->cg_range[i] = dd->ncg_home;
447     }
448     /* zone_ncg1[0] should always be equal to ncg_home */
449     dd->comm->zone_ncg1[0] = dd->ncg_home;
450 }
451
452 //! Restore atom groups for the charge groups.
453 static void restoreAtomGroups(gmx_domdec_t* dd, const t_state* state)
454 {
455     gmx::ArrayRef<const int> atomGroupsState = state->cg_gl;
456
457     std::vector<int>& globalAtomGroupIndices = dd->globalAtomGroupIndices;
458
459     globalAtomGroupIndices.resize(atomGroupsState.size());
460
461     /* Copy back the global charge group indices from state
462      * and rebuild the local charge group to atom index.
463      */
464     for (gmx::index i = 0; i < atomGroupsState.ssize(); i++)
465     {
466         globalAtomGroupIndices[i] = atomGroupsState[i];
467     }
468
469     dd->ncg_home = atomGroupsState.size();
470     dd->comm->atomRanges.setEnd(DDAtomRanges::Type::Home, atomGroupsState.ssize());
471
472     set_zones_ncg_home(dd);
473 }
474
475 //! Sets the cginfo structures.
476 static void dd_set_cginfo(gmx::ArrayRef<const int> index_gl, int cg0, int cg1, t_forcerec* fr)
477 {
478     if (fr != nullptr)
479     {
480         gmx::ArrayRef<cginfo_mb_t> cginfo_mb = fr->cginfo_mb;
481         gmx::ArrayRef<int>         cginfo    = fr->cginfo;
482
483         for (int cg = cg0; cg < cg1; cg++)
484         {
485             cginfo[cg] = ddcginfo(cginfo_mb, index_gl[cg]);
486         }
487     }
488 }
489
490 //! Makes the mappings between global and local atom indices during DD repartioning.
491 static void make_dd_indices(gmx_domdec_t* dd, const int atomStart)
492 {
493     const int                numZones               = dd->comm->zones.n;
494     const int*               zone2cg                = dd->comm->zones.cg_range.data();
495     const int*               zone_ncg1              = dd->comm->zone_ncg1;
496     gmx::ArrayRef<const int> globalAtomGroupIndices = dd->globalAtomGroupIndices;
497
498     std::vector<int>& globalAtomIndices = dd->globalAtomIndices;
499     gmx_ga2la_t&      ga2la             = *dd->ga2la;
500
501     if (zone2cg[1] != dd->ncg_home)
502     {
503         gmx_incons("dd->ncg_zone is not up to date");
504     }
505
506     /* Make the local to global and global to local atom index */
507     int a = atomStart;
508     globalAtomIndices.resize(a);
509     for (int zone = 0; zone < numZones; zone++)
510     {
511         int cg0;
512         if (zone == 0)
513         {
514             cg0 = atomStart;
515         }
516         else
517         {
518             cg0 = zone2cg[zone];
519         }
520         int cg1    = zone2cg[zone + 1];
521         int cg1_p1 = cg0 + zone_ncg1[zone];
522
523         for (int cg = cg0; cg < cg1; cg++)
524         {
525             int zone1 = zone;
526             if (cg >= cg1_p1)
527             {
528                 /* Signal that this cg is from more than one pulse away */
529                 zone1 += numZones;
530             }
531             int cg_gl = globalAtomGroupIndices[cg];
532             globalAtomIndices.push_back(cg_gl);
533             ga2la.insert(cg_gl, { a, zone1 });
534             a++;
535         }
536     }
537 }
538
539 //! Checks whether global and local atom indices are consistent.
540 static void check_index_consistency(const gmx_domdec_t* dd, int natoms_sys, const char* where)
541 {
542     int nerr = 0;
543
544     const int numAtomsInZones = dd->comm->atomRanges.end(DDAtomRanges::Type::Zones);
545
546     if (dd->comm->ddSettings.DD_debug > 1)
547     {
548         std::vector<int> have(natoms_sys);
549         for (int a = 0; a < numAtomsInZones; a++)
550         {
551             int globalAtomIndex = dd->globalAtomIndices[a];
552             if (have[globalAtomIndex] > 0)
553             {
554                 fprintf(stderr,
555                         "DD rank %d: global atom %d occurs twice: index %d and %d\n",
556                         dd->rank,
557                         globalAtomIndex + 1,
558                         have[globalAtomIndex],
559                         a + 1);
560             }
561             else
562             {
563                 have[globalAtomIndex] = a + 1;
564             }
565         }
566     }
567
568     std::vector<int> have(numAtomsInZones);
569
570     int ngl = 0;
571     for (int i = 0; i < natoms_sys; i++)
572     {
573         if (const auto entry = dd->ga2la->find(i))
574         {
575             const int a = entry->la;
576             if (a >= numAtomsInZones)
577             {
578                 fprintf(stderr,
579                         "DD rank %d: global atom %d marked as local atom %d, which is larger than "
580                         "nat_tot (%d)\n",
581                         dd->rank,
582                         i + 1,
583                         a + 1,
584                         numAtomsInZones);
585                 nerr++;
586             }
587             else
588             {
589                 have[a] = 1;
590                 if (dd->globalAtomIndices[a] != i)
591                 {
592                     fprintf(stderr,
593                             "DD rank %d: global atom %d marked as local atom %d, which has global "
594                             "atom index %d\n",
595                             dd->rank,
596                             i + 1,
597                             a + 1,
598                             dd->globalAtomIndices[a] + 1);
599                     nerr++;
600                 }
601             }
602             ngl++;
603         }
604     }
605     if (ngl != numAtomsInZones)
606     {
607         fprintf(stderr, "DD rank %d, %s: %d global atom indices, %d local atoms\n", dd->rank, where, ngl, numAtomsInZones);
608     }
609     for (int a = 0; a < numAtomsInZones; a++)
610     {
611         if (have[a] == 0)
612         {
613             fprintf(stderr,
614                     "DD rank %d, %s: local atom %d, global %d has no global index\n",
615                     dd->rank,
616                     where,
617                     a + 1,
618                     dd->globalAtomIndices[a] + 1);
619         }
620     }
621
622     if (nerr > 0)
623     {
624         gmx_fatal(FARGS, "DD rank %d, %s: %d atom(group) index inconsistencies", dd->rank, where, nerr);
625     }
626 }
627
628 //! Clear all DD global state indices
629 static void clearDDStateIndices(gmx_domdec_t* dd, const bool keepLocalAtomIndices)
630 {
631     gmx_ga2la_t& ga2la = *dd->ga2la;
632
633     if (!keepLocalAtomIndices)
634     {
635         /* Clear the whole list without the overhead of searching */
636         ga2la.clear();
637     }
638     else
639     {
640         const int numAtomsInZones = dd->comm->atomRanges.end(DDAtomRanges::Type::Zones);
641         for (int i = 0; i < numAtomsInZones; i++)
642         {
643             ga2la.erase(dd->globalAtomIndices[i]);
644         }
645     }
646
647     dd_clear_local_vsite_indices(dd);
648
649     if (dd->constraints)
650     {
651         dd_clear_local_constraint_indices(dd);
652     }
653 }
654
655 bool check_grid_jump(int64_t step, const gmx_domdec_t* dd, real cutoff, const gmx_ddbox_t* ddbox, gmx_bool bFatal)
656 {
657     gmx_domdec_comm_t* comm    = dd->comm;
658     bool               invalid = false;
659
660     for (int d = 1; d < dd->ndim; d++)
661     {
662         const DDCellsizesWithDlb& cellsizes = comm->cellsizesWithDlb[d];
663         const int                 dim       = dd->dim[d];
664         const real                limit     = grid_jump_limit(comm, cutoff, d);
665         real                      bfac      = ddbox->box_size[dim];
666         if (ddbox->tric_dir[dim])
667         {
668             bfac *= ddbox->skew_fac[dim];
669         }
670         if ((cellsizes.fracUpper - cellsizes.fracLowerMax) * bfac < limit
671             || (cellsizes.fracLower - cellsizes.fracUpperMin) * bfac > -limit)
672         {
673             invalid = true;
674
675             if (bFatal)
676             {
677                 char buf[22];
678
679                 /* This error should never be triggered under normal
680                  * circumstances, but you never know ...
681                  */
682                 gmx_fatal(FARGS,
683                           "step %s: The domain decomposition grid has shifted too much in the "
684                           "%c-direction around cell %d %d %d. This should not have happened. "
685                           "Running with fewer ranks might avoid this issue.",
686                           gmx_step_str(step, buf),
687                           dim2char(dim),
688                           dd->ci[XX],
689                           dd->ci[YY],
690                           dd->ci[ZZ]);
691             }
692         }
693     }
694
695     return invalid;
696 }
697 //! Return the duration of force calculations on this rank.
698 static float dd_force_load(gmx_domdec_comm_t* comm)
699 {
700     float load;
701
702     if (comm->ddSettings.eFlop)
703     {
704         load = comm->flop;
705         if (comm->ddSettings.eFlop > 1)
706         {
707             load *= 1.0 + (comm->ddSettings.eFlop - 1) * (0.1 * rand() / RAND_MAX - 0.05);
708         }
709     }
710     else
711     {
712         load = comm->cycl[ddCyclF];
713         if (comm->cycl_n[ddCyclF] > 1)
714         {
715             /* Subtract the maximum of the last n cycle counts
716              * to get rid of possible high counts due to other sources,
717              * for instance system activity, that would otherwise
718              * affect the dynamic load balancing.
719              */
720             load -= comm->cycl_max[ddCyclF];
721         }
722
723 #if GMX_MPI
724         if (comm->cycl_n[ddCyclWaitGPU] && comm->nrank_gpu_shared > 1)
725         {
726             float gpu_wait, gpu_wait_sum;
727
728             gpu_wait = comm->cycl[ddCyclWaitGPU];
729             if (comm->cycl_n[ddCyclF] > 1)
730             {
731                 /* We should remove the WaitGPU time of the same MD step
732                  * as the one with the maximum F time, since the F time
733                  * and the wait time are not independent.
734                  * Furthermore, the step for the max F time should be chosen
735                  * the same on all ranks that share the same GPU.
736                  * But to keep the code simple, we remove the average instead.
737                  * The main reason for artificially long times at some steps
738                  * is spurious CPU activity or MPI time, so we don't expect
739                  * that changes in the GPU wait time matter a lot here.
740                  */
741                 gpu_wait *= (comm->cycl_n[ddCyclF] - 1) / static_cast<float>(comm->cycl_n[ddCyclF]);
742             }
743             /* Sum the wait times over the ranks that share the same GPU */
744             MPI_Allreduce(&gpu_wait, &gpu_wait_sum, 1, MPI_FLOAT, MPI_SUM, comm->mpi_comm_gpu_shared);
745             /* Replace the wait time by the average over the ranks */
746             load += -gpu_wait + gpu_wait_sum / comm->nrank_gpu_shared;
747         }
748 #endif
749     }
750
751     return load;
752 }
753
754 //! Runs cell size checks and communicates the boundaries.
755 static void comm_dd_ns_cell_sizes(gmx_domdec_t* dd, gmx_ddbox_t* ddbox, rvec cell_ns_x0, rvec cell_ns_x1, int64_t step)
756 {
757     gmx_domdec_comm_t* comm;
758     int                dim_ind, dim;
759
760     comm = dd->comm;
761
762     for (dim_ind = 0; dim_ind < dd->ndim; dim_ind++)
763     {
764         dim = dd->dim[dim_ind];
765
766         /* Without PBC we don't have restrictions on the outer cells */
767         if (!(dim >= ddbox->npbcdim && (dd->ci[dim] == 0 || dd->ci[dim] == dd->numCells[dim] - 1))
768             && isDlbOn(comm)
769             && (comm->cell_x1[dim] - comm->cell_x0[dim]) * ddbox->skew_fac[dim] < comm->cellsize_min[dim])
770         {
771             char buf[22];
772             gmx_fatal(FARGS,
773                       "step %s: The %c-size (%f) times the triclinic skew factor (%f) is smaller "
774                       "than the smallest allowed cell size (%f) for domain decomposition grid cell "
775                       "%d %d %d",
776                       gmx_step_str(step, buf),
777                       dim2char(dim),
778                       comm->cell_x1[dim] - comm->cell_x0[dim],
779                       ddbox->skew_fac[dim],
780                       dd->comm->cellsize_min[dim],
781                       dd->ci[XX],
782                       dd->ci[YY],
783                       dd->ci[ZZ]);
784         }
785     }
786
787     if ((isDlbOn(dd->comm) && dd->ndim > 1) || ddbox->nboundeddim < DIM)
788     {
789         /* Communicate the boundaries and update cell_ns_x0/1 */
790         dd_move_cellx(dd, ddbox, cell_ns_x0, cell_ns_x1);
791         if (isDlbOn(dd->comm) && dd->ndim > 1)
792         {
793             check_grid_jump(step, dd, dd->comm->systemInfo.cutoff, ddbox, TRUE);
794         }
795     }
796 }
797
798 //! Compute and communicate to determine the load distribution across PP ranks.
799 static void get_load_distribution(gmx_domdec_t* dd, gmx_wallcycle_t wcycle)
800 {
801     gmx_domdec_comm_t* comm;
802     domdec_load_t*     load;
803     float              cell_frac = 0, sbuf[DD_NLOAD_MAX];
804     gmx_bool           bSepPME;
805
806     if (debug)
807     {
808         fprintf(debug, "get_load_distribution start\n");
809     }
810
811     wallcycle_start(wcycle, ewcDDCOMMLOAD);
812
813     comm = dd->comm;
814
815     bSepPME = (dd->pme_nodeid >= 0);
816
817     if (dd->ndim == 0 && bSepPME)
818     {
819         /* Without decomposition, but with PME nodes, we need the load */
820         comm->load[0].mdf = comm->cycl[ddCyclPPduringPME];
821         comm->load[0].pme = comm->cycl[ddCyclPME];
822     }
823
824     for (int d = dd->ndim - 1; d >= 0; d--)
825     {
826         const DDCellsizesWithDlb* cellsizes = (isDlbOn(dd->comm) ? &comm->cellsizesWithDlb[d] : nullptr);
827         const int                 dim       = dd->dim[d];
828         /* Check if we participate in the communication in this dimension */
829         if (d == dd->ndim - 1 || (dd->ci[dd->dim[d + 1]] == 0 && dd->ci[dd->dim[dd->ndim - 1]] == 0))
830         {
831             load = &comm->load[d];
832             if (isDlbOn(dd->comm))
833             {
834                 cell_frac = cellsizes->fracUpper - cellsizes->fracLower;
835             }
836             int pos = 0;
837             if (d == dd->ndim - 1)
838             {
839                 sbuf[pos++] = dd_force_load(comm);
840                 sbuf[pos++] = sbuf[0];
841                 if (isDlbOn(dd->comm))
842                 {
843                     sbuf[pos++] = sbuf[0];
844                     sbuf[pos++] = cell_frac;
845                     if (d > 0)
846                     {
847                         sbuf[pos++] = cellsizes->fracLowerMax;
848                         sbuf[pos++] = cellsizes->fracUpperMin;
849                     }
850                 }
851                 if (bSepPME)
852                 {
853                     sbuf[pos++] = comm->cycl[ddCyclPPduringPME];
854                     sbuf[pos++] = comm->cycl[ddCyclPME];
855                 }
856             }
857             else
858             {
859                 sbuf[pos++] = comm->load[d + 1].sum;
860                 sbuf[pos++] = comm->load[d + 1].max;
861                 if (isDlbOn(dd->comm))
862                 {
863                     sbuf[pos++] = comm->load[d + 1].sum_m;
864                     sbuf[pos++] = comm->load[d + 1].cvol_min * cell_frac;
865                     sbuf[pos++] = comm->load[d + 1].flags;
866                     if (d > 0)
867                     {
868                         sbuf[pos++] = cellsizes->fracLowerMax;
869                         sbuf[pos++] = cellsizes->fracUpperMin;
870                     }
871                 }
872                 if (bSepPME)
873                 {
874                     sbuf[pos++] = comm->load[d + 1].mdf;
875                     sbuf[pos++] = comm->load[d + 1].pme;
876                 }
877             }
878             load->nload = pos;
879             /* Communicate a row in DD direction d.
880              * The communicators are setup such that the root always has rank 0.
881              */
882 #if GMX_MPI
883             MPI_Gather(sbuf,
884                        load->nload * sizeof(float),
885                        MPI_BYTE,
886                        load->load,
887                        load->nload * sizeof(float),
888                        MPI_BYTE,
889                        0,
890                        comm->mpi_comm_load[d]);
891 #endif
892             if (dd->ci[dim] == dd->master_ci[dim])
893             {
894                 /* We are the master along this row, process this row */
895                 RowMaster* rowMaster = nullptr;
896
897                 if (isDlbOn(comm))
898                 {
899                     rowMaster = cellsizes->rowMaster.get();
900                 }
901                 load->sum      = 0;
902                 load->max      = 0;
903                 load->sum_m    = 0;
904                 load->cvol_min = 1;
905                 load->flags    = 0;
906                 load->mdf      = 0;
907                 load->pme      = 0;
908                 int pos        = 0;
909                 for (int i = 0; i < dd->numCells[dim]; i++)
910                 {
911                     load->sum += load->load[pos++];
912                     load->max = std::max(load->max, load->load[pos]);
913                     pos++;
914                     if (isDlbOn(dd->comm))
915                     {
916                         if (rowMaster->dlbIsLimited)
917                         {
918                             /* This direction could not be load balanced properly,
919                              * therefore we need to use the maximum iso the average load.
920                              */
921                             load->sum_m = std::max(load->sum_m, load->load[pos]);
922                         }
923                         else
924                         {
925                             load->sum_m += load->load[pos];
926                         }
927                         pos++;
928                         load->cvol_min = std::min(load->cvol_min, load->load[pos]);
929                         pos++;
930                         if (d < dd->ndim - 1)
931                         {
932                             load->flags = gmx::roundToInt(load->load[pos++]);
933                         }
934                         if (d > 0)
935                         {
936                             rowMaster->bounds[i].cellFracLowerMax = load->load[pos++];
937                             rowMaster->bounds[i].cellFracUpperMin = load->load[pos++];
938                         }
939                     }
940                     if (bSepPME)
941                     {
942                         load->mdf = std::max(load->mdf, load->load[pos]);
943                         pos++;
944                         load->pme = std::max(load->pme, load->load[pos]);
945                         pos++;
946                     }
947                 }
948                 if (isDlbOn(comm) && rowMaster->dlbIsLimited)
949                 {
950                     load->sum_m *= dd->numCells[dim];
951                     load->flags |= (1 << d);
952                 }
953             }
954         }
955     }
956
957     if (DDMASTER(dd))
958     {
959         comm->nload += dd_load_count(comm);
960         comm->load_step += comm->cycl[ddCyclStep];
961         comm->load_sum += comm->load[0].sum;
962         comm->load_max += comm->load[0].max;
963         if (isDlbOn(comm))
964         {
965             for (int d = 0; d < dd->ndim; d++)
966             {
967                 if (comm->load[0].flags & (1 << d))
968                 {
969                     comm->load_lim[d]++;
970                 }
971             }
972         }
973         if (bSepPME)
974         {
975             comm->load_mdf += comm->load[0].mdf;
976             comm->load_pme += comm->load[0].pme;
977         }
978     }
979
980     wallcycle_stop(wcycle, ewcDDCOMMLOAD);
981
982     if (debug)
983     {
984         fprintf(debug, "get_load_distribution finished\n");
985     }
986 }
987
988 /*! \brief Return the relative performance loss on the total run time
989  * due to the force calculation load imbalance. */
990 static float dd_force_load_fraction(gmx_domdec_t* dd)
991 {
992     if (dd->comm->nload > 0 && dd->comm->load_step > 0)
993     {
994         return dd->comm->load_sum / (dd->comm->load_step * dd->nnodes);
995     }
996     else
997     {
998         return 0;
999     }
1000 }
1001
1002 /*! \brief Return the relative performance loss on the total run time
1003  * due to the force calculation load imbalance. */
1004 static float dd_force_imb_perf_loss(gmx_domdec_t* dd)
1005 {
1006     if (dd->comm->nload > 0 && dd->comm->load_step > 0)
1007     {
1008         return (dd->comm->load_max * dd->nnodes - dd->comm->load_sum) / (dd->comm->load_step * dd->nnodes);
1009     }
1010     else
1011     {
1012         return 0;
1013     }
1014 }
1015
1016 //! Print load-balance report e.g. at the end of a run.
1017 static void print_dd_load_av(FILE* fplog, gmx_domdec_t* dd)
1018 {
1019     gmx_domdec_comm_t* comm = dd->comm;
1020
1021     /* Only the master rank prints loads and only if we measured loads */
1022     if (!DDMASTER(dd) || comm->nload == 0)
1023     {
1024         return;
1025     }
1026
1027     char buf[STRLEN];
1028     int  numPpRanks  = dd->nnodes;
1029     int  numPmeRanks = (comm->ddRankSetup.usePmeOnlyRanks ? comm->ddRankSetup.numRanksDoingPme : 0);
1030     int  numRanks    = numPpRanks + numPmeRanks;
1031     float lossFraction = 0;
1032
1033     /* Print the average load imbalance and performance loss */
1034     if (dd->nnodes > 1 && comm->load_sum > 0)
1035     {
1036         float imbalance = comm->load_max * numPpRanks / comm->load_sum - 1;
1037         lossFraction    = dd_force_imb_perf_loss(dd);
1038
1039         std::string msg = "\nDynamic load balancing report:\n";
1040         std::string dlbStateStr;
1041
1042         switch (dd->comm->dlbState)
1043         {
1044             case DlbState::offUser:
1045                 dlbStateStr = "DLB was off during the run per user request.";
1046                 break;
1047             case DlbState::offForever:
1048                 /* Currectly this can happen due to performance loss observed, cell size
1049                  * limitations or incompatibility with other settings observed during
1050                  * determineInitialDlbState(). */
1051                 dlbStateStr = "DLB got disabled because it was unsuitable to use.";
1052                 break;
1053             case DlbState::offCanTurnOn:
1054                 dlbStateStr = "DLB was off during the run due to low measured imbalance.";
1055                 break;
1056             case DlbState::offTemporarilyLocked:
1057                 dlbStateStr =
1058                         "DLB was locked at the end of the run due to unfinished PP-PME "
1059                         "balancing.";
1060                 break;
1061             case DlbState::onCanTurnOff:
1062                 dlbStateStr = "DLB was turned on during the run due to measured imbalance.";
1063                 break;
1064             case DlbState::onUser:
1065                 dlbStateStr = "DLB was permanently on during the run per user request.";
1066                 break;
1067             default: GMX_ASSERT(false, "Undocumented DLB state");
1068         }
1069
1070         msg += " " + dlbStateStr + "\n";
1071         msg += gmx::formatString(" Average load imbalance: %.1f%%.\n", imbalance * 100);
1072         msg += gmx::formatString(
1073                 " The balanceable part of the MD step is %d%%, load imbalance is computed from "
1074                 "this.\n",
1075                 gmx::roundToInt(dd_force_load_fraction(dd) * 100));
1076         msg += gmx::formatString(
1077                 " Part of the total run time spent waiting due to load imbalance: %.1f%%.\n",
1078                 lossFraction * 100);
1079         fprintf(fplog, "%s", msg.c_str());
1080         fprintf(stderr, "\n%s", msg.c_str());
1081     }
1082
1083     /* Print during what percentage of steps the  load balancing was limited */
1084     bool dlbWasLimited = false;
1085     if (isDlbOn(comm))
1086     {
1087         sprintf(buf, " Steps where the load balancing was limited by -rdd, -rcon and/or -dds:");
1088         for (int d = 0; d < dd->ndim; d++)
1089         {
1090             int limitPercentage = (200 * comm->load_lim[d] + 1) / (2 * comm->nload);
1091             sprintf(buf + strlen(buf), " %c %d %%", dim2char(dd->dim[d]), limitPercentage);
1092             if (limitPercentage >= 50)
1093             {
1094                 dlbWasLimited = true;
1095             }
1096         }
1097         sprintf(buf + strlen(buf), "\n");
1098         fprintf(fplog, "%s", buf);
1099         fprintf(stderr, "%s", buf);
1100     }
1101
1102     /* Print the performance loss due to separate PME - PP rank imbalance */
1103     float lossFractionPme = 0;
1104     if (numPmeRanks > 0 && comm->load_mdf > 0 && comm->load_step > 0)
1105     {
1106         float pmeForceRatio = comm->load_pme / comm->load_mdf;
1107         lossFractionPme     = (comm->load_pme - comm->load_mdf) / comm->load_step;
1108         if (lossFractionPme <= 0)
1109         {
1110             lossFractionPme *= numPmeRanks / static_cast<float>(numRanks);
1111         }
1112         else
1113         {
1114             lossFractionPme *= numPpRanks / static_cast<float>(numRanks);
1115         }
1116         sprintf(buf, " Average PME mesh/force load: %5.3f\n", pmeForceRatio);
1117         fprintf(fplog, "%s", buf);
1118         fprintf(stderr, "%s", buf);
1119         sprintf(buf,
1120                 " Part of the total run time spent waiting due to PP/PME imbalance: %.1f %%\n",
1121                 std::fabs(lossFractionPme) * 100);
1122         fprintf(fplog, "%s", buf);
1123         fprintf(stderr, "%s", buf);
1124     }
1125     fprintf(fplog, "\n");
1126     fprintf(stderr, "\n");
1127
1128     if ((lossFraction >= DD_PERF_LOSS_WARN) && (dd->comm->dlbState != DlbState::offTemporarilyLocked))
1129     {
1130         std::string message = gmx::formatString(
1131                 "NOTE: %.1f %% of the available CPU time was lost due to load imbalance\n"
1132                 "      in the domain decomposition.\n",
1133                 lossFraction * 100);
1134
1135         bool hadSuggestion = false;
1136         if (dd->comm->dlbState == DlbState::offUser)
1137         {
1138             message += "      You might want to allow dynamic load balancing (option -dlb auto.)\n";
1139             hadSuggestion = true;
1140         }
1141         else if (dd->comm->dlbState == DlbState::offCanTurnOn)
1142         {
1143             message +=
1144                     "      Dynamic load balancing was automatically disabled, but it might be "
1145                     "beneficial to manually tuning it on (option -dlb on.)\n";
1146             hadSuggestion = true;
1147         }
1148         else if (dlbWasLimited)
1149         {
1150             message +=
1151                     "      You might want to decrease the cell size limit (options -rdd, -rcon "
1152                     "and/or -dds).\n";
1153             hadSuggestion = true;
1154         }
1155         message += gmx::formatString(
1156                 "      You can %sconsider manually changing the decomposition (option -dd);\n"
1157                 "      e.g. by using fewer domains along the box dimension in which there is\n"
1158                 "      considerable inhomogeneity in the simulated system.",
1159                 hadSuggestion ? "also " : "");
1160
1161         fprintf(fplog, "%s\n", message.c_str());
1162         fprintf(stderr, "%s\n", message.c_str());
1163     }
1164     if (numPmeRanks > 0 && std::fabs(lossFractionPme) >= DD_PERF_LOSS_WARN)
1165     {
1166         sprintf(buf,
1167                 "NOTE: %.1f %% performance was lost because the PME ranks\n"
1168                 "      had %s work to do than the PP ranks.\n"
1169                 "      You might want to %s the number of PME ranks\n"
1170                 "      or %s the cut-off and the grid spacing.\n",
1171                 std::fabs(lossFractionPme * 100),
1172                 (lossFractionPme < 0) ? "less" : "more",
1173                 (lossFractionPme < 0) ? "decrease" : "increase",
1174                 (lossFractionPme < 0) ? "decrease" : "increase");
1175         fprintf(fplog, "%s\n", buf);
1176         fprintf(stderr, "%s\n", buf);
1177     }
1178 }
1179
1180 //! Return the minimum communication volume.
1181 static float dd_vol_min(gmx_domdec_t* dd)
1182 {
1183     return dd->comm->load[0].cvol_min * dd->nnodes;
1184 }
1185
1186 //! Return the DD load flags.
1187 static int dd_load_flags(gmx_domdec_t* dd)
1188 {
1189     return dd->comm->load[0].flags;
1190 }
1191
1192 //! Return the reported load imbalance in force calculations.
1193 static float dd_f_imbal(gmx_domdec_t* dd)
1194 {
1195     if (dd->comm->load[0].sum > 0)
1196     {
1197         return dd->comm->load[0].max * dd->nnodes / dd->comm->load[0].sum - 1.0F;
1198     }
1199     else
1200     {
1201         /* Something is wrong in the cycle counting, report no load imbalance */
1202         return 0.0F;
1203     }
1204 }
1205
1206 //! Returns DD load balance report.
1207 static std::string dd_print_load(gmx_domdec_t* dd, int64_t step)
1208 {
1209     gmx::StringOutputStream stream;
1210     gmx::TextWriter         log(&stream);
1211
1212     int flags = dd_load_flags(dd);
1213     if (flags)
1214     {
1215         log.writeString("DD  load balancing is limited by minimum cell size in dimension");
1216         for (int d = 0; d < dd->ndim; d++)
1217         {
1218             if (flags & (1 << d))
1219             {
1220                 log.writeStringFormatted(" %c", dim2char(dd->dim[d]));
1221             }
1222         }
1223         log.ensureLineBreak();
1224     }
1225     log.writeString("DD  step " + gmx::toString(step));
1226     if (isDlbOn(dd->comm))
1227     {
1228         log.writeStringFormatted("  vol min/aver %5.3f%c", dd_vol_min(dd), flags ? '!' : ' ');
1229     }
1230     if (dd->nnodes > 1)
1231     {
1232         log.writeStringFormatted(" load imb.: force %4.1f%%", dd_f_imbal(dd) * 100);
1233     }
1234     if (dd->comm->cycl_n[ddCyclPME])
1235     {
1236         log.writeStringFormatted("  pme mesh/force %5.3f", dd_pme_f_ratio(dd));
1237     }
1238     log.ensureLineBreak();
1239     return stream.toString();
1240 }
1241
1242 //! Prints DD load balance report in mdrun verbose mode.
1243 static void dd_print_load_verbose(gmx_domdec_t* dd)
1244 {
1245     if (isDlbOn(dd->comm))
1246     {
1247         fprintf(stderr, "vol %4.2f%c ", dd_vol_min(dd), dd_load_flags(dd) ? '!' : ' ');
1248     }
1249     if (dd->nnodes > 1)
1250     {
1251         fprintf(stderr, "imb F %2d%% ", gmx::roundToInt(dd_f_imbal(dd) * 100));
1252     }
1253     if (dd->comm->cycl_n[ddCyclPME])
1254     {
1255         fprintf(stderr, "pme/F %4.2f ", dd_pme_f_ratio(dd));
1256     }
1257 }
1258
1259 //! Turns on dynamic load balancing if possible and needed.
1260 static void turn_on_dlb(const gmx::MDLogger& mdlog, gmx_domdec_t* dd, int64_t step)
1261 {
1262     gmx_domdec_comm_t* comm = dd->comm;
1263
1264     real cellsize_min = comm->cellsize_min[dd->dim[0]];
1265     for (int d = 1; d < dd->ndim; d++)
1266     {
1267         cellsize_min = std::min(cellsize_min, comm->cellsize_min[dd->dim[d]]);
1268     }
1269
1270     /* Turn off DLB if we're too close to the cell size limit. */
1271     if (cellsize_min < comm->cellsize_limit * 1.05)
1272     {
1273         GMX_LOG(mdlog.info)
1274                 .appendTextFormatted(
1275                         "step %s Measured %.1f %% performance loss due to load imbalance, "
1276                         "but the minimum cell size is smaller than 1.05 times the cell size limit. "
1277                         "Will no longer try dynamic load balancing.",
1278                         gmx::toString(step).c_str(),
1279                         dd_force_imb_perf_loss(dd) * 100);
1280
1281         comm->dlbState = DlbState::offForever;
1282         return;
1283     }
1284
1285     GMX_LOG(mdlog.info)
1286             .appendTextFormatted(
1287                     "step %s Turning on dynamic load balancing, because the performance loss due "
1288                     "to load imbalance is %.1f %%.",
1289                     gmx::toString(step).c_str(),
1290                     dd_force_imb_perf_loss(dd) * 100);
1291     comm->dlbState = DlbState::onCanTurnOff;
1292
1293     /* Store the non-DLB performance, so we can check if DLB actually
1294      * improves performance.
1295      */
1296     GMX_RELEASE_ASSERT(comm->cycl_n[ddCyclStep] > 0,
1297                        "When we turned on DLB, we should have measured cycles");
1298     comm->cyclesPerStepBeforeDLB = comm->cycl[ddCyclStep] / comm->cycl_n[ddCyclStep];
1299
1300     set_dlb_limits(dd);
1301
1302     /* We can set the required cell size info here,
1303      * so we do not need to communicate this.
1304      * The grid is completely uniform.
1305      */
1306     for (int d = 0; d < dd->ndim; d++)
1307     {
1308         RowMaster* rowMaster = comm->cellsizesWithDlb[d].rowMaster.get();
1309
1310         if (rowMaster)
1311         {
1312             comm->load[d].sum_m = comm->load[d].sum;
1313
1314             int nc = dd->numCells[dd->dim[d]];
1315             for (int i = 0; i < nc; i++)
1316             {
1317                 rowMaster->cellFrac[i] = i / static_cast<real>(nc);
1318                 if (d > 0)
1319                 {
1320                     rowMaster->bounds[i].cellFracLowerMax = i / static_cast<real>(nc);
1321                     rowMaster->bounds[i].cellFracUpperMin = (i + 1) / static_cast<real>(nc);
1322                 }
1323             }
1324             rowMaster->cellFrac[nc] = 1.0;
1325         }
1326     }
1327 }
1328
1329 //! Turns off dynamic load balancing (but leave it able to turn back on).
1330 static void turn_off_dlb(const gmx::MDLogger& mdlog, gmx_domdec_t* dd, int64_t step)
1331 {
1332     GMX_LOG(mdlog.info)
1333             .appendText(
1334                     "step " + gmx::toString(step)
1335                     + " Turning off dynamic load balancing, because it is degrading performance.");
1336     dd->comm->dlbState                     = DlbState::offCanTurnOn;
1337     dd->comm->haveTurnedOffDlb             = true;
1338     dd->comm->ddPartioningCountFirstDlbOff = dd->ddp_count;
1339 }
1340
1341 //! Turns off dynamic load balancing permanently.
1342 static void turn_off_dlb_forever(const gmx::MDLogger& mdlog, gmx_domdec_t* dd, int64_t step)
1343 {
1344     GMX_RELEASE_ASSERT(dd->comm->dlbState == DlbState::offCanTurnOn,
1345                        "Can only turn off DLB forever when it was in the can-turn-on state");
1346     GMX_LOG(mdlog.info)
1347             .appendText(
1348                     "step " + gmx::toString(step)
1349                     + " Will no longer try dynamic load balancing, as it degraded performance.");
1350     dd->comm->dlbState = DlbState::offForever;
1351 }
1352
1353 void set_dd_dlb_max_cutoff(t_commrec* cr, real cutoff)
1354 {
1355     gmx_domdec_comm_t* comm;
1356
1357     comm = cr->dd->comm;
1358
1359     /* Turn on the DLB limiting (might have been on already) */
1360     comm->bPMELoadBalDLBLimits = TRUE;
1361
1362     /* Change the cut-off limit */
1363     comm->PMELoadBal_max_cutoff = cutoff;
1364
1365     if (debug)
1366     {
1367         fprintf(debug,
1368                 "PME load balancing set a limit to the DLB staggering such that a %f cut-off will "
1369                 "continue to fit\n",
1370                 comm->PMELoadBal_max_cutoff);
1371     }
1372 }
1373
1374 //! Merge atom buffers.
1375 static void merge_cg_buffers(int                            ncell,
1376                              gmx_domdec_comm_dim_t*         cd,
1377                              int                            pulse,
1378                              int*                           ncg_cell,
1379                              gmx::ArrayRef<int>             index_gl,
1380                              const int*                     recv_i,
1381                              gmx::ArrayRef<gmx::RVec>       x,
1382                              gmx::ArrayRef<const gmx::RVec> recv_vr,
1383                              gmx::ArrayRef<cginfo_mb_t>     cginfo_mb,
1384                              gmx::ArrayRef<int>             cginfo)
1385 {
1386     gmx_domdec_ind_t *ind, *ind_p;
1387     int               p, cell, c, cg, cg0, cg1, cg_gl;
1388     int               shift;
1389
1390     ind = &cd->ind[pulse];
1391
1392     /* First correct the already stored data */
1393     shift = ind->nrecv[ncell];
1394     for (cell = ncell - 1; cell >= 0; cell--)
1395     {
1396         shift -= ind->nrecv[cell];
1397         if (shift > 0)
1398         {
1399             /* Move the cg's present from previous grid pulses */
1400             cg0 = ncg_cell[ncell + cell];
1401             cg1 = ncg_cell[ncell + cell + 1];
1402             for (cg = cg1 - 1; cg >= cg0; cg--)
1403             {
1404                 index_gl[cg + shift] = index_gl[cg];
1405                 x[cg + shift]        = x[cg];
1406                 cginfo[cg + shift]   = cginfo[cg];
1407             }
1408             /* Correct the already stored send indices for the shift */
1409             for (p = 1; p <= pulse; p++)
1410             {
1411                 ind_p = &cd->ind[p];
1412                 cg0   = 0;
1413                 for (c = 0; c < cell; c++)
1414                 {
1415                     cg0 += ind_p->nsend[c];
1416                 }
1417                 cg1 = cg0 + ind_p->nsend[cell];
1418                 for (cg = cg0; cg < cg1; cg++)
1419                 {
1420                     ind_p->index[cg] += shift;
1421                 }
1422             }
1423         }
1424     }
1425
1426     /* Merge in the communicated buffers */
1427     shift = 0;
1428     cg0   = 0;
1429     for (cell = 0; cell < ncell; cell++)
1430     {
1431         cg1 = ncg_cell[ncell + cell + 1] + shift;
1432         for (cg = 0; cg < ind->nrecv[cell]; cg++)
1433         {
1434             /* Copy this atom from the buffer */
1435             index_gl[cg1] = recv_i[cg0];
1436             x[cg1]        = recv_vr[cg0];
1437             /* Copy information */
1438             cg_gl       = index_gl[cg1];
1439             cginfo[cg1] = ddcginfo(cginfo_mb, cg_gl);
1440             cg0++;
1441             cg1++;
1442         }
1443         shift += ind->nrecv[cell];
1444         ncg_cell[ncell + cell + 1] = cg1;
1445     }
1446 }
1447
1448 //! Makes a range partitioning for the atom groups wthin a cell
1449 static void make_cell2at_index(gmx_domdec_comm_dim_t* cd, int nzone, int atomGroupStart)
1450 {
1451     /* Store the atom block boundaries for easy copying of communication buffers
1452      */
1453     int g = atomGroupStart;
1454     for (int zone = 0; zone < nzone; zone++)
1455     {
1456         for (gmx_domdec_ind_t& ind : cd->ind)
1457         {
1458             ind.cell2at0[zone] = g;
1459             g += ind.nrecv[zone];
1460             ind.cell2at1[zone] = g;
1461         }
1462     }
1463 }
1464
1465 //! Returns whether a link is missing.
1466 static gmx_bool missing_link(const t_blocka& link, const int globalAtomIndex, const gmx_ga2la_t& ga2la)
1467 {
1468     for (int i = link.index[globalAtomIndex]; i < link.index[globalAtomIndex + 1]; i++)
1469     {
1470         if (!ga2la.findHome(link.a[i]))
1471         {
1472             return true;
1473         }
1474     }
1475
1476     return false;
1477 }
1478
1479 //! Domain corners for communication, a maximum of 4 i-zones see a j domain
1480 typedef struct
1481 {
1482     //! The corners for the non-bonded communication.
1483     real c[DIM][4];
1484     //! Corner for rounding.
1485     real cr0;
1486     //! Corners for rounding.
1487     real cr1[4];
1488     //! Corners for bounded communication.
1489     real bc[DIM];
1490     //! Corner for rounding for bonded communication.
1491     real bcr1;
1492 } dd_corners_t;
1493
1494 //! Determine the corners of the domain(s) we are communicating with.
1495 static void set_dd_corners(const gmx_domdec_t* dd, int dim0, int dim1, int dim2, gmx_bool bDistMB, dd_corners_t* c)
1496 {
1497     const gmx_domdec_comm_t*  comm;
1498     const gmx_domdec_zones_t* zones;
1499
1500     comm = dd->comm;
1501
1502     zones = &comm->zones;
1503
1504     /* Keep the compiler happy */
1505     c->cr0  = 0;
1506     c->bcr1 = 0;
1507
1508     /* The first dimension is equal for all cells */
1509     c->c[0][0] = comm->cell_x0[dim0];
1510     if (bDistMB)
1511     {
1512         c->bc[0] = c->c[0][0];
1513     }
1514     if (dd->ndim >= 2)
1515     {
1516         dim1 = dd->dim[1];
1517         /* This cell row is only seen from the first row */
1518         c->c[1][0] = comm->cell_x0[dim1];
1519         /* All rows can see this row */
1520         c->c[1][1] = comm->cell_x0[dim1];
1521         if (isDlbOn(dd->comm))
1522         {
1523             c->c[1][1] = std::max(comm->cell_x0[dim1], comm->zone_d1[1].mch0);
1524             if (bDistMB)
1525             {
1526                 /* For the multi-body distance we need the maximum */
1527                 c->bc[1] = std::max(comm->cell_x0[dim1], comm->zone_d1[1].p1_0);
1528             }
1529         }
1530         /* Set the upper-right corner for rounding */
1531         c->cr0 = comm->cell_x1[dim0];
1532
1533         if (dd->ndim >= 3)
1534         {
1535             dim2 = dd->dim[2];
1536             for (int j = 0; j < 4; j++)
1537             {
1538                 c->c[2][j] = comm->cell_x0[dim2];
1539             }
1540             if (isDlbOn(dd->comm))
1541             {
1542                 /* Use the maximum of the i-cells that see a j-cell */
1543                 for (const auto& iZone : zones->iZones)
1544                 {
1545                     const int iZoneIndex = iZone.iZoneIndex;
1546                     for (int jZone : iZone.jZoneRange)
1547                     {
1548                         if (jZone >= 4)
1549                         {
1550                             c->c[2][jZone - 4] = std::max(
1551                                     c->c[2][jZone - 4],
1552                                     comm->zone_d2[zones->shift[iZoneIndex][dim0]][zones->shift[iZoneIndex][dim1]]
1553                                             .mch0);
1554                         }
1555                     }
1556                 }
1557                 if (bDistMB)
1558                 {
1559                     /* For the multi-body distance we need the maximum */
1560                     c->bc[2] = comm->cell_x0[dim2];
1561                     for (int i = 0; i < 2; i++)
1562                     {
1563                         for (int j = 0; j < 2; j++)
1564                         {
1565                             c->bc[2] = std::max(c->bc[2], comm->zone_d2[i][j].p1_0);
1566                         }
1567                     }
1568                 }
1569             }
1570
1571             /* Set the upper-right corner for rounding */
1572             /* Cell (0,0,0) and cell (1,0,0) can see cell 4 (0,1,1)
1573              * Only cell (0,0,0) can see cell 7 (1,1,1)
1574              */
1575             c->cr1[0] = comm->cell_x1[dim1];
1576             c->cr1[3] = comm->cell_x1[dim1];
1577             if (isDlbOn(dd->comm))
1578             {
1579                 c->cr1[0] = std::max(comm->cell_x1[dim1], comm->zone_d1[1].mch1);
1580                 if (bDistMB)
1581                 {
1582                     /* For the multi-body distance we need the maximum */
1583                     c->bcr1 = std::max(comm->cell_x1[dim1], comm->zone_d1[1].p1_1);
1584                 }
1585             }
1586         }
1587     }
1588 }
1589
1590 /*! \brief Add the atom groups we need to send in this pulse from this
1591  * zone to \p localAtomGroups and \p work. */
1592 static void get_zone_pulse_cgs(gmx_domdec_t*            dd,
1593                                int                      zonei,
1594                                int                      zone,
1595                                int                      cg0,
1596                                int                      cg1,
1597                                gmx::ArrayRef<const int> globalAtomGroupIndices,
1598                                int                      dim,
1599                                int                      dim_ind,
1600                                int                      dim0,
1601                                int                      dim1,
1602                                int                      dim2,
1603                                real                     r_comm2,
1604                                real                     r_bcomm2,
1605                                matrix                   box,
1606                                bool                     distanceIsTriclinic,
1607                                rvec*                    normal,
1608                                real                     skew_fac2_d,
1609                                real                     skew_fac_01,
1610                                rvec*                    v_d,
1611                                rvec*                    v_0,
1612                                rvec*                    v_1,
1613                                const dd_corners_t*      c,
1614                                const rvec               sf2_round,
1615                                gmx_bool                 bDistBonded,
1616                                gmx_bool                 bBondComm,
1617                                gmx_bool                 bDist2B,
1618                                gmx_bool                 bDistMB,
1619                                rvec*                    cg_cm,
1620                                gmx::ArrayRef<const int> cginfo,
1621                                std::vector<int>*        localAtomGroups,
1622                                dd_comm_setup_work_t*    work)
1623 {
1624     gmx_domdec_comm_t* comm;
1625     gmx_bool           bScrew;
1626     gmx_bool           bDistMB_pulse;
1627     int                cg, i;
1628     real               r2, rb2, r, tric_sh;
1629     rvec               rn, rb;
1630     int                dimd;
1631     int                nsend_z, nat;
1632
1633     comm = dd->comm;
1634
1635     bScrew = (dd->unitCellInfo.haveScrewPBC && dim == XX);
1636
1637     bDistMB_pulse = (bDistMB && bDistBonded);
1638
1639     /* Unpack the work data */
1640     std::vector<int>&       ibuf = work->atomGroupBuffer;
1641     std::vector<gmx::RVec>& vbuf = work->positionBuffer;
1642     nsend_z                      = 0;
1643     nat                          = work->nat;
1644
1645     for (cg = cg0; cg < cg1; cg++)
1646     {
1647         r2  = 0;
1648         rb2 = 0;
1649         if (!distanceIsTriclinic)
1650         {
1651             /* Rectangular direction, easy */
1652             r = cg_cm[cg][dim] - c->c[dim_ind][zone];
1653             if (r > 0)
1654             {
1655                 r2 += r * r;
1656             }
1657             if (bDistMB_pulse)
1658             {
1659                 r = cg_cm[cg][dim] - c->bc[dim_ind];
1660                 if (r > 0)
1661                 {
1662                     rb2 += r * r;
1663                 }
1664             }
1665             /* Rounding gives at most a 16% reduction
1666              * in communicated atoms
1667              */
1668             if (dim_ind >= 1 && (zonei == 1 || zonei == 2))
1669             {
1670                 r = cg_cm[cg][dim0] - c->cr0;
1671                 /* This is the first dimension, so always r >= 0 */
1672                 r2 += r * r;
1673                 if (bDistMB_pulse)
1674                 {
1675                     rb2 += r * r;
1676                 }
1677             }
1678             if (dim_ind == 2 && (zonei == 2 || zonei == 3))
1679             {
1680                 r = cg_cm[cg][dim1] - c->cr1[zone];
1681                 if (r > 0)
1682                 {
1683                     r2 += r * r;
1684                 }
1685                 if (bDistMB_pulse)
1686                 {
1687                     r = cg_cm[cg][dim1] - c->bcr1;
1688                     if (r > 0)
1689                     {
1690                         rb2 += r * r;
1691                     }
1692                 }
1693             }
1694         }
1695         else
1696         {
1697             /* Triclinic direction, more complicated */
1698             clear_rvec(rn);
1699             clear_rvec(rb);
1700             /* Rounding, conservative as the skew_fac multiplication
1701              * will slightly underestimate the distance.
1702              */
1703             if (dim_ind >= 1 && (zonei == 1 || zonei == 2))
1704             {
1705                 rn[dim0] = cg_cm[cg][dim0] - c->cr0;
1706                 for (i = dim0 + 1; i < DIM; i++)
1707                 {
1708                     rn[dim0] -= cg_cm[cg][i] * v_0[i][dim0];
1709                 }
1710                 r2 = rn[dim0] * rn[dim0] * sf2_round[dim0];
1711                 if (bDistMB_pulse)
1712                 {
1713                     rb[dim0] = rn[dim0];
1714                     rb2      = r2;
1715                 }
1716                 /* Take care that the cell planes along dim0 might not
1717                  * be orthogonal to those along dim1 and dim2.
1718                  */
1719                 for (i = 1; i <= dim_ind; i++)
1720                 {
1721                     dimd = dd->dim[i];
1722                     if (normal[dim0][dimd] > 0)
1723                     {
1724                         rn[dimd] -= rn[dim0] * normal[dim0][dimd];
1725                         if (bDistMB_pulse)
1726                         {
1727                             rb[dimd] -= rb[dim0] * normal[dim0][dimd];
1728                         }
1729                     }
1730                 }
1731             }
1732             if (dim_ind == 2 && (zonei == 2 || zonei == 3))
1733             {
1734                 GMX_ASSERT(dim1 >= 0 && dim1 < DIM, "Must have a valid dimension index");
1735                 rn[dim1] += cg_cm[cg][dim1] - c->cr1[zone];
1736                 tric_sh = 0;
1737                 for (i = dim1 + 1; i < DIM; i++)
1738                 {
1739                     tric_sh -= cg_cm[cg][i] * v_1[i][dim1];
1740                 }
1741                 rn[dim1] += tric_sh;
1742                 if (rn[dim1] > 0)
1743                 {
1744                     r2 += rn[dim1] * rn[dim1] * sf2_round[dim1];
1745                     /* Take care of coupling of the distances
1746                      * to the planes along dim0 and dim1 through dim2.
1747                      */
1748                     r2 -= rn[dim0] * rn[dim1] * skew_fac_01;
1749                     /* Take care that the cell planes along dim1
1750                      * might not be orthogonal to that along dim2.
1751                      */
1752                     if (normal[dim1][dim2] > 0)
1753                     {
1754                         rn[dim2] -= rn[dim1] * normal[dim1][dim2];
1755                     }
1756                 }
1757                 if (bDistMB_pulse)
1758                 {
1759                     rb[dim1] += cg_cm[cg][dim1] - c->bcr1 + tric_sh;
1760                     if (rb[dim1] > 0)
1761                     {
1762                         rb2 += rb[dim1] * rb[dim1] * sf2_round[dim1];
1763                         /* Take care of coupling of the distances
1764                          * to the planes along dim0 and dim1 through dim2.
1765                          */
1766                         rb2 -= rb[dim0] * rb[dim1] * skew_fac_01;
1767                         /* Take care that the cell planes along dim1
1768                          * might not be orthogonal to that along dim2.
1769                          */
1770                         if (normal[dim1][dim2] > 0)
1771                         {
1772                             rb[dim2] -= rb[dim1] * normal[dim1][dim2];
1773                         }
1774                     }
1775                 }
1776             }
1777             /* The distance along the communication direction */
1778             rn[dim] += cg_cm[cg][dim] - c->c[dim_ind][zone];
1779             tric_sh = 0;
1780             for (i = dim + 1; i < DIM; i++)
1781             {
1782                 tric_sh -= cg_cm[cg][i] * v_d[i][dim];
1783             }
1784             rn[dim] += tric_sh;
1785             if (rn[dim] > 0)
1786             {
1787                 r2 += rn[dim] * rn[dim] * skew_fac2_d;
1788                 /* Take care of coupling of the distances
1789                  * to the planes along dim0 and dim1 through dim2.
1790                  */
1791                 if (dim_ind == 1 && zonei == 1)
1792                 {
1793                     r2 -= rn[dim0] * rn[dim] * skew_fac_01;
1794                 }
1795             }
1796             if (bDistMB_pulse)
1797             {
1798                 clear_rvec(rb);
1799                 GMX_ASSERT(dim >= 0 && dim < DIM, "Must have a valid dimension index");
1800                 rb[dim] += cg_cm[cg][dim] - c->bc[dim_ind] + tric_sh;
1801                 if (rb[dim] > 0)
1802                 {
1803                     rb2 += rb[dim] * rb[dim] * skew_fac2_d;
1804                     /* Take care of coupling of the distances
1805                      * to the planes along dim0 and dim1 through dim2.
1806                      */
1807                     if (dim_ind == 1 && zonei == 1)
1808                     {
1809                         rb2 -= rb[dim0] * rb[dim] * skew_fac_01;
1810                     }
1811                 }
1812             }
1813         }
1814
1815         if (r2 < r_comm2
1816             || (bDistBonded && ((bDistMB && rb2 < r_bcomm2) || (bDist2B && r2 < r_bcomm2))
1817                 && (!bBondComm
1818                     || (GET_CGINFO_BOND_INTER(cginfo[cg])
1819                         && missing_link(*comm->bondedLinks, globalAtomGroupIndices[cg], *dd->ga2la)))))
1820         {
1821             /* Store the local and global atom group indices and position */
1822             localAtomGroups->push_back(cg);
1823             ibuf.push_back(globalAtomGroupIndices[cg]);
1824             nsend_z++;
1825
1826             rvec posPbc;
1827             if (dd->ci[dim] == 0)
1828             {
1829                 /* Correct cg_cm for pbc */
1830                 rvec_add(cg_cm[cg], box[dim], posPbc);
1831                 if (bScrew)
1832                 {
1833                     posPbc[YY] = box[YY][YY] - posPbc[YY];
1834                     posPbc[ZZ] = box[ZZ][ZZ] - posPbc[ZZ];
1835                 }
1836             }
1837             else
1838             {
1839                 copy_rvec(cg_cm[cg], posPbc);
1840             }
1841             vbuf.emplace_back(posPbc[XX], posPbc[YY], posPbc[ZZ]);
1842
1843             nat += 1;
1844         }
1845     }
1846
1847     work->nat        = nat;
1848     work->nsend_zone = nsend_z;
1849 }
1850
1851 //! Clear data.
1852 static void clearCommSetupData(dd_comm_setup_work_t* work)
1853 {
1854     work->localAtomGroupBuffer.clear();
1855     work->atomGroupBuffer.clear();
1856     work->positionBuffer.clear();
1857     work->nat        = 0;
1858     work->nsend_zone = 0;
1859 }
1860
1861 //! Prepare DD communication.
1862 static void setup_dd_communication(gmx_domdec_t* dd, matrix box, gmx_ddbox_t* ddbox, t_forcerec* fr, t_state* state)
1863 {
1864     int                    dim_ind, dim, dim0, dim1, dim2, dimd, nat_tot;
1865     int                    nzone, nzone_send, zone, zonei, cg0, cg1;
1866     int                    c;
1867     int *                  zone_cg_range, pos_cg;
1868     gmx_domdec_comm_t*     comm;
1869     gmx_domdec_zones_t*    zones;
1870     gmx_domdec_comm_dim_t* cd;
1871     gmx_bool               bBondComm, bDist2B, bDistMB, bDistBonded;
1872     dd_corners_t           corners;
1873     rvec *                 normal, *v_d, *v_0 = nullptr, *v_1 = nullptr;
1874     real                   skew_fac2_d, skew_fac_01;
1875     rvec                   sf2_round;
1876
1877     if (debug)
1878     {
1879         fprintf(debug, "Setting up DD communication\n");
1880     }
1881
1882     comm = dd->comm;
1883
1884     if (comm->dth.empty())
1885     {
1886         /* Initialize the thread data.
1887          * This can not be done in init_domain_decomposition,
1888          * as the numbers of threads is determined later.
1889          */
1890         int numThreads = gmx_omp_nthreads_get(emntDomdec);
1891         comm->dth.resize(numThreads);
1892     }
1893
1894     bBondComm = comm->systemInfo.filterBondedCommunication;
1895
1896     /* Do we need to determine extra distances for multi-body bondeds? */
1897     bDistMB = (comm->systemInfo.haveInterDomainMultiBodyBondeds && isDlbOn(dd->comm) && dd->ndim > 1);
1898
1899     /* Do we need to determine extra distances for only two-body bondeds? */
1900     bDist2B = (bBondComm && !bDistMB);
1901
1902     const real r_comm2 =
1903             gmx::square(domainToDomainIntoAtomToDomainCutoff(comm->systemInfo, comm->systemInfo.cutoff));
1904     const real r_bcomm2 =
1905             gmx::square(domainToDomainIntoAtomToDomainCutoff(comm->systemInfo, comm->cutoff_mbody));
1906
1907     if (debug)
1908     {
1909         fprintf(debug, "bBondComm %s, r_bc %f\n", gmx::boolToString(bBondComm), std::sqrt(r_bcomm2));
1910     }
1911
1912     zones = &comm->zones;
1913
1914     dim0 = dd->dim[0];
1915     dim1 = (dd->ndim >= 2 ? dd->dim[1] : -1);
1916     dim2 = (dd->ndim >= 3 ? dd->dim[2] : -1);
1917
1918     set_dd_corners(dd, dim0, dim1, dim2, bDistMB, &corners);
1919
1920     /* Triclinic stuff */
1921     normal      = ddbox->normal;
1922     skew_fac_01 = 0;
1923     if (dd->ndim >= 2)
1924     {
1925         v_0 = ddbox->v[dim0];
1926         if (ddbox->tric_dir[dim0] && ddbox->tric_dir[dim1])
1927         {
1928             /* Determine the coupling coefficient for the distances
1929              * to the cell planes along dim0 and dim1 through dim2.
1930              * This is required for correct rounding.
1931              */
1932             skew_fac_01 = ddbox->v[dim0][dim1 + 1][dim0] * ddbox->v[dim1][dim1 + 1][dim1];
1933             if (debug)
1934             {
1935                 fprintf(debug, "\nskew_fac_01 %f\n", skew_fac_01);
1936             }
1937         }
1938     }
1939     if (dd->ndim >= 3)
1940     {
1941         v_1 = ddbox->v[dim1];
1942     }
1943
1944     zone_cg_range                        = zones->cg_range.data();
1945     gmx::ArrayRef<cginfo_mb_t> cginfo_mb = fr->cginfo_mb;
1946
1947     zone_cg_range[0]   = 0;
1948     zone_cg_range[1]   = dd->ncg_home;
1949     comm->zone_ncg1[0] = dd->ncg_home;
1950     pos_cg             = dd->ncg_home;
1951
1952     nat_tot = comm->atomRanges.numHomeAtoms();
1953     nzone   = 1;
1954     for (dim_ind = 0; dim_ind < dd->ndim; dim_ind++)
1955     {
1956         dim = dd->dim[dim_ind];
1957         cd  = &comm->cd[dim_ind];
1958
1959         /* Check if we need to compute triclinic distances along this dim */
1960         bool distanceIsTriclinic = false;
1961         for (int i = 0; i <= dim_ind; i++)
1962         {
1963             if (ddbox->tric_dir[dd->dim[i]])
1964             {
1965                 distanceIsTriclinic = true;
1966             }
1967         }
1968
1969         if (dim >= ddbox->npbcdim && dd->ci[dim] == 0)
1970         {
1971             /* No pbc in this dimension, the first node should not comm. */
1972             nzone_send = 0;
1973         }
1974         else
1975         {
1976             nzone_send = nzone;
1977         }
1978
1979         v_d         = ddbox->v[dim];
1980         skew_fac2_d = gmx::square(ddbox->skew_fac[dim]);
1981
1982         cd->receiveInPlace = true;
1983         for (int p = 0; p < cd->numPulses(); p++)
1984         {
1985             /* Only atoms communicated in the first pulse are used
1986              * for multi-body bonded interactions or for bBondComm.
1987              */
1988             bDistBonded = ((bDistMB || bDist2B) && p == 0);
1989
1990             gmx_domdec_ind_t* ind = &cd->ind[p];
1991
1992             /* Thread 0 writes in the global index array */
1993             ind->index.clear();
1994             clearCommSetupData(&comm->dth[0]);
1995
1996             for (zone = 0; zone < nzone_send; zone++)
1997             {
1998                 if (dim_ind > 0 && distanceIsTriclinic)
1999                 {
2000                     /* Determine slightly more optimized skew_fac's
2001                      * for rounding.
2002                      * This reduces the number of communicated atoms
2003                      * by about 10% for 3D DD of rhombic dodecahedra.
2004                      */
2005                     for (dimd = 0; dimd < dim; dimd++)
2006                     {
2007                         sf2_round[dimd] = 1;
2008                         if (ddbox->tric_dir[dimd])
2009                         {
2010                             for (int i = dd->dim[dimd] + 1; i < DIM; i++)
2011                             {
2012                                 /* If we are shifted in dimension i
2013                                  * and the cell plane is tilted forward
2014                                  * in dimension i, skip this coupling.
2015                                  */
2016                                 if (!(zones->shift[nzone + zone][i] && ddbox->v[dimd][i][dimd] >= 0))
2017                                 {
2018                                     sf2_round[dimd] += gmx::square(ddbox->v[dimd][i][dimd]);
2019                                 }
2020                             }
2021                             sf2_round[dimd] = 1 / sf2_round[dimd];
2022                         }
2023                     }
2024                 }
2025
2026                 zonei = zone_perm[dim_ind][zone];
2027                 if (p == 0)
2028                 {
2029                     /* Here we permutate the zones to obtain a convenient order
2030                      * for neighbor searching
2031                      */
2032                     cg0 = zone_cg_range[zonei];
2033                     cg1 = zone_cg_range[zonei + 1];
2034                 }
2035                 else
2036                 {
2037                     /* Look only at the cg's received in the previous grid pulse
2038                      */
2039                     cg1 = zone_cg_range[nzone + zone + 1];
2040                     cg0 = cg1 - cd->ind[p - 1].nrecv[zone];
2041                 }
2042
2043                 const int numThreads = gmx::ssize(comm->dth);
2044 #pragma omp parallel for num_threads(numThreads) schedule(static)
2045                 for (int th = 0; th < numThreads; th++)
2046                 {
2047                     try
2048                     {
2049                         dd_comm_setup_work_t& work = comm->dth[th];
2050
2051                         /* Retain data accumulated into buffers of thread 0 */
2052                         if (th > 0)
2053                         {
2054                             clearCommSetupData(&work);
2055                         }
2056
2057                         int cg0_th = cg0 + ((cg1 - cg0) * th) / numThreads;
2058                         int cg1_th = cg0 + ((cg1 - cg0) * (th + 1)) / numThreads;
2059
2060                         /* Get the cg's for this pulse in this zone */
2061                         get_zone_pulse_cgs(dd,
2062                                            zonei,
2063                                            zone,
2064                                            cg0_th,
2065                                            cg1_th,
2066                                            dd->globalAtomGroupIndices,
2067                                            dim,
2068                                            dim_ind,
2069                                            dim0,
2070                                            dim1,
2071                                            dim2,
2072                                            r_comm2,
2073                                            r_bcomm2,
2074                                            box,
2075                                            distanceIsTriclinic,
2076                                            normal,
2077                                            skew_fac2_d,
2078                                            skew_fac_01,
2079                                            v_d,
2080                                            v_0,
2081                                            v_1,
2082                                            &corners,
2083                                            sf2_round,
2084                                            bDistBonded,
2085                                            bBondComm,
2086                                            bDist2B,
2087                                            bDistMB,
2088                                            state->x.rvec_array(),
2089                                            fr->cginfo,
2090                                            th == 0 ? &ind->index : &work.localAtomGroupBuffer,
2091                                            &work);
2092                     }
2093                     GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR
2094                 } // END
2095
2096                 std::vector<int>&       atomGroups = comm->dth[0].atomGroupBuffer;
2097                 std::vector<gmx::RVec>& positions  = comm->dth[0].positionBuffer;
2098                 ind->nsend[zone]                   = comm->dth[0].nsend_zone;
2099                 /* Append data of threads>=1 to the communication buffers */
2100                 for (int th = 1; th < numThreads; th++)
2101                 {
2102                     const dd_comm_setup_work_t& dth = comm->dth[th];
2103
2104                     ind->index.insert(ind->index.end(),
2105                                       dth.localAtomGroupBuffer.begin(),
2106                                       dth.localAtomGroupBuffer.end());
2107                     atomGroups.insert(
2108                             atomGroups.end(), dth.atomGroupBuffer.begin(), dth.atomGroupBuffer.end());
2109                     positions.insert(
2110                             positions.end(), dth.positionBuffer.begin(), dth.positionBuffer.end());
2111                     comm->dth[0].nat += dth.nat;
2112                     ind->nsend[zone] += dth.nsend_zone;
2113                 }
2114             }
2115             /* Clear the counts in case we do not have pbc */
2116             for (zone = nzone_send; zone < nzone; zone++)
2117             {
2118                 ind->nsend[zone] = 0;
2119             }
2120             ind->nsend[nzone]     = ind->index.size();
2121             ind->nsend[nzone + 1] = comm->dth[0].nat;
2122             /* Communicate the number of cg's and atoms to receive */
2123             ddSendrecv(dd, dim_ind, dddirBackward, ind->nsend, nzone + 2, ind->nrecv, nzone + 2);
2124
2125             if (p > 0)
2126             {
2127                 /* We can receive in place if only the last zone is not empty */
2128                 for (zone = 0; zone < nzone - 1; zone++)
2129                 {
2130                     if (ind->nrecv[zone] > 0)
2131                     {
2132                         cd->receiveInPlace = false;
2133                     }
2134                 }
2135             }
2136
2137             int receiveBufferSize = 0;
2138             if (!cd->receiveInPlace)
2139             {
2140                 receiveBufferSize = ind->nrecv[nzone];
2141             }
2142             /* These buffer are actually only needed with in-place */
2143             DDBufferAccess<int>       globalAtomGroupBuffer(comm->intBuffer, receiveBufferSize);
2144             DDBufferAccess<gmx::RVec> rvecBuffer(comm->rvecBuffer, receiveBufferSize);
2145
2146             dd_comm_setup_work_t& work = comm->dth[0];
2147
2148             /* Make space for the global cg indices */
2149             int numAtomGroupsNew = pos_cg + ind->nrecv[nzone];
2150             dd->globalAtomGroupIndices.resize(numAtomGroupsNew);
2151             /* Communicate the global cg indices */
2152             gmx::ArrayRef<int> integerBufferRef;
2153             if (cd->receiveInPlace)
2154             {
2155                 integerBufferRef = gmx::arrayRefFromArray(
2156                         dd->globalAtomGroupIndices.data() + pos_cg, ind->nrecv[nzone]);
2157             }
2158             else
2159             {
2160                 integerBufferRef = globalAtomGroupBuffer.buffer;
2161             }
2162             ddSendrecv<int>(dd, dim_ind, dddirBackward, work.atomGroupBuffer, integerBufferRef);
2163
2164             /* Make space for cg_cm */
2165             dd_resize_atominfo_and_state(fr, state, pos_cg + ind->nrecv[nzone]);
2166
2167             /* Communicate the coordinates */
2168             gmx::ArrayRef<gmx::RVec> rvecBufferRef;
2169             if (cd->receiveInPlace)
2170             {
2171                 rvecBufferRef = gmx::makeArrayRef(state->x).subArray(pos_cg, ind->nrecv[nzone]);
2172             }
2173             else
2174             {
2175                 rvecBufferRef = rvecBuffer.buffer;
2176             }
2177             ddSendrecv<gmx::RVec>(dd, dim_ind, dddirBackward, work.positionBuffer, rvecBufferRef);
2178
2179             /* Make the charge group index */
2180             if (cd->receiveInPlace)
2181             {
2182                 zone = (p == 0 ? 0 : nzone - 1);
2183                 while (zone < nzone)
2184                 {
2185                     for (int i = 0; i < ind->nrecv[zone]; i++)
2186                     {
2187                         int globalAtomIndex = dd->globalAtomGroupIndices[pos_cg];
2188                         fr->cginfo[pos_cg]  = ddcginfo(cginfo_mb, globalAtomIndex);
2189                         pos_cg++;
2190                     }
2191                     if (p == 0)
2192                     {
2193                         comm->zone_ncg1[nzone + zone] = ind->nrecv[zone];
2194                     }
2195                     zone++;
2196                     zone_cg_range[nzone + zone] = pos_cg;
2197                 }
2198             }
2199             else
2200             {
2201                 /* This part of the code is never executed with bBondComm. */
2202                 merge_cg_buffers(nzone,
2203                                  cd,
2204                                  p,
2205                                  zone_cg_range,
2206                                  dd->globalAtomGroupIndices,
2207                                  integerBufferRef.data(),
2208                                  state->x,
2209                                  rvecBufferRef,
2210                                  fr->cginfo_mb,
2211                                  fr->cginfo);
2212                 pos_cg += ind->nrecv[nzone];
2213             }
2214             nat_tot += ind->nrecv[nzone + 1];
2215         }
2216         if (!cd->receiveInPlace)
2217         {
2218             /* Store the atom block for easy copying of communication buffers */
2219             make_cell2at_index(cd, nzone, zone_cg_range[nzone]);
2220         }
2221         nzone += nzone;
2222     }
2223
2224     comm->atomRanges.setEnd(DDAtomRanges::Type::Zones, nat_tot);
2225
2226     if (!bBondComm)
2227     {
2228         /* We don't need to update cginfo, since that was alrady done above.
2229          * So we pass NULL for the forcerec.
2230          */
2231         dd_set_cginfo(dd->globalAtomGroupIndices, dd->ncg_home, dd->globalAtomGroupIndices.size(), nullptr);
2232     }
2233
2234     if (debug)
2235     {
2236         fprintf(debug, "Finished setting up DD communication, zones:");
2237         for (c = 0; c < zones->n; c++)
2238         {
2239             fprintf(debug, " %d", zones->cg_range[c + 1] - zones->cg_range[c]);
2240         }
2241         fprintf(debug, "\n");
2242     }
2243 }
2244
2245 //! Set boundaries for the charge group range.
2246 static void set_cg_boundaries(gmx_domdec_zones_t* zones)
2247 {
2248     for (auto& iZone : zones->iZones)
2249     {
2250         iZone.iAtomRange = gmx::Range<int>(0, zones->cg_range[iZone.iZoneIndex + 1]);
2251         iZone.jAtomRange = gmx::Range<int>(zones->cg_range[iZone.jZoneRange.begin()],
2252                                            zones->cg_range[iZone.jZoneRange.end()]);
2253     }
2254 }
2255
2256 /*! \brief Set zone dimensions for zones \p zone_start to \p zone_end-1
2257  *
2258  * Also sets the atom density for the home zone when \p zone_start=0.
2259  * For this \p numMovedChargeGroupsInHomeZone needs to be passed to tell
2260  * how many charge groups will move but are still part of the current range.
2261  * \todo When converting domdec to use proper classes, all these variables
2262  *       should be private and a method should return the correct count
2263  *       depending on an internal state.
2264  *
2265  * \param[in,out] dd          The domain decomposition struct
2266  * \param[in]     box         The box
2267  * \param[in]     ddbox       The domain decomposition box struct
2268  * \param[in]     zone_start  The start of the zone range to set sizes for
2269  * \param[in]     zone_end    The end of the zone range to set sizes for
2270  * \param[in]     numMovedChargeGroupsInHomeZone  The number of charge groups in the home zone that should moved but are still present in dd->comm->zones.cg_range
2271  */
2272 static void set_zones_size(gmx_domdec_t*      dd,
2273                            matrix             box,
2274                            const gmx_ddbox_t* ddbox,
2275                            int                zone_start,
2276                            int                zone_end,
2277                            int                numMovedChargeGroupsInHomeZone)
2278 {
2279     gmx_domdec_comm_t*  comm;
2280     gmx_domdec_zones_t* zones;
2281     gmx_bool            bDistMB;
2282     int                 z, d, dim;
2283     real                rcs, rcmbs;
2284     int                 i, j;
2285     real                vol;
2286
2287     comm = dd->comm;
2288
2289     zones = &comm->zones;
2290
2291     /* Do we need to determine extra distances for multi-body bondeds? */
2292     bDistMB = (comm->systemInfo.haveInterDomainMultiBodyBondeds && isDlbOn(dd->comm) && dd->ndim > 1);
2293
2294     for (z = zone_start; z < zone_end; z++)
2295     {
2296         /* Copy cell limits to zone limits.
2297          * Valid for non-DD dims and non-shifted dims.
2298          */
2299         copy_rvec(comm->cell_x0, zones->size[z].x0);
2300         copy_rvec(comm->cell_x1, zones->size[z].x1);
2301     }
2302
2303     for (d = 0; d < dd->ndim; d++)
2304     {
2305         dim = dd->dim[d];
2306
2307         for (z = 0; z < zones->n; z++)
2308         {
2309             /* With a staggered grid we have different sizes
2310              * for non-shifted dimensions.
2311              */
2312             if (isDlbOn(dd->comm) && zones->shift[z][dim] == 0)
2313             {
2314                 if (d == 1)
2315                 {
2316                     zones->size[z].x0[dim] = comm->zone_d1[zones->shift[z][dd->dim[d - 1]]].min0;
2317                     zones->size[z].x1[dim] = comm->zone_d1[zones->shift[z][dd->dim[d - 1]]].max1;
2318                 }
2319                 else if (d == 2)
2320                 {
2321                     zones->size[z].x0[dim] =
2322                             comm->zone_d2[zones->shift[z][dd->dim[d - 2]]][zones->shift[z][dd->dim[d - 1]]]
2323                                     .min0;
2324                     zones->size[z].x1[dim] =
2325                             comm->zone_d2[zones->shift[z][dd->dim[d - 2]]][zones->shift[z][dd->dim[d - 1]]]
2326                                     .max1;
2327                 }
2328             }
2329         }
2330
2331         rcs   = comm->systemInfo.cutoff;
2332         rcmbs = comm->cutoff_mbody;
2333         if (ddbox->tric_dir[dim])
2334         {
2335             rcs /= ddbox->skew_fac[dim];
2336             rcmbs /= ddbox->skew_fac[dim];
2337         }
2338
2339         /* Set the lower limit for the shifted zone dimensions */
2340         for (z = zone_start; z < zone_end; z++)
2341         {
2342             if (zones->shift[z][dim] > 0)
2343             {
2344                 dim = dd->dim[d];
2345                 if (!isDlbOn(dd->comm) || d == 0)
2346                 {
2347                     zones->size[z].x0[dim] = comm->cell_x1[dim];
2348                     zones->size[z].x1[dim] = comm->cell_x1[dim] + rcs;
2349                 }
2350                 else
2351                 {
2352                     /* Here we take the lower limit of the zone from
2353                      * the lowest domain of the zone below.
2354                      */
2355                     if (z < 4)
2356                     {
2357                         zones->size[z].x0[dim] = comm->zone_d1[zones->shift[z][dd->dim[d - 1]]].min1;
2358                     }
2359                     else
2360                     {
2361                         if (d == 1)
2362                         {
2363                             zones->size[z].x0[dim] = zones->size[zone_perm[2][z - 4]].x0[dim];
2364                         }
2365                         else
2366                         {
2367                             zones->size[z].x0[dim] =
2368                                     comm->zone_d2[zones->shift[z][dd->dim[d - 2]]][zones->shift[z][dd->dim[d - 1]]]
2369                                             .min1;
2370                         }
2371                     }
2372                     /* A temporary limit, is updated below */
2373                     zones->size[z].x1[dim] = zones->size[z].x0[dim];
2374
2375                     if (bDistMB)
2376                     {
2377                         for (size_t zi = 0; zi < zones->iZones.size(); zi++)
2378                         {
2379                             if (zones->shift[zi][dim] == 0)
2380                             {
2381                                 /* This takes the whole zone into account.
2382                                  * With multiple pulses this will lead
2383                                  * to a larger zone then strictly necessary.
2384                                  */
2385                                 zones->size[z].x1[dim] = std::max(zones->size[z].x1[dim],
2386                                                                   zones->size[zi].x1[dim] + rcmbs);
2387                             }
2388                         }
2389                     }
2390                 }
2391             }
2392         }
2393
2394         /* Loop over the i-zones to set the upper limit of each
2395          * j-zone they see.
2396          */
2397         for (const auto& iZone : zones->iZones)
2398         {
2399             const int zi = iZone.iZoneIndex;
2400             if (zones->shift[zi][dim] == 0)
2401             {
2402                 /* We should only use zones up to zone_end */
2403                 const auto& jZoneRangeFull = iZone.jZoneRange;
2404                 if (zone_end <= *jZoneRangeFull.begin())
2405                 {
2406                     continue;
2407                 }
2408                 const gmx::Range<int> jZoneRange(*jZoneRangeFull.begin(),
2409                                                  std::min(*jZoneRangeFull.end(), zone_end));
2410                 for (int jZone : jZoneRange)
2411                 {
2412                     if (zones->shift[jZone][dim] > 0)
2413                     {
2414                         zones->size[jZone].x1[dim] =
2415                                 std::max(zones->size[jZone].x1[dim], zones->size[zi].x1[dim] + rcs);
2416                     }
2417                 }
2418             }
2419         }
2420     }
2421
2422     for (z = zone_start; z < zone_end; z++)
2423     {
2424         /* Initialization only required to keep the compiler happy */
2425         rvec corner_min = { 0, 0, 0 }, corner_max = { 0, 0, 0 }, corner;
2426         int  nc, c;
2427
2428         /* To determine the bounding box for a zone we need to find
2429          * the extreme corners of 4, 2 or 1 corners.
2430          */
2431         nc = 1 << (ddbox->nboundeddim - 1);
2432
2433         for (c = 0; c < nc; c++)
2434         {
2435             /* Set up a zone corner at x=0, ignoring trilinic couplings */
2436             corner[XX] = 0;
2437             if ((c & 1) == 0)
2438             {
2439                 corner[YY] = zones->size[z].x0[YY];
2440             }
2441             else
2442             {
2443                 corner[YY] = zones->size[z].x1[YY];
2444             }
2445             if ((c & 2) == 0)
2446             {
2447                 corner[ZZ] = zones->size[z].x0[ZZ];
2448             }
2449             else
2450             {
2451                 corner[ZZ] = zones->size[z].x1[ZZ];
2452             }
2453             if (dd->ndim == 1 && dd->dim[0] < ZZ && ZZ < dd->unitCellInfo.npbcdim
2454                 && box[ZZ][1 - dd->dim[0]] != 0)
2455             {
2456                 /* With 1D domain decomposition the cg's are not in
2457                  * the triclinic box, but triclinic x-y and rectangular y/x-z.
2458                  * Shift the corner of the z-vector back to along the box
2459                  * vector of dimension d, so it will later end up at 0 along d.
2460                  * This can affect the location of this corner along dd->dim[0]
2461                  * through the matrix operation below if box[d][dd->dim[0]]!=0.
2462                  */
2463                 int d = 1 - dd->dim[0];
2464
2465                 corner[d] -= corner[ZZ] * box[ZZ][d] / box[ZZ][ZZ];
2466             }
2467             /* Apply the triclinic couplings */
2468             for (i = YY; i < ddbox->npbcdim && i < DIM; i++)
2469             {
2470                 for (j = XX; j < i; j++)
2471                 {
2472                     corner[j] += corner[i] * box[i][j] / box[i][i];
2473                 }
2474             }
2475             if (c == 0)
2476             {
2477                 copy_rvec(corner, corner_min);
2478                 copy_rvec(corner, corner_max);
2479             }
2480             else
2481             {
2482                 for (i = 0; i < DIM; i++)
2483                 {
2484                     corner_min[i] = std::min(corner_min[i], corner[i]);
2485                     corner_max[i] = std::max(corner_max[i], corner[i]);
2486                 }
2487             }
2488         }
2489         /* Copy the extreme cornes without offset along x */
2490         for (i = 0; i < DIM; i++)
2491         {
2492             zones->size[z].bb_x0[i] = corner_min[i];
2493             zones->size[z].bb_x1[i] = corner_max[i];
2494         }
2495         /* Add the offset along x */
2496         zones->size[z].bb_x0[XX] += zones->size[z].x0[XX];
2497         zones->size[z].bb_x1[XX] += zones->size[z].x1[XX];
2498     }
2499
2500     if (zone_start == 0)
2501     {
2502         vol = 1;
2503         for (dim = 0; dim < DIM; dim++)
2504         {
2505             vol *= zones->size[0].x1[dim] - zones->size[0].x0[dim];
2506         }
2507         zones->dens_zone0 =
2508                 (zones->cg_range[1] - zones->cg_range[0] - numMovedChargeGroupsInHomeZone) / vol;
2509     }
2510
2511     if (debug)
2512     {
2513         for (z = zone_start; z < zone_end; z++)
2514         {
2515             fprintf(debug,
2516                     "zone %d    %6.3f - %6.3f  %6.3f - %6.3f  %6.3f - %6.3f\n",
2517                     z,
2518                     zones->size[z].x0[XX],
2519                     zones->size[z].x1[XX],
2520                     zones->size[z].x0[YY],
2521                     zones->size[z].x1[YY],
2522                     zones->size[z].x0[ZZ],
2523                     zones->size[z].x1[ZZ]);
2524             fprintf(debug,
2525                     "zone %d bb %6.3f - %6.3f  %6.3f - %6.3f  %6.3f - %6.3f\n",
2526                     z,
2527                     zones->size[z].bb_x0[XX],
2528                     zones->size[z].bb_x1[XX],
2529                     zones->size[z].bb_x0[YY],
2530                     zones->size[z].bb_x1[YY],
2531                     zones->size[z].bb_x0[ZZ],
2532                     zones->size[z].bb_x1[ZZ]);
2533         }
2534     }
2535 }
2536
2537 /*! \brief Order data in \p dataToSort according to \p sort
2538  *
2539  * Note: both buffers should have at least \p sort.size() elements.
2540  */
2541 template<typename T>
2542 static void orderVector(gmx::ArrayRef<const gmx_cgsort_t> sort,
2543                         gmx::ArrayRef<T>                  dataToSort,
2544                         gmx::ArrayRef<T>                  sortBuffer)
2545 {
2546     GMX_ASSERT(dataToSort.size() >= sort.size(), "The vector needs to be sufficiently large");
2547     GMX_ASSERT(sortBuffer.size() >= sort.size(),
2548                "The sorting buffer needs to be sufficiently large");
2549
2550     /* Order the data into the temporary buffer */
2551     size_t i = 0;
2552     for (const gmx_cgsort_t& entry : sort)
2553     {
2554         sortBuffer[i++] = dataToSort[entry.ind];
2555     }
2556
2557     /* Copy back to the original array */
2558     std::copy(sortBuffer.begin(), sortBuffer.begin() + sort.size(), dataToSort.begin());
2559 }
2560
2561 /*! \brief Order data in \p dataToSort according to \p sort
2562  *
2563  * Note: \p vectorToSort should have at least \p sort.size() elements,
2564  *       \p workVector is resized when it is too small.
2565  */
2566 template<typename T>
2567 static void orderVector(gmx::ArrayRef<const gmx_cgsort_t> sort,
2568                         gmx::ArrayRef<T>                  vectorToSort,
2569                         std::vector<T>*                   workVector)
2570 {
2571     if (gmx::index(workVector->size()) < sort.ssize())
2572     {
2573         workVector->resize(sort.size());
2574     }
2575     orderVector<T>(sort, vectorToSort, *workVector);
2576 }
2577
2578 //! Returns the sorting order for atoms based on the nbnxn grid order in sort
2579 static void dd_sort_order_nbnxn(const t_forcerec* fr, std::vector<gmx_cgsort_t>* sort)
2580 {
2581     gmx::ArrayRef<const int> atomOrder = fr->nbv->getLocalAtomOrder();
2582
2583     /* Using push_back() instead of this resize results in much slower code */
2584     sort->resize(atomOrder.size());
2585     gmx::ArrayRef<gmx_cgsort_t> buffer    = *sort;
2586     size_t                      numSorted = 0;
2587     for (int i : atomOrder)
2588     {
2589         if (i >= 0)
2590         {
2591             /* The values of nsc and ind_gl are not used in this case */
2592             buffer[numSorted++].ind = i;
2593         }
2594     }
2595     sort->resize(numSorted);
2596 }
2597
2598 //! Returns the sorting state for DD.
2599 static void dd_sort_state(gmx_domdec_t* dd, t_forcerec* fr, t_state* state)
2600 {
2601     gmx_domdec_sort_t* sort = dd->comm->sort.get();
2602
2603     dd_sort_order_nbnxn(fr, &sort->sorted);
2604
2605     /* We alloc with the old size, since cgindex is still old */
2606     DDBufferAccess<gmx::RVec> rvecBuffer(dd->comm->rvecBuffer, dd->ncg_home);
2607
2608     /* Set the new home atom/charge group count */
2609     dd->ncg_home = sort->sorted.size();
2610     if (debug)
2611     {
2612         fprintf(debug, "Set the new home atom count to %d\n", dd->ncg_home);
2613     }
2614
2615     /* Reorder the state */
2616     gmx::ArrayRef<const gmx_cgsort_t> cgsort = sort->sorted;
2617     GMX_RELEASE_ASSERT(cgsort.ssize() == dd->ncg_home, "We should sort all the home atom groups");
2618
2619     if (state->flags & (1 << estX))
2620     {
2621         orderVector(cgsort, makeArrayRef(state->x), rvecBuffer.buffer);
2622     }
2623     if (state->flags & (1 << estV))
2624     {
2625         orderVector(cgsort, makeArrayRef(state->v), rvecBuffer.buffer);
2626     }
2627     if (state->flags & (1 << estCGP))
2628     {
2629         orderVector(cgsort, makeArrayRef(state->cg_p), rvecBuffer.buffer);
2630     }
2631
2632     /* Reorder the global cg index */
2633     orderVector<int>(cgsort, dd->globalAtomGroupIndices, &sort->intBuffer);
2634     /* Reorder the cginfo */
2635     orderVector<int>(cgsort, fr->cginfo, &sort->intBuffer);
2636     /* Set the home atom number */
2637     dd->comm->atomRanges.setEnd(DDAtomRanges::Type::Home, dd->ncg_home);
2638
2639     /* The atoms are now exactly in grid order, update the grid order */
2640     fr->nbv->setLocalAtomOrder();
2641 }
2642
2643 //! Accumulates load statistics.
2644 static void add_dd_statistics(gmx_domdec_t* dd)
2645 {
2646     gmx_domdec_comm_t* comm = dd->comm;
2647
2648     for (int i = 0; i < static_cast<int>(DDAtomRanges::Type::Number); i++)
2649     {
2650         auto range = static_cast<DDAtomRanges::Type>(i);
2651         comm->sum_nat[i] += comm->atomRanges.end(range) - comm->atomRanges.start(range);
2652     }
2653     comm->ndecomp++;
2654 }
2655
2656 void reset_dd_statistics_counters(gmx_domdec_t* dd)
2657 {
2658     gmx_domdec_comm_t* comm = dd->comm;
2659
2660     /* Reset all the statistics and counters for total run counting */
2661     for (int i = 0; i < static_cast<int>(DDAtomRanges::Type::Number); i++)
2662     {
2663         comm->sum_nat[i] = 0;
2664     }
2665     comm->ndecomp   = 0;
2666     comm->nload     = 0;
2667     comm->load_step = 0;
2668     comm->load_sum  = 0;
2669     comm->load_max  = 0;
2670     clear_ivec(comm->load_lim);
2671     comm->load_mdf = 0;
2672     comm->load_pme = 0;
2673 }
2674
2675 void print_dd_statistics(const t_commrec* cr, const t_inputrec* ir, FILE* fplog)
2676 {
2677     gmx_domdec_comm_t* comm = cr->dd->comm;
2678
2679     const int numRanges = static_cast<int>(DDAtomRanges::Type::Number);
2680     gmx_sumd(numRanges, comm->sum_nat, cr);
2681
2682     if (fplog == nullptr)
2683     {
2684         return;
2685     }
2686
2687     fprintf(fplog, "\n    D O M A I N   D E C O M P O S I T I O N   S T A T I S T I C S\n\n");
2688
2689     for (int i = static_cast<int>(DDAtomRanges::Type::Zones); i < numRanges; i++)
2690     {
2691         auto   range = static_cast<DDAtomRanges::Type>(i);
2692         double av    = comm->sum_nat[i] / comm->ndecomp;
2693         switch (range)
2694         {
2695             case DDAtomRanges::Type::Zones:
2696                 fprintf(fplog, " av. #atoms communicated per step for force:  %d x %.1f\n", 2, av);
2697                 break;
2698             case DDAtomRanges::Type::Vsites:
2699                 if (cr->dd->vsite_comm)
2700                 {
2701                     fprintf(fplog,
2702                             " av. #atoms communicated per step for vsites: %d x %.1f\n",
2703                             (EEL_PME(ir->coulombtype) || ir->coulombtype == eelEWALD) ? 3 : 2,
2704                             av);
2705                 }
2706                 break;
2707             case DDAtomRanges::Type::Constraints:
2708                 if (cr->dd->constraint_comm)
2709                 {
2710                     fprintf(fplog,
2711                             " av. #atoms communicated per step for LINCS:  %d x %.1f\n",
2712                             1 + ir->nLincsIter,
2713                             av);
2714                 }
2715                 break;
2716             default: gmx_incons(" Unknown type for DD statistics");
2717         }
2718     }
2719     fprintf(fplog, "\n");
2720
2721     if (comm->ddSettings.recordLoad && EI_DYNAMICS(ir->eI))
2722     {
2723         print_dd_load_av(fplog, cr->dd);
2724     }
2725 }
2726
2727 //!\brief TODO Remove fplog when group scheme and charge groups are gone
2728 void dd_partition_system(FILE*                     fplog,
2729                          const gmx::MDLogger&      mdlog,
2730                          int64_t                   step,
2731                          const t_commrec*          cr,
2732                          gmx_bool                  bMasterState,
2733                          int                       nstglobalcomm,
2734                          t_state*                  state_global,
2735                          const gmx_mtop_t&         top_global,
2736                          const t_inputrec*         ir,
2737                          gmx::ImdSession*          imdSession,
2738                          pull_t*                   pull_work,
2739                          t_state*                  state_local,
2740                          gmx::ForceBuffers*        f,
2741                          gmx::MDAtoms*             mdAtoms,
2742                          gmx_localtop_t*           top_local,
2743                          t_forcerec*               fr,
2744                          gmx::VirtualSitesHandler* vsite,
2745                          gmx::Constraints*         constr,
2746                          t_nrnb*                   nrnb,
2747                          gmx_wallcycle*            wcycle,
2748                          gmx_bool                  bVerbose)
2749 {
2750     gmx_domdec_t*      dd;
2751     gmx_domdec_comm_t* comm;
2752     gmx_ddbox_t        ddbox = { 0 };
2753     int64_t            step_pcoupl;
2754     rvec               cell_ns_x0, cell_ns_x1;
2755     int                ncgindex_set, ncg_moved, nat_f_novirsum;
2756     gmx_bool           bBoxChanged, bNStGlobalComm, bDoDLB, bCheckWhetherToTurnDlbOn, bLogLoad;
2757     gmx_bool           bRedist;
2758     ivec               np;
2759     real               grid_density;
2760     char               sbuf[22];
2761
2762     wallcycle_start(wcycle, ewcDOMDEC);
2763
2764     dd   = cr->dd;
2765     comm = dd->comm;
2766
2767     // TODO if the update code becomes accessible here, use
2768     // upd->deform for this logic.
2769     bBoxChanged = (bMasterState || inputrecDeform(ir));
2770     if (ir->epc != PressureCoupling::No)
2771     {
2772         /* With nstpcouple > 1 pressure coupling happens.
2773          * one step after calculating the pressure.
2774          * Box scaling happens at the end of the MD step,
2775          * after the DD partitioning.
2776          * We therefore have to do DLB in the first partitioning
2777          * after an MD step where P-coupling occurred.
2778          * We need to determine the last step in which p-coupling occurred.
2779          * MRS -- need to validate this for vv?
2780          */
2781         int n = ir->nstpcouple;
2782         if (n == 1)
2783         {
2784             step_pcoupl = step - 1;
2785         }
2786         else
2787         {
2788             step_pcoupl = ((step - 1) / n) * n + 1;
2789         }
2790         if (step_pcoupl >= comm->partition_step)
2791         {
2792             bBoxChanged = TRUE;
2793         }
2794     }
2795
2796     bNStGlobalComm = (step % nstglobalcomm == 0);
2797
2798     if (!isDlbOn(comm))
2799     {
2800         bDoDLB = FALSE;
2801     }
2802     else
2803     {
2804         /* Should we do dynamic load balacing this step?
2805          * Since it requires (possibly expensive) global communication,
2806          * we might want to do DLB less frequently.
2807          */
2808         if (bBoxChanged || ir->epc != PressureCoupling::No)
2809         {
2810             bDoDLB = bBoxChanged;
2811         }
2812         else
2813         {
2814             bDoDLB = bNStGlobalComm;
2815         }
2816     }
2817
2818     /* Check if we have recorded loads on the nodes */
2819     if (comm->ddSettings.recordLoad && dd_load_count(comm) > 0)
2820     {
2821         bCheckWhetherToTurnDlbOn = dd_dlb_get_should_check_whether_to_turn_dlb_on(dd);
2822
2823         /* Print load every nstlog, first and last step to the log file */
2824         bLogLoad = ((ir->nstlog > 0 && step % ir->nstlog == 0) || comm->n_load_collect == 0
2825                     || (ir->nsteps >= 0 && (step + ir->nstlist > ir->init_step + ir->nsteps)));
2826
2827         /* Avoid extra communication due to verbose screen output
2828          * when nstglobalcomm is set.
2829          */
2830         if (bDoDLB || bLogLoad || bCheckWhetherToTurnDlbOn
2831             || (bVerbose && (ir->nstlist == 0 || nstglobalcomm <= ir->nstlist)))
2832         {
2833             get_load_distribution(dd, wcycle);
2834             if (DDMASTER(dd))
2835             {
2836                 if (bLogLoad)
2837                 {
2838                     GMX_LOG(mdlog.info).asParagraph().appendText(dd_print_load(dd, step - 1));
2839                 }
2840                 if (bVerbose)
2841                 {
2842                     dd_print_load_verbose(dd);
2843                 }
2844             }
2845             comm->n_load_collect++;
2846
2847             if (isDlbOn(comm))
2848             {
2849                 if (DDMASTER(dd))
2850                 {
2851                     /* Add the measured cycles to the running average */
2852                     const float averageFactor = 0.1F;
2853                     comm->cyclesPerStepDlbExpAverage =
2854                             (1 - averageFactor) * comm->cyclesPerStepDlbExpAverage
2855                             + averageFactor * comm->cycl[ddCyclStep] / comm->cycl_n[ddCyclStep];
2856                 }
2857                 if (comm->dlbState == DlbState::onCanTurnOff
2858                     && dd->comm->n_load_have % c_checkTurnDlbOffInterval == c_checkTurnDlbOffInterval - 1)
2859                 {
2860                     gmx_bool turnOffDlb;
2861                     if (DDMASTER(dd))
2862                     {
2863                         /* If the running averaged cycles with DLB are more
2864                          * than before we turned on DLB, turn off DLB.
2865                          * We will again run and check the cycles without DLB
2866                          * and we can then decide if to turn off DLB forever.
2867                          */
2868                         turnOffDlb = (comm->cyclesPerStepDlbExpAverage > comm->cyclesPerStepBeforeDLB);
2869                     }
2870                     dd_bcast(dd, sizeof(turnOffDlb), &turnOffDlb);
2871                     if (turnOffDlb)
2872                     {
2873                         /* To turn off DLB, we need to redistribute the atoms */
2874                         dd_collect_state(dd, state_local, state_global);
2875                         bMasterState = TRUE;
2876                         turn_off_dlb(mdlog, dd, step);
2877                     }
2878                 }
2879             }
2880             else if (bCheckWhetherToTurnDlbOn)
2881             {
2882                 gmx_bool turnOffDlbForever = FALSE;
2883                 gmx_bool turnOnDlb         = FALSE;
2884
2885                 /* Since the timings are node dependent, the master decides */
2886                 if (DDMASTER(dd))
2887                 {
2888                     /* If we recently turned off DLB, we want to check if
2889                      * performance is better without DLB. We want to do this
2890                      * ASAP to minimize the chance that external factors
2891                      * slowed down the DLB step are gone here and we
2892                      * incorrectly conclude that DLB was causing the slowdown.
2893                      * So we measure one nstlist block, no running average.
2894                      */
2895                     if (comm->haveTurnedOffDlb
2896                         && comm->cycl[ddCyclStep] / comm->cycl_n[ddCyclStep] < comm->cyclesPerStepDlbExpAverage)
2897                     {
2898                         /* After turning off DLB we ran nstlist steps in fewer
2899                          * cycles than with DLB. This likely means that DLB
2900                          * in not benefical, but this could be due to a one
2901                          * time unlucky fluctuation, so we require two such
2902                          * observations in close succession to turn off DLB
2903                          * forever.
2904                          */
2905                         if (comm->dlbSlowerPartitioningCount > 0
2906                             && dd->ddp_count < comm->dlbSlowerPartitioningCount + 10 * c_checkTurnDlbOnInterval)
2907                         {
2908                             turnOffDlbForever = TRUE;
2909                         }
2910                         comm->haveTurnedOffDlb = false;
2911                         /* Register when we last measured DLB slowdown */
2912                         comm->dlbSlowerPartitioningCount = dd->ddp_count;
2913                     }
2914                     else
2915                     {
2916                         /* Here we check if the max PME rank load is more than 0.98
2917                          * the max PP force load. If so, PP DLB will not help,
2918                          * since we are (almost) limited by PME. Furthermore,
2919                          * DLB will cause a significant extra x/f redistribution
2920                          * cost on the PME ranks, which will then surely result
2921                          * in lower total performance.
2922                          */
2923                         if (comm->ddRankSetup.usePmeOnlyRanks && dd_pme_f_ratio(dd) > 1 - DD_PERF_LOSS_DLB_ON)
2924                         {
2925                             turnOnDlb = FALSE;
2926                         }
2927                         else
2928                         {
2929                             turnOnDlb = (dd_force_imb_perf_loss(dd) >= DD_PERF_LOSS_DLB_ON);
2930                         }
2931                     }
2932                 }
2933                 struct
2934                 {
2935                     gmx_bool turnOffDlbForever;
2936                     gmx_bool turnOnDlb;
2937                 } bools{ turnOffDlbForever, turnOnDlb };
2938                 dd_bcast(dd, sizeof(bools), &bools);
2939                 if (bools.turnOffDlbForever)
2940                 {
2941                     turn_off_dlb_forever(mdlog, dd, step);
2942                 }
2943                 else if (bools.turnOnDlb)
2944                 {
2945                     turn_on_dlb(mdlog, dd, step);
2946                     bDoDLB = TRUE;
2947                 }
2948             }
2949         }
2950         comm->n_load_have++;
2951     }
2952
2953     bRedist = FALSE;
2954     if (bMasterState)
2955     {
2956         /* Clear the old state */
2957         clearDDStateIndices(dd, false);
2958         ncgindex_set = 0;
2959
2960         auto xGlobal = positionsFromStatePointer(state_global);
2961
2962         set_ddbox(*dd, true, DDMASTER(dd) ? state_global->box : nullptr, true, xGlobal, &ddbox);
2963
2964         distributeState(mdlog, dd, top_global, state_global, ddbox, state_local);
2965
2966         /* Ensure that we have space for the new distribution */
2967         dd_resize_atominfo_and_state(fr, state_local, dd->ncg_home);
2968
2969         inc_nrnb(nrnb, eNR_CGCM, comm->atomRanges.numHomeAtoms());
2970
2971         dd_set_cginfo(dd->globalAtomGroupIndices, 0, dd->ncg_home, fr);
2972     }
2973     else if (state_local->ddp_count != dd->ddp_count)
2974     {
2975         if (state_local->ddp_count > dd->ddp_count)
2976         {
2977             gmx_fatal(FARGS,
2978                       "Internal inconsistency state_local->ddp_count (%d) > dd->ddp_count (%" PRId64
2979                       ")",
2980                       state_local->ddp_count,
2981                       dd->ddp_count);
2982         }
2983
2984         if (state_local->ddp_count_cg_gl != state_local->ddp_count)
2985         {
2986             gmx_fatal(FARGS,
2987                       "Internal inconsistency state_local->ddp_count_cg_gl (%d) != "
2988                       "state_local->ddp_count (%d)",
2989                       state_local->ddp_count_cg_gl,
2990                       state_local->ddp_count);
2991         }
2992
2993         /* Clear the old state */
2994         clearDDStateIndices(dd, false);
2995
2996         /* Restore the atom group indices from state_local */
2997         restoreAtomGroups(dd, state_local);
2998         make_dd_indices(dd, 0);
2999         ncgindex_set = dd->ncg_home;
3000
3001         inc_nrnb(nrnb, eNR_CGCM, comm->atomRanges.numHomeAtoms());
3002
3003         dd_set_cginfo(dd->globalAtomGroupIndices, 0, dd->ncg_home, fr);
3004
3005         set_ddbox(*dd, bMasterState, state_local->box, true, state_local->x, &ddbox);
3006
3007         bRedist = isDlbOn(comm);
3008     }
3009     else
3010     {
3011         /* We have the full state, only redistribute the cgs */
3012
3013         /* Clear the non-home indices */
3014         clearDDStateIndices(dd, true);
3015         ncgindex_set = 0;
3016
3017         /* To avoid global communication, we do not recompute the extent
3018          * of the system for dims without pbc. Therefore we need to copy
3019          * the previously computed values when we do not communicate.
3020          */
3021         if (!bNStGlobalComm)
3022         {
3023             copy_rvec(comm->box0, ddbox.box0);
3024             copy_rvec(comm->box_size, ddbox.box_size);
3025         }
3026         set_ddbox(*dd, bMasterState, state_local->box, bNStGlobalComm, state_local->x, &ddbox);
3027
3028         bBoxChanged = TRUE;
3029         bRedist     = TRUE;
3030     }
3031     /* Copy needed for dim's without pbc when avoiding communication */
3032     copy_rvec(ddbox.box0, comm->box0);
3033     copy_rvec(ddbox.box_size, comm->box_size);
3034
3035     set_dd_cell_sizes(dd, &ddbox, dd->unitCellInfo.ddBoxIsDynamic, bMasterState, bDoDLB, step, wcycle);
3036
3037     if (comm->ddSettings.nstDDDumpGrid > 0 && step % comm->ddSettings.nstDDDumpGrid == 0)
3038     {
3039         write_dd_grid_pdb("dd_grid", step, dd, state_local->box, &ddbox);
3040     }
3041
3042     if (comm->systemInfo.useUpdateGroups)
3043     {
3044         comm->updateGroupsCog->addCogs(
3045                 gmx::arrayRefFromArray(dd->globalAtomGroupIndices.data(), dd->ncg_home), state_local->x);
3046     }
3047
3048     /* Check if we should sort the charge groups */
3049     const bool bSortCG = (bMasterState || bRedist);
3050
3051     /* When repartitioning we mark atom groups that will move to neighboring
3052      * DD cells, but we do not move them right away for performance reasons.
3053      * Thus we need to keep track of how many charge groups will move for
3054      * obtaining correct local charge group / atom counts.
3055      */
3056     ncg_moved = 0;
3057     if (bRedist)
3058     {
3059         wallcycle_sub_start(wcycle, ewcsDD_REDIST);
3060
3061         ncgindex_set = dd->ncg_home;
3062         dd_redistribute_cg(fplog, step, dd, ddbox.tric_dir, state_local, fr, nrnb, &ncg_moved);
3063
3064         GMX_RELEASE_ASSERT(bSortCG, "Sorting is required after redistribution");
3065
3066         if (comm->systemInfo.useUpdateGroups)
3067         {
3068             comm->updateGroupsCog->addCogs(
3069                     gmx::arrayRefFromArray(dd->globalAtomGroupIndices.data(), dd->ncg_home),
3070                     state_local->x);
3071         }
3072
3073         wallcycle_sub_stop(wcycle, ewcsDD_REDIST);
3074     }
3075
3076     // TODO: Integrate this code in the nbnxm module
3077     get_nsgrid_boundaries(ddbox.nboundeddim,
3078                           state_local->box,
3079                           dd,
3080                           &ddbox,
3081                           &comm->cell_x0,
3082                           &comm->cell_x1,
3083                           dd->ncg_home,
3084                           as_rvec_array(state_local->x.data()),
3085                           cell_ns_x0,
3086                           cell_ns_x1,
3087                           &grid_density);
3088
3089     if (bBoxChanged)
3090     {
3091         comm_dd_ns_cell_sizes(dd, &ddbox, cell_ns_x0, cell_ns_x1, step);
3092     }
3093
3094     if (bSortCG)
3095     {
3096         wallcycle_sub_start(wcycle, ewcsDD_GRID);
3097
3098         /* Sort the state on charge group position.
3099          * This enables exact restarts from this step.
3100          * It also improves performance by about 15% with larger numbers
3101          * of atoms per node.
3102          */
3103
3104         /* Fill the ns grid with the home cell,
3105          * so we can sort with the indices.
3106          */
3107         set_zones_ncg_home(dd);
3108
3109         set_zones_size(dd, state_local->box, &ddbox, 0, 1, ncg_moved);
3110
3111         nbnxn_put_on_grid(fr->nbv.get(),
3112                           state_local->box,
3113                           0,
3114                           comm->zones.size[0].bb_x0,
3115                           comm->zones.size[0].bb_x1,
3116                           comm->updateGroupsCog.get(),
3117                           { 0, dd->ncg_home },
3118                           comm->zones.dens_zone0,
3119                           fr->cginfo,
3120                           state_local->x,
3121                           ncg_moved,
3122                           bRedist ? comm->movedBuffer.data() : nullptr);
3123
3124         if (debug)
3125         {
3126             fprintf(debug, "Step %s, sorting the %d home charge groups\n", gmx_step_str(step, sbuf), dd->ncg_home);
3127         }
3128         dd_sort_state(dd, fr, state_local);
3129
3130         /* After sorting and compacting we set the correct size */
3131         state_change_natoms(state_local, comm->atomRanges.numHomeAtoms());
3132
3133         /* Rebuild all the indices */
3134         dd->ga2la->clear();
3135         ncgindex_set = 0;
3136
3137         wallcycle_sub_stop(wcycle, ewcsDD_GRID);
3138     }
3139     else
3140     {
3141         /* With the group scheme the sorting array is part of the DD state,
3142          * but it just got out of sync, so mark as invalid by emptying it.
3143          */
3144         if (ir->cutoff_scheme == ecutsGROUP)
3145         {
3146             comm->sort->sorted.clear();
3147         }
3148     }
3149
3150     if (comm->systemInfo.useUpdateGroups)
3151     {
3152         /* The update groups cog's are invalid after sorting
3153          * and need to be cleared before the next partitioning anyhow.
3154          */
3155         comm->updateGroupsCog->clear();
3156     }
3157
3158     wallcycle_sub_start(wcycle, ewcsDD_SETUPCOMM);
3159
3160     /* Set the induces for the home atoms */
3161     set_zones_ncg_home(dd);
3162     make_dd_indices(dd, ncgindex_set);
3163
3164     /* Setup up the communication and communicate the coordinates */
3165     setup_dd_communication(dd, state_local->box, &ddbox, fr, state_local);
3166
3167     /* Set the indices for the halo atoms */
3168     make_dd_indices(dd, dd->ncg_home);
3169
3170     /* Set the charge group boundaries for neighbor searching */
3171     set_cg_boundaries(&comm->zones);
3172
3173     /* When bSortCG=true, we have already set the size for zone 0 */
3174     set_zones_size(dd, state_local->box, &ddbox, bSortCG ? 1 : 0, comm->zones.n, 0);
3175
3176     wallcycle_sub_stop(wcycle, ewcsDD_SETUPCOMM);
3177
3178     /*
3179        write_dd_pdb("dd_home",step,"dump",top_global,cr,
3180                  -1,state_local->x.rvec_array(),state_local->box);
3181      */
3182
3183     wallcycle_sub_start(wcycle, ewcsDD_MAKETOP);
3184
3185     /* Extract a local topology from the global topology */
3186     for (int i = 0; i < dd->ndim; i++)
3187     {
3188         np[dd->dim[i]] = comm->cd[i].numPulses();
3189     }
3190     dd_make_local_top(dd,
3191                       &comm->zones,
3192                       dd->unitCellInfo.npbcdim,
3193                       state_local->box,
3194                       comm->cellsize_min,
3195                       np,
3196                       fr,
3197                       state_local->x.rvec_array(),
3198                       top_global,
3199                       top_local);
3200
3201     wallcycle_sub_stop(wcycle, ewcsDD_MAKETOP);
3202
3203     wallcycle_sub_start(wcycle, ewcsDD_MAKECONSTR);
3204
3205     /* Set up the special atom communication */
3206     int n = comm->atomRanges.end(DDAtomRanges::Type::Zones);
3207     for (int i = static_cast<int>(DDAtomRanges::Type::Zones) + 1;
3208          i < static_cast<int>(DDAtomRanges::Type::Number);
3209          i++)
3210     {
3211         auto range = static_cast<DDAtomRanges::Type>(i);
3212         switch (range)
3213         {
3214             case DDAtomRanges::Type::Vsites:
3215                 if (vsite && vsite->numInterUpdategroupVirtualSites())
3216                 {
3217                     n = dd_make_local_vsites(dd, n, top_local->idef.il);
3218                 }
3219                 break;
3220             case DDAtomRanges::Type::Constraints:
3221                 if (dd->comm->systemInfo.haveSplitConstraints || dd->comm->systemInfo.haveSplitSettles)
3222                 {
3223                     /* Only for inter-cg constraints we need special code */
3224                     n = dd_make_local_constraints(dd,
3225                                                   n,
3226                                                   &top_global,
3227                                                   fr->cginfo.data(),
3228                                                   constr,
3229                                                   ir->nProjOrder,
3230                                                   top_local->idef.il);
3231                 }
3232                 break;
3233             default: gmx_incons("Unknown special atom type setup");
3234         }
3235         comm->atomRanges.setEnd(range, n);
3236     }
3237
3238     wallcycle_sub_stop(wcycle, ewcsDD_MAKECONSTR);
3239
3240     wallcycle_sub_start(wcycle, ewcsDD_TOPOTHER);
3241
3242     /* Make space for the extra coordinates for virtual site
3243      * or constraint communication.
3244      */
3245     state_local->natoms = comm->atomRanges.numAtomsTotal();
3246
3247     state_change_natoms(state_local, state_local->natoms);
3248
3249     if (vsite && vsite->numInterUpdategroupVirtualSites())
3250     {
3251         nat_f_novirsum = comm->atomRanges.end(DDAtomRanges::Type::Vsites);
3252     }
3253     else
3254     {
3255         if (EEL_FULL(ir->coulombtype) && dd->haveExclusions)
3256         {
3257             nat_f_novirsum = comm->atomRanges.end(DDAtomRanges::Type::Zones);
3258         }
3259         else
3260         {
3261             nat_f_novirsum = comm->atomRanges.numHomeAtoms();
3262         }
3263     }
3264
3265     /* Set the number of atoms required for the force calculation.
3266      * Forces need to be constrained when doing energy
3267      * minimization. For simple simulations we could avoid some
3268      * allocation, zeroing and copying, but this is probably not worth
3269      * the complications and checking.
3270      */
3271     forcerec_set_ranges(fr,
3272                         comm->atomRanges.end(DDAtomRanges::Type::Zones),
3273                         comm->atomRanges.end(DDAtomRanges::Type::Constraints),
3274                         nat_f_novirsum);
3275
3276     /* Update atom data for mdatoms and several algorithms */
3277     mdAlgorithmsSetupAtomData(cr, ir, top_global, top_local, fr, f, mdAtoms, constr, vsite, nullptr);
3278
3279     auto mdatoms = mdAtoms->mdatoms();
3280     if (!thisRankHasDuty(cr, DUTY_PME))
3281     {
3282         /* Send the charges and/or c6/sigmas to our PME only node */
3283         gmx_pme_send_parameters(cr,
3284                                 fr->ic,
3285                                 mdatoms->nChargePerturbed != 0,
3286                                 mdatoms->nTypePerturbed != 0,
3287                                 mdatoms->chargeA,
3288                                 mdatoms->chargeB,
3289                                 mdatoms->sqrt_c6A,
3290                                 mdatoms->sqrt_c6B,
3291                                 mdatoms->sigmaA,
3292                                 mdatoms->sigmaB,
3293                                 dd_pme_maxshift_x(dd),
3294                                 dd_pme_maxshift_y(dd));
3295     }
3296
3297     if (dd->atomSets != nullptr)
3298     {
3299         /* Update the local atom sets */
3300         dd->atomSets->setIndicesInDomainDecomposition(*(dd->ga2la));
3301     }
3302
3303     // The pull group construction can need the atom sets updated above
3304     if (ir->bPull)
3305     {
3306         /* Update the local pull groups */
3307         dd_make_local_pull_groups(cr, pull_work);
3308     }
3309
3310     /* Update the local atoms to be communicated via the IMD protocol if bIMD is TRUE. */
3311     imdSession->dd_make_local_IMD_atoms(dd);
3312
3313     add_dd_statistics(dd);
3314
3315     /* Make sure we only count the cycles for this DD partitioning */
3316     clear_dd_cycle_counts(dd);
3317
3318     /* Because the order of the atoms might have changed since
3319      * the last vsite construction, we need to communicate the constructing
3320      * atom coordinates again (for spreading the forces this MD step).
3321      */
3322     dd_move_x_vsites(*dd, state_local->box, state_local->x.rvec_array());
3323
3324     wallcycle_sub_stop(wcycle, ewcsDD_TOPOTHER);
3325
3326     if (comm->ddSettings.nstDDDump > 0 && step % comm->ddSettings.nstDDDump == 0)
3327     {
3328         dd_move_x(dd, state_local->box, state_local->x, nullWallcycle);
3329         write_dd_pdb("dd_dump",
3330                      step,
3331                      "dump",
3332                      &top_global,
3333                      cr,
3334                      -1,
3335                      state_local->x.rvec_array(),
3336                      state_local->box);
3337     }
3338
3339     /* Store the partitioning step */
3340     comm->partition_step = step;
3341
3342     /* Increase the DD partitioning counter */
3343     dd->ddp_count++;
3344     /* The state currently matches this DD partitioning count, store it */
3345     state_local->ddp_count = dd->ddp_count;
3346     if (bMasterState)
3347     {
3348         /* The DD master node knows the complete cg distribution,
3349          * store the count so we can possibly skip the cg info communication.
3350          */
3351         comm->master_cg_ddp_count = (bSortCG ? 0 : dd->ddp_count);
3352     }
3353
3354     if (comm->ddSettings.DD_debug > 0)
3355     {
3356         /* Set the env var GMX_DD_DEBUG if you suspect corrupted indices */
3357         check_index_consistency(dd, top_global.natoms, "after partitioning");
3358     }
3359
3360     wallcycle_stop(wcycle, ewcDOMDEC);
3361 }
3362
3363 /*! \brief Check whether bonded interactions are missing, if appropriate */
3364 void checkNumberOfBondedInteractions(const gmx::MDLogger&           mdlog,
3365                                      t_commrec*                     cr,
3366                                      int                            totalNumberOfBondedInteractions,
3367                                      const gmx_mtop_t*              top_global,
3368                                      const gmx_localtop_t*          top_local,
3369                                      gmx::ArrayRef<const gmx::RVec> x,
3370                                      const matrix                   box,
3371                                      bool* shouldCheckNumberOfBondedInteractions)
3372 {
3373     if (*shouldCheckNumberOfBondedInteractions)
3374     {
3375         if (totalNumberOfBondedInteractions != cr->dd->nbonded_global)
3376         {
3377             dd_print_missing_interactions(
3378                     mdlog, cr, totalNumberOfBondedInteractions, top_global, top_local, x, box); // Does not return
3379         }
3380         *shouldCheckNumberOfBondedInteractions = false;
3381     }
3382 }