eccf1475ee33ba39f17a0a8f3292868a103b21d0
[alexxy/gromacs.git] / src / gromacs / domdec / partition.cpp
1 /*
2  * This file is part of the GROMACS molecular simulation package.
3  *
4  * Copyright (c) 2018,2019,2020,2021, by the GROMACS development team, led by
5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6  * and including many others, as listed in the AUTHORS file in the
7  * top-level source directory and at http://www.gromacs.org.
8  *
9  * GROMACS is free software; you can redistribute it and/or
10  * modify it under the terms of the GNU Lesser General Public License
11  * as published by the Free Software Foundation; either version 2.1
12  * of the License, or (at your option) any later version.
13  *
14  * GROMACS is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17  * Lesser General Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser General Public
20  * License along with GROMACS; if not, see
21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
23  *
24  * If you want to redistribute modifications to GROMACS, please
25  * consider that scientific software is very special. Version
26  * control is crucial - bugs must be traceable. We will be happy to
27  * consider code for inclusion in the official distribution, but
28  * derived work must not be called official GROMACS. Details are found
29  * in the README & COPYING files - if they are missing, get the
30  * official version at http://www.gromacs.org.
31  *
32  * To help us fund GROMACS development, we humbly ask that you cite
33  * the research papers on the package. Check out http://www.gromacs.org.
34  */
35 /*! \internal \file
36  *
37  * \brief This file defines functions for mdrun to call to make a new
38  * domain decomposition, and check it.
39  *
40  * \author Berk Hess <hess@kth.se>
41  * \ingroup module_domdec
42  */
43
44 #include "gmxpre.h"
45
46 #include "gromacs/utility/arrayref.h"
47 #include "partition.h"
48
49 #include "config.h"
50
51 #include <cassert>
52 #include <cstdio>
53
54 #include <algorithm>
55
56 #include "gromacs/domdec/collect.h"
57 #include "gromacs/domdec/dlb.h"
58 #include "gromacs/domdec/dlbtiming.h"
59 #include "gromacs/domdec/domdec_network.h"
60 #include "gromacs/domdec/ga2la.h"
61 #include "gromacs/domdec/localatomsetmanager.h"
62 #include "gromacs/domdec/localtopology.h"
63 #include "gromacs/domdec/localtopologychecker.h"
64 #include "gromacs/domdec/mdsetup.h"
65 #include "gromacs/domdec/nsgrid.h"
66 #include "gromacs/ewald/pme_pp.h"
67 #include "gromacs/gmxlib/network.h"
68 #include "gromacs/gmxlib/nrnb.h"
69 #include "gromacs/imd/imd.h"
70 #include "gromacs/math/functions.h"
71 #include "gromacs/math/vec.h"
72 #include "gromacs/mdlib/forcerec.h"
73 #include "gromacs/mdlib/gmx_omp_nthreads.h"
74 #include "gromacs/mdlib/mdatoms.h"
75 #include "gromacs/mdlib/vsite.h"
76 #include "gromacs/mdtypes/commrec.h"
77 #include "gromacs/mdtypes/forcerec.h"
78 #include "gromacs/mdtypes/inputrec.h"
79 #include "gromacs/mdtypes/md_enums.h"
80 #include "gromacs/mdtypes/mdatom.h"
81 #include "gromacs/mdtypes/nblist.h"
82 #include "gromacs/mdtypes/state.h"
83 #include "gromacs/nbnxm/nbnxm.h"
84 #include "gromacs/pulling/pull.h"
85 #include "gromacs/timing/wallcycle.h"
86 #include "gromacs/topology/mtop_util.h"
87 #include "gromacs/topology/topology.h"
88 #include "gromacs/utility/cstringutil.h"
89 #include "gromacs/utility/fatalerror.h"
90 #include "gromacs/utility/logger.h"
91 #include "gromacs/utility/real.h"
92 #include "gromacs/utility/smalloc.h"
93 #include "gromacs/utility/strconvert.h"
94 #include "gromacs/utility/stringstream.h"
95 #include "gromacs/utility/stringutil.h"
96 #include "gromacs/utility/textwriter.h"
97
98 #include "box.h"
99 #include "cellsizes.h"
100 #include "distribute.h"
101 #include "domdec_constraints.h"
102 #include "domdec_internal.h"
103 #include "domdec_vsite.h"
104 #include "dump.h"
105 #include "redistribute.h"
106 #include "utility.h"
107
108 /*! \brief Turn on DLB when the load imbalance causes this amount of total loss.
109  *
110  * There is a bit of overhead with DLB and it's difficult to achieve
111  * a load imbalance of less than 2% with DLB.
112  */
113 #define DD_PERF_LOSS_DLB_ON 0.02
114
115 //! Warn about imbalance due to PP or PP/PME load imbalance at this loss.
116 #define DD_PERF_LOSS_WARN 0.05
117
118
119 //! Debug helper printing a DD zone
120 static void print_ddzone(FILE* fp, int d, int i, int j, gmx_ddzone_t* zone)
121 {
122     fprintf(fp,
123             "zone d0 %d d1 %d d2 %d  min0 %6.3f max1 %6.3f mch0 %6.3f mch1 %6.3f p1_0 %6.3f p1_1 "
124             "%6.3f\n",
125             d,
126             i,
127             j,
128             zone->min0,
129             zone->max1,
130             zone->mch0,
131             zone->mch0,
132             zone->p1_0,
133             zone->p1_1);
134 }
135
136 /*! \brief Using the home grid size as input in cell_ns_x0 and cell_ns_x1
137  * takes the extremes over all home and remote zones in the halo
138  * and returns the results in cell_ns_x0 and cell_ns_x1.
139  * Note: only used with the group cut-off scheme.
140  */
141 static void dd_move_cellx(gmx_domdec_t* dd, const gmx_ddbox_t* ddbox, rvec cell_ns_x0, rvec cell_ns_x1)
142 {
143     constexpr int      c_ddZoneCommMaxNumZones = 5;
144     gmx_ddzone_t       buf_s[c_ddZoneCommMaxNumZones];
145     gmx_ddzone_t       buf_r[c_ddZoneCommMaxNumZones];
146     gmx_ddzone_t       buf_e[c_ddZoneCommMaxNumZones];
147     gmx_domdec_comm_t* comm = dd->comm;
148
149     rvec extr_s[2];
150     rvec extr_r[2];
151     for (int d = 1; d < dd->ndim; d++)
152     {
153         int           dim = dd->dim[d];
154         gmx_ddzone_t& zp  = (d == 1) ? comm->zone_d1[0] : comm->zone_d2[0][0];
155
156         /* Copy the base sizes of the home zone */
157         zp.min0    = cell_ns_x0[dim];
158         zp.max1    = cell_ns_x1[dim];
159         zp.min1    = cell_ns_x1[dim];
160         zp.mch0    = cell_ns_x0[dim];
161         zp.mch1    = cell_ns_x1[dim];
162         zp.p1_0    = cell_ns_x0[dim];
163         zp.p1_1    = cell_ns_x1[dim];
164         zp.dataSet = 1;
165     }
166
167     gmx::ArrayRef<DDCellsizesWithDlb> cellsizes = comm->cellsizesWithDlb;
168
169     /* Loop backward over the dimensions and aggregate the extremes
170      * of the cell sizes.
171      */
172     for (int d = dd->ndim - 2; d >= 0; d--)
173     {
174         const int  dim      = dd->dim[d];
175         const bool applyPbc = (dim < ddbox->npbcdim);
176
177         /* Use an rvec to store two reals */
178         extr_s[d][0] = cellsizes[d + 1].fracLower;
179         extr_s[d][1] = cellsizes[d + 1].fracUpper;
180         extr_s[d][2] = cellsizes[d + 1].fracUpper;
181
182         int pos = 0;
183         GMX_ASSERT(pos < c_ddZoneCommMaxNumZones, "The buffers should be sufficiently large");
184         /* Store the extremes in the backward sending buffer,
185          * so they get updated separately from the forward communication.
186          */
187         for (int d1 = d; d1 < dd->ndim - 1; d1++)
188         {
189             gmx_ddzone_t& buf = buf_s[pos];
190
191             /* We invert the order to be able to use the same loop for buf_e */
192             buf.min0 = extr_s[d1][1];
193             buf.max1 = extr_s[d1][0];
194             buf.min1 = extr_s[d1][2];
195             buf.mch0 = 0;
196             buf.mch1 = 0;
197             /* Store the cell corner of the dimension we communicate along */
198             buf.p1_0    = comm->cell_x0[dim];
199             buf.p1_1    = 0;
200             buf.dataSet = 1;
201             pos++;
202         }
203
204         buf_s[pos] = (dd->ndim == 2) ? comm->zone_d1[0] : comm->zone_d2[0][0];
205         pos++;
206
207         if (dd->ndim == 3 && d == 0)
208         {
209             buf_s[pos] = comm->zone_d2[0][1];
210             pos++;
211             buf_s[pos] = comm->zone_d1[0];
212             pos++;
213         }
214
215         /* We only need to communicate the extremes
216          * in the forward direction
217          */
218         int numPulses = comm->cd[d].numPulses();
219         int numPulsesMin;
220         if (applyPbc)
221         {
222             /* Take the minimum to avoid double communication */
223             numPulsesMin = std::min(numPulses, dd->numCells[dim] - 1 - numPulses);
224         }
225         else
226         {
227             /* Without PBC we should really not communicate over
228              * the boundaries, but implementing that complicates
229              * the communication setup and therefore we simply
230              * do all communication, but ignore some data.
231              */
232             numPulsesMin = numPulses;
233         }
234         for (int pulse = 0; pulse < numPulsesMin; pulse++)
235         {
236             /* Communicate the extremes forward */
237             bool receiveValidData = (applyPbc || dd->ci[dim] > 0);
238
239             int numElements = dd->ndim - d - 1;
240             ddSendrecv(dd, d, dddirForward, extr_s + d, numElements, extr_r + d, numElements);
241
242             if (receiveValidData)
243             {
244                 for (int d1 = d; d1 < dd->ndim - 1; d1++)
245                 {
246                     extr_s[d1][0] = std::max(extr_s[d1][0], extr_r[d1][0]);
247                     extr_s[d1][1] = std::min(extr_s[d1][1], extr_r[d1][1]);
248                     extr_s[d1][2] = std::min(extr_s[d1][2], extr_r[d1][2]);
249                 }
250             }
251         }
252
253         const int numElementsInBuffer = pos;
254         for (int pulse = 0; pulse < numPulses; pulse++)
255         {
256             /* Communicate all the zone information backward */
257             bool receiveValidData = (applyPbc || dd->ci[dim] < dd->numCells[dim] - 1);
258
259             static_assert(
260                     sizeof(gmx_ddzone_t) == c_ddzoneNumReals * sizeof(real),
261                     "Here we expect gmx_ddzone_t to consist of c_ddzoneNumReals reals (only)");
262
263             int numReals = numElementsInBuffer * c_ddzoneNumReals;
264             ddSendrecv(dd,
265                        d,
266                        dddirBackward,
267                        gmx::arrayRefFromArray(&buf_s[0].min0, numReals),
268                        gmx::arrayRefFromArray(&buf_r[0].min0, numReals));
269
270             rvec dh = { 0 };
271             if (pulse > 0)
272             {
273                 for (int d1 = d + 1; d1 < dd->ndim; d1++)
274                 {
275                     /* Determine the decrease of maximum required
276                      * communication height along d1 due to the distance along d,
277                      * this avoids a lot of useless atom communication.
278                      */
279                     real dist_d = comm->cell_x1[dim] - buf_r[0].p1_0;
280
281                     real c;
282                     if (ddbox->tric_dir[dim])
283                     {
284                         /* c is the off-diagonal coupling between the cell planes
285                          * along directions d and d1.
286                          */
287                         c = ddbox->v[dim][dd->dim[d1]][dim];
288                     }
289                     else
290                     {
291                         c = 0;
292                     }
293                     real det = (1 + c * c) * gmx::square(comm->systemInfo.cutoff) - dist_d * dist_d;
294                     if (det > 0)
295                     {
296                         dh[d1] = comm->systemInfo.cutoff - (c * dist_d + std::sqrt(det)) / (1 + c * c);
297                     }
298                     else
299                     {
300                         /* A negative value signals out of range */
301                         dh[d1] = -1;
302                     }
303                 }
304             }
305
306             /* Accumulate the extremes over all pulses */
307             for (int i = 0; i < numElementsInBuffer; i++)
308             {
309                 if (pulse == 0)
310                 {
311                     buf_e[i] = buf_r[i];
312                 }
313                 else
314                 {
315                     if (receiveValidData)
316                     {
317                         buf_e[i].min0 = std::min(buf_e[i].min0, buf_r[i].min0);
318                         buf_e[i].max1 = std::max(buf_e[i].max1, buf_r[i].max1);
319                         buf_e[i].min1 = std::min(buf_e[i].min1, buf_r[i].min1);
320                     }
321
322                     int d1;
323                     if (dd->ndim == 3 && d == 0 && i == numElementsInBuffer - 1)
324                     {
325                         d1 = 1;
326                     }
327                     else
328                     {
329                         d1 = d + 1;
330                     }
331                     if (receiveValidData && dh[d1] >= 0)
332                     {
333                         buf_e[i].mch0 = std::max(buf_e[i].mch0, buf_r[i].mch0 - dh[d1]);
334                         buf_e[i].mch1 = std::max(buf_e[i].mch1, buf_r[i].mch1 - dh[d1]);
335                     }
336                 }
337                 /* Copy the received buffer to the send buffer,
338                  * to pass the data through with the next pulse.
339                  */
340                 buf_s[i] = buf_r[i];
341             }
342             if (((applyPbc || dd->ci[dim] + numPulses < dd->numCells[dim]) && pulse == numPulses - 1)
343                 || (!applyPbc && dd->ci[dim] + 1 + pulse == dd->numCells[dim] - 1))
344             {
345                 /* Store the extremes */
346                 int pos = 0;
347
348                 for (int d1 = d; d1 < dd->ndim - 1; d1++)
349                 {
350                     extr_s[d1][1] = std::min(extr_s[d1][1], buf_e[pos].min0);
351                     extr_s[d1][0] = std::max(extr_s[d1][0], buf_e[pos].max1);
352                     extr_s[d1][2] = std::min(extr_s[d1][2], buf_e[pos].min1);
353                     pos++;
354                 }
355
356                 if (d == 1 || (d == 0 && dd->ndim == 3))
357                 {
358                     for (int i = d; i < 2; i++)
359                     {
360                         comm->zone_d2[1 - d][i] = buf_e[pos];
361                         pos++;
362                     }
363                 }
364                 if (d == 0)
365                 {
366                     comm->zone_d1[1] = buf_e[pos];
367                     pos++;
368                 }
369             }
370             else
371             {
372                 if (d == 1 || (d == 0 && dd->ndim == 3))
373                 {
374                     for (int i = d; i < 2; i++)
375                     {
376                         comm->zone_d2[1 - d][i].dataSet = 0;
377                     }
378                 }
379                 if (d == 0)
380                 {
381                     comm->zone_d1[1].dataSet = 0;
382                 }
383             }
384         }
385     }
386
387     if (dd->ndim >= 2)
388     {
389         int dim = dd->dim[1];
390         for (int i = 0; i < 2; i++)
391         {
392             if (comm->zone_d1[i].dataSet != 0)
393             {
394                 if (debug)
395                 {
396                     print_ddzone(debug, 1, i, 0, &comm->zone_d1[i]);
397                 }
398                 cell_ns_x0[dim] = std::min(cell_ns_x0[dim], comm->zone_d1[i].min0);
399                 cell_ns_x1[dim] = std::max(cell_ns_x1[dim], comm->zone_d1[i].max1);
400             }
401         }
402     }
403     if (dd->ndim >= 3)
404     {
405         int dim = dd->dim[2];
406         for (int i = 0; i < 2; i++)
407         {
408             for (int j = 0; j < 2; j++)
409             {
410                 if (comm->zone_d2[i][j].dataSet != 0)
411                 {
412                     if (debug)
413                     {
414                         print_ddzone(debug, 2, i, j, &comm->zone_d2[i][j]);
415                     }
416                     cell_ns_x0[dim] = std::min(cell_ns_x0[dim], comm->zone_d2[i][j].min0);
417                     cell_ns_x1[dim] = std::max(cell_ns_x1[dim], comm->zone_d2[i][j].max1);
418                 }
419             }
420         }
421     }
422     for (int d = 1; d < dd->ndim; d++)
423     {
424         cellsizes[d].fracLowerMax = extr_s[d - 1][0];
425         cellsizes[d].fracUpperMin = extr_s[d - 1][1];
426         if (debug)
427         {
428             fprintf(debug,
429                     "Cell fraction d %d, max0 %f, min1 %f\n",
430                     d,
431                     cellsizes[d].fracLowerMax,
432                     cellsizes[d].fracUpperMin);
433         }
434     }
435 }
436
437 //! Sets the charge-group zones to be equal to the home zone.
438 static void set_zones_numHomeAtoms(gmx_domdec_t* dd)
439 {
440     gmx_domdec_zones_t* zones;
441     int                 i;
442
443     zones = &dd->comm->zones;
444
445     zones->cg_range[0] = 0;
446     for (i = 1; i < zones->n + 1; i++)
447     {
448         zones->cg_range[i] = dd->numHomeAtoms;
449     }
450     /* zone_ncg1[0] should always be equal to numHomeAtoms */
451     dd->comm->zone_ncg1[0] = dd->numHomeAtoms;
452 }
453
454 //! Restore atom groups for the charge groups.
455 static void restoreAtomGroups(gmx_domdec_t* dd, const t_state* state)
456 {
457     gmx::ArrayRef<const int> atomGroupsState = state->cg_gl;
458
459     std::vector<int>& globalAtomGroupIndices = dd->globalAtomGroupIndices;
460
461     globalAtomGroupIndices.resize(atomGroupsState.size());
462
463     /* Copy back the global charge group indices from state
464      * and rebuild the local charge group to atom index.
465      */
466     for (gmx::index i = 0; i < atomGroupsState.ssize(); i++)
467     {
468         globalAtomGroupIndices[i] = atomGroupsState[i];
469     }
470
471     dd->numHomeAtoms = atomGroupsState.size();
472     dd->comm->atomRanges.setEnd(DDAtomRanges::Type::Home, atomGroupsState.ssize());
473
474     set_zones_numHomeAtoms(dd);
475 }
476
477 //! Sets the atom info structures.
478 static void dd_set_atominfo(gmx::ArrayRef<const int> index_gl, int cg0, int cg1, t_forcerec* fr)
479 {
480     if (fr != nullptr)
481     {
482         gmx::ArrayRef<gmx::AtomInfoWithinMoleculeBlock> atomInfoForEachMoleculeBlock =
483                 fr->atomInfoForEachMoleculeBlock;
484         gmx::ArrayRef<int64_t> atomInfo = fr->atomInfo;
485
486         for (int cg = cg0; cg < cg1; cg++)
487         {
488             atomInfo[cg] = ddGetAtomInfo(atomInfoForEachMoleculeBlock, index_gl[cg]);
489         }
490     }
491 }
492
493 //! Makes the mappings between global and local atom indices during DD repartioning.
494 static void make_dd_indices(gmx_domdec_t* dd, const int atomStart)
495 {
496     const int                numZones               = dd->comm->zones.n;
497     gmx::ArrayRef<const int> zone2cg                = dd->comm->zones.cg_range;
498     gmx::ArrayRef<const int> zone_ncg1              = dd->comm->zone_ncg1;
499     gmx::ArrayRef<const int> globalAtomGroupIndices = dd->globalAtomGroupIndices;
500
501     std::vector<int>& globalAtomIndices = dd->globalAtomIndices;
502     gmx_ga2la_t&      ga2la             = *dd->ga2la;
503
504     if (zone2cg[1] != dd->numHomeAtoms)
505     {
506         gmx_incons("dd->ncg_zone is not up to date");
507     }
508
509     /* Make the local to global and global to local atom index */
510     int a = atomStart;
511     globalAtomIndices.resize(a);
512     for (int zone = 0; zone < numZones; zone++)
513     {
514         int cg0;
515         if (zone == 0)
516         {
517             cg0 = atomStart;
518         }
519         else
520         {
521             cg0 = zone2cg[zone];
522         }
523         int cg1    = zone2cg[zone + 1];
524         int cg1_p1 = cg0 + zone_ncg1[zone];
525
526         for (int cg = cg0; cg < cg1; cg++)
527         {
528             int zone1 = zone;
529             if (cg >= cg1_p1)
530             {
531                 /* Signal that this cg is from more than one pulse away */
532                 zone1 += numZones;
533             }
534             int cg_gl = globalAtomGroupIndices[cg];
535             globalAtomIndices.push_back(cg_gl);
536             ga2la.insert(cg_gl, { a, zone1 });
537             a++;
538         }
539     }
540 }
541
542 //! Checks whether global and local atom indices are consistent.
543 static void check_index_consistency(const gmx_domdec_t* dd, int natoms_sys, const char* where)
544 {
545     int nerr = 0;
546
547     const int numAtomsInZones = dd->comm->atomRanges.end(DDAtomRanges::Type::Zones);
548
549     if (dd->comm->ddSettings.DD_debug > 1)
550     {
551         std::vector<int> have(natoms_sys);
552         for (int a = 0; a < numAtomsInZones; a++)
553         {
554             int globalAtomIndex = dd->globalAtomIndices[a];
555             if (have[globalAtomIndex] > 0)
556             {
557                 fprintf(stderr,
558                         "DD rank %d: global atom %d occurs twice: index %d and %d\n",
559                         dd->rank,
560                         globalAtomIndex + 1,
561                         have[globalAtomIndex],
562                         a + 1);
563             }
564             else
565             {
566                 have[globalAtomIndex] = a + 1;
567             }
568         }
569     }
570
571     std::vector<int> have(numAtomsInZones);
572
573     int ngl = 0;
574     for (int i = 0; i < natoms_sys; i++)
575     {
576         if (const auto* entry = dd->ga2la->find(i))
577         {
578             const int a = entry->la;
579             if (a >= numAtomsInZones)
580             {
581                 fprintf(stderr,
582                         "DD rank %d: global atom %d marked as local atom %d, which is larger than "
583                         "nat_tot (%d)\n",
584                         dd->rank,
585                         i + 1,
586                         a + 1,
587                         numAtomsInZones);
588                 nerr++;
589             }
590             else
591             {
592                 have[a] = 1;
593                 if (dd->globalAtomIndices[a] != i)
594                 {
595                     fprintf(stderr,
596                             "DD rank %d: global atom %d marked as local atom %d, which has global "
597                             "atom index %d\n",
598                             dd->rank,
599                             i + 1,
600                             a + 1,
601                             dd->globalAtomIndices[a] + 1);
602                     nerr++;
603                 }
604             }
605             ngl++;
606         }
607     }
608     if (ngl != numAtomsInZones)
609     {
610         fprintf(stderr, "DD rank %d, %s: %d global atom indices, %d local atoms\n", dd->rank, where, ngl, numAtomsInZones);
611     }
612     for (int a = 0; a < numAtomsInZones; a++)
613     {
614         if (have[a] == 0)
615         {
616             fprintf(stderr,
617                     "DD rank %d, %s: local atom %d, global %d has no global index\n",
618                     dd->rank,
619                     where,
620                     a + 1,
621                     dd->globalAtomIndices[a] + 1);
622         }
623     }
624
625     if (nerr > 0)
626     {
627         gmx_fatal(FARGS, "DD rank %d, %s: %d atom(group) index inconsistencies", dd->rank, where, nerr);
628     }
629 }
630
631 //! Clear all DD global state indices
632 static void clearDDStateIndices(gmx_domdec_t* dd, const bool keepLocalAtomIndices)
633 {
634     gmx_ga2la_t& ga2la = *dd->ga2la;
635
636     if (!keepLocalAtomIndices)
637     {
638         /* Clear the whole list without the overhead of searching */
639         ga2la.clear(true);
640     }
641     else
642     {
643         const int numAtomsInZones = dd->comm->atomRanges.end(DDAtomRanges::Type::Zones);
644         for (int i = 0; i < numAtomsInZones; i++)
645         {
646             ga2la.erase(dd->globalAtomIndices[i]);
647         }
648     }
649
650     dd_clear_local_vsite_indices(dd);
651
652     if (dd->constraints)
653     {
654         dd_clear_local_constraint_indices(dd);
655     }
656 }
657
658 //! Return the duration of force calculations on this rank.
659 static float dd_force_load(gmx_domdec_comm_t* comm)
660 {
661     float load;
662
663     if (comm->ddSettings.eFlop)
664     {
665         load = comm->flop;
666         if (comm->ddSettings.eFlop > 1)
667         {
668             load *= 1.0 + (comm->ddSettings.eFlop - 1) * (0.1 * rand() / RAND_MAX - 0.05);
669         }
670     }
671     else
672     {
673         load = comm->cycl[ddCyclF];
674         if (comm->cycl_n[ddCyclF] > 1)
675         {
676             /* Subtract the maximum of the last n cycle counts
677              * to get rid of possible high counts due to other sources,
678              * for instance system activity, that would otherwise
679              * affect the dynamic load balancing.
680              */
681             load -= comm->cycl_max[ddCyclF];
682         }
683
684 #if GMX_MPI
685         if (comm->cycl_n[ddCyclWaitGPU] && comm->nrank_gpu_shared > 1)
686         {
687             float gpu_wait, gpu_wait_sum;
688
689             gpu_wait = comm->cycl[ddCyclWaitGPU];
690             if (comm->cycl_n[ddCyclF] > 1)
691             {
692                 /* We should remove the WaitGPU time of the same MD step
693                  * as the one with the maximum F time, since the F time
694                  * and the wait time are not independent.
695                  * Furthermore, the step for the max F time should be chosen
696                  * the same on all ranks that share the same GPU.
697                  * But to keep the code simple, we remove the average instead.
698                  * The main reason for artificially long times at some steps
699                  * is spurious CPU activity or MPI time, so we don't expect
700                  * that changes in the GPU wait time matter a lot here.
701                  */
702                 gpu_wait *= (comm->cycl_n[ddCyclF] - 1) / static_cast<float>(comm->cycl_n[ddCyclF]);
703             }
704             /* Sum the wait times over the ranks that share the same GPU */
705             MPI_Allreduce(&gpu_wait, &gpu_wait_sum, 1, MPI_FLOAT, MPI_SUM, comm->mpi_comm_gpu_shared);
706             /* Replace the wait time by the average over the ranks */
707             load += -gpu_wait + gpu_wait_sum / comm->nrank_gpu_shared;
708         }
709 #endif
710     }
711
712     return load;
713 }
714
715 //! Runs cell size checks and communicates the boundaries.
716 static void comm_dd_ns_cell_sizes(gmx_domdec_t* dd, gmx_ddbox_t* ddbox, rvec cell_ns_x0, rvec cell_ns_x1, int64_t step)
717 {
718     gmx_domdec_comm_t* comm;
719     int                dim_ind, dim;
720
721     comm = dd->comm;
722
723     for (dim_ind = 0; dim_ind < dd->ndim; dim_ind++)
724     {
725         dim = dd->dim[dim_ind];
726
727         /* Without PBC we don't have restrictions on the outer cells */
728         if (!(dim >= ddbox->npbcdim && (dd->ci[dim] == 0 || dd->ci[dim] == dd->numCells[dim] - 1))
729             && isDlbOn(comm)
730             && (comm->cell_x1[dim] - comm->cell_x0[dim]) * ddbox->skew_fac[dim] < comm->cellsize_min[dim])
731         {
732             char buf[22];
733             gmx_fatal(FARGS,
734                       "step %s: The %c-size (%f) times the triclinic skew factor (%f) is smaller "
735                       "than the smallest allowed cell size (%f) for domain decomposition grid cell "
736                       "%d %d %d",
737                       gmx_step_str(step, buf),
738                       dim2char(dim),
739                       comm->cell_x1[dim] - comm->cell_x0[dim],
740                       ddbox->skew_fac[dim],
741                       dd->comm->cellsize_min[dim],
742                       dd->ci[XX],
743                       dd->ci[YY],
744                       dd->ci[ZZ]);
745         }
746     }
747
748     if ((isDlbOn(dd->comm) && dd->ndim > 1) || ddbox->nboundeddim < DIM)
749     {
750         /* Communicate the boundaries and update cell_ns_x0/1 */
751         dd_move_cellx(dd, ddbox, cell_ns_x0, cell_ns_x1);
752         if (isDlbOn(dd->comm) && dd->ndim > 1)
753         {
754             gmx::check_grid_jump(step, dd, dd->comm->systemInfo.cutoff, ddbox, TRUE);
755         }
756     }
757 }
758
759 //! Compute and communicate to determine the load distribution across PP ranks.
760 static void get_load_distribution(gmx_domdec_t* dd, gmx_wallcycle* wcycle)
761 {
762     gmx_domdec_comm_t* comm;
763     domdec_load_t*     load;
764     float              cell_frac = 0, sbuf[DD_NLOAD_MAX];
765     gmx_bool           bSepPME;
766
767     if (debug)
768     {
769         fprintf(debug, "get_load_distribution start\n");
770     }
771
772     wallcycle_start(wcycle, WallCycleCounter::DDCommLoad);
773
774     comm = dd->comm;
775
776     bSepPME = (dd->pme_nodeid >= 0);
777
778     if (dd->ndim == 0 && bSepPME)
779     {
780         /* Without decomposition, but with PME nodes, we need the load */
781         comm->load[0].mdf = comm->cycl[ddCyclPPduringPME];
782         comm->load[0].pme = comm->cycl[ddCyclPME];
783     }
784
785     // Either we have DLB off, or we have it on and the array is large enough
786     GMX_ASSERT(!isDlbOn(dd->comm) || static_cast<int>(dd->comm->cellsizesWithDlb.size()) == dd->ndim,
787                "DLB cell sizes data not set up properly ");
788     for (int d = dd->ndim - 1; d >= 0; d--)
789     {
790         const int dim = dd->dim[d];
791         /* Check if we participate in the communication in this dimension */
792         if (d == dd->ndim - 1 || (dd->ci[dd->dim[d + 1]] == 0 && dd->ci[dd->dim[dd->ndim - 1]] == 0))
793         {
794             load = &comm->load[d];
795             if (isDlbOn(dd->comm))
796             {
797                 cell_frac = comm->cellsizesWithDlb[d].fracUpper - comm->cellsizesWithDlb[d].fracLower;
798             }
799             int pos = 0;
800             if (d == dd->ndim - 1)
801             {
802                 sbuf[pos++] = dd_force_load(comm);
803                 sbuf[pos++] = sbuf[0];
804                 if (isDlbOn(dd->comm))
805                 {
806                     sbuf[pos++] = sbuf[0];
807                     sbuf[pos++] = cell_frac;
808                     if (d > 0)
809                     {
810                         sbuf[pos++] = comm->cellsizesWithDlb[d].fracLowerMax;
811                         sbuf[pos++] = comm->cellsizesWithDlb[d].fracUpperMin;
812                     }
813                 }
814                 if (bSepPME)
815                 {
816                     sbuf[pos++] = comm->cycl[ddCyclPPduringPME];
817                     sbuf[pos++] = comm->cycl[ddCyclPME];
818                 }
819             }
820             else
821             {
822                 sbuf[pos++] = comm->load[d + 1].sum;
823                 sbuf[pos++] = comm->load[d + 1].max;
824                 if (isDlbOn(dd->comm))
825                 {
826                     sbuf[pos++] = comm->load[d + 1].sum_m;
827                     sbuf[pos++] = comm->load[d + 1].cvol_min * cell_frac;
828                     sbuf[pos++] = comm->load[d + 1].flags;
829                     if (d > 0)
830                     {
831                         sbuf[pos++] = comm->cellsizesWithDlb[d].fracLowerMax;
832                         sbuf[pos++] = comm->cellsizesWithDlb[d].fracUpperMin;
833                     }
834                 }
835                 if (bSepPME)
836                 {
837                     sbuf[pos++] = comm->load[d + 1].mdf;
838                     sbuf[pos++] = comm->load[d + 1].pme;
839                 }
840             }
841             load->nload = pos;
842             /* Communicate a row in DD direction d.
843              * The communicators are setup such that the root always has rank 0.
844              */
845 #if GMX_MPI
846             MPI_Gather(sbuf,
847                        load->nload * sizeof(float),
848                        MPI_BYTE,
849                        load->load,
850                        load->nload * sizeof(float),
851                        MPI_BYTE,
852                        0,
853                        comm->mpi_comm_load[d]);
854 #endif
855             if (dd->ci[dim] == dd->master_ci[dim])
856             {
857                 /* We are the master along this row, process this row */
858                 RowMaster* rowMaster = nullptr;
859
860                 if (isDlbOn(comm))
861                 {
862                     rowMaster = comm->cellsizesWithDlb[d].rowMaster.get();
863                 }
864                 load->sum      = 0;
865                 load->max      = 0;
866                 load->sum_m    = 0;
867                 load->cvol_min = 1;
868                 load->flags    = 0;
869                 load->mdf      = 0;
870                 load->pme      = 0;
871                 int pos        = 0;
872                 for (int i = 0; i < dd->numCells[dim]; i++)
873                 {
874                     load->sum += load->load[pos++];
875                     load->max = std::max(load->max, load->load[pos]);
876                     pos++;
877                     if (isDlbOn(dd->comm))
878                     {
879                         if (rowMaster->dlbIsLimited)
880                         {
881                             /* This direction could not be load balanced properly,
882                              * therefore we need to use the maximum iso the average load.
883                              */
884                             load->sum_m = std::max(load->sum_m, load->load[pos]);
885                         }
886                         else
887                         {
888                             load->sum_m += load->load[pos];
889                         }
890                         pos++;
891                         load->cvol_min = std::min(load->cvol_min, load->load[pos]);
892                         pos++;
893                         if (d < dd->ndim - 1)
894                         {
895                             load->flags = gmx::roundToInt(load->load[pos++]);
896                         }
897                         if (d > 0)
898                         {
899                             rowMaster->bounds[i].cellFracLowerMax = load->load[pos++];
900                             rowMaster->bounds[i].cellFracUpperMin = load->load[pos++];
901                         }
902                     }
903                     if (bSepPME)
904                     {
905                         load->mdf = std::max(load->mdf, load->load[pos]);
906                         pos++;
907                         load->pme = std::max(load->pme, load->load[pos]);
908                         pos++;
909                     }
910                 }
911                 if (isDlbOn(comm) && rowMaster->dlbIsLimited)
912                 {
913                     load->sum_m *= dd->numCells[dim];
914                     load->flags |= (1 << d);
915                 }
916             }
917         }
918     }
919
920     if (DDMASTER(dd))
921     {
922         comm->nload += dd_load_count(comm);
923         comm->load_step += comm->cycl[ddCyclStep];
924         comm->load_sum += comm->load[0].sum;
925         comm->load_max += comm->load[0].max;
926         if (isDlbOn(comm))
927         {
928             for (int d = 0; d < dd->ndim; d++)
929             {
930                 if (comm->load[0].flags & (1 << d))
931                 {
932                     comm->load_lim[d]++;
933                 }
934             }
935         }
936         if (bSepPME)
937         {
938             comm->load_mdf += comm->load[0].mdf;
939             comm->load_pme += comm->load[0].pme;
940         }
941     }
942
943     wallcycle_stop(wcycle, WallCycleCounter::DDCommLoad);
944
945     if (debug)
946     {
947         fprintf(debug, "get_load_distribution finished\n");
948     }
949 }
950
951 /*! \brief Return the relative performance loss on the total run time
952  * due to the force calculation load imbalance. */
953 static float dd_force_load_fraction(gmx_domdec_t* dd)
954 {
955     if (dd->comm->nload > 0 && dd->comm->load_step > 0)
956     {
957         return dd->comm->load_sum / (dd->comm->load_step * dd->nnodes);
958     }
959     else
960     {
961         return 0;
962     }
963 }
964
965 /*! \brief Return the relative performance loss on the total run time
966  * due to the force calculation load imbalance. */
967 static float dd_force_imb_perf_loss(gmx_domdec_t* dd)
968 {
969     if (dd->comm->nload > 0 && dd->comm->load_step > 0)
970     {
971         return (dd->comm->load_max * dd->nnodes - dd->comm->load_sum) / (dd->comm->load_step * dd->nnodes);
972     }
973     else
974     {
975         return 0;
976     }
977 }
978
979 //! Print load-balance report e.g. at the end of a run.
980 static void print_dd_load_av(FILE* fplog, gmx_domdec_t* dd)
981 {
982     gmx_domdec_comm_t* comm = dd->comm;
983
984     /* Only the master rank prints loads and only if we measured loads */
985     if (!DDMASTER(dd) || comm->nload == 0)
986     {
987         return;
988     }
989
990     char buf[STRLEN];
991     int  numPpRanks  = dd->nnodes;
992     int  numPmeRanks = (comm->ddRankSetup.usePmeOnlyRanks ? comm->ddRankSetup.numRanksDoingPme : 0);
993     int  numRanks    = numPpRanks + numPmeRanks;
994     float lossFraction = 0;
995
996     /* Print the average load imbalance and performance loss */
997     if (dd->nnodes > 1 && comm->load_sum > 0)
998     {
999         float imbalance = comm->load_max * numPpRanks / comm->load_sum - 1;
1000         lossFraction    = dd_force_imb_perf_loss(dd);
1001
1002         std::string msg = "\nDynamic load balancing report:\n";
1003         std::string dlbStateStr;
1004
1005         switch (dd->comm->dlbState)
1006         {
1007             case DlbState::offUser:
1008                 dlbStateStr = "DLB was off during the run per user request.";
1009                 break;
1010             case DlbState::offForever:
1011                 /* Currectly this can happen due to performance loss observed, cell size
1012                  * limitations or incompatibility with other settings observed during
1013                  * determineInitialDlbState(). */
1014                 dlbStateStr = "DLB got disabled because it was unsuitable to use.";
1015                 break;
1016             case DlbState::offCanTurnOn:
1017                 dlbStateStr = "DLB was off during the run due to low measured imbalance.";
1018                 break;
1019             case DlbState::offTemporarilyLocked:
1020                 dlbStateStr =
1021                         "DLB was locked at the end of the run due to unfinished PP-PME "
1022                         "balancing.";
1023                 break;
1024             case DlbState::onCanTurnOff:
1025                 dlbStateStr = "DLB was turned on during the run due to measured imbalance.";
1026                 break;
1027             case DlbState::onUser:
1028                 dlbStateStr = "DLB was permanently on during the run per user request.";
1029                 break;
1030             default: GMX_ASSERT(false, "Undocumented DLB state");
1031         }
1032
1033         msg += " " + dlbStateStr + "\n";
1034         msg += gmx::formatString(" Average load imbalance: %.1f%%.\n", imbalance * 100);
1035         msg += gmx::formatString(
1036                 " The balanceable part of the MD step is %d%%, load imbalance is computed from "
1037                 "this.\n",
1038                 gmx::roundToInt(dd_force_load_fraction(dd) * 100));
1039         msg += gmx::formatString(
1040                 " Part of the total run time spent waiting due to load imbalance: %.1f%%.\n",
1041                 lossFraction * 100);
1042         fprintf(fplog, "%s", msg.c_str());
1043         fprintf(stderr, "\n%s", msg.c_str());
1044     }
1045
1046     /* Print during what percentage of steps the  load balancing was limited */
1047     bool dlbWasLimited = false;
1048     if (isDlbOn(comm))
1049     {
1050         sprintf(buf, " Steps where the load balancing was limited by -rdd, -rcon and/or -dds:");
1051         for (int d = 0; d < dd->ndim; d++)
1052         {
1053             int limitPercentage = (200 * comm->load_lim[d] + 1) / (2 * comm->nload);
1054             sprintf(buf + strlen(buf), " %c %d %%", dim2char(dd->dim[d]), limitPercentage);
1055             if (limitPercentage >= 50)
1056             {
1057                 dlbWasLimited = true;
1058             }
1059         }
1060         sprintf(buf + strlen(buf), "\n");
1061         fprintf(fplog, "%s", buf);
1062         fprintf(stderr, "%s", buf);
1063     }
1064
1065     /* Print the performance loss due to separate PME - PP rank imbalance */
1066     float lossFractionPme = 0;
1067     if (numPmeRanks > 0 && comm->load_mdf > 0 && comm->load_step > 0)
1068     {
1069         float pmeForceRatio = comm->load_pme / comm->load_mdf;
1070         lossFractionPme     = (comm->load_pme - comm->load_mdf) / comm->load_step;
1071         if (lossFractionPme <= 0)
1072         {
1073             lossFractionPme *= numPmeRanks / static_cast<float>(numRanks);
1074         }
1075         else
1076         {
1077             lossFractionPme *= numPpRanks / static_cast<float>(numRanks);
1078         }
1079         sprintf(buf, " Average PME mesh/force load: %5.3f\n", pmeForceRatio);
1080         fprintf(fplog, "%s", buf);
1081         fprintf(stderr, "%s", buf);
1082         sprintf(buf,
1083                 " Part of the total run time spent waiting due to PP/PME imbalance: %.1f %%\n",
1084                 std::fabs(lossFractionPme) * 100);
1085         fprintf(fplog, "%s", buf);
1086         fprintf(stderr, "%s", buf);
1087     }
1088     fprintf(fplog, "\n");
1089     fprintf(stderr, "\n");
1090
1091     if ((lossFraction >= DD_PERF_LOSS_WARN) && (dd->comm->dlbState != DlbState::offTemporarilyLocked))
1092     {
1093         std::string message = gmx::formatString(
1094                 "NOTE: %.1f %% of the available CPU time was lost due to load imbalance\n"
1095                 "      in the domain decomposition.\n",
1096                 lossFraction * 100);
1097
1098         bool hadSuggestion = false;
1099         if (dd->comm->dlbState == DlbState::offUser)
1100         {
1101             message += "      You might want to allow dynamic load balancing (option -dlb auto.)\n";
1102             hadSuggestion = true;
1103         }
1104         else if (dd->comm->dlbState == DlbState::offCanTurnOn)
1105         {
1106             message +=
1107                     "      Dynamic load balancing was automatically disabled, but it might be "
1108                     "beneficial to manually tuning it on (option -dlb on.)\n";
1109             hadSuggestion = true;
1110         }
1111         else if (dlbWasLimited)
1112         {
1113             message +=
1114                     "      You might want to decrease the cell size limit (options -rdd, -rcon "
1115                     "and/or -dds).\n";
1116             hadSuggestion = true;
1117         }
1118         message += gmx::formatString(
1119                 "      You can %sconsider manually changing the decomposition (option -dd);\n"
1120                 "      e.g. by using fewer domains along the box dimension in which there is\n"
1121                 "      considerable inhomogeneity in the simulated system.",
1122                 hadSuggestion ? "also " : "");
1123
1124         fprintf(fplog, "%s\n", message.c_str());
1125         fprintf(stderr, "%s\n", message.c_str());
1126     }
1127     if (numPmeRanks > 0 && std::fabs(lossFractionPme) >= DD_PERF_LOSS_WARN)
1128     {
1129         sprintf(buf,
1130                 "NOTE: %.1f %% performance was lost because the PME ranks\n"
1131                 "      had %s work to do than the PP ranks.\n"
1132                 "      You might want to %s the number of PME ranks\n"
1133                 "      or %s the cut-off and the grid spacing.\n",
1134                 std::fabs(lossFractionPme * 100),
1135                 (lossFractionPme < 0) ? "less" : "more",
1136                 (lossFractionPme < 0) ? "decrease" : "increase",
1137                 (lossFractionPme < 0) ? "decrease" : "increase");
1138         fprintf(fplog, "%s\n", buf);
1139         fprintf(stderr, "%s\n", buf);
1140     }
1141 }
1142
1143 //! Return the minimum communication volume.
1144 static float dd_vol_min(gmx_domdec_t* dd)
1145 {
1146     return dd->comm->load[0].cvol_min * dd->nnodes;
1147 }
1148
1149 //! Return the DD load flags.
1150 static int dd_load_flags(gmx_domdec_t* dd)
1151 {
1152     return dd->comm->load[0].flags;
1153 }
1154
1155 //! Return the reported load imbalance in force calculations.
1156 static float dd_f_imbal(gmx_domdec_t* dd)
1157 {
1158     if (dd->comm->load[0].sum > 0)
1159     {
1160         return dd->comm->load[0].max * dd->nnodes / dd->comm->load[0].sum - 1.0F;
1161     }
1162     else
1163     {
1164         /* Something is wrong in the cycle counting, report no load imbalance */
1165         return 0.0F;
1166     }
1167 }
1168
1169 //! Returns DD load balance report.
1170 static std::string dd_print_load(gmx_domdec_t* dd, int64_t step)
1171 {
1172     gmx::StringOutputStream stream;
1173     gmx::TextWriter         log(&stream);
1174
1175     int flags = dd_load_flags(dd);
1176     if (flags)
1177     {
1178         log.writeString("DD  load balancing is limited by minimum cell size in dimension");
1179         for (int d = 0; d < dd->ndim; d++)
1180         {
1181             if (flags & (1 << d))
1182             {
1183                 log.writeStringFormatted(" %c", dim2char(dd->dim[d]));
1184             }
1185         }
1186         log.ensureLineBreak();
1187     }
1188     log.writeString("DD  step " + gmx::toString(step));
1189     if (isDlbOn(dd->comm))
1190     {
1191         log.writeStringFormatted("  vol min/aver %5.3f%c", dd_vol_min(dd), flags ? '!' : ' ');
1192     }
1193     if (dd->nnodes > 1)
1194     {
1195         log.writeStringFormatted(" load imb.: force %4.1f%%", dd_f_imbal(dd) * 100);
1196     }
1197     if (dd->comm->cycl_n[ddCyclPME])
1198     {
1199         log.writeStringFormatted("  pme mesh/force %5.3f", dd_pme_f_ratio(dd));
1200     }
1201     log.ensureLineBreak();
1202     return stream.toString();
1203 }
1204
1205 //! Prints DD load balance report in mdrun verbose mode.
1206 static void dd_print_load_verbose(gmx_domdec_t* dd)
1207 {
1208     if (isDlbOn(dd->comm))
1209     {
1210         fprintf(stderr, "vol %4.2f%c ", dd_vol_min(dd), dd_load_flags(dd) ? '!' : ' ');
1211     }
1212     if (dd->nnodes > 1)
1213     {
1214         fprintf(stderr, "imb F %2d%% ", gmx::roundToInt(dd_f_imbal(dd) * 100));
1215     }
1216     if (dd->comm->cycl_n[ddCyclPME])
1217     {
1218         fprintf(stderr, "pme/F %4.2f ", dd_pme_f_ratio(dd));
1219     }
1220 }
1221
1222 //! Turns on dynamic load balancing if possible and needed.
1223 static void turn_on_dlb(const gmx::MDLogger& mdlog, gmx_domdec_t* dd, int64_t step)
1224 {
1225     gmx_domdec_comm_t* comm = dd->comm;
1226
1227     real cellsize_min = comm->cellsize_min[dd->dim[0]];
1228     for (int d = 1; d < dd->ndim; d++)
1229     {
1230         cellsize_min = std::min(cellsize_min, comm->cellsize_min[dd->dim[d]]);
1231     }
1232
1233     /* Turn off DLB if we're too close to the cell size limit. */
1234     if (cellsize_min < comm->cellsize_limit * 1.05)
1235     {
1236         GMX_LOG(mdlog.info)
1237                 .appendTextFormatted(
1238                         "step %s Measured %.1f %% performance loss due to load imbalance, "
1239                         "but the minimum cell size is smaller than 1.05 times the cell size limit. "
1240                         "Will no longer try dynamic load balancing.",
1241                         gmx::toString(step).c_str(),
1242                         dd_force_imb_perf_loss(dd) * 100);
1243
1244         comm->dlbState = DlbState::offForever;
1245         return;
1246     }
1247
1248     GMX_LOG(mdlog.info)
1249             .appendTextFormatted(
1250                     "step %s Turning on dynamic load balancing, because the performance loss due "
1251                     "to load imbalance is %.1f %%.",
1252                     gmx::toString(step).c_str(),
1253                     dd_force_imb_perf_loss(dd) * 100);
1254     comm->dlbState = DlbState::onCanTurnOff;
1255
1256     /* Store the non-DLB performance, so we can check if DLB actually
1257      * improves performance.
1258      */
1259     GMX_RELEASE_ASSERT(comm->cycl_n[ddCyclStep] > 0,
1260                        "When we turned on DLB, we should have measured cycles");
1261     comm->cyclesPerStepBeforeDLB = comm->cycl[ddCyclStep] / comm->cycl_n[ddCyclStep];
1262
1263     set_dlb_limits(dd);
1264
1265     /* We can set the required cell size info here,
1266      * so we do not need to communicate this.
1267      * The grid is completely uniform.
1268      */
1269     for (int d = 0; d < dd->ndim; d++)
1270     {
1271         RowMaster* rowMaster = comm->cellsizesWithDlb[d].rowMaster.get();
1272
1273         if (rowMaster)
1274         {
1275             comm->load[d].sum_m = comm->load[d].sum;
1276
1277             int nc = dd->numCells[dd->dim[d]];
1278             for (int i = 0; i < nc; i++)
1279             {
1280                 rowMaster->cellFrac[i] = i / static_cast<real>(nc);
1281                 if (d > 0)
1282                 {
1283                     rowMaster->bounds[i].cellFracLowerMax = i / static_cast<real>(nc);
1284                     rowMaster->bounds[i].cellFracUpperMin = (i + 1) / static_cast<real>(nc);
1285                 }
1286             }
1287             rowMaster->cellFrac[nc] = 1.0;
1288         }
1289     }
1290 }
1291
1292 //! Turns off dynamic load balancing (but leave it able to turn back on).
1293 static void turn_off_dlb(const gmx::MDLogger& mdlog, gmx_domdec_t* dd, int64_t step)
1294 {
1295     GMX_LOG(mdlog.info)
1296             .appendText(
1297                     "step " + gmx::toString(step)
1298                     + " Turning off dynamic load balancing, because it is degrading performance.");
1299     dd->comm->dlbState                     = DlbState::offCanTurnOn;
1300     dd->comm->haveTurnedOffDlb             = true;
1301     dd->comm->ddPartioningCountFirstDlbOff = dd->ddp_count;
1302 }
1303
1304 //! Turns off dynamic load balancing permanently.
1305 static void turn_off_dlb_forever(const gmx::MDLogger& mdlog, gmx_domdec_t* dd, int64_t step)
1306 {
1307     GMX_RELEASE_ASSERT(dd->comm->dlbState == DlbState::offCanTurnOn,
1308                        "Can only turn off DLB forever when it was in the can-turn-on state");
1309     GMX_LOG(mdlog.info)
1310             .appendText(
1311                     "step " + gmx::toString(step)
1312                     + " Will no longer try dynamic load balancing, as it degraded performance.");
1313     dd->comm->dlbState = DlbState::offForever;
1314 }
1315
1316 void set_dd_dlb_max_cutoff(t_commrec* cr, real cutoff)
1317 {
1318     gmx_domdec_comm_t* comm;
1319
1320     comm = cr->dd->comm;
1321
1322     /* Turn on the DLB limiting (might have been on already) */
1323     comm->bPMELoadBalDLBLimits = TRUE;
1324
1325     /* Change the cut-off limit */
1326     comm->PMELoadBal_max_cutoff = cutoff;
1327
1328     if (debug)
1329     {
1330         fprintf(debug,
1331                 "PME load balancing set a limit to the DLB staggering such that a %f cut-off will "
1332                 "continue to fit\n",
1333                 comm->PMELoadBal_max_cutoff);
1334     }
1335 }
1336
1337 //! Merge atom buffers.
1338 static void merge_cg_buffers(int                                             ncell,
1339                              gmx_domdec_comm_dim_t*                          cd,
1340                              int                                             pulse,
1341                              int*                                            ncg_cell,
1342                              gmx::ArrayRef<int>                              index_gl,
1343                              const int*                                      recv_i,
1344                              gmx::ArrayRef<gmx::RVec>                        x,
1345                              gmx::ArrayRef<const gmx::RVec>                  recv_vr,
1346                              gmx::ArrayRef<gmx::AtomInfoWithinMoleculeBlock> atomInfoForEachMoleculeBlock,
1347                              gmx::ArrayRef<int64_t>                          atomInfo)
1348 {
1349     gmx_domdec_ind_t *ind, *ind_p;
1350     int               p, cell, c, cg, cg0, cg1, cg_gl;
1351     int               shift;
1352
1353     ind = &cd->ind[pulse];
1354
1355     /* First correct the already stored data */
1356     shift = ind->nrecv[ncell];
1357     for (cell = ncell - 1; cell >= 0; cell--)
1358     {
1359         shift -= ind->nrecv[cell];
1360         if (shift > 0)
1361         {
1362             /* Move the cg's present from previous grid pulses */
1363             cg0 = ncg_cell[ncell + cell];
1364             cg1 = ncg_cell[ncell + cell + 1];
1365             for (cg = cg1 - 1; cg >= cg0; cg--)
1366             {
1367                 index_gl[cg + shift] = index_gl[cg];
1368                 x[cg + shift]        = x[cg];
1369                 atomInfo[cg + shift] = atomInfo[cg];
1370             }
1371             /* Correct the already stored send indices for the shift */
1372             for (p = 1; p <= pulse; p++)
1373             {
1374                 ind_p = &cd->ind[p];
1375                 cg0   = 0;
1376                 for (c = 0; c < cell; c++)
1377                 {
1378                     cg0 += ind_p->nsend[c];
1379                 }
1380                 cg1 = cg0 + ind_p->nsend[cell];
1381                 for (cg = cg0; cg < cg1; cg++)
1382                 {
1383                     ind_p->index[cg] += shift;
1384                 }
1385             }
1386         }
1387     }
1388
1389     /* Merge in the communicated buffers */
1390     shift = 0;
1391     cg0   = 0;
1392     for (cell = 0; cell < ncell; cell++)
1393     {
1394         cg1 = ncg_cell[ncell + cell + 1] + shift;
1395         for (cg = 0; cg < ind->nrecv[cell]; cg++)
1396         {
1397             /* Copy this atom from the buffer */
1398             index_gl[cg1] = recv_i[cg0];
1399             x[cg1]        = recv_vr[cg0];
1400             /* Copy information */
1401             cg_gl         = index_gl[cg1];
1402             atomInfo[cg1] = ddGetAtomInfo(atomInfoForEachMoleculeBlock, cg_gl);
1403             cg0++;
1404             cg1++;
1405         }
1406         shift += ind->nrecv[cell];
1407         ncg_cell[ncell + cell + 1] = cg1;
1408     }
1409 }
1410
1411 //! Makes a range partitioning for the atom groups wthin a cell
1412 static void make_cell2at_index(gmx_domdec_comm_dim_t* cd, int nzone, int atomGroupStart)
1413 {
1414     /* Store the atom block boundaries for easy copying of communication buffers
1415      */
1416     int g = atomGroupStart;
1417     for (int zone = 0; zone < nzone; zone++)
1418     {
1419         for (gmx_domdec_ind_t& ind : cd->ind)
1420         {
1421             ind.cell2at0[zone] = g;
1422             g += ind.nrecv[zone];
1423             ind.cell2at1[zone] = g;
1424         }
1425     }
1426 }
1427
1428 //! Returns whether a link is missing.
1429 static gmx_bool missing_link(const t_blocka& link, const int globalAtomIndex, const gmx_ga2la_t& ga2la)
1430 {
1431     for (int i = link.index[globalAtomIndex]; i < link.index[globalAtomIndex + 1]; i++)
1432     {
1433         if (!ga2la.findHome(link.a[i]))
1434         {
1435             return true;
1436         }
1437     }
1438
1439     return false;
1440 }
1441
1442 //! Domain corners for communication, a maximum of 4 i-zones see a j domain
1443 typedef struct
1444 {
1445     //! The corners for the non-bonded communication.
1446     real c[DIM][4];
1447     //! Corner for rounding.
1448     real cr0;
1449     //! Corners for rounding.
1450     real cr1[4];
1451     //! Corners for bounded communication.
1452     real bc[DIM];
1453     //! Corner for rounding for bonded communication.
1454     real bcr1;
1455 } dd_corners_t;
1456
1457 //! Determine the corners of the domain(s) we are communicating with.
1458 static void set_dd_corners(const gmx_domdec_t* dd, int dim0, int dim1, int dim2, gmx_bool bDistMB, dd_corners_t* c)
1459 {
1460     const gmx_domdec_comm_t*  comm;
1461     const gmx_domdec_zones_t* zones;
1462
1463     comm = dd->comm;
1464
1465     zones = &comm->zones;
1466
1467     /* Keep the compiler happy */
1468     c->cr0  = 0;
1469     c->bcr1 = 0;
1470
1471     /* The first dimension is equal for all cells */
1472     c->c[0][0] = comm->cell_x0[dim0];
1473     if (bDistMB)
1474     {
1475         c->bc[0] = c->c[0][0];
1476     }
1477     if (dd->ndim >= 2)
1478     {
1479         dim1 = dd->dim[1];
1480         /* This cell row is only seen from the first row */
1481         c->c[1][0] = comm->cell_x0[dim1];
1482         /* All rows can see this row */
1483         c->c[1][1] = comm->cell_x0[dim1];
1484         if (isDlbOn(dd->comm))
1485         {
1486             c->c[1][1] = std::max(comm->cell_x0[dim1], comm->zone_d1[1].mch0);
1487             if (bDistMB)
1488             {
1489                 /* For the multi-body distance we need the maximum */
1490                 c->bc[1] = std::max(comm->cell_x0[dim1], comm->zone_d1[1].p1_0);
1491             }
1492         }
1493         /* Set the upper-right corner for rounding */
1494         c->cr0 = comm->cell_x1[dim0];
1495
1496         if (dd->ndim >= 3)
1497         {
1498             dim2 = dd->dim[2];
1499             for (int j = 0; j < 4; j++)
1500             {
1501                 c->c[2][j] = comm->cell_x0[dim2];
1502             }
1503             if (isDlbOn(dd->comm))
1504             {
1505                 /* Use the maximum of the i-cells that see a j-cell */
1506                 for (const auto& iZone : zones->iZones)
1507                 {
1508                     const int iZoneIndex = iZone.iZoneIndex;
1509                     for (int jZone : iZone.jZoneRange)
1510                     {
1511                         if (jZone >= 4)
1512                         {
1513                             c->c[2][jZone - 4] = std::max(
1514                                     c->c[2][jZone - 4],
1515                                     comm->zone_d2[zones->shift[iZoneIndex][dim0]][zones->shift[iZoneIndex][dim1]]
1516                                             .mch0);
1517                         }
1518                     }
1519                 }
1520                 if (bDistMB)
1521                 {
1522                     /* For the multi-body distance we need the maximum */
1523                     c->bc[2] = comm->cell_x0[dim2];
1524                     for (int i = 0; i < 2; i++)
1525                     {
1526                         for (int j = 0; j < 2; j++)
1527                         {
1528                             c->bc[2] = std::max(c->bc[2], comm->zone_d2[i][j].p1_0);
1529                         }
1530                     }
1531                 }
1532             }
1533
1534             /* Set the upper-right corner for rounding */
1535             /* Cell (0,0,0) and cell (1,0,0) can see cell 4 (0,1,1)
1536              * Only cell (0,0,0) can see cell 7 (1,1,1)
1537              */
1538             c->cr1[0] = comm->cell_x1[dim1];
1539             c->cr1[3] = comm->cell_x1[dim1];
1540             if (isDlbOn(dd->comm))
1541             {
1542                 c->cr1[0] = std::max(comm->cell_x1[dim1], comm->zone_d1[1].mch1);
1543                 if (bDistMB)
1544                 {
1545                     /* For the multi-body distance we need the maximum */
1546                     c->bcr1 = std::max(comm->cell_x1[dim1], comm->zone_d1[1].p1_1);
1547                 }
1548             }
1549         }
1550     }
1551 }
1552
1553 /*! \brief Add the atom groups and coordinates we need to send in this
1554  * pulse from this zone to \p localAtomGroups and \p work. */
1555 static void get_zone_pulse_groups(gmx_domdec_t*                  dd,
1556                                   int                            zonei,
1557                                   int                            zone,
1558                                   int                            cg0,
1559                                   int                            cg1,
1560                                   gmx::ArrayRef<const int>       globalAtomGroupIndices,
1561                                   int                            dim,
1562                                   int                            dim_ind,
1563                                   int                            dim0,
1564                                   int                            dim1,
1565                                   int                            dim2,
1566                                   real                           r_comm2,
1567                                   real                           r_bcomm2,
1568                                   matrix                         box,
1569                                   bool                           distanceIsTriclinic,
1570                                   rvec*                          normal,
1571                                   real                           skew_fac2_d,
1572                                   real                           skew_fac_01,
1573                                   rvec*                          v_d,
1574                                   rvec*                          v_0,
1575                                   rvec*                          v_1,
1576                                   const dd_corners_t*            c,
1577                                   const rvec                     sf2_round,
1578                                   gmx_bool                       bDistBonded,
1579                                   gmx_bool                       bBondComm,
1580                                   gmx_bool                       bDist2B,
1581                                   gmx_bool                       bDistMB,
1582                                   gmx::ArrayRef<const gmx::RVec> coordinates,
1583                                   gmx::ArrayRef<const int64_t>   atomInfo,
1584                                   std::vector<int>*              localAtomGroups,
1585                                   dd_comm_setup_work_t*          work)
1586 {
1587     gmx_domdec_comm_t* comm;
1588     gmx_bool           bScrew;
1589     gmx_bool           bDistMB_pulse;
1590     int                cg, i;
1591     real               r2, rb2, r, tric_sh;
1592     rvec               rn, rb;
1593     int                dimd;
1594     int                nsend_z, nat;
1595
1596     comm = dd->comm;
1597
1598     bScrew = (dd->unitCellInfo.haveScrewPBC && dim == XX);
1599
1600     bDistMB_pulse = (bDistMB && bDistBonded);
1601
1602     /* Unpack the work data */
1603     std::vector<int>&       ibuf = work->atomGroupBuffer;
1604     std::vector<gmx::RVec>& vbuf = work->positionBuffer;
1605     nsend_z                      = 0;
1606     nat                          = work->nat;
1607
1608     for (cg = cg0; cg < cg1; cg++)
1609     {
1610         r2  = 0;
1611         rb2 = 0;
1612         if (!distanceIsTriclinic)
1613         {
1614             /* Rectangular direction, easy */
1615             r = coordinates[cg][dim] - c->c[dim_ind][zone];
1616             if (r > 0)
1617             {
1618                 r2 += r * r;
1619             }
1620             if (bDistMB_pulse)
1621             {
1622                 r = coordinates[cg][dim] - c->bc[dim_ind];
1623                 if (r > 0)
1624                 {
1625                     rb2 += r * r;
1626                 }
1627             }
1628             /* Rounding gives at most a 16% reduction
1629              * in communicated atoms
1630              */
1631             if (dim_ind >= 1 && (zonei == 1 || zonei == 2))
1632             {
1633                 r = coordinates[cg][dim0] - c->cr0;
1634                 /* This is the first dimension, so always r >= 0 */
1635                 r2 += r * r;
1636                 if (bDistMB_pulse)
1637                 {
1638                     rb2 += r * r;
1639                 }
1640             }
1641             if (dim_ind == 2 && (zonei == 2 || zonei == 3))
1642             {
1643                 r = coordinates[cg][dim1] - c->cr1[zone];
1644                 if (r > 0)
1645                 {
1646                     r2 += r * r;
1647                 }
1648                 if (bDistMB_pulse)
1649                 {
1650                     r = coordinates[cg][dim1] - c->bcr1;
1651                     if (r > 0)
1652                     {
1653                         rb2 += r * r;
1654                     }
1655                 }
1656             }
1657         }
1658         else
1659         {
1660             /* Triclinic direction, more complicated */
1661             clear_rvec(rn);
1662             clear_rvec(rb);
1663             /* Rounding, conservative as the skew_fac multiplication
1664              * will slightly underestimate the distance.
1665              */
1666             if (dim_ind >= 1 && (zonei == 1 || zonei == 2))
1667             {
1668                 rn[dim0] = coordinates[cg][dim0] - c->cr0;
1669                 for (i = dim0 + 1; i < DIM; i++)
1670                 {
1671                     rn[dim0] -= coordinates[cg][i] * v_0[i][dim0];
1672                 }
1673                 r2 = rn[dim0] * rn[dim0] * sf2_round[dim0];
1674                 if (bDistMB_pulse)
1675                 {
1676                     rb[dim0] = rn[dim0];
1677                     rb2      = r2;
1678                 }
1679                 /* Take care that the cell planes along dim0 might not
1680                  * be orthogonal to those along dim1 and dim2.
1681                  */
1682                 for (i = 1; i <= dim_ind; i++)
1683                 {
1684                     dimd = dd->dim[i];
1685                     if (normal[dim0][dimd] > 0)
1686                     {
1687                         rn[dimd] -= rn[dim0] * normal[dim0][dimd];
1688                         if (bDistMB_pulse)
1689                         {
1690                             rb[dimd] -= rb[dim0] * normal[dim0][dimd];
1691                         }
1692                     }
1693                 }
1694             }
1695             if (dim_ind == 2 && (zonei == 2 || zonei == 3))
1696             {
1697                 GMX_ASSERT(dim1 >= 0 && dim1 < DIM, "Must have a valid dimension index");
1698                 rn[dim1] += coordinates[cg][dim1] - c->cr1[zone];
1699                 tric_sh = 0;
1700                 for (i = dim1 + 1; i < DIM; i++)
1701                 {
1702                     tric_sh -= coordinates[cg][i] * v_1[i][dim1];
1703                 }
1704                 rn[dim1] += tric_sh;
1705                 if (rn[dim1] > 0)
1706                 {
1707                     r2 += rn[dim1] * rn[dim1] * sf2_round[dim1];
1708                     /* Take care of coupling of the distances
1709                      * to the planes along dim0 and dim1 through dim2.
1710                      */
1711                     r2 -= rn[dim0] * rn[dim1] * skew_fac_01;
1712                     /* Take care that the cell planes along dim1
1713                      * might not be orthogonal to that along dim2.
1714                      */
1715                     if (normal[dim1][dim2] > 0)
1716                     {
1717                         rn[dim2] -= rn[dim1] * normal[dim1][dim2];
1718                     }
1719                 }
1720                 if (bDistMB_pulse)
1721                 {
1722                     rb[dim1] += coordinates[cg][dim1] - c->bcr1 + tric_sh;
1723                     if (rb[dim1] > 0)
1724                     {
1725                         rb2 += rb[dim1] * rb[dim1] * sf2_round[dim1];
1726                         /* Take care of coupling of the distances
1727                          * to the planes along dim0 and dim1 through dim2.
1728                          */
1729                         rb2 -= rb[dim0] * rb[dim1] * skew_fac_01;
1730                         /* Take care that the cell planes along dim1
1731                          * might not be orthogonal to that along dim2.
1732                          */
1733                         if (normal[dim1][dim2] > 0)
1734                         {
1735                             rb[dim2] -= rb[dim1] * normal[dim1][dim2];
1736                         }
1737                     }
1738                 }
1739             }
1740             /* The distance along the communication direction */
1741             rn[dim] += coordinates[cg][dim] - c->c[dim_ind][zone];
1742             tric_sh = 0;
1743             for (i = dim + 1; i < DIM; i++)
1744             {
1745                 tric_sh -= coordinates[cg][i] * v_d[i][dim];
1746             }
1747             rn[dim] += tric_sh;
1748             if (rn[dim] > 0)
1749             {
1750                 r2 += rn[dim] * rn[dim] * skew_fac2_d;
1751                 /* Take care of coupling of the distances
1752                  * to the planes along dim0 and dim1 through dim2.
1753                  */
1754                 if (dim_ind == 1 && zonei == 1)
1755                 {
1756                     r2 -= rn[dim0] * rn[dim] * skew_fac_01;
1757                 }
1758             }
1759             if (bDistMB_pulse)
1760             {
1761                 clear_rvec(rb);
1762                 GMX_ASSERT(dim >= 0 && dim < DIM, "Must have a valid dimension index");
1763                 rb[dim] += coordinates[cg][dim] - c->bc[dim_ind] + tric_sh;
1764                 if (rb[dim] > 0)
1765                 {
1766                     rb2 += rb[dim] * rb[dim] * skew_fac2_d;
1767                     /* Take care of coupling of the distances
1768                      * to the planes along dim0 and dim1 through dim2.
1769                      */
1770                     if (dim_ind == 1 && zonei == 1)
1771                     {
1772                         rb2 -= rb[dim0] * rb[dim] * skew_fac_01;
1773                     }
1774                 }
1775             }
1776         }
1777
1778         if (r2 < r_comm2
1779             || (bDistBonded && ((bDistMB && rb2 < r_bcomm2) || (bDist2B && r2 < r_bcomm2))
1780                 && (!bBondComm
1781                     || ((atomInfo[cg] & gmx::sc_atomInfo_BondCommunication)
1782                         && missing_link(*comm->bondedLinks, globalAtomGroupIndices[cg], *dd->ga2la)))))
1783         {
1784             /* Store the local and global atom group indices and position */
1785             localAtomGroups->push_back(cg);
1786             ibuf.push_back(globalAtomGroupIndices[cg]);
1787             nsend_z++;
1788
1789             rvec posPbc;
1790             if (dd->ci[dim] == 0)
1791             {
1792                 /* Correct coordinates for pbc */
1793                 rvec_add(coordinates[cg], box[dim], posPbc);
1794                 if (bScrew)
1795                 {
1796                     posPbc[YY] = box[YY][YY] - posPbc[YY];
1797                     posPbc[ZZ] = box[ZZ][ZZ] - posPbc[ZZ];
1798                 }
1799             }
1800             else
1801             {
1802                 copy_rvec(coordinates[cg], posPbc);
1803             }
1804             vbuf.emplace_back(posPbc[XX], posPbc[YY], posPbc[ZZ]);
1805
1806             nat += 1;
1807         }
1808     }
1809
1810     work->nat        = nat;
1811     work->nsend_zone = nsend_z;
1812 }
1813
1814 //! Clear data.
1815 static void clearCommSetupData(dd_comm_setup_work_t* work)
1816 {
1817     work->localAtomGroupBuffer.clear();
1818     work->atomGroupBuffer.clear();
1819     work->positionBuffer.clear();
1820     work->nat        = 0;
1821     work->nsend_zone = 0;
1822 }
1823
1824 //! Prepare DD communication.
1825 static void setup_dd_communication(gmx_domdec_t* dd, matrix box, gmx_ddbox_t* ddbox, t_forcerec* fr, t_state* state)
1826 {
1827     int                    dim_ind, dim, dim0, dim1, dim2, dimd, nat_tot;
1828     int                    nzone, nzone_send, zone, zonei, cg0, cg1;
1829     int                    c;
1830     int *                  zone_cg_range, pos_cg;
1831     gmx_domdec_comm_t*     comm;
1832     gmx_domdec_zones_t*    zones;
1833     gmx_domdec_comm_dim_t* cd;
1834     gmx_bool               bBondComm, bDist2B, bDistMB, bDistBonded;
1835     dd_corners_t           corners;
1836     rvec *                 normal, *v_d, *v_0 = nullptr, *v_1 = nullptr;
1837     real                   skew_fac2_d, skew_fac_01;
1838     rvec                   sf2_round;
1839
1840     if (debug)
1841     {
1842         fprintf(debug, "Setting up DD communication\n");
1843     }
1844
1845     comm = dd->comm;
1846
1847     if (comm->dth.empty())
1848     {
1849         /* Initialize the thread data.
1850          * This can not be done in init_domain_decomposition,
1851          * as the numbers of threads is determined later.
1852          */
1853         int numThreads = gmx_omp_nthreads_get(ModuleMultiThread::Domdec);
1854         comm->dth.resize(numThreads);
1855     }
1856
1857     bBondComm = comm->systemInfo.filterBondedCommunication;
1858
1859     /* Do we need to determine extra distances for multi-body bondeds? */
1860     bDistMB = (comm->systemInfo.haveInterDomainMultiBodyBondeds && isDlbOn(dd->comm) && dd->ndim > 1);
1861
1862     /* Do we need to determine extra distances for only two-body bondeds? */
1863     bDist2B = (bBondComm && !bDistMB);
1864
1865     const real r_comm2 =
1866             gmx::square(domainToDomainIntoAtomToDomainCutoff(comm->systemInfo, comm->systemInfo.cutoff));
1867     const real r_bcomm2 =
1868             gmx::square(domainToDomainIntoAtomToDomainCutoff(comm->systemInfo, comm->cutoff_mbody));
1869
1870     if (debug)
1871     {
1872         fprintf(debug, "bBondComm %s, r_bc %f\n", gmx::boolToString(bBondComm), std::sqrt(r_bcomm2));
1873     }
1874
1875     zones = &comm->zones;
1876
1877     dim0 = dd->dim[0];
1878     dim1 = (dd->ndim >= 2 ? dd->dim[1] : -1);
1879     dim2 = (dd->ndim >= 3 ? dd->dim[2] : -1);
1880
1881     set_dd_corners(dd, dim0, dim1, dim2, bDistMB, &corners);
1882
1883     /* Triclinic stuff */
1884     normal      = ddbox->normal;
1885     skew_fac_01 = 0;
1886     if (dd->ndim >= 2)
1887     {
1888         v_0 = ddbox->v[dim0];
1889         if (ddbox->tric_dir[dim0] && ddbox->tric_dir[dim1])
1890         {
1891             /* Determine the coupling coefficient for the distances
1892              * to the cell planes along dim0 and dim1 through dim2.
1893              * This is required for correct rounding.
1894              */
1895             skew_fac_01 = ddbox->v[dim0][dim1 + 1][dim0] * ddbox->v[dim1][dim1 + 1][dim1];
1896             if (debug)
1897             {
1898                 fprintf(debug, "\nskew_fac_01 %f\n", skew_fac_01);
1899             }
1900         }
1901     }
1902     if (dd->ndim >= 3)
1903     {
1904         v_1 = ddbox->v[dim1];
1905     }
1906
1907     zone_cg_range = zones->cg_range.data();
1908     gmx::ArrayRef<gmx::AtomInfoWithinMoleculeBlock> atomInfoForEachMoleculeBlock =
1909             fr->atomInfoForEachMoleculeBlock;
1910
1911     zone_cg_range[0]   = 0;
1912     zone_cg_range[1]   = dd->numHomeAtoms;
1913     comm->zone_ncg1[0] = dd->numHomeAtoms;
1914     pos_cg             = dd->numHomeAtoms;
1915
1916     nat_tot = comm->atomRanges.numHomeAtoms();
1917     nzone   = 1;
1918     for (dim_ind = 0; dim_ind < dd->ndim; dim_ind++)
1919     {
1920         dim = dd->dim[dim_ind];
1921         cd  = &comm->cd[dim_ind];
1922
1923         /* Check if we need to compute triclinic distances along this dim */
1924         bool distanceIsTriclinic = false;
1925         for (int i = 0; i <= dim_ind; i++)
1926         {
1927             if (ddbox->tric_dir[dd->dim[i]])
1928             {
1929                 distanceIsTriclinic = true;
1930             }
1931         }
1932
1933         if (dim >= ddbox->npbcdim && dd->ci[dim] == 0)
1934         {
1935             /* No pbc in this dimension, the first node should not comm. */
1936             nzone_send = 0;
1937         }
1938         else
1939         {
1940             nzone_send = nzone;
1941         }
1942
1943         v_d         = ddbox->v[dim];
1944         skew_fac2_d = gmx::square(ddbox->skew_fac[dim]);
1945
1946         cd->receiveInPlace = true;
1947         for (int p = 0; p < cd->numPulses(); p++)
1948         {
1949             /* Only atoms communicated in the first pulse are used
1950              * for multi-body bonded interactions or for bBondComm.
1951              */
1952             bDistBonded = ((bDistMB || bDist2B) && p == 0);
1953
1954             gmx_domdec_ind_t* ind = &cd->ind[p];
1955
1956             /* Thread 0 writes in the global index array */
1957             ind->index.clear();
1958             clearCommSetupData(&comm->dth[0]);
1959
1960             for (zone = 0; zone < nzone_send; zone++)
1961             {
1962                 if (dim_ind > 0 && distanceIsTriclinic)
1963                 {
1964                     /* Determine slightly more optimized skew_fac's
1965                      * for rounding.
1966                      * This reduces the number of communicated atoms
1967                      * by about 10% for 3D DD of rhombic dodecahedra.
1968                      */
1969                     for (dimd = 0; dimd < dim; dimd++)
1970                     {
1971                         sf2_round[dimd] = 1;
1972                         if (ddbox->tric_dir[dimd])
1973                         {
1974                             for (int i = dd->dim[dimd] + 1; i < DIM; i++)
1975                             {
1976                                 /* If we are shifted in dimension i
1977                                  * and the cell plane is tilted forward
1978                                  * in dimension i, skip this coupling.
1979                                  */
1980                                 if (!(zones->shift[nzone + zone][i] && ddbox->v[dimd][i][dimd] >= 0))
1981                                 {
1982                                     sf2_round[dimd] += gmx::square(ddbox->v[dimd][i][dimd]);
1983                                 }
1984                             }
1985                             sf2_round[dimd] = 1 / sf2_round[dimd];
1986                         }
1987                     }
1988                 }
1989
1990                 zonei = zone_perm[dim_ind][zone];
1991                 if (p == 0)
1992                 {
1993                     /* Here we permutate the zones to obtain a convenient order
1994                      * for neighbor searching
1995                      */
1996                     cg0 = zone_cg_range[zonei];
1997                     cg1 = zone_cg_range[zonei + 1];
1998                 }
1999                 else
2000                 {
2001                     /* Look only at the cg's received in the previous grid pulse
2002                      */
2003                     cg1 = zone_cg_range[nzone + zone + 1];
2004                     cg0 = cg1 - cd->ind[p - 1].nrecv[zone];
2005                 }
2006
2007                 const int numThreads = gmx::ssize(comm->dth);
2008 #pragma omp parallel for num_threads(numThreads) schedule(static)
2009                 for (int th = 0; th < numThreads; th++)
2010                 {
2011                     try
2012                     {
2013                         dd_comm_setup_work_t& work = comm->dth[th];
2014
2015                         /* Retain data accumulated into buffers of thread 0 */
2016                         if (th > 0)
2017                         {
2018                             clearCommSetupData(&work);
2019                         }
2020
2021                         int cg0_th = cg0 + ((cg1 - cg0) * th) / numThreads;
2022                         int cg1_th = cg0 + ((cg1 - cg0) * (th + 1)) / numThreads;
2023
2024                         /* Get the atom groups and coordinates for this pulse in this zone */
2025                         get_zone_pulse_groups(dd,
2026                                               zonei,
2027                                               zone,
2028                                               cg0_th,
2029                                               cg1_th,
2030                                               dd->globalAtomGroupIndices,
2031                                               dim,
2032                                               dim_ind,
2033                                               dim0,
2034                                               dim1,
2035                                               dim2,
2036                                               r_comm2,
2037                                               r_bcomm2,
2038                                               box,
2039                                               distanceIsTriclinic,
2040                                               normal,
2041                                               skew_fac2_d,
2042                                               skew_fac_01,
2043                                               v_d,
2044                                               v_0,
2045                                               v_1,
2046                                               &corners,
2047                                               sf2_round,
2048                                               bDistBonded,
2049                                               bBondComm,
2050                                               bDist2B,
2051                                               bDistMB,
2052                                               state->x,
2053                                               fr->atomInfo,
2054                                               th == 0 ? &ind->index : &work.localAtomGroupBuffer,
2055                                               &work);
2056                     }
2057                     GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR
2058                 } // END
2059
2060                 std::vector<int>&       atomGroups = comm->dth[0].atomGroupBuffer;
2061                 std::vector<gmx::RVec>& positions  = comm->dth[0].positionBuffer;
2062                 ind->nsend[zone]                   = comm->dth[0].nsend_zone;
2063                 /* Append data of threads>=1 to the communication buffers */
2064                 for (int th = 1; th < numThreads; th++)
2065                 {
2066                     const dd_comm_setup_work_t& dth = comm->dth[th];
2067
2068                     ind->index.insert(ind->index.end(),
2069                                       dth.localAtomGroupBuffer.begin(),
2070                                       dth.localAtomGroupBuffer.end());
2071                     atomGroups.insert(
2072                             atomGroups.end(), dth.atomGroupBuffer.begin(), dth.atomGroupBuffer.end());
2073                     positions.insert(
2074                             positions.end(), dth.positionBuffer.begin(), dth.positionBuffer.end());
2075                     comm->dth[0].nat += dth.nat;
2076                     ind->nsend[zone] += dth.nsend_zone;
2077                 }
2078             }
2079             /* Clear the counts in case we do not have pbc */
2080             for (zone = nzone_send; zone < nzone; zone++)
2081             {
2082                 ind->nsend[zone] = 0;
2083             }
2084             ind->nsend[nzone]     = ind->index.size();
2085             ind->nsend[nzone + 1] = comm->dth[0].nat;
2086             /* Communicate the number of cg's and atoms to receive */
2087             ddSendrecv(dd, dim_ind, dddirBackward, ind->nsend, nzone + 2, ind->nrecv, nzone + 2);
2088
2089             if (p > 0)
2090             {
2091                 /* We can receive in place if only the last zone is not empty */
2092                 for (zone = 0; zone < nzone - 1; zone++)
2093                 {
2094                     if (ind->nrecv[zone] > 0)
2095                     {
2096                         cd->receiveInPlace = false;
2097                     }
2098                 }
2099             }
2100
2101             int receiveBufferSize = 0;
2102             if (!cd->receiveInPlace)
2103             {
2104                 receiveBufferSize = ind->nrecv[nzone];
2105             }
2106             /* These buffer are actually only needed with in-place */
2107             DDBufferAccess<int>       globalAtomGroupBuffer(comm->intBuffer, receiveBufferSize);
2108             DDBufferAccess<gmx::RVec> rvecBuffer(comm->rvecBuffer, receiveBufferSize);
2109
2110             dd_comm_setup_work_t& work = comm->dth[0];
2111
2112             /* Make space for the global cg indices */
2113             int numAtomGroupsNew = pos_cg + ind->nrecv[nzone];
2114             dd->globalAtomGroupIndices.resize(numAtomGroupsNew);
2115             /* Communicate the global cg indices */
2116             gmx::ArrayRef<int> integerBufferRef;
2117             if (cd->receiveInPlace)
2118             {
2119                 integerBufferRef = gmx::arrayRefFromArray(
2120                         dd->globalAtomGroupIndices.data() + pos_cg, ind->nrecv[nzone]);
2121             }
2122             else
2123             {
2124                 integerBufferRef = globalAtomGroupBuffer.buffer;
2125             }
2126             ddSendrecv<int>(dd, dim_ind, dddirBackward, work.atomGroupBuffer, integerBufferRef);
2127
2128             /* Make space for atominfo */
2129             dd_resize_atominfo_and_state(fr, state, pos_cg + ind->nrecv[nzone]);
2130
2131             /* Communicate the coordinates */
2132             gmx::ArrayRef<gmx::RVec> rvecBufferRef;
2133             if (cd->receiveInPlace)
2134             {
2135                 rvecBufferRef = gmx::makeArrayRef(state->x).subArray(pos_cg, ind->nrecv[nzone]);
2136             }
2137             else
2138             {
2139                 rvecBufferRef = rvecBuffer.buffer;
2140             }
2141             ddSendrecv<gmx::RVec>(dd, dim_ind, dddirBackward, work.positionBuffer, rvecBufferRef);
2142
2143             /* Make the charge group index */
2144             if (cd->receiveInPlace)
2145             {
2146                 zone = (p == 0 ? 0 : nzone - 1);
2147                 while (zone < nzone)
2148                 {
2149                     for (int i = 0; i < ind->nrecv[zone]; i++)
2150                     {
2151                         int globalAtomIndex = dd->globalAtomGroupIndices[pos_cg];
2152                         fr->atomInfo[pos_cg] =
2153                                 ddGetAtomInfo(atomInfoForEachMoleculeBlock, globalAtomIndex);
2154                         pos_cg++;
2155                     }
2156                     if (p == 0)
2157                     {
2158                         comm->zone_ncg1[nzone + zone] = ind->nrecv[zone];
2159                     }
2160                     zone++;
2161                     zone_cg_range[nzone + zone] = pos_cg;
2162                 }
2163             }
2164             else
2165             {
2166                 /* This part of the code is never executed with bBondComm. */
2167                 merge_cg_buffers(nzone,
2168                                  cd,
2169                                  p,
2170                                  zone_cg_range,
2171                                  dd->globalAtomGroupIndices,
2172                                  integerBufferRef.data(),
2173                                  state->x,
2174                                  rvecBufferRef,
2175                                  fr->atomInfoForEachMoleculeBlock,
2176                                  fr->atomInfo);
2177                 pos_cg += ind->nrecv[nzone];
2178             }
2179             nat_tot += ind->nrecv[nzone + 1];
2180         }
2181         if (!cd->receiveInPlace)
2182         {
2183             /* Store the atom block for easy copying of communication buffers */
2184             make_cell2at_index(cd, nzone, zone_cg_range[nzone]);
2185         }
2186         nzone += nzone;
2187     }
2188
2189     comm->atomRanges.setEnd(DDAtomRanges::Type::Zones, nat_tot);
2190
2191     if (!bBondComm)
2192     {
2193         /* We don't need to update atominfo, since that was already done above.
2194          * So we pass NULL for the forcerec.
2195          */
2196         dd_set_atominfo(
2197                 dd->globalAtomGroupIndices, dd->numHomeAtoms, dd->globalAtomGroupIndices.size(), nullptr);
2198     }
2199
2200     if (debug)
2201     {
2202         fprintf(debug, "Finished setting up DD communication, zones:");
2203         for (c = 0; c < zones->n; c++)
2204         {
2205             fprintf(debug, " %d", zones->cg_range[c + 1] - zones->cg_range[c]);
2206         }
2207         fprintf(debug, "\n");
2208     }
2209 }
2210
2211 //! Set boundaries for the charge group range.
2212 static void set_cg_boundaries(gmx_domdec_zones_t* zones)
2213 {
2214     for (auto& iZone : zones->iZones)
2215     {
2216         iZone.iAtomRange = gmx::Range<int>(0, zones->cg_range[iZone.iZoneIndex + 1]);
2217         iZone.jAtomRange = gmx::Range<int>(zones->cg_range[iZone.jZoneRange.begin()],
2218                                            zones->cg_range[iZone.jZoneRange.end()]);
2219     }
2220 }
2221
2222 /*! \brief Set zone dimensions for zones \p zone_start to \p zone_end-1
2223  *
2224  * Also sets the atom density for the home zone when \p zone_start=0.
2225  * For this \p numMovedChargeGroupsInHomeZone needs to be passed to tell
2226  * how many charge groups will move but are still part of the current range.
2227  * \todo When converting domdec to use proper classes, all these variables
2228  *       should be private and a method should return the correct count
2229  *       depending on an internal state.
2230  *
2231  * \param[in,out] dd          The domain decomposition struct
2232  * \param[in]     box         The box
2233  * \param[in]     ddbox       The domain decomposition box struct
2234  * \param[in]     zone_start  The start of the zone range to set sizes for
2235  * \param[in]     zone_end    The end of the zone range to set sizes for
2236  * \param[in]     numMovedChargeGroupsInHomeZone  The number of charge groups in the home zone that should moved but are still present in dd->comm->zones.cg_range
2237  */
2238 static void set_zones_size(gmx_domdec_t*      dd,
2239                            matrix             box,
2240                            const gmx_ddbox_t* ddbox,
2241                            int                zone_start,
2242                            int                zone_end,
2243                            int                numMovedChargeGroupsInHomeZone)
2244 {
2245     gmx_domdec_comm_t*  comm;
2246     gmx_domdec_zones_t* zones;
2247     gmx_bool            bDistMB;
2248     int                 z, d, dim;
2249     real                rcs, rcmbs;
2250     int                 i, j;
2251     real                vol;
2252
2253     comm = dd->comm;
2254
2255     zones = &comm->zones;
2256
2257     /* Do we need to determine extra distances for multi-body bondeds? */
2258     bDistMB = (comm->systemInfo.haveInterDomainMultiBodyBondeds && isDlbOn(dd->comm) && dd->ndim > 1);
2259
2260     for (z = zone_start; z < zone_end; z++)
2261     {
2262         /* Copy cell limits to zone limits.
2263          * Valid for non-DD dims and non-shifted dims.
2264          */
2265         copy_rvec(comm->cell_x0, zones->size[z].x0);
2266         copy_rvec(comm->cell_x1, zones->size[z].x1);
2267     }
2268
2269     for (d = 0; d < dd->ndim; d++)
2270     {
2271         dim = dd->dim[d];
2272
2273         for (z = 0; z < zones->n; z++)
2274         {
2275             /* With a staggered grid we have different sizes
2276              * for non-shifted dimensions.
2277              */
2278             if (isDlbOn(dd->comm) && zones->shift[z][dim] == 0)
2279             {
2280                 if (d == 1)
2281                 {
2282                     zones->size[z].x0[dim] = comm->zone_d1[zones->shift[z][dd->dim[d - 1]]].min0;
2283                     zones->size[z].x1[dim] = comm->zone_d1[zones->shift[z][dd->dim[d - 1]]].max1;
2284                 }
2285                 else if (d == 2)
2286                 {
2287                     zones->size[z].x0[dim] =
2288                             comm->zone_d2[zones->shift[z][dd->dim[d - 2]]][zones->shift[z][dd->dim[d - 1]]]
2289                                     .min0;
2290                     zones->size[z].x1[dim] =
2291                             comm->zone_d2[zones->shift[z][dd->dim[d - 2]]][zones->shift[z][dd->dim[d - 1]]]
2292                                     .max1;
2293                 }
2294             }
2295         }
2296
2297         rcs   = comm->systemInfo.cutoff;
2298         rcmbs = comm->cutoff_mbody;
2299         if (ddbox->tric_dir[dim])
2300         {
2301             rcs /= ddbox->skew_fac[dim];
2302             rcmbs /= ddbox->skew_fac[dim];
2303         }
2304
2305         /* Set the lower limit for the shifted zone dimensions */
2306         for (z = zone_start; z < zone_end; z++)
2307         {
2308             if (zones->shift[z][dim] > 0)
2309             {
2310                 dim = dd->dim[d];
2311                 if (!isDlbOn(dd->comm) || d == 0)
2312                 {
2313                     zones->size[z].x0[dim] = comm->cell_x1[dim];
2314                     zones->size[z].x1[dim] = comm->cell_x1[dim] + rcs;
2315                 }
2316                 else
2317                 {
2318                     /* Here we take the lower limit of the zone from
2319                      * the lowest domain of the zone below.
2320                      */
2321                     if (z < 4)
2322                     {
2323                         zones->size[z].x0[dim] = comm->zone_d1[zones->shift[z][dd->dim[d - 1]]].min1;
2324                     }
2325                     else
2326                     {
2327                         if (d == 1)
2328                         {
2329                             zones->size[z].x0[dim] = zones->size[zone_perm[2][z - 4]].x0[dim];
2330                         }
2331                         else
2332                         {
2333                             zones->size[z].x0[dim] =
2334                                     comm->zone_d2[zones->shift[z][dd->dim[d - 2]]][zones->shift[z][dd->dim[d - 1]]]
2335                                             .min1;
2336                         }
2337                     }
2338                     /* A temporary limit, is updated below */
2339                     zones->size[z].x1[dim] = zones->size[z].x0[dim];
2340
2341                     if (bDistMB)
2342                     {
2343                         for (size_t zi = 0; zi < zones->iZones.size(); zi++)
2344                         {
2345                             if (zones->shift[zi][dim] == 0)
2346                             {
2347                                 /* This takes the whole zone into account.
2348                                  * With multiple pulses this will lead
2349                                  * to a larger zone then strictly necessary.
2350                                  */
2351                                 zones->size[z].x1[dim] = std::max(zones->size[z].x1[dim],
2352                                                                   zones->size[zi].x1[dim] + rcmbs);
2353                             }
2354                         }
2355                     }
2356                 }
2357             }
2358         }
2359
2360         /* Loop over the i-zones to set the upper limit of each
2361          * j-zone they see.
2362          */
2363         for (const auto& iZone : zones->iZones)
2364         {
2365             const int zi = iZone.iZoneIndex;
2366             if (zones->shift[zi][dim] == 0)
2367             {
2368                 /* We should only use zones up to zone_end */
2369                 const auto& jZoneRangeFull = iZone.jZoneRange;
2370                 if (zone_end <= *jZoneRangeFull.begin())
2371                 {
2372                     continue;
2373                 }
2374                 const gmx::Range<int> jZoneRange(*jZoneRangeFull.begin(),
2375                                                  std::min(*jZoneRangeFull.end(), zone_end));
2376                 for (int jZone : jZoneRange)
2377                 {
2378                     if (zones->shift[jZone][dim] > 0)
2379                     {
2380                         zones->size[jZone].x1[dim] =
2381                                 std::max(zones->size[jZone].x1[dim], zones->size[zi].x1[dim] + rcs);
2382                     }
2383                 }
2384             }
2385         }
2386     }
2387
2388     for (z = zone_start; z < zone_end; z++)
2389     {
2390         /* Initialization only required to keep the compiler happy */
2391         rvec corner_min = { 0, 0, 0 }, corner_max = { 0, 0, 0 }, corner;
2392         int  nc, c;
2393
2394         /* To determine the bounding box for a zone we need to find
2395          * the extreme corners of 4, 2 or 1 corners.
2396          */
2397         nc = 1 << (ddbox->nboundeddim - 1);
2398
2399         for (c = 0; c < nc; c++)
2400         {
2401             /* Set up a zone corner at x=0, ignoring trilinic couplings */
2402             corner[XX] = 0;
2403             if ((c & 1) == 0)
2404             {
2405                 corner[YY] = zones->size[z].x0[YY];
2406             }
2407             else
2408             {
2409                 corner[YY] = zones->size[z].x1[YY];
2410             }
2411             if ((c & 2) == 0)
2412             {
2413                 corner[ZZ] = zones->size[z].x0[ZZ];
2414             }
2415             else
2416             {
2417                 corner[ZZ] = zones->size[z].x1[ZZ];
2418             }
2419             if (dd->ndim == 1 && dd->dim[0] < ZZ && ZZ < dd->unitCellInfo.npbcdim
2420                 && box[ZZ][1 - dd->dim[0]] != 0)
2421             {
2422                 /* With 1D domain decomposition the cg's are not in
2423                  * the triclinic box, but triclinic x-y and rectangular y/x-z.
2424                  * Shift the corner of the z-vector back to along the box
2425                  * vector of dimension d, so it will later end up at 0 along d.
2426                  * This can affect the location of this corner along dd->dim[0]
2427                  * through the matrix operation below if box[d][dd->dim[0]]!=0.
2428                  */
2429                 int d = 1 - dd->dim[0];
2430
2431                 corner[d] -= corner[ZZ] * box[ZZ][d] / box[ZZ][ZZ];
2432             }
2433             /* Apply the triclinic couplings */
2434             for (i = YY; i < ddbox->npbcdim && i < DIM; i++)
2435             {
2436                 for (j = XX; j < i; j++)
2437                 {
2438                     corner[j] += corner[i] * box[i][j] / box[i][i];
2439                 }
2440             }
2441             if (c == 0)
2442             {
2443                 copy_rvec(corner, corner_min);
2444                 copy_rvec(corner, corner_max);
2445             }
2446             else
2447             {
2448                 for (i = 0; i < DIM; i++)
2449                 {
2450                     corner_min[i] = std::min(corner_min[i], corner[i]);
2451                     corner_max[i] = std::max(corner_max[i], corner[i]);
2452                 }
2453             }
2454         }
2455         /* Copy the extreme cornes without offset along x */
2456         for (i = 0; i < DIM; i++)
2457         {
2458             zones->size[z].bb_x0[i] = corner_min[i];
2459             zones->size[z].bb_x1[i] = corner_max[i];
2460         }
2461         /* Add the offset along x */
2462         zones->size[z].bb_x0[XX] += zones->size[z].x0[XX];
2463         zones->size[z].bb_x1[XX] += zones->size[z].x1[XX];
2464     }
2465
2466     if (zone_start == 0)
2467     {
2468         vol = 1;
2469         for (dim = 0; dim < DIM; dim++)
2470         {
2471             vol *= zones->size[0].x1[dim] - zones->size[0].x0[dim];
2472         }
2473         zones->dens_zone0 =
2474                 (zones->cg_range[1] - zones->cg_range[0] - numMovedChargeGroupsInHomeZone) / vol;
2475     }
2476
2477     if (debug)
2478     {
2479         for (z = zone_start; z < zone_end; z++)
2480         {
2481             fprintf(debug,
2482                     "zone %d    %6.3f - %6.3f  %6.3f - %6.3f  %6.3f - %6.3f\n",
2483                     z,
2484                     zones->size[z].x0[XX],
2485                     zones->size[z].x1[XX],
2486                     zones->size[z].x0[YY],
2487                     zones->size[z].x1[YY],
2488                     zones->size[z].x0[ZZ],
2489                     zones->size[z].x1[ZZ]);
2490             fprintf(debug,
2491                     "zone %d bb %6.3f - %6.3f  %6.3f - %6.3f  %6.3f - %6.3f\n",
2492                     z,
2493                     zones->size[z].bb_x0[XX],
2494                     zones->size[z].bb_x1[XX],
2495                     zones->size[z].bb_x0[YY],
2496                     zones->size[z].bb_x1[YY],
2497                     zones->size[z].bb_x0[ZZ],
2498                     zones->size[z].bb_x1[ZZ]);
2499         }
2500     }
2501 }
2502
2503 /*! \brief Order data in \p dataToSort according to \p sort
2504  *
2505  * Note: both buffers should have at least \p sort.size() elements.
2506  */
2507 template<typename T>
2508 static void orderVector(gmx::ArrayRef<const gmx_cgsort_t> sort,
2509                         gmx::ArrayRef<T>                  dataToSort,
2510                         gmx::ArrayRef<T>                  sortBuffer)
2511 {
2512     GMX_ASSERT(dataToSort.size() >= sort.size(), "The vector needs to be sufficiently large");
2513     GMX_ASSERT(sortBuffer.size() >= sort.size(),
2514                "The sorting buffer needs to be sufficiently large");
2515
2516     /* Order the data into the temporary buffer */
2517     size_t i = 0;
2518     for (const gmx_cgsort_t& entry : sort)
2519     {
2520         sortBuffer[i++] = dataToSort[entry.ind];
2521     }
2522
2523     /* Copy back to the original array */
2524     std::copy(sortBuffer.begin(), sortBuffer.begin() + sort.size(), dataToSort.begin());
2525 }
2526
2527 /*! \brief Order data in \p dataToSort according to \p sort
2528  *
2529  * Note: \p vectorToSort should have at least \p sort.size() elements,
2530  *       \p workVector is resized when it is too small.
2531  */
2532 template<typename T>
2533 static void orderVector(gmx::ArrayRef<const gmx_cgsort_t> sort,
2534                         gmx::ArrayRef<T>                  vectorToSort,
2535                         std::vector<T>*                   workVector)
2536 {
2537     if (gmx::index(workVector->size()) < sort.ssize())
2538     {
2539         workVector->resize(sort.size());
2540     }
2541     orderVector<T>(sort, vectorToSort, *workVector);
2542 }
2543
2544 //! Returns the sorting order for atoms based on the nbnxn grid order in sort
2545 static void dd_sort_order_nbnxn(const t_forcerec* fr, std::vector<gmx_cgsort_t>* sort)
2546 {
2547     gmx::ArrayRef<const int> atomOrder = fr->nbv->getLocalAtomOrder();
2548
2549     /* Using push_back() instead of this resize results in much slower code */
2550     sort->resize(atomOrder.size());
2551     gmx::ArrayRef<gmx_cgsort_t> buffer    = *sort;
2552     size_t                      numSorted = 0;
2553     for (int i : atomOrder)
2554     {
2555         if (i >= 0)
2556         {
2557             /* The values of nsc and ind_gl are not used in this case */
2558             buffer[numSorted++].ind = i;
2559         }
2560     }
2561     sort->resize(numSorted);
2562 }
2563
2564 //! Returns the sorting state for DD.
2565 static void dd_sort_state(gmx_domdec_t* dd, t_forcerec* fr, t_state* state)
2566 {
2567     gmx_domdec_sort_t* sort = dd->comm->sort.get();
2568
2569     dd_sort_order_nbnxn(fr, &sort->sorted);
2570
2571     /* We alloc with the old size, since cgindex is still old */
2572     DDBufferAccess<gmx::RVec> rvecBuffer(dd->comm->rvecBuffer, dd->numHomeAtoms);
2573
2574     /* Set the new home atom/charge group count */
2575     dd->numHomeAtoms = sort->sorted.size();
2576     if (debug)
2577     {
2578         fprintf(debug, "Set the new home atom count to %d\n", dd->numHomeAtoms);
2579     }
2580
2581     /* Reorder the state */
2582     gmx::ArrayRef<const gmx_cgsort_t> cgsort = sort->sorted;
2583     GMX_RELEASE_ASSERT(cgsort.ssize() == dd->numHomeAtoms,
2584                        "We should sort all the home atom groups");
2585
2586     if (state->flags & enumValueToBitMask(StateEntry::X))
2587     {
2588         orderVector(cgsort, makeArrayRef(state->x), rvecBuffer.buffer);
2589     }
2590     if (state->flags & enumValueToBitMask(StateEntry::V))
2591     {
2592         orderVector(cgsort, makeArrayRef(state->v), rvecBuffer.buffer);
2593     }
2594     if (state->flags & enumValueToBitMask(StateEntry::Cgp))
2595     {
2596         orderVector(cgsort, makeArrayRef(state->cg_p), rvecBuffer.buffer);
2597     }
2598
2599     /* Reorder the global cg index */
2600     orderVector<int>(cgsort, dd->globalAtomGroupIndices, &sort->intBuffer);
2601     /* Reorder the atom info */
2602     orderVector<int64_t>(cgsort, fr->atomInfo, &sort->int64Buffer);
2603     /* Set the home atom number */
2604     dd->comm->atomRanges.setEnd(DDAtomRanges::Type::Home, dd->numHomeAtoms);
2605
2606     /* The atoms are now exactly in grid order, update the grid order */
2607     fr->nbv->setLocalAtomOrder();
2608 }
2609
2610 //! Accumulates load statistics.
2611 static void add_dd_statistics(gmx_domdec_t* dd)
2612 {
2613     gmx_domdec_comm_t* comm = dd->comm;
2614
2615     for (int i = 0; i < static_cast<int>(DDAtomRanges::Type::Number); i++)
2616     {
2617         auto range = static_cast<DDAtomRanges::Type>(i);
2618         comm->sum_nat[i] += comm->atomRanges.end(range) - comm->atomRanges.start(range);
2619     }
2620     comm->ndecomp++;
2621 }
2622
2623 void reset_dd_statistics_counters(gmx_domdec_t* dd)
2624 {
2625     gmx_domdec_comm_t* comm = dd->comm;
2626
2627     /* Reset all the statistics and counters for total run counting */
2628     for (int i = 0; i < static_cast<int>(DDAtomRanges::Type::Number); i++)
2629     {
2630         comm->sum_nat[i] = 0;
2631     }
2632     comm->ndecomp   = 0;
2633     comm->nload     = 0;
2634     comm->load_step = 0;
2635     comm->load_sum  = 0;
2636     comm->load_max  = 0;
2637     clear_ivec(comm->load_lim);
2638     comm->load_mdf = 0;
2639     comm->load_pme = 0;
2640 }
2641
2642 namespace gmx
2643 {
2644
2645 bool check_grid_jump(int64_t step, const gmx_domdec_t* dd, real cutoff, const gmx_ddbox_t* ddbox, bool bFatal)
2646 {
2647     gmx_domdec_comm_t* comm    = dd->comm;
2648     bool               invalid = false;
2649
2650     for (int d = 1; d < dd->ndim; d++)
2651     {
2652         const DDCellsizesWithDlb& cellsizes = comm->cellsizesWithDlb[d];
2653         const int                 dim       = dd->dim[d];
2654         const real                limit     = grid_jump_limit(comm, cutoff, d);
2655         real                      bfac      = ddbox->box_size[dim];
2656         if (ddbox->tric_dir[dim])
2657         {
2658             bfac *= ddbox->skew_fac[dim];
2659         }
2660         if ((cellsizes.fracUpper - cellsizes.fracLowerMax) * bfac < limit
2661             || (cellsizes.fracLower - cellsizes.fracUpperMin) * bfac > -limit)
2662         {
2663             invalid = true;
2664
2665             if (bFatal)
2666             {
2667                 char buf[22];
2668
2669                 /* This error should never be triggered under normal
2670                  * circumstances, but you never know ...
2671                  */
2672                 gmx_fatal(FARGS,
2673                           "step %s: The domain decomposition grid has shifted too much in the "
2674                           "%c-direction around cell %d %d %d. This should not have happened. "
2675                           "Running with fewer ranks might avoid this issue.",
2676                           gmx_step_str(step, buf),
2677                           dim2char(dim),
2678                           dd->ci[XX],
2679                           dd->ci[YY],
2680                           dd->ci[ZZ]);
2681             }
2682         }
2683     }
2684
2685     return invalid;
2686 }
2687
2688 void print_dd_statistics(const t_commrec* cr, const t_inputrec& inputrec, FILE* fplog)
2689 {
2690     gmx_domdec_comm_t* comm = cr->dd->comm;
2691
2692     const int numRanges = static_cast<int>(DDAtomRanges::Type::Number);
2693     gmx_sumd(numRanges, comm->sum_nat, cr);
2694
2695     if (fplog == nullptr)
2696     {
2697         return;
2698     }
2699
2700     fprintf(fplog, "\n    D O M A I N   D E C O M P O S I T I O N   S T A T I S T I C S\n\n");
2701
2702     for (int i = static_cast<int>(DDAtomRanges::Type::Zones); i < numRanges; i++)
2703     {
2704         auto   range = static_cast<DDAtomRanges::Type>(i);
2705         double av    = comm->sum_nat[i] / comm->ndecomp;
2706         switch (range)
2707         {
2708             case DDAtomRanges::Type::Zones:
2709                 fprintf(fplog, " av. #atoms communicated per step for force:  %d x %.1f\n", 2, av);
2710                 break;
2711             case DDAtomRanges::Type::Vsites:
2712                 if (cr->dd->vsite_comm)
2713                 {
2714                     fprintf(fplog,
2715                             " av. #atoms communicated per step for vsites: %d x %.1f\n",
2716                             (EEL_PME(inputrec.coulombtype)
2717                              || inputrec.coulombtype == CoulombInteractionType::Ewald)
2718                                     ? 3
2719                                     : 2,
2720                             av);
2721                 }
2722                 break;
2723             case DDAtomRanges::Type::Constraints:
2724                 if (cr->dd->constraint_comm)
2725                 {
2726                     fprintf(fplog,
2727                             " av. #atoms communicated per step for LINCS:  %d x %.1f\n",
2728                             1 + inputrec.nLincsIter,
2729                             av);
2730                 }
2731                 break;
2732             default: gmx_incons(" Unknown type for DD statistics");
2733         }
2734     }
2735     fprintf(fplog, "\n");
2736
2737     if (comm->ddSettings.recordLoad && EI_DYNAMICS(inputrec.eI))
2738     {
2739         print_dd_load_av(fplog, cr->dd);
2740     }
2741 }
2742
2743 //!\brief TODO Remove fplog when group scheme and charge groups are gone
2744 void dd_partition_system(FILE*                     fplog,
2745                          const gmx::MDLogger&      mdlog,
2746                          int64_t                   step,
2747                          const t_commrec*          cr,
2748                          bool                      bMasterState,
2749                          int                       nstglobalcomm,
2750                          t_state*                  state_global,
2751                          const gmx_mtop_t&         top_global,
2752                          const t_inputrec&         inputrec,
2753                          gmx::ImdSession*          imdSession,
2754                          pull_t*                   pull_work,
2755                          t_state*                  state_local,
2756                          gmx::ForceBuffers*        f,
2757                          gmx::MDAtoms*             mdAtoms,
2758                          gmx_localtop_t*           top_local,
2759                          t_forcerec*               fr,
2760                          gmx::VirtualSitesHandler* vsite,
2761                          gmx::Constraints*         constr,
2762                          t_nrnb*                   nrnb,
2763                          gmx_wallcycle*            wcycle,
2764                          bool                      bVerbose)
2765 {
2766     gmx_ddbox_t ddbox = { 0 };
2767     int         ncgindex_set;
2768     char        sbuf[22];
2769
2770     wallcycle_start(wcycle, WallCycleCounter::Domdec);
2771
2772     gmx_domdec_t*      dd   = cr->dd;
2773     gmx_domdec_comm_t* comm = dd->comm;
2774
2775     // TODO if the update code becomes accessible here, use
2776     // upd->deform for this logic.
2777     bool bBoxChanged = (bMasterState || inputrecDeform(&inputrec));
2778     if (inputrec.epc != PressureCoupling::No)
2779     {
2780         /* With nstpcouple > 1 pressure coupling happens.
2781          * one step after calculating the pressure.
2782          * Box scaling happens at the end of the MD step,
2783          * after the DD partitioning.
2784          * We therefore have to do DLB in the first partitioning
2785          * after an MD step where P-coupling occurred.
2786          * We need to determine the last step in which p-coupling occurred.
2787          * MRS -- need to validate this for vv?
2788          */
2789         int     n = inputrec.nstpcouple;
2790         int64_t step_pcoupl;
2791         if (n == 1)
2792         {
2793             step_pcoupl = step - 1;
2794         }
2795         else
2796         {
2797             step_pcoupl = ((step - 1) / n) * n + 1;
2798         }
2799         if (step_pcoupl >= comm->partition_step)
2800         {
2801             bBoxChanged = true;
2802         }
2803     }
2804
2805     bool bNStGlobalComm = (step % nstglobalcomm == 0);
2806     bool bDoDLB;
2807     if (!isDlbOn(comm))
2808     {
2809         bDoDLB = false;
2810     }
2811     else
2812     {
2813         /* Should we do dynamic load balacing this step?
2814          * Since it requires (possibly expensive) global communication,
2815          * we might want to do DLB less frequently.
2816          */
2817         if (bBoxChanged || inputrec.epc != PressureCoupling::No)
2818         {
2819             bDoDLB = bBoxChanged;
2820         }
2821         else
2822         {
2823             bDoDLB = bNStGlobalComm;
2824         }
2825     }
2826
2827     /* Check if we have recorded loads on the nodes */
2828     if (comm->ddSettings.recordLoad && dd_load_count(comm) > 0)
2829     {
2830         bool bCheckWhetherToTurnDlbOn = dd_dlb_get_should_check_whether_to_turn_dlb_on(dd);
2831
2832         /* Print load every nstlog, first and last step to the log file */
2833         bool bLogLoad = ((inputrec.nstlog > 0 && step % inputrec.nstlog == 0) || comm->n_load_collect == 0
2834                          || (inputrec.nsteps >= 0
2835                              && (step + inputrec.nstlist > inputrec.init_step + inputrec.nsteps)));
2836
2837         /* Avoid extra communication due to verbose screen output
2838          * when nstglobalcomm is set.
2839          */
2840         if (bDoDLB || bLogLoad || bCheckWhetherToTurnDlbOn
2841             || (bVerbose && (inputrec.nstlist == 0 || nstglobalcomm <= inputrec.nstlist)))
2842         {
2843             get_load_distribution(dd, wcycle);
2844             if (DDMASTER(dd))
2845             {
2846                 if (bLogLoad)
2847                 {
2848                     GMX_LOG(mdlog.info).asParagraph().appendText(dd_print_load(dd, step - 1));
2849                 }
2850                 if (bVerbose)
2851                 {
2852                     dd_print_load_verbose(dd);
2853                 }
2854             }
2855             comm->n_load_collect++;
2856
2857             if (isDlbOn(comm))
2858             {
2859                 if (DDMASTER(dd))
2860                 {
2861                     /* Add the measured cycles to the running average */
2862                     const float averageFactor = 0.1F;
2863                     comm->cyclesPerStepDlbExpAverage =
2864                             (1 - averageFactor) * comm->cyclesPerStepDlbExpAverage
2865                             + averageFactor * comm->cycl[ddCyclStep] / comm->cycl_n[ddCyclStep];
2866                 }
2867                 if (comm->dlbState == DlbState::onCanTurnOff
2868                     && dd->comm->n_load_have % c_checkTurnDlbOffInterval == c_checkTurnDlbOffInterval - 1)
2869                 {
2870                     bool turnOffDlb;
2871                     if (DDMASTER(dd))
2872                     {
2873                         /* If the running averaged cycles with DLB are more
2874                          * than before we turned on DLB, turn off DLB.
2875                          * We will again run and check the cycles without DLB
2876                          * and we can then decide if to turn off DLB forever.
2877                          */
2878                         turnOffDlb = (comm->cyclesPerStepDlbExpAverage > comm->cyclesPerStepBeforeDLB);
2879                     }
2880                     dd_bcast(dd, sizeof(turnOffDlb), &turnOffDlb);
2881                     if (turnOffDlb)
2882                     {
2883                         /* To turn off DLB, we need to redistribute the atoms */
2884                         dd_collect_state(dd, state_local, state_global);
2885                         bMasterState = true;
2886                         turn_off_dlb(mdlog, dd, step);
2887                     }
2888                 }
2889             }
2890             else if (bCheckWhetherToTurnDlbOn)
2891             {
2892                 bool turnOffDlbForever = false;
2893                 bool turnOnDlb         = false;
2894
2895                 /* Since the timings are node dependent, the master decides */
2896                 if (DDMASTER(dd))
2897                 {
2898                     /* If we recently turned off DLB, we want to check if
2899                      * performance is better without DLB. We want to do this
2900                      * ASAP to minimize the chance that external factors
2901                      * slowed down the DLB step are gone here and we
2902                      * incorrectly conclude that DLB was causing the slowdown.
2903                      * So we measure one nstlist block, no running average.
2904                      */
2905                     if (comm->haveTurnedOffDlb
2906                         && comm->cycl[ddCyclStep] / comm->cycl_n[ddCyclStep] < comm->cyclesPerStepDlbExpAverage)
2907                     {
2908                         /* After turning off DLB we ran nstlist steps in fewer
2909                          * cycles than with DLB. This likely means that DLB
2910                          * in not benefical, but this could be due to a one
2911                          * time unlucky fluctuation, so we require two such
2912                          * observations in close succession to turn off DLB
2913                          * forever.
2914                          */
2915                         if (comm->dlbSlowerPartitioningCount > 0
2916                             && dd->ddp_count < comm->dlbSlowerPartitioningCount + 10 * c_checkTurnDlbOnInterval)
2917                         {
2918                             turnOffDlbForever = true;
2919                         }
2920                         comm->haveTurnedOffDlb = false;
2921                         /* Register when we last measured DLB slowdown */
2922                         comm->dlbSlowerPartitioningCount = dd->ddp_count;
2923                     }
2924                     else
2925                     {
2926                         /* Here we check if the max PME rank load is more than 0.98
2927                          * the max PP force load. If so, PP DLB will not help,
2928                          * since we are (almost) limited by PME. Furthermore,
2929                          * DLB will cause a significant extra x/f redistribution
2930                          * cost on the PME ranks, which will then surely result
2931                          * in lower total performance.
2932                          */
2933                         if (comm->ddRankSetup.usePmeOnlyRanks && dd_pme_f_ratio(dd) > 1 - DD_PERF_LOSS_DLB_ON)
2934                         {
2935                             turnOnDlb = false;
2936                         }
2937                         else
2938                         {
2939                             turnOnDlb = (dd_force_imb_perf_loss(dd) >= DD_PERF_LOSS_DLB_ON);
2940                         }
2941                     }
2942                 }
2943                 struct
2944                 {
2945                     bool turnOffDlbForever;
2946                     bool turnOnDlb;
2947                 } bools{ turnOffDlbForever, turnOnDlb };
2948                 dd_bcast(dd, sizeof(bools), &bools);
2949                 if (bools.turnOffDlbForever)
2950                 {
2951                     turn_off_dlb_forever(mdlog, dd, step);
2952                 }
2953                 else if (bools.turnOnDlb)
2954                 {
2955                     turn_on_dlb(mdlog, dd, step);
2956                     bDoDLB = true;
2957                 }
2958             }
2959         }
2960         comm->n_load_have++;
2961     }
2962
2963     bool bRedist = false;
2964     if (bMasterState)
2965     {
2966         /* Clear the old state */
2967         clearDDStateIndices(dd, false);
2968         ncgindex_set = 0;
2969
2970         auto xGlobal = positionsFromStatePointer(state_global);
2971
2972         set_ddbox(*dd, true, DDMASTER(dd) ? state_global->box : nullptr, true, xGlobal, &ddbox);
2973
2974         distributeState(mdlog, dd, top_global, state_global, ddbox, state_local);
2975
2976         /* Ensure that we have space for the new distribution */
2977         dd_resize_atominfo_and_state(fr, state_local, dd->numHomeAtoms);
2978
2979         inc_nrnb(nrnb, eNR_CGCM, comm->atomRanges.numHomeAtoms());
2980
2981         dd_set_atominfo(dd->globalAtomGroupIndices, 0, dd->numHomeAtoms, fr);
2982     }
2983     else if (state_local->ddp_count != dd->ddp_count)
2984     {
2985         if (state_local->ddp_count > dd->ddp_count)
2986         {
2987             gmx_fatal(FARGS,
2988                       "Internal inconsistency state_local->ddp_count (%d) > dd->ddp_count (%" PRId64
2989                       ")",
2990                       state_local->ddp_count,
2991                       dd->ddp_count);
2992         }
2993
2994         if (state_local->ddp_count_cg_gl != state_local->ddp_count)
2995         {
2996             gmx_fatal(FARGS,
2997                       "Internal inconsistency state_local->ddp_count_cg_gl (%d) != "
2998                       "state_local->ddp_count (%d)",
2999                       state_local->ddp_count_cg_gl,
3000                       state_local->ddp_count);
3001         }
3002
3003         /* Clear the old state */
3004         clearDDStateIndices(dd, false);
3005
3006         /* Restore the atom group indices from state_local */
3007         restoreAtomGroups(dd, state_local);
3008         make_dd_indices(dd, 0);
3009         ncgindex_set = dd->numHomeAtoms;
3010
3011         inc_nrnb(nrnb, eNR_CGCM, comm->atomRanges.numHomeAtoms());
3012
3013         dd_set_atominfo(dd->globalAtomGroupIndices, 0, dd->numHomeAtoms, fr);
3014
3015         set_ddbox(*dd, bMasterState, state_local->box, true, state_local->x, &ddbox);
3016
3017         bRedist = isDlbOn(comm);
3018     }
3019     else
3020     {
3021         /* We have the full state, only redistribute the cgs */
3022
3023         /* Clear the non-home indices */
3024         clearDDStateIndices(dd, true);
3025         ncgindex_set = 0;
3026
3027         /* To avoid global communication, we do not recompute the extent
3028          * of the system for dims without pbc. Therefore we need to copy
3029          * the previously computed values when we do not communicate.
3030          */
3031         if (!bNStGlobalComm)
3032         {
3033             copy_rvec(comm->box0, ddbox.box0);
3034             copy_rvec(comm->box_size, ddbox.box_size);
3035         }
3036         set_ddbox(*dd, bMasterState, state_local->box, bNStGlobalComm, state_local->x, &ddbox);
3037
3038         bBoxChanged = true;
3039         bRedist     = true;
3040     }
3041     /* Copy needed for dim's without pbc when avoiding communication */
3042     copy_rvec(ddbox.box0, comm->box0);
3043     copy_rvec(ddbox.box_size, comm->box_size);
3044
3045     set_dd_cell_sizes(dd, &ddbox, dd->unitCellInfo.ddBoxIsDynamic, bMasterState, bDoDLB, step, wcycle);
3046
3047     if (comm->ddSettings.nstDDDumpGrid > 0 && step % comm->ddSettings.nstDDDumpGrid == 0)
3048     {
3049         write_dd_grid_pdb("dd_grid", step, dd, state_local->box, &ddbox);
3050     }
3051
3052     if (comm->systemInfo.useUpdateGroups)
3053     {
3054         comm->updateGroupsCog->addCogs(
3055                 gmx::arrayRefFromArray(dd->globalAtomGroupIndices.data(), dd->numHomeAtoms),
3056                 state_local->x);
3057     }
3058
3059     /* Check if we should sort the charge groups */
3060     const bool bSortCG = (bMasterState || bRedist);
3061
3062     /* When repartitioning we mark atom groups that will move to neighboring
3063      * DD cells, but we do not move them right away for performance reasons.
3064      * Thus we need to keep track of how many charge groups will move for
3065      * obtaining correct local charge group / atom counts.
3066      */
3067     int ncg_moved = 0;
3068     if (bRedist)
3069     {
3070         wallcycle_sub_start(wcycle, WallCycleSubCounter::DDRedist);
3071
3072         ncgindex_set = dd->numHomeAtoms;
3073         dd_redistribute_cg(fplog, step, dd, ddbox.tric_dir, state_local, fr, nrnb, &ncg_moved);
3074
3075         GMX_RELEASE_ASSERT(bSortCG, "Sorting is required after redistribution");
3076
3077         if (comm->systemInfo.useUpdateGroups)
3078         {
3079             comm->updateGroupsCog->addCogs(
3080                     gmx::arrayRefFromArray(dd->globalAtomGroupIndices.data(), dd->numHomeAtoms),
3081                     state_local->x);
3082         }
3083
3084         wallcycle_sub_stop(wcycle, WallCycleSubCounter::DDRedist);
3085     }
3086
3087     RVec cell_ns_x0, cell_ns_x1;
3088     get_nsgrid_boundaries(ddbox.nboundeddim,
3089                           state_local->box,
3090                           dd,
3091                           &ddbox,
3092                           &comm->cell_x0,
3093                           &comm->cell_x1,
3094                           dd->numHomeAtoms,
3095                           as_rvec_array(state_local->x.data()),
3096                           cell_ns_x0,
3097                           cell_ns_x1);
3098
3099     if (bBoxChanged)
3100     {
3101         comm_dd_ns_cell_sizes(dd, &ddbox, cell_ns_x0, cell_ns_x1, step);
3102     }
3103
3104     if (bSortCG)
3105     {
3106         wallcycle_sub_start(wcycle, WallCycleSubCounter::DDGrid);
3107
3108         /* Sort the state on charge group position.
3109          * This enables exact restarts from this step.
3110          * It also improves performance by about 15% with larger numbers
3111          * of atoms per node.
3112          */
3113
3114         /* Fill the ns grid with the home cell,
3115          * so we can sort with the indices.
3116          */
3117         set_zones_numHomeAtoms(dd);
3118
3119         set_zones_size(dd, state_local->box, &ddbox, 0, 1, ncg_moved);
3120
3121         nbnxn_put_on_grid(fr->nbv.get(),
3122                           state_local->box,
3123                           0,
3124                           comm->zones.size[0].bb_x0,
3125                           comm->zones.size[0].bb_x1,
3126                           comm->updateGroupsCog.get(),
3127                           { 0, dd->numHomeAtoms },
3128                           comm->zones.dens_zone0,
3129                           fr->atomInfo,
3130                           state_local->x,
3131                           ncg_moved,
3132                           bRedist ? comm->movedBuffer.data() : nullptr);
3133
3134         if (debug)
3135         {
3136             fprintf(debug, "Step %s, sorting the %d home charge groups\n", gmx_step_str(step, sbuf), dd->numHomeAtoms);
3137         }
3138         dd_sort_state(dd, fr, state_local);
3139
3140         /* After sorting and compacting we set the correct size */
3141         state_change_natoms(state_local, comm->atomRanges.numHomeAtoms());
3142
3143         /* Rebuild all the indices */
3144         dd->ga2la->clear(false);
3145         ncgindex_set = 0;
3146
3147         wallcycle_sub_stop(wcycle, WallCycleSubCounter::DDGrid);
3148     }
3149     else
3150     {
3151         /* With the group scheme the sorting array is part of the DD state,
3152          * but it just got out of sync, so mark as invalid by emptying it.
3153          */
3154         if (inputrec.cutoff_scheme == CutoffScheme::Group)
3155         {
3156             comm->sort->sorted.clear();
3157         }
3158     }
3159
3160     if (comm->systemInfo.useUpdateGroups)
3161     {
3162         /* The update groups cog's are invalid after sorting
3163          * and need to be cleared before the next partitioning anyhow.
3164          */
3165         comm->updateGroupsCog->clear();
3166     }
3167
3168     wallcycle_sub_start(wcycle, WallCycleSubCounter::DDSetupComm);
3169
3170     /* Set the induces for the home atoms */
3171     set_zones_numHomeAtoms(dd);
3172     make_dd_indices(dd, ncgindex_set);
3173
3174     /* Setup up the communication and communicate the coordinates */
3175     setup_dd_communication(dd, state_local->box, &ddbox, fr, state_local);
3176
3177     /* Set the indices for the halo atoms */
3178     make_dd_indices(dd, dd->numHomeAtoms);
3179
3180     /* Set the charge group boundaries for neighbor searching */
3181     set_cg_boundaries(&comm->zones);
3182
3183     /* When bSortCG=true, we have already set the size for zone 0 */
3184     set_zones_size(dd, state_local->box, &ddbox, bSortCG ? 1 : 0, comm->zones.n, 0);
3185
3186     wallcycle_sub_stop(wcycle, WallCycleSubCounter::DDSetupComm);
3187
3188     /*
3189        write_dd_pdb("dd_home",step,"dump",top_global,cr,
3190                  -1,state_local->x.rvec_array(),state_local->box);
3191      */
3192
3193     wallcycle_sub_start(wcycle, WallCycleSubCounter::DDMakeTop);
3194
3195     /* Extract a local topology from the global topology */
3196     IVec numPulses;
3197     for (int i = 0; i < dd->ndim; i++)
3198     {
3199         numPulses[dd->dim[i]] = comm->cd[i].numPulses();
3200     }
3201     int numBondedInteractionsToReduce = dd_make_local_top(dd,
3202                                                           &comm->zones,
3203                                                           dd->unitCellInfo.npbcdim,
3204                                                           state_local->box,
3205                                                           comm->cellsize_min,
3206                                                           numPulses,
3207                                                           fr,
3208                                                           state_local->x,
3209                                                           top_global,
3210                                                           fr->atomInfo,
3211                                                           top_local);
3212     dd->localTopologyChecker->scheduleCheckOfLocalTopology(numBondedInteractionsToReduce);
3213
3214     wallcycle_sub_stop(wcycle, WallCycleSubCounter::DDMakeTop);
3215
3216     wallcycle_sub_start(wcycle, WallCycleSubCounter::DDMakeConstr);
3217
3218     /* Set up the special atom communication */
3219     int n = comm->atomRanges.end(DDAtomRanges::Type::Zones);
3220     for (int i = static_cast<int>(DDAtomRanges::Type::Zones) + 1;
3221          i < static_cast<int>(DDAtomRanges::Type::Number);
3222          i++)
3223     {
3224         auto range = static_cast<DDAtomRanges::Type>(i);
3225         switch (range)
3226         {
3227             case DDAtomRanges::Type::Vsites:
3228                 if (vsite && vsite->numInterUpdategroupVirtualSites())
3229                 {
3230                     n = dd_make_local_vsites(dd, n, top_local->idef.il);
3231                 }
3232                 break;
3233             case DDAtomRanges::Type::Constraints:
3234                 if (dd->comm->systemInfo.mayHaveSplitConstraints || dd->comm->systemInfo.mayHaveSplitSettles)
3235                 {
3236                     /* Only for inter-cg constraints we need special code */
3237                     n = dd_make_local_constraints(dd,
3238                                                   n,
3239                                                   top_global,
3240                                                   fr->atomInfo,
3241                                                   constr,
3242                                                   inputrec.nProjOrder,
3243                                                   top_local->idef.il);
3244                 }
3245                 break;
3246             default: gmx_incons("Unknown special atom type setup");
3247         }
3248         comm->atomRanges.setEnd(range, n);
3249     }
3250
3251     wallcycle_sub_stop(wcycle, WallCycleSubCounter::DDMakeConstr);
3252
3253     wallcycle_sub_start(wcycle, WallCycleSubCounter::DDTopOther);
3254
3255     /* Make space for the extra coordinates for virtual site
3256      * or constraint communication.
3257      */
3258     state_local->natoms = comm->atomRanges.numAtomsTotal();
3259
3260     state_change_natoms(state_local, state_local->natoms);
3261
3262     int nat_f_novirsum;
3263     if (vsite && vsite->numInterUpdategroupVirtualSites())
3264     {
3265         nat_f_novirsum = comm->atomRanges.end(DDAtomRanges::Type::Vsites);
3266     }
3267     else
3268     {
3269         if (EEL_FULL(inputrec.coulombtype) && dd->haveExclusions)
3270         {
3271             nat_f_novirsum = comm->atomRanges.end(DDAtomRanges::Type::Zones);
3272         }
3273         else
3274         {
3275             nat_f_novirsum = comm->atomRanges.numHomeAtoms();
3276         }
3277     }
3278
3279     /* Set the number of atoms required for the force calculation.
3280      * Forces need to be constrained when doing energy
3281      * minimization. For simple simulations we could avoid some
3282      * allocation, zeroing and copying, but this is probably not worth
3283      * the complications and checking.
3284      */
3285     forcerec_set_ranges(fr,
3286                         comm->atomRanges.end(DDAtomRanges::Type::Zones),
3287                         comm->atomRanges.end(DDAtomRanges::Type::Constraints),
3288                         nat_f_novirsum);
3289
3290     /* Update atom data for mdatoms and several algorithms */
3291     mdAlgorithmsSetupAtomData(cr, inputrec, top_global, top_local, fr, f, mdAtoms, constr, vsite, nullptr);
3292
3293     auto* mdatoms = mdAtoms->mdatoms();
3294     if (!thisRankHasDuty(cr, DUTY_PME))
3295     {
3296         /* Send the charges and/or c6/sigmas to our PME only node */
3297         gmx_pme_send_parameters(
3298                 cr,
3299                 *fr->ic,
3300                 mdatoms->nChargePerturbed != 0,
3301                 mdatoms->nTypePerturbed != 0,
3302                 mdatoms->chargeA ? gmx::arrayRefFromArray(mdatoms->chargeA, mdatoms->nr)
3303                                  : gmx::ArrayRef<real>{},
3304                 mdatoms->chargeB ? gmx::arrayRefFromArray(mdatoms->chargeB, mdatoms->nr)
3305                                  : gmx::ArrayRef<real>{},
3306                 mdatoms->sqrt_c6A ? gmx::arrayRefFromArray(mdatoms->sqrt_c6A, mdatoms->nr)
3307                                   : gmx::ArrayRef<real>{},
3308                 mdatoms->sqrt_c6B ? gmx::arrayRefFromArray(mdatoms->sqrt_c6B, mdatoms->nr)
3309                                   : gmx::ArrayRef<real>{},
3310                 mdatoms->sigmaA ? gmx::arrayRefFromArray(mdatoms->sigmaA, mdatoms->nr)
3311                                 : gmx::ArrayRef<real>{},
3312                 mdatoms->sigmaB ? gmx::arrayRefFromArray(mdatoms->sigmaB, mdatoms->nr)
3313                                 : gmx::ArrayRef<real>{},
3314                 dd_pme_maxshift_x(*dd),
3315                 dd_pme_maxshift_y(*dd));
3316     }
3317
3318     if (dd->atomSets != nullptr)
3319     {
3320         /* Update the local atom sets */
3321         dd->atomSets->setIndicesInDomainDecomposition(*(dd->ga2la));
3322     }
3323
3324     // The pull group construction can need the atom sets updated above
3325     if (inputrec.bPull)
3326     {
3327         /* Update the local pull groups */
3328         dd_make_local_pull_groups(cr, pull_work);
3329     }
3330
3331     /* Update the local atoms to be communicated via the IMD protocol if bIMD is true. */
3332     imdSession->dd_make_local_IMD_atoms(dd);
3333
3334     add_dd_statistics(dd);
3335
3336     /* Make sure we only count the cycles for this DD partitioning */
3337     clear_dd_cycle_counts(dd);
3338
3339     /* Because the order of the atoms might have changed since
3340      * the last vsite construction, we need to communicate the constructing
3341      * atom coordinates again (for spreading the forces this MD step).
3342      */
3343     dd_move_x_vsites(*dd, state_local->box, state_local->x.rvec_array());
3344
3345     wallcycle_sub_stop(wcycle, WallCycleSubCounter::DDTopOther);
3346
3347     if (comm->ddSettings.nstDDDump > 0 && step % comm->ddSettings.nstDDDump == 0)
3348     {
3349         dd_move_x(dd, state_local->box, state_local->x, nullptr);
3350         write_dd_pdb("dd_dump",
3351                      step,
3352                      "dump",
3353                      top_global,
3354                      cr,
3355                      -1,
3356                      state_local->x.rvec_array(),
3357                      state_local->box);
3358     }
3359
3360     /* Store the partitioning step */
3361     comm->partition_step = step;
3362
3363     /* Increase the DD partitioning counter */
3364     dd->ddp_count++;
3365     /* The state currently matches this DD partitioning count, store it */
3366     state_local->ddp_count = dd->ddp_count;
3367     if (bMasterState)
3368     {
3369         /* The DD master node knows the complete cg distribution,
3370          * store the count so we can possibly skip the cg info communication.
3371          */
3372         comm->master_cg_ddp_count = (bSortCG ? 0 : dd->ddp_count);
3373     }
3374
3375     if (comm->ddSettings.DD_debug > 0)
3376     {
3377         /* Set the env var GMX_DD_DEBUG if you suspect corrupted indices */
3378         check_index_consistency(dd, top_global.natoms, "after partitioning");
3379     }
3380
3381     wallcycle_stop(wcycle, WallCycleCounter::Domdec);
3382 }
3383
3384 } // namespace gmx