Merge origin/release-2020 into master
[alexxy/gromacs.git] / src / gromacs / domdec / partition.cpp
1 /*
2  * This file is part of the GROMACS molecular simulation package.
3  *
4  * Copyright (c) 2018,2019, by the GROMACS development team, led by
5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6  * and including many others, as listed in the AUTHORS file in the
7  * top-level source directory and at http://www.gromacs.org.
8  *
9  * GROMACS is free software; you can redistribute it and/or
10  * modify it under the terms of the GNU Lesser General Public License
11  * as published by the Free Software Foundation; either version 2.1
12  * of the License, or (at your option) any later version.
13  *
14  * GROMACS is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17  * Lesser General Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser General Public
20  * License along with GROMACS; if not, see
21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
23  *
24  * If you want to redistribute modifications to GROMACS, please
25  * consider that scientific software is very special. Version
26  * control is crucial - bugs must be traceable. We will be happy to
27  * consider code for inclusion in the official distribution, but
28  * derived work must not be called official GROMACS. Details are found
29  * in the README & COPYING files - if they are missing, get the
30  * official version at http://www.gromacs.org.
31  *
32  * To help us fund GROMACS development, we humbly ask that you cite
33  * the research papers on the package. Check out http://www.gromacs.org.
34  */
35 /*! \internal \file
36  *
37  * \brief This file defines functions for mdrun to call to make a new
38  * domain decomposition, and check it.
39  *
40  * \author Berk Hess <hess@kth.se>
41  * \ingroup module_domdec
42  */
43
44 #include "gmxpre.h"
45
46 #include "partition.h"
47
48 #include "config.h"
49
50 #include <cassert>
51 #include <cstdio>
52
53 #include <algorithm>
54
55 #include "gromacs/domdec/collect.h"
56 #include "gromacs/domdec/dlb.h"
57 #include "gromacs/domdec/dlbtiming.h"
58 #include "gromacs/domdec/domdec.h"
59 #include "gromacs/domdec/domdec_network.h"
60 #include "gromacs/domdec/ga2la.h"
61 #include "gromacs/domdec/localatomsetmanager.h"
62 #include "gromacs/domdec/mdsetup.h"
63 #include "gromacs/ewald/pme.h"
64 #include "gromacs/gmxlib/network.h"
65 #include "gromacs/gmxlib/nrnb.h"
66 #include "gromacs/imd/imd.h"
67 #include "gromacs/math/functions.h"
68 #include "gromacs/math/vec.h"
69 #include "gromacs/mdlib/forcerec.h"
70 #include "gromacs/mdlib/gmx_omp_nthreads.h"
71 #include "gromacs/mdlib/mdatoms.h"
72 #include "gromacs/mdlib/nsgrid.h"
73 #include "gromacs/mdlib/vsite.h"
74 #include "gromacs/mdtypes/commrec.h"
75 #include "gromacs/mdtypes/forcerec.h"
76 #include "gromacs/mdtypes/inputrec.h"
77 #include "gromacs/mdtypes/md_enums.h"
78 #include "gromacs/mdtypes/nblist.h"
79 #include "gromacs/mdtypes/state.h"
80 #include "gromacs/nbnxm/nbnxm.h"
81 #include "gromacs/pulling/pull.h"
82 #include "gromacs/timing/wallcycle.h"
83 #include "gromacs/topology/mtop_util.h"
84 #include "gromacs/topology/topology.h"
85 #include "gromacs/utility/cstringutil.h"
86 #include "gromacs/utility/fatalerror.h"
87 #include "gromacs/utility/logger.h"
88 #include "gromacs/utility/real.h"
89 #include "gromacs/utility/smalloc.h"
90 #include "gromacs/utility/strconvert.h"
91 #include "gromacs/utility/stringstream.h"
92 #include "gromacs/utility/stringutil.h"
93 #include "gromacs/utility/textwriter.h"
94
95 #include "box.h"
96 #include "cellsizes.h"
97 #include "distribute.h"
98 #include "domdec_constraints.h"
99 #include "domdec_internal.h"
100 #include "domdec_vsite.h"
101 #include "dump.h"
102 #include "redistribute.h"
103 #include "utility.h"
104
105 /*! \brief Turn on DLB when the load imbalance causes this amount of total loss.
106  *
107  * There is a bit of overhead with DLB and it's difficult to achieve
108  * a load imbalance of less than 2% with DLB.
109  */
110 #define DD_PERF_LOSS_DLB_ON 0.02
111
112 //! Warn about imbalance due to PP or PP/PME load imbalance at this loss.
113 #define DD_PERF_LOSS_WARN 0.05
114
115
116 //! Debug helper printing a DD zone
117 static void print_ddzone(FILE* fp, int d, int i, int j, gmx_ddzone_t* zone)
118 {
119     fprintf(fp,
120             "zone d0 %d d1 %d d2 %d  min0 %6.3f max1 %6.3f mch0 %6.3f mch1 %6.3f p1_0 %6.3f p1_1 "
121             "%6.3f\n",
122             d, i, j, zone->min0, zone->max1, zone->mch0, zone->mch0, zone->p1_0, zone->p1_1);
123 }
124
125 /*! \brief Using the home grid size as input in cell_ns_x0 and cell_ns_x1
126  * takes the extremes over all home and remote zones in the halo
127  * and returns the results in cell_ns_x0 and cell_ns_x1.
128  * Note: only used with the group cut-off scheme.
129  */
130 static void dd_move_cellx(gmx_domdec_t* dd, const gmx_ddbox_t* ddbox, rvec cell_ns_x0, rvec cell_ns_x1)
131 {
132     constexpr int      c_ddZoneCommMaxNumZones = 5;
133     gmx_ddzone_t       buf_s[c_ddZoneCommMaxNumZones];
134     gmx_ddzone_t       buf_r[c_ddZoneCommMaxNumZones];
135     gmx_ddzone_t       buf_e[c_ddZoneCommMaxNumZones];
136     gmx_domdec_comm_t* comm = dd->comm;
137
138     rvec extr_s[2];
139     rvec extr_r[2];
140     for (int d = 1; d < dd->ndim; d++)
141     {
142         int           dim = dd->dim[d];
143         gmx_ddzone_t& zp  = (d == 1) ? comm->zone_d1[0] : comm->zone_d2[0][0];
144
145         /* Copy the base sizes of the home zone */
146         zp.min0    = cell_ns_x0[dim];
147         zp.max1    = cell_ns_x1[dim];
148         zp.min1    = cell_ns_x1[dim];
149         zp.mch0    = cell_ns_x0[dim];
150         zp.mch1    = cell_ns_x1[dim];
151         zp.p1_0    = cell_ns_x0[dim];
152         zp.p1_1    = cell_ns_x1[dim];
153         zp.dataSet = 1;
154     }
155
156     gmx::ArrayRef<DDCellsizesWithDlb> cellsizes = comm->cellsizesWithDlb;
157
158     /* Loop backward over the dimensions and aggregate the extremes
159      * of the cell sizes.
160      */
161     for (int d = dd->ndim - 2; d >= 0; d--)
162     {
163         const int  dim      = dd->dim[d];
164         const bool applyPbc = (dim < ddbox->npbcdim);
165
166         /* Use an rvec to store two reals */
167         extr_s[d][0] = cellsizes[d + 1].fracLower;
168         extr_s[d][1] = cellsizes[d + 1].fracUpper;
169         extr_s[d][2] = cellsizes[d + 1].fracUpper;
170
171         int pos = 0;
172         GMX_ASSERT(pos < c_ddZoneCommMaxNumZones, "The buffers should be sufficiently large");
173         /* Store the extremes in the backward sending buffer,
174          * so they get updated separately from the forward communication.
175          */
176         for (int d1 = d; d1 < dd->ndim - 1; d1++)
177         {
178             gmx_ddzone_t& buf = buf_s[pos];
179
180             /* We invert the order to be able to use the same loop for buf_e */
181             buf.min0 = extr_s[d1][1];
182             buf.max1 = extr_s[d1][0];
183             buf.min1 = extr_s[d1][2];
184             buf.mch0 = 0;
185             buf.mch1 = 0;
186             /* Store the cell corner of the dimension we communicate along */
187             buf.p1_0    = comm->cell_x0[dim];
188             buf.p1_1    = 0;
189             buf.dataSet = 1;
190             pos++;
191         }
192
193         buf_s[pos] = (dd->ndim == 2) ? comm->zone_d1[0] : comm->zone_d2[0][0];
194         pos++;
195
196         if (dd->ndim == 3 && d == 0)
197         {
198             buf_s[pos] = comm->zone_d2[0][1];
199             pos++;
200             buf_s[pos] = comm->zone_d1[0];
201             pos++;
202         }
203
204         /* We only need to communicate the extremes
205          * in the forward direction
206          */
207         int numPulses = comm->cd[d].numPulses();
208         int numPulsesMin;
209         if (applyPbc)
210         {
211             /* Take the minimum to avoid double communication */
212             numPulsesMin = std::min(numPulses, dd->nc[dim] - 1 - numPulses);
213         }
214         else
215         {
216             /* Without PBC we should really not communicate over
217              * the boundaries, but implementing that complicates
218              * the communication setup and therefore we simply
219              * do all communication, but ignore some data.
220              */
221             numPulsesMin = numPulses;
222         }
223         for (int pulse = 0; pulse < numPulsesMin; pulse++)
224         {
225             /* Communicate the extremes forward */
226             bool receiveValidData = (applyPbc || dd->ci[dim] > 0);
227
228             int numElements = dd->ndim - d - 1;
229             ddSendrecv(dd, d, dddirForward, extr_s + d, numElements, extr_r + d, numElements);
230
231             if (receiveValidData)
232             {
233                 for (int d1 = d; d1 < dd->ndim - 1; d1++)
234                 {
235                     extr_s[d1][0] = std::max(extr_s[d1][0], extr_r[d1][0]);
236                     extr_s[d1][1] = std::min(extr_s[d1][1], extr_r[d1][1]);
237                     extr_s[d1][2] = std::min(extr_s[d1][2], extr_r[d1][2]);
238                 }
239             }
240         }
241
242         const int numElementsInBuffer = pos;
243         for (int pulse = 0; pulse < numPulses; pulse++)
244         {
245             /* Communicate all the zone information backward */
246             bool receiveValidData = (applyPbc || dd->ci[dim] < dd->nc[dim] - 1);
247
248             static_assert(
249                     sizeof(gmx_ddzone_t) == c_ddzoneNumReals * sizeof(real),
250                     "Here we expect gmx_ddzone_t to consist of c_ddzoneNumReals reals (only)");
251
252             int numReals = numElementsInBuffer * c_ddzoneNumReals;
253             ddSendrecv(dd, d, dddirBackward, gmx::arrayRefFromArray(&buf_s[0].min0, numReals),
254                        gmx::arrayRefFromArray(&buf_r[0].min0, numReals));
255
256             rvec dh = { 0 };
257             if (pulse > 0)
258             {
259                 for (int d1 = d + 1; d1 < dd->ndim; d1++)
260                 {
261                     /* Determine the decrease of maximum required
262                      * communication height along d1 due to the distance along d,
263                      * this avoids a lot of useless atom communication.
264                      */
265                     real dist_d = comm->cell_x1[dim] - buf_r[0].p1_0;
266
267                     real c;
268                     if (ddbox->tric_dir[dim])
269                     {
270                         /* c is the off-diagonal coupling between the cell planes
271                          * along directions d and d1.
272                          */
273                         c = ddbox->v[dim][dd->dim[d1]][dim];
274                     }
275                     else
276                     {
277                         c = 0;
278                     }
279                     real det = (1 + c * c) * gmx::square(comm->systemInfo.cutoff) - dist_d * dist_d;
280                     if (det > 0)
281                     {
282                         dh[d1] = comm->systemInfo.cutoff - (c * dist_d + std::sqrt(det)) / (1 + c * c);
283                     }
284                     else
285                     {
286                         /* A negative value signals out of range */
287                         dh[d1] = -1;
288                     }
289                 }
290             }
291
292             /* Accumulate the extremes over all pulses */
293             for (int i = 0; i < numElementsInBuffer; i++)
294             {
295                 if (pulse == 0)
296                 {
297                     buf_e[i] = buf_r[i];
298                 }
299                 else
300                 {
301                     if (receiveValidData)
302                     {
303                         buf_e[i].min0 = std::min(buf_e[i].min0, buf_r[i].min0);
304                         buf_e[i].max1 = std::max(buf_e[i].max1, buf_r[i].max1);
305                         buf_e[i].min1 = std::min(buf_e[i].min1, buf_r[i].min1);
306                     }
307
308                     int d1;
309                     if (dd->ndim == 3 && d == 0 && i == numElementsInBuffer - 1)
310                     {
311                         d1 = 1;
312                     }
313                     else
314                     {
315                         d1 = d + 1;
316                     }
317                     if (receiveValidData && dh[d1] >= 0)
318                     {
319                         buf_e[i].mch0 = std::max(buf_e[i].mch0, buf_r[i].mch0 - dh[d1]);
320                         buf_e[i].mch1 = std::max(buf_e[i].mch1, buf_r[i].mch1 - dh[d1]);
321                     }
322                 }
323                 /* Copy the received buffer to the send buffer,
324                  * to pass the data through with the next pulse.
325                  */
326                 buf_s[i] = buf_r[i];
327             }
328             if (((applyPbc || dd->ci[dim] + numPulses < dd->nc[dim]) && pulse == numPulses - 1)
329                 || (!applyPbc && dd->ci[dim] + 1 + pulse == dd->nc[dim] - 1))
330             {
331                 /* Store the extremes */
332                 int pos = 0;
333
334                 for (int d1 = d; d1 < dd->ndim - 1; d1++)
335                 {
336                     extr_s[d1][1] = std::min(extr_s[d1][1], buf_e[pos].min0);
337                     extr_s[d1][0] = std::max(extr_s[d1][0], buf_e[pos].max1);
338                     extr_s[d1][2] = std::min(extr_s[d1][2], buf_e[pos].min1);
339                     pos++;
340                 }
341
342                 if (d == 1 || (d == 0 && dd->ndim == 3))
343                 {
344                     for (int i = d; i < 2; i++)
345                     {
346                         comm->zone_d2[1 - d][i] = buf_e[pos];
347                         pos++;
348                     }
349                 }
350                 if (d == 0)
351                 {
352                     comm->zone_d1[1] = buf_e[pos];
353                     pos++;
354                 }
355             }
356             else
357             {
358                 if (d == 1 || (d == 0 && dd->ndim == 3))
359                 {
360                     for (int i = d; i < 2; i++)
361                     {
362                         comm->zone_d2[1 - d][i].dataSet = 0;
363                     }
364                 }
365                 if (d == 0)
366                 {
367                     comm->zone_d1[1].dataSet = 0;
368                 }
369             }
370         }
371     }
372
373     if (dd->ndim >= 2)
374     {
375         int dim = dd->dim[1];
376         for (int i = 0; i < 2; i++)
377         {
378             if (comm->zone_d1[i].dataSet != 0)
379             {
380                 if (debug)
381                 {
382                     print_ddzone(debug, 1, i, 0, &comm->zone_d1[i]);
383                 }
384                 cell_ns_x0[dim] = std::min(cell_ns_x0[dim], comm->zone_d1[i].min0);
385                 cell_ns_x1[dim] = std::max(cell_ns_x1[dim], comm->zone_d1[i].max1);
386             }
387         }
388     }
389     if (dd->ndim >= 3)
390     {
391         int dim = dd->dim[2];
392         for (int i = 0; i < 2; i++)
393         {
394             for (int j = 0; j < 2; j++)
395             {
396                 if (comm->zone_d2[i][j].dataSet != 0)
397                 {
398                     if (debug)
399                     {
400                         print_ddzone(debug, 2, i, j, &comm->zone_d2[i][j]);
401                     }
402                     cell_ns_x0[dim] = std::min(cell_ns_x0[dim], comm->zone_d2[i][j].min0);
403                     cell_ns_x1[dim] = std::max(cell_ns_x1[dim], comm->zone_d2[i][j].max1);
404                 }
405             }
406         }
407     }
408     for (int d = 1; d < dd->ndim; d++)
409     {
410         cellsizes[d].fracLowerMax = extr_s[d - 1][0];
411         cellsizes[d].fracUpperMin = extr_s[d - 1][1];
412         if (debug)
413         {
414             fprintf(debug, "Cell fraction d %d, max0 %f, min1 %f\n", d, cellsizes[d].fracLowerMax,
415                     cellsizes[d].fracUpperMin);
416         }
417     }
418 }
419
420 //! Sets the charge-group zones to be equal to the home zone.
421 static void set_zones_ncg_home(gmx_domdec_t* dd)
422 {
423     gmx_domdec_zones_t* zones;
424     int                 i;
425
426     zones = &dd->comm->zones;
427
428     zones->cg_range[0] = 0;
429     for (i = 1; i < zones->n + 1; i++)
430     {
431         zones->cg_range[i] = dd->ncg_home;
432     }
433     /* zone_ncg1[0] should always be equal to ncg_home */
434     dd->comm->zone_ncg1[0] = dd->ncg_home;
435 }
436
437 //! Restore atom groups for the charge groups.
438 static void restoreAtomGroups(gmx_domdec_t* dd, const t_state* state)
439 {
440     gmx::ArrayRef<const int> atomGroupsState = state->cg_gl;
441
442     std::vector<int>& globalAtomGroupIndices = dd->globalAtomGroupIndices;
443
444     globalAtomGroupIndices.resize(atomGroupsState.size());
445
446     /* Copy back the global charge group indices from state
447      * and rebuild the local charge group to atom index.
448      */
449     for (gmx::index i = 0; i < atomGroupsState.ssize(); i++)
450     {
451         globalAtomGroupIndices[i] = atomGroupsState[i];
452     }
453
454     dd->ncg_home = atomGroupsState.size();
455     dd->comm->atomRanges.setEnd(DDAtomRanges::Type::Home, atomGroupsState.ssize());
456
457     set_zones_ncg_home(dd);
458 }
459
460 //! Sets the cginfo structures.
461 static void dd_set_cginfo(gmx::ArrayRef<const int> index_gl, int cg0, int cg1, t_forcerec* fr)
462 {
463     if (fr != nullptr)
464     {
465         gmx::ArrayRef<cginfo_mb_t> cginfo_mb = fr->cginfo_mb;
466         gmx::ArrayRef<int>         cginfo    = fr->cginfo;
467
468         for (int cg = cg0; cg < cg1; cg++)
469         {
470             cginfo[cg] = ddcginfo(cginfo_mb, index_gl[cg]);
471         }
472     }
473 }
474
475 //! Makes the mappings between global and local atom indices during DD repartioning.
476 static void make_dd_indices(gmx_domdec_t* dd, const int atomStart)
477 {
478     const int                numZones               = dd->comm->zones.n;
479     const int*               zone2cg                = dd->comm->zones.cg_range;
480     const int*               zone_ncg1              = dd->comm->zone_ncg1;
481     gmx::ArrayRef<const int> globalAtomGroupIndices = dd->globalAtomGroupIndices;
482
483     std::vector<int>& globalAtomIndices = dd->globalAtomIndices;
484     gmx_ga2la_t&      ga2la             = *dd->ga2la;
485
486     if (zone2cg[1] != dd->ncg_home)
487     {
488         gmx_incons("dd->ncg_zone is not up to date");
489     }
490
491     /* Make the local to global and global to local atom index */
492     int a = atomStart;
493     globalAtomIndices.resize(a);
494     for (int zone = 0; zone < numZones; zone++)
495     {
496         int cg0;
497         if (zone == 0)
498         {
499             cg0 = atomStart;
500         }
501         else
502         {
503             cg0 = zone2cg[zone];
504         }
505         int cg1    = zone2cg[zone + 1];
506         int cg1_p1 = cg0 + zone_ncg1[zone];
507
508         for (int cg = cg0; cg < cg1; cg++)
509         {
510             int zone1 = zone;
511             if (cg >= cg1_p1)
512             {
513                 /* Signal that this cg is from more than one pulse away */
514                 zone1 += numZones;
515             }
516             int cg_gl = globalAtomGroupIndices[cg];
517             globalAtomIndices.push_back(cg_gl);
518             ga2la.insert(cg_gl, { a, zone1 });
519             a++;
520         }
521     }
522 }
523
524 //! Checks whether global and local atom indices are consistent.
525 static void check_index_consistency(const gmx_domdec_t* dd, int natoms_sys, const char* where)
526 {
527     int nerr = 0;
528
529     const int numAtomsInZones = dd->comm->atomRanges.end(DDAtomRanges::Type::Zones);
530
531     if (dd->comm->ddSettings.DD_debug > 1)
532     {
533         std::vector<int> have(natoms_sys);
534         for (int a = 0; a < numAtomsInZones; a++)
535         {
536             int globalAtomIndex = dd->globalAtomIndices[a];
537             if (have[globalAtomIndex] > 0)
538             {
539                 fprintf(stderr, "DD rank %d: global atom %d occurs twice: index %d and %d\n",
540                         dd->rank, globalAtomIndex + 1, have[globalAtomIndex], a + 1);
541             }
542             else
543             {
544                 have[globalAtomIndex] = a + 1;
545             }
546         }
547     }
548
549     std::vector<int> have(numAtomsInZones);
550
551     int ngl = 0;
552     for (int i = 0; i < natoms_sys; i++)
553     {
554         if (const auto entry = dd->ga2la->find(i))
555         {
556             const int a = entry->la;
557             if (a >= numAtomsInZones)
558             {
559                 fprintf(stderr,
560                         "DD rank %d: global atom %d marked as local atom %d, which is larger than "
561                         "nat_tot (%d)\n",
562                         dd->rank, i + 1, a + 1, numAtomsInZones);
563                 nerr++;
564             }
565             else
566             {
567                 have[a] = 1;
568                 if (dd->globalAtomIndices[a] != i)
569                 {
570                     fprintf(stderr,
571                             "DD rank %d: global atom %d marked as local atom %d, which has global "
572                             "atom index %d\n",
573                             dd->rank, i + 1, a + 1, dd->globalAtomIndices[a] + 1);
574                     nerr++;
575                 }
576             }
577             ngl++;
578         }
579     }
580     if (ngl != numAtomsInZones)
581     {
582         fprintf(stderr, "DD rank %d, %s: %d global atom indices, %d local atoms\n", dd->rank, where,
583                 ngl, numAtomsInZones);
584     }
585     for (int a = 0; a < numAtomsInZones; a++)
586     {
587         if (have[a] == 0)
588         {
589             fprintf(stderr, "DD rank %d, %s: local atom %d, global %d has no global index\n",
590                     dd->rank, where, a + 1, dd->globalAtomIndices[a] + 1);
591         }
592     }
593
594     if (nerr > 0)
595     {
596         gmx_fatal(FARGS, "DD rank %d, %s: %d atom(group) index inconsistencies", dd->rank, where, nerr);
597     }
598 }
599
600 //! Clear all DD global state indices
601 static void clearDDStateIndices(gmx_domdec_t* dd, const bool keepLocalAtomIndices)
602 {
603     gmx_ga2la_t& ga2la = *dd->ga2la;
604
605     if (!keepLocalAtomIndices)
606     {
607         /* Clear the whole list without the overhead of searching */
608         ga2la.clear();
609     }
610     else
611     {
612         const int numAtomsInZones = dd->comm->atomRanges.end(DDAtomRanges::Type::Zones);
613         for (int i = 0; i < numAtomsInZones; i++)
614         {
615             ga2la.erase(dd->globalAtomIndices[i]);
616         }
617     }
618
619     dd_clear_local_vsite_indices(dd);
620
621     if (dd->constraints)
622     {
623         dd_clear_local_constraint_indices(dd);
624     }
625 }
626
627 bool check_grid_jump(int64_t step, const gmx_domdec_t* dd, real cutoff, const gmx_ddbox_t* ddbox, gmx_bool bFatal)
628 {
629     gmx_domdec_comm_t* comm    = dd->comm;
630     bool               invalid = false;
631
632     for (int d = 1; d < dd->ndim; d++)
633     {
634         const DDCellsizesWithDlb& cellsizes = comm->cellsizesWithDlb[d];
635         const int                 dim       = dd->dim[d];
636         const real                limit     = grid_jump_limit(comm, cutoff, d);
637         real                      bfac      = ddbox->box_size[dim];
638         if (ddbox->tric_dir[dim])
639         {
640             bfac *= ddbox->skew_fac[dim];
641         }
642         if ((cellsizes.fracUpper - cellsizes.fracLowerMax) * bfac < limit
643             || (cellsizes.fracLower - cellsizes.fracUpperMin) * bfac > -limit)
644         {
645             invalid = true;
646
647             if (bFatal)
648             {
649                 char buf[22];
650
651                 /* This error should never be triggered under normal
652                  * circumstances, but you never know ...
653                  */
654                 gmx_fatal(FARGS,
655                           "step %s: The domain decomposition grid has shifted too much in the "
656                           "%c-direction around cell %d %d %d. This should not have happened. "
657                           "Running with fewer ranks might avoid this issue.",
658                           gmx_step_str(step, buf), dim2char(dim), dd->ci[XX], dd->ci[YY], dd->ci[ZZ]);
659             }
660         }
661     }
662
663     return invalid;
664 }
665 //! Return the duration of force calculations on this rank.
666 static float dd_force_load(gmx_domdec_comm_t* comm)
667 {
668     float load;
669
670     if (comm->ddSettings.eFlop)
671     {
672         load = comm->flop;
673         if (comm->ddSettings.eFlop > 1)
674         {
675             load *= 1.0 + (comm->ddSettings.eFlop - 1) * (0.1 * rand() / RAND_MAX - 0.05);
676         }
677     }
678     else
679     {
680         load = comm->cycl[ddCyclF];
681         if (comm->cycl_n[ddCyclF] > 1)
682         {
683             /* Subtract the maximum of the last n cycle counts
684              * to get rid of possible high counts due to other sources,
685              * for instance system activity, that would otherwise
686              * affect the dynamic load balancing.
687              */
688             load -= comm->cycl_max[ddCyclF];
689         }
690
691 #if GMX_MPI
692         if (comm->cycl_n[ddCyclWaitGPU] && comm->nrank_gpu_shared > 1)
693         {
694             float gpu_wait, gpu_wait_sum;
695
696             gpu_wait = comm->cycl[ddCyclWaitGPU];
697             if (comm->cycl_n[ddCyclF] > 1)
698             {
699                 /* We should remove the WaitGPU time of the same MD step
700                  * as the one with the maximum F time, since the F time
701                  * and the wait time are not independent.
702                  * Furthermore, the step for the max F time should be chosen
703                  * the same on all ranks that share the same GPU.
704                  * But to keep the code simple, we remove the average instead.
705                  * The main reason for artificially long times at some steps
706                  * is spurious CPU activity or MPI time, so we don't expect
707                  * that changes in the GPU wait time matter a lot here.
708                  */
709                 gpu_wait *= (comm->cycl_n[ddCyclF] - 1) / static_cast<float>(comm->cycl_n[ddCyclF]);
710             }
711             /* Sum the wait times over the ranks that share the same GPU */
712             MPI_Allreduce(&gpu_wait, &gpu_wait_sum, 1, MPI_FLOAT, MPI_SUM, comm->mpi_comm_gpu_shared);
713             /* Replace the wait time by the average over the ranks */
714             load += -gpu_wait + gpu_wait_sum / comm->nrank_gpu_shared;
715         }
716 #endif
717     }
718
719     return load;
720 }
721
722 //! Runs cell size checks and communicates the boundaries.
723 static void comm_dd_ns_cell_sizes(gmx_domdec_t* dd, gmx_ddbox_t* ddbox, rvec cell_ns_x0, rvec cell_ns_x1, int64_t step)
724 {
725     gmx_domdec_comm_t* comm;
726     int                dim_ind, dim;
727
728     comm = dd->comm;
729
730     for (dim_ind = 0; dim_ind < dd->ndim; dim_ind++)
731     {
732         dim = dd->dim[dim_ind];
733
734         /* Without PBC we don't have restrictions on the outer cells */
735         if (!(dim >= ddbox->npbcdim && (dd->ci[dim] == 0 || dd->ci[dim] == dd->nc[dim] - 1))
736             && isDlbOn(comm)
737             && (comm->cell_x1[dim] - comm->cell_x0[dim]) * ddbox->skew_fac[dim] < comm->cellsize_min[dim])
738         {
739             char buf[22];
740             gmx_fatal(FARGS,
741                       "step %s: The %c-size (%f) times the triclinic skew factor (%f) is smaller "
742                       "than the smallest allowed cell size (%f) for domain decomposition grid cell "
743                       "%d %d %d",
744                       gmx_step_str(step, buf), dim2char(dim),
745                       comm->cell_x1[dim] - comm->cell_x0[dim], ddbox->skew_fac[dim],
746                       dd->comm->cellsize_min[dim], dd->ci[XX], dd->ci[YY], dd->ci[ZZ]);
747         }
748     }
749
750     if ((isDlbOn(dd->comm) && dd->ndim > 1) || ddbox->nboundeddim < DIM)
751     {
752         /* Communicate the boundaries and update cell_ns_x0/1 */
753         dd_move_cellx(dd, ddbox, cell_ns_x0, cell_ns_x1);
754         if (isDlbOn(dd->comm) && dd->ndim > 1)
755         {
756             check_grid_jump(step, dd, dd->comm->systemInfo.cutoff, ddbox, TRUE);
757         }
758     }
759 }
760
761 //! Compute and communicate to determine the load distribution across PP ranks.
762 static void get_load_distribution(gmx_domdec_t* dd, gmx_wallcycle_t wcycle)
763 {
764     gmx_domdec_comm_t* comm;
765     domdec_load_t*     load;
766     float              cell_frac = 0, sbuf[DD_NLOAD_MAX];
767     gmx_bool           bSepPME;
768
769     if (debug)
770     {
771         fprintf(debug, "get_load_distribution start\n");
772     }
773
774     wallcycle_start(wcycle, ewcDDCOMMLOAD);
775
776     comm = dd->comm;
777
778     bSepPME = (dd->pme_nodeid >= 0);
779
780     if (dd->ndim == 0 && bSepPME)
781     {
782         /* Without decomposition, but with PME nodes, we need the load */
783         comm->load[0].mdf = comm->cycl[ddCyclPPduringPME];
784         comm->load[0].pme = comm->cycl[ddCyclPME];
785     }
786
787     for (int d = dd->ndim - 1; d >= 0; d--)
788     {
789         const DDCellsizesWithDlb* cellsizes = (isDlbOn(dd->comm) ? &comm->cellsizesWithDlb[d] : nullptr);
790         const int                 dim       = dd->dim[d];
791         /* Check if we participate in the communication in this dimension */
792         if (d == dd->ndim - 1 || (dd->ci[dd->dim[d + 1]] == 0 && dd->ci[dd->dim[dd->ndim - 1]] == 0))
793         {
794             load = &comm->load[d];
795             if (isDlbOn(dd->comm))
796             {
797                 cell_frac = cellsizes->fracUpper - cellsizes->fracLower;
798             }
799             int pos = 0;
800             if (d == dd->ndim - 1)
801             {
802                 sbuf[pos++] = dd_force_load(comm);
803                 sbuf[pos++] = sbuf[0];
804                 if (isDlbOn(dd->comm))
805                 {
806                     sbuf[pos++] = sbuf[0];
807                     sbuf[pos++] = cell_frac;
808                     if (d > 0)
809                     {
810                         sbuf[pos++] = cellsizes->fracLowerMax;
811                         sbuf[pos++] = cellsizes->fracUpperMin;
812                     }
813                 }
814                 if (bSepPME)
815                 {
816                     sbuf[pos++] = comm->cycl[ddCyclPPduringPME];
817                     sbuf[pos++] = comm->cycl[ddCyclPME];
818                 }
819             }
820             else
821             {
822                 sbuf[pos++] = comm->load[d + 1].sum;
823                 sbuf[pos++] = comm->load[d + 1].max;
824                 if (isDlbOn(dd->comm))
825                 {
826                     sbuf[pos++] = comm->load[d + 1].sum_m;
827                     sbuf[pos++] = comm->load[d + 1].cvol_min * cell_frac;
828                     sbuf[pos++] = comm->load[d + 1].flags;
829                     if (d > 0)
830                     {
831                         sbuf[pos++] = cellsizes->fracLowerMax;
832                         sbuf[pos++] = cellsizes->fracUpperMin;
833                     }
834                 }
835                 if (bSepPME)
836                 {
837                     sbuf[pos++] = comm->load[d + 1].mdf;
838                     sbuf[pos++] = comm->load[d + 1].pme;
839                 }
840             }
841             load->nload = pos;
842             /* Communicate a row in DD direction d.
843              * The communicators are setup such that the root always has rank 0.
844              */
845 #if GMX_MPI
846             MPI_Gather(sbuf, load->nload * sizeof(float), MPI_BYTE, load->load,
847                        load->nload * sizeof(float), MPI_BYTE, 0, comm->mpi_comm_load[d]);
848 #endif
849             if (dd->ci[dim] == dd->master_ci[dim])
850             {
851                 /* We are the master along this row, process this row */
852                 RowMaster* rowMaster = nullptr;
853
854                 if (isDlbOn(comm))
855                 {
856                     rowMaster = cellsizes->rowMaster.get();
857                 }
858                 load->sum      = 0;
859                 load->max      = 0;
860                 load->sum_m    = 0;
861                 load->cvol_min = 1;
862                 load->flags    = 0;
863                 load->mdf      = 0;
864                 load->pme      = 0;
865                 int pos        = 0;
866                 for (int i = 0; i < dd->nc[dim]; i++)
867                 {
868                     load->sum += load->load[pos++];
869                     load->max = std::max(load->max, load->load[pos]);
870                     pos++;
871                     if (isDlbOn(dd->comm))
872                     {
873                         if (rowMaster->dlbIsLimited)
874                         {
875                             /* This direction could not be load balanced properly,
876                              * therefore we need to use the maximum iso the average load.
877                              */
878                             load->sum_m = std::max(load->sum_m, load->load[pos]);
879                         }
880                         else
881                         {
882                             load->sum_m += load->load[pos];
883                         }
884                         pos++;
885                         load->cvol_min = std::min(load->cvol_min, load->load[pos]);
886                         pos++;
887                         if (d < dd->ndim - 1)
888                         {
889                             load->flags = gmx::roundToInt(load->load[pos++]);
890                         }
891                         if (d > 0)
892                         {
893                             rowMaster->bounds[i].cellFracLowerMax = load->load[pos++];
894                             rowMaster->bounds[i].cellFracUpperMin = load->load[pos++];
895                         }
896                     }
897                     if (bSepPME)
898                     {
899                         load->mdf = std::max(load->mdf, load->load[pos]);
900                         pos++;
901                         load->pme = std::max(load->pme, load->load[pos]);
902                         pos++;
903                     }
904                 }
905                 if (isDlbOn(comm) && rowMaster->dlbIsLimited)
906                 {
907                     load->sum_m *= dd->nc[dim];
908                     load->flags |= (1 << d);
909                 }
910             }
911         }
912     }
913
914     if (DDMASTER(dd))
915     {
916         comm->nload += dd_load_count(comm);
917         comm->load_step += comm->cycl[ddCyclStep];
918         comm->load_sum += comm->load[0].sum;
919         comm->load_max += comm->load[0].max;
920         if (isDlbOn(comm))
921         {
922             for (int d = 0; d < dd->ndim; d++)
923             {
924                 if (comm->load[0].flags & (1 << d))
925                 {
926                     comm->load_lim[d]++;
927                 }
928             }
929         }
930         if (bSepPME)
931         {
932             comm->load_mdf += comm->load[0].mdf;
933             comm->load_pme += comm->load[0].pme;
934         }
935     }
936
937     wallcycle_stop(wcycle, ewcDDCOMMLOAD);
938
939     if (debug)
940     {
941         fprintf(debug, "get_load_distribution finished\n");
942     }
943 }
944
945 /*! \brief Return the relative performance loss on the total run time
946  * due to the force calculation load imbalance. */
947 static float dd_force_load_fraction(gmx_domdec_t* dd)
948 {
949     if (dd->comm->nload > 0 && dd->comm->load_step > 0)
950     {
951         return dd->comm->load_sum / (dd->comm->load_step * dd->nnodes);
952     }
953     else
954     {
955         return 0;
956     }
957 }
958
959 /*! \brief Return the relative performance loss on the total run time
960  * due to the force calculation load imbalance. */
961 static float dd_force_imb_perf_loss(gmx_domdec_t* dd)
962 {
963     if (dd->comm->nload > 0 && dd->comm->load_step > 0)
964     {
965         return (dd->comm->load_max * dd->nnodes - dd->comm->load_sum) / (dd->comm->load_step * dd->nnodes);
966     }
967     else
968     {
969         return 0;
970     }
971 }
972
973 //! Print load-balance report e.g. at the end of a run.
974 static void print_dd_load_av(FILE* fplog, gmx_domdec_t* dd)
975 {
976     gmx_domdec_comm_t* comm = dd->comm;
977
978     /* Only the master rank prints loads and only if we measured loads */
979     if (!DDMASTER(dd) || comm->nload == 0)
980     {
981         return;
982     }
983
984     char buf[STRLEN];
985     int  numPpRanks  = dd->nnodes;
986     int  numPmeRanks = (comm->ddRankSetup.usePmeOnlyRanks ? comm->ddRankSetup.numRanksDoingPme : 0);
987     int  numRanks    = numPpRanks + numPmeRanks;
988     float lossFraction = 0;
989
990     /* Print the average load imbalance and performance loss */
991     if (dd->nnodes > 1 && comm->load_sum > 0)
992     {
993         float imbalance = comm->load_max * numPpRanks / comm->load_sum - 1;
994         lossFraction    = dd_force_imb_perf_loss(dd);
995
996         std::string msg = "\nDynamic load balancing report:\n";
997         std::string dlbStateStr;
998
999         switch (dd->comm->dlbState)
1000         {
1001             case DlbState::offUser:
1002                 dlbStateStr = "DLB was off during the run per user request.";
1003                 break;
1004             case DlbState::offForever:
1005                 /* Currectly this can happen due to performance loss observed, cell size
1006                  * limitations or incompatibility with other settings observed during
1007                  * determineInitialDlbState(). */
1008                 dlbStateStr = "DLB got disabled because it was unsuitable to use.";
1009                 break;
1010             case DlbState::offCanTurnOn:
1011                 dlbStateStr = "DLB was off during the run due to low measured imbalance.";
1012                 break;
1013             case DlbState::offTemporarilyLocked:
1014                 dlbStateStr =
1015                         "DLB was locked at the end of the run due to unfinished PP-PME "
1016                         "balancing.";
1017                 break;
1018             case DlbState::onCanTurnOff:
1019                 dlbStateStr = "DLB was turned on during the run due to measured imbalance.";
1020                 break;
1021             case DlbState::onUser:
1022                 dlbStateStr = "DLB was permanently on during the run per user request.";
1023                 break;
1024             default: GMX_ASSERT(false, "Undocumented DLB state");
1025         }
1026
1027         msg += " " + dlbStateStr + "\n";
1028         msg += gmx::formatString(" Average load imbalance: %.1f%%.\n", imbalance * 100);
1029         msg += gmx::formatString(
1030                 " The balanceable part of the MD step is %d%%, load imbalance is computed from "
1031                 "this.\n",
1032                 gmx::roundToInt(dd_force_load_fraction(dd) * 100));
1033         msg += gmx::formatString(
1034                 " Part of the total run time spent waiting due to load imbalance: %.1f%%.\n",
1035                 lossFraction * 100);
1036         fprintf(fplog, "%s", msg.c_str());
1037         fprintf(stderr, "\n%s", msg.c_str());
1038     }
1039
1040     /* Print during what percentage of steps the  load balancing was limited */
1041     bool dlbWasLimited = false;
1042     if (isDlbOn(comm))
1043     {
1044         sprintf(buf, " Steps where the load balancing was limited by -rdd, -rcon and/or -dds:");
1045         for (int d = 0; d < dd->ndim; d++)
1046         {
1047             int limitPercentage = (200 * comm->load_lim[d] + 1) / (2 * comm->nload);
1048             sprintf(buf + strlen(buf), " %c %d %%", dim2char(dd->dim[d]), limitPercentage);
1049             if (limitPercentage >= 50)
1050             {
1051                 dlbWasLimited = true;
1052             }
1053         }
1054         sprintf(buf + strlen(buf), "\n");
1055         fprintf(fplog, "%s", buf);
1056         fprintf(stderr, "%s", buf);
1057     }
1058
1059     /* Print the performance loss due to separate PME - PP rank imbalance */
1060     float lossFractionPme = 0;
1061     if (numPmeRanks > 0 && comm->load_mdf > 0 && comm->load_step > 0)
1062     {
1063         float pmeForceRatio = comm->load_pme / comm->load_mdf;
1064         lossFractionPme     = (comm->load_pme - comm->load_mdf) / comm->load_step;
1065         if (lossFractionPme <= 0)
1066         {
1067             lossFractionPme *= numPmeRanks / static_cast<float>(numRanks);
1068         }
1069         else
1070         {
1071             lossFractionPme *= numPpRanks / static_cast<float>(numRanks);
1072         }
1073         sprintf(buf, " Average PME mesh/force load: %5.3f\n", pmeForceRatio);
1074         fprintf(fplog, "%s", buf);
1075         fprintf(stderr, "%s", buf);
1076         sprintf(buf, " Part of the total run time spent waiting due to PP/PME imbalance: %.1f %%\n",
1077                 std::fabs(lossFractionPme) * 100);
1078         fprintf(fplog, "%s", buf);
1079         fprintf(stderr, "%s", buf);
1080     }
1081     fprintf(fplog, "\n");
1082     fprintf(stderr, "\n");
1083
1084     if (lossFraction >= DD_PERF_LOSS_WARN)
1085     {
1086         std::string message = gmx::formatString(
1087                 "NOTE: %.1f %% of the available CPU time was lost due to load imbalance\n"
1088                 "      in the domain decomposition.\n",
1089                 lossFraction * 100);
1090
1091         bool hadSuggestion = false;
1092         if (!isDlbOn(comm))
1093         {
1094             message += "      You might want to use dynamic load balancing (option -dlb.)\n";
1095             hadSuggestion = true;
1096         }
1097         else if (dlbWasLimited)
1098         {
1099             message +=
1100                     "      You might want to decrease the cell size limit (options -rdd, -rcon "
1101                     "and/or -dds).\n";
1102             hadSuggestion = true;
1103         }
1104         message += gmx::formatString(
1105                 "      You can %sconsider manually changing the decomposition (option -dd);\n"
1106                 "      e.g. by using fewer domains along the box dimension in which there is\n"
1107                 "      considerable inhomogeneity in the simulated system.",
1108                 hadSuggestion ? "also " : "");
1109
1110
1111         fprintf(fplog, "%s\n", message.c_str());
1112         fprintf(stderr, "%s\n", message.c_str());
1113     }
1114     if (numPmeRanks > 0 && std::fabs(lossFractionPme) >= DD_PERF_LOSS_WARN)
1115     {
1116         sprintf(buf,
1117                 "NOTE: %.1f %% performance was lost because the PME ranks\n"
1118                 "      had %s work to do than the PP ranks.\n"
1119                 "      You might want to %s the number of PME ranks\n"
1120                 "      or %s the cut-off and the grid spacing.\n",
1121                 std::fabs(lossFractionPme * 100), (lossFractionPme < 0) ? "less" : "more",
1122                 (lossFractionPme < 0) ? "decrease" : "increase",
1123                 (lossFractionPme < 0) ? "decrease" : "increase");
1124         fprintf(fplog, "%s\n", buf);
1125         fprintf(stderr, "%s\n", buf);
1126     }
1127 }
1128
1129 //! Return the minimum communication volume.
1130 static float dd_vol_min(gmx_domdec_t* dd)
1131 {
1132     return dd->comm->load[0].cvol_min * dd->nnodes;
1133 }
1134
1135 //! Return the DD load flags.
1136 static int dd_load_flags(gmx_domdec_t* dd)
1137 {
1138     return dd->comm->load[0].flags;
1139 }
1140
1141 //! Return the reported load imbalance in force calculations.
1142 static float dd_f_imbal(gmx_domdec_t* dd)
1143 {
1144     if (dd->comm->load[0].sum > 0)
1145     {
1146         return dd->comm->load[0].max * dd->nnodes / dd->comm->load[0].sum - 1.0F;
1147     }
1148     else
1149     {
1150         /* Something is wrong in the cycle counting, report no load imbalance */
1151         return 0.0F;
1152     }
1153 }
1154
1155 //! Returns DD load balance report.
1156 static std::string dd_print_load(gmx_domdec_t* dd, int64_t step)
1157 {
1158     gmx::StringOutputStream stream;
1159     gmx::TextWriter         log(&stream);
1160
1161     int flags = dd_load_flags(dd);
1162     if (flags)
1163     {
1164         log.writeString("DD  load balancing is limited by minimum cell size in dimension");
1165         for (int d = 0; d < dd->ndim; d++)
1166         {
1167             if (flags & (1 << d))
1168             {
1169                 log.writeStringFormatted(" %c", dim2char(dd->dim[d]));
1170             }
1171         }
1172         log.ensureLineBreak();
1173     }
1174     log.writeString("DD  step " + gmx::toString(step));
1175     if (isDlbOn(dd->comm))
1176     {
1177         log.writeStringFormatted("  vol min/aver %5.3f%c", dd_vol_min(dd), flags ? '!' : ' ');
1178     }
1179     if (dd->nnodes > 1)
1180     {
1181         log.writeStringFormatted(" load imb.: force %4.1f%%", dd_f_imbal(dd) * 100);
1182     }
1183     if (dd->comm->cycl_n[ddCyclPME])
1184     {
1185         log.writeStringFormatted("  pme mesh/force %5.3f", dd_pme_f_ratio(dd));
1186     }
1187     log.ensureLineBreak();
1188     return stream.toString();
1189 }
1190
1191 //! Prints DD load balance report in mdrun verbose mode.
1192 static void dd_print_load_verbose(gmx_domdec_t* dd)
1193 {
1194     if (isDlbOn(dd->comm))
1195     {
1196         fprintf(stderr, "vol %4.2f%c ", dd_vol_min(dd), dd_load_flags(dd) ? '!' : ' ');
1197     }
1198     if (dd->nnodes > 1)
1199     {
1200         fprintf(stderr, "imb F %2d%% ", gmx::roundToInt(dd_f_imbal(dd) * 100));
1201     }
1202     if (dd->comm->cycl_n[ddCyclPME])
1203     {
1204         fprintf(stderr, "pme/F %4.2f ", dd_pme_f_ratio(dd));
1205     }
1206 }
1207
1208 //! Turns on dynamic load balancing if possible and needed.
1209 static void turn_on_dlb(const gmx::MDLogger& mdlog, gmx_domdec_t* dd, int64_t step)
1210 {
1211     gmx_domdec_comm_t* comm = dd->comm;
1212
1213     real cellsize_min = comm->cellsize_min[dd->dim[0]];
1214     for (int d = 1; d < dd->ndim; d++)
1215     {
1216         cellsize_min = std::min(cellsize_min, comm->cellsize_min[dd->dim[d]]);
1217     }
1218
1219     /* Turn off DLB if we're too close to the cell size limit. */
1220     if (cellsize_min < comm->cellsize_limit * 1.05)
1221     {
1222         GMX_LOG(mdlog.info)
1223                 .appendTextFormatted(
1224                         "step %s Measured %.1f %% performance loss due to load imbalance, "
1225                         "but the minimum cell size is smaller than 1.05 times the cell size limit. "
1226                         "Will no longer try dynamic load balancing.",
1227                         gmx::toString(step).c_str(), dd_force_imb_perf_loss(dd) * 100);
1228
1229         comm->dlbState = DlbState::offForever;
1230         return;
1231     }
1232
1233     GMX_LOG(mdlog.info)
1234             .appendTextFormatted(
1235                     "step %s Turning on dynamic load balancing, because the performance loss due "
1236                     "to load imbalance is %.1f %%.",
1237                     gmx::toString(step).c_str(), dd_force_imb_perf_loss(dd) * 100);
1238     comm->dlbState = DlbState::onCanTurnOff;
1239
1240     /* Store the non-DLB performance, so we can check if DLB actually
1241      * improves performance.
1242      */
1243     GMX_RELEASE_ASSERT(comm->cycl_n[ddCyclStep] > 0,
1244                        "When we turned on DLB, we should have measured cycles");
1245     comm->cyclesPerStepBeforeDLB = comm->cycl[ddCyclStep] / comm->cycl_n[ddCyclStep];
1246
1247     set_dlb_limits(dd);
1248
1249     /* We can set the required cell size info here,
1250      * so we do not need to communicate this.
1251      * The grid is completely uniform.
1252      */
1253     for (int d = 0; d < dd->ndim; d++)
1254     {
1255         RowMaster* rowMaster = comm->cellsizesWithDlb[d].rowMaster.get();
1256
1257         if (rowMaster)
1258         {
1259             comm->load[d].sum_m = comm->load[d].sum;
1260
1261             int nc = dd->nc[dd->dim[d]];
1262             for (int i = 0; i < nc; i++)
1263             {
1264                 rowMaster->cellFrac[i] = i / static_cast<real>(nc);
1265                 if (d > 0)
1266                 {
1267                     rowMaster->bounds[i].cellFracLowerMax = i / static_cast<real>(nc);
1268                     rowMaster->bounds[i].cellFracUpperMin = (i + 1) / static_cast<real>(nc);
1269                 }
1270             }
1271             rowMaster->cellFrac[nc] = 1.0;
1272         }
1273     }
1274 }
1275
1276 //! Turns off dynamic load balancing (but leave it able to turn back on).
1277 static void turn_off_dlb(const gmx::MDLogger& mdlog, gmx_domdec_t* dd, int64_t step)
1278 {
1279     GMX_LOG(mdlog.info)
1280             .appendText(
1281                     "step " + gmx::toString(step)
1282                     + " Turning off dynamic load balancing, because it is degrading performance.");
1283     dd->comm->dlbState                     = DlbState::offCanTurnOn;
1284     dd->comm->haveTurnedOffDlb             = true;
1285     dd->comm->ddPartioningCountFirstDlbOff = dd->ddp_count;
1286 }
1287
1288 //! Turns off dynamic load balancing permanently.
1289 static void turn_off_dlb_forever(const gmx::MDLogger& mdlog, gmx_domdec_t* dd, int64_t step)
1290 {
1291     GMX_RELEASE_ASSERT(dd->comm->dlbState == DlbState::offCanTurnOn,
1292                        "Can only turn off DLB forever when it was in the can-turn-on state");
1293     GMX_LOG(mdlog.info)
1294             .appendText(
1295                     "step " + gmx::toString(step)
1296                     + " Will no longer try dynamic load balancing, as it degraded performance.");
1297     dd->comm->dlbState = DlbState::offForever;
1298 }
1299
1300 void set_dd_dlb_max_cutoff(t_commrec* cr, real cutoff)
1301 {
1302     gmx_domdec_comm_t* comm;
1303
1304     comm = cr->dd->comm;
1305
1306     /* Turn on the DLB limiting (might have been on already) */
1307     comm->bPMELoadBalDLBLimits = TRUE;
1308
1309     /* Change the cut-off limit */
1310     comm->PMELoadBal_max_cutoff = cutoff;
1311
1312     if (debug)
1313     {
1314         fprintf(debug,
1315                 "PME load balancing set a limit to the DLB staggering such that a %f cut-off will "
1316                 "continue to fit\n",
1317                 comm->PMELoadBal_max_cutoff);
1318     }
1319 }
1320
1321 //! Merge atom buffers.
1322 static void merge_cg_buffers(int                            ncell,
1323                              gmx_domdec_comm_dim_t*         cd,
1324                              int                            pulse,
1325                              int*                           ncg_cell,
1326                              gmx::ArrayRef<int>             index_gl,
1327                              const int*                     recv_i,
1328                              gmx::ArrayRef<gmx::RVec>       x,
1329                              gmx::ArrayRef<const gmx::RVec> recv_vr,
1330                              gmx::ArrayRef<cginfo_mb_t>     cginfo_mb,
1331                              gmx::ArrayRef<int>             cginfo)
1332 {
1333     gmx_domdec_ind_t *ind, *ind_p;
1334     int               p, cell, c, cg, cg0, cg1, cg_gl;
1335     int               shift;
1336
1337     ind = &cd->ind[pulse];
1338
1339     /* First correct the already stored data */
1340     shift = ind->nrecv[ncell];
1341     for (cell = ncell - 1; cell >= 0; cell--)
1342     {
1343         shift -= ind->nrecv[cell];
1344         if (shift > 0)
1345         {
1346             /* Move the cg's present from previous grid pulses */
1347             cg0 = ncg_cell[ncell + cell];
1348             cg1 = ncg_cell[ncell + cell + 1];
1349             for (cg = cg1 - 1; cg >= cg0; cg--)
1350             {
1351                 index_gl[cg + shift] = index_gl[cg];
1352                 x[cg + shift]        = x[cg];
1353                 cginfo[cg + shift]   = cginfo[cg];
1354             }
1355             /* Correct the already stored send indices for the shift */
1356             for (p = 1; p <= pulse; p++)
1357             {
1358                 ind_p = &cd->ind[p];
1359                 cg0   = 0;
1360                 for (c = 0; c < cell; c++)
1361                 {
1362                     cg0 += ind_p->nsend[c];
1363                 }
1364                 cg1 = cg0 + ind_p->nsend[cell];
1365                 for (cg = cg0; cg < cg1; cg++)
1366                 {
1367                     ind_p->index[cg] += shift;
1368                 }
1369             }
1370         }
1371     }
1372
1373     /* Merge in the communicated buffers */
1374     shift = 0;
1375     cg0   = 0;
1376     for (cell = 0; cell < ncell; cell++)
1377     {
1378         cg1 = ncg_cell[ncell + cell + 1] + shift;
1379         for (cg = 0; cg < ind->nrecv[cell]; cg++)
1380         {
1381             /* Copy this atom from the buffer */
1382             index_gl[cg1] = recv_i[cg0];
1383             x[cg1]        = recv_vr[cg0];
1384             /* Copy information */
1385             cg_gl       = index_gl[cg1];
1386             cginfo[cg1] = ddcginfo(cginfo_mb, cg_gl);
1387             cg0++;
1388             cg1++;
1389         }
1390         shift += ind->nrecv[cell];
1391         ncg_cell[ncell + cell + 1] = cg1;
1392     }
1393 }
1394
1395 //! Makes a range partitioning for the atom groups wthin a cell
1396 static void make_cell2at_index(gmx_domdec_comm_dim_t* cd, int nzone, int atomGroupStart)
1397 {
1398     /* Store the atom block boundaries for easy copying of communication buffers
1399      */
1400     int g = atomGroupStart;
1401     for (int zone = 0; zone < nzone; zone++)
1402     {
1403         for (gmx_domdec_ind_t& ind : cd->ind)
1404         {
1405             ind.cell2at0[zone] = g;
1406             g += ind.nrecv[zone];
1407             ind.cell2at1[zone] = g;
1408         }
1409     }
1410 }
1411
1412 //! Returns whether a link is missing.
1413 static gmx_bool missing_link(const t_blocka& link, const int globalAtomIndex, const gmx_ga2la_t& ga2la)
1414 {
1415     for (int i = link.index[globalAtomIndex]; i < link.index[globalAtomIndex + 1]; i++)
1416     {
1417         if (!ga2la.findHome(link.a[i]))
1418         {
1419             return true;
1420         }
1421     }
1422
1423     return false;
1424 }
1425
1426 //! Domain corners for communication, a maximum of 4 i-zones see a j domain
1427 typedef struct
1428 {
1429     //! The corners for the non-bonded communication.
1430     real c[DIM][4];
1431     //! Corner for rounding.
1432     real cr0;
1433     //! Corners for rounding.
1434     real cr1[4];
1435     //! Corners for bounded communication.
1436     real bc[DIM];
1437     //! Corner for rounding for bonded communication.
1438     real bcr1;
1439 } dd_corners_t;
1440
1441 //! Determine the corners of the domain(s) we are communicating with.
1442 static void set_dd_corners(const gmx_domdec_t* dd, int dim0, int dim1, int dim2, gmx_bool bDistMB, dd_corners_t* c)
1443 {
1444     const gmx_domdec_comm_t*  comm;
1445     const gmx_domdec_zones_t* zones;
1446
1447     comm = dd->comm;
1448
1449     zones = &comm->zones;
1450
1451     /* Keep the compiler happy */
1452     c->cr0  = 0;
1453     c->bcr1 = 0;
1454
1455     /* The first dimension is equal for all cells */
1456     c->c[0][0] = comm->cell_x0[dim0];
1457     if (bDistMB)
1458     {
1459         c->bc[0] = c->c[0][0];
1460     }
1461     if (dd->ndim >= 2)
1462     {
1463         dim1 = dd->dim[1];
1464         /* This cell row is only seen from the first row */
1465         c->c[1][0] = comm->cell_x0[dim1];
1466         /* All rows can see this row */
1467         c->c[1][1] = comm->cell_x0[dim1];
1468         if (isDlbOn(dd->comm))
1469         {
1470             c->c[1][1] = std::max(comm->cell_x0[dim1], comm->zone_d1[1].mch0);
1471             if (bDistMB)
1472             {
1473                 /* For the multi-body distance we need the maximum */
1474                 c->bc[1] = std::max(comm->cell_x0[dim1], comm->zone_d1[1].p1_0);
1475             }
1476         }
1477         /* Set the upper-right corner for rounding */
1478         c->cr0 = comm->cell_x1[dim0];
1479
1480         if (dd->ndim >= 3)
1481         {
1482             dim2 = dd->dim[2];
1483             for (int j = 0; j < 4; j++)
1484             {
1485                 c->c[2][j] = comm->cell_x0[dim2];
1486             }
1487             if (isDlbOn(dd->comm))
1488             {
1489                 /* Use the maximum of the i-cells that see a j-cell */
1490                 for (const auto& iZone : zones->iZones)
1491                 {
1492                     const int iZoneIndex = iZone.iZoneIndex;
1493                     for (int jZone : iZone.jZoneRange)
1494                     {
1495                         if (jZone >= 4)
1496                         {
1497                             c->c[2][jZone - 4] = std::max(
1498                                     c->c[2][jZone - 4],
1499                                     comm->zone_d2[zones->shift[iZoneIndex][dim0]][zones->shift[iZoneIndex][dim1]]
1500                                             .mch0);
1501                         }
1502                     }
1503                 }
1504                 if (bDistMB)
1505                 {
1506                     /* For the multi-body distance we need the maximum */
1507                     c->bc[2] = comm->cell_x0[dim2];
1508                     for (int i = 0; i < 2; i++)
1509                     {
1510                         for (int j = 0; j < 2; j++)
1511                         {
1512                             c->bc[2] = std::max(c->bc[2], comm->zone_d2[i][j].p1_0);
1513                         }
1514                     }
1515                 }
1516             }
1517
1518             /* Set the upper-right corner for rounding */
1519             /* Cell (0,0,0) and cell (1,0,0) can see cell 4 (0,1,1)
1520              * Only cell (0,0,0) can see cell 7 (1,1,1)
1521              */
1522             c->cr1[0] = comm->cell_x1[dim1];
1523             c->cr1[3] = comm->cell_x1[dim1];
1524             if (isDlbOn(dd->comm))
1525             {
1526                 c->cr1[0] = std::max(comm->cell_x1[dim1], comm->zone_d1[1].mch1);
1527                 if (bDistMB)
1528                 {
1529                     /* For the multi-body distance we need the maximum */
1530                     c->bcr1 = std::max(comm->cell_x1[dim1], comm->zone_d1[1].p1_1);
1531                 }
1532             }
1533         }
1534     }
1535 }
1536
1537 /*! \brief Add the atom groups we need to send in this pulse from this
1538  * zone to \p localAtomGroups and \p work. */
1539 static void get_zone_pulse_cgs(gmx_domdec_t*            dd,
1540                                int                      zonei,
1541                                int                      zone,
1542                                int                      cg0,
1543                                int                      cg1,
1544                                gmx::ArrayRef<const int> globalAtomGroupIndices,
1545                                int                      dim,
1546                                int                      dim_ind,
1547                                int                      dim0,
1548                                int                      dim1,
1549                                int                      dim2,
1550                                real                     r_comm2,
1551                                real                     r_bcomm2,
1552                                matrix                   box,
1553                                bool                     distanceIsTriclinic,
1554                                rvec*                    normal,
1555                                real                     skew_fac2_d,
1556                                real                     skew_fac_01,
1557                                rvec*                    v_d,
1558                                rvec*                    v_0,
1559                                rvec*                    v_1,
1560                                const dd_corners_t*      c,
1561                                const rvec               sf2_round,
1562                                gmx_bool                 bDistBonded,
1563                                gmx_bool                 bBondComm,
1564                                gmx_bool                 bDist2B,
1565                                gmx_bool                 bDistMB,
1566                                rvec*                    cg_cm,
1567                                gmx::ArrayRef<const int> cginfo,
1568                                std::vector<int>*        localAtomGroups,
1569                                dd_comm_setup_work_t*    work)
1570 {
1571     gmx_domdec_comm_t* comm;
1572     gmx_bool           bScrew;
1573     gmx_bool           bDistMB_pulse;
1574     int                cg, i;
1575     real               r2, rb2, r, tric_sh;
1576     rvec               rn, rb;
1577     int                dimd;
1578     int                nsend_z, nat;
1579
1580     comm = dd->comm;
1581
1582     bScrew = (dd->unitCellInfo.haveScrewPBC && dim == XX);
1583
1584     bDistMB_pulse = (bDistMB && bDistBonded);
1585
1586     /* Unpack the work data */
1587     std::vector<int>&       ibuf = work->atomGroupBuffer;
1588     std::vector<gmx::RVec>& vbuf = work->positionBuffer;
1589     nsend_z                      = 0;
1590     nat                          = work->nat;
1591
1592     for (cg = cg0; cg < cg1; cg++)
1593     {
1594         r2  = 0;
1595         rb2 = 0;
1596         if (!distanceIsTriclinic)
1597         {
1598             /* Rectangular direction, easy */
1599             r = cg_cm[cg][dim] - c->c[dim_ind][zone];
1600             if (r > 0)
1601             {
1602                 r2 += r * r;
1603             }
1604             if (bDistMB_pulse)
1605             {
1606                 r = cg_cm[cg][dim] - c->bc[dim_ind];
1607                 if (r > 0)
1608                 {
1609                     rb2 += r * r;
1610                 }
1611             }
1612             /* Rounding gives at most a 16% reduction
1613              * in communicated atoms
1614              */
1615             if (dim_ind >= 1 && (zonei == 1 || zonei == 2))
1616             {
1617                 r = cg_cm[cg][dim0] - c->cr0;
1618                 /* This is the first dimension, so always r >= 0 */
1619                 r2 += r * r;
1620                 if (bDistMB_pulse)
1621                 {
1622                     rb2 += r * r;
1623                 }
1624             }
1625             if (dim_ind == 2 && (zonei == 2 || zonei == 3))
1626             {
1627                 r = cg_cm[cg][dim1] - c->cr1[zone];
1628                 if (r > 0)
1629                 {
1630                     r2 += r * r;
1631                 }
1632                 if (bDistMB_pulse)
1633                 {
1634                     r = cg_cm[cg][dim1] - c->bcr1;
1635                     if (r > 0)
1636                     {
1637                         rb2 += r * r;
1638                     }
1639                 }
1640             }
1641         }
1642         else
1643         {
1644             /* Triclinic direction, more complicated */
1645             clear_rvec(rn);
1646             clear_rvec(rb);
1647             /* Rounding, conservative as the skew_fac multiplication
1648              * will slightly underestimate the distance.
1649              */
1650             if (dim_ind >= 1 && (zonei == 1 || zonei == 2))
1651             {
1652                 rn[dim0] = cg_cm[cg][dim0] - c->cr0;
1653                 for (i = dim0 + 1; i < DIM; i++)
1654                 {
1655                     rn[dim0] -= cg_cm[cg][i] * v_0[i][dim0];
1656                 }
1657                 r2 = rn[dim0] * rn[dim0] * sf2_round[dim0];
1658                 if (bDistMB_pulse)
1659                 {
1660                     rb[dim0] = rn[dim0];
1661                     rb2      = r2;
1662                 }
1663                 /* Take care that the cell planes along dim0 might not
1664                  * be orthogonal to those along dim1 and dim2.
1665                  */
1666                 for (i = 1; i <= dim_ind; i++)
1667                 {
1668                     dimd = dd->dim[i];
1669                     if (normal[dim0][dimd] > 0)
1670                     {
1671                         rn[dimd] -= rn[dim0] * normal[dim0][dimd];
1672                         if (bDistMB_pulse)
1673                         {
1674                             rb[dimd] -= rb[dim0] * normal[dim0][dimd];
1675                         }
1676                     }
1677                 }
1678             }
1679             if (dim_ind == 2 && (zonei == 2 || zonei == 3))
1680             {
1681                 GMX_ASSERT(dim1 >= 0 && dim1 < DIM, "Must have a valid dimension index");
1682                 rn[dim1] += cg_cm[cg][dim1] - c->cr1[zone];
1683                 tric_sh = 0;
1684                 for (i = dim1 + 1; i < DIM; i++)
1685                 {
1686                     tric_sh -= cg_cm[cg][i] * v_1[i][dim1];
1687                 }
1688                 rn[dim1] += tric_sh;
1689                 if (rn[dim1] > 0)
1690                 {
1691                     r2 += rn[dim1] * rn[dim1] * sf2_round[dim1];
1692                     /* Take care of coupling of the distances
1693                      * to the planes along dim0 and dim1 through dim2.
1694                      */
1695                     r2 -= rn[dim0] * rn[dim1] * skew_fac_01;
1696                     /* Take care that the cell planes along dim1
1697                      * might not be orthogonal to that along dim2.
1698                      */
1699                     if (normal[dim1][dim2] > 0)
1700                     {
1701                         rn[dim2] -= rn[dim1] * normal[dim1][dim2];
1702                     }
1703                 }
1704                 if (bDistMB_pulse)
1705                 {
1706                     rb[dim1] += cg_cm[cg][dim1] - c->bcr1 + tric_sh;
1707                     if (rb[dim1] > 0)
1708                     {
1709                         rb2 += rb[dim1] * rb[dim1] * sf2_round[dim1];
1710                         /* Take care of coupling of the distances
1711                          * to the planes along dim0 and dim1 through dim2.
1712                          */
1713                         rb2 -= rb[dim0] * rb[dim1] * skew_fac_01;
1714                         /* Take care that the cell planes along dim1
1715                          * might not be orthogonal to that along dim2.
1716                          */
1717                         if (normal[dim1][dim2] > 0)
1718                         {
1719                             rb[dim2] -= rb[dim1] * normal[dim1][dim2];
1720                         }
1721                     }
1722                 }
1723             }
1724             /* The distance along the communication direction */
1725             rn[dim] += cg_cm[cg][dim] - c->c[dim_ind][zone];
1726             tric_sh = 0;
1727             for (i = dim + 1; i < DIM; i++)
1728             {
1729                 tric_sh -= cg_cm[cg][i] * v_d[i][dim];
1730             }
1731             rn[dim] += tric_sh;
1732             if (rn[dim] > 0)
1733             {
1734                 r2 += rn[dim] * rn[dim] * skew_fac2_d;
1735                 /* Take care of coupling of the distances
1736                  * to the planes along dim0 and dim1 through dim2.
1737                  */
1738                 if (dim_ind == 1 && zonei == 1)
1739                 {
1740                     r2 -= rn[dim0] * rn[dim] * skew_fac_01;
1741                 }
1742             }
1743             if (bDistMB_pulse)
1744             {
1745                 clear_rvec(rb);
1746                 GMX_ASSERT(dim >= 0 && dim < DIM, "Must have a valid dimension index");
1747                 rb[dim] += cg_cm[cg][dim] - c->bc[dim_ind] + tric_sh;
1748                 if (rb[dim] > 0)
1749                 {
1750                     rb2 += rb[dim] * rb[dim] * skew_fac2_d;
1751                     /* Take care of coupling of the distances
1752                      * to the planes along dim0 and dim1 through dim2.
1753                      */
1754                     if (dim_ind == 1 && zonei == 1)
1755                     {
1756                         rb2 -= rb[dim0] * rb[dim] * skew_fac_01;
1757                     }
1758                 }
1759             }
1760         }
1761
1762         if (r2 < r_comm2
1763             || (bDistBonded && ((bDistMB && rb2 < r_bcomm2) || (bDist2B && r2 < r_bcomm2))
1764                 && (!bBondComm
1765                     || (GET_CGINFO_BOND_INTER(cginfo[cg])
1766                         && missing_link(*comm->bondedLinks, globalAtomGroupIndices[cg], *dd->ga2la)))))
1767         {
1768             /* Store the local and global atom group indices and position */
1769             localAtomGroups->push_back(cg);
1770             ibuf.push_back(globalAtomGroupIndices[cg]);
1771             nsend_z++;
1772
1773             rvec posPbc;
1774             if (dd->ci[dim] == 0)
1775             {
1776                 /* Correct cg_cm for pbc */
1777                 rvec_add(cg_cm[cg], box[dim], posPbc);
1778                 if (bScrew)
1779                 {
1780                     posPbc[YY] = box[YY][YY] - posPbc[YY];
1781                     posPbc[ZZ] = box[ZZ][ZZ] - posPbc[ZZ];
1782                 }
1783             }
1784             else
1785             {
1786                 copy_rvec(cg_cm[cg], posPbc);
1787             }
1788             vbuf.emplace_back(posPbc[XX], posPbc[YY], posPbc[ZZ]);
1789
1790             nat += 1;
1791         }
1792     }
1793
1794     work->nat        = nat;
1795     work->nsend_zone = nsend_z;
1796 }
1797
1798 //! Clear data.
1799 static void clearCommSetupData(dd_comm_setup_work_t* work)
1800 {
1801     work->localAtomGroupBuffer.clear();
1802     work->atomGroupBuffer.clear();
1803     work->positionBuffer.clear();
1804     work->nat        = 0;
1805     work->nsend_zone = 0;
1806 }
1807
1808 //! Prepare DD communication.
1809 static void setup_dd_communication(gmx_domdec_t*                dd,
1810                                    matrix                       box,
1811                                    gmx_ddbox_t*                 ddbox,
1812                                    t_forcerec*                  fr,
1813                                    t_state*                     state,
1814                                    PaddedHostVector<gmx::RVec>* f)
1815 {
1816     int                    dim_ind, dim, dim0, dim1, dim2, dimd, nat_tot;
1817     int                    nzone, nzone_send, zone, zonei, cg0, cg1;
1818     int                    c;
1819     int *                  zone_cg_range, pos_cg;
1820     gmx_domdec_comm_t*     comm;
1821     gmx_domdec_zones_t*    zones;
1822     gmx_domdec_comm_dim_t* cd;
1823     gmx_bool               bBondComm, bDist2B, bDistMB, bDistBonded;
1824     dd_corners_t           corners;
1825     rvec *                 normal, *v_d, *v_0 = nullptr, *v_1 = nullptr;
1826     real                   skew_fac2_d, skew_fac_01;
1827     rvec                   sf2_round;
1828
1829     if (debug)
1830     {
1831         fprintf(debug, "Setting up DD communication\n");
1832     }
1833
1834     comm = dd->comm;
1835
1836     if (comm->dth.empty())
1837     {
1838         /* Initialize the thread data.
1839          * This can not be done in init_domain_decomposition,
1840          * as the numbers of threads is determined later.
1841          */
1842         int numThreads = gmx_omp_nthreads_get(emntDomdec);
1843         comm->dth.resize(numThreads);
1844     }
1845
1846     bBondComm = comm->systemInfo.filterBondedCommunication;
1847
1848     /* Do we need to determine extra distances for multi-body bondeds? */
1849     bDistMB = (comm->systemInfo.haveInterDomainMultiBodyBondeds && isDlbOn(dd->comm) && dd->ndim > 1);
1850
1851     /* Do we need to determine extra distances for only two-body bondeds? */
1852     bDist2B = (bBondComm && !bDistMB);
1853
1854     const real r_comm2 =
1855             gmx::square(domainToDomainIntoAtomToDomainCutoff(comm->systemInfo, comm->systemInfo.cutoff));
1856     const real r_bcomm2 =
1857             gmx::square(domainToDomainIntoAtomToDomainCutoff(comm->systemInfo, comm->cutoff_mbody));
1858
1859     if (debug)
1860     {
1861         fprintf(debug, "bBondComm %s, r_bc %f\n", gmx::boolToString(bBondComm), std::sqrt(r_bcomm2));
1862     }
1863
1864     zones = &comm->zones;
1865
1866     dim0 = dd->dim[0];
1867     dim1 = (dd->ndim >= 2 ? dd->dim[1] : -1);
1868     dim2 = (dd->ndim >= 3 ? dd->dim[2] : -1);
1869
1870     set_dd_corners(dd, dim0, dim1, dim2, bDistMB, &corners);
1871
1872     /* Triclinic stuff */
1873     normal      = ddbox->normal;
1874     skew_fac_01 = 0;
1875     if (dd->ndim >= 2)
1876     {
1877         v_0 = ddbox->v[dim0];
1878         if (ddbox->tric_dir[dim0] && ddbox->tric_dir[dim1])
1879         {
1880             /* Determine the coupling coefficient for the distances
1881              * to the cell planes along dim0 and dim1 through dim2.
1882              * This is required for correct rounding.
1883              */
1884             skew_fac_01 = ddbox->v[dim0][dim1 + 1][dim0] * ddbox->v[dim1][dim1 + 1][dim1];
1885             if (debug)
1886             {
1887                 fprintf(debug, "\nskew_fac_01 %f\n", skew_fac_01);
1888             }
1889         }
1890     }
1891     if (dd->ndim >= 3)
1892     {
1893         v_1 = ddbox->v[dim1];
1894     }
1895
1896     zone_cg_range                        = zones->cg_range;
1897     gmx::ArrayRef<cginfo_mb_t> cginfo_mb = fr->cginfo_mb;
1898
1899     zone_cg_range[0]   = 0;
1900     zone_cg_range[1]   = dd->ncg_home;
1901     comm->zone_ncg1[0] = dd->ncg_home;
1902     pos_cg             = dd->ncg_home;
1903
1904     nat_tot = comm->atomRanges.numHomeAtoms();
1905     nzone   = 1;
1906     for (dim_ind = 0; dim_ind < dd->ndim; dim_ind++)
1907     {
1908         dim = dd->dim[dim_ind];
1909         cd  = &comm->cd[dim_ind];
1910
1911         /* Check if we need to compute triclinic distances along this dim */
1912         bool distanceIsTriclinic = false;
1913         for (int i = 0; i <= dim_ind; i++)
1914         {
1915             if (ddbox->tric_dir[dd->dim[i]])
1916             {
1917                 distanceIsTriclinic = true;
1918             }
1919         }
1920
1921         if (dim >= ddbox->npbcdim && dd->ci[dim] == 0)
1922         {
1923             /* No pbc in this dimension, the first node should not comm. */
1924             nzone_send = 0;
1925         }
1926         else
1927         {
1928             nzone_send = nzone;
1929         }
1930
1931         v_d         = ddbox->v[dim];
1932         skew_fac2_d = gmx::square(ddbox->skew_fac[dim]);
1933
1934         cd->receiveInPlace = true;
1935         for (int p = 0; p < cd->numPulses(); p++)
1936         {
1937             /* Only atoms communicated in the first pulse are used
1938              * for multi-body bonded interactions or for bBondComm.
1939              */
1940             bDistBonded = ((bDistMB || bDist2B) && p == 0);
1941
1942             gmx_domdec_ind_t* ind = &cd->ind[p];
1943
1944             /* Thread 0 writes in the global index array */
1945             ind->index.clear();
1946             clearCommSetupData(&comm->dth[0]);
1947
1948             for (zone = 0; zone < nzone_send; zone++)
1949             {
1950                 if (dim_ind > 0 && distanceIsTriclinic)
1951                 {
1952                     /* Determine slightly more optimized skew_fac's
1953                      * for rounding.
1954                      * This reduces the number of communicated atoms
1955                      * by about 10% for 3D DD of rhombic dodecahedra.
1956                      */
1957                     for (dimd = 0; dimd < dim; dimd++)
1958                     {
1959                         sf2_round[dimd] = 1;
1960                         if (ddbox->tric_dir[dimd])
1961                         {
1962                             for (int i = dd->dim[dimd] + 1; i < DIM; i++)
1963                             {
1964                                 /* If we are shifted in dimension i
1965                                  * and the cell plane is tilted forward
1966                                  * in dimension i, skip this coupling.
1967                                  */
1968                                 if (!(zones->shift[nzone + zone][i] && ddbox->v[dimd][i][dimd] >= 0))
1969                                 {
1970                                     sf2_round[dimd] += gmx::square(ddbox->v[dimd][i][dimd]);
1971                                 }
1972                             }
1973                             sf2_round[dimd] = 1 / sf2_round[dimd];
1974                         }
1975                     }
1976                 }
1977
1978                 zonei = zone_perm[dim_ind][zone];
1979                 if (p == 0)
1980                 {
1981                     /* Here we permutate the zones to obtain a convenient order
1982                      * for neighbor searching
1983                      */
1984                     cg0 = zone_cg_range[zonei];
1985                     cg1 = zone_cg_range[zonei + 1];
1986                 }
1987                 else
1988                 {
1989                     /* Look only at the cg's received in the previous grid pulse
1990                      */
1991                     cg1 = zone_cg_range[nzone + zone + 1];
1992                     cg0 = cg1 - cd->ind[p - 1].nrecv[zone];
1993                 }
1994
1995                 const int numThreads = gmx::ssize(comm->dth);
1996 #pragma omp parallel for num_threads(numThreads) schedule(static)
1997                 for (int th = 0; th < numThreads; th++)
1998                 {
1999                     try
2000                     {
2001                         dd_comm_setup_work_t& work = comm->dth[th];
2002
2003                         /* Retain data accumulated into buffers of thread 0 */
2004                         if (th > 0)
2005                         {
2006                             clearCommSetupData(&work);
2007                         }
2008
2009                         int cg0_th = cg0 + ((cg1 - cg0) * th) / numThreads;
2010                         int cg1_th = cg0 + ((cg1 - cg0) * (th + 1)) / numThreads;
2011
2012                         /* Get the cg's for this pulse in this zone */
2013                         get_zone_pulse_cgs(dd, zonei, zone, cg0_th, cg1_th, dd->globalAtomGroupIndices,
2014                                            dim, dim_ind, dim0, dim1, dim2, r_comm2, r_bcomm2, box,
2015                                            distanceIsTriclinic, normal, skew_fac2_d, skew_fac_01,
2016                                            v_d, v_0, v_1, &corners, sf2_round, bDistBonded, bBondComm,
2017                                            bDist2B, bDistMB, state->x.rvec_array(), fr->cginfo,
2018                                            th == 0 ? &ind->index : &work.localAtomGroupBuffer, &work);
2019                     }
2020                     GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR
2021                 } // END
2022
2023                 std::vector<int>&       atomGroups = comm->dth[0].atomGroupBuffer;
2024                 std::vector<gmx::RVec>& positions  = comm->dth[0].positionBuffer;
2025                 ind->nsend[zone]                   = comm->dth[0].nsend_zone;
2026                 /* Append data of threads>=1 to the communication buffers */
2027                 for (int th = 1; th < numThreads; th++)
2028                 {
2029                     const dd_comm_setup_work_t& dth = comm->dth[th];
2030
2031                     ind->index.insert(ind->index.end(), dth.localAtomGroupBuffer.begin(),
2032                                       dth.localAtomGroupBuffer.end());
2033                     atomGroups.insert(atomGroups.end(), dth.atomGroupBuffer.begin(),
2034                                       dth.atomGroupBuffer.end());
2035                     positions.insert(positions.end(), dth.positionBuffer.begin(),
2036                                      dth.positionBuffer.end());
2037                     comm->dth[0].nat += dth.nat;
2038                     ind->nsend[zone] += dth.nsend_zone;
2039                 }
2040             }
2041             /* Clear the counts in case we do not have pbc */
2042             for (zone = nzone_send; zone < nzone; zone++)
2043             {
2044                 ind->nsend[zone] = 0;
2045             }
2046             ind->nsend[nzone]     = ind->index.size();
2047             ind->nsend[nzone + 1] = comm->dth[0].nat;
2048             /* Communicate the number of cg's and atoms to receive */
2049             ddSendrecv(dd, dim_ind, dddirBackward, ind->nsend, nzone + 2, ind->nrecv, nzone + 2);
2050
2051             if (p > 0)
2052             {
2053                 /* We can receive in place if only the last zone is not empty */
2054                 for (zone = 0; zone < nzone - 1; zone++)
2055                 {
2056                     if (ind->nrecv[zone] > 0)
2057                     {
2058                         cd->receiveInPlace = false;
2059                     }
2060                 }
2061             }
2062
2063             int receiveBufferSize = 0;
2064             if (!cd->receiveInPlace)
2065             {
2066                 receiveBufferSize = ind->nrecv[nzone];
2067             }
2068             /* These buffer are actually only needed with in-place */
2069             DDBufferAccess<int>       globalAtomGroupBuffer(comm->intBuffer, receiveBufferSize);
2070             DDBufferAccess<gmx::RVec> rvecBuffer(comm->rvecBuffer, receiveBufferSize);
2071
2072             dd_comm_setup_work_t& work = comm->dth[0];
2073
2074             /* Make space for the global cg indices */
2075             int numAtomGroupsNew = pos_cg + ind->nrecv[nzone];
2076             dd->globalAtomGroupIndices.resize(numAtomGroupsNew);
2077             /* Communicate the global cg indices */
2078             gmx::ArrayRef<int> integerBufferRef;
2079             if (cd->receiveInPlace)
2080             {
2081                 integerBufferRef = gmx::arrayRefFromArray(
2082                         dd->globalAtomGroupIndices.data() + pos_cg, ind->nrecv[nzone]);
2083             }
2084             else
2085             {
2086                 integerBufferRef = globalAtomGroupBuffer.buffer;
2087             }
2088             ddSendrecv<int>(dd, dim_ind, dddirBackward, work.atomGroupBuffer, integerBufferRef);
2089
2090             /* Make space for cg_cm */
2091             dd_check_alloc_ncg(fr, state, f, pos_cg + ind->nrecv[nzone]);
2092
2093             /* Communicate the coordinates */
2094             gmx::ArrayRef<gmx::RVec> rvecBufferRef;
2095             if (cd->receiveInPlace)
2096             {
2097                 rvecBufferRef = gmx::makeArrayRef(state->x).subArray(pos_cg, ind->nrecv[nzone]);
2098             }
2099             else
2100             {
2101                 rvecBufferRef = rvecBuffer.buffer;
2102             }
2103             ddSendrecv<gmx::RVec>(dd, dim_ind, dddirBackward, work.positionBuffer, rvecBufferRef);
2104
2105             /* Make the charge group index */
2106             if (cd->receiveInPlace)
2107             {
2108                 zone = (p == 0 ? 0 : nzone - 1);
2109                 while (zone < nzone)
2110                 {
2111                     for (int i = 0; i < ind->nrecv[zone]; i++)
2112                     {
2113                         int globalAtomIndex = dd->globalAtomGroupIndices[pos_cg];
2114                         fr->cginfo[pos_cg]  = ddcginfo(cginfo_mb, globalAtomIndex);
2115                         pos_cg++;
2116                     }
2117                     if (p == 0)
2118                     {
2119                         comm->zone_ncg1[nzone + zone] = ind->nrecv[zone];
2120                     }
2121                     zone++;
2122                     zone_cg_range[nzone + zone] = pos_cg;
2123                 }
2124             }
2125             else
2126             {
2127                 /* This part of the code is never executed with bBondComm. */
2128                 merge_cg_buffers(nzone, cd, p, zone_cg_range, dd->globalAtomGroupIndices,
2129                                  integerBufferRef.data(), state->x, rvecBufferRef, fr->cginfo_mb,
2130                                  fr->cginfo);
2131                 pos_cg += ind->nrecv[nzone];
2132             }
2133             nat_tot += ind->nrecv[nzone + 1];
2134         }
2135         if (!cd->receiveInPlace)
2136         {
2137             /* Store the atom block for easy copying of communication buffers */
2138             make_cell2at_index(cd, nzone, zone_cg_range[nzone]);
2139         }
2140         nzone += nzone;
2141     }
2142
2143     comm->atomRanges.setEnd(DDAtomRanges::Type::Zones, nat_tot);
2144
2145     if (!bBondComm)
2146     {
2147         /* We don't need to update cginfo, since that was alrady done above.
2148          * So we pass NULL for the forcerec.
2149          */
2150         dd_set_cginfo(dd->globalAtomGroupIndices, dd->ncg_home, dd->globalAtomGroupIndices.size(), nullptr);
2151     }
2152
2153     if (debug)
2154     {
2155         fprintf(debug, "Finished setting up DD communication, zones:");
2156         for (c = 0; c < zones->n; c++)
2157         {
2158             fprintf(debug, " %d", zones->cg_range[c + 1] - zones->cg_range[c]);
2159         }
2160         fprintf(debug, "\n");
2161     }
2162 }
2163
2164 //! Set boundaries for the charge group range.
2165 static void set_cg_boundaries(gmx_domdec_zones_t* zones)
2166 {
2167     for (auto& iZone : zones->iZones)
2168     {
2169         iZone.iAtomRange = gmx::Range<int>(0, zones->cg_range[iZone.iZoneIndex + 1]);
2170         iZone.jAtomRange = gmx::Range<int>(zones->cg_range[iZone.jZoneRange.begin()],
2171                                            zones->cg_range[iZone.jZoneRange.end()]);
2172     }
2173 }
2174
2175 /*! \brief Set zone dimensions for zones \p zone_start to \p zone_end-1
2176  *
2177  * Also sets the atom density for the home zone when \p zone_start=0.
2178  * For this \p numMovedChargeGroupsInHomeZone needs to be passed to tell
2179  * how many charge groups will move but are still part of the current range.
2180  * \todo When converting domdec to use proper classes, all these variables
2181  *       should be private and a method should return the correct count
2182  *       depending on an internal state.
2183  *
2184  * \param[in,out] dd          The domain decomposition struct
2185  * \param[in]     box         The box
2186  * \param[in]     ddbox       The domain decomposition box struct
2187  * \param[in]     zone_start  The start of the zone range to set sizes for
2188  * \param[in]     zone_end    The end of the zone range to set sizes for
2189  * \param[in]     numMovedChargeGroupsInHomeZone  The number of charge groups in the home zone that should moved but are still present in dd->comm->zones.cg_range
2190  */
2191 static void set_zones_size(gmx_domdec_t*      dd,
2192                            matrix             box,
2193                            const gmx_ddbox_t* ddbox,
2194                            int                zone_start,
2195                            int                zone_end,
2196                            int                numMovedChargeGroupsInHomeZone)
2197 {
2198     gmx_domdec_comm_t*  comm;
2199     gmx_domdec_zones_t* zones;
2200     gmx_bool            bDistMB;
2201     int                 z, d, dim;
2202     real                rcs, rcmbs;
2203     int                 i, j;
2204     real                vol;
2205
2206     comm = dd->comm;
2207
2208     zones = &comm->zones;
2209
2210     /* Do we need to determine extra distances for multi-body bondeds? */
2211     bDistMB = (comm->systemInfo.haveInterDomainMultiBodyBondeds && isDlbOn(dd->comm) && dd->ndim > 1);
2212
2213     for (z = zone_start; z < zone_end; z++)
2214     {
2215         /* Copy cell limits to zone limits.
2216          * Valid for non-DD dims and non-shifted dims.
2217          */
2218         copy_rvec(comm->cell_x0, zones->size[z].x0);
2219         copy_rvec(comm->cell_x1, zones->size[z].x1);
2220     }
2221
2222     for (d = 0; d < dd->ndim; d++)
2223     {
2224         dim = dd->dim[d];
2225
2226         for (z = 0; z < zones->n; z++)
2227         {
2228             /* With a staggered grid we have different sizes
2229              * for non-shifted dimensions.
2230              */
2231             if (isDlbOn(dd->comm) && zones->shift[z][dim] == 0)
2232             {
2233                 if (d == 1)
2234                 {
2235                     zones->size[z].x0[dim] = comm->zone_d1[zones->shift[z][dd->dim[d - 1]]].min0;
2236                     zones->size[z].x1[dim] = comm->zone_d1[zones->shift[z][dd->dim[d - 1]]].max1;
2237                 }
2238                 else if (d == 2)
2239                 {
2240                     zones->size[z].x0[dim] =
2241                             comm->zone_d2[zones->shift[z][dd->dim[d - 2]]][zones->shift[z][dd->dim[d - 1]]]
2242                                     .min0;
2243                     zones->size[z].x1[dim] =
2244                             comm->zone_d2[zones->shift[z][dd->dim[d - 2]]][zones->shift[z][dd->dim[d - 1]]]
2245                                     .max1;
2246                 }
2247             }
2248         }
2249
2250         rcs   = comm->systemInfo.cutoff;
2251         rcmbs = comm->cutoff_mbody;
2252         if (ddbox->tric_dir[dim])
2253         {
2254             rcs /= ddbox->skew_fac[dim];
2255             rcmbs /= ddbox->skew_fac[dim];
2256         }
2257
2258         /* Set the lower limit for the shifted zone dimensions */
2259         for (z = zone_start; z < zone_end; z++)
2260         {
2261             if (zones->shift[z][dim] > 0)
2262             {
2263                 dim = dd->dim[d];
2264                 if (!isDlbOn(dd->comm) || d == 0)
2265                 {
2266                     zones->size[z].x0[dim] = comm->cell_x1[dim];
2267                     zones->size[z].x1[dim] = comm->cell_x1[dim] + rcs;
2268                 }
2269                 else
2270                 {
2271                     /* Here we take the lower limit of the zone from
2272                      * the lowest domain of the zone below.
2273                      */
2274                     if (z < 4)
2275                     {
2276                         zones->size[z].x0[dim] = comm->zone_d1[zones->shift[z][dd->dim[d - 1]]].min1;
2277                     }
2278                     else
2279                     {
2280                         if (d == 1)
2281                         {
2282                             zones->size[z].x0[dim] = zones->size[zone_perm[2][z - 4]].x0[dim];
2283                         }
2284                         else
2285                         {
2286                             zones->size[z].x0[dim] =
2287                                     comm->zone_d2[zones->shift[z][dd->dim[d - 2]]][zones->shift[z][dd->dim[d - 1]]]
2288                                             .min1;
2289                         }
2290                     }
2291                     /* A temporary limit, is updated below */
2292                     zones->size[z].x1[dim] = zones->size[z].x0[dim];
2293
2294                     if (bDistMB)
2295                     {
2296                         for (size_t zi = 0; zi < zones->iZones.size(); zi++)
2297                         {
2298                             if (zones->shift[zi][dim] == 0)
2299                             {
2300                                 /* This takes the whole zone into account.
2301                                  * With multiple pulses this will lead
2302                                  * to a larger zone then strictly necessary.
2303                                  */
2304                                 zones->size[z].x1[dim] = std::max(zones->size[z].x1[dim],
2305                                                                   zones->size[zi].x1[dim] + rcmbs);
2306                             }
2307                         }
2308                     }
2309                 }
2310             }
2311         }
2312
2313         /* Loop over the i-zones to set the upper limit of each
2314          * j-zone they see.
2315          */
2316         for (const auto& iZone : zones->iZones)
2317         {
2318             const int zi = iZone.iZoneIndex;
2319             if (zones->shift[zi][dim] == 0)
2320             {
2321                 /* We should only use zones up to zone_end */
2322                 const auto& jZoneRangeFull = iZone.jZoneRange;
2323                 if (zone_end <= *jZoneRangeFull.begin())
2324                 {
2325                     continue;
2326                 }
2327                 const gmx::Range<int> jZoneRange(*jZoneRangeFull.begin(),
2328                                                  std::min(*jZoneRangeFull.end(), zone_end));
2329                 for (int jZone : jZoneRange)
2330                 {
2331                     if (zones->shift[jZone][dim] > 0)
2332                     {
2333                         zones->size[jZone].x1[dim] =
2334                                 std::max(zones->size[jZone].x1[dim], zones->size[zi].x1[dim] + rcs);
2335                     }
2336                 }
2337             }
2338         }
2339     }
2340
2341     for (z = zone_start; z < zone_end; z++)
2342     {
2343         /* Initialization only required to keep the compiler happy */
2344         rvec corner_min = { 0, 0, 0 }, corner_max = { 0, 0, 0 }, corner;
2345         int  nc, c;
2346
2347         /* To determine the bounding box for a zone we need to find
2348          * the extreme corners of 4, 2 or 1 corners.
2349          */
2350         nc = 1 << (ddbox->nboundeddim - 1);
2351
2352         for (c = 0; c < nc; c++)
2353         {
2354             /* Set up a zone corner at x=0, ignoring trilinic couplings */
2355             corner[XX] = 0;
2356             if ((c & 1) == 0)
2357             {
2358                 corner[YY] = zones->size[z].x0[YY];
2359             }
2360             else
2361             {
2362                 corner[YY] = zones->size[z].x1[YY];
2363             }
2364             if ((c & 2) == 0)
2365             {
2366                 corner[ZZ] = zones->size[z].x0[ZZ];
2367             }
2368             else
2369             {
2370                 corner[ZZ] = zones->size[z].x1[ZZ];
2371             }
2372             if (dd->ndim == 1 && dd->dim[0] < ZZ && ZZ < dd->unitCellInfo.npbcdim
2373                 && box[ZZ][1 - dd->dim[0]] != 0)
2374             {
2375                 /* With 1D domain decomposition the cg's are not in
2376                  * the triclinic box, but triclinic x-y and rectangular y/x-z.
2377                  * Shift the corner of the z-vector back to along the box
2378                  * vector of dimension d, so it will later end up at 0 along d.
2379                  * This can affect the location of this corner along dd->dim[0]
2380                  * through the matrix operation below if box[d][dd->dim[0]]!=0.
2381                  */
2382                 int d = 1 - dd->dim[0];
2383
2384                 corner[d] -= corner[ZZ] * box[ZZ][d] / box[ZZ][ZZ];
2385             }
2386             /* Apply the triclinic couplings */
2387             for (i = YY; i < ddbox->npbcdim && i < DIM; i++)
2388             {
2389                 for (j = XX; j < i; j++)
2390                 {
2391                     corner[j] += corner[i] * box[i][j] / box[i][i];
2392                 }
2393             }
2394             if (c == 0)
2395             {
2396                 copy_rvec(corner, corner_min);
2397                 copy_rvec(corner, corner_max);
2398             }
2399             else
2400             {
2401                 for (i = 0; i < DIM; i++)
2402                 {
2403                     corner_min[i] = std::min(corner_min[i], corner[i]);
2404                     corner_max[i] = std::max(corner_max[i], corner[i]);
2405                 }
2406             }
2407         }
2408         /* Copy the extreme cornes without offset along x */
2409         for (i = 0; i < DIM; i++)
2410         {
2411             zones->size[z].bb_x0[i] = corner_min[i];
2412             zones->size[z].bb_x1[i] = corner_max[i];
2413         }
2414         /* Add the offset along x */
2415         zones->size[z].bb_x0[XX] += zones->size[z].x0[XX];
2416         zones->size[z].bb_x1[XX] += zones->size[z].x1[XX];
2417     }
2418
2419     if (zone_start == 0)
2420     {
2421         vol = 1;
2422         for (dim = 0; dim < DIM; dim++)
2423         {
2424             vol *= zones->size[0].x1[dim] - zones->size[0].x0[dim];
2425         }
2426         zones->dens_zone0 =
2427                 (zones->cg_range[1] - zones->cg_range[0] - numMovedChargeGroupsInHomeZone) / vol;
2428     }
2429
2430     if (debug)
2431     {
2432         for (z = zone_start; z < zone_end; z++)
2433         {
2434             fprintf(debug, "zone %d    %6.3f - %6.3f  %6.3f - %6.3f  %6.3f - %6.3f\n", z,
2435                     zones->size[z].x0[XX], zones->size[z].x1[XX], zones->size[z].x0[YY],
2436                     zones->size[z].x1[YY], zones->size[z].x0[ZZ], zones->size[z].x1[ZZ]);
2437             fprintf(debug, "zone %d bb %6.3f - %6.3f  %6.3f - %6.3f  %6.3f - %6.3f\n", z,
2438                     zones->size[z].bb_x0[XX], zones->size[z].bb_x1[XX], zones->size[z].bb_x0[YY],
2439                     zones->size[z].bb_x1[YY], zones->size[z].bb_x0[ZZ], zones->size[z].bb_x1[ZZ]);
2440         }
2441     }
2442 }
2443
2444 /*! \brief Order data in \p dataToSort according to \p sort
2445  *
2446  * Note: both buffers should have at least \p sort.size() elements.
2447  */
2448 template<typename T>
2449 static void orderVector(gmx::ArrayRef<const gmx_cgsort_t> sort,
2450                         gmx::ArrayRef<T>                  dataToSort,
2451                         gmx::ArrayRef<T>                  sortBuffer)
2452 {
2453     GMX_ASSERT(dataToSort.size() >= sort.size(), "The vector needs to be sufficiently large");
2454     GMX_ASSERT(sortBuffer.size() >= sort.size(),
2455                "The sorting buffer needs to be sufficiently large");
2456
2457     /* Order the data into the temporary buffer */
2458     size_t i = 0;
2459     for (const gmx_cgsort_t& entry : sort)
2460     {
2461         sortBuffer[i++] = dataToSort[entry.ind];
2462     }
2463
2464     /* Copy back to the original array */
2465     std::copy(sortBuffer.begin(), sortBuffer.begin() + sort.size(), dataToSort.begin());
2466 }
2467
2468 /*! \brief Order data in \p dataToSort according to \p sort
2469  *
2470  * Note: \p vectorToSort should have at least \p sort.size() elements,
2471  *       \p workVector is resized when it is too small.
2472  */
2473 template<typename T>
2474 static void orderVector(gmx::ArrayRef<const gmx_cgsort_t> sort,
2475                         gmx::ArrayRef<T>                  vectorToSort,
2476                         std::vector<T>*                   workVector)
2477 {
2478     if (gmx::index(workVector->size()) < sort.ssize())
2479     {
2480         workVector->resize(sort.size());
2481     }
2482     orderVector<T>(sort, vectorToSort, *workVector);
2483 }
2484
2485 //! Returns the sorting order for atoms based on the nbnxn grid order in sort
2486 static void dd_sort_order_nbnxn(const t_forcerec* fr, std::vector<gmx_cgsort_t>* sort)
2487 {
2488     gmx::ArrayRef<const int> atomOrder = fr->nbv->getLocalAtomOrder();
2489
2490     /* Using push_back() instead of this resize results in much slower code */
2491     sort->resize(atomOrder.size());
2492     gmx::ArrayRef<gmx_cgsort_t> buffer    = *sort;
2493     size_t                      numSorted = 0;
2494     for (int i : atomOrder)
2495     {
2496         if (i >= 0)
2497         {
2498             /* The values of nsc and ind_gl are not used in this case */
2499             buffer[numSorted++].ind = i;
2500         }
2501     }
2502     sort->resize(numSorted);
2503 }
2504
2505 //! Returns the sorting state for DD.
2506 static void dd_sort_state(gmx_domdec_t* dd, t_forcerec* fr, t_state* state)
2507 {
2508     gmx_domdec_sort_t* sort = dd->comm->sort.get();
2509
2510     dd_sort_order_nbnxn(fr, &sort->sorted);
2511
2512     /* We alloc with the old size, since cgindex is still old */
2513     DDBufferAccess<gmx::RVec> rvecBuffer(dd->comm->rvecBuffer, dd->ncg_home);
2514
2515     /* Set the new home atom/charge group count */
2516     dd->ncg_home = sort->sorted.size();
2517     if (debug)
2518     {
2519         fprintf(debug, "Set the new home atom count to %d\n", dd->ncg_home);
2520     }
2521
2522     /* Reorder the state */
2523     gmx::ArrayRef<const gmx_cgsort_t> cgsort = sort->sorted;
2524     GMX_RELEASE_ASSERT(cgsort.ssize() == dd->ncg_home, "We should sort all the home atom groups");
2525
2526     if (state->flags & (1 << estX))
2527     {
2528         orderVector(cgsort, makeArrayRef(state->x), rvecBuffer.buffer);
2529     }
2530     if (state->flags & (1 << estV))
2531     {
2532         orderVector(cgsort, makeArrayRef(state->v), rvecBuffer.buffer);
2533     }
2534     if (state->flags & (1 << estCGP))
2535     {
2536         orderVector(cgsort, makeArrayRef(state->cg_p), rvecBuffer.buffer);
2537     }
2538
2539     /* Reorder the global cg index */
2540     orderVector<int>(cgsort, dd->globalAtomGroupIndices, &sort->intBuffer);
2541     /* Reorder the cginfo */
2542     orderVector<int>(cgsort, fr->cginfo, &sort->intBuffer);
2543     /* Set the home atom number */
2544     dd->comm->atomRanges.setEnd(DDAtomRanges::Type::Home, dd->ncg_home);
2545
2546     /* The atoms are now exactly in grid order, update the grid order */
2547     fr->nbv->setLocalAtomOrder();
2548 }
2549
2550 //! Accumulates load statistics.
2551 static void add_dd_statistics(gmx_domdec_t* dd)
2552 {
2553     gmx_domdec_comm_t* comm = dd->comm;
2554
2555     for (int i = 0; i < static_cast<int>(DDAtomRanges::Type::Number); i++)
2556     {
2557         auto range = static_cast<DDAtomRanges::Type>(i);
2558         comm->sum_nat[i] += comm->atomRanges.end(range) - comm->atomRanges.start(range);
2559     }
2560     comm->ndecomp++;
2561 }
2562
2563 void reset_dd_statistics_counters(gmx_domdec_t* dd)
2564 {
2565     gmx_domdec_comm_t* comm = dd->comm;
2566
2567     /* Reset all the statistics and counters for total run counting */
2568     for (int i = 0; i < static_cast<int>(DDAtomRanges::Type::Number); i++)
2569     {
2570         comm->sum_nat[i] = 0;
2571     }
2572     comm->ndecomp   = 0;
2573     comm->nload     = 0;
2574     comm->load_step = 0;
2575     comm->load_sum  = 0;
2576     comm->load_max  = 0;
2577     clear_ivec(comm->load_lim);
2578     comm->load_mdf = 0;
2579     comm->load_pme = 0;
2580 }
2581
2582 void print_dd_statistics(const t_commrec* cr, const t_inputrec* ir, FILE* fplog)
2583 {
2584     gmx_domdec_comm_t* comm = cr->dd->comm;
2585
2586     const int numRanges = static_cast<int>(DDAtomRanges::Type::Number);
2587     gmx_sumd(numRanges, comm->sum_nat, cr);
2588
2589     if (fplog == nullptr)
2590     {
2591         return;
2592     }
2593
2594     fprintf(fplog, "\n    D O M A I N   D E C O M P O S I T I O N   S T A T I S T I C S\n\n");
2595
2596     for (int i = static_cast<int>(DDAtomRanges::Type::Zones); i < numRanges; i++)
2597     {
2598         auto   range = static_cast<DDAtomRanges::Type>(i);
2599         double av    = comm->sum_nat[i] / comm->ndecomp;
2600         switch (range)
2601         {
2602             case DDAtomRanges::Type::Zones:
2603                 fprintf(fplog, " av. #atoms communicated per step for force:  %d x %.1f\n", 2, av);
2604                 break;
2605             case DDAtomRanges::Type::Vsites:
2606                 if (cr->dd->vsite_comm)
2607                 {
2608                     fprintf(fplog, " av. #atoms communicated per step for vsites: %d x %.1f\n",
2609                             (EEL_PME(ir->coulombtype) || ir->coulombtype == eelEWALD) ? 3 : 2, av);
2610                 }
2611                 break;
2612             case DDAtomRanges::Type::Constraints:
2613                 if (cr->dd->constraint_comm)
2614                 {
2615                     fprintf(fplog, " av. #atoms communicated per step for LINCS:  %d x %.1f\n",
2616                             1 + ir->nLincsIter, av);
2617                 }
2618                 break;
2619             default: gmx_incons(" Unknown type for DD statistics");
2620         }
2621     }
2622     fprintf(fplog, "\n");
2623
2624     if (comm->ddSettings.recordLoad && EI_DYNAMICS(ir->eI))
2625     {
2626         print_dd_load_av(fplog, cr->dd);
2627     }
2628 }
2629
2630 //!\brief TODO Remove fplog when group scheme and charge groups are gone
2631 void dd_partition_system(FILE*                        fplog,
2632                          const gmx::MDLogger&         mdlog,
2633                          int64_t                      step,
2634                          const t_commrec*             cr,
2635                          gmx_bool                     bMasterState,
2636                          int                          nstglobalcomm,
2637                          t_state*                     state_global,
2638                          const gmx_mtop_t&            top_global,
2639                          const t_inputrec*            ir,
2640                          gmx::ImdSession*             imdSession,
2641                          pull_t*                      pull_work,
2642                          t_state*                     state_local,
2643                          PaddedHostVector<gmx::RVec>* f,
2644                          gmx::MDAtoms*                mdAtoms,
2645                          gmx_localtop_t*              top_local,
2646                          t_forcerec*                  fr,
2647                          gmx_vsite_t*                 vsite,
2648                          gmx::Constraints*            constr,
2649                          t_nrnb*                      nrnb,
2650                          gmx_wallcycle*               wcycle,
2651                          gmx_bool                     bVerbose)
2652 {
2653     gmx_domdec_t*      dd;
2654     gmx_domdec_comm_t* comm;
2655     gmx_ddbox_t        ddbox = { 0 };
2656     int64_t            step_pcoupl;
2657     rvec               cell_ns_x0, cell_ns_x1;
2658     int                ncgindex_set, ncg_moved, nat_f_novirsum;
2659     gmx_bool           bBoxChanged, bNStGlobalComm, bDoDLB, bCheckWhetherToTurnDlbOn, bLogLoad;
2660     gmx_bool           bRedist;
2661     ivec               np;
2662     real               grid_density;
2663     char               sbuf[22];
2664
2665     wallcycle_start(wcycle, ewcDOMDEC);
2666
2667     dd   = cr->dd;
2668     comm = dd->comm;
2669
2670     // TODO if the update code becomes accessible here, use
2671     // upd->deform for this logic.
2672     bBoxChanged = (bMasterState || inputrecDeform(ir));
2673     if (ir->epc != epcNO)
2674     {
2675         /* With nstpcouple > 1 pressure coupling happens.
2676          * one step after calculating the pressure.
2677          * Box scaling happens at the end of the MD step,
2678          * after the DD partitioning.
2679          * We therefore have to do DLB in the first partitioning
2680          * after an MD step where P-coupling occurred.
2681          * We need to determine the last step in which p-coupling occurred.
2682          * MRS -- need to validate this for vv?
2683          */
2684         int n = ir->nstpcouple;
2685         if (n == 1)
2686         {
2687             step_pcoupl = step - 1;
2688         }
2689         else
2690         {
2691             step_pcoupl = ((step - 1) / n) * n + 1;
2692         }
2693         if (step_pcoupl >= comm->partition_step)
2694         {
2695             bBoxChanged = TRUE;
2696         }
2697     }
2698
2699     bNStGlobalComm = (step % nstglobalcomm == 0);
2700
2701     if (!isDlbOn(comm))
2702     {
2703         bDoDLB = FALSE;
2704     }
2705     else
2706     {
2707         /* Should we do dynamic load balacing this step?
2708          * Since it requires (possibly expensive) global communication,
2709          * we might want to do DLB less frequently.
2710          */
2711         if (bBoxChanged || ir->epc != epcNO)
2712         {
2713             bDoDLB = bBoxChanged;
2714         }
2715         else
2716         {
2717             bDoDLB = bNStGlobalComm;
2718         }
2719     }
2720
2721     /* Check if we have recorded loads on the nodes */
2722     if (comm->ddSettings.recordLoad && dd_load_count(comm) > 0)
2723     {
2724         bCheckWhetherToTurnDlbOn = dd_dlb_get_should_check_whether_to_turn_dlb_on(dd);
2725
2726         /* Print load every nstlog, first and last step to the log file */
2727         bLogLoad = ((ir->nstlog > 0 && step % ir->nstlog == 0) || comm->n_load_collect == 0
2728                     || (ir->nsteps >= 0 && (step + ir->nstlist > ir->init_step + ir->nsteps)));
2729
2730         /* Avoid extra communication due to verbose screen output
2731          * when nstglobalcomm is set.
2732          */
2733         if (bDoDLB || bLogLoad || bCheckWhetherToTurnDlbOn
2734             || (bVerbose && (ir->nstlist == 0 || nstglobalcomm <= ir->nstlist)))
2735         {
2736             get_load_distribution(dd, wcycle);
2737             if (DDMASTER(dd))
2738             {
2739                 if (bLogLoad)
2740                 {
2741                     GMX_LOG(mdlog.info).asParagraph().appendText(dd_print_load(dd, step - 1));
2742                 }
2743                 if (bVerbose)
2744                 {
2745                     dd_print_load_verbose(dd);
2746                 }
2747             }
2748             comm->n_load_collect++;
2749
2750             if (isDlbOn(comm))
2751             {
2752                 if (DDMASTER(dd))
2753                 {
2754                     /* Add the measured cycles to the running average */
2755                     const float averageFactor = 0.1F;
2756                     comm->cyclesPerStepDlbExpAverage =
2757                             (1 - averageFactor) * comm->cyclesPerStepDlbExpAverage
2758                             + averageFactor * comm->cycl[ddCyclStep] / comm->cycl_n[ddCyclStep];
2759                 }
2760                 if (comm->dlbState == DlbState::onCanTurnOff
2761                     && dd->comm->n_load_have % c_checkTurnDlbOffInterval == c_checkTurnDlbOffInterval - 1)
2762                 {
2763                     gmx_bool turnOffDlb;
2764                     if (DDMASTER(dd))
2765                     {
2766                         /* If the running averaged cycles with DLB are more
2767                          * than before we turned on DLB, turn off DLB.
2768                          * We will again run and check the cycles without DLB
2769                          * and we can then decide if to turn off DLB forever.
2770                          */
2771                         turnOffDlb = (comm->cyclesPerStepDlbExpAverage > comm->cyclesPerStepBeforeDLB);
2772                     }
2773                     dd_bcast(dd, sizeof(turnOffDlb), &turnOffDlb);
2774                     if (turnOffDlb)
2775                     {
2776                         /* To turn off DLB, we need to redistribute the atoms */
2777                         dd_collect_state(dd, state_local, state_global);
2778                         bMasterState = TRUE;
2779                         turn_off_dlb(mdlog, dd, step);
2780                     }
2781                 }
2782             }
2783             else if (bCheckWhetherToTurnDlbOn)
2784             {
2785                 gmx_bool turnOffDlbForever = FALSE;
2786                 gmx_bool turnOnDlb         = FALSE;
2787
2788                 /* Since the timings are node dependent, the master decides */
2789                 if (DDMASTER(dd))
2790                 {
2791                     /* If we recently turned off DLB, we want to check if
2792                      * performance is better without DLB. We want to do this
2793                      * ASAP to minimize the chance that external factors
2794                      * slowed down the DLB step are gone here and we
2795                      * incorrectly conclude that DLB was causing the slowdown.
2796                      * So we measure one nstlist block, no running average.
2797                      */
2798                     if (comm->haveTurnedOffDlb
2799                         && comm->cycl[ddCyclStep] / comm->cycl_n[ddCyclStep] < comm->cyclesPerStepDlbExpAverage)
2800                     {
2801                         /* After turning off DLB we ran nstlist steps in fewer
2802                          * cycles than with DLB. This likely means that DLB
2803                          * in not benefical, but this could be due to a one
2804                          * time unlucky fluctuation, so we require two such
2805                          * observations in close succession to turn off DLB
2806                          * forever.
2807                          */
2808                         if (comm->dlbSlowerPartitioningCount > 0
2809                             && dd->ddp_count < comm->dlbSlowerPartitioningCount + 10 * c_checkTurnDlbOnInterval)
2810                         {
2811                             turnOffDlbForever = TRUE;
2812                         }
2813                         comm->haveTurnedOffDlb = false;
2814                         /* Register when we last measured DLB slowdown */
2815                         comm->dlbSlowerPartitioningCount = dd->ddp_count;
2816                     }
2817                     else
2818                     {
2819                         /* Here we check if the max PME rank load is more than 0.98
2820                          * the max PP force load. If so, PP DLB will not help,
2821                          * since we are (almost) limited by PME. Furthermore,
2822                          * DLB will cause a significant extra x/f redistribution
2823                          * cost on the PME ranks, which will then surely result
2824                          * in lower total performance.
2825                          */
2826                         if (comm->ddRankSetup.usePmeOnlyRanks && dd_pme_f_ratio(dd) > 1 - DD_PERF_LOSS_DLB_ON)
2827                         {
2828                             turnOnDlb = FALSE;
2829                         }
2830                         else
2831                         {
2832                             turnOnDlb = (dd_force_imb_perf_loss(dd) >= DD_PERF_LOSS_DLB_ON);
2833                         }
2834                     }
2835                 }
2836                 struct
2837                 {
2838                     gmx_bool turnOffDlbForever;
2839                     gmx_bool turnOnDlb;
2840                 } bools{ turnOffDlbForever, turnOnDlb };
2841                 dd_bcast(dd, sizeof(bools), &bools);
2842                 if (bools.turnOffDlbForever)
2843                 {
2844                     turn_off_dlb_forever(mdlog, dd, step);
2845                 }
2846                 else if (bools.turnOnDlb)
2847                 {
2848                     turn_on_dlb(mdlog, dd, step);
2849                     bDoDLB = TRUE;
2850                 }
2851             }
2852         }
2853         comm->n_load_have++;
2854     }
2855
2856     bRedist = FALSE;
2857     if (bMasterState)
2858     {
2859         /* Clear the old state */
2860         clearDDStateIndices(dd, false);
2861         ncgindex_set = 0;
2862
2863         auto xGlobal = positionsFromStatePointer(state_global);
2864
2865         set_ddbox(*dd, true, DDMASTER(dd) ? state_global->box : nullptr, true, xGlobal, &ddbox);
2866
2867         distributeState(mdlog, dd, top_global, state_global, ddbox, state_local, f);
2868
2869         /* Ensure that we have space for the new distribution */
2870         dd_check_alloc_ncg(fr, state_local, f, dd->ncg_home);
2871
2872         inc_nrnb(nrnb, eNR_CGCM, comm->atomRanges.numHomeAtoms());
2873
2874         dd_set_cginfo(dd->globalAtomGroupIndices, 0, dd->ncg_home, fr);
2875     }
2876     else if (state_local->ddp_count != dd->ddp_count)
2877     {
2878         if (state_local->ddp_count > dd->ddp_count)
2879         {
2880             gmx_fatal(FARGS,
2881                       "Internal inconsistency state_local->ddp_count (%d) > dd->ddp_count (%" PRId64
2882                       ")",
2883                       state_local->ddp_count, dd->ddp_count);
2884         }
2885
2886         if (state_local->ddp_count_cg_gl != state_local->ddp_count)
2887         {
2888             gmx_fatal(FARGS,
2889                       "Internal inconsistency state_local->ddp_count_cg_gl (%d) != "
2890                       "state_local->ddp_count (%d)",
2891                       state_local->ddp_count_cg_gl, state_local->ddp_count);
2892         }
2893
2894         /* Clear the old state */
2895         clearDDStateIndices(dd, false);
2896
2897         /* Restore the atom group indices from state_local */
2898         restoreAtomGroups(dd, state_local);
2899         make_dd_indices(dd, 0);
2900         ncgindex_set = dd->ncg_home;
2901
2902         inc_nrnb(nrnb, eNR_CGCM, comm->atomRanges.numHomeAtoms());
2903
2904         dd_set_cginfo(dd->globalAtomGroupIndices, 0, dd->ncg_home, fr);
2905
2906         set_ddbox(*dd, bMasterState, state_local->box, true, state_local->x, &ddbox);
2907
2908         bRedist = isDlbOn(comm);
2909     }
2910     else
2911     {
2912         /* We have the full state, only redistribute the cgs */
2913
2914         /* Clear the non-home indices */
2915         clearDDStateIndices(dd, true);
2916         ncgindex_set = 0;
2917
2918         /* To avoid global communication, we do not recompute the extent
2919          * of the system for dims without pbc. Therefore we need to copy
2920          * the previously computed values when we do not communicate.
2921          */
2922         if (!bNStGlobalComm)
2923         {
2924             copy_rvec(comm->box0, ddbox.box0);
2925             copy_rvec(comm->box_size, ddbox.box_size);
2926         }
2927         set_ddbox(*dd, bMasterState, state_local->box, bNStGlobalComm, state_local->x, &ddbox);
2928
2929         bBoxChanged = TRUE;
2930         bRedist     = TRUE;
2931     }
2932     /* Copy needed for dim's without pbc when avoiding communication */
2933     copy_rvec(ddbox.box0, comm->box0);
2934     copy_rvec(ddbox.box_size, comm->box_size);
2935
2936     set_dd_cell_sizes(dd, &ddbox, dd->unitCellInfo.ddBoxIsDynamic, bMasterState, bDoDLB, step, wcycle);
2937
2938     if (comm->ddSettings.nstDDDumpGrid > 0 && step % comm->ddSettings.nstDDDumpGrid == 0)
2939     {
2940         write_dd_grid_pdb("dd_grid", step, dd, state_local->box, &ddbox);
2941     }
2942
2943     if (comm->systemInfo.useUpdateGroups)
2944     {
2945         comm->updateGroupsCog->addCogs(
2946                 gmx::arrayRefFromArray(dd->globalAtomGroupIndices.data(), dd->ncg_home), state_local->x);
2947     }
2948
2949     /* Check if we should sort the charge groups */
2950     const bool bSortCG = (bMasterState || bRedist);
2951
2952     /* When repartitioning we mark atom groups that will move to neighboring
2953      * DD cells, but we do not move them right away for performance reasons.
2954      * Thus we need to keep track of how many charge groups will move for
2955      * obtaining correct local charge group / atom counts.
2956      */
2957     ncg_moved = 0;
2958     if (bRedist)
2959     {
2960         wallcycle_sub_start(wcycle, ewcsDD_REDIST);
2961
2962         ncgindex_set = dd->ncg_home;
2963         dd_redistribute_cg(fplog, step, dd, ddbox.tric_dir, state_local, f, fr, nrnb, &ncg_moved);
2964
2965         GMX_RELEASE_ASSERT(bSortCG, "Sorting is required after redistribution");
2966
2967         if (comm->systemInfo.useUpdateGroups)
2968         {
2969             comm->updateGroupsCog->addCogs(
2970                     gmx::arrayRefFromArray(dd->globalAtomGroupIndices.data(), dd->ncg_home),
2971                     state_local->x);
2972         }
2973
2974         wallcycle_sub_stop(wcycle, ewcsDD_REDIST);
2975     }
2976
2977     // TODO: Integrate this code in the nbnxm module
2978     get_nsgrid_boundaries(ddbox.nboundeddim, state_local->box, dd, &ddbox, &comm->cell_x0,
2979                           &comm->cell_x1, dd->ncg_home, as_rvec_array(state_local->x.data()),
2980                           cell_ns_x0, cell_ns_x1, &grid_density);
2981
2982     if (bBoxChanged)
2983     {
2984         comm_dd_ns_cell_sizes(dd, &ddbox, cell_ns_x0, cell_ns_x1, step);
2985     }
2986
2987     if (bSortCG)
2988     {
2989         wallcycle_sub_start(wcycle, ewcsDD_GRID);
2990
2991         /* Sort the state on charge group position.
2992          * This enables exact restarts from this step.
2993          * It also improves performance by about 15% with larger numbers
2994          * of atoms per node.
2995          */
2996
2997         /* Fill the ns grid with the home cell,
2998          * so we can sort with the indices.
2999          */
3000         set_zones_ncg_home(dd);
3001
3002         set_zones_size(dd, state_local->box, &ddbox, 0, 1, ncg_moved);
3003
3004         nbnxn_put_on_grid(fr->nbv.get(), state_local->box, 0, comm->zones.size[0].bb_x0,
3005                           comm->zones.size[0].bb_x1, comm->updateGroupsCog.get(),
3006                           { 0, dd->ncg_home }, comm->zones.dens_zone0, fr->cginfo, state_local->x,
3007                           ncg_moved, bRedist ? comm->movedBuffer.data() : nullptr);
3008
3009         if (debug)
3010         {
3011             fprintf(debug, "Step %s, sorting the %d home charge groups\n", gmx_step_str(step, sbuf),
3012                     dd->ncg_home);
3013         }
3014         dd_sort_state(dd, fr, state_local);
3015
3016         /* After sorting and compacting we set the correct size */
3017         dd_resize_state(state_local, f, comm->atomRanges.numHomeAtoms());
3018
3019         /* Rebuild all the indices */
3020         dd->ga2la->clear();
3021         ncgindex_set = 0;
3022
3023         wallcycle_sub_stop(wcycle, ewcsDD_GRID);
3024     }
3025     else
3026     {
3027         /* With the group scheme the sorting array is part of the DD state,
3028          * but it just got out of sync, so mark as invalid by emptying it.
3029          */
3030         if (ir->cutoff_scheme == ecutsGROUP)
3031         {
3032             comm->sort->sorted.clear();
3033         }
3034     }
3035
3036     if (comm->systemInfo.useUpdateGroups)
3037     {
3038         /* The update groups cog's are invalid after sorting
3039          * and need to be cleared before the next partitioning anyhow.
3040          */
3041         comm->updateGroupsCog->clear();
3042     }
3043
3044     wallcycle_sub_start(wcycle, ewcsDD_SETUPCOMM);
3045
3046     /* Set the induces for the home atoms */
3047     set_zones_ncg_home(dd);
3048     make_dd_indices(dd, ncgindex_set);
3049
3050     /* Setup up the communication and communicate the coordinates */
3051     setup_dd_communication(dd, state_local->box, &ddbox, fr, state_local, f);
3052
3053     /* Set the indices for the halo atoms */
3054     make_dd_indices(dd, dd->ncg_home);
3055
3056     /* Set the charge group boundaries for neighbor searching */
3057     set_cg_boundaries(&comm->zones);
3058
3059     if (fr->cutoff_scheme == ecutsVERLET)
3060     {
3061         /* When bSortCG=true, we have already set the size for zone 0 */
3062         set_zones_size(dd, state_local->box, &ddbox, bSortCG ? 1 : 0, comm->zones.n, 0);
3063     }
3064
3065     wallcycle_sub_stop(wcycle, ewcsDD_SETUPCOMM);
3066
3067     /*
3068        write_dd_pdb("dd_home",step,"dump",top_global,cr,
3069                  -1,state_local->x.rvec_array(),state_local->box);
3070      */
3071
3072     wallcycle_sub_start(wcycle, ewcsDD_MAKETOP);
3073
3074     /* Extract a local topology from the global topology */
3075     for (int i = 0; i < dd->ndim; i++)
3076     {
3077         np[dd->dim[i]] = comm->cd[i].numPulses();
3078     }
3079     dd_make_local_top(dd, &comm->zones, dd->unitCellInfo.npbcdim, state_local->box,
3080                       comm->cellsize_min, np, fr, state_local->x.rvec_array(), top_global, top_local);
3081
3082     wallcycle_sub_stop(wcycle, ewcsDD_MAKETOP);
3083
3084     wallcycle_sub_start(wcycle, ewcsDD_MAKECONSTR);
3085
3086     /* Set up the special atom communication */
3087     int n = comm->atomRanges.end(DDAtomRanges::Type::Zones);
3088     for (int i = static_cast<int>(DDAtomRanges::Type::Zones) + 1;
3089          i < static_cast<int>(DDAtomRanges::Type::Number); i++)
3090     {
3091         auto range = static_cast<DDAtomRanges::Type>(i);
3092         switch (range)
3093         {
3094             case DDAtomRanges::Type::Vsites:
3095                 if (vsite && vsite->numInterUpdategroupVsites)
3096                 {
3097                     n = dd_make_local_vsites(dd, n, top_local->idef.il);
3098                 }
3099                 break;
3100             case DDAtomRanges::Type::Constraints:
3101                 if (dd->comm->systemInfo.haveSplitConstraints || dd->comm->systemInfo.haveSplitSettles)
3102                 {
3103                     /* Only for inter-cg constraints we need special code */
3104                     n = dd_make_local_constraints(dd, n, &top_global, fr->cginfo.data(), constr,
3105                                                   ir->nProjOrder, top_local->idef.il);
3106                 }
3107                 break;
3108             default: gmx_incons("Unknown special atom type setup");
3109         }
3110         comm->atomRanges.setEnd(range, n);
3111     }
3112
3113     wallcycle_sub_stop(wcycle, ewcsDD_MAKECONSTR);
3114
3115     wallcycle_sub_start(wcycle, ewcsDD_TOPOTHER);
3116
3117     /* Make space for the extra coordinates for virtual site
3118      * or constraint communication.
3119      */
3120     state_local->natoms = comm->atomRanges.numAtomsTotal();
3121
3122     dd_resize_state(state_local, f, state_local->natoms);
3123
3124     if (fr->haveDirectVirialContributions)
3125     {
3126         if (vsite && vsite->numInterUpdategroupVsites)
3127         {
3128             nat_f_novirsum = comm->atomRanges.end(DDAtomRanges::Type::Vsites);
3129         }
3130         else
3131         {
3132             if (EEL_FULL(ir->coulombtype) && dd->haveExclusions)
3133             {
3134                 nat_f_novirsum = comm->atomRanges.end(DDAtomRanges::Type::Zones);
3135             }
3136             else
3137             {
3138                 nat_f_novirsum = comm->atomRanges.numHomeAtoms();
3139             }
3140         }
3141     }
3142     else
3143     {
3144         nat_f_novirsum = 0;
3145     }
3146
3147     /* Set the number of atoms required for the force calculation.
3148      * Forces need to be constrained when doing energy
3149      * minimization. For simple simulations we could avoid some
3150      * allocation, zeroing and copying, but this is probably not worth
3151      * the complications and checking.
3152      */
3153     forcerec_set_ranges(fr, comm->atomRanges.end(DDAtomRanges::Type::Zones),
3154                         comm->atomRanges.end(DDAtomRanges::Type::Constraints), nat_f_novirsum);
3155
3156     /* Update atom data for mdatoms and several algorithms */
3157     mdAlgorithmsSetupAtomData(cr, ir, top_global, top_local, fr, nullptr, mdAtoms, constr, vsite, nullptr);
3158
3159     auto mdatoms = mdAtoms->mdatoms();
3160     if (!thisRankHasDuty(cr, DUTY_PME))
3161     {
3162         /* Send the charges and/or c6/sigmas to our PME only node */
3163         gmx_pme_send_parameters(cr, fr->ic, mdatoms->nChargePerturbed != 0,
3164                                 mdatoms->nTypePerturbed != 0, mdatoms->chargeA, mdatoms->chargeB,
3165                                 mdatoms->sqrt_c6A, mdatoms->sqrt_c6B, mdatoms->sigmaA,
3166                                 mdatoms->sigmaB, dd_pme_maxshift_x(dd), dd_pme_maxshift_y(dd));
3167     }
3168
3169     if (ir->bPull)
3170     {
3171         /* Update the local pull groups */
3172         dd_make_local_pull_groups(cr, pull_work);
3173     }
3174
3175     if (dd->atomSets != nullptr)
3176     {
3177         /* Update the local atom sets */
3178         dd->atomSets->setIndicesInDomainDecomposition(*(dd->ga2la));
3179     }
3180
3181     /* Update the local atoms to be communicated via the IMD protocol if bIMD is TRUE. */
3182     imdSession->dd_make_local_IMD_atoms(dd);
3183
3184     add_dd_statistics(dd);
3185
3186     /* Make sure we only count the cycles for this DD partitioning */
3187     clear_dd_cycle_counts(dd);
3188
3189     /* Because the order of the atoms might have changed since
3190      * the last vsite construction, we need to communicate the constructing
3191      * atom coordinates again (for spreading the forces this MD step).
3192      */
3193     dd_move_x_vsites(dd, state_local->box, state_local->x.rvec_array());
3194
3195     wallcycle_sub_stop(wcycle, ewcsDD_TOPOTHER);
3196
3197     if (comm->ddSettings.nstDDDump > 0 && step % comm->ddSettings.nstDDDump == 0)
3198     {
3199         dd_move_x(dd, state_local->box, state_local->x, nullWallcycle);
3200         write_dd_pdb("dd_dump", step, "dump", &top_global, cr, -1, state_local->x.rvec_array(),
3201                      state_local->box);
3202     }
3203
3204     /* Store the partitioning step */
3205     comm->partition_step = step;
3206
3207     /* Increase the DD partitioning counter */
3208     dd->ddp_count++;
3209     /* The state currently matches this DD partitioning count, store it */
3210     state_local->ddp_count = dd->ddp_count;
3211     if (bMasterState)
3212     {
3213         /* The DD master node knows the complete cg distribution,
3214          * store the count so we can possibly skip the cg info communication.
3215          */
3216         comm->master_cg_ddp_count = (bSortCG ? 0 : dd->ddp_count);
3217     }
3218
3219     if (comm->ddSettings.DD_debug > 0)
3220     {
3221         /* Set the env var GMX_DD_DEBUG if you suspect corrupted indices */
3222         check_index_consistency(dd, top_global.natoms, "after partitioning");
3223     }
3224
3225     wallcycle_stop(wcycle, ewcDOMDEC);
3226 }
3227
3228 /*! \brief Check whether bonded interactions are missing, if appropriate */
3229 void checkNumberOfBondedInteractions(const gmx::MDLogger&  mdlog,
3230                                      t_commrec*            cr,
3231                                      int                   totalNumberOfBondedInteractions,
3232                                      const gmx_mtop_t*     top_global,
3233                                      const gmx_localtop_t* top_local,
3234                                      const rvec*           x,
3235                                      const matrix          box,
3236                                      bool*                 shouldCheckNumberOfBondedInteractions)
3237 {
3238     if (*shouldCheckNumberOfBondedInteractions)
3239     {
3240         if (totalNumberOfBondedInteractions != cr->dd->nbonded_global)
3241         {
3242             dd_print_missing_interactions(mdlog, cr, totalNumberOfBondedInteractions, top_global,
3243                                           top_local, x, box); // Does not return
3244         }
3245         *shouldCheckNumberOfBondedInteractions = false;
3246     }
3247 }