e6642598b79b735b830a6c2fd1cb48aabe2b6d3c
[alexxy/gromacs.git] / src / gromacs / domdec / partition.cpp
1 /*
2  * This file is part of the GROMACS molecular simulation package.
3  *
4  * Copyright (c) 2018,2019, by the GROMACS development team, led by
5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6  * and including many others, as listed in the AUTHORS file in the
7  * top-level source directory and at http://www.gromacs.org.
8  *
9  * GROMACS is free software; you can redistribute it and/or
10  * modify it under the terms of the GNU Lesser General Public License
11  * as published by the Free Software Foundation; either version 2.1
12  * of the License, or (at your option) any later version.
13  *
14  * GROMACS is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17  * Lesser General Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser General Public
20  * License along with GROMACS; if not, see
21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
23  *
24  * If you want to redistribute modifications to GROMACS, please
25  * consider that scientific software is very special. Version
26  * control is crucial - bugs must be traceable. We will be happy to
27  * consider code for inclusion in the official distribution, but
28  * derived work must not be called official GROMACS. Details are found
29  * in the README & COPYING files - if they are missing, get the
30  * official version at http://www.gromacs.org.
31  *
32  * To help us fund GROMACS development, we humbly ask that you cite
33  * the research papers on the package. Check out http://www.gromacs.org.
34  */
35 /*! \internal \file
36  *
37  * \brief This file defines functions for mdrun to call to make a new
38  * domain decomposition, and check it.
39  *
40  * \author Berk Hess <hess@kth.se>
41  * \ingroup module_domdec
42  */
43
44 #include "gmxpre.h"
45
46 #include "partition.h"
47
48 #include "config.h"
49
50 #include <cassert>
51 #include <cstdio>
52
53 #include <algorithm>
54
55 #include "gromacs/domdec/collect.h"
56 #include "gromacs/domdec/dlb.h"
57 #include "gromacs/domdec/dlbtiming.h"
58 #include "gromacs/domdec/domdec.h"
59 #include "gromacs/domdec/domdec_network.h"
60 #include "gromacs/domdec/ga2la.h"
61 #include "gromacs/domdec/localatomsetmanager.h"
62 #include "gromacs/domdec/mdsetup.h"
63 #include "gromacs/ewald/pme.h"
64 #include "gromacs/gmxlib/chargegroup.h"
65 #include "gromacs/gmxlib/network.h"
66 #include "gromacs/gmxlib/nrnb.h"
67 #include "gromacs/imd/imd.h"
68 #include "gromacs/math/functions.h"
69 #include "gromacs/math/vec.h"
70 #include "gromacs/mdlib/forcerec.h"
71 #include "gromacs/mdlib/gmx_omp_nthreads.h"
72 #include "gromacs/mdlib/mdatoms.h"
73 #include "gromacs/mdlib/nsgrid.h"
74 #include "gromacs/mdlib/vsite.h"
75 #include "gromacs/mdtypes/commrec.h"
76 #include "gromacs/mdtypes/forcerec.h"
77 #include "gromacs/mdtypes/inputrec.h"
78 #include "gromacs/mdtypes/md_enums.h"
79 #include "gromacs/mdtypes/nblist.h"
80 #include "gromacs/mdtypes/state.h"
81 #include "gromacs/nbnxm/nbnxm.h"
82 #include "gromacs/pulling/pull.h"
83 #include "gromacs/timing/wallcycle.h"
84 #include "gromacs/topology/mtop_util.h"
85 #include "gromacs/topology/topology.h"
86 #include "gromacs/utility/cstringutil.h"
87 #include "gromacs/utility/fatalerror.h"
88 #include "gromacs/utility/logger.h"
89 #include "gromacs/utility/real.h"
90 #include "gromacs/utility/smalloc.h"
91 #include "gromacs/utility/strconvert.h"
92 #include "gromacs/utility/stringstream.h"
93 #include "gromacs/utility/stringutil.h"
94 #include "gromacs/utility/textwriter.h"
95
96 #include "box.h"
97 #include "cellsizes.h"
98 #include "distribute.h"
99 #include "domdec_constraints.h"
100 #include "domdec_internal.h"
101 #include "domdec_vsite.h"
102 #include "dump.h"
103 #include "redistribute.h"
104 #include "utility.h"
105
106 /*! \brief Turn on DLB when the load imbalance causes this amount of total loss.
107  *
108  * There is a bit of overhead with DLB and it's difficult to achieve
109  * a load imbalance of less than 2% with DLB.
110  */
111 #define DD_PERF_LOSS_DLB_ON  0.02
112
113 //! Warn about imbalance due to PP or PP/PME load imbalance at this loss.
114 #define DD_PERF_LOSS_WARN    0.05
115
116
117 //! Debug helper printing a DD zone
118 static void print_ddzone(FILE *fp, int d, int i, int j, gmx_ddzone_t *zone)
119 {
120     fprintf(fp, "zone d0 %d d1 %d d2 %d  min0 %6.3f max1 %6.3f mch0 %6.3f mch1 %6.3f p1_0 %6.3f p1_1 %6.3f\n",
121             d, i, j,
122             zone->min0, zone->max1,
123             zone->mch0, zone->mch0,
124             zone->p1_0, zone->p1_1);
125 }
126
127 /*! \brief Using the home grid size as input in cell_ns_x0 and cell_ns_x1
128  * takes the extremes over all home and remote zones in the halo
129  * and returns the results in cell_ns_x0 and cell_ns_x1.
130  * Note: only used with the group cut-off scheme.
131  */
132 static void dd_move_cellx(gmx_domdec_t      *dd,
133                           const gmx_ddbox_t *ddbox,
134                           rvec               cell_ns_x0,
135                           rvec               cell_ns_x1)
136 {
137     constexpr int      c_ddZoneCommMaxNumZones = 5;
138     gmx_ddzone_t       buf_s[c_ddZoneCommMaxNumZones];
139     gmx_ddzone_t       buf_r[c_ddZoneCommMaxNumZones];
140     gmx_ddzone_t       buf_e[c_ddZoneCommMaxNumZones];
141     gmx_domdec_comm_t *comm = dd->comm;
142
143     rvec               extr_s[2];
144     rvec               extr_r[2];
145     for (int d = 1; d < dd->ndim; d++)
146     {
147         int           dim = dd->dim[d];
148         gmx_ddzone_t &zp  = (d == 1) ? comm->zone_d1[0] : comm->zone_d2[0][0];
149
150         /* Copy the base sizes of the home zone */
151         zp.min0    = cell_ns_x0[dim];
152         zp.max1    = cell_ns_x1[dim];
153         zp.min1    = cell_ns_x1[dim];
154         zp.mch0    = cell_ns_x0[dim];
155         zp.mch1    = cell_ns_x1[dim];
156         zp.p1_0    = cell_ns_x0[dim];
157         zp.p1_1    = cell_ns_x1[dim];
158         zp.dataSet = 1;
159     }
160
161     gmx::ArrayRef<DDCellsizesWithDlb> cellsizes = comm->cellsizesWithDlb;
162
163     /* Loop backward over the dimensions and aggregate the extremes
164      * of the cell sizes.
165      */
166     for (int d = dd->ndim - 2; d >= 0; d--)
167     {
168         const int  dim      = dd->dim[d];
169         const bool applyPbc = (dim < ddbox->npbcdim);
170
171         /* Use an rvec to store two reals */
172         extr_s[d][0] = cellsizes[d + 1].fracLower;
173         extr_s[d][1] = cellsizes[d + 1].fracUpper;
174         extr_s[d][2] = cellsizes[d + 1].fracUpper;
175
176         int pos = 0;
177         GMX_ASSERT(pos < c_ddZoneCommMaxNumZones, "The buffers should be sufficiently large");
178         /* Store the extremes in the backward sending buffer,
179          * so they get updated separately from the forward communication.
180          */
181         for (int d1 = d; d1 < dd->ndim-1; d1++)
182         {
183             gmx_ddzone_t &buf = buf_s[pos];
184
185             /* We invert the order to be able to use the same loop for buf_e */
186             buf.min0    = extr_s[d1][1];
187             buf.max1    = extr_s[d1][0];
188             buf.min1    = extr_s[d1][2];
189             buf.mch0    = 0;
190             buf.mch1    = 0;
191             /* Store the cell corner of the dimension we communicate along */
192             buf.p1_0    = comm->cell_x0[dim];
193             buf.p1_1    = 0;
194             buf.dataSet = 1;
195             pos++;
196         }
197
198         buf_s[pos] = (dd->ndim == 2) ? comm->zone_d1[0] : comm->zone_d2[0][0];
199         pos++;
200
201         if (dd->ndim == 3 && d == 0)
202         {
203             buf_s[pos] = comm->zone_d2[0][1];
204             pos++;
205             buf_s[pos] = comm->zone_d1[0];
206             pos++;
207         }
208
209         /* We only need to communicate the extremes
210          * in the forward direction
211          */
212         int numPulses = comm->cd[d].numPulses();
213         int numPulsesMin;
214         if (applyPbc)
215         {
216             /* Take the minimum to avoid double communication */
217             numPulsesMin = std::min(numPulses, dd->nc[dim] - 1 - numPulses);
218         }
219         else
220         {
221             /* Without PBC we should really not communicate over
222              * the boundaries, but implementing that complicates
223              * the communication setup and therefore we simply
224              * do all communication, but ignore some data.
225              */
226             numPulsesMin = numPulses;
227         }
228         for (int pulse = 0; pulse < numPulsesMin; pulse++)
229         {
230             /* Communicate the extremes forward */
231             bool receiveValidData = (applyPbc || dd->ci[dim] > 0);
232
233             int  numElements      = dd->ndim - d - 1;
234             ddSendrecv(dd, d, dddirForward,
235                        extr_s + d, numElements,
236                        extr_r + d, numElements);
237
238             if (receiveValidData)
239             {
240                 for (int d1 = d; d1 < dd->ndim - 1; d1++)
241                 {
242                     extr_s[d1][0] = std::max(extr_s[d1][0], extr_r[d1][0]);
243                     extr_s[d1][1] = std::min(extr_s[d1][1], extr_r[d1][1]);
244                     extr_s[d1][2] = std::min(extr_s[d1][2], extr_r[d1][2]);
245                 }
246             }
247         }
248
249         const int numElementsInBuffer = pos;
250         for (int pulse = 0; pulse < numPulses; pulse++)
251         {
252             /* Communicate all the zone information backward */
253             bool receiveValidData = (applyPbc || dd->ci[dim] < dd->nc[dim] - 1);
254
255             static_assert(sizeof(gmx_ddzone_t) == c_ddzoneNumReals*sizeof(real), "Here we expect gmx_ddzone_t to consist of c_ddzoneNumReals reals (only)");
256
257             int numReals = numElementsInBuffer*c_ddzoneNumReals;
258             ddSendrecv(dd, d, dddirBackward,
259                        gmx::arrayRefFromArray(&buf_s[0].min0, numReals),
260                        gmx::arrayRefFromArray(&buf_r[0].min0, numReals));
261
262             rvec dh = { 0 };
263             if (pulse > 0)
264             {
265                 for (int d1 = d + 1; d1 < dd->ndim; d1++)
266                 {
267                     /* Determine the decrease of maximum required
268                      * communication height along d1 due to the distance along d,
269                      * this avoids a lot of useless atom communication.
270                      */
271                     real dist_d = comm->cell_x1[dim] - buf_r[0].p1_0;
272
273                     real c;
274                     if (ddbox->tric_dir[dim])
275                     {
276                         /* c is the off-diagonal coupling between the cell planes
277                          * along directions d and d1.
278                          */
279                         c = ddbox->v[dim][dd->dim[d1]][dim];
280                     }
281                     else
282                     {
283                         c = 0;
284                     }
285                     real det = (1 + c*c)*gmx::square(comm->systemInfo.cutoff) - dist_d*dist_d;
286                     if (det > 0)
287                     {
288                         dh[d1] = comm->systemInfo.cutoff - (c*dist_d + std::sqrt(det))/(1 + c*c);
289                     }
290                     else
291                     {
292                         /* A negative value signals out of range */
293                         dh[d1] = -1;
294                     }
295                 }
296             }
297
298             /* Accumulate the extremes over all pulses */
299             for (int i = 0; i < numElementsInBuffer; i++)
300             {
301                 if (pulse == 0)
302                 {
303                     buf_e[i] = buf_r[i];
304                 }
305                 else
306                 {
307                     if (receiveValidData)
308                     {
309                         buf_e[i].min0 = std::min(buf_e[i].min0, buf_r[i].min0);
310                         buf_e[i].max1 = std::max(buf_e[i].max1, buf_r[i].max1);
311                         buf_e[i].min1 = std::min(buf_e[i].min1, buf_r[i].min1);
312                     }
313
314                     int d1;
315                     if (dd->ndim == 3 && d == 0 && i == numElementsInBuffer - 1)
316                     {
317                         d1 = 1;
318                     }
319                     else
320                     {
321                         d1 = d + 1;
322                     }
323                     if (receiveValidData && dh[d1] >= 0)
324                     {
325                         buf_e[i].mch0 = std::max(buf_e[i].mch0, buf_r[i].mch0-dh[d1]);
326                         buf_e[i].mch1 = std::max(buf_e[i].mch1, buf_r[i].mch1-dh[d1]);
327                     }
328                 }
329                 /* Copy the received buffer to the send buffer,
330                  * to pass the data through with the next pulse.
331                  */
332                 buf_s[i] = buf_r[i];
333             }
334             if (((applyPbc || dd->ci[dim] + numPulses < dd->nc[dim]) && pulse == numPulses - 1) ||
335                 (!applyPbc && dd->ci[dim] + 1 + pulse == dd->nc[dim] - 1))
336             {
337                 /* Store the extremes */
338                 int pos = 0;
339
340                 for (int d1 = d; d1 < dd->ndim-1; d1++)
341                 {
342                     extr_s[d1][1] = std::min(extr_s[d1][1], buf_e[pos].min0);
343                     extr_s[d1][0] = std::max(extr_s[d1][0], buf_e[pos].max1);
344                     extr_s[d1][2] = std::min(extr_s[d1][2], buf_e[pos].min1);
345                     pos++;
346                 }
347
348                 if (d == 1 || (d == 0 && dd->ndim == 3))
349                 {
350                     for (int i = d; i < 2; i++)
351                     {
352                         comm->zone_d2[1-d][i] = buf_e[pos];
353                         pos++;
354                     }
355                 }
356                 if (d == 0)
357                 {
358                     comm->zone_d1[1] = buf_e[pos];
359                     pos++;
360                 }
361             }
362             else
363             {
364                 if (d == 1 || (d == 0 && dd->ndim == 3))
365                 {
366                     for (int i = d; i < 2; i++)
367                     {
368                         comm->zone_d2[1 - d][i].dataSet = 0;
369                     }
370                 }
371                 if (d == 0)
372                 {
373                     comm->zone_d1[1].dataSet = 0;
374                 }
375             }
376         }
377     }
378
379     if (dd->ndim >= 2)
380     {
381         int dim = dd->dim[1];
382         for (int i = 0; i < 2; i++)
383         {
384             if (comm->zone_d1[i].dataSet != 0)
385             {
386                 if (debug)
387                 {
388                     print_ddzone(debug, 1, i, 0, &comm->zone_d1[i]);
389                 }
390                 cell_ns_x0[dim] = std::min(cell_ns_x0[dim], comm->zone_d1[i].min0);
391                 cell_ns_x1[dim] = std::max(cell_ns_x1[dim], comm->zone_d1[i].max1);
392             }
393         }
394     }
395     if (dd->ndim >= 3)
396     {
397         int dim = dd->dim[2];
398         for (int i = 0; i < 2; i++)
399         {
400             for (int j = 0; j < 2; j++)
401             {
402                 if (comm->zone_d2[i][j].dataSet != 0)
403                 {
404                     if (debug)
405                     {
406                         print_ddzone(debug, 2, i, j, &comm->zone_d2[i][j]);
407                     }
408                     cell_ns_x0[dim] = std::min(cell_ns_x0[dim], comm->zone_d2[i][j].min0);
409                     cell_ns_x1[dim] = std::max(cell_ns_x1[dim], comm->zone_d2[i][j].max1);
410                 }
411             }
412         }
413     }
414     for (int d = 1; d < dd->ndim; d++)
415     {
416         cellsizes[d].fracLowerMax = extr_s[d-1][0];
417         cellsizes[d].fracUpperMin = extr_s[d-1][1];
418         if (debug)
419         {
420             fprintf(debug, "Cell fraction d %d, max0 %f, min1 %f\n",
421                     d, cellsizes[d].fracLowerMax, cellsizes[d].fracUpperMin);
422         }
423     }
424 }
425
426 //! Sets the charge-group zones to be equal to the home zone.
427 static void set_zones_ncg_home(gmx_domdec_t *dd)
428 {
429     gmx_domdec_zones_t *zones;
430     int                 i;
431
432     zones = &dd->comm->zones;
433
434     zones->cg_range[0] = 0;
435     for (i = 1; i < zones->n+1; i++)
436     {
437         zones->cg_range[i] = dd->ncg_home;
438     }
439     /* zone_ncg1[0] should always be equal to ncg_home */
440     dd->comm->zone_ncg1[0] = dd->ncg_home;
441 }
442
443 //! Restore atom groups for the charge groups.
444 static void restoreAtomGroups(gmx_domdec_t  *dd,
445                               const t_state *state)
446 {
447     gmx::ArrayRef<const int>  atomGroupsState        = state->cg_gl;
448
449     std::vector<int>         &globalAtomGroupIndices = dd->globalAtomGroupIndices;
450
451     globalAtomGroupIndices.resize(atomGroupsState.size());
452
453     /* Copy back the global charge group indices from state
454      * and rebuild the local charge group to atom index.
455      */
456     for (gmx::index i = 0; i < atomGroupsState.ssize(); i++)
457     {
458         globalAtomGroupIndices[i] = atomGroupsState[i];
459     }
460
461     dd->ncg_home = atomGroupsState.size();
462     dd->comm->atomRanges.setEnd(DDAtomRanges::Type::Home, atomGroupsState.ssize());
463
464     set_zones_ncg_home(dd);
465 }
466
467 //! Sets the cginfo structures.
468 static void dd_set_cginfo(gmx::ArrayRef<const int> index_gl, int cg0, int cg1,
469                           t_forcerec *fr)
470 {
471     if (fr != nullptr)
472     {
473         const cginfo_mb_t *cginfo_mb = fr->cginfo_mb;
474         gmx::ArrayRef<int> cginfo    = fr->cginfo;
475
476         for (int cg = cg0; cg < cg1; cg++)
477         {
478             cginfo[cg] = ddcginfo(cginfo_mb, index_gl[cg]);
479         }
480     }
481 }
482
483 //! Makes the mappings between global and local atom indices during DD repartioning.
484 static void make_dd_indices(gmx_domdec_t *dd,
485                             const int     atomStart)
486 {
487     const int                 numZones               = dd->comm->zones.n;
488     const int                *zone2cg                = dd->comm->zones.cg_range;
489     const int                *zone_ncg1              = dd->comm->zone_ncg1;
490     gmx::ArrayRef<const int>  globalAtomGroupIndices = dd->globalAtomGroupIndices;
491
492     std::vector<int>         &globalAtomIndices      = dd->globalAtomIndices;
493     gmx_ga2la_t              &ga2la                  = *dd->ga2la;
494
495     if (zone2cg[1] != dd->ncg_home)
496     {
497         gmx_incons("dd->ncg_zone is not up to date");
498     }
499
500     /* Make the local to global and global to local atom index */
501     int a = atomStart;
502     globalAtomIndices.resize(a);
503     for (int zone = 0; zone < numZones; zone++)
504     {
505         int cg0;
506         if (zone == 0)
507         {
508             cg0 = atomStart;
509         }
510         else
511         {
512             cg0 = zone2cg[zone];
513         }
514         int cg1    = zone2cg[zone+1];
515         int cg1_p1 = cg0 + zone_ncg1[zone];
516
517         for (int cg = cg0; cg < cg1; cg++)
518         {
519             int zone1 = zone;
520             if (cg >= cg1_p1)
521             {
522                 /* Signal that this cg is from more than one pulse away */
523                 zone1 += numZones;
524             }
525             int cg_gl = globalAtomGroupIndices[cg];
526             globalAtomIndices.push_back(cg_gl);
527             ga2la.insert(cg_gl, {a, zone1});
528             a++;
529         }
530     }
531 }
532
533 //! Checks whether global and local atom indices are consistent.
534 static void check_index_consistency(const gmx_domdec_t *dd,
535                                     int                 natoms_sys,
536                                     const char         *where)
537 {
538     int       nerr = 0;
539
540     const int numAtomsInZones = dd->comm->atomRanges.end(DDAtomRanges::Type::Zones);
541
542     if (dd->comm->ddSettings.DD_debug > 1)
543     {
544         std::vector<int> have(natoms_sys);
545         for (int a = 0; a < numAtomsInZones; a++)
546         {
547             int globalAtomIndex = dd->globalAtomIndices[a];
548             if (have[globalAtomIndex] > 0)
549             {
550                 fprintf(stderr, "DD rank %d: global atom %d occurs twice: index %d and %d\n", dd->rank, globalAtomIndex + 1, have[globalAtomIndex], a+1);
551             }
552             else
553             {
554                 have[globalAtomIndex] = a + 1;
555             }
556         }
557     }
558
559     std::vector<int> have(numAtomsInZones);
560
561     int              ngl = 0;
562     for (int i = 0; i < natoms_sys; i++)
563     {
564         if (const auto entry = dd->ga2la->find(i))
565         {
566             const int a = entry->la;
567             if (a >= numAtomsInZones)
568             {
569                 fprintf(stderr, "DD rank %d: global atom %d marked as local atom %d, which is larger than nat_tot (%d)\n", dd->rank, i+1, a+1, numAtomsInZones);
570                 nerr++;
571             }
572             else
573             {
574                 have[a] = 1;
575                 if (dd->globalAtomIndices[a] != i)
576                 {
577                     fprintf(stderr, "DD rank %d: global atom %d marked as local atom %d, which has global atom index %d\n", dd->rank, i+1, a+1, dd->globalAtomIndices[a]+1);
578                     nerr++;
579                 }
580             }
581             ngl++;
582         }
583     }
584     if (ngl != numAtomsInZones)
585     {
586         fprintf(stderr,
587                 "DD rank %d, %s: %d global atom indices, %d local atoms\n",
588                 dd->rank, where, ngl, numAtomsInZones);
589     }
590     for (int a = 0; a < numAtomsInZones; a++)
591     {
592         if (have[a] == 0)
593         {
594             fprintf(stderr,
595                     "DD rank %d, %s: local atom %d, global %d has no global index\n",
596                     dd->rank, where, a + 1, dd->globalAtomIndices[a] + 1);
597         }
598     }
599
600     if (nerr > 0)
601     {
602         gmx_fatal(FARGS, "DD rank %d, %s: %d atom(group) index inconsistencies",
603                   dd->rank, where, nerr);
604     }
605 }
606
607 //! Clear all DD global state indices
608 static void clearDDStateIndices(gmx_domdec_t *dd,
609                                 const bool    keepLocalAtomIndices)
610 {
611     gmx_ga2la_t &ga2la = *dd->ga2la;
612
613     if (!keepLocalAtomIndices)
614     {
615         /* Clear the whole list without the overhead of searching */
616         ga2la.clear();
617     }
618     else
619     {
620         const int numAtomsInZones = dd->comm->atomRanges.end(DDAtomRanges::Type::Zones);
621         for (int i = 0; i < numAtomsInZones; i++)
622         {
623             ga2la.erase(dd->globalAtomIndices[i]);
624         }
625     }
626
627     dd_clear_local_vsite_indices(dd);
628
629     if (dd->constraints)
630     {
631         dd_clear_local_constraint_indices(dd);
632     }
633 }
634
635 bool check_grid_jump(int64_t             step,
636                      const gmx_domdec_t *dd,
637                      real                cutoff,
638                      const gmx_ddbox_t  *ddbox,
639                      gmx_bool            bFatal)
640 {
641     gmx_domdec_comm_t *comm    = dd->comm;
642     bool               invalid = false;
643
644     for (int d = 1; d < dd->ndim; d++)
645     {
646         const DDCellsizesWithDlb &cellsizes = comm->cellsizesWithDlb[d];
647         const int                 dim       = dd->dim[d];
648         const real                limit     = grid_jump_limit(comm, cutoff, d);
649         real                      bfac      = ddbox->box_size[dim];
650         if (ddbox->tric_dir[dim])
651         {
652             bfac *= ddbox->skew_fac[dim];
653         }
654         if ((cellsizes.fracUpper - cellsizes.fracLowerMax)*bfac <  limit ||
655             (cellsizes.fracLower - cellsizes.fracUpperMin)*bfac > -limit)
656         {
657             invalid = true;
658
659             if (bFatal)
660             {
661                 char buf[22];
662
663                 /* This error should never be triggered under normal
664                  * circumstances, but you never know ...
665                  */
666                 gmx_fatal(FARGS, "step %s: The domain decomposition grid has shifted too much in the %c-direction around cell %d %d %d. This should not have happened. Running with fewer ranks might avoid this issue.",
667                           gmx_step_str(step, buf),
668                           dim2char(dim), dd->ci[XX], dd->ci[YY], dd->ci[ZZ]);
669             }
670         }
671     }
672
673     return invalid;
674 }
675 //! Return the duration of force calculations on this rank.
676 static float dd_force_load(gmx_domdec_comm_t *comm)
677 {
678     float load;
679
680     if (comm->ddSettings.eFlop)
681     {
682         load = comm->flop;
683         if (comm->ddSettings.eFlop > 1)
684         {
685             load *= 1.0 + (comm->ddSettings.eFlop - 1)*(0.1*rand()/RAND_MAX - 0.05);
686         }
687     }
688     else
689     {
690         load = comm->cycl[ddCyclF];
691         if (comm->cycl_n[ddCyclF] > 1)
692         {
693             /* Subtract the maximum of the last n cycle counts
694              * to get rid of possible high counts due to other sources,
695              * for instance system activity, that would otherwise
696              * affect the dynamic load balancing.
697              */
698             load -= comm->cycl_max[ddCyclF];
699         }
700
701 #if GMX_MPI
702         if (comm->cycl_n[ddCyclWaitGPU] && comm->nrank_gpu_shared > 1)
703         {
704             float gpu_wait, gpu_wait_sum;
705
706             gpu_wait = comm->cycl[ddCyclWaitGPU];
707             if (comm->cycl_n[ddCyclF] > 1)
708             {
709                 /* We should remove the WaitGPU time of the same MD step
710                  * as the one with the maximum F time, since the F time
711                  * and the wait time are not independent.
712                  * Furthermore, the step for the max F time should be chosen
713                  * the same on all ranks that share the same GPU.
714                  * But to keep the code simple, we remove the average instead.
715                  * The main reason for artificially long times at some steps
716                  * is spurious CPU activity or MPI time, so we don't expect
717                  * that changes in the GPU wait time matter a lot here.
718                  */
719                 gpu_wait *= (comm->cycl_n[ddCyclF] - 1)/static_cast<float>(comm->cycl_n[ddCyclF]);
720             }
721             /* Sum the wait times over the ranks that share the same GPU */
722             MPI_Allreduce(&gpu_wait, &gpu_wait_sum, 1, MPI_FLOAT, MPI_SUM,
723                           comm->mpi_comm_gpu_shared);
724             /* Replace the wait time by the average over the ranks */
725             load += -gpu_wait + gpu_wait_sum/comm->nrank_gpu_shared;
726         }
727 #endif
728     }
729
730     return load;
731 }
732
733 //! Runs cell size checks and communicates the boundaries.
734 static void comm_dd_ns_cell_sizes(gmx_domdec_t *dd,
735                                   gmx_ddbox_t *ddbox,
736                                   rvec cell_ns_x0, rvec cell_ns_x1,
737                                   int64_t step)
738 {
739     gmx_domdec_comm_t *comm;
740     int                dim_ind, dim;
741
742     comm = dd->comm;
743
744     for (dim_ind = 0; dim_ind < dd->ndim; dim_ind++)
745     {
746         dim = dd->dim[dim_ind];
747
748         /* Without PBC we don't have restrictions on the outer cells */
749         if (!(dim >= ddbox->npbcdim &&
750               (dd->ci[dim] == 0 || dd->ci[dim] == dd->nc[dim] - 1)) &&
751             isDlbOn(comm) &&
752             (comm->cell_x1[dim] - comm->cell_x0[dim])*ddbox->skew_fac[dim] <
753             comm->cellsize_min[dim])
754         {
755             char buf[22];
756             gmx_fatal(FARGS, "step %s: The %c-size (%f) times the triclinic skew factor (%f) is smaller than the smallest allowed cell size (%f) for domain decomposition grid cell %d %d %d",
757                       gmx_step_str(step, buf), dim2char(dim),
758                       comm->cell_x1[dim] - comm->cell_x0[dim],
759                       ddbox->skew_fac[dim],
760                       dd->comm->cellsize_min[dim],
761                       dd->ci[XX], dd->ci[YY], dd->ci[ZZ]);
762         }
763     }
764
765     if ((isDlbOn(dd->comm) && dd->ndim > 1) || ddbox->nboundeddim < DIM)
766     {
767         /* Communicate the boundaries and update cell_ns_x0/1 */
768         dd_move_cellx(dd, ddbox, cell_ns_x0, cell_ns_x1);
769         if (isDlbOn(dd->comm) && dd->ndim > 1)
770         {
771             check_grid_jump(step, dd, dd->comm->systemInfo.cutoff, ddbox, TRUE);
772         }
773     }
774 }
775
776 //! Compute and communicate to determine the load distribution across PP ranks.
777 static void get_load_distribution(gmx_domdec_t *dd, gmx_wallcycle_t wcycle)
778 {
779     gmx_domdec_comm_t *comm;
780     domdec_load_t     *load;
781     float              cell_frac = 0, sbuf[DD_NLOAD_MAX];
782     gmx_bool           bSepPME;
783
784     if (debug)
785     {
786         fprintf(debug, "get_load_distribution start\n");
787     }
788
789     wallcycle_start(wcycle, ewcDDCOMMLOAD);
790
791     comm = dd->comm;
792
793     bSepPME = (dd->pme_nodeid >= 0);
794
795     if (dd->ndim == 0 && bSepPME)
796     {
797         /* Without decomposition, but with PME nodes, we need the load */
798         comm->load[0].mdf = comm->cycl[ddCyclPPduringPME];
799         comm->load[0].pme = comm->cycl[ddCyclPME];
800     }
801
802     for (int d = dd->ndim - 1; d >= 0; d--)
803     {
804         const DDCellsizesWithDlb *cellsizes = (isDlbOn(dd->comm) ? &comm->cellsizesWithDlb[d] : nullptr);
805         const int                 dim       = dd->dim[d];
806         /* Check if we participate in the communication in this dimension */
807         if (d == dd->ndim-1 ||
808             (dd->ci[dd->dim[d+1]] == 0 && dd->ci[dd->dim[dd->ndim-1]] == 0))
809         {
810             load = &comm->load[d];
811             if (isDlbOn(dd->comm))
812             {
813                 cell_frac = cellsizes->fracUpper - cellsizes->fracLower;
814             }
815             int pos = 0;
816             if (d == dd->ndim-1)
817             {
818                 sbuf[pos++] = dd_force_load(comm);
819                 sbuf[pos++] = sbuf[0];
820                 if (isDlbOn(dd->comm))
821                 {
822                     sbuf[pos++] = sbuf[0];
823                     sbuf[pos++] = cell_frac;
824                     if (d > 0)
825                     {
826                         sbuf[pos++] = cellsizes->fracLowerMax;
827                         sbuf[pos++] = cellsizes->fracUpperMin;
828                     }
829                 }
830                 if (bSepPME)
831                 {
832                     sbuf[pos++] = comm->cycl[ddCyclPPduringPME];
833                     sbuf[pos++] = comm->cycl[ddCyclPME];
834                 }
835             }
836             else
837             {
838                 sbuf[pos++] = comm->load[d+1].sum;
839                 sbuf[pos++] = comm->load[d+1].max;
840                 if (isDlbOn(dd->comm))
841                 {
842                     sbuf[pos++] = comm->load[d+1].sum_m;
843                     sbuf[pos++] = comm->load[d+1].cvol_min*cell_frac;
844                     sbuf[pos++] = comm->load[d+1].flags;
845                     if (d > 0)
846                     {
847                         sbuf[pos++] = cellsizes->fracLowerMax;
848                         sbuf[pos++] = cellsizes->fracUpperMin;
849                     }
850                 }
851                 if (bSepPME)
852                 {
853                     sbuf[pos++] = comm->load[d+1].mdf;
854                     sbuf[pos++] = comm->load[d+1].pme;
855                 }
856             }
857             load->nload = pos;
858             /* Communicate a row in DD direction d.
859              * The communicators are setup such that the root always has rank 0.
860              */
861 #if GMX_MPI
862             MPI_Gather(sbuf, load->nload*sizeof(float), MPI_BYTE,
863                        load->load, load->nload*sizeof(float), MPI_BYTE,
864                        0, comm->mpi_comm_load[d]);
865 #endif
866             if (dd->ci[dim] == dd->master_ci[dim])
867             {
868                 /* We are the master along this row, process this row */
869                 RowMaster *rowMaster = nullptr;
870
871                 if (isDlbOn(comm))
872                 {
873                     rowMaster = cellsizes->rowMaster.get();
874                 }
875                 load->sum      = 0;
876                 load->max      = 0;
877                 load->sum_m    = 0;
878                 load->cvol_min = 1;
879                 load->flags    = 0;
880                 load->mdf      = 0;
881                 load->pme      = 0;
882                 int pos        = 0;
883                 for (int i = 0; i < dd->nc[dim]; i++)
884                 {
885                     load->sum += load->load[pos++];
886                     load->max  = std::max(load->max, load->load[pos]);
887                     pos++;
888                     if (isDlbOn(dd->comm))
889                     {
890                         if (rowMaster->dlbIsLimited)
891                         {
892                             /* This direction could not be load balanced properly,
893                              * therefore we need to use the maximum iso the average load.
894                              */
895                             load->sum_m = std::max(load->sum_m, load->load[pos]);
896                         }
897                         else
898                         {
899                             load->sum_m += load->load[pos];
900                         }
901                         pos++;
902                         load->cvol_min = std::min(load->cvol_min, load->load[pos]);
903                         pos++;
904                         if (d < dd->ndim-1)
905                         {
906                             load->flags = gmx::roundToInt(load->load[pos++]);
907                         }
908                         if (d > 0)
909                         {
910                             rowMaster->bounds[i].cellFracLowerMax = load->load[pos++];
911                             rowMaster->bounds[i].cellFracUpperMin = load->load[pos++];
912                         }
913                     }
914                     if (bSepPME)
915                     {
916                         load->mdf = std::max(load->mdf, load->load[pos]);
917                         pos++;
918                         load->pme = std::max(load->pme, load->load[pos]);
919                         pos++;
920                     }
921                 }
922                 if (isDlbOn(comm) && rowMaster->dlbIsLimited)
923                 {
924                     load->sum_m *= dd->nc[dim];
925                     load->flags |= (1<<d);
926                 }
927             }
928         }
929     }
930
931     if (DDMASTER(dd))
932     {
933         comm->nload      += dd_load_count(comm);
934         comm->load_step  += comm->cycl[ddCyclStep];
935         comm->load_sum   += comm->load[0].sum;
936         comm->load_max   += comm->load[0].max;
937         if (isDlbOn(comm))
938         {
939             for (int d = 0; d < dd->ndim; d++)
940             {
941                 if (comm->load[0].flags & (1<<d))
942                 {
943                     comm->load_lim[d]++;
944                 }
945             }
946         }
947         if (bSepPME)
948         {
949             comm->load_mdf += comm->load[0].mdf;
950             comm->load_pme += comm->load[0].pme;
951         }
952     }
953
954     wallcycle_stop(wcycle, ewcDDCOMMLOAD);
955
956     if (debug)
957     {
958         fprintf(debug, "get_load_distribution finished\n");
959     }
960 }
961
962 /*! \brief Return the relative performance loss on the total run time
963  * due to the force calculation load imbalance. */
964 static float dd_force_load_fraction(gmx_domdec_t *dd)
965 {
966     if (dd->comm->nload > 0 && dd->comm->load_step > 0)
967     {
968         return dd->comm->load_sum/(dd->comm->load_step*dd->nnodes);
969     }
970     else
971     {
972         return 0;
973     }
974 }
975
976 /*! \brief Return the relative performance loss on the total run time
977  * due to the force calculation load imbalance. */
978 static float dd_force_imb_perf_loss(gmx_domdec_t *dd)
979 {
980     if (dd->comm->nload > 0 && dd->comm->load_step > 0)
981     {
982         return
983             (dd->comm->load_max*dd->nnodes - dd->comm->load_sum)/
984             (dd->comm->load_step*dd->nnodes);
985     }
986     else
987     {
988         return 0;
989     }
990 }
991
992 //! Print load-balance report e.g. at the end of a run.
993 static void print_dd_load_av(FILE *fplog, gmx_domdec_t *dd)
994 {
995     gmx_domdec_comm_t *comm = dd->comm;
996
997     /* Only the master rank prints loads and only if we measured loads */
998     if (!DDMASTER(dd) || comm->nload == 0)
999     {
1000         return;
1001     }
1002
1003     char  buf[STRLEN];
1004     int   numPpRanks   = dd->nnodes;
1005     int   numPmeRanks  = (comm->ddRankSetup.usePmeOnlyRanks ? comm->ddRankSetup.numRanksDoingPme : 0);
1006     int   numRanks     = numPpRanks + numPmeRanks;
1007     float lossFraction = 0;
1008
1009     /* Print the average load imbalance and performance loss */
1010     if (dd->nnodes > 1 && comm->load_sum > 0)
1011     {
1012         float imbalance = comm->load_max*numPpRanks/comm->load_sum - 1;
1013         lossFraction    = dd_force_imb_perf_loss(dd);
1014
1015         std::string msg         = "\nDynamic load balancing report:\n";
1016         std::string dlbStateStr;
1017
1018         switch (dd->comm->dlbState)
1019         {
1020             case DlbState::offUser:
1021                 dlbStateStr = "DLB was off during the run per user request.";
1022                 break;
1023             case DlbState::offForever:
1024                 /* Currectly this can happen due to performance loss observed, cell size
1025                  * limitations or incompatibility with other settings observed during
1026                  * determineInitialDlbState(). */
1027                 dlbStateStr = "DLB got disabled because it was unsuitable to use.";
1028                 break;
1029             case DlbState::offCanTurnOn:
1030                 dlbStateStr = "DLB was off during the run due to low measured imbalance.";
1031                 break;
1032             case DlbState::offTemporarilyLocked:
1033                 dlbStateStr = "DLB was locked at the end of the run due to unfinished PP-PME balancing.";
1034                 break;
1035             case DlbState::onCanTurnOff:
1036                 dlbStateStr = "DLB was turned on during the run due to measured imbalance.";
1037                 break;
1038             case DlbState::onUser:
1039                 dlbStateStr = "DLB was permanently on during the run per user request.";
1040                 break;
1041             default:
1042                 GMX_ASSERT(false, "Undocumented DLB state");
1043         }
1044
1045         msg += " " + dlbStateStr + "\n";
1046         msg += gmx::formatString(" Average load imbalance: %.1f%%.\n", imbalance*100);
1047         msg += gmx::formatString(" The balanceable part of the MD step is %d%%, load imbalance is computed from this.\n",
1048                                  gmx::roundToInt(dd_force_load_fraction(dd)*100));
1049         msg += gmx::formatString(" Part of the total run time spent waiting due to load imbalance: %.1f%%.\n",
1050                                  lossFraction*100);
1051         fprintf(fplog, "%s", msg.c_str());
1052         fprintf(stderr, "\n%s", msg.c_str());
1053     }
1054
1055     /* Print during what percentage of steps the  load balancing was limited */
1056     bool dlbWasLimited = false;
1057     if (isDlbOn(comm))
1058     {
1059         sprintf(buf, " Steps where the load balancing was limited by -rdd, -rcon and/or -dds:");
1060         for (int d = 0; d < dd->ndim; d++)
1061         {
1062             int limitPercentage = (200*comm->load_lim[d] + 1)/(2*comm->nload);
1063             sprintf(buf+strlen(buf), " %c %d %%",
1064                     dim2char(dd->dim[d]), limitPercentage);
1065             if (limitPercentage >= 50)
1066             {
1067                 dlbWasLimited = true;
1068             }
1069         }
1070         sprintf(buf + strlen(buf), "\n");
1071         fprintf(fplog, "%s", buf);
1072         fprintf(stderr, "%s", buf);
1073     }
1074
1075     /* Print the performance loss due to separate PME - PP rank imbalance */
1076     float lossFractionPme = 0;
1077     if (numPmeRanks > 0 && comm->load_mdf > 0 && comm->load_step > 0)
1078     {
1079         float pmeForceRatio = comm->load_pme/comm->load_mdf;
1080         lossFractionPme     = (comm->load_pme - comm->load_mdf)/comm->load_step;
1081         if (lossFractionPme <= 0)
1082         {
1083             lossFractionPme *= numPmeRanks/static_cast<float>(numRanks);
1084         }
1085         else
1086         {
1087             lossFractionPme *= numPpRanks/static_cast<float>(numRanks);
1088         }
1089         sprintf(buf, " Average PME mesh/force load: %5.3f\n", pmeForceRatio);
1090         fprintf(fplog, "%s", buf);
1091         fprintf(stderr, "%s", buf);
1092         sprintf(buf, " Part of the total run time spent waiting due to PP/PME imbalance: %.1f %%\n", std::fabs(lossFractionPme)*100);
1093         fprintf(fplog, "%s", buf);
1094         fprintf(stderr, "%s", buf);
1095     }
1096     fprintf(fplog, "\n");
1097     fprintf(stderr, "\n");
1098
1099     if (lossFraction >= DD_PERF_LOSS_WARN)
1100     {
1101         std::string message = gmx::formatString(
1102                     "NOTE: %.1f %% of the available CPU time was lost due to load imbalance\n"
1103                     "      in the domain decomposition.\n", lossFraction*100);
1104
1105         bool hadSuggestion = false;
1106         if (!isDlbOn(comm))
1107         {
1108             message      += "      You might want to use dynamic load balancing (option -dlb.)\n";
1109             hadSuggestion = true;
1110         }
1111         else if (dlbWasLimited)
1112         {
1113             message      += "      You might want to decrease the cell size limit (options -rdd, -rcon and/or -dds).\n";
1114             hadSuggestion = true;
1115         }
1116         message += gmx::formatString(
1117                     "      You can %sconsider manually changing the decomposition (option -dd);\n"
1118                     "      e.g. by using fewer domains along the box dimension in which there is\n"
1119                     "      considerable inhomogeneity in the simulated system.",
1120                     hadSuggestion ? "also " : "");
1121
1122
1123         fprintf(fplog, "%s\n", message.c_str());
1124         fprintf(stderr, "%s\n", message.c_str());
1125     }
1126     if (numPmeRanks > 0 && std::fabs(lossFractionPme) >= DD_PERF_LOSS_WARN)
1127     {
1128         sprintf(buf,
1129                 "NOTE: %.1f %% performance was lost because the PME ranks\n"
1130                 "      had %s work to do than the PP ranks.\n"
1131                 "      You might want to %s the number of PME ranks\n"
1132                 "      or %s the cut-off and the grid spacing.\n",
1133                 std::fabs(lossFractionPme*100),
1134                 (lossFractionPme < 0) ? "less"     : "more",
1135                 (lossFractionPme < 0) ? "decrease" : "increase",
1136                 (lossFractionPme < 0) ? "decrease" : "increase");
1137         fprintf(fplog, "%s\n", buf);
1138         fprintf(stderr, "%s\n", buf);
1139     }
1140 }
1141
1142 //! Return the minimum communication volume.
1143 static float dd_vol_min(gmx_domdec_t *dd)
1144 {
1145     return dd->comm->load[0].cvol_min*dd->nnodes;
1146 }
1147
1148 //! Return the DD load flags.
1149 static int dd_load_flags(gmx_domdec_t *dd)
1150 {
1151     return dd->comm->load[0].flags;
1152 }
1153
1154 //! Return the reported load imbalance in force calculations.
1155 static float dd_f_imbal(gmx_domdec_t *dd)
1156 {
1157     if (dd->comm->load[0].sum > 0)
1158     {
1159         return dd->comm->load[0].max*dd->nnodes/dd->comm->load[0].sum - 1.0F;
1160     }
1161     else
1162     {
1163         /* Something is wrong in the cycle counting, report no load imbalance */
1164         return 0.0F;
1165     }
1166 }
1167
1168 //! Returns DD load balance report.
1169 static std::string
1170 dd_print_load(gmx_domdec_t *dd,
1171               int64_t       step)
1172 {
1173     gmx::StringOutputStream stream;
1174     gmx::TextWriter         log(&stream);
1175
1176     int                     flags = dd_load_flags(dd);
1177     if (flags)
1178     {
1179         log.writeString("DD  load balancing is limited by minimum cell size in dimension");
1180         for (int d = 0; d < dd->ndim; d++)
1181         {
1182             if (flags & (1<<d))
1183             {
1184                 log.writeStringFormatted(" %c", dim2char(dd->dim[d]));
1185             }
1186         }
1187         log.ensureLineBreak();
1188     }
1189     log.writeString("DD  step " + gmx::toString(step));
1190     if (isDlbOn(dd->comm))
1191     {
1192         log.writeStringFormatted("  vol min/aver %5.3f%c",
1193                                  dd_vol_min(dd), flags ? '!' : ' ');
1194     }
1195     if (dd->nnodes > 1)
1196     {
1197         log.writeStringFormatted(" load imb.: force %4.1f%%", dd_f_imbal(dd)*100);
1198     }
1199     if (dd->comm->cycl_n[ddCyclPME])
1200     {
1201         log.writeStringFormatted("  pme mesh/force %5.3f", dd_pme_f_ratio(dd));
1202     }
1203     log.ensureLineBreak();
1204     return stream.toString();
1205 }
1206
1207 //! Prints DD load balance report in mdrun verbose mode.
1208 static void dd_print_load_verbose(gmx_domdec_t *dd)
1209 {
1210     if (isDlbOn(dd->comm))
1211     {
1212         fprintf(stderr, "vol %4.2f%c ",
1213                 dd_vol_min(dd), dd_load_flags(dd) ? '!' : ' ');
1214     }
1215     if (dd->nnodes > 1)
1216     {
1217         fprintf(stderr, "imb F %2d%% ", gmx::roundToInt(dd_f_imbal(dd)*100));
1218     }
1219     if (dd->comm->cycl_n[ddCyclPME])
1220     {
1221         fprintf(stderr, "pme/F %4.2f ", dd_pme_f_ratio(dd));
1222     }
1223 }
1224
1225 //! Turns on dynamic load balancing if possible and needed.
1226 static void turn_on_dlb(const gmx::MDLogger &mdlog,
1227                         gmx_domdec_t        *dd,
1228                         int64_t              step)
1229 {
1230     gmx_domdec_comm_t *comm         = dd->comm;
1231
1232     real               cellsize_min = comm->cellsize_min[dd->dim[0]];
1233     for (int d = 1; d < dd->ndim; d++)
1234     {
1235         cellsize_min = std::min(cellsize_min, comm->cellsize_min[dd->dim[d]]);
1236     }
1237
1238     /* Turn off DLB if we're too close to the cell size limit. */
1239     if (cellsize_min < comm->cellsize_limit*1.05)
1240     {
1241         GMX_LOG(mdlog.info).appendTextFormatted(
1242                 "step %s Measured %.1f %% performance loss due to load imbalance, "
1243                 "but the minimum cell size is smaller than 1.05 times the cell size limit. "
1244                 "Will no longer try dynamic load balancing.",
1245                 gmx::toString(step).c_str(), dd_force_imb_perf_loss(dd)*100);
1246
1247         comm->dlbState = DlbState::offForever;
1248         return;
1249     }
1250
1251     GMX_LOG(mdlog.info).appendTextFormatted(
1252             "step %s Turning on dynamic load balancing, because the performance loss due to load imbalance is %.1f %%.",
1253             gmx::toString(step).c_str(), dd_force_imb_perf_loss(dd)*100);
1254     comm->dlbState = DlbState::onCanTurnOff;
1255
1256     /* Store the non-DLB performance, so we can check if DLB actually
1257      * improves performance.
1258      */
1259     GMX_RELEASE_ASSERT(comm->cycl_n[ddCyclStep] > 0, "When we turned on DLB, we should have measured cycles");
1260     comm->cyclesPerStepBeforeDLB = comm->cycl[ddCyclStep]/comm->cycl_n[ddCyclStep];
1261
1262     set_dlb_limits(dd);
1263
1264     /* We can set the required cell size info here,
1265      * so we do not need to communicate this.
1266      * The grid is completely uniform.
1267      */
1268     for (int d = 0; d < dd->ndim; d++)
1269     {
1270         RowMaster *rowMaster = comm->cellsizesWithDlb[d].rowMaster.get();
1271
1272         if (rowMaster)
1273         {
1274             comm->load[d].sum_m = comm->load[d].sum;
1275
1276             int nc = dd->nc[dd->dim[d]];
1277             for (int i = 0; i < nc; i++)
1278             {
1279                 rowMaster->cellFrac[i] = i/static_cast<real>(nc);
1280                 if (d > 0)
1281                 {
1282                     rowMaster->bounds[i].cellFracLowerMax =  i     /static_cast<real>(nc);
1283                     rowMaster->bounds[i].cellFracUpperMin = (i + 1)/static_cast<real>(nc);
1284                 }
1285             }
1286             rowMaster->cellFrac[nc] = 1.0;
1287         }
1288     }
1289 }
1290
1291 //! Turns off dynamic load balancing (but leave it able to turn back on).
1292 static void turn_off_dlb(const gmx::MDLogger &mdlog,
1293                          gmx_domdec_t        *dd,
1294                          int64_t              step)
1295 {
1296     GMX_LOG(mdlog.info).appendText(
1297             "step " + gmx::toString(step) + " Turning off dynamic load balancing, because it is degrading performance.");
1298     dd->comm->dlbState                     = DlbState::offCanTurnOn;
1299     dd->comm->haveTurnedOffDlb             = true;
1300     dd->comm->ddPartioningCountFirstDlbOff = dd->ddp_count;
1301 }
1302
1303 //! Turns off dynamic load balancing permanently.
1304 static void turn_off_dlb_forever(const gmx::MDLogger &mdlog,
1305                                  gmx_domdec_t        *dd,
1306                                  int64_t              step)
1307 {
1308     GMX_RELEASE_ASSERT(dd->comm->dlbState == DlbState::offCanTurnOn, "Can only turn off DLB forever when it was in the can-turn-on state");
1309     GMX_LOG(mdlog.info).appendText(
1310             "step " + gmx::toString(step) + " Will no longer try dynamic load balancing, as it degraded performance.");
1311     dd->comm->dlbState = DlbState::offForever;
1312 }
1313
1314 void set_dd_dlb_max_cutoff(t_commrec *cr, real cutoff)
1315 {
1316     gmx_domdec_comm_t *comm;
1317
1318     comm = cr->dd->comm;
1319
1320     /* Turn on the DLB limiting (might have been on already) */
1321     comm->bPMELoadBalDLBLimits = TRUE;
1322
1323     /* Change the cut-off limit */
1324     comm->PMELoadBal_max_cutoff = cutoff;
1325
1326     if (debug)
1327     {
1328         fprintf(debug, "PME load balancing set a limit to the DLB staggering such that a %f cut-off will continue to fit\n",
1329                 comm->PMELoadBal_max_cutoff);
1330     }
1331 }
1332
1333 //! Merge atom buffers.
1334 static void merge_cg_buffers(int ncell,
1335                              gmx_domdec_comm_dim_t *cd, int pulse,
1336                              int  *ncg_cell,
1337                              gmx::ArrayRef<int> index_gl,
1338                              const int  *recv_i,
1339                              gmx::ArrayRef<gmx::RVec> x,
1340                              gmx::ArrayRef<const gmx::RVec> recv_vr,
1341                              cginfo_mb_t *cginfo_mb,
1342                              gmx::ArrayRef<int> cginfo)
1343 {
1344     gmx_domdec_ind_t *ind, *ind_p;
1345     int               p, cell, c, cg, cg0, cg1, cg_gl;
1346     int               shift;
1347
1348     ind = &cd->ind[pulse];
1349
1350     /* First correct the already stored data */
1351     shift = ind->nrecv[ncell];
1352     for (cell = ncell-1; cell >= 0; cell--)
1353     {
1354         shift -= ind->nrecv[cell];
1355         if (shift > 0)
1356         {
1357             /* Move the cg's present from previous grid pulses */
1358             cg0                = ncg_cell[ncell+cell];
1359             cg1                = ncg_cell[ncell+cell+1];
1360             for (cg = cg1-1; cg >= cg0; cg--)
1361             {
1362                 index_gl[cg + shift] = index_gl[cg];
1363                 x[cg + shift]        = x[cg];
1364                 cginfo[cg + shift]   = cginfo[cg];
1365             }
1366             /* Correct the already stored send indices for the shift */
1367             for (p = 1; p <= pulse; p++)
1368             {
1369                 ind_p = &cd->ind[p];
1370                 cg0   = 0;
1371                 for (c = 0; c < cell; c++)
1372                 {
1373                     cg0 += ind_p->nsend[c];
1374                 }
1375                 cg1 = cg0 + ind_p->nsend[cell];
1376                 for (cg = cg0; cg < cg1; cg++)
1377                 {
1378                     ind_p->index[cg] += shift;
1379                 }
1380             }
1381         }
1382     }
1383
1384     /* Merge in the communicated buffers */
1385     shift    = 0;
1386     cg0      = 0;
1387     for (cell = 0; cell < ncell; cell++)
1388     {
1389         cg1 = ncg_cell[ncell+cell+1] + shift;
1390         for (cg = 0; cg < ind->nrecv[cell]; cg++)
1391         {
1392             /* Copy this atom from the buffer */
1393             index_gl[cg1] = recv_i[cg0];
1394             x[cg1]        = recv_vr[cg0];
1395             /* Copy information */
1396             cg_gl          = index_gl[cg1];
1397             cginfo[cg1]    = ddcginfo(cginfo_mb, cg_gl);
1398             cg0++;
1399             cg1++;
1400         }
1401         shift                 += ind->nrecv[cell];
1402         ncg_cell[ncell+cell+1] = cg1;
1403     }
1404 }
1405
1406 //! Makes a range partitioning for the atom groups wthin a cell
1407 static void make_cell2at_index(gmx_domdec_comm_dim_t        *cd,
1408                                int                           nzone,
1409                                int                           atomGroupStart)
1410 {
1411     /* Store the atom block boundaries for easy copying of communication buffers
1412      */
1413     int g = atomGroupStart;
1414     for (int zone = 0; zone < nzone; zone++)
1415     {
1416         for (gmx_domdec_ind_t &ind : cd->ind)
1417         {
1418             ind.cell2at0[zone]  = g;
1419             g                  += ind.nrecv[zone];
1420             ind.cell2at1[zone]  = g;
1421         }
1422     }
1423 }
1424
1425 //! Returns whether a link is missing.
1426 static gmx_bool missing_link(const t_blocka    &link,
1427                              const int          globalAtomIndex,
1428                              const gmx_ga2la_t &ga2la)
1429 {
1430     for (int i = link.index[globalAtomIndex]; i < link.index[globalAtomIndex + 1]; i++)
1431     {
1432         if (!ga2la.findHome(link.a[i]))
1433         {
1434             return true;
1435         }
1436     }
1437
1438     return false;
1439 }
1440
1441 //! Domain corners for communication, a maximum of 4 i-zones see a j domain
1442 typedef struct {
1443     //! The corners for the non-bonded communication.
1444     real c[DIM][4];
1445     //! Corner for rounding.
1446     real cr0;
1447     //! Corners for rounding.
1448     real cr1[4];
1449     //! Corners for bounded communication.
1450     real bc[DIM];
1451     //! Corner for rounding for bonded communication.
1452     real bcr1;
1453 } dd_corners_t;
1454
1455 //! Determine the corners of the domain(s) we are communicating with.
1456 static void
1457 set_dd_corners(const gmx_domdec_t *dd,
1458                int dim0, int dim1, int dim2,
1459                gmx_bool bDistMB,
1460                dd_corners_t *c)
1461 {
1462     const gmx_domdec_comm_t  *comm;
1463     const gmx_domdec_zones_t *zones;
1464     int i, j;
1465
1466     comm = dd->comm;
1467
1468     zones = &comm->zones;
1469
1470     /* Keep the compiler happy */
1471     c->cr0  = 0;
1472     c->bcr1 = 0;
1473
1474     /* The first dimension is equal for all cells */
1475     c->c[0][0] = comm->cell_x0[dim0];
1476     if (bDistMB)
1477     {
1478         c->bc[0] = c->c[0][0];
1479     }
1480     if (dd->ndim >= 2)
1481     {
1482         dim1 = dd->dim[1];
1483         /* This cell row is only seen from the first row */
1484         c->c[1][0] = comm->cell_x0[dim1];
1485         /* All rows can see this row */
1486         c->c[1][1] = comm->cell_x0[dim1];
1487         if (isDlbOn(dd->comm))
1488         {
1489             c->c[1][1] = std::max(comm->cell_x0[dim1], comm->zone_d1[1].mch0);
1490             if (bDistMB)
1491             {
1492                 /* For the multi-body distance we need the maximum */
1493                 c->bc[1] = std::max(comm->cell_x0[dim1], comm->zone_d1[1].p1_0);
1494             }
1495         }
1496         /* Set the upper-right corner for rounding */
1497         c->cr0 = comm->cell_x1[dim0];
1498
1499         if (dd->ndim >= 3)
1500         {
1501             dim2 = dd->dim[2];
1502             for (j = 0; j < 4; j++)
1503             {
1504                 c->c[2][j] = comm->cell_x0[dim2];
1505             }
1506             if (isDlbOn(dd->comm))
1507             {
1508                 /* Use the maximum of the i-cells that see a j-cell */
1509                 for (i = 0; i < zones->nizone; i++)
1510                 {
1511                     for (j = zones->izone[i].j0; j < zones->izone[i].j1; j++)
1512                     {
1513                         if (j >= 4)
1514                         {
1515                             c->c[2][j-4] =
1516                                 std::max(c->c[2][j-4],
1517                                          comm->zone_d2[zones->shift[i][dim0]][zones->shift[i][dim1]].mch0);
1518                         }
1519                     }
1520                 }
1521                 if (bDistMB)
1522                 {
1523                     /* For the multi-body distance we need the maximum */
1524                     c->bc[2] = comm->cell_x0[dim2];
1525                     for (i = 0; i < 2; i++)
1526                     {
1527                         for (j = 0; j < 2; j++)
1528                         {
1529                             c->bc[2] = std::max(c->bc[2], comm->zone_d2[i][j].p1_0);
1530                         }
1531                     }
1532                 }
1533             }
1534
1535             /* Set the upper-right corner for rounding */
1536             /* Cell (0,0,0) and cell (1,0,0) can see cell 4 (0,1,1)
1537              * Only cell (0,0,0) can see cell 7 (1,1,1)
1538              */
1539             c->cr1[0] = comm->cell_x1[dim1];
1540             c->cr1[3] = comm->cell_x1[dim1];
1541             if (isDlbOn(dd->comm))
1542             {
1543                 c->cr1[0] = std::max(comm->cell_x1[dim1], comm->zone_d1[1].mch1);
1544                 if (bDistMB)
1545                 {
1546                     /* For the multi-body distance we need the maximum */
1547                     c->bcr1 = std::max(comm->cell_x1[dim1], comm->zone_d1[1].p1_1);
1548                 }
1549             }
1550         }
1551     }
1552 }
1553
1554 /*! \brief Add the atom groups we need to send in this pulse from this
1555  * zone to \p localAtomGroups and \p work. */
1556 static void
1557 get_zone_pulse_cgs(gmx_domdec_t *dd,
1558                    int zonei, int zone,
1559                    int cg0, int cg1,
1560                    gmx::ArrayRef<const int> globalAtomGroupIndices,
1561                    int dim, int dim_ind,
1562                    int dim0, int dim1, int dim2,
1563                    real r_comm2, real r_bcomm2,
1564                    matrix box,
1565                    bool distanceIsTriclinic,
1566                    rvec *normal,
1567                    real skew_fac2_d, real skew_fac_01,
1568                    rvec *v_d, rvec *v_0, rvec *v_1,
1569                    const dd_corners_t *c,
1570                    const rvec sf2_round,
1571                    gmx_bool bDistBonded,
1572                    gmx_bool bBondComm,
1573                    gmx_bool bDist2B,
1574                    gmx_bool bDistMB,
1575                    rvec *cg_cm,
1576                    gmx::ArrayRef<const int> cginfo,
1577                    std::vector<int>     *localAtomGroups,
1578                    dd_comm_setup_work_t *work)
1579 {
1580     gmx_domdec_comm_t *comm;
1581     gmx_bool           bScrew;
1582     gmx_bool           bDistMB_pulse;
1583     int                cg, i;
1584     real               r2, rb2, r, tric_sh;
1585     rvec               rn, rb;
1586     int                dimd;
1587     int                nsend_z, nat;
1588
1589     comm = dd->comm;
1590
1591     bScrew = (dd->unitCellInfo.haveScrewPBC && dim == XX);
1592
1593     bDistMB_pulse = (bDistMB && bDistBonded);
1594
1595     /* Unpack the work data */
1596     std::vector<int>       &ibuf = work->atomGroupBuffer;
1597     std::vector<gmx::RVec> &vbuf = work->positionBuffer;
1598     nsend_z                      = 0;
1599     nat                          = work->nat;
1600
1601     for (cg = cg0; cg < cg1; cg++)
1602     {
1603         r2  = 0;
1604         rb2 = 0;
1605         if (!distanceIsTriclinic)
1606         {
1607             /* Rectangular direction, easy */
1608             r = cg_cm[cg][dim] - c->c[dim_ind][zone];
1609             if (r > 0)
1610             {
1611                 r2 += r*r;
1612             }
1613             if (bDistMB_pulse)
1614             {
1615                 r = cg_cm[cg][dim] - c->bc[dim_ind];
1616                 if (r > 0)
1617                 {
1618                     rb2 += r*r;
1619                 }
1620             }
1621             /* Rounding gives at most a 16% reduction
1622              * in communicated atoms
1623              */
1624             if (dim_ind >= 1 && (zonei == 1 || zonei == 2))
1625             {
1626                 r = cg_cm[cg][dim0] - c->cr0;
1627                 /* This is the first dimension, so always r >= 0 */
1628                 r2 += r*r;
1629                 if (bDistMB_pulse)
1630                 {
1631                     rb2 += r*r;
1632                 }
1633             }
1634             if (dim_ind == 2 && (zonei == 2 || zonei == 3))
1635             {
1636                 r = cg_cm[cg][dim1] - c->cr1[zone];
1637                 if (r > 0)
1638                 {
1639                     r2 += r*r;
1640                 }
1641                 if (bDistMB_pulse)
1642                 {
1643                     r = cg_cm[cg][dim1] - c->bcr1;
1644                     if (r > 0)
1645                     {
1646                         rb2 += r*r;
1647                     }
1648                 }
1649             }
1650         }
1651         else
1652         {
1653             /* Triclinic direction, more complicated */
1654             clear_rvec(rn);
1655             clear_rvec(rb);
1656             /* Rounding, conservative as the skew_fac multiplication
1657              * will slightly underestimate the distance.
1658              */
1659             if (dim_ind >= 1 && (zonei == 1 || zonei == 2))
1660             {
1661                 rn[dim0] = cg_cm[cg][dim0] - c->cr0;
1662                 for (i = dim0+1; i < DIM; i++)
1663                 {
1664                     rn[dim0] -= cg_cm[cg][i]*v_0[i][dim0];
1665                 }
1666                 r2 = rn[dim0]*rn[dim0]*sf2_round[dim0];
1667                 if (bDistMB_pulse)
1668                 {
1669                     rb[dim0] = rn[dim0];
1670                     rb2      = r2;
1671                 }
1672                 /* Take care that the cell planes along dim0 might not
1673                  * be orthogonal to those along dim1 and dim2.
1674                  */
1675                 for (i = 1; i <= dim_ind; i++)
1676                 {
1677                     dimd = dd->dim[i];
1678                     if (normal[dim0][dimd] > 0)
1679                     {
1680                         rn[dimd] -= rn[dim0]*normal[dim0][dimd];
1681                         if (bDistMB_pulse)
1682                         {
1683                             rb[dimd] -= rb[dim0]*normal[dim0][dimd];
1684                         }
1685                     }
1686                 }
1687             }
1688             if (dim_ind == 2 && (zonei == 2 || zonei == 3))
1689             {
1690                 GMX_ASSERT(dim1 >= 0 && dim1 < DIM, "Must have a valid dimension index");
1691                 rn[dim1] += cg_cm[cg][dim1] - c->cr1[zone];
1692                 tric_sh   = 0;
1693                 for (i = dim1+1; i < DIM; i++)
1694                 {
1695                     tric_sh -= cg_cm[cg][i]*v_1[i][dim1];
1696                 }
1697                 rn[dim1] += tric_sh;
1698                 if (rn[dim1] > 0)
1699                 {
1700                     r2 += rn[dim1]*rn[dim1]*sf2_round[dim1];
1701                     /* Take care of coupling of the distances
1702                      * to the planes along dim0 and dim1 through dim2.
1703                      */
1704                     r2 -= rn[dim0]*rn[dim1]*skew_fac_01;
1705                     /* Take care that the cell planes along dim1
1706                      * might not be orthogonal to that along dim2.
1707                      */
1708                     if (normal[dim1][dim2] > 0)
1709                     {
1710                         rn[dim2] -= rn[dim1]*normal[dim1][dim2];
1711                     }
1712                 }
1713                 if (bDistMB_pulse)
1714                 {
1715                     rb[dim1] +=
1716                         cg_cm[cg][dim1] - c->bcr1 + tric_sh;
1717                     if (rb[dim1] > 0)
1718                     {
1719                         rb2 += rb[dim1]*rb[dim1]*sf2_round[dim1];
1720                         /* Take care of coupling of the distances
1721                          * to the planes along dim0 and dim1 through dim2.
1722                          */
1723                         rb2 -= rb[dim0]*rb[dim1]*skew_fac_01;
1724                         /* Take care that the cell planes along dim1
1725                          * might not be orthogonal to that along dim2.
1726                          */
1727                         if (normal[dim1][dim2] > 0)
1728                         {
1729                             rb[dim2] -= rb[dim1]*normal[dim1][dim2];
1730                         }
1731                     }
1732                 }
1733             }
1734             /* The distance along the communication direction */
1735             rn[dim] += cg_cm[cg][dim] - c->c[dim_ind][zone];
1736             tric_sh  = 0;
1737             for (i = dim+1; i < DIM; i++)
1738             {
1739                 tric_sh -= cg_cm[cg][i]*v_d[i][dim];
1740             }
1741             rn[dim] += tric_sh;
1742             if (rn[dim] > 0)
1743             {
1744                 r2 += rn[dim]*rn[dim]*skew_fac2_d;
1745                 /* Take care of coupling of the distances
1746                  * to the planes along dim0 and dim1 through dim2.
1747                  */
1748                 if (dim_ind == 1 && zonei == 1)
1749                 {
1750                     r2 -= rn[dim0]*rn[dim]*skew_fac_01;
1751                 }
1752             }
1753             if (bDistMB_pulse)
1754             {
1755                 clear_rvec(rb);
1756                 GMX_ASSERT(dim >= 0 && dim < DIM, "Must have a valid dimension index");
1757                 rb[dim] += cg_cm[cg][dim] - c->bc[dim_ind] + tric_sh;
1758                 if (rb[dim] > 0)
1759                 {
1760                     rb2 += rb[dim]*rb[dim]*skew_fac2_d;
1761                     /* Take care of coupling of the distances
1762                      * to the planes along dim0 and dim1 through dim2.
1763                      */
1764                     if (dim_ind == 1 && zonei == 1)
1765                     {
1766                         rb2 -= rb[dim0]*rb[dim]*skew_fac_01;
1767                     }
1768                 }
1769             }
1770         }
1771
1772         if (r2 < r_comm2 ||
1773             (bDistBonded &&
1774              ((bDistMB && rb2 < r_bcomm2) ||
1775               (bDist2B && r2  < r_bcomm2)) &&
1776              (!bBondComm ||
1777               (GET_CGINFO_BOND_INTER(cginfo[cg]) &&
1778                missing_link(*comm->bondedLinks, globalAtomGroupIndices[cg],
1779                             *dd->ga2la)))))
1780         {
1781             /* Store the local and global atom group indices and position */
1782             localAtomGroups->push_back(cg);
1783             ibuf.push_back(globalAtomGroupIndices[cg]);
1784             nsend_z++;
1785
1786             rvec posPbc;
1787             if (dd->ci[dim] == 0)
1788             {
1789                 /* Correct cg_cm for pbc */
1790                 rvec_add(cg_cm[cg], box[dim], posPbc);
1791                 if (bScrew)
1792                 {
1793                     posPbc[YY] = box[YY][YY] - posPbc[YY];
1794                     posPbc[ZZ] = box[ZZ][ZZ] - posPbc[ZZ];
1795                 }
1796             }
1797             else
1798             {
1799                 copy_rvec(cg_cm[cg], posPbc);
1800             }
1801             vbuf.emplace_back(posPbc[XX], posPbc[YY], posPbc[ZZ]);
1802
1803             nat += 1;
1804         }
1805     }
1806
1807     work->nat        = nat;
1808     work->nsend_zone = nsend_z;
1809 }
1810
1811 //! Clear data.
1812 static void clearCommSetupData(dd_comm_setup_work_t *work)
1813 {
1814     work->localAtomGroupBuffer.clear();
1815     work->atomGroupBuffer.clear();
1816     work->positionBuffer.clear();
1817     work->nat        = 0;
1818     work->nsend_zone = 0;
1819 }
1820
1821 //! Prepare DD communication.
1822 static void setup_dd_communication(gmx_domdec_t *dd,
1823                                    matrix box, gmx_ddbox_t *ddbox,
1824                                    t_forcerec *fr,
1825                                    t_state *state,
1826                                    PaddedHostVector<gmx::RVec> *f)
1827 {
1828     int                    dim_ind, dim, dim0, dim1, dim2, dimd, nat_tot;
1829     int                    nzone, nzone_send, zone, zonei, cg0, cg1;
1830     int                    c;
1831     int                   *zone_cg_range, pos_cg;
1832     gmx_domdec_comm_t     *comm;
1833     gmx_domdec_zones_t    *zones;
1834     gmx_domdec_comm_dim_t *cd;
1835     cginfo_mb_t           *cginfo_mb;
1836     gmx_bool               bBondComm, bDist2B, bDistMB, bDistBonded;
1837     dd_corners_t           corners;
1838     rvec                  *normal, *v_d, *v_0 = nullptr, *v_1 = nullptr;
1839     real                   skew_fac2_d, skew_fac_01;
1840     rvec                   sf2_round;
1841
1842     if (debug)
1843     {
1844         fprintf(debug, "Setting up DD communication\n");
1845     }
1846
1847     comm  = dd->comm;
1848
1849     if (comm->dth.empty())
1850     {
1851         /* Initialize the thread data.
1852          * This can not be done in init_domain_decomposition,
1853          * as the numbers of threads is determined later.
1854          */
1855         int numThreads = gmx_omp_nthreads_get(emntDomdec);
1856         comm->dth.resize(numThreads);
1857     }
1858
1859     bBondComm = comm->systemInfo.filterBondedCommunication;
1860
1861     /* Do we need to determine extra distances for multi-body bondeds? */
1862     bDistMB = (comm->systemInfo.haveInterDomainMultiBodyBondeds && isDlbOn(dd->comm) && dd->ndim > 1);
1863
1864     /* Do we need to determine extra distances for only two-body bondeds? */
1865     bDist2B = (bBondComm && !bDistMB);
1866
1867     const real r_comm2  = gmx::square(domainToDomainIntoAtomToDomainCutoff(comm->systemInfo, comm->systemInfo.cutoff));
1868     const real r_bcomm2 = gmx::square(domainToDomainIntoAtomToDomainCutoff(comm->systemInfo, comm->cutoff_mbody));
1869
1870     if (debug)
1871     {
1872         fprintf(debug, "bBondComm %s, r_bc %f\n", gmx::boolToString(bBondComm), std::sqrt(r_bcomm2));
1873     }
1874
1875     zones = &comm->zones;
1876
1877     dim0 = dd->dim[0];
1878     dim1 = (dd->ndim >= 2 ? dd->dim[1] : -1);
1879     dim2 = (dd->ndim >= 3 ? dd->dim[2] : -1);
1880
1881     set_dd_corners(dd, dim0, dim1, dim2, bDistMB, &corners);
1882
1883     /* Triclinic stuff */
1884     normal      = ddbox->normal;
1885     skew_fac_01 = 0;
1886     if (dd->ndim >= 2)
1887     {
1888         v_0 = ddbox->v[dim0];
1889         if (ddbox->tric_dir[dim0] && ddbox->tric_dir[dim1])
1890         {
1891             /* Determine the coupling coefficient for the distances
1892              * to the cell planes along dim0 and dim1 through dim2.
1893              * This is required for correct rounding.
1894              */
1895             skew_fac_01 =
1896                 ddbox->v[dim0][dim1+1][dim0]*ddbox->v[dim1][dim1+1][dim1];
1897             if (debug)
1898             {
1899                 fprintf(debug, "\nskew_fac_01 %f\n", skew_fac_01);
1900             }
1901         }
1902     }
1903     if (dd->ndim >= 3)
1904     {
1905         v_1 = ddbox->v[dim1];
1906     }
1907
1908     zone_cg_range = zones->cg_range;
1909     cginfo_mb     = fr->cginfo_mb;
1910
1911     zone_cg_range[0]   = 0;
1912     zone_cg_range[1]   = dd->ncg_home;
1913     comm->zone_ncg1[0] = dd->ncg_home;
1914     pos_cg             = dd->ncg_home;
1915
1916     nat_tot = comm->atomRanges.numHomeAtoms();
1917     nzone   = 1;
1918     for (dim_ind = 0; dim_ind < dd->ndim; dim_ind++)
1919     {
1920         dim = dd->dim[dim_ind];
1921         cd  = &comm->cd[dim_ind];
1922
1923         /* Check if we need to compute triclinic distances along this dim */
1924         bool distanceIsTriclinic = false;
1925         for (int i = 0; i <= dim_ind; i++)
1926         {
1927             if (ddbox->tric_dir[dd->dim[i]])
1928             {
1929                 distanceIsTriclinic = true;
1930             }
1931         }
1932
1933         if (dim >= ddbox->npbcdim && dd->ci[dim] == 0)
1934         {
1935             /* No pbc in this dimension, the first node should not comm. */
1936             nzone_send = 0;
1937         }
1938         else
1939         {
1940             nzone_send = nzone;
1941         }
1942
1943         v_d                = ddbox->v[dim];
1944         skew_fac2_d        = gmx::square(ddbox->skew_fac[dim]);
1945
1946         cd->receiveInPlace = true;
1947         for (int p = 0; p < cd->numPulses(); p++)
1948         {
1949             /* Only atoms communicated in the first pulse are used
1950              * for multi-body bonded interactions or for bBondComm.
1951              */
1952             bDistBonded = ((bDistMB || bDist2B) && p == 0);
1953
1954             gmx_domdec_ind_t *ind = &cd->ind[p];
1955
1956             /* Thread 0 writes in the global index array */
1957             ind->index.clear();
1958             clearCommSetupData(&comm->dth[0]);
1959
1960             for (zone = 0; zone < nzone_send; zone++)
1961             {
1962                 if (dim_ind > 0 && distanceIsTriclinic)
1963                 {
1964                     /* Determine slightly more optimized skew_fac's
1965                      * for rounding.
1966                      * This reduces the number of communicated atoms
1967                      * by about 10% for 3D DD of rhombic dodecahedra.
1968                      */
1969                     for (dimd = 0; dimd < dim; dimd++)
1970                     {
1971                         sf2_round[dimd] = 1;
1972                         if (ddbox->tric_dir[dimd])
1973                         {
1974                             for (int i = dd->dim[dimd] + 1; i < DIM; i++)
1975                             {
1976                                 /* If we are shifted in dimension i
1977                                  * and the cell plane is tilted forward
1978                                  * in dimension i, skip this coupling.
1979                                  */
1980                                 if (!(zones->shift[nzone+zone][i] &&
1981                                       ddbox->v[dimd][i][dimd] >= 0))
1982                                 {
1983                                     sf2_round[dimd] +=
1984                                         gmx::square(ddbox->v[dimd][i][dimd]);
1985                                 }
1986                             }
1987                             sf2_round[dimd] = 1/sf2_round[dimd];
1988                         }
1989                     }
1990                 }
1991
1992                 zonei = zone_perm[dim_ind][zone];
1993                 if (p == 0)
1994                 {
1995                     /* Here we permutate the zones to obtain a convenient order
1996                      * for neighbor searching
1997                      */
1998                     cg0 = zone_cg_range[zonei];
1999                     cg1 = zone_cg_range[zonei+1];
2000                 }
2001                 else
2002                 {
2003                     /* Look only at the cg's received in the previous grid pulse
2004                      */
2005                     cg1 = zone_cg_range[nzone+zone+1];
2006                     cg0 = cg1 - cd->ind[p-1].nrecv[zone];
2007                 }
2008
2009                 const int numThreads = gmx::ssize(comm->dth);
2010 #pragma omp parallel for num_threads(numThreads) schedule(static)
2011                 for (int th = 0; th < numThreads; th++)
2012                 {
2013                     try
2014                     {
2015                         dd_comm_setup_work_t &work = comm->dth[th];
2016
2017                         /* Retain data accumulated into buffers of thread 0 */
2018                         if (th > 0)
2019                         {
2020                             clearCommSetupData(&work);
2021                         }
2022
2023                         int cg0_th = cg0 + ((cg1 - cg0)* th   )/numThreads;
2024                         int cg1_th = cg0 + ((cg1 - cg0)*(th+1))/numThreads;
2025
2026                         /* Get the cg's for this pulse in this zone */
2027                         get_zone_pulse_cgs(dd, zonei, zone, cg0_th, cg1_th,
2028                                            dd->globalAtomGroupIndices,
2029                                            dim, dim_ind, dim0, dim1, dim2,
2030                                            r_comm2, r_bcomm2,
2031                                            box, distanceIsTriclinic,
2032                                            normal, skew_fac2_d, skew_fac_01,
2033                                            v_d, v_0, v_1, &corners, sf2_round,
2034                                            bDistBonded, bBondComm,
2035                                            bDist2B, bDistMB,
2036                                            state->x.rvec_array(),
2037                                            fr->cginfo,
2038                                            th == 0 ? &ind->index : &work.localAtomGroupBuffer,
2039                                            &work);
2040                     }
2041                     GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
2042                 } // END
2043
2044                 std::vector<int>       &atomGroups = comm->dth[0].atomGroupBuffer;
2045                 std::vector<gmx::RVec> &positions  = comm->dth[0].positionBuffer;
2046                 ind->nsend[zone]  = comm->dth[0].nsend_zone;
2047                 /* Append data of threads>=1 to the communication buffers */
2048                 for (int th = 1; th < numThreads; th++)
2049                 {
2050                     const dd_comm_setup_work_t &dth = comm->dth[th];
2051
2052                     ind->index.insert(ind->index.end(), dth.localAtomGroupBuffer.begin(), dth.localAtomGroupBuffer.end());
2053                     atomGroups.insert(atomGroups.end(), dth.atomGroupBuffer.begin(), dth.atomGroupBuffer.end());
2054                     positions.insert(positions.end(), dth.positionBuffer.begin(), dth.positionBuffer.end());
2055                     comm->dth[0].nat += dth.nat;
2056                     ind->nsend[zone] += dth.nsend_zone;
2057                 }
2058             }
2059             /* Clear the counts in case we do not have pbc */
2060             for (zone = nzone_send; zone < nzone; zone++)
2061             {
2062                 ind->nsend[zone] = 0;
2063             }
2064             ind->nsend[nzone]     = ind->index.size();
2065             ind->nsend[nzone + 1] = comm->dth[0].nat;
2066             /* Communicate the number of cg's and atoms to receive */
2067             ddSendrecv(dd, dim_ind, dddirBackward,
2068                        ind->nsend, nzone+2,
2069                        ind->nrecv, nzone+2);
2070
2071             if (p > 0)
2072             {
2073                 /* We can receive in place if only the last zone is not empty */
2074                 for (zone = 0; zone < nzone-1; zone++)
2075                 {
2076                     if (ind->nrecv[zone] > 0)
2077                     {
2078                         cd->receiveInPlace = false;
2079                     }
2080                 }
2081             }
2082
2083             int receiveBufferSize = 0;
2084             if (!cd->receiveInPlace)
2085             {
2086                 receiveBufferSize = ind->nrecv[nzone];
2087             }
2088             /* These buffer are actually only needed with in-place */
2089             DDBufferAccess<int>       globalAtomGroupBuffer(comm->intBuffer, receiveBufferSize);
2090             DDBufferAccess<gmx::RVec> rvecBuffer(comm->rvecBuffer, receiveBufferSize);
2091
2092             dd_comm_setup_work_t     &work = comm->dth[0];
2093
2094             /* Make space for the global cg indices */
2095             int numAtomGroupsNew = pos_cg + ind->nrecv[nzone];
2096             dd->globalAtomGroupIndices.resize(numAtomGroupsNew);
2097             /* Communicate the global cg indices */
2098             gmx::ArrayRef<int> integerBufferRef;
2099             if (cd->receiveInPlace)
2100             {
2101                 integerBufferRef = gmx::arrayRefFromArray(dd->globalAtomGroupIndices.data() + pos_cg, ind->nrecv[nzone]);
2102             }
2103             else
2104             {
2105                 integerBufferRef = globalAtomGroupBuffer.buffer;
2106             }
2107             ddSendrecv<int>(dd, dim_ind, dddirBackward,
2108                             work.atomGroupBuffer, integerBufferRef);
2109
2110             /* Make space for cg_cm */
2111             dd_check_alloc_ncg(fr, state, f, pos_cg + ind->nrecv[nzone]);
2112
2113             /* Communicate the coordinates */
2114             gmx::ArrayRef<gmx::RVec> rvecBufferRef;
2115             if (cd->receiveInPlace)
2116             {
2117                 rvecBufferRef = gmx::makeArrayRef(state->x).subArray(pos_cg, ind->nrecv[nzone]);
2118             }
2119             else
2120             {
2121                 rvecBufferRef = rvecBuffer.buffer;
2122             }
2123             ddSendrecv<gmx::RVec>(dd, dim_ind, dddirBackward,
2124                                   work.positionBuffer, rvecBufferRef);
2125
2126             /* Make the charge group index */
2127             if (cd->receiveInPlace)
2128             {
2129                 zone = (p == 0 ? 0 : nzone - 1);
2130                 while (zone < nzone)
2131                 {
2132                     for (int i = 0; i < ind->nrecv[zone]; i++)
2133                     {
2134                         int globalAtomIndex = dd->globalAtomGroupIndices[pos_cg];
2135                         fr->cginfo[pos_cg]  = ddcginfo(cginfo_mb, globalAtomIndex);
2136                         pos_cg++;
2137                     }
2138                     if (p == 0)
2139                     {
2140                         comm->zone_ncg1[nzone+zone] = ind->nrecv[zone];
2141                     }
2142                     zone++;
2143                     zone_cg_range[nzone+zone] = pos_cg;
2144                 }
2145             }
2146             else
2147             {
2148                 /* This part of the code is never executed with bBondComm. */
2149                 merge_cg_buffers(nzone, cd, p, zone_cg_range,
2150                                  dd->globalAtomGroupIndices, integerBufferRef.data(),
2151                                  state->x, rvecBufferRef,
2152                                  fr->cginfo_mb, fr->cginfo);
2153                 pos_cg += ind->nrecv[nzone];
2154             }
2155             nat_tot += ind->nrecv[nzone+1];
2156         }
2157         if (!cd->receiveInPlace)
2158         {
2159             /* Store the atom block for easy copying of communication buffers */
2160             make_cell2at_index(cd, nzone, zone_cg_range[nzone]);
2161         }
2162         nzone += nzone;
2163     }
2164
2165     comm->atomRanges.setEnd(DDAtomRanges::Type::Zones, nat_tot);
2166
2167     if (!bBondComm)
2168     {
2169         /* We don't need to update cginfo, since that was alrady done above.
2170          * So we pass NULL for the forcerec.
2171          */
2172         dd_set_cginfo(dd->globalAtomGroupIndices,
2173                       dd->ncg_home, dd->globalAtomGroupIndices.size(),
2174                       nullptr);
2175     }
2176
2177     if (debug)
2178     {
2179         fprintf(debug, "Finished setting up DD communication, zones:");
2180         for (c = 0; c < zones->n; c++)
2181         {
2182             fprintf(debug, " %d", zones->cg_range[c+1]-zones->cg_range[c]);
2183         }
2184         fprintf(debug, "\n");
2185     }
2186 }
2187
2188 //! Set boundaries for the charge group range.
2189 static void set_cg_boundaries(gmx_domdec_zones_t *zones)
2190 {
2191     int c;
2192
2193     for (c = 0; c < zones->nizone; c++)
2194     {
2195         zones->izone[c].cg1  = zones->cg_range[c+1];
2196         zones->izone[c].jcg0 = zones->cg_range[zones->izone[c].j0];
2197         zones->izone[c].jcg1 = zones->cg_range[zones->izone[c].j1];
2198     }
2199 }
2200
2201 /*! \brief Set zone dimensions for zones \p zone_start to \p zone_end-1
2202  *
2203  * Also sets the atom density for the home zone when \p zone_start=0.
2204  * For this \p numMovedChargeGroupsInHomeZone needs to be passed to tell
2205  * how many charge groups will move but are still part of the current range.
2206  * \todo When converting domdec to use proper classes, all these variables
2207  *       should be private and a method should return the correct count
2208  *       depending on an internal state.
2209  *
2210  * \param[in,out] dd          The domain decomposition struct
2211  * \param[in]     box         The box
2212  * \param[in]     ddbox       The domain decomposition box struct
2213  * \param[in]     zone_start  The start of the zone range to set sizes for
2214  * \param[in]     zone_end    The end of the zone range to set sizes for
2215  * \param[in]     numMovedChargeGroupsInHomeZone  The number of charge groups in the home zone that should moved but are still present in dd->comm->zones.cg_range
2216  */
2217 static void set_zones_size(gmx_domdec_t *dd,
2218                            matrix box, const gmx_ddbox_t *ddbox,
2219                            int zone_start, int zone_end,
2220                            int numMovedChargeGroupsInHomeZone)
2221 {
2222     gmx_domdec_comm_t  *comm;
2223     gmx_domdec_zones_t *zones;
2224     gmx_bool            bDistMB;
2225     int                 z, zi, d, dim;
2226     real                rcs, rcmbs;
2227     int                 i, j;
2228     real                vol;
2229
2230     comm = dd->comm;
2231
2232     zones = &comm->zones;
2233
2234     /* Do we need to determine extra distances for multi-body bondeds? */
2235     bDistMB = (comm->systemInfo.haveInterDomainMultiBodyBondeds &&
2236                isDlbOn(dd->comm) &&
2237                dd->ndim > 1);
2238
2239     for (z = zone_start; z < zone_end; z++)
2240     {
2241         /* Copy cell limits to zone limits.
2242          * Valid for non-DD dims and non-shifted dims.
2243          */
2244         copy_rvec(comm->cell_x0, zones->size[z].x0);
2245         copy_rvec(comm->cell_x1, zones->size[z].x1);
2246     }
2247
2248     for (d = 0; d < dd->ndim; d++)
2249     {
2250         dim = dd->dim[d];
2251
2252         for (z = 0; z < zones->n; z++)
2253         {
2254             /* With a staggered grid we have different sizes
2255              * for non-shifted dimensions.
2256              */
2257             if (isDlbOn(dd->comm) && zones->shift[z][dim] == 0)
2258             {
2259                 if (d == 1)
2260                 {
2261                     zones->size[z].x0[dim] = comm->zone_d1[zones->shift[z][dd->dim[d-1]]].min0;
2262                     zones->size[z].x1[dim] = comm->zone_d1[zones->shift[z][dd->dim[d-1]]].max1;
2263                 }
2264                 else if (d == 2)
2265                 {
2266                     zones->size[z].x0[dim] = comm->zone_d2[zones->shift[z][dd->dim[d-2]]][zones->shift[z][dd->dim[d-1]]].min0;
2267                     zones->size[z].x1[dim] = comm->zone_d2[zones->shift[z][dd->dim[d-2]]][zones->shift[z][dd->dim[d-1]]].max1;
2268                 }
2269             }
2270         }
2271
2272         rcs   = comm->systemInfo.cutoff;
2273         rcmbs = comm->cutoff_mbody;
2274         if (ddbox->tric_dir[dim])
2275         {
2276             rcs   /= ddbox->skew_fac[dim];
2277             rcmbs /= ddbox->skew_fac[dim];
2278         }
2279
2280         /* Set the lower limit for the shifted zone dimensions */
2281         for (z = zone_start; z < zone_end; z++)
2282         {
2283             if (zones->shift[z][dim] > 0)
2284             {
2285                 dim = dd->dim[d];
2286                 if (!isDlbOn(dd->comm) || d == 0)
2287                 {
2288                     zones->size[z].x0[dim] = comm->cell_x1[dim];
2289                     zones->size[z].x1[dim] = comm->cell_x1[dim] + rcs;
2290                 }
2291                 else
2292                 {
2293                     /* Here we take the lower limit of the zone from
2294                      * the lowest domain of the zone below.
2295                      */
2296                     if (z < 4)
2297                     {
2298                         zones->size[z].x0[dim] =
2299                             comm->zone_d1[zones->shift[z][dd->dim[d-1]]].min1;
2300                     }
2301                     else
2302                     {
2303                         if (d == 1)
2304                         {
2305                             zones->size[z].x0[dim] =
2306                                 zones->size[zone_perm[2][z-4]].x0[dim];
2307                         }
2308                         else
2309                         {
2310                             zones->size[z].x0[dim] =
2311                                 comm->zone_d2[zones->shift[z][dd->dim[d-2]]][zones->shift[z][dd->dim[d-1]]].min1;
2312                         }
2313                     }
2314                     /* A temporary limit, is updated below */
2315                     zones->size[z].x1[dim] = zones->size[z].x0[dim];
2316
2317                     if (bDistMB)
2318                     {
2319                         for (zi = 0; zi < zones->nizone; zi++)
2320                         {
2321                             if (zones->shift[zi][dim] == 0)
2322                             {
2323                                 /* This takes the whole zone into account.
2324                                  * With multiple pulses this will lead
2325                                  * to a larger zone then strictly necessary.
2326                                  */
2327                                 zones->size[z].x1[dim] = std::max(zones->size[z].x1[dim],
2328                                                                   zones->size[zi].x1[dim]+rcmbs);
2329                             }
2330                         }
2331                     }
2332                 }
2333             }
2334         }
2335
2336         /* Loop over the i-zones to set the upper limit of each
2337          * j-zone they see.
2338          */
2339         for (zi = 0; zi < zones->nizone; zi++)
2340         {
2341             if (zones->shift[zi][dim] == 0)
2342             {
2343                 /* We should only use zones up to zone_end */
2344                 int jZoneEnd = std::min(zones->izone[zi].j1, zone_end);
2345                 for (z = zones->izone[zi].j0; z < jZoneEnd; z++)
2346                 {
2347                     if (zones->shift[z][dim] > 0)
2348                     {
2349                         zones->size[z].x1[dim] = std::max(zones->size[z].x1[dim],
2350                                                           zones->size[zi].x1[dim]+rcs);
2351                     }
2352                 }
2353             }
2354         }
2355     }
2356
2357     for (z = zone_start; z < zone_end; z++)
2358     {
2359         /* Initialization only required to keep the compiler happy */
2360         rvec corner_min = {0, 0, 0}, corner_max = {0, 0, 0}, corner;
2361         int  nc, c;
2362
2363         /* To determine the bounding box for a zone we need to find
2364          * the extreme corners of 4, 2 or 1 corners.
2365          */
2366         nc = 1 << (ddbox->nboundeddim - 1);
2367
2368         for (c = 0; c < nc; c++)
2369         {
2370             /* Set up a zone corner at x=0, ignoring trilinic couplings */
2371             corner[XX] = 0;
2372             if ((c & 1) == 0)
2373             {
2374                 corner[YY] = zones->size[z].x0[YY];
2375             }
2376             else
2377             {
2378                 corner[YY] = zones->size[z].x1[YY];
2379             }
2380             if ((c & 2) == 0)
2381             {
2382                 corner[ZZ] = zones->size[z].x0[ZZ];
2383             }
2384             else
2385             {
2386                 corner[ZZ] = zones->size[z].x1[ZZ];
2387             }
2388             if (dd->ndim == 1 && dd->dim[0] < ZZ && ZZ < dd->unitCellInfo.npbcdim &&
2389                 box[ZZ][1 - dd->dim[0]] != 0)
2390             {
2391                 /* With 1D domain decomposition the cg's are not in
2392                  * the triclinic box, but triclinic x-y and rectangular y/x-z.
2393                  * Shift the corner of the z-vector back to along the box
2394                  * vector of dimension d, so it will later end up at 0 along d.
2395                  * This can affect the location of this corner along dd->dim[0]
2396                  * through the matrix operation below if box[d][dd->dim[0]]!=0.
2397                  */
2398                 int d = 1 - dd->dim[0];
2399
2400                 corner[d] -= corner[ZZ]*box[ZZ][d]/box[ZZ][ZZ];
2401             }
2402             /* Apply the triclinic couplings */
2403             for (i = YY; i < ddbox->npbcdim && i < DIM; i++)
2404             {
2405                 for (j = XX; j < i; j++)
2406                 {
2407                     corner[j] += corner[i]*box[i][j]/box[i][i];
2408                 }
2409             }
2410             if (c == 0)
2411             {
2412                 copy_rvec(corner, corner_min);
2413                 copy_rvec(corner, corner_max);
2414             }
2415             else
2416             {
2417                 for (i = 0; i < DIM; i++)
2418                 {
2419                     corner_min[i] = std::min(corner_min[i], corner[i]);
2420                     corner_max[i] = std::max(corner_max[i], corner[i]);
2421                 }
2422             }
2423         }
2424         /* Copy the extreme cornes without offset along x */
2425         for (i = 0; i < DIM; i++)
2426         {
2427             zones->size[z].bb_x0[i] = corner_min[i];
2428             zones->size[z].bb_x1[i] = corner_max[i];
2429         }
2430         /* Add the offset along x */
2431         zones->size[z].bb_x0[XX] += zones->size[z].x0[XX];
2432         zones->size[z].bb_x1[XX] += zones->size[z].x1[XX];
2433     }
2434
2435     if (zone_start == 0)
2436     {
2437         vol = 1;
2438         for (dim = 0; dim < DIM; dim++)
2439         {
2440             vol *= zones->size[0].x1[dim] - zones->size[0].x0[dim];
2441         }
2442         zones->dens_zone0 = (zones->cg_range[1] - zones->cg_range[0] - numMovedChargeGroupsInHomeZone)/vol;
2443     }
2444
2445     if (debug)
2446     {
2447         for (z = zone_start; z < zone_end; z++)
2448         {
2449             fprintf(debug, "zone %d    %6.3f - %6.3f  %6.3f - %6.3f  %6.3f - %6.3f\n",
2450                     z,
2451                     zones->size[z].x0[XX], zones->size[z].x1[XX],
2452                     zones->size[z].x0[YY], zones->size[z].x1[YY],
2453                     zones->size[z].x0[ZZ], zones->size[z].x1[ZZ]);
2454             fprintf(debug, "zone %d bb %6.3f - %6.3f  %6.3f - %6.3f  %6.3f - %6.3f\n",
2455                     z,
2456                     zones->size[z].bb_x0[XX], zones->size[z].bb_x1[XX],
2457                     zones->size[z].bb_x0[YY], zones->size[z].bb_x1[YY],
2458                     zones->size[z].bb_x0[ZZ], zones->size[z].bb_x1[ZZ]);
2459         }
2460     }
2461 }
2462
2463 /*! \brief Order data in \p dataToSort according to \p sort
2464  *
2465  * Note: both buffers should have at least \p sort.size() elements.
2466  */
2467 template <typename T>
2468 static void
2469 orderVector(gmx::ArrayRef<const gmx_cgsort_t>  sort,
2470             gmx::ArrayRef<T>                   dataToSort,
2471             gmx::ArrayRef<T>                   sortBuffer)
2472 {
2473     GMX_ASSERT(dataToSort.size() >= sort.size(), "The vector needs to be sufficiently large");
2474     GMX_ASSERT(sortBuffer.size() >= sort.size(), "The sorting buffer needs to be sufficiently large");
2475
2476     /* Order the data into the temporary buffer */
2477     size_t i = 0;
2478     for (const gmx_cgsort_t &entry : sort)
2479     {
2480         sortBuffer[i++] = dataToSort[entry.ind];
2481     }
2482
2483     /* Copy back to the original array */
2484     std::copy(sortBuffer.begin(), sortBuffer.begin() + sort.size(),
2485               dataToSort.begin());
2486 }
2487
2488 /*! \brief Order data in \p dataToSort according to \p sort
2489  *
2490  * Note: \p vectorToSort should have at least \p sort.size() elements,
2491  *       \p workVector is resized when it is too small.
2492  */
2493 template <typename T>
2494 static void
2495 orderVector(gmx::ArrayRef<const gmx_cgsort_t>  sort,
2496             gmx::ArrayRef<T>                   vectorToSort,
2497             std::vector<T>                    *workVector)
2498 {
2499     if (gmx::index(workVector->size()) < sort.ssize())
2500     {
2501         workVector->resize(sort.size());
2502     }
2503     orderVector<T>(sort, vectorToSort, *workVector);
2504 }
2505
2506 //! Returns the sorting order for atoms based on the nbnxn grid order in sort
2507 static void dd_sort_order_nbnxn(const t_forcerec          *fr,
2508                                 std::vector<gmx_cgsort_t> *sort)
2509 {
2510     gmx::ArrayRef<const int> atomOrder = fr->nbv->getLocalAtomOrder();
2511
2512     /* Using push_back() instead of this resize results in much slower code */
2513     sort->resize(atomOrder.size());
2514     gmx::ArrayRef<gmx_cgsort_t> buffer    = *sort;
2515     size_t                      numSorted = 0;
2516     for (int i : atomOrder)
2517     {
2518         if (i >= 0)
2519         {
2520             /* The values of nsc and ind_gl are not used in this case */
2521             buffer[numSorted++].ind = i;
2522         }
2523     }
2524     sort->resize(numSorted);
2525 }
2526
2527 //! Returns the sorting state for DD.
2528 static void dd_sort_state(gmx_domdec_t *dd, t_forcerec *fr, t_state *state)
2529 {
2530     gmx_domdec_sort_t *sort = dd->comm->sort.get();
2531
2532     dd_sort_order_nbnxn(fr, &sort->sorted);
2533
2534     /* We alloc with the old size, since cgindex is still old */
2535     DDBufferAccess<gmx::RVec>     rvecBuffer(dd->comm->rvecBuffer, dd->ncg_home);
2536
2537     /* Set the new home atom/charge group count */
2538     dd->ncg_home = sort->sorted.size();
2539     if (debug)
2540     {
2541         fprintf(debug, "Set the new home atom count to %d\n",
2542                 dd->ncg_home);
2543     }
2544
2545     /* Reorder the state */
2546     gmx::ArrayRef<const gmx_cgsort_t> cgsort = sort->sorted;
2547     GMX_RELEASE_ASSERT(cgsort.ssize() == dd->ncg_home, "We should sort all the home atom groups");
2548
2549     if (state->flags & (1 << estX))
2550     {
2551         orderVector(cgsort, makeArrayRef(state->x), rvecBuffer.buffer);
2552     }
2553     if (state->flags & (1 << estV))
2554     {
2555         orderVector(cgsort, makeArrayRef(state->v), rvecBuffer.buffer);
2556     }
2557     if (state->flags & (1 << estCGP))
2558     {
2559         orderVector(cgsort, makeArrayRef(state->cg_p), rvecBuffer.buffer);
2560     }
2561
2562     /* Reorder the global cg index */
2563     orderVector<int>(cgsort, dd->globalAtomGroupIndices, &sort->intBuffer);
2564     /* Reorder the cginfo */
2565     orderVector<int>(cgsort, fr->cginfo, &sort->intBuffer);
2566     /* Set the home atom number */
2567     dd->comm->atomRanges.setEnd(DDAtomRanges::Type::Home, dd->ncg_home);
2568
2569     /* The atoms are now exactly in grid order, update the grid order */
2570     fr->nbv->setLocalAtomOrder();
2571 }
2572
2573 //! Accumulates load statistics.
2574 static void add_dd_statistics(gmx_domdec_t *dd)
2575 {
2576     gmx_domdec_comm_t *comm = dd->comm;
2577
2578     for (int i = 0; i < static_cast<int>(DDAtomRanges::Type::Number); i++)
2579     {
2580         auto range = static_cast<DDAtomRanges::Type>(i);
2581         comm->sum_nat[i] +=
2582             comm->atomRanges.end(range) - comm->atomRanges.start(range);
2583     }
2584     comm->ndecomp++;
2585 }
2586
2587 void reset_dd_statistics_counters(gmx_domdec_t *dd)
2588 {
2589     gmx_domdec_comm_t *comm = dd->comm;
2590
2591     /* Reset all the statistics and counters for total run counting */
2592     for (int i = 0; i < static_cast<int>(DDAtomRanges::Type::Number); i++)
2593     {
2594         comm->sum_nat[i] = 0;
2595     }
2596     comm->ndecomp   = 0;
2597     comm->nload     = 0;
2598     comm->load_step = 0;
2599     comm->load_sum  = 0;
2600     comm->load_max  = 0;
2601     clear_ivec(comm->load_lim);
2602     comm->load_mdf = 0;
2603     comm->load_pme = 0;
2604 }
2605
2606 void print_dd_statistics(const t_commrec *cr, const t_inputrec *ir, FILE *fplog)
2607 {
2608     gmx_domdec_comm_t *comm      = cr->dd->comm;
2609
2610     const int          numRanges = static_cast<int>(DDAtomRanges::Type::Number);
2611     gmx_sumd(numRanges, comm->sum_nat, cr);
2612
2613     if (fplog == nullptr)
2614     {
2615         return;
2616     }
2617
2618     fprintf(fplog, "\n    D O M A I N   D E C O M P O S I T I O N   S T A T I S T I C S\n\n");
2619
2620     for (int i = static_cast<int>(DDAtomRanges::Type::Zones); i < numRanges; i++)
2621     {
2622         auto   range = static_cast<DDAtomRanges::Type>(i);
2623         double av    = comm->sum_nat[i]/comm->ndecomp;
2624         switch (range)
2625         {
2626             case DDAtomRanges::Type::Zones:
2627                 fprintf(fplog,
2628                         " av. #atoms communicated per step for force:  %d x %.1f\n",
2629                         2, av);
2630                 break;
2631             case DDAtomRanges::Type::Vsites:
2632                 if (cr->dd->vsite_comm)
2633                 {
2634                     fprintf(fplog,
2635                             " av. #atoms communicated per step for vsites: %d x %.1f\n",
2636                             (EEL_PME(ir->coulombtype) || ir->coulombtype == eelEWALD) ? 3 : 2,
2637                             av);
2638                 }
2639                 break;
2640             case DDAtomRanges::Type::Constraints:
2641                 if (cr->dd->constraint_comm)
2642                 {
2643                     fprintf(fplog,
2644                             " av. #atoms communicated per step for LINCS:  %d x %.1f\n",
2645                             1 + ir->nLincsIter, av);
2646                 }
2647                 break;
2648             default:
2649                 gmx_incons(" Unknown type for DD statistics");
2650         }
2651     }
2652     fprintf(fplog, "\n");
2653
2654     if (comm->ddSettings.recordLoad && EI_DYNAMICS(ir->eI))
2655     {
2656         print_dd_load_av(fplog, cr->dd);
2657     }
2658 }
2659
2660 //!\brief TODO Remove fplog when group scheme and charge groups are gone
2661 void dd_partition_system(FILE                        *fplog,
2662                          const gmx::MDLogger         &mdlog,
2663                          int64_t                      step,
2664                          const t_commrec             *cr,
2665                          gmx_bool                     bMasterState,
2666                          int                          nstglobalcomm,
2667                          t_state                     *state_global,
2668                          const gmx_mtop_t            &top_global,
2669                          const t_inputrec            *ir,
2670                          gmx::ImdSession             *imdSession,
2671                          pull_t                      *pull_work,
2672                          t_state                     *state_local,
2673                          PaddedHostVector<gmx::RVec> *f,
2674                          gmx::MDAtoms                *mdAtoms,
2675                          gmx_localtop_t              *top_local,
2676                          t_forcerec                  *fr,
2677                          gmx_vsite_t                 *vsite,
2678                          gmx::Constraints            *constr,
2679                          t_nrnb                      *nrnb,
2680                          gmx_wallcycle               *wcycle,
2681                          gmx_bool                     bVerbose)
2682 {
2683     gmx_domdec_t      *dd;
2684     gmx_domdec_comm_t *comm;
2685     gmx_ddbox_t        ddbox = {0};
2686     int64_t            step_pcoupl;
2687     rvec               cell_ns_x0, cell_ns_x1;
2688     int                ncgindex_set, ncg_moved, nat_f_novirsum;
2689     gmx_bool           bBoxChanged, bNStGlobalComm, bDoDLB, bCheckWhetherToTurnDlbOn, bLogLoad;
2690     gmx_bool           bRedist;
2691     ivec               np;
2692     real               grid_density;
2693     char               sbuf[22];
2694
2695     wallcycle_start(wcycle, ewcDOMDEC);
2696
2697     dd   = cr->dd;
2698     comm = dd->comm;
2699
2700     // TODO if the update code becomes accessible here, use
2701     // upd->deform for this logic.
2702     bBoxChanged = (bMasterState || inputrecDeform(ir));
2703     if (ir->epc != epcNO)
2704     {
2705         /* With nstpcouple > 1 pressure coupling happens.
2706          * one step after calculating the pressure.
2707          * Box scaling happens at the end of the MD step,
2708          * after the DD partitioning.
2709          * We therefore have to do DLB in the first partitioning
2710          * after an MD step where P-coupling occurred.
2711          * We need to determine the last step in which p-coupling occurred.
2712          * MRS -- need to validate this for vv?
2713          */
2714         int n = ir->nstpcouple;
2715         if (n == 1)
2716         {
2717             step_pcoupl = step - 1;
2718         }
2719         else
2720         {
2721             step_pcoupl = ((step - 1)/n)*n + 1;
2722         }
2723         if (step_pcoupl >= comm->partition_step)
2724         {
2725             bBoxChanged = TRUE;
2726         }
2727     }
2728
2729     bNStGlobalComm = (step % nstglobalcomm == 0);
2730
2731     if (!isDlbOn(comm))
2732     {
2733         bDoDLB = FALSE;
2734     }
2735     else
2736     {
2737         /* Should we do dynamic load balacing this step?
2738          * Since it requires (possibly expensive) global communication,
2739          * we might want to do DLB less frequently.
2740          */
2741         if (bBoxChanged || ir->epc != epcNO)
2742         {
2743             bDoDLB = bBoxChanged;
2744         }
2745         else
2746         {
2747             bDoDLB = bNStGlobalComm;
2748         }
2749     }
2750
2751     /* Check if we have recorded loads on the nodes */
2752     if (comm->ddSettings.recordLoad && dd_load_count(comm) > 0)
2753     {
2754         bCheckWhetherToTurnDlbOn = dd_dlb_get_should_check_whether_to_turn_dlb_on(dd);
2755
2756         /* Print load every nstlog, first and last step to the log file */
2757         bLogLoad = ((ir->nstlog > 0 && step % ir->nstlog == 0) ||
2758                     comm->n_load_collect == 0 ||
2759                     (ir->nsteps >= 0 &&
2760                      (step + ir->nstlist > ir->init_step + ir->nsteps)));
2761
2762         /* Avoid extra communication due to verbose screen output
2763          * when nstglobalcomm is set.
2764          */
2765         if (bDoDLB || bLogLoad || bCheckWhetherToTurnDlbOn ||
2766             (bVerbose && (ir->nstlist == 0 || nstglobalcomm <= ir->nstlist)))
2767         {
2768             get_load_distribution(dd, wcycle);
2769             if (DDMASTER(dd))
2770             {
2771                 if (bLogLoad)
2772                 {
2773                     GMX_LOG(mdlog.info).asParagraph().appendText(dd_print_load(dd, step-1));
2774                 }
2775                 if (bVerbose)
2776                 {
2777                     dd_print_load_verbose(dd);
2778                 }
2779             }
2780             comm->n_load_collect++;
2781
2782             if (isDlbOn(comm))
2783             {
2784                 if (DDMASTER(dd))
2785                 {
2786                     /* Add the measured cycles to the running average */
2787                     const float averageFactor        = 0.1F;
2788                     comm->cyclesPerStepDlbExpAverage =
2789                         (1 - averageFactor)*comm->cyclesPerStepDlbExpAverage +
2790                         averageFactor*comm->cycl[ddCyclStep]/comm->cycl_n[ddCyclStep];
2791                 }
2792                 if (comm->dlbState == DlbState::onCanTurnOff &&
2793                     dd->comm->n_load_have % c_checkTurnDlbOffInterval == c_checkTurnDlbOffInterval - 1)
2794                 {
2795                     gmx_bool turnOffDlb;
2796                     if (DDMASTER(dd))
2797                     {
2798                         /* If the running averaged cycles with DLB are more
2799                          * than before we turned on DLB, turn off DLB.
2800                          * We will again run and check the cycles without DLB
2801                          * and we can then decide if to turn off DLB forever.
2802                          */
2803                         turnOffDlb = (comm->cyclesPerStepDlbExpAverage >
2804                                       comm->cyclesPerStepBeforeDLB);
2805                     }
2806                     dd_bcast(dd, sizeof(turnOffDlb), &turnOffDlb);
2807                     if (turnOffDlb)
2808                     {
2809                         /* To turn off DLB, we need to redistribute the atoms */
2810                         dd_collect_state(dd, state_local, state_global);
2811                         bMasterState = TRUE;
2812                         turn_off_dlb(mdlog, dd, step);
2813                     }
2814                 }
2815             }
2816             else if (bCheckWhetherToTurnDlbOn)
2817             {
2818                 gmx_bool turnOffDlbForever = FALSE;
2819                 gmx_bool turnOnDlb         = FALSE;
2820
2821                 /* Since the timings are node dependent, the master decides */
2822                 if (DDMASTER(dd))
2823                 {
2824                     /* If we recently turned off DLB, we want to check if
2825                      * performance is better without DLB. We want to do this
2826                      * ASAP to minimize the chance that external factors
2827                      * slowed down the DLB step are gone here and we
2828                      * incorrectly conclude that DLB was causing the slowdown.
2829                      * So we measure one nstlist block, no running average.
2830                      */
2831                     if (comm->haveTurnedOffDlb &&
2832                         comm->cycl[ddCyclStep]/comm->cycl_n[ddCyclStep] <
2833                         comm->cyclesPerStepDlbExpAverage)
2834                     {
2835                         /* After turning off DLB we ran nstlist steps in fewer
2836                          * cycles than with DLB. This likely means that DLB
2837                          * in not benefical, but this could be due to a one
2838                          * time unlucky fluctuation, so we require two such
2839                          * observations in close succession to turn off DLB
2840                          * forever.
2841                          */
2842                         if (comm->dlbSlowerPartitioningCount > 0 &&
2843                             dd->ddp_count < comm->dlbSlowerPartitioningCount + 10*c_checkTurnDlbOnInterval)
2844                         {
2845                             turnOffDlbForever = TRUE;
2846                         }
2847                         comm->haveTurnedOffDlb           = false;
2848                         /* Register when we last measured DLB slowdown */
2849                         comm->dlbSlowerPartitioningCount = dd->ddp_count;
2850                     }
2851                     else
2852                     {
2853                         /* Here we check if the max PME rank load is more than 0.98
2854                          * the max PP force load. If so, PP DLB will not help,
2855                          * since we are (almost) limited by PME. Furthermore,
2856                          * DLB will cause a significant extra x/f redistribution
2857                          * cost on the PME ranks, which will then surely result
2858                          * in lower total performance.
2859                          */
2860                         if (comm->ddRankSetup.usePmeOnlyRanks &&
2861                             dd_pme_f_ratio(dd) > 1 - DD_PERF_LOSS_DLB_ON)
2862                         {
2863                             turnOnDlb = FALSE;
2864                         }
2865                         else
2866                         {
2867                             turnOnDlb = (dd_force_imb_perf_loss(dd) >= DD_PERF_LOSS_DLB_ON);
2868                         }
2869                     }
2870                 }
2871                 struct
2872                 {
2873                     gmx_bool turnOffDlbForever;
2874                     gmx_bool turnOnDlb;
2875                 }
2876                 bools {
2877                     turnOffDlbForever, turnOnDlb
2878                 };
2879                 dd_bcast(dd, sizeof(bools), &bools);
2880                 if (bools.turnOffDlbForever)
2881                 {
2882                     turn_off_dlb_forever(mdlog, dd, step);
2883                 }
2884                 else if (bools.turnOnDlb)
2885                 {
2886                     turn_on_dlb(mdlog, dd, step);
2887                     bDoDLB = TRUE;
2888                 }
2889             }
2890         }
2891         comm->n_load_have++;
2892     }
2893
2894     bRedist = FALSE;
2895     if (bMasterState)
2896     {
2897         /* Clear the old state */
2898         clearDDStateIndices(dd, false);
2899         ncgindex_set = 0;
2900
2901         auto xGlobal = positionsFromStatePointer(state_global);
2902
2903         set_ddbox(*dd, true,
2904                   DDMASTER(dd) ? state_global->box : nullptr,
2905                   true, xGlobal,
2906                   &ddbox);
2907
2908         distributeState(mdlog, dd, top_global, state_global, ddbox, state_local, f);
2909
2910         /* Ensure that we have space for the new distribution */
2911         dd_check_alloc_ncg(fr, state_local, f, dd->ncg_home);
2912
2913         inc_nrnb(nrnb, eNR_CGCM, comm->atomRanges.numHomeAtoms());
2914
2915         dd_set_cginfo(dd->globalAtomGroupIndices, 0, dd->ncg_home, fr);
2916     }
2917     else if (state_local->ddp_count != dd->ddp_count)
2918     {
2919         if (state_local->ddp_count > dd->ddp_count)
2920         {
2921             gmx_fatal(FARGS, "Internal inconsistency state_local->ddp_count (%d) > dd->ddp_count (%" PRId64 ")", state_local->ddp_count, dd->ddp_count);
2922         }
2923
2924         if (state_local->ddp_count_cg_gl != state_local->ddp_count)
2925         {
2926             gmx_fatal(FARGS, "Internal inconsistency state_local->ddp_count_cg_gl (%d) != state_local->ddp_count (%d)", state_local->ddp_count_cg_gl, state_local->ddp_count);
2927         }
2928
2929         /* Clear the old state */
2930         clearDDStateIndices(dd, false);
2931
2932         /* Restore the atom group indices from state_local */
2933         restoreAtomGroups(dd, state_local);
2934         make_dd_indices(dd, 0);
2935         ncgindex_set = dd->ncg_home;
2936
2937         inc_nrnb(nrnb, eNR_CGCM, comm->atomRanges.numHomeAtoms());
2938
2939         dd_set_cginfo(dd->globalAtomGroupIndices, 0, dd->ncg_home, fr);
2940
2941         set_ddbox(*dd, bMasterState, state_local->box,
2942                   true, state_local->x, &ddbox);
2943
2944         bRedist = isDlbOn(comm);
2945     }
2946     else
2947     {
2948         /* We have the full state, only redistribute the cgs */
2949
2950         /* Clear the non-home indices */
2951         clearDDStateIndices(dd, true);
2952         ncgindex_set = 0;
2953
2954         /* To avoid global communication, we do not recompute the extent
2955          * of the system for dims without pbc. Therefore we need to copy
2956          * the previously computed values when we do not communicate.
2957          */
2958         if (!bNStGlobalComm)
2959         {
2960             copy_rvec(comm->box0, ddbox.box0    );
2961             copy_rvec(comm->box_size, ddbox.box_size);
2962         }
2963         set_ddbox(*dd, bMasterState, state_local->box,
2964                   bNStGlobalComm, state_local->x, &ddbox);
2965
2966         bBoxChanged = TRUE;
2967         bRedist     = TRUE;
2968     }
2969     /* Copy needed for dim's without pbc when avoiding communication */
2970     copy_rvec(ddbox.box0, comm->box0    );
2971     copy_rvec(ddbox.box_size, comm->box_size);
2972
2973     set_dd_cell_sizes(dd, &ddbox, dd->unitCellInfo.ddBoxIsDynamic, bMasterState, bDoDLB,
2974                       step, wcycle);
2975
2976     if (comm->ddSettings.nstDDDumpGrid > 0 && step % comm->ddSettings.nstDDDumpGrid == 0)
2977     {
2978         write_dd_grid_pdb("dd_grid", step, dd, state_local->box, &ddbox);
2979     }
2980
2981     if (comm->systemInfo.useUpdateGroups)
2982     {
2983         comm->updateGroupsCog->addCogs(gmx::arrayRefFromArray(dd->globalAtomGroupIndices.data(), dd->ncg_home),
2984                                        state_local->x);
2985     }
2986
2987     /* Check if we should sort the charge groups */
2988     const bool bSortCG = (bMasterState || bRedist);
2989
2990     /* When repartitioning we mark atom groups that will move to neighboring
2991      * DD cells, but we do not move them right away for performance reasons.
2992      * Thus we need to keep track of how many charge groups will move for
2993      * obtaining correct local charge group / atom counts.
2994      */
2995     ncg_moved = 0;
2996     if (bRedist)
2997     {
2998         wallcycle_sub_start(wcycle, ewcsDD_REDIST);
2999
3000         ncgindex_set = dd->ncg_home;
3001         dd_redistribute_cg(fplog, step, dd, ddbox.tric_dir,
3002                            state_local, f, fr,
3003                            nrnb, &ncg_moved);
3004
3005         GMX_RELEASE_ASSERT(bSortCG, "Sorting is required after redistribution");
3006
3007         if (comm->systemInfo.useUpdateGroups)
3008         {
3009             comm->updateGroupsCog->addCogs(gmx::arrayRefFromArray(dd->globalAtomGroupIndices.data(), dd->ncg_home),
3010                                            state_local->x);
3011         }
3012
3013         wallcycle_sub_stop(wcycle, ewcsDD_REDIST);
3014     }
3015
3016     // TODO: Integrate this code in the nbnxm module
3017     get_nsgrid_boundaries(ddbox.nboundeddim, state_local->box,
3018                           dd, &ddbox,
3019                           &comm->cell_x0, &comm->cell_x1,
3020                           dd->ncg_home, as_rvec_array(state_local->x.data()),
3021                           cell_ns_x0, cell_ns_x1, &grid_density);
3022
3023     if (bBoxChanged)
3024     {
3025         comm_dd_ns_cell_sizes(dd, &ddbox, cell_ns_x0, cell_ns_x1, step);
3026     }
3027
3028     if (bSortCG)
3029     {
3030         wallcycle_sub_start(wcycle, ewcsDD_GRID);
3031
3032         /* Sort the state on charge group position.
3033          * This enables exact restarts from this step.
3034          * It also improves performance by about 15% with larger numbers
3035          * of atoms per node.
3036          */
3037
3038         /* Fill the ns grid with the home cell,
3039          * so we can sort with the indices.
3040          */
3041         set_zones_ncg_home(dd);
3042
3043         set_zones_size(dd, state_local->box, &ddbox, 0, 1, ncg_moved);
3044
3045         nbnxn_put_on_grid(fr->nbv.get(), state_local->box,
3046                           0,
3047                           comm->zones.size[0].bb_x0,
3048                           comm->zones.size[0].bb_x1,
3049                           comm->updateGroupsCog.get(),
3050                           0, dd->ncg_home,
3051                           comm->zones.dens_zone0,
3052                           fr->cginfo,
3053                           state_local->x,
3054                           ncg_moved, bRedist ? comm->movedBuffer.data() : nullptr);
3055
3056         if (debug)
3057         {
3058             fprintf(debug, "Step %s, sorting the %d home charge groups\n",
3059                     gmx_step_str(step, sbuf), dd->ncg_home);
3060         }
3061         dd_sort_state(dd, fr, state_local);
3062
3063         /* After sorting and compacting we set the correct size */
3064         dd_resize_state(state_local, f, comm->atomRanges.numHomeAtoms());
3065
3066         /* Rebuild all the indices */
3067         dd->ga2la->clear();
3068         ncgindex_set = 0;
3069
3070         wallcycle_sub_stop(wcycle, ewcsDD_GRID);
3071     }
3072     else
3073     {
3074         /* With the group scheme the sorting array is part of the DD state,
3075          * but it just got out of sync, so mark as invalid by emptying it.
3076          */
3077         if (ir->cutoff_scheme == ecutsGROUP)
3078         {
3079             comm->sort->sorted.clear();
3080         }
3081     }
3082
3083     if (comm->systemInfo.useUpdateGroups)
3084     {
3085         /* The update groups cog's are invalid after sorting
3086          * and need to be cleared before the next partitioning anyhow.
3087          */
3088         comm->updateGroupsCog->clear();
3089     }
3090
3091     wallcycle_sub_start(wcycle, ewcsDD_SETUPCOMM);
3092
3093     /* Setup up the communication and communicate the coordinates */
3094     setup_dd_communication(dd, state_local->box, &ddbox, fr, state_local, f);
3095
3096     /* Set the indices */
3097     make_dd_indices(dd, ncgindex_set);
3098
3099     /* Set the charge group boundaries for neighbor searching */
3100     set_cg_boundaries(&comm->zones);
3101
3102     if (fr->cutoff_scheme == ecutsVERLET)
3103     {
3104         /* When bSortCG=true, we have already set the size for zone 0 */
3105         set_zones_size(dd, state_local->box, &ddbox,
3106                        bSortCG ? 1 : 0, comm->zones.n,
3107                        0);
3108     }
3109
3110     wallcycle_sub_stop(wcycle, ewcsDD_SETUPCOMM);
3111
3112     /*
3113        write_dd_pdb("dd_home",step,"dump",top_global,cr,
3114                  -1,state_local->x.rvec_array(),state_local->box);
3115      */
3116
3117     wallcycle_sub_start(wcycle, ewcsDD_MAKETOP);
3118
3119     /* Extract a local topology from the global topology */
3120     for (int i = 0; i < dd->ndim; i++)
3121     {
3122         np[dd->dim[i]] = comm->cd[i].numPulses();
3123     }
3124     dd_make_local_top(dd, &comm->zones, dd->unitCellInfo.npbcdim, state_local->box,
3125                       comm->cellsize_min, np,
3126                       fr,
3127                       state_local->x.rvec_array(),
3128                       top_global, top_local);
3129
3130     wallcycle_sub_stop(wcycle, ewcsDD_MAKETOP);
3131
3132     wallcycle_sub_start(wcycle, ewcsDD_MAKECONSTR);
3133
3134     /* Set up the special atom communication */
3135     int n = comm->atomRanges.end(DDAtomRanges::Type::Zones);
3136     for (int i = static_cast<int>(DDAtomRanges::Type::Zones) + 1; i < static_cast<int>(DDAtomRanges::Type::Number); i++)
3137     {
3138         auto range = static_cast<DDAtomRanges::Type>(i);
3139         switch (range)
3140         {
3141             case DDAtomRanges::Type::Vsites:
3142                 if (vsite && vsite->numInterUpdategroupVsites)
3143                 {
3144                     n = dd_make_local_vsites(dd, n, top_local->idef.il);
3145                 }
3146                 break;
3147             case DDAtomRanges::Type::Constraints:
3148                 if (dd->comm->systemInfo.haveSplitConstraints || dd->comm->systemInfo.haveSplitSettles)
3149                 {
3150                     /* Only for inter-cg constraints we need special code */
3151                     n = dd_make_local_constraints(dd, n, &top_global, fr->cginfo.data(),
3152                                                   constr, ir->nProjOrder,
3153                                                   top_local->idef.il);
3154                 }
3155                 break;
3156             default:
3157                 gmx_incons("Unknown special atom type setup");
3158         }
3159         comm->atomRanges.setEnd(range, n);
3160     }
3161
3162     wallcycle_sub_stop(wcycle, ewcsDD_MAKECONSTR);
3163
3164     wallcycle_sub_start(wcycle, ewcsDD_TOPOTHER);
3165
3166     /* Make space for the extra coordinates for virtual site
3167      * or constraint communication.
3168      */
3169     state_local->natoms = comm->atomRanges.numAtomsTotal();
3170
3171     dd_resize_state(state_local, f, state_local->natoms);
3172
3173     if (fr->haveDirectVirialContributions)
3174     {
3175         if (vsite && vsite->numInterUpdategroupVsites)
3176         {
3177             nat_f_novirsum = comm->atomRanges.end(DDAtomRanges::Type::Vsites);
3178         }
3179         else
3180         {
3181             if (EEL_FULL(ir->coulombtype) && dd->n_intercg_excl > 0)
3182             {
3183                 nat_f_novirsum = comm->atomRanges.end(DDAtomRanges::Type::Zones);
3184             }
3185             else
3186             {
3187                 nat_f_novirsum = comm->atomRanges.numHomeAtoms();
3188             }
3189         }
3190     }
3191     else
3192     {
3193         nat_f_novirsum = 0;
3194     }
3195
3196     /* Set the number of atoms required for the force calculation.
3197      * Forces need to be constrained when doing energy
3198      * minimization. For simple simulations we could avoid some
3199      * allocation, zeroing and copying, but this is probably not worth
3200      * the complications and checking.
3201      */
3202     forcerec_set_ranges(fr, dd->ncg_home, dd->globalAtomGroupIndices.size(),
3203                         comm->atomRanges.end(DDAtomRanges::Type::Zones),
3204                         comm->atomRanges.end(DDAtomRanges::Type::Constraints),
3205                         nat_f_novirsum);
3206
3207     /* Update atom data for mdatoms and several algorithms */
3208     mdAlgorithmsSetupAtomData(cr, ir, top_global, top_local, fr,
3209                               nullptr, mdAtoms, constr, vsite, nullptr);
3210
3211     auto mdatoms = mdAtoms->mdatoms();
3212     if (!thisRankHasDuty(cr, DUTY_PME))
3213     {
3214         /* Send the charges and/or c6/sigmas to our PME only node */
3215         gmx_pme_send_parameters(cr,
3216                                 fr->ic,
3217                                 mdatoms->nChargePerturbed != 0, mdatoms->nTypePerturbed != 0,
3218                                 mdatoms->chargeA, mdatoms->chargeB,
3219                                 mdatoms->sqrt_c6A, mdatoms->sqrt_c6B,
3220                                 mdatoms->sigmaA, mdatoms->sigmaB,
3221                                 dd_pme_maxshift_x(dd), dd_pme_maxshift_y(dd));
3222     }
3223
3224     if (ir->bPull)
3225     {
3226         /* Update the local pull groups */
3227         dd_make_local_pull_groups(cr, pull_work);
3228     }
3229
3230     if (dd->atomSets != nullptr)
3231     {
3232         /* Update the local atom sets */
3233         dd->atomSets->setIndicesInDomainDecomposition(*(dd->ga2la));
3234     }
3235
3236     /* Update the local atoms to be communicated via the IMD protocol if bIMD is TRUE. */
3237     imdSession->dd_make_local_IMD_atoms(dd);
3238
3239     add_dd_statistics(dd);
3240
3241     /* Make sure we only count the cycles for this DD partitioning */
3242     clear_dd_cycle_counts(dd);
3243
3244     /* Because the order of the atoms might have changed since
3245      * the last vsite construction, we need to communicate the constructing
3246      * atom coordinates again (for spreading the forces this MD step).
3247      */
3248     dd_move_x_vsites(dd, state_local->box, state_local->x.rvec_array());
3249
3250     wallcycle_sub_stop(wcycle, ewcsDD_TOPOTHER);
3251
3252     if (comm->ddSettings.nstDDDump > 0 && step % comm->ddSettings.nstDDDump == 0)
3253     {
3254         dd_move_x(dd, state_local->box, state_local->x, nullWallcycle);
3255         write_dd_pdb("dd_dump", step, "dump", &top_global, cr,
3256                      -1, state_local->x.rvec_array(), state_local->box);
3257     }
3258
3259     /* Store the partitioning step */
3260     comm->partition_step = step;
3261
3262     /* Increase the DD partitioning counter */
3263     dd->ddp_count++;
3264     /* The state currently matches this DD partitioning count, store it */
3265     state_local->ddp_count = dd->ddp_count;
3266     if (bMasterState)
3267     {
3268         /* The DD master node knows the complete cg distribution,
3269          * store the count so we can possibly skip the cg info communication.
3270          */
3271         comm->master_cg_ddp_count = (bSortCG ? 0 : dd->ddp_count);
3272     }
3273
3274     if (comm->ddSettings.DD_debug > 0)
3275     {
3276         /* Set the env var GMX_DD_DEBUG if you suspect corrupted indices */
3277         check_index_consistency(dd, top_global.natoms, "after partitioning");
3278     }
3279
3280     wallcycle_stop(wcycle, ewcDOMDEC);
3281 }
3282
3283 /*! \brief Check whether bonded interactions are missing, if appropriate */
3284 void checkNumberOfBondedInteractions(const gmx::MDLogger  &mdlog,
3285                                      t_commrec            *cr,
3286                                      int                   totalNumberOfBondedInteractions,
3287                                      const gmx_mtop_t     *top_global,
3288                                      const gmx_localtop_t *top_local,
3289                                      const rvec           *x,
3290                                      const matrix          box,
3291                                      bool                 *shouldCheckNumberOfBondedInteractions)
3292 {
3293     if (*shouldCheckNumberOfBondedInteractions)
3294     {
3295         if (totalNumberOfBondedInteractions != cr->dd->nbonded_global)
3296         {
3297             dd_print_missing_interactions(mdlog, cr, totalNumberOfBondedInteractions, top_global, top_local, x, box); // Does not return
3298         }
3299         *shouldCheckNumberOfBondedInteractions = false;
3300     }
3301 }