be726af14104d839aacbc961376ef41b849b2529
[alexxy/gromacs.git] / src / gromacs / domdec / partition.cpp
1 /*
2  * This file is part of the GROMACS molecular simulation package.
3  *
4  * Copyright (c) 2018,2019, by the GROMACS development team, led by
5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6  * and including many others, as listed in the AUTHORS file in the
7  * top-level source directory and at http://www.gromacs.org.
8  *
9  * GROMACS is free software; you can redistribute it and/or
10  * modify it under the terms of the GNU Lesser General Public License
11  * as published by the Free Software Foundation; either version 2.1
12  * of the License, or (at your option) any later version.
13  *
14  * GROMACS is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17  * Lesser General Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser General Public
20  * License along with GROMACS; if not, see
21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
23  *
24  * If you want to redistribute modifications to GROMACS, please
25  * consider that scientific software is very special. Version
26  * control is crucial - bugs must be traceable. We will be happy to
27  * consider code for inclusion in the official distribution, but
28  * derived work must not be called official GROMACS. Details are found
29  * in the README & COPYING files - if they are missing, get the
30  * official version at http://www.gromacs.org.
31  *
32  * To help us fund GROMACS development, we humbly ask that you cite
33  * the research papers on the package. Check out http://www.gromacs.org.
34  */
35 /*! \internal \file
36  *
37  * \brief This file defines functions for mdrun to call to make a new
38  * domain decomposition, and check it.
39  *
40  * \author Berk Hess <hess@kth.se>
41  * \ingroup module_domdec
42  */
43
44 #include "gmxpre.h"
45
46 #include "partition.h"
47
48 #include "config.h"
49
50 #include <cassert>
51 #include <cstdio>
52
53 #include <algorithm>
54
55 #include "gromacs/domdec/collect.h"
56 #include "gromacs/domdec/dlb.h"
57 #include "gromacs/domdec/dlbtiming.h"
58 #include "gromacs/domdec/domdec.h"
59 #include "gromacs/domdec/domdec_network.h"
60 #include "gromacs/domdec/ga2la.h"
61 #include "gromacs/domdec/localatomsetmanager.h"
62 #include "gromacs/ewald/pme.h"
63 #include "gromacs/gmxlib/chargegroup.h"
64 #include "gromacs/gmxlib/network.h"
65 #include "gromacs/gmxlib/nrnb.h"
66 #include "gromacs/imd/imd.h"
67 #include "gromacs/math/functions.h"
68 #include "gromacs/math/vec.h"
69 #include "gromacs/mdlib/forcerec.h"
70 #include "gromacs/mdlib/gmx_omp_nthreads.h"
71 #include "gromacs/mdlib/mdatoms.h"
72 #include "gromacs/mdlib/mdsetup.h"
73 #include "gromacs/mdlib/nsgrid.h"
74 #include "gromacs/mdlib/vsite.h"
75 #include "gromacs/mdtypes/commrec.h"
76 #include "gromacs/mdtypes/forcerec.h"
77 #include "gromacs/mdtypes/inputrec.h"
78 #include "gromacs/mdtypes/md_enums.h"
79 #include "gromacs/mdtypes/nblist.h"
80 #include "gromacs/mdtypes/state.h"
81 #include "gromacs/nbnxm/nbnxm.h"
82 #include "gromacs/pulling/pull.h"
83 #include "gromacs/timing/wallcycle.h"
84 #include "gromacs/topology/mtop_util.h"
85 #include "gromacs/topology/topology.h"
86 #include "gromacs/utility/cstringutil.h"
87 #include "gromacs/utility/fatalerror.h"
88 #include "gromacs/utility/logger.h"
89 #include "gromacs/utility/real.h"
90 #include "gromacs/utility/smalloc.h"
91 #include "gromacs/utility/strconvert.h"
92 #include "gromacs/utility/stringstream.h"
93 #include "gromacs/utility/stringutil.h"
94 #include "gromacs/utility/textwriter.h"
95
96 #include "box.h"
97 #include "cellsizes.h"
98 #include "distribute.h"
99 #include "domdec_constraints.h"
100 #include "domdec_internal.h"
101 #include "domdec_vsite.h"
102 #include "dump.h"
103 #include "redistribute.h"
104 #include "utility.h"
105
106 /*! \brief Turn on DLB when the load imbalance causes this amount of total loss.
107  *
108  * There is a bit of overhead with DLB and it's difficult to achieve
109  * a load imbalance of less than 2% with DLB.
110  */
111 #define DD_PERF_LOSS_DLB_ON  0.02
112
113 //! Warn about imbalance due to PP or PP/PME load imbalance at this loss.
114 #define DD_PERF_LOSS_WARN    0.05
115
116
117 //! Debug helper printing a DD zone
118 static void print_ddzone(FILE *fp, int d, int i, int j, gmx_ddzone_t *zone)
119 {
120     fprintf(fp, "zone d0 %d d1 %d d2 %d  min0 %6.3f max1 %6.3f mch0 %6.3f mch1 %6.3f p1_0 %6.3f p1_1 %6.3f\n",
121             d, i, j,
122             zone->min0, zone->max1,
123             zone->mch0, zone->mch0,
124             zone->p1_0, zone->p1_1);
125 }
126
127 /*! \brief Using the home grid size as input in cell_ns_x0 and cell_ns_x1
128  * takes the extremes over all home and remote zones in the halo
129  * and returns the results in cell_ns_x0 and cell_ns_x1.
130  * Note: only used with the group cut-off scheme.
131  */
132 static void dd_move_cellx(gmx_domdec_t      *dd,
133                           const gmx_ddbox_t *ddbox,
134                           rvec               cell_ns_x0,
135                           rvec               cell_ns_x1)
136 {
137     constexpr int      c_ddZoneCommMaxNumZones = 5;
138     gmx_ddzone_t       buf_s[c_ddZoneCommMaxNumZones];
139     gmx_ddzone_t       buf_r[c_ddZoneCommMaxNumZones];
140     gmx_ddzone_t       buf_e[c_ddZoneCommMaxNumZones];
141     gmx_domdec_comm_t *comm = dd->comm;
142
143     rvec               extr_s[2];
144     rvec               extr_r[2];
145     for (int d = 1; d < dd->ndim; d++)
146     {
147         int           dim = dd->dim[d];
148         gmx_ddzone_t &zp  = (d == 1) ? comm->zone_d1[0] : comm->zone_d2[0][0];
149
150         /* Copy the base sizes of the home zone */
151         zp.min0    = cell_ns_x0[dim];
152         zp.max1    = cell_ns_x1[dim];
153         zp.min1    = cell_ns_x1[dim];
154         zp.mch0    = cell_ns_x0[dim];
155         zp.mch1    = cell_ns_x1[dim];
156         zp.p1_0    = cell_ns_x0[dim];
157         zp.p1_1    = cell_ns_x1[dim];
158         zp.dataSet = 1;
159     }
160
161     gmx::ArrayRef<DDCellsizesWithDlb> cellsizes = comm->cellsizesWithDlb;
162
163     /* Loop backward over the dimensions and aggregate the extremes
164      * of the cell sizes.
165      */
166     for (int d = dd->ndim - 2; d >= 0; d--)
167     {
168         const int  dim      = dd->dim[d];
169         const bool applyPbc = (dim < ddbox->npbcdim);
170
171         /* Use an rvec to store two reals */
172         extr_s[d][0] = cellsizes[d + 1].fracLower;
173         extr_s[d][1] = cellsizes[d + 1].fracUpper;
174         extr_s[d][2] = cellsizes[d + 1].fracUpper;
175
176         int pos = 0;
177         GMX_ASSERT(pos < c_ddZoneCommMaxNumZones, "The buffers should be sufficiently large");
178         /* Store the extremes in the backward sending buffer,
179          * so they get updated separately from the forward communication.
180          */
181         for (int d1 = d; d1 < dd->ndim-1; d1++)
182         {
183             gmx_ddzone_t &buf = buf_s[pos];
184
185             /* We invert the order to be able to use the same loop for buf_e */
186             buf.min0    = extr_s[d1][1];
187             buf.max1    = extr_s[d1][0];
188             buf.min1    = extr_s[d1][2];
189             buf.mch0    = 0;
190             buf.mch1    = 0;
191             /* Store the cell corner of the dimension we communicate along */
192             buf.p1_0    = comm->cell_x0[dim];
193             buf.p1_1    = 0;
194             buf.dataSet = 1;
195             pos++;
196         }
197
198         buf_s[pos] = (dd->ndim == 2) ? comm->zone_d1[0] : comm->zone_d2[0][0];
199         pos++;
200
201         if (dd->ndim == 3 && d == 0)
202         {
203             buf_s[pos] = comm->zone_d2[0][1];
204             pos++;
205             buf_s[pos] = comm->zone_d1[0];
206             pos++;
207         }
208
209         /* We only need to communicate the extremes
210          * in the forward direction
211          */
212         int numPulses = comm->cd[d].numPulses();
213         int numPulsesMin;
214         if (applyPbc)
215         {
216             /* Take the minimum to avoid double communication */
217             numPulsesMin = std::min(numPulses, dd->nc[dim] - 1 - numPulses);
218         }
219         else
220         {
221             /* Without PBC we should really not communicate over
222              * the boundaries, but implementing that complicates
223              * the communication setup and therefore we simply
224              * do all communication, but ignore some data.
225              */
226             numPulsesMin = numPulses;
227         }
228         for (int pulse = 0; pulse < numPulsesMin; pulse++)
229         {
230             /* Communicate the extremes forward */
231             bool receiveValidData = (applyPbc || dd->ci[dim] > 0);
232
233             int  numElements      = dd->ndim - d - 1;
234             ddSendrecv(dd, d, dddirForward,
235                        extr_s + d, numElements,
236                        extr_r + d, numElements);
237
238             if (receiveValidData)
239             {
240                 for (int d1 = d; d1 < dd->ndim - 1; d1++)
241                 {
242                     extr_s[d1][0] = std::max(extr_s[d1][0], extr_r[d1][0]);
243                     extr_s[d1][1] = std::min(extr_s[d1][1], extr_r[d1][1]);
244                     extr_s[d1][2] = std::min(extr_s[d1][2], extr_r[d1][2]);
245                 }
246             }
247         }
248
249         const int numElementsInBuffer = pos;
250         for (int pulse = 0; pulse < numPulses; pulse++)
251         {
252             /* Communicate all the zone information backward */
253             bool receiveValidData = (applyPbc || dd->ci[dim] < dd->nc[dim] - 1);
254
255             static_assert(sizeof(gmx_ddzone_t) == c_ddzoneNumReals*sizeof(real), "Here we expect gmx_ddzone_t to consist of c_ddzoneNumReals reals (only)");
256
257             int numReals = numElementsInBuffer*c_ddzoneNumReals;
258             ddSendrecv(dd, d, dddirBackward,
259                        gmx::arrayRefFromArray(&buf_s[0].min0, numReals),
260                        gmx::arrayRefFromArray(&buf_r[0].min0, numReals));
261
262             rvec dh = { 0 };
263             if (pulse > 0)
264             {
265                 for (int d1 = d + 1; d1 < dd->ndim; d1++)
266                 {
267                     /* Determine the decrease of maximum required
268                      * communication height along d1 due to the distance along d,
269                      * this avoids a lot of useless atom communication.
270                      */
271                     real dist_d = comm->cell_x1[dim] - buf_r[0].p1_0;
272
273                     real c;
274                     if (ddbox->tric_dir[dim])
275                     {
276                         /* c is the off-diagonal coupling between the cell planes
277                          * along directions d and d1.
278                          */
279                         c = ddbox->v[dim][dd->dim[d1]][dim];
280                     }
281                     else
282                     {
283                         c = 0;
284                     }
285                     real det = (1 + c*c)*comm->cutoff*comm->cutoff - dist_d*dist_d;
286                     if (det > 0)
287                     {
288                         dh[d1] = comm->cutoff - (c*dist_d + std::sqrt(det))/(1 + c*c);
289                     }
290                     else
291                     {
292                         /* A negative value signals out of range */
293                         dh[d1] = -1;
294                     }
295                 }
296             }
297
298             /* Accumulate the extremes over all pulses */
299             for (int i = 0; i < numElementsInBuffer; i++)
300             {
301                 if (pulse == 0)
302                 {
303                     buf_e[i] = buf_r[i];
304                 }
305                 else
306                 {
307                     if (receiveValidData)
308                     {
309                         buf_e[i].min0 = std::min(buf_e[i].min0, buf_r[i].min0);
310                         buf_e[i].max1 = std::max(buf_e[i].max1, buf_r[i].max1);
311                         buf_e[i].min1 = std::min(buf_e[i].min1, buf_r[i].min1);
312                     }
313
314                     int d1;
315                     if (dd->ndim == 3 && d == 0 && i == numElementsInBuffer - 1)
316                     {
317                         d1 = 1;
318                     }
319                     else
320                     {
321                         d1 = d + 1;
322                     }
323                     if (receiveValidData && dh[d1] >= 0)
324                     {
325                         buf_e[i].mch0 = std::max(buf_e[i].mch0, buf_r[i].mch0-dh[d1]);
326                         buf_e[i].mch1 = std::max(buf_e[i].mch1, buf_r[i].mch1-dh[d1]);
327                     }
328                 }
329                 /* Copy the received buffer to the send buffer,
330                  * to pass the data through with the next pulse.
331                  */
332                 buf_s[i] = buf_r[i];
333             }
334             if (((applyPbc || dd->ci[dim] + numPulses < dd->nc[dim]) && pulse == numPulses - 1) ||
335                 (!applyPbc && dd->ci[dim] + 1 + pulse == dd->nc[dim] - 1))
336             {
337                 /* Store the extremes */
338                 int pos = 0;
339
340                 for (int d1 = d; d1 < dd->ndim-1; d1++)
341                 {
342                     extr_s[d1][1] = std::min(extr_s[d1][1], buf_e[pos].min0);
343                     extr_s[d1][0] = std::max(extr_s[d1][0], buf_e[pos].max1);
344                     extr_s[d1][2] = std::min(extr_s[d1][2], buf_e[pos].min1);
345                     pos++;
346                 }
347
348                 if (d == 1 || (d == 0 && dd->ndim == 3))
349                 {
350                     for (int i = d; i < 2; i++)
351                     {
352                         comm->zone_d2[1-d][i] = buf_e[pos];
353                         pos++;
354                     }
355                 }
356                 if (d == 0)
357                 {
358                     comm->zone_d1[1] = buf_e[pos];
359                     pos++;
360                 }
361             }
362             else
363             {
364                 if (d == 1 || (d == 0 && dd->ndim == 3))
365                 {
366                     for (int i = d; i < 2; i++)
367                     {
368                         comm->zone_d2[1 - d][i].dataSet = 0;
369                     }
370                 }
371                 if (d == 0)
372                 {
373                     comm->zone_d1[1].dataSet = 0;
374                 }
375             }
376         }
377     }
378
379     if (dd->ndim >= 2)
380     {
381         int dim = dd->dim[1];
382         for (int i = 0; i < 2; i++)
383         {
384             if (comm->zone_d1[i].dataSet != 0)
385             {
386                 if (debug)
387                 {
388                     print_ddzone(debug, 1, i, 0, &comm->zone_d1[i]);
389                 }
390                 cell_ns_x0[dim] = std::min(cell_ns_x0[dim], comm->zone_d1[i].min0);
391                 cell_ns_x1[dim] = std::max(cell_ns_x1[dim], comm->zone_d1[i].max1);
392             }
393         }
394     }
395     if (dd->ndim >= 3)
396     {
397         int dim = dd->dim[2];
398         for (int i = 0; i < 2; i++)
399         {
400             for (int j = 0; j < 2; j++)
401             {
402                 if (comm->zone_d2[i][j].dataSet != 0)
403                 {
404                     if (debug)
405                     {
406                         print_ddzone(debug, 2, i, j, &comm->zone_d2[i][j]);
407                     }
408                     cell_ns_x0[dim] = std::min(cell_ns_x0[dim], comm->zone_d2[i][j].min0);
409                     cell_ns_x1[dim] = std::max(cell_ns_x1[dim], comm->zone_d2[i][j].max1);
410                 }
411             }
412         }
413     }
414     for (int d = 1; d < dd->ndim; d++)
415     {
416         cellsizes[d].fracLowerMax = extr_s[d-1][0];
417         cellsizes[d].fracUpperMin = extr_s[d-1][1];
418         if (debug)
419         {
420             fprintf(debug, "Cell fraction d %d, max0 %f, min1 %f\n",
421                     d, cellsizes[d].fracLowerMax, cellsizes[d].fracUpperMin);
422         }
423     }
424 }
425
426 //! Sets the charge-group zones to be equal to the home zone.
427 static void set_zones_ncg_home(gmx_domdec_t *dd)
428 {
429     gmx_domdec_zones_t *zones;
430     int                 i;
431
432     zones = &dd->comm->zones;
433
434     zones->cg_range[0] = 0;
435     for (i = 1; i < zones->n+1; i++)
436     {
437         zones->cg_range[i] = dd->ncg_home;
438     }
439     /* zone_ncg1[0] should always be equal to ncg_home */
440     dd->comm->zone_ncg1[0] = dd->ncg_home;
441 }
442
443 //! Restore atom groups for the charge groups.
444 static void restoreAtomGroups(gmx_domdec_t *dd,
445                               const int *gcgs_index, const t_state *state)
446 {
447     gmx::ArrayRef<const int>  atomGroupsState        = state->cg_gl;
448
449     std::vector<int>         &globalAtomGroupIndices = dd->globalAtomGroupIndices;
450     gmx::RangePartitioning   &atomGrouping           = dd->atomGrouping_;
451
452     globalAtomGroupIndices.resize(atomGroupsState.size());
453     atomGrouping.clear();
454
455     /* Copy back the global charge group indices from state
456      * and rebuild the local charge group to atom index.
457      */
458     for (gmx::index i = 0; i < atomGroupsState.ssize(); i++)
459     {
460         const int atomGroupGlobal  = atomGroupsState[i];
461         const int groupSize        = gcgs_index[atomGroupGlobal + 1] - gcgs_index[atomGroupGlobal];
462         globalAtomGroupIndices[i]  = atomGroupGlobal;
463         atomGrouping.appendBlock(groupSize);
464     }
465
466     dd->ncg_home = atomGroupsState.size();
467     dd->comm->atomRanges.setEnd(DDAtomRanges::Type::Home, atomGrouping.fullRange().end());
468
469     set_zones_ncg_home(dd);
470 }
471
472 //! Sets the cginfo structures.
473 static void dd_set_cginfo(gmx::ArrayRef<const int> index_gl, int cg0, int cg1,
474                           t_forcerec *fr, char *bLocalCG)
475 {
476     cginfo_mb_t *cginfo_mb;
477     int         *cginfo;
478     int          cg;
479
480     if (fr != nullptr)
481     {
482         cginfo_mb = fr->cginfo_mb;
483         cginfo    = fr->cginfo;
484
485         for (cg = cg0; cg < cg1; cg++)
486         {
487             cginfo[cg] = ddcginfo(cginfo_mb, index_gl[cg]);
488         }
489     }
490
491     if (bLocalCG != nullptr)
492     {
493         for (cg = cg0; cg < cg1; cg++)
494         {
495             bLocalCG[index_gl[cg]] = TRUE;
496         }
497     }
498 }
499
500 //! Makes the mappings between global and local atom indices during DD repartioning.
501 static void make_dd_indices(gmx_domdec_t *dd,
502                             const int *gcgs_index, int cg_start)
503 {
504     const int                 numZones               = dd->comm->zones.n;
505     const int                *zone2cg                = dd->comm->zones.cg_range;
506     const int                *zone_ncg1              = dd->comm->zone_ncg1;
507     gmx::ArrayRef<const int>  globalAtomGroupIndices = dd->globalAtomGroupIndices;
508     const gmx_bool            bCGs                   = dd->comm->bCGs;
509
510     std::vector<int>         &globalAtomIndices      = dd->globalAtomIndices;
511     gmx_ga2la_t              &ga2la                  = *dd->ga2la;
512
513     if (zone2cg[1] != dd->ncg_home)
514     {
515         gmx_incons("dd->ncg_zone is not up to date");
516     }
517
518     /* Make the local to global and global to local atom index */
519     int a = dd->atomGrouping().subRange(cg_start, cg_start).begin();
520     globalAtomIndices.resize(a);
521     for (int zone = 0; zone < numZones; zone++)
522     {
523         int cg0;
524         if (zone == 0)
525         {
526             cg0 = cg_start;
527         }
528         else
529         {
530             cg0 = zone2cg[zone];
531         }
532         int cg1    = zone2cg[zone+1];
533         int cg1_p1 = cg0 + zone_ncg1[zone];
534
535         for (int cg = cg0; cg < cg1; cg++)
536         {
537             int zone1 = zone;
538             if (cg >= cg1_p1)
539             {
540                 /* Signal that this cg is from more than one pulse away */
541                 zone1 += numZones;
542             }
543             int cg_gl = globalAtomGroupIndices[cg];
544             if (bCGs)
545             {
546                 for (int a_gl = gcgs_index[cg_gl]; a_gl < gcgs_index[cg_gl+1]; a_gl++)
547                 {
548                     globalAtomIndices.push_back(a_gl);
549                     ga2la.insert(a_gl, {a, zone1});
550                     a++;
551                 }
552             }
553             else
554             {
555                 globalAtomIndices.push_back(cg_gl);
556                 ga2la.insert(cg_gl, {a, zone1});
557                 a++;
558             }
559         }
560     }
561 }
562
563 //! Checks the charge-group assignements.
564 static int check_bLocalCG(gmx_domdec_t *dd, int ncg_sys, const char *bLocalCG,
565                           const char *where)
566 {
567     int nerr = 0;
568     if (bLocalCG == nullptr)
569     {
570         return nerr;
571     }
572     for (size_t i = 0; i < dd->globalAtomGroupIndices.size(); i++)
573     {
574         if (!bLocalCG[dd->globalAtomGroupIndices[i]])
575         {
576             fprintf(stderr,
577                     "DD rank %d, %s: atom group %zu, global atom group %d is not marked in bLocalCG (ncg_home %d)\n", dd->rank, where, i + 1, dd->globalAtomGroupIndices[i] + 1, dd->ncg_home);
578             nerr++;
579         }
580     }
581     size_t ngl = 0;
582     for (int i = 0; i < ncg_sys; i++)
583     {
584         if (bLocalCG[i])
585         {
586             ngl++;
587         }
588     }
589     if (ngl != dd->globalAtomGroupIndices.size())
590     {
591         fprintf(stderr, "DD rank %d, %s: In bLocalCG %zu atom groups are marked as local, whereas there are %zu\n", dd->rank, where, ngl, dd->globalAtomGroupIndices.size());
592         nerr++;
593     }
594
595     return nerr;
596 }
597
598 //! Checks whether global and local atom indices are consistent.
599 static void check_index_consistency(gmx_domdec_t *dd,
600                                     int natoms_sys, int ncg_sys,
601                                     const char *where)
602 {
603     int       nerr = 0;
604
605     const int numAtomsInZones = dd->comm->atomRanges.end(DDAtomRanges::Type::Zones);
606
607     if (dd->comm->DD_debug > 1)
608     {
609         std::vector<int> have(natoms_sys);
610         for (int a = 0; a < numAtomsInZones; a++)
611         {
612             int globalAtomIndex = dd->globalAtomIndices[a];
613             if (have[globalAtomIndex] > 0)
614             {
615                 fprintf(stderr, "DD rank %d: global atom %d occurs twice: index %d and %d\n", dd->rank, globalAtomIndex + 1, have[globalAtomIndex], a+1);
616             }
617             else
618             {
619                 have[globalAtomIndex] = a + 1;
620             }
621         }
622     }
623
624     std::vector<int> have(numAtomsInZones);
625
626     int              ngl = 0;
627     for (int i = 0; i < natoms_sys; i++)
628     {
629         if (const auto entry = dd->ga2la->find(i))
630         {
631             const int a = entry->la;
632             if (a >= numAtomsInZones)
633             {
634                 fprintf(stderr, "DD rank %d: global atom %d marked as local atom %d, which is larger than nat_tot (%d)\n", dd->rank, i+1, a+1, numAtomsInZones);
635                 nerr++;
636             }
637             else
638             {
639                 have[a] = 1;
640                 if (dd->globalAtomIndices[a] != i)
641                 {
642                     fprintf(stderr, "DD rank %d: global atom %d marked as local atom %d, which has global atom index %d\n", dd->rank, i+1, a+1, dd->globalAtomIndices[a]+1);
643                     nerr++;
644                 }
645             }
646             ngl++;
647         }
648     }
649     if (ngl != numAtomsInZones)
650     {
651         fprintf(stderr,
652                 "DD rank %d, %s: %d global atom indices, %d local atoms\n",
653                 dd->rank, where, ngl, numAtomsInZones);
654     }
655     for (int a = 0; a < numAtomsInZones; a++)
656     {
657         if (have[a] == 0)
658         {
659             fprintf(stderr,
660                     "DD rank %d, %s: local atom %d, global %d has no global index\n",
661                     dd->rank, where, a + 1, dd->globalAtomIndices[a] + 1);
662         }
663     }
664
665     nerr += check_bLocalCG(dd, ncg_sys, dd->comm->bLocalCG, where);
666
667     if (nerr > 0)
668     {
669         gmx_fatal(FARGS, "DD rank %d, %s: %d atom(group) index inconsistencies",
670                   dd->rank, where, nerr);
671     }
672 }
673
674 //! Clear all DD global state indices, starting from \p atomGroupStart and \p atomStart
675 static void clearDDStateIndices(gmx_domdec_t *dd,
676                                 int           atomGroupStart,
677                                 int           atomStart)
678 {
679     gmx_ga2la_t &ga2la = *dd->ga2la;
680
681     if (atomStart == 0)
682     {
683         /* Clear the whole list without the overhead of searching */
684         ga2la.clear();
685     }
686     else
687     {
688         const int numAtomsInZones = dd->comm->atomRanges.end(DDAtomRanges::Type::Zones);
689         for (int i = 0; i < numAtomsInZones; i++)
690         {
691             ga2la.erase(dd->globalAtomIndices[i]);
692         }
693     }
694
695     char *bLocalCG = dd->comm->bLocalCG;
696     if (bLocalCG)
697     {
698         for (size_t atomGroup = atomGroupStart; atomGroup < dd->globalAtomGroupIndices.size(); atomGroup++)
699         {
700             bLocalCG[dd->globalAtomGroupIndices[atomGroup]] = FALSE;
701         }
702     }
703
704     dd_clear_local_vsite_indices(dd);
705
706     if (dd->constraints)
707     {
708         dd_clear_local_constraint_indices(dd);
709     }
710 }
711
712 bool check_grid_jump(int64_t             step,
713                      const gmx_domdec_t *dd,
714                      real                cutoff,
715                      const gmx_ddbox_t  *ddbox,
716                      gmx_bool            bFatal)
717 {
718     gmx_domdec_comm_t *comm    = dd->comm;
719     bool               invalid = false;
720
721     for (int d = 1; d < dd->ndim; d++)
722     {
723         const DDCellsizesWithDlb &cellsizes = comm->cellsizesWithDlb[d];
724         const int                 dim       = dd->dim[d];
725         const real                limit     = grid_jump_limit(comm, cutoff, d);
726         real                      bfac      = ddbox->box_size[dim];
727         if (ddbox->tric_dir[dim])
728         {
729             bfac *= ddbox->skew_fac[dim];
730         }
731         if ((cellsizes.fracUpper - cellsizes.fracLowerMax)*bfac <  limit ||
732             (cellsizes.fracLower - cellsizes.fracUpperMin)*bfac > -limit)
733         {
734             invalid = true;
735
736             if (bFatal)
737             {
738                 char buf[22];
739
740                 /* This error should never be triggered under normal
741                  * circumstances, but you never know ...
742                  */
743                 gmx_fatal(FARGS, "step %s: The domain decomposition grid has shifted too much in the %c-direction around cell %d %d %d. This should not have happened. Running with fewer ranks might avoid this issue.",
744                           gmx_step_str(step, buf),
745                           dim2char(dim), dd->ci[XX], dd->ci[YY], dd->ci[ZZ]);
746             }
747         }
748     }
749
750     return invalid;
751 }
752 //! Return the duration of force calculations on this rank.
753 static float dd_force_load(gmx_domdec_comm_t *comm)
754 {
755     float load;
756
757     if (comm->eFlop)
758     {
759         load = comm->flop;
760         if (comm->eFlop > 1)
761         {
762             load *= 1.0 + (comm->eFlop - 1)*(0.1*rand()/RAND_MAX - 0.05);
763         }
764     }
765     else
766     {
767         load = comm->cycl[ddCyclF];
768         if (comm->cycl_n[ddCyclF] > 1)
769         {
770             /* Subtract the maximum of the last n cycle counts
771              * to get rid of possible high counts due to other sources,
772              * for instance system activity, that would otherwise
773              * affect the dynamic load balancing.
774              */
775             load -= comm->cycl_max[ddCyclF];
776         }
777
778 #if GMX_MPI
779         if (comm->cycl_n[ddCyclWaitGPU] && comm->nrank_gpu_shared > 1)
780         {
781             float gpu_wait, gpu_wait_sum;
782
783             gpu_wait = comm->cycl[ddCyclWaitGPU];
784             if (comm->cycl_n[ddCyclF] > 1)
785             {
786                 /* We should remove the WaitGPU time of the same MD step
787                  * as the one with the maximum F time, since the F time
788                  * and the wait time are not independent.
789                  * Furthermore, the step for the max F time should be chosen
790                  * the same on all ranks that share the same GPU.
791                  * But to keep the code simple, we remove the average instead.
792                  * The main reason for artificially long times at some steps
793                  * is spurious CPU activity or MPI time, so we don't expect
794                  * that changes in the GPU wait time matter a lot here.
795                  */
796                 gpu_wait *= (comm->cycl_n[ddCyclF] - 1)/static_cast<float>(comm->cycl_n[ddCyclF]);
797             }
798             /* Sum the wait times over the ranks that share the same GPU */
799             MPI_Allreduce(&gpu_wait, &gpu_wait_sum, 1, MPI_FLOAT, MPI_SUM,
800                           comm->mpi_comm_gpu_shared);
801             /* Replace the wait time by the average over the ranks */
802             load += -gpu_wait + gpu_wait_sum/comm->nrank_gpu_shared;
803         }
804 #endif
805     }
806
807     return load;
808 }
809
810 //! Runs cell size checks and communicates the boundaries.
811 static void comm_dd_ns_cell_sizes(gmx_domdec_t *dd,
812                                   gmx_ddbox_t *ddbox,
813                                   rvec cell_ns_x0, rvec cell_ns_x1,
814                                   int64_t step)
815 {
816     gmx_domdec_comm_t *comm;
817     int                dim_ind, dim;
818
819     comm = dd->comm;
820
821     for (dim_ind = 0; dim_ind < dd->ndim; dim_ind++)
822     {
823         dim = dd->dim[dim_ind];
824
825         /* Without PBC we don't have restrictions on the outer cells */
826         if (!(dim >= ddbox->npbcdim &&
827               (dd->ci[dim] == 0 || dd->ci[dim] == dd->nc[dim] - 1)) &&
828             isDlbOn(comm) &&
829             (comm->cell_x1[dim] - comm->cell_x0[dim])*ddbox->skew_fac[dim] <
830             comm->cellsize_min[dim])
831         {
832             char buf[22];
833             gmx_fatal(FARGS, "step %s: The %c-size (%f) times the triclinic skew factor (%f) is smaller than the smallest allowed cell size (%f) for domain decomposition grid cell %d %d %d",
834                       gmx_step_str(step, buf), dim2char(dim),
835                       comm->cell_x1[dim] - comm->cell_x0[dim],
836                       ddbox->skew_fac[dim],
837                       dd->comm->cellsize_min[dim],
838                       dd->ci[XX], dd->ci[YY], dd->ci[ZZ]);
839         }
840     }
841
842     if ((isDlbOn(dd->comm) && dd->ndim > 1) || ddbox->nboundeddim < DIM)
843     {
844         /* Communicate the boundaries and update cell_ns_x0/1 */
845         dd_move_cellx(dd, ddbox, cell_ns_x0, cell_ns_x1);
846         if (isDlbOn(dd->comm) && dd->ndim > 1)
847         {
848             check_grid_jump(step, dd, dd->comm->cutoff, ddbox, TRUE);
849         }
850     }
851 }
852
853 //! Compute and communicate to determine the load distribution across PP ranks.
854 static void get_load_distribution(gmx_domdec_t *dd, gmx_wallcycle_t wcycle)
855 {
856     gmx_domdec_comm_t *comm;
857     domdec_load_t     *load;
858     float              cell_frac = 0, sbuf[DD_NLOAD_MAX];
859     gmx_bool           bSepPME;
860
861     if (debug)
862     {
863         fprintf(debug, "get_load_distribution start\n");
864     }
865
866     wallcycle_start(wcycle, ewcDDCOMMLOAD);
867
868     comm = dd->comm;
869
870     bSepPME = (dd->pme_nodeid >= 0);
871
872     if (dd->ndim == 0 && bSepPME)
873     {
874         /* Without decomposition, but with PME nodes, we need the load */
875         comm->load[0].mdf = comm->cycl[ddCyclPPduringPME];
876         comm->load[0].pme = comm->cycl[ddCyclPME];
877     }
878
879     for (int d = dd->ndim - 1; d >= 0; d--)
880     {
881         const DDCellsizesWithDlb *cellsizes = (isDlbOn(dd->comm) ? &comm->cellsizesWithDlb[d] : nullptr);
882         const int                 dim       = dd->dim[d];
883         /* Check if we participate in the communication in this dimension */
884         if (d == dd->ndim-1 ||
885             (dd->ci[dd->dim[d+1]] == 0 && dd->ci[dd->dim[dd->ndim-1]] == 0))
886         {
887             load = &comm->load[d];
888             if (isDlbOn(dd->comm))
889             {
890                 cell_frac = cellsizes->fracUpper - cellsizes->fracLower;
891             }
892             int pos = 0;
893             if (d == dd->ndim-1)
894             {
895                 sbuf[pos++] = dd_force_load(comm);
896                 sbuf[pos++] = sbuf[0];
897                 if (isDlbOn(dd->comm))
898                 {
899                     sbuf[pos++] = sbuf[0];
900                     sbuf[pos++] = cell_frac;
901                     if (d > 0)
902                     {
903                         sbuf[pos++] = cellsizes->fracLowerMax;
904                         sbuf[pos++] = cellsizes->fracUpperMin;
905                     }
906                 }
907                 if (bSepPME)
908                 {
909                     sbuf[pos++] = comm->cycl[ddCyclPPduringPME];
910                     sbuf[pos++] = comm->cycl[ddCyclPME];
911                 }
912             }
913             else
914             {
915                 sbuf[pos++] = comm->load[d+1].sum;
916                 sbuf[pos++] = comm->load[d+1].max;
917                 if (isDlbOn(dd->comm))
918                 {
919                     sbuf[pos++] = comm->load[d+1].sum_m;
920                     sbuf[pos++] = comm->load[d+1].cvol_min*cell_frac;
921                     sbuf[pos++] = comm->load[d+1].flags;
922                     if (d > 0)
923                     {
924                         sbuf[pos++] = cellsizes->fracLowerMax;
925                         sbuf[pos++] = cellsizes->fracUpperMin;
926                     }
927                 }
928                 if (bSepPME)
929                 {
930                     sbuf[pos++] = comm->load[d+1].mdf;
931                     sbuf[pos++] = comm->load[d+1].pme;
932                 }
933             }
934             load->nload = pos;
935             /* Communicate a row in DD direction d.
936              * The communicators are setup such that the root always has rank 0.
937              */
938 #if GMX_MPI
939             MPI_Gather(sbuf, load->nload*sizeof(float), MPI_BYTE,
940                        load->load, load->nload*sizeof(float), MPI_BYTE,
941                        0, comm->mpi_comm_load[d]);
942 #endif
943             if (dd->ci[dim] == dd->master_ci[dim])
944             {
945                 /* We are the master along this row, process this row */
946                 RowMaster *rowMaster = nullptr;
947
948                 if (isDlbOn(comm))
949                 {
950                     rowMaster = cellsizes->rowMaster.get();
951                 }
952                 load->sum      = 0;
953                 load->max      = 0;
954                 load->sum_m    = 0;
955                 load->cvol_min = 1;
956                 load->flags    = 0;
957                 load->mdf      = 0;
958                 load->pme      = 0;
959                 int pos        = 0;
960                 for (int i = 0; i < dd->nc[dim]; i++)
961                 {
962                     load->sum += load->load[pos++];
963                     load->max  = std::max(load->max, load->load[pos]);
964                     pos++;
965                     if (isDlbOn(dd->comm))
966                     {
967                         if (rowMaster->dlbIsLimited)
968                         {
969                             /* This direction could not be load balanced properly,
970                              * therefore we need to use the maximum iso the average load.
971                              */
972                             load->sum_m = std::max(load->sum_m, load->load[pos]);
973                         }
974                         else
975                         {
976                             load->sum_m += load->load[pos];
977                         }
978                         pos++;
979                         load->cvol_min = std::min(load->cvol_min, load->load[pos]);
980                         pos++;
981                         if (d < dd->ndim-1)
982                         {
983                             load->flags = gmx::roundToInt(load->load[pos++]);
984                         }
985                         if (d > 0)
986                         {
987                             rowMaster->bounds[i].cellFracLowerMax = load->load[pos++];
988                             rowMaster->bounds[i].cellFracUpperMin = load->load[pos++];
989                         }
990                     }
991                     if (bSepPME)
992                     {
993                         load->mdf = std::max(load->mdf, load->load[pos]);
994                         pos++;
995                         load->pme = std::max(load->pme, load->load[pos]);
996                         pos++;
997                     }
998                 }
999                 if (isDlbOn(comm) && rowMaster->dlbIsLimited)
1000                 {
1001                     load->sum_m *= dd->nc[dim];
1002                     load->flags |= (1<<d);
1003                 }
1004             }
1005         }
1006     }
1007
1008     if (DDMASTER(dd))
1009     {
1010         comm->nload      += dd_load_count(comm);
1011         comm->load_step  += comm->cycl[ddCyclStep];
1012         comm->load_sum   += comm->load[0].sum;
1013         comm->load_max   += comm->load[0].max;
1014         if (isDlbOn(comm))
1015         {
1016             for (int d = 0; d < dd->ndim; d++)
1017             {
1018                 if (comm->load[0].flags & (1<<d))
1019                 {
1020                     comm->load_lim[d]++;
1021                 }
1022             }
1023         }
1024         if (bSepPME)
1025         {
1026             comm->load_mdf += comm->load[0].mdf;
1027             comm->load_pme += comm->load[0].pme;
1028         }
1029     }
1030
1031     wallcycle_stop(wcycle, ewcDDCOMMLOAD);
1032
1033     if (debug)
1034     {
1035         fprintf(debug, "get_load_distribution finished\n");
1036     }
1037 }
1038
1039 /*! \brief Return the relative performance loss on the total run time
1040  * due to the force calculation load imbalance. */
1041 static float dd_force_load_fraction(gmx_domdec_t *dd)
1042 {
1043     if (dd->comm->nload > 0 && dd->comm->load_step > 0)
1044     {
1045         return dd->comm->load_sum/(dd->comm->load_step*dd->nnodes);
1046     }
1047     else
1048     {
1049         return 0;
1050     }
1051 }
1052
1053 /*! \brief Return the relative performance loss on the total run time
1054  * due to the force calculation load imbalance. */
1055 static float dd_force_imb_perf_loss(gmx_domdec_t *dd)
1056 {
1057     if (dd->comm->nload > 0 && dd->comm->load_step > 0)
1058     {
1059         return
1060             (dd->comm->load_max*dd->nnodes - dd->comm->load_sum)/
1061             (dd->comm->load_step*dd->nnodes);
1062     }
1063     else
1064     {
1065         return 0;
1066     }
1067 }
1068
1069 //! Print load-balance report e.g. at the end of a run.
1070 static void print_dd_load_av(FILE *fplog, gmx_domdec_t *dd)
1071 {
1072     gmx_domdec_comm_t *comm = dd->comm;
1073
1074     /* Only the master rank prints loads and only if we measured loads */
1075     if (!DDMASTER(dd) || comm->nload == 0)
1076     {
1077         return;
1078     }
1079
1080     char  buf[STRLEN];
1081     int   numPpRanks   = dd->nnodes;
1082     int   numPmeRanks  = (dd->pme_nodeid >= 0) ? comm->npmenodes : 0;
1083     int   numRanks     = numPpRanks + numPmeRanks;
1084     float lossFraction = 0;
1085
1086     /* Print the average load imbalance and performance loss */
1087     if (dd->nnodes > 1 && comm->load_sum > 0)
1088     {
1089         float imbalance = comm->load_max*numPpRanks/comm->load_sum - 1;
1090         lossFraction    = dd_force_imb_perf_loss(dd);
1091
1092         std::string msg         = "\nDynamic load balancing report:\n";
1093         std::string dlbStateStr;
1094
1095         switch (dd->comm->dlbState)
1096         {
1097             case DlbState::offUser:
1098                 dlbStateStr = "DLB was off during the run per user request.";
1099                 break;
1100             case DlbState::offForever:
1101                 /* Currectly this can happen due to performance loss observed, cell size
1102                  * limitations or incompatibility with other settings observed during
1103                  * determineInitialDlbState(). */
1104                 dlbStateStr = "DLB got disabled because it was unsuitable to use.";
1105                 break;
1106             case DlbState::offCanTurnOn:
1107                 dlbStateStr = "DLB was off during the run due to low measured imbalance.";
1108                 break;
1109             case DlbState::offTemporarilyLocked:
1110                 dlbStateStr = "DLB was locked at the end of the run due to unfinished PP-PME balancing.";
1111                 break;
1112             case DlbState::onCanTurnOff:
1113                 dlbStateStr = "DLB was turned on during the run due to measured imbalance.";
1114                 break;
1115             case DlbState::onUser:
1116                 dlbStateStr = "DLB was permanently on during the run per user request.";
1117                 break;
1118             default:
1119                 GMX_ASSERT(false, "Undocumented DLB state");
1120         }
1121
1122         msg += " " + dlbStateStr + "\n";
1123         msg += gmx::formatString(" Average load imbalance: %.1f%%.\n", imbalance*100);
1124         msg += gmx::formatString(" The balanceable part of the MD step is %d%%, load imbalance is computed from this.\n",
1125                                  gmx::roundToInt(dd_force_load_fraction(dd)*100));
1126         msg += gmx::formatString(" Part of the total run time spent waiting due to load imbalance: %.1f%%.\n",
1127                                  lossFraction*100);
1128         fprintf(fplog, "%s", msg.c_str());
1129         fprintf(stderr, "\n%s", msg.c_str());
1130     }
1131
1132     /* Print during what percentage of steps the  load balancing was limited */
1133     bool dlbWasLimited = false;
1134     if (isDlbOn(comm))
1135     {
1136         sprintf(buf, " Steps where the load balancing was limited by -rdd, -rcon and/or -dds:");
1137         for (int d = 0; d < dd->ndim; d++)
1138         {
1139             int limitPercentage = (200*comm->load_lim[d] + 1)/(2*comm->nload);
1140             sprintf(buf+strlen(buf), " %c %d %%",
1141                     dim2char(dd->dim[d]), limitPercentage);
1142             if (limitPercentage >= 50)
1143             {
1144                 dlbWasLimited = true;
1145             }
1146         }
1147         sprintf(buf + strlen(buf), "\n");
1148         fprintf(fplog, "%s", buf);
1149         fprintf(stderr, "%s", buf);
1150     }
1151
1152     /* Print the performance loss due to separate PME - PP rank imbalance */
1153     float lossFractionPme = 0;
1154     if (numPmeRanks > 0 && comm->load_mdf > 0 && comm->load_step > 0)
1155     {
1156         float pmeForceRatio = comm->load_pme/comm->load_mdf;
1157         lossFractionPme     = (comm->load_pme - comm->load_mdf)/comm->load_step;
1158         if (lossFractionPme <= 0)
1159         {
1160             lossFractionPme *= numPmeRanks/static_cast<float>(numRanks);
1161         }
1162         else
1163         {
1164             lossFractionPme *= numPpRanks/static_cast<float>(numRanks);
1165         }
1166         sprintf(buf, " Average PME mesh/force load: %5.3f\n", pmeForceRatio);
1167         fprintf(fplog, "%s", buf);
1168         fprintf(stderr, "%s", buf);
1169         sprintf(buf, " Part of the total run time spent waiting due to PP/PME imbalance: %.1f %%\n", std::fabs(lossFractionPme)*100);
1170         fprintf(fplog, "%s", buf);
1171         fprintf(stderr, "%s", buf);
1172     }
1173     fprintf(fplog, "\n");
1174     fprintf(stderr, "\n");
1175
1176     if (lossFraction >= DD_PERF_LOSS_WARN)
1177     {
1178         std::string message = gmx::formatString(
1179                     "NOTE: %.1f %% of the available CPU time was lost due to load imbalance\n"
1180                     "      in the domain decomposition.\n", lossFraction*100);
1181
1182         bool hadSuggestion = false;
1183         if (!isDlbOn(comm))
1184         {
1185             message      += "      You might want to use dynamic load balancing (option -dlb.)\n";
1186             hadSuggestion = true;
1187         }
1188         else if (dlbWasLimited)
1189         {
1190             message      += "      You might want to decrease the cell size limit (options -rdd, -rcon and/or -dds).\n";
1191             hadSuggestion = true;
1192         }
1193         message += gmx::formatString(
1194                     "      You can %sconsider manually changing the decomposition (option -dd);\n"
1195                     "      e.g. by using fewer domains along the box dimension in which there is\n"
1196                     "      considerable inhomogeneity in the simulated system.",
1197                     hadSuggestion ? "also " : "");
1198
1199
1200         fprintf(fplog, "%s\n", message.c_str());
1201         fprintf(stderr, "%s\n", message.c_str());
1202     }
1203     if (numPmeRanks > 0 && std::fabs(lossFractionPme) >= DD_PERF_LOSS_WARN)
1204     {
1205         sprintf(buf,
1206                 "NOTE: %.1f %% performance was lost because the PME ranks\n"
1207                 "      had %s work to do than the PP ranks.\n"
1208                 "      You might want to %s the number of PME ranks\n"
1209                 "      or %s the cut-off and the grid spacing.\n",
1210                 std::fabs(lossFractionPme*100),
1211                 (lossFractionPme < 0) ? "less"     : "more",
1212                 (lossFractionPme < 0) ? "decrease" : "increase",
1213                 (lossFractionPme < 0) ? "decrease" : "increase");
1214         fprintf(fplog, "%s\n", buf);
1215         fprintf(stderr, "%s\n", buf);
1216     }
1217 }
1218
1219 //! Return the minimum communication volume.
1220 static float dd_vol_min(gmx_domdec_t *dd)
1221 {
1222     return dd->comm->load[0].cvol_min*dd->nnodes;
1223 }
1224
1225 //! Return the DD load flags.
1226 static int dd_load_flags(gmx_domdec_t *dd)
1227 {
1228     return dd->comm->load[0].flags;
1229 }
1230
1231 //! Return the reported load imbalance in force calculations.
1232 static float dd_f_imbal(gmx_domdec_t *dd)
1233 {
1234     if (dd->comm->load[0].sum > 0)
1235     {
1236         return dd->comm->load[0].max*dd->nnodes/dd->comm->load[0].sum - 1.0f;
1237     }
1238     else
1239     {
1240         /* Something is wrong in the cycle counting, report no load imbalance */
1241         return 0.0f;
1242     }
1243 }
1244
1245 //! Returns DD load balance report.
1246 static std::string
1247 dd_print_load(gmx_domdec_t *dd,
1248               int64_t       step)
1249 {
1250     gmx::StringOutputStream stream;
1251     gmx::TextWriter         log(&stream);
1252
1253     int                     flags = dd_load_flags(dd);
1254     if (flags)
1255     {
1256         log.writeString("DD  load balancing is limited by minimum cell size in dimension");
1257         for (int d = 0; d < dd->ndim; d++)
1258         {
1259             if (flags & (1<<d))
1260             {
1261                 log.writeStringFormatted(" %c", dim2char(dd->dim[d]));
1262             }
1263         }
1264         log.ensureLineBreak();
1265     }
1266     log.writeString("DD  step " + gmx::toString(step));
1267     if (isDlbOn(dd->comm))
1268     {
1269         log.writeStringFormatted("  vol min/aver %5.3f%c",
1270                                  dd_vol_min(dd), flags ? '!' : ' ');
1271     }
1272     if (dd->nnodes > 1)
1273     {
1274         log.writeStringFormatted(" load imb.: force %4.1f%%", dd_f_imbal(dd)*100);
1275     }
1276     if (dd->comm->cycl_n[ddCyclPME])
1277     {
1278         log.writeStringFormatted("  pme mesh/force %5.3f", dd_pme_f_ratio(dd));
1279     }
1280     log.ensureLineBreak();
1281     return stream.toString();
1282 }
1283
1284 //! Prints DD load balance report in mdrun verbose mode.
1285 static void dd_print_load_verbose(gmx_domdec_t *dd)
1286 {
1287     if (isDlbOn(dd->comm))
1288     {
1289         fprintf(stderr, "vol %4.2f%c ",
1290                 dd_vol_min(dd), dd_load_flags(dd) ? '!' : ' ');
1291     }
1292     if (dd->nnodes > 1)
1293     {
1294         fprintf(stderr, "imb F %2d%% ", gmx::roundToInt(dd_f_imbal(dd)*100));
1295     }
1296     if (dd->comm->cycl_n[ddCyclPME])
1297     {
1298         fprintf(stderr, "pme/F %4.2f ", dd_pme_f_ratio(dd));
1299     }
1300 }
1301
1302 //! Turns on dynamic load balancing if possible and needed.
1303 static void turn_on_dlb(const gmx::MDLogger &mdlog,
1304                         gmx_domdec_t        *dd,
1305                         int64_t              step)
1306 {
1307     gmx_domdec_comm_t *comm         = dd->comm;
1308
1309     real               cellsize_min = comm->cellsize_min[dd->dim[0]];
1310     for (int d = 1; d < dd->ndim; d++)
1311     {
1312         cellsize_min = std::min(cellsize_min, comm->cellsize_min[dd->dim[d]]);
1313     }
1314
1315     /* Turn off DLB if we're too close to the cell size limit. */
1316     if (cellsize_min < comm->cellsize_limit*1.05)
1317     {
1318         GMX_LOG(mdlog.info).appendTextFormatted(
1319                 "step %s Measured %.1f %% performance loss due to load imbalance, "
1320                 "but the minimum cell size is smaller than 1.05 times the cell size limit. "
1321                 "Will no longer try dynamic load balancing.",
1322                 gmx::toString(step).c_str(), dd_force_imb_perf_loss(dd)*100);
1323
1324         comm->dlbState = DlbState::offForever;
1325         return;
1326     }
1327
1328     GMX_LOG(mdlog.info).appendTextFormatted(
1329             "step %s Turning on dynamic load balancing, because the performance loss due to load imbalance is %.1f %%.",
1330             gmx::toString(step).c_str(), dd_force_imb_perf_loss(dd)*100);
1331     comm->dlbState = DlbState::onCanTurnOff;
1332
1333     /* Store the non-DLB performance, so we can check if DLB actually
1334      * improves performance.
1335      */
1336     GMX_RELEASE_ASSERT(comm->cycl_n[ddCyclStep] > 0, "When we turned on DLB, we should have measured cycles");
1337     comm->cyclesPerStepBeforeDLB = comm->cycl[ddCyclStep]/comm->cycl_n[ddCyclStep];
1338
1339     set_dlb_limits(dd);
1340
1341     /* We can set the required cell size info here,
1342      * so we do not need to communicate this.
1343      * The grid is completely uniform.
1344      */
1345     for (int d = 0; d < dd->ndim; d++)
1346     {
1347         RowMaster *rowMaster = comm->cellsizesWithDlb[d].rowMaster.get();
1348
1349         if (rowMaster)
1350         {
1351             comm->load[d].sum_m = comm->load[d].sum;
1352
1353             int nc = dd->nc[dd->dim[d]];
1354             for (int i = 0; i < nc; i++)
1355             {
1356                 rowMaster->cellFrac[i] = i/static_cast<real>(nc);
1357                 if (d > 0)
1358                 {
1359                     rowMaster->bounds[i].cellFracLowerMax =  i     /static_cast<real>(nc);
1360                     rowMaster->bounds[i].cellFracUpperMin = (i + 1)/static_cast<real>(nc);
1361                 }
1362             }
1363             rowMaster->cellFrac[nc] = 1.0;
1364         }
1365     }
1366 }
1367
1368 //! Turns off dynamic load balancing (but leave it able to turn back on).
1369 static void turn_off_dlb(const gmx::MDLogger &mdlog,
1370                          gmx_domdec_t        *dd,
1371                          int64_t              step)
1372 {
1373     GMX_LOG(mdlog.info).appendText(
1374             "step " + gmx::toString(step) + " Turning off dynamic load balancing, because it is degrading performance.");
1375     dd->comm->dlbState                     = DlbState::offCanTurnOn;
1376     dd->comm->haveTurnedOffDlb             = true;
1377     dd->comm->ddPartioningCountFirstDlbOff = dd->ddp_count;
1378 }
1379
1380 //! Turns off dynamic load balancing permanently.
1381 static void turn_off_dlb_forever(const gmx::MDLogger &mdlog,
1382                                  gmx_domdec_t        *dd,
1383                                  int64_t              step)
1384 {
1385     GMX_RELEASE_ASSERT(dd->comm->dlbState == DlbState::offCanTurnOn, "Can only turn off DLB forever when it was in the can-turn-on state");
1386     GMX_LOG(mdlog.info).appendText(
1387             "step " + gmx::toString(step) + " Will no longer try dynamic load balancing, as it degraded performance.");
1388     dd->comm->dlbState = DlbState::offForever;
1389 }
1390
1391 void set_dd_dlb_max_cutoff(t_commrec *cr, real cutoff)
1392 {
1393     gmx_domdec_comm_t *comm;
1394
1395     comm = cr->dd->comm;
1396
1397     /* Turn on the DLB limiting (might have been on already) */
1398     comm->bPMELoadBalDLBLimits = TRUE;
1399
1400     /* Change the cut-off limit */
1401     comm->PMELoadBal_max_cutoff = cutoff;
1402
1403     if (debug)
1404     {
1405         fprintf(debug, "PME load balancing set a limit to the DLB staggering such that a %f cut-off will continue to fit\n",
1406                 comm->PMELoadBal_max_cutoff);
1407     }
1408 }
1409
1410 //! Merges charge-group buffers.
1411 static void merge_cg_buffers(int ncell,
1412                              gmx_domdec_comm_dim_t *cd, int pulse,
1413                              int  *ncg_cell,
1414                              gmx::ArrayRef<int> index_gl,
1415                              const int  *recv_i,
1416                              rvec *cg_cm,    rvec *recv_vr,
1417                              gmx::ArrayRef<int> cgindex,
1418                              cginfo_mb_t *cginfo_mb, int *cginfo)
1419 {
1420     gmx_domdec_ind_t *ind, *ind_p;
1421     int               p, cell, c, cg, cg0, cg1, cg_gl, nat;
1422     int               shift, shift_at;
1423
1424     ind = &cd->ind[pulse];
1425
1426     /* First correct the already stored data */
1427     shift = ind->nrecv[ncell];
1428     for (cell = ncell-1; cell >= 0; cell--)
1429     {
1430         shift -= ind->nrecv[cell];
1431         if (shift > 0)
1432         {
1433             /* Move the cg's present from previous grid pulses */
1434             cg0                = ncg_cell[ncell+cell];
1435             cg1                = ncg_cell[ncell+cell+1];
1436             cgindex[cg1+shift] = cgindex[cg1];
1437             for (cg = cg1-1; cg >= cg0; cg--)
1438             {
1439                 index_gl[cg+shift] = index_gl[cg];
1440                 copy_rvec(cg_cm[cg], cg_cm[cg+shift]);
1441                 cgindex[cg+shift] = cgindex[cg];
1442                 cginfo[cg+shift]  = cginfo[cg];
1443             }
1444             /* Correct the already stored send indices for the shift */
1445             for (p = 1; p <= pulse; p++)
1446             {
1447                 ind_p = &cd->ind[p];
1448                 cg0   = 0;
1449                 for (c = 0; c < cell; c++)
1450                 {
1451                     cg0 += ind_p->nsend[c];
1452                 }
1453                 cg1 = cg0 + ind_p->nsend[cell];
1454                 for (cg = cg0; cg < cg1; cg++)
1455                 {
1456                     ind_p->index[cg] += shift;
1457                 }
1458             }
1459         }
1460     }
1461
1462     /* Merge in the communicated buffers */
1463     shift    = 0;
1464     shift_at = 0;
1465     cg0      = 0;
1466     for (cell = 0; cell < ncell; cell++)
1467     {
1468         cg1 = ncg_cell[ncell+cell+1] + shift;
1469         if (shift_at > 0)
1470         {
1471             /* Correct the old cg indices */
1472             for (cg = ncg_cell[ncell+cell]; cg < cg1; cg++)
1473             {
1474                 cgindex[cg+1] += shift_at;
1475             }
1476         }
1477         for (cg = 0; cg < ind->nrecv[cell]; cg++)
1478         {
1479             /* Copy this charge group from the buffer */
1480             index_gl[cg1] = recv_i[cg0];
1481             copy_rvec(recv_vr[cg0], cg_cm[cg1]);
1482             /* Add it to the cgindex */
1483             cg_gl          = index_gl[cg1];
1484             cginfo[cg1]    = ddcginfo(cginfo_mb, cg_gl);
1485             nat            = GET_CGINFO_NATOMS(cginfo[cg1]);
1486             cgindex[cg1+1] = cgindex[cg1] + nat;
1487             cg0++;
1488             cg1++;
1489             shift_at += nat;
1490         }
1491         shift                 += ind->nrecv[cell];
1492         ncg_cell[ncell+cell+1] = cg1;
1493     }
1494 }
1495
1496 //! Makes a range partitioning for the atom groups wthin a cell
1497 static void make_cell2at_index(gmx_domdec_comm_dim_t        *cd,
1498                                int                           nzone,
1499                                int                           atomGroupStart,
1500                                const gmx::RangePartitioning &atomGroups)
1501 {
1502     /* Store the atom block boundaries for easy copying of communication buffers
1503      */
1504     int g = atomGroupStart;
1505     for (int zone = 0; zone < nzone; zone++)
1506     {
1507         for (gmx_domdec_ind_t &ind : cd->ind)
1508         {
1509             const auto range    = atomGroups.subRange(g, g + ind.nrecv[zone]);
1510             ind.cell2at0[zone]  = range.begin();
1511             ind.cell2at1[zone]  = range.end();
1512             g                  += ind.nrecv[zone];
1513         }
1514     }
1515 }
1516
1517 //! Returns whether a link is missing.
1518 static gmx_bool missing_link(t_blocka *link, int cg_gl, const char *bLocalCG)
1519 {
1520     int      i;
1521     gmx_bool bMiss;
1522
1523     bMiss = FALSE;
1524     for (i = link->index[cg_gl]; i < link->index[cg_gl+1]; i++)
1525     {
1526         if (!bLocalCG[link->a[i]])
1527         {
1528             bMiss = TRUE;
1529         }
1530     }
1531
1532     return bMiss;
1533 }
1534
1535 //! Domain corners for communication, a maximum of 4 i-zones see a j domain
1536 typedef struct {
1537     //! The corners for the non-bonded communication.
1538     real c[DIM][4];
1539     //! Corner for rounding.
1540     real cr0;
1541     //! Corners for rounding.
1542     real cr1[4];
1543     //! Corners for bounded communication.
1544     real bc[DIM];
1545     //! Corner for rounding for bonded communication.
1546     real bcr1;
1547 } dd_corners_t;
1548
1549 //! Determine the corners of the domain(s) we are communicating with.
1550 static void
1551 set_dd_corners(const gmx_domdec_t *dd,
1552                int dim0, int dim1, int dim2,
1553                gmx_bool bDistMB,
1554                dd_corners_t *c)
1555 {
1556     const gmx_domdec_comm_t  *comm;
1557     const gmx_domdec_zones_t *zones;
1558     int i, j;
1559
1560     comm = dd->comm;
1561
1562     zones = &comm->zones;
1563
1564     /* Keep the compiler happy */
1565     c->cr0  = 0;
1566     c->bcr1 = 0;
1567
1568     /* The first dimension is equal for all cells */
1569     c->c[0][0] = comm->cell_x0[dim0];
1570     if (bDistMB)
1571     {
1572         c->bc[0] = c->c[0][0];
1573     }
1574     if (dd->ndim >= 2)
1575     {
1576         dim1 = dd->dim[1];
1577         /* This cell row is only seen from the first row */
1578         c->c[1][0] = comm->cell_x0[dim1];
1579         /* All rows can see this row */
1580         c->c[1][1] = comm->cell_x0[dim1];
1581         if (isDlbOn(dd->comm))
1582         {
1583             c->c[1][1] = std::max(comm->cell_x0[dim1], comm->zone_d1[1].mch0);
1584             if (bDistMB)
1585             {
1586                 /* For the multi-body distance we need the maximum */
1587                 c->bc[1] = std::max(comm->cell_x0[dim1], comm->zone_d1[1].p1_0);
1588             }
1589         }
1590         /* Set the upper-right corner for rounding */
1591         c->cr0 = comm->cell_x1[dim0];
1592
1593         if (dd->ndim >= 3)
1594         {
1595             dim2 = dd->dim[2];
1596             for (j = 0; j < 4; j++)
1597             {
1598                 c->c[2][j] = comm->cell_x0[dim2];
1599             }
1600             if (isDlbOn(dd->comm))
1601             {
1602                 /* Use the maximum of the i-cells that see a j-cell */
1603                 for (i = 0; i < zones->nizone; i++)
1604                 {
1605                     for (j = zones->izone[i].j0; j < zones->izone[i].j1; j++)
1606                     {
1607                         if (j >= 4)
1608                         {
1609                             c->c[2][j-4] =
1610                                 std::max(c->c[2][j-4],
1611                                          comm->zone_d2[zones->shift[i][dim0]][zones->shift[i][dim1]].mch0);
1612                         }
1613                     }
1614                 }
1615                 if (bDistMB)
1616                 {
1617                     /* For the multi-body distance we need the maximum */
1618                     c->bc[2] = comm->cell_x0[dim2];
1619                     for (i = 0; i < 2; i++)
1620                     {
1621                         for (j = 0; j < 2; j++)
1622                         {
1623                             c->bc[2] = std::max(c->bc[2], comm->zone_d2[i][j].p1_0);
1624                         }
1625                     }
1626                 }
1627             }
1628
1629             /* Set the upper-right corner for rounding */
1630             /* Cell (0,0,0) and cell (1,0,0) can see cell 4 (0,1,1)
1631              * Only cell (0,0,0) can see cell 7 (1,1,1)
1632              */
1633             c->cr1[0] = comm->cell_x1[dim1];
1634             c->cr1[3] = comm->cell_x1[dim1];
1635             if (isDlbOn(dd->comm))
1636             {
1637                 c->cr1[0] = std::max(comm->cell_x1[dim1], comm->zone_d1[1].mch1);
1638                 if (bDistMB)
1639                 {
1640                     /* For the multi-body distance we need the maximum */
1641                     c->bcr1 = std::max(comm->cell_x1[dim1], comm->zone_d1[1].p1_1);
1642                 }
1643             }
1644         }
1645     }
1646 }
1647
1648 /*! \brief Add the atom groups we need to send in this pulse from this
1649  * zone to \p localAtomGroups and \p work. */
1650 static void
1651 get_zone_pulse_cgs(gmx_domdec_t *dd,
1652                    int zonei, int zone,
1653                    int cg0, int cg1,
1654                    gmx::ArrayRef<const int> globalAtomGroupIndices,
1655                    const gmx::RangePartitioning &atomGroups,
1656                    int dim, int dim_ind,
1657                    int dim0, int dim1, int dim2,
1658                    real r_comm2, real r_bcomm2,
1659                    matrix box,
1660                    bool distanceIsTriclinic,
1661                    rvec *normal,
1662                    real skew_fac2_d, real skew_fac_01,
1663                    rvec *v_d, rvec *v_0, rvec *v_1,
1664                    const dd_corners_t *c,
1665                    const rvec sf2_round,
1666                    gmx_bool bDistBonded,
1667                    gmx_bool bBondComm,
1668                    gmx_bool bDist2B,
1669                    gmx_bool bDistMB,
1670                    rvec *cg_cm,
1671                    const int *cginfo,
1672                    std::vector<int>     *localAtomGroups,
1673                    dd_comm_setup_work_t *work)
1674 {
1675     gmx_domdec_comm_t *comm;
1676     gmx_bool           bScrew;
1677     gmx_bool           bDistMB_pulse;
1678     int                cg, i;
1679     real               r2, rb2, r, tric_sh;
1680     rvec               rn, rb;
1681     int                dimd;
1682     int                nsend_z, nat;
1683
1684     comm = dd->comm;
1685
1686     bScrew = (dd->bScrewPBC && dim == XX);
1687
1688     bDistMB_pulse = (bDistMB && bDistBonded);
1689
1690     /* Unpack the work data */
1691     std::vector<int>       &ibuf = work->atomGroupBuffer;
1692     std::vector<gmx::RVec> &vbuf = work->positionBuffer;
1693     nsend_z                      = 0;
1694     nat                          = work->nat;
1695
1696     for (cg = cg0; cg < cg1; cg++)
1697     {
1698         r2  = 0;
1699         rb2 = 0;
1700         if (!distanceIsTriclinic)
1701         {
1702             /* Rectangular direction, easy */
1703             r = cg_cm[cg][dim] - c->c[dim_ind][zone];
1704             if (r > 0)
1705             {
1706                 r2 += r*r;
1707             }
1708             if (bDistMB_pulse)
1709             {
1710                 r = cg_cm[cg][dim] - c->bc[dim_ind];
1711                 if (r > 0)
1712                 {
1713                     rb2 += r*r;
1714                 }
1715             }
1716             /* Rounding gives at most a 16% reduction
1717              * in communicated atoms
1718              */
1719             if (dim_ind >= 1 && (zonei == 1 || zonei == 2))
1720             {
1721                 r = cg_cm[cg][dim0] - c->cr0;
1722                 /* This is the first dimension, so always r >= 0 */
1723                 r2 += r*r;
1724                 if (bDistMB_pulse)
1725                 {
1726                     rb2 += r*r;
1727                 }
1728             }
1729             if (dim_ind == 2 && (zonei == 2 || zonei == 3))
1730             {
1731                 r = cg_cm[cg][dim1] - c->cr1[zone];
1732                 if (r > 0)
1733                 {
1734                     r2 += r*r;
1735                 }
1736                 if (bDistMB_pulse)
1737                 {
1738                     r = cg_cm[cg][dim1] - c->bcr1;
1739                     if (r > 0)
1740                     {
1741                         rb2 += r*r;
1742                     }
1743                 }
1744             }
1745         }
1746         else
1747         {
1748             /* Triclinic direction, more complicated */
1749             clear_rvec(rn);
1750             clear_rvec(rb);
1751             /* Rounding, conservative as the skew_fac multiplication
1752              * will slightly underestimate the distance.
1753              */
1754             if (dim_ind >= 1 && (zonei == 1 || zonei == 2))
1755             {
1756                 rn[dim0] = cg_cm[cg][dim0] - c->cr0;
1757                 for (i = dim0+1; i < DIM; i++)
1758                 {
1759                     rn[dim0] -= cg_cm[cg][i]*v_0[i][dim0];
1760                 }
1761                 r2 = rn[dim0]*rn[dim0]*sf2_round[dim0];
1762                 if (bDistMB_pulse)
1763                 {
1764                     rb[dim0] = rn[dim0];
1765                     rb2      = r2;
1766                 }
1767                 /* Take care that the cell planes along dim0 might not
1768                  * be orthogonal to those along dim1 and dim2.
1769                  */
1770                 for (i = 1; i <= dim_ind; i++)
1771                 {
1772                     dimd = dd->dim[i];
1773                     if (normal[dim0][dimd] > 0)
1774                     {
1775                         rn[dimd] -= rn[dim0]*normal[dim0][dimd];
1776                         if (bDistMB_pulse)
1777                         {
1778                             rb[dimd] -= rb[dim0]*normal[dim0][dimd];
1779                         }
1780                     }
1781                 }
1782             }
1783             if (dim_ind == 2 && (zonei == 2 || zonei == 3))
1784             {
1785                 rn[dim1] += cg_cm[cg][dim1] - c->cr1[zone];
1786                 tric_sh   = 0;
1787                 for (i = dim1+1; i < DIM; i++)
1788                 {
1789                     tric_sh -= cg_cm[cg][i]*v_1[i][dim1];
1790                 }
1791                 rn[dim1] += tric_sh;
1792                 if (rn[dim1] > 0)
1793                 {
1794                     r2 += rn[dim1]*rn[dim1]*sf2_round[dim1];
1795                     /* Take care of coupling of the distances
1796                      * to the planes along dim0 and dim1 through dim2.
1797                      */
1798                     r2 -= rn[dim0]*rn[dim1]*skew_fac_01;
1799                     /* Take care that the cell planes along dim1
1800                      * might not be orthogonal to that along dim2.
1801                      */
1802                     if (normal[dim1][dim2] > 0)
1803                     {
1804                         rn[dim2] -= rn[dim1]*normal[dim1][dim2];
1805                     }
1806                 }
1807                 if (bDistMB_pulse)
1808                 {
1809                     rb[dim1] +=
1810                         cg_cm[cg][dim1] - c->bcr1 + tric_sh;
1811                     if (rb[dim1] > 0)
1812                     {
1813                         rb2 += rb[dim1]*rb[dim1]*sf2_round[dim1];
1814                         /* Take care of coupling of the distances
1815                          * to the planes along dim0 and dim1 through dim2.
1816                          */
1817                         rb2 -= rb[dim0]*rb[dim1]*skew_fac_01;
1818                         /* Take care that the cell planes along dim1
1819                          * might not be orthogonal to that along dim2.
1820                          */
1821                         if (normal[dim1][dim2] > 0)
1822                         {
1823                             rb[dim2] -= rb[dim1]*normal[dim1][dim2];
1824                         }
1825                     }
1826                 }
1827             }
1828             /* The distance along the communication direction */
1829             rn[dim] += cg_cm[cg][dim] - c->c[dim_ind][zone];
1830             tric_sh  = 0;
1831             for (i = dim+1; i < DIM; i++)
1832             {
1833                 tric_sh -= cg_cm[cg][i]*v_d[i][dim];
1834             }
1835             rn[dim] += tric_sh;
1836             if (rn[dim] > 0)
1837             {
1838                 r2 += rn[dim]*rn[dim]*skew_fac2_d;
1839                 /* Take care of coupling of the distances
1840                  * to the planes along dim0 and dim1 through dim2.
1841                  */
1842                 if (dim_ind == 1 && zonei == 1)
1843                 {
1844                     r2 -= rn[dim0]*rn[dim]*skew_fac_01;
1845                 }
1846             }
1847             if (bDistMB_pulse)
1848             {
1849                 clear_rvec(rb);
1850                 rb[dim] += cg_cm[cg][dim] - c->bc[dim_ind] + tric_sh;
1851                 if (rb[dim] > 0)
1852                 {
1853                     rb2 += rb[dim]*rb[dim]*skew_fac2_d;
1854                     /* Take care of coupling of the distances
1855                      * to the planes along dim0 and dim1 through dim2.
1856                      */
1857                     if (dim_ind == 1 && zonei == 1)
1858                     {
1859                         rb2 -= rb[dim0]*rb[dim]*skew_fac_01;
1860                     }
1861                 }
1862             }
1863         }
1864
1865         if (r2 < r_comm2 ||
1866             (bDistBonded &&
1867              ((bDistMB && rb2 < r_bcomm2) ||
1868               (bDist2B && r2  < r_bcomm2)) &&
1869              (!bBondComm ||
1870               (GET_CGINFO_BOND_INTER(cginfo[cg]) &&
1871                missing_link(comm->cglink, globalAtomGroupIndices[cg],
1872                             comm->bLocalCG)))))
1873         {
1874             /* Store the local and global atom group indices and position */
1875             localAtomGroups->push_back(cg);
1876             ibuf.push_back(globalAtomGroupIndices[cg]);
1877             nsend_z++;
1878
1879             rvec posPbc;
1880             if (dd->ci[dim] == 0)
1881             {
1882                 /* Correct cg_cm for pbc */
1883                 rvec_add(cg_cm[cg], box[dim], posPbc);
1884                 if (bScrew)
1885                 {
1886                     posPbc[YY] = box[YY][YY] - posPbc[YY];
1887                     posPbc[ZZ] = box[ZZ][ZZ] - posPbc[ZZ];
1888                 }
1889             }
1890             else
1891             {
1892                 copy_rvec(cg_cm[cg], posPbc);
1893             }
1894             vbuf.emplace_back(posPbc[XX], posPbc[YY], posPbc[ZZ]);
1895
1896             nat += atomGroups.block(cg).size();
1897         }
1898     }
1899
1900     work->nat        = nat;
1901     work->nsend_zone = nsend_z;
1902 }
1903
1904 //! Clear data.
1905 static void clearCommSetupData(dd_comm_setup_work_t *work)
1906 {
1907     work->localAtomGroupBuffer.clear();
1908     work->atomGroupBuffer.clear();
1909     work->positionBuffer.clear();
1910     work->nat        = 0;
1911     work->nsend_zone = 0;
1912 }
1913
1914 //! Prepare DD communication.
1915 static void setup_dd_communication(gmx_domdec_t *dd,
1916                                    matrix box, gmx_ddbox_t *ddbox,
1917                                    t_forcerec *fr,
1918                                    t_state *state,
1919                                    PaddedVector<gmx::RVec> *f)
1920 {
1921     int                    dim_ind, dim, dim0, dim1, dim2, dimd, nat_tot;
1922     int                    nzone, nzone_send, zone, zonei, cg0, cg1;
1923     int                    c, i, cg, cg_gl, nrcg;
1924     int                   *zone_cg_range, pos_cg;
1925     gmx_domdec_comm_t     *comm;
1926     gmx_domdec_zones_t    *zones;
1927     gmx_domdec_comm_dim_t *cd;
1928     cginfo_mb_t           *cginfo_mb;
1929     gmx_bool               bBondComm, bDist2B, bDistMB, bDistBonded;
1930     dd_corners_t           corners;
1931     rvec                  *cg_cm, *normal, *v_d, *v_0 = nullptr, *v_1 = nullptr;
1932     real                   skew_fac2_d, skew_fac_01;
1933     rvec                   sf2_round;
1934
1935     if (debug)
1936     {
1937         fprintf(debug, "Setting up DD communication\n");
1938     }
1939
1940     comm  = dd->comm;
1941
1942     if (comm->dth.empty())
1943     {
1944         /* Initialize the thread data.
1945          * This can not be done in init_domain_decomposition,
1946          * as the numbers of threads is determined later.
1947          */
1948         int numThreads = gmx_omp_nthreads_get(emntDomdec);
1949         comm->dth.resize(numThreads);
1950     }
1951
1952     switch (fr->cutoff_scheme)
1953     {
1954         case ecutsGROUP:
1955             cg_cm = fr->cg_cm;
1956             break;
1957         case ecutsVERLET:
1958             cg_cm = state->x.rvec_array();
1959             break;
1960         default:
1961             gmx_incons("unimplemented");
1962     }
1963
1964     bBondComm = comm->bBondComm;
1965
1966     /* Do we need to determine extra distances for multi-body bondeds? */
1967     bDistMB = (comm->bInterCGMultiBody && isDlbOn(dd->comm) && dd->ndim > 1);
1968
1969     /* Do we need to determine extra distances for only two-body bondeds? */
1970     bDist2B = (bBondComm && !bDistMB);
1971
1972     const real r_comm2  = gmx::square(domainToDomainIntoAtomToDomainCutoff(*comm, comm->cutoff));
1973     const real r_bcomm2 = gmx::square(domainToDomainIntoAtomToDomainCutoff(*comm, comm->cutoff_mbody));
1974
1975     if (debug)
1976     {
1977         fprintf(debug, "bBondComm %s, r_bc %f\n", gmx::boolToString(bBondComm), std::sqrt(r_bcomm2));
1978     }
1979
1980     zones = &comm->zones;
1981
1982     dim0 = dd->dim[0];
1983     dim1 = (dd->ndim >= 2 ? dd->dim[1] : -1);
1984     dim2 = (dd->ndim >= 3 ? dd->dim[2] : -1);
1985
1986     set_dd_corners(dd, dim0, dim1, dim2, bDistMB, &corners);
1987
1988     /* Triclinic stuff */
1989     normal      = ddbox->normal;
1990     skew_fac_01 = 0;
1991     if (dd->ndim >= 2)
1992     {
1993         v_0 = ddbox->v[dim0];
1994         if (ddbox->tric_dir[dim0] && ddbox->tric_dir[dim1])
1995         {
1996             /* Determine the coupling coefficient for the distances
1997              * to the cell planes along dim0 and dim1 through dim2.
1998              * This is required for correct rounding.
1999              */
2000             skew_fac_01 =
2001                 ddbox->v[dim0][dim1+1][dim0]*ddbox->v[dim1][dim1+1][dim1];
2002             if (debug)
2003             {
2004                 fprintf(debug, "\nskew_fac_01 %f\n", skew_fac_01);
2005             }
2006         }
2007     }
2008     if (dd->ndim >= 3)
2009     {
2010         v_1 = ddbox->v[dim1];
2011     }
2012
2013     zone_cg_range = zones->cg_range;
2014     cginfo_mb     = fr->cginfo_mb;
2015
2016     zone_cg_range[0]   = 0;
2017     zone_cg_range[1]   = dd->ncg_home;
2018     comm->zone_ncg1[0] = dd->ncg_home;
2019     pos_cg             = dd->ncg_home;
2020
2021     nat_tot = comm->atomRanges.numHomeAtoms();
2022     nzone   = 1;
2023     for (dim_ind = 0; dim_ind < dd->ndim; dim_ind++)
2024     {
2025         dim = dd->dim[dim_ind];
2026         cd  = &comm->cd[dim_ind];
2027
2028         /* Check if we need to compute triclinic distances along this dim */
2029         bool distanceIsTriclinic = false;
2030         for (i = 0; i <= dim_ind; i++)
2031         {
2032             if (ddbox->tric_dir[dd->dim[i]])
2033             {
2034                 distanceIsTriclinic = true;
2035             }
2036         }
2037
2038         if (dim >= ddbox->npbcdim && dd->ci[dim] == 0)
2039         {
2040             /* No pbc in this dimension, the first node should not comm. */
2041             nzone_send = 0;
2042         }
2043         else
2044         {
2045             nzone_send = nzone;
2046         }
2047
2048         v_d                = ddbox->v[dim];
2049         skew_fac2_d        = gmx::square(ddbox->skew_fac[dim]);
2050
2051         cd->receiveInPlace = true;
2052         for (int p = 0; p < cd->numPulses(); p++)
2053         {
2054             /* Only atoms communicated in the first pulse are used
2055              * for multi-body bonded interactions or for bBondComm.
2056              */
2057             bDistBonded = ((bDistMB || bDist2B) && p == 0);
2058
2059             gmx_domdec_ind_t *ind = &cd->ind[p];
2060
2061             /* Thread 0 writes in the global index array */
2062             ind->index.clear();
2063             clearCommSetupData(&comm->dth[0]);
2064
2065             for (zone = 0; zone < nzone_send; zone++)
2066             {
2067                 if (dim_ind > 0 && distanceIsTriclinic)
2068                 {
2069                     /* Determine slightly more optimized skew_fac's
2070                      * for rounding.
2071                      * This reduces the number of communicated atoms
2072                      * by about 10% for 3D DD of rhombic dodecahedra.
2073                      */
2074                     for (dimd = 0; dimd < dim; dimd++)
2075                     {
2076                         sf2_round[dimd] = 1;
2077                         if (ddbox->tric_dir[dimd])
2078                         {
2079                             for (i = dd->dim[dimd]+1; i < DIM; i++)
2080                             {
2081                                 /* If we are shifted in dimension i
2082                                  * and the cell plane is tilted forward
2083                                  * in dimension i, skip this coupling.
2084                                  */
2085                                 if (!(zones->shift[nzone+zone][i] &&
2086                                       ddbox->v[dimd][i][dimd] >= 0))
2087                                 {
2088                                     sf2_round[dimd] +=
2089                                         gmx::square(ddbox->v[dimd][i][dimd]);
2090                                 }
2091                             }
2092                             sf2_round[dimd] = 1/sf2_round[dimd];
2093                         }
2094                     }
2095                 }
2096
2097                 zonei = zone_perm[dim_ind][zone];
2098                 if (p == 0)
2099                 {
2100                     /* Here we permutate the zones to obtain a convenient order
2101                      * for neighbor searching
2102                      */
2103                     cg0 = zone_cg_range[zonei];
2104                     cg1 = zone_cg_range[zonei+1];
2105                 }
2106                 else
2107                 {
2108                     /* Look only at the cg's received in the previous grid pulse
2109                      */
2110                     cg1 = zone_cg_range[nzone+zone+1];
2111                     cg0 = cg1 - cd->ind[p-1].nrecv[zone];
2112                 }
2113
2114                 const int numThreads = gmx::ssize(comm->dth);
2115 #pragma omp parallel for num_threads(numThreads) schedule(static)
2116                 for (int th = 0; th < numThreads; th++)
2117                 {
2118                     try
2119                     {
2120                         dd_comm_setup_work_t &work = comm->dth[th];
2121
2122                         /* Retain data accumulated into buffers of thread 0 */
2123                         if (th > 0)
2124                         {
2125                             clearCommSetupData(&work);
2126                         }
2127
2128                         int cg0_th = cg0 + ((cg1 - cg0)* th   )/numThreads;
2129                         int cg1_th = cg0 + ((cg1 - cg0)*(th+1))/numThreads;
2130
2131                         /* Get the cg's for this pulse in this zone */
2132                         get_zone_pulse_cgs(dd, zonei, zone, cg0_th, cg1_th,
2133                                            dd->globalAtomGroupIndices,
2134                                            dd->atomGrouping(),
2135                                            dim, dim_ind, dim0, dim1, dim2,
2136                                            r_comm2, r_bcomm2,
2137                                            box, distanceIsTriclinic,
2138                                            normal, skew_fac2_d, skew_fac_01,
2139                                            v_d, v_0, v_1, &corners, sf2_round,
2140                                            bDistBonded, bBondComm,
2141                                            bDist2B, bDistMB,
2142                                            cg_cm, fr->cginfo,
2143                                            th == 0 ? &ind->index : &work.localAtomGroupBuffer,
2144                                            &work);
2145                     }
2146                     GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
2147                 } // END
2148
2149                 std::vector<int>       &atomGroups = comm->dth[0].atomGroupBuffer;
2150                 std::vector<gmx::RVec> &positions  = comm->dth[0].positionBuffer;
2151                 ind->nsend[zone]  = comm->dth[0].nsend_zone;
2152                 /* Append data of threads>=1 to the communication buffers */
2153                 for (int th = 1; th < numThreads; th++)
2154                 {
2155                     const dd_comm_setup_work_t &dth = comm->dth[th];
2156
2157                     ind->index.insert(ind->index.end(), dth.localAtomGroupBuffer.begin(), dth.localAtomGroupBuffer.end());
2158                     atomGroups.insert(atomGroups.end(), dth.atomGroupBuffer.begin(), dth.atomGroupBuffer.end());
2159                     positions.insert(positions.end(), dth.positionBuffer.begin(), dth.positionBuffer.end());
2160                     comm->dth[0].nat += dth.nat;
2161                     ind->nsend[zone] += dth.nsend_zone;
2162                 }
2163             }
2164             /* Clear the counts in case we do not have pbc */
2165             for (zone = nzone_send; zone < nzone; zone++)
2166             {
2167                 ind->nsend[zone] = 0;
2168             }
2169             ind->nsend[nzone]     = ind->index.size();
2170             ind->nsend[nzone + 1] = comm->dth[0].nat;
2171             /* Communicate the number of cg's and atoms to receive */
2172             ddSendrecv(dd, dim_ind, dddirBackward,
2173                        ind->nsend, nzone+2,
2174                        ind->nrecv, nzone+2);
2175
2176             if (p > 0)
2177             {
2178                 /* We can receive in place if only the last zone is not empty */
2179                 for (zone = 0; zone < nzone-1; zone++)
2180                 {
2181                     if (ind->nrecv[zone] > 0)
2182                     {
2183                         cd->receiveInPlace = false;
2184                     }
2185                 }
2186             }
2187
2188             int receiveBufferSize = 0;
2189             if (!cd->receiveInPlace)
2190             {
2191                 receiveBufferSize = ind->nrecv[nzone];
2192             }
2193             /* These buffer are actually only needed with in-place */
2194             DDBufferAccess<int>       globalAtomGroupBuffer(comm->intBuffer, receiveBufferSize);
2195             DDBufferAccess<gmx::RVec> rvecBuffer(comm->rvecBuffer, receiveBufferSize);
2196
2197             dd_comm_setup_work_t     &work = comm->dth[0];
2198
2199             /* Make space for the global cg indices */
2200             int numAtomGroupsNew = pos_cg + ind->nrecv[nzone];
2201             dd->globalAtomGroupIndices.resize(numAtomGroupsNew);
2202             /* Communicate the global cg indices */
2203             gmx::ArrayRef<int> integerBufferRef;
2204             if (cd->receiveInPlace)
2205             {
2206                 integerBufferRef = gmx::arrayRefFromArray(dd->globalAtomGroupIndices.data() + pos_cg, ind->nrecv[nzone]);
2207             }
2208             else
2209             {
2210                 integerBufferRef = globalAtomGroupBuffer.buffer;
2211             }
2212             ddSendrecv<int>(dd, dim_ind, dddirBackward,
2213                             work.atomGroupBuffer, integerBufferRef);
2214
2215             /* Make space for cg_cm */
2216             dd_check_alloc_ncg(fr, state, f, pos_cg + ind->nrecv[nzone]);
2217             if (fr->cutoff_scheme == ecutsGROUP)
2218             {
2219                 cg_cm = fr->cg_cm;
2220             }
2221             else
2222             {
2223                 cg_cm = state->x.rvec_array();
2224             }
2225             /* Communicate cg_cm */
2226             gmx::ArrayRef<gmx::RVec> rvecBufferRef;
2227             if (cd->receiveInPlace)
2228             {
2229                 rvecBufferRef = gmx::arrayRefFromArray(reinterpret_cast<gmx::RVec *>(cg_cm + pos_cg), ind->nrecv[nzone]);
2230             }
2231             else
2232             {
2233                 rvecBufferRef = rvecBuffer.buffer;
2234             }
2235             ddSendrecv<gmx::RVec>(dd, dim_ind, dddirBackward,
2236                                   work.positionBuffer, rvecBufferRef);
2237
2238             /* Make the charge group index */
2239             if (cd->receiveInPlace)
2240             {
2241                 zone = (p == 0 ? 0 : nzone - 1);
2242                 while (zone < nzone)
2243                 {
2244                     for (cg = 0; cg < ind->nrecv[zone]; cg++)
2245                     {
2246                         cg_gl                              = dd->globalAtomGroupIndices[pos_cg];
2247                         fr->cginfo[pos_cg]                 = ddcginfo(cginfo_mb, cg_gl);
2248                         nrcg                               = GET_CGINFO_NATOMS(fr->cginfo[pos_cg]);
2249                         dd->atomGrouping_.appendBlock(nrcg);
2250                         if (bBondComm)
2251                         {
2252                             /* Update the charge group presence,
2253                              * so we can use it in the next pass of the loop.
2254                              */
2255                             comm->bLocalCG[cg_gl] = TRUE;
2256                         }
2257                         pos_cg++;
2258                     }
2259                     if (p == 0)
2260                     {
2261                         comm->zone_ncg1[nzone+zone] = ind->nrecv[zone];
2262                     }
2263                     zone++;
2264                     zone_cg_range[nzone+zone] = pos_cg;
2265                 }
2266             }
2267             else
2268             {
2269                 /* This part of the code is never executed with bBondComm. */
2270                 std::vector<int> &atomGroupsIndex = dd->atomGrouping_.rawIndex();
2271                 atomGroupsIndex.resize(numAtomGroupsNew + 1);
2272
2273                 merge_cg_buffers(nzone, cd, p, zone_cg_range,
2274                                  dd->globalAtomGroupIndices, integerBufferRef.data(),
2275                                  cg_cm, as_rvec_array(rvecBufferRef.data()),
2276                                  atomGroupsIndex,
2277                                  fr->cginfo_mb, fr->cginfo);
2278                 pos_cg += ind->nrecv[nzone];
2279             }
2280             nat_tot += ind->nrecv[nzone+1];
2281         }
2282         if (!cd->receiveInPlace)
2283         {
2284             /* Store the atom block for easy copying of communication buffers */
2285             make_cell2at_index(cd, nzone, zone_cg_range[nzone], dd->atomGrouping());
2286         }
2287         nzone += nzone;
2288     }
2289
2290     comm->atomRanges.setEnd(DDAtomRanges::Type::Zones, nat_tot);
2291
2292     if (!bBondComm)
2293     {
2294         /* We don't need to update cginfo, since that was alrady done above.
2295          * So we pass NULL for the forcerec.
2296          */
2297         dd_set_cginfo(dd->globalAtomGroupIndices,
2298                       dd->ncg_home, dd->globalAtomGroupIndices.size(),
2299                       nullptr, comm->bLocalCG);
2300     }
2301
2302     if (debug)
2303     {
2304         fprintf(debug, "Finished setting up DD communication, zones:");
2305         for (c = 0; c < zones->n; c++)
2306         {
2307             fprintf(debug, " %d", zones->cg_range[c+1]-zones->cg_range[c]);
2308         }
2309         fprintf(debug, "\n");
2310     }
2311 }
2312
2313 //! Set boundaries for the charge group range.
2314 static void set_cg_boundaries(gmx_domdec_zones_t *zones)
2315 {
2316     int c;
2317
2318     for (c = 0; c < zones->nizone; c++)
2319     {
2320         zones->izone[c].cg1  = zones->cg_range[c+1];
2321         zones->izone[c].jcg0 = zones->cg_range[zones->izone[c].j0];
2322         zones->izone[c].jcg1 = zones->cg_range[zones->izone[c].j1];
2323     }
2324 }
2325
2326 /*! \brief Set zone dimensions for zones \p zone_start to \p zone_end-1
2327  *
2328  * Also sets the atom density for the home zone when \p zone_start=0.
2329  * For this \p numMovedChargeGroupsInHomeZone needs to be passed to tell
2330  * how many charge groups will move but are still part of the current range.
2331  * \todo When converting domdec to use proper classes, all these variables
2332  *       should be private and a method should return the correct count
2333  *       depending on an internal state.
2334  *
2335  * \param[in,out] dd          The domain decomposition struct
2336  * \param[in]     box         The box
2337  * \param[in]     ddbox       The domain decomposition box struct
2338  * \param[in]     zone_start  The start of the zone range to set sizes for
2339  * \param[in]     zone_end    The end of the zone range to set sizes for
2340  * \param[in]     numMovedChargeGroupsInHomeZone  The number of charge groups in the home zone that should moved but are still present in dd->comm->zones.cg_range
2341  */
2342 static void set_zones_size(gmx_domdec_t *dd,
2343                            matrix box, const gmx_ddbox_t *ddbox,
2344                            int zone_start, int zone_end,
2345                            int numMovedChargeGroupsInHomeZone)
2346 {
2347     gmx_domdec_comm_t  *comm;
2348     gmx_domdec_zones_t *zones;
2349     gmx_bool            bDistMB;
2350     int                 z, zi, d, dim;
2351     real                rcs, rcmbs;
2352     int                 i, j;
2353     real                vol;
2354
2355     comm = dd->comm;
2356
2357     zones = &comm->zones;
2358
2359     /* Do we need to determine extra distances for multi-body bondeds? */
2360     bDistMB = (comm->bInterCGMultiBody && isDlbOn(dd->comm) && dd->ndim > 1);
2361
2362     for (z = zone_start; z < zone_end; z++)
2363     {
2364         /* Copy cell limits to zone limits.
2365          * Valid for non-DD dims and non-shifted dims.
2366          */
2367         copy_rvec(comm->cell_x0, zones->size[z].x0);
2368         copy_rvec(comm->cell_x1, zones->size[z].x1);
2369     }
2370
2371     for (d = 0; d < dd->ndim; d++)
2372     {
2373         dim = dd->dim[d];
2374
2375         for (z = 0; z < zones->n; z++)
2376         {
2377             /* With a staggered grid we have different sizes
2378              * for non-shifted dimensions.
2379              */
2380             if (isDlbOn(dd->comm) && zones->shift[z][dim] == 0)
2381             {
2382                 if (d == 1)
2383                 {
2384                     zones->size[z].x0[dim] = comm->zone_d1[zones->shift[z][dd->dim[d-1]]].min0;
2385                     zones->size[z].x1[dim] = comm->zone_d1[zones->shift[z][dd->dim[d-1]]].max1;
2386                 }
2387                 else if (d == 2)
2388                 {
2389                     zones->size[z].x0[dim] = comm->zone_d2[zones->shift[z][dd->dim[d-2]]][zones->shift[z][dd->dim[d-1]]].min0;
2390                     zones->size[z].x1[dim] = comm->zone_d2[zones->shift[z][dd->dim[d-2]]][zones->shift[z][dd->dim[d-1]]].max1;
2391                 }
2392             }
2393         }
2394
2395         rcs   = comm->cutoff;
2396         rcmbs = comm->cutoff_mbody;
2397         if (ddbox->tric_dir[dim])
2398         {
2399             rcs   /= ddbox->skew_fac[dim];
2400             rcmbs /= ddbox->skew_fac[dim];
2401         }
2402
2403         /* Set the lower limit for the shifted zone dimensions */
2404         for (z = zone_start; z < zone_end; z++)
2405         {
2406             if (zones->shift[z][dim] > 0)
2407             {
2408                 dim = dd->dim[d];
2409                 if (!isDlbOn(dd->comm) || d == 0)
2410                 {
2411                     zones->size[z].x0[dim] = comm->cell_x1[dim];
2412                     zones->size[z].x1[dim] = comm->cell_x1[dim] + rcs;
2413                 }
2414                 else
2415                 {
2416                     /* Here we take the lower limit of the zone from
2417                      * the lowest domain of the zone below.
2418                      */
2419                     if (z < 4)
2420                     {
2421                         zones->size[z].x0[dim] =
2422                             comm->zone_d1[zones->shift[z][dd->dim[d-1]]].min1;
2423                     }
2424                     else
2425                     {
2426                         if (d == 1)
2427                         {
2428                             zones->size[z].x0[dim] =
2429                                 zones->size[zone_perm[2][z-4]].x0[dim];
2430                         }
2431                         else
2432                         {
2433                             zones->size[z].x0[dim] =
2434                                 comm->zone_d2[zones->shift[z][dd->dim[d-2]]][zones->shift[z][dd->dim[d-1]]].min1;
2435                         }
2436                     }
2437                     /* A temporary limit, is updated below */
2438                     zones->size[z].x1[dim] = zones->size[z].x0[dim];
2439
2440                     if (bDistMB)
2441                     {
2442                         for (zi = 0; zi < zones->nizone; zi++)
2443                         {
2444                             if (zones->shift[zi][dim] == 0)
2445                             {
2446                                 /* This takes the whole zone into account.
2447                                  * With multiple pulses this will lead
2448                                  * to a larger zone then strictly necessary.
2449                                  */
2450                                 zones->size[z].x1[dim] = std::max(zones->size[z].x1[dim],
2451                                                                   zones->size[zi].x1[dim]+rcmbs);
2452                             }
2453                         }
2454                     }
2455                 }
2456             }
2457         }
2458
2459         /* Loop over the i-zones to set the upper limit of each
2460          * j-zone they see.
2461          */
2462         for (zi = 0; zi < zones->nizone; zi++)
2463         {
2464             if (zones->shift[zi][dim] == 0)
2465             {
2466                 /* We should only use zones up to zone_end */
2467                 int jZoneEnd = std::min(zones->izone[zi].j1, zone_end);
2468                 for (z = zones->izone[zi].j0; z < jZoneEnd; z++)
2469                 {
2470                     if (zones->shift[z][dim] > 0)
2471                     {
2472                         zones->size[z].x1[dim] = std::max(zones->size[z].x1[dim],
2473                                                           zones->size[zi].x1[dim]+rcs);
2474                     }
2475                 }
2476             }
2477         }
2478     }
2479
2480     for (z = zone_start; z < zone_end; z++)
2481     {
2482         /* Initialization only required to keep the compiler happy */
2483         rvec corner_min = {0, 0, 0}, corner_max = {0, 0, 0}, corner;
2484         int  nc, c;
2485
2486         /* To determine the bounding box for a zone we need to find
2487          * the extreme corners of 4, 2 or 1 corners.
2488          */
2489         nc = 1 << (ddbox->nboundeddim - 1);
2490
2491         for (c = 0; c < nc; c++)
2492         {
2493             /* Set up a zone corner at x=0, ignoring trilinic couplings */
2494             corner[XX] = 0;
2495             if ((c & 1) == 0)
2496             {
2497                 corner[YY] = zones->size[z].x0[YY];
2498             }
2499             else
2500             {
2501                 corner[YY] = zones->size[z].x1[YY];
2502             }
2503             if ((c & 2) == 0)
2504             {
2505                 corner[ZZ] = zones->size[z].x0[ZZ];
2506             }
2507             else
2508             {
2509                 corner[ZZ] = zones->size[z].x1[ZZ];
2510             }
2511             if (dd->ndim == 1 && dd->dim[0] < ZZ && ZZ < dd->npbcdim &&
2512                 box[ZZ][1 - dd->dim[0]] != 0)
2513             {
2514                 /* With 1D domain decomposition the cg's are not in
2515                  * the triclinic box, but triclinic x-y and rectangular y/x-z.
2516                  * Shift the corner of the z-vector back to along the box
2517                  * vector of dimension d, so it will later end up at 0 along d.
2518                  * This can affect the location of this corner along dd->dim[0]
2519                  * through the matrix operation below if box[d][dd->dim[0]]!=0.
2520                  */
2521                 int d = 1 - dd->dim[0];
2522
2523                 corner[d] -= corner[ZZ]*box[ZZ][d]/box[ZZ][ZZ];
2524             }
2525             /* Apply the triclinic couplings */
2526             assert(ddbox->npbcdim <= DIM);
2527             for (i = YY; i < ddbox->npbcdim; i++)
2528             {
2529                 for (j = XX; j < i; j++)
2530                 {
2531                     corner[j] += corner[i]*box[i][j]/box[i][i];
2532                 }
2533             }
2534             if (c == 0)
2535             {
2536                 copy_rvec(corner, corner_min);
2537                 copy_rvec(corner, corner_max);
2538             }
2539             else
2540             {
2541                 for (i = 0; i < DIM; i++)
2542                 {
2543                     corner_min[i] = std::min(corner_min[i], corner[i]);
2544                     corner_max[i] = std::max(corner_max[i], corner[i]);
2545                 }
2546             }
2547         }
2548         /* Copy the extreme cornes without offset along x */
2549         for (i = 0; i < DIM; i++)
2550         {
2551             zones->size[z].bb_x0[i] = corner_min[i];
2552             zones->size[z].bb_x1[i] = corner_max[i];
2553         }
2554         /* Add the offset along x */
2555         zones->size[z].bb_x0[XX] += zones->size[z].x0[XX];
2556         zones->size[z].bb_x1[XX] += zones->size[z].x1[XX];
2557     }
2558
2559     if (zone_start == 0)
2560     {
2561         vol = 1;
2562         for (dim = 0; dim < DIM; dim++)
2563         {
2564             vol *= zones->size[0].x1[dim] - zones->size[0].x0[dim];
2565         }
2566         zones->dens_zone0 = (zones->cg_range[1] - zones->cg_range[0] - numMovedChargeGroupsInHomeZone)/vol;
2567     }
2568
2569     if (debug)
2570     {
2571         for (z = zone_start; z < zone_end; z++)
2572         {
2573             fprintf(debug, "zone %d    %6.3f - %6.3f  %6.3f - %6.3f  %6.3f - %6.3f\n",
2574                     z,
2575                     zones->size[z].x0[XX], zones->size[z].x1[XX],
2576                     zones->size[z].x0[YY], zones->size[z].x1[YY],
2577                     zones->size[z].x0[ZZ], zones->size[z].x1[ZZ]);
2578             fprintf(debug, "zone %d bb %6.3f - %6.3f  %6.3f - %6.3f  %6.3f - %6.3f\n",
2579                     z,
2580                     zones->size[z].bb_x0[XX], zones->size[z].bb_x1[XX],
2581                     zones->size[z].bb_x0[YY], zones->size[z].bb_x1[YY],
2582                     zones->size[z].bb_x0[ZZ], zones->size[z].bb_x1[ZZ]);
2583         }
2584     }
2585 }
2586
2587 //! Comparator for sorting charge groups.
2588 static bool comp_cgsort(const gmx_cgsort_t &a, const gmx_cgsort_t &b)
2589 {
2590
2591     if (a.nsc == b.nsc)
2592     {
2593         return a.ind_gl < b.ind_gl;
2594     }
2595     return a.nsc < b.nsc;
2596 }
2597
2598 /*! \brief Order data in \p dataToSort according to \p sort
2599  *
2600  * Note: both buffers should have at least \p sort.size() elements.
2601  */
2602 template <typename T>
2603 static void
2604 orderVector(gmx::ArrayRef<const gmx_cgsort_t>  sort,
2605             gmx::ArrayRef<T>                   dataToSort,
2606             gmx::ArrayRef<T>                   sortBuffer)
2607 {
2608     GMX_ASSERT(dataToSort.size() >= sort.size(), "The vector needs to be sufficiently large");
2609     GMX_ASSERT(sortBuffer.size() >= sort.size(), "The sorting buffer needs to be sufficiently large");
2610
2611     /* Order the data into the temporary buffer */
2612     size_t i = 0;
2613     for (const gmx_cgsort_t &entry : sort)
2614     {
2615         sortBuffer[i++] = dataToSort[entry.ind];
2616     }
2617
2618     /* Copy back to the original array */
2619     std::copy(sortBuffer.begin(), sortBuffer.begin() + sort.size(),
2620               dataToSort.begin());
2621 }
2622
2623 /*! \brief Order data in \p dataToSort according to \p sort
2624  *
2625  * Note: \p vectorToSort should have at least \p sort.size() elements,
2626  *       \p workVector is resized when it is too small.
2627  */
2628 template <typename T>
2629 static void
2630 orderVector(gmx::ArrayRef<const gmx_cgsort_t>  sort,
2631             gmx::ArrayRef<T>                   vectorToSort,
2632             std::vector<T>                    *workVector)
2633 {
2634     if (gmx::index(workVector->size()) < sort.ssize())
2635     {
2636         workVector->resize(sort.size());
2637     }
2638     orderVector<T>(sort, vectorToSort, *workVector);
2639 }
2640
2641 //! Order vectors of atoms.
2642 static void order_vec_atom(const gmx::RangePartitioning      *atomGroups,
2643                            gmx::ArrayRef<const gmx_cgsort_t>  sort,
2644                            gmx::ArrayRef<gmx::RVec>           v,
2645                            gmx::ArrayRef<gmx::RVec>           buf)
2646 {
2647     if (atomGroups == nullptr)
2648     {
2649         /* Avoid the useless loop of the atoms within a cg */
2650         orderVector(sort, v, buf);
2651
2652         return;
2653     }
2654
2655     /* Order the data */
2656     int a = 0;
2657     for (const gmx_cgsort_t &entry : sort)
2658     {
2659         for (int i : atomGroups->block(entry.ind))
2660         {
2661             copy_rvec(v[i], buf[a]);
2662             a++;
2663         }
2664     }
2665     int atot = a;
2666
2667     /* Copy back to the original array */
2668     for (int a = 0; a < atot; a++)
2669     {
2670         copy_rvec(buf[a], v[a]);
2671     }
2672 }
2673
2674 //! Returns whether a < b */
2675 static bool compareCgsort(const gmx_cgsort_t &a,
2676                           const gmx_cgsort_t &b)
2677 {
2678     return (a.nsc < b.nsc ||
2679             (a.nsc == b.nsc && a.ind_gl < b.ind_gl));
2680 }
2681
2682 //! Do sorting of charge groups.
2683 static void orderedSort(gmx::ArrayRef<const gmx_cgsort_t>  stationary,
2684                         gmx::ArrayRef<gmx_cgsort_t>        moved,
2685                         std::vector<gmx_cgsort_t>         *sort1)
2686 {
2687     /* The new indices are not very ordered, so we qsort them */
2688     std::sort(moved.begin(), moved.end(), comp_cgsort);
2689
2690     /* stationary is already ordered, so now we can merge the two arrays */
2691     sort1->resize(stationary.size() + moved.size());
2692     std::merge(stationary.begin(), stationary.end(),
2693                moved.begin(), moved.end(),
2694                sort1->begin(),
2695                compareCgsort);
2696 }
2697
2698 /*! \brief Set the sorting order for systems with charge groups, returned in sort->sort.
2699  *
2700  * The order is according to the global charge group index.
2701  * This adds and removes charge groups that moved between domains.
2702  */
2703 static void dd_sort_order(const gmx_domdec_t *dd,
2704                           const t_forcerec   *fr,
2705                           int                 ncg_home_old,
2706                           gmx_domdec_sort_t  *sort)
2707 {
2708     const int *a          = fr->ns->grid->cell_index;
2709
2710     const int  movedValue = NSGRID_SIGNAL_MOVED_FAC*fr->ns->grid->ncells;
2711
2712     if (ncg_home_old >= 0)
2713     {
2714         std::vector<gmx_cgsort_t> &stationary = sort->stationary;
2715         std::vector<gmx_cgsort_t> &moved      = sort->moved;
2716
2717         /* The charge groups that remained in the same ns grid cell
2718          * are completely ordered. So we can sort efficiently by sorting
2719          * the charge groups that did move into the stationary list.
2720          * Note: push_back() seems to be slightly slower than direct access.
2721          */
2722         stationary.clear();
2723         moved.clear();
2724         for (int i = 0; i < dd->ncg_home; i++)
2725         {
2726             /* Check if this cg did not move to another node */
2727             if (a[i] < movedValue)
2728             {
2729                 gmx_cgsort_t entry;
2730                 entry.nsc    = a[i];
2731                 entry.ind_gl = dd->globalAtomGroupIndices[i];
2732                 entry.ind    = i;
2733
2734                 if (i >= ncg_home_old || a[i] != sort->sorted[i].nsc)
2735                 {
2736                     /* This cg is new on this node or moved ns grid cell */
2737                     moved.push_back(entry);
2738                 }
2739                 else
2740                 {
2741                     /* This cg did not move */
2742                     stationary.push_back(entry);
2743                 }
2744             }
2745         }
2746
2747         if (debug)
2748         {
2749             fprintf(debug, "ordered sort cgs: stationary %zu moved %zu\n",
2750                     stationary.size(), moved.size());
2751         }
2752         /* Sort efficiently */
2753         orderedSort(stationary, moved, &sort->sorted);
2754     }
2755     else
2756     {
2757         std::vector<gmx_cgsort_t> &cgsort   = sort->sorted;
2758         cgsort.clear();
2759         cgsort.reserve(dd->ncg_home);
2760         int                        numCGNew = 0;
2761         for (int i = 0; i < dd->ncg_home; i++)
2762         {
2763             /* Sort on the ns grid cell indices
2764              * and the global topology index
2765              */
2766             gmx_cgsort_t entry;
2767             entry.nsc    = a[i];
2768             entry.ind_gl = dd->globalAtomGroupIndices[i];
2769             entry.ind    = i;
2770             cgsort.push_back(entry);
2771             if (cgsort[i].nsc < movedValue)
2772             {
2773                 numCGNew++;
2774             }
2775         }
2776         if (debug)
2777         {
2778             fprintf(debug, "qsort cgs: %d new home %d\n", dd->ncg_home, numCGNew);
2779         }
2780         /* Determine the order of the charge groups using qsort */
2781         std::sort(cgsort.begin(), cgsort.end(), comp_cgsort);
2782
2783         /* Remove the charge groups which are no longer at home here */
2784         cgsort.resize(numCGNew);
2785     }
2786 }
2787
2788 //! Returns the sorting order for atoms based on the nbnxn grid order in sort
2789 static void dd_sort_order_nbnxn(const t_forcerec          *fr,
2790                                 std::vector<gmx_cgsort_t> *sort)
2791 {
2792     gmx::ArrayRef<const int> atomOrder = nbnxn_get_atomorder(fr->nbv->nbs.get());
2793
2794     /* Using push_back() instead of this resize results in much slower code */
2795     sort->resize(atomOrder.size());
2796     gmx::ArrayRef<gmx_cgsort_t> buffer    = *sort;
2797     size_t                      numSorted = 0;
2798     for (int i : atomOrder)
2799     {
2800         if (i >= 0)
2801         {
2802             /* The values of nsc and ind_gl are not used in this case */
2803             buffer[numSorted++].ind = i;
2804         }
2805     }
2806     sort->resize(numSorted);
2807 }
2808
2809 //! Returns the sorting state for DD.
2810 static void dd_sort_state(gmx_domdec_t *dd, rvec *cgcm, t_forcerec *fr, t_state *state,
2811                           int ncg_home_old)
2812 {
2813     gmx_domdec_sort_t *sort = dd->comm->sort.get();
2814
2815     switch (fr->cutoff_scheme)
2816     {
2817         case ecutsGROUP:
2818             dd_sort_order(dd, fr, ncg_home_old, sort);
2819             break;
2820         case ecutsVERLET:
2821             dd_sort_order_nbnxn(fr, &sort->sorted);
2822             break;
2823         default:
2824             gmx_incons("unimplemented");
2825     }
2826
2827     const gmx::RangePartitioning &atomGrouping = dd->atomGrouping();
2828
2829     /* We alloc with the old size, since cgindex is still old */
2830     GMX_ASSERT(atomGrouping.numBlocks() == dd->ncg_home, "atomGroups and dd should be consistent");
2831     DDBufferAccess<gmx::RVec>     rvecBuffer(dd->comm->rvecBuffer, atomGrouping.fullRange().end());
2832
2833     const gmx::RangePartitioning *atomGroupsPtr = (dd->comm->bCGs ? &atomGrouping : nullptr);
2834
2835     /* Set the new home atom/charge group count */
2836     dd->ncg_home = sort->sorted.size();
2837     if (debug)
2838     {
2839         fprintf(debug, "Set the new home %s count to %d\n",
2840                 dd->comm->bCGs ? "charge group" : "atom",
2841                 dd->ncg_home);
2842     }
2843
2844     /* Reorder the state */
2845     gmx::ArrayRef<const gmx_cgsort_t> cgsort = sort->sorted;
2846     GMX_RELEASE_ASSERT(cgsort.ssize() == dd->ncg_home, "We should sort all the home atom groups");
2847
2848     if (state->flags & (1 << estX))
2849     {
2850         order_vec_atom(atomGroupsPtr, cgsort, state->x, rvecBuffer.buffer);
2851     }
2852     if (state->flags & (1 << estV))
2853     {
2854         order_vec_atom(atomGroupsPtr, cgsort, state->v, rvecBuffer.buffer);
2855     }
2856     if (state->flags & (1 << estCGP))
2857     {
2858         order_vec_atom(atomGroupsPtr, cgsort, state->cg_p, rvecBuffer.buffer);
2859     }
2860
2861     if (fr->cutoff_scheme == ecutsGROUP)
2862     {
2863         /* Reorder cgcm */
2864         gmx::ArrayRef<gmx::RVec> cgcmRef = gmx::arrayRefFromArray(reinterpret_cast<gmx::RVec *>(cgcm[0]), cgsort.size());
2865         orderVector(cgsort, cgcmRef, rvecBuffer.buffer);
2866     }
2867
2868     /* Reorder the global cg index */
2869     orderVector<int>(cgsort, dd->globalAtomGroupIndices, &sort->intBuffer);
2870     /* Reorder the cginfo */
2871     orderVector<int>(cgsort, gmx::arrayRefFromArray(fr->cginfo, cgsort.size()), &sort->intBuffer);
2872     /* Rebuild the local cg index */
2873     if (dd->comm->bCGs)
2874     {
2875         /* We make a new, ordered atomGroups object and assign it to
2876          * the old one. This causes some allocation overhead, but saves
2877          * a copy back of the whole index.
2878          */
2879         gmx::RangePartitioning ordered;
2880         for (const gmx_cgsort_t &entry : cgsort)
2881         {
2882             ordered.appendBlock(atomGrouping.block(entry.ind).size());
2883         }
2884         dd->atomGrouping_ = ordered;
2885     }
2886     else
2887     {
2888         dd->atomGrouping_.setAllBlocksSizeOne(dd->ncg_home);
2889     }
2890     /* Set the home atom number */
2891     dd->comm->atomRanges.setEnd(DDAtomRanges::Type::Home, dd->atomGrouping().fullRange().end());
2892
2893     if (fr->cutoff_scheme == ecutsVERLET)
2894     {
2895         /* The atoms are now exactly in grid order, update the grid order */
2896         nbnxn_set_atomorder(fr->nbv->nbs.get());
2897     }
2898     else
2899     {
2900         /* Copy the sorted ns cell indices back to the ns grid struct */
2901         for (gmx::index i = 0; i < cgsort.ssize(); i++)
2902         {
2903             fr->ns->grid->cell_index[i] = cgsort[i].nsc;
2904         }
2905         fr->ns->grid->nr = cgsort.size();
2906     }
2907 }
2908
2909 //! Accumulates load statistics.
2910 static void add_dd_statistics(gmx_domdec_t *dd)
2911 {
2912     gmx_domdec_comm_t *comm = dd->comm;
2913
2914     for (int i = 0; i < static_cast<int>(DDAtomRanges::Type::Number); i++)
2915     {
2916         auto range = static_cast<DDAtomRanges::Type>(i);
2917         comm->sum_nat[i] +=
2918             comm->atomRanges.end(range) - comm->atomRanges.start(range);
2919     }
2920     comm->ndecomp++;
2921 }
2922
2923 void reset_dd_statistics_counters(gmx_domdec_t *dd)
2924 {
2925     gmx_domdec_comm_t *comm = dd->comm;
2926
2927     /* Reset all the statistics and counters for total run counting */
2928     for (int i = 0; i < static_cast<int>(DDAtomRanges::Type::Number); i++)
2929     {
2930         comm->sum_nat[i] = 0;
2931     }
2932     comm->ndecomp   = 0;
2933     comm->nload     = 0;
2934     comm->load_step = 0;
2935     comm->load_sum  = 0;
2936     comm->load_max  = 0;
2937     clear_ivec(comm->load_lim);
2938     comm->load_mdf = 0;
2939     comm->load_pme = 0;
2940 }
2941
2942 void print_dd_statistics(const t_commrec *cr, const t_inputrec *ir, FILE *fplog)
2943 {
2944     gmx_domdec_comm_t *comm      = cr->dd->comm;
2945
2946     const int          numRanges = static_cast<int>(DDAtomRanges::Type::Number);
2947     gmx_sumd(numRanges, comm->sum_nat, cr);
2948
2949     if (fplog == nullptr)
2950     {
2951         return;
2952     }
2953
2954     fprintf(fplog, "\n    D O M A I N   D E C O M P O S I T I O N   S T A T I S T I C S\n\n");
2955
2956     for (int i = static_cast<int>(DDAtomRanges::Type::Zones); i < numRanges; i++)
2957     {
2958         auto   range = static_cast<DDAtomRanges::Type>(i);
2959         double av    = comm->sum_nat[i]/comm->ndecomp;
2960         switch (range)
2961         {
2962             case DDAtomRanges::Type::Zones:
2963                 fprintf(fplog,
2964                         " av. #atoms communicated per step for force:  %d x %.1f\n",
2965                         2, av);
2966                 break;
2967             case DDAtomRanges::Type::Vsites:
2968                 if (cr->dd->vsite_comm)
2969                 {
2970                     fprintf(fplog,
2971                             " av. #atoms communicated per step for vsites: %d x %.1f\n",
2972                             (EEL_PME(ir->coulombtype) || ir->coulombtype == eelEWALD) ? 3 : 2,
2973                             av);
2974                 }
2975                 break;
2976             case DDAtomRanges::Type::Constraints:
2977                 if (cr->dd->constraint_comm)
2978                 {
2979                     fprintf(fplog,
2980                             " av. #atoms communicated per step for LINCS:  %d x %.1f\n",
2981                             1 + ir->nLincsIter, av);
2982                 }
2983                 break;
2984             default:
2985                 gmx_incons(" Unknown type for DD statistics");
2986         }
2987     }
2988     fprintf(fplog, "\n");
2989
2990     if (comm->bRecordLoad && EI_DYNAMICS(ir->eI))
2991     {
2992         print_dd_load_av(fplog, cr->dd);
2993     }
2994 }
2995
2996 // TODO Remove fplog when group scheme and charge groups are gone
2997 void dd_partition_system(FILE                    *fplog,
2998                          const gmx::MDLogger     &mdlog,
2999                          int64_t                  step,
3000                          const t_commrec         *cr,
3001                          gmx_bool                 bMasterState,
3002                          int                      nstglobalcomm,
3003                          t_state                 *state_global,
3004                          const gmx_mtop_t        &top_global,
3005                          const t_inputrec        *ir,
3006                          t_state                 *state_local,
3007                          PaddedVector<gmx::RVec> *f,
3008                          gmx::MDAtoms            *mdAtoms,
3009                          gmx_localtop_t          *top_local,
3010                          t_forcerec              *fr,
3011                          gmx_vsite_t             *vsite,
3012                          gmx::Constraints        *constr,
3013                          t_nrnb                  *nrnb,
3014                          gmx_wallcycle           *wcycle,
3015                          gmx_bool                 bVerbose)
3016 {
3017     gmx_domdec_t      *dd;
3018     gmx_domdec_comm_t *comm;
3019     gmx_ddbox_t        ddbox = {0};
3020     t_block           *cgs_gl;
3021     int64_t            step_pcoupl;
3022     rvec               cell_ns_x0, cell_ns_x1;
3023     int                ncgindex_set, ncg_home_old = -1, ncg_moved, nat_f_novirsum;
3024     gmx_bool           bBoxChanged, bNStGlobalComm, bDoDLB, bCheckWhetherToTurnDlbOn, bLogLoad;
3025     gmx_bool           bRedist, bResortAll;
3026     ivec               ncells_old = {0, 0, 0}, ncells_new = {0, 0, 0}, np;
3027     real               grid_density;
3028     char               sbuf[22];
3029
3030     wallcycle_start(wcycle, ewcDOMDEC);
3031
3032     dd   = cr->dd;
3033     comm = dd->comm;
3034
3035     // TODO if the update code becomes accessible here, use
3036     // upd->deform for this logic.
3037     bBoxChanged = (bMasterState || inputrecDeform(ir));
3038     if (ir->epc != epcNO)
3039     {
3040         /* With nstpcouple > 1 pressure coupling happens.
3041          * one step after calculating the pressure.
3042          * Box scaling happens at the end of the MD step,
3043          * after the DD partitioning.
3044          * We therefore have to do DLB in the first partitioning
3045          * after an MD step where P-coupling occurred.
3046          * We need to determine the last step in which p-coupling occurred.
3047          * MRS -- need to validate this for vv?
3048          */
3049         int n = ir->nstpcouple;
3050         if (n == 1)
3051         {
3052             step_pcoupl = step - 1;
3053         }
3054         else
3055         {
3056             step_pcoupl = ((step - 1)/n)*n + 1;
3057         }
3058         if (step_pcoupl >= comm->partition_step)
3059         {
3060             bBoxChanged = TRUE;
3061         }
3062     }
3063
3064     bNStGlobalComm = (step % nstglobalcomm == 0);
3065
3066     if (!isDlbOn(comm))
3067     {
3068         bDoDLB = FALSE;
3069     }
3070     else
3071     {
3072         /* Should we do dynamic load balacing this step?
3073          * Since it requires (possibly expensive) global communication,
3074          * we might want to do DLB less frequently.
3075          */
3076         if (bBoxChanged || ir->epc != epcNO)
3077         {
3078             bDoDLB = bBoxChanged;
3079         }
3080         else
3081         {
3082             bDoDLB = bNStGlobalComm;
3083         }
3084     }
3085
3086     /* Check if we have recorded loads on the nodes */
3087     if (comm->bRecordLoad && dd_load_count(comm) > 0)
3088     {
3089         bCheckWhetherToTurnDlbOn = dd_dlb_get_should_check_whether_to_turn_dlb_on(dd);
3090
3091         /* Print load every nstlog, first and last step to the log file */
3092         bLogLoad = ((ir->nstlog > 0 && step % ir->nstlog == 0) ||
3093                     comm->n_load_collect == 0 ||
3094                     (ir->nsteps >= 0 &&
3095                      (step + ir->nstlist > ir->init_step + ir->nsteps)));
3096
3097         /* Avoid extra communication due to verbose screen output
3098          * when nstglobalcomm is set.
3099          */
3100         if (bDoDLB || bLogLoad || bCheckWhetherToTurnDlbOn ||
3101             (bVerbose && (ir->nstlist == 0 || nstglobalcomm <= ir->nstlist)))
3102         {
3103             get_load_distribution(dd, wcycle);
3104             if (DDMASTER(dd))
3105             {
3106                 if (bLogLoad)
3107                 {
3108                     GMX_LOG(mdlog.info).asParagraph().appendText(dd_print_load(dd, step-1));
3109                 }
3110                 if (bVerbose)
3111                 {
3112                     dd_print_load_verbose(dd);
3113                 }
3114             }
3115             comm->n_load_collect++;
3116
3117             if (isDlbOn(comm))
3118             {
3119                 if (DDMASTER(dd))
3120                 {
3121                     /* Add the measured cycles to the running average */
3122                     const float averageFactor        = 0.1f;
3123                     comm->cyclesPerStepDlbExpAverage =
3124                         (1 - averageFactor)*comm->cyclesPerStepDlbExpAverage +
3125                         averageFactor*comm->cycl[ddCyclStep]/comm->cycl_n[ddCyclStep];
3126                 }
3127                 if (comm->dlbState == DlbState::onCanTurnOff &&
3128                     dd->comm->n_load_have % c_checkTurnDlbOffInterval == c_checkTurnDlbOffInterval - 1)
3129                 {
3130                     gmx_bool turnOffDlb;
3131                     if (DDMASTER(dd))
3132                     {
3133                         /* If the running averaged cycles with DLB are more
3134                          * than before we turned on DLB, turn off DLB.
3135                          * We will again run and check the cycles without DLB
3136                          * and we can then decide if to turn off DLB forever.
3137                          */
3138                         turnOffDlb = (comm->cyclesPerStepDlbExpAverage >
3139                                       comm->cyclesPerStepBeforeDLB);
3140                     }
3141                     dd_bcast(dd, sizeof(turnOffDlb), &turnOffDlb);
3142                     if (turnOffDlb)
3143                     {
3144                         /* To turn off DLB, we need to redistribute the atoms */
3145                         dd_collect_state(dd, state_local, state_global);
3146                         bMasterState = TRUE;
3147                         turn_off_dlb(mdlog, dd, step);
3148                     }
3149                 }
3150             }
3151             else if (bCheckWhetherToTurnDlbOn)
3152             {
3153                 gmx_bool turnOffDlbForever = FALSE;
3154                 gmx_bool turnOnDlb         = FALSE;
3155
3156                 /* Since the timings are node dependent, the master decides */
3157                 if (DDMASTER(dd))
3158                 {
3159                     /* If we recently turned off DLB, we want to check if
3160                      * performance is better without DLB. We want to do this
3161                      * ASAP to minimize the chance that external factors
3162                      * slowed down the DLB step are gone here and we
3163                      * incorrectly conclude that DLB was causing the slowdown.
3164                      * So we measure one nstlist block, no running average.
3165                      */
3166                     if (comm->haveTurnedOffDlb &&
3167                         comm->cycl[ddCyclStep]/comm->cycl_n[ddCyclStep] <
3168                         comm->cyclesPerStepDlbExpAverage)
3169                     {
3170                         /* After turning off DLB we ran nstlist steps in fewer
3171                          * cycles than with DLB. This likely means that DLB
3172                          * in not benefical, but this could be due to a one
3173                          * time unlucky fluctuation, so we require two such
3174                          * observations in close succession to turn off DLB
3175                          * forever.
3176                          */
3177                         if (comm->dlbSlowerPartitioningCount > 0 &&
3178                             dd->ddp_count < comm->dlbSlowerPartitioningCount + 10*c_checkTurnDlbOnInterval)
3179                         {
3180                             turnOffDlbForever = TRUE;
3181                         }
3182                         comm->haveTurnedOffDlb           = false;
3183                         /* Register when we last measured DLB slowdown */
3184                         comm->dlbSlowerPartitioningCount = dd->ddp_count;
3185                     }
3186                     else
3187                     {
3188                         /* Here we check if the max PME rank load is more than 0.98
3189                          * the max PP force load. If so, PP DLB will not help,
3190                          * since we are (almost) limited by PME. Furthermore,
3191                          * DLB will cause a significant extra x/f redistribution
3192                          * cost on the PME ranks, which will then surely result
3193                          * in lower total performance.
3194                          */
3195                         if (cr->npmenodes > 0 &&
3196                             dd_pme_f_ratio(dd) > 1 - DD_PERF_LOSS_DLB_ON)
3197                         {
3198                             turnOnDlb = FALSE;
3199                         }
3200                         else
3201                         {
3202                             turnOnDlb = (dd_force_imb_perf_loss(dd) >= DD_PERF_LOSS_DLB_ON);
3203                         }
3204                     }
3205                 }
3206                 struct
3207                 {
3208                     gmx_bool turnOffDlbForever;
3209                     gmx_bool turnOnDlb;
3210                 }
3211                 bools {
3212                     turnOffDlbForever, turnOnDlb
3213                 };
3214                 dd_bcast(dd, sizeof(bools), &bools);
3215                 if (bools.turnOffDlbForever)
3216                 {
3217                     turn_off_dlb_forever(mdlog, dd, step);
3218                 }
3219                 else if (bools.turnOnDlb)
3220                 {
3221                     turn_on_dlb(mdlog, dd, step);
3222                     bDoDLB = TRUE;
3223                 }
3224             }
3225         }
3226         comm->n_load_have++;
3227     }
3228
3229     cgs_gl = &comm->cgs_gl;
3230
3231     bRedist = FALSE;
3232     if (bMasterState)
3233     {
3234         /* Clear the old state */
3235         clearDDStateIndices(dd, 0, 0);
3236         ncgindex_set = 0;
3237
3238         auto xGlobal = positionsFromStatePointer(state_global);
3239
3240         set_ddbox(*dd, true,
3241                   DDMASTER(dd) ? state_global->box : nullptr,
3242                   true, xGlobal,
3243                   &ddbox);
3244
3245         distributeState(mdlog, dd, top_global, state_global, ddbox, state_local, f);
3246
3247         dd_make_local_cgs(dd, &top_local->cgs);
3248
3249         /* Ensure that we have space for the new distribution */
3250         dd_check_alloc_ncg(fr, state_local, f, dd->ncg_home);
3251
3252         if (fr->cutoff_scheme == ecutsGROUP)
3253         {
3254             calc_cgcm(fplog, 0, dd->ncg_home,
3255                       &top_local->cgs, state_local->x.rvec_array(), fr->cg_cm);
3256         }
3257
3258         inc_nrnb(nrnb, eNR_CGCM, comm->atomRanges.numHomeAtoms());
3259
3260         dd_set_cginfo(dd->globalAtomGroupIndices, 0, dd->ncg_home, fr, comm->bLocalCG);
3261     }
3262     else if (state_local->ddp_count != dd->ddp_count)
3263     {
3264         if (state_local->ddp_count > dd->ddp_count)
3265         {
3266             gmx_fatal(FARGS, "Internal inconsistency state_local->ddp_count (%d) > dd->ddp_count (%" PRId64 ")", state_local->ddp_count, dd->ddp_count);
3267         }
3268
3269         if (state_local->ddp_count_cg_gl != state_local->ddp_count)
3270         {
3271             gmx_fatal(FARGS, "Internal inconsistency state_local->ddp_count_cg_gl (%d) != state_local->ddp_count (%d)", state_local->ddp_count_cg_gl, state_local->ddp_count);
3272         }
3273
3274         /* Clear the old state */
3275         clearDDStateIndices(dd, 0, 0);
3276
3277         /* Restore the atom group indices from state_local */
3278         restoreAtomGroups(dd, cgs_gl->index, state_local);
3279         make_dd_indices(dd, cgs_gl->index, 0);
3280         ncgindex_set = dd->ncg_home;
3281
3282         if (fr->cutoff_scheme == ecutsGROUP)
3283         {
3284             /* Redetermine the cg COMs */
3285             calc_cgcm(fplog, 0, dd->ncg_home,
3286                       &top_local->cgs, state_local->x.rvec_array(), fr->cg_cm);
3287         }
3288
3289         inc_nrnb(nrnb, eNR_CGCM, comm->atomRanges.numHomeAtoms());
3290
3291         dd_set_cginfo(dd->globalAtomGroupIndices, 0, dd->ncg_home, fr, comm->bLocalCG);
3292
3293         set_ddbox(*dd, bMasterState, state_local->box,
3294                   true, state_local->x, &ddbox);
3295
3296         bRedist = isDlbOn(comm);
3297     }
3298     else
3299     {
3300         /* We have the full state, only redistribute the cgs */
3301
3302         /* Clear the non-home indices */
3303         clearDDStateIndices(dd, dd->ncg_home, comm->atomRanges.numHomeAtoms());
3304         ncgindex_set = 0;
3305
3306         /* Avoid global communication for dim's without pbc and -gcom */
3307         if (!bNStGlobalComm)
3308         {
3309             copy_rvec(comm->box0, ddbox.box0    );
3310             copy_rvec(comm->box_size, ddbox.box_size);
3311         }
3312         set_ddbox(*dd, bMasterState, state_local->box,
3313                   bNStGlobalComm, state_local->x, &ddbox);
3314
3315         bBoxChanged = TRUE;
3316         bRedist     = TRUE;
3317     }
3318     /* For dim's without pbc and -gcom */
3319     copy_rvec(ddbox.box0, comm->box0    );
3320     copy_rvec(ddbox.box_size, comm->box_size);
3321
3322     set_dd_cell_sizes(dd, &ddbox, dynamic_dd_box(*dd), bMasterState, bDoDLB,
3323                       step, wcycle);
3324
3325     if (comm->nstDDDumpGrid > 0 && step % comm->nstDDDumpGrid == 0)
3326     {
3327         write_dd_grid_pdb("dd_grid", step, dd, state_local->box, &ddbox);
3328     }
3329
3330     if (comm->useUpdateGroups)
3331     {
3332         comm->updateGroupsCog->addCogs(gmx::arrayRefFromArray(dd->globalAtomGroupIndices.data(), dd->ncg_home),
3333                                        state_local->x);
3334     }
3335
3336     /* Check if we should sort the charge groups */
3337     const bool bSortCG = (bMasterState || bRedist);
3338
3339     ncg_home_old = dd->ncg_home;
3340
3341     /* When repartitioning we mark atom groups that will move to neighboring
3342      * DD cells, but we do not move them right away for performance reasons.
3343      * Thus we need to keep track of how many charge groups will move for
3344      * obtaining correct local charge group / atom counts.
3345      */
3346     ncg_moved = 0;
3347     if (bRedist)
3348     {
3349         wallcycle_sub_start(wcycle, ewcsDD_REDIST);
3350
3351         ncgindex_set = dd->ncg_home;
3352         dd_redistribute_cg(fplog, step, dd, ddbox.tric_dir,
3353                            state_local, f, fr,
3354                            nrnb, &ncg_moved);
3355
3356         GMX_RELEASE_ASSERT(bSortCG, "Sorting is required after redistribution");
3357
3358         if (comm->useUpdateGroups)
3359         {
3360             comm->updateGroupsCog->addCogs(gmx::arrayRefFromArray(dd->globalAtomGroupIndices.data(), dd->ncg_home),
3361                                            state_local->x);
3362         }
3363
3364         wallcycle_sub_stop(wcycle, ewcsDD_REDIST);
3365     }
3366
3367     get_nsgrid_boundaries(ddbox.nboundeddim, state_local->box,
3368                           dd, &ddbox,
3369                           &comm->cell_x0, &comm->cell_x1,
3370                           dd->ncg_home, fr->cg_cm,
3371                           cell_ns_x0, cell_ns_x1, &grid_density);
3372
3373     if (bBoxChanged)
3374     {
3375         comm_dd_ns_cell_sizes(dd, &ddbox, cell_ns_x0, cell_ns_x1, step);
3376     }
3377
3378     switch (fr->cutoff_scheme)
3379     {
3380         case ecutsGROUP:
3381             copy_ivec(fr->ns->grid->n, ncells_old);
3382             grid_first(fplog, fr->ns->grid, dd, &ddbox,
3383                        state_local->box, cell_ns_x0, cell_ns_x1,
3384                        fr->rlist, grid_density);
3385             break;
3386         case ecutsVERLET:
3387             nbnxn_get_ncells(fr->nbv->nbs.get(), &ncells_old[XX], &ncells_old[YY]);
3388             break;
3389         default:
3390             gmx_incons("unimplemented");
3391     }
3392     /* We need to store tric_dir for dd_get_ns_ranges called from ns.c */
3393     copy_ivec(ddbox.tric_dir, comm->tric_dir);
3394
3395     if (bSortCG)
3396     {
3397         wallcycle_sub_start(wcycle, ewcsDD_GRID);
3398
3399         /* Sort the state on charge group position.
3400          * This enables exact restarts from this step.
3401          * It also improves performance by about 15% with larger numbers
3402          * of atoms per node.
3403          */
3404
3405         /* Fill the ns grid with the home cell,
3406          * so we can sort with the indices.
3407          */
3408         set_zones_ncg_home(dd);
3409
3410         switch (fr->cutoff_scheme)
3411         {
3412             case ecutsVERLET:
3413                 set_zones_size(dd, state_local->box, &ddbox, 0, 1, ncg_moved);
3414
3415                 nbnxn_put_on_grid(fr->nbv->nbs.get(), fr->ePBC, state_local->box,
3416                                   0,
3417                                   comm->zones.size[0].bb_x0,
3418                                   comm->zones.size[0].bb_x1,
3419                                   comm->updateGroupsCog.get(),
3420                                   0, dd->ncg_home,
3421                                   comm->zones.dens_zone0,
3422                                   fr->cginfo,
3423                                   state_local->x,
3424                                   ncg_moved, bRedist ? comm->movedBuffer.data() : nullptr,
3425                                   fr->nbv->grp[eintLocal].kernel_type,
3426                                   fr->nbv->nbat);
3427
3428                 nbnxn_get_ncells(fr->nbv->nbs.get(), &ncells_new[XX], &ncells_new[YY]);
3429                 break;
3430             case ecutsGROUP:
3431                 fill_grid(&comm->zones, fr->ns->grid, dd->ncg_home,
3432                           0, dd->ncg_home, fr->cg_cm);
3433
3434                 copy_ivec(fr->ns->grid->n, ncells_new);
3435                 break;
3436             default:
3437                 gmx_incons("unimplemented");
3438         }
3439
3440         bResortAll = bMasterState;
3441
3442         /* Check if we can user the old order and ns grid cell indices
3443          * of the charge groups to sort the charge groups efficiently.
3444          */
3445         if (ncells_new[XX] != ncells_old[XX] ||
3446             ncells_new[YY] != ncells_old[YY] ||
3447             ncells_new[ZZ] != ncells_old[ZZ])
3448         {
3449             bResortAll = TRUE;
3450         }
3451
3452         if (debug)
3453         {
3454             fprintf(debug, "Step %s, sorting the %d home charge groups\n",
3455                     gmx_step_str(step, sbuf), dd->ncg_home);
3456         }
3457         dd_sort_state(dd, fr->cg_cm, fr, state_local,
3458                       bResortAll ? -1 : ncg_home_old);
3459
3460         /* After sorting and compacting we set the correct size */
3461         dd_resize_state(state_local, f, comm->atomRanges.numHomeAtoms());
3462
3463         /* Rebuild all the indices */
3464         dd->ga2la->clear();
3465         ncgindex_set = 0;
3466
3467         wallcycle_sub_stop(wcycle, ewcsDD_GRID);
3468     }
3469
3470     if (comm->useUpdateGroups)
3471     {
3472         /* The update groups cog's are invalid after sorting
3473          * and need to be cleared before the next partitioning anyhow.
3474          */
3475         comm->updateGroupsCog->clear();
3476     }
3477
3478     wallcycle_sub_start(wcycle, ewcsDD_SETUPCOMM);
3479
3480     /* Setup up the communication and communicate the coordinates */
3481     setup_dd_communication(dd, state_local->box, &ddbox, fr, state_local, f);
3482
3483     /* Set the indices */
3484     make_dd_indices(dd, cgs_gl->index, ncgindex_set);
3485
3486     /* Set the charge group boundaries for neighbor searching */
3487     set_cg_boundaries(&comm->zones);
3488
3489     if (fr->cutoff_scheme == ecutsVERLET)
3490     {
3491         /* When bSortCG=true, we have already set the size for zone 0 */
3492         set_zones_size(dd, state_local->box, &ddbox,
3493                        bSortCG ? 1 : 0, comm->zones.n,
3494                        0);
3495     }
3496
3497     wallcycle_sub_stop(wcycle, ewcsDD_SETUPCOMM);
3498
3499     /*
3500        write_dd_pdb("dd_home",step,"dump",top_global,cr,
3501                  -1,state_local->x.rvec_array(),state_local->box);
3502      */
3503
3504     wallcycle_sub_start(wcycle, ewcsDD_MAKETOP);
3505
3506     /* Extract a local topology from the global topology */
3507     for (int i = 0; i < dd->ndim; i++)
3508     {
3509         np[dd->dim[i]] = comm->cd[i].numPulses();
3510     }
3511     dd_make_local_top(dd, &comm->zones, dd->npbcdim, state_local->box,
3512                       comm->cellsize_min, np,
3513                       fr,
3514                       fr->cutoff_scheme == ecutsGROUP ? fr->cg_cm : state_local->x.rvec_array(),
3515                       vsite, top_global, top_local);
3516
3517     wallcycle_sub_stop(wcycle, ewcsDD_MAKETOP);
3518
3519     wallcycle_sub_start(wcycle, ewcsDD_MAKECONSTR);
3520
3521     /* Set up the special atom communication */
3522     int n = comm->atomRanges.end(DDAtomRanges::Type::Zones);
3523     for (int i = static_cast<int>(DDAtomRanges::Type::Zones) + 1; i < static_cast<int>(DDAtomRanges::Type::Number); i++)
3524     {
3525         auto range = static_cast<DDAtomRanges::Type>(i);
3526         switch (range)
3527         {
3528             case DDAtomRanges::Type::Vsites:
3529                 if (vsite && vsite->n_intercg_vsite)
3530                 {
3531                     n = dd_make_local_vsites(dd, n, top_local->idef.il);
3532                 }
3533                 break;
3534             case DDAtomRanges::Type::Constraints:
3535                 if (dd->splitConstraints || dd->splitSettles)
3536                 {
3537                     /* Only for inter-cg constraints we need special code */
3538                     n = dd_make_local_constraints(dd, n, &top_global, fr->cginfo,
3539                                                   constr, ir->nProjOrder,
3540                                                   top_local->idef.il);
3541                 }
3542                 break;
3543             default:
3544                 gmx_incons("Unknown special atom type setup");
3545         }
3546         comm->atomRanges.setEnd(range, n);
3547     }
3548
3549     wallcycle_sub_stop(wcycle, ewcsDD_MAKECONSTR);
3550
3551     wallcycle_sub_start(wcycle, ewcsDD_TOPOTHER);
3552
3553     /* Make space for the extra coordinates for virtual site
3554      * or constraint communication.
3555      */
3556     state_local->natoms = comm->atomRanges.numAtomsTotal();
3557
3558     dd_resize_state(state_local, f, state_local->natoms);
3559
3560     if (fr->haveDirectVirialContributions)
3561     {
3562         if (vsite && vsite->n_intercg_vsite)
3563         {
3564             nat_f_novirsum = comm->atomRanges.end(DDAtomRanges::Type::Vsites);
3565         }
3566         else
3567         {
3568             if (EEL_FULL(ir->coulombtype) && dd->n_intercg_excl > 0)
3569             {
3570                 nat_f_novirsum = comm->atomRanges.end(DDAtomRanges::Type::Zones);
3571             }
3572             else
3573             {
3574                 nat_f_novirsum = comm->atomRanges.numHomeAtoms();
3575             }
3576         }
3577     }
3578     else
3579     {
3580         nat_f_novirsum = 0;
3581     }
3582
3583     /* Set the number of atoms required for the force calculation.
3584      * Forces need to be constrained when doing energy
3585      * minimization. For simple simulations we could avoid some
3586      * allocation, zeroing and copying, but this is probably not worth
3587      * the complications and checking.
3588      */
3589     forcerec_set_ranges(fr, dd->ncg_home, dd->globalAtomGroupIndices.size(),
3590                         comm->atomRanges.end(DDAtomRanges::Type::Zones),
3591                         comm->atomRanges.end(DDAtomRanges::Type::Constraints),
3592                         nat_f_novirsum);
3593
3594     /* Update atom data for mdatoms and several algorithms */
3595     mdAlgorithmsSetupAtomData(cr, ir, top_global, top_local, fr,
3596                               nullptr, mdAtoms, constr, vsite, nullptr);
3597
3598     auto mdatoms = mdAtoms->mdatoms();
3599     if (!thisRankHasDuty(cr, DUTY_PME))
3600     {
3601         /* Send the charges and/or c6/sigmas to our PME only node */
3602         gmx_pme_send_parameters(cr,
3603                                 fr->ic,
3604                                 mdatoms->nChargePerturbed != 0, mdatoms->nTypePerturbed != 0,
3605                                 mdatoms->chargeA, mdatoms->chargeB,
3606                                 mdatoms->sqrt_c6A, mdatoms->sqrt_c6B,
3607                                 mdatoms->sigmaA, mdatoms->sigmaB,
3608                                 dd_pme_maxshift_x(dd), dd_pme_maxshift_y(dd));
3609     }
3610
3611     if (ir->bPull)
3612     {
3613         /* Update the local pull groups */
3614         dd_make_local_pull_groups(cr, ir->pull_work);
3615     }
3616
3617     if (dd->atomSets != nullptr)
3618     {
3619         /* Update the local atom sets */
3620         dd->atomSets->setIndicesInDomainDecomposition(*(dd->ga2la));
3621     }
3622
3623     /* Update the local atoms to be communicated via the IMD protocol if bIMD is TRUE. */
3624     dd_make_local_IMD_atoms(ir->bIMD, dd, ir->imd);
3625
3626     add_dd_statistics(dd);
3627
3628     /* Make sure we only count the cycles for this DD partitioning */
3629     clear_dd_cycle_counts(dd);
3630
3631     /* Because the order of the atoms might have changed since
3632      * the last vsite construction, we need to communicate the constructing
3633      * atom coordinates again (for spreading the forces this MD step).
3634      */
3635     dd_move_x_vsites(dd, state_local->box, state_local->x.rvec_array());
3636
3637     wallcycle_sub_stop(wcycle, ewcsDD_TOPOTHER);
3638
3639     if (comm->nstDDDump > 0 && step % comm->nstDDDump == 0)
3640     {
3641         dd_move_x(dd, state_local->box, state_local->x, nullWallcycle);
3642         write_dd_pdb("dd_dump", step, "dump", &top_global, cr,
3643                      -1, state_local->x.rvec_array(), state_local->box);
3644     }
3645
3646     /* Store the partitioning step */
3647     comm->partition_step = step;
3648
3649     /* Increase the DD partitioning counter */
3650     dd->ddp_count++;
3651     /* The state currently matches this DD partitioning count, store it */
3652     state_local->ddp_count = dd->ddp_count;
3653     if (bMasterState)
3654     {
3655         /* The DD master node knows the complete cg distribution,
3656          * store the count so we can possibly skip the cg info communication.
3657          */
3658         comm->master_cg_ddp_count = (bSortCG ? 0 : dd->ddp_count);
3659     }
3660
3661     if (comm->DD_debug > 0)
3662     {
3663         /* Set the env var GMX_DD_DEBUG if you suspect corrupted indices */
3664         check_index_consistency(dd, top_global.natoms, ncg_mtop(&top_global),
3665                                 "after partitioning");
3666     }
3667
3668     wallcycle_stop(wcycle, ewcDOMDEC);
3669 }
3670
3671 /*! \brief Check whether bonded interactions are missing, if appropriate */
3672 void checkNumberOfBondedInteractions(const gmx::MDLogger  &mdlog,
3673                                      t_commrec            *cr,
3674                                      int                   totalNumberOfBondedInteractions,
3675                                      const gmx_mtop_t     *top_global,
3676                                      const gmx_localtop_t *top_local,
3677                                      const t_state        *state,
3678                                      bool                 *shouldCheckNumberOfBondedInteractions)
3679 {
3680     if (*shouldCheckNumberOfBondedInteractions)
3681     {
3682         if (totalNumberOfBondedInteractions != cr->dd->nbonded_global)
3683         {
3684             dd_print_missing_interactions(mdlog, cr, totalNumberOfBondedInteractions, top_global, top_local, state); // Does not return
3685         }
3686         *shouldCheckNumberOfBondedInteractions = false;
3687     }
3688 }