f4ddcc70eeb077242840b56b7f2ed27aadbd2d30
[alexxy/gromacs.git] / src / gromacs / domdec / domdec.cpp
1 /*
2  * This file is part of the GROMACS molecular simulation package.
3  *
4  * Copyright (c) 2005,2006,2007,2008,2009 by the GROMACS development team.
5  * Copyright (c) 2010,2011,2012,2013,2014 by the GROMACS development team.
6  * Copyright (c) 2015,2016,2017,2018,2019 by the GROMACS development team.
7  * Copyright (c) 2020,2021, by the GROMACS development team, led by
8  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
9  * and including many others, as listed in the AUTHORS file in the
10  * top-level source directory and at http://www.gromacs.org.
11  *
12  * GROMACS is free software; you can redistribute it and/or
13  * modify it under the terms of the GNU Lesser General Public License
14  * as published by the Free Software Foundation; either version 2.1
15  * of the License, or (at your option) any later version.
16  *
17  * GROMACS is distributed in the hope that it will be useful,
18  * but WITHOUT ANY WARRANTY; without even the implied warranty of
19  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
20  * Lesser General Public License for more details.
21  *
22  * You should have received a copy of the GNU Lesser General Public
23  * License along with GROMACS; if not, see
24  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
25  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
26  *
27  * If you want to redistribute modifications to GROMACS, please
28  * consider that scientific software is very special. Version
29  * control is crucial - bugs must be traceable. We will be happy to
30  * consider code for inclusion in the official distribution, but
31  * derived work must not be called official GROMACS. Details are found
32  * in the README & COPYING files - if they are missing, get the
33  * official version at http://www.gromacs.org.
34  *
35  * To help us fund GROMACS development, we humbly ask that you cite
36  * the research papers on the package. Check out http://www.gromacs.org.
37  */
38
39 #include "gmxpre.h"
40
41 #include "domdec.h"
42
43 #include "config.h"
44
45 #include <cassert>
46 #include <cinttypes>
47 #include <climits>
48 #include <cmath>
49 #include <cstdio>
50 #include <cstdlib>
51 #include <cstring>
52
53 #include <algorithm>
54 #include <memory>
55
56 #include "gromacs/domdec/builder.h"
57 #include "gromacs/domdec/collect.h"
58 #include "gromacs/domdec/computemultibodycutoffs.h"
59 #include "gromacs/domdec/dlb.h"
60 #include "gromacs/domdec/dlbtiming.h"
61 #include "gromacs/domdec/domdec_network.h"
62 #include "gromacs/domdec/ga2la.h"
63 #include "gromacs/domdec/gpuhaloexchange.h"
64 #include "gromacs/domdec/localtopologychecker.h"
65 #include "gromacs/domdec/options.h"
66 #include "gromacs/domdec/partition.h"
67 #include "gromacs/ewald/pme.h"
68 #include "gromacs/domdec/reversetopology.h"
69 #include "gromacs/gmxlib/network.h"
70 #include "gromacs/gmxlib/nrnb.h"
71 #include "gromacs/gpu_utils/device_stream_manager.h"
72 #include "gromacs/gpu_utils/gpu_utils.h"
73 #include "gromacs/hardware/hw_info.h"
74 #include "gromacs/math/vec.h"
75 #include "gromacs/math/vectypes.h"
76 #include "gromacs/mdlib/calc_verletbuf.h"
77 #include "gromacs/mdlib/constr.h"
78 #include "gromacs/mdlib/constraintrange.h"
79 #include "gromacs/mdlib/updategroups.h"
80 #include "gromacs/mdlib/vcm.h"
81 #include "gromacs/mdlib/vsite.h"
82 #include "gromacs/mdrun/mdmodules.h"
83 #include "gromacs/mdtypes/commrec.h"
84 #include "gromacs/mdtypes/forceoutput.h"
85 #include "gromacs/mdtypes/forcerec.h"
86 #include "gromacs/mdtypes/inputrec.h"
87 #include "gromacs/mdtypes/mdrunoptions.h"
88 #include "gromacs/mdtypes/state.h"
89 #include "gromacs/pbcutil/ishift.h"
90 #include "gromacs/pbcutil/pbc.h"
91 #include "gromacs/pulling/pull.h"
92 #include "gromacs/timing/wallcycle.h"
93 #include "gromacs/topology/block.h"
94 #include "gromacs/topology/idef.h"
95 #include "gromacs/topology/ifunc.h"
96 #include "gromacs/topology/mtop_lookup.h"
97 #include "gromacs/topology/mtop_util.h"
98 #include "gromacs/topology/topology.h"
99 #include "gromacs/utility/basedefinitions.h"
100 #include "gromacs/utility/basenetwork.h"
101 #include "gromacs/utility/cstringutil.h"
102 #include "gromacs/utility/enumerationhelpers.h"
103 #include "gromacs/utility/exceptions.h"
104 #include "gromacs/utility/fatalerror.h"
105 #include "gromacs/utility/gmxmpi.h"
106 #include "gromacs/utility/logger.h"
107 #include "gromacs/utility/mdmodulesnotifiers.h"
108 #include "gromacs/utility/real.h"
109 #include "gromacs/utility/smalloc.h"
110 #include "gromacs/utility/strconvert.h"
111 #include "gromacs/utility/stringstream.h"
112 #include "gromacs/utility/stringutil.h"
113 #include "gromacs/utility/textwriter.h"
114
115 #include "atomdistribution.h"
116 #include "box.h"
117 #include "cellsizes.h"
118 #include "distribute.h"
119 #include "domdec_constraints.h"
120 #include "domdec_internal.h"
121 #include "domdec_setup.h"
122 #include "domdec_vsite.h"
123 #include "redistribute.h"
124 #include "utility.h"
125
126 // TODO remove this when moving domdec into gmx namespace
127 using gmx::ArrayRef;
128 using gmx::DdRankOrder;
129 using gmx::DlbOption;
130 using gmx::DomdecOptions;
131 using gmx::RangePartitioning;
132
133 static const char* enumValueToString(DlbState enumValue)
134 {
135     static constexpr gmx::EnumerationArray<DlbState, const char*> dlbStateNames = {
136         "off", "off", "auto", "locked", "on", "on"
137     };
138     return dlbStateNames[enumValue];
139 }
140
141 /* The DD zone order */
142 static const ivec dd_zo[DD_MAXZONE] = { { 0, 0, 0 }, { 1, 0, 0 }, { 1, 1, 0 }, { 0, 1, 0 },
143                                         { 0, 1, 1 }, { 0, 0, 1 }, { 1, 0, 1 }, { 1, 1, 1 } };
144
145 /* The non-bonded zone-pair setup for domain decomposition
146  * The first number is the i-zone, the second number the first j-zone seen by
147  * this i-zone, the third number the last+1 j-zone seen by this i-zone.
148  * As is, this is for 3D decomposition, where there are 4 i-zones.
149  * With 2D decomposition use only the first 2 i-zones and a last+1 j-zone of 4.
150  * With 1D decomposition use only the first i-zone and a last+1 j-zone of 2.
151  */
152 static const int ddNonbondedZonePairRanges[DD_MAXIZONE][3] = { { 0, 0, 8 },
153                                                                { 1, 3, 6 },
154                                                                { 2, 5, 6 },
155                                                                { 3, 5, 7 } };
156
157 static void ddindex2xyz(const ivec nc, int ind, ivec xyz)
158 {
159     xyz[XX] = ind / (nc[YY] * nc[ZZ]);
160     xyz[YY] = (ind / nc[ZZ]) % nc[YY];
161     xyz[ZZ] = ind % nc[ZZ];
162 }
163
164 static int ddcoord2ddnodeid(gmx_domdec_t* dd, ivec c)
165 {
166     int ddnodeid = -1;
167
168     const CartesianRankSetup& cartSetup = dd->comm->cartesianRankSetup;
169     const int                 ddindex   = dd_index(dd->numCells, c);
170     if (cartSetup.bCartesianPP_PME)
171     {
172         ddnodeid = cartSetup.ddindex2ddnodeid[ddindex];
173     }
174     else if (cartSetup.bCartesianPP)
175     {
176 #if GMX_MPI
177         MPI_Cart_rank(dd->mpi_comm_all, c, &ddnodeid);
178 #endif
179     }
180     else
181     {
182         ddnodeid = ddindex;
183     }
184
185     return ddnodeid;
186 }
187
188 int ddglatnr(const gmx_domdec_t* dd, int i)
189 {
190     int atnr = 0;
191
192     if (dd == nullptr)
193     {
194         atnr = i + 1;
195     }
196     else
197     {
198         if (i >= dd->comm->atomRanges.numAtomsTotal())
199         {
200             gmx_fatal(FARGS,
201                       "glatnr called with %d, which is larger than the local number of atoms (%d)",
202                       i,
203                       dd->comm->atomRanges.numAtomsTotal());
204         }
205         atnr = dd->globalAtomIndices[i] + 1;
206     }
207
208     return atnr;
209 }
210
211 void dd_store_state(const gmx_domdec_t& dd, t_state* state)
212 {
213     if (state->ddp_count != dd.ddp_count)
214     {
215         gmx_incons("The MD state does not match the domain decomposition state");
216     }
217
218     state->cg_gl.resize(dd.numHomeAtoms);
219     for (int i = 0; i < dd.numHomeAtoms; i++)
220     {
221         state->cg_gl[i] = dd.globalAtomGroupIndices[i];
222     }
223
224     state->ddp_count_cg_gl = dd.ddp_count;
225 }
226
227 gmx_domdec_zones_t* domdec_zones(gmx_domdec_t* dd)
228 {
229     return &dd->comm->zones;
230 }
231
232 int dd_numAtomsZones(const gmx_domdec_t& dd)
233 {
234     return dd.comm->atomRanges.end(DDAtomRanges::Type::Zones);
235 }
236
237 int dd_numHomeAtoms(const gmx_domdec_t& dd)
238 {
239     return dd.comm->atomRanges.numHomeAtoms();
240 }
241
242 int dd_natoms_mdatoms(const gmx_domdec_t& dd)
243 {
244     /* We currently set mdatoms entries for all atoms:
245      * local + non-local + communicated for vsite + constraints
246      */
247
248     return dd.comm->atomRanges.numAtomsTotal();
249 }
250
251 int dd_natoms_vsite(const gmx_domdec_t& dd)
252 {
253     return dd.comm->atomRanges.end(DDAtomRanges::Type::Vsites);
254 }
255
256 void dd_get_constraint_range(const gmx_domdec_t& dd, int* at_start, int* at_end)
257 {
258     *at_start = dd.comm->atomRanges.start(DDAtomRanges::Type::Constraints);
259     *at_end   = dd.comm->atomRanges.end(DDAtomRanges::Type::Constraints);
260 }
261
262 void dd_move_x(gmx_domdec_t* dd, const matrix box, gmx::ArrayRef<gmx::RVec> x, gmx_wallcycle* wcycle)
263 {
264     wallcycle_start(wcycle, WallCycleCounter::MoveX);
265
266     rvec shift = { 0, 0, 0 };
267
268     gmx_domdec_comm_t* comm = dd->comm;
269
270     int nzone   = 1;
271     int nat_tot = comm->atomRanges.numHomeAtoms();
272     for (int d = 0; d < dd->ndim; d++)
273     {
274         const bool bPBC   = (dd->ci[dd->dim[d]] == 0);
275         const bool bScrew = (bPBC && dd->unitCellInfo.haveScrewPBC && dd->dim[d] == XX);
276         if (bPBC)
277         {
278             copy_rvec(box[dd->dim[d]], shift);
279         }
280         gmx_domdec_comm_dim_t* cd = &comm->cd[d];
281         for (const gmx_domdec_ind_t& ind : cd->ind)
282         {
283             DDBufferAccess<gmx::RVec> sendBufferAccess(comm->rvecBuffer, ind.nsend[nzone + 1]);
284             gmx::ArrayRef<gmx::RVec>& sendBuffer = sendBufferAccess.buffer;
285             int                       n          = 0;
286             if (!bPBC)
287             {
288                 for (int j : ind.index)
289                 {
290                     sendBuffer[n] = x[j];
291                     n++;
292                 }
293             }
294             else if (!bScrew)
295             {
296                 for (int j : ind.index)
297                 {
298                     /* We need to shift the coordinates */
299                     for (int d = 0; d < DIM; d++)
300                     {
301                         sendBuffer[n][d] = x[j][d] + shift[d];
302                     }
303                     n++;
304                 }
305             }
306             else
307             {
308                 for (int j : ind.index)
309                 {
310                     /* Shift x */
311                     sendBuffer[n][XX] = x[j][XX] + shift[XX];
312                     /* Rotate y and z.
313                      * This operation requires a special shift force
314                      * treatment, which is performed in calc_vir.
315                      */
316                     sendBuffer[n][YY] = box[YY][YY] - x[j][YY];
317                     sendBuffer[n][ZZ] = box[ZZ][ZZ] - x[j][ZZ];
318                     n++;
319                 }
320             }
321
322             DDBufferAccess<gmx::RVec> receiveBufferAccess(
323                     comm->rvecBuffer2, cd->receiveInPlace ? 0 : ind.nrecv[nzone + 1]);
324
325             gmx::ArrayRef<gmx::RVec> receiveBuffer;
326             if (cd->receiveInPlace)
327             {
328                 receiveBuffer = gmx::arrayRefFromArray(x.data() + nat_tot, ind.nrecv[nzone + 1]);
329             }
330             else
331             {
332                 receiveBuffer = receiveBufferAccess.buffer;
333             }
334             /* Send and receive the coordinates */
335             ddSendrecv(dd, d, dddirBackward, sendBuffer, receiveBuffer);
336
337             if (!cd->receiveInPlace)
338             {
339                 int j = 0;
340                 for (int zone = 0; zone < nzone; zone++)
341                 {
342                     for (int i = ind.cell2at0[zone]; i < ind.cell2at1[zone]; i++)
343                     {
344                         x[i] = receiveBuffer[j++];
345                     }
346                 }
347             }
348             nat_tot += ind.nrecv[nzone + 1];
349         }
350         nzone += nzone;
351     }
352
353     wallcycle_stop(wcycle, WallCycleCounter::MoveX);
354 }
355
356 void dd_move_f(gmx_domdec_t* dd, gmx::ForceWithShiftForces* forceWithShiftForces, gmx_wallcycle* wcycle)
357 {
358     wallcycle_start(wcycle, WallCycleCounter::MoveF);
359
360     gmx::ArrayRef<gmx::RVec> f      = forceWithShiftForces->force();
361     gmx::ArrayRef<gmx::RVec> fshift = forceWithShiftForces->shiftForces();
362
363     gmx_domdec_comm_t& comm    = *dd->comm;
364     int                nzone   = comm.zones.n / 2;
365     int                nat_tot = comm.atomRanges.end(DDAtomRanges::Type::Zones);
366     for (int d = dd->ndim - 1; d >= 0; d--)
367     {
368         /* Only forces in domains near the PBC boundaries need to
369            consider PBC in the treatment of fshift */
370         const bool shiftForcesNeedPbc =
371                 (forceWithShiftForces->computeVirial() && dd->ci[dd->dim[d]] == 0);
372         const bool applyScrewPbc =
373                 (shiftForcesNeedPbc && dd->unitCellInfo.haveScrewPBC && dd->dim[d] == XX);
374         /* Determine which shift vector we need */
375         ivec vis        = { 0, 0, 0 };
376         vis[dd->dim[d]] = 1;
377         const int is    = gmx::ivecToShiftIndex(vis);
378
379         /* Loop over the pulses */
380         const gmx_domdec_comm_dim_t& cd = comm.cd[d];
381         for (int p = cd.numPulses() - 1; p >= 0; p--)
382         {
383             const gmx_domdec_ind_t&   ind = cd.ind[p];
384             DDBufferAccess<gmx::RVec> receiveBufferAccess(comm.rvecBuffer, ind.nsend[nzone + 1]);
385             gmx::ArrayRef<gmx::RVec>& receiveBuffer = receiveBufferAccess.buffer;
386
387             nat_tot -= ind.nrecv[nzone + 1];
388
389             DDBufferAccess<gmx::RVec> sendBufferAccess(
390                     comm.rvecBuffer2, cd.receiveInPlace ? 0 : ind.nrecv[nzone + 1]);
391
392             gmx::ArrayRef<gmx::RVec> sendBuffer;
393             if (cd.receiveInPlace)
394             {
395                 sendBuffer = gmx::arrayRefFromArray(f.data() + nat_tot, ind.nrecv[nzone + 1]);
396             }
397             else
398             {
399                 sendBuffer = sendBufferAccess.buffer;
400                 int j      = 0;
401                 for (int zone = 0; zone < nzone; zone++)
402                 {
403                     for (int i = ind.cell2at0[zone]; i < ind.cell2at1[zone]; i++)
404                     {
405                         sendBuffer[j++] = f[i];
406                     }
407                 }
408             }
409             /* Communicate the forces */
410             ddSendrecv(dd, d, dddirForward, sendBuffer, receiveBuffer);
411             /* Add the received forces */
412             int n = 0;
413             if (!shiftForcesNeedPbc)
414             {
415                 for (int j : ind.index)
416                 {
417                     for (int d = 0; d < DIM; d++)
418                     {
419                         f[j][d] += receiveBuffer[n][d];
420                     }
421                     n++;
422                 }
423             }
424             else if (!applyScrewPbc)
425             {
426                 for (int j : ind.index)
427                 {
428                     for (int d = 0; d < DIM; d++)
429                     {
430                         f[j][d] += receiveBuffer[n][d];
431                     }
432                     /* Add this force to the shift force */
433                     for (int d = 0; d < DIM; d++)
434                     {
435                         fshift[is][d] += receiveBuffer[n][d];
436                     }
437                     n++;
438                 }
439             }
440             else
441             {
442                 for (int j : ind.index)
443                 {
444                     /* Rotate the force */
445                     f[j][XX] += receiveBuffer[n][XX];
446                     f[j][YY] -= receiveBuffer[n][YY];
447                     f[j][ZZ] -= receiveBuffer[n][ZZ];
448                     if (shiftForcesNeedPbc)
449                     {
450                         /* Add this force to the shift force */
451                         for (int d = 0; d < DIM; d++)
452                         {
453                             fshift[is][d] += receiveBuffer[n][d];
454                         }
455                     }
456                     n++;
457                 }
458             }
459         }
460         nzone /= 2;
461     }
462     wallcycle_stop(wcycle, WallCycleCounter::MoveF);
463 }
464
465 /* Convenience function for extracting a real buffer from an rvec buffer
466  *
467  * To reduce the number of temporary communication buffers and avoid
468  * cache polution, we reuse gmx::RVec buffers for storing reals.
469  * This functions return a real buffer reference with the same number
470  * of elements as the gmx::RVec buffer (so 1/3 of the size in bytes).
471  */
472 static gmx::ArrayRef<real> realArrayRefFromRvecArrayRef(gmx::ArrayRef<gmx::RVec> arrayRef)
473 {
474     return gmx::arrayRefFromArray(as_rvec_array(arrayRef.data())[0], arrayRef.size());
475 }
476
477 void dd_atom_spread_real(gmx_domdec_t* dd, real v[])
478 {
479     gmx_domdec_comm_t* comm = dd->comm;
480
481     int nzone   = 1;
482     int nat_tot = comm->atomRanges.numHomeAtoms();
483     for (int d = 0; d < dd->ndim; d++)
484     {
485         gmx_domdec_comm_dim_t* cd = &comm->cd[d];
486         for (const gmx_domdec_ind_t& ind : cd->ind)
487         {
488             /* Note: We provision for RVec instead of real, so a factor of 3
489              * more than needed. The buffer actually already has this size
490              * and we pass a plain pointer below, so this does not matter.
491              */
492             DDBufferAccess<gmx::RVec> sendBufferAccess(comm->rvecBuffer, ind.nsend[nzone + 1]);
493             gmx::ArrayRef<real> sendBuffer = realArrayRefFromRvecArrayRef(sendBufferAccess.buffer);
494             int                 n          = 0;
495             for (int j : ind.index)
496             {
497                 sendBuffer[n++] = v[j];
498             }
499
500             DDBufferAccess<gmx::RVec> receiveBufferAccess(
501                     comm->rvecBuffer2, cd->receiveInPlace ? 0 : ind.nrecv[nzone + 1]);
502
503             gmx::ArrayRef<real> receiveBuffer;
504             if (cd->receiveInPlace)
505             {
506                 receiveBuffer = gmx::arrayRefFromArray(v + nat_tot, ind.nrecv[nzone + 1]);
507             }
508             else
509             {
510                 receiveBuffer = realArrayRefFromRvecArrayRef(receiveBufferAccess.buffer);
511             }
512             /* Send and receive the data */
513             ddSendrecv(dd, d, dddirBackward, sendBuffer, receiveBuffer);
514             if (!cd->receiveInPlace)
515             {
516                 int j = 0;
517                 for (int zone = 0; zone < nzone; zone++)
518                 {
519                     for (int i = ind.cell2at0[zone]; i < ind.cell2at1[zone]; i++)
520                     {
521                         v[i] = receiveBuffer[j++];
522                     }
523                 }
524             }
525             nat_tot += ind.nrecv[nzone + 1];
526         }
527         nzone += nzone;
528     }
529 }
530
531 void dd_atom_sum_real(gmx_domdec_t* dd, real v[])
532 {
533     gmx_domdec_comm_t* comm = dd->comm;
534
535     int nzone   = comm->zones.n / 2;
536     int nat_tot = comm->atomRanges.end(DDAtomRanges::Type::Zones);
537     for (int d = dd->ndim - 1; d >= 0; d--)
538     {
539         gmx_domdec_comm_dim_t* cd = &comm->cd[d];
540         for (int p = cd->numPulses() - 1; p >= 0; p--)
541         {
542             const gmx_domdec_ind_t& ind = cd->ind[p];
543
544             /* Note: We provision for RVec instead of real, so a factor of 3
545              * more than needed. The buffer actually already has this size
546              * and we typecast, so this works as intended.
547              */
548             DDBufferAccess<gmx::RVec> receiveBufferAccess(comm->rvecBuffer, ind.nsend[nzone + 1]);
549             gmx::ArrayRef<real> receiveBuffer = realArrayRefFromRvecArrayRef(receiveBufferAccess.buffer);
550             nat_tot -= ind.nrecv[nzone + 1];
551
552             DDBufferAccess<gmx::RVec> sendBufferAccess(
553                     comm->rvecBuffer2, cd->receiveInPlace ? 0 : ind.nrecv[nzone + 1]);
554
555             gmx::ArrayRef<real> sendBuffer;
556             if (cd->receiveInPlace)
557             {
558                 sendBuffer = gmx::arrayRefFromArray(v + nat_tot, ind.nrecv[nzone + 1]);
559             }
560             else
561             {
562                 sendBuffer = realArrayRefFromRvecArrayRef(sendBufferAccess.buffer);
563                 int j      = 0;
564                 for (int zone = 0; zone < nzone; zone++)
565                 {
566                     for (int i = ind.cell2at0[zone]; i < ind.cell2at1[zone]; i++)
567                     {
568                         sendBuffer[j++] = v[i];
569                     }
570                 }
571             }
572             /* Communicate the forces */
573             ddSendrecv(dd, d, dddirForward, sendBuffer, receiveBuffer);
574             /* Add the received forces */
575             int n = 0;
576             for (int j : ind.index)
577             {
578                 v[j] += receiveBuffer[n];
579                 n++;
580             }
581         }
582         nzone /= 2;
583     }
584 }
585
586 real dd_cutoff_multibody(const gmx_domdec_t* dd)
587 {
588     const gmx_domdec_comm_t& comm       = *dd->comm;
589     const DDSystemInfo&      systemInfo = comm.systemInfo;
590
591     real r = -1;
592     if (systemInfo.haveInterDomainMultiBodyBondeds)
593     {
594         if (comm.cutoff_mbody > 0)
595         {
596             r = comm.cutoff_mbody;
597         }
598         else
599         {
600             /* cutoff_mbody=0 means we do not have DLB */
601             r = comm.cellsize_min[dd->dim[0]];
602             for (int di = 1; di < dd->ndim; di++)
603             {
604                 r = std::min(r, comm.cellsize_min[dd->dim[di]]);
605             }
606             if (comm.systemInfo.filterBondedCommunication)
607             {
608                 r = std::max(r, comm.cutoff_mbody);
609             }
610             else
611             {
612                 r = std::min(r, systemInfo.cutoff);
613             }
614         }
615     }
616
617     return r;
618 }
619
620 real dd_cutoff_twobody(const gmx_domdec_t* dd)
621 {
622     const real r_mb = dd_cutoff_multibody(dd);
623
624     return std::max(dd->comm->systemInfo.cutoff, r_mb);
625 }
626
627
628 static void dd_cart_coord2pmecoord(const DDRankSetup&        ddRankSetup,
629                                    const CartesianRankSetup& cartSetup,
630                                    const ivec                coord,
631                                    ivec                      coord_pme)
632 {
633     const int nc   = ddRankSetup.numPPCells[cartSetup.cartpmedim];
634     const int ntot = cartSetup.ntot[cartSetup.cartpmedim];
635     copy_ivec(coord, coord_pme);
636     coord_pme[cartSetup.cartpmedim] =
637             nc + (coord[cartSetup.cartpmedim] * (ntot - nc) + (ntot - nc) / 2) / nc;
638 }
639
640 /* Returns the PME rank index in 0...npmenodes-1 for the PP cell with index ddCellIndex */
641 static int ddindex2pmeindex(const DDRankSetup& ddRankSetup, const int ddCellIndex)
642 {
643     const int npp  = ddRankSetup.numPPRanks;
644     const int npme = ddRankSetup.numRanksDoingPme;
645
646     /* Here we assign a PME node to communicate with this DD node
647      * by assuming that the major index of both is x.
648      * We add npme/2 to obtain an even distribution.
649      */
650     return (ddCellIndex * npme + npme / 2) / npp;
651 }
652
653 static std::vector<int> dd_interleaved_pme_ranks(const DDRankSetup& ddRankSetup)
654 {
655     std::vector<int> pmeRanks(ddRankSetup.numRanksDoingPme);
656
657     int n = 0;
658     for (int i = 0; i < ddRankSetup.numPPRanks; i++)
659     {
660         const int p0 = ddindex2pmeindex(ddRankSetup, i);
661         const int p1 = ddindex2pmeindex(ddRankSetup, i + 1);
662         if (i + 1 == ddRankSetup.numPPRanks || p1 > p0)
663         {
664             if (debug)
665             {
666                 fprintf(debug, "pme_rank[%d] = %d\n", n, i + 1 + n);
667             }
668             pmeRanks[n] = i + 1 + n;
669             n++;
670         }
671     }
672
673     return pmeRanks;
674 }
675
676 static int gmx_ddcoord2pmeindex(const gmx_domdec_t& dd, int x, int y, int z)
677 {
678     ivec coords;
679
680     coords[XX]     = x;
681     coords[YY]     = y;
682     coords[ZZ]     = z;
683     const int slab = ddindex2pmeindex(dd.comm->ddRankSetup, dd_index(dd.numCells, coords));
684
685     return slab;
686 }
687
688 static int ddcoord2simnodeid(const t_commrec* cr, int x, int y, int z)
689 {
690     const CartesianRankSetup& cartSetup = cr->dd->comm->cartesianRankSetup;
691     ivec                      coords    = { x, y, z };
692     int                       nodeid    = -1;
693
694     if (cartSetup.bCartesianPP_PME)
695     {
696 #if GMX_MPI
697         MPI_Cart_rank(cr->mpi_comm_mysim, coords, &nodeid);
698 #endif
699     }
700     else
701     {
702         int ddindex = dd_index(cr->dd->numCells, coords);
703         if (cartSetup.bCartesianPP)
704         {
705             nodeid = cartSetup.ddindex2simnodeid[ddindex];
706         }
707         else
708         {
709             const DDRankSetup& rankSetup = cr->dd->comm->ddRankSetup;
710             if (rankSetup.rankOrder != DdRankOrder::pp_pme && rankSetup.usePmeOnlyRanks)
711             {
712                 nodeid = ddindex + gmx_ddcoord2pmeindex(*cr->dd, x, y, z);
713             }
714             else
715             {
716                 nodeid = ddindex;
717             }
718         }
719     }
720
721     return nodeid;
722 }
723
724 static int dd_simnode2pmenode(const DDRankSetup&        ddRankSetup,
725                               const CartesianRankSetup& cartSetup,
726                               gmx::ArrayRef<const int>  pmeRanks,
727                               const t_commrec gmx_unused* cr,
728                               const int                   sim_nodeid)
729 {
730     int pmenode = -1;
731
732     /* This assumes a uniform x domain decomposition grid cell size */
733     if (cartSetup.bCartesianPP_PME)
734     {
735 #if GMX_MPI
736         ivec coord, coord_pme;
737         MPI_Cart_coords(cr->mpi_comm_mysim, sim_nodeid, DIM, coord);
738         if (coord[cartSetup.cartpmedim] < ddRankSetup.numPPCells[cartSetup.cartpmedim])
739         {
740             /* This is a PP rank */
741             dd_cart_coord2pmecoord(ddRankSetup, cartSetup, coord, coord_pme);
742             MPI_Cart_rank(cr->mpi_comm_mysim, coord_pme, &pmenode);
743         }
744 #endif
745     }
746     else if (cartSetup.bCartesianPP)
747     {
748         if (sim_nodeid < ddRankSetup.numPPRanks)
749         {
750             pmenode = ddRankSetup.numPPRanks + ddindex2pmeindex(ddRankSetup, sim_nodeid);
751         }
752     }
753     else
754     {
755         /* This assumes DD cells with identical x coordinates
756          * are numbered sequentially.
757          */
758         if (pmeRanks.empty())
759         {
760             if (sim_nodeid < ddRankSetup.numPPRanks)
761             {
762                 /* The DD index equals the nodeid */
763                 pmenode = ddRankSetup.numPPRanks + ddindex2pmeindex(ddRankSetup, sim_nodeid);
764             }
765         }
766         else
767         {
768             int i = 0;
769             while (sim_nodeid > pmeRanks[i])
770             {
771                 i++;
772             }
773             if (sim_nodeid < pmeRanks[i])
774             {
775                 pmenode = pmeRanks[i];
776             }
777         }
778     }
779
780     return pmenode;
781 }
782
783 NumPmeDomains getNumPmeDomains(const gmx_domdec_t* dd)
784 {
785     if (dd != nullptr)
786     {
787         return { dd->comm->ddRankSetup.npmenodes_x, dd->comm->ddRankSetup.npmenodes_y };
788     }
789     else
790     {
791         return { 1, 1 };
792     }
793 }
794
795 std::vector<int> get_pme_ddranks(const t_commrec* cr, const int pmenodeid)
796 {
797     const DDRankSetup&        ddRankSetup = cr->dd->comm->ddRankSetup;
798     const CartesianRankSetup& cartSetup   = cr->dd->comm->cartesianRankSetup;
799     GMX_RELEASE_ASSERT(ddRankSetup.usePmeOnlyRanks,
800                        "This function should only be called when PME-only ranks are in use");
801     std::vector<int> ddranks;
802     ddranks.reserve((ddRankSetup.numPPRanks + ddRankSetup.numRanksDoingPme - 1) / ddRankSetup.numRanksDoingPme);
803
804     for (int x = 0; x < ddRankSetup.numPPCells[XX]; x++)
805     {
806         for (int y = 0; y < ddRankSetup.numPPCells[YY]; y++)
807         {
808             for (int z = 0; z < ddRankSetup.numPPCells[ZZ]; z++)
809             {
810                 if (cartSetup.bCartesianPP_PME)
811                 {
812                     ivec coord = { x, y, z };
813                     ivec coord_pme;
814                     dd_cart_coord2pmecoord(ddRankSetup, cartSetup, coord, coord_pme);
815                     if (cr->dd->ci[XX] == coord_pme[XX] && cr->dd->ci[YY] == coord_pme[YY]
816                         && cr->dd->ci[ZZ] == coord_pme[ZZ])
817                     {
818                         ddranks.push_back(ddcoord2simnodeid(cr, x, y, z));
819                     }
820                 }
821                 else
822                 {
823                     /* The slab corresponds to the nodeid in the PME group */
824                     if (gmx_ddcoord2pmeindex(*cr->dd, x, y, z) == pmenodeid)
825                     {
826                         ddranks.push_back(ddcoord2simnodeid(cr, x, y, z));
827                     }
828                 }
829             }
830         }
831     }
832     return ddranks;
833 }
834
835 static gmx_bool receive_vir_ener(const gmx_domdec_t* dd, gmx::ArrayRef<const int> pmeRanks, const t_commrec* cr)
836 {
837     bool bReceive = true;
838
839     const DDRankSetup& ddRankSetup = dd->comm->ddRankSetup;
840     if (ddRankSetup.usePmeOnlyRanks)
841     {
842         const CartesianRankSetup& cartSetup = dd->comm->cartesianRankSetup;
843         if (cartSetup.bCartesianPP_PME)
844         {
845 #if GMX_MPI
846             int  pmenode = dd_simnode2pmenode(ddRankSetup, cartSetup, pmeRanks, cr, cr->sim_nodeid);
847             ivec coords;
848             MPI_Cart_coords(cr->mpi_comm_mysim, cr->sim_nodeid, DIM, coords);
849             coords[cartSetup.cartpmedim]++;
850             if (coords[cartSetup.cartpmedim] < dd->numCells[cartSetup.cartpmedim])
851             {
852                 int rank = 0;
853                 MPI_Cart_rank(cr->mpi_comm_mysim, coords, &rank);
854                 if (dd_simnode2pmenode(ddRankSetup, cartSetup, pmeRanks, cr, rank) == pmenode)
855                 {
856                     /* This is not the last PP node for pmenode */
857                     bReceive = FALSE;
858                 }
859             }
860 #else
861             GMX_RELEASE_ASSERT(
862                     false,
863                     "Without MPI we should not have Cartesian PP-PME with #PMEnodes < #DDnodes");
864 #endif
865         }
866         else
867         {
868             int pmenode = dd_simnode2pmenode(ddRankSetup, cartSetup, pmeRanks, cr, cr->sim_nodeid);
869             if (cr->sim_nodeid + 1 < cr->nnodes
870                 && dd_simnode2pmenode(ddRankSetup, cartSetup, pmeRanks, cr, cr->sim_nodeid + 1) == pmenode)
871             {
872                 /* This is not the last PP node for pmenode */
873                 bReceive = FALSE;
874             }
875         }
876     }
877
878     return bReceive;
879 }
880
881 static void set_slb_pme_dim_f(gmx_domdec_t* dd, int dim, real** dim_f)
882 {
883     gmx_domdec_comm_t* comm = dd->comm;
884
885     snew(*dim_f, dd->numCells[dim] + 1);
886     (*dim_f)[0] = 0;
887     for (int i = 1; i < dd->numCells[dim]; i++)
888     {
889         if (comm->slb_frac[dim])
890         {
891             (*dim_f)[i] = (*dim_f)[i - 1] + comm->slb_frac[dim][i - 1];
892         }
893         else
894         {
895             (*dim_f)[i] = static_cast<real>(i) / static_cast<real>(dd->numCells[dim]);
896         }
897     }
898     (*dim_f)[dd->numCells[dim]] = 1;
899 }
900
901 static void init_ddpme(gmx_domdec_t* dd, gmx_ddpme_t* ddpme, int dimind)
902 {
903     const DDRankSetup& ddRankSetup = dd->comm->ddRankSetup;
904
905     if (dimind == 0 && dd->dim[0] == YY && ddRankSetup.npmenodes_x == 1)
906     {
907         ddpme->dim = YY;
908     }
909     else
910     {
911         ddpme->dim = dimind;
912     }
913     ddpme->dim_match = (ddpme->dim == dd->dim[dimind]);
914
915     ddpme->nslab = (ddpme->dim == 0 ? ddRankSetup.npmenodes_x : ddRankSetup.npmenodes_y);
916
917     if (ddpme->nslab <= 1)
918     {
919         return;
920     }
921
922     const int nso = ddRankSetup.numRanksDoingPme / ddpme->nslab;
923     /* Determine for each PME slab the PP location range for dimension dim */
924     snew(ddpme->pp_min, ddpme->nslab);
925     snew(ddpme->pp_max, ddpme->nslab);
926     for (int slab = 0; slab < ddpme->nslab; slab++)
927     {
928         ddpme->pp_min[slab] = dd->numCells[dd->dim[dimind]] - 1;
929         ddpme->pp_max[slab] = 0;
930     }
931     for (int i = 0; i < dd->nnodes; i++)
932     {
933         ivec xyz;
934         ddindex2xyz(dd->numCells, i, xyz);
935         /* For y only use our y/z slab.
936          * This assumes that the PME x grid size matches the DD grid size.
937          */
938         if (dimind == 0 || xyz[XX] == dd->ci[XX])
939         {
940             const int pmeindex  = ddindex2pmeindex(ddRankSetup, i);
941             const int slab      = (dimind == 0) ? (pmeindex / nso) : (pmeindex % ddpme->nslab);
942             ddpme->pp_min[slab] = std::min(ddpme->pp_min[slab], xyz[dimind]);
943             ddpme->pp_max[slab] = std::max(ddpme->pp_max[slab], xyz[dimind]);
944         }
945     }
946
947     set_slb_pme_dim_f(dd, ddpme->dim, &ddpme->slb_dim_f);
948 }
949
950 int dd_pme_maxshift_x(const gmx_domdec_t& dd)
951 {
952     const DDRankSetup& ddRankSetup = dd.comm->ddRankSetup;
953
954     if (ddRankSetup.ddpme[0].dim == XX)
955     {
956         return ddRankSetup.ddpme[0].maxshift;
957     }
958     else
959     {
960         return 0;
961     }
962 }
963
964 int dd_pme_maxshift_y(const gmx_domdec_t& dd)
965 {
966     const DDRankSetup& ddRankSetup = dd.comm->ddRankSetup;
967
968     if (ddRankSetup.ddpme[0].dim == YY)
969     {
970         return ddRankSetup.ddpme[0].maxshift;
971     }
972     else if (ddRankSetup.npmedecompdim >= 2 && ddRankSetup.ddpme[1].dim == YY)
973     {
974         return ddRankSetup.ddpme[1].maxshift;
975     }
976     else
977     {
978         return 0;
979     }
980 }
981
982 bool ddUsesUpdateGroups(const gmx_domdec_t& dd)
983 {
984     return dd.comm->systemInfo.useUpdateGroups;
985 }
986
987 void dd_cycles_add(const gmx_domdec_t* dd, float cycles, int ddCycl)
988 {
989     /* Note that the cycles value can be incorrect, either 0 or some
990      * extremely large value, when our thread migrated to another core
991      * with an unsynchronized cycle counter. If this happens less often
992      * that once per nstlist steps, this will not cause issues, since
993      * we later subtract the maximum value from the sum over nstlist steps.
994      * A zero count will slightly lower the total, but that's a small effect.
995      * Note that the main purpose of the subtraction of the maximum value
996      * is to avoid throwing off the load balancing when stalls occur due
997      * e.g. system activity or network congestion.
998      */
999     dd->comm->cycl[ddCycl] += cycles;
1000     dd->comm->cycl_n[ddCycl]++;
1001     if (cycles > dd->comm->cycl_max[ddCycl])
1002     {
1003         dd->comm->cycl_max[ddCycl] = cycles;
1004     }
1005 }
1006
1007 #if GMX_MPI
1008 static void make_load_communicator(gmx_domdec_t* dd, int dim_ind, ivec loc)
1009 {
1010     MPI_Comm c_row = MPI_COMM_NULL;
1011     ivec     loc_c;
1012     bool     bPartOfGroup = false;
1013
1014     const int dim = dd->dim[dim_ind];
1015     copy_ivec(loc, loc_c);
1016     for (int i = 0; i < dd->numCells[dim]; i++)
1017     {
1018         loc_c[dim]     = i;
1019         const int rank = dd_index(dd->numCells, loc_c);
1020         if (rank == dd->rank)
1021         {
1022             /* This process is part of the group */
1023             bPartOfGroup = TRUE;
1024         }
1025     }
1026     MPI_Comm_split(dd->mpi_comm_all, bPartOfGroup ? 0 : MPI_UNDEFINED, dd->rank, &c_row);
1027     if (bPartOfGroup)
1028     {
1029         dd->comm->mpi_comm_load[dim_ind] = c_row;
1030         if (!isDlbDisabled(dd->comm))
1031         {
1032             DDCellsizesWithDlb& cellsizes = dd->comm->cellsizesWithDlb[dim_ind];
1033
1034             if (dd->ci[dim] == dd->master_ci[dim])
1035             {
1036                 /* This is the root process of this row */
1037                 cellsizes.rowMaster = std::make_unique<RowMaster>();
1038
1039                 RowMaster& rowMaster = *cellsizes.rowMaster;
1040                 rowMaster.cellFrac.resize(ddCellFractionBufferSize(dd, dim_ind));
1041                 rowMaster.oldCellFrac.resize(dd->numCells[dim] + 1);
1042                 rowMaster.isCellMin.resize(dd->numCells[dim]);
1043                 if (dim_ind > 0)
1044                 {
1045                     rowMaster.bounds.resize(dd->numCells[dim]);
1046                 }
1047                 rowMaster.buf_ncd.resize(dd->numCells[dim]);
1048             }
1049             else
1050             {
1051                 /* This is not a root process, we only need to receive cell_f */
1052                 cellsizes.fracRow.resize(ddCellFractionBufferSize(dd, dim_ind));
1053             }
1054         }
1055         if (dd->ci[dim] == dd->master_ci[dim])
1056         {
1057             snew(dd->comm->load[dim_ind].load, dd->numCells[dim] * DD_NLOAD_MAX);
1058         }
1059     }
1060 }
1061 #endif
1062
1063 void dd_setup_dlb_resource_sharing(const t_commrec* cr, int gpu_id)
1064 {
1065 #if GMX_MPI
1066     MPI_Comm mpi_comm_pp_physicalnode = MPI_COMM_NULL;
1067
1068     if (!thisRankHasDuty(cr, DUTY_PP) || gpu_id < 0)
1069     {
1070         /* Only ranks with short-ranged tasks (currently) use GPUs.
1071          * If we don't have GPUs assigned, there are no resources to share.
1072          */
1073         return;
1074     }
1075
1076     const int physicalnode_id_hash = gmx_physicalnode_id_hash();
1077
1078     gmx_domdec_t* dd = cr->dd;
1079
1080     if (debug)
1081     {
1082         fprintf(debug, "dd_setup_dd_dlb_gpu_sharing:\n");
1083         fprintf(debug, "DD PP rank %d physical node hash %d gpu_id %d\n", dd->rank, physicalnode_id_hash, gpu_id);
1084     }
1085     /* Split the PP communicator over the physical nodes */
1086     /* TODO: See if we should store this (before), as it's also used for
1087      * for the nodecomm summation.
1088      */
1089     // TODO PhysicalNodeCommunicator could be extended/used to handle
1090     // the need for per-node per-group communicators.
1091     MPI_Comm_split(dd->mpi_comm_all, physicalnode_id_hash, dd->rank, &mpi_comm_pp_physicalnode);
1092     MPI_Comm_split(mpi_comm_pp_physicalnode, gpu_id, dd->rank, &dd->comm->mpi_comm_gpu_shared);
1093     MPI_Comm_free(&mpi_comm_pp_physicalnode);
1094     MPI_Comm_size(dd->comm->mpi_comm_gpu_shared, &dd->comm->nrank_gpu_shared);
1095
1096     if (debug)
1097     {
1098         fprintf(debug, "nrank_gpu_shared %d\n", dd->comm->nrank_gpu_shared);
1099     }
1100
1101     /* Note that some ranks could share a GPU, while others don't */
1102
1103     if (dd->comm->nrank_gpu_shared == 1)
1104     {
1105         MPI_Comm_free(&dd->comm->mpi_comm_gpu_shared);
1106     }
1107 #else
1108     GMX_UNUSED_VALUE(cr);
1109     GMX_UNUSED_VALUE(gpu_id);
1110 #endif
1111 }
1112
1113 static void make_load_communicators(gmx_domdec_t gmx_unused* dd)
1114 {
1115 #if GMX_MPI
1116     ivec loc;
1117
1118     if (debug)
1119     {
1120         fprintf(debug, "Making load communicators\n");
1121     }
1122
1123     dd->comm->load = new domdec_load_t[std::max(dd->ndim, 1)];
1124     snew(dd->comm->mpi_comm_load, std::max(dd->ndim, 1));
1125
1126     if (dd->ndim == 0)
1127     {
1128         return;
1129     }
1130
1131     clear_ivec(loc);
1132     make_load_communicator(dd, 0, loc);
1133     if (dd->ndim > 1)
1134     {
1135         const int dim0 = dd->dim[0];
1136         for (int i = 0; i < dd->numCells[dim0]; i++)
1137         {
1138             loc[dim0] = i;
1139             make_load_communicator(dd, 1, loc);
1140         }
1141     }
1142     if (dd->ndim > 2)
1143     {
1144         const int dim0 = dd->dim[0];
1145         for (int i = 0; i < dd->numCells[dim0]; i++)
1146         {
1147             loc[dim0]      = i;
1148             const int dim1 = dd->dim[1];
1149             for (int j = 0; j < dd->numCells[dim1]; j++)
1150             {
1151                 loc[dim1] = j;
1152                 make_load_communicator(dd, 2, loc);
1153             }
1154         }
1155     }
1156
1157     if (debug)
1158     {
1159         fprintf(debug, "Finished making load communicators\n");
1160     }
1161 #endif
1162 }
1163
1164 /*! \brief Sets up the relation between neighboring domains and zones */
1165 static void setup_neighbor_relations(gmx_domdec_t* dd)
1166 {
1167     ivec tmp, s;
1168     GMX_ASSERT((dd->ndim >= 0) && (dd->ndim <= DIM), "Must have valid number of dimensions for DD");
1169
1170     for (int d = 0; d < dd->ndim; d++)
1171     {
1172         const int dim = dd->dim[d];
1173         copy_ivec(dd->ci, tmp);
1174         tmp[dim]           = (tmp[dim] + 1) % dd->numCells[dim];
1175         dd->neighbor[d][0] = ddcoord2ddnodeid(dd, tmp);
1176         copy_ivec(dd->ci, tmp);
1177         tmp[dim]           = (tmp[dim] - 1 + dd->numCells[dim]) % dd->numCells[dim];
1178         dd->neighbor[d][1] = ddcoord2ddnodeid(dd, tmp);
1179         if (debug)
1180         {
1181             fprintf(debug,
1182                     "DD rank %d neighbor ranks in dir %d are + %d - %d\n",
1183                     dd->rank,
1184                     dim,
1185                     dd->neighbor[d][0],
1186                     dd->neighbor[d][1]);
1187         }
1188     }
1189
1190     int nzone  = (1 << dd->ndim);
1191     int nizone = (1 << std::max(dd->ndim - 1, 0));
1192     assert(nizone >= 1 && nizone <= DD_MAXIZONE);
1193
1194     gmx_domdec_zones_t* zones = &dd->comm->zones;
1195
1196     for (int i = 0; i < nzone; i++)
1197     {
1198         int m = 0;
1199         clear_ivec(zones->shift[i]);
1200         for (int d = 0; d < dd->ndim; d++)
1201         {
1202             zones->shift[i][dd->dim[d]] = dd_zo[i][m++];
1203         }
1204     }
1205
1206     zones->n = nzone;
1207     for (int i = 0; i < nzone; i++)
1208     {
1209         for (int d = 0; d < DIM; d++)
1210         {
1211             s[d] = dd->ci[d] - zones->shift[i][d];
1212             if (s[d] < 0)
1213             {
1214                 s[d] += dd->numCells[d];
1215             }
1216             else if (s[d] >= dd->numCells[d])
1217             {
1218                 s[d] -= dd->numCells[d];
1219             }
1220         }
1221     }
1222     for (int iZoneIndex = 0; iZoneIndex < nizone; iZoneIndex++)
1223     {
1224         GMX_RELEASE_ASSERT(
1225                 ddNonbondedZonePairRanges[iZoneIndex][0] == iZoneIndex,
1226                 "The first element for each ddNonbondedZonePairRanges should match its index");
1227
1228         DDPairInteractionRanges iZone;
1229         iZone.iZoneIndex = iZoneIndex;
1230         /* dd_zp3 is for 3D decomposition, for fewer dimensions use only
1231          * j-zones up to nzone.
1232          */
1233         iZone.jZoneRange = gmx::Range<int>(std::min(ddNonbondedZonePairRanges[iZoneIndex][1], nzone),
1234                                            std::min(ddNonbondedZonePairRanges[iZoneIndex][2], nzone));
1235         for (int dim = 0; dim < DIM; dim++)
1236         {
1237             if (dd->numCells[dim] == 1)
1238             {
1239                 /* All shifts should be allowed */
1240                 iZone.shift0[dim] = -1;
1241                 iZone.shift1[dim] = 1;
1242             }
1243             else
1244             {
1245                 /* Determine the min/max j-zone shift wrt the i-zone */
1246                 iZone.shift0[dim] = 1;
1247                 iZone.shift1[dim] = -1;
1248                 for (int jZone : iZone.jZoneRange)
1249                 {
1250                     int shift_diff = zones->shift[jZone][dim] - zones->shift[iZoneIndex][dim];
1251                     if (shift_diff < iZone.shift0[dim])
1252                     {
1253                         iZone.shift0[dim] = shift_diff;
1254                     }
1255                     if (shift_diff > iZone.shift1[dim])
1256                     {
1257                         iZone.shift1[dim] = shift_diff;
1258                     }
1259                 }
1260             }
1261         }
1262
1263         zones->iZones.push_back(iZone);
1264     }
1265
1266     if (!isDlbDisabled(dd->comm))
1267     {
1268         dd->comm->cellsizesWithDlb.resize(dd->ndim);
1269     }
1270
1271     if (dd->comm->ddSettings.recordLoad)
1272     {
1273         make_load_communicators(dd);
1274     }
1275 }
1276
1277 static void make_pp_communicator(const gmx::MDLogger& mdlog,
1278                                  gmx_domdec_t*        dd,
1279                                  t_commrec gmx_unused* cr,
1280                                  bool gmx_unused       reorder)
1281 {
1282 #if GMX_MPI
1283     gmx_domdec_comm_t*  comm      = dd->comm;
1284     CartesianRankSetup& cartSetup = comm->cartesianRankSetup;
1285
1286     if (cartSetup.bCartesianPP)
1287     {
1288         /* Set up cartesian communication for the particle-particle part */
1289         GMX_LOG(mdlog.info)
1290                 .appendTextFormatted("Will use a Cartesian communicator: %d x %d x %d",
1291                                      dd->numCells[XX],
1292                                      dd->numCells[YY],
1293                                      dd->numCells[ZZ]);
1294
1295         ivec periods;
1296         for (int i = 0; i < DIM; i++)
1297         {
1298             periods[i] = TRUE;
1299         }
1300         MPI_Comm comm_cart = MPI_COMM_NULL;
1301         MPI_Cart_create(cr->mpi_comm_mygroup, DIM, dd->numCells, periods, static_cast<int>(reorder), &comm_cart);
1302         /* We overwrite the old communicator with the new cartesian one */
1303         cr->mpi_comm_mygroup = comm_cart;
1304     }
1305
1306     dd->mpi_comm_all = cr->mpi_comm_mygroup;
1307     MPI_Comm_rank(dd->mpi_comm_all, &dd->rank);
1308
1309     if (cartSetup.bCartesianPP_PME)
1310     {
1311         /* Since we want to use the original cartesian setup for sim,
1312          * and not the one after split, we need to make an index.
1313          */
1314         cartSetup.ddindex2ddnodeid.resize(dd->nnodes);
1315         cartSetup.ddindex2ddnodeid[dd_index(dd->numCells, dd->ci)] = dd->rank;
1316         gmx_sumi(dd->nnodes, cartSetup.ddindex2ddnodeid.data(), cr);
1317         /* Get the rank of the DD master,
1318          * above we made sure that the master node is a PP node.
1319          */
1320         int rank = MASTER(cr) ? dd->rank : 0;
1321         MPI_Allreduce(&rank, &dd->masterrank, 1, MPI_INT, MPI_SUM, dd->mpi_comm_all);
1322     }
1323     else if (cartSetup.bCartesianPP)
1324     {
1325         if (!comm->ddRankSetup.usePmeOnlyRanks)
1326         {
1327             /* The PP communicator is also
1328              * the communicator for this simulation
1329              */
1330             cr->mpi_comm_mysim = cr->mpi_comm_mygroup;
1331         }
1332         cr->nodeid = dd->rank;
1333
1334         MPI_Cart_coords(dd->mpi_comm_all, dd->rank, DIM, dd->ci);
1335
1336         /* We need to make an index to go from the coordinates
1337          * to the nodeid of this simulation.
1338          */
1339         cartSetup.ddindex2simnodeid.resize(dd->nnodes);
1340         std::vector<int> buf(dd->nnodes);
1341         if (thisRankHasDuty(cr, DUTY_PP))
1342         {
1343             buf[dd_index(dd->numCells, dd->ci)] = cr->sim_nodeid;
1344         }
1345         /* Communicate the ddindex to simulation nodeid index */
1346         MPI_Allreduce(buf.data(), cartSetup.ddindex2simnodeid.data(), dd->nnodes, MPI_INT, MPI_SUM, cr->mpi_comm_mysim);
1347
1348         /* Determine the master coordinates and rank.
1349          * The DD master should be the same node as the master of this sim.
1350          */
1351         for (int i = 0; i < dd->nnodes; i++)
1352         {
1353             if (cartSetup.ddindex2simnodeid[i] == 0)
1354             {
1355                 ddindex2xyz(dd->numCells, i, dd->master_ci);
1356                 MPI_Cart_rank(dd->mpi_comm_all, dd->master_ci, &dd->masterrank);
1357             }
1358         }
1359         if (debug)
1360         {
1361             fprintf(debug, "The master rank is %d\n", dd->masterrank);
1362         }
1363     }
1364     else
1365     {
1366         /* No Cartesian communicators */
1367         /* We use the rank in dd->comm->all as DD index */
1368         ddindex2xyz(dd->numCells, dd->rank, dd->ci);
1369         /* The simulation master nodeid is 0, so the DD master rank is also 0 */
1370         dd->masterrank = 0;
1371         clear_ivec(dd->master_ci);
1372     }
1373 #endif
1374
1375     GMX_LOG(mdlog.info)
1376             .appendTextFormatted("Domain decomposition rank %d, coordinates %d %d %d\n",
1377                                  dd->rank,
1378                                  dd->ci[XX],
1379                                  dd->ci[YY],
1380                                  dd->ci[ZZ]);
1381     if (debug)
1382     {
1383         fprintf(debug,
1384                 "Domain decomposition rank %d, coordinates %d %d %d\n\n",
1385                 dd->rank,
1386                 dd->ci[XX],
1387                 dd->ci[YY],
1388                 dd->ci[ZZ]);
1389     }
1390 }
1391
1392 static void receive_ddindex2simnodeid(gmx_domdec_t* dd, t_commrec* cr)
1393 {
1394 #if GMX_MPI
1395     CartesianRankSetup& cartSetup = dd->comm->cartesianRankSetup;
1396
1397     if (!cartSetup.bCartesianPP_PME && cartSetup.bCartesianPP)
1398     {
1399         cartSetup.ddindex2simnodeid.resize(dd->nnodes);
1400         std::vector<int> buf(dd->nnodes);
1401         if (thisRankHasDuty(cr, DUTY_PP))
1402         {
1403             buf[dd_index(dd->numCells, dd->ci)] = cr->sim_nodeid;
1404         }
1405         /* Communicate the ddindex to simulation nodeid index */
1406         MPI_Allreduce(buf.data(), cartSetup.ddindex2simnodeid.data(), dd->nnodes, MPI_INT, MPI_SUM, cr->mpi_comm_mysim);
1407     }
1408 #else
1409     GMX_UNUSED_VALUE(dd);
1410     GMX_UNUSED_VALUE(cr);
1411 #endif
1412 }
1413
1414 static CartesianRankSetup split_communicator(const gmx::MDLogger& mdlog,
1415                                              t_commrec*           cr,
1416                                              const DdRankOrder    ddRankOrder,
1417                                              bool gmx_unused      reorder,
1418                                              const DDRankSetup&   ddRankSetup,
1419                                              ivec                 ddCellIndex,
1420                                              std::vector<int>*    pmeRanks)
1421 {
1422     CartesianRankSetup cartSetup;
1423
1424     cartSetup.bCartesianPP     = (ddRankOrder == DdRankOrder::cartesian);
1425     cartSetup.bCartesianPP_PME = false;
1426
1427     const ivec& numDDCells = ddRankSetup.numPPCells;
1428     /* Initially we set ntot to the number of PP cells */
1429     copy_ivec(numDDCells, cartSetup.ntot);
1430
1431     if (cartSetup.bCartesianPP)
1432     {
1433         const int numDDCellsTot = ddRankSetup.numPPRanks;
1434         bool      bDiv[DIM];
1435         for (int i = 1; i < DIM; i++)
1436         {
1437             bDiv[i] = ((ddRankSetup.numRanksDoingPme * numDDCells[i]) % numDDCellsTot == 0);
1438         }
1439         if (bDiv[YY] || bDiv[ZZ])
1440         {
1441             cartSetup.bCartesianPP_PME = TRUE;
1442             /* If we have 2D PME decomposition, which is always in x+y,
1443              * we stack the PME only nodes in z.
1444              * Otherwise we choose the direction that provides the thinnest slab
1445              * of PME only nodes as this will have the least effect
1446              * on the PP communication.
1447              * But for the PME communication the opposite might be better.
1448              */
1449             if (bDiv[ZZ] && (ddRankSetup.npmenodes_y > 1 || !bDiv[YY] || numDDCells[YY] > numDDCells[ZZ]))
1450             {
1451                 cartSetup.cartpmedim = ZZ;
1452             }
1453             else
1454             {
1455                 cartSetup.cartpmedim = YY;
1456             }
1457             cartSetup.ntot[cartSetup.cartpmedim] +=
1458                     (ddRankSetup.numRanksDoingPme * numDDCells[cartSetup.cartpmedim]) / numDDCellsTot;
1459         }
1460         else
1461         {
1462             GMX_LOG(mdlog.info)
1463                     .appendTextFormatted(
1464                             "Number of PME-only ranks (%d) is not a multiple of nx*ny (%d*%d) or "
1465                             "nx*nz (%d*%d)",
1466                             ddRankSetup.numRanksDoingPme,
1467                             numDDCells[XX],
1468                             numDDCells[YY],
1469                             numDDCells[XX],
1470                             numDDCells[ZZ]);
1471             GMX_LOG(mdlog.info)
1472                     .appendText("Will not use a Cartesian communicator for PP <-> PME\n");
1473         }
1474     }
1475
1476     if (cartSetup.bCartesianPP_PME)
1477     {
1478 #if GMX_MPI
1479         int  rank = 0;
1480         ivec periods;
1481
1482         GMX_LOG(mdlog.info)
1483                 .appendTextFormatted(
1484                         "Will use a Cartesian communicator for PP <-> PME: %d x %d x %d",
1485                         cartSetup.ntot[XX],
1486                         cartSetup.ntot[YY],
1487                         cartSetup.ntot[ZZ]);
1488
1489         for (int i = 0; i < DIM; i++)
1490         {
1491             periods[i] = TRUE;
1492         }
1493         MPI_Comm comm_cart = MPI_COMM_NULL;
1494         MPI_Cart_create(cr->mpi_comm_mysim, DIM, cartSetup.ntot, periods, static_cast<int>(reorder), &comm_cart);
1495         MPI_Comm_rank(comm_cart, &rank);
1496         if (MASTER(cr) && rank != 0)
1497         {
1498             gmx_fatal(FARGS, "MPI rank 0 was renumbered by MPI_Cart_create, we do not allow this");
1499         }
1500
1501         /* With this assigment we loose the link to the original communicator
1502          * which will usually be MPI_COMM_WORLD, unless have multisim.
1503          */
1504         cr->mpi_comm_mysim = comm_cart;
1505         cr->sim_nodeid     = rank;
1506
1507         MPI_Cart_coords(cr->mpi_comm_mysim, cr->sim_nodeid, DIM, ddCellIndex);
1508
1509         GMX_LOG(mdlog.info)
1510                 .appendTextFormatted("Cartesian rank %d, coordinates %d %d %d\n",
1511                                      cr->sim_nodeid,
1512                                      ddCellIndex[XX],
1513                                      ddCellIndex[YY],
1514                                      ddCellIndex[ZZ]);
1515
1516         if (ddCellIndex[cartSetup.cartpmedim] < numDDCells[cartSetup.cartpmedim])
1517         {
1518             cr->duty = DUTY_PP;
1519         }
1520         if (!ddRankSetup.usePmeOnlyRanks
1521             || ddCellIndex[cartSetup.cartpmedim] >= numDDCells[cartSetup.cartpmedim])
1522         {
1523             cr->duty = DUTY_PME;
1524         }
1525
1526         /* Split the sim communicator into PP and PME only nodes */
1527         MPI_Comm_split(cr->mpi_comm_mysim,
1528                        getThisRankDuties(cr),
1529                        dd_index(cartSetup.ntot, ddCellIndex),
1530                        &cr->mpi_comm_mygroup);
1531 #else
1532         GMX_UNUSED_VALUE(ddCellIndex);
1533 #endif
1534     }
1535     else
1536     {
1537         switch (ddRankOrder)
1538         {
1539             case DdRankOrder::pp_pme:
1540                 GMX_LOG(mdlog.info).appendText("Order of the ranks: PP first, PME last");
1541                 break;
1542             case DdRankOrder::interleave:
1543                 /* Interleave the PP-only and PME-only ranks */
1544                 GMX_LOG(mdlog.info).appendText("Interleaving PP and PME ranks");
1545                 *pmeRanks = dd_interleaved_pme_ranks(ddRankSetup);
1546                 break;
1547             case DdRankOrder::cartesian: break;
1548             default: gmx_fatal(FARGS, "Invalid ddRankOrder=%d", static_cast<int>(ddRankOrder));
1549         }
1550
1551         if (dd_simnode2pmenode(ddRankSetup, cartSetup, *pmeRanks, cr, cr->sim_nodeid) == -1)
1552         {
1553             cr->duty = DUTY_PME;
1554         }
1555         else
1556         {
1557             cr->duty = DUTY_PP;
1558         }
1559 #if GMX_MPI
1560         /* Split the sim communicator into PP and PME only nodes */
1561         MPI_Comm_split(cr->mpi_comm_mysim, getThisRankDuties(cr), cr->nodeid, &cr->mpi_comm_mygroup);
1562         MPI_Comm_rank(cr->mpi_comm_mygroup, &cr->nodeid);
1563 #endif
1564     }
1565
1566     GMX_LOG(mdlog.info)
1567             .appendTextFormatted("This rank does only %s work.\n",
1568                                  thisRankHasDuty(cr, DUTY_PP) ? "particle-particle" : "PME-mesh");
1569
1570     return cartSetup;
1571 }
1572
1573 /*! \brief Makes the PP communicator and the PME communicator, when needed
1574  *
1575  * Returns the Cartesian rank setup.
1576  * Sets \p cr->mpi_comm_mygroup
1577  * For PP ranks, sets the DD PP cell index in \p ddCellIndex.
1578  * With separate PME ranks in interleaved order, set the PME ranks in \p pmeRanks.
1579  */
1580 static CartesianRankSetup makeGroupCommunicators(const gmx::MDLogger& mdlog,
1581                                                  const DDSettings&    ddSettings,
1582                                                  const DdRankOrder    ddRankOrder,
1583                                                  const DDRankSetup&   ddRankSetup,
1584                                                  t_commrec*           cr,
1585                                                  ivec                 ddCellIndex,
1586                                                  std::vector<int>*    pmeRanks)
1587 {
1588     CartesianRankSetup cartSetup;
1589
1590     // As a default, both group and sim communicators are equal to the default communicator
1591     cr->mpi_comm_mygroup = cr->mpiDefaultCommunicator;
1592     cr->mpi_comm_mysim   = cr->mpiDefaultCommunicator;
1593     cr->nnodes           = cr->sizeOfDefaultCommunicator;
1594     cr->nodeid           = cr->rankInDefaultCommunicator;
1595     cr->sim_nodeid       = cr->rankInDefaultCommunicator;
1596
1597     if (ddRankSetup.usePmeOnlyRanks)
1598     {
1599         /* Split the communicator into a PP and PME part */
1600         cartSetup = split_communicator(
1601                 mdlog, cr, ddRankOrder, ddSettings.useCartesianReorder, ddRankSetup, ddCellIndex, pmeRanks);
1602     }
1603     else
1604     {
1605         /* All nodes do PP and PME */
1606         /* We do not require separate communicators */
1607         cartSetup.bCartesianPP     = false;
1608         cartSetup.bCartesianPP_PME = false;
1609     }
1610
1611     return cartSetup;
1612 }
1613
1614 /*! \brief For PP ranks, sets or makes the communicator
1615  *
1616  * For PME ranks get the rank id.
1617  * For PP only ranks, sets the PME-only rank.
1618  */
1619 static void setupGroupCommunication(const gmx::MDLogger&     mdlog,
1620                                     const DDSettings&        ddSettings,
1621                                     gmx::ArrayRef<const int> pmeRanks,
1622                                     t_commrec*               cr,
1623                                     const int                numAtomsInSystem,
1624                                     gmx_domdec_t*            dd)
1625 {
1626     const DDRankSetup&        ddRankSetup = dd->comm->ddRankSetup;
1627     const CartesianRankSetup& cartSetup   = dd->comm->cartesianRankSetup;
1628
1629     if (thisRankHasDuty(cr, DUTY_PP))
1630     {
1631         /* Copy or make a new PP communicator */
1632
1633         /* We (possibly) reordered the nodes in split_communicator,
1634          * so it is no longer required in make_pp_communicator.
1635          */
1636         const bool useCartesianReorder = (ddSettings.useCartesianReorder && !cartSetup.bCartesianPP_PME);
1637
1638         make_pp_communicator(mdlog, dd, cr, useCartesianReorder);
1639     }
1640     else
1641     {
1642         receive_ddindex2simnodeid(dd, cr);
1643     }
1644
1645     if (!thisRankHasDuty(cr, DUTY_PME))
1646     {
1647         /* Set up the commnuication to our PME node */
1648         dd->pme_nodeid = dd_simnode2pmenode(ddRankSetup, cartSetup, pmeRanks, cr, cr->sim_nodeid);
1649         dd->pme_receive_vir_ener = receive_vir_ener(dd, pmeRanks, cr);
1650         if (debug)
1651         {
1652             fprintf(debug,
1653                     "My pme_nodeid %d receive ener %s\n",
1654                     dd->pme_nodeid,
1655                     gmx::boolToString(dd->pme_receive_vir_ener));
1656         }
1657     }
1658     else
1659     {
1660         dd->pme_nodeid = -1;
1661     }
1662
1663     /* We can not use DDMASTER(dd), because dd->masterrank is set later */
1664     if (MASTER(cr))
1665     {
1666         dd->ma = std::make_unique<AtomDistribution>(dd->numCells, numAtomsInSystem, numAtomsInSystem);
1667     }
1668 }
1669
1670 static real* get_slb_frac(const gmx::MDLogger& mdlog, const char* dir, int nc, const char* size_string)
1671 {
1672     real* slb_frac = nullptr;
1673     if (nc > 1 && size_string != nullptr)
1674     {
1675         GMX_LOG(mdlog.info).appendTextFormatted("Using static load balancing for the %s direction", dir);
1676         snew(slb_frac, nc);
1677         real tot = 0;
1678         for (int i = 0; i < nc; i++)
1679         {
1680             double dbl = 0;
1681             int    n   = 0;
1682             sscanf(size_string, "%20lf%n", &dbl, &n);
1683             if (dbl == 0)
1684             {
1685                 gmx_fatal(FARGS,
1686                           "Incorrect or not enough DD cell size entries for direction %s: '%s'",
1687                           dir,
1688                           size_string);
1689             }
1690             slb_frac[i] = dbl;
1691             size_string += n;
1692             tot += slb_frac[i];
1693         }
1694         /* Normalize */
1695         std::string relativeCellSizes = "Relative cell sizes:";
1696         for (int i = 0; i < nc; i++)
1697         {
1698             slb_frac[i] /= tot;
1699             relativeCellSizes += gmx::formatString(" %5.3f", slb_frac[i]);
1700         }
1701         GMX_LOG(mdlog.info).appendText(relativeCellSizes);
1702     }
1703
1704     return slb_frac;
1705 }
1706
1707 static int multi_body_bondeds_count(const gmx_mtop_t& mtop)
1708 {
1709     int n = 0;
1710     for (const auto ilists : IListRange(mtop))
1711     {
1712         for (auto& ilist : extractILists(ilists.list(), IF_BOND))
1713         {
1714             if (NRAL(ilist.functionType) > 2)
1715             {
1716                 n += ilists.nmol() * (ilist.iatoms.size() / ilistStride(ilist));
1717             }
1718         }
1719     }
1720
1721     return n;
1722 }
1723
1724 static int dd_getenv(const gmx::MDLogger& mdlog, const char* env_var, int def)
1725 {
1726     int   nst = def;
1727     char* val = getenv(env_var);
1728     if (val)
1729     {
1730         if (sscanf(val, "%20d", &nst) <= 0)
1731         {
1732             nst = 1;
1733         }
1734         GMX_LOG(mdlog.info).appendTextFormatted("Found env.var. %s = %s, using value %d", env_var, val, nst);
1735     }
1736
1737     return nst;
1738 }
1739
1740 static void check_dd_restrictions(const gmx_domdec_t* dd, const t_inputrec& inputrec, const gmx::MDLogger& mdlog)
1741 {
1742     if (inputrec.pbcType == PbcType::Screw
1743         && (dd->numCells[XX] == 1 || dd->numCells[YY] > 1 || dd->numCells[ZZ] > 1))
1744     {
1745         gmx_fatal(FARGS,
1746                   "With pbc=%s can only do domain decomposition in the x-direction",
1747                   c_pbcTypeNames[inputrec.pbcType].c_str());
1748     }
1749
1750     if (inputrec.nstlist == 0)
1751     {
1752         gmx_fatal(FARGS, "Domain decomposition does not work with nstlist=0");
1753     }
1754
1755     if (inputrec.comm_mode == ComRemovalAlgorithm::Angular && inputrec.pbcType != PbcType::No)
1756     {
1757         GMX_LOG(mdlog.warning)
1758                 .appendText(
1759                         "comm-mode angular will give incorrect results when the comm group "
1760                         "partially crosses a periodic boundary");
1761     }
1762 }
1763
1764 static real average_cellsize_min(const gmx_ddbox_t& ddbox, const ivec numDomains)
1765 {
1766     real r = ddbox.box_size[XX];
1767     for (int d = 0; d < DIM; d++)
1768     {
1769         if (numDomains[d] > 1)
1770         {
1771             /* Check using the initial average cell size */
1772             r = std::min(r, ddbox.box_size[d] * ddbox.skew_fac[d] / numDomains[d]);
1773         }
1774     }
1775
1776     return r;
1777 }
1778
1779 /*! \brief Depending on the DLB initial value return the DLB switched off state or issue an error.
1780  */
1781 static DlbState forceDlbOffOrBail(DlbState             cmdlineDlbState,
1782                                   const std::string&   reasonStr,
1783                                   const gmx::MDLogger& mdlog)
1784 {
1785     std::string dlbNotSupportedErr = "Dynamic load balancing requested, but ";
1786     std::string dlbDisableNote     = "NOTE: disabling dynamic load balancing as ";
1787
1788     if (cmdlineDlbState == DlbState::onUser)
1789     {
1790         gmx_fatal(FARGS, "%s", (dlbNotSupportedErr + reasonStr).c_str());
1791     }
1792     else if (cmdlineDlbState == DlbState::offCanTurnOn)
1793     {
1794         GMX_LOG(mdlog.info).appendText(dlbDisableNote + reasonStr);
1795     }
1796     return DlbState::offForever;
1797 }
1798
1799 /*! \brief Return the dynamic load balancer's initial state based on initial conditions and user inputs.
1800  *
1801  * This function parses the parameters of "-dlb" command line option setting
1802  * corresponding state values. Then it checks the consistency of the determined
1803  * state with other run parameters and settings. As a result, the initial state
1804  * may be altered or an error may be thrown if incompatibility of options is detected.
1805  *
1806  * \param [in] mdlog                Logger.
1807  * \param [in] dlbOption            Enum value for the DLB option.
1808  * \param [in] bRecordLoad          True if the load balancer is recording load information.
1809  * \param [in] mdrunOptions         Options for mdrun.
1810  * \param [in] inputrec             Pointer mdrun to input parameters.
1811  * \param [in] directGpuCommUsedWithGpuUpdate     Direct GPU halo exchange and GPU update enabled
1812  * \returns                         DLB initial/startup state.
1813  */
1814 static DlbState determineInitialDlbState(const gmx::MDLogger&     mdlog,
1815                                          DlbOption                dlbOption,
1816                                          gmx_bool                 bRecordLoad,
1817                                          const gmx::MdrunOptions& mdrunOptions,
1818                                          const t_inputrec&        inputrec,
1819                                          const bool               directGpuCommUsedWithGpuUpdate)
1820 {
1821     DlbState dlbState = DlbState::offCanTurnOn;
1822
1823     switch (dlbOption)
1824     {
1825         case DlbOption::turnOnWhenUseful: dlbState = DlbState::offCanTurnOn; break;
1826         case DlbOption::no: dlbState = DlbState::offUser; break;
1827         case DlbOption::yes: dlbState = DlbState::onUser; break;
1828         default: gmx_incons("Invalid dlbOption enum value");
1829     }
1830
1831     // P2P GPU comm + GPU update leads to case in which we enqueue async work for multiple timesteps
1832     // DLB needs to be disabled in that case
1833     if (directGpuCommUsedWithGpuUpdate)
1834     {
1835         std::string reasonStr =
1836                 "it is not supported with GPU direct communication + GPU update enabled.";
1837         return forceDlbOffOrBail(dlbState, reasonStr, mdlog);
1838     }
1839
1840     /* Reruns don't support DLB: bail or override auto mode */
1841     if (mdrunOptions.rerun)
1842     {
1843         std::string reasonStr = "it is not supported in reruns.";
1844         return forceDlbOffOrBail(dlbState, reasonStr, mdlog);
1845     }
1846
1847     /* Unsupported integrators */
1848     if (!EI_DYNAMICS(inputrec.eI))
1849     {
1850         auto reasonStr =
1851                 gmx::formatString("it is only supported with dynamics, not with integrator '%s'.",
1852                                   enumValueToString(inputrec.eI));
1853         return forceDlbOffOrBail(dlbState, reasonStr, mdlog);
1854     }
1855
1856     /* Without cycle counters we can't time work to balance on */
1857     if (!bRecordLoad)
1858     {
1859         std::string reasonStr =
1860                 "cycle counters unsupported or not enabled in the operating system kernel.";
1861         return forceDlbOffOrBail(dlbState, reasonStr, mdlog);
1862     }
1863
1864     if (mdrunOptions.reproducible)
1865     {
1866         std::string reasonStr = "you started a reproducible run.";
1867         switch (dlbState)
1868         {
1869             case DlbState::offUser: break;
1870             case DlbState::offForever:
1871                 GMX_RELEASE_ASSERT(false, "DlbState::offForever is not a valid initial state");
1872                 break;
1873             case DlbState::offCanTurnOn: return forceDlbOffOrBail(dlbState, reasonStr, mdlog);
1874             case DlbState::onCanTurnOff:
1875                 GMX_RELEASE_ASSERT(false, "DlbState::offCanTurnOff is not a valid initial state");
1876                 break;
1877             case DlbState::onUser:
1878                 return forceDlbOffOrBail(
1879                         dlbState,
1880                         reasonStr
1881                                 + " In load balanced runs binary reproducibility cannot be "
1882                                   "ensured.",
1883                         mdlog);
1884             default:
1885                 gmx_fatal(FARGS,
1886                           "Death horror: undefined case (%d) for load balancing choice",
1887                           static_cast<int>(dlbState));
1888         }
1889     }
1890
1891     return dlbState;
1892 }
1893
1894 static gmx_domdec_comm_t* init_dd_comm()
1895 {
1896     gmx_domdec_comm_t* comm = new gmx_domdec_comm_t;
1897
1898     comm->n_load_have    = 0;
1899     comm->n_load_collect = 0;
1900
1901     comm->haveTurnedOffDlb = false;
1902
1903     for (int i = 0; i < static_cast<int>(DDAtomRanges::Type::Number); i++)
1904     {
1905         comm->sum_nat[i] = 0;
1906     }
1907     comm->ndecomp   = 0;
1908     comm->nload     = 0;
1909     comm->load_step = 0;
1910     comm->load_sum  = 0;
1911     comm->load_max  = 0;
1912     clear_ivec(comm->load_lim);
1913     comm->load_mdf = 0;
1914     comm->load_pme = 0;
1915
1916     /* This should be replaced by a unique pointer */
1917     comm->balanceRegion = ddBalanceRegionAllocate();
1918
1919     return comm;
1920 }
1921
1922 static void setupUpdateGroups(const gmx::MDLogger&              mdlog,
1923                               const gmx_mtop_t&                 mtop,
1924                               ArrayRef<const RangePartitioning> updateGroupingsPerMoleculeType,
1925                               const bool                        useUpdateGroups,
1926                               const real                        maxUpdateGroupRadius,
1927                               DDSystemInfo*                     systemInfo)
1928 {
1929     systemInfo->updateGroupingsPerMoleculeType = updateGroupingsPerMoleculeType;
1930     systemInfo->useUpdateGroups                = useUpdateGroups;
1931     systemInfo->maxUpdateGroupRadius           = maxUpdateGroupRadius;
1932
1933     if (systemInfo->useUpdateGroups)
1934     {
1935         int numUpdateGroups = 0;
1936         for (const auto& molblock : mtop.molblock)
1937         {
1938             numUpdateGroups += molblock.nmol
1939                                * systemInfo->updateGroupingsPerMoleculeType[molblock.type].numBlocks();
1940         }
1941
1942         GMX_LOG(mdlog.info)
1943                 .appendTextFormatted(
1944                         "Using update groups, nr %d, average size %.1f atoms, max. radius %.3f "
1945                         "nm\n",
1946                         numUpdateGroups,
1947                         mtop.natoms / static_cast<double>(numUpdateGroups),
1948                         systemInfo->maxUpdateGroupRadius);
1949     }
1950 }
1951
1952 UnitCellInfo::UnitCellInfo(const t_inputrec& ir) :
1953     npbcdim(numPbcDimensions(ir.pbcType)),
1954     numBoundedDimensions(inputrec2nboundeddim(&ir)),
1955     ddBoxIsDynamic(numBoundedDimensions < DIM || inputrecDynamicBox(&ir)),
1956     haveScrewPBC(ir.pbcType == PbcType::Screw)
1957 {
1958 }
1959
1960 /* Returns whether molecules are always whole, i.e. not broken by PBC */
1961 static bool moleculesAreAlwaysWhole(const gmx_mtop_t&                           mtop,
1962                                     const bool                                  useUpdateGroups,
1963                                     gmx::ArrayRef<const gmx::RangePartitioning> updateGroupingsPerMoleculeType)
1964 {
1965     if (useUpdateGroups)
1966     {
1967         GMX_RELEASE_ASSERT(updateGroupingsPerMoleculeType.size() == mtop.moltype.size(),
1968                            "Need one grouping per moltype");
1969         for (size_t mol = 0; mol < mtop.moltype.size(); mol++)
1970         {
1971             if (updateGroupingsPerMoleculeType[mol].numBlocks() > 1)
1972             {
1973                 return false;
1974             }
1975         }
1976     }
1977     else
1978     {
1979         for (const auto& moltype : mtop.moltype)
1980         {
1981             if (moltype.atoms.nr > 1)
1982             {
1983                 return false;
1984             }
1985         }
1986     }
1987
1988     return true;
1989 }
1990
1991 /*! \brief Generate the simulation system information */
1992 static DDSystemInfo getSystemInfo(const gmx::MDLogger&              mdlog,
1993                                   DDRole                            ddRole,
1994                                   MPI_Comm                          communicator,
1995                                   const DomdecOptions&              options,
1996                                   const gmx_mtop_t&                 mtop,
1997                                   const t_inputrec&                 ir,
1998                                   const matrix                      box,
1999                                   ArrayRef<const RangePartitioning> updateGroupingPerMoleculeType,
2000                                   const bool                        useUpdateGroups,
2001                                   const real                        maxUpdateGroupRadius,
2002                                   gmx::ArrayRef<const gmx::RVec>    xGlobal)
2003 {
2004     const real tenPercentMargin = 1.1;
2005
2006     DDSystemInfo systemInfo;
2007
2008     setupUpdateGroups(
2009             mdlog, mtop, updateGroupingPerMoleculeType, useUpdateGroups, maxUpdateGroupRadius, &systemInfo);
2010
2011     systemInfo.moleculesAreAlwaysWhole = moleculesAreAlwaysWhole(
2012             mtop, systemInfo.useUpdateGroups, systemInfo.updateGroupingsPerMoleculeType);
2013     systemInfo.haveInterDomainBondeds =
2014             (!systemInfo.moleculesAreAlwaysWhole || mtop.bIntermolecularInteractions);
2015     systemInfo.haveInterDomainMultiBodyBondeds =
2016             (systemInfo.haveInterDomainBondeds && multi_body_bondeds_count(mtop) > 0);
2017
2018     if (systemInfo.useUpdateGroups)
2019     {
2020         systemInfo.mayHaveSplitConstraints = false;
2021         systemInfo.mayHaveSplitSettles     = false;
2022     }
2023     else
2024     {
2025         systemInfo.mayHaveSplitConstraints = (gmx_mtop_ftype_count(mtop, F_CONSTR) > 0
2026                                               || gmx_mtop_ftype_count(mtop, F_CONSTRNC) > 0);
2027         systemInfo.mayHaveSplitSettles     = (gmx_mtop_ftype_count(mtop, F_SETTLE) > 0);
2028     }
2029
2030     if (ir.rlist == 0)
2031     {
2032         /* Set the cut-off to some very large value,
2033          * so we don't need if statements everywhere in the code.
2034          * We use sqrt, since the cut-off is squared in some places.
2035          */
2036         systemInfo.cutoff = GMX_CUTOFF_INF;
2037     }
2038     else
2039     {
2040         systemInfo.cutoff = atomToAtomIntoDomainToDomainCutoff(systemInfo, ir.rlist);
2041     }
2042     systemInfo.minCutoffForMultiBody = 0;
2043
2044     /* Determine the minimum cell size limit, affected by many factors */
2045     systemInfo.cellsizeLimit             = 0;
2046     systemInfo.filterBondedCommunication = false;
2047
2048     /* We do not allow home atoms to move beyond the neighboring domain
2049      * between domain decomposition steps, which limits the cell size.
2050      * Get an estimate of cell size limit due to atom displacement.
2051      * In most cases this is a large overestimate, because it assumes
2052      * non-interaction atoms.
2053      * We set the chance to 1 in a trillion steps.
2054      */
2055     constexpr real c_chanceThatAtomMovesBeyondDomain = 1e-12;
2056     const real     limitForAtomDisplacement          = minCellSizeForAtomDisplacement(
2057             mtop, ir, systemInfo.updateGroupingsPerMoleculeType, c_chanceThatAtomMovesBeyondDomain);
2058     GMX_LOG(mdlog.info).appendTextFormatted("Minimum cell size due to atom displacement: %.3f nm", limitForAtomDisplacement);
2059
2060     systemInfo.cellsizeLimit = std::max(systemInfo.cellsizeLimit, limitForAtomDisplacement);
2061
2062     /* TODO: PME decomposition currently requires atoms not to be more than
2063      *       2/3 of comm->cutoff, which is >=rlist, outside of their domain.
2064      *       In nearly all cases, limitForAtomDisplacement will be smaller
2065      *       than 2/3*rlist, so the PME requirement is satisfied.
2066      *       But it would be better for both correctness and performance
2067      *       to use limitForAtomDisplacement instead of 2/3*comm->cutoff.
2068      *       Note that we would need to improve the pairlist buffer case.
2069      */
2070
2071     if (systemInfo.haveInterDomainBondeds)
2072     {
2073         if (options.minimumCommunicationRange > 0)
2074         {
2075             systemInfo.minCutoffForMultiBody =
2076                     atomToAtomIntoDomainToDomainCutoff(systemInfo, options.minimumCommunicationRange);
2077             if (options.useBondedCommunication)
2078             {
2079                 systemInfo.filterBondedCommunication =
2080                         (systemInfo.minCutoffForMultiBody > systemInfo.cutoff);
2081             }
2082             else
2083             {
2084                 systemInfo.cutoff = std::max(systemInfo.cutoff, systemInfo.minCutoffForMultiBody);
2085             }
2086         }
2087         else if (ir.bPeriodicMols)
2088         {
2089             /* Can not easily determine the required cut-off */
2090             GMX_LOG(mdlog.warning)
2091                     .appendText(
2092                             "NOTE: Periodic molecules are present in this system. Because of this, "
2093                             "the domain decomposition algorithm cannot easily determine the "
2094                             "minimum cell size that it requires for treating bonded interactions. "
2095                             "Instead, domain decomposition will assume that half the non-bonded "
2096                             "cut-off will be a suitable lower bound.");
2097             systemInfo.minCutoffForMultiBody = systemInfo.cutoff / 2;
2098         }
2099         else
2100         {
2101             real r_2b = 0;
2102             real r_mb = 0;
2103
2104             if (ddRole == DDRole::Master)
2105             {
2106                 dd_bonded_cg_distance(mdlog, mtop, ir, xGlobal, box, options.ddBondedChecking, &r_2b, &r_mb);
2107             }
2108             gmx_bcast(sizeof(r_2b), &r_2b, communicator);
2109             gmx_bcast(sizeof(r_mb), &r_mb, communicator);
2110
2111             /* We use an initial margin of 10% for the minimum cell size,
2112              * except when we are just below the non-bonded cut-off.
2113              */
2114             if (options.useBondedCommunication)
2115             {
2116                 if (std::max(r_2b, r_mb) > systemInfo.cutoff)
2117                 {
2118                     const real r_bonded              = std::max(r_2b, r_mb);
2119                     systemInfo.minCutoffForMultiBody = tenPercentMargin * r_bonded;
2120                     /* This is the (only) place where we turn on the filtering */
2121                     systemInfo.filterBondedCommunication = true;
2122                 }
2123                 else
2124                 {
2125                     const real r_bonded = r_mb;
2126                     systemInfo.minCutoffForMultiBody =
2127                             std::min(tenPercentMargin * r_bonded, systemInfo.cutoff);
2128                 }
2129                 /* We determine cutoff_mbody later */
2130                 systemInfo.increaseMultiBodyCutoff = true;
2131             }
2132             else
2133             {
2134                 /* No special bonded communication,
2135                  * simply increase the DD cut-off.
2136                  */
2137                 systemInfo.minCutoffForMultiBody = tenPercentMargin * std::max(r_2b, r_mb);
2138                 systemInfo.cutoff = std::max(systemInfo.cutoff, systemInfo.minCutoffForMultiBody);
2139             }
2140         }
2141         GMX_LOG(mdlog.info)
2142                 .appendTextFormatted("Minimum cell size due to bonded interactions: %.3f nm",
2143                                      systemInfo.minCutoffForMultiBody);
2144
2145         systemInfo.cellsizeLimit = std::max(systemInfo.cellsizeLimit, systemInfo.minCutoffForMultiBody);
2146     }
2147
2148     systemInfo.constraintCommunicationRange = 0;
2149     if (systemInfo.mayHaveSplitConstraints && options.constraintCommunicationRange <= 0)
2150     {
2151         /* There is a cell size limit due to the constraints (P-LINCS) */
2152         systemInfo.constraintCommunicationRange = gmx::constr_r_max(mdlog, &mtop, &ir);
2153         GMX_LOG(mdlog.info)
2154                 .appendTextFormatted("Estimated maximum distance required for P-LINCS: %.3f nm",
2155                                      systemInfo.constraintCommunicationRange);
2156         if (systemInfo.constraintCommunicationRange > systemInfo.cellsizeLimit)
2157         {
2158             GMX_LOG(mdlog.info)
2159                     .appendText(
2160                             "This distance will limit the DD cell size, you can override this with "
2161                             "-rcon");
2162         }
2163     }
2164     else if (options.constraintCommunicationRange > 0)
2165     {
2166         /* Here we do not check for dd->splitConstraints.
2167          * because one can also set a cell size limit for virtual sites only
2168          * and at this point we don't know yet if there are intercg v-sites.
2169          */
2170         GMX_LOG(mdlog.info)
2171                 .appendTextFormatted("User supplied maximum distance required for P-LINCS: %.3f nm",
2172                                      options.constraintCommunicationRange);
2173         systemInfo.constraintCommunicationRange = options.constraintCommunicationRange;
2174     }
2175     systemInfo.cellsizeLimit = std::max(systemInfo.cellsizeLimit, systemInfo.constraintCommunicationRange);
2176
2177     return systemInfo;
2178 }
2179
2180 /*! \brief Exit with a fatal error if the DDGridSetup cannot be
2181  * implemented. */
2182 static void checkDDGridSetup(const DDGridSetup&   ddGridSetup,
2183                              DDRole               ddRole,
2184                              MPI_Comm             communicator,
2185                              int                  numNodes,
2186                              const DomdecOptions& options,
2187                              const DDSettings&    ddSettings,
2188                              const DDSystemInfo&  systemInfo,
2189                              const real           cellsizeLimit,
2190                              const gmx_ddbox_t&   ddbox)
2191 {
2192     if (options.numCells[XX] <= 0 && (ddGridSetup.numDomains[XX] == 0))
2193     {
2194         const bool  bC = (systemInfo.mayHaveSplitConstraints
2195                          && systemInfo.constraintCommunicationRange > systemInfo.minCutoffForMultiBody);
2196         std::string message =
2197                 gmx::formatString("Change the number of ranks or mdrun option %s%s%s",
2198                                   !bC ? "-rdd" : "-rcon",
2199                                   ddSettings.initialDlbState != DlbState::offUser ? " or -dds" : "",
2200                                   bC ? " or your LINCS settings" : "");
2201
2202         gmx_fatal_collective(FARGS,
2203                              communicator,
2204                              ddRole == DDRole::Master,
2205                              "There is no domain decomposition for %d ranks that is compatible "
2206                              "with the given box and a minimum cell size of %g nm\n"
2207                              "%s\n"
2208                              "Look in the log file for details on the domain decomposition",
2209                              numNodes - ddGridSetup.numPmeOnlyRanks,
2210                              cellsizeLimit,
2211                              message.c_str());
2212     }
2213
2214     const real acs = average_cellsize_min(ddbox, ddGridSetup.numDomains);
2215     if (acs < cellsizeLimit)
2216     {
2217         if (options.numCells[XX] <= 0)
2218         {
2219             GMX_RELEASE_ASSERT(
2220                     false,
2221                     "dd_choose_grid() should return a grid that satisfies the cell size limits");
2222         }
2223         else
2224         {
2225             gmx_fatal_collective(
2226                     FARGS,
2227                     communicator,
2228                     ddRole == DDRole::Master,
2229                     "The initial cell size (%f) is smaller than the cell size limit (%f), change "
2230                     "options -dd, -rdd or -rcon, see the log file for details",
2231                     acs,
2232                     cellsizeLimit);
2233         }
2234     }
2235
2236     const int numPPRanks =
2237             ddGridSetup.numDomains[XX] * ddGridSetup.numDomains[YY] * ddGridSetup.numDomains[ZZ];
2238     if (numNodes - numPPRanks != ddGridSetup.numPmeOnlyRanks)
2239     {
2240         gmx_fatal_collective(FARGS,
2241                              communicator,
2242                              ddRole == DDRole::Master,
2243                              "The size of the domain decomposition grid (%d) does not match the "
2244                              "number of PP ranks (%d). The total number of ranks is %d",
2245                              numPPRanks,
2246                              numNodes - ddGridSetup.numPmeOnlyRanks,
2247                              numNodes);
2248     }
2249     if (ddGridSetup.numPmeOnlyRanks > numPPRanks)
2250     {
2251         gmx_fatal_collective(FARGS,
2252                              communicator,
2253                              ddRole == DDRole::Master,
2254                              "The number of separate PME ranks (%d) is larger than the number of "
2255                              "PP ranks (%d), this is not supported.",
2256                              ddGridSetup.numPmeOnlyRanks,
2257                              numPPRanks);
2258     }
2259 }
2260
2261 /*! \brief Set the cell size and interaction limits, as well as the DD grid */
2262 static DDRankSetup getDDRankSetup(const gmx::MDLogger& mdlog,
2263                                   int                  numNodes,
2264                                   const DdRankOrder    rankOrder,
2265                                   const DDGridSetup&   ddGridSetup,
2266                                   const t_inputrec&    ir)
2267 {
2268     GMX_LOG(mdlog.info)
2269             .appendTextFormatted("Domain decomposition grid %d x %d x %d, separate PME ranks %d",
2270                                  ddGridSetup.numDomains[XX],
2271                                  ddGridSetup.numDomains[YY],
2272                                  ddGridSetup.numDomains[ZZ],
2273                                  ddGridSetup.numPmeOnlyRanks);
2274
2275     DDRankSetup ddRankSetup;
2276
2277     ddRankSetup.rankOrder = rankOrder;
2278
2279     ddRankSetup.numPPRanks = numNodes - ddGridSetup.numPmeOnlyRanks;
2280     copy_ivec(ddGridSetup.numDomains, ddRankSetup.numPPCells);
2281
2282     ddRankSetup.usePmeOnlyRanks = (ddGridSetup.numPmeOnlyRanks > 0);
2283     if (ddRankSetup.usePmeOnlyRanks)
2284     {
2285         ddRankSetup.numRanksDoingPme = ddGridSetup.numPmeOnlyRanks;
2286     }
2287     else
2288     {
2289         ddRankSetup.numRanksDoingPme =
2290                 ddGridSetup.numDomains[XX] * ddGridSetup.numDomains[YY] * ddGridSetup.numDomains[ZZ];
2291     }
2292
2293     if (EEL_PME(ir.coulombtype) || EVDW_PME(ir.vdwtype))
2294     {
2295         /* The following choices should match those
2296          * in comm_cost_est in domdec_setup.c.
2297          * Note that here the checks have to take into account
2298          * that the decomposition might occur in a different order than xyz
2299          * (for instance through the env.var. GMX_DD_ORDER_ZYX),
2300          * in which case they will not match those in comm_cost_est,
2301          * but since that is mainly for testing purposes that's fine.
2302          */
2303         if (ddGridSetup.numDDDimensions >= 2 && ddGridSetup.ddDimensions[0] == XX
2304             && ddGridSetup.ddDimensions[1] == YY
2305             && ddRankSetup.numRanksDoingPme > ddGridSetup.numDomains[XX]
2306             && ddRankSetup.numRanksDoingPme % ddGridSetup.numDomains[XX] == 0
2307             && getenv("GMX_PMEONEDD") == nullptr)
2308         {
2309             ddRankSetup.npmedecompdim = 2;
2310             ddRankSetup.npmenodes_x   = ddGridSetup.numDomains[XX];
2311             ddRankSetup.npmenodes_y   = ddRankSetup.numRanksDoingPme / ddRankSetup.npmenodes_x;
2312         }
2313         else
2314         {
2315             /* In case nc is 1 in both x and y we could still choose to
2316              * decompose pme in y instead of x, but we use x for simplicity.
2317              */
2318             ddRankSetup.npmedecompdim = 1;
2319             if (ddGridSetup.ddDimensions[0] == YY)
2320             {
2321                 ddRankSetup.npmenodes_x = 1;
2322                 ddRankSetup.npmenodes_y = ddRankSetup.numRanksDoingPme;
2323             }
2324             else
2325             {
2326                 ddRankSetup.npmenodes_x = ddRankSetup.numRanksDoingPme;
2327                 ddRankSetup.npmenodes_y = 1;
2328             }
2329         }
2330         GMX_LOG(mdlog.info)
2331                 .appendTextFormatted("PME domain decomposition: %d x %d x %d",
2332                                      ddRankSetup.npmenodes_x,
2333                                      ddRankSetup.npmenodes_y,
2334                                      1);
2335     }
2336     else
2337     {
2338         ddRankSetup.npmedecompdim = 0;
2339         ddRankSetup.npmenodes_x   = 0;
2340         ddRankSetup.npmenodes_y   = 0;
2341     }
2342
2343     return ddRankSetup;
2344 }
2345
2346 /*! \brief Set the cell size and interaction limits */
2347 static void set_dd_limits(const gmx::MDLogger& mdlog,
2348                           DDRole               ddRole,
2349                           gmx_domdec_t*        dd,
2350                           const DomdecOptions& options,
2351                           const DDSettings&    ddSettings,
2352                           const DDSystemInfo&  systemInfo,
2353                           const DDGridSetup&   ddGridSetup,
2354                           const int            numPPRanks,
2355                           const gmx_mtop_t&    mtop,
2356                           const t_inputrec&    ir,
2357                           const gmx_ddbox_t&   ddbox)
2358 {
2359     gmx_domdec_comm_t* comm = dd->comm;
2360     comm->ddSettings        = ddSettings;
2361
2362     /* Initialize to GPU share count to 0, might change later */
2363     comm->nrank_gpu_shared = 0;
2364
2365     comm->dlbState = comm->ddSettings.initialDlbState;
2366     dd_dlb_set_should_check_whether_to_turn_dlb_on(dd, TRUE);
2367     /* To consider turning DLB on after 2*nstlist steps we need to check
2368      * at partitioning count 3. Thus we need to increase the first count by 2.
2369      */
2370     comm->ddPartioningCountFirstDlbOff += 2;
2371
2372     comm->bPMELoadBalDLBLimits = FALSE;
2373
2374     /* Allocate the charge group/atom sorting struct */
2375     comm->sort = std::make_unique<gmx_domdec_sort_t>();
2376
2377     comm->systemInfo = systemInfo;
2378
2379     if (systemInfo.useUpdateGroups)
2380     {
2381         /* Note: We would like to use dd->nnodes for the atom count estimate,
2382          *       but that is not yet available here. But this anyhow only
2383          *       affect performance up to the second dd_partition_system call.
2384          */
2385         const int homeAtomCountEstimate = mtop.natoms / numPPRanks;
2386         comm->updateGroupsCog           = std::make_unique<gmx::UpdateGroupsCog>(
2387                 mtop, systemInfo.updateGroupingsPerMoleculeType, maxReferenceTemperature(ir), homeAtomCountEstimate);
2388     }
2389
2390     /* Set the DD setup given by ddGridSetup */
2391     copy_ivec(ddGridSetup.numDomains, dd->numCells);
2392     dd->ndim = ddGridSetup.numDDDimensions;
2393     copy_ivec(ddGridSetup.ddDimensions, dd->dim);
2394
2395     dd->nnodes = dd->numCells[XX] * dd->numCells[YY] * dd->numCells[ZZ];
2396
2397     snew(comm->slb_frac, DIM);
2398     if (isDlbDisabled(comm))
2399     {
2400         comm->slb_frac[XX] = get_slb_frac(mdlog, "x", dd->numCells[XX], options.cellSizeX);
2401         comm->slb_frac[YY] = get_slb_frac(mdlog, "y", dd->numCells[YY], options.cellSizeY);
2402         comm->slb_frac[ZZ] = get_slb_frac(mdlog, "z", dd->numCells[ZZ], options.cellSizeZ);
2403     }
2404
2405     /* Set the multi-body cut-off and cellsize limit for DLB */
2406     comm->cutoff_mbody   = systemInfo.minCutoffForMultiBody;
2407     comm->cellsize_limit = systemInfo.cellsizeLimit;
2408     if (systemInfo.haveInterDomainBondeds && systemInfo.increaseMultiBodyCutoff)
2409     {
2410         if (systemInfo.filterBondedCommunication || !isDlbDisabled(comm))
2411         {
2412             /* Set the bonded communication distance to halfway
2413              * the minimum and the maximum,
2414              * since the extra communication cost is nearly zero.
2415              */
2416             real acs           = average_cellsize_min(ddbox, dd->numCells);
2417             comm->cutoff_mbody = 0.5 * (systemInfo.minCutoffForMultiBody + acs);
2418             if (!isDlbDisabled(comm))
2419             {
2420                 /* Check if this does not limit the scaling */
2421                 comm->cutoff_mbody = std::min(comm->cutoff_mbody, options.dlbScaling * acs);
2422             }
2423             if (!systemInfo.filterBondedCommunication)
2424             {
2425                 /* Without bBondComm do not go beyond the n.b. cut-off */
2426                 comm->cutoff_mbody = std::min(comm->cutoff_mbody, systemInfo.cutoff);
2427                 if (comm->cellsize_limit >= systemInfo.cutoff)
2428                 {
2429                     /* We don't loose a lot of efficieny
2430                      * when increasing it to the n.b. cut-off.
2431                      * It can even be slightly faster, because we need
2432                      * less checks for the communication setup.
2433                      */
2434                     comm->cutoff_mbody = systemInfo.cutoff;
2435                 }
2436             }
2437             /* Check if we did not end up below our original limit */
2438             comm->cutoff_mbody = std::max(comm->cutoff_mbody, systemInfo.minCutoffForMultiBody);
2439
2440             if (comm->cutoff_mbody > comm->cellsize_limit)
2441             {
2442                 comm->cellsize_limit = comm->cutoff_mbody;
2443             }
2444         }
2445         /* Without DLB and cutoff_mbody<cutoff, cutoff_mbody is dynamic */
2446     }
2447
2448     if (debug)
2449     {
2450         fprintf(debug,
2451                 "Bonded atom communication beyond the cut-off: %s\n"
2452                 "cellsize limit %f\n",
2453                 gmx::boolToString(systemInfo.filterBondedCommunication),
2454                 comm->cellsize_limit);
2455     }
2456
2457     if (ddRole == DDRole::Master)
2458     {
2459         check_dd_restrictions(dd, ir, mdlog);
2460     }
2461 }
2462
2463 static void writeSettings(gmx::TextWriter*   log,
2464                           gmx_domdec_t*      dd,
2465                           const gmx_mtop_t&  mtop,
2466                           const t_inputrec&  ir,
2467                           gmx_bool           bDynLoadBal,
2468                           real               dlb_scale,
2469                           const gmx_ddbox_t* ddbox)
2470 {
2471     gmx_domdec_comm_t* comm = dd->comm;
2472
2473     if (bDynLoadBal)
2474     {
2475         log->writeString("The maximum number of communication pulses is:");
2476         for (int d = 0; d < dd->ndim; d++)
2477         {
2478             log->writeStringFormatted(" %c %d", dim2char(dd->dim[d]), comm->cd[d].np_dlb);
2479         }
2480         log->ensureLineBreak();
2481         log->writeLineFormatted("The minimum size for domain decomposition cells is %.3f nm",
2482                                 comm->cellsize_limit);
2483         log->writeLineFormatted("The requested allowed shrink of DD cells (option -dds) is: %.2f", dlb_scale);
2484         log->writeString("The allowed shrink of domain decomposition cells is:");
2485         for (int d = 0; d < DIM; d++)
2486         {
2487             if (dd->numCells[d] > 1)
2488             {
2489                 const real shrink =
2490                         (d >= ddbox->npbcdim && dd->numCells[d] == 2)
2491                                 ? 0
2492                                 : comm->cellsize_min_dlb[d]
2493                                           / (ddbox->box_size[d] * ddbox->skew_fac[d] / dd->numCells[d]);
2494                 log->writeStringFormatted(" %c %.2f", dim2char(d), shrink);
2495             }
2496         }
2497         log->ensureLineBreak();
2498     }
2499     else
2500     {
2501         ivec np;
2502         set_dd_cell_sizes_slb(dd, ddbox, setcellsizeslbPULSE_ONLY, np);
2503         log->writeString("The initial number of communication pulses is:");
2504         for (int d = 0; d < dd->ndim; d++)
2505         {
2506             log->writeStringFormatted(" %c %d", dim2char(dd->dim[d]), np[dd->dim[d]]);
2507         }
2508         log->ensureLineBreak();
2509         log->writeString("The initial domain decomposition cell size is:");
2510         for (int d = 0; d < DIM; d++)
2511         {
2512             if (dd->numCells[d] > 1)
2513             {
2514                 log->writeStringFormatted(" %c %.2f nm", dim2char(d), dd->comm->cellsize_min[d]);
2515             }
2516         }
2517         log->ensureLineBreak();
2518         log->writeLine();
2519     }
2520
2521     const bool haveInterDomainVsites =
2522             (countInterUpdategroupVsites(mtop, comm->systemInfo.updateGroupingsPerMoleculeType) != 0);
2523
2524     if (comm->systemInfo.haveInterDomainBondeds || haveInterDomainVsites
2525         || comm->systemInfo.mayHaveSplitConstraints || comm->systemInfo.mayHaveSplitSettles)
2526     {
2527         std::string decompUnits;
2528         if (comm->systemInfo.useUpdateGroups)
2529         {
2530             decompUnits = "atom groups";
2531         }
2532         else
2533         {
2534             decompUnits = "atoms";
2535         }
2536
2537         log->writeLineFormatted("The maximum allowed distance for %s involved in interactions is:",
2538                                 decompUnits.c_str());
2539         log->writeLineFormatted(
2540                 "%40s  %-7s %6.3f nm", "non-bonded interactions", "", comm->systemInfo.cutoff);
2541
2542         real limit = 0;
2543         if (bDynLoadBal)
2544         {
2545             limit = dd->comm->cellsize_limit;
2546         }
2547         else
2548         {
2549             if (dd->unitCellInfo.ddBoxIsDynamic)
2550             {
2551                 log->writeLine(
2552                         "(the following are initial values, they could change due to box "
2553                         "deformation)");
2554             }
2555             limit = dd->comm->cellsize_min[XX];
2556             for (int d = 1; d < DIM; d++)
2557             {
2558                 limit = std::min(limit, dd->comm->cellsize_min[d]);
2559             }
2560         }
2561
2562         if (comm->systemInfo.haveInterDomainBondeds)
2563         {
2564             log->writeLineFormatted("%40s  %-7s %6.3f nm",
2565                                     "two-body bonded interactions",
2566                                     "(-rdd)",
2567                                     std::max(comm->systemInfo.cutoff, comm->cutoff_mbody));
2568             log->writeLineFormatted("%40s  %-7s %6.3f nm",
2569                                     "multi-body bonded interactions",
2570                                     "(-rdd)",
2571                                     (comm->systemInfo.filterBondedCommunication || isDlbOn(dd->comm))
2572                                             ? comm->cutoff_mbody
2573                                             : std::min(comm->systemInfo.cutoff, limit));
2574         }
2575         if (haveInterDomainVsites)
2576         {
2577             log->writeLineFormatted("%40s  %-7s %6.3f nm", "virtual site constructions", "(-rcon)", limit);
2578         }
2579         if (comm->systemInfo.mayHaveSplitConstraints || comm->systemInfo.mayHaveSplitSettles)
2580         {
2581             std::string separation =
2582                     gmx::formatString("atoms separated by up to %d constraints", 1 + ir.nProjOrder);
2583             log->writeLineFormatted("%40s  %-7s %6.3f nm\n", separation.c_str(), "(-rcon)", limit);
2584         }
2585         log->ensureLineBreak();
2586     }
2587 }
2588
2589 static void logSettings(const gmx::MDLogger& mdlog,
2590                         gmx_domdec_t*        dd,
2591                         const gmx_mtop_t&    mtop,
2592                         const t_inputrec&    ir,
2593                         real                 dlb_scale,
2594                         const gmx_ddbox_t*   ddbox)
2595 {
2596     gmx::StringOutputStream stream;
2597     gmx::TextWriter         log(&stream);
2598     writeSettings(&log, dd, mtop, ir, isDlbOn(dd->comm), dlb_scale, ddbox);
2599     if (dd->comm->dlbState == DlbState::offCanTurnOn)
2600     {
2601         {
2602             log.ensureEmptyLine();
2603             log.writeLine(
2604                     "When dynamic load balancing gets turned on, these settings will change to:");
2605         }
2606         writeSettings(&log, dd, mtop, ir, true, dlb_scale, ddbox);
2607     }
2608     GMX_LOG(mdlog.info).asParagraph().appendText(stream.toString());
2609 }
2610
2611 static void set_cell_limits_dlb(const gmx::MDLogger& mdlog,
2612                                 gmx_domdec_t*        dd,
2613                                 real                 dlb_scale,
2614                                 const t_inputrec&    inputrec,
2615                                 const gmx_ddbox_t*   ddbox)
2616 {
2617     int npulse       = 0;
2618     int npulse_d_max = 0;
2619     int npulse_d     = 0;
2620
2621     gmx_domdec_comm_t* comm = dd->comm;
2622
2623     bool bNoCutOff = (inputrec.rvdw == 0 || inputrec.rcoulomb == 0);
2624
2625     /* Determine the maximum number of comm. pulses in one dimension */
2626
2627     comm->cellsize_limit = std::max(comm->cellsize_limit, comm->cutoff_mbody);
2628
2629     /* Determine the maximum required number of grid pulses */
2630     if (comm->cellsize_limit >= comm->systemInfo.cutoff)
2631     {
2632         /* Only a single pulse is required */
2633         npulse = 1;
2634     }
2635     else if (!bNoCutOff && comm->cellsize_limit > 0)
2636     {
2637         /* We round down slightly here to avoid overhead due to the latency
2638          * of extra communication calls when the cut-off
2639          * would be only slightly longer than the cell size.
2640          * Later cellsize_limit is redetermined,
2641          * so we can not miss interactions due to this rounding.
2642          */
2643         npulse = static_cast<int>(0.96 + comm->systemInfo.cutoff / comm->cellsize_limit);
2644     }
2645     else
2646     {
2647         /* There is no cell size limit */
2648         npulse = std::max(dd->numCells[XX] - 1, std::max(dd->numCells[YY] - 1, dd->numCells[ZZ] - 1));
2649     }
2650
2651     if (!bNoCutOff && npulse > 1)
2652     {
2653         /* See if we can do with less pulses, based on dlb_scale */
2654         npulse_d_max = 0;
2655         for (int d = 0; d < dd->ndim; d++)
2656         {
2657             int dim  = dd->dim[d];
2658             npulse_d = static_cast<int>(
2659                     1
2660                     + dd->numCells[dim] * comm->systemInfo.cutoff
2661                               / (ddbox->box_size[dim] * ddbox->skew_fac[dim] * dlb_scale));
2662             npulse_d_max = std::max(npulse_d_max, npulse_d);
2663         }
2664         npulse = std::min(npulse, npulse_d_max);
2665     }
2666
2667     /* This env var can override npulse */
2668     const int ddPulseEnv = dd_getenv(mdlog, "GMX_DD_NPULSE", 0);
2669     if (ddPulseEnv > 0)
2670     {
2671         npulse = ddPulseEnv;
2672     }
2673
2674     comm->maxpulse       = 1;
2675     comm->bVacDLBNoLimit = (inputrec.pbcType == PbcType::No);
2676     for (int d = 0; d < dd->ndim; d++)
2677     {
2678         comm->cd[d].np_dlb = std::min(npulse, dd->numCells[dd->dim[d]] - 1);
2679         comm->maxpulse     = std::max(comm->maxpulse, comm->cd[d].np_dlb);
2680         if (comm->cd[d].np_dlb < dd->numCells[dd->dim[d]] - 1)
2681         {
2682             comm->bVacDLBNoLimit = FALSE;
2683         }
2684     }
2685
2686     /* cellsize_limit is set for LINCS in init_domain_decomposition */
2687     if (!comm->bVacDLBNoLimit)
2688     {
2689         comm->cellsize_limit = std::max(comm->cellsize_limit, comm->systemInfo.cutoff / comm->maxpulse);
2690     }
2691     comm->cellsize_limit = std::max(comm->cellsize_limit, comm->cutoff_mbody);
2692     /* Set the minimum cell size for each DD dimension */
2693     for (int d = 0; d < dd->ndim; d++)
2694     {
2695         if (comm->bVacDLBNoLimit || comm->cd[d].np_dlb * comm->cellsize_limit >= comm->systemInfo.cutoff)
2696         {
2697             comm->cellsize_min_dlb[dd->dim[d]] = comm->cellsize_limit;
2698         }
2699         else
2700         {
2701             comm->cellsize_min_dlb[dd->dim[d]] = comm->systemInfo.cutoff / comm->cd[d].np_dlb;
2702         }
2703     }
2704     if (comm->cutoff_mbody <= 0)
2705     {
2706         comm->cutoff_mbody = std::min(comm->systemInfo.cutoff, comm->cellsize_limit);
2707     }
2708     if (isDlbOn(comm))
2709     {
2710         set_dlb_limits(dd);
2711     }
2712 }
2713
2714 bool dd_moleculesAreAlwaysWhole(const gmx_domdec_t& dd)
2715 {
2716     return dd.comm->systemInfo.moleculesAreAlwaysWhole;
2717 }
2718
2719 bool dd_bonded_molpbc(const gmx_domdec_t& dd, PbcType pbcType)
2720 {
2721     /* If each molecule is a single charge group
2722      * or we use domain decomposition for each periodic dimension,
2723      * we do not need to take pbc into account for the bonded interactions.
2724      */
2725     return (pbcType != PbcType::No && dd.comm->systemInfo.haveInterDomainBondeds
2726             && !(dd.numCells[XX] > 1 && dd.numCells[YY] > 1
2727                  && (dd.numCells[ZZ] > 1 || pbcType == PbcType::XY)));
2728 }
2729
2730 /*! \brief Sets grid size limits and PP-PME setup, prints settings to log */
2731 static void set_ddgrid_parameters(const gmx::MDLogger& mdlog,
2732                                   gmx_domdec_t*        dd,
2733                                   real                 dlb_scale,
2734                                   const gmx_mtop_t&    mtop,
2735                                   const t_inputrec&    inputrec,
2736                                   const gmx_ddbox_t*   ddbox)
2737 {
2738     gmx_domdec_comm_t* comm        = dd->comm;
2739     DDRankSetup&       ddRankSetup = comm->ddRankSetup;
2740
2741     if (EEL_PME(inputrec.coulombtype) || EVDW_PME(inputrec.vdwtype))
2742     {
2743         init_ddpme(dd, &ddRankSetup.ddpme[0], 0);
2744         if (ddRankSetup.npmedecompdim >= 2)
2745         {
2746             init_ddpme(dd, &ddRankSetup.ddpme[1], 1);
2747         }
2748     }
2749     else
2750     {
2751         ddRankSetup.numRanksDoingPme = 0;
2752         if (dd->pme_nodeid >= 0)
2753         {
2754             gmx_fatal_collective(FARGS,
2755                                  dd->mpi_comm_all,
2756                                  DDMASTER(dd),
2757                                  "Can not have separate PME ranks without PME electrostatics");
2758         }
2759     }
2760
2761     if (debug)
2762     {
2763         fprintf(debug, "The DD cut-off is %f\n", comm->systemInfo.cutoff);
2764     }
2765     if (!isDlbDisabled(comm))
2766     {
2767         set_cell_limits_dlb(mdlog, dd, dlb_scale, inputrec, ddbox);
2768     }
2769
2770     logSettings(mdlog, dd, mtop, inputrec, dlb_scale, ddbox);
2771
2772     const real vol_frac = (inputrec.pbcType == PbcType::No)
2773                                   ? (1 - 1 / static_cast<double>(dd->nnodes))
2774                                   : ((1 + comm_box_frac(dd->numCells, comm->systemInfo.cutoff, *ddbox))
2775                                      / static_cast<double>(dd->nnodes));
2776     if (debug)
2777     {
2778         fprintf(debug, "Volume fraction for all DD zones: %f\n", vol_frac);
2779     }
2780     int natoms_tot = mtop.natoms;
2781
2782     dd->ga2la = new gmx_ga2la_t(natoms_tot, static_cast<int>(vol_frac * natoms_tot));
2783 }
2784
2785 /*! \brief Get some important DD parameters which can be modified by env.vars */
2786 static DDSettings getDDSettings(const gmx::MDLogger&     mdlog,
2787                                 const DomdecOptions&     options,
2788                                 const gmx::MdrunOptions& mdrunOptions,
2789                                 const t_inputrec&        ir,
2790                                 const bool               directGpuCommUsedWithGpuUpdate)
2791 {
2792     DDSettings ddSettings;
2793
2794     ddSettings.useSendRecv2        = (dd_getenv(mdlog, "GMX_DD_USE_SENDRECV2", 0) != 0);
2795     ddSettings.dlb_scale_lim       = dd_getenv(mdlog, "GMX_DLB_MAX_BOX_SCALING", 10);
2796     ddSettings.useDDOrderZYX       = bool(dd_getenv(mdlog, "GMX_DD_ORDER_ZYX", 0));
2797     ddSettings.useCartesianReorder = bool(dd_getenv(mdlog, "GMX_NO_CART_REORDER", 1));
2798     ddSettings.eFlop               = dd_getenv(mdlog, "GMX_DLB_BASED_ON_FLOPS", 0);
2799     const int recload              = dd_getenv(mdlog, "GMX_DD_RECORD_LOAD", 1);
2800     ddSettings.nstDDDump           = dd_getenv(mdlog, "GMX_DD_NST_DUMP", 0);
2801     ddSettings.nstDDDumpGrid       = dd_getenv(mdlog, "GMX_DD_NST_DUMP_GRID", 0);
2802     ddSettings.DD_debug            = dd_getenv(mdlog, "GMX_DD_DEBUG", 0);
2803
2804     if (ddSettings.useSendRecv2)
2805     {
2806         GMX_LOG(mdlog.info)
2807                 .appendText(
2808                         "Will use two sequential MPI_Sendrecv calls instead of two simultaneous "
2809                         "non-blocking MPI_Irecv and MPI_Isend pairs for constraint and vsite "
2810                         "communication");
2811     }
2812
2813     if (ddSettings.eFlop)
2814     {
2815         GMX_LOG(mdlog.info).appendText("Will load balance based on FLOP count");
2816         ddSettings.recordLoad = true;
2817     }
2818     else
2819     {
2820         ddSettings.recordLoad = (wallcycle_have_counter() && recload > 0);
2821     }
2822
2823     ddSettings.initialDlbState = determineInitialDlbState(
2824             mdlog, options.dlbOption, ddSettings.recordLoad, mdrunOptions, ir, directGpuCommUsedWithGpuUpdate);
2825     GMX_LOG(mdlog.info)
2826             .appendTextFormatted("Dynamic load balancing: %s",
2827                                  enumValueToString(ddSettings.initialDlbState));
2828
2829     return ddSettings;
2830 }
2831
2832 gmx_domdec_t::gmx_domdec_t(const t_inputrec& ir) : unitCellInfo(ir) {}
2833
2834 gmx_domdec_t::~gmx_domdec_t() = default;
2835
2836 namespace gmx
2837 {
2838
2839 // TODO once the functionality stablizes, move this class and
2840 // supporting functionality into builder.cpp
2841 /*! \brief Impl class for DD builder */
2842 class DomainDecompositionBuilder::Impl
2843 {
2844 public:
2845     //! Constructor
2846     Impl(const MDLogger&                   mdlog,
2847          t_commrec*                        cr,
2848          const DomdecOptions&              options,
2849          const MdrunOptions&               mdrunOptions,
2850          const gmx_mtop_t&                 mtop,
2851          const t_inputrec&                 ir,
2852          const MDModulesNotifiers&         notifiers,
2853          const matrix                      box,
2854          ArrayRef<const RangePartitioning> updateGroupingPerMoleculeType,
2855          bool                              useUpdateGroups,
2856          real                              maxUpdateGroupRadius,
2857          ArrayRef<const RVec>              xGlobal,
2858          bool                              useGpuForNonbonded,
2859          bool                              useGpuForPme,
2860          bool                              directGpuCommUsedWithGpuUpdate);
2861
2862     //! Build the resulting DD manager
2863     gmx_domdec_t* build(LocalAtomSetManager*       atomSets,
2864                         const gmx_localtop_t&      localTopology,
2865                         const t_state&             localState,
2866                         ObservablesReducerBuilder* observablesReducerBuilder);
2867
2868     //! Objects used in constructing and configuring DD
2869     //! {
2870     //! Logging object
2871     const MDLogger& mdlog_;
2872     //! Communication object
2873     t_commrec* cr_;
2874     //! User-supplied options configuring DD behavior
2875     const DomdecOptions options_;
2876     //! Global system topology
2877     const gmx_mtop_t& mtop_;
2878     //! User input values from the tpr file
2879     const t_inputrec& ir_;
2880     //! MdModules object
2881     const MDModulesNotifiers& notifiers_;
2882     //! }
2883
2884     //! Internal objects used in constructing DD
2885     //! {
2886     //! Settings combined from the user input
2887     DDSettings ddSettings_;
2888     //! Information derived from the simulation system
2889     DDSystemInfo systemInfo_;
2890     //! Box structure
2891     gmx_ddbox_t ddbox_ = { 0 };
2892     //! Organization of the DD grids
2893     DDGridSetup ddGridSetup_;
2894     //! Organzation of the DD ranks
2895     DDRankSetup ddRankSetup_;
2896     //! Number of DD cells in each dimension
2897     ivec ddCellIndex_ = { 0, 0, 0 };
2898     //! IDs of PME-only ranks
2899     std::vector<int> pmeRanks_;
2900     //! Contains a valid Cartesian-communicator-based setup, or defaults.
2901     CartesianRankSetup cartSetup_;
2902     //! }
2903 };
2904
2905 DomainDecompositionBuilder::Impl::Impl(const MDLogger&                   mdlog,
2906                                        t_commrec*                        cr,
2907                                        const DomdecOptions&              options,
2908                                        const MdrunOptions&               mdrunOptions,
2909                                        const gmx_mtop_t&                 mtop,
2910                                        const t_inputrec&                 ir,
2911                                        const MDModulesNotifiers&         notifiers,
2912                                        const matrix                      box,
2913                                        ArrayRef<const RangePartitioning> updateGroupingPerMoleculeType,
2914                                        const bool                        useUpdateGroups,
2915                                        const real                        maxUpdateGroupRadius,
2916                                        ArrayRef<const RVec>              xGlobal,
2917                                        bool                              useGpuForNonbonded,
2918                                        bool                              useGpuForPme,
2919                                        bool directGpuCommUsedWithGpuUpdate) :
2920     mdlog_(mdlog), cr_(cr), options_(options), mtop_(mtop), ir_(ir), notifiers_(notifiers)
2921 {
2922     GMX_LOG(mdlog_.info).appendTextFormatted("\nInitializing Domain Decomposition on %d ranks", cr_->sizeOfDefaultCommunicator);
2923
2924     ddSettings_ = getDDSettings(mdlog_, options_, mdrunOptions, ir_, directGpuCommUsedWithGpuUpdate);
2925
2926     if (ddSettings_.eFlop > 1)
2927     {
2928         /* Ensure that we have different random flop counts on different ranks */
2929         srand(1 + cr_->rankInDefaultCommunicator);
2930     }
2931
2932     systemInfo_ = getSystemInfo(mdlog_,
2933                                 MASTER(cr_) ? DDRole::Master : DDRole::Agent,
2934                                 cr->mpiDefaultCommunicator,
2935                                 options_,
2936                                 mtop_,
2937                                 ir_,
2938                                 box,
2939                                 updateGroupingPerMoleculeType,
2940                                 useUpdateGroups,
2941                                 maxUpdateGroupRadius,
2942                                 xGlobal);
2943
2944     const int  numRanksRequested         = cr_->sizeOfDefaultCommunicator;
2945     const bool checkForLargePrimeFactors = (options_.numCells[0] <= 0);
2946
2947
2948     /* Checks for ability to use PME-only ranks */
2949     auto separatePmeRanksPermitted = checkForSeparatePmeRanks(
2950             notifiers_, options_, numRanksRequested, useGpuForNonbonded, useGpuForPme);
2951
2952     /* Checks for validity of requested Ranks setup */
2953     checkForValidRankCountRequests(numRanksRequested,
2954                                    EEL_PME(ir_.coulombtype) | EVDW_PME(ir_.vdwtype),
2955                                    options_.numPmeRanks,
2956                                    separatePmeRanksPermitted,
2957                                    checkForLargePrimeFactors);
2958
2959     // DD grid setup uses a more different cell size limit for
2960     // automated setup than the one in systemInfo_. The latter is used
2961     // in set_dd_limits() to configure DLB, for example.
2962     const real gridSetupCellsizeLimit =
2963             getDDGridSetupCellSizeLimit(mdlog_,
2964                                         !isDlbDisabled(ddSettings_.initialDlbState),
2965                                         options_.dlbScaling,
2966                                         ir_,
2967                                         systemInfo_.cellsizeLimit);
2968     ddGridSetup_ = getDDGridSetup(mdlog_,
2969                                   MASTER(cr_) ? DDRole::Master : DDRole::Agent,
2970                                   cr->mpiDefaultCommunicator,
2971                                   numRanksRequested,
2972                                   options_,
2973                                   ddSettings_,
2974                                   systemInfo_,
2975                                   gridSetupCellsizeLimit,
2976                                   mtop_,
2977                                   ir_,
2978                                   separatePmeRanksPermitted,
2979                                   box,
2980                                   xGlobal,
2981                                   &ddbox_);
2982     checkDDGridSetup(ddGridSetup_,
2983                      MASTER(cr_) ? DDRole::Master : DDRole::Agent,
2984                      cr->mpiDefaultCommunicator,
2985                      cr->sizeOfDefaultCommunicator,
2986                      options_,
2987                      ddSettings_,
2988                      systemInfo_,
2989                      gridSetupCellsizeLimit,
2990                      ddbox_);
2991
2992     cr_->npmenodes = ddGridSetup_.numPmeOnlyRanks;
2993
2994     ddRankSetup_ = getDDRankSetup(
2995             mdlog_, cr_->sizeOfDefaultCommunicator, options_.rankOrder, ddGridSetup_, ir_);
2996
2997     /* Generate the group communicator, also decides the duty of each rank */
2998     cartSetup_ = makeGroupCommunicators(
2999             mdlog_, ddSettings_, options_.rankOrder, ddRankSetup_, cr_, ddCellIndex_, &pmeRanks_);
3000 }
3001
3002 gmx_domdec_t* DomainDecompositionBuilder::Impl::build(LocalAtomSetManager*  atomSets,
3003                                                       const gmx_localtop_t& localTopology,
3004                                                       const t_state&        localState,
3005                                                       ObservablesReducerBuilder* observablesReducerBuilder)
3006 {
3007     gmx_domdec_t* dd = new gmx_domdec_t(ir_);
3008
3009     copy_ivec(ddCellIndex_, dd->ci);
3010
3011     dd->comm = init_dd_comm();
3012
3013     dd->comm->ddRankSetup        = ddRankSetup_;
3014     dd->comm->cartesianRankSetup = cartSetup_;
3015
3016     set_dd_limits(mdlog_,
3017                   MASTER(cr_) ? DDRole::Master : DDRole::Agent,
3018                   dd,
3019                   options_,
3020                   ddSettings_,
3021                   systemInfo_,
3022                   ddGridSetup_,
3023                   ddRankSetup_.numPPRanks,
3024                   mtop_,
3025                   ir_,
3026                   ddbox_);
3027
3028     setupGroupCommunication(mdlog_, ddSettings_, pmeRanks_, cr_, mtop_.natoms, dd);
3029
3030     if (thisRankHasDuty(cr_, DUTY_PP))
3031     {
3032         set_ddgrid_parameters(mdlog_, dd, options_.dlbScaling, mtop_, ir_, &ddbox_);
3033
3034         setup_neighbor_relations(dd);
3035     }
3036
3037     /* Set overallocation to avoid frequent reallocation of arrays */
3038     set_over_alloc_dd(true);
3039
3040     dd->atomSets = atomSets;
3041
3042     dd->localTopologyChecker = std::make_unique<LocalTopologyChecker>(
3043             mdlog_, cr_, mtop_, localTopology, localState, dd->comm->systemInfo.useUpdateGroups, observablesReducerBuilder);
3044
3045     return dd;
3046 }
3047
3048 DomainDecompositionBuilder::DomainDecompositionBuilder(const MDLogger&           mdlog,
3049                                                        t_commrec*                cr,
3050                                                        const DomdecOptions&      options,
3051                                                        const MdrunOptions&       mdrunOptions,
3052                                                        const gmx_mtop_t&         mtop,
3053                                                        const t_inputrec&         ir,
3054                                                        const MDModulesNotifiers& notifiers,
3055                                                        const matrix              box,
3056                                                        ArrayRef<const RangePartitioning> updateGroupingPerMoleculeType,
3057                                                        const bool           useUpdateGroups,
3058                                                        const real           maxUpdateGroupRadius,
3059                                                        ArrayRef<const RVec> xGlobal,
3060                                                        const bool           useGpuForNonbonded,
3061                                                        const bool           useGpuForPme,
3062                                                        const bool directGpuCommUsedWithGpuUpdate) :
3063     impl_(new Impl(mdlog,
3064                    cr,
3065                    options,
3066                    mdrunOptions,
3067                    mtop,
3068                    ir,
3069                    notifiers,
3070                    box,
3071                    updateGroupingPerMoleculeType,
3072                    useUpdateGroups,
3073                    maxUpdateGroupRadius,
3074                    xGlobal,
3075                    useGpuForNonbonded,
3076                    useGpuForPme,
3077                    directGpuCommUsedWithGpuUpdate))
3078 {
3079 }
3080
3081 gmx_domdec_t* DomainDecompositionBuilder::build(LocalAtomSetManager*       atomSets,
3082                                                 const gmx_localtop_t&      localTopology,
3083                                                 const t_state&             localState,
3084                                                 ObservablesReducerBuilder* observablesReducerBuilder)
3085 {
3086     return impl_->build(atomSets, localTopology, localState, observablesReducerBuilder);
3087 }
3088
3089 DomainDecompositionBuilder::~DomainDecompositionBuilder() = default;
3090
3091 } // namespace gmx
3092
3093 static gmx_bool test_dd_cutoff(const t_commrec* cr, const matrix box, gmx::ArrayRef<const gmx::RVec> x, real cutoffRequested)
3094 {
3095     gmx_ddbox_t ddbox;
3096     int         LocallyLimited = 0;
3097
3098     const auto* dd = cr->dd;
3099
3100     set_ddbox(*dd, false, box, true, x, &ddbox);
3101
3102     LocallyLimited = 0;
3103
3104     for (int d = 0; d < dd->ndim; d++)
3105     {
3106         const int dim = dd->dim[d];
3107
3108         real inv_cell_size = DD_CELL_MARGIN * dd->numCells[dim] / ddbox.box_size[dim];
3109         if (dd->unitCellInfo.ddBoxIsDynamic)
3110         {
3111             inv_cell_size *= DD_PRES_SCALE_MARGIN;
3112         }
3113
3114         const int np = 1 + static_cast<int>(cutoffRequested * inv_cell_size * ddbox.skew_fac[dim]);
3115
3116         if (!isDlbDisabled(dd->comm) && (dim < ddbox.npbcdim) && (dd->comm->cd[d].np_dlb > 0))
3117         {
3118             if (np > dd->comm->cd[d].np_dlb)
3119             {
3120                 return FALSE;
3121             }
3122
3123             /* If a current local cell size is smaller than the requested
3124              * cut-off, we could still fix it, but this gets very complicated.
3125              * Without fixing here, we might actually need more checks.
3126              */
3127             real cellSizeAlongDim =
3128                     (dd->comm->cell_x1[dim] - dd->comm->cell_x0[dim]) * ddbox.skew_fac[dim];
3129             if (cellSizeAlongDim * dd->comm->cd[d].np_dlb < cutoffRequested)
3130             {
3131                 LocallyLimited = 1;
3132             }
3133         }
3134     }
3135
3136     if (!isDlbDisabled(dd->comm))
3137     {
3138         /* If DLB is not active yet, we don't need to check the grid jumps.
3139          * Actually we shouldn't, because then the grid jump data is not set.
3140          */
3141         if (isDlbOn(dd->comm) && gmx::check_grid_jump(0, dd, cutoffRequested, &ddbox, FALSE))
3142         {
3143             LocallyLimited = 1;
3144         }
3145
3146         gmx_sumi(1, &LocallyLimited, cr);
3147
3148         if (LocallyLimited > 0)
3149         {
3150             return FALSE;
3151         }
3152     }
3153
3154     return TRUE;
3155 }
3156
3157 bool change_dd_cutoff(t_commrec* cr, const matrix box, gmx::ArrayRef<const gmx::RVec> x, real cutoffRequested)
3158 {
3159     bool bCutoffAllowed = test_dd_cutoff(cr, box, x, cutoffRequested);
3160
3161     if (bCutoffAllowed)
3162     {
3163         cr->dd->comm->systemInfo.cutoff = cutoffRequested;
3164     }
3165
3166     return bCutoffAllowed;
3167 }
3168
3169 void constructGpuHaloExchange(const gmx::MDLogger&            mdlog,
3170                               const t_commrec&                cr,
3171                               const gmx::DeviceStreamManager& deviceStreamManager,
3172                               gmx_wallcycle*                  wcycle)
3173 {
3174     GMX_RELEASE_ASSERT(deviceStreamManager.streamIsValid(gmx::DeviceStreamType::NonBondedLocal),
3175                        "Local non-bonded stream should be valid when using"
3176                        "GPU halo exchange.");
3177     GMX_RELEASE_ASSERT(deviceStreamManager.streamIsValid(gmx::DeviceStreamType::NonBondedNonLocal),
3178                        "Non-local non-bonded stream should be valid when using "
3179                        "GPU halo exchange.");
3180
3181     if (cr.dd->gpuHaloExchange[0].empty())
3182     {
3183         GMX_LOG(mdlog.warning)
3184                 .asParagraph()
3185                 .appendTextFormatted(
3186                         "NOTE: Activating the 'GPU halo exchange' feature, enabled "
3187                         "by the "
3188                         "GMX_GPU_DD_COMMS environment variable.");
3189     }
3190
3191     for (int d = 0; d < cr.dd->ndim; d++)
3192     {
3193         for (int pulse = cr.dd->gpuHaloExchange[d].size(); pulse < cr.dd->comm->cd[d].numPulses(); pulse++)
3194         {
3195             cr.dd->gpuHaloExchange[d].push_back(std::make_unique<gmx::GpuHaloExchange>(
3196                     cr.dd,
3197                     d,
3198                     cr.mpi_comm_mygroup,
3199                     deviceStreamManager.context(),
3200                     deviceStreamManager.stream(gmx::DeviceStreamType::NonBondedLocal),
3201                     deviceStreamManager.stream(gmx::DeviceStreamType::NonBondedNonLocal),
3202                     pulse,
3203                     wcycle));
3204         }
3205     }
3206 }
3207
3208 void reinitGpuHaloExchange(const t_commrec&              cr,
3209                            const DeviceBuffer<gmx::RVec> d_coordinatesBuffer,
3210                            const DeviceBuffer<gmx::RVec> d_forcesBuffer)
3211 {
3212     for (int d = 0; d < cr.dd->ndim; d++)
3213     {
3214         for (int pulse = 0; pulse < cr.dd->comm->cd[d].numPulses(); pulse++)
3215         {
3216             cr.dd->gpuHaloExchange[d][pulse]->reinitHalo(d_coordinatesBuffer, d_forcesBuffer);
3217         }
3218     }
3219 }
3220
3221 void communicateGpuHaloCoordinates(const t_commrec&      cr,
3222                                    const matrix          box,
3223                                    GpuEventSynchronizer* coordinatesReadyOnDeviceEvent)
3224 {
3225     for (int d = 0; d < cr.dd->ndim; d++)
3226     {
3227         for (int pulse = 0; pulse < cr.dd->comm->cd[d].numPulses(); pulse++)
3228         {
3229             cr.dd->gpuHaloExchange[d][pulse]->communicateHaloCoordinates(box, coordinatesReadyOnDeviceEvent);
3230         }
3231     }
3232 }
3233
3234 void communicateGpuHaloForces(const t_commrec& cr, bool accumulateForces)
3235 {
3236     for (int d = cr.dd->ndim - 1; d >= 0; d--)
3237     {
3238         for (int pulse = cr.dd->comm->cd[d].numPulses() - 1; pulse >= 0; pulse--)
3239         {
3240             cr.dd->gpuHaloExchange[d][pulse]->communicateHaloForces(accumulateForces);
3241         }
3242     }
3243 }
3244
3245 void dd_init_local_state(const gmx_domdec_t& dd, const t_state* state_global, t_state* state_local)
3246 {
3247     std::array<int, 5> buf;
3248
3249     if (DDMASTER(dd))
3250     {
3251         buf[0] = state_global->flags;
3252         buf[1] = state_global->ngtc;
3253         buf[2] = state_global->nnhpres;
3254         buf[3] = state_global->nhchainlength;
3255         buf[4] = state_global->dfhist ? state_global->dfhist->nlambda : 0;
3256     }
3257     dd_bcast(&dd, buf.size() * sizeof(int), buf.data());
3258
3259     init_gtc_state(state_local, buf[1], buf[2], buf[3]);
3260     init_dfhist_state(state_local, buf[4]);
3261     state_local->flags = buf[0];
3262 }
3263
3264 void putUpdateGroupAtomsInSamePeriodicImage(const gmx_domdec_t&      dd,
3265                                             const gmx_mtop_t&        mtop,
3266                                             const matrix             box,
3267                                             gmx::ArrayRef<gmx::RVec> positions)
3268 {
3269     int atomOffset = 0;
3270     for (const gmx_molblock_t& molblock : mtop.molblock)
3271     {
3272         const auto& updateGrouping = dd.comm->systemInfo.updateGroupingsPerMoleculeType[molblock.type];
3273
3274         for (int mol = 0; mol < molblock.nmol; mol++)
3275         {
3276             for (int g = 0; g < updateGrouping.numBlocks(); g++)
3277             {
3278                 const auto& block     = updateGrouping.block(g);
3279                 const int   atomBegin = atomOffset + block.begin();
3280                 const int   atomEnd   = atomOffset + block.end();
3281                 for (int a = atomBegin + 1; a < atomEnd; a++)
3282                 {
3283                     // Make sure that atoms in the same update group
3284                     // are in the same periodic image after restarts.
3285                     for (int d = DIM - 1; d >= 0; d--)
3286                     {
3287                         while (positions[a][d] - positions[atomBegin][d] > 0.5_real * box[d][d])
3288                         {
3289                             positions[a] -= box[d];
3290                         }
3291                         while (positions[a][d] - positions[atomBegin][d] < -0.5_real * box[d][d])
3292                         {
3293                             positions[a] += box[d];
3294                         }
3295                     }
3296                 }
3297             }
3298             atomOffset += updateGrouping.fullRange().end();
3299         }
3300     }
3301 }