Merge remote-tracking branch 'origin/release-2021' into merge-2021-into-master
[alexxy/gromacs.git] / src / gromacs / domdec / domdec.cpp
1 /*
2  * This file is part of the GROMACS molecular simulation package.
3  *
4  * Copyright (c) 2005,2006,2007,2008,2009 by the GROMACS development team.
5  * Copyright (c) 2010,2011,2012,2013,2014 by the GROMACS development team.
6  * Copyright (c) 2015,2016,2017,2018,2019 by the GROMACS development team.
7  * Copyright (c) 2020,2021, by the GROMACS development team, led by
8  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
9  * and including many others, as listed in the AUTHORS file in the
10  * top-level source directory and at http://www.gromacs.org.
11  *
12  * GROMACS is free software; you can redistribute it and/or
13  * modify it under the terms of the GNU Lesser General Public License
14  * as published by the Free Software Foundation; either version 2.1
15  * of the License, or (at your option) any later version.
16  *
17  * GROMACS is distributed in the hope that it will be useful,
18  * but WITHOUT ANY WARRANTY; without even the implied warranty of
19  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
20  * Lesser General Public License for more details.
21  *
22  * You should have received a copy of the GNU Lesser General Public
23  * License along with GROMACS; if not, see
24  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
25  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
26  *
27  * If you want to redistribute modifications to GROMACS, please
28  * consider that scientific software is very special. Version
29  * control is crucial - bugs must be traceable. We will be happy to
30  * consider code for inclusion in the official distribution, but
31  * derived work must not be called official GROMACS. Details are found
32  * in the README & COPYING files - if they are missing, get the
33  * official version at http://www.gromacs.org.
34  *
35  * To help us fund GROMACS development, we humbly ask that you cite
36  * the research papers on the package. Check out http://www.gromacs.org.
37  */
38
39 #include "gmxpre.h"
40
41 #include "domdec.h"
42
43 #include "config.h"
44
45 #include <cassert>
46 #include <cinttypes>
47 #include <climits>
48 #include <cmath>
49 #include <cstdio>
50 #include <cstdlib>
51 #include <cstring>
52
53 #include <algorithm>
54 #include <memory>
55
56 #include "gromacs/domdec/builder.h"
57 #include "gromacs/domdec/collect.h"
58 #include "gromacs/domdec/computemultibodycutoffs.h"
59 #include "gromacs/domdec/dlb.h"
60 #include "gromacs/domdec/dlbtiming.h"
61 #include "gromacs/domdec/domdec_network.h"
62 #include "gromacs/domdec/ga2la.h"
63 #include "gromacs/domdec/gpuhaloexchange.h"
64 #include "gromacs/domdec/localtopologychecker.h"
65 #include "gromacs/domdec/options.h"
66 #include "gromacs/domdec/partition.h"
67 #include "gromacs/ewald/pme.h"
68 #include "gromacs/domdec/reversetopology.h"
69 #include "gromacs/gmxlib/network.h"
70 #include "gromacs/gmxlib/nrnb.h"
71 #include "gromacs/gpu_utils/device_stream_manager.h"
72 #include "gromacs/gpu_utils/gpu_utils.h"
73 #include "gromacs/hardware/hw_info.h"
74 #include "gromacs/math/vec.h"
75 #include "gromacs/math/vectypes.h"
76 #include "gromacs/mdlib/calc_verletbuf.h"
77 #include "gromacs/mdlib/constr.h"
78 #include "gromacs/mdlib/constraintrange.h"
79 #include "gromacs/mdlib/updategroups.h"
80 #include "gromacs/mdlib/vcm.h"
81 #include "gromacs/mdlib/vsite.h"
82 #include "gromacs/mdrun/mdmodules.h"
83 #include "gromacs/mdtypes/commrec.h"
84 #include "gromacs/mdtypes/forceoutput.h"
85 #include "gromacs/mdtypes/forcerec.h"
86 #include "gromacs/mdtypes/inputrec.h"
87 #include "gromacs/mdtypes/mdrunoptions.h"
88 #include "gromacs/mdtypes/state.h"
89 #include "gromacs/pbcutil/ishift.h"
90 #include "gromacs/pbcutil/pbc.h"
91 #include "gromacs/pulling/pull.h"
92 #include "gromacs/timing/wallcycle.h"
93 #include "gromacs/topology/block.h"
94 #include "gromacs/topology/idef.h"
95 #include "gromacs/topology/ifunc.h"
96 #include "gromacs/topology/mtop_lookup.h"
97 #include "gromacs/topology/mtop_util.h"
98 #include "gromacs/topology/topology.h"
99 #include "gromacs/utility/basedefinitions.h"
100 #include "gromacs/utility/basenetwork.h"
101 #include "gromacs/utility/cstringutil.h"
102 #include "gromacs/utility/enumerationhelpers.h"
103 #include "gromacs/utility/exceptions.h"
104 #include "gromacs/utility/fatalerror.h"
105 #include "gromacs/utility/gmxmpi.h"
106 #include "gromacs/utility/logger.h"
107 #include "gromacs/utility/mdmodulesnotifiers.h"
108 #include "gromacs/utility/real.h"
109 #include "gromacs/utility/smalloc.h"
110 #include "gromacs/utility/strconvert.h"
111 #include "gromacs/utility/stringstream.h"
112 #include "gromacs/utility/stringutil.h"
113 #include "gromacs/utility/textwriter.h"
114
115 #include "atomdistribution.h"
116 #include "box.h"
117 #include "cellsizes.h"
118 #include "distribute.h"
119 #include "domdec_constraints.h"
120 #include "domdec_internal.h"
121 #include "domdec_setup.h"
122 #include "domdec_vsite.h"
123 #include "redistribute.h"
124 #include "utility.h"
125
126 // TODO remove this when moving domdec into gmx namespace
127 using gmx::ArrayRef;
128 using gmx::DdRankOrder;
129 using gmx::DlbOption;
130 using gmx::DomdecOptions;
131 using gmx::RangePartitioning;
132
133 static const char* enumValueToString(DlbState enumValue)
134 {
135     static constexpr gmx::EnumerationArray<DlbState, const char*> dlbStateNames = {
136         "off", "off", "auto", "locked", "on", "on"
137     };
138     return dlbStateNames[enumValue];
139 }
140
141 /* The DD zone order */
142 static const ivec dd_zo[DD_MAXZONE] = { { 0, 0, 0 }, { 1, 0, 0 }, { 1, 1, 0 }, { 0, 1, 0 },
143                                         { 0, 1, 1 }, { 0, 0, 1 }, { 1, 0, 1 }, { 1, 1, 1 } };
144
145 /* The non-bonded zone-pair setup for domain decomposition
146  * The first number is the i-zone, the second number the first j-zone seen by
147  * this i-zone, the third number the last+1 j-zone seen by this i-zone.
148  * As is, this is for 3D decomposition, where there are 4 i-zones.
149  * With 2D decomposition use only the first 2 i-zones and a last+1 j-zone of 4.
150  * With 1D decomposition use only the first i-zone and a last+1 j-zone of 2.
151  */
152 static const int ddNonbondedZonePairRanges[DD_MAXIZONE][3] = { { 0, 0, 8 },
153                                                                { 1, 3, 6 },
154                                                                { 2, 5, 6 },
155                                                                { 3, 5, 7 } };
156
157 static void ddindex2xyz(const ivec nc, int ind, ivec xyz)
158 {
159     xyz[XX] = ind / (nc[YY] * nc[ZZ]);
160     xyz[YY] = (ind / nc[ZZ]) % nc[YY];
161     xyz[ZZ] = ind % nc[ZZ];
162 }
163
164 static int ddcoord2ddnodeid(gmx_domdec_t* dd, ivec c)
165 {
166     int ddnodeid = -1;
167
168     const CartesianRankSetup& cartSetup = dd->comm->cartesianRankSetup;
169     const int                 ddindex   = dd_index(dd->numCells, c);
170     if (cartSetup.bCartesianPP_PME)
171     {
172         ddnodeid = cartSetup.ddindex2ddnodeid[ddindex];
173     }
174     else if (cartSetup.bCartesianPP)
175     {
176 #if GMX_MPI
177         MPI_Cart_rank(dd->mpi_comm_all, c, &ddnodeid);
178 #endif
179     }
180     else
181     {
182         ddnodeid = ddindex;
183     }
184
185     return ddnodeid;
186 }
187
188 int ddglatnr(const gmx_domdec_t* dd, int i)
189 {
190     int atnr = 0;
191
192     if (dd == nullptr)
193     {
194         atnr = i + 1;
195     }
196     else
197     {
198         if (i >= dd->comm->atomRanges.numAtomsTotal())
199         {
200             gmx_fatal(FARGS,
201                       "glatnr called with %d, which is larger than the local number of atoms (%d)",
202                       i,
203                       dd->comm->atomRanges.numAtomsTotal());
204         }
205         atnr = dd->globalAtomIndices[i] + 1;
206     }
207
208     return atnr;
209 }
210
211 void dd_store_state(const gmx_domdec_t& dd, t_state* state)
212 {
213     if (state->ddp_count != dd.ddp_count)
214     {
215         gmx_incons("The MD state does not match the domain decomposition state");
216     }
217
218     state->cg_gl.resize(dd.numHomeAtoms);
219     for (int i = 0; i < dd.numHomeAtoms; i++)
220     {
221         state->cg_gl[i] = dd.globalAtomGroupIndices[i];
222     }
223
224     state->ddp_count_cg_gl = dd.ddp_count;
225 }
226
227 gmx_domdec_zones_t* domdec_zones(gmx_domdec_t* dd)
228 {
229     return &dd->comm->zones;
230 }
231
232 int dd_numAtomsZones(const gmx_domdec_t& dd)
233 {
234     return dd.comm->atomRanges.end(DDAtomRanges::Type::Zones);
235 }
236
237 int dd_numHomeAtoms(const gmx_domdec_t& dd)
238 {
239     return dd.comm->atomRanges.numHomeAtoms();
240 }
241
242 int dd_natoms_mdatoms(const gmx_domdec_t& dd)
243 {
244     /* We currently set mdatoms entries for all atoms:
245      * local + non-local + communicated for vsite + constraints
246      */
247
248     return dd.comm->atomRanges.numAtomsTotal();
249 }
250
251 int dd_natoms_vsite(const gmx_domdec_t& dd)
252 {
253     return dd.comm->atomRanges.end(DDAtomRanges::Type::Vsites);
254 }
255
256 void dd_get_constraint_range(const gmx_domdec_t& dd, int* at_start, int* at_end)
257 {
258     *at_start = dd.comm->atomRanges.start(DDAtomRanges::Type::Constraints);
259     *at_end   = dd.comm->atomRanges.end(DDAtomRanges::Type::Constraints);
260 }
261
262 void dd_move_x(gmx_domdec_t* dd, const matrix box, gmx::ArrayRef<gmx::RVec> x, gmx_wallcycle* wcycle)
263 {
264     wallcycle_start(wcycle, WallCycleCounter::MoveX);
265
266     rvec shift = { 0, 0, 0 };
267
268     gmx_domdec_comm_t* comm = dd->comm;
269
270     int nzone   = 1;
271     int nat_tot = comm->atomRanges.numHomeAtoms();
272     for (int d = 0; d < dd->ndim; d++)
273     {
274         const bool bPBC   = (dd->ci[dd->dim[d]] == 0);
275         const bool bScrew = (bPBC && dd->unitCellInfo.haveScrewPBC && dd->dim[d] == XX);
276         if (bPBC)
277         {
278             copy_rvec(box[dd->dim[d]], shift);
279         }
280         gmx_domdec_comm_dim_t* cd = &comm->cd[d];
281         for (const gmx_domdec_ind_t& ind : cd->ind)
282         {
283             DDBufferAccess<gmx::RVec> sendBufferAccess(comm->rvecBuffer, ind.nsend[nzone + 1]);
284             gmx::ArrayRef<gmx::RVec>& sendBuffer = sendBufferAccess.buffer;
285             int                       n          = 0;
286             if (!bPBC)
287             {
288                 for (int j : ind.index)
289                 {
290                     sendBuffer[n] = x[j];
291                     n++;
292                 }
293             }
294             else if (!bScrew)
295             {
296                 for (int j : ind.index)
297                 {
298                     /* We need to shift the coordinates */
299                     for (int d = 0; d < DIM; d++)
300                     {
301                         sendBuffer[n][d] = x[j][d] + shift[d];
302                     }
303                     n++;
304                 }
305             }
306             else
307             {
308                 for (int j : ind.index)
309                 {
310                     /* Shift x */
311                     sendBuffer[n][XX] = x[j][XX] + shift[XX];
312                     /* Rotate y and z.
313                      * This operation requires a special shift force
314                      * treatment, which is performed in calc_vir.
315                      */
316                     sendBuffer[n][YY] = box[YY][YY] - x[j][YY];
317                     sendBuffer[n][ZZ] = box[ZZ][ZZ] - x[j][ZZ];
318                     n++;
319                 }
320             }
321
322             DDBufferAccess<gmx::RVec> receiveBufferAccess(
323                     comm->rvecBuffer2, cd->receiveInPlace ? 0 : ind.nrecv[nzone + 1]);
324
325             gmx::ArrayRef<gmx::RVec> receiveBuffer;
326             if (cd->receiveInPlace)
327             {
328                 receiveBuffer = gmx::arrayRefFromArray(x.data() + nat_tot, ind.nrecv[nzone + 1]);
329             }
330             else
331             {
332                 receiveBuffer = receiveBufferAccess.buffer;
333             }
334             /* Send and receive the coordinates */
335             ddSendrecv(dd, d, dddirBackward, sendBuffer, receiveBuffer);
336
337             if (!cd->receiveInPlace)
338             {
339                 int j = 0;
340                 for (int zone = 0; zone < nzone; zone++)
341                 {
342                     for (int i = ind.cell2at0[zone]; i < ind.cell2at1[zone]; i++)
343                     {
344                         x[i] = receiveBuffer[j++];
345                     }
346                 }
347             }
348             nat_tot += ind.nrecv[nzone + 1];
349         }
350         nzone += nzone;
351     }
352
353     wallcycle_stop(wcycle, WallCycleCounter::MoveX);
354 }
355
356 void dd_move_f(gmx_domdec_t* dd, gmx::ForceWithShiftForces* forceWithShiftForces, gmx_wallcycle* wcycle)
357 {
358     wallcycle_start(wcycle, WallCycleCounter::MoveF);
359
360     gmx::ArrayRef<gmx::RVec> f      = forceWithShiftForces->force();
361     gmx::ArrayRef<gmx::RVec> fshift = forceWithShiftForces->shiftForces();
362
363     gmx_domdec_comm_t& comm    = *dd->comm;
364     int                nzone   = comm.zones.n / 2;
365     int                nat_tot = comm.atomRanges.end(DDAtomRanges::Type::Zones);
366     for (int d = dd->ndim - 1; d >= 0; d--)
367     {
368         /* Only forces in domains near the PBC boundaries need to
369            consider PBC in the treatment of fshift */
370         const bool shiftForcesNeedPbc =
371                 (forceWithShiftForces->computeVirial() && dd->ci[dd->dim[d]] == 0);
372         const bool applyScrewPbc =
373                 (shiftForcesNeedPbc && dd->unitCellInfo.haveScrewPBC && dd->dim[d] == XX);
374         /* Determine which shift vector we need */
375         ivec vis        = { 0, 0, 0 };
376         vis[dd->dim[d]] = 1;
377         const int is    = gmx::ivecToShiftIndex(vis);
378
379         /* Loop over the pulses */
380         const gmx_domdec_comm_dim_t& cd = comm.cd[d];
381         for (int p = cd.numPulses() - 1; p >= 0; p--)
382         {
383             const gmx_domdec_ind_t&   ind = cd.ind[p];
384             DDBufferAccess<gmx::RVec> receiveBufferAccess(comm.rvecBuffer, ind.nsend[nzone + 1]);
385             gmx::ArrayRef<gmx::RVec>& receiveBuffer = receiveBufferAccess.buffer;
386
387             nat_tot -= ind.nrecv[nzone + 1];
388
389             DDBufferAccess<gmx::RVec> sendBufferAccess(
390                     comm.rvecBuffer2, cd.receiveInPlace ? 0 : ind.nrecv[nzone + 1]);
391
392             gmx::ArrayRef<gmx::RVec> sendBuffer;
393             if (cd.receiveInPlace)
394             {
395                 sendBuffer = gmx::arrayRefFromArray(f.data() + nat_tot, ind.nrecv[nzone + 1]);
396             }
397             else
398             {
399                 sendBuffer = sendBufferAccess.buffer;
400                 int j      = 0;
401                 for (int zone = 0; zone < nzone; zone++)
402                 {
403                     for (int i = ind.cell2at0[zone]; i < ind.cell2at1[zone]; i++)
404                     {
405                         sendBuffer[j++] = f[i];
406                     }
407                 }
408             }
409             /* Communicate the forces */
410             ddSendrecv(dd, d, dddirForward, sendBuffer, receiveBuffer);
411             /* Add the received forces */
412             int n = 0;
413             if (!shiftForcesNeedPbc)
414             {
415                 for (int j : ind.index)
416                 {
417                     for (int d = 0; d < DIM; d++)
418                     {
419                         f[j][d] += receiveBuffer[n][d];
420                     }
421                     n++;
422                 }
423             }
424             else if (!applyScrewPbc)
425             {
426                 for (int j : ind.index)
427                 {
428                     for (int d = 0; d < DIM; d++)
429                     {
430                         f[j][d] += receiveBuffer[n][d];
431                     }
432                     /* Add this force to the shift force */
433                     for (int d = 0; d < DIM; d++)
434                     {
435                         fshift[is][d] += receiveBuffer[n][d];
436                     }
437                     n++;
438                 }
439             }
440             else
441             {
442                 for (int j : ind.index)
443                 {
444                     /* Rotate the force */
445                     f[j][XX] += receiveBuffer[n][XX];
446                     f[j][YY] -= receiveBuffer[n][YY];
447                     f[j][ZZ] -= receiveBuffer[n][ZZ];
448                     if (shiftForcesNeedPbc)
449                     {
450                         /* Add this force to the shift force */
451                         for (int d = 0; d < DIM; d++)
452                         {
453                             fshift[is][d] += receiveBuffer[n][d];
454                         }
455                     }
456                     n++;
457                 }
458             }
459         }
460         nzone /= 2;
461     }
462     wallcycle_stop(wcycle, WallCycleCounter::MoveF);
463 }
464
465 /* Convenience function for extracting a real buffer from an rvec buffer
466  *
467  * To reduce the number of temporary communication buffers and avoid
468  * cache polution, we reuse gmx::RVec buffers for storing reals.
469  * This functions return a real buffer reference with the same number
470  * of elements as the gmx::RVec buffer (so 1/3 of the size in bytes).
471  */
472 static gmx::ArrayRef<real> realArrayRefFromRvecArrayRef(gmx::ArrayRef<gmx::RVec> arrayRef)
473 {
474     return gmx::arrayRefFromArray(as_rvec_array(arrayRef.data())[0], arrayRef.size());
475 }
476
477 void dd_atom_spread_real(gmx_domdec_t* dd, real v[])
478 {
479     gmx_domdec_comm_t* comm = dd->comm;
480
481     int nzone   = 1;
482     int nat_tot = comm->atomRanges.numHomeAtoms();
483     for (int d = 0; d < dd->ndim; d++)
484     {
485         gmx_domdec_comm_dim_t* cd = &comm->cd[d];
486         for (const gmx_domdec_ind_t& ind : cd->ind)
487         {
488             /* Note: We provision for RVec instead of real, so a factor of 3
489              * more than needed. The buffer actually already has this size
490              * and we pass a plain pointer below, so this does not matter.
491              */
492             DDBufferAccess<gmx::RVec> sendBufferAccess(comm->rvecBuffer, ind.nsend[nzone + 1]);
493             gmx::ArrayRef<real> sendBuffer = realArrayRefFromRvecArrayRef(sendBufferAccess.buffer);
494             int                 n          = 0;
495             for (int j : ind.index)
496             {
497                 sendBuffer[n++] = v[j];
498             }
499
500             DDBufferAccess<gmx::RVec> receiveBufferAccess(
501                     comm->rvecBuffer2, cd->receiveInPlace ? 0 : ind.nrecv[nzone + 1]);
502
503             gmx::ArrayRef<real> receiveBuffer;
504             if (cd->receiveInPlace)
505             {
506                 receiveBuffer = gmx::arrayRefFromArray(v + nat_tot, ind.nrecv[nzone + 1]);
507             }
508             else
509             {
510                 receiveBuffer = realArrayRefFromRvecArrayRef(receiveBufferAccess.buffer);
511             }
512             /* Send and receive the data */
513             ddSendrecv(dd, d, dddirBackward, sendBuffer, receiveBuffer);
514             if (!cd->receiveInPlace)
515             {
516                 int j = 0;
517                 for (int zone = 0; zone < nzone; zone++)
518                 {
519                     for (int i = ind.cell2at0[zone]; i < ind.cell2at1[zone]; i++)
520                     {
521                         v[i] = receiveBuffer[j++];
522                     }
523                 }
524             }
525             nat_tot += ind.nrecv[nzone + 1];
526         }
527         nzone += nzone;
528     }
529 }
530
531 void dd_atom_sum_real(gmx_domdec_t* dd, real v[])
532 {
533     gmx_domdec_comm_t* comm = dd->comm;
534
535     int nzone   = comm->zones.n / 2;
536     int nat_tot = comm->atomRanges.end(DDAtomRanges::Type::Zones);
537     for (int d = dd->ndim - 1; d >= 0; d--)
538     {
539         gmx_domdec_comm_dim_t* cd = &comm->cd[d];
540         for (int p = cd->numPulses() - 1; p >= 0; p--)
541         {
542             const gmx_domdec_ind_t& ind = cd->ind[p];
543
544             /* Note: We provision for RVec instead of real, so a factor of 3
545              * more than needed. The buffer actually already has this size
546              * and we typecast, so this works as intended.
547              */
548             DDBufferAccess<gmx::RVec> receiveBufferAccess(comm->rvecBuffer, ind.nsend[nzone + 1]);
549             gmx::ArrayRef<real> receiveBuffer = realArrayRefFromRvecArrayRef(receiveBufferAccess.buffer);
550             nat_tot -= ind.nrecv[nzone + 1];
551
552             DDBufferAccess<gmx::RVec> sendBufferAccess(
553                     comm->rvecBuffer2, cd->receiveInPlace ? 0 : ind.nrecv[nzone + 1]);
554
555             gmx::ArrayRef<real> sendBuffer;
556             if (cd->receiveInPlace)
557             {
558                 sendBuffer = gmx::arrayRefFromArray(v + nat_tot, ind.nrecv[nzone + 1]);
559             }
560             else
561             {
562                 sendBuffer = realArrayRefFromRvecArrayRef(sendBufferAccess.buffer);
563                 int j      = 0;
564                 for (int zone = 0; zone < nzone; zone++)
565                 {
566                     for (int i = ind.cell2at0[zone]; i < ind.cell2at1[zone]; i++)
567                     {
568                         sendBuffer[j++] = v[i];
569                     }
570                 }
571             }
572             /* Communicate the forces */
573             ddSendrecv(dd, d, dddirForward, sendBuffer, receiveBuffer);
574             /* Add the received forces */
575             int n = 0;
576             for (int j : ind.index)
577             {
578                 v[j] += receiveBuffer[n];
579                 n++;
580             }
581         }
582         nzone /= 2;
583     }
584 }
585
586 real dd_cutoff_multibody(const gmx_domdec_t* dd)
587 {
588     const gmx_domdec_comm_t& comm       = *dd->comm;
589     const DDSystemInfo&      systemInfo = comm.systemInfo;
590
591     real r = -1;
592     if (systemInfo.haveInterDomainMultiBodyBondeds)
593     {
594         if (comm.cutoff_mbody > 0)
595         {
596             r = comm.cutoff_mbody;
597         }
598         else
599         {
600             /* cutoff_mbody=0 means we do not have DLB */
601             r = comm.cellsize_min[dd->dim[0]];
602             for (int di = 1; di < dd->ndim; di++)
603             {
604                 r = std::min(r, comm.cellsize_min[dd->dim[di]]);
605             }
606             if (comm.systemInfo.filterBondedCommunication)
607             {
608                 r = std::max(r, comm.cutoff_mbody);
609             }
610             else
611             {
612                 r = std::min(r, systemInfo.cutoff);
613             }
614         }
615     }
616
617     return r;
618 }
619
620 real dd_cutoff_twobody(const gmx_domdec_t* dd)
621 {
622     const real r_mb = dd_cutoff_multibody(dd);
623
624     return std::max(dd->comm->systemInfo.cutoff, r_mb);
625 }
626
627
628 static void dd_cart_coord2pmecoord(const DDRankSetup&        ddRankSetup,
629                                    const CartesianRankSetup& cartSetup,
630                                    const ivec                coord,
631                                    ivec                      coord_pme)
632 {
633     const int nc   = ddRankSetup.numPPCells[cartSetup.cartpmedim];
634     const int ntot = cartSetup.ntot[cartSetup.cartpmedim];
635     copy_ivec(coord, coord_pme);
636     coord_pme[cartSetup.cartpmedim] =
637             nc + (coord[cartSetup.cartpmedim] * (ntot - nc) + (ntot - nc) / 2) / nc;
638 }
639
640 /* Returns the PME rank index in 0...npmenodes-1 for the PP cell with index ddCellIndex */
641 static int ddindex2pmeindex(const DDRankSetup& ddRankSetup, const int ddCellIndex)
642 {
643     const int npp  = ddRankSetup.numPPRanks;
644     const int npme = ddRankSetup.numRanksDoingPme;
645
646     /* Here we assign a PME node to communicate with this DD node
647      * by assuming that the major index of both is x.
648      * We add npme/2 to obtain an even distribution.
649      */
650     return (ddCellIndex * npme + npme / 2) / npp;
651 }
652
653 static std::vector<int> dd_interleaved_pme_ranks(const DDRankSetup& ddRankSetup)
654 {
655     std::vector<int> pmeRanks(ddRankSetup.numRanksDoingPme);
656
657     int n = 0;
658     for (int i = 0; i < ddRankSetup.numPPRanks; i++)
659     {
660         const int p0 = ddindex2pmeindex(ddRankSetup, i);
661         const int p1 = ddindex2pmeindex(ddRankSetup, i + 1);
662         if (i + 1 == ddRankSetup.numPPRanks || p1 > p0)
663         {
664             if (debug)
665             {
666                 fprintf(debug, "pme_rank[%d] = %d\n", n, i + 1 + n);
667             }
668             pmeRanks[n] = i + 1 + n;
669             n++;
670         }
671     }
672
673     return pmeRanks;
674 }
675
676 static int gmx_ddcoord2pmeindex(const gmx_domdec_t& dd, int x, int y, int z)
677 {
678     ivec coords;
679
680     coords[XX]     = x;
681     coords[YY]     = y;
682     coords[ZZ]     = z;
683     const int slab = ddindex2pmeindex(dd.comm->ddRankSetup, dd_index(dd.numCells, coords));
684
685     return slab;
686 }
687
688 static int ddcoord2simnodeid(const t_commrec* cr, int x, int y, int z)
689 {
690     const CartesianRankSetup& cartSetup = cr->dd->comm->cartesianRankSetup;
691     ivec                      coords    = { x, y, z };
692     int                       nodeid    = -1;
693
694     if (cartSetup.bCartesianPP_PME)
695     {
696 #if GMX_MPI
697         MPI_Cart_rank(cr->mpi_comm_mysim, coords, &nodeid);
698 #endif
699     }
700     else
701     {
702         int ddindex = dd_index(cr->dd->numCells, coords);
703         if (cartSetup.bCartesianPP)
704         {
705             nodeid = cartSetup.ddindex2simnodeid[ddindex];
706         }
707         else
708         {
709             const DDRankSetup& rankSetup = cr->dd->comm->ddRankSetup;
710             if (rankSetup.rankOrder != DdRankOrder::pp_pme && rankSetup.usePmeOnlyRanks)
711             {
712                 nodeid = ddindex + gmx_ddcoord2pmeindex(*cr->dd, x, y, z);
713             }
714             else
715             {
716                 nodeid = ddindex;
717             }
718         }
719     }
720
721     return nodeid;
722 }
723
724 static int dd_simnode2pmenode(const DDRankSetup&        ddRankSetup,
725                               const CartesianRankSetup& cartSetup,
726                               gmx::ArrayRef<const int>  pmeRanks,
727                               const t_commrec gmx_unused* cr,
728                               const int                   sim_nodeid)
729 {
730     int pmenode = -1;
731
732     /* This assumes a uniform x domain decomposition grid cell size */
733     if (cartSetup.bCartesianPP_PME)
734     {
735 #if GMX_MPI
736         ivec coord, coord_pme;
737         MPI_Cart_coords(cr->mpi_comm_mysim, sim_nodeid, DIM, coord);
738         if (coord[cartSetup.cartpmedim] < ddRankSetup.numPPCells[cartSetup.cartpmedim])
739         {
740             /* This is a PP rank */
741             dd_cart_coord2pmecoord(ddRankSetup, cartSetup, coord, coord_pme);
742             MPI_Cart_rank(cr->mpi_comm_mysim, coord_pme, &pmenode);
743         }
744 #endif
745     }
746     else if (cartSetup.bCartesianPP)
747     {
748         if (sim_nodeid < ddRankSetup.numPPRanks)
749         {
750             pmenode = ddRankSetup.numPPRanks + ddindex2pmeindex(ddRankSetup, sim_nodeid);
751         }
752     }
753     else
754     {
755         /* This assumes DD cells with identical x coordinates
756          * are numbered sequentially.
757          */
758         if (pmeRanks.empty())
759         {
760             if (sim_nodeid < ddRankSetup.numPPRanks)
761             {
762                 /* The DD index equals the nodeid */
763                 pmenode = ddRankSetup.numPPRanks + ddindex2pmeindex(ddRankSetup, sim_nodeid);
764             }
765         }
766         else
767         {
768             int i = 0;
769             while (sim_nodeid > pmeRanks[i])
770             {
771                 i++;
772             }
773             if (sim_nodeid < pmeRanks[i])
774             {
775                 pmenode = pmeRanks[i];
776             }
777         }
778     }
779
780     return pmenode;
781 }
782
783 NumPmeDomains getNumPmeDomains(const gmx_domdec_t* dd)
784 {
785     if (dd != nullptr)
786     {
787         return { dd->comm->ddRankSetup.npmenodes_x, dd->comm->ddRankSetup.npmenodes_y };
788     }
789     else
790     {
791         return { 1, 1 };
792     }
793 }
794
795 std::vector<int> get_pme_ddranks(const t_commrec* cr, const int pmenodeid)
796 {
797     const DDRankSetup&        ddRankSetup = cr->dd->comm->ddRankSetup;
798     const CartesianRankSetup& cartSetup   = cr->dd->comm->cartesianRankSetup;
799     GMX_RELEASE_ASSERT(ddRankSetup.usePmeOnlyRanks,
800                        "This function should only be called when PME-only ranks are in use");
801     std::vector<int> ddranks;
802     ddranks.reserve((ddRankSetup.numPPRanks + ddRankSetup.numRanksDoingPme - 1) / ddRankSetup.numRanksDoingPme);
803
804     for (int x = 0; x < ddRankSetup.numPPCells[XX]; x++)
805     {
806         for (int y = 0; y < ddRankSetup.numPPCells[YY]; y++)
807         {
808             for (int z = 0; z < ddRankSetup.numPPCells[ZZ]; z++)
809             {
810                 if (cartSetup.bCartesianPP_PME)
811                 {
812                     ivec coord = { x, y, z };
813                     ivec coord_pme;
814                     dd_cart_coord2pmecoord(ddRankSetup, cartSetup, coord, coord_pme);
815                     if (cr->dd->ci[XX] == coord_pme[XX] && cr->dd->ci[YY] == coord_pme[YY]
816                         && cr->dd->ci[ZZ] == coord_pme[ZZ])
817                     {
818                         ddranks.push_back(ddcoord2simnodeid(cr, x, y, z));
819                     }
820                 }
821                 else
822                 {
823                     /* The slab corresponds to the nodeid in the PME group */
824                     if (gmx_ddcoord2pmeindex(*cr->dd, x, y, z) == pmenodeid)
825                     {
826                         ddranks.push_back(ddcoord2simnodeid(cr, x, y, z));
827                     }
828                 }
829             }
830         }
831     }
832     return ddranks;
833 }
834
835 static gmx_bool receive_vir_ener(const gmx_domdec_t* dd, gmx::ArrayRef<const int> pmeRanks, const t_commrec* cr)
836 {
837     bool bReceive = true;
838
839     const DDRankSetup& ddRankSetup = dd->comm->ddRankSetup;
840     if (ddRankSetup.usePmeOnlyRanks)
841     {
842         const CartesianRankSetup& cartSetup = dd->comm->cartesianRankSetup;
843         if (cartSetup.bCartesianPP_PME)
844         {
845 #if GMX_MPI
846             int  pmenode = dd_simnode2pmenode(ddRankSetup, cartSetup, pmeRanks, cr, cr->sim_nodeid);
847             ivec coords;
848             MPI_Cart_coords(cr->mpi_comm_mysim, cr->sim_nodeid, DIM, coords);
849             coords[cartSetup.cartpmedim]++;
850             if (coords[cartSetup.cartpmedim] < dd->numCells[cartSetup.cartpmedim])
851             {
852                 int rank = 0;
853                 MPI_Cart_rank(cr->mpi_comm_mysim, coords, &rank);
854                 if (dd_simnode2pmenode(ddRankSetup, cartSetup, pmeRanks, cr, rank) == pmenode)
855                 {
856                     /* This is not the last PP node for pmenode */
857                     bReceive = FALSE;
858                 }
859             }
860 #else
861             GMX_RELEASE_ASSERT(
862                     false,
863                     "Without MPI we should not have Cartesian PP-PME with #PMEnodes < #DDnodes");
864 #endif
865         }
866         else
867         {
868             int pmenode = dd_simnode2pmenode(ddRankSetup, cartSetup, pmeRanks, cr, cr->sim_nodeid);
869             if (cr->sim_nodeid + 1 < cr->nnodes
870                 && dd_simnode2pmenode(ddRankSetup, cartSetup, pmeRanks, cr, cr->sim_nodeid + 1) == pmenode)
871             {
872                 /* This is not the last PP node for pmenode */
873                 bReceive = FALSE;
874             }
875         }
876     }
877
878     return bReceive;
879 }
880
881 static void set_slb_pme_dim_f(gmx_domdec_t* dd, int dim, real** dim_f)
882 {
883     gmx_domdec_comm_t* comm = dd->comm;
884
885     snew(*dim_f, dd->numCells[dim] + 1);
886     (*dim_f)[0] = 0;
887     for (int i = 1; i < dd->numCells[dim]; i++)
888     {
889         if (comm->slb_frac[dim])
890         {
891             (*dim_f)[i] = (*dim_f)[i - 1] + comm->slb_frac[dim][i - 1];
892         }
893         else
894         {
895             (*dim_f)[i] = static_cast<real>(i) / static_cast<real>(dd->numCells[dim]);
896         }
897     }
898     (*dim_f)[dd->numCells[dim]] = 1;
899 }
900
901 static void init_ddpme(gmx_domdec_t* dd, gmx_ddpme_t* ddpme, int dimind)
902 {
903     const DDRankSetup& ddRankSetup = dd->comm->ddRankSetup;
904
905     if (dimind == 0 && dd->dim[0] == YY && ddRankSetup.npmenodes_x == 1)
906     {
907         ddpme->dim = YY;
908     }
909     else
910     {
911         ddpme->dim = dimind;
912     }
913     ddpme->dim_match = (ddpme->dim == dd->dim[dimind]);
914
915     ddpme->nslab = (ddpme->dim == 0 ? ddRankSetup.npmenodes_x : ddRankSetup.npmenodes_y);
916
917     if (ddpme->nslab <= 1)
918     {
919         return;
920     }
921
922     const int nso = ddRankSetup.numRanksDoingPme / ddpme->nslab;
923     /* Determine for each PME slab the PP location range for dimension dim */
924     snew(ddpme->pp_min, ddpme->nslab);
925     snew(ddpme->pp_max, ddpme->nslab);
926     for (int slab = 0; slab < ddpme->nslab; slab++)
927     {
928         ddpme->pp_min[slab] = dd->numCells[dd->dim[dimind]] - 1;
929         ddpme->pp_max[slab] = 0;
930     }
931     for (int i = 0; i < dd->nnodes; i++)
932     {
933         ivec xyz;
934         ddindex2xyz(dd->numCells, i, xyz);
935         /* For y only use our y/z slab.
936          * This assumes that the PME x grid size matches the DD grid size.
937          */
938         if (dimind == 0 || xyz[XX] == dd->ci[XX])
939         {
940             const int pmeindex  = ddindex2pmeindex(ddRankSetup, i);
941             const int slab      = (dimind == 0) ? (pmeindex / nso) : (pmeindex % ddpme->nslab);
942             ddpme->pp_min[slab] = std::min(ddpme->pp_min[slab], xyz[dimind]);
943             ddpme->pp_max[slab] = std::max(ddpme->pp_max[slab], xyz[dimind]);
944         }
945     }
946
947     set_slb_pme_dim_f(dd, ddpme->dim, &ddpme->slb_dim_f);
948 }
949
950 int dd_pme_maxshift_x(const gmx_domdec_t& dd)
951 {
952     const DDRankSetup& ddRankSetup = dd.comm->ddRankSetup;
953
954     if (ddRankSetup.ddpme[0].dim == XX)
955     {
956         return ddRankSetup.ddpme[0].maxshift;
957     }
958     else
959     {
960         return 0;
961     }
962 }
963
964 int dd_pme_maxshift_y(const gmx_domdec_t& dd)
965 {
966     const DDRankSetup& ddRankSetup = dd.comm->ddRankSetup;
967
968     if (ddRankSetup.ddpme[0].dim == YY)
969     {
970         return ddRankSetup.ddpme[0].maxshift;
971     }
972     else if (ddRankSetup.npmedecompdim >= 2 && ddRankSetup.ddpme[1].dim == YY)
973     {
974         return ddRankSetup.ddpme[1].maxshift;
975     }
976     else
977     {
978         return 0;
979     }
980 }
981
982 bool ddUsesUpdateGroups(const gmx_domdec_t& dd)
983 {
984     return dd.comm->systemInfo.useUpdateGroups;
985 }
986
987 void dd_cycles_add(const gmx_domdec_t* dd, float cycles, int ddCycl)
988 {
989     /* Note that the cycles value can be incorrect, either 0 or some
990      * extremely large value, when our thread migrated to another core
991      * with an unsynchronized cycle counter. If this happens less often
992      * that once per nstlist steps, this will not cause issues, since
993      * we later subtract the maximum value from the sum over nstlist steps.
994      * A zero count will slightly lower the total, but that's a small effect.
995      * Note that the main purpose of the subtraction of the maximum value
996      * is to avoid throwing off the load balancing when stalls occur due
997      * e.g. system activity or network congestion.
998      */
999     dd->comm->cycl[ddCycl] += cycles;
1000     dd->comm->cycl_n[ddCycl]++;
1001     if (cycles > dd->comm->cycl_max[ddCycl])
1002     {
1003         dd->comm->cycl_max[ddCycl] = cycles;
1004     }
1005 }
1006
1007 #if GMX_MPI
1008 static void make_load_communicator(gmx_domdec_t* dd, int dim_ind, ivec loc)
1009 {
1010     MPI_Comm c_row = MPI_COMM_NULL;
1011     ivec     loc_c;
1012     bool     bPartOfGroup = false;
1013
1014     const int dim = dd->dim[dim_ind];
1015     copy_ivec(loc, loc_c);
1016     for (int i = 0; i < dd->numCells[dim]; i++)
1017     {
1018         loc_c[dim]     = i;
1019         const int rank = dd_index(dd->numCells, loc_c);
1020         if (rank == dd->rank)
1021         {
1022             /* This process is part of the group */
1023             bPartOfGroup = TRUE;
1024         }
1025     }
1026     MPI_Comm_split(dd->mpi_comm_all, bPartOfGroup ? 0 : MPI_UNDEFINED, dd->rank, &c_row);
1027     if (bPartOfGroup)
1028     {
1029         dd->comm->mpi_comm_load[dim_ind] = c_row;
1030         if (!isDlbDisabled(dd->comm))
1031         {
1032             DDCellsizesWithDlb& cellsizes = dd->comm->cellsizesWithDlb[dim_ind];
1033
1034             if (dd->ci[dim] == dd->master_ci[dim])
1035             {
1036                 /* This is the root process of this row */
1037                 cellsizes.rowMaster = std::make_unique<RowMaster>();
1038
1039                 RowMaster& rowMaster = *cellsizes.rowMaster;
1040                 rowMaster.cellFrac.resize(ddCellFractionBufferSize(dd, dim_ind));
1041                 rowMaster.oldCellFrac.resize(dd->numCells[dim] + 1);
1042                 rowMaster.isCellMin.resize(dd->numCells[dim]);
1043                 if (dim_ind > 0)
1044                 {
1045                     rowMaster.bounds.resize(dd->numCells[dim]);
1046                 }
1047                 rowMaster.buf_ncd.resize(dd->numCells[dim]);
1048             }
1049             else
1050             {
1051                 /* This is not a root process, we only need to receive cell_f */
1052                 cellsizes.fracRow.resize(ddCellFractionBufferSize(dd, dim_ind));
1053             }
1054         }
1055         if (dd->ci[dim] == dd->master_ci[dim])
1056         {
1057             snew(dd->comm->load[dim_ind].load, dd->numCells[dim] * DD_NLOAD_MAX);
1058         }
1059     }
1060 }
1061 #endif
1062
1063 void dd_setup_dlb_resource_sharing(const t_commrec* cr, int gpu_id)
1064 {
1065 #if GMX_MPI
1066     MPI_Comm mpi_comm_pp_physicalnode = MPI_COMM_NULL;
1067
1068     if (!thisRankHasDuty(cr, DUTY_PP) || gpu_id < 0)
1069     {
1070         /* Only ranks with short-ranged tasks (currently) use GPUs.
1071          * If we don't have GPUs assigned, there are no resources to share.
1072          */
1073         return;
1074     }
1075
1076     const int physicalnode_id_hash = gmx_physicalnode_id_hash();
1077
1078     gmx_domdec_t* dd = cr->dd;
1079
1080     if (debug)
1081     {
1082         fprintf(debug, "dd_setup_dd_dlb_gpu_sharing:\n");
1083         fprintf(debug, "DD PP rank %d physical node hash %d gpu_id %d\n", dd->rank, physicalnode_id_hash, gpu_id);
1084     }
1085     /* Split the PP communicator over the physical nodes */
1086     /* TODO: See if we should store this (before), as it's also used for
1087      * for the nodecomm summation.
1088      */
1089     // TODO PhysicalNodeCommunicator could be extended/used to handle
1090     // the need for per-node per-group communicators.
1091     MPI_Comm_split(dd->mpi_comm_all, physicalnode_id_hash, dd->rank, &mpi_comm_pp_physicalnode);
1092     MPI_Comm_split(mpi_comm_pp_physicalnode, gpu_id, dd->rank, &dd->comm->mpi_comm_gpu_shared);
1093     MPI_Comm_free(&mpi_comm_pp_physicalnode);
1094     MPI_Comm_size(dd->comm->mpi_comm_gpu_shared, &dd->comm->nrank_gpu_shared);
1095
1096     if (debug)
1097     {
1098         fprintf(debug, "nrank_gpu_shared %d\n", dd->comm->nrank_gpu_shared);
1099     }
1100
1101     /* Note that some ranks could share a GPU, while others don't */
1102
1103     if (dd->comm->nrank_gpu_shared == 1)
1104     {
1105         MPI_Comm_free(&dd->comm->mpi_comm_gpu_shared);
1106     }
1107 #else
1108     GMX_UNUSED_VALUE(cr);
1109     GMX_UNUSED_VALUE(gpu_id);
1110 #endif
1111 }
1112
1113 static void make_load_communicators(gmx_domdec_t gmx_unused* dd)
1114 {
1115 #if GMX_MPI
1116     ivec loc;
1117
1118     if (debug)
1119     {
1120         fprintf(debug, "Making load communicators\n");
1121     }
1122
1123     dd->comm->load = new domdec_load_t[std::max(dd->ndim, 1)];
1124     snew(dd->comm->mpi_comm_load, std::max(dd->ndim, 1));
1125
1126     if (dd->ndim == 0)
1127     {
1128         return;
1129     }
1130
1131     clear_ivec(loc);
1132     make_load_communicator(dd, 0, loc);
1133     if (dd->ndim > 1)
1134     {
1135         const int dim0 = dd->dim[0];
1136         for (int i = 0; i < dd->numCells[dim0]; i++)
1137         {
1138             loc[dim0] = i;
1139             make_load_communicator(dd, 1, loc);
1140         }
1141     }
1142     if (dd->ndim > 2)
1143     {
1144         const int dim0 = dd->dim[0];
1145         for (int i = 0; i < dd->numCells[dim0]; i++)
1146         {
1147             loc[dim0]      = i;
1148             const int dim1 = dd->dim[1];
1149             for (int j = 0; j < dd->numCells[dim1]; j++)
1150             {
1151                 loc[dim1] = j;
1152                 make_load_communicator(dd, 2, loc);
1153             }
1154         }
1155     }
1156
1157     if (debug)
1158     {
1159         fprintf(debug, "Finished making load communicators\n");
1160     }
1161 #endif
1162 }
1163
1164 /*! \brief Sets up the relation between neighboring domains and zones */
1165 static void setup_neighbor_relations(gmx_domdec_t* dd)
1166 {
1167     ivec tmp, s;
1168     GMX_ASSERT((dd->ndim >= 0) && (dd->ndim <= DIM), "Must have valid number of dimensions for DD");
1169
1170     for (int d = 0; d < dd->ndim; d++)
1171     {
1172         const int dim = dd->dim[d];
1173         copy_ivec(dd->ci, tmp);
1174         tmp[dim]           = (tmp[dim] + 1) % dd->numCells[dim];
1175         dd->neighbor[d][0] = ddcoord2ddnodeid(dd, tmp);
1176         copy_ivec(dd->ci, tmp);
1177         tmp[dim]           = (tmp[dim] - 1 + dd->numCells[dim]) % dd->numCells[dim];
1178         dd->neighbor[d][1] = ddcoord2ddnodeid(dd, tmp);
1179         if (debug)
1180         {
1181             fprintf(debug,
1182                     "DD rank %d neighbor ranks in dir %d are + %d - %d\n",
1183                     dd->rank,
1184                     dim,
1185                     dd->neighbor[d][0],
1186                     dd->neighbor[d][1]);
1187         }
1188     }
1189
1190     int nzone  = (1 << dd->ndim);
1191     int nizone = (1 << std::max(dd->ndim - 1, 0));
1192     assert(nizone >= 1 && nizone <= DD_MAXIZONE);
1193
1194     gmx_domdec_zones_t* zones = &dd->comm->zones;
1195
1196     for (int i = 0; i < nzone; i++)
1197     {
1198         int m = 0;
1199         clear_ivec(zones->shift[i]);
1200         for (int d = 0; d < dd->ndim; d++)
1201         {
1202             zones->shift[i][dd->dim[d]] = dd_zo[i][m++];
1203         }
1204     }
1205
1206     zones->n = nzone;
1207     for (int i = 0; i < nzone; i++)
1208     {
1209         for (int d = 0; d < DIM; d++)
1210         {
1211             s[d] = dd->ci[d] - zones->shift[i][d];
1212             if (s[d] < 0)
1213             {
1214                 s[d] += dd->numCells[d];
1215             }
1216             else if (s[d] >= dd->numCells[d])
1217             {
1218                 s[d] -= dd->numCells[d];
1219             }
1220         }
1221     }
1222     for (int iZoneIndex = 0; iZoneIndex < nizone; iZoneIndex++)
1223     {
1224         GMX_RELEASE_ASSERT(
1225                 ddNonbondedZonePairRanges[iZoneIndex][0] == iZoneIndex,
1226                 "The first element for each ddNonbondedZonePairRanges should match its index");
1227
1228         DDPairInteractionRanges iZone;
1229         iZone.iZoneIndex = iZoneIndex;
1230         /* dd_zp3 is for 3D decomposition, for fewer dimensions use only
1231          * j-zones up to nzone.
1232          */
1233         iZone.jZoneRange = gmx::Range<int>(std::min(ddNonbondedZonePairRanges[iZoneIndex][1], nzone),
1234                                            std::min(ddNonbondedZonePairRanges[iZoneIndex][2], nzone));
1235         for (int dim = 0; dim < DIM; dim++)
1236         {
1237             if (dd->numCells[dim] == 1)
1238             {
1239                 /* All shifts should be allowed */
1240                 iZone.shift0[dim] = -1;
1241                 iZone.shift1[dim] = 1;
1242             }
1243             else
1244             {
1245                 /* Determine the min/max j-zone shift wrt the i-zone */
1246                 iZone.shift0[dim] = 1;
1247                 iZone.shift1[dim] = -1;
1248                 for (int jZone : iZone.jZoneRange)
1249                 {
1250                     int shift_diff = zones->shift[jZone][dim] - zones->shift[iZoneIndex][dim];
1251                     if (shift_diff < iZone.shift0[dim])
1252                     {
1253                         iZone.shift0[dim] = shift_diff;
1254                     }
1255                     if (shift_diff > iZone.shift1[dim])
1256                     {
1257                         iZone.shift1[dim] = shift_diff;
1258                     }
1259                 }
1260             }
1261         }
1262
1263         zones->iZones.push_back(iZone);
1264     }
1265
1266     if (!isDlbDisabled(dd->comm))
1267     {
1268         dd->comm->cellsizesWithDlb.resize(dd->ndim);
1269     }
1270
1271     if (dd->comm->ddSettings.recordLoad)
1272     {
1273         make_load_communicators(dd);
1274     }
1275 }
1276
1277 static void make_pp_communicator(const gmx::MDLogger& mdlog,
1278                                  gmx_domdec_t*        dd,
1279                                  t_commrec gmx_unused* cr,
1280                                  bool gmx_unused       reorder)
1281 {
1282 #if GMX_MPI
1283     gmx_domdec_comm_t*  comm      = dd->comm;
1284     CartesianRankSetup& cartSetup = comm->cartesianRankSetup;
1285
1286     if (cartSetup.bCartesianPP)
1287     {
1288         /* Set up cartesian communication for the particle-particle part */
1289         GMX_LOG(mdlog.info)
1290                 .appendTextFormatted("Will use a Cartesian communicator: %d x %d x %d",
1291                                      dd->numCells[XX],
1292                                      dd->numCells[YY],
1293                                      dd->numCells[ZZ]);
1294
1295         ivec periods;
1296         for (int i = 0; i < DIM; i++)
1297         {
1298             periods[i] = TRUE;
1299         }
1300         MPI_Comm comm_cart = MPI_COMM_NULL;
1301         MPI_Cart_create(cr->mpi_comm_mygroup, DIM, dd->numCells, periods, static_cast<int>(reorder), &comm_cart);
1302         /* We overwrite the old communicator with the new cartesian one */
1303         cr->mpi_comm_mygroup = comm_cart;
1304     }
1305
1306     dd->mpi_comm_all = cr->mpi_comm_mygroup;
1307     MPI_Comm_rank(dd->mpi_comm_all, &dd->rank);
1308
1309     if (cartSetup.bCartesianPP_PME)
1310     {
1311         /* Since we want to use the original cartesian setup for sim,
1312          * and not the one after split, we need to make an index.
1313          */
1314         cartSetup.ddindex2ddnodeid.resize(dd->nnodes);
1315         cartSetup.ddindex2ddnodeid[dd_index(dd->numCells, dd->ci)] = dd->rank;
1316         gmx_sumi(dd->nnodes, cartSetup.ddindex2ddnodeid.data(), cr);
1317         /* Get the rank of the DD master,
1318          * above we made sure that the master node is a PP node.
1319          */
1320         int rank = MASTER(cr) ? dd->rank : 0;
1321         MPI_Allreduce(&rank, &dd->masterrank, 1, MPI_INT, MPI_SUM, dd->mpi_comm_all);
1322     }
1323     else if (cartSetup.bCartesianPP)
1324     {
1325         if (!comm->ddRankSetup.usePmeOnlyRanks)
1326         {
1327             /* The PP communicator is also
1328              * the communicator for this simulation
1329              */
1330             cr->mpi_comm_mysim = cr->mpi_comm_mygroup;
1331         }
1332         cr->nodeid = dd->rank;
1333
1334         MPI_Cart_coords(dd->mpi_comm_all, dd->rank, DIM, dd->ci);
1335
1336         /* We need to make an index to go from the coordinates
1337          * to the nodeid of this simulation.
1338          */
1339         cartSetup.ddindex2simnodeid.resize(dd->nnodes);
1340         std::vector<int> buf(dd->nnodes);
1341         if (thisRankHasDuty(cr, DUTY_PP))
1342         {
1343             buf[dd_index(dd->numCells, dd->ci)] = cr->sim_nodeid;
1344         }
1345         /* Communicate the ddindex to simulation nodeid index */
1346         MPI_Allreduce(buf.data(), cartSetup.ddindex2simnodeid.data(), dd->nnodes, MPI_INT, MPI_SUM, cr->mpi_comm_mysim);
1347
1348         /* Determine the master coordinates and rank.
1349          * The DD master should be the same node as the master of this sim.
1350          */
1351         for (int i = 0; i < dd->nnodes; i++)
1352         {
1353             if (cartSetup.ddindex2simnodeid[i] == 0)
1354             {
1355                 ddindex2xyz(dd->numCells, i, dd->master_ci);
1356                 MPI_Cart_rank(dd->mpi_comm_all, dd->master_ci, &dd->masterrank);
1357             }
1358         }
1359         if (debug)
1360         {
1361             fprintf(debug, "The master rank is %d\n", dd->masterrank);
1362         }
1363     }
1364     else
1365     {
1366         /* No Cartesian communicators */
1367         /* We use the rank in dd->comm->all as DD index */
1368         ddindex2xyz(dd->numCells, dd->rank, dd->ci);
1369         /* The simulation master nodeid is 0, so the DD master rank is also 0 */
1370         dd->masterrank = 0;
1371         clear_ivec(dd->master_ci);
1372     }
1373 #endif
1374
1375     GMX_LOG(mdlog.info)
1376             .appendTextFormatted("Domain decomposition rank %d, coordinates %d %d %d\n",
1377                                  dd->rank,
1378                                  dd->ci[XX],
1379                                  dd->ci[YY],
1380                                  dd->ci[ZZ]);
1381     if (debug)
1382     {
1383         fprintf(debug,
1384                 "Domain decomposition rank %d, coordinates %d %d %d\n\n",
1385                 dd->rank,
1386                 dd->ci[XX],
1387                 dd->ci[YY],
1388                 dd->ci[ZZ]);
1389     }
1390 }
1391
1392 static void receive_ddindex2simnodeid(gmx_domdec_t* dd, t_commrec* cr)
1393 {
1394 #if GMX_MPI
1395     CartesianRankSetup& cartSetup = dd->comm->cartesianRankSetup;
1396
1397     if (!cartSetup.bCartesianPP_PME && cartSetup.bCartesianPP)
1398     {
1399         cartSetup.ddindex2simnodeid.resize(dd->nnodes);
1400         std::vector<int> buf(dd->nnodes);
1401         if (thisRankHasDuty(cr, DUTY_PP))
1402         {
1403             buf[dd_index(dd->numCells, dd->ci)] = cr->sim_nodeid;
1404         }
1405         /* Communicate the ddindex to simulation nodeid index */
1406         MPI_Allreduce(buf.data(), cartSetup.ddindex2simnodeid.data(), dd->nnodes, MPI_INT, MPI_SUM, cr->mpi_comm_mysim);
1407     }
1408 #else
1409     GMX_UNUSED_VALUE(dd);
1410     GMX_UNUSED_VALUE(cr);
1411 #endif
1412 }
1413
1414 static CartesianRankSetup split_communicator(const gmx::MDLogger& mdlog,
1415                                              t_commrec*           cr,
1416                                              const DdRankOrder    ddRankOrder,
1417                                              bool gmx_unused      reorder,
1418                                              const DDRankSetup&   ddRankSetup,
1419                                              ivec                 ddCellIndex,
1420                                              std::vector<int>*    pmeRanks)
1421 {
1422     CartesianRankSetup cartSetup;
1423
1424     cartSetup.bCartesianPP     = (ddRankOrder == DdRankOrder::cartesian);
1425     cartSetup.bCartesianPP_PME = false;
1426
1427     const ivec& numDDCells = ddRankSetup.numPPCells;
1428     /* Initially we set ntot to the number of PP cells */
1429     copy_ivec(numDDCells, cartSetup.ntot);
1430
1431     if (cartSetup.bCartesianPP)
1432     {
1433         const int numDDCellsTot = ddRankSetup.numPPRanks;
1434         bool      bDiv[DIM];
1435         for (int i = 1; i < DIM; i++)
1436         {
1437             bDiv[i] = ((ddRankSetup.numRanksDoingPme * numDDCells[i]) % numDDCellsTot == 0);
1438         }
1439         if (bDiv[YY] || bDiv[ZZ])
1440         {
1441             cartSetup.bCartesianPP_PME = TRUE;
1442             /* If we have 2D PME decomposition, which is always in x+y,
1443              * we stack the PME only nodes in z.
1444              * Otherwise we choose the direction that provides the thinnest slab
1445              * of PME only nodes as this will have the least effect
1446              * on the PP communication.
1447              * But for the PME communication the opposite might be better.
1448              */
1449             if (bDiv[ZZ] && (ddRankSetup.npmenodes_y > 1 || !bDiv[YY] || numDDCells[YY] > numDDCells[ZZ]))
1450             {
1451                 cartSetup.cartpmedim = ZZ;
1452             }
1453             else
1454             {
1455                 cartSetup.cartpmedim = YY;
1456             }
1457             cartSetup.ntot[cartSetup.cartpmedim] +=
1458                     (ddRankSetup.numRanksDoingPme * numDDCells[cartSetup.cartpmedim]) / numDDCellsTot;
1459         }
1460         else
1461         {
1462             GMX_LOG(mdlog.info)
1463                     .appendTextFormatted(
1464                             "Number of PME-only ranks (%d) is not a multiple of nx*ny (%d*%d) or "
1465                             "nx*nz (%d*%d)",
1466                             ddRankSetup.numRanksDoingPme,
1467                             numDDCells[XX],
1468                             numDDCells[YY],
1469                             numDDCells[XX],
1470                             numDDCells[ZZ]);
1471             GMX_LOG(mdlog.info)
1472                     .appendText("Will not use a Cartesian communicator for PP <-> PME\n");
1473         }
1474     }
1475
1476     if (cartSetup.bCartesianPP_PME)
1477     {
1478 #if GMX_MPI
1479         int  rank = 0;
1480         ivec periods;
1481
1482         GMX_LOG(mdlog.info)
1483                 .appendTextFormatted(
1484                         "Will use a Cartesian communicator for PP <-> PME: %d x %d x %d",
1485                         cartSetup.ntot[XX],
1486                         cartSetup.ntot[YY],
1487                         cartSetup.ntot[ZZ]);
1488
1489         for (int i = 0; i < DIM; i++)
1490         {
1491             periods[i] = TRUE;
1492         }
1493         MPI_Comm comm_cart = MPI_COMM_NULL;
1494         MPI_Cart_create(cr->mpi_comm_mysim, DIM, cartSetup.ntot, periods, static_cast<int>(reorder), &comm_cart);
1495         MPI_Comm_rank(comm_cart, &rank);
1496         if (MASTER(cr) && rank != 0)
1497         {
1498             gmx_fatal(FARGS, "MPI rank 0 was renumbered by MPI_Cart_create, we do not allow this");
1499         }
1500
1501         /* With this assigment we loose the link to the original communicator
1502          * which will usually be MPI_COMM_WORLD, unless have multisim.
1503          */
1504         cr->mpi_comm_mysim = comm_cart;
1505         cr->sim_nodeid     = rank;
1506
1507         MPI_Cart_coords(cr->mpi_comm_mysim, cr->sim_nodeid, DIM, ddCellIndex);
1508
1509         GMX_LOG(mdlog.info)
1510                 .appendTextFormatted("Cartesian rank %d, coordinates %d %d %d\n",
1511                                      cr->sim_nodeid,
1512                                      ddCellIndex[XX],
1513                                      ddCellIndex[YY],
1514                                      ddCellIndex[ZZ]);
1515
1516         if (ddCellIndex[cartSetup.cartpmedim] < numDDCells[cartSetup.cartpmedim])
1517         {
1518             cr->duty = DUTY_PP;
1519         }
1520         if (!ddRankSetup.usePmeOnlyRanks
1521             || ddCellIndex[cartSetup.cartpmedim] >= numDDCells[cartSetup.cartpmedim])
1522         {
1523             cr->duty = DUTY_PME;
1524         }
1525
1526         /* Split the sim communicator into PP and PME only nodes */
1527         MPI_Comm_split(cr->mpi_comm_mysim,
1528                        getThisRankDuties(cr),
1529                        dd_index(cartSetup.ntot, ddCellIndex),
1530                        &cr->mpi_comm_mygroup);
1531 #else
1532         GMX_UNUSED_VALUE(ddCellIndex);
1533 #endif
1534     }
1535     else
1536     {
1537         switch (ddRankOrder)
1538         {
1539             case DdRankOrder::pp_pme:
1540                 GMX_LOG(mdlog.info).appendText("Order of the ranks: PP first, PME last");
1541                 break;
1542             case DdRankOrder::interleave:
1543                 /* Interleave the PP-only and PME-only ranks */
1544                 GMX_LOG(mdlog.info).appendText("Interleaving PP and PME ranks");
1545                 *pmeRanks = dd_interleaved_pme_ranks(ddRankSetup);
1546                 break;
1547             case DdRankOrder::cartesian: break;
1548             default: gmx_fatal(FARGS, "Invalid ddRankOrder=%d", static_cast<int>(ddRankOrder));
1549         }
1550
1551         if (dd_simnode2pmenode(ddRankSetup, cartSetup, *pmeRanks, cr, cr->sim_nodeid) == -1)
1552         {
1553             cr->duty = DUTY_PME;
1554         }
1555         else
1556         {
1557             cr->duty = DUTY_PP;
1558         }
1559 #if GMX_MPI
1560         /* Split the sim communicator into PP and PME only nodes */
1561         MPI_Comm_split(cr->mpi_comm_mysim, getThisRankDuties(cr), cr->nodeid, &cr->mpi_comm_mygroup);
1562         MPI_Comm_rank(cr->mpi_comm_mygroup, &cr->nodeid);
1563 #endif
1564     }
1565
1566     GMX_LOG(mdlog.info)
1567             .appendTextFormatted("This rank does only %s work.\n",
1568                                  thisRankHasDuty(cr, DUTY_PP) ? "particle-particle" : "PME-mesh");
1569
1570     return cartSetup;
1571 }
1572
1573 /*! \brief Makes the PP communicator and the PME communicator, when needed
1574  *
1575  * Returns the Cartesian rank setup.
1576  * Sets \p cr->mpi_comm_mygroup
1577  * For PP ranks, sets the DD PP cell index in \p ddCellIndex.
1578  * With separate PME ranks in interleaved order, set the PME ranks in \p pmeRanks.
1579  */
1580 static CartesianRankSetup makeGroupCommunicators(const gmx::MDLogger& mdlog,
1581                                                  const DDSettings&    ddSettings,
1582                                                  const DdRankOrder    ddRankOrder,
1583                                                  const DDRankSetup&   ddRankSetup,
1584                                                  t_commrec*           cr,
1585                                                  ivec                 ddCellIndex,
1586                                                  std::vector<int>*    pmeRanks)
1587 {
1588     CartesianRankSetup cartSetup;
1589
1590     // As a default, both group and sim communicators are equal to the default communicator
1591     cr->mpi_comm_mygroup = cr->mpiDefaultCommunicator;
1592     cr->mpi_comm_mysim   = cr->mpiDefaultCommunicator;
1593     cr->nnodes           = cr->sizeOfDefaultCommunicator;
1594     cr->nodeid           = cr->rankInDefaultCommunicator;
1595     cr->sim_nodeid       = cr->rankInDefaultCommunicator;
1596
1597     if (ddRankSetup.usePmeOnlyRanks)
1598     {
1599         /* Split the communicator into a PP and PME part */
1600         cartSetup = split_communicator(
1601                 mdlog, cr, ddRankOrder, ddSettings.useCartesianReorder, ddRankSetup, ddCellIndex, pmeRanks);
1602     }
1603     else
1604     {
1605         /* All nodes do PP and PME */
1606         /* We do not require separate communicators */
1607         cartSetup.bCartesianPP     = false;
1608         cartSetup.bCartesianPP_PME = false;
1609     }
1610
1611     return cartSetup;
1612 }
1613
1614 /*! \brief For PP ranks, sets or makes the communicator
1615  *
1616  * For PME ranks get the rank id.
1617  * For PP only ranks, sets the PME-only rank.
1618  */
1619 static void setupGroupCommunication(const gmx::MDLogger&     mdlog,
1620                                     const DDSettings&        ddSettings,
1621                                     gmx::ArrayRef<const int> pmeRanks,
1622                                     t_commrec*               cr,
1623                                     const int                numAtomsInSystem,
1624                                     gmx_domdec_t*            dd)
1625 {
1626     const DDRankSetup&        ddRankSetup = dd->comm->ddRankSetup;
1627     const CartesianRankSetup& cartSetup   = dd->comm->cartesianRankSetup;
1628
1629     if (thisRankHasDuty(cr, DUTY_PP))
1630     {
1631         /* Copy or make a new PP communicator */
1632
1633         /* We (possibly) reordered the nodes in split_communicator,
1634          * so it is no longer required in make_pp_communicator.
1635          */
1636         const bool useCartesianReorder = (ddSettings.useCartesianReorder && !cartSetup.bCartesianPP_PME);
1637
1638         make_pp_communicator(mdlog, dd, cr, useCartesianReorder);
1639     }
1640     else
1641     {
1642         receive_ddindex2simnodeid(dd, cr);
1643     }
1644
1645     if (!thisRankHasDuty(cr, DUTY_PME))
1646     {
1647         /* Set up the commnuication to our PME node */
1648         dd->pme_nodeid = dd_simnode2pmenode(ddRankSetup, cartSetup, pmeRanks, cr, cr->sim_nodeid);
1649         dd->pme_receive_vir_ener = receive_vir_ener(dd, pmeRanks, cr);
1650         if (debug)
1651         {
1652             fprintf(debug,
1653                     "My pme_nodeid %d receive ener %s\n",
1654                     dd->pme_nodeid,
1655                     gmx::boolToString(dd->pme_receive_vir_ener));
1656         }
1657     }
1658     else
1659     {
1660         dd->pme_nodeid = -1;
1661     }
1662
1663     /* We can not use DDMASTER(dd), because dd->masterrank is set later */
1664     if (MASTER(cr))
1665     {
1666         dd->ma = std::make_unique<AtomDistribution>(dd->numCells, numAtomsInSystem, numAtomsInSystem);
1667     }
1668 }
1669
1670 static real* get_slb_frac(const gmx::MDLogger& mdlog, const char* dir, int nc, const char* size_string)
1671 {
1672     real* slb_frac = nullptr;
1673     if (nc > 1 && size_string != nullptr)
1674     {
1675         GMX_LOG(mdlog.info).appendTextFormatted("Using static load balancing for the %s direction", dir);
1676         snew(slb_frac, nc);
1677         real tot = 0;
1678         for (int i = 0; i < nc; i++)
1679         {
1680             double dbl = 0;
1681             int    n   = 0;
1682             sscanf(size_string, "%20lf%n", &dbl, &n);
1683             if (dbl == 0)
1684             {
1685                 gmx_fatal(FARGS,
1686                           "Incorrect or not enough DD cell size entries for direction %s: '%s'",
1687                           dir,
1688                           size_string);
1689             }
1690             slb_frac[i] = dbl;
1691             size_string += n;
1692             tot += slb_frac[i];
1693         }
1694         /* Normalize */
1695         std::string relativeCellSizes = "Relative cell sizes:";
1696         for (int i = 0; i < nc; i++)
1697         {
1698             slb_frac[i] /= tot;
1699             relativeCellSizes += gmx::formatString(" %5.3f", slb_frac[i]);
1700         }
1701         GMX_LOG(mdlog.info).appendText(relativeCellSizes);
1702     }
1703
1704     return slb_frac;
1705 }
1706
1707 static int multi_body_bondeds_count(const gmx_mtop_t& mtop)
1708 {
1709     int n = 0;
1710     for (const auto ilists : IListRange(mtop))
1711     {
1712         for (auto& ilist : extractILists(ilists.list(), IF_BOND))
1713         {
1714             if (NRAL(ilist.functionType) > 2)
1715             {
1716                 n += ilists.nmol() * (ilist.iatoms.size() / ilistStride(ilist));
1717             }
1718         }
1719     }
1720
1721     return n;
1722 }
1723
1724 static int dd_getenv(const gmx::MDLogger& mdlog, const char* env_var, int def)
1725 {
1726     int   nst = def;
1727     char* val = getenv(env_var);
1728     if (val)
1729     {
1730         if (sscanf(val, "%20d", &nst) <= 0)
1731         {
1732             nst = 1;
1733         }
1734         GMX_LOG(mdlog.info).appendTextFormatted("Found env.var. %s = %s, using value %d", env_var, val, nst);
1735     }
1736
1737     return nst;
1738 }
1739
1740 static void check_dd_restrictions(const gmx_domdec_t* dd, const t_inputrec& inputrec, const gmx::MDLogger& mdlog)
1741 {
1742     if (inputrec.pbcType == PbcType::Screw
1743         && (dd->numCells[XX] == 1 || dd->numCells[YY] > 1 || dd->numCells[ZZ] > 1))
1744     {
1745         gmx_fatal(FARGS,
1746                   "With pbc=%s can only do domain decomposition in the x-direction",
1747                   c_pbcTypeNames[inputrec.pbcType].c_str());
1748     }
1749
1750     if (inputrec.nstlist == 0)
1751     {
1752         gmx_fatal(FARGS, "Domain decomposition does not work with nstlist=0");
1753     }
1754
1755     if (inputrec.comm_mode == ComRemovalAlgorithm::Angular && inputrec.pbcType != PbcType::No)
1756     {
1757         GMX_LOG(mdlog.warning)
1758                 .appendText(
1759                         "comm-mode angular will give incorrect results when the comm group "
1760                         "partially crosses a periodic boundary");
1761     }
1762 }
1763
1764 static real average_cellsize_min(const gmx_ddbox_t& ddbox, const ivec numDomains)
1765 {
1766     real r = ddbox.box_size[XX];
1767     for (int d = 0; d < DIM; d++)
1768     {
1769         if (numDomains[d] > 1)
1770         {
1771             /* Check using the initial average cell size */
1772             r = std::min(r, ddbox.box_size[d] * ddbox.skew_fac[d] / numDomains[d]);
1773         }
1774     }
1775
1776     return r;
1777 }
1778
1779 /*! \brief Depending on the DLB initial value return the DLB switched off state or issue an error.
1780  */
1781 static DlbState forceDlbOffOrBail(DlbState             cmdlineDlbState,
1782                                   const std::string&   reasonStr,
1783                                   const gmx::MDLogger& mdlog)
1784 {
1785     std::string dlbNotSupportedErr = "Dynamic load balancing requested, but ";
1786     std::string dlbDisableNote     = "NOTE: disabling dynamic load balancing as ";
1787
1788     if (cmdlineDlbState == DlbState::onUser)
1789     {
1790         gmx_fatal(FARGS, "%s", (dlbNotSupportedErr + reasonStr).c_str());
1791     }
1792     else if (cmdlineDlbState == DlbState::offCanTurnOn)
1793     {
1794         GMX_LOG(mdlog.info).appendText(dlbDisableNote + reasonStr);
1795     }
1796     return DlbState::offForever;
1797 }
1798
1799 /*! \brief Return the dynamic load balancer's initial state based on initial conditions and user inputs.
1800  *
1801  * This function parses the parameters of "-dlb" command line option setting
1802  * corresponding state values. Then it checks the consistency of the determined
1803  * state with other run parameters and settings. As a result, the initial state
1804  * may be altered or an error may be thrown if incompatibility of options is detected.
1805  *
1806  * \param [in] mdlog       Logger.
1807  * \param [in] dlbOption   Enum value for the DLB option.
1808  * \param [in] bRecordLoad True if the load balancer is recording load information.
1809  * \param [in] mdrunOptions  Options for mdrun.
1810  * \param [in] inputrec    Pointer mdrun to input parameters.
1811  * \returns                DLB initial/startup state.
1812  */
1813 static DlbState determineInitialDlbState(const gmx::MDLogger&     mdlog,
1814                                          DlbOption                dlbOption,
1815                                          gmx_bool                 bRecordLoad,
1816                                          const gmx::MdrunOptions& mdrunOptions,
1817                                          const t_inputrec&        inputrec)
1818 {
1819     DlbState dlbState = DlbState::offCanTurnOn;
1820
1821     switch (dlbOption)
1822     {
1823         case DlbOption::turnOnWhenUseful: dlbState = DlbState::offCanTurnOn; break;
1824         case DlbOption::no: dlbState = DlbState::offUser; break;
1825         case DlbOption::yes: dlbState = DlbState::onUser; break;
1826         default: gmx_incons("Invalid dlbOption enum value");
1827     }
1828
1829     /* Reruns don't support DLB: bail or override auto mode */
1830     if (mdrunOptions.rerun)
1831     {
1832         std::string reasonStr = "it is not supported in reruns.";
1833         return forceDlbOffOrBail(dlbState, reasonStr, mdlog);
1834     }
1835
1836     /* Unsupported integrators */
1837     if (!EI_DYNAMICS(inputrec.eI))
1838     {
1839         auto reasonStr =
1840                 gmx::formatString("it is only supported with dynamics, not with integrator '%s'.",
1841                                   enumValueToString(inputrec.eI));
1842         return forceDlbOffOrBail(dlbState, reasonStr, mdlog);
1843     }
1844
1845     /* Without cycle counters we can't time work to balance on */
1846     if (!bRecordLoad)
1847     {
1848         std::string reasonStr =
1849                 "cycle counters unsupported or not enabled in the operating system kernel.";
1850         return forceDlbOffOrBail(dlbState, reasonStr, mdlog);
1851     }
1852
1853     if (mdrunOptions.reproducible)
1854     {
1855         std::string reasonStr = "you started a reproducible run.";
1856         switch (dlbState)
1857         {
1858             case DlbState::offUser: break;
1859             case DlbState::offForever:
1860                 GMX_RELEASE_ASSERT(false, "DlbState::offForever is not a valid initial state");
1861                 break;
1862             case DlbState::offCanTurnOn: return forceDlbOffOrBail(dlbState, reasonStr, mdlog);
1863             case DlbState::onCanTurnOff:
1864                 GMX_RELEASE_ASSERT(false, "DlbState::offCanTurnOff is not a valid initial state");
1865                 break;
1866             case DlbState::onUser:
1867                 return forceDlbOffOrBail(
1868                         dlbState,
1869                         reasonStr
1870                                 + " In load balanced runs binary reproducibility cannot be "
1871                                   "ensured.",
1872                         mdlog);
1873             default:
1874                 gmx_fatal(FARGS,
1875                           "Death horror: undefined case (%d) for load balancing choice",
1876                           static_cast<int>(dlbState));
1877         }
1878     }
1879
1880     return dlbState;
1881 }
1882
1883 static gmx_domdec_comm_t* init_dd_comm()
1884 {
1885     gmx_domdec_comm_t* comm = new gmx_domdec_comm_t;
1886
1887     comm->n_load_have    = 0;
1888     comm->n_load_collect = 0;
1889
1890     comm->haveTurnedOffDlb = false;
1891
1892     for (int i = 0; i < static_cast<int>(DDAtomRanges::Type::Number); i++)
1893     {
1894         comm->sum_nat[i] = 0;
1895     }
1896     comm->ndecomp   = 0;
1897     comm->nload     = 0;
1898     comm->load_step = 0;
1899     comm->load_sum  = 0;
1900     comm->load_max  = 0;
1901     clear_ivec(comm->load_lim);
1902     comm->load_mdf = 0;
1903     comm->load_pme = 0;
1904
1905     /* This should be replaced by a unique pointer */
1906     comm->balanceRegion = ddBalanceRegionAllocate();
1907
1908     return comm;
1909 }
1910
1911 static void setupUpdateGroups(const gmx::MDLogger&              mdlog,
1912                               const gmx_mtop_t&                 mtop,
1913                               ArrayRef<const RangePartitioning> updateGroupingsPerMoleculeType,
1914                               const bool                        useUpdateGroups,
1915                               const real                        maxUpdateGroupRadius,
1916                               DDSystemInfo*                     systemInfo)
1917 {
1918     systemInfo->updateGroupingsPerMoleculeType = updateGroupingsPerMoleculeType;
1919     systemInfo->useUpdateGroups                = useUpdateGroups;
1920     systemInfo->maxUpdateGroupRadius           = maxUpdateGroupRadius;
1921
1922     if (systemInfo->useUpdateGroups)
1923     {
1924         int numUpdateGroups = 0;
1925         for (const auto& molblock : mtop.molblock)
1926         {
1927             numUpdateGroups += molblock.nmol
1928                                * systemInfo->updateGroupingsPerMoleculeType[molblock.type].numBlocks();
1929         }
1930
1931         GMX_LOG(mdlog.info)
1932                 .appendTextFormatted(
1933                         "Using update groups, nr %d, average size %.1f atoms, max. radius %.3f "
1934                         "nm\n",
1935                         numUpdateGroups,
1936                         mtop.natoms / static_cast<double>(numUpdateGroups),
1937                         systemInfo->maxUpdateGroupRadius);
1938     }
1939 }
1940
1941 UnitCellInfo::UnitCellInfo(const t_inputrec& ir) :
1942     npbcdim(numPbcDimensions(ir.pbcType)),
1943     numBoundedDimensions(inputrec2nboundeddim(&ir)),
1944     ddBoxIsDynamic(numBoundedDimensions < DIM || inputrecDynamicBox(&ir)),
1945     haveScrewPBC(ir.pbcType == PbcType::Screw)
1946 {
1947 }
1948
1949 /* Returns whether molecules are always whole, i.e. not broken by PBC */
1950 static bool moleculesAreAlwaysWhole(const gmx_mtop_t&                           mtop,
1951                                     const bool                                  useUpdateGroups,
1952                                     gmx::ArrayRef<const gmx::RangePartitioning> updateGroupingsPerMoleculeType)
1953 {
1954     if (useUpdateGroups)
1955     {
1956         GMX_RELEASE_ASSERT(updateGroupingsPerMoleculeType.size() == mtop.moltype.size(),
1957                            "Need one grouping per moltype");
1958         for (size_t mol = 0; mol < mtop.moltype.size(); mol++)
1959         {
1960             if (updateGroupingsPerMoleculeType[mol].numBlocks() > 1)
1961             {
1962                 return false;
1963             }
1964         }
1965     }
1966     else
1967     {
1968         for (const auto& moltype : mtop.moltype)
1969         {
1970             if (moltype.atoms.nr > 1)
1971             {
1972                 return false;
1973             }
1974         }
1975     }
1976
1977     return true;
1978 }
1979
1980 /*! \brief Generate the simulation system information */
1981 static DDSystemInfo getSystemInfo(const gmx::MDLogger&              mdlog,
1982                                   DDRole                            ddRole,
1983                                   MPI_Comm                          communicator,
1984                                   const DomdecOptions&              options,
1985                                   const gmx_mtop_t&                 mtop,
1986                                   const t_inputrec&                 ir,
1987                                   const matrix                      box,
1988                                   ArrayRef<const RangePartitioning> updateGroupingPerMoleculeType,
1989                                   const bool                        useUpdateGroups,
1990                                   const real                        maxUpdateGroupRadius,
1991                                   gmx::ArrayRef<const gmx::RVec>    xGlobal)
1992 {
1993     const real tenPercentMargin = 1.1;
1994
1995     DDSystemInfo systemInfo;
1996
1997     setupUpdateGroups(
1998             mdlog, mtop, updateGroupingPerMoleculeType, useUpdateGroups, maxUpdateGroupRadius, &systemInfo);
1999
2000     systemInfo.moleculesAreAlwaysWhole = moleculesAreAlwaysWhole(
2001             mtop, systemInfo.useUpdateGroups, systemInfo.updateGroupingsPerMoleculeType);
2002     systemInfo.haveInterDomainBondeds =
2003             (!systemInfo.moleculesAreAlwaysWhole || mtop.bIntermolecularInteractions);
2004     systemInfo.haveInterDomainMultiBodyBondeds =
2005             (systemInfo.haveInterDomainBondeds && multi_body_bondeds_count(mtop) > 0);
2006
2007     if (systemInfo.useUpdateGroups)
2008     {
2009         systemInfo.mayHaveSplitConstraints = false;
2010         systemInfo.mayHaveSplitSettles     = false;
2011     }
2012     else
2013     {
2014         systemInfo.mayHaveSplitConstraints = (gmx_mtop_ftype_count(mtop, F_CONSTR) > 0
2015                                               || gmx_mtop_ftype_count(mtop, F_CONSTRNC) > 0);
2016         systemInfo.mayHaveSplitSettles     = (gmx_mtop_ftype_count(mtop, F_SETTLE) > 0);
2017     }
2018
2019     if (ir.rlist == 0)
2020     {
2021         /* Set the cut-off to some very large value,
2022          * so we don't need if statements everywhere in the code.
2023          * We use sqrt, since the cut-off is squared in some places.
2024          */
2025         systemInfo.cutoff = GMX_CUTOFF_INF;
2026     }
2027     else
2028     {
2029         systemInfo.cutoff = atomToAtomIntoDomainToDomainCutoff(systemInfo, ir.rlist);
2030     }
2031     systemInfo.minCutoffForMultiBody = 0;
2032
2033     /* Determine the minimum cell size limit, affected by many factors */
2034     systemInfo.cellsizeLimit             = 0;
2035     systemInfo.filterBondedCommunication = false;
2036
2037     /* We do not allow home atoms to move beyond the neighboring domain
2038      * between domain decomposition steps, which limits the cell size.
2039      * Get an estimate of cell size limit due to atom displacement.
2040      * In most cases this is a large overestimate, because it assumes
2041      * non-interaction atoms.
2042      * We set the chance to 1 in a trillion steps.
2043      */
2044     constexpr real c_chanceThatAtomMovesBeyondDomain = 1e-12;
2045     const real     limitForAtomDisplacement          = minCellSizeForAtomDisplacement(
2046             mtop, ir, systemInfo.updateGroupingsPerMoleculeType, c_chanceThatAtomMovesBeyondDomain);
2047     GMX_LOG(mdlog.info).appendTextFormatted("Minimum cell size due to atom displacement: %.3f nm", limitForAtomDisplacement);
2048
2049     systemInfo.cellsizeLimit = std::max(systemInfo.cellsizeLimit, limitForAtomDisplacement);
2050
2051     /* TODO: PME decomposition currently requires atoms not to be more than
2052      *       2/3 of comm->cutoff, which is >=rlist, outside of their domain.
2053      *       In nearly all cases, limitForAtomDisplacement will be smaller
2054      *       than 2/3*rlist, so the PME requirement is satisfied.
2055      *       But it would be better for both correctness and performance
2056      *       to use limitForAtomDisplacement instead of 2/3*comm->cutoff.
2057      *       Note that we would need to improve the pairlist buffer case.
2058      */
2059
2060     if (systemInfo.haveInterDomainBondeds)
2061     {
2062         if (options.minimumCommunicationRange > 0)
2063         {
2064             systemInfo.minCutoffForMultiBody =
2065                     atomToAtomIntoDomainToDomainCutoff(systemInfo, options.minimumCommunicationRange);
2066             if (options.useBondedCommunication)
2067             {
2068                 systemInfo.filterBondedCommunication =
2069                         (systemInfo.minCutoffForMultiBody > systemInfo.cutoff);
2070             }
2071             else
2072             {
2073                 systemInfo.cutoff = std::max(systemInfo.cutoff, systemInfo.minCutoffForMultiBody);
2074             }
2075         }
2076         else if (ir.bPeriodicMols)
2077         {
2078             /* Can not easily determine the required cut-off */
2079             GMX_LOG(mdlog.warning)
2080                     .appendText(
2081                             "NOTE: Periodic molecules are present in this system. Because of this, "
2082                             "the domain decomposition algorithm cannot easily determine the "
2083                             "minimum cell size that it requires for treating bonded interactions. "
2084                             "Instead, domain decomposition will assume that half the non-bonded "
2085                             "cut-off will be a suitable lower bound.");
2086             systemInfo.minCutoffForMultiBody = systemInfo.cutoff / 2;
2087         }
2088         else
2089         {
2090             real r_2b = 0;
2091             real r_mb = 0;
2092
2093             if (ddRole == DDRole::Master)
2094             {
2095                 dd_bonded_cg_distance(mdlog, mtop, ir, xGlobal, box, options.ddBondedChecking, &r_2b, &r_mb);
2096             }
2097             gmx_bcast(sizeof(r_2b), &r_2b, communicator);
2098             gmx_bcast(sizeof(r_mb), &r_mb, communicator);
2099
2100             /* We use an initial margin of 10% for the minimum cell size,
2101              * except when we are just below the non-bonded cut-off.
2102              */
2103             if (options.useBondedCommunication)
2104             {
2105                 if (std::max(r_2b, r_mb) > systemInfo.cutoff)
2106                 {
2107                     const real r_bonded              = std::max(r_2b, r_mb);
2108                     systemInfo.minCutoffForMultiBody = tenPercentMargin * r_bonded;
2109                     /* This is the (only) place where we turn on the filtering */
2110                     systemInfo.filterBondedCommunication = true;
2111                 }
2112                 else
2113                 {
2114                     const real r_bonded = r_mb;
2115                     systemInfo.minCutoffForMultiBody =
2116                             std::min(tenPercentMargin * r_bonded, systemInfo.cutoff);
2117                 }
2118                 /* We determine cutoff_mbody later */
2119                 systemInfo.increaseMultiBodyCutoff = true;
2120             }
2121             else
2122             {
2123                 /* No special bonded communication,
2124                  * simply increase the DD cut-off.
2125                  */
2126                 systemInfo.minCutoffForMultiBody = tenPercentMargin * std::max(r_2b, r_mb);
2127                 systemInfo.cutoff = std::max(systemInfo.cutoff, systemInfo.minCutoffForMultiBody);
2128             }
2129         }
2130         GMX_LOG(mdlog.info)
2131                 .appendTextFormatted("Minimum cell size due to bonded interactions: %.3f nm",
2132                                      systemInfo.minCutoffForMultiBody);
2133
2134         systemInfo.cellsizeLimit = std::max(systemInfo.cellsizeLimit, systemInfo.minCutoffForMultiBody);
2135     }
2136
2137     systemInfo.constraintCommunicationRange = 0;
2138     if (systemInfo.mayHaveSplitConstraints && options.constraintCommunicationRange <= 0)
2139     {
2140         /* There is a cell size limit due to the constraints (P-LINCS) */
2141         systemInfo.constraintCommunicationRange = gmx::constr_r_max(mdlog, &mtop, &ir);
2142         GMX_LOG(mdlog.info)
2143                 .appendTextFormatted("Estimated maximum distance required for P-LINCS: %.3f nm",
2144                                      systemInfo.constraintCommunicationRange);
2145         if (systemInfo.constraintCommunicationRange > systemInfo.cellsizeLimit)
2146         {
2147             GMX_LOG(mdlog.info)
2148                     .appendText(
2149                             "This distance will limit the DD cell size, you can override this with "
2150                             "-rcon");
2151         }
2152     }
2153     else if (options.constraintCommunicationRange > 0)
2154     {
2155         /* Here we do not check for dd->splitConstraints.
2156          * because one can also set a cell size limit for virtual sites only
2157          * and at this point we don't know yet if there are intercg v-sites.
2158          */
2159         GMX_LOG(mdlog.info)
2160                 .appendTextFormatted("User supplied maximum distance required for P-LINCS: %.3f nm",
2161                                      options.constraintCommunicationRange);
2162         systemInfo.constraintCommunicationRange = options.constraintCommunicationRange;
2163     }
2164     systemInfo.cellsizeLimit = std::max(systemInfo.cellsizeLimit, systemInfo.constraintCommunicationRange);
2165
2166     return systemInfo;
2167 }
2168
2169 /*! \brief Exit with a fatal error if the DDGridSetup cannot be
2170  * implemented. */
2171 static void checkDDGridSetup(const DDGridSetup&   ddGridSetup,
2172                              DDRole               ddRole,
2173                              MPI_Comm             communicator,
2174                              int                  numNodes,
2175                              const DomdecOptions& options,
2176                              const DDSettings&    ddSettings,
2177                              const DDSystemInfo&  systemInfo,
2178                              const real           cellsizeLimit,
2179                              const gmx_ddbox_t&   ddbox)
2180 {
2181     if (options.numCells[XX] <= 0 && (ddGridSetup.numDomains[XX] == 0))
2182     {
2183         const bool  bC = (systemInfo.mayHaveSplitConstraints
2184                          && systemInfo.constraintCommunicationRange > systemInfo.minCutoffForMultiBody);
2185         std::string message =
2186                 gmx::formatString("Change the number of ranks or mdrun option %s%s%s",
2187                                   !bC ? "-rdd" : "-rcon",
2188                                   ddSettings.initialDlbState != DlbState::offUser ? " or -dds" : "",
2189                                   bC ? " or your LINCS settings" : "");
2190
2191         gmx_fatal_collective(FARGS,
2192                              communicator,
2193                              ddRole == DDRole::Master,
2194                              "There is no domain decomposition for %d ranks that is compatible "
2195                              "with the given box and a minimum cell size of %g nm\n"
2196                              "%s\n"
2197                              "Look in the log file for details on the domain decomposition",
2198                              numNodes - ddGridSetup.numPmeOnlyRanks,
2199                              cellsizeLimit,
2200                              message.c_str());
2201     }
2202
2203     const real acs = average_cellsize_min(ddbox, ddGridSetup.numDomains);
2204     if (acs < cellsizeLimit)
2205     {
2206         if (options.numCells[XX] <= 0)
2207         {
2208             GMX_RELEASE_ASSERT(
2209                     false,
2210                     "dd_choose_grid() should return a grid that satisfies the cell size limits");
2211         }
2212         else
2213         {
2214             gmx_fatal_collective(
2215                     FARGS,
2216                     communicator,
2217                     ddRole == DDRole::Master,
2218                     "The initial cell size (%f) is smaller than the cell size limit (%f), change "
2219                     "options -dd, -rdd or -rcon, see the log file for details",
2220                     acs,
2221                     cellsizeLimit);
2222         }
2223     }
2224
2225     const int numPPRanks =
2226             ddGridSetup.numDomains[XX] * ddGridSetup.numDomains[YY] * ddGridSetup.numDomains[ZZ];
2227     if (numNodes - numPPRanks != ddGridSetup.numPmeOnlyRanks)
2228     {
2229         gmx_fatal_collective(FARGS,
2230                              communicator,
2231                              ddRole == DDRole::Master,
2232                              "The size of the domain decomposition grid (%d) does not match the "
2233                              "number of PP ranks (%d). The total number of ranks is %d",
2234                              numPPRanks,
2235                              numNodes - ddGridSetup.numPmeOnlyRanks,
2236                              numNodes);
2237     }
2238     if (ddGridSetup.numPmeOnlyRanks > numPPRanks)
2239     {
2240         gmx_fatal_collective(FARGS,
2241                              communicator,
2242                              ddRole == DDRole::Master,
2243                              "The number of separate PME ranks (%d) is larger than the number of "
2244                              "PP ranks (%d), this is not supported.",
2245                              ddGridSetup.numPmeOnlyRanks,
2246                              numPPRanks);
2247     }
2248 }
2249
2250 /*! \brief Set the cell size and interaction limits, as well as the DD grid */
2251 static DDRankSetup getDDRankSetup(const gmx::MDLogger& mdlog,
2252                                   int                  numNodes,
2253                                   const DdRankOrder    rankOrder,
2254                                   const DDGridSetup&   ddGridSetup,
2255                                   const t_inputrec&    ir)
2256 {
2257     GMX_LOG(mdlog.info)
2258             .appendTextFormatted("Domain decomposition grid %d x %d x %d, separate PME ranks %d",
2259                                  ddGridSetup.numDomains[XX],
2260                                  ddGridSetup.numDomains[YY],
2261                                  ddGridSetup.numDomains[ZZ],
2262                                  ddGridSetup.numPmeOnlyRanks);
2263
2264     DDRankSetup ddRankSetup;
2265
2266     ddRankSetup.rankOrder = rankOrder;
2267
2268     ddRankSetup.numPPRanks = numNodes - ddGridSetup.numPmeOnlyRanks;
2269     copy_ivec(ddGridSetup.numDomains, ddRankSetup.numPPCells);
2270
2271     ddRankSetup.usePmeOnlyRanks = (ddGridSetup.numPmeOnlyRanks > 0);
2272     if (ddRankSetup.usePmeOnlyRanks)
2273     {
2274         ddRankSetup.numRanksDoingPme = ddGridSetup.numPmeOnlyRanks;
2275     }
2276     else
2277     {
2278         ddRankSetup.numRanksDoingPme =
2279                 ddGridSetup.numDomains[XX] * ddGridSetup.numDomains[YY] * ddGridSetup.numDomains[ZZ];
2280     }
2281
2282     if (EEL_PME(ir.coulombtype) || EVDW_PME(ir.vdwtype))
2283     {
2284         /* The following choices should match those
2285          * in comm_cost_est in domdec_setup.c.
2286          * Note that here the checks have to take into account
2287          * that the decomposition might occur in a different order than xyz
2288          * (for instance through the env.var. GMX_DD_ORDER_ZYX),
2289          * in which case they will not match those in comm_cost_est,
2290          * but since that is mainly for testing purposes that's fine.
2291          */
2292         if (ddGridSetup.numDDDimensions >= 2 && ddGridSetup.ddDimensions[0] == XX
2293             && ddGridSetup.ddDimensions[1] == YY
2294             && ddRankSetup.numRanksDoingPme > ddGridSetup.numDomains[XX]
2295             && ddRankSetup.numRanksDoingPme % ddGridSetup.numDomains[XX] == 0
2296             && getenv("GMX_PMEONEDD") == nullptr)
2297         {
2298             ddRankSetup.npmedecompdim = 2;
2299             ddRankSetup.npmenodes_x   = ddGridSetup.numDomains[XX];
2300             ddRankSetup.npmenodes_y   = ddRankSetup.numRanksDoingPme / ddRankSetup.npmenodes_x;
2301         }
2302         else
2303         {
2304             /* In case nc is 1 in both x and y we could still choose to
2305              * decompose pme in y instead of x, but we use x for simplicity.
2306              */
2307             ddRankSetup.npmedecompdim = 1;
2308             if (ddGridSetup.ddDimensions[0] == YY)
2309             {
2310                 ddRankSetup.npmenodes_x = 1;
2311                 ddRankSetup.npmenodes_y = ddRankSetup.numRanksDoingPme;
2312             }
2313             else
2314             {
2315                 ddRankSetup.npmenodes_x = ddRankSetup.numRanksDoingPme;
2316                 ddRankSetup.npmenodes_y = 1;
2317             }
2318         }
2319         GMX_LOG(mdlog.info)
2320                 .appendTextFormatted("PME domain decomposition: %d x %d x %d",
2321                                      ddRankSetup.npmenodes_x,
2322                                      ddRankSetup.npmenodes_y,
2323                                      1);
2324     }
2325     else
2326     {
2327         ddRankSetup.npmedecompdim = 0;
2328         ddRankSetup.npmenodes_x   = 0;
2329         ddRankSetup.npmenodes_y   = 0;
2330     }
2331
2332     return ddRankSetup;
2333 }
2334
2335 /*! \brief Set the cell size and interaction limits */
2336 static void set_dd_limits(const gmx::MDLogger& mdlog,
2337                           DDRole               ddRole,
2338                           gmx_domdec_t*        dd,
2339                           const DomdecOptions& options,
2340                           const DDSettings&    ddSettings,
2341                           const DDSystemInfo&  systemInfo,
2342                           const DDGridSetup&   ddGridSetup,
2343                           const int            numPPRanks,
2344                           const gmx_mtop_t&    mtop,
2345                           const t_inputrec&    ir,
2346                           const gmx_ddbox_t&   ddbox)
2347 {
2348     gmx_domdec_comm_t* comm = dd->comm;
2349     comm->ddSettings        = ddSettings;
2350
2351     /* Initialize to GPU share count to 0, might change later */
2352     comm->nrank_gpu_shared = 0;
2353
2354     comm->dlbState = comm->ddSettings.initialDlbState;
2355     dd_dlb_set_should_check_whether_to_turn_dlb_on(dd, TRUE);
2356     /* To consider turning DLB on after 2*nstlist steps we need to check
2357      * at partitioning count 3. Thus we need to increase the first count by 2.
2358      */
2359     comm->ddPartioningCountFirstDlbOff += 2;
2360
2361     comm->bPMELoadBalDLBLimits = FALSE;
2362
2363     /* Allocate the charge group/atom sorting struct */
2364     comm->sort = std::make_unique<gmx_domdec_sort_t>();
2365
2366     comm->systemInfo = systemInfo;
2367
2368     if (systemInfo.useUpdateGroups)
2369     {
2370         /* Note: We would like to use dd->nnodes for the atom count estimate,
2371          *       but that is not yet available here. But this anyhow only
2372          *       affect performance up to the second dd_partition_system call.
2373          */
2374         const int homeAtomCountEstimate = mtop.natoms / numPPRanks;
2375         comm->updateGroupsCog           = std::make_unique<gmx::UpdateGroupsCog>(
2376                 mtop, systemInfo.updateGroupingsPerMoleculeType, maxReferenceTemperature(ir), homeAtomCountEstimate);
2377     }
2378
2379     /* Set the DD setup given by ddGridSetup */
2380     copy_ivec(ddGridSetup.numDomains, dd->numCells);
2381     dd->ndim = ddGridSetup.numDDDimensions;
2382     copy_ivec(ddGridSetup.ddDimensions, dd->dim);
2383
2384     dd->nnodes = dd->numCells[XX] * dd->numCells[YY] * dd->numCells[ZZ];
2385
2386     snew(comm->slb_frac, DIM);
2387     if (isDlbDisabled(comm))
2388     {
2389         comm->slb_frac[XX] = get_slb_frac(mdlog, "x", dd->numCells[XX], options.cellSizeX);
2390         comm->slb_frac[YY] = get_slb_frac(mdlog, "y", dd->numCells[YY], options.cellSizeY);
2391         comm->slb_frac[ZZ] = get_slb_frac(mdlog, "z", dd->numCells[ZZ], options.cellSizeZ);
2392     }
2393
2394     /* Set the multi-body cut-off and cellsize limit for DLB */
2395     comm->cutoff_mbody   = systemInfo.minCutoffForMultiBody;
2396     comm->cellsize_limit = systemInfo.cellsizeLimit;
2397     if (systemInfo.haveInterDomainBondeds && systemInfo.increaseMultiBodyCutoff)
2398     {
2399         if (systemInfo.filterBondedCommunication || !isDlbDisabled(comm))
2400         {
2401             /* Set the bonded communication distance to halfway
2402              * the minimum and the maximum,
2403              * since the extra communication cost is nearly zero.
2404              */
2405             real acs           = average_cellsize_min(ddbox, dd->numCells);
2406             comm->cutoff_mbody = 0.5 * (systemInfo.minCutoffForMultiBody + acs);
2407             if (!isDlbDisabled(comm))
2408             {
2409                 /* Check if this does not limit the scaling */
2410                 comm->cutoff_mbody = std::min(comm->cutoff_mbody, options.dlbScaling * acs);
2411             }
2412             if (!systemInfo.filterBondedCommunication)
2413             {
2414                 /* Without bBondComm do not go beyond the n.b. cut-off */
2415                 comm->cutoff_mbody = std::min(comm->cutoff_mbody, systemInfo.cutoff);
2416                 if (comm->cellsize_limit >= systemInfo.cutoff)
2417                 {
2418                     /* We don't loose a lot of efficieny
2419                      * when increasing it to the n.b. cut-off.
2420                      * It can even be slightly faster, because we need
2421                      * less checks for the communication setup.
2422                      */
2423                     comm->cutoff_mbody = systemInfo.cutoff;
2424                 }
2425             }
2426             /* Check if we did not end up below our original limit */
2427             comm->cutoff_mbody = std::max(comm->cutoff_mbody, systemInfo.minCutoffForMultiBody);
2428
2429             if (comm->cutoff_mbody > comm->cellsize_limit)
2430             {
2431                 comm->cellsize_limit = comm->cutoff_mbody;
2432             }
2433         }
2434         /* Without DLB and cutoff_mbody<cutoff, cutoff_mbody is dynamic */
2435     }
2436
2437     if (debug)
2438     {
2439         fprintf(debug,
2440                 "Bonded atom communication beyond the cut-off: %s\n"
2441                 "cellsize limit %f\n",
2442                 gmx::boolToString(systemInfo.filterBondedCommunication),
2443                 comm->cellsize_limit);
2444     }
2445
2446     if (ddRole == DDRole::Master)
2447     {
2448         check_dd_restrictions(dd, ir, mdlog);
2449     }
2450 }
2451
2452 static void writeSettings(gmx::TextWriter*   log,
2453                           gmx_domdec_t*      dd,
2454                           const gmx_mtop_t&  mtop,
2455                           const t_inputrec&  ir,
2456                           gmx_bool           bDynLoadBal,
2457                           real               dlb_scale,
2458                           const gmx_ddbox_t* ddbox)
2459 {
2460     gmx_domdec_comm_t* comm = dd->comm;
2461
2462     if (bDynLoadBal)
2463     {
2464         log->writeString("The maximum number of communication pulses is:");
2465         for (int d = 0; d < dd->ndim; d++)
2466         {
2467             log->writeStringFormatted(" %c %d", dim2char(dd->dim[d]), comm->cd[d].np_dlb);
2468         }
2469         log->ensureLineBreak();
2470         log->writeLineFormatted("The minimum size for domain decomposition cells is %.3f nm",
2471                                 comm->cellsize_limit);
2472         log->writeLineFormatted("The requested allowed shrink of DD cells (option -dds) is: %.2f", dlb_scale);
2473         log->writeString("The allowed shrink of domain decomposition cells is:");
2474         for (int d = 0; d < DIM; d++)
2475         {
2476             if (dd->numCells[d] > 1)
2477             {
2478                 const real shrink =
2479                         (d >= ddbox->npbcdim && dd->numCells[d] == 2)
2480                                 ? 0
2481                                 : comm->cellsize_min_dlb[d]
2482                                           / (ddbox->box_size[d] * ddbox->skew_fac[d] / dd->numCells[d]);
2483                 log->writeStringFormatted(" %c %.2f", dim2char(d), shrink);
2484             }
2485         }
2486         log->ensureLineBreak();
2487     }
2488     else
2489     {
2490         ivec np;
2491         set_dd_cell_sizes_slb(dd, ddbox, setcellsizeslbPULSE_ONLY, np);
2492         log->writeString("The initial number of communication pulses is:");
2493         for (int d = 0; d < dd->ndim; d++)
2494         {
2495             log->writeStringFormatted(" %c %d", dim2char(dd->dim[d]), np[dd->dim[d]]);
2496         }
2497         log->ensureLineBreak();
2498         log->writeString("The initial domain decomposition cell size is:");
2499         for (int d = 0; d < DIM; d++)
2500         {
2501             if (dd->numCells[d] > 1)
2502             {
2503                 log->writeStringFormatted(" %c %.2f nm", dim2char(d), dd->comm->cellsize_min[d]);
2504             }
2505         }
2506         log->ensureLineBreak();
2507         log->writeLine();
2508     }
2509
2510     const bool haveInterDomainVsites =
2511             (countInterUpdategroupVsites(mtop, comm->systemInfo.updateGroupingsPerMoleculeType) != 0);
2512
2513     if (comm->systemInfo.haveInterDomainBondeds || haveInterDomainVsites
2514         || comm->systemInfo.mayHaveSplitConstraints || comm->systemInfo.mayHaveSplitSettles)
2515     {
2516         std::string decompUnits;
2517         if (comm->systemInfo.useUpdateGroups)
2518         {
2519             decompUnits = "atom groups";
2520         }
2521         else
2522         {
2523             decompUnits = "atoms";
2524         }
2525
2526         log->writeLineFormatted("The maximum allowed distance for %s involved in interactions is:",
2527                                 decompUnits.c_str());
2528         log->writeLineFormatted(
2529                 "%40s  %-7s %6.3f nm", "non-bonded interactions", "", comm->systemInfo.cutoff);
2530
2531         real limit = 0;
2532         if (bDynLoadBal)
2533         {
2534             limit = dd->comm->cellsize_limit;
2535         }
2536         else
2537         {
2538             if (dd->unitCellInfo.ddBoxIsDynamic)
2539             {
2540                 log->writeLine(
2541                         "(the following are initial values, they could change due to box "
2542                         "deformation)");
2543             }
2544             limit = dd->comm->cellsize_min[XX];
2545             for (int d = 1; d < DIM; d++)
2546             {
2547                 limit = std::min(limit, dd->comm->cellsize_min[d]);
2548             }
2549         }
2550
2551         if (comm->systemInfo.haveInterDomainBondeds)
2552         {
2553             log->writeLineFormatted("%40s  %-7s %6.3f nm",
2554                                     "two-body bonded interactions",
2555                                     "(-rdd)",
2556                                     std::max(comm->systemInfo.cutoff, comm->cutoff_mbody));
2557             log->writeLineFormatted("%40s  %-7s %6.3f nm",
2558                                     "multi-body bonded interactions",
2559                                     "(-rdd)",
2560                                     (comm->systemInfo.filterBondedCommunication || isDlbOn(dd->comm))
2561                                             ? comm->cutoff_mbody
2562                                             : std::min(comm->systemInfo.cutoff, limit));
2563         }
2564         if (haveInterDomainVsites)
2565         {
2566             log->writeLineFormatted("%40s  %-7s %6.3f nm", "virtual site constructions", "(-rcon)", limit);
2567         }
2568         if (comm->systemInfo.mayHaveSplitConstraints || comm->systemInfo.mayHaveSplitSettles)
2569         {
2570             std::string separation =
2571                     gmx::formatString("atoms separated by up to %d constraints", 1 + ir.nProjOrder);
2572             log->writeLineFormatted("%40s  %-7s %6.3f nm\n", separation.c_str(), "(-rcon)", limit);
2573         }
2574         log->ensureLineBreak();
2575     }
2576 }
2577
2578 static void logSettings(const gmx::MDLogger& mdlog,
2579                         gmx_domdec_t*        dd,
2580                         const gmx_mtop_t&    mtop,
2581                         const t_inputrec&    ir,
2582                         real                 dlb_scale,
2583                         const gmx_ddbox_t*   ddbox)
2584 {
2585     gmx::StringOutputStream stream;
2586     gmx::TextWriter         log(&stream);
2587     writeSettings(&log, dd, mtop, ir, isDlbOn(dd->comm), dlb_scale, ddbox);
2588     if (dd->comm->dlbState == DlbState::offCanTurnOn)
2589     {
2590         {
2591             log.ensureEmptyLine();
2592             log.writeLine(
2593                     "When dynamic load balancing gets turned on, these settings will change to:");
2594         }
2595         writeSettings(&log, dd, mtop, ir, true, dlb_scale, ddbox);
2596     }
2597     GMX_LOG(mdlog.info).asParagraph().appendText(stream.toString());
2598 }
2599
2600 static void set_cell_limits_dlb(const gmx::MDLogger& mdlog,
2601                                 gmx_domdec_t*        dd,
2602                                 real                 dlb_scale,
2603                                 const t_inputrec&    inputrec,
2604                                 const gmx_ddbox_t*   ddbox)
2605 {
2606     int npulse       = 0;
2607     int npulse_d_max = 0;
2608     int npulse_d     = 0;
2609
2610     gmx_domdec_comm_t* comm = dd->comm;
2611
2612     bool bNoCutOff = (inputrec.rvdw == 0 || inputrec.rcoulomb == 0);
2613
2614     /* Determine the maximum number of comm. pulses in one dimension */
2615
2616     comm->cellsize_limit = std::max(comm->cellsize_limit, comm->cutoff_mbody);
2617
2618     /* Determine the maximum required number of grid pulses */
2619     if (comm->cellsize_limit >= comm->systemInfo.cutoff)
2620     {
2621         /* Only a single pulse is required */
2622         npulse = 1;
2623     }
2624     else if (!bNoCutOff && comm->cellsize_limit > 0)
2625     {
2626         /* We round down slightly here to avoid overhead due to the latency
2627          * of extra communication calls when the cut-off
2628          * would be only slightly longer than the cell size.
2629          * Later cellsize_limit is redetermined,
2630          * so we can not miss interactions due to this rounding.
2631          */
2632         npulse = static_cast<int>(0.96 + comm->systemInfo.cutoff / comm->cellsize_limit);
2633     }
2634     else
2635     {
2636         /* There is no cell size limit */
2637         npulse = std::max(dd->numCells[XX] - 1, std::max(dd->numCells[YY] - 1, dd->numCells[ZZ] - 1));
2638     }
2639
2640     if (!bNoCutOff && npulse > 1)
2641     {
2642         /* See if we can do with less pulses, based on dlb_scale */
2643         npulse_d_max = 0;
2644         for (int d = 0; d < dd->ndim; d++)
2645         {
2646             int dim  = dd->dim[d];
2647             npulse_d = static_cast<int>(
2648                     1
2649                     + dd->numCells[dim] * comm->systemInfo.cutoff
2650                               / (ddbox->box_size[dim] * ddbox->skew_fac[dim] * dlb_scale));
2651             npulse_d_max = std::max(npulse_d_max, npulse_d);
2652         }
2653         npulse = std::min(npulse, npulse_d_max);
2654     }
2655
2656     /* This env var can override npulse */
2657     const int ddPulseEnv = dd_getenv(mdlog, "GMX_DD_NPULSE", 0);
2658     if (ddPulseEnv > 0)
2659     {
2660         npulse = ddPulseEnv;
2661     }
2662
2663     comm->maxpulse       = 1;
2664     comm->bVacDLBNoLimit = (inputrec.pbcType == PbcType::No);
2665     for (int d = 0; d < dd->ndim; d++)
2666     {
2667         comm->cd[d].np_dlb = std::min(npulse, dd->numCells[dd->dim[d]] - 1);
2668         comm->maxpulse     = std::max(comm->maxpulse, comm->cd[d].np_dlb);
2669         if (comm->cd[d].np_dlb < dd->numCells[dd->dim[d]] - 1)
2670         {
2671             comm->bVacDLBNoLimit = FALSE;
2672         }
2673     }
2674
2675     /* cellsize_limit is set for LINCS in init_domain_decomposition */
2676     if (!comm->bVacDLBNoLimit)
2677     {
2678         comm->cellsize_limit = std::max(comm->cellsize_limit, comm->systemInfo.cutoff / comm->maxpulse);
2679     }
2680     comm->cellsize_limit = std::max(comm->cellsize_limit, comm->cutoff_mbody);
2681     /* Set the minimum cell size for each DD dimension */
2682     for (int d = 0; d < dd->ndim; d++)
2683     {
2684         if (comm->bVacDLBNoLimit || comm->cd[d].np_dlb * comm->cellsize_limit >= comm->systemInfo.cutoff)
2685         {
2686             comm->cellsize_min_dlb[dd->dim[d]] = comm->cellsize_limit;
2687         }
2688         else
2689         {
2690             comm->cellsize_min_dlb[dd->dim[d]] = comm->systemInfo.cutoff / comm->cd[d].np_dlb;
2691         }
2692     }
2693     if (comm->cutoff_mbody <= 0)
2694     {
2695         comm->cutoff_mbody = std::min(comm->systemInfo.cutoff, comm->cellsize_limit);
2696     }
2697     if (isDlbOn(comm))
2698     {
2699         set_dlb_limits(dd);
2700     }
2701 }
2702
2703 bool dd_moleculesAreAlwaysWhole(const gmx_domdec_t& dd)
2704 {
2705     return dd.comm->systemInfo.moleculesAreAlwaysWhole;
2706 }
2707
2708 bool dd_bonded_molpbc(const gmx_domdec_t& dd, PbcType pbcType)
2709 {
2710     /* If each molecule is a single charge group
2711      * or we use domain decomposition for each periodic dimension,
2712      * we do not need to take pbc into account for the bonded interactions.
2713      */
2714     return (pbcType != PbcType::No && dd.comm->systemInfo.haveInterDomainBondeds
2715             && !(dd.numCells[XX] > 1 && dd.numCells[YY] > 1
2716                  && (dd.numCells[ZZ] > 1 || pbcType == PbcType::XY)));
2717 }
2718
2719 /*! \brief Sets grid size limits and PP-PME setup, prints settings to log */
2720 static void set_ddgrid_parameters(const gmx::MDLogger& mdlog,
2721                                   gmx_domdec_t*        dd,
2722                                   real                 dlb_scale,
2723                                   const gmx_mtop_t&    mtop,
2724                                   const t_inputrec&    inputrec,
2725                                   const gmx_ddbox_t*   ddbox)
2726 {
2727     gmx_domdec_comm_t* comm        = dd->comm;
2728     DDRankSetup&       ddRankSetup = comm->ddRankSetup;
2729
2730     if (EEL_PME(inputrec.coulombtype) || EVDW_PME(inputrec.vdwtype))
2731     {
2732         init_ddpme(dd, &ddRankSetup.ddpme[0], 0);
2733         if (ddRankSetup.npmedecompdim >= 2)
2734         {
2735             init_ddpme(dd, &ddRankSetup.ddpme[1], 1);
2736         }
2737     }
2738     else
2739     {
2740         ddRankSetup.numRanksDoingPme = 0;
2741         if (dd->pme_nodeid >= 0)
2742         {
2743             gmx_fatal_collective(FARGS,
2744                                  dd->mpi_comm_all,
2745                                  DDMASTER(dd),
2746                                  "Can not have separate PME ranks without PME electrostatics");
2747         }
2748     }
2749
2750     if (debug)
2751     {
2752         fprintf(debug, "The DD cut-off is %f\n", comm->systemInfo.cutoff);
2753     }
2754     if (!isDlbDisabled(comm))
2755     {
2756         set_cell_limits_dlb(mdlog, dd, dlb_scale, inputrec, ddbox);
2757     }
2758
2759     logSettings(mdlog, dd, mtop, inputrec, dlb_scale, ddbox);
2760
2761     const real vol_frac = (inputrec.pbcType == PbcType::No)
2762                                   ? (1 - 1 / static_cast<double>(dd->nnodes))
2763                                   : ((1 + comm_box_frac(dd->numCells, comm->systemInfo.cutoff, *ddbox))
2764                                      / static_cast<double>(dd->nnodes));
2765     if (debug)
2766     {
2767         fprintf(debug, "Volume fraction for all DD zones: %f\n", vol_frac);
2768     }
2769     int natoms_tot = mtop.natoms;
2770
2771     dd->ga2la = new gmx_ga2la_t(natoms_tot, static_cast<int>(vol_frac * natoms_tot));
2772 }
2773
2774 /*! \brief Get some important DD parameters which can be modified by env.vars */
2775 static DDSettings getDDSettings(const gmx::MDLogger&     mdlog,
2776                                 const DomdecOptions&     options,
2777                                 const gmx::MdrunOptions& mdrunOptions,
2778                                 const t_inputrec&        ir)
2779 {
2780     DDSettings ddSettings;
2781
2782     ddSettings.useSendRecv2        = (dd_getenv(mdlog, "GMX_DD_USE_SENDRECV2", 0) != 0);
2783     ddSettings.dlb_scale_lim       = dd_getenv(mdlog, "GMX_DLB_MAX_BOX_SCALING", 10);
2784     ddSettings.useDDOrderZYX       = bool(dd_getenv(mdlog, "GMX_DD_ORDER_ZYX", 0));
2785     ddSettings.useCartesianReorder = bool(dd_getenv(mdlog, "GMX_NO_CART_REORDER", 1));
2786     ddSettings.eFlop               = dd_getenv(mdlog, "GMX_DLB_BASED_ON_FLOPS", 0);
2787     const int recload              = dd_getenv(mdlog, "GMX_DD_RECORD_LOAD", 1);
2788     ddSettings.nstDDDump           = dd_getenv(mdlog, "GMX_DD_NST_DUMP", 0);
2789     ddSettings.nstDDDumpGrid       = dd_getenv(mdlog, "GMX_DD_NST_DUMP_GRID", 0);
2790     ddSettings.DD_debug            = dd_getenv(mdlog, "GMX_DD_DEBUG", 0);
2791
2792     if (ddSettings.useSendRecv2)
2793     {
2794         GMX_LOG(mdlog.info)
2795                 .appendText(
2796                         "Will use two sequential MPI_Sendrecv calls instead of two simultaneous "
2797                         "non-blocking MPI_Irecv and MPI_Isend pairs for constraint and vsite "
2798                         "communication");
2799     }
2800
2801     if (ddSettings.eFlop)
2802     {
2803         GMX_LOG(mdlog.info).appendText("Will load balance based on FLOP count");
2804         ddSettings.recordLoad = true;
2805     }
2806     else
2807     {
2808         ddSettings.recordLoad = (wallcycle_have_counter() && recload > 0);
2809     }
2810
2811     ddSettings.initialDlbState =
2812             determineInitialDlbState(mdlog, options.dlbOption, ddSettings.recordLoad, mdrunOptions, ir);
2813     GMX_LOG(mdlog.info)
2814             .appendTextFormatted("Dynamic load balancing: %s",
2815                                  enumValueToString(ddSettings.initialDlbState));
2816
2817     return ddSettings;
2818 }
2819
2820 gmx_domdec_t::gmx_domdec_t(const t_inputrec& ir) : unitCellInfo(ir) {}
2821
2822 gmx_domdec_t::~gmx_domdec_t() = default;
2823
2824 namespace gmx
2825 {
2826
2827 // TODO once the functionality stablizes, move this class and
2828 // supporting functionality into builder.cpp
2829 /*! \brief Impl class for DD builder */
2830 class DomainDecompositionBuilder::Impl
2831 {
2832 public:
2833     //! Constructor
2834     Impl(const MDLogger&                   mdlog,
2835          t_commrec*                        cr,
2836          const DomdecOptions&              options,
2837          const MdrunOptions&               mdrunOptions,
2838          const gmx_mtop_t&                 mtop,
2839          const t_inputrec&                 ir,
2840          const MDModulesNotifiers&         notifiers,
2841          const matrix                      box,
2842          ArrayRef<const RangePartitioning> updateGroupingPerMoleculeType,
2843          bool                              useUpdateGroups,
2844          real                              maxUpdateGroupRadius,
2845          ArrayRef<const RVec>              xGlobal,
2846          bool                              useGpuForNonbonded,
2847          bool                              useGpuForPme);
2848
2849     //! Build the resulting DD manager
2850     gmx_domdec_t* build(LocalAtomSetManager* atomSets);
2851
2852     //! Objects used in constructing and configuring DD
2853     //! {
2854     //! Logging object
2855     const MDLogger& mdlog_;
2856     //! Communication object
2857     t_commrec* cr_;
2858     //! User-supplied options configuring DD behavior
2859     const DomdecOptions options_;
2860     //! Global system topology
2861     const gmx_mtop_t& mtop_;
2862     //! User input values from the tpr file
2863     const t_inputrec& ir_;
2864     //! MdModules object
2865     const MDModulesNotifiers& notifiers_;
2866     //! }
2867
2868     //! Internal objects used in constructing DD
2869     //! {
2870     //! Settings combined from the user input
2871     DDSettings ddSettings_;
2872     //! Information derived from the simulation system
2873     DDSystemInfo systemInfo_;
2874     //! Box structure
2875     gmx_ddbox_t ddbox_ = { 0 };
2876     //! Organization of the DD grids
2877     DDGridSetup ddGridSetup_;
2878     //! Organzation of the DD ranks
2879     DDRankSetup ddRankSetup_;
2880     //! Number of DD cells in each dimension
2881     ivec ddCellIndex_ = { 0, 0, 0 };
2882     //! IDs of PME-only ranks
2883     std::vector<int> pmeRanks_;
2884     //! Contains a valid Cartesian-communicator-based setup, or defaults.
2885     CartesianRankSetup cartSetup_;
2886     //! }
2887 };
2888
2889 DomainDecompositionBuilder::Impl::Impl(const MDLogger&                   mdlog,
2890                                        t_commrec*                        cr,
2891                                        const DomdecOptions&              options,
2892                                        const MdrunOptions&               mdrunOptions,
2893                                        const gmx_mtop_t&                 mtop,
2894                                        const t_inputrec&                 ir,
2895                                        const MDModulesNotifiers&         notifiers,
2896                                        const matrix                      box,
2897                                        ArrayRef<const RangePartitioning> updateGroupingPerMoleculeType,
2898                                        const bool                        useUpdateGroups,
2899                                        const real                        maxUpdateGroupRadius,
2900                                        ArrayRef<const RVec>              xGlobal,
2901                                        bool                              useGpuForNonbonded,
2902                                        bool                              useGpuForPme) :
2903     mdlog_(mdlog), cr_(cr), options_(options), mtop_(mtop), ir_(ir), notifiers_(notifiers)
2904 {
2905     GMX_LOG(mdlog_.info).appendTextFormatted("\nInitializing Domain Decomposition on %d ranks", cr_->sizeOfDefaultCommunicator);
2906
2907     ddSettings_ = getDDSettings(mdlog_, options_, mdrunOptions, ir_);
2908
2909     if (ddSettings_.eFlop > 1)
2910     {
2911         /* Ensure that we have different random flop counts on different ranks */
2912         srand(1 + cr_->rankInDefaultCommunicator);
2913     }
2914
2915     systemInfo_ = getSystemInfo(mdlog_,
2916                                 MASTER(cr_) ? DDRole::Master : DDRole::Agent,
2917                                 cr->mpiDefaultCommunicator,
2918                                 options_,
2919                                 mtop_,
2920                                 ir_,
2921                                 box,
2922                                 updateGroupingPerMoleculeType,
2923                                 useUpdateGroups,
2924                                 maxUpdateGroupRadius,
2925                                 xGlobal);
2926
2927     const int  numRanksRequested         = cr_->sizeOfDefaultCommunicator;
2928     const bool checkForLargePrimeFactors = (options_.numCells[0] <= 0);
2929
2930
2931     /* Checks for ability to use PME-only ranks */
2932     auto separatePmeRanksPermitted = checkForSeparatePmeRanks(
2933             notifiers_, options_, numRanksRequested, useGpuForNonbonded, useGpuForPme);
2934
2935     /* Checks for validity of requested Ranks setup */
2936     checkForValidRankCountRequests(numRanksRequested,
2937                                    EEL_PME(ir_.coulombtype) | EVDW_PME(ir_.vdwtype),
2938                                    options_.numPmeRanks,
2939                                    separatePmeRanksPermitted,
2940                                    checkForLargePrimeFactors);
2941
2942     // DD grid setup uses a more different cell size limit for
2943     // automated setup than the one in systemInfo_. The latter is used
2944     // in set_dd_limits() to configure DLB, for example.
2945     const real gridSetupCellsizeLimit =
2946             getDDGridSetupCellSizeLimit(mdlog_,
2947                                         !isDlbDisabled(ddSettings_.initialDlbState),
2948                                         options_.dlbScaling,
2949                                         ir_,
2950                                         systemInfo_.cellsizeLimit);
2951     ddGridSetup_ = getDDGridSetup(mdlog_,
2952                                   MASTER(cr_) ? DDRole::Master : DDRole::Agent,
2953                                   cr->mpiDefaultCommunicator,
2954                                   numRanksRequested,
2955                                   options_,
2956                                   ddSettings_,
2957                                   systemInfo_,
2958                                   gridSetupCellsizeLimit,
2959                                   mtop_,
2960                                   ir_,
2961                                   separatePmeRanksPermitted,
2962                                   box,
2963                                   xGlobal,
2964                                   &ddbox_);
2965     checkDDGridSetup(ddGridSetup_,
2966                      MASTER(cr_) ? DDRole::Master : DDRole::Agent,
2967                      cr->mpiDefaultCommunicator,
2968                      cr->sizeOfDefaultCommunicator,
2969                      options_,
2970                      ddSettings_,
2971                      systemInfo_,
2972                      gridSetupCellsizeLimit,
2973                      ddbox_);
2974
2975     cr_->npmenodes = ddGridSetup_.numPmeOnlyRanks;
2976
2977     ddRankSetup_ = getDDRankSetup(
2978             mdlog_, cr_->sizeOfDefaultCommunicator, options_.rankOrder, ddGridSetup_, ir_);
2979
2980     /* Generate the group communicator, also decides the duty of each rank */
2981     cartSetup_ = makeGroupCommunicators(
2982             mdlog_, ddSettings_, options_.rankOrder, ddRankSetup_, cr_, ddCellIndex_, &pmeRanks_);
2983 }
2984
2985 gmx_domdec_t* DomainDecompositionBuilder::Impl::build(LocalAtomSetManager* atomSets)
2986 {
2987     gmx_domdec_t* dd = new gmx_domdec_t(ir_);
2988
2989     copy_ivec(ddCellIndex_, dd->ci);
2990
2991     dd->comm = init_dd_comm();
2992
2993     dd->comm->ddRankSetup        = ddRankSetup_;
2994     dd->comm->cartesianRankSetup = cartSetup_;
2995
2996     set_dd_limits(mdlog_,
2997                   MASTER(cr_) ? DDRole::Master : DDRole::Agent,
2998                   dd,
2999                   options_,
3000                   ddSettings_,
3001                   systemInfo_,
3002                   ddGridSetup_,
3003                   ddRankSetup_.numPPRanks,
3004                   mtop_,
3005                   ir_,
3006                   ddbox_);
3007
3008     setupGroupCommunication(mdlog_, ddSettings_, pmeRanks_, cr_, mtop_.natoms, dd);
3009
3010     if (thisRankHasDuty(cr_, DUTY_PP))
3011     {
3012         set_ddgrid_parameters(mdlog_, dd, options_.dlbScaling, mtop_, ir_, &ddbox_);
3013
3014         setup_neighbor_relations(dd);
3015     }
3016
3017     /* Set overallocation to avoid frequent reallocation of arrays */
3018     set_over_alloc_dd(true);
3019
3020     dd->atomSets = atomSets;
3021
3022     dd->localTopologyChecker = std::make_unique<LocalTopologyChecker>(
3023             mdlog_, cr_, mtop_, dd->comm->systemInfo.useUpdateGroups);
3024
3025     return dd;
3026 }
3027
3028 DomainDecompositionBuilder::DomainDecompositionBuilder(const MDLogger&           mdlog,
3029                                                        t_commrec*                cr,
3030                                                        const DomdecOptions&      options,
3031                                                        const MdrunOptions&       mdrunOptions,
3032                                                        const gmx_mtop_t&         mtop,
3033                                                        const t_inputrec&         ir,
3034                                                        const MDModulesNotifiers& notifiers,
3035                                                        const matrix              box,
3036                                                        ArrayRef<const RangePartitioning> updateGroupingPerMoleculeType,
3037                                                        const bool           useUpdateGroups,
3038                                                        const real           maxUpdateGroupRadius,
3039                                                        ArrayRef<const RVec> xGlobal,
3040                                                        const bool           useGpuForNonbonded,
3041                                                        const bool           useGpuForPme) :
3042     impl_(new Impl(mdlog,
3043                    cr,
3044                    options,
3045                    mdrunOptions,
3046                    mtop,
3047                    ir,
3048                    notifiers,
3049                    box,
3050                    updateGroupingPerMoleculeType,
3051                    useUpdateGroups,
3052                    maxUpdateGroupRadius,
3053                    xGlobal,
3054                    useGpuForNonbonded,
3055                    useGpuForPme))
3056 {
3057 }
3058
3059 gmx_domdec_t* DomainDecompositionBuilder::build(LocalAtomSetManager* atomSets)
3060 {
3061     return impl_->build(atomSets);
3062 }
3063
3064 DomainDecompositionBuilder::~DomainDecompositionBuilder() = default;
3065
3066 } // namespace gmx
3067
3068 static gmx_bool test_dd_cutoff(const t_commrec* cr, const matrix box, gmx::ArrayRef<const gmx::RVec> x, real cutoffRequested)
3069 {
3070     gmx_ddbox_t ddbox;
3071     int         LocallyLimited = 0;
3072
3073     const auto* dd = cr->dd;
3074
3075     set_ddbox(*dd, false, box, true, x, &ddbox);
3076
3077     LocallyLimited = 0;
3078
3079     for (int d = 0; d < dd->ndim; d++)
3080     {
3081         const int dim = dd->dim[d];
3082
3083         real inv_cell_size = DD_CELL_MARGIN * dd->numCells[dim] / ddbox.box_size[dim];
3084         if (dd->unitCellInfo.ddBoxIsDynamic)
3085         {
3086             inv_cell_size *= DD_PRES_SCALE_MARGIN;
3087         }
3088
3089         const int np = 1 + static_cast<int>(cutoffRequested * inv_cell_size * ddbox.skew_fac[dim]);
3090
3091         if (!isDlbDisabled(dd->comm) && (dim < ddbox.npbcdim) && (dd->comm->cd[d].np_dlb > 0))
3092         {
3093             if (np > dd->comm->cd[d].np_dlb)
3094             {
3095                 return FALSE;
3096             }
3097
3098             /* If a current local cell size is smaller than the requested
3099              * cut-off, we could still fix it, but this gets very complicated.
3100              * Without fixing here, we might actually need more checks.
3101              */
3102             real cellSizeAlongDim =
3103                     (dd->comm->cell_x1[dim] - dd->comm->cell_x0[dim]) * ddbox.skew_fac[dim];
3104             if (cellSizeAlongDim * dd->comm->cd[d].np_dlb < cutoffRequested)
3105             {
3106                 LocallyLimited = 1;
3107             }
3108         }
3109     }
3110
3111     if (!isDlbDisabled(dd->comm))
3112     {
3113         /* If DLB is not active yet, we don't need to check the grid jumps.
3114          * Actually we shouldn't, because then the grid jump data is not set.
3115          */
3116         if (isDlbOn(dd->comm) && gmx::check_grid_jump(0, dd, cutoffRequested, &ddbox, FALSE))
3117         {
3118             LocallyLimited = 1;
3119         }
3120
3121         gmx_sumi(1, &LocallyLimited, cr);
3122
3123         if (LocallyLimited > 0)
3124         {
3125             return FALSE;
3126         }
3127     }
3128
3129     return TRUE;
3130 }
3131
3132 bool change_dd_cutoff(t_commrec* cr, const matrix box, gmx::ArrayRef<const gmx::RVec> x, real cutoffRequested)
3133 {
3134     bool bCutoffAllowed = test_dd_cutoff(cr, box, x, cutoffRequested);
3135
3136     if (bCutoffAllowed)
3137     {
3138         cr->dd->comm->systemInfo.cutoff = cutoffRequested;
3139     }
3140
3141     return bCutoffAllowed;
3142 }
3143
3144 void constructGpuHaloExchange(const gmx::MDLogger&            mdlog,
3145                               const t_commrec&                cr,
3146                               const gmx::DeviceStreamManager& deviceStreamManager,
3147                               gmx_wallcycle*                  wcycle)
3148 {
3149     GMX_RELEASE_ASSERT(deviceStreamManager.streamIsValid(gmx::DeviceStreamType::NonBondedLocal),
3150                        "Local non-bonded stream should be valid when using"
3151                        "GPU halo exchange.");
3152     GMX_RELEASE_ASSERT(deviceStreamManager.streamIsValid(gmx::DeviceStreamType::NonBondedNonLocal),
3153                        "Non-local non-bonded stream should be valid when using "
3154                        "GPU halo exchange.");
3155
3156     if (cr.dd->gpuHaloExchange[0].empty())
3157     {
3158         GMX_LOG(mdlog.warning)
3159                 .asParagraph()
3160                 .appendTextFormatted(
3161                         "NOTE: Activating the 'GPU halo exchange' feature, enabled "
3162                         "by the "
3163                         "GMX_GPU_DD_COMMS environment variable.");
3164     }
3165
3166     for (int d = 0; d < cr.dd->ndim; d++)
3167     {
3168         for (int pulse = cr.dd->gpuHaloExchange[d].size(); pulse < cr.dd->comm->cd[d].numPulses(); pulse++)
3169         {
3170             cr.dd->gpuHaloExchange[d].push_back(std::make_unique<gmx::GpuHaloExchange>(
3171                     cr.dd,
3172                     d,
3173                     cr.mpi_comm_mygroup,
3174                     deviceStreamManager.context(),
3175                     deviceStreamManager.stream(gmx::DeviceStreamType::NonBondedLocal),
3176                     deviceStreamManager.stream(gmx::DeviceStreamType::NonBondedNonLocal),
3177                     pulse,
3178                     wcycle));
3179         }
3180     }
3181 }
3182
3183 void reinitGpuHaloExchange(const t_commrec&              cr,
3184                            const DeviceBuffer<gmx::RVec> d_coordinatesBuffer,
3185                            const DeviceBuffer<gmx::RVec> d_forcesBuffer)
3186 {
3187     for (int d = 0; d < cr.dd->ndim; d++)
3188     {
3189         for (int pulse = 0; pulse < cr.dd->comm->cd[d].numPulses(); pulse++)
3190         {
3191             cr.dd->gpuHaloExchange[d][pulse]->reinitHalo(d_coordinatesBuffer, d_forcesBuffer);
3192         }
3193     }
3194 }
3195
3196 void communicateGpuHaloCoordinates(const t_commrec&      cr,
3197                                    const matrix          box,
3198                                    GpuEventSynchronizer* coordinatesReadyOnDeviceEvent)
3199 {
3200     for (int d = 0; d < cr.dd->ndim; d++)
3201     {
3202         for (int pulse = 0; pulse < cr.dd->comm->cd[d].numPulses(); pulse++)
3203         {
3204             cr.dd->gpuHaloExchange[d][pulse]->communicateHaloCoordinates(box, coordinatesReadyOnDeviceEvent);
3205         }
3206     }
3207 }
3208
3209 void communicateGpuHaloForces(const t_commrec& cr, bool accumulateForces)
3210 {
3211     for (int d = cr.dd->ndim - 1; d >= 0; d--)
3212     {
3213         for (int pulse = cr.dd->comm->cd[d].numPulses() - 1; pulse >= 0; pulse--)
3214         {
3215             cr.dd->gpuHaloExchange[d][pulse]->communicateHaloForces(accumulateForces);
3216         }
3217     }
3218 }
3219
3220 const gmx::LocalTopologyChecker& dd_localTopologyChecker(const gmx_domdec_t& dd)
3221 {
3222     return *dd.localTopologyChecker;
3223 }
3224
3225 gmx::LocalTopologyChecker* dd_localTopologyChecker(gmx_domdec_t* dd)
3226 {
3227     return dd->localTopologyChecker.get();
3228 }
3229
3230 void dd_init_local_state(const gmx_domdec_t& dd, const t_state* state_global, t_state* state_local)
3231 {
3232     std::array<int, 5> buf;
3233
3234     if (DDMASTER(dd))
3235     {
3236         buf[0] = state_global->flags;
3237         buf[1] = state_global->ngtc;
3238         buf[2] = state_global->nnhpres;
3239         buf[3] = state_global->nhchainlength;
3240         buf[4] = state_global->dfhist ? state_global->dfhist->nlambda : 0;
3241     }
3242     dd_bcast(&dd, buf.size() * sizeof(int), buf.data());
3243
3244     init_gtc_state(state_local, buf[1], buf[2], buf[3]);
3245     init_dfhist_state(state_local, buf[4]);
3246     state_local->flags = buf[0];
3247 }
3248
3249 void putUpdateGroupAtomsInSamePeriodicImage(const gmx_domdec_t&      dd,
3250                                             const gmx_mtop_t&        mtop,
3251                                             const matrix             box,
3252                                             gmx::ArrayRef<gmx::RVec> positions)
3253 {
3254     int atomOffset = 0;
3255     for (const gmx_molblock_t& molblock : mtop.molblock)
3256     {
3257         const auto& updateGrouping = dd.comm->systemInfo.updateGroupingsPerMoleculeType[molblock.type];
3258
3259         for (int mol = 0; mol < molblock.nmol; mol++)
3260         {
3261             for (int g = 0; g < updateGrouping.numBlocks(); g++)
3262             {
3263                 const auto& block     = updateGrouping.block(g);
3264                 const int   atomBegin = atomOffset + block.begin();
3265                 const int   atomEnd   = atomOffset + block.end();
3266                 for (int a = atomBegin + 1; a < atomEnd; a++)
3267                 {
3268                     // Make sure that atoms in the same update group
3269                     // are in the same periodic image after restarts.
3270                     for (int d = DIM - 1; d >= 0; d--)
3271                     {
3272                         while (positions[a][d] - positions[atomBegin][d] > 0.5_real * box[d][d])
3273                         {
3274                             positions[a] -= box[d];
3275                         }
3276                         while (positions[a][d] - positions[atomBegin][d] < -0.5_real * box[d][d])
3277                         {
3278                             positions[a] += box[d];
3279                         }
3280                     }
3281                 }
3282             }
3283             atomOffset += updateGrouping.fullRange().end();
3284         }
3285     }
3286 }