2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
52 #include "gromacs/compat/make_unique.h"
53 #include "gromacs/domdec/collect.h"
54 #include "gromacs/domdec/dlb.h"
55 #include "gromacs/domdec/dlbtiming.h"
56 #include "gromacs/domdec/domdec_network.h"
57 #include "gromacs/domdec/ga2la.h"
58 #include "gromacs/domdec/partition.h"
59 #include "gromacs/gmxlib/network.h"
60 #include "gromacs/gmxlib/nrnb.h"
61 #include "gromacs/gpu_utils/gpu_utils.h"
62 #include "gromacs/hardware/hw_info.h"
63 #include "gromacs/listed_forces/manage-threading.h"
64 #include "gromacs/math/vec.h"
65 #include "gromacs/math/vectypes.h"
66 #include "gromacs/mdlib/calc_verletbuf.h"
67 #include "gromacs/mdlib/constr.h"
68 #include "gromacs/mdlib/constraintrange.h"
69 #include "gromacs/mdlib/mdrun.h"
70 #include "gromacs/mdlib/updategroups.h"
71 #include "gromacs/mdlib/vsite.h"
72 #include "gromacs/mdtypes/commrec.h"
73 #include "gromacs/mdtypes/inputrec.h"
74 #include "gromacs/mdtypes/state.h"
75 #include "gromacs/pbcutil/ishift.h"
76 #include "gromacs/pbcutil/pbc.h"
77 #include "gromacs/pulling/pull.h"
78 #include "gromacs/timing/wallcycle.h"
79 #include "gromacs/topology/block.h"
80 #include "gromacs/topology/idef.h"
81 #include "gromacs/topology/ifunc.h"
82 #include "gromacs/topology/mtop_lookup.h"
83 #include "gromacs/topology/mtop_util.h"
84 #include "gromacs/topology/topology.h"
85 #include "gromacs/utility/basedefinitions.h"
86 #include "gromacs/utility/basenetwork.h"
87 #include "gromacs/utility/cstringutil.h"
88 #include "gromacs/utility/exceptions.h"
89 #include "gromacs/utility/fatalerror.h"
90 #include "gromacs/utility/gmxmpi.h"
91 #include "gromacs/utility/logger.h"
92 #include "gromacs/utility/real.h"
93 #include "gromacs/utility/smalloc.h"
94 #include "gromacs/utility/strconvert.h"
95 #include "gromacs/utility/stringstream.h"
96 #include "gromacs/utility/stringutil.h"
97 #include "gromacs/utility/textwriter.h"
99 #include "atomdistribution.h"
101 #include "cellsizes.h"
102 #include "distribute.h"
103 #include "domdec_constraints.h"
104 #include "domdec_internal.h"
105 #include "domdec_vsite.h"
106 #include "redistribute.h"
109 static const char *edlbs_names[int(DlbState::nr)] = { "off", "auto", "locked", "on", "on" };
111 /* The size per atom group of the cggl_flag buffer in gmx_domdec_comm_t */
114 /* The flags for the cggl_flag buffer in gmx_domdec_comm_t */
115 #define DD_FLAG_NRCG 65535
116 #define DD_FLAG_FW(d) (1<<(16+(d)*2))
117 #define DD_FLAG_BW(d) (1<<(16+(d)*2+1))
119 /* The DD zone order */
120 static const ivec dd_zo[DD_MAXZONE] =
121 {{0, 0, 0}, {1, 0, 0}, {1, 1, 0}, {0, 1, 0}, {0, 1, 1}, {0, 0, 1}, {1, 0, 1}, {1, 1, 1}};
123 /* The non-bonded zone-pair setup for domain decomposition
124 * The first number is the i-zone, the second number the first j-zone seen by
125 * this i-zone, the third number the last+1 j-zone seen by this i-zone.
126 * As is, this is for 3D decomposition, where there are 4 i-zones.
127 * With 2D decomposition use only the first 2 i-zones and a last+1 j-zone of 4.
128 * With 1D decomposition use only the first i-zone and a last+1 j-zone of 2.
131 ddNonbondedZonePairRanges[DD_MAXIZONE][3] = {{0, 0, 8},
138 #define dd_index(n,i) ((((i)[ZZ]*(n)[YY] + (i)[YY])*(n)[XX]) + (i)[XX])
140 static void index2xyz(ivec nc,int ind,ivec xyz)
142 xyz[XX] = ind % nc[XX];
143 xyz[YY] = (ind / nc[XX]) % nc[YY];
144 xyz[ZZ] = ind / (nc[YY]*nc[XX]);
148 static void ddindex2xyz(const ivec nc, int ind, ivec xyz)
150 xyz[XX] = ind / (nc[YY]*nc[ZZ]);
151 xyz[YY] = (ind / nc[ZZ]) % nc[YY];
152 xyz[ZZ] = ind % nc[ZZ];
155 static int ddcoord2ddnodeid(gmx_domdec_t *dd, ivec c)
160 ddindex = dd_index(dd->nc, c);
161 if (dd->comm->bCartesianPP_PME)
163 ddnodeid = dd->comm->ddindex2ddnodeid[ddindex];
165 else if (dd->comm->bCartesianPP)
168 MPI_Cart_rank(dd->mpi_comm_all, c, &ddnodeid);
179 int ddglatnr(const gmx_domdec_t *dd, int i)
189 if (i >= dd->comm->atomRanges.numAtomsTotal())
191 gmx_fatal(FARGS, "glatnr called with %d, which is larger than the local number of atoms (%d)", i, dd->comm->atomRanges.numAtomsTotal());
193 atnr = dd->globalAtomIndices[i] + 1;
199 t_block *dd_charge_groups_global(gmx_domdec_t *dd)
201 return &dd->comm->cgs_gl;
204 void dd_store_state(gmx_domdec_t *dd, t_state *state)
208 if (state->ddp_count != dd->ddp_count)
210 gmx_incons("The MD state does not match the domain decomposition state");
213 state->cg_gl.resize(dd->ncg_home);
214 for (i = 0; i < dd->ncg_home; i++)
216 state->cg_gl[i] = dd->globalAtomGroupIndices[i];
219 state->ddp_count_cg_gl = dd->ddp_count;
222 gmx_domdec_zones_t *domdec_zones(gmx_domdec_t *dd)
224 return &dd->comm->zones;
227 void dd_get_ns_ranges(const gmx_domdec_t *dd, int icg,
228 int *jcg0, int *jcg1, ivec shift0, ivec shift1)
230 gmx_domdec_zones_t *zones;
233 zones = &dd->comm->zones;
236 while (icg >= zones->izone[izone].cg1)
245 else if (izone < zones->nizone)
247 *jcg0 = zones->izone[izone].jcg0;
251 gmx_fatal(FARGS, "DD icg %d out of range: izone (%d) >= nizone (%d)",
252 icg, izone, zones->nizone);
255 *jcg1 = zones->izone[izone].jcg1;
257 for (d = 0; d < dd->ndim; d++)
260 shift0[dim] = zones->izone[izone].shift0[dim];
261 shift1[dim] = zones->izone[izone].shift1[dim];
262 if (dd->comm->tric_dir[dim] || (isDlbOn(dd->comm) && d > 0))
264 /* A conservative approach, this can be optimized */
271 int dd_numHomeAtoms(const gmx_domdec_t &dd)
273 return dd.comm->atomRanges.numHomeAtoms();
276 int dd_natoms_mdatoms(const gmx_domdec_t *dd)
278 /* We currently set mdatoms entries for all atoms:
279 * local + non-local + communicated for vsite + constraints
282 return dd->comm->atomRanges.numAtomsTotal();
285 int dd_natoms_vsite(const gmx_domdec_t *dd)
287 return dd->comm->atomRanges.end(DDAtomRanges::Type::Vsites);
290 void dd_get_constraint_range(const gmx_domdec_t *dd, int *at_start, int *at_end)
292 *at_start = dd->comm->atomRanges.start(DDAtomRanges::Type::Constraints);
293 *at_end = dd->comm->atomRanges.end(DDAtomRanges::Type::Constraints);
296 void dd_move_x(gmx_domdec_t *dd,
298 gmx::ArrayRef<gmx::RVec> x,
299 gmx_wallcycle *wcycle)
301 wallcycle_start(wcycle, ewcMOVEX);
304 gmx_domdec_comm_t *comm;
305 gmx_domdec_comm_dim_t *cd;
306 rvec shift = {0, 0, 0};
307 gmx_bool bPBC, bScrew;
311 const gmx::RangePartitioning &atomGrouping = dd->atomGrouping();
314 nat_tot = comm->atomRanges.numHomeAtoms();
315 for (int d = 0; d < dd->ndim; d++)
317 bPBC = (dd->ci[dd->dim[d]] == 0);
318 bScrew = (bPBC && dd->bScrewPBC && dd->dim[d] == XX);
321 copy_rvec(box[dd->dim[d]], shift);
324 for (const gmx_domdec_ind_t &ind : cd->ind)
326 DDBufferAccess<gmx::RVec> sendBufferAccess(comm->rvecBuffer, ind.nsend[nzone + 1]);
327 gmx::ArrayRef<gmx::RVec> &sendBuffer = sendBufferAccess.buffer;
331 for (int g : ind.index)
333 for (int j : atomGrouping.block(g))
335 sendBuffer[n] = x[j];
342 for (int g : ind.index)
344 for (int j : atomGrouping.block(g))
346 /* We need to shift the coordinates */
347 for (int d = 0; d < DIM; d++)
349 sendBuffer[n][d] = x[j][d] + shift[d];
357 for (int g : ind.index)
359 for (int j : atomGrouping.block(g))
362 sendBuffer[n][XX] = x[j][XX] + shift[XX];
364 * This operation requires a special shift force
365 * treatment, which is performed in calc_vir.
367 sendBuffer[n][YY] = box[YY][YY] - x[j][YY];
368 sendBuffer[n][ZZ] = box[ZZ][ZZ] - x[j][ZZ];
374 DDBufferAccess<gmx::RVec> receiveBufferAccess(comm->rvecBuffer2, cd->receiveInPlace ? 0 : ind.nrecv[nzone + 1]);
376 gmx::ArrayRef<gmx::RVec> receiveBuffer;
377 if (cd->receiveInPlace)
379 receiveBuffer = gmx::arrayRefFromArray(x.data() + nat_tot, ind.nrecv[nzone + 1]);
383 receiveBuffer = receiveBufferAccess.buffer;
385 /* Send and receive the coordinates */
386 ddSendrecv(dd, d, dddirBackward,
387 sendBuffer, receiveBuffer);
389 if (!cd->receiveInPlace)
392 for (int zone = 0; zone < nzone; zone++)
394 for (int i = ind.cell2at0[zone]; i < ind.cell2at1[zone]; i++)
396 x[i] = receiveBuffer[j++];
400 nat_tot += ind.nrecv[nzone+1];
405 wallcycle_stop(wcycle, ewcMOVEX);
408 void dd_move_f(gmx_domdec_t *dd,
409 gmx::ArrayRef<gmx::RVec> f,
411 gmx_wallcycle *wcycle)
413 wallcycle_start(wcycle, ewcMOVEF);
416 gmx_domdec_comm_t *comm;
417 gmx_domdec_comm_dim_t *cd;
420 gmx_bool bShiftForcesNeedPbc, bScrew;
424 const gmx::RangePartitioning &atomGrouping = dd->atomGrouping();
426 nzone = comm->zones.n/2;
427 nat_tot = comm->atomRanges.end(DDAtomRanges::Type::Zones);
428 for (int d = dd->ndim-1; d >= 0; d--)
430 /* Only forces in domains near the PBC boundaries need to
431 consider PBC in the treatment of fshift */
432 bShiftForcesNeedPbc = (dd->ci[dd->dim[d]] == 0);
433 bScrew = (bShiftForcesNeedPbc && dd->bScrewPBC && dd->dim[d] == XX);
434 if (fshift == nullptr && !bScrew)
436 bShiftForcesNeedPbc = FALSE;
438 /* Determine which shift vector we need */
444 for (int p = cd->numPulses() - 1; p >= 0; p--)
446 const gmx_domdec_ind_t &ind = cd->ind[p];
447 DDBufferAccess<gmx::RVec> receiveBufferAccess(comm->rvecBuffer, ind.nsend[nzone + 1]);
448 gmx::ArrayRef<gmx::RVec> &receiveBuffer = receiveBufferAccess.buffer;
450 nat_tot -= ind.nrecv[nzone+1];
452 DDBufferAccess<gmx::RVec> sendBufferAccess(comm->rvecBuffer2, cd->receiveInPlace ? 0 : ind.nrecv[nzone + 1]);
454 gmx::ArrayRef<gmx::RVec> sendBuffer;
455 if (cd->receiveInPlace)
457 sendBuffer = gmx::arrayRefFromArray(f.data() + nat_tot, ind.nrecv[nzone + 1]);
461 sendBuffer = sendBufferAccess.buffer;
463 for (int zone = 0; zone < nzone; zone++)
465 for (int i = ind.cell2at0[zone]; i < ind.cell2at1[zone]; i++)
467 sendBuffer[j++] = f[i];
471 /* Communicate the forces */
472 ddSendrecv(dd, d, dddirForward,
473 sendBuffer, receiveBuffer);
474 /* Add the received forces */
476 if (!bShiftForcesNeedPbc)
478 for (int g : ind.index)
480 for (int j : atomGrouping.block(g))
482 for (int d = 0; d < DIM; d++)
484 f[j][d] += receiveBuffer[n][d];
492 /* fshift should always be defined if this function is
493 * called when bShiftForcesNeedPbc is true */
494 assert(nullptr != fshift);
495 for (int g : ind.index)
497 for (int j : atomGrouping.block(g))
499 for (int d = 0; d < DIM; d++)
501 f[j][d] += receiveBuffer[n][d];
503 /* Add this force to the shift force */
504 for (int d = 0; d < DIM; d++)
506 fshift[is][d] += receiveBuffer[n][d];
514 for (int g : ind.index)
516 for (int j : atomGrouping.block(g))
518 /* Rotate the force */
519 f[j][XX] += receiveBuffer[n][XX];
520 f[j][YY] -= receiveBuffer[n][YY];
521 f[j][ZZ] -= receiveBuffer[n][ZZ];
524 /* Add this force to the shift force */
525 for (int d = 0; d < DIM; d++)
527 fshift[is][d] += receiveBuffer[n][d];
537 wallcycle_stop(wcycle, ewcMOVEF);
540 /* Convenience function for extracting a real buffer from an rvec buffer
542 * To reduce the number of temporary communication buffers and avoid
543 * cache polution, we reuse gmx::RVec buffers for storing reals.
544 * This functions return a real buffer reference with the same number
545 * of elements as the gmx::RVec buffer (so 1/3 of the size in bytes).
547 static gmx::ArrayRef<real>
548 realArrayRefFromRvecArrayRef(gmx::ArrayRef<gmx::RVec> arrayRef)
550 return gmx::arrayRefFromArray(as_rvec_array(arrayRef.data())[0],
554 void dd_atom_spread_real(gmx_domdec_t *dd, real v[])
557 gmx_domdec_comm_t *comm;
558 gmx_domdec_comm_dim_t *cd;
562 const gmx::RangePartitioning &atomGrouping = dd->atomGrouping();
565 nat_tot = comm->atomRanges.numHomeAtoms();
566 for (int d = 0; d < dd->ndim; d++)
569 for (const gmx_domdec_ind_t &ind : cd->ind)
571 /* Note: We provision for RVec instead of real, so a factor of 3
572 * more than needed. The buffer actually already has this size
573 * and we pass a plain pointer below, so this does not matter.
575 DDBufferAccess<gmx::RVec> sendBufferAccess(comm->rvecBuffer, ind.nsend[nzone + 1]);
576 gmx::ArrayRef<real> sendBuffer = realArrayRefFromRvecArrayRef(sendBufferAccess.buffer);
578 for (int g : ind.index)
580 for (int j : atomGrouping.block(g))
582 sendBuffer[n++] = v[j];
586 DDBufferAccess<gmx::RVec> receiveBufferAccess(comm->rvecBuffer2, cd->receiveInPlace ? 0 : ind.nrecv[nzone + 1]);
588 gmx::ArrayRef<real> receiveBuffer;
589 if (cd->receiveInPlace)
591 receiveBuffer = gmx::arrayRefFromArray(v + nat_tot, ind.nrecv[nzone + 1]);
595 receiveBuffer = realArrayRefFromRvecArrayRef(receiveBufferAccess.buffer);
597 /* Send and receive the data */
598 ddSendrecv(dd, d, dddirBackward,
599 sendBuffer, receiveBuffer);
600 if (!cd->receiveInPlace)
603 for (int zone = 0; zone < nzone; zone++)
605 for (int i = ind.cell2at0[zone]; i < ind.cell2at1[zone]; i++)
607 v[i] = receiveBuffer[j++];
611 nat_tot += ind.nrecv[nzone+1];
617 void dd_atom_sum_real(gmx_domdec_t *dd, real v[])
620 gmx_domdec_comm_t *comm;
621 gmx_domdec_comm_dim_t *cd;
625 const gmx::RangePartitioning &atomGrouping = dd->atomGrouping();
627 nzone = comm->zones.n/2;
628 nat_tot = comm->atomRanges.end(DDAtomRanges::Type::Zones);
629 for (int d = dd->ndim-1; d >= 0; d--)
632 for (int p = cd->numPulses() - 1; p >= 0; p--)
634 const gmx_domdec_ind_t &ind = cd->ind[p];
636 /* Note: We provision for RVec instead of real, so a factor of 3
637 * more than needed. The buffer actually already has this size
638 * and we typecast, so this works as intended.
640 DDBufferAccess<gmx::RVec> receiveBufferAccess(comm->rvecBuffer, ind.nsend[nzone + 1]);
641 gmx::ArrayRef<real> receiveBuffer = realArrayRefFromRvecArrayRef(receiveBufferAccess.buffer);
642 nat_tot -= ind.nrecv[nzone + 1];
644 DDBufferAccess<gmx::RVec> sendBufferAccess(comm->rvecBuffer2, cd->receiveInPlace ? 0 : ind.nrecv[nzone + 1]);
646 gmx::ArrayRef<real> sendBuffer;
647 if (cd->receiveInPlace)
649 sendBuffer = gmx::arrayRefFromArray(v + nat_tot, ind.nrecv[nzone + 1]);
653 sendBuffer = realArrayRefFromRvecArrayRef(sendBufferAccess.buffer);
655 for (int zone = 0; zone < nzone; zone++)
657 for (int i = ind.cell2at0[zone]; i < ind.cell2at1[zone]; i++)
659 sendBuffer[j++] = v[i];
663 /* Communicate the forces */
664 ddSendrecv(dd, d, dddirForward,
665 sendBuffer, receiveBuffer);
666 /* Add the received forces */
668 for (int g : ind.index)
670 for (int j : atomGrouping.block(g))
672 v[j] += receiveBuffer[n];
681 real dd_cutoff_multibody(const gmx_domdec_t *dd)
683 gmx_domdec_comm_t *comm;
690 if (comm->bInterCGBondeds)
692 if (comm->cutoff_mbody > 0)
694 r = comm->cutoff_mbody;
698 /* cutoff_mbody=0 means we do not have DLB */
699 r = comm->cellsize_min[dd->dim[0]];
700 for (di = 1; di < dd->ndim; di++)
702 r = std::min(r, comm->cellsize_min[dd->dim[di]]);
706 r = std::max(r, comm->cutoff_mbody);
710 r = std::min(r, comm->cutoff);
718 real dd_cutoff_twobody(const gmx_domdec_t *dd)
722 r_mb = dd_cutoff_multibody(dd);
724 return std::max(dd->comm->cutoff, r_mb);
728 static void dd_cart_coord2pmecoord(const gmx_domdec_t *dd, const ivec coord,
733 nc = dd->nc[dd->comm->cartpmedim];
734 ntot = dd->comm->ntot[dd->comm->cartpmedim];
735 copy_ivec(coord, coord_pme);
736 coord_pme[dd->comm->cartpmedim] =
737 nc + (coord[dd->comm->cartpmedim]*(ntot - nc) + (ntot - nc)/2)/nc;
740 static int ddindex2pmeindex(const gmx_domdec_t *dd, int ddindex)
745 npme = dd->comm->npmenodes;
747 /* Here we assign a PME node to communicate with this DD node
748 * by assuming that the major index of both is x.
749 * We add cr->npmenodes/2 to obtain an even distribution.
751 return (ddindex*npme + npme/2)/npp;
754 static int *dd_interleaved_pme_ranks(const gmx_domdec_t *dd)
759 snew(pme_rank, dd->comm->npmenodes);
761 for (i = 0; i < dd->nnodes; i++)
763 p0 = ddindex2pmeindex(dd, i);
764 p1 = ddindex2pmeindex(dd, i+1);
765 if (i+1 == dd->nnodes || p1 > p0)
769 fprintf(debug, "pme_rank[%d] = %d\n", n, i+1+n);
771 pme_rank[n] = i + 1 + n;
779 static int gmx_ddcoord2pmeindex(const t_commrec *cr, int x, int y, int z)
787 if (dd->comm->bCartesian) {
788 gmx_ddindex2xyz(dd->nc,ddindex,coords);
789 dd_coords2pmecoords(dd,coords,coords_pme);
790 copy_ivec(dd->ntot,nc);
791 nc[dd->cartpmedim] -= dd->nc[dd->cartpmedim];
792 coords_pme[dd->cartpmedim] -= dd->nc[dd->cartpmedim];
794 slab = (coords_pme[XX]*nc[YY] + coords_pme[YY])*nc[ZZ] + coords_pme[ZZ];
796 slab = (ddindex*cr->npmenodes + cr->npmenodes/2)/dd->nnodes;
802 slab = ddindex2pmeindex(dd, dd_index(dd->nc, coords));
807 static int ddcoord2simnodeid(const t_commrec *cr, int x, int y, int z)
809 gmx_domdec_comm_t *comm;
811 int ddindex, nodeid = -1;
818 if (comm->bCartesianPP_PME)
821 MPI_Cart_rank(cr->mpi_comm_mysim, coords, &nodeid);
826 ddindex = dd_index(cr->dd->nc, coords);
827 if (comm->bCartesianPP)
829 nodeid = comm->ddindex2simnodeid[ddindex];
835 nodeid = ddindex + gmx_ddcoord2pmeindex(cr, x, y, z);
847 static int dd_simnode2pmenode(const gmx_domdec_t *dd,
848 const t_commrec gmx_unused *cr,
853 const gmx_domdec_comm_t *comm = dd->comm;
855 /* This assumes a uniform x domain decomposition grid cell size */
856 if (comm->bCartesianPP_PME)
859 ivec coord, coord_pme;
860 MPI_Cart_coords(cr->mpi_comm_mysim, sim_nodeid, DIM, coord);
861 if (coord[comm->cartpmedim] < dd->nc[comm->cartpmedim])
863 /* This is a PP node */
864 dd_cart_coord2pmecoord(dd, coord, coord_pme);
865 MPI_Cart_rank(cr->mpi_comm_mysim, coord_pme, &pmenode);
869 else if (comm->bCartesianPP)
871 if (sim_nodeid < dd->nnodes)
873 pmenode = dd->nnodes + ddindex2pmeindex(dd, sim_nodeid);
878 /* This assumes DD cells with identical x coordinates
879 * are numbered sequentially.
881 if (dd->comm->pmenodes == nullptr)
883 if (sim_nodeid < dd->nnodes)
885 /* The DD index equals the nodeid */
886 pmenode = dd->nnodes + ddindex2pmeindex(dd, sim_nodeid);
892 while (sim_nodeid > dd->comm->pmenodes[i])
896 if (sim_nodeid < dd->comm->pmenodes[i])
898 pmenode = dd->comm->pmenodes[i];
906 NumPmeDomains getNumPmeDomains(const gmx_domdec_t *dd)
911 dd->comm->npmenodes_x, dd->comm->npmenodes_y
922 std::vector<int> get_pme_ddranks(const t_commrec *cr, int pmenodeid)
926 ivec coord, coord_pme;
930 std::vector<int> ddranks;
931 ddranks.reserve((dd->nnodes+cr->npmenodes-1)/cr->npmenodes);
933 for (x = 0; x < dd->nc[XX]; x++)
935 for (y = 0; y < dd->nc[YY]; y++)
937 for (z = 0; z < dd->nc[ZZ]; z++)
939 if (dd->comm->bCartesianPP_PME)
944 dd_cart_coord2pmecoord(dd, coord, coord_pme);
945 if (dd->ci[XX] == coord_pme[XX] &&
946 dd->ci[YY] == coord_pme[YY] &&
947 dd->ci[ZZ] == coord_pme[ZZ])
949 ddranks.push_back(ddcoord2simnodeid(cr, x, y, z));
954 /* The slab corresponds to the nodeid in the PME group */
955 if (gmx_ddcoord2pmeindex(cr, x, y, z) == pmenodeid)
957 ddranks.push_back(ddcoord2simnodeid(cr, x, y, z));
966 static gmx_bool receive_vir_ener(const gmx_domdec_t *dd, const t_commrec *cr)
968 gmx_bool bReceive = TRUE;
970 if (cr->npmenodes < dd->nnodes)
972 gmx_domdec_comm_t *comm = dd->comm;
973 if (comm->bCartesianPP_PME)
976 int pmenode = dd_simnode2pmenode(dd, cr, cr->sim_nodeid);
978 MPI_Cart_coords(cr->mpi_comm_mysim, cr->sim_nodeid, DIM, coords);
979 coords[comm->cartpmedim]++;
980 if (coords[comm->cartpmedim] < dd->nc[comm->cartpmedim])
983 MPI_Cart_rank(cr->mpi_comm_mysim, coords, &rank);
984 if (dd_simnode2pmenode(dd, cr, rank) == pmenode)
986 /* This is not the last PP node for pmenode */
991 GMX_RELEASE_ASSERT(false, "Without MPI we should not have Cartesian PP-PME with #PMEnodes < #DDnodes");
996 int pmenode = dd_simnode2pmenode(dd, cr, cr->sim_nodeid);
997 if (cr->sim_nodeid+1 < cr->nnodes &&
998 dd_simnode2pmenode(dd, cr, cr->sim_nodeid+1) == pmenode)
1000 /* This is not the last PP node for pmenode */
1009 static void set_slb_pme_dim_f(gmx_domdec_t *dd, int dim, real **dim_f)
1011 gmx_domdec_comm_t *comm;
1016 snew(*dim_f, dd->nc[dim]+1);
1018 for (i = 1; i < dd->nc[dim]; i++)
1020 if (comm->slb_frac[dim])
1022 (*dim_f)[i] = (*dim_f)[i-1] + comm->slb_frac[dim][i-1];
1026 (*dim_f)[i] = static_cast<real>(i)/static_cast<real>(dd->nc[dim]);
1029 (*dim_f)[dd->nc[dim]] = 1;
1032 static void init_ddpme(gmx_domdec_t *dd, gmx_ddpme_t *ddpme, int dimind)
1034 int pmeindex, slab, nso, i;
1037 if (dimind == 0 && dd->dim[0] == YY && dd->comm->npmenodes_x == 1)
1043 ddpme->dim = dimind;
1045 ddpme->dim_match = (ddpme->dim == dd->dim[dimind]);
1047 ddpme->nslab = (ddpme->dim == 0 ?
1048 dd->comm->npmenodes_x :
1049 dd->comm->npmenodes_y);
1051 if (ddpme->nslab <= 1)
1056 nso = dd->comm->npmenodes/ddpme->nslab;
1057 /* Determine for each PME slab the PP location range for dimension dim */
1058 snew(ddpme->pp_min, ddpme->nslab);
1059 snew(ddpme->pp_max, ddpme->nslab);
1060 for (slab = 0; slab < ddpme->nslab; slab++)
1062 ddpme->pp_min[slab] = dd->nc[dd->dim[dimind]] - 1;
1063 ddpme->pp_max[slab] = 0;
1065 for (i = 0; i < dd->nnodes; i++)
1067 ddindex2xyz(dd->nc, i, xyz);
1068 /* For y only use our y/z slab.
1069 * This assumes that the PME x grid size matches the DD grid size.
1071 if (dimind == 0 || xyz[XX] == dd->ci[XX])
1073 pmeindex = ddindex2pmeindex(dd, i);
1076 slab = pmeindex/nso;
1080 slab = pmeindex % ddpme->nslab;
1082 ddpme->pp_min[slab] = std::min(ddpme->pp_min[slab], xyz[dimind]);
1083 ddpme->pp_max[slab] = std::max(ddpme->pp_max[slab], xyz[dimind]);
1087 set_slb_pme_dim_f(dd, ddpme->dim, &ddpme->slb_dim_f);
1090 int dd_pme_maxshift_x(const gmx_domdec_t *dd)
1092 if (dd->comm->ddpme[0].dim == XX)
1094 return dd->comm->ddpme[0].maxshift;
1102 int dd_pme_maxshift_y(const gmx_domdec_t *dd)
1104 if (dd->comm->ddpme[0].dim == YY)
1106 return dd->comm->ddpme[0].maxshift;
1108 else if (dd->comm->npmedecompdim >= 2 && dd->comm->ddpme[1].dim == YY)
1110 return dd->comm->ddpme[1].maxshift;
1118 void dd_cycles_add(const gmx_domdec_t *dd, float cycles, int ddCycl)
1120 /* Note that the cycles value can be incorrect, either 0 or some
1121 * extremely large value, when our thread migrated to another core
1122 * with an unsynchronized cycle counter. If this happens less often
1123 * that once per nstlist steps, this will not cause issues, since
1124 * we later subtract the maximum value from the sum over nstlist steps.
1125 * A zero count will slightly lower the total, but that's a small effect.
1126 * Note that the main purpose of the subtraction of the maximum value
1127 * is to avoid throwing off the load balancing when stalls occur due
1128 * e.g. system activity or network congestion.
1130 dd->comm->cycl[ddCycl] += cycles;
1131 dd->comm->cycl_n[ddCycl]++;
1132 if (cycles > dd->comm->cycl_max[ddCycl])
1134 dd->comm->cycl_max[ddCycl] = cycles;
1139 static void make_load_communicator(gmx_domdec_t *dd, int dim_ind, ivec loc)
1144 gmx_bool bPartOfGroup = FALSE;
1146 dim = dd->dim[dim_ind];
1147 copy_ivec(loc, loc_c);
1148 for (i = 0; i < dd->nc[dim]; i++)
1151 rank = dd_index(dd->nc, loc_c);
1152 if (rank == dd->rank)
1154 /* This process is part of the group */
1155 bPartOfGroup = TRUE;
1158 MPI_Comm_split(dd->mpi_comm_all, bPartOfGroup ? 0 : MPI_UNDEFINED, dd->rank,
1162 dd->comm->mpi_comm_load[dim_ind] = c_row;
1163 if (!isDlbDisabled(dd->comm))
1165 DDCellsizesWithDlb &cellsizes = dd->comm->cellsizesWithDlb[dim_ind];
1167 if (dd->ci[dim] == dd->master_ci[dim])
1169 /* This is the root process of this row */
1170 cellsizes.rowMaster = gmx::compat::make_unique<RowMaster>();
1172 RowMaster &rowMaster = *cellsizes.rowMaster;
1173 rowMaster.cellFrac.resize(ddCellFractionBufferSize(dd, dim_ind));
1174 rowMaster.oldCellFrac.resize(dd->nc[dim] + 1);
1175 rowMaster.isCellMin.resize(dd->nc[dim]);
1178 rowMaster.bounds.resize(dd->nc[dim]);
1180 rowMaster.buf_ncd.resize(dd->nc[dim]);
1184 /* This is not a root process, we only need to receive cell_f */
1185 cellsizes.fracRow.resize(ddCellFractionBufferSize(dd, dim_ind));
1188 if (dd->ci[dim] == dd->master_ci[dim])
1190 snew(dd->comm->load[dim_ind].load, dd->nc[dim]*DD_NLOAD_MAX);
1196 void dd_setup_dlb_resource_sharing(t_commrec *cr,
1200 int physicalnode_id_hash;
1202 MPI_Comm mpi_comm_pp_physicalnode;
1204 if (!thisRankHasDuty(cr, DUTY_PP) || gpu_id < 0)
1206 /* Only ranks with short-ranged tasks (currently) use GPUs.
1207 * If we don't have GPUs assigned, there are no resources to share.
1212 physicalnode_id_hash = gmx_physicalnode_id_hash();
1218 fprintf(debug, "dd_setup_dd_dlb_gpu_sharing:\n");
1219 fprintf(debug, "DD PP rank %d physical node hash %d gpu_id %d\n",
1220 dd->rank, physicalnode_id_hash, gpu_id);
1222 /* Split the PP communicator over the physical nodes */
1223 /* TODO: See if we should store this (before), as it's also used for
1224 * for the nodecomm summation.
1226 // TODO PhysicalNodeCommunicator could be extended/used to handle
1227 // the need for per-node per-group communicators.
1228 MPI_Comm_split(dd->mpi_comm_all, physicalnode_id_hash, dd->rank,
1229 &mpi_comm_pp_physicalnode);
1230 MPI_Comm_split(mpi_comm_pp_physicalnode, gpu_id, dd->rank,
1231 &dd->comm->mpi_comm_gpu_shared);
1232 MPI_Comm_free(&mpi_comm_pp_physicalnode);
1233 MPI_Comm_size(dd->comm->mpi_comm_gpu_shared, &dd->comm->nrank_gpu_shared);
1237 fprintf(debug, "nrank_gpu_shared %d\n", dd->comm->nrank_gpu_shared);
1240 /* Note that some ranks could share a GPU, while others don't */
1242 if (dd->comm->nrank_gpu_shared == 1)
1244 MPI_Comm_free(&dd->comm->mpi_comm_gpu_shared);
1247 GMX_UNUSED_VALUE(cr);
1248 GMX_UNUSED_VALUE(gpu_id);
1252 static void make_load_communicators(gmx_domdec_t gmx_unused *dd)
1255 int dim0, dim1, i, j;
1260 fprintf(debug, "Making load communicators\n");
1263 snew(dd->comm->load, std::max(dd->ndim, 1));
1264 snew(dd->comm->mpi_comm_load, std::max(dd->ndim, 1));
1272 make_load_communicator(dd, 0, loc);
1276 for (i = 0; i < dd->nc[dim0]; i++)
1279 make_load_communicator(dd, 1, loc);
1285 for (i = 0; i < dd->nc[dim0]; i++)
1289 for (j = 0; j < dd->nc[dim1]; j++)
1292 make_load_communicator(dd, 2, loc);
1299 fprintf(debug, "Finished making load communicators\n");
1304 /*! \brief Sets up the relation between neighboring domains and zones */
1305 static void setup_neighbor_relations(gmx_domdec_t *dd)
1307 int d, dim, i, j, m;
1309 gmx_domdec_zones_t *zones;
1310 gmx_domdec_ns_ranges_t *izone;
1312 for (d = 0; d < dd->ndim; d++)
1315 copy_ivec(dd->ci, tmp);
1316 tmp[dim] = (tmp[dim] + 1) % dd->nc[dim];
1317 dd->neighbor[d][0] = ddcoord2ddnodeid(dd, tmp);
1318 copy_ivec(dd->ci, tmp);
1319 tmp[dim] = (tmp[dim] - 1 + dd->nc[dim]) % dd->nc[dim];
1320 dd->neighbor[d][1] = ddcoord2ddnodeid(dd, tmp);
1323 fprintf(debug, "DD rank %d neighbor ranks in dir %d are + %d - %d\n",
1326 dd->neighbor[d][1]);
1330 int nzone = (1 << dd->ndim);
1331 int nizone = (1 << std::max(dd->ndim - 1, 0));
1332 assert(nizone >= 1 && nizone <= DD_MAXIZONE);
1334 zones = &dd->comm->zones;
1336 for (i = 0; i < nzone; i++)
1339 clear_ivec(zones->shift[i]);
1340 for (d = 0; d < dd->ndim; d++)
1342 zones->shift[i][dd->dim[d]] = dd_zo[i][m++];
1347 for (i = 0; i < nzone; i++)
1349 for (d = 0; d < DIM; d++)
1351 s[d] = dd->ci[d] - zones->shift[i][d];
1356 else if (s[d] >= dd->nc[d])
1362 zones->nizone = nizone;
1363 for (i = 0; i < zones->nizone; i++)
1365 assert(ddNonbondedZonePairRanges[i][0] == i);
1367 izone = &zones->izone[i];
1368 /* dd_zp3 is for 3D decomposition, for fewer dimensions use only
1369 * j-zones up to nzone.
1371 izone->j0 = std::min(ddNonbondedZonePairRanges[i][1], nzone);
1372 izone->j1 = std::min(ddNonbondedZonePairRanges[i][2], nzone);
1373 for (dim = 0; dim < DIM; dim++)
1375 if (dd->nc[dim] == 1)
1377 /* All shifts should be allowed */
1378 izone->shift0[dim] = -1;
1379 izone->shift1[dim] = 1;
1383 /* Determine the min/max j-zone shift wrt the i-zone */
1384 izone->shift0[dim] = 1;
1385 izone->shift1[dim] = -1;
1386 for (j = izone->j0; j < izone->j1; j++)
1388 int shift_diff = zones->shift[j][dim] - zones->shift[i][dim];
1389 if (shift_diff < izone->shift0[dim])
1391 izone->shift0[dim] = shift_diff;
1393 if (shift_diff > izone->shift1[dim])
1395 izone->shift1[dim] = shift_diff;
1402 if (!isDlbDisabled(dd->comm))
1404 dd->comm->cellsizesWithDlb.resize(dd->ndim);
1407 if (dd->comm->bRecordLoad)
1409 make_load_communicators(dd);
1413 static void make_pp_communicator(const gmx::MDLogger &mdlog,
1415 t_commrec gmx_unused *cr,
1416 bool gmx_unused reorder)
1419 gmx_domdec_comm_t *comm;
1426 if (comm->bCartesianPP)
1428 /* Set up cartesian communication for the particle-particle part */
1429 GMX_LOG(mdlog.info).appendTextFormatted(
1430 "Will use a Cartesian communicator: %d x %d x %d",
1431 dd->nc[XX], dd->nc[YY], dd->nc[ZZ]);
1433 for (int i = 0; i < DIM; i++)
1437 MPI_Cart_create(cr->mpi_comm_mygroup, DIM, dd->nc, periods, static_cast<int>(reorder),
1439 /* We overwrite the old communicator with the new cartesian one */
1440 cr->mpi_comm_mygroup = comm_cart;
1443 dd->mpi_comm_all = cr->mpi_comm_mygroup;
1444 MPI_Comm_rank(dd->mpi_comm_all, &dd->rank);
1446 if (comm->bCartesianPP_PME)
1448 /* Since we want to use the original cartesian setup for sim,
1449 * and not the one after split, we need to make an index.
1451 snew(comm->ddindex2ddnodeid, dd->nnodes);
1452 comm->ddindex2ddnodeid[dd_index(dd->nc, dd->ci)] = dd->rank;
1453 gmx_sumi(dd->nnodes, comm->ddindex2ddnodeid, cr);
1454 /* Get the rank of the DD master,
1455 * above we made sure that the master node is a PP node.
1465 MPI_Allreduce(&rank, &dd->masterrank, 1, MPI_INT, MPI_SUM, dd->mpi_comm_all);
1467 else if (comm->bCartesianPP)
1469 if (cr->npmenodes == 0)
1471 /* The PP communicator is also
1472 * the communicator for this simulation
1474 cr->mpi_comm_mysim = cr->mpi_comm_mygroup;
1476 cr->nodeid = dd->rank;
1478 MPI_Cart_coords(dd->mpi_comm_all, dd->rank, DIM, dd->ci);
1480 /* We need to make an index to go from the coordinates
1481 * to the nodeid of this simulation.
1483 snew(comm->ddindex2simnodeid, dd->nnodes);
1484 snew(buf, dd->nnodes);
1485 if (thisRankHasDuty(cr, DUTY_PP))
1487 buf[dd_index(dd->nc, dd->ci)] = cr->sim_nodeid;
1489 /* Communicate the ddindex to simulation nodeid index */
1490 MPI_Allreduce(buf, comm->ddindex2simnodeid, dd->nnodes, MPI_INT, MPI_SUM,
1491 cr->mpi_comm_mysim);
1494 /* Determine the master coordinates and rank.
1495 * The DD master should be the same node as the master of this sim.
1497 for (int i = 0; i < dd->nnodes; i++)
1499 if (comm->ddindex2simnodeid[i] == 0)
1501 ddindex2xyz(dd->nc, i, dd->master_ci);
1502 MPI_Cart_rank(dd->mpi_comm_all, dd->master_ci, &dd->masterrank);
1507 fprintf(debug, "The master rank is %d\n", dd->masterrank);
1512 /* No Cartesian communicators */
1513 /* We use the rank in dd->comm->all as DD index */
1514 ddindex2xyz(dd->nc, dd->rank, dd->ci);
1515 /* The simulation master nodeid is 0, so the DD master rank is also 0 */
1517 clear_ivec(dd->master_ci);
1521 GMX_LOG(mdlog.info).appendTextFormatted(
1522 "Domain decomposition rank %d, coordinates %d %d %d\n",
1523 dd->rank, dd->ci[XX], dd->ci[YY], dd->ci[ZZ]);
1527 "Domain decomposition rank %d, coordinates %d %d %d\n\n",
1528 dd->rank, dd->ci[XX], dd->ci[YY], dd->ci[ZZ]);
1532 static void receive_ddindex2simnodeid(gmx_domdec_t *dd,
1536 gmx_domdec_comm_t *comm = dd->comm;
1538 if (!comm->bCartesianPP_PME && comm->bCartesianPP)
1541 snew(comm->ddindex2simnodeid, dd->nnodes);
1542 snew(buf, dd->nnodes);
1543 if (thisRankHasDuty(cr, DUTY_PP))
1545 buf[dd_index(dd->nc, dd->ci)] = cr->sim_nodeid;
1547 /* Communicate the ddindex to simulation nodeid index */
1548 MPI_Allreduce(buf, comm->ddindex2simnodeid, dd->nnodes, MPI_INT, MPI_SUM,
1549 cr->mpi_comm_mysim);
1553 GMX_UNUSED_VALUE(dd);
1554 GMX_UNUSED_VALUE(cr);
1558 static void split_communicator(const gmx::MDLogger &mdlog,
1559 t_commrec *cr, gmx_domdec_t *dd,
1560 DdRankOrder gmx_unused rankOrder,
1561 bool gmx_unused reorder)
1563 gmx_domdec_comm_t *comm;
1572 if (comm->bCartesianPP)
1574 for (i = 1; i < DIM; i++)
1576 bDiv[i] = ((cr->npmenodes*dd->nc[i]) % (dd->nnodes) == 0);
1578 if (bDiv[YY] || bDiv[ZZ])
1580 comm->bCartesianPP_PME = TRUE;
1581 /* If we have 2D PME decomposition, which is always in x+y,
1582 * we stack the PME only nodes in z.
1583 * Otherwise we choose the direction that provides the thinnest slab
1584 * of PME only nodes as this will have the least effect
1585 * on the PP communication.
1586 * But for the PME communication the opposite might be better.
1588 if (bDiv[ZZ] && (comm->npmenodes_y > 1 ||
1590 dd->nc[YY] > dd->nc[ZZ]))
1592 comm->cartpmedim = ZZ;
1596 comm->cartpmedim = YY;
1598 comm->ntot[comm->cartpmedim]
1599 += (cr->npmenodes*dd->nc[comm->cartpmedim])/dd->nnodes;
1603 GMX_LOG(mdlog.info).appendTextFormatted(
1604 "Number of PME-only ranks (%d) is not a multiple of nx*ny (%d*%d) or nx*nz (%d*%d)",
1605 cr->npmenodes, dd->nc[XX], dd->nc[YY], dd->nc[XX], dd->nc[ZZ]);
1606 GMX_LOG(mdlog.info).appendText("Will not use a Cartesian communicator for PP <-> PME\n");
1610 if (comm->bCartesianPP_PME)
1616 GMX_LOG(mdlog.info).appendTextFormatted(
1617 "Will use a Cartesian communicator for PP <-> PME: %d x %d x %d",
1618 comm->ntot[XX], comm->ntot[YY], comm->ntot[ZZ]);
1620 for (i = 0; i < DIM; i++)
1624 MPI_Cart_create(cr->mpi_comm_mysim, DIM, comm->ntot, periods, static_cast<int>(reorder),
1626 MPI_Comm_rank(comm_cart, &rank);
1627 if (MASTER(cr) && rank != 0)
1629 gmx_fatal(FARGS, "MPI rank 0 was renumbered by MPI_Cart_create, we do not allow this");
1632 /* With this assigment we loose the link to the original communicator
1633 * which will usually be MPI_COMM_WORLD, unless have multisim.
1635 cr->mpi_comm_mysim = comm_cart;
1636 cr->sim_nodeid = rank;
1638 MPI_Cart_coords(cr->mpi_comm_mysim, cr->sim_nodeid, DIM, dd->ci);
1640 GMX_LOG(mdlog.info).appendTextFormatted(
1641 "Cartesian rank %d, coordinates %d %d %d\n",
1642 cr->sim_nodeid, dd->ci[XX], dd->ci[YY], dd->ci[ZZ]);
1644 if (dd->ci[comm->cartpmedim] < dd->nc[comm->cartpmedim])
1648 if (cr->npmenodes == 0 ||
1649 dd->ci[comm->cartpmedim] >= dd->nc[comm->cartpmedim])
1651 cr->duty = DUTY_PME;
1654 /* Split the sim communicator into PP and PME only nodes */
1655 MPI_Comm_split(cr->mpi_comm_mysim,
1656 getThisRankDuties(cr),
1657 dd_index(comm->ntot, dd->ci),
1658 &cr->mpi_comm_mygroup);
1665 case DdRankOrder::pp_pme:
1666 GMX_LOG(mdlog.info).appendText("Order of the ranks: PP first, PME last");
1668 case DdRankOrder::interleave:
1669 /* Interleave the PP-only and PME-only ranks */
1670 GMX_LOG(mdlog.info).appendText("Interleaving PP and PME ranks");
1671 comm->pmenodes = dd_interleaved_pme_ranks(dd);
1673 case DdRankOrder::cartesian:
1676 gmx_fatal(FARGS, "Invalid ddRankOrder=%d", static_cast<int>(rankOrder));
1679 if (dd_simnode2pmenode(dd, cr, cr->sim_nodeid) == -1)
1681 cr->duty = DUTY_PME;
1688 /* Split the sim communicator into PP and PME only nodes */
1689 MPI_Comm_split(cr->mpi_comm_mysim,
1690 getThisRankDuties(cr),
1692 &cr->mpi_comm_mygroup);
1693 MPI_Comm_rank(cr->mpi_comm_mygroup, &cr->nodeid);
1697 GMX_LOG(mdlog.info).appendTextFormatted(
1698 "This rank does only %s work.\n",
1699 thisRankHasDuty(cr, DUTY_PP) ? "particle-particle" : "PME-mesh");
1702 /*! \brief Generates the MPI communicators for domain decomposition */
1703 static void make_dd_communicators(const gmx::MDLogger &mdlog,
1705 gmx_domdec_t *dd, DdRankOrder ddRankOrder)
1707 gmx_domdec_comm_t *comm;
1712 copy_ivec(dd->nc, comm->ntot);
1714 comm->bCartesianPP = (ddRankOrder == DdRankOrder::cartesian);
1715 comm->bCartesianPP_PME = FALSE;
1717 /* Reorder the nodes by default. This might change the MPI ranks.
1718 * Real reordering is only supported on very few architectures,
1719 * Blue Gene is one of them.
1721 CartReorder = getenv("GMX_NO_CART_REORDER") == nullptr;
1723 if (cr->npmenodes > 0)
1725 /* Split the communicator into a PP and PME part */
1726 split_communicator(mdlog, cr, dd, ddRankOrder, CartReorder);
1727 if (comm->bCartesianPP_PME)
1729 /* We (possibly) reordered the nodes in split_communicator,
1730 * so it is no longer required in make_pp_communicator.
1732 CartReorder = FALSE;
1737 /* All nodes do PP and PME */
1739 /* We do not require separate communicators */
1740 cr->mpi_comm_mygroup = cr->mpi_comm_mysim;
1744 if (thisRankHasDuty(cr, DUTY_PP))
1746 /* Copy or make a new PP communicator */
1747 make_pp_communicator(mdlog, dd, cr, CartReorder);
1751 receive_ddindex2simnodeid(dd, cr);
1754 if (!thisRankHasDuty(cr, DUTY_PME))
1756 /* Set up the commnuication to our PME node */
1757 dd->pme_nodeid = dd_simnode2pmenode(dd, cr, cr->sim_nodeid);
1758 dd->pme_receive_vir_ener = receive_vir_ener(dd, cr);
1761 fprintf(debug, "My pme_nodeid %d receive ener %s\n",
1762 dd->pme_nodeid, gmx::boolToString(dd->pme_receive_vir_ener));
1767 dd->pme_nodeid = -1;
1770 /* We can not use DDMASTER(dd), because dd->masterrank is set later */
1773 dd->ma = gmx::compat::make_unique<AtomDistribution>(dd->nc,
1775 comm->cgs_gl.index[comm->cgs_gl.nr]);
1779 static real *get_slb_frac(const gmx::MDLogger &mdlog,
1780 const char *dir, int nc, const char *size_string)
1782 real *slb_frac, tot;
1787 if (nc > 1 && size_string != nullptr)
1789 GMX_LOG(mdlog.info).appendTextFormatted(
1790 "Using static load balancing for the %s direction", dir);
1793 for (i = 0; i < nc; i++)
1796 sscanf(size_string, "%20lf%n", &dbl, &n);
1799 gmx_fatal(FARGS, "Incorrect or not enough DD cell size entries for direction %s: '%s'", dir, size_string);
1806 std::string relativeCellSizes = "Relative cell sizes:";
1807 for (i = 0; i < nc; i++)
1810 relativeCellSizes += gmx::formatString(" %5.3f", slb_frac[i]);
1812 GMX_LOG(mdlog.info).appendText(relativeCellSizes);
1818 static int multi_body_bondeds_count(const gmx_mtop_t *mtop)
1821 gmx_mtop_ilistloop_t iloop = gmx_mtop_ilistloop_init(mtop);
1823 while (const InteractionLists *ilists = gmx_mtop_ilistloop_next(iloop, &nmol))
1825 for (auto &ilist : extractILists(*ilists, IF_BOND))
1827 if (NRAL(ilist.functionType) > 2)
1829 n += nmol*(ilist.iatoms.size()/ilistStride(ilist));
1837 static int dd_getenv(const gmx::MDLogger &mdlog,
1838 const char *env_var, int def)
1844 val = getenv(env_var);
1847 if (sscanf(val, "%20d", &nst) <= 0)
1851 GMX_LOG(mdlog.info).appendTextFormatted(
1852 "Found env.var. %s = %s, using value %d",
1859 static void check_dd_restrictions(const gmx_domdec_t *dd,
1860 const t_inputrec *ir,
1861 const gmx::MDLogger &mdlog)
1863 if (ir->ePBC == epbcSCREW &&
1864 (dd->nc[XX] == 1 || dd->nc[YY] > 1 || dd->nc[ZZ] > 1))
1866 gmx_fatal(FARGS, "With pbc=%s can only do domain decomposition in the x-direction", epbc_names[ir->ePBC]);
1869 if (ir->ns_type == ensSIMPLE)
1871 gmx_fatal(FARGS, "Domain decomposition does not support simple neighbor searching, use grid searching or run with one MPI rank");
1874 if (ir->nstlist == 0)
1876 gmx_fatal(FARGS, "Domain decomposition does not work with nstlist=0");
1879 if (ir->comm_mode == ecmANGULAR && ir->ePBC != epbcNONE)
1881 GMX_LOG(mdlog.warning).appendText("comm-mode angular will give incorrect results when the comm group partially crosses a periodic boundary");
1885 static real average_cellsize_min(gmx_domdec_t *dd, gmx_ddbox_t *ddbox)
1890 r = ddbox->box_size[XX];
1891 for (di = 0; di < dd->ndim; di++)
1894 /* Check using the initial average cell size */
1895 r = std::min(r, ddbox->box_size[d]*ddbox->skew_fac[d]/dd->nc[d]);
1901 /*! \brief Depending on the DLB initial value return the DLB switched off state or issue an error.
1903 static DlbState forceDlbOffOrBail(DlbState cmdlineDlbState,
1904 const std::string &reasonStr,
1905 const gmx::MDLogger &mdlog)
1907 std::string dlbNotSupportedErr = "Dynamic load balancing requested, but ";
1908 std::string dlbDisableNote = "NOTE: disabling dynamic load balancing as ";
1910 if (cmdlineDlbState == DlbState::onUser)
1912 gmx_fatal(FARGS, "%s", (dlbNotSupportedErr + reasonStr).c_str());
1914 else if (cmdlineDlbState == DlbState::offCanTurnOn)
1916 GMX_LOG(mdlog.info).appendText(dlbDisableNote + reasonStr);
1918 return DlbState::offForever;
1921 /*! \brief Return the dynamic load balancer's initial state based on initial conditions and user inputs.
1923 * This function parses the parameters of "-dlb" command line option setting
1924 * corresponding state values. Then it checks the consistency of the determined
1925 * state with other run parameters and settings. As a result, the initial state
1926 * may be altered or an error may be thrown if incompatibility of options is detected.
1928 * \param [in] mdlog Logger.
1929 * \param [in] dlbOption Enum value for the DLB option.
1930 * \param [in] bRecordLoad True if the load balancer is recording load information.
1931 * \param [in] mdrunOptions Options for mdrun.
1932 * \param [in] ir Pointer mdrun to input parameters.
1933 * \returns DLB initial/startup state.
1935 static DlbState determineInitialDlbState(const gmx::MDLogger &mdlog,
1936 DlbOption dlbOption, gmx_bool bRecordLoad,
1937 const MdrunOptions &mdrunOptions,
1938 const t_inputrec *ir)
1940 DlbState dlbState = DlbState::offCanTurnOn;
1944 case DlbOption::turnOnWhenUseful: dlbState = DlbState::offCanTurnOn; break;
1945 case DlbOption::no: dlbState = DlbState::offUser; break;
1946 case DlbOption::yes: dlbState = DlbState::onUser; break;
1947 default: gmx_incons("Invalid dlbOption enum value");
1950 /* Reruns don't support DLB: bail or override auto mode */
1951 if (mdrunOptions.rerun)
1953 std::string reasonStr = "it is not supported in reruns.";
1954 return forceDlbOffOrBail(dlbState, reasonStr, mdlog);
1957 /* Unsupported integrators */
1958 if (!EI_DYNAMICS(ir->eI))
1960 auto reasonStr = gmx::formatString("it is only supported with dynamics, not with integrator '%s'.", EI(ir->eI));
1961 return forceDlbOffOrBail(dlbState, reasonStr, mdlog);
1964 /* Without cycle counters we can't time work to balance on */
1967 std::string reasonStr = "cycle counters unsupported or not enabled in the operating system kernel.";
1968 return forceDlbOffOrBail(dlbState, reasonStr, mdlog);
1971 if (mdrunOptions.reproducible)
1973 std::string reasonStr = "you started a reproducible run.";
1976 case DlbState::offUser:
1978 case DlbState::offForever:
1979 GMX_RELEASE_ASSERT(false, "DlbState::offForever is not a valid initial state");
1981 case DlbState::offCanTurnOn:
1982 return forceDlbOffOrBail(dlbState, reasonStr, mdlog);
1983 case DlbState::onCanTurnOff:
1984 GMX_RELEASE_ASSERT(false, "DlbState::offCanTurnOff is not a valid initial state");
1986 case DlbState::onUser:
1987 return forceDlbOffOrBail(dlbState, reasonStr + " In load balanced runs binary reproducibility cannot be ensured.", mdlog);
1989 gmx_fatal(FARGS, "Death horror: undefined case (%d) for load balancing choice", static_cast<int>(dlbState));
1996 static void set_dd_dim(const gmx::MDLogger &mdlog, gmx_domdec_t *dd)
1999 if (getenv("GMX_DD_ORDER_ZYX") != nullptr)
2001 /* Decomposition order z,y,x */
2002 GMX_LOG(mdlog.info).appendText("Using domain decomposition order z, y, x");
2003 for (int dim = DIM-1; dim >= 0; dim--)
2005 if (dd->nc[dim] > 1)
2007 dd->dim[dd->ndim++] = dim;
2013 /* Decomposition order x,y,z */
2014 for (int dim = 0; dim < DIM; dim++)
2016 if (dd->nc[dim] > 1)
2018 dd->dim[dd->ndim++] = dim;
2025 /* Set dim[0] to avoid extra checks on ndim in several places */
2030 static gmx_domdec_comm_t *init_dd_comm()
2032 gmx_domdec_comm_t *comm = new gmx_domdec_comm_t;
2034 comm->n_load_have = 0;
2035 comm->n_load_collect = 0;
2037 comm->haveTurnedOffDlb = false;
2039 for (int i = 0; i < static_cast<int>(DDAtomRanges::Type::Number); i++)
2041 comm->sum_nat[i] = 0;
2045 comm->load_step = 0;
2048 clear_ivec(comm->load_lim);
2052 /* This should be replaced by a unique pointer */
2053 comm->balanceRegion = ddBalanceRegionAllocate();
2058 /* Returns whether mtop contains constraints and/or vsites */
2059 static bool systemHasConstraintsOrVsites(const gmx_mtop_t &mtop)
2061 auto ilistLoop = gmx_mtop_ilistloop_init(mtop);
2063 while (const InteractionLists *ilists = gmx_mtop_ilistloop_next(ilistLoop, &nmol))
2065 if (!extractILists(*ilists, IF_CONSTRAINT | IF_VSITE).empty())
2074 static void setupUpdateGroups(const gmx::MDLogger &mdlog,
2075 const gmx_mtop_t &mtop,
2076 const t_inputrec &inputrec,
2078 int numMpiRanksTotal,
2079 gmx_domdec_comm_t *comm)
2081 /* When we have constraints and/or vsites, it is beneficial to use
2082 * update groups (when possible) to allow independent update of groups.
2084 if (!systemHasConstraintsOrVsites(mtop))
2086 /* No constraints or vsites, atoms can be updated independently */
2090 comm->updateGroupingPerMoleculetype = gmx::makeUpdateGroups(mtop);
2091 comm->useUpdateGroups =
2092 (!comm->updateGroupingPerMoleculetype.empty() &&
2093 getenv("GMX_NO_UPDATEGROUPS") == nullptr);
2095 if (comm->useUpdateGroups)
2097 int numUpdateGroups = 0;
2098 for (const auto &molblock : mtop.molblock)
2100 numUpdateGroups += molblock.nmol*comm->updateGroupingPerMoleculetype[molblock.type].numBlocks();
2103 /* Note: We would like to use dd->nnodes for the atom count estimate,
2104 * but that is not yet available here. But this anyhow only
2105 * affect performance up to the second dd_partition_system call.
2107 int homeAtomCountEstimate = mtop.natoms/numMpiRanksTotal;
2108 comm->updateGroupsCog =
2109 gmx::compat::make_unique<gmx::UpdateGroupsCog>(mtop,
2110 comm->updateGroupingPerMoleculetype,
2111 maxReferenceTemperature(inputrec),
2112 homeAtomCountEstimate);
2114 /* To use update groups, the large domain-to-domain cutoff distance
2115 * should be compatible with the box size.
2117 comm->useUpdateGroups = (atomToAtomIntoDomainToDomainCutoff(*comm, 0) < cutoffMargin);
2119 if (comm->useUpdateGroups)
2121 GMX_LOG(mdlog.info).appendTextFormatted(
2122 "Using update groups, nr %d, average size %.1f atoms, max. radius %.3f nm\n",
2124 mtop.natoms/static_cast<double>(numUpdateGroups),
2125 comm->updateGroupsCog->maxUpdateGroupRadius());
2129 GMX_LOG(mdlog.info).appendTextFormatted("The combination of rlist and box size prohibits the use of update groups\n");
2130 comm->updateGroupingPerMoleculetype.clear();
2131 comm->updateGroupsCog.reset(nullptr);
2136 /*! \brief Set the cell size and interaction limits, as well as the DD grid */
2137 static void set_dd_limits_and_grid(const gmx::MDLogger &mdlog,
2138 t_commrec *cr, gmx_domdec_t *dd,
2139 const DomdecOptions &options,
2140 const MdrunOptions &mdrunOptions,
2141 const gmx_mtop_t *mtop,
2142 const t_inputrec *ir,
2144 gmx::ArrayRef<const gmx::RVec> xGlobal,
2148 real r_bonded_limit = -1;
2149 const real tenPercentMargin = 1.1;
2150 gmx_domdec_comm_t *comm = dd->comm;
2152 dd->npbcdim = ePBC2npbcdim(ir->ePBC);
2153 dd->numBoundedDimensions = inputrec2nboundeddim(ir);
2154 dd->haveDynamicBox = inputrecDynamicBox(ir);
2155 dd->bScrewPBC = (ir->ePBC == epbcSCREW);
2157 dd->pme_recv_f_alloc = 0;
2158 dd->pme_recv_f_buf = nullptr;
2160 /* Initialize to GPU share count to 0, might change later */
2161 comm->nrank_gpu_shared = 0;
2163 comm->dlbState = determineInitialDlbState(mdlog, options.dlbOption, comm->bRecordLoad, mdrunOptions, ir);
2164 dd_dlb_set_should_check_whether_to_turn_dlb_on(dd, TRUE);
2165 /* To consider turning DLB on after 2*nstlist steps we need to check
2166 * at partitioning count 3. Thus we need to increase the first count by 2.
2168 comm->ddPartioningCountFirstDlbOff += 2;
2170 GMX_LOG(mdlog.info).appendTextFormatted(
2171 "Dynamic load balancing: %s", edlbs_names[int(comm->dlbState)]);
2173 comm->bPMELoadBalDLBLimits = FALSE;
2175 /* Allocate the charge group/atom sorting struct */
2176 comm->sort = gmx::compat::make_unique<gmx_domdec_sort_t>();
2178 comm->bCGs = (ncg_mtop(mtop) < mtop->natoms);
2180 /* We need to decide on update groups early, as this affects communication distances */
2181 comm->useUpdateGroups = false;
2182 if (ir->cutoff_scheme == ecutsVERLET)
2184 real cutoffMargin = std::sqrt(max_cutoff2(ir->ePBC, box)) - ir->rlist;
2185 setupUpdateGroups(mdlog, *mtop, *ir, cutoffMargin, cr->nnodes, comm);
2188 comm->bInterCGBondeds = ((ncg_mtop(mtop) > gmx_mtop_num_molecules(*mtop)) ||
2189 mtop->bIntermolecularInteractions);
2190 if (comm->bInterCGBondeds)
2192 comm->bInterCGMultiBody = (multi_body_bondeds_count(mtop) > 0);
2196 comm->bInterCGMultiBody = FALSE;
2199 if (comm->useUpdateGroups)
2201 dd->splitConstraints = false;
2202 dd->splitSettles = false;
2206 dd->splitConstraints = gmx::inter_charge_group_constraints(*mtop);
2207 dd->splitSettles = gmx::inter_charge_group_settles(*mtop);
2212 /* Set the cut-off to some very large value,
2213 * so we don't need if statements everywhere in the code.
2214 * We use sqrt, since the cut-off is squared in some places.
2216 comm->cutoff = GMX_CUTOFF_INF;
2220 comm->cutoff = atomToAtomIntoDomainToDomainCutoff(*comm, ir->rlist);
2222 comm->cutoff_mbody = 0;
2224 /* Determine the minimum cell size limit, affected by many factors */
2225 comm->cellsize_limit = 0;
2226 comm->bBondComm = FALSE;
2228 /* We do not allow home atoms to move beyond the neighboring domain
2229 * between domain decomposition steps, which limits the cell size.
2230 * Get an estimate of cell size limit due to atom displacement.
2231 * In most cases this is a large overestimate, because it assumes
2232 * non-interaction atoms.
2233 * We set the chance to 1 in a trillion steps.
2235 constexpr real c_chanceThatAtomMovesBeyondDomain = 1e-12;
2236 const real limitForAtomDisplacement =
2237 minCellSizeForAtomDisplacement(*mtop, *ir,
2238 comm->updateGroupingPerMoleculetype,
2239 c_chanceThatAtomMovesBeyondDomain);
2240 GMX_LOG(mdlog.info).appendTextFormatted(
2241 "Minimum cell size due to atom displacement: %.3f nm",
2242 limitForAtomDisplacement);
2244 comm->cellsize_limit = std::max(comm->cellsize_limit,
2245 limitForAtomDisplacement);
2247 /* TODO: PME decomposition currently requires atoms not to be more than
2248 * 2/3 of comm->cutoff, which is >=rlist, outside of their domain.
2249 * In nearly all cases, limitForAtomDisplacement will be smaller
2250 * than 2/3*rlist, so the PME requirement is satisfied.
2251 * But it would be better for both correctness and performance
2252 * to use limitForAtomDisplacement instead of 2/3*comm->cutoff.
2253 * Note that we would need to improve the pairlist buffer case.
2256 if (comm->bInterCGBondeds)
2258 if (options.minimumCommunicationRange > 0)
2260 comm->cutoff_mbody = atomToAtomIntoDomainToDomainCutoff(*comm, options.minimumCommunicationRange);
2261 if (options.useBondedCommunication)
2263 comm->bBondComm = (comm->cutoff_mbody > comm->cutoff);
2267 comm->cutoff = std::max(comm->cutoff, comm->cutoff_mbody);
2269 r_bonded_limit = comm->cutoff_mbody;
2271 else if (ir->bPeriodicMols)
2273 /* Can not easily determine the required cut-off */
2274 GMX_LOG(mdlog.warning).appendText("NOTE: Periodic molecules are present in this system. Because of this, the domain decomposition algorithm cannot easily determine the minimum cell size that it requires for treating bonded interactions. Instead, domain decomposition will assume that half the non-bonded cut-off will be a suitable lower bound.");
2275 comm->cutoff_mbody = comm->cutoff/2;
2276 r_bonded_limit = comm->cutoff_mbody;
2284 dd_bonded_cg_distance(mdlog, mtop, ir, as_rvec_array(xGlobal.data()), box,
2285 options.checkBondedInteractions,
2288 gmx_bcast(sizeof(r_2b), &r_2b, cr);
2289 gmx_bcast(sizeof(r_mb), &r_mb, cr);
2291 /* We use an initial margin of 10% for the minimum cell size,
2292 * except when we are just below the non-bonded cut-off.
2294 if (options.useBondedCommunication)
2296 if (std::max(r_2b, r_mb) > comm->cutoff)
2298 r_bonded = std::max(r_2b, r_mb);
2299 r_bonded_limit = tenPercentMargin*r_bonded;
2300 comm->bBondComm = TRUE;
2305 r_bonded_limit = std::min(tenPercentMargin*r_bonded, comm->cutoff);
2307 /* We determine cutoff_mbody later */
2311 /* No special bonded communication,
2312 * simply increase the DD cut-off.
2314 r_bonded_limit = tenPercentMargin*std::max(r_2b, r_mb);
2315 comm->cutoff_mbody = r_bonded_limit;
2316 comm->cutoff = std::max(comm->cutoff, comm->cutoff_mbody);
2319 GMX_LOG(mdlog.info).appendTextFormatted(
2320 "Minimum cell size due to bonded interactions: %.3f nm",
2323 comm->cellsize_limit = std::max(comm->cellsize_limit, r_bonded_limit);
2327 if (dd->splitConstraints && options.constraintCommunicationRange <= 0)
2329 /* There is a cell size limit due to the constraints (P-LINCS) */
2330 rconstr = gmx::constr_r_max(mdlog, mtop, ir);
2331 GMX_LOG(mdlog.info).appendTextFormatted(
2332 "Estimated maximum distance required for P-LINCS: %.3f nm",
2334 if (rconstr > comm->cellsize_limit)
2336 GMX_LOG(mdlog.info).appendText("This distance will limit the DD cell size, you can override this with -rcon");
2339 else if (options.constraintCommunicationRange > 0)
2341 /* Here we do not check for dd->splitConstraints.
2342 * because one can also set a cell size limit for virtual sites only
2343 * and at this point we don't know yet if there are intercg v-sites.
2345 GMX_LOG(mdlog.info).appendTextFormatted(
2346 "User supplied maximum distance required for P-LINCS: %.3f nm",
2347 options.constraintCommunicationRange);
2348 rconstr = options.constraintCommunicationRange;
2350 comm->cellsize_limit = std::max(comm->cellsize_limit, rconstr);
2352 comm->cgs_gl = gmx_mtop_global_cgs(mtop);
2354 if (options.numCells[XX] > 0)
2356 copy_ivec(options.numCells, dd->nc);
2357 set_dd_dim(mdlog, dd);
2358 set_ddbox_cr(*cr, &dd->nc, *ir, box, xGlobal, ddbox);
2360 if (options.numPmeRanks >= 0)
2362 cr->npmenodes = options.numPmeRanks;
2366 /* When the DD grid is set explicitly and -npme is set to auto,
2367 * don't use PME ranks. We check later if the DD grid is
2368 * compatible with the total number of ranks.
2373 real acs = average_cellsize_min(dd, ddbox);
2374 if (acs < comm->cellsize_limit)
2376 gmx_fatal_collective(FARGS, cr->mpi_comm_mysim, MASTER(cr),
2377 "The initial cell size (%f) is smaller than the cell size limit (%f), change options -dd, -rdd or -rcon, see the log file for details",
2378 acs, comm->cellsize_limit);
2383 set_ddbox_cr(*cr, nullptr, *ir, box, xGlobal, ddbox);
2385 /* We need to choose the optimal DD grid and possibly PME nodes */
2387 dd_choose_grid(mdlog, cr, dd, ir, mtop, box, ddbox,
2388 options.numPmeRanks,
2389 !isDlbDisabled(comm),
2391 comm->cellsize_limit, comm->cutoff,
2392 comm->bInterCGBondeds);
2394 if (dd->nc[XX] == 0)
2397 gmx_bool bC = (dd->splitConstraints && rconstr > r_bonded_limit);
2398 sprintf(buf, "Change the number of ranks or mdrun option %s%s%s",
2399 !bC ? "-rdd" : "-rcon",
2400 comm->dlbState != DlbState::offUser ? " or -dds" : "",
2401 bC ? " or your LINCS settings" : "");
2403 gmx_fatal_collective(FARGS, cr->mpi_comm_mysim, MASTER(cr),
2404 "There is no domain decomposition for %d ranks that is compatible with the given box and a minimum cell size of %g nm\n"
2406 "Look in the log file for details on the domain decomposition",
2407 cr->nnodes-cr->npmenodes, limit, buf);
2409 set_dd_dim(mdlog, dd);
2412 GMX_LOG(mdlog.info).appendTextFormatted(
2413 "Domain decomposition grid %d x %d x %d, separate PME ranks %d",
2414 dd->nc[XX], dd->nc[YY], dd->nc[ZZ], cr->npmenodes);
2416 dd->nnodes = dd->nc[XX]*dd->nc[YY]*dd->nc[ZZ];
2417 if (cr->nnodes - dd->nnodes != cr->npmenodes)
2419 gmx_fatal_collective(FARGS, cr->mpi_comm_mysim, MASTER(cr),
2420 "The size of the domain decomposition grid (%d) does not match the number of ranks (%d). The total number of ranks is %d",
2421 dd->nnodes, cr->nnodes - cr->npmenodes, cr->nnodes);
2423 if (cr->npmenodes > dd->nnodes)
2425 gmx_fatal_collective(FARGS, cr->mpi_comm_mysim, MASTER(cr),
2426 "The number of separate PME ranks (%d) is larger than the number of PP ranks (%d), this is not supported.", cr->npmenodes, dd->nnodes);
2428 if (cr->npmenodes > 0)
2430 comm->npmenodes = cr->npmenodes;
2434 comm->npmenodes = dd->nnodes;
2437 if (EEL_PME(ir->coulombtype) || EVDW_PME(ir->vdwtype))
2439 /* The following choices should match those
2440 * in comm_cost_est in domdec_setup.c.
2441 * Note that here the checks have to take into account
2442 * that the decomposition might occur in a different order than xyz
2443 * (for instance through the env.var. GMX_DD_ORDER_ZYX),
2444 * in which case they will not match those in comm_cost_est,
2445 * but since that is mainly for testing purposes that's fine.
2447 if (dd->ndim >= 2 && dd->dim[0] == XX && dd->dim[1] == YY &&
2448 comm->npmenodes > dd->nc[XX] && comm->npmenodes % dd->nc[XX] == 0 &&
2449 getenv("GMX_PMEONEDD") == nullptr)
2451 comm->npmedecompdim = 2;
2452 comm->npmenodes_x = dd->nc[XX];
2453 comm->npmenodes_y = comm->npmenodes/comm->npmenodes_x;
2457 /* In case nc is 1 in both x and y we could still choose to
2458 * decompose pme in y instead of x, but we use x for simplicity.
2460 comm->npmedecompdim = 1;
2461 if (dd->dim[0] == YY)
2463 comm->npmenodes_x = 1;
2464 comm->npmenodes_y = comm->npmenodes;
2468 comm->npmenodes_x = comm->npmenodes;
2469 comm->npmenodes_y = 1;
2472 GMX_LOG(mdlog.info).appendTextFormatted(
2473 "PME domain decomposition: %d x %d x %d",
2474 comm->npmenodes_x, comm->npmenodes_y, 1);
2478 comm->npmedecompdim = 0;
2479 comm->npmenodes_x = 0;
2480 comm->npmenodes_y = 0;
2483 snew(comm->slb_frac, DIM);
2484 if (isDlbDisabled(comm))
2486 comm->slb_frac[XX] = get_slb_frac(mdlog, "x", dd->nc[XX], options.cellSizeX);
2487 comm->slb_frac[YY] = get_slb_frac(mdlog, "y", dd->nc[YY], options.cellSizeY);
2488 comm->slb_frac[ZZ] = get_slb_frac(mdlog, "z", dd->nc[ZZ], options.cellSizeZ);
2491 if (comm->bInterCGBondeds && comm->cutoff_mbody == 0)
2493 if (comm->bBondComm || !isDlbDisabled(comm))
2495 /* Set the bonded communication distance to halfway
2496 * the minimum and the maximum,
2497 * since the extra communication cost is nearly zero.
2499 real acs = average_cellsize_min(dd, ddbox);
2500 comm->cutoff_mbody = 0.5*(r_bonded + acs);
2501 if (!isDlbDisabled(comm))
2503 /* Check if this does not limit the scaling */
2504 comm->cutoff_mbody = std::min(comm->cutoff_mbody,
2505 options.dlbScaling*acs);
2507 if (!comm->bBondComm)
2509 /* Without bBondComm do not go beyond the n.b. cut-off */
2510 comm->cutoff_mbody = std::min(comm->cutoff_mbody, comm->cutoff);
2511 if (comm->cellsize_limit >= comm->cutoff)
2513 /* We don't loose a lot of efficieny
2514 * when increasing it to the n.b. cut-off.
2515 * It can even be slightly faster, because we need
2516 * less checks for the communication setup.
2518 comm->cutoff_mbody = comm->cutoff;
2521 /* Check if we did not end up below our original limit */
2522 comm->cutoff_mbody = std::max(comm->cutoff_mbody, r_bonded_limit);
2524 if (comm->cutoff_mbody > comm->cellsize_limit)
2526 comm->cellsize_limit = comm->cutoff_mbody;
2529 /* Without DLB and cutoff_mbody<cutoff, cutoff_mbody is dynamic */
2534 fprintf(debug, "Bonded atom communication beyond the cut-off: %s\n"
2535 "cellsize limit %f\n",
2536 gmx::boolToString(comm->bBondComm), comm->cellsize_limit);
2541 check_dd_restrictions(dd, ir, mdlog);
2545 static char *init_bLocalCG(const gmx_mtop_t *mtop)
2550 ncg = ncg_mtop(mtop);
2551 snew(bLocalCG, ncg);
2552 for (cg = 0; cg < ncg; cg++)
2554 bLocalCG[cg] = FALSE;
2560 void dd_init_bondeds(FILE *fplog,
2562 const gmx_mtop_t *mtop,
2563 const gmx_vsite_t *vsite,
2564 const t_inputrec *ir,
2565 gmx_bool bBCheck, cginfo_mb_t *cginfo_mb)
2567 gmx_domdec_comm_t *comm;
2569 dd_make_reverse_top(fplog, dd, mtop, vsite, ir, bBCheck);
2573 if (comm->bBondComm)
2575 /* Communicate atoms beyond the cut-off for bonded interactions */
2578 comm->cglink = make_charge_group_links(mtop, dd, cginfo_mb);
2580 comm->bLocalCG = init_bLocalCG(mtop);
2584 /* Only communicate atoms based on cut-off */
2585 comm->cglink = nullptr;
2586 comm->bLocalCG = nullptr;
2590 static void writeSettings(gmx::TextWriter *log,
2592 const gmx_mtop_t *mtop,
2593 const t_inputrec *ir,
2594 gmx_bool bDynLoadBal,
2596 const gmx_ddbox_t *ddbox)
2598 gmx_domdec_comm_t *comm;
2607 log->writeString("The maximum number of communication pulses is:");
2608 for (d = 0; d < dd->ndim; d++)
2610 log->writeStringFormatted(" %c %d", dim2char(dd->dim[d]), comm->cd[d].np_dlb);
2612 log->ensureLineBreak();
2613 log->writeLineFormatted("The minimum size for domain decomposition cells is %.3f nm", comm->cellsize_limit);
2614 log->writeLineFormatted("The requested allowed shrink of DD cells (option -dds) is: %.2f", dlb_scale);
2615 log->writeString("The allowed shrink of domain decomposition cells is:");
2616 for (d = 0; d < DIM; d++)
2620 if (d >= ddbox->npbcdim && dd->nc[d] == 2)
2627 comm->cellsize_min_dlb[d]/
2628 (ddbox->box_size[d]*ddbox->skew_fac[d]/dd->nc[d]);
2630 log->writeStringFormatted(" %c %.2f", dim2char(d), shrink);
2633 log->ensureLineBreak();
2637 set_dd_cell_sizes_slb(dd, ddbox, setcellsizeslbPULSE_ONLY, np);
2638 log->writeString("The initial number of communication pulses is:");
2639 for (d = 0; d < dd->ndim; d++)
2641 log->writeStringFormatted(" %c %d", dim2char(dd->dim[d]), np[dd->dim[d]]);
2643 log->ensureLineBreak();
2644 log->writeString("The initial domain decomposition cell size is:");
2645 for (d = 0; d < DIM; d++)
2649 log->writeStringFormatted(" %c %.2f nm",
2650 dim2char(d), dd->comm->cellsize_min[d]);
2653 log->ensureLineBreak();
2657 gmx_bool bInterCGVsites = count_intercg_vsites(mtop) != 0;
2659 if (comm->bInterCGBondeds ||
2661 dd->splitConstraints || dd->splitSettles)
2663 std::string decompUnits;
2666 decompUnits = "charge groups";
2668 else if (comm->useUpdateGroups)
2670 decompUnits = "atom groups";
2674 decompUnits = "atoms";
2677 log->writeLineFormatted("The maximum allowed distance for %s involved in interactions is:", decompUnits.c_str());
2678 log->writeLineFormatted("%40s %-7s %6.3f nm", "non-bonded interactions", "", comm->cutoff);
2682 limit = dd->comm->cellsize_limit;
2686 if (dynamic_dd_box(*dd))
2688 log->writeLine("(the following are initial values, they could change due to box deformation)");
2690 limit = dd->comm->cellsize_min[XX];
2691 for (d = 1; d < DIM; d++)
2693 limit = std::min(limit, dd->comm->cellsize_min[d]);
2697 if (comm->bInterCGBondeds)
2699 log->writeLineFormatted("%40s %-7s %6.3f nm",
2700 "two-body bonded interactions", "(-rdd)",
2701 std::max(comm->cutoff, comm->cutoff_mbody));
2702 log->writeLineFormatted("%40s %-7s %6.3f nm",
2703 "multi-body bonded interactions", "(-rdd)",
2704 (comm->bBondComm || isDlbOn(dd->comm)) ? comm->cutoff_mbody : std::min(comm->cutoff, limit));
2708 log->writeLineFormatted("%40s %-7s %6.3f nm",
2709 "virtual site constructions", "(-rcon)", limit);
2711 if (dd->splitConstraints || dd->splitSettles)
2713 std::string separation = gmx::formatString("atoms separated by up to %d constraints",
2715 log->writeLineFormatted("%40s %-7s %6.3f nm\n",
2716 separation.c_str(), "(-rcon)", limit);
2718 log->ensureLineBreak();
2722 static void logSettings(const gmx::MDLogger &mdlog,
2724 const gmx_mtop_t *mtop,
2725 const t_inputrec *ir,
2727 const gmx_ddbox_t *ddbox)
2729 gmx::StringOutputStream stream;
2730 gmx::TextWriter log(&stream);
2731 writeSettings(&log, dd, mtop, ir, isDlbOn(dd->comm), dlb_scale, ddbox);
2732 if (dd->comm->dlbState == DlbState::offCanTurnOn)
2735 log.ensureEmptyLine();
2736 log.writeLine("When dynamic load balancing gets turned on, these settings will change to:");
2738 writeSettings(&log, dd, mtop, ir, true, dlb_scale, ddbox);
2740 GMX_LOG(mdlog.info).asParagraph().appendText(stream.toString());
2743 static void set_cell_limits_dlb(const gmx::MDLogger &mdlog,
2746 const t_inputrec *ir,
2747 const gmx_ddbox_t *ddbox)
2749 gmx_domdec_comm_t *comm;
2750 int d, dim, npulse, npulse_d_max, npulse_d;
2755 bNoCutOff = (ir->rvdw == 0 || ir->rcoulomb == 0);
2757 /* Determine the maximum number of comm. pulses in one dimension */
2759 comm->cellsize_limit = std::max(comm->cellsize_limit, comm->cutoff_mbody);
2761 /* Determine the maximum required number of grid pulses */
2762 if (comm->cellsize_limit >= comm->cutoff)
2764 /* Only a single pulse is required */
2767 else if (!bNoCutOff && comm->cellsize_limit > 0)
2769 /* We round down slightly here to avoid overhead due to the latency
2770 * of extra communication calls when the cut-off
2771 * would be only slightly longer than the cell size.
2772 * Later cellsize_limit is redetermined,
2773 * so we can not miss interactions due to this rounding.
2775 npulse = static_cast<int>(0.96 + comm->cutoff/comm->cellsize_limit);
2779 /* There is no cell size limit */
2780 npulse = std::max(dd->nc[XX]-1, std::max(dd->nc[YY]-1, dd->nc[ZZ]-1));
2783 if (!bNoCutOff && npulse > 1)
2785 /* See if we can do with less pulses, based on dlb_scale */
2787 for (d = 0; d < dd->ndim; d++)
2790 npulse_d = static_cast<int>(1 + dd->nc[dim]*comm->cutoff
2791 /(ddbox->box_size[dim]*ddbox->skew_fac[dim]*dlb_scale));
2792 npulse_d_max = std::max(npulse_d_max, npulse_d);
2794 npulse = std::min(npulse, npulse_d_max);
2797 /* This env var can override npulse */
2798 d = dd_getenv(mdlog, "GMX_DD_NPULSE", 0);
2805 comm->bVacDLBNoLimit = (ir->ePBC == epbcNONE);
2806 for (d = 0; d < dd->ndim; d++)
2808 comm->cd[d].np_dlb = std::min(npulse, dd->nc[dd->dim[d]]-1);
2809 comm->maxpulse = std::max(comm->maxpulse, comm->cd[d].np_dlb);
2810 if (comm->cd[d].np_dlb < dd->nc[dd->dim[d]]-1)
2812 comm->bVacDLBNoLimit = FALSE;
2816 /* cellsize_limit is set for LINCS in init_domain_decomposition */
2817 if (!comm->bVacDLBNoLimit)
2819 comm->cellsize_limit = std::max(comm->cellsize_limit,
2820 comm->cutoff/comm->maxpulse);
2822 comm->cellsize_limit = std::max(comm->cellsize_limit, comm->cutoff_mbody);
2823 /* Set the minimum cell size for each DD dimension */
2824 for (d = 0; d < dd->ndim; d++)
2826 if (comm->bVacDLBNoLimit ||
2827 comm->cd[d].np_dlb*comm->cellsize_limit >= comm->cutoff)
2829 comm->cellsize_min_dlb[dd->dim[d]] = comm->cellsize_limit;
2833 comm->cellsize_min_dlb[dd->dim[d]] =
2834 comm->cutoff/comm->cd[d].np_dlb;
2837 if (comm->cutoff_mbody <= 0)
2839 comm->cutoff_mbody = std::min(comm->cutoff, comm->cellsize_limit);
2847 gmx_bool dd_bonded_molpbc(const gmx_domdec_t *dd, int ePBC)
2849 /* If each molecule is a single charge group
2850 * or we use domain decomposition for each periodic dimension,
2851 * we do not need to take pbc into account for the bonded interactions.
2853 return (ePBC != epbcNONE && dd->comm->bInterCGBondeds &&
2856 (dd->nc[ZZ] > 1 || ePBC == epbcXY)));
2859 /*! \brief Sets grid size limits and PP-PME setup, prints settings to log */
2860 static void set_ddgrid_parameters(const gmx::MDLogger &mdlog,
2861 gmx_domdec_t *dd, real dlb_scale,
2862 const gmx_mtop_t *mtop, const t_inputrec *ir,
2863 const gmx_ddbox_t *ddbox)
2865 gmx_domdec_comm_t *comm;
2871 if (EEL_PME(ir->coulombtype) || EVDW_PME(ir->vdwtype))
2873 init_ddpme(dd, &comm->ddpme[0], 0);
2874 if (comm->npmedecompdim >= 2)
2876 init_ddpme(dd, &comm->ddpme[1], 1);
2881 comm->npmenodes = 0;
2882 if (dd->pme_nodeid >= 0)
2884 gmx_fatal_collective(FARGS, dd->mpi_comm_all, DDMASTER(dd),
2885 "Can not have separate PME ranks without PME electrostatics");
2891 fprintf(debug, "The DD cut-off is %f\n", comm->cutoff);
2893 if (!isDlbDisabled(comm))
2895 set_cell_limits_dlb(mdlog, dd, dlb_scale, ir, ddbox);
2898 logSettings(mdlog, dd, mtop, ir, dlb_scale, ddbox);
2900 if (ir->ePBC == epbcNONE)
2902 vol_frac = 1 - 1/static_cast<double>(dd->nnodes);
2907 (1 + comm_box_frac(dd->nc, comm->cutoff, ddbox))/static_cast<double>(dd->nnodes);
2911 fprintf(debug, "Volume fraction for all DD zones: %f\n", vol_frac);
2913 natoms_tot = comm->cgs_gl.index[comm->cgs_gl.nr];
2915 dd->ga2la = new gmx_ga2la_t(natoms_tot,
2916 static_cast<int>(vol_frac*natoms_tot));
2919 /*! \brief Set some important DD parameters that can be modified by env.vars */
2920 static void set_dd_envvar_options(const gmx::MDLogger &mdlog,
2921 gmx_domdec_t *dd, int rank_mysim)
2923 gmx_domdec_comm_t *comm = dd->comm;
2925 dd->bSendRecv2 = (dd_getenv(mdlog, "GMX_DD_USE_SENDRECV2", 0) != 0);
2926 comm->dlb_scale_lim = dd_getenv(mdlog, "GMX_DLB_MAX_BOX_SCALING", 10);
2927 comm->eFlop = dd_getenv(mdlog, "GMX_DLB_BASED_ON_FLOPS", 0);
2928 int recload = dd_getenv(mdlog, "GMX_DD_RECORD_LOAD", 1);
2929 comm->nstDDDump = dd_getenv(mdlog, "GMX_DD_NST_DUMP", 0);
2930 comm->nstDDDumpGrid = dd_getenv(mdlog, "GMX_DD_NST_DUMP_GRID", 0);
2931 comm->DD_debug = dd_getenv(mdlog, "GMX_DD_DEBUG", 0);
2935 GMX_LOG(mdlog.info).appendText("Will use two sequential MPI_Sendrecv calls instead of two simultaneous non-blocking MPI_Irecv and MPI_Isend pairs for constraint and vsite communication");
2940 GMX_LOG(mdlog.info).appendText("Will load balance based on FLOP count");
2941 if (comm->eFlop > 1)
2943 srand(1 + rank_mysim);
2945 comm->bRecordLoad = TRUE;
2949 comm->bRecordLoad = (wallcycle_have_counter() && recload > 0);
2953 gmx_domdec_t *init_domain_decomposition(const gmx::MDLogger &mdlog,
2955 const DomdecOptions &options,
2956 const MdrunOptions &mdrunOptions,
2957 const gmx_mtop_t *mtop,
2958 const t_inputrec *ir,
2960 gmx::ArrayRef<const gmx::RVec> xGlobal,
2961 gmx::LocalAtomSetManager *atomSets)
2965 GMX_LOG(mdlog.info).appendTextFormatted(
2966 "\nInitializing Domain Decomposition on %d ranks", cr->nnodes);
2968 dd = new gmx_domdec_t;
2970 dd->comm = init_dd_comm();
2972 /* Initialize DD paritioning counters */
2973 dd->comm->partition_step = INT_MIN;
2976 set_dd_envvar_options(mdlog, dd, cr->nodeid);
2978 gmx_ddbox_t ddbox = {0};
2979 set_dd_limits_and_grid(mdlog, cr, dd, options, mdrunOptions,
2984 make_dd_communicators(mdlog, cr, dd, options.rankOrder);
2986 if (thisRankHasDuty(cr, DUTY_PP))
2988 set_ddgrid_parameters(mdlog, dd, options.dlbScaling, mtop, ir, &ddbox);
2990 setup_neighbor_relations(dd);
2993 /* Set overallocation to avoid frequent reallocation of arrays */
2994 set_over_alloc_dd(TRUE);
2996 clear_dd_cycle_counts(dd);
2998 dd->atomSets = atomSets;
3003 static gmx_bool test_dd_cutoff(t_commrec *cr,
3004 const t_state &state,
3005 real cutoffRequested)
3015 set_ddbox(*dd, false, state.box, true, state.x, &ddbox);
3019 for (d = 0; d < dd->ndim; d++)
3023 inv_cell_size = DD_CELL_MARGIN*dd->nc[dim]/ddbox.box_size[dim];
3024 if (dynamic_dd_box(*dd))
3026 inv_cell_size *= DD_PRES_SCALE_MARGIN;
3029 np = 1 + static_cast<int>(cutoffRequested*inv_cell_size*ddbox.skew_fac[dim]);
3031 if (!isDlbDisabled(dd->comm) && (dim < ddbox.npbcdim) && (dd->comm->cd[d].np_dlb > 0))
3033 if (np > dd->comm->cd[d].np_dlb)
3038 /* If a current local cell size is smaller than the requested
3039 * cut-off, we could still fix it, but this gets very complicated.
3040 * Without fixing here, we might actually need more checks.
3042 real cellSizeAlongDim = (dd->comm->cell_x1[dim] - dd->comm->cell_x0[dim])*ddbox.skew_fac[dim];
3043 if (cellSizeAlongDim*dd->comm->cd[d].np_dlb < cutoffRequested)
3050 if (!isDlbDisabled(dd->comm))
3052 /* If DLB is not active yet, we don't need to check the grid jumps.
3053 * Actually we shouldn't, because then the grid jump data is not set.
3055 if (isDlbOn(dd->comm) &&
3056 check_grid_jump(0, dd, cutoffRequested, &ddbox, FALSE))
3061 gmx_sumi(1, &LocallyLimited, cr);
3063 if (LocallyLimited > 0)
3072 gmx_bool change_dd_cutoff(t_commrec *cr,
3073 const t_state &state,
3074 real cutoffRequested)
3076 gmx_bool bCutoffAllowed;
3078 bCutoffAllowed = test_dd_cutoff(cr, state, cutoffRequested);
3082 cr->dd->comm->cutoff = cutoffRequested;
3085 return bCutoffAllowed;