2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
53 #include "gromacs/domdec/collect.h"
54 #include "gromacs/domdec/dlb.h"
55 #include "gromacs/domdec/dlbtiming.h"
56 #include "gromacs/domdec/domdec_network.h"
57 #include "gromacs/domdec/ga2la.h"
58 #include "gromacs/domdec/options.h"
59 #include "gromacs/domdec/partition.h"
60 #include "gromacs/gmxlib/network.h"
61 #include "gromacs/gmxlib/nrnb.h"
62 #include "gromacs/gpu_utils/gpu_utils.h"
63 #include "gromacs/hardware/hw_info.h"
64 #include "gromacs/listed_forces/manage_threading.h"
65 #include "gromacs/math/vec.h"
66 #include "gromacs/math/vectypes.h"
67 #include "gromacs/mdlib/calc_verletbuf.h"
68 #include "gromacs/mdlib/constr.h"
69 #include "gromacs/mdlib/constraintrange.h"
70 #include "gromacs/mdlib/updategroups.h"
71 #include "gromacs/mdlib/vsite.h"
72 #include "gromacs/mdtypes/commrec.h"
73 #include "gromacs/mdtypes/inputrec.h"
74 #include "gromacs/mdtypes/mdrunoptions.h"
75 #include "gromacs/mdtypes/state.h"
76 #include "gromacs/pbcutil/ishift.h"
77 #include "gromacs/pbcutil/pbc.h"
78 #include "gromacs/pulling/pull.h"
79 #include "gromacs/timing/wallcycle.h"
80 #include "gromacs/topology/block.h"
81 #include "gromacs/topology/idef.h"
82 #include "gromacs/topology/ifunc.h"
83 #include "gromacs/topology/mtop_lookup.h"
84 #include "gromacs/topology/mtop_util.h"
85 #include "gromacs/topology/topology.h"
86 #include "gromacs/utility/basedefinitions.h"
87 #include "gromacs/utility/basenetwork.h"
88 #include "gromacs/utility/cstringutil.h"
89 #include "gromacs/utility/exceptions.h"
90 #include "gromacs/utility/fatalerror.h"
91 #include "gromacs/utility/gmxmpi.h"
92 #include "gromacs/utility/logger.h"
93 #include "gromacs/utility/real.h"
94 #include "gromacs/utility/smalloc.h"
95 #include "gromacs/utility/strconvert.h"
96 #include "gromacs/utility/stringstream.h"
97 #include "gromacs/utility/stringutil.h"
98 #include "gromacs/utility/textwriter.h"
100 #include "atomdistribution.h"
102 #include "cellsizes.h"
103 #include "distribute.h"
104 #include "domdec_constraints.h"
105 #include "domdec_internal.h"
106 #include "domdec_vsite.h"
107 #include "redistribute.h"
110 // TODO remove this when moving domdec into gmx namespace
111 using gmx::DdRankOrder;
112 using gmx::DlbOption;
113 using gmx::DomdecOptions;
115 static const char *edlbs_names[int(DlbState::nr)] = { "off", "auto", "locked", "on", "on" };
117 /* The size per atom group of the cggl_flag buffer in gmx_domdec_comm_t */
120 /* The flags for the cggl_flag buffer in gmx_domdec_comm_t */
121 #define DD_FLAG_NRCG 65535
122 #define DD_FLAG_FW(d) (1<<(16+(d)*2))
123 #define DD_FLAG_BW(d) (1<<(16+(d)*2+1))
125 /* The DD zone order */
126 static const ivec dd_zo[DD_MAXZONE] =
127 {{0, 0, 0}, {1, 0, 0}, {1, 1, 0}, {0, 1, 0}, {0, 1, 1}, {0, 0, 1}, {1, 0, 1}, {1, 1, 1}};
129 /* The non-bonded zone-pair setup for domain decomposition
130 * The first number is the i-zone, the second number the first j-zone seen by
131 * this i-zone, the third number the last+1 j-zone seen by this i-zone.
132 * As is, this is for 3D decomposition, where there are 4 i-zones.
133 * With 2D decomposition use only the first 2 i-zones and a last+1 j-zone of 4.
134 * With 1D decomposition use only the first i-zone and a last+1 j-zone of 2.
137 ddNonbondedZonePairRanges[DD_MAXIZONE][3] = {{0, 0, 8},
144 #define dd_index(n,i) ((((i)[ZZ]*(n)[YY] + (i)[YY])*(n)[XX]) + (i)[XX])
146 static void index2xyz(ivec nc,int ind,ivec xyz)
148 xyz[XX] = ind % nc[XX];
149 xyz[YY] = (ind / nc[XX]) % nc[YY];
150 xyz[ZZ] = ind / (nc[YY]*nc[XX]);
154 static void ddindex2xyz(const ivec nc, int ind, ivec xyz)
156 xyz[XX] = ind / (nc[YY]*nc[ZZ]);
157 xyz[YY] = (ind / nc[ZZ]) % nc[YY];
158 xyz[ZZ] = ind % nc[ZZ];
161 static int ddcoord2ddnodeid(gmx_domdec_t *dd, ivec c)
166 ddindex = dd_index(dd->nc, c);
167 if (dd->comm->bCartesianPP_PME)
169 ddnodeid = dd->comm->ddindex2ddnodeid[ddindex];
171 else if (dd->comm->bCartesianPP)
174 MPI_Cart_rank(dd->mpi_comm_all, c, &ddnodeid);
185 int ddglatnr(const gmx_domdec_t *dd, int i)
195 if (i >= dd->comm->atomRanges.numAtomsTotal())
197 gmx_fatal(FARGS, "glatnr called with %d, which is larger than the local number of atoms (%d)", i, dd->comm->atomRanges.numAtomsTotal());
199 atnr = dd->globalAtomIndices[i] + 1;
205 t_block *dd_charge_groups_global(gmx_domdec_t *dd)
207 return &dd->comm->cgs_gl;
210 gmx::ArrayRef<const gmx::RangePartitioning> getUpdateGroupingPerMoleculetype(const gmx_domdec_t &dd)
212 GMX_RELEASE_ASSERT(dd.comm, "Need a valid dd.comm");
213 return dd.comm->updateGroupingPerMoleculetype;
216 void dd_store_state(gmx_domdec_t *dd, t_state *state)
220 if (state->ddp_count != dd->ddp_count)
222 gmx_incons("The MD state does not match the domain decomposition state");
225 state->cg_gl.resize(dd->ncg_home);
226 for (i = 0; i < dd->ncg_home; i++)
228 state->cg_gl[i] = dd->globalAtomGroupIndices[i];
231 state->ddp_count_cg_gl = dd->ddp_count;
234 gmx_domdec_zones_t *domdec_zones(gmx_domdec_t *dd)
236 return &dd->comm->zones;
239 void dd_get_ns_ranges(const gmx_domdec_t *dd, int icg,
240 int *jcg0, int *jcg1, ivec shift0, ivec shift1)
242 gmx_domdec_zones_t *zones;
245 zones = &dd->comm->zones;
248 while (icg >= zones->izone[izone].cg1)
257 else if (izone < zones->nizone)
259 *jcg0 = zones->izone[izone].jcg0;
263 gmx_fatal(FARGS, "DD icg %d out of range: izone (%d) >= nizone (%d)",
264 icg, izone, zones->nizone);
267 *jcg1 = zones->izone[izone].jcg1;
269 for (d = 0; d < dd->ndim; d++)
272 shift0[dim] = zones->izone[izone].shift0[dim];
273 shift1[dim] = zones->izone[izone].shift1[dim];
274 if (dd->comm->tric_dir[dim] || (isDlbOn(dd->comm) && d > 0))
276 /* A conservative approach, this can be optimized */
283 int dd_numHomeAtoms(const gmx_domdec_t &dd)
285 return dd.comm->atomRanges.numHomeAtoms();
288 int dd_natoms_mdatoms(const gmx_domdec_t *dd)
290 /* We currently set mdatoms entries for all atoms:
291 * local + non-local + communicated for vsite + constraints
294 return dd->comm->atomRanges.numAtomsTotal();
297 int dd_natoms_vsite(const gmx_domdec_t *dd)
299 return dd->comm->atomRanges.end(DDAtomRanges::Type::Vsites);
302 void dd_get_constraint_range(const gmx_domdec_t *dd, int *at_start, int *at_end)
304 *at_start = dd->comm->atomRanges.start(DDAtomRanges::Type::Constraints);
305 *at_end = dd->comm->atomRanges.end(DDAtomRanges::Type::Constraints);
308 void dd_move_x(gmx_domdec_t *dd,
310 gmx::ArrayRef<gmx::RVec> x,
311 gmx_wallcycle *wcycle)
313 wallcycle_start(wcycle, ewcMOVEX);
316 gmx_domdec_comm_t *comm;
317 gmx_domdec_comm_dim_t *cd;
318 rvec shift = {0, 0, 0};
319 gmx_bool bPBC, bScrew;
323 const gmx::RangePartitioning &atomGrouping = dd->atomGrouping();
326 nat_tot = comm->atomRanges.numHomeAtoms();
327 for (int d = 0; d < dd->ndim; d++)
329 bPBC = (dd->ci[dd->dim[d]] == 0);
330 bScrew = (bPBC && dd->bScrewPBC && dd->dim[d] == XX);
333 copy_rvec(box[dd->dim[d]], shift);
336 for (const gmx_domdec_ind_t &ind : cd->ind)
338 DDBufferAccess<gmx::RVec> sendBufferAccess(comm->rvecBuffer, ind.nsend[nzone + 1]);
339 gmx::ArrayRef<gmx::RVec> &sendBuffer = sendBufferAccess.buffer;
343 for (int g : ind.index)
345 for (int j : atomGrouping.block(g))
347 sendBuffer[n] = x[j];
354 for (int g : ind.index)
356 for (int j : atomGrouping.block(g))
358 /* We need to shift the coordinates */
359 for (int d = 0; d < DIM; d++)
361 sendBuffer[n][d] = x[j][d] + shift[d];
369 for (int g : ind.index)
371 for (int j : atomGrouping.block(g))
374 sendBuffer[n][XX] = x[j][XX] + shift[XX];
376 * This operation requires a special shift force
377 * treatment, which is performed in calc_vir.
379 sendBuffer[n][YY] = box[YY][YY] - x[j][YY];
380 sendBuffer[n][ZZ] = box[ZZ][ZZ] - x[j][ZZ];
386 DDBufferAccess<gmx::RVec> receiveBufferAccess(comm->rvecBuffer2, cd->receiveInPlace ? 0 : ind.nrecv[nzone + 1]);
388 gmx::ArrayRef<gmx::RVec> receiveBuffer;
389 if (cd->receiveInPlace)
391 receiveBuffer = gmx::arrayRefFromArray(x.data() + nat_tot, ind.nrecv[nzone + 1]);
395 receiveBuffer = receiveBufferAccess.buffer;
397 /* Send and receive the coordinates */
398 ddSendrecv(dd, d, dddirBackward,
399 sendBuffer, receiveBuffer);
401 if (!cd->receiveInPlace)
404 for (int zone = 0; zone < nzone; zone++)
406 for (int i = ind.cell2at0[zone]; i < ind.cell2at1[zone]; i++)
408 x[i] = receiveBuffer[j++];
412 nat_tot += ind.nrecv[nzone+1];
417 wallcycle_stop(wcycle, ewcMOVEX);
420 void dd_move_f(gmx_domdec_t *dd,
421 gmx::ArrayRef<gmx::RVec> f,
423 gmx_wallcycle *wcycle)
425 wallcycle_start(wcycle, ewcMOVEF);
428 gmx_domdec_comm_t *comm;
429 gmx_domdec_comm_dim_t *cd;
432 gmx_bool bShiftForcesNeedPbc, bScrew;
436 const gmx::RangePartitioning &atomGrouping = dd->atomGrouping();
438 nzone = comm->zones.n/2;
439 nat_tot = comm->atomRanges.end(DDAtomRanges::Type::Zones);
440 for (int d = dd->ndim-1; d >= 0; d--)
442 /* Only forces in domains near the PBC boundaries need to
443 consider PBC in the treatment of fshift */
444 bShiftForcesNeedPbc = (dd->ci[dd->dim[d]] == 0);
445 bScrew = (bShiftForcesNeedPbc && dd->bScrewPBC && dd->dim[d] == XX);
446 if (fshift == nullptr && !bScrew)
448 bShiftForcesNeedPbc = FALSE;
450 /* Determine which shift vector we need */
456 for (int p = cd->numPulses() - 1; p >= 0; p--)
458 const gmx_domdec_ind_t &ind = cd->ind[p];
459 DDBufferAccess<gmx::RVec> receiveBufferAccess(comm->rvecBuffer, ind.nsend[nzone + 1]);
460 gmx::ArrayRef<gmx::RVec> &receiveBuffer = receiveBufferAccess.buffer;
462 nat_tot -= ind.nrecv[nzone+1];
464 DDBufferAccess<gmx::RVec> sendBufferAccess(comm->rvecBuffer2, cd->receiveInPlace ? 0 : ind.nrecv[nzone + 1]);
466 gmx::ArrayRef<gmx::RVec> sendBuffer;
467 if (cd->receiveInPlace)
469 sendBuffer = gmx::arrayRefFromArray(f.data() + nat_tot, ind.nrecv[nzone + 1]);
473 sendBuffer = sendBufferAccess.buffer;
475 for (int zone = 0; zone < nzone; zone++)
477 for (int i = ind.cell2at0[zone]; i < ind.cell2at1[zone]; i++)
479 sendBuffer[j++] = f[i];
483 /* Communicate the forces */
484 ddSendrecv(dd, d, dddirForward,
485 sendBuffer, receiveBuffer);
486 /* Add the received forces */
488 if (!bShiftForcesNeedPbc)
490 for (int g : ind.index)
492 for (int j : atomGrouping.block(g))
494 for (int d = 0; d < DIM; d++)
496 f[j][d] += receiveBuffer[n][d];
504 /* fshift should always be defined if this function is
505 * called when bShiftForcesNeedPbc is true */
506 assert(nullptr != fshift);
507 for (int g : ind.index)
509 for (int j : atomGrouping.block(g))
511 for (int d = 0; d < DIM; d++)
513 f[j][d] += receiveBuffer[n][d];
515 /* Add this force to the shift force */
516 for (int d = 0; d < DIM; d++)
518 fshift[is][d] += receiveBuffer[n][d];
526 for (int g : ind.index)
528 for (int j : atomGrouping.block(g))
530 /* Rotate the force */
531 f[j][XX] += receiveBuffer[n][XX];
532 f[j][YY] -= receiveBuffer[n][YY];
533 f[j][ZZ] -= receiveBuffer[n][ZZ];
536 /* Add this force to the shift force */
537 for (int d = 0; d < DIM; d++)
539 fshift[is][d] += receiveBuffer[n][d];
549 wallcycle_stop(wcycle, ewcMOVEF);
552 /* Convenience function for extracting a real buffer from an rvec buffer
554 * To reduce the number of temporary communication buffers and avoid
555 * cache polution, we reuse gmx::RVec buffers for storing reals.
556 * This functions return a real buffer reference with the same number
557 * of elements as the gmx::RVec buffer (so 1/3 of the size in bytes).
559 static gmx::ArrayRef<real>
560 realArrayRefFromRvecArrayRef(gmx::ArrayRef<gmx::RVec> arrayRef)
562 return gmx::arrayRefFromArray(as_rvec_array(arrayRef.data())[0],
566 void dd_atom_spread_real(gmx_domdec_t *dd, real v[])
569 gmx_domdec_comm_t *comm;
570 gmx_domdec_comm_dim_t *cd;
574 const gmx::RangePartitioning &atomGrouping = dd->atomGrouping();
577 nat_tot = comm->atomRanges.numHomeAtoms();
578 for (int d = 0; d < dd->ndim; d++)
581 for (const gmx_domdec_ind_t &ind : cd->ind)
583 /* Note: We provision for RVec instead of real, so a factor of 3
584 * more than needed. The buffer actually already has this size
585 * and we pass a plain pointer below, so this does not matter.
587 DDBufferAccess<gmx::RVec> sendBufferAccess(comm->rvecBuffer, ind.nsend[nzone + 1]);
588 gmx::ArrayRef<real> sendBuffer = realArrayRefFromRvecArrayRef(sendBufferAccess.buffer);
590 for (int g : ind.index)
592 for (int j : atomGrouping.block(g))
594 sendBuffer[n++] = v[j];
598 DDBufferAccess<gmx::RVec> receiveBufferAccess(comm->rvecBuffer2, cd->receiveInPlace ? 0 : ind.nrecv[nzone + 1]);
600 gmx::ArrayRef<real> receiveBuffer;
601 if (cd->receiveInPlace)
603 receiveBuffer = gmx::arrayRefFromArray(v + nat_tot, ind.nrecv[nzone + 1]);
607 receiveBuffer = realArrayRefFromRvecArrayRef(receiveBufferAccess.buffer);
609 /* Send and receive the data */
610 ddSendrecv(dd, d, dddirBackward,
611 sendBuffer, receiveBuffer);
612 if (!cd->receiveInPlace)
615 for (int zone = 0; zone < nzone; zone++)
617 for (int i = ind.cell2at0[zone]; i < ind.cell2at1[zone]; i++)
619 v[i] = receiveBuffer[j++];
623 nat_tot += ind.nrecv[nzone+1];
629 void dd_atom_sum_real(gmx_domdec_t *dd, real v[])
632 gmx_domdec_comm_t *comm;
633 gmx_domdec_comm_dim_t *cd;
637 const gmx::RangePartitioning &atomGrouping = dd->atomGrouping();
639 nzone = comm->zones.n/2;
640 nat_tot = comm->atomRanges.end(DDAtomRanges::Type::Zones);
641 for (int d = dd->ndim-1; d >= 0; d--)
644 for (int p = cd->numPulses() - 1; p >= 0; p--)
646 const gmx_domdec_ind_t &ind = cd->ind[p];
648 /* Note: We provision for RVec instead of real, so a factor of 3
649 * more than needed. The buffer actually already has this size
650 * and we typecast, so this works as intended.
652 DDBufferAccess<gmx::RVec> receiveBufferAccess(comm->rvecBuffer, ind.nsend[nzone + 1]);
653 gmx::ArrayRef<real> receiveBuffer = realArrayRefFromRvecArrayRef(receiveBufferAccess.buffer);
654 nat_tot -= ind.nrecv[nzone + 1];
656 DDBufferAccess<gmx::RVec> sendBufferAccess(comm->rvecBuffer2, cd->receiveInPlace ? 0 : ind.nrecv[nzone + 1]);
658 gmx::ArrayRef<real> sendBuffer;
659 if (cd->receiveInPlace)
661 sendBuffer = gmx::arrayRefFromArray(v + nat_tot, ind.nrecv[nzone + 1]);
665 sendBuffer = realArrayRefFromRvecArrayRef(sendBufferAccess.buffer);
667 for (int zone = 0; zone < nzone; zone++)
669 for (int i = ind.cell2at0[zone]; i < ind.cell2at1[zone]; i++)
671 sendBuffer[j++] = v[i];
675 /* Communicate the forces */
676 ddSendrecv(dd, d, dddirForward,
677 sendBuffer, receiveBuffer);
678 /* Add the received forces */
680 for (int g : ind.index)
682 for (int j : atomGrouping.block(g))
684 v[j] += receiveBuffer[n];
693 real dd_cutoff_multibody(const gmx_domdec_t *dd)
695 gmx_domdec_comm_t *comm;
702 if (comm->bInterCGBondeds)
704 if (comm->cutoff_mbody > 0)
706 r = comm->cutoff_mbody;
710 /* cutoff_mbody=0 means we do not have DLB */
711 r = comm->cellsize_min[dd->dim[0]];
712 for (di = 1; di < dd->ndim; di++)
714 r = std::min(r, comm->cellsize_min[dd->dim[di]]);
718 r = std::max(r, comm->cutoff_mbody);
722 r = std::min(r, comm->cutoff);
730 real dd_cutoff_twobody(const gmx_domdec_t *dd)
734 r_mb = dd_cutoff_multibody(dd);
736 return std::max(dd->comm->cutoff, r_mb);
740 static void dd_cart_coord2pmecoord(const gmx_domdec_t *dd, const ivec coord,
745 nc = dd->nc[dd->comm->cartpmedim];
746 ntot = dd->comm->ntot[dd->comm->cartpmedim];
747 copy_ivec(coord, coord_pme);
748 coord_pme[dd->comm->cartpmedim] =
749 nc + (coord[dd->comm->cartpmedim]*(ntot - nc) + (ntot - nc)/2)/nc;
752 static int ddindex2pmeindex(const gmx_domdec_t *dd, int ddindex)
757 npme = dd->comm->npmenodes;
759 /* Here we assign a PME node to communicate with this DD node
760 * by assuming that the major index of both is x.
761 * We add cr->npmenodes/2 to obtain an even distribution.
763 return (ddindex*npme + npme/2)/npp;
766 static int *dd_interleaved_pme_ranks(const gmx_domdec_t *dd)
771 snew(pme_rank, dd->comm->npmenodes);
773 for (i = 0; i < dd->nnodes; i++)
775 p0 = ddindex2pmeindex(dd, i);
776 p1 = ddindex2pmeindex(dd, i+1);
777 if (i+1 == dd->nnodes || p1 > p0)
781 fprintf(debug, "pme_rank[%d] = %d\n", n, i+1+n);
783 pme_rank[n] = i + 1 + n;
791 static int gmx_ddcoord2pmeindex(const t_commrec *cr, int x, int y, int z)
799 if (dd->comm->bCartesian) {
800 gmx_ddindex2xyz(dd->nc,ddindex,coords);
801 dd_coords2pmecoords(dd,coords,coords_pme);
802 copy_ivec(dd->ntot,nc);
803 nc[dd->cartpmedim] -= dd->nc[dd->cartpmedim];
804 coords_pme[dd->cartpmedim] -= dd->nc[dd->cartpmedim];
806 slab = (coords_pme[XX]*nc[YY] + coords_pme[YY])*nc[ZZ] + coords_pme[ZZ];
808 slab = (ddindex*cr->npmenodes + cr->npmenodes/2)/dd->nnodes;
814 slab = ddindex2pmeindex(dd, dd_index(dd->nc, coords));
819 static int ddcoord2simnodeid(const t_commrec *cr, int x, int y, int z)
821 gmx_domdec_comm_t *comm;
823 int ddindex, nodeid = -1;
830 if (comm->bCartesianPP_PME)
833 MPI_Cart_rank(cr->mpi_comm_mysim, coords, &nodeid);
838 ddindex = dd_index(cr->dd->nc, coords);
839 if (comm->bCartesianPP)
841 nodeid = comm->ddindex2simnodeid[ddindex];
847 nodeid = ddindex + gmx_ddcoord2pmeindex(cr, x, y, z);
859 static int dd_simnode2pmenode(const gmx_domdec_t *dd,
860 const t_commrec gmx_unused *cr,
865 const gmx_domdec_comm_t *comm = dd->comm;
867 /* This assumes a uniform x domain decomposition grid cell size */
868 if (comm->bCartesianPP_PME)
871 ivec coord, coord_pme;
872 MPI_Cart_coords(cr->mpi_comm_mysim, sim_nodeid, DIM, coord);
873 if (coord[comm->cartpmedim] < dd->nc[comm->cartpmedim])
875 /* This is a PP node */
876 dd_cart_coord2pmecoord(dd, coord, coord_pme);
877 MPI_Cart_rank(cr->mpi_comm_mysim, coord_pme, &pmenode);
881 else if (comm->bCartesianPP)
883 if (sim_nodeid < dd->nnodes)
885 pmenode = dd->nnodes + ddindex2pmeindex(dd, sim_nodeid);
890 /* This assumes DD cells with identical x coordinates
891 * are numbered sequentially.
893 if (dd->comm->pmenodes == nullptr)
895 if (sim_nodeid < dd->nnodes)
897 /* The DD index equals the nodeid */
898 pmenode = dd->nnodes + ddindex2pmeindex(dd, sim_nodeid);
904 while (sim_nodeid > dd->comm->pmenodes[i])
908 if (sim_nodeid < dd->comm->pmenodes[i])
910 pmenode = dd->comm->pmenodes[i];
918 NumPmeDomains getNumPmeDomains(const gmx_domdec_t *dd)
923 dd->comm->npmenodes_x, dd->comm->npmenodes_y
934 std::vector<int> get_pme_ddranks(const t_commrec *cr, int pmenodeid)
938 ivec coord, coord_pme;
942 std::vector<int> ddranks;
943 ddranks.reserve((dd->nnodes+cr->npmenodes-1)/cr->npmenodes);
945 for (x = 0; x < dd->nc[XX]; x++)
947 for (y = 0; y < dd->nc[YY]; y++)
949 for (z = 0; z < dd->nc[ZZ]; z++)
951 if (dd->comm->bCartesianPP_PME)
956 dd_cart_coord2pmecoord(dd, coord, coord_pme);
957 if (dd->ci[XX] == coord_pme[XX] &&
958 dd->ci[YY] == coord_pme[YY] &&
959 dd->ci[ZZ] == coord_pme[ZZ])
961 ddranks.push_back(ddcoord2simnodeid(cr, x, y, z));
966 /* The slab corresponds to the nodeid in the PME group */
967 if (gmx_ddcoord2pmeindex(cr, x, y, z) == pmenodeid)
969 ddranks.push_back(ddcoord2simnodeid(cr, x, y, z));
978 static gmx_bool receive_vir_ener(const gmx_domdec_t *dd, const t_commrec *cr)
980 gmx_bool bReceive = TRUE;
982 if (cr->npmenodes < dd->nnodes)
984 gmx_domdec_comm_t *comm = dd->comm;
985 if (comm->bCartesianPP_PME)
988 int pmenode = dd_simnode2pmenode(dd, cr, cr->sim_nodeid);
990 MPI_Cart_coords(cr->mpi_comm_mysim, cr->sim_nodeid, DIM, coords);
991 coords[comm->cartpmedim]++;
992 if (coords[comm->cartpmedim] < dd->nc[comm->cartpmedim])
995 MPI_Cart_rank(cr->mpi_comm_mysim, coords, &rank);
996 if (dd_simnode2pmenode(dd, cr, rank) == pmenode)
998 /* This is not the last PP node for pmenode */
1003 GMX_RELEASE_ASSERT(false, "Without MPI we should not have Cartesian PP-PME with #PMEnodes < #DDnodes");
1008 int pmenode = dd_simnode2pmenode(dd, cr, cr->sim_nodeid);
1009 if (cr->sim_nodeid+1 < cr->nnodes &&
1010 dd_simnode2pmenode(dd, cr, cr->sim_nodeid+1) == pmenode)
1012 /* This is not the last PP node for pmenode */
1021 static void set_slb_pme_dim_f(gmx_domdec_t *dd, int dim, real **dim_f)
1023 gmx_domdec_comm_t *comm;
1028 snew(*dim_f, dd->nc[dim]+1);
1030 for (i = 1; i < dd->nc[dim]; i++)
1032 if (comm->slb_frac[dim])
1034 (*dim_f)[i] = (*dim_f)[i-1] + comm->slb_frac[dim][i-1];
1038 (*dim_f)[i] = static_cast<real>(i)/static_cast<real>(dd->nc[dim]);
1041 (*dim_f)[dd->nc[dim]] = 1;
1044 static void init_ddpme(gmx_domdec_t *dd, gmx_ddpme_t *ddpme, int dimind)
1046 int pmeindex, slab, nso, i;
1049 if (dimind == 0 && dd->dim[0] == YY && dd->comm->npmenodes_x == 1)
1055 ddpme->dim = dimind;
1057 ddpme->dim_match = (ddpme->dim == dd->dim[dimind]);
1059 ddpme->nslab = (ddpme->dim == 0 ?
1060 dd->comm->npmenodes_x :
1061 dd->comm->npmenodes_y);
1063 if (ddpme->nslab <= 1)
1068 nso = dd->comm->npmenodes/ddpme->nslab;
1069 /* Determine for each PME slab the PP location range for dimension dim */
1070 snew(ddpme->pp_min, ddpme->nslab);
1071 snew(ddpme->pp_max, ddpme->nslab);
1072 for (slab = 0; slab < ddpme->nslab; slab++)
1074 ddpme->pp_min[slab] = dd->nc[dd->dim[dimind]] - 1;
1075 ddpme->pp_max[slab] = 0;
1077 for (i = 0; i < dd->nnodes; i++)
1079 ddindex2xyz(dd->nc, i, xyz);
1080 /* For y only use our y/z slab.
1081 * This assumes that the PME x grid size matches the DD grid size.
1083 if (dimind == 0 || xyz[XX] == dd->ci[XX])
1085 pmeindex = ddindex2pmeindex(dd, i);
1088 slab = pmeindex/nso;
1092 slab = pmeindex % ddpme->nslab;
1094 ddpme->pp_min[slab] = std::min(ddpme->pp_min[slab], xyz[dimind]);
1095 ddpme->pp_max[slab] = std::max(ddpme->pp_max[slab], xyz[dimind]);
1099 set_slb_pme_dim_f(dd, ddpme->dim, &ddpme->slb_dim_f);
1102 int dd_pme_maxshift_x(const gmx_domdec_t *dd)
1104 if (dd->comm->ddpme[0].dim == XX)
1106 return dd->comm->ddpme[0].maxshift;
1114 int dd_pme_maxshift_y(const gmx_domdec_t *dd)
1116 if (dd->comm->ddpme[0].dim == YY)
1118 return dd->comm->ddpme[0].maxshift;
1120 else if (dd->comm->npmedecompdim >= 2 && dd->comm->ddpme[1].dim == YY)
1122 return dd->comm->ddpme[1].maxshift;
1130 void dd_cycles_add(const gmx_domdec_t *dd, float cycles, int ddCycl)
1132 /* Note that the cycles value can be incorrect, either 0 or some
1133 * extremely large value, when our thread migrated to another core
1134 * with an unsynchronized cycle counter. If this happens less often
1135 * that once per nstlist steps, this will not cause issues, since
1136 * we later subtract the maximum value from the sum over nstlist steps.
1137 * A zero count will slightly lower the total, but that's a small effect.
1138 * Note that the main purpose of the subtraction of the maximum value
1139 * is to avoid throwing off the load balancing when stalls occur due
1140 * e.g. system activity or network congestion.
1142 dd->comm->cycl[ddCycl] += cycles;
1143 dd->comm->cycl_n[ddCycl]++;
1144 if (cycles > dd->comm->cycl_max[ddCycl])
1146 dd->comm->cycl_max[ddCycl] = cycles;
1151 static void make_load_communicator(gmx_domdec_t *dd, int dim_ind, ivec loc)
1156 gmx_bool bPartOfGroup = FALSE;
1158 dim = dd->dim[dim_ind];
1159 copy_ivec(loc, loc_c);
1160 for (i = 0; i < dd->nc[dim]; i++)
1163 rank = dd_index(dd->nc, loc_c);
1164 if (rank == dd->rank)
1166 /* This process is part of the group */
1167 bPartOfGroup = TRUE;
1170 MPI_Comm_split(dd->mpi_comm_all, bPartOfGroup ? 0 : MPI_UNDEFINED, dd->rank,
1174 dd->comm->mpi_comm_load[dim_ind] = c_row;
1175 if (!isDlbDisabled(dd->comm))
1177 DDCellsizesWithDlb &cellsizes = dd->comm->cellsizesWithDlb[dim_ind];
1179 if (dd->ci[dim] == dd->master_ci[dim])
1181 /* This is the root process of this row */
1182 cellsizes.rowMaster = std::make_unique<RowMaster>();
1184 RowMaster &rowMaster = *cellsizes.rowMaster;
1185 rowMaster.cellFrac.resize(ddCellFractionBufferSize(dd, dim_ind));
1186 rowMaster.oldCellFrac.resize(dd->nc[dim] + 1);
1187 rowMaster.isCellMin.resize(dd->nc[dim]);
1190 rowMaster.bounds.resize(dd->nc[dim]);
1192 rowMaster.buf_ncd.resize(dd->nc[dim]);
1196 /* This is not a root process, we only need to receive cell_f */
1197 cellsizes.fracRow.resize(ddCellFractionBufferSize(dd, dim_ind));
1200 if (dd->ci[dim] == dd->master_ci[dim])
1202 snew(dd->comm->load[dim_ind].load, dd->nc[dim]*DD_NLOAD_MAX);
1208 void dd_setup_dlb_resource_sharing(t_commrec *cr,
1212 int physicalnode_id_hash;
1214 MPI_Comm mpi_comm_pp_physicalnode;
1216 if (!thisRankHasDuty(cr, DUTY_PP) || gpu_id < 0)
1218 /* Only ranks with short-ranged tasks (currently) use GPUs.
1219 * If we don't have GPUs assigned, there are no resources to share.
1224 physicalnode_id_hash = gmx_physicalnode_id_hash();
1230 fprintf(debug, "dd_setup_dd_dlb_gpu_sharing:\n");
1231 fprintf(debug, "DD PP rank %d physical node hash %d gpu_id %d\n",
1232 dd->rank, physicalnode_id_hash, gpu_id);
1234 /* Split the PP communicator over the physical nodes */
1235 /* TODO: See if we should store this (before), as it's also used for
1236 * for the nodecomm summation.
1238 // TODO PhysicalNodeCommunicator could be extended/used to handle
1239 // the need for per-node per-group communicators.
1240 MPI_Comm_split(dd->mpi_comm_all, physicalnode_id_hash, dd->rank,
1241 &mpi_comm_pp_physicalnode);
1242 MPI_Comm_split(mpi_comm_pp_physicalnode, gpu_id, dd->rank,
1243 &dd->comm->mpi_comm_gpu_shared);
1244 MPI_Comm_free(&mpi_comm_pp_physicalnode);
1245 MPI_Comm_size(dd->comm->mpi_comm_gpu_shared, &dd->comm->nrank_gpu_shared);
1249 fprintf(debug, "nrank_gpu_shared %d\n", dd->comm->nrank_gpu_shared);
1252 /* Note that some ranks could share a GPU, while others don't */
1254 if (dd->comm->nrank_gpu_shared == 1)
1256 MPI_Comm_free(&dd->comm->mpi_comm_gpu_shared);
1259 GMX_UNUSED_VALUE(cr);
1260 GMX_UNUSED_VALUE(gpu_id);
1264 static void make_load_communicators(gmx_domdec_t gmx_unused *dd)
1267 int dim0, dim1, i, j;
1272 fprintf(debug, "Making load communicators\n");
1275 snew(dd->comm->load, std::max(dd->ndim, 1));
1276 snew(dd->comm->mpi_comm_load, std::max(dd->ndim, 1));
1284 make_load_communicator(dd, 0, loc);
1288 for (i = 0; i < dd->nc[dim0]; i++)
1291 make_load_communicator(dd, 1, loc);
1297 for (i = 0; i < dd->nc[dim0]; i++)
1301 for (j = 0; j < dd->nc[dim1]; j++)
1304 make_load_communicator(dd, 2, loc);
1311 fprintf(debug, "Finished making load communicators\n");
1316 /*! \brief Sets up the relation between neighboring domains and zones */
1317 static void setup_neighbor_relations(gmx_domdec_t *dd)
1319 int d, dim, i, j, m;
1321 gmx_domdec_zones_t *zones;
1322 gmx_domdec_ns_ranges_t *izone;
1324 for (d = 0; d < dd->ndim; d++)
1327 copy_ivec(dd->ci, tmp);
1328 tmp[dim] = (tmp[dim] + 1) % dd->nc[dim];
1329 dd->neighbor[d][0] = ddcoord2ddnodeid(dd, tmp);
1330 copy_ivec(dd->ci, tmp);
1331 tmp[dim] = (tmp[dim] - 1 + dd->nc[dim]) % dd->nc[dim];
1332 dd->neighbor[d][1] = ddcoord2ddnodeid(dd, tmp);
1335 fprintf(debug, "DD rank %d neighbor ranks in dir %d are + %d - %d\n",
1338 dd->neighbor[d][1]);
1342 int nzone = (1 << dd->ndim);
1343 int nizone = (1 << std::max(dd->ndim - 1, 0));
1344 assert(nizone >= 1 && nizone <= DD_MAXIZONE);
1346 zones = &dd->comm->zones;
1348 for (i = 0; i < nzone; i++)
1351 clear_ivec(zones->shift[i]);
1352 for (d = 0; d < dd->ndim; d++)
1354 zones->shift[i][dd->dim[d]] = dd_zo[i][m++];
1359 for (i = 0; i < nzone; i++)
1361 for (d = 0; d < DIM; d++)
1363 s[d] = dd->ci[d] - zones->shift[i][d];
1368 else if (s[d] >= dd->nc[d])
1374 zones->nizone = nizone;
1375 for (i = 0; i < zones->nizone; i++)
1377 assert(ddNonbondedZonePairRanges[i][0] == i);
1379 izone = &zones->izone[i];
1380 /* dd_zp3 is for 3D decomposition, for fewer dimensions use only
1381 * j-zones up to nzone.
1383 izone->j0 = std::min(ddNonbondedZonePairRanges[i][1], nzone);
1384 izone->j1 = std::min(ddNonbondedZonePairRanges[i][2], nzone);
1385 for (dim = 0; dim < DIM; dim++)
1387 if (dd->nc[dim] == 1)
1389 /* All shifts should be allowed */
1390 izone->shift0[dim] = -1;
1391 izone->shift1[dim] = 1;
1395 /* Determine the min/max j-zone shift wrt the i-zone */
1396 izone->shift0[dim] = 1;
1397 izone->shift1[dim] = -1;
1398 for (j = izone->j0; j < izone->j1; j++)
1400 int shift_diff = zones->shift[j][dim] - zones->shift[i][dim];
1401 if (shift_diff < izone->shift0[dim])
1403 izone->shift0[dim] = shift_diff;
1405 if (shift_diff > izone->shift1[dim])
1407 izone->shift1[dim] = shift_diff;
1414 if (!isDlbDisabled(dd->comm))
1416 dd->comm->cellsizesWithDlb.resize(dd->ndim);
1419 if (dd->comm->bRecordLoad)
1421 make_load_communicators(dd);
1425 static void make_pp_communicator(const gmx::MDLogger &mdlog,
1427 t_commrec gmx_unused *cr,
1428 bool gmx_unused reorder)
1431 gmx_domdec_comm_t *comm;
1438 if (comm->bCartesianPP)
1440 /* Set up cartesian communication for the particle-particle part */
1441 GMX_LOG(mdlog.info).appendTextFormatted(
1442 "Will use a Cartesian communicator: %d x %d x %d",
1443 dd->nc[XX], dd->nc[YY], dd->nc[ZZ]);
1445 for (int i = 0; i < DIM; i++)
1449 MPI_Cart_create(cr->mpi_comm_mygroup, DIM, dd->nc, periods, static_cast<int>(reorder),
1451 /* We overwrite the old communicator with the new cartesian one */
1452 cr->mpi_comm_mygroup = comm_cart;
1455 dd->mpi_comm_all = cr->mpi_comm_mygroup;
1456 MPI_Comm_rank(dd->mpi_comm_all, &dd->rank);
1458 if (comm->bCartesianPP_PME)
1460 /* Since we want to use the original cartesian setup for sim,
1461 * and not the one after split, we need to make an index.
1463 snew(comm->ddindex2ddnodeid, dd->nnodes);
1464 comm->ddindex2ddnodeid[dd_index(dd->nc, dd->ci)] = dd->rank;
1465 gmx_sumi(dd->nnodes, comm->ddindex2ddnodeid, cr);
1466 /* Get the rank of the DD master,
1467 * above we made sure that the master node is a PP node.
1477 MPI_Allreduce(&rank, &dd->masterrank, 1, MPI_INT, MPI_SUM, dd->mpi_comm_all);
1479 else if (comm->bCartesianPP)
1481 if (cr->npmenodes == 0)
1483 /* The PP communicator is also
1484 * the communicator for this simulation
1486 cr->mpi_comm_mysim = cr->mpi_comm_mygroup;
1488 cr->nodeid = dd->rank;
1490 MPI_Cart_coords(dd->mpi_comm_all, dd->rank, DIM, dd->ci);
1492 /* We need to make an index to go from the coordinates
1493 * to the nodeid of this simulation.
1495 snew(comm->ddindex2simnodeid, dd->nnodes);
1496 snew(buf, dd->nnodes);
1497 if (thisRankHasDuty(cr, DUTY_PP))
1499 buf[dd_index(dd->nc, dd->ci)] = cr->sim_nodeid;
1501 /* Communicate the ddindex to simulation nodeid index */
1502 MPI_Allreduce(buf, comm->ddindex2simnodeid, dd->nnodes, MPI_INT, MPI_SUM,
1503 cr->mpi_comm_mysim);
1506 /* Determine the master coordinates and rank.
1507 * The DD master should be the same node as the master of this sim.
1509 for (int i = 0; i < dd->nnodes; i++)
1511 if (comm->ddindex2simnodeid[i] == 0)
1513 ddindex2xyz(dd->nc, i, dd->master_ci);
1514 MPI_Cart_rank(dd->mpi_comm_all, dd->master_ci, &dd->masterrank);
1519 fprintf(debug, "The master rank is %d\n", dd->masterrank);
1524 /* No Cartesian communicators */
1525 /* We use the rank in dd->comm->all as DD index */
1526 ddindex2xyz(dd->nc, dd->rank, dd->ci);
1527 /* The simulation master nodeid is 0, so the DD master rank is also 0 */
1529 clear_ivec(dd->master_ci);
1533 GMX_LOG(mdlog.info).appendTextFormatted(
1534 "Domain decomposition rank %d, coordinates %d %d %d\n",
1535 dd->rank, dd->ci[XX], dd->ci[YY], dd->ci[ZZ]);
1539 "Domain decomposition rank %d, coordinates %d %d %d\n\n",
1540 dd->rank, dd->ci[XX], dd->ci[YY], dd->ci[ZZ]);
1544 static void receive_ddindex2simnodeid(gmx_domdec_t *dd,
1548 gmx_domdec_comm_t *comm = dd->comm;
1550 if (!comm->bCartesianPP_PME && comm->bCartesianPP)
1553 snew(comm->ddindex2simnodeid, dd->nnodes);
1554 snew(buf, dd->nnodes);
1555 if (thisRankHasDuty(cr, DUTY_PP))
1557 buf[dd_index(dd->nc, dd->ci)] = cr->sim_nodeid;
1559 /* Communicate the ddindex to simulation nodeid index */
1560 MPI_Allreduce(buf, comm->ddindex2simnodeid, dd->nnodes, MPI_INT, MPI_SUM,
1561 cr->mpi_comm_mysim);
1565 GMX_UNUSED_VALUE(dd);
1566 GMX_UNUSED_VALUE(cr);
1570 static void split_communicator(const gmx::MDLogger &mdlog,
1571 t_commrec *cr, gmx_domdec_t *dd,
1572 DdRankOrder gmx_unused rankOrder,
1573 bool gmx_unused reorder)
1575 gmx_domdec_comm_t *comm;
1584 if (comm->bCartesianPP)
1586 for (i = 1; i < DIM; i++)
1588 bDiv[i] = ((cr->npmenodes*dd->nc[i]) % (dd->nnodes) == 0);
1590 if (bDiv[YY] || bDiv[ZZ])
1592 comm->bCartesianPP_PME = TRUE;
1593 /* If we have 2D PME decomposition, which is always in x+y,
1594 * we stack the PME only nodes in z.
1595 * Otherwise we choose the direction that provides the thinnest slab
1596 * of PME only nodes as this will have the least effect
1597 * on the PP communication.
1598 * But for the PME communication the opposite might be better.
1600 if (bDiv[ZZ] && (comm->npmenodes_y > 1 ||
1602 dd->nc[YY] > dd->nc[ZZ]))
1604 comm->cartpmedim = ZZ;
1608 comm->cartpmedim = YY;
1610 comm->ntot[comm->cartpmedim]
1611 += (cr->npmenodes*dd->nc[comm->cartpmedim])/dd->nnodes;
1615 GMX_LOG(mdlog.info).appendTextFormatted(
1616 "Number of PME-only ranks (%d) is not a multiple of nx*ny (%d*%d) or nx*nz (%d*%d)",
1617 cr->npmenodes, dd->nc[XX], dd->nc[YY], dd->nc[XX], dd->nc[ZZ]);
1618 GMX_LOG(mdlog.info).appendText("Will not use a Cartesian communicator for PP <-> PME\n");
1622 if (comm->bCartesianPP_PME)
1628 GMX_LOG(mdlog.info).appendTextFormatted(
1629 "Will use a Cartesian communicator for PP <-> PME: %d x %d x %d",
1630 comm->ntot[XX], comm->ntot[YY], comm->ntot[ZZ]);
1632 for (i = 0; i < DIM; i++)
1636 MPI_Cart_create(cr->mpi_comm_mysim, DIM, comm->ntot, periods, static_cast<int>(reorder),
1638 MPI_Comm_rank(comm_cart, &rank);
1639 if (MASTER(cr) && rank != 0)
1641 gmx_fatal(FARGS, "MPI rank 0 was renumbered by MPI_Cart_create, we do not allow this");
1644 /* With this assigment we loose the link to the original communicator
1645 * which will usually be MPI_COMM_WORLD, unless have multisim.
1647 cr->mpi_comm_mysim = comm_cart;
1648 cr->sim_nodeid = rank;
1650 MPI_Cart_coords(cr->mpi_comm_mysim, cr->sim_nodeid, DIM, dd->ci);
1652 GMX_LOG(mdlog.info).appendTextFormatted(
1653 "Cartesian rank %d, coordinates %d %d %d\n",
1654 cr->sim_nodeid, dd->ci[XX], dd->ci[YY], dd->ci[ZZ]);
1656 if (dd->ci[comm->cartpmedim] < dd->nc[comm->cartpmedim])
1660 if (cr->npmenodes == 0 ||
1661 dd->ci[comm->cartpmedim] >= dd->nc[comm->cartpmedim])
1663 cr->duty = DUTY_PME;
1666 /* Split the sim communicator into PP and PME only nodes */
1667 MPI_Comm_split(cr->mpi_comm_mysim,
1668 getThisRankDuties(cr),
1669 dd_index(comm->ntot, dd->ci),
1670 &cr->mpi_comm_mygroup);
1677 case DdRankOrder::pp_pme:
1678 GMX_LOG(mdlog.info).appendText("Order of the ranks: PP first, PME last");
1680 case DdRankOrder::interleave:
1681 /* Interleave the PP-only and PME-only ranks */
1682 GMX_LOG(mdlog.info).appendText("Interleaving PP and PME ranks");
1683 comm->pmenodes = dd_interleaved_pme_ranks(dd);
1685 case DdRankOrder::cartesian:
1688 gmx_fatal(FARGS, "Invalid ddRankOrder=%d", static_cast<int>(rankOrder));
1691 if (dd_simnode2pmenode(dd, cr, cr->sim_nodeid) == -1)
1693 cr->duty = DUTY_PME;
1700 /* Split the sim communicator into PP and PME only nodes */
1701 MPI_Comm_split(cr->mpi_comm_mysim,
1702 getThisRankDuties(cr),
1704 &cr->mpi_comm_mygroup);
1705 MPI_Comm_rank(cr->mpi_comm_mygroup, &cr->nodeid);
1709 GMX_LOG(mdlog.info).appendTextFormatted(
1710 "This rank does only %s work.\n",
1711 thisRankHasDuty(cr, DUTY_PP) ? "particle-particle" : "PME-mesh");
1714 /*! \brief Generates the MPI communicators for domain decomposition */
1715 static void make_dd_communicators(const gmx::MDLogger &mdlog,
1717 gmx_domdec_t *dd, DdRankOrder ddRankOrder)
1719 gmx_domdec_comm_t *comm;
1724 copy_ivec(dd->nc, comm->ntot);
1726 comm->bCartesianPP = (ddRankOrder == DdRankOrder::cartesian);
1727 comm->bCartesianPP_PME = FALSE;
1729 /* Reorder the nodes by default. This might change the MPI ranks.
1730 * Real reordering is only supported on very few architectures,
1731 * Blue Gene is one of them.
1733 CartReorder = getenv("GMX_NO_CART_REORDER") == nullptr;
1735 if (cr->npmenodes > 0)
1737 /* Split the communicator into a PP and PME part */
1738 split_communicator(mdlog, cr, dd, ddRankOrder, CartReorder);
1739 if (comm->bCartesianPP_PME)
1741 /* We (possibly) reordered the nodes in split_communicator,
1742 * so it is no longer required in make_pp_communicator.
1744 CartReorder = FALSE;
1749 /* All nodes do PP and PME */
1751 /* We do not require separate communicators */
1752 cr->mpi_comm_mygroup = cr->mpi_comm_mysim;
1756 if (thisRankHasDuty(cr, DUTY_PP))
1758 /* Copy or make a new PP communicator */
1759 make_pp_communicator(mdlog, dd, cr, CartReorder);
1763 receive_ddindex2simnodeid(dd, cr);
1766 if (!thisRankHasDuty(cr, DUTY_PME))
1768 /* Set up the commnuication to our PME node */
1769 dd->pme_nodeid = dd_simnode2pmenode(dd, cr, cr->sim_nodeid);
1770 dd->pme_receive_vir_ener = receive_vir_ener(dd, cr);
1773 fprintf(debug, "My pme_nodeid %d receive ener %s\n",
1774 dd->pme_nodeid, gmx::boolToString(dd->pme_receive_vir_ener));
1779 dd->pme_nodeid = -1;
1782 /* We can not use DDMASTER(dd), because dd->masterrank is set later */
1785 dd->ma = std::make_unique<AtomDistribution>(dd->nc,
1787 comm->cgs_gl.index[comm->cgs_gl.nr]);
1791 static real *get_slb_frac(const gmx::MDLogger &mdlog,
1792 const char *dir, int nc, const char *size_string)
1794 real *slb_frac, tot;
1799 if (nc > 1 && size_string != nullptr)
1801 GMX_LOG(mdlog.info).appendTextFormatted(
1802 "Using static load balancing for the %s direction", dir);
1805 for (i = 0; i < nc; i++)
1808 sscanf(size_string, "%20lf%n", &dbl, &n);
1811 gmx_fatal(FARGS, "Incorrect or not enough DD cell size entries for direction %s: '%s'", dir, size_string);
1818 std::string relativeCellSizes = "Relative cell sizes:";
1819 for (i = 0; i < nc; i++)
1822 relativeCellSizes += gmx::formatString(" %5.3f", slb_frac[i]);
1824 GMX_LOG(mdlog.info).appendText(relativeCellSizes);
1830 static int multi_body_bondeds_count(const gmx_mtop_t *mtop)
1833 gmx_mtop_ilistloop_t iloop = gmx_mtop_ilistloop_init(mtop);
1835 while (const InteractionLists *ilists = gmx_mtop_ilistloop_next(iloop, &nmol))
1837 for (auto &ilist : extractILists(*ilists, IF_BOND))
1839 if (NRAL(ilist.functionType) > 2)
1841 n += nmol*(ilist.iatoms.size()/ilistStride(ilist));
1849 static int dd_getenv(const gmx::MDLogger &mdlog,
1850 const char *env_var, int def)
1856 val = getenv(env_var);
1859 if (sscanf(val, "%20d", &nst) <= 0)
1863 GMX_LOG(mdlog.info).appendTextFormatted(
1864 "Found env.var. %s = %s, using value %d",
1871 static void check_dd_restrictions(const gmx_domdec_t *dd,
1872 const t_inputrec *ir,
1873 const gmx::MDLogger &mdlog)
1875 if (ir->ePBC == epbcSCREW &&
1876 (dd->nc[XX] == 1 || dd->nc[YY] > 1 || dd->nc[ZZ] > 1))
1878 gmx_fatal(FARGS, "With pbc=%s can only do domain decomposition in the x-direction", epbc_names[ir->ePBC]);
1881 if (ir->ns_type == ensSIMPLE)
1883 gmx_fatal(FARGS, "Domain decomposition does not support simple neighbor searching, use grid searching or run with one MPI rank");
1886 if (ir->nstlist == 0)
1888 gmx_fatal(FARGS, "Domain decomposition does not work with nstlist=0");
1891 if (ir->comm_mode == ecmANGULAR && ir->ePBC != epbcNONE)
1893 GMX_LOG(mdlog.warning).appendText("comm-mode angular will give incorrect results when the comm group partially crosses a periodic boundary");
1897 static real average_cellsize_min(gmx_domdec_t *dd, gmx_ddbox_t *ddbox)
1902 r = ddbox->box_size[XX];
1903 for (di = 0; di < dd->ndim; di++)
1906 /* Check using the initial average cell size */
1907 r = std::min(r, ddbox->box_size[d]*ddbox->skew_fac[d]/dd->nc[d]);
1913 /*! \brief Depending on the DLB initial value return the DLB switched off state or issue an error.
1915 static DlbState forceDlbOffOrBail(DlbState cmdlineDlbState,
1916 const std::string &reasonStr,
1917 const gmx::MDLogger &mdlog)
1919 std::string dlbNotSupportedErr = "Dynamic load balancing requested, but ";
1920 std::string dlbDisableNote = "NOTE: disabling dynamic load balancing as ";
1922 if (cmdlineDlbState == DlbState::onUser)
1924 gmx_fatal(FARGS, "%s", (dlbNotSupportedErr + reasonStr).c_str());
1926 else if (cmdlineDlbState == DlbState::offCanTurnOn)
1928 GMX_LOG(mdlog.info).appendText(dlbDisableNote + reasonStr);
1930 return DlbState::offForever;
1933 /*! \brief Return the dynamic load balancer's initial state based on initial conditions and user inputs.
1935 * This function parses the parameters of "-dlb" command line option setting
1936 * corresponding state values. Then it checks the consistency of the determined
1937 * state with other run parameters and settings. As a result, the initial state
1938 * may be altered or an error may be thrown if incompatibility of options is detected.
1940 * \param [in] mdlog Logger.
1941 * \param [in] dlbOption Enum value for the DLB option.
1942 * \param [in] bRecordLoad True if the load balancer is recording load information.
1943 * \param [in] mdrunOptions Options for mdrun.
1944 * \param [in] ir Pointer mdrun to input parameters.
1945 * \returns DLB initial/startup state.
1947 static DlbState determineInitialDlbState(const gmx::MDLogger &mdlog,
1948 DlbOption dlbOption, gmx_bool bRecordLoad,
1949 const gmx::MdrunOptions &mdrunOptions,
1950 const t_inputrec *ir)
1952 DlbState dlbState = DlbState::offCanTurnOn;
1956 case DlbOption::turnOnWhenUseful: dlbState = DlbState::offCanTurnOn; break;
1957 case DlbOption::no: dlbState = DlbState::offUser; break;
1958 case DlbOption::yes: dlbState = DlbState::onUser; break;
1959 default: gmx_incons("Invalid dlbOption enum value");
1962 /* Reruns don't support DLB: bail or override auto mode */
1963 if (mdrunOptions.rerun)
1965 std::string reasonStr = "it is not supported in reruns.";
1966 return forceDlbOffOrBail(dlbState, reasonStr, mdlog);
1969 /* Unsupported integrators */
1970 if (!EI_DYNAMICS(ir->eI))
1972 auto reasonStr = gmx::formatString("it is only supported with dynamics, not with integrator '%s'.", EI(ir->eI));
1973 return forceDlbOffOrBail(dlbState, reasonStr, mdlog);
1976 /* Without cycle counters we can't time work to balance on */
1979 std::string reasonStr = "cycle counters unsupported or not enabled in the operating system kernel.";
1980 return forceDlbOffOrBail(dlbState, reasonStr, mdlog);
1983 if (mdrunOptions.reproducible)
1985 std::string reasonStr = "you started a reproducible run.";
1988 case DlbState::offUser:
1990 case DlbState::offForever:
1991 GMX_RELEASE_ASSERT(false, "DlbState::offForever is not a valid initial state");
1993 case DlbState::offCanTurnOn:
1994 return forceDlbOffOrBail(dlbState, reasonStr, mdlog);
1995 case DlbState::onCanTurnOff:
1996 GMX_RELEASE_ASSERT(false, "DlbState::offCanTurnOff is not a valid initial state");
1998 case DlbState::onUser:
1999 return forceDlbOffOrBail(dlbState, reasonStr + " In load balanced runs binary reproducibility cannot be ensured.", mdlog);
2001 gmx_fatal(FARGS, "Death horror: undefined case (%d) for load balancing choice", static_cast<int>(dlbState));
2008 static void set_dd_dim(const gmx::MDLogger &mdlog, gmx_domdec_t *dd)
2011 if (getenv("GMX_DD_ORDER_ZYX") != nullptr)
2013 /* Decomposition order z,y,x */
2014 GMX_LOG(mdlog.info).appendText("Using domain decomposition order z, y, x");
2015 for (int dim = DIM-1; dim >= 0; dim--)
2017 if (dd->nc[dim] > 1)
2019 dd->dim[dd->ndim++] = dim;
2025 /* Decomposition order x,y,z */
2026 for (int dim = 0; dim < DIM; dim++)
2028 if (dd->nc[dim] > 1)
2030 dd->dim[dd->ndim++] = dim;
2037 /* Set dim[0] to avoid extra checks on ndim in several places */
2042 static gmx_domdec_comm_t *init_dd_comm()
2044 gmx_domdec_comm_t *comm = new gmx_domdec_comm_t;
2046 comm->n_load_have = 0;
2047 comm->n_load_collect = 0;
2049 comm->haveTurnedOffDlb = false;
2051 for (int i = 0; i < static_cast<int>(DDAtomRanges::Type::Number); i++)
2053 comm->sum_nat[i] = 0;
2057 comm->load_step = 0;
2060 clear_ivec(comm->load_lim);
2064 /* This should be replaced by a unique pointer */
2065 comm->balanceRegion = ddBalanceRegionAllocate();
2070 /* Returns whether mtop contains constraints and/or vsites */
2071 static bool systemHasConstraintsOrVsites(const gmx_mtop_t &mtop)
2073 auto ilistLoop = gmx_mtop_ilistloop_init(mtop);
2075 while (const InteractionLists *ilists = gmx_mtop_ilistloop_next(ilistLoop, &nmol))
2077 if (!extractILists(*ilists, IF_CONSTRAINT | IF_VSITE).empty())
2086 static void setupUpdateGroups(const gmx::MDLogger &mdlog,
2087 const gmx_mtop_t &mtop,
2088 const t_inputrec &inputrec,
2090 int numMpiRanksTotal,
2091 gmx_domdec_comm_t *comm)
2093 /* When we have constraints and/or vsites, it is beneficial to use
2094 * update groups (when possible) to allow independent update of groups.
2096 if (!systemHasConstraintsOrVsites(mtop))
2098 /* No constraints or vsites, atoms can be updated independently */
2102 comm->updateGroupingPerMoleculetype = gmx::makeUpdateGroups(mtop);
2103 comm->useUpdateGroups =
2104 (!comm->updateGroupingPerMoleculetype.empty() &&
2105 getenv("GMX_NO_UPDATEGROUPS") == nullptr);
2107 if (comm->useUpdateGroups)
2109 int numUpdateGroups = 0;
2110 for (const auto &molblock : mtop.molblock)
2112 numUpdateGroups += molblock.nmol*comm->updateGroupingPerMoleculetype[molblock.type].numBlocks();
2115 /* Note: We would like to use dd->nnodes for the atom count estimate,
2116 * but that is not yet available here. But this anyhow only
2117 * affect performance up to the second dd_partition_system call.
2119 int homeAtomCountEstimate = mtop.natoms/numMpiRanksTotal;
2120 comm->updateGroupsCog =
2121 std::make_unique<gmx::UpdateGroupsCog>(mtop,
2122 comm->updateGroupingPerMoleculetype,
2123 maxReferenceTemperature(inputrec),
2124 homeAtomCountEstimate);
2126 /* To use update groups, the large domain-to-domain cutoff distance
2127 * should be compatible with the box size.
2129 comm->useUpdateGroups = (atomToAtomIntoDomainToDomainCutoff(*comm, 0) < cutoffMargin);
2131 if (comm->useUpdateGroups)
2133 GMX_LOG(mdlog.info).appendTextFormatted(
2134 "Using update groups, nr %d, average size %.1f atoms, max. radius %.3f nm\n",
2136 mtop.natoms/static_cast<double>(numUpdateGroups),
2137 comm->updateGroupsCog->maxUpdateGroupRadius());
2141 GMX_LOG(mdlog.info).appendTextFormatted("The combination of rlist and box size prohibits the use of update groups\n");
2142 comm->updateGroupingPerMoleculetype.clear();
2143 comm->updateGroupsCog.reset(nullptr);
2148 /*! \brief Set the cell size and interaction limits, as well as the DD grid */
2149 static void set_dd_limits_and_grid(const gmx::MDLogger &mdlog,
2150 t_commrec *cr, gmx_domdec_t *dd,
2151 const DomdecOptions &options,
2152 const gmx::MdrunOptions &mdrunOptions,
2153 const gmx_mtop_t *mtop,
2154 const t_inputrec *ir,
2156 gmx::ArrayRef<const gmx::RVec> xGlobal,
2160 real r_bonded_limit = -1;
2161 const real tenPercentMargin = 1.1;
2162 gmx_domdec_comm_t *comm = dd->comm;
2164 dd->npbcdim = ePBC2npbcdim(ir->ePBC);
2165 dd->numBoundedDimensions = inputrec2nboundeddim(ir);
2166 dd->haveDynamicBox = inputrecDynamicBox(ir);
2167 dd->bScrewPBC = (ir->ePBC == epbcSCREW);
2169 dd->pme_recv_f_alloc = 0;
2170 dd->pme_recv_f_buf = nullptr;
2172 /* Initialize to GPU share count to 0, might change later */
2173 comm->nrank_gpu_shared = 0;
2175 comm->dlbState = determineInitialDlbState(mdlog, options.dlbOption, comm->bRecordLoad, mdrunOptions, ir);
2176 dd_dlb_set_should_check_whether_to_turn_dlb_on(dd, TRUE);
2177 /* To consider turning DLB on after 2*nstlist steps we need to check
2178 * at partitioning count 3. Thus we need to increase the first count by 2.
2180 comm->ddPartioningCountFirstDlbOff += 2;
2182 GMX_LOG(mdlog.info).appendTextFormatted(
2183 "Dynamic load balancing: %s", edlbs_names[int(comm->dlbState)]);
2185 comm->bPMELoadBalDLBLimits = FALSE;
2187 /* Allocate the charge group/atom sorting struct */
2188 comm->sort = std::make_unique<gmx_domdec_sort_t>();
2190 comm->bCGs = (ncg_mtop(mtop) < mtop->natoms);
2192 /* We need to decide on update groups early, as this affects communication distances */
2193 comm->useUpdateGroups = false;
2194 if (ir->cutoff_scheme == ecutsVERLET)
2196 real cutoffMargin = std::sqrt(max_cutoff2(ir->ePBC, box)) - ir->rlist;
2197 setupUpdateGroups(mdlog, *mtop, *ir, cutoffMargin, cr->nnodes, comm);
2200 comm->bInterCGBondeds = ((ncg_mtop(mtop) > gmx_mtop_num_molecules(*mtop)) ||
2201 mtop->bIntermolecularInteractions);
2202 if (comm->bInterCGBondeds)
2204 comm->bInterCGMultiBody = (multi_body_bondeds_count(mtop) > 0);
2208 comm->bInterCGMultiBody = FALSE;
2211 if (comm->useUpdateGroups)
2213 dd->splitConstraints = false;
2214 dd->splitSettles = false;
2218 dd->splitConstraints = gmx::inter_charge_group_constraints(*mtop);
2219 dd->splitSettles = gmx::inter_charge_group_settles(*mtop);
2224 /* Set the cut-off to some very large value,
2225 * so we don't need if statements everywhere in the code.
2226 * We use sqrt, since the cut-off is squared in some places.
2228 comm->cutoff = GMX_CUTOFF_INF;
2232 comm->cutoff = atomToAtomIntoDomainToDomainCutoff(*comm, ir->rlist);
2234 comm->cutoff_mbody = 0;
2236 /* Determine the minimum cell size limit, affected by many factors */
2237 comm->cellsize_limit = 0;
2238 comm->bBondComm = FALSE;
2240 /* We do not allow home atoms to move beyond the neighboring domain
2241 * between domain decomposition steps, which limits the cell size.
2242 * Get an estimate of cell size limit due to atom displacement.
2243 * In most cases this is a large overestimate, because it assumes
2244 * non-interaction atoms.
2245 * We set the chance to 1 in a trillion steps.
2247 constexpr real c_chanceThatAtomMovesBeyondDomain = 1e-12;
2248 const real limitForAtomDisplacement =
2249 minCellSizeForAtomDisplacement(*mtop, *ir,
2250 comm->updateGroupingPerMoleculetype,
2251 c_chanceThatAtomMovesBeyondDomain);
2252 GMX_LOG(mdlog.info).appendTextFormatted(
2253 "Minimum cell size due to atom displacement: %.3f nm",
2254 limitForAtomDisplacement);
2256 comm->cellsize_limit = std::max(comm->cellsize_limit,
2257 limitForAtomDisplacement);
2259 /* TODO: PME decomposition currently requires atoms not to be more than
2260 * 2/3 of comm->cutoff, which is >=rlist, outside of their domain.
2261 * In nearly all cases, limitForAtomDisplacement will be smaller
2262 * than 2/3*rlist, so the PME requirement is satisfied.
2263 * But it would be better for both correctness and performance
2264 * to use limitForAtomDisplacement instead of 2/3*comm->cutoff.
2265 * Note that we would need to improve the pairlist buffer case.
2268 if (comm->bInterCGBondeds)
2270 if (options.minimumCommunicationRange > 0)
2272 comm->cutoff_mbody = atomToAtomIntoDomainToDomainCutoff(*comm, options.minimumCommunicationRange);
2273 if (options.useBondedCommunication)
2275 comm->bBondComm = (comm->cutoff_mbody > comm->cutoff);
2279 comm->cutoff = std::max(comm->cutoff, comm->cutoff_mbody);
2281 r_bonded_limit = comm->cutoff_mbody;
2283 else if (ir->bPeriodicMols)
2285 /* Can not easily determine the required cut-off */
2286 GMX_LOG(mdlog.warning).appendText("NOTE: Periodic molecules are present in this system. Because of this, the domain decomposition algorithm cannot easily determine the minimum cell size that it requires for treating bonded interactions. Instead, domain decomposition will assume that half the non-bonded cut-off will be a suitable lower bound.");
2287 comm->cutoff_mbody = comm->cutoff/2;
2288 r_bonded_limit = comm->cutoff_mbody;
2296 dd_bonded_cg_distance(mdlog, mtop, ir, as_rvec_array(xGlobal.data()), box,
2297 options.checkBondedInteractions,
2300 gmx_bcast(sizeof(r_2b), &r_2b, cr);
2301 gmx_bcast(sizeof(r_mb), &r_mb, cr);
2303 /* We use an initial margin of 10% for the minimum cell size,
2304 * except when we are just below the non-bonded cut-off.
2306 if (options.useBondedCommunication)
2308 if (std::max(r_2b, r_mb) > comm->cutoff)
2310 r_bonded = std::max(r_2b, r_mb);
2311 r_bonded_limit = tenPercentMargin*r_bonded;
2312 comm->bBondComm = TRUE;
2317 r_bonded_limit = std::min(tenPercentMargin*r_bonded, comm->cutoff);
2319 /* We determine cutoff_mbody later */
2323 /* No special bonded communication,
2324 * simply increase the DD cut-off.
2326 r_bonded_limit = tenPercentMargin*std::max(r_2b, r_mb);
2327 comm->cutoff_mbody = r_bonded_limit;
2328 comm->cutoff = std::max(comm->cutoff, comm->cutoff_mbody);
2331 GMX_LOG(mdlog.info).appendTextFormatted(
2332 "Minimum cell size due to bonded interactions: %.3f nm",
2335 comm->cellsize_limit = std::max(comm->cellsize_limit, r_bonded_limit);
2339 if (dd->splitConstraints && options.constraintCommunicationRange <= 0)
2341 /* There is a cell size limit due to the constraints (P-LINCS) */
2342 rconstr = gmx::constr_r_max(mdlog, mtop, ir);
2343 GMX_LOG(mdlog.info).appendTextFormatted(
2344 "Estimated maximum distance required for P-LINCS: %.3f nm",
2346 if (rconstr > comm->cellsize_limit)
2348 GMX_LOG(mdlog.info).appendText("This distance will limit the DD cell size, you can override this with -rcon");
2351 else if (options.constraintCommunicationRange > 0)
2353 /* Here we do not check for dd->splitConstraints.
2354 * because one can also set a cell size limit for virtual sites only
2355 * and at this point we don't know yet if there are intercg v-sites.
2357 GMX_LOG(mdlog.info).appendTextFormatted(
2358 "User supplied maximum distance required for P-LINCS: %.3f nm",
2359 options.constraintCommunicationRange);
2360 rconstr = options.constraintCommunicationRange;
2362 comm->cellsize_limit = std::max(comm->cellsize_limit, rconstr);
2364 comm->cgs_gl = gmx_mtop_global_cgs(mtop);
2366 if (options.numCells[XX] > 0)
2368 copy_ivec(options.numCells, dd->nc);
2369 set_dd_dim(mdlog, dd);
2370 set_ddbox_cr(*cr, &dd->nc, *ir, box, xGlobal, ddbox);
2372 if (options.numPmeRanks >= 0)
2374 cr->npmenodes = options.numPmeRanks;
2378 /* When the DD grid is set explicitly and -npme is set to auto,
2379 * don't use PME ranks. We check later if the DD grid is
2380 * compatible with the total number of ranks.
2385 real acs = average_cellsize_min(dd, ddbox);
2386 if (acs < comm->cellsize_limit)
2388 gmx_fatal_collective(FARGS, cr->mpi_comm_mysim, MASTER(cr),
2389 "The initial cell size (%f) is smaller than the cell size limit (%f), change options -dd, -rdd or -rcon, see the log file for details",
2390 acs, comm->cellsize_limit);
2395 set_ddbox_cr(*cr, nullptr, *ir, box, xGlobal, ddbox);
2397 /* We need to choose the optimal DD grid and possibly PME nodes */
2399 dd_choose_grid(mdlog, cr, dd, ir, mtop, box, ddbox,
2400 options.numPmeRanks,
2401 !isDlbDisabled(comm),
2403 comm->cellsize_limit, comm->cutoff,
2404 comm->bInterCGBondeds);
2406 if (dd->nc[XX] == 0)
2409 gmx_bool bC = (dd->splitConstraints && rconstr > r_bonded_limit);
2410 sprintf(buf, "Change the number of ranks or mdrun option %s%s%s",
2411 !bC ? "-rdd" : "-rcon",
2412 comm->dlbState != DlbState::offUser ? " or -dds" : "",
2413 bC ? " or your LINCS settings" : "");
2415 gmx_fatal_collective(FARGS, cr->mpi_comm_mysim, MASTER(cr),
2416 "There is no domain decomposition for %d ranks that is compatible with the given box and a minimum cell size of %g nm\n"
2418 "Look in the log file for details on the domain decomposition",
2419 cr->nnodes-cr->npmenodes, limit, buf);
2421 set_dd_dim(mdlog, dd);
2424 GMX_LOG(mdlog.info).appendTextFormatted(
2425 "Domain decomposition grid %d x %d x %d, separate PME ranks %d",
2426 dd->nc[XX], dd->nc[YY], dd->nc[ZZ], cr->npmenodes);
2428 dd->nnodes = dd->nc[XX]*dd->nc[YY]*dd->nc[ZZ];
2429 if (cr->nnodes - dd->nnodes != cr->npmenodes)
2431 gmx_fatal_collective(FARGS, cr->mpi_comm_mysim, MASTER(cr),
2432 "The size of the domain decomposition grid (%d) does not match the number of ranks (%d). The total number of ranks is %d",
2433 dd->nnodes, cr->nnodes - cr->npmenodes, cr->nnodes);
2435 if (cr->npmenodes > dd->nnodes)
2437 gmx_fatal_collective(FARGS, cr->mpi_comm_mysim, MASTER(cr),
2438 "The number of separate PME ranks (%d) is larger than the number of PP ranks (%d), this is not supported.", cr->npmenodes, dd->nnodes);
2440 if (cr->npmenodes > 0)
2442 comm->npmenodes = cr->npmenodes;
2446 comm->npmenodes = dd->nnodes;
2449 if (EEL_PME(ir->coulombtype) || EVDW_PME(ir->vdwtype))
2451 /* The following choices should match those
2452 * in comm_cost_est in domdec_setup.c.
2453 * Note that here the checks have to take into account
2454 * that the decomposition might occur in a different order than xyz
2455 * (for instance through the env.var. GMX_DD_ORDER_ZYX),
2456 * in which case they will not match those in comm_cost_est,
2457 * but since that is mainly for testing purposes that's fine.
2459 if (dd->ndim >= 2 && dd->dim[0] == XX && dd->dim[1] == YY &&
2460 comm->npmenodes > dd->nc[XX] && comm->npmenodes % dd->nc[XX] == 0 &&
2461 getenv("GMX_PMEONEDD") == nullptr)
2463 comm->npmedecompdim = 2;
2464 comm->npmenodes_x = dd->nc[XX];
2465 comm->npmenodes_y = comm->npmenodes/comm->npmenodes_x;
2469 /* In case nc is 1 in both x and y we could still choose to
2470 * decompose pme in y instead of x, but we use x for simplicity.
2472 comm->npmedecompdim = 1;
2473 if (dd->dim[0] == YY)
2475 comm->npmenodes_x = 1;
2476 comm->npmenodes_y = comm->npmenodes;
2480 comm->npmenodes_x = comm->npmenodes;
2481 comm->npmenodes_y = 1;
2484 GMX_LOG(mdlog.info).appendTextFormatted(
2485 "PME domain decomposition: %d x %d x %d",
2486 comm->npmenodes_x, comm->npmenodes_y, 1);
2490 comm->npmedecompdim = 0;
2491 comm->npmenodes_x = 0;
2492 comm->npmenodes_y = 0;
2495 snew(comm->slb_frac, DIM);
2496 if (isDlbDisabled(comm))
2498 comm->slb_frac[XX] = get_slb_frac(mdlog, "x", dd->nc[XX], options.cellSizeX);
2499 comm->slb_frac[YY] = get_slb_frac(mdlog, "y", dd->nc[YY], options.cellSizeY);
2500 comm->slb_frac[ZZ] = get_slb_frac(mdlog, "z", dd->nc[ZZ], options.cellSizeZ);
2503 if (comm->bInterCGBondeds && comm->cutoff_mbody == 0)
2505 if (comm->bBondComm || !isDlbDisabled(comm))
2507 /* Set the bonded communication distance to halfway
2508 * the minimum and the maximum,
2509 * since the extra communication cost is nearly zero.
2511 real acs = average_cellsize_min(dd, ddbox);
2512 comm->cutoff_mbody = 0.5*(r_bonded + acs);
2513 if (!isDlbDisabled(comm))
2515 /* Check if this does not limit the scaling */
2516 comm->cutoff_mbody = std::min(comm->cutoff_mbody,
2517 options.dlbScaling*acs);
2519 if (!comm->bBondComm)
2521 /* Without bBondComm do not go beyond the n.b. cut-off */
2522 comm->cutoff_mbody = std::min(comm->cutoff_mbody, comm->cutoff);
2523 if (comm->cellsize_limit >= comm->cutoff)
2525 /* We don't loose a lot of efficieny
2526 * when increasing it to the n.b. cut-off.
2527 * It can even be slightly faster, because we need
2528 * less checks for the communication setup.
2530 comm->cutoff_mbody = comm->cutoff;
2533 /* Check if we did not end up below our original limit */
2534 comm->cutoff_mbody = std::max(comm->cutoff_mbody, r_bonded_limit);
2536 if (comm->cutoff_mbody > comm->cellsize_limit)
2538 comm->cellsize_limit = comm->cutoff_mbody;
2541 /* Without DLB and cutoff_mbody<cutoff, cutoff_mbody is dynamic */
2546 fprintf(debug, "Bonded atom communication beyond the cut-off: %s\n"
2547 "cellsize limit %f\n",
2548 gmx::boolToString(comm->bBondComm), comm->cellsize_limit);
2553 check_dd_restrictions(dd, ir, mdlog);
2557 static char *init_bLocalCG(const gmx_mtop_t *mtop)
2562 ncg = ncg_mtop(mtop);
2563 snew(bLocalCG, ncg);
2564 for (cg = 0; cg < ncg; cg++)
2566 bLocalCG[cg] = FALSE;
2572 void dd_init_bondeds(FILE *fplog,
2574 const gmx_mtop_t *mtop,
2575 const gmx_vsite_t *vsite,
2576 const t_inputrec *ir,
2577 gmx_bool bBCheck, cginfo_mb_t *cginfo_mb)
2579 gmx_domdec_comm_t *comm;
2581 dd_make_reverse_top(fplog, dd, mtop, vsite, ir, bBCheck);
2585 if (comm->bBondComm)
2587 /* Communicate atoms beyond the cut-off for bonded interactions */
2590 comm->cglink = make_charge_group_links(mtop, dd, cginfo_mb);
2592 comm->bLocalCG = init_bLocalCG(mtop);
2596 /* Only communicate atoms based on cut-off */
2597 comm->cglink = nullptr;
2598 comm->bLocalCG = nullptr;
2602 static void writeSettings(gmx::TextWriter *log,
2604 const gmx_mtop_t *mtop,
2605 const t_inputrec *ir,
2606 gmx_bool bDynLoadBal,
2608 const gmx_ddbox_t *ddbox)
2610 gmx_domdec_comm_t *comm;
2619 log->writeString("The maximum number of communication pulses is:");
2620 for (d = 0; d < dd->ndim; d++)
2622 log->writeStringFormatted(" %c %d", dim2char(dd->dim[d]), comm->cd[d].np_dlb);
2624 log->ensureLineBreak();
2625 log->writeLineFormatted("The minimum size for domain decomposition cells is %.3f nm", comm->cellsize_limit);
2626 log->writeLineFormatted("The requested allowed shrink of DD cells (option -dds) is: %.2f", dlb_scale);
2627 log->writeString("The allowed shrink of domain decomposition cells is:");
2628 for (d = 0; d < DIM; d++)
2632 if (d >= ddbox->npbcdim && dd->nc[d] == 2)
2639 comm->cellsize_min_dlb[d]/
2640 (ddbox->box_size[d]*ddbox->skew_fac[d]/dd->nc[d]);
2642 log->writeStringFormatted(" %c %.2f", dim2char(d), shrink);
2645 log->ensureLineBreak();
2649 set_dd_cell_sizes_slb(dd, ddbox, setcellsizeslbPULSE_ONLY, np);
2650 log->writeString("The initial number of communication pulses is:");
2651 for (d = 0; d < dd->ndim; d++)
2653 log->writeStringFormatted(" %c %d", dim2char(dd->dim[d]), np[dd->dim[d]]);
2655 log->ensureLineBreak();
2656 log->writeString("The initial domain decomposition cell size is:");
2657 for (d = 0; d < DIM; d++)
2661 log->writeStringFormatted(" %c %.2f nm",
2662 dim2char(d), dd->comm->cellsize_min[d]);
2665 log->ensureLineBreak();
2669 const bool haveInterDomainVsites =
2670 (countInterUpdategroupVsites(*mtop, comm->updateGroupingPerMoleculetype) != 0);
2672 if (comm->bInterCGBondeds ||
2673 haveInterDomainVsites ||
2674 dd->splitConstraints || dd->splitSettles)
2676 std::string decompUnits;
2679 decompUnits = "charge groups";
2681 else if (comm->useUpdateGroups)
2683 decompUnits = "atom groups";
2687 decompUnits = "atoms";
2690 log->writeLineFormatted("The maximum allowed distance for %s involved in interactions is:", decompUnits.c_str());
2691 log->writeLineFormatted("%40s %-7s %6.3f nm", "non-bonded interactions", "", comm->cutoff);
2695 limit = dd->comm->cellsize_limit;
2699 if (dynamic_dd_box(*dd))
2701 log->writeLine("(the following are initial values, they could change due to box deformation)");
2703 limit = dd->comm->cellsize_min[XX];
2704 for (d = 1; d < DIM; d++)
2706 limit = std::min(limit, dd->comm->cellsize_min[d]);
2710 if (comm->bInterCGBondeds)
2712 log->writeLineFormatted("%40s %-7s %6.3f nm",
2713 "two-body bonded interactions", "(-rdd)",
2714 std::max(comm->cutoff, comm->cutoff_mbody));
2715 log->writeLineFormatted("%40s %-7s %6.3f nm",
2716 "multi-body bonded interactions", "(-rdd)",
2717 (comm->bBondComm || isDlbOn(dd->comm)) ? comm->cutoff_mbody : std::min(comm->cutoff, limit));
2719 if (haveInterDomainVsites)
2721 log->writeLineFormatted("%40s %-7s %6.3f nm",
2722 "virtual site constructions", "(-rcon)", limit);
2724 if (dd->splitConstraints || dd->splitSettles)
2726 std::string separation = gmx::formatString("atoms separated by up to %d constraints",
2728 log->writeLineFormatted("%40s %-7s %6.3f nm\n",
2729 separation.c_str(), "(-rcon)", limit);
2731 log->ensureLineBreak();
2735 static void logSettings(const gmx::MDLogger &mdlog,
2737 const gmx_mtop_t *mtop,
2738 const t_inputrec *ir,
2740 const gmx_ddbox_t *ddbox)
2742 gmx::StringOutputStream stream;
2743 gmx::TextWriter log(&stream);
2744 writeSettings(&log, dd, mtop, ir, isDlbOn(dd->comm), dlb_scale, ddbox);
2745 if (dd->comm->dlbState == DlbState::offCanTurnOn)
2748 log.ensureEmptyLine();
2749 log.writeLine("When dynamic load balancing gets turned on, these settings will change to:");
2751 writeSettings(&log, dd, mtop, ir, true, dlb_scale, ddbox);
2753 GMX_LOG(mdlog.info).asParagraph().appendText(stream.toString());
2756 static void set_cell_limits_dlb(const gmx::MDLogger &mdlog,
2759 const t_inputrec *ir,
2760 const gmx_ddbox_t *ddbox)
2762 gmx_domdec_comm_t *comm;
2763 int d, dim, npulse, npulse_d_max, npulse_d;
2768 bNoCutOff = (ir->rvdw == 0 || ir->rcoulomb == 0);
2770 /* Determine the maximum number of comm. pulses in one dimension */
2772 comm->cellsize_limit = std::max(comm->cellsize_limit, comm->cutoff_mbody);
2774 /* Determine the maximum required number of grid pulses */
2775 if (comm->cellsize_limit >= comm->cutoff)
2777 /* Only a single pulse is required */
2780 else if (!bNoCutOff && comm->cellsize_limit > 0)
2782 /* We round down slightly here to avoid overhead due to the latency
2783 * of extra communication calls when the cut-off
2784 * would be only slightly longer than the cell size.
2785 * Later cellsize_limit is redetermined,
2786 * so we can not miss interactions due to this rounding.
2788 npulse = static_cast<int>(0.96 + comm->cutoff/comm->cellsize_limit);
2792 /* There is no cell size limit */
2793 npulse = std::max(dd->nc[XX]-1, std::max(dd->nc[YY]-1, dd->nc[ZZ]-1));
2796 if (!bNoCutOff && npulse > 1)
2798 /* See if we can do with less pulses, based on dlb_scale */
2800 for (d = 0; d < dd->ndim; d++)
2803 npulse_d = static_cast<int>(1 + dd->nc[dim]*comm->cutoff
2804 /(ddbox->box_size[dim]*ddbox->skew_fac[dim]*dlb_scale));
2805 npulse_d_max = std::max(npulse_d_max, npulse_d);
2807 npulse = std::min(npulse, npulse_d_max);
2810 /* This env var can override npulse */
2811 d = dd_getenv(mdlog, "GMX_DD_NPULSE", 0);
2818 comm->bVacDLBNoLimit = (ir->ePBC == epbcNONE);
2819 for (d = 0; d < dd->ndim; d++)
2821 comm->cd[d].np_dlb = std::min(npulse, dd->nc[dd->dim[d]]-1);
2822 comm->maxpulse = std::max(comm->maxpulse, comm->cd[d].np_dlb);
2823 if (comm->cd[d].np_dlb < dd->nc[dd->dim[d]]-1)
2825 comm->bVacDLBNoLimit = FALSE;
2829 /* cellsize_limit is set for LINCS in init_domain_decomposition */
2830 if (!comm->bVacDLBNoLimit)
2832 comm->cellsize_limit = std::max(comm->cellsize_limit,
2833 comm->cutoff/comm->maxpulse);
2835 comm->cellsize_limit = std::max(comm->cellsize_limit, comm->cutoff_mbody);
2836 /* Set the minimum cell size for each DD dimension */
2837 for (d = 0; d < dd->ndim; d++)
2839 if (comm->bVacDLBNoLimit ||
2840 comm->cd[d].np_dlb*comm->cellsize_limit >= comm->cutoff)
2842 comm->cellsize_min_dlb[dd->dim[d]] = comm->cellsize_limit;
2846 comm->cellsize_min_dlb[dd->dim[d]] =
2847 comm->cutoff/comm->cd[d].np_dlb;
2850 if (comm->cutoff_mbody <= 0)
2852 comm->cutoff_mbody = std::min(comm->cutoff, comm->cellsize_limit);
2860 gmx_bool dd_bonded_molpbc(const gmx_domdec_t *dd, int ePBC)
2862 /* If each molecule is a single charge group
2863 * or we use domain decomposition for each periodic dimension,
2864 * we do not need to take pbc into account for the bonded interactions.
2866 return (ePBC != epbcNONE && dd->comm->bInterCGBondeds &&
2869 (dd->nc[ZZ] > 1 || ePBC == epbcXY)));
2872 /*! \brief Sets grid size limits and PP-PME setup, prints settings to log */
2873 static void set_ddgrid_parameters(const gmx::MDLogger &mdlog,
2874 gmx_domdec_t *dd, real dlb_scale,
2875 const gmx_mtop_t *mtop, const t_inputrec *ir,
2876 const gmx_ddbox_t *ddbox)
2878 gmx_domdec_comm_t *comm;
2884 if (EEL_PME(ir->coulombtype) || EVDW_PME(ir->vdwtype))
2886 init_ddpme(dd, &comm->ddpme[0], 0);
2887 if (comm->npmedecompdim >= 2)
2889 init_ddpme(dd, &comm->ddpme[1], 1);
2894 comm->npmenodes = 0;
2895 if (dd->pme_nodeid >= 0)
2897 gmx_fatal_collective(FARGS, dd->mpi_comm_all, DDMASTER(dd),
2898 "Can not have separate PME ranks without PME electrostatics");
2904 fprintf(debug, "The DD cut-off is %f\n", comm->cutoff);
2906 if (!isDlbDisabled(comm))
2908 set_cell_limits_dlb(mdlog, dd, dlb_scale, ir, ddbox);
2911 logSettings(mdlog, dd, mtop, ir, dlb_scale, ddbox);
2913 if (ir->ePBC == epbcNONE)
2915 vol_frac = 1 - 1/static_cast<double>(dd->nnodes);
2920 (1 + comm_box_frac(dd->nc, comm->cutoff, ddbox))/static_cast<double>(dd->nnodes);
2924 fprintf(debug, "Volume fraction for all DD zones: %f\n", vol_frac);
2926 natoms_tot = comm->cgs_gl.index[comm->cgs_gl.nr];
2928 dd->ga2la = new gmx_ga2la_t(natoms_tot,
2929 static_cast<int>(vol_frac*natoms_tot));
2932 /*! \brief Set some important DD parameters that can be modified by env.vars */
2933 static void set_dd_envvar_options(const gmx::MDLogger &mdlog,
2934 gmx_domdec_t *dd, int rank_mysim)
2936 gmx_domdec_comm_t *comm = dd->comm;
2938 dd->bSendRecv2 = (dd_getenv(mdlog, "GMX_DD_USE_SENDRECV2", 0) != 0);
2939 comm->dlb_scale_lim = dd_getenv(mdlog, "GMX_DLB_MAX_BOX_SCALING", 10);
2940 comm->eFlop = dd_getenv(mdlog, "GMX_DLB_BASED_ON_FLOPS", 0);
2941 int recload = dd_getenv(mdlog, "GMX_DD_RECORD_LOAD", 1);
2942 comm->nstDDDump = dd_getenv(mdlog, "GMX_DD_NST_DUMP", 0);
2943 comm->nstDDDumpGrid = dd_getenv(mdlog, "GMX_DD_NST_DUMP_GRID", 0);
2944 comm->DD_debug = dd_getenv(mdlog, "GMX_DD_DEBUG", 0);
2948 GMX_LOG(mdlog.info).appendText("Will use two sequential MPI_Sendrecv calls instead of two simultaneous non-blocking MPI_Irecv and MPI_Isend pairs for constraint and vsite communication");
2953 GMX_LOG(mdlog.info).appendText("Will load balance based on FLOP count");
2954 if (comm->eFlop > 1)
2956 srand(1 + rank_mysim);
2958 comm->bRecordLoad = TRUE;
2962 comm->bRecordLoad = (wallcycle_have_counter() && recload > 0);
2966 gmx_domdec_t *init_domain_decomposition(const gmx::MDLogger &mdlog,
2968 const DomdecOptions &options,
2969 const gmx::MdrunOptions &mdrunOptions,
2970 const gmx_mtop_t *mtop,
2971 const t_inputrec *ir,
2973 gmx::ArrayRef<const gmx::RVec> xGlobal,
2974 gmx::LocalAtomSetManager *atomSets)
2978 GMX_LOG(mdlog.info).appendTextFormatted(
2979 "\nInitializing Domain Decomposition on %d ranks", cr->nnodes);
2981 dd = new gmx_domdec_t;
2983 dd->comm = init_dd_comm();
2985 /* Initialize DD paritioning counters */
2986 dd->comm->partition_step = INT_MIN;
2989 set_dd_envvar_options(mdlog, dd, cr->nodeid);
2991 gmx_ddbox_t ddbox = {0};
2992 set_dd_limits_and_grid(mdlog, cr, dd, options, mdrunOptions,
2997 make_dd_communicators(mdlog, cr, dd, options.rankOrder);
2999 if (thisRankHasDuty(cr, DUTY_PP))
3001 set_ddgrid_parameters(mdlog, dd, options.dlbScaling, mtop, ir, &ddbox);
3003 setup_neighbor_relations(dd);
3006 /* Set overallocation to avoid frequent reallocation of arrays */
3007 set_over_alloc_dd(TRUE);
3009 clear_dd_cycle_counts(dd);
3011 dd->atomSets = atomSets;
3016 static gmx_bool test_dd_cutoff(t_commrec *cr,
3017 const t_state &state,
3018 real cutoffRequested)
3028 set_ddbox(*dd, false, state.box, true, state.x, &ddbox);
3032 for (d = 0; d < dd->ndim; d++)
3036 inv_cell_size = DD_CELL_MARGIN*dd->nc[dim]/ddbox.box_size[dim];
3037 if (dynamic_dd_box(*dd))
3039 inv_cell_size *= DD_PRES_SCALE_MARGIN;
3042 np = 1 + static_cast<int>(cutoffRequested*inv_cell_size*ddbox.skew_fac[dim]);
3044 if (!isDlbDisabled(dd->comm) && (dim < ddbox.npbcdim) && (dd->comm->cd[d].np_dlb > 0))
3046 if (np > dd->comm->cd[d].np_dlb)
3051 /* If a current local cell size is smaller than the requested
3052 * cut-off, we could still fix it, but this gets very complicated.
3053 * Without fixing here, we might actually need more checks.
3055 real cellSizeAlongDim = (dd->comm->cell_x1[dim] - dd->comm->cell_x0[dim])*ddbox.skew_fac[dim];
3056 if (cellSizeAlongDim*dd->comm->cd[d].np_dlb < cutoffRequested)
3063 if (!isDlbDisabled(dd->comm))
3065 /* If DLB is not active yet, we don't need to check the grid jumps.
3066 * Actually we shouldn't, because then the grid jump data is not set.
3068 if (isDlbOn(dd->comm) &&
3069 check_grid_jump(0, dd, cutoffRequested, &ddbox, FALSE))
3074 gmx_sumi(1, &LocallyLimited, cr);
3076 if (LocallyLimited > 0)
3085 gmx_bool change_dd_cutoff(t_commrec *cr,
3086 const t_state &state,
3087 real cutoffRequested)
3089 gmx_bool bCutoffAllowed;
3091 bCutoffAllowed = test_dd_cutoff(cr, state, cutoffRequested);
3095 cr->dd->comm->cutoff = cutoffRequested;
3098 return bCutoffAllowed;