2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2005,2006,2007,2008,2009 by the GROMACS development team.
5 * Copyright (c) 2010,2011,2012,2013,2014 by the GROMACS development team.
6 * Copyright (c) 2015,2016,2017,2018,2019 by the GROMACS development team.
7 * Copyright (c) 2020,2021, by the GROMACS development team, led by
8 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
9 * and including many others, as listed in the AUTHORS file in the
10 * top-level source directory and at http://www.gromacs.org.
12 * GROMACS is free software; you can redistribute it and/or
13 * modify it under the terms of the GNU Lesser General Public License
14 * as published by the Free Software Foundation; either version 2.1
15 * of the License, or (at your option) any later version.
17 * GROMACS is distributed in the hope that it will be useful,
18 * but WITHOUT ANY WARRANTY; without even the implied warranty of
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20 * Lesser General Public License for more details.
22 * You should have received a copy of the GNU Lesser General Public
23 * License along with GROMACS; if not, see
24 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
25 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
27 * If you want to redistribute modifications to GROMACS, please
28 * consider that scientific software is very special. Version
29 * control is crucial - bugs must be traceable. We will be happy to
30 * consider code for inclusion in the official distribution, but
31 * derived work must not be called official GROMACS. Details are found
32 * in the README & COPYING files - if they are missing, get the
33 * official version at http://www.gromacs.org.
35 * To help us fund GROMACS development, we humbly ask that you cite
36 * the research papers on the package. Check out http://www.gromacs.org.
56 #include "gromacs/domdec/builder.h"
57 #include "gromacs/domdec/collect.h"
58 #include "gromacs/domdec/computemultibodycutoffs.h"
59 #include "gromacs/domdec/dlb.h"
60 #include "gromacs/domdec/dlbtiming.h"
61 #include "gromacs/domdec/domdec_network.h"
62 #include "gromacs/domdec/ga2la.h"
63 #include "gromacs/domdec/gpuhaloexchange.h"
64 #include "gromacs/domdec/localtopologychecker.h"
65 #include "gromacs/domdec/options.h"
66 #include "gromacs/domdec/partition.h"
67 #include "gromacs/ewald/pme.h"
68 #include "gromacs/domdec/reversetopology.h"
69 #include "gromacs/gmxlib/network.h"
70 #include "gromacs/gmxlib/nrnb.h"
71 #include "gromacs/gpu_utils/device_stream_manager.h"
72 #include "gromacs/gpu_utils/gpu_utils.h"
73 #include "gromacs/hardware/hw_info.h"
74 #include "gromacs/math/vec.h"
75 #include "gromacs/math/vectypes.h"
76 #include "gromacs/mdlib/calc_verletbuf.h"
77 #include "gromacs/mdlib/constr.h"
78 #include "gromacs/mdlib/constraintrange.h"
79 #include "gromacs/mdlib/updategroups.h"
80 #include "gromacs/mdlib/vcm.h"
81 #include "gromacs/mdlib/vsite.h"
82 #include "gromacs/mdrun/mdmodules.h"
83 #include "gromacs/mdtypes/commrec.h"
84 #include "gromacs/mdtypes/forceoutput.h"
85 #include "gromacs/mdtypes/forcerec.h"
86 #include "gromacs/mdtypes/inputrec.h"
87 #include "gromacs/mdtypes/mdrunoptions.h"
88 #include "gromacs/mdtypes/state.h"
89 #include "gromacs/pbcutil/ishift.h"
90 #include "gromacs/pbcutil/pbc.h"
91 #include "gromacs/pulling/pull.h"
92 #include "gromacs/timing/wallcycle.h"
93 #include "gromacs/topology/block.h"
94 #include "gromacs/topology/idef.h"
95 #include "gromacs/topology/ifunc.h"
96 #include "gromacs/topology/mtop_lookup.h"
97 #include "gromacs/topology/mtop_util.h"
98 #include "gromacs/topology/topology.h"
99 #include "gromacs/utility/basedefinitions.h"
100 #include "gromacs/utility/basenetwork.h"
101 #include "gromacs/utility/cstringutil.h"
102 #include "gromacs/utility/enumerationhelpers.h"
103 #include "gromacs/utility/exceptions.h"
104 #include "gromacs/utility/fatalerror.h"
105 #include "gromacs/utility/gmxmpi.h"
106 #include "gromacs/utility/logger.h"
107 #include "gromacs/utility/mdmodulesnotifiers.h"
108 #include "gromacs/utility/real.h"
109 #include "gromacs/utility/smalloc.h"
110 #include "gromacs/utility/strconvert.h"
111 #include "gromacs/utility/stringstream.h"
112 #include "gromacs/utility/stringutil.h"
113 #include "gromacs/utility/textwriter.h"
115 #include "atomdistribution.h"
117 #include "cellsizes.h"
118 #include "distribute.h"
119 #include "domdec_constraints.h"
120 #include "domdec_internal.h"
121 #include "domdec_setup.h"
122 #include "domdec_vsite.h"
123 #include "redistribute.h"
126 // TODO remove this when moving domdec into gmx namespace
128 using gmx::DdRankOrder;
129 using gmx::DlbOption;
130 using gmx::DomdecOptions;
131 using gmx::RangePartitioning;
133 static const char* enumValueToString(DlbState enumValue)
135 static constexpr gmx::EnumerationArray<DlbState, const char*> dlbStateNames = {
136 "off", "off", "auto", "locked", "on", "on"
138 return dlbStateNames[enumValue];
141 /* The DD zone order */
142 static const ivec dd_zo[DD_MAXZONE] = { { 0, 0, 0 }, { 1, 0, 0 }, { 1, 1, 0 }, { 0, 1, 0 },
143 { 0, 1, 1 }, { 0, 0, 1 }, { 1, 0, 1 }, { 1, 1, 1 } };
145 /* The non-bonded zone-pair setup for domain decomposition
146 * The first number is the i-zone, the second number the first j-zone seen by
147 * this i-zone, the third number the last+1 j-zone seen by this i-zone.
148 * As is, this is for 3D decomposition, where there are 4 i-zones.
149 * With 2D decomposition use only the first 2 i-zones and a last+1 j-zone of 4.
150 * With 1D decomposition use only the first i-zone and a last+1 j-zone of 2.
152 static const int ddNonbondedZonePairRanges[DD_MAXIZONE][3] = { { 0, 0, 8 },
157 static void ddindex2xyz(const ivec nc, int ind, ivec xyz)
159 xyz[XX] = ind / (nc[YY] * nc[ZZ]);
160 xyz[YY] = (ind / nc[ZZ]) % nc[YY];
161 xyz[ZZ] = ind % nc[ZZ];
164 static int ddcoord2ddnodeid(gmx_domdec_t* dd, ivec c)
168 const CartesianRankSetup& cartSetup = dd->comm->cartesianRankSetup;
169 const int ddindex = dd_index(dd->numCells, c);
170 if (cartSetup.bCartesianPP_PME)
172 ddnodeid = cartSetup.ddindex2ddnodeid[ddindex];
174 else if (cartSetup.bCartesianPP)
177 MPI_Cart_rank(dd->mpi_comm_all, c, &ddnodeid);
188 int ddglatnr(const gmx_domdec_t* dd, int i)
198 if (i >= dd->comm->atomRanges.numAtomsTotal())
201 "glatnr called with %d, which is larger than the local number of atoms (%d)",
203 dd->comm->atomRanges.numAtomsTotal());
205 atnr = dd->globalAtomIndices[i] + 1;
211 void dd_store_state(const gmx_domdec_t& dd, t_state* state)
213 if (state->ddp_count != dd.ddp_count)
215 gmx_incons("The MD state does not match the domain decomposition state");
218 state->cg_gl.resize(dd.numHomeAtoms);
219 for (int i = 0; i < dd.numHomeAtoms; i++)
221 state->cg_gl[i] = dd.globalAtomGroupIndices[i];
224 state->ddp_count_cg_gl = dd.ddp_count;
227 gmx_domdec_zones_t* domdec_zones(gmx_domdec_t* dd)
229 return &dd->comm->zones;
232 int dd_numAtomsZones(const gmx_domdec_t& dd)
234 return dd.comm->atomRanges.end(DDAtomRanges::Type::Zones);
237 int dd_numHomeAtoms(const gmx_domdec_t& dd)
239 return dd.comm->atomRanges.numHomeAtoms();
242 int dd_natoms_mdatoms(const gmx_domdec_t& dd)
244 /* We currently set mdatoms entries for all atoms:
245 * local + non-local + communicated for vsite + constraints
248 return dd.comm->atomRanges.numAtomsTotal();
251 int dd_natoms_vsite(const gmx_domdec_t& dd)
253 return dd.comm->atomRanges.end(DDAtomRanges::Type::Vsites);
256 void dd_get_constraint_range(const gmx_domdec_t& dd, int* at_start, int* at_end)
258 *at_start = dd.comm->atomRanges.start(DDAtomRanges::Type::Constraints);
259 *at_end = dd.comm->atomRanges.end(DDAtomRanges::Type::Constraints);
262 void dd_move_x(gmx_domdec_t* dd, const matrix box, gmx::ArrayRef<gmx::RVec> x, gmx_wallcycle* wcycle)
264 wallcycle_start(wcycle, WallCycleCounter::MoveX);
266 rvec shift = { 0, 0, 0 };
268 gmx_domdec_comm_t* comm = dd->comm;
271 int nat_tot = comm->atomRanges.numHomeAtoms();
272 for (int d = 0; d < dd->ndim; d++)
274 const bool bPBC = (dd->ci[dd->dim[d]] == 0);
275 const bool bScrew = (bPBC && dd->unitCellInfo.haveScrewPBC && dd->dim[d] == XX);
278 copy_rvec(box[dd->dim[d]], shift);
280 gmx_domdec_comm_dim_t* cd = &comm->cd[d];
281 for (const gmx_domdec_ind_t& ind : cd->ind)
283 DDBufferAccess<gmx::RVec> sendBufferAccess(comm->rvecBuffer, ind.nsend[nzone + 1]);
284 gmx::ArrayRef<gmx::RVec>& sendBuffer = sendBufferAccess.buffer;
288 for (int j : ind.index)
290 sendBuffer[n] = x[j];
296 for (int j : ind.index)
298 /* We need to shift the coordinates */
299 for (int d = 0; d < DIM; d++)
301 sendBuffer[n][d] = x[j][d] + shift[d];
308 for (int j : ind.index)
311 sendBuffer[n][XX] = x[j][XX] + shift[XX];
313 * This operation requires a special shift force
314 * treatment, which is performed in calc_vir.
316 sendBuffer[n][YY] = box[YY][YY] - x[j][YY];
317 sendBuffer[n][ZZ] = box[ZZ][ZZ] - x[j][ZZ];
322 DDBufferAccess<gmx::RVec> receiveBufferAccess(
323 comm->rvecBuffer2, cd->receiveInPlace ? 0 : ind.nrecv[nzone + 1]);
325 gmx::ArrayRef<gmx::RVec> receiveBuffer;
326 if (cd->receiveInPlace)
328 receiveBuffer = gmx::arrayRefFromArray(x.data() + nat_tot, ind.nrecv[nzone + 1]);
332 receiveBuffer = receiveBufferAccess.buffer;
334 /* Send and receive the coordinates */
335 ddSendrecv(dd, d, dddirBackward, sendBuffer, receiveBuffer);
337 if (!cd->receiveInPlace)
340 for (int zone = 0; zone < nzone; zone++)
342 for (int i = ind.cell2at0[zone]; i < ind.cell2at1[zone]; i++)
344 x[i] = receiveBuffer[j++];
348 nat_tot += ind.nrecv[nzone + 1];
353 wallcycle_stop(wcycle, WallCycleCounter::MoveX);
356 void dd_move_f(gmx_domdec_t* dd, gmx::ForceWithShiftForces* forceWithShiftForces, gmx_wallcycle* wcycle)
358 wallcycle_start(wcycle, WallCycleCounter::MoveF);
360 gmx::ArrayRef<gmx::RVec> f = forceWithShiftForces->force();
361 gmx::ArrayRef<gmx::RVec> fshift = forceWithShiftForces->shiftForces();
363 gmx_domdec_comm_t& comm = *dd->comm;
364 int nzone = comm.zones.n / 2;
365 int nat_tot = comm.atomRanges.end(DDAtomRanges::Type::Zones);
366 for (int d = dd->ndim - 1; d >= 0; d--)
368 /* Only forces in domains near the PBC boundaries need to
369 consider PBC in the treatment of fshift */
370 const bool shiftForcesNeedPbc =
371 (forceWithShiftForces->computeVirial() && dd->ci[dd->dim[d]] == 0);
372 const bool applyScrewPbc =
373 (shiftForcesNeedPbc && dd->unitCellInfo.haveScrewPBC && dd->dim[d] == XX);
374 /* Determine which shift vector we need */
375 ivec vis = { 0, 0, 0 };
377 const int is = gmx::ivecToShiftIndex(vis);
379 /* Loop over the pulses */
380 const gmx_domdec_comm_dim_t& cd = comm.cd[d];
381 for (int p = cd.numPulses() - 1; p >= 0; p--)
383 const gmx_domdec_ind_t& ind = cd.ind[p];
384 DDBufferAccess<gmx::RVec> receiveBufferAccess(comm.rvecBuffer, ind.nsend[nzone + 1]);
385 gmx::ArrayRef<gmx::RVec>& receiveBuffer = receiveBufferAccess.buffer;
387 nat_tot -= ind.nrecv[nzone + 1];
389 DDBufferAccess<gmx::RVec> sendBufferAccess(
390 comm.rvecBuffer2, cd.receiveInPlace ? 0 : ind.nrecv[nzone + 1]);
392 gmx::ArrayRef<gmx::RVec> sendBuffer;
393 if (cd.receiveInPlace)
395 sendBuffer = gmx::arrayRefFromArray(f.data() + nat_tot, ind.nrecv[nzone + 1]);
399 sendBuffer = sendBufferAccess.buffer;
401 for (int zone = 0; zone < nzone; zone++)
403 for (int i = ind.cell2at0[zone]; i < ind.cell2at1[zone]; i++)
405 sendBuffer[j++] = f[i];
409 /* Communicate the forces */
410 ddSendrecv(dd, d, dddirForward, sendBuffer, receiveBuffer);
411 /* Add the received forces */
413 if (!shiftForcesNeedPbc)
415 for (int j : ind.index)
417 for (int d = 0; d < DIM; d++)
419 f[j][d] += receiveBuffer[n][d];
424 else if (!applyScrewPbc)
426 for (int j : ind.index)
428 for (int d = 0; d < DIM; d++)
430 f[j][d] += receiveBuffer[n][d];
432 /* Add this force to the shift force */
433 for (int d = 0; d < DIM; d++)
435 fshift[is][d] += receiveBuffer[n][d];
442 for (int j : ind.index)
444 /* Rotate the force */
445 f[j][XX] += receiveBuffer[n][XX];
446 f[j][YY] -= receiveBuffer[n][YY];
447 f[j][ZZ] -= receiveBuffer[n][ZZ];
448 if (shiftForcesNeedPbc)
450 /* Add this force to the shift force */
451 for (int d = 0; d < DIM; d++)
453 fshift[is][d] += receiveBuffer[n][d];
462 wallcycle_stop(wcycle, WallCycleCounter::MoveF);
465 /* Convenience function for extracting a real buffer from an rvec buffer
467 * To reduce the number of temporary communication buffers and avoid
468 * cache polution, we reuse gmx::RVec buffers for storing reals.
469 * This functions return a real buffer reference with the same number
470 * of elements as the gmx::RVec buffer (so 1/3 of the size in bytes).
472 static gmx::ArrayRef<real> realArrayRefFromRvecArrayRef(gmx::ArrayRef<gmx::RVec> arrayRef)
474 return gmx::arrayRefFromArray(as_rvec_array(arrayRef.data())[0], arrayRef.size());
477 void dd_atom_spread_real(gmx_domdec_t* dd, real v[])
479 gmx_domdec_comm_t* comm = dd->comm;
482 int nat_tot = comm->atomRanges.numHomeAtoms();
483 for (int d = 0; d < dd->ndim; d++)
485 gmx_domdec_comm_dim_t* cd = &comm->cd[d];
486 for (const gmx_domdec_ind_t& ind : cd->ind)
488 /* Note: We provision for RVec instead of real, so a factor of 3
489 * more than needed. The buffer actually already has this size
490 * and we pass a plain pointer below, so this does not matter.
492 DDBufferAccess<gmx::RVec> sendBufferAccess(comm->rvecBuffer, ind.nsend[nzone + 1]);
493 gmx::ArrayRef<real> sendBuffer = realArrayRefFromRvecArrayRef(sendBufferAccess.buffer);
495 for (int j : ind.index)
497 sendBuffer[n++] = v[j];
500 DDBufferAccess<gmx::RVec> receiveBufferAccess(
501 comm->rvecBuffer2, cd->receiveInPlace ? 0 : ind.nrecv[nzone + 1]);
503 gmx::ArrayRef<real> receiveBuffer;
504 if (cd->receiveInPlace)
506 receiveBuffer = gmx::arrayRefFromArray(v + nat_tot, ind.nrecv[nzone + 1]);
510 receiveBuffer = realArrayRefFromRvecArrayRef(receiveBufferAccess.buffer);
512 /* Send and receive the data */
513 ddSendrecv(dd, d, dddirBackward, sendBuffer, receiveBuffer);
514 if (!cd->receiveInPlace)
517 for (int zone = 0; zone < nzone; zone++)
519 for (int i = ind.cell2at0[zone]; i < ind.cell2at1[zone]; i++)
521 v[i] = receiveBuffer[j++];
525 nat_tot += ind.nrecv[nzone + 1];
531 void dd_atom_sum_real(gmx_domdec_t* dd, real v[])
533 gmx_domdec_comm_t* comm = dd->comm;
535 int nzone = comm->zones.n / 2;
536 int nat_tot = comm->atomRanges.end(DDAtomRanges::Type::Zones);
537 for (int d = dd->ndim - 1; d >= 0; d--)
539 gmx_domdec_comm_dim_t* cd = &comm->cd[d];
540 for (int p = cd->numPulses() - 1; p >= 0; p--)
542 const gmx_domdec_ind_t& ind = cd->ind[p];
544 /* Note: We provision for RVec instead of real, so a factor of 3
545 * more than needed. The buffer actually already has this size
546 * and we typecast, so this works as intended.
548 DDBufferAccess<gmx::RVec> receiveBufferAccess(comm->rvecBuffer, ind.nsend[nzone + 1]);
549 gmx::ArrayRef<real> receiveBuffer = realArrayRefFromRvecArrayRef(receiveBufferAccess.buffer);
550 nat_tot -= ind.nrecv[nzone + 1];
552 DDBufferAccess<gmx::RVec> sendBufferAccess(
553 comm->rvecBuffer2, cd->receiveInPlace ? 0 : ind.nrecv[nzone + 1]);
555 gmx::ArrayRef<real> sendBuffer;
556 if (cd->receiveInPlace)
558 sendBuffer = gmx::arrayRefFromArray(v + nat_tot, ind.nrecv[nzone + 1]);
562 sendBuffer = realArrayRefFromRvecArrayRef(sendBufferAccess.buffer);
564 for (int zone = 0; zone < nzone; zone++)
566 for (int i = ind.cell2at0[zone]; i < ind.cell2at1[zone]; i++)
568 sendBuffer[j++] = v[i];
572 /* Communicate the forces */
573 ddSendrecv(dd, d, dddirForward, sendBuffer, receiveBuffer);
574 /* Add the received forces */
576 for (int j : ind.index)
578 v[j] += receiveBuffer[n];
586 real dd_cutoff_multibody(const gmx_domdec_t* dd)
588 const gmx_domdec_comm_t& comm = *dd->comm;
589 const DDSystemInfo& systemInfo = comm.systemInfo;
592 if (systemInfo.haveInterDomainMultiBodyBondeds)
594 if (comm.cutoff_mbody > 0)
596 r = comm.cutoff_mbody;
600 /* cutoff_mbody=0 means we do not have DLB */
601 r = comm.cellsize_min[dd->dim[0]];
602 for (int di = 1; di < dd->ndim; di++)
604 r = std::min(r, comm.cellsize_min[dd->dim[di]]);
606 if (comm.systemInfo.filterBondedCommunication)
608 r = std::max(r, comm.cutoff_mbody);
612 r = std::min(r, systemInfo.cutoff);
620 real dd_cutoff_twobody(const gmx_domdec_t* dd)
622 const real r_mb = dd_cutoff_multibody(dd);
624 return std::max(dd->comm->systemInfo.cutoff, r_mb);
628 static void dd_cart_coord2pmecoord(const DDRankSetup& ddRankSetup,
629 const CartesianRankSetup& cartSetup,
633 const int nc = ddRankSetup.numPPCells[cartSetup.cartpmedim];
634 const int ntot = cartSetup.ntot[cartSetup.cartpmedim];
635 copy_ivec(coord, coord_pme);
636 coord_pme[cartSetup.cartpmedim] =
637 nc + (coord[cartSetup.cartpmedim] * (ntot - nc) + (ntot - nc) / 2) / nc;
640 /* Returns the PME rank index in 0...npmenodes-1 for the PP cell with index ddCellIndex */
641 static int ddindex2pmeindex(const DDRankSetup& ddRankSetup, const int ddCellIndex)
643 const int npp = ddRankSetup.numPPRanks;
644 const int npme = ddRankSetup.numRanksDoingPme;
646 /* Here we assign a PME node to communicate with this DD node
647 * by assuming that the major index of both is x.
648 * We add npme/2 to obtain an even distribution.
650 return (ddCellIndex * npme + npme / 2) / npp;
653 static std::vector<int> dd_interleaved_pme_ranks(const DDRankSetup& ddRankSetup)
655 std::vector<int> pmeRanks(ddRankSetup.numRanksDoingPme);
658 for (int i = 0; i < ddRankSetup.numPPRanks; i++)
660 const int p0 = ddindex2pmeindex(ddRankSetup, i);
661 const int p1 = ddindex2pmeindex(ddRankSetup, i + 1);
662 if (i + 1 == ddRankSetup.numPPRanks || p1 > p0)
666 fprintf(debug, "pme_rank[%d] = %d\n", n, i + 1 + n);
668 pmeRanks[n] = i + 1 + n;
676 static int gmx_ddcoord2pmeindex(const gmx_domdec_t& dd, int x, int y, int z)
683 const int slab = ddindex2pmeindex(dd.comm->ddRankSetup, dd_index(dd.numCells, coords));
688 static int ddcoord2simnodeid(const t_commrec* cr, int x, int y, int z)
690 const CartesianRankSetup& cartSetup = cr->dd->comm->cartesianRankSetup;
691 ivec coords = { x, y, z };
694 if (cartSetup.bCartesianPP_PME)
697 MPI_Cart_rank(cr->mpi_comm_mysim, coords, &nodeid);
702 int ddindex = dd_index(cr->dd->numCells, coords);
703 if (cartSetup.bCartesianPP)
705 nodeid = cartSetup.ddindex2simnodeid[ddindex];
709 const DDRankSetup& rankSetup = cr->dd->comm->ddRankSetup;
710 if (rankSetup.rankOrder != DdRankOrder::pp_pme && rankSetup.usePmeOnlyRanks)
712 nodeid = ddindex + gmx_ddcoord2pmeindex(*cr->dd, x, y, z);
724 static int dd_simnode2pmenode(const DDRankSetup& ddRankSetup,
725 const CartesianRankSetup& cartSetup,
726 gmx::ArrayRef<const int> pmeRanks,
727 const t_commrec gmx_unused* cr,
728 const int sim_nodeid)
732 /* This assumes a uniform x domain decomposition grid cell size */
733 if (cartSetup.bCartesianPP_PME)
736 ivec coord, coord_pme;
737 MPI_Cart_coords(cr->mpi_comm_mysim, sim_nodeid, DIM, coord);
738 if (coord[cartSetup.cartpmedim] < ddRankSetup.numPPCells[cartSetup.cartpmedim])
740 /* This is a PP rank */
741 dd_cart_coord2pmecoord(ddRankSetup, cartSetup, coord, coord_pme);
742 MPI_Cart_rank(cr->mpi_comm_mysim, coord_pme, &pmenode);
746 else if (cartSetup.bCartesianPP)
748 if (sim_nodeid < ddRankSetup.numPPRanks)
750 pmenode = ddRankSetup.numPPRanks + ddindex2pmeindex(ddRankSetup, sim_nodeid);
755 /* This assumes DD cells with identical x coordinates
756 * are numbered sequentially.
758 if (pmeRanks.empty())
760 if (sim_nodeid < ddRankSetup.numPPRanks)
762 /* The DD index equals the nodeid */
763 pmenode = ddRankSetup.numPPRanks + ddindex2pmeindex(ddRankSetup, sim_nodeid);
769 while (sim_nodeid > pmeRanks[i])
773 if (sim_nodeid < pmeRanks[i])
775 pmenode = pmeRanks[i];
783 NumPmeDomains getNumPmeDomains(const gmx_domdec_t* dd)
787 return { dd->comm->ddRankSetup.npmenodes_x, dd->comm->ddRankSetup.npmenodes_y };
795 std::vector<int> get_pme_ddranks(const t_commrec* cr, const int pmenodeid)
797 const DDRankSetup& ddRankSetup = cr->dd->comm->ddRankSetup;
798 const CartesianRankSetup& cartSetup = cr->dd->comm->cartesianRankSetup;
799 GMX_RELEASE_ASSERT(ddRankSetup.usePmeOnlyRanks,
800 "This function should only be called when PME-only ranks are in use");
801 std::vector<int> ddranks;
802 ddranks.reserve((ddRankSetup.numPPRanks + ddRankSetup.numRanksDoingPme - 1) / ddRankSetup.numRanksDoingPme);
804 for (int x = 0; x < ddRankSetup.numPPCells[XX]; x++)
806 for (int y = 0; y < ddRankSetup.numPPCells[YY]; y++)
808 for (int z = 0; z < ddRankSetup.numPPCells[ZZ]; z++)
810 if (cartSetup.bCartesianPP_PME)
812 ivec coord = { x, y, z };
814 dd_cart_coord2pmecoord(ddRankSetup, cartSetup, coord, coord_pme);
815 if (cr->dd->ci[XX] == coord_pme[XX] && cr->dd->ci[YY] == coord_pme[YY]
816 && cr->dd->ci[ZZ] == coord_pme[ZZ])
818 ddranks.push_back(ddcoord2simnodeid(cr, x, y, z));
823 /* The slab corresponds to the nodeid in the PME group */
824 if (gmx_ddcoord2pmeindex(*cr->dd, x, y, z) == pmenodeid)
826 ddranks.push_back(ddcoord2simnodeid(cr, x, y, z));
835 static gmx_bool receive_vir_ener(const gmx_domdec_t* dd, gmx::ArrayRef<const int> pmeRanks, const t_commrec* cr)
837 bool bReceive = true;
839 const DDRankSetup& ddRankSetup = dd->comm->ddRankSetup;
840 if (ddRankSetup.usePmeOnlyRanks)
842 const CartesianRankSetup& cartSetup = dd->comm->cartesianRankSetup;
843 if (cartSetup.bCartesianPP_PME)
846 int pmenode = dd_simnode2pmenode(ddRankSetup, cartSetup, pmeRanks, cr, cr->sim_nodeid);
848 MPI_Cart_coords(cr->mpi_comm_mysim, cr->sim_nodeid, DIM, coords);
849 coords[cartSetup.cartpmedim]++;
850 if (coords[cartSetup.cartpmedim] < dd->numCells[cartSetup.cartpmedim])
853 MPI_Cart_rank(cr->mpi_comm_mysim, coords, &rank);
854 if (dd_simnode2pmenode(ddRankSetup, cartSetup, pmeRanks, cr, rank) == pmenode)
856 /* This is not the last PP node for pmenode */
863 "Without MPI we should not have Cartesian PP-PME with #PMEnodes < #DDnodes");
868 int pmenode = dd_simnode2pmenode(ddRankSetup, cartSetup, pmeRanks, cr, cr->sim_nodeid);
869 if (cr->sim_nodeid + 1 < cr->nnodes
870 && dd_simnode2pmenode(ddRankSetup, cartSetup, pmeRanks, cr, cr->sim_nodeid + 1) == pmenode)
872 /* This is not the last PP node for pmenode */
881 static void set_slb_pme_dim_f(gmx_domdec_t* dd, int dim, real** dim_f)
883 gmx_domdec_comm_t* comm = dd->comm;
885 snew(*dim_f, dd->numCells[dim] + 1);
887 for (int i = 1; i < dd->numCells[dim]; i++)
889 if (comm->slb_frac[dim])
891 (*dim_f)[i] = (*dim_f)[i - 1] + comm->slb_frac[dim][i - 1];
895 (*dim_f)[i] = static_cast<real>(i) / static_cast<real>(dd->numCells[dim]);
898 (*dim_f)[dd->numCells[dim]] = 1;
901 static void init_ddpme(gmx_domdec_t* dd, gmx_ddpme_t* ddpme, int dimind)
903 const DDRankSetup& ddRankSetup = dd->comm->ddRankSetup;
905 if (dimind == 0 && dd->dim[0] == YY && ddRankSetup.npmenodes_x == 1)
913 ddpme->dim_match = (ddpme->dim == dd->dim[dimind]);
915 ddpme->nslab = (ddpme->dim == 0 ? ddRankSetup.npmenodes_x : ddRankSetup.npmenodes_y);
917 if (ddpme->nslab <= 1)
922 const int nso = ddRankSetup.numRanksDoingPme / ddpme->nslab;
923 /* Determine for each PME slab the PP location range for dimension dim */
924 snew(ddpme->pp_min, ddpme->nslab);
925 snew(ddpme->pp_max, ddpme->nslab);
926 for (int slab = 0; slab < ddpme->nslab; slab++)
928 ddpme->pp_min[slab] = dd->numCells[dd->dim[dimind]] - 1;
929 ddpme->pp_max[slab] = 0;
931 for (int i = 0; i < dd->nnodes; i++)
934 ddindex2xyz(dd->numCells, i, xyz);
935 /* For y only use our y/z slab.
936 * This assumes that the PME x grid size matches the DD grid size.
938 if (dimind == 0 || xyz[XX] == dd->ci[XX])
940 const int pmeindex = ddindex2pmeindex(ddRankSetup, i);
941 const int slab = (dimind == 0) ? (pmeindex / nso) : (pmeindex % ddpme->nslab);
942 ddpme->pp_min[slab] = std::min(ddpme->pp_min[slab], xyz[dimind]);
943 ddpme->pp_max[slab] = std::max(ddpme->pp_max[slab], xyz[dimind]);
947 set_slb_pme_dim_f(dd, ddpme->dim, &ddpme->slb_dim_f);
950 int dd_pme_maxshift_x(const gmx_domdec_t& dd)
952 const DDRankSetup& ddRankSetup = dd.comm->ddRankSetup;
954 if (ddRankSetup.ddpme[0].dim == XX)
956 return ddRankSetup.ddpme[0].maxshift;
964 int dd_pme_maxshift_y(const gmx_domdec_t& dd)
966 const DDRankSetup& ddRankSetup = dd.comm->ddRankSetup;
968 if (ddRankSetup.ddpme[0].dim == YY)
970 return ddRankSetup.ddpme[0].maxshift;
972 else if (ddRankSetup.npmedecompdim >= 2 && ddRankSetup.ddpme[1].dim == YY)
974 return ddRankSetup.ddpme[1].maxshift;
982 bool ddUsesUpdateGroups(const gmx_domdec_t& dd)
984 return dd.comm->systemInfo.useUpdateGroups;
987 void dd_cycles_add(const gmx_domdec_t* dd, float cycles, int ddCycl)
989 /* Note that the cycles value can be incorrect, either 0 or some
990 * extremely large value, when our thread migrated to another core
991 * with an unsynchronized cycle counter. If this happens less often
992 * that once per nstlist steps, this will not cause issues, since
993 * we later subtract the maximum value from the sum over nstlist steps.
994 * A zero count will slightly lower the total, but that's a small effect.
995 * Note that the main purpose of the subtraction of the maximum value
996 * is to avoid throwing off the load balancing when stalls occur due
997 * e.g. system activity or network congestion.
999 dd->comm->cycl[ddCycl] += cycles;
1000 dd->comm->cycl_n[ddCycl]++;
1001 if (cycles > dd->comm->cycl_max[ddCycl])
1003 dd->comm->cycl_max[ddCycl] = cycles;
1008 static void make_load_communicator(gmx_domdec_t* dd, int dim_ind, ivec loc)
1010 MPI_Comm c_row = MPI_COMM_NULL;
1012 bool bPartOfGroup = false;
1014 const int dim = dd->dim[dim_ind];
1015 copy_ivec(loc, loc_c);
1016 for (int i = 0; i < dd->numCells[dim]; i++)
1019 const int rank = dd_index(dd->numCells, loc_c);
1020 if (rank == dd->rank)
1022 /* This process is part of the group */
1023 bPartOfGroup = TRUE;
1026 MPI_Comm_split(dd->mpi_comm_all, bPartOfGroup ? 0 : MPI_UNDEFINED, dd->rank, &c_row);
1029 dd->comm->mpi_comm_load[dim_ind] = c_row;
1030 if (!isDlbDisabled(dd->comm))
1032 DDCellsizesWithDlb& cellsizes = dd->comm->cellsizesWithDlb[dim_ind];
1034 if (dd->ci[dim] == dd->master_ci[dim])
1036 /* This is the root process of this row */
1037 cellsizes.rowMaster = std::make_unique<RowMaster>();
1039 RowMaster& rowMaster = *cellsizes.rowMaster;
1040 rowMaster.cellFrac.resize(ddCellFractionBufferSize(dd, dim_ind));
1041 rowMaster.oldCellFrac.resize(dd->numCells[dim] + 1);
1042 rowMaster.isCellMin.resize(dd->numCells[dim]);
1045 rowMaster.bounds.resize(dd->numCells[dim]);
1047 rowMaster.buf_ncd.resize(dd->numCells[dim]);
1051 /* This is not a root process, we only need to receive cell_f */
1052 cellsizes.fracRow.resize(ddCellFractionBufferSize(dd, dim_ind));
1055 if (dd->ci[dim] == dd->master_ci[dim])
1057 snew(dd->comm->load[dim_ind].load, dd->numCells[dim] * DD_NLOAD_MAX);
1063 void dd_setup_dlb_resource_sharing(const t_commrec* cr, int gpu_id)
1066 MPI_Comm mpi_comm_pp_physicalnode = MPI_COMM_NULL;
1068 if (!thisRankHasDuty(cr, DUTY_PP) || gpu_id < 0)
1070 /* Only ranks with short-ranged tasks (currently) use GPUs.
1071 * If we don't have GPUs assigned, there are no resources to share.
1076 const int physicalnode_id_hash = gmx_physicalnode_id_hash();
1078 gmx_domdec_t* dd = cr->dd;
1082 fprintf(debug, "dd_setup_dd_dlb_gpu_sharing:\n");
1083 fprintf(debug, "DD PP rank %d physical node hash %d gpu_id %d\n", dd->rank, physicalnode_id_hash, gpu_id);
1085 /* Split the PP communicator over the physical nodes */
1086 /* TODO: See if we should store this (before), as it's also used for
1087 * for the nodecomm summation.
1089 // TODO PhysicalNodeCommunicator could be extended/used to handle
1090 // the need for per-node per-group communicators.
1091 MPI_Comm_split(dd->mpi_comm_all, physicalnode_id_hash, dd->rank, &mpi_comm_pp_physicalnode);
1092 MPI_Comm_split(mpi_comm_pp_physicalnode, gpu_id, dd->rank, &dd->comm->mpi_comm_gpu_shared);
1093 MPI_Comm_free(&mpi_comm_pp_physicalnode);
1094 MPI_Comm_size(dd->comm->mpi_comm_gpu_shared, &dd->comm->nrank_gpu_shared);
1098 fprintf(debug, "nrank_gpu_shared %d\n", dd->comm->nrank_gpu_shared);
1101 /* Note that some ranks could share a GPU, while others don't */
1103 if (dd->comm->nrank_gpu_shared == 1)
1105 MPI_Comm_free(&dd->comm->mpi_comm_gpu_shared);
1108 GMX_UNUSED_VALUE(cr);
1109 GMX_UNUSED_VALUE(gpu_id);
1113 static void make_load_communicators(gmx_domdec_t gmx_unused* dd)
1120 fprintf(debug, "Making load communicators\n");
1123 dd->comm->load = new domdec_load_t[std::max(dd->ndim, 1)];
1124 snew(dd->comm->mpi_comm_load, std::max(dd->ndim, 1));
1132 make_load_communicator(dd, 0, loc);
1135 const int dim0 = dd->dim[0];
1136 for (int i = 0; i < dd->numCells[dim0]; i++)
1139 make_load_communicator(dd, 1, loc);
1144 const int dim0 = dd->dim[0];
1145 for (int i = 0; i < dd->numCells[dim0]; i++)
1148 const int dim1 = dd->dim[1];
1149 for (int j = 0; j < dd->numCells[dim1]; j++)
1152 make_load_communicator(dd, 2, loc);
1159 fprintf(debug, "Finished making load communicators\n");
1164 /*! \brief Sets up the relation between neighboring domains and zones */
1165 static void setup_neighbor_relations(gmx_domdec_t* dd)
1168 GMX_ASSERT((dd->ndim >= 0) && (dd->ndim <= DIM), "Must have valid number of dimensions for DD");
1170 for (int d = 0; d < dd->ndim; d++)
1172 const int dim = dd->dim[d];
1173 copy_ivec(dd->ci, tmp);
1174 tmp[dim] = (tmp[dim] + 1) % dd->numCells[dim];
1175 dd->neighbor[d][0] = ddcoord2ddnodeid(dd, tmp);
1176 copy_ivec(dd->ci, tmp);
1177 tmp[dim] = (tmp[dim] - 1 + dd->numCells[dim]) % dd->numCells[dim];
1178 dd->neighbor[d][1] = ddcoord2ddnodeid(dd, tmp);
1182 "DD rank %d neighbor ranks in dir %d are + %d - %d\n",
1186 dd->neighbor[d][1]);
1190 int nzone = (1 << dd->ndim);
1191 int nizone = (1 << std::max(dd->ndim - 1, 0));
1192 assert(nizone >= 1 && nizone <= DD_MAXIZONE);
1194 gmx_domdec_zones_t* zones = &dd->comm->zones;
1196 for (int i = 0; i < nzone; i++)
1199 clear_ivec(zones->shift[i]);
1200 for (int d = 0; d < dd->ndim; d++)
1202 zones->shift[i][dd->dim[d]] = dd_zo[i][m++];
1207 for (int i = 0; i < nzone; i++)
1209 for (int d = 0; d < DIM; d++)
1211 s[d] = dd->ci[d] - zones->shift[i][d];
1214 s[d] += dd->numCells[d];
1216 else if (s[d] >= dd->numCells[d])
1218 s[d] -= dd->numCells[d];
1222 for (int iZoneIndex = 0; iZoneIndex < nizone; iZoneIndex++)
1225 ddNonbondedZonePairRanges[iZoneIndex][0] == iZoneIndex,
1226 "The first element for each ddNonbondedZonePairRanges should match its index");
1228 DDPairInteractionRanges iZone;
1229 iZone.iZoneIndex = iZoneIndex;
1230 /* dd_zp3 is for 3D decomposition, for fewer dimensions use only
1231 * j-zones up to nzone.
1233 iZone.jZoneRange = gmx::Range<int>(std::min(ddNonbondedZonePairRanges[iZoneIndex][1], nzone),
1234 std::min(ddNonbondedZonePairRanges[iZoneIndex][2], nzone));
1235 for (int dim = 0; dim < DIM; dim++)
1237 if (dd->numCells[dim] == 1)
1239 /* All shifts should be allowed */
1240 iZone.shift0[dim] = -1;
1241 iZone.shift1[dim] = 1;
1245 /* Determine the min/max j-zone shift wrt the i-zone */
1246 iZone.shift0[dim] = 1;
1247 iZone.shift1[dim] = -1;
1248 for (int jZone : iZone.jZoneRange)
1250 int shift_diff = zones->shift[jZone][dim] - zones->shift[iZoneIndex][dim];
1251 if (shift_diff < iZone.shift0[dim])
1253 iZone.shift0[dim] = shift_diff;
1255 if (shift_diff > iZone.shift1[dim])
1257 iZone.shift1[dim] = shift_diff;
1263 zones->iZones.push_back(iZone);
1266 if (!isDlbDisabled(dd->comm))
1268 dd->comm->cellsizesWithDlb.resize(dd->ndim);
1271 if (dd->comm->ddSettings.recordLoad)
1273 make_load_communicators(dd);
1277 static void make_pp_communicator(const gmx::MDLogger& mdlog,
1279 t_commrec gmx_unused* cr,
1280 bool gmx_unused reorder)
1283 gmx_domdec_comm_t* comm = dd->comm;
1284 CartesianRankSetup& cartSetup = comm->cartesianRankSetup;
1286 if (cartSetup.bCartesianPP)
1288 /* Set up cartesian communication for the particle-particle part */
1290 .appendTextFormatted("Will use a Cartesian communicator: %d x %d x %d",
1296 for (int i = 0; i < DIM; i++)
1300 MPI_Comm comm_cart = MPI_COMM_NULL;
1301 MPI_Cart_create(cr->mpi_comm_mygroup, DIM, dd->numCells, periods, static_cast<int>(reorder), &comm_cart);
1302 /* We overwrite the old communicator with the new cartesian one */
1303 cr->mpi_comm_mygroup = comm_cart;
1306 dd->mpi_comm_all = cr->mpi_comm_mygroup;
1307 MPI_Comm_rank(dd->mpi_comm_all, &dd->rank);
1309 if (cartSetup.bCartesianPP_PME)
1311 /* Since we want to use the original cartesian setup for sim,
1312 * and not the one after split, we need to make an index.
1314 cartSetup.ddindex2ddnodeid.resize(dd->nnodes);
1315 cartSetup.ddindex2ddnodeid[dd_index(dd->numCells, dd->ci)] = dd->rank;
1316 gmx_sumi(dd->nnodes, cartSetup.ddindex2ddnodeid.data(), cr);
1317 /* Get the rank of the DD master,
1318 * above we made sure that the master node is a PP node.
1320 int rank = MASTER(cr) ? dd->rank : 0;
1321 MPI_Allreduce(&rank, &dd->masterrank, 1, MPI_INT, MPI_SUM, dd->mpi_comm_all);
1323 else if (cartSetup.bCartesianPP)
1325 if (!comm->ddRankSetup.usePmeOnlyRanks)
1327 /* The PP communicator is also
1328 * the communicator for this simulation
1330 cr->mpi_comm_mysim = cr->mpi_comm_mygroup;
1332 cr->nodeid = dd->rank;
1334 MPI_Cart_coords(dd->mpi_comm_all, dd->rank, DIM, dd->ci);
1336 /* We need to make an index to go from the coordinates
1337 * to the nodeid of this simulation.
1339 cartSetup.ddindex2simnodeid.resize(dd->nnodes);
1340 std::vector<int> buf(dd->nnodes);
1341 if (thisRankHasDuty(cr, DUTY_PP))
1343 buf[dd_index(dd->numCells, dd->ci)] = cr->sim_nodeid;
1345 /* Communicate the ddindex to simulation nodeid index */
1346 MPI_Allreduce(buf.data(), cartSetup.ddindex2simnodeid.data(), dd->nnodes, MPI_INT, MPI_SUM, cr->mpi_comm_mysim);
1348 /* Determine the master coordinates and rank.
1349 * The DD master should be the same node as the master of this sim.
1351 for (int i = 0; i < dd->nnodes; i++)
1353 if (cartSetup.ddindex2simnodeid[i] == 0)
1355 ddindex2xyz(dd->numCells, i, dd->master_ci);
1356 MPI_Cart_rank(dd->mpi_comm_all, dd->master_ci, &dd->masterrank);
1361 fprintf(debug, "The master rank is %d\n", dd->masterrank);
1366 /* No Cartesian communicators */
1367 /* We use the rank in dd->comm->all as DD index */
1368 ddindex2xyz(dd->numCells, dd->rank, dd->ci);
1369 /* The simulation master nodeid is 0, so the DD master rank is also 0 */
1371 clear_ivec(dd->master_ci);
1376 .appendTextFormatted("Domain decomposition rank %d, coordinates %d %d %d\n",
1384 "Domain decomposition rank %d, coordinates %d %d %d\n\n",
1392 static void receive_ddindex2simnodeid(gmx_domdec_t* dd, t_commrec* cr)
1395 CartesianRankSetup& cartSetup = dd->comm->cartesianRankSetup;
1397 if (!cartSetup.bCartesianPP_PME && cartSetup.bCartesianPP)
1399 cartSetup.ddindex2simnodeid.resize(dd->nnodes);
1400 std::vector<int> buf(dd->nnodes);
1401 if (thisRankHasDuty(cr, DUTY_PP))
1403 buf[dd_index(dd->numCells, dd->ci)] = cr->sim_nodeid;
1405 /* Communicate the ddindex to simulation nodeid index */
1406 MPI_Allreduce(buf.data(), cartSetup.ddindex2simnodeid.data(), dd->nnodes, MPI_INT, MPI_SUM, cr->mpi_comm_mysim);
1409 GMX_UNUSED_VALUE(dd);
1410 GMX_UNUSED_VALUE(cr);
1414 static CartesianRankSetup split_communicator(const gmx::MDLogger& mdlog,
1416 const DdRankOrder ddRankOrder,
1417 bool gmx_unused reorder,
1418 const DDRankSetup& ddRankSetup,
1420 std::vector<int>* pmeRanks)
1422 CartesianRankSetup cartSetup;
1424 cartSetup.bCartesianPP = (ddRankOrder == DdRankOrder::cartesian);
1425 cartSetup.bCartesianPP_PME = false;
1427 const ivec& numDDCells = ddRankSetup.numPPCells;
1428 /* Initially we set ntot to the number of PP cells */
1429 copy_ivec(numDDCells, cartSetup.ntot);
1431 if (cartSetup.bCartesianPP)
1433 const int numDDCellsTot = ddRankSetup.numPPRanks;
1435 for (int i = 1; i < DIM; i++)
1437 bDiv[i] = ((ddRankSetup.numRanksDoingPme * numDDCells[i]) % numDDCellsTot == 0);
1439 if (bDiv[YY] || bDiv[ZZ])
1441 cartSetup.bCartesianPP_PME = TRUE;
1442 /* If we have 2D PME decomposition, which is always in x+y,
1443 * we stack the PME only nodes in z.
1444 * Otherwise we choose the direction that provides the thinnest slab
1445 * of PME only nodes as this will have the least effect
1446 * on the PP communication.
1447 * But for the PME communication the opposite might be better.
1449 if (bDiv[ZZ] && (ddRankSetup.npmenodes_y > 1 || !bDiv[YY] || numDDCells[YY] > numDDCells[ZZ]))
1451 cartSetup.cartpmedim = ZZ;
1455 cartSetup.cartpmedim = YY;
1457 cartSetup.ntot[cartSetup.cartpmedim] +=
1458 (ddRankSetup.numRanksDoingPme * numDDCells[cartSetup.cartpmedim]) / numDDCellsTot;
1463 .appendTextFormatted(
1464 "Number of PME-only ranks (%d) is not a multiple of nx*ny (%d*%d) or "
1466 ddRankSetup.numRanksDoingPme,
1472 .appendText("Will not use a Cartesian communicator for PP <-> PME\n");
1476 if (cartSetup.bCartesianPP_PME)
1483 .appendTextFormatted(
1484 "Will use a Cartesian communicator for PP <-> PME: %d x %d x %d",
1487 cartSetup.ntot[ZZ]);
1489 for (int i = 0; i < DIM; i++)
1493 MPI_Comm comm_cart = MPI_COMM_NULL;
1494 MPI_Cart_create(cr->mpi_comm_mysim, DIM, cartSetup.ntot, periods, static_cast<int>(reorder), &comm_cart);
1495 MPI_Comm_rank(comm_cart, &rank);
1496 if (MASTER(cr) && rank != 0)
1498 gmx_fatal(FARGS, "MPI rank 0 was renumbered by MPI_Cart_create, we do not allow this");
1501 /* With this assigment we loose the link to the original communicator
1502 * which will usually be MPI_COMM_WORLD, unless have multisim.
1504 cr->mpi_comm_mysim = comm_cart;
1505 cr->sim_nodeid = rank;
1507 MPI_Cart_coords(cr->mpi_comm_mysim, cr->sim_nodeid, DIM, ddCellIndex);
1510 .appendTextFormatted("Cartesian rank %d, coordinates %d %d %d\n",
1516 if (ddCellIndex[cartSetup.cartpmedim] < numDDCells[cartSetup.cartpmedim])
1520 if (!ddRankSetup.usePmeOnlyRanks
1521 || ddCellIndex[cartSetup.cartpmedim] >= numDDCells[cartSetup.cartpmedim])
1523 cr->duty = DUTY_PME;
1526 /* Split the sim communicator into PP and PME only nodes */
1527 MPI_Comm_split(cr->mpi_comm_mysim,
1528 getThisRankDuties(cr),
1529 dd_index(cartSetup.ntot, ddCellIndex),
1530 &cr->mpi_comm_mygroup);
1532 GMX_UNUSED_VALUE(ddCellIndex);
1537 switch (ddRankOrder)
1539 case DdRankOrder::pp_pme:
1540 GMX_LOG(mdlog.info).appendText("Order of the ranks: PP first, PME last");
1542 case DdRankOrder::interleave:
1543 /* Interleave the PP-only and PME-only ranks */
1544 GMX_LOG(mdlog.info).appendText("Interleaving PP and PME ranks");
1545 *pmeRanks = dd_interleaved_pme_ranks(ddRankSetup);
1547 case DdRankOrder::cartesian: break;
1548 default: gmx_fatal(FARGS, "Invalid ddRankOrder=%d", static_cast<int>(ddRankOrder));
1551 if (dd_simnode2pmenode(ddRankSetup, cartSetup, *pmeRanks, cr, cr->sim_nodeid) == -1)
1553 cr->duty = DUTY_PME;
1560 /* Split the sim communicator into PP and PME only nodes */
1561 MPI_Comm_split(cr->mpi_comm_mysim, getThisRankDuties(cr), cr->nodeid, &cr->mpi_comm_mygroup);
1562 MPI_Comm_rank(cr->mpi_comm_mygroup, &cr->nodeid);
1567 .appendTextFormatted("This rank does only %s work.\n",
1568 thisRankHasDuty(cr, DUTY_PP) ? "particle-particle" : "PME-mesh");
1573 /*! \brief Makes the PP communicator and the PME communicator, when needed
1575 * Returns the Cartesian rank setup.
1576 * Sets \p cr->mpi_comm_mygroup
1577 * For PP ranks, sets the DD PP cell index in \p ddCellIndex.
1578 * With separate PME ranks in interleaved order, set the PME ranks in \p pmeRanks.
1580 static CartesianRankSetup makeGroupCommunicators(const gmx::MDLogger& mdlog,
1581 const DDSettings& ddSettings,
1582 const DdRankOrder ddRankOrder,
1583 const DDRankSetup& ddRankSetup,
1586 std::vector<int>* pmeRanks)
1588 CartesianRankSetup cartSetup;
1590 // As a default, both group and sim communicators are equal to the default communicator
1591 cr->mpi_comm_mygroup = cr->mpiDefaultCommunicator;
1592 cr->mpi_comm_mysim = cr->mpiDefaultCommunicator;
1593 cr->nnodes = cr->sizeOfDefaultCommunicator;
1594 cr->nodeid = cr->rankInDefaultCommunicator;
1595 cr->sim_nodeid = cr->rankInDefaultCommunicator;
1597 if (ddRankSetup.usePmeOnlyRanks)
1599 /* Split the communicator into a PP and PME part */
1600 cartSetup = split_communicator(
1601 mdlog, cr, ddRankOrder, ddSettings.useCartesianReorder, ddRankSetup, ddCellIndex, pmeRanks);
1605 /* All nodes do PP and PME */
1606 /* We do not require separate communicators */
1607 cartSetup.bCartesianPP = false;
1608 cartSetup.bCartesianPP_PME = false;
1614 /*! \brief For PP ranks, sets or makes the communicator
1616 * For PME ranks get the rank id.
1617 * For PP only ranks, sets the PME-only rank.
1619 static void setupGroupCommunication(const gmx::MDLogger& mdlog,
1620 const DDSettings& ddSettings,
1621 gmx::ArrayRef<const int> pmeRanks,
1623 const int numAtomsInSystem,
1626 const DDRankSetup& ddRankSetup = dd->comm->ddRankSetup;
1627 const CartesianRankSetup& cartSetup = dd->comm->cartesianRankSetup;
1629 if (thisRankHasDuty(cr, DUTY_PP))
1631 /* Copy or make a new PP communicator */
1633 /* We (possibly) reordered the nodes in split_communicator,
1634 * so it is no longer required in make_pp_communicator.
1636 const bool useCartesianReorder = (ddSettings.useCartesianReorder && !cartSetup.bCartesianPP_PME);
1638 make_pp_communicator(mdlog, dd, cr, useCartesianReorder);
1642 receive_ddindex2simnodeid(dd, cr);
1645 if (!thisRankHasDuty(cr, DUTY_PME))
1647 /* Set up the commnuication to our PME node */
1648 dd->pme_nodeid = dd_simnode2pmenode(ddRankSetup, cartSetup, pmeRanks, cr, cr->sim_nodeid);
1649 dd->pme_receive_vir_ener = receive_vir_ener(dd, pmeRanks, cr);
1653 "My pme_nodeid %d receive ener %s\n",
1655 gmx::boolToString(dd->pme_receive_vir_ener));
1660 dd->pme_nodeid = -1;
1663 /* We can not use DDMASTER(dd), because dd->masterrank is set later */
1666 dd->ma = std::make_unique<AtomDistribution>(dd->numCells, numAtomsInSystem, numAtomsInSystem);
1670 static real* get_slb_frac(const gmx::MDLogger& mdlog, const char* dir, int nc, const char* size_string)
1672 real* slb_frac = nullptr;
1673 if (nc > 1 && size_string != nullptr)
1675 GMX_LOG(mdlog.info).appendTextFormatted("Using static load balancing for the %s direction", dir);
1678 for (int i = 0; i < nc; i++)
1682 sscanf(size_string, "%20lf%n", &dbl, &n);
1686 "Incorrect or not enough DD cell size entries for direction %s: '%s'",
1695 std::string relativeCellSizes = "Relative cell sizes:";
1696 for (int i = 0; i < nc; i++)
1699 relativeCellSizes += gmx::formatString(" %5.3f", slb_frac[i]);
1701 GMX_LOG(mdlog.info).appendText(relativeCellSizes);
1707 static int multi_body_bondeds_count(const gmx_mtop_t& mtop)
1710 for (const auto ilists : IListRange(mtop))
1712 for (auto& ilist : extractILists(ilists.list(), IF_BOND))
1714 if (NRAL(ilist.functionType) > 2)
1716 n += ilists.nmol() * (ilist.iatoms.size() / ilistStride(ilist));
1724 static int dd_getenv(const gmx::MDLogger& mdlog, const char* env_var, int def)
1727 char* val = getenv(env_var);
1730 if (sscanf(val, "%20d", &nst) <= 0)
1734 GMX_LOG(mdlog.info).appendTextFormatted("Found env.var. %s = %s, using value %d", env_var, val, nst);
1740 static void check_dd_restrictions(const gmx_domdec_t* dd, const t_inputrec& inputrec, const gmx::MDLogger& mdlog)
1742 if (inputrec.pbcType == PbcType::Screw
1743 && (dd->numCells[XX] == 1 || dd->numCells[YY] > 1 || dd->numCells[ZZ] > 1))
1746 "With pbc=%s can only do domain decomposition in the x-direction",
1747 c_pbcTypeNames[inputrec.pbcType].c_str());
1750 if (inputrec.nstlist == 0)
1752 gmx_fatal(FARGS, "Domain decomposition does not work with nstlist=0");
1755 if (inputrec.comm_mode == ComRemovalAlgorithm::Angular && inputrec.pbcType != PbcType::No)
1757 GMX_LOG(mdlog.warning)
1759 "comm-mode angular will give incorrect results when the comm group "
1760 "partially crosses a periodic boundary");
1764 static real average_cellsize_min(const gmx_ddbox_t& ddbox, const ivec numDomains)
1766 real r = ddbox.box_size[XX];
1767 for (int d = 0; d < DIM; d++)
1769 if (numDomains[d] > 1)
1771 /* Check using the initial average cell size */
1772 r = std::min(r, ddbox.box_size[d] * ddbox.skew_fac[d] / numDomains[d]);
1779 /*! \brief Depending on the DLB initial value return the DLB switched off state or issue an error.
1781 static DlbState forceDlbOffOrBail(DlbState cmdlineDlbState,
1782 const std::string& reasonStr,
1783 const gmx::MDLogger& mdlog)
1785 std::string dlbNotSupportedErr = "Dynamic load balancing requested, but ";
1786 std::string dlbDisableNote = "NOTE: disabling dynamic load balancing as ";
1788 if (cmdlineDlbState == DlbState::onUser)
1790 gmx_fatal(FARGS, "%s", (dlbNotSupportedErr + reasonStr).c_str());
1792 else if (cmdlineDlbState == DlbState::offCanTurnOn)
1794 GMX_LOG(mdlog.info).appendText(dlbDisableNote + reasonStr);
1796 return DlbState::offForever;
1799 /*! \brief Return the dynamic load balancer's initial state based on initial conditions and user inputs.
1801 * This function parses the parameters of "-dlb" command line option setting
1802 * corresponding state values. Then it checks the consistency of the determined
1803 * state with other run parameters and settings. As a result, the initial state
1804 * may be altered or an error may be thrown if incompatibility of options is detected.
1806 * \param [in] mdlog Logger.
1807 * \param [in] dlbOption Enum value for the DLB option.
1808 * \param [in] bRecordLoad True if the load balancer is recording load information.
1809 * \param [in] mdrunOptions Options for mdrun.
1810 * \param [in] inputrec Pointer mdrun to input parameters.
1811 * \param [in] directGpuCommUsedWithGpuUpdate Direct GPU halo exchange and GPU update enabled
1812 * \returns DLB initial/startup state.
1814 static DlbState determineInitialDlbState(const gmx::MDLogger& mdlog,
1815 DlbOption dlbOption,
1816 gmx_bool bRecordLoad,
1817 const gmx::MdrunOptions& mdrunOptions,
1818 const t_inputrec& inputrec,
1819 const bool directGpuCommUsedWithGpuUpdate)
1821 DlbState dlbState = DlbState::offCanTurnOn;
1825 case DlbOption::turnOnWhenUseful: dlbState = DlbState::offCanTurnOn; break;
1826 case DlbOption::no: dlbState = DlbState::offUser; break;
1827 case DlbOption::yes: dlbState = DlbState::onUser; break;
1828 default: gmx_incons("Invalid dlbOption enum value");
1831 // P2P GPU comm + GPU update leads to case in which we enqueue async work for multiple timesteps
1832 // DLB needs to be disabled in that case
1833 if (directGpuCommUsedWithGpuUpdate)
1835 std::string reasonStr =
1836 "it is not supported with GPU direct communication + GPU update enabled.";
1837 return forceDlbOffOrBail(dlbState, reasonStr, mdlog);
1840 /* Reruns don't support DLB: bail or override auto mode */
1841 if (mdrunOptions.rerun)
1843 std::string reasonStr = "it is not supported in reruns.";
1844 return forceDlbOffOrBail(dlbState, reasonStr, mdlog);
1847 /* Unsupported integrators */
1848 if (!EI_DYNAMICS(inputrec.eI))
1851 gmx::formatString("it is only supported with dynamics, not with integrator '%s'.",
1852 enumValueToString(inputrec.eI));
1853 return forceDlbOffOrBail(dlbState, reasonStr, mdlog);
1856 /* Without cycle counters we can't time work to balance on */
1859 std::string reasonStr =
1860 "cycle counters unsupported or not enabled in the operating system kernel.";
1861 return forceDlbOffOrBail(dlbState, reasonStr, mdlog);
1864 if (mdrunOptions.reproducible)
1866 std::string reasonStr = "you started a reproducible run.";
1869 case DlbState::offUser: break;
1870 case DlbState::offForever:
1871 GMX_RELEASE_ASSERT(false, "DlbState::offForever is not a valid initial state");
1873 case DlbState::offCanTurnOn: return forceDlbOffOrBail(dlbState, reasonStr, mdlog);
1874 case DlbState::onCanTurnOff:
1875 GMX_RELEASE_ASSERT(false, "DlbState::offCanTurnOff is not a valid initial state");
1877 case DlbState::onUser:
1878 return forceDlbOffOrBail(
1881 + " In load balanced runs binary reproducibility cannot be "
1886 "Death horror: undefined case (%d) for load balancing choice",
1887 static_cast<int>(dlbState));
1894 static gmx_domdec_comm_t* init_dd_comm()
1896 gmx_domdec_comm_t* comm = new gmx_domdec_comm_t;
1898 comm->n_load_have = 0;
1899 comm->n_load_collect = 0;
1901 comm->haveTurnedOffDlb = false;
1903 for (int i = 0; i < static_cast<int>(DDAtomRanges::Type::Number); i++)
1905 comm->sum_nat[i] = 0;
1909 comm->load_step = 0;
1912 clear_ivec(comm->load_lim);
1916 /* This should be replaced by a unique pointer */
1917 comm->balanceRegion = ddBalanceRegionAllocate();
1922 static void setupUpdateGroups(const gmx::MDLogger& mdlog,
1923 const gmx_mtop_t& mtop,
1924 ArrayRef<const RangePartitioning> updateGroupingsPerMoleculeType,
1925 const bool useUpdateGroups,
1926 const real maxUpdateGroupRadius,
1927 DDSystemInfo* systemInfo)
1929 systemInfo->updateGroupingsPerMoleculeType = updateGroupingsPerMoleculeType;
1930 systemInfo->useUpdateGroups = useUpdateGroups;
1931 systemInfo->maxUpdateGroupRadius = maxUpdateGroupRadius;
1933 if (systemInfo->useUpdateGroups)
1935 int numUpdateGroups = 0;
1936 for (const auto& molblock : mtop.molblock)
1938 numUpdateGroups += molblock.nmol
1939 * systemInfo->updateGroupingsPerMoleculeType[molblock.type].numBlocks();
1943 .appendTextFormatted(
1944 "Using update groups, nr %d, average size %.1f atoms, max. radius %.3f "
1947 mtop.natoms / static_cast<double>(numUpdateGroups),
1948 systemInfo->maxUpdateGroupRadius);
1952 UnitCellInfo::UnitCellInfo(const t_inputrec& ir) :
1953 npbcdim(numPbcDimensions(ir.pbcType)),
1954 numBoundedDimensions(inputrec2nboundeddim(&ir)),
1955 ddBoxIsDynamic(numBoundedDimensions < DIM || inputrecDynamicBox(&ir)),
1956 haveScrewPBC(ir.pbcType == PbcType::Screw)
1960 /* Returns whether molecules are always whole, i.e. not broken by PBC */
1961 static bool moleculesAreAlwaysWhole(const gmx_mtop_t& mtop,
1962 const bool useUpdateGroups,
1963 gmx::ArrayRef<const gmx::RangePartitioning> updateGroupingsPerMoleculeType)
1965 if (useUpdateGroups)
1967 GMX_RELEASE_ASSERT(updateGroupingsPerMoleculeType.size() == mtop.moltype.size(),
1968 "Need one grouping per moltype");
1969 for (size_t mol = 0; mol < mtop.moltype.size(); mol++)
1971 if (updateGroupingsPerMoleculeType[mol].numBlocks() > 1)
1979 for (const auto& moltype : mtop.moltype)
1981 if (moltype.atoms.nr > 1)
1991 /*! \brief Generate the simulation system information */
1992 static DDSystemInfo getSystemInfo(const gmx::MDLogger& mdlog,
1994 MPI_Comm communicator,
1995 const DomdecOptions& options,
1996 const gmx_mtop_t& mtop,
1997 const t_inputrec& ir,
1999 ArrayRef<const RangePartitioning> updateGroupingPerMoleculeType,
2000 const bool useUpdateGroups,
2001 const real maxUpdateGroupRadius,
2002 gmx::ArrayRef<const gmx::RVec> xGlobal)
2004 const real tenPercentMargin = 1.1;
2006 DDSystemInfo systemInfo;
2009 mdlog, mtop, updateGroupingPerMoleculeType, useUpdateGroups, maxUpdateGroupRadius, &systemInfo);
2011 systemInfo.moleculesAreAlwaysWhole = moleculesAreAlwaysWhole(
2012 mtop, systemInfo.useUpdateGroups, systemInfo.updateGroupingsPerMoleculeType);
2013 systemInfo.haveInterDomainBondeds =
2014 (!systemInfo.moleculesAreAlwaysWhole || mtop.bIntermolecularInteractions);
2015 systemInfo.haveInterDomainMultiBodyBondeds =
2016 (systemInfo.haveInterDomainBondeds && multi_body_bondeds_count(mtop) > 0);
2018 if (systemInfo.useUpdateGroups)
2020 systemInfo.mayHaveSplitConstraints = false;
2021 systemInfo.mayHaveSplitSettles = false;
2025 systemInfo.mayHaveSplitConstraints = (gmx_mtop_ftype_count(mtop, F_CONSTR) > 0
2026 || gmx_mtop_ftype_count(mtop, F_CONSTRNC) > 0);
2027 systemInfo.mayHaveSplitSettles = (gmx_mtop_ftype_count(mtop, F_SETTLE) > 0);
2032 /* Set the cut-off to some very large value,
2033 * so we don't need if statements everywhere in the code.
2034 * We use sqrt, since the cut-off is squared in some places.
2036 systemInfo.cutoff = GMX_CUTOFF_INF;
2040 systemInfo.cutoff = atomToAtomIntoDomainToDomainCutoff(systemInfo, ir.rlist);
2042 systemInfo.minCutoffForMultiBody = 0;
2044 /* Determine the minimum cell size limit, affected by many factors */
2045 systemInfo.cellsizeLimit = 0;
2046 systemInfo.filterBondedCommunication = false;
2048 /* We do not allow home atoms to move beyond the neighboring domain
2049 * between domain decomposition steps, which limits the cell size.
2050 * Get an estimate of cell size limit due to atom displacement.
2051 * In most cases this is a large overestimate, because it assumes
2052 * non-interaction atoms.
2053 * We set the chance to 1 in a trillion steps.
2055 constexpr real c_chanceThatAtomMovesBeyondDomain = 1e-12;
2056 const real limitForAtomDisplacement = minCellSizeForAtomDisplacement(
2057 mtop, ir, systemInfo.updateGroupingsPerMoleculeType, c_chanceThatAtomMovesBeyondDomain);
2058 GMX_LOG(mdlog.info).appendTextFormatted("Minimum cell size due to atom displacement: %.3f nm", limitForAtomDisplacement);
2060 systemInfo.cellsizeLimit = std::max(systemInfo.cellsizeLimit, limitForAtomDisplacement);
2062 /* TODO: PME decomposition currently requires atoms not to be more than
2063 * 2/3 of comm->cutoff, which is >=rlist, outside of their domain.
2064 * In nearly all cases, limitForAtomDisplacement will be smaller
2065 * than 2/3*rlist, so the PME requirement is satisfied.
2066 * But it would be better for both correctness and performance
2067 * to use limitForAtomDisplacement instead of 2/3*comm->cutoff.
2068 * Note that we would need to improve the pairlist buffer case.
2071 if (systemInfo.haveInterDomainBondeds)
2073 if (options.minimumCommunicationRange > 0)
2075 systemInfo.minCutoffForMultiBody =
2076 atomToAtomIntoDomainToDomainCutoff(systemInfo, options.minimumCommunicationRange);
2077 if (options.useBondedCommunication)
2079 systemInfo.filterBondedCommunication =
2080 (systemInfo.minCutoffForMultiBody > systemInfo.cutoff);
2084 systemInfo.cutoff = std::max(systemInfo.cutoff, systemInfo.minCutoffForMultiBody);
2087 else if (ir.bPeriodicMols)
2089 /* Can not easily determine the required cut-off */
2090 GMX_LOG(mdlog.warning)
2092 "NOTE: Periodic molecules are present in this system. Because of this, "
2093 "the domain decomposition algorithm cannot easily determine the "
2094 "minimum cell size that it requires for treating bonded interactions. "
2095 "Instead, domain decomposition will assume that half the non-bonded "
2096 "cut-off will be a suitable lower bound.");
2097 systemInfo.minCutoffForMultiBody = systemInfo.cutoff / 2;
2104 if (ddRole == DDRole::Master)
2106 dd_bonded_cg_distance(mdlog, mtop, ir, xGlobal, box, options.ddBondedChecking, &r_2b, &r_mb);
2108 gmx_bcast(sizeof(r_2b), &r_2b, communicator);
2109 gmx_bcast(sizeof(r_mb), &r_mb, communicator);
2111 /* We use an initial margin of 10% for the minimum cell size,
2112 * except when we are just below the non-bonded cut-off.
2114 if (options.useBondedCommunication)
2116 if (std::max(r_2b, r_mb) > systemInfo.cutoff)
2118 const real r_bonded = std::max(r_2b, r_mb);
2119 systemInfo.minCutoffForMultiBody = tenPercentMargin * r_bonded;
2120 /* This is the (only) place where we turn on the filtering */
2121 systemInfo.filterBondedCommunication = true;
2125 const real r_bonded = r_mb;
2126 systemInfo.minCutoffForMultiBody =
2127 std::min(tenPercentMargin * r_bonded, systemInfo.cutoff);
2129 /* We determine cutoff_mbody later */
2130 systemInfo.increaseMultiBodyCutoff = true;
2134 /* No special bonded communication,
2135 * simply increase the DD cut-off.
2137 systemInfo.minCutoffForMultiBody = tenPercentMargin * std::max(r_2b, r_mb);
2138 systemInfo.cutoff = std::max(systemInfo.cutoff, systemInfo.minCutoffForMultiBody);
2142 .appendTextFormatted("Minimum cell size due to bonded interactions: %.3f nm",
2143 systemInfo.minCutoffForMultiBody);
2145 systemInfo.cellsizeLimit = std::max(systemInfo.cellsizeLimit, systemInfo.minCutoffForMultiBody);
2148 systemInfo.constraintCommunicationRange = 0;
2149 if (systemInfo.mayHaveSplitConstraints && options.constraintCommunicationRange <= 0)
2151 /* There is a cell size limit due to the constraints (P-LINCS) */
2152 systemInfo.constraintCommunicationRange = gmx::constr_r_max(mdlog, &mtop, &ir);
2154 .appendTextFormatted("Estimated maximum distance required for P-LINCS: %.3f nm",
2155 systemInfo.constraintCommunicationRange);
2156 if (systemInfo.constraintCommunicationRange > systemInfo.cellsizeLimit)
2160 "This distance will limit the DD cell size, you can override this with "
2164 else if (options.constraintCommunicationRange > 0)
2166 /* Here we do not check for dd->splitConstraints.
2167 * because one can also set a cell size limit for virtual sites only
2168 * and at this point we don't know yet if there are intercg v-sites.
2171 .appendTextFormatted("User supplied maximum distance required for P-LINCS: %.3f nm",
2172 options.constraintCommunicationRange);
2173 systemInfo.constraintCommunicationRange = options.constraintCommunicationRange;
2175 systemInfo.cellsizeLimit = std::max(systemInfo.cellsizeLimit, systemInfo.constraintCommunicationRange);
2180 /*! \brief Exit with a fatal error if the DDGridSetup cannot be
2182 static void checkDDGridSetup(const DDGridSetup& ddGridSetup,
2184 MPI_Comm communicator,
2186 const DomdecOptions& options,
2187 const DDSettings& ddSettings,
2188 const DDSystemInfo& systemInfo,
2189 const real cellsizeLimit,
2190 const gmx_ddbox_t& ddbox)
2192 if (options.numCells[XX] <= 0 && (ddGridSetup.numDomains[XX] == 0))
2194 const bool bC = (systemInfo.mayHaveSplitConstraints
2195 && systemInfo.constraintCommunicationRange > systemInfo.minCutoffForMultiBody);
2196 std::string message =
2197 gmx::formatString("Change the number of ranks or mdrun option %s%s%s",
2198 !bC ? "-rdd" : "-rcon",
2199 ddSettings.initialDlbState != DlbState::offUser ? " or -dds" : "",
2200 bC ? " or your LINCS settings" : "");
2202 gmx_fatal_collective(FARGS,
2204 ddRole == DDRole::Master,
2205 "There is no domain decomposition for %d ranks that is compatible "
2206 "with the given box and a minimum cell size of %g nm\n"
2208 "Look in the log file for details on the domain decomposition",
2209 numNodes - ddGridSetup.numPmeOnlyRanks,
2214 const real acs = average_cellsize_min(ddbox, ddGridSetup.numDomains);
2215 if (acs < cellsizeLimit)
2217 if (options.numCells[XX] <= 0)
2221 "dd_choose_grid() should return a grid that satisfies the cell size limits");
2225 gmx_fatal_collective(
2228 ddRole == DDRole::Master,
2229 "The initial cell size (%f) is smaller than the cell size limit (%f), change "
2230 "options -dd, -rdd or -rcon, see the log file for details",
2236 const int numPPRanks =
2237 ddGridSetup.numDomains[XX] * ddGridSetup.numDomains[YY] * ddGridSetup.numDomains[ZZ];
2238 if (numNodes - numPPRanks != ddGridSetup.numPmeOnlyRanks)
2240 gmx_fatal_collective(FARGS,
2242 ddRole == DDRole::Master,
2243 "The size of the domain decomposition grid (%d) does not match the "
2244 "number of PP ranks (%d). The total number of ranks is %d",
2246 numNodes - ddGridSetup.numPmeOnlyRanks,
2249 if (ddGridSetup.numPmeOnlyRanks > numPPRanks)
2251 gmx_fatal_collective(FARGS,
2253 ddRole == DDRole::Master,
2254 "The number of separate PME ranks (%d) is larger than the number of "
2255 "PP ranks (%d), this is not supported.",
2256 ddGridSetup.numPmeOnlyRanks,
2261 /*! \brief Set the cell size and interaction limits, as well as the DD grid */
2262 static DDRankSetup getDDRankSetup(const gmx::MDLogger& mdlog,
2264 const DdRankOrder rankOrder,
2265 const DDGridSetup& ddGridSetup,
2266 const t_inputrec& ir)
2269 .appendTextFormatted("Domain decomposition grid %d x %d x %d, separate PME ranks %d",
2270 ddGridSetup.numDomains[XX],
2271 ddGridSetup.numDomains[YY],
2272 ddGridSetup.numDomains[ZZ],
2273 ddGridSetup.numPmeOnlyRanks);
2275 DDRankSetup ddRankSetup;
2277 ddRankSetup.rankOrder = rankOrder;
2279 ddRankSetup.numPPRanks = numNodes - ddGridSetup.numPmeOnlyRanks;
2280 copy_ivec(ddGridSetup.numDomains, ddRankSetup.numPPCells);
2282 ddRankSetup.usePmeOnlyRanks = (ddGridSetup.numPmeOnlyRanks > 0);
2283 if (ddRankSetup.usePmeOnlyRanks)
2285 ddRankSetup.numRanksDoingPme = ddGridSetup.numPmeOnlyRanks;
2289 ddRankSetup.numRanksDoingPme =
2290 ddGridSetup.numDomains[XX] * ddGridSetup.numDomains[YY] * ddGridSetup.numDomains[ZZ];
2293 if (EEL_PME(ir.coulombtype) || EVDW_PME(ir.vdwtype))
2295 /* The following choices should match those
2296 * in comm_cost_est in domdec_setup.c.
2297 * Note that here the checks have to take into account
2298 * that the decomposition might occur in a different order than xyz
2299 * (for instance through the env.var. GMX_DD_ORDER_ZYX),
2300 * in which case they will not match those in comm_cost_est,
2301 * but since that is mainly for testing purposes that's fine.
2303 if (ddGridSetup.numDDDimensions >= 2 && ddGridSetup.ddDimensions[0] == XX
2304 && ddGridSetup.ddDimensions[1] == YY
2305 && ddRankSetup.numRanksDoingPme > ddGridSetup.numDomains[XX]
2306 && ddRankSetup.numRanksDoingPme % ddGridSetup.numDomains[XX] == 0
2307 && getenv("GMX_PMEONEDD") == nullptr)
2309 ddRankSetup.npmedecompdim = 2;
2310 ddRankSetup.npmenodes_x = ddGridSetup.numDomains[XX];
2311 ddRankSetup.npmenodes_y = ddRankSetup.numRanksDoingPme / ddRankSetup.npmenodes_x;
2315 /* In case nc is 1 in both x and y we could still choose to
2316 * decompose pme in y instead of x, but we use x for simplicity.
2318 ddRankSetup.npmedecompdim = 1;
2319 if (ddGridSetup.ddDimensions[0] == YY)
2321 ddRankSetup.npmenodes_x = 1;
2322 ddRankSetup.npmenodes_y = ddRankSetup.numRanksDoingPme;
2326 ddRankSetup.npmenodes_x = ddRankSetup.numRanksDoingPme;
2327 ddRankSetup.npmenodes_y = 1;
2331 .appendTextFormatted("PME domain decomposition: %d x %d x %d",
2332 ddRankSetup.npmenodes_x,
2333 ddRankSetup.npmenodes_y,
2338 ddRankSetup.npmedecompdim = 0;
2339 ddRankSetup.npmenodes_x = 0;
2340 ddRankSetup.npmenodes_y = 0;
2346 /*! \brief Set the cell size and interaction limits */
2347 static void set_dd_limits(const gmx::MDLogger& mdlog,
2350 const DomdecOptions& options,
2351 const DDSettings& ddSettings,
2352 const DDSystemInfo& systemInfo,
2353 const DDGridSetup& ddGridSetup,
2354 const int numPPRanks,
2355 const gmx_mtop_t& mtop,
2356 const t_inputrec& ir,
2357 const gmx_ddbox_t& ddbox)
2359 gmx_domdec_comm_t* comm = dd->comm;
2360 comm->ddSettings = ddSettings;
2362 /* Initialize to GPU share count to 0, might change later */
2363 comm->nrank_gpu_shared = 0;
2365 comm->dlbState = comm->ddSettings.initialDlbState;
2366 dd_dlb_set_should_check_whether_to_turn_dlb_on(dd, TRUE);
2367 /* To consider turning DLB on after 2*nstlist steps we need to check
2368 * at partitioning count 3. Thus we need to increase the first count by 2.
2370 comm->ddPartioningCountFirstDlbOff += 2;
2372 comm->bPMELoadBalDLBLimits = FALSE;
2374 /* Allocate the charge group/atom sorting struct */
2375 comm->sort = std::make_unique<gmx_domdec_sort_t>();
2377 comm->systemInfo = systemInfo;
2379 if (systemInfo.useUpdateGroups)
2381 /* Note: We would like to use dd->nnodes for the atom count estimate,
2382 * but that is not yet available here. But this anyhow only
2383 * affect performance up to the second dd_partition_system call.
2385 const int homeAtomCountEstimate = mtop.natoms / numPPRanks;
2386 comm->updateGroupsCog = std::make_unique<gmx::UpdateGroupsCog>(
2387 mtop, systemInfo.updateGroupingsPerMoleculeType, maxReferenceTemperature(ir), homeAtomCountEstimate);
2390 /* Set the DD setup given by ddGridSetup */
2391 copy_ivec(ddGridSetup.numDomains, dd->numCells);
2392 dd->ndim = ddGridSetup.numDDDimensions;
2393 copy_ivec(ddGridSetup.ddDimensions, dd->dim);
2395 dd->nnodes = dd->numCells[XX] * dd->numCells[YY] * dd->numCells[ZZ];
2397 snew(comm->slb_frac, DIM);
2398 if (isDlbDisabled(comm))
2400 comm->slb_frac[XX] = get_slb_frac(mdlog, "x", dd->numCells[XX], options.cellSizeX);
2401 comm->slb_frac[YY] = get_slb_frac(mdlog, "y", dd->numCells[YY], options.cellSizeY);
2402 comm->slb_frac[ZZ] = get_slb_frac(mdlog, "z", dd->numCells[ZZ], options.cellSizeZ);
2405 /* Set the multi-body cut-off and cellsize limit for DLB */
2406 comm->cutoff_mbody = systemInfo.minCutoffForMultiBody;
2407 comm->cellsize_limit = systemInfo.cellsizeLimit;
2408 if (systemInfo.haveInterDomainBondeds && systemInfo.increaseMultiBodyCutoff)
2410 if (systemInfo.filterBondedCommunication || !isDlbDisabled(comm))
2412 /* Set the bonded communication distance to halfway
2413 * the minimum and the maximum,
2414 * since the extra communication cost is nearly zero.
2416 real acs = average_cellsize_min(ddbox, dd->numCells);
2417 comm->cutoff_mbody = 0.5 * (systemInfo.minCutoffForMultiBody + acs);
2418 if (!isDlbDisabled(comm))
2420 /* Check if this does not limit the scaling */
2421 comm->cutoff_mbody = std::min(comm->cutoff_mbody, options.dlbScaling * acs);
2423 if (!systemInfo.filterBondedCommunication)
2425 /* Without bBondComm do not go beyond the n.b. cut-off */
2426 comm->cutoff_mbody = std::min(comm->cutoff_mbody, systemInfo.cutoff);
2427 if (comm->cellsize_limit >= systemInfo.cutoff)
2429 /* We don't loose a lot of efficieny
2430 * when increasing it to the n.b. cut-off.
2431 * It can even be slightly faster, because we need
2432 * less checks for the communication setup.
2434 comm->cutoff_mbody = systemInfo.cutoff;
2437 /* Check if we did not end up below our original limit */
2438 comm->cutoff_mbody = std::max(comm->cutoff_mbody, systemInfo.minCutoffForMultiBody);
2440 if (comm->cutoff_mbody > comm->cellsize_limit)
2442 comm->cellsize_limit = comm->cutoff_mbody;
2445 /* Without DLB and cutoff_mbody<cutoff, cutoff_mbody is dynamic */
2451 "Bonded atom communication beyond the cut-off: %s\n"
2452 "cellsize limit %f\n",
2453 gmx::boolToString(systemInfo.filterBondedCommunication),
2454 comm->cellsize_limit);
2457 if (ddRole == DDRole::Master)
2459 check_dd_restrictions(dd, ir, mdlog);
2463 static void writeSettings(gmx::TextWriter* log,
2465 const gmx_mtop_t& mtop,
2466 const t_inputrec& ir,
2467 gmx_bool bDynLoadBal,
2469 const gmx_ddbox_t* ddbox)
2471 gmx_domdec_comm_t* comm = dd->comm;
2475 log->writeString("The maximum number of communication pulses is:");
2476 for (int d = 0; d < dd->ndim; d++)
2478 log->writeStringFormatted(" %c %d", dim2char(dd->dim[d]), comm->cd[d].np_dlb);
2480 log->ensureLineBreak();
2481 log->writeLineFormatted("The minimum size for domain decomposition cells is %.3f nm",
2482 comm->cellsize_limit);
2483 log->writeLineFormatted("The requested allowed shrink of DD cells (option -dds) is: %.2f", dlb_scale);
2484 log->writeString("The allowed shrink of domain decomposition cells is:");
2485 for (int d = 0; d < DIM; d++)
2487 if (dd->numCells[d] > 1)
2490 (d >= ddbox->npbcdim && dd->numCells[d] == 2)
2492 : comm->cellsize_min_dlb[d]
2493 / (ddbox->box_size[d] * ddbox->skew_fac[d] / dd->numCells[d]);
2494 log->writeStringFormatted(" %c %.2f", dim2char(d), shrink);
2497 log->ensureLineBreak();
2502 set_dd_cell_sizes_slb(dd, ddbox, setcellsizeslbPULSE_ONLY, np);
2503 log->writeString("The initial number of communication pulses is:");
2504 for (int d = 0; d < dd->ndim; d++)
2506 log->writeStringFormatted(" %c %d", dim2char(dd->dim[d]), np[dd->dim[d]]);
2508 log->ensureLineBreak();
2509 log->writeString("The initial domain decomposition cell size is:");
2510 for (int d = 0; d < DIM; d++)
2512 if (dd->numCells[d] > 1)
2514 log->writeStringFormatted(" %c %.2f nm", dim2char(d), dd->comm->cellsize_min[d]);
2517 log->ensureLineBreak();
2521 const bool haveInterDomainVsites =
2522 (countInterUpdategroupVsites(mtop, comm->systemInfo.updateGroupingsPerMoleculeType) != 0);
2524 if (comm->systemInfo.haveInterDomainBondeds || haveInterDomainVsites
2525 || comm->systemInfo.mayHaveSplitConstraints || comm->systemInfo.mayHaveSplitSettles)
2527 std::string decompUnits;
2528 if (comm->systemInfo.useUpdateGroups)
2530 decompUnits = "atom groups";
2534 decompUnits = "atoms";
2537 log->writeLineFormatted("The maximum allowed distance for %s involved in interactions is:",
2538 decompUnits.c_str());
2539 log->writeLineFormatted(
2540 "%40s %-7s %6.3f nm", "non-bonded interactions", "", comm->systemInfo.cutoff);
2545 limit = dd->comm->cellsize_limit;
2549 if (dd->unitCellInfo.ddBoxIsDynamic)
2552 "(the following are initial values, they could change due to box "
2555 limit = dd->comm->cellsize_min[XX];
2556 for (int d = 1; d < DIM; d++)
2558 limit = std::min(limit, dd->comm->cellsize_min[d]);
2562 if (comm->systemInfo.haveInterDomainBondeds)
2564 log->writeLineFormatted("%40s %-7s %6.3f nm",
2565 "two-body bonded interactions",
2567 std::max(comm->systemInfo.cutoff, comm->cutoff_mbody));
2568 log->writeLineFormatted("%40s %-7s %6.3f nm",
2569 "multi-body bonded interactions",
2571 (comm->systemInfo.filterBondedCommunication || isDlbOn(dd->comm))
2572 ? comm->cutoff_mbody
2573 : std::min(comm->systemInfo.cutoff, limit));
2575 if (haveInterDomainVsites)
2577 log->writeLineFormatted("%40s %-7s %6.3f nm", "virtual site constructions", "(-rcon)", limit);
2579 if (comm->systemInfo.mayHaveSplitConstraints || comm->systemInfo.mayHaveSplitSettles)
2581 std::string separation =
2582 gmx::formatString("atoms separated by up to %d constraints", 1 + ir.nProjOrder);
2583 log->writeLineFormatted("%40s %-7s %6.3f nm\n", separation.c_str(), "(-rcon)", limit);
2585 log->ensureLineBreak();
2589 static void logSettings(const gmx::MDLogger& mdlog,
2591 const gmx_mtop_t& mtop,
2592 const t_inputrec& ir,
2594 const gmx_ddbox_t* ddbox)
2596 gmx::StringOutputStream stream;
2597 gmx::TextWriter log(&stream);
2598 writeSettings(&log, dd, mtop, ir, isDlbOn(dd->comm), dlb_scale, ddbox);
2599 if (dd->comm->dlbState == DlbState::offCanTurnOn)
2602 log.ensureEmptyLine();
2604 "When dynamic load balancing gets turned on, these settings will change to:");
2606 writeSettings(&log, dd, mtop, ir, true, dlb_scale, ddbox);
2608 GMX_LOG(mdlog.info).asParagraph().appendText(stream.toString());
2611 static void set_cell_limits_dlb(const gmx::MDLogger& mdlog,
2614 const t_inputrec& inputrec,
2615 const gmx_ddbox_t* ddbox)
2618 int npulse_d_max = 0;
2621 gmx_domdec_comm_t* comm = dd->comm;
2623 bool bNoCutOff = (inputrec.rvdw == 0 || inputrec.rcoulomb == 0);
2625 /* Determine the maximum number of comm. pulses in one dimension */
2627 comm->cellsize_limit = std::max(comm->cellsize_limit, comm->cutoff_mbody);
2629 /* Determine the maximum required number of grid pulses */
2630 if (comm->cellsize_limit >= comm->systemInfo.cutoff)
2632 /* Only a single pulse is required */
2635 else if (!bNoCutOff && comm->cellsize_limit > 0)
2637 /* We round down slightly here to avoid overhead due to the latency
2638 * of extra communication calls when the cut-off
2639 * would be only slightly longer than the cell size.
2640 * Later cellsize_limit is redetermined,
2641 * so we can not miss interactions due to this rounding.
2643 npulse = static_cast<int>(0.96 + comm->systemInfo.cutoff / comm->cellsize_limit);
2647 /* There is no cell size limit */
2648 npulse = std::max(dd->numCells[XX] - 1, std::max(dd->numCells[YY] - 1, dd->numCells[ZZ] - 1));
2651 if (!bNoCutOff && npulse > 1)
2653 /* See if we can do with less pulses, based on dlb_scale */
2655 for (int d = 0; d < dd->ndim; d++)
2657 int dim = dd->dim[d];
2658 npulse_d = static_cast<int>(
2660 + dd->numCells[dim] * comm->systemInfo.cutoff
2661 / (ddbox->box_size[dim] * ddbox->skew_fac[dim] * dlb_scale));
2662 npulse_d_max = std::max(npulse_d_max, npulse_d);
2664 npulse = std::min(npulse, npulse_d_max);
2667 /* This env var can override npulse */
2668 const int ddPulseEnv = dd_getenv(mdlog, "GMX_DD_NPULSE", 0);
2671 npulse = ddPulseEnv;
2675 comm->bVacDLBNoLimit = (inputrec.pbcType == PbcType::No);
2676 for (int d = 0; d < dd->ndim; d++)
2678 comm->cd[d].np_dlb = std::min(npulse, dd->numCells[dd->dim[d]] - 1);
2679 comm->maxpulse = std::max(comm->maxpulse, comm->cd[d].np_dlb);
2680 if (comm->cd[d].np_dlb < dd->numCells[dd->dim[d]] - 1)
2682 comm->bVacDLBNoLimit = FALSE;
2686 /* cellsize_limit is set for LINCS in init_domain_decomposition */
2687 if (!comm->bVacDLBNoLimit)
2689 comm->cellsize_limit = std::max(comm->cellsize_limit, comm->systemInfo.cutoff / comm->maxpulse);
2691 comm->cellsize_limit = std::max(comm->cellsize_limit, comm->cutoff_mbody);
2692 /* Set the minimum cell size for each DD dimension */
2693 for (int d = 0; d < dd->ndim; d++)
2695 if (comm->bVacDLBNoLimit || comm->cd[d].np_dlb * comm->cellsize_limit >= comm->systemInfo.cutoff)
2697 comm->cellsize_min_dlb[dd->dim[d]] = comm->cellsize_limit;
2701 comm->cellsize_min_dlb[dd->dim[d]] = comm->systemInfo.cutoff / comm->cd[d].np_dlb;
2704 if (comm->cutoff_mbody <= 0)
2706 comm->cutoff_mbody = std::min(comm->systemInfo.cutoff, comm->cellsize_limit);
2714 bool dd_moleculesAreAlwaysWhole(const gmx_domdec_t& dd)
2716 return dd.comm->systemInfo.moleculesAreAlwaysWhole;
2719 bool dd_bonded_molpbc(const gmx_domdec_t& dd, PbcType pbcType)
2721 /* If each molecule is a single charge group
2722 * or we use domain decomposition for each periodic dimension,
2723 * we do not need to take pbc into account for the bonded interactions.
2725 return (pbcType != PbcType::No && dd.comm->systemInfo.haveInterDomainBondeds
2726 && !(dd.numCells[XX] > 1 && dd.numCells[YY] > 1
2727 && (dd.numCells[ZZ] > 1 || pbcType == PbcType::XY)));
2730 /*! \brief Sets grid size limits and PP-PME setup, prints settings to log */
2731 static void set_ddgrid_parameters(const gmx::MDLogger& mdlog,
2734 const gmx_mtop_t& mtop,
2735 const t_inputrec& inputrec,
2736 const gmx_ddbox_t* ddbox)
2738 gmx_domdec_comm_t* comm = dd->comm;
2739 DDRankSetup& ddRankSetup = comm->ddRankSetup;
2741 if (EEL_PME(inputrec.coulombtype) || EVDW_PME(inputrec.vdwtype))
2743 init_ddpme(dd, &ddRankSetup.ddpme[0], 0);
2744 if (ddRankSetup.npmedecompdim >= 2)
2746 init_ddpme(dd, &ddRankSetup.ddpme[1], 1);
2751 ddRankSetup.numRanksDoingPme = 0;
2752 if (dd->pme_nodeid >= 0)
2754 gmx_fatal_collective(FARGS,
2757 "Can not have separate PME ranks without PME electrostatics");
2763 fprintf(debug, "The DD cut-off is %f\n", comm->systemInfo.cutoff);
2765 if (!isDlbDisabled(comm))
2767 set_cell_limits_dlb(mdlog, dd, dlb_scale, inputrec, ddbox);
2770 logSettings(mdlog, dd, mtop, inputrec, dlb_scale, ddbox);
2772 const real vol_frac = (inputrec.pbcType == PbcType::No)
2773 ? (1 - 1 / static_cast<double>(dd->nnodes))
2774 : ((1 + comm_box_frac(dd->numCells, comm->systemInfo.cutoff, *ddbox))
2775 / static_cast<double>(dd->nnodes));
2778 fprintf(debug, "Volume fraction for all DD zones: %f\n", vol_frac);
2780 int natoms_tot = mtop.natoms;
2782 dd->ga2la = new gmx_ga2la_t(natoms_tot, static_cast<int>(vol_frac * natoms_tot));
2785 /*! \brief Get some important DD parameters which can be modified by env.vars */
2786 static DDSettings getDDSettings(const gmx::MDLogger& mdlog,
2787 const DomdecOptions& options,
2788 const gmx::MdrunOptions& mdrunOptions,
2789 const t_inputrec& ir,
2790 const bool directGpuCommUsedWithGpuUpdate)
2792 DDSettings ddSettings;
2794 ddSettings.useSendRecv2 = (dd_getenv(mdlog, "GMX_DD_USE_SENDRECV2", 0) != 0);
2795 ddSettings.dlb_scale_lim = dd_getenv(mdlog, "GMX_DLB_MAX_BOX_SCALING", 10);
2796 ddSettings.useDDOrderZYX = bool(dd_getenv(mdlog, "GMX_DD_ORDER_ZYX", 0));
2797 ddSettings.useCartesianReorder = bool(dd_getenv(mdlog, "GMX_NO_CART_REORDER", 1));
2798 ddSettings.eFlop = dd_getenv(mdlog, "GMX_DLB_BASED_ON_FLOPS", 0);
2799 const int recload = dd_getenv(mdlog, "GMX_DD_RECORD_LOAD", 1);
2800 ddSettings.nstDDDump = dd_getenv(mdlog, "GMX_DD_NST_DUMP", 0);
2801 ddSettings.nstDDDumpGrid = dd_getenv(mdlog, "GMX_DD_NST_DUMP_GRID", 0);
2802 ddSettings.DD_debug = dd_getenv(mdlog, "GMX_DD_DEBUG", 0);
2804 if (ddSettings.useSendRecv2)
2808 "Will use two sequential MPI_Sendrecv calls instead of two simultaneous "
2809 "non-blocking MPI_Irecv and MPI_Isend pairs for constraint and vsite "
2813 if (ddSettings.eFlop)
2815 GMX_LOG(mdlog.info).appendText("Will load balance based on FLOP count");
2816 ddSettings.recordLoad = true;
2820 ddSettings.recordLoad = (wallcycle_have_counter() && recload > 0);
2823 ddSettings.initialDlbState = determineInitialDlbState(
2824 mdlog, options.dlbOption, ddSettings.recordLoad, mdrunOptions, ir, directGpuCommUsedWithGpuUpdate);
2826 .appendTextFormatted("Dynamic load balancing: %s",
2827 enumValueToString(ddSettings.initialDlbState));
2832 gmx_domdec_t::gmx_domdec_t(const t_inputrec& ir) : unitCellInfo(ir) {}
2834 gmx_domdec_t::~gmx_domdec_t() = default;
2839 // TODO once the functionality stablizes, move this class and
2840 // supporting functionality into builder.cpp
2841 /*! \brief Impl class for DD builder */
2842 class DomainDecompositionBuilder::Impl
2846 Impl(const MDLogger& mdlog,
2848 const DomdecOptions& options,
2849 const MdrunOptions& mdrunOptions,
2850 const gmx_mtop_t& mtop,
2851 const t_inputrec& ir,
2852 const MDModulesNotifiers& notifiers,
2854 ArrayRef<const RangePartitioning> updateGroupingPerMoleculeType,
2855 bool useUpdateGroups,
2856 real maxUpdateGroupRadius,
2857 ArrayRef<const RVec> xGlobal,
2858 bool useGpuForNonbonded,
2860 bool directGpuCommUsedWithGpuUpdate);
2862 //! Build the resulting DD manager
2863 gmx_domdec_t* build(LocalAtomSetManager* atomSets,
2864 const gmx_localtop_t& localTopology,
2865 const t_state& localState,
2866 ObservablesReducerBuilder* observablesReducerBuilder);
2868 //! Objects used in constructing and configuring DD
2871 const MDLogger& mdlog_;
2872 //! Communication object
2874 //! User-supplied options configuring DD behavior
2875 const DomdecOptions options_;
2876 //! Global system topology
2877 const gmx_mtop_t& mtop_;
2878 //! User input values from the tpr file
2879 const t_inputrec& ir_;
2880 //! MdModules object
2881 const MDModulesNotifiers& notifiers_;
2884 //! Internal objects used in constructing DD
2886 //! Settings combined from the user input
2887 DDSettings ddSettings_;
2888 //! Information derived from the simulation system
2889 DDSystemInfo systemInfo_;
2891 gmx_ddbox_t ddbox_ = { 0 };
2892 //! Organization of the DD grids
2893 DDGridSetup ddGridSetup_;
2894 //! Organzation of the DD ranks
2895 DDRankSetup ddRankSetup_;
2896 //! Number of DD cells in each dimension
2897 ivec ddCellIndex_ = { 0, 0, 0 };
2898 //! IDs of PME-only ranks
2899 std::vector<int> pmeRanks_;
2900 //! Contains a valid Cartesian-communicator-based setup, or defaults.
2901 CartesianRankSetup cartSetup_;
2905 DomainDecompositionBuilder::Impl::Impl(const MDLogger& mdlog,
2907 const DomdecOptions& options,
2908 const MdrunOptions& mdrunOptions,
2909 const gmx_mtop_t& mtop,
2910 const t_inputrec& ir,
2911 const MDModulesNotifiers& notifiers,
2913 ArrayRef<const RangePartitioning> updateGroupingPerMoleculeType,
2914 const bool useUpdateGroups,
2915 const real maxUpdateGroupRadius,
2916 ArrayRef<const RVec> xGlobal,
2917 bool useGpuForNonbonded,
2919 bool directGpuCommUsedWithGpuUpdate) :
2920 mdlog_(mdlog), cr_(cr), options_(options), mtop_(mtop), ir_(ir), notifiers_(notifiers)
2922 GMX_LOG(mdlog_.info).appendTextFormatted("\nInitializing Domain Decomposition on %d ranks", cr_->sizeOfDefaultCommunicator);
2924 ddSettings_ = getDDSettings(mdlog_, options_, mdrunOptions, ir_, directGpuCommUsedWithGpuUpdate);
2926 if (ddSettings_.eFlop > 1)
2928 /* Ensure that we have different random flop counts on different ranks */
2929 srand(1 + cr_->rankInDefaultCommunicator);
2932 systemInfo_ = getSystemInfo(mdlog_,
2933 MASTER(cr_) ? DDRole::Master : DDRole::Agent,
2934 cr->mpiDefaultCommunicator,
2939 updateGroupingPerMoleculeType,
2941 maxUpdateGroupRadius,
2944 const int numRanksRequested = cr_->sizeOfDefaultCommunicator;
2945 const bool checkForLargePrimeFactors = (options_.numCells[0] <= 0);
2948 /* Checks for ability to use PME-only ranks */
2949 auto separatePmeRanksPermitted = checkForSeparatePmeRanks(
2950 notifiers_, options_, numRanksRequested, useGpuForNonbonded, useGpuForPme);
2952 /* Checks for validity of requested Ranks setup */
2953 checkForValidRankCountRequests(numRanksRequested,
2954 EEL_PME(ir_.coulombtype) | EVDW_PME(ir_.vdwtype),
2955 options_.numPmeRanks,
2956 separatePmeRanksPermitted,
2957 checkForLargePrimeFactors);
2959 // DD grid setup uses a more different cell size limit for
2960 // automated setup than the one in systemInfo_. The latter is used
2961 // in set_dd_limits() to configure DLB, for example.
2962 const real gridSetupCellsizeLimit =
2963 getDDGridSetupCellSizeLimit(mdlog_,
2964 !isDlbDisabled(ddSettings_.initialDlbState),
2965 options_.dlbScaling,
2967 systemInfo_.cellsizeLimit);
2968 ddGridSetup_ = getDDGridSetup(mdlog_,
2969 MASTER(cr_) ? DDRole::Master : DDRole::Agent,
2970 cr->mpiDefaultCommunicator,
2975 gridSetupCellsizeLimit,
2978 separatePmeRanksPermitted,
2982 checkDDGridSetup(ddGridSetup_,
2983 MASTER(cr_) ? DDRole::Master : DDRole::Agent,
2984 cr->mpiDefaultCommunicator,
2985 cr->sizeOfDefaultCommunicator,
2989 gridSetupCellsizeLimit,
2992 cr_->npmenodes = ddGridSetup_.numPmeOnlyRanks;
2994 ddRankSetup_ = getDDRankSetup(
2995 mdlog_, cr_->sizeOfDefaultCommunicator, options_.rankOrder, ddGridSetup_, ir_);
2997 /* Generate the group communicator, also decides the duty of each rank */
2998 cartSetup_ = makeGroupCommunicators(
2999 mdlog_, ddSettings_, options_.rankOrder, ddRankSetup_, cr_, ddCellIndex_, &pmeRanks_);
3002 gmx_domdec_t* DomainDecompositionBuilder::Impl::build(LocalAtomSetManager* atomSets,
3003 const gmx_localtop_t& localTopology,
3004 const t_state& localState,
3005 ObservablesReducerBuilder* observablesReducerBuilder)
3007 gmx_domdec_t* dd = new gmx_domdec_t(ir_);
3009 copy_ivec(ddCellIndex_, dd->ci);
3011 dd->comm = init_dd_comm();
3013 dd->comm->ddRankSetup = ddRankSetup_;
3014 dd->comm->cartesianRankSetup = cartSetup_;
3016 set_dd_limits(mdlog_,
3017 MASTER(cr_) ? DDRole::Master : DDRole::Agent,
3023 ddRankSetup_.numPPRanks,
3028 setupGroupCommunication(mdlog_, ddSettings_, pmeRanks_, cr_, mtop_.natoms, dd);
3030 if (thisRankHasDuty(cr_, DUTY_PP))
3032 set_ddgrid_parameters(mdlog_, dd, options_.dlbScaling, mtop_, ir_, &ddbox_);
3034 setup_neighbor_relations(dd);
3037 /* Set overallocation to avoid frequent reallocation of arrays */
3038 set_over_alloc_dd(true);
3040 dd->atomSets = atomSets;
3042 dd->localTopologyChecker = std::make_unique<LocalTopologyChecker>(
3043 mdlog_, cr_, mtop_, localTopology, localState, dd->comm->systemInfo.useUpdateGroups, observablesReducerBuilder);
3048 DomainDecompositionBuilder::DomainDecompositionBuilder(const MDLogger& mdlog,
3050 const DomdecOptions& options,
3051 const MdrunOptions& mdrunOptions,
3052 const gmx_mtop_t& mtop,
3053 const t_inputrec& ir,
3054 const MDModulesNotifiers& notifiers,
3056 ArrayRef<const RangePartitioning> updateGroupingPerMoleculeType,
3057 const bool useUpdateGroups,
3058 const real maxUpdateGroupRadius,
3059 ArrayRef<const RVec> xGlobal,
3060 const bool useGpuForNonbonded,
3061 const bool useGpuForPme,
3062 const bool directGpuCommUsedWithGpuUpdate) :
3063 impl_(new Impl(mdlog,
3071 updateGroupingPerMoleculeType,
3073 maxUpdateGroupRadius,
3077 directGpuCommUsedWithGpuUpdate))
3081 gmx_domdec_t* DomainDecompositionBuilder::build(LocalAtomSetManager* atomSets,
3082 const gmx_localtop_t& localTopology,
3083 const t_state& localState,
3084 ObservablesReducerBuilder* observablesReducerBuilder)
3086 return impl_->build(atomSets, localTopology, localState, observablesReducerBuilder);
3089 DomainDecompositionBuilder::~DomainDecompositionBuilder() = default;
3093 static gmx_bool test_dd_cutoff(const t_commrec* cr, const matrix box, gmx::ArrayRef<const gmx::RVec> x, real cutoffRequested)
3096 int LocallyLimited = 0;
3098 const auto* dd = cr->dd;
3100 set_ddbox(*dd, false, box, true, x, &ddbox);
3104 for (int d = 0; d < dd->ndim; d++)
3106 const int dim = dd->dim[d];
3108 real inv_cell_size = DD_CELL_MARGIN * dd->numCells[dim] / ddbox.box_size[dim];
3109 if (dd->unitCellInfo.ddBoxIsDynamic)
3111 inv_cell_size *= DD_PRES_SCALE_MARGIN;
3114 const int np = 1 + static_cast<int>(cutoffRequested * inv_cell_size * ddbox.skew_fac[dim]);
3116 if (!isDlbDisabled(dd->comm) && (dim < ddbox.npbcdim) && (dd->comm->cd[d].np_dlb > 0))
3118 if (np > dd->comm->cd[d].np_dlb)
3123 /* If a current local cell size is smaller than the requested
3124 * cut-off, we could still fix it, but this gets very complicated.
3125 * Without fixing here, we might actually need more checks.
3127 real cellSizeAlongDim =
3128 (dd->comm->cell_x1[dim] - dd->comm->cell_x0[dim]) * ddbox.skew_fac[dim];
3129 if (cellSizeAlongDim * dd->comm->cd[d].np_dlb < cutoffRequested)
3136 if (!isDlbDisabled(dd->comm))
3138 /* If DLB is not active yet, we don't need to check the grid jumps.
3139 * Actually we shouldn't, because then the grid jump data is not set.
3141 if (isDlbOn(dd->comm) && gmx::check_grid_jump(0, dd, cutoffRequested, &ddbox, FALSE))
3146 gmx_sumi(1, &LocallyLimited, cr);
3148 if (LocallyLimited > 0)
3157 bool change_dd_cutoff(t_commrec* cr, const matrix box, gmx::ArrayRef<const gmx::RVec> x, real cutoffRequested)
3159 bool bCutoffAllowed = test_dd_cutoff(cr, box, x, cutoffRequested);
3163 cr->dd->comm->systemInfo.cutoff = cutoffRequested;
3166 return bCutoffAllowed;
3169 void constructGpuHaloExchange(const gmx::MDLogger& mdlog,
3170 const t_commrec& cr,
3171 const gmx::DeviceStreamManager& deviceStreamManager,
3172 gmx_wallcycle* wcycle)
3174 GMX_RELEASE_ASSERT(deviceStreamManager.streamIsValid(gmx::DeviceStreamType::NonBondedLocal),
3175 "Local non-bonded stream should be valid when using"
3176 "GPU halo exchange.");
3177 GMX_RELEASE_ASSERT(deviceStreamManager.streamIsValid(gmx::DeviceStreamType::NonBondedNonLocal),
3178 "Non-local non-bonded stream should be valid when using "
3179 "GPU halo exchange.");
3181 if (cr.dd->gpuHaloExchange[0].empty())
3183 GMX_LOG(mdlog.warning)
3185 .appendTextFormatted(
3186 "NOTE: Activating the 'GPU halo exchange' feature, enabled "
3188 "GMX_GPU_DD_COMMS environment variable.");
3191 for (int d = 0; d < cr.dd->ndim; d++)
3193 for (int pulse = cr.dd->gpuHaloExchange[d].size(); pulse < cr.dd->comm->cd[d].numPulses(); pulse++)
3195 cr.dd->gpuHaloExchange[d].push_back(std::make_unique<gmx::GpuHaloExchange>(
3198 cr.mpi_comm_mygroup,
3199 deviceStreamManager.context(),
3200 deviceStreamManager.stream(gmx::DeviceStreamType::NonBondedLocal),
3201 deviceStreamManager.stream(gmx::DeviceStreamType::NonBondedNonLocal),
3208 void reinitGpuHaloExchange(const t_commrec& cr,
3209 const DeviceBuffer<gmx::RVec> d_coordinatesBuffer,
3210 const DeviceBuffer<gmx::RVec> d_forcesBuffer)
3212 for (int d = 0; d < cr.dd->ndim; d++)
3214 for (int pulse = 0; pulse < cr.dd->comm->cd[d].numPulses(); pulse++)
3216 cr.dd->gpuHaloExchange[d][pulse]->reinitHalo(d_coordinatesBuffer, d_forcesBuffer);
3221 void communicateGpuHaloCoordinates(const t_commrec& cr,
3223 GpuEventSynchronizer* coordinatesReadyOnDeviceEvent)
3225 for (int d = 0; d < cr.dd->ndim; d++)
3227 for (int pulse = 0; pulse < cr.dd->comm->cd[d].numPulses(); pulse++)
3229 cr.dd->gpuHaloExchange[d][pulse]->communicateHaloCoordinates(box, coordinatesReadyOnDeviceEvent);
3234 void communicateGpuHaloForces(const t_commrec& cr, bool accumulateForces)
3236 for (int d = cr.dd->ndim - 1; d >= 0; d--)
3238 for (int pulse = cr.dd->comm->cd[d].numPulses() - 1; pulse >= 0; pulse--)
3240 cr.dd->gpuHaloExchange[d][pulse]->communicateHaloForces(accumulateForces);
3245 void dd_init_local_state(const gmx_domdec_t& dd, const t_state* state_global, t_state* state_local)
3247 std::array<int, 5> buf;
3251 buf[0] = state_global->flags;
3252 buf[1] = state_global->ngtc;
3253 buf[2] = state_global->nnhpres;
3254 buf[3] = state_global->nhchainlength;
3255 buf[4] = state_global->dfhist ? state_global->dfhist->nlambda : 0;
3257 dd_bcast(&dd, buf.size() * sizeof(int), buf.data());
3259 init_gtc_state(state_local, buf[1], buf[2], buf[3]);
3260 init_dfhist_state(state_local, buf[4]);
3261 state_local->flags = buf[0];
3264 void putUpdateGroupAtomsInSamePeriodicImage(const gmx_domdec_t& dd,
3265 const gmx_mtop_t& mtop,
3267 gmx::ArrayRef<gmx::RVec> positions)
3270 for (const gmx_molblock_t& molblock : mtop.molblock)
3272 const auto& updateGrouping = dd.comm->systemInfo.updateGroupingsPerMoleculeType[molblock.type];
3274 for (int mol = 0; mol < molblock.nmol; mol++)
3276 for (int g = 0; g < updateGrouping.numBlocks(); g++)
3278 const auto& block = updateGrouping.block(g);
3279 const int atomBegin = atomOffset + block.begin();
3280 const int atomEnd = atomOffset + block.end();
3281 for (int a = atomBegin + 1; a < atomEnd; a++)
3283 // Make sure that atoms in the same update group
3284 // are in the same periodic image after restarts.
3285 for (int d = DIM - 1; d >= 0; d--)
3287 while (positions[a][d] - positions[atomBegin][d] > 0.5_real * box[d][d])
3289 positions[a] -= box[d];
3291 while (positions[a][d] - positions[atomBegin][d] < -0.5_real * box[d][d])
3293 positions[a] += box[d];
3298 atomOffset += updateGrouping.fullRange().end();