2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2018,2019,2020,2021, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
37 * \brief This file defines functions for mdrun to call to make a new
38 * domain decomposition, and check it.
40 * \author Berk Hess <hess@kth.se>
41 * \ingroup module_domdec
46 #include "gromacs/utility/arrayref.h"
47 #include "partition.h"
56 #include "gromacs/domdec/collect.h"
57 #include "gromacs/domdec/dlb.h"
58 #include "gromacs/domdec/dlbtiming.h"
59 #include "gromacs/domdec/domdec_network.h"
60 #include "gromacs/domdec/ga2la.h"
61 #include "gromacs/domdec/localatomsetmanager.h"
62 #include "gromacs/domdec/localtopology.h"
63 #include "gromacs/domdec/localtopologychecker.h"
64 #include "gromacs/domdec/mdsetup.h"
65 #include "gromacs/domdec/nsgrid.h"
66 #include "gromacs/ewald/pme_pp.h"
67 #include "gromacs/gmxlib/network.h"
68 #include "gromacs/gmxlib/nrnb.h"
69 #include "gromacs/imd/imd.h"
70 #include "gromacs/math/functions.h"
71 #include "gromacs/math/vec.h"
72 #include "gromacs/mdlib/forcerec.h"
73 #include "gromacs/mdlib/gmx_omp_nthreads.h"
74 #include "gromacs/mdlib/mdatoms.h"
75 #include "gromacs/mdlib/vsite.h"
76 #include "gromacs/mdtypes/commrec.h"
77 #include "gromacs/mdtypes/forcerec.h"
78 #include "gromacs/mdtypes/inputrec.h"
79 #include "gromacs/mdtypes/md_enums.h"
80 #include "gromacs/mdtypes/mdatom.h"
81 #include "gromacs/mdtypes/nblist.h"
82 #include "gromacs/mdtypes/state.h"
83 #include "gromacs/nbnxm/nbnxm.h"
84 #include "gromacs/pulling/pull.h"
85 #include "gromacs/timing/wallcycle.h"
86 #include "gromacs/topology/mtop_util.h"
87 #include "gromacs/topology/topology.h"
88 #include "gromacs/utility/cstringutil.h"
89 #include "gromacs/utility/fatalerror.h"
90 #include "gromacs/utility/logger.h"
91 #include "gromacs/utility/real.h"
92 #include "gromacs/utility/smalloc.h"
93 #include "gromacs/utility/strconvert.h"
94 #include "gromacs/utility/stringstream.h"
95 #include "gromacs/utility/stringutil.h"
96 #include "gromacs/utility/textwriter.h"
99 #include "cellsizes.h"
100 #include "distribute.h"
101 #include "domdec_constraints.h"
102 #include "domdec_internal.h"
103 #include "domdec_vsite.h"
105 #include "redistribute.h"
108 /*! \brief Turn on DLB when the load imbalance causes this amount of total loss.
110 * There is a bit of overhead with DLB and it's difficult to achieve
111 * a load imbalance of less than 2% with DLB.
113 #define DD_PERF_LOSS_DLB_ON 0.02
115 //! Warn about imbalance due to PP or PP/PME load imbalance at this loss.
116 #define DD_PERF_LOSS_WARN 0.05
119 //! Debug helper printing a DD zone
120 static void print_ddzone(FILE* fp, int d, int i, int j, gmx_ddzone_t* zone)
123 "zone d0 %d d1 %d d2 %d min0 %6.3f max1 %6.3f mch0 %6.3f mch1 %6.3f p1_0 %6.3f p1_1 "
136 /*! \brief Using the home grid size as input in cell_ns_x0 and cell_ns_x1
137 * takes the extremes over all home and remote zones in the halo
138 * and returns the results in cell_ns_x0 and cell_ns_x1.
139 * Note: only used with the group cut-off scheme.
141 static void dd_move_cellx(gmx_domdec_t* dd, const gmx_ddbox_t* ddbox, rvec cell_ns_x0, rvec cell_ns_x1)
143 constexpr int c_ddZoneCommMaxNumZones = 5;
144 gmx_ddzone_t buf_s[c_ddZoneCommMaxNumZones];
145 gmx_ddzone_t buf_r[c_ddZoneCommMaxNumZones];
146 gmx_ddzone_t buf_e[c_ddZoneCommMaxNumZones];
147 gmx_domdec_comm_t* comm = dd->comm;
151 for (int d = 1; d < dd->ndim; d++)
153 int dim = dd->dim[d];
154 gmx_ddzone_t& zp = (d == 1) ? comm->zone_d1[0] : comm->zone_d2[0][0];
156 /* Copy the base sizes of the home zone */
157 zp.min0 = cell_ns_x0[dim];
158 zp.max1 = cell_ns_x1[dim];
159 zp.min1 = cell_ns_x1[dim];
160 zp.mch0 = cell_ns_x0[dim];
161 zp.mch1 = cell_ns_x1[dim];
162 zp.p1_0 = cell_ns_x0[dim];
163 zp.p1_1 = cell_ns_x1[dim];
167 gmx::ArrayRef<DDCellsizesWithDlb> cellsizes = comm->cellsizesWithDlb;
169 /* Loop backward over the dimensions and aggregate the extremes
172 for (int d = dd->ndim - 2; d >= 0; d--)
174 const int dim = dd->dim[d];
175 const bool applyPbc = (dim < ddbox->npbcdim);
177 /* Use an rvec to store two reals */
178 extr_s[d][0] = cellsizes[d + 1].fracLower;
179 extr_s[d][1] = cellsizes[d + 1].fracUpper;
180 extr_s[d][2] = cellsizes[d + 1].fracUpper;
183 GMX_ASSERT(pos < c_ddZoneCommMaxNumZones, "The buffers should be sufficiently large");
184 /* Store the extremes in the backward sending buffer,
185 * so they get updated separately from the forward communication.
187 for (int d1 = d; d1 < dd->ndim - 1; d1++)
189 gmx_ddzone_t& buf = buf_s[pos];
191 /* We invert the order to be able to use the same loop for buf_e */
192 buf.min0 = extr_s[d1][1];
193 buf.max1 = extr_s[d1][0];
194 buf.min1 = extr_s[d1][2];
197 /* Store the cell corner of the dimension we communicate along */
198 buf.p1_0 = comm->cell_x0[dim];
204 buf_s[pos] = (dd->ndim == 2) ? comm->zone_d1[0] : comm->zone_d2[0][0];
207 if (dd->ndim == 3 && d == 0)
209 buf_s[pos] = comm->zone_d2[0][1];
211 buf_s[pos] = comm->zone_d1[0];
215 /* We only need to communicate the extremes
216 * in the forward direction
218 int numPulses = comm->cd[d].numPulses();
222 /* Take the minimum to avoid double communication */
223 numPulsesMin = std::min(numPulses, dd->numCells[dim] - 1 - numPulses);
227 /* Without PBC we should really not communicate over
228 * the boundaries, but implementing that complicates
229 * the communication setup and therefore we simply
230 * do all communication, but ignore some data.
232 numPulsesMin = numPulses;
234 for (int pulse = 0; pulse < numPulsesMin; pulse++)
236 /* Communicate the extremes forward */
237 bool receiveValidData = (applyPbc || dd->ci[dim] > 0);
239 int numElements = dd->ndim - d - 1;
240 ddSendrecv(dd, d, dddirForward, extr_s + d, numElements, extr_r + d, numElements);
242 if (receiveValidData)
244 for (int d1 = d; d1 < dd->ndim - 1; d1++)
246 extr_s[d1][0] = std::max(extr_s[d1][0], extr_r[d1][0]);
247 extr_s[d1][1] = std::min(extr_s[d1][1], extr_r[d1][1]);
248 extr_s[d1][2] = std::min(extr_s[d1][2], extr_r[d1][2]);
253 const int numElementsInBuffer = pos;
254 for (int pulse = 0; pulse < numPulses; pulse++)
256 /* Communicate all the zone information backward */
257 bool receiveValidData = (applyPbc || dd->ci[dim] < dd->numCells[dim] - 1);
260 sizeof(gmx_ddzone_t) == c_ddzoneNumReals * sizeof(real),
261 "Here we expect gmx_ddzone_t to consist of c_ddzoneNumReals reals (only)");
263 int numReals = numElementsInBuffer * c_ddzoneNumReals;
267 gmx::arrayRefFromArray(&buf_s[0].min0, numReals),
268 gmx::arrayRefFromArray(&buf_r[0].min0, numReals));
273 for (int d1 = d + 1; d1 < dd->ndim; d1++)
275 /* Determine the decrease of maximum required
276 * communication height along d1 due to the distance along d,
277 * this avoids a lot of useless atom communication.
279 real dist_d = comm->cell_x1[dim] - buf_r[0].p1_0;
282 if (ddbox->tric_dir[dim])
284 /* c is the off-diagonal coupling between the cell planes
285 * along directions d and d1.
287 c = ddbox->v[dim][dd->dim[d1]][dim];
293 real det = (1 + c * c) * gmx::square(comm->systemInfo.cutoff) - dist_d * dist_d;
296 dh[d1] = comm->systemInfo.cutoff - (c * dist_d + std::sqrt(det)) / (1 + c * c);
300 /* A negative value signals out of range */
306 /* Accumulate the extremes over all pulses */
307 for (int i = 0; i < numElementsInBuffer; i++)
315 if (receiveValidData)
317 buf_e[i].min0 = std::min(buf_e[i].min0, buf_r[i].min0);
318 buf_e[i].max1 = std::max(buf_e[i].max1, buf_r[i].max1);
319 buf_e[i].min1 = std::min(buf_e[i].min1, buf_r[i].min1);
323 if (dd->ndim == 3 && d == 0 && i == numElementsInBuffer - 1)
331 if (receiveValidData && dh[d1] >= 0)
333 buf_e[i].mch0 = std::max(buf_e[i].mch0, buf_r[i].mch0 - dh[d1]);
334 buf_e[i].mch1 = std::max(buf_e[i].mch1, buf_r[i].mch1 - dh[d1]);
337 /* Copy the received buffer to the send buffer,
338 * to pass the data through with the next pulse.
342 if (((applyPbc || dd->ci[dim] + numPulses < dd->numCells[dim]) && pulse == numPulses - 1)
343 || (!applyPbc && dd->ci[dim] + 1 + pulse == dd->numCells[dim] - 1))
345 /* Store the extremes */
348 for (int d1 = d; d1 < dd->ndim - 1; d1++)
350 extr_s[d1][1] = std::min(extr_s[d1][1], buf_e[pos].min0);
351 extr_s[d1][0] = std::max(extr_s[d1][0], buf_e[pos].max1);
352 extr_s[d1][2] = std::min(extr_s[d1][2], buf_e[pos].min1);
356 if (d == 1 || (d == 0 && dd->ndim == 3))
358 for (int i = d; i < 2; i++)
360 comm->zone_d2[1 - d][i] = buf_e[pos];
366 comm->zone_d1[1] = buf_e[pos];
372 if (d == 1 || (d == 0 && dd->ndim == 3))
374 for (int i = d; i < 2; i++)
376 comm->zone_d2[1 - d][i].dataSet = 0;
381 comm->zone_d1[1].dataSet = 0;
389 int dim = dd->dim[1];
390 for (int i = 0; i < 2; i++)
392 if (comm->zone_d1[i].dataSet != 0)
396 print_ddzone(debug, 1, i, 0, &comm->zone_d1[i]);
398 cell_ns_x0[dim] = std::min(cell_ns_x0[dim], comm->zone_d1[i].min0);
399 cell_ns_x1[dim] = std::max(cell_ns_x1[dim], comm->zone_d1[i].max1);
405 int dim = dd->dim[2];
406 for (int i = 0; i < 2; i++)
408 for (int j = 0; j < 2; j++)
410 if (comm->zone_d2[i][j].dataSet != 0)
414 print_ddzone(debug, 2, i, j, &comm->zone_d2[i][j]);
416 cell_ns_x0[dim] = std::min(cell_ns_x0[dim], comm->zone_d2[i][j].min0);
417 cell_ns_x1[dim] = std::max(cell_ns_x1[dim], comm->zone_d2[i][j].max1);
422 for (int d = 1; d < dd->ndim; d++)
424 cellsizes[d].fracLowerMax = extr_s[d - 1][0];
425 cellsizes[d].fracUpperMin = extr_s[d - 1][1];
429 "Cell fraction d %d, max0 %f, min1 %f\n",
431 cellsizes[d].fracLowerMax,
432 cellsizes[d].fracUpperMin);
437 //! Sets the charge-group zones to be equal to the home zone.
438 static void set_zones_numHomeAtoms(gmx_domdec_t* dd)
440 gmx_domdec_zones_t* zones;
443 zones = &dd->comm->zones;
445 zones->cg_range[0] = 0;
446 for (i = 1; i < zones->n + 1; i++)
448 zones->cg_range[i] = dd->numHomeAtoms;
450 /* zone_ncg1[0] should always be equal to numHomeAtoms */
451 dd->comm->zone_ncg1[0] = dd->numHomeAtoms;
454 //! Restore atom groups for the charge groups.
455 static void restoreAtomGroups(gmx_domdec_t* dd, const t_state* state)
457 gmx::ArrayRef<const int> atomGroupsState = state->cg_gl;
459 std::vector<int>& globalAtomGroupIndices = dd->globalAtomGroupIndices;
461 globalAtomGroupIndices.resize(atomGroupsState.size());
463 /* Copy back the global charge group indices from state
464 * and rebuild the local charge group to atom index.
466 for (gmx::index i = 0; i < atomGroupsState.ssize(); i++)
468 globalAtomGroupIndices[i] = atomGroupsState[i];
471 dd->numHomeAtoms = atomGroupsState.size();
472 dd->comm->atomRanges.setEnd(DDAtomRanges::Type::Home, atomGroupsState.ssize());
474 set_zones_numHomeAtoms(dd);
477 //! Sets the atom info structures.
478 static void dd_set_atominfo(gmx::ArrayRef<const int> index_gl, int cg0, int cg1, t_forcerec* fr)
482 gmx::ArrayRef<gmx::AtomInfoWithinMoleculeBlock> atomInfoForEachMoleculeBlock =
483 fr->atomInfoForEachMoleculeBlock;
484 gmx::ArrayRef<int64_t> atomInfo = fr->atomInfo;
486 for (int cg = cg0; cg < cg1; cg++)
488 atomInfo[cg] = ddGetAtomInfo(atomInfoForEachMoleculeBlock, index_gl[cg]);
493 //! Makes the mappings between global and local atom indices during DD repartioning.
494 static void make_dd_indices(gmx_domdec_t* dd, const int atomStart)
496 const int numZones = dd->comm->zones.n;
497 gmx::ArrayRef<const int> zone2cg = dd->comm->zones.cg_range;
498 gmx::ArrayRef<const int> zone_ncg1 = dd->comm->zone_ncg1;
499 gmx::ArrayRef<const int> globalAtomGroupIndices = dd->globalAtomGroupIndices;
501 std::vector<int>& globalAtomIndices = dd->globalAtomIndices;
502 gmx_ga2la_t& ga2la = *dd->ga2la;
504 if (zone2cg[1] != dd->numHomeAtoms)
506 gmx_incons("dd->ncg_zone is not up to date");
509 /* Make the local to global and global to local atom index */
511 globalAtomIndices.resize(a);
512 for (int zone = 0; zone < numZones; zone++)
523 int cg1 = zone2cg[zone + 1];
524 int cg1_p1 = cg0 + zone_ncg1[zone];
526 for (int cg = cg0; cg < cg1; cg++)
531 /* Signal that this cg is from more than one pulse away */
534 int cg_gl = globalAtomGroupIndices[cg];
535 globalAtomIndices.push_back(cg_gl);
536 ga2la.insert(cg_gl, { a, zone1 });
542 //! Checks whether global and local atom indices are consistent.
543 static void check_index_consistency(const gmx_domdec_t* dd, int natoms_sys, const char* where)
547 const int numAtomsInZones = dd->comm->atomRanges.end(DDAtomRanges::Type::Zones);
549 if (dd->comm->ddSettings.DD_debug > 1)
551 std::vector<int> have(natoms_sys);
552 for (int a = 0; a < numAtomsInZones; a++)
554 int globalAtomIndex = dd->globalAtomIndices[a];
555 if (have[globalAtomIndex] > 0)
558 "DD rank %d: global atom %d occurs twice: index %d and %d\n",
561 have[globalAtomIndex],
566 have[globalAtomIndex] = a + 1;
571 std::vector<int> have(numAtomsInZones);
574 for (int i = 0; i < natoms_sys; i++)
576 if (const auto* entry = dd->ga2la->find(i))
578 const int a = entry->la;
579 if (a >= numAtomsInZones)
582 "DD rank %d: global atom %d marked as local atom %d, which is larger than "
593 if (dd->globalAtomIndices[a] != i)
596 "DD rank %d: global atom %d marked as local atom %d, which has global "
601 dd->globalAtomIndices[a] + 1);
608 if (ngl != numAtomsInZones)
610 fprintf(stderr, "DD rank %d, %s: %d global atom indices, %d local atoms\n", dd->rank, where, ngl, numAtomsInZones);
612 for (int a = 0; a < numAtomsInZones; a++)
617 "DD rank %d, %s: local atom %d, global %d has no global index\n",
621 dd->globalAtomIndices[a] + 1);
627 gmx_fatal(FARGS, "DD rank %d, %s: %d atom(group) index inconsistencies", dd->rank, where, nerr);
631 //! Clear all DD global state indices
632 static void clearDDStateIndices(gmx_domdec_t* dd, const bool keepLocalAtomIndices)
634 gmx_ga2la_t& ga2la = *dd->ga2la;
636 if (!keepLocalAtomIndices)
638 /* Clear the whole list without the overhead of searching */
643 const int numAtomsInZones = dd->comm->atomRanges.end(DDAtomRanges::Type::Zones);
644 for (int i = 0; i < numAtomsInZones; i++)
646 ga2la.erase(dd->globalAtomIndices[i]);
650 dd_clear_local_vsite_indices(dd);
654 dd_clear_local_constraint_indices(dd);
658 //! Return the duration of force calculations on this rank.
659 static float dd_force_load(gmx_domdec_comm_t* comm)
663 if (comm->ddSettings.eFlop)
666 if (comm->ddSettings.eFlop > 1)
668 load *= 1.0 + (comm->ddSettings.eFlop - 1) * (0.1 * rand() / RAND_MAX - 0.05);
673 load = comm->cycl[ddCyclF];
674 if (comm->cycl_n[ddCyclF] > 1)
676 /* Subtract the maximum of the last n cycle counts
677 * to get rid of possible high counts due to other sources,
678 * for instance system activity, that would otherwise
679 * affect the dynamic load balancing.
681 load -= comm->cycl_max[ddCyclF];
685 if (comm->cycl_n[ddCyclWaitGPU] && comm->nrank_gpu_shared > 1)
687 float gpu_wait, gpu_wait_sum;
689 gpu_wait = comm->cycl[ddCyclWaitGPU];
690 if (comm->cycl_n[ddCyclF] > 1)
692 /* We should remove the WaitGPU time of the same MD step
693 * as the one with the maximum F time, since the F time
694 * and the wait time are not independent.
695 * Furthermore, the step for the max F time should be chosen
696 * the same on all ranks that share the same GPU.
697 * But to keep the code simple, we remove the average instead.
698 * The main reason for artificially long times at some steps
699 * is spurious CPU activity or MPI time, so we don't expect
700 * that changes in the GPU wait time matter a lot here.
702 gpu_wait *= (comm->cycl_n[ddCyclF] - 1) / static_cast<float>(comm->cycl_n[ddCyclF]);
704 /* Sum the wait times over the ranks that share the same GPU */
705 MPI_Allreduce(&gpu_wait, &gpu_wait_sum, 1, MPI_FLOAT, MPI_SUM, comm->mpi_comm_gpu_shared);
706 /* Replace the wait time by the average over the ranks */
707 load += -gpu_wait + gpu_wait_sum / comm->nrank_gpu_shared;
715 //! Runs cell size checks and communicates the boundaries.
716 static void comm_dd_ns_cell_sizes(gmx_domdec_t* dd, gmx_ddbox_t* ddbox, rvec cell_ns_x0, rvec cell_ns_x1, int64_t step)
718 gmx_domdec_comm_t* comm;
723 for (dim_ind = 0; dim_ind < dd->ndim; dim_ind++)
725 dim = dd->dim[dim_ind];
727 /* Without PBC we don't have restrictions on the outer cells */
728 if (!(dim >= ddbox->npbcdim && (dd->ci[dim] == 0 || dd->ci[dim] == dd->numCells[dim] - 1))
730 && (comm->cell_x1[dim] - comm->cell_x0[dim]) * ddbox->skew_fac[dim] < comm->cellsize_min[dim])
734 "step %s: The %c-size (%f) times the triclinic skew factor (%f) is smaller "
735 "than the smallest allowed cell size (%f) for domain decomposition grid cell "
737 gmx_step_str(step, buf),
739 comm->cell_x1[dim] - comm->cell_x0[dim],
740 ddbox->skew_fac[dim],
741 dd->comm->cellsize_min[dim],
748 if ((isDlbOn(dd->comm) && dd->ndim > 1) || ddbox->nboundeddim < DIM)
750 /* Communicate the boundaries and update cell_ns_x0/1 */
751 dd_move_cellx(dd, ddbox, cell_ns_x0, cell_ns_x1);
752 if (isDlbOn(dd->comm) && dd->ndim > 1)
754 gmx::check_grid_jump(step, dd, dd->comm->systemInfo.cutoff, ddbox, TRUE);
759 //! Compute and communicate to determine the load distribution across PP ranks.
760 static void get_load_distribution(gmx_domdec_t* dd, gmx_wallcycle* wcycle)
762 gmx_domdec_comm_t* comm;
764 float cell_frac = 0, sbuf[DD_NLOAD_MAX];
769 fprintf(debug, "get_load_distribution start\n");
772 wallcycle_start(wcycle, WallCycleCounter::DDCommLoad);
776 bSepPME = (dd->pme_nodeid >= 0);
778 if (dd->ndim == 0 && bSepPME)
780 /* Without decomposition, but with PME nodes, we need the load */
781 comm->load[0].mdf = comm->cycl[ddCyclPPduringPME];
782 comm->load[0].pme = comm->cycl[ddCyclPME];
785 // Either we have DLB off, or we have it on and the array is large enough
786 GMX_ASSERT(!isDlbOn(dd->comm) || static_cast<int>(dd->comm->cellsizesWithDlb.size()) == dd->ndim,
787 "DLB cell sizes data not set up properly ");
788 for (int d = dd->ndim - 1; d >= 0; d--)
790 const int dim = dd->dim[d];
791 /* Check if we participate in the communication in this dimension */
792 if (d == dd->ndim - 1 || (dd->ci[dd->dim[d + 1]] == 0 && dd->ci[dd->dim[dd->ndim - 1]] == 0))
794 load = &comm->load[d];
795 if (isDlbOn(dd->comm))
797 cell_frac = comm->cellsizesWithDlb[d].fracUpper - comm->cellsizesWithDlb[d].fracLower;
800 if (d == dd->ndim - 1)
802 sbuf[pos++] = dd_force_load(comm);
803 sbuf[pos++] = sbuf[0];
804 if (isDlbOn(dd->comm))
806 sbuf[pos++] = sbuf[0];
807 sbuf[pos++] = cell_frac;
810 sbuf[pos++] = comm->cellsizesWithDlb[d].fracLowerMax;
811 sbuf[pos++] = comm->cellsizesWithDlb[d].fracUpperMin;
816 sbuf[pos++] = comm->cycl[ddCyclPPduringPME];
817 sbuf[pos++] = comm->cycl[ddCyclPME];
822 sbuf[pos++] = comm->load[d + 1].sum;
823 sbuf[pos++] = comm->load[d + 1].max;
824 if (isDlbOn(dd->comm))
826 sbuf[pos++] = comm->load[d + 1].sum_m;
827 sbuf[pos++] = comm->load[d + 1].cvol_min * cell_frac;
828 sbuf[pos++] = comm->load[d + 1].flags;
831 sbuf[pos++] = comm->cellsizesWithDlb[d].fracLowerMax;
832 sbuf[pos++] = comm->cellsizesWithDlb[d].fracUpperMin;
837 sbuf[pos++] = comm->load[d + 1].mdf;
838 sbuf[pos++] = comm->load[d + 1].pme;
842 /* Communicate a row in DD direction d.
843 * The communicators are setup such that the root always has rank 0.
847 load->nload * sizeof(float),
850 load->nload * sizeof(float),
853 comm->mpi_comm_load[d]);
855 if (dd->ci[dim] == dd->master_ci[dim])
857 /* We are the master along this row, process this row */
858 RowMaster* rowMaster = nullptr;
862 rowMaster = comm->cellsizesWithDlb[d].rowMaster.get();
872 for (int i = 0; i < dd->numCells[dim]; i++)
874 load->sum += load->load[pos++];
875 load->max = std::max(load->max, load->load[pos]);
877 if (isDlbOn(dd->comm))
879 if (rowMaster->dlbIsLimited)
881 /* This direction could not be load balanced properly,
882 * therefore we need to use the maximum iso the average load.
884 load->sum_m = std::max(load->sum_m, load->load[pos]);
888 load->sum_m += load->load[pos];
891 load->cvol_min = std::min(load->cvol_min, load->load[pos]);
893 if (d < dd->ndim - 1)
895 load->flags = gmx::roundToInt(load->load[pos++]);
899 rowMaster->bounds[i].cellFracLowerMax = load->load[pos++];
900 rowMaster->bounds[i].cellFracUpperMin = load->load[pos++];
905 load->mdf = std::max(load->mdf, load->load[pos]);
907 load->pme = std::max(load->pme, load->load[pos]);
911 if (isDlbOn(comm) && rowMaster->dlbIsLimited)
913 load->sum_m *= dd->numCells[dim];
914 load->flags |= (1 << d);
922 comm->nload += dd_load_count(comm);
923 comm->load_step += comm->cycl[ddCyclStep];
924 comm->load_sum += comm->load[0].sum;
925 comm->load_max += comm->load[0].max;
928 for (int d = 0; d < dd->ndim; d++)
930 if (comm->load[0].flags & (1 << d))
938 comm->load_mdf += comm->load[0].mdf;
939 comm->load_pme += comm->load[0].pme;
943 wallcycle_stop(wcycle, WallCycleCounter::DDCommLoad);
947 fprintf(debug, "get_load_distribution finished\n");
951 /*! \brief Return the relative performance loss on the total run time
952 * due to the force calculation load imbalance. */
953 static float dd_force_load_fraction(gmx_domdec_t* dd)
955 if (dd->comm->nload > 0 && dd->comm->load_step > 0)
957 return dd->comm->load_sum / (dd->comm->load_step * dd->nnodes);
965 /*! \brief Return the relative performance loss on the total run time
966 * due to the force calculation load imbalance. */
967 static float dd_force_imb_perf_loss(gmx_domdec_t* dd)
969 if (dd->comm->nload > 0 && dd->comm->load_step > 0)
971 return (dd->comm->load_max * dd->nnodes - dd->comm->load_sum) / (dd->comm->load_step * dd->nnodes);
979 //! Print load-balance report e.g. at the end of a run.
980 static void print_dd_load_av(FILE* fplog, gmx_domdec_t* dd)
982 gmx_domdec_comm_t* comm = dd->comm;
984 /* Only the master rank prints loads and only if we measured loads */
985 if (!DDMASTER(dd) || comm->nload == 0)
991 int numPpRanks = dd->nnodes;
992 int numPmeRanks = (comm->ddRankSetup.usePmeOnlyRanks ? comm->ddRankSetup.numRanksDoingPme : 0);
993 int numRanks = numPpRanks + numPmeRanks;
994 float lossFraction = 0;
996 /* Print the average load imbalance and performance loss */
997 if (dd->nnodes > 1 && comm->load_sum > 0)
999 float imbalance = comm->load_max * numPpRanks / comm->load_sum - 1;
1000 lossFraction = dd_force_imb_perf_loss(dd);
1002 std::string msg = "\nDynamic load balancing report:\n";
1003 std::string dlbStateStr;
1005 switch (dd->comm->dlbState)
1007 case DlbState::offUser:
1008 dlbStateStr = "DLB was off during the run per user request.";
1010 case DlbState::offForever:
1011 /* Currectly this can happen due to performance loss observed, cell size
1012 * limitations or incompatibility with other settings observed during
1013 * determineInitialDlbState(). */
1014 dlbStateStr = "DLB got disabled because it was unsuitable to use.";
1016 case DlbState::offCanTurnOn:
1017 dlbStateStr = "DLB was off during the run due to low measured imbalance.";
1019 case DlbState::offTemporarilyLocked:
1021 "DLB was locked at the end of the run due to unfinished PP-PME "
1024 case DlbState::onCanTurnOff:
1025 dlbStateStr = "DLB was turned on during the run due to measured imbalance.";
1027 case DlbState::onUser:
1028 dlbStateStr = "DLB was permanently on during the run per user request.";
1030 default: GMX_ASSERT(false, "Undocumented DLB state");
1033 msg += " " + dlbStateStr + "\n";
1034 msg += gmx::formatString(" Average load imbalance: %.1f%%.\n", imbalance * 100);
1035 msg += gmx::formatString(
1036 " The balanceable part of the MD step is %d%%, load imbalance is computed from "
1038 gmx::roundToInt(dd_force_load_fraction(dd) * 100));
1039 msg += gmx::formatString(
1040 " Part of the total run time spent waiting due to load imbalance: %.1f%%.\n",
1041 lossFraction * 100);
1042 fprintf(fplog, "%s", msg.c_str());
1043 fprintf(stderr, "\n%s", msg.c_str());
1046 /* Print during what percentage of steps the load balancing was limited */
1047 bool dlbWasLimited = false;
1050 sprintf(buf, " Steps where the load balancing was limited by -rdd, -rcon and/or -dds:");
1051 for (int d = 0; d < dd->ndim; d++)
1053 int limitPercentage = (200 * comm->load_lim[d] + 1) / (2 * comm->nload);
1054 sprintf(buf + strlen(buf), " %c %d %%", dim2char(dd->dim[d]), limitPercentage);
1055 if (limitPercentage >= 50)
1057 dlbWasLimited = true;
1060 sprintf(buf + strlen(buf), "\n");
1061 fprintf(fplog, "%s", buf);
1062 fprintf(stderr, "%s", buf);
1065 /* Print the performance loss due to separate PME - PP rank imbalance */
1066 float lossFractionPme = 0;
1067 if (numPmeRanks > 0 && comm->load_mdf > 0 && comm->load_step > 0)
1069 float pmeForceRatio = comm->load_pme / comm->load_mdf;
1070 lossFractionPme = (comm->load_pme - comm->load_mdf) / comm->load_step;
1071 if (lossFractionPme <= 0)
1073 lossFractionPme *= numPmeRanks / static_cast<float>(numRanks);
1077 lossFractionPme *= numPpRanks / static_cast<float>(numRanks);
1079 sprintf(buf, " Average PME mesh/force load: %5.3f\n", pmeForceRatio);
1080 fprintf(fplog, "%s", buf);
1081 fprintf(stderr, "%s", buf);
1083 " Part of the total run time spent waiting due to PP/PME imbalance: %.1f %%\n",
1084 std::fabs(lossFractionPme) * 100);
1085 fprintf(fplog, "%s", buf);
1086 fprintf(stderr, "%s", buf);
1088 fprintf(fplog, "\n");
1089 fprintf(stderr, "\n");
1091 if ((lossFraction >= DD_PERF_LOSS_WARN) && (dd->comm->dlbState != DlbState::offTemporarilyLocked))
1093 std::string message = gmx::formatString(
1094 "NOTE: %.1f %% of the available CPU time was lost due to load imbalance\n"
1095 " in the domain decomposition.\n",
1096 lossFraction * 100);
1098 bool hadSuggestion = false;
1099 if (dd->comm->dlbState == DlbState::offUser)
1101 message += " You might want to allow dynamic load balancing (option -dlb auto.)\n";
1102 hadSuggestion = true;
1104 else if (dd->comm->dlbState == DlbState::offCanTurnOn)
1107 " Dynamic load balancing was automatically disabled, but it might be "
1108 "beneficial to manually tuning it on (option -dlb on.)\n";
1109 hadSuggestion = true;
1111 else if (dlbWasLimited)
1114 " You might want to decrease the cell size limit (options -rdd, -rcon "
1116 hadSuggestion = true;
1118 message += gmx::formatString(
1119 " You can %sconsider manually changing the decomposition (option -dd);\n"
1120 " e.g. by using fewer domains along the box dimension in which there is\n"
1121 " considerable inhomogeneity in the simulated system.",
1122 hadSuggestion ? "also " : "");
1124 fprintf(fplog, "%s\n", message.c_str());
1125 fprintf(stderr, "%s\n", message.c_str());
1127 if (numPmeRanks > 0 && std::fabs(lossFractionPme) >= DD_PERF_LOSS_WARN)
1130 "NOTE: %.1f %% performance was lost because the PME ranks\n"
1131 " had %s work to do than the PP ranks.\n"
1132 " You might want to %s the number of PME ranks\n"
1133 " or %s the cut-off and the grid spacing.\n",
1134 std::fabs(lossFractionPme * 100),
1135 (lossFractionPme < 0) ? "less" : "more",
1136 (lossFractionPme < 0) ? "decrease" : "increase",
1137 (lossFractionPme < 0) ? "decrease" : "increase");
1138 fprintf(fplog, "%s\n", buf);
1139 fprintf(stderr, "%s\n", buf);
1143 //! Return the minimum communication volume.
1144 static float dd_vol_min(gmx_domdec_t* dd)
1146 return dd->comm->load[0].cvol_min * dd->nnodes;
1149 //! Return the DD load flags.
1150 static int dd_load_flags(gmx_domdec_t* dd)
1152 return dd->comm->load[0].flags;
1155 //! Return the reported load imbalance in force calculations.
1156 static float dd_f_imbal(gmx_domdec_t* dd)
1158 if (dd->comm->load[0].sum > 0)
1160 return dd->comm->load[0].max * dd->nnodes / dd->comm->load[0].sum - 1.0F;
1164 /* Something is wrong in the cycle counting, report no load imbalance */
1169 //! Returns DD load balance report.
1170 static std::string dd_print_load(gmx_domdec_t* dd, int64_t step)
1172 gmx::StringOutputStream stream;
1173 gmx::TextWriter log(&stream);
1175 int flags = dd_load_flags(dd);
1178 log.writeString("DD load balancing is limited by minimum cell size in dimension");
1179 for (int d = 0; d < dd->ndim; d++)
1181 if (flags & (1 << d))
1183 log.writeStringFormatted(" %c", dim2char(dd->dim[d]));
1186 log.ensureLineBreak();
1188 log.writeString("DD step " + gmx::toString(step));
1189 if (isDlbOn(dd->comm))
1191 log.writeStringFormatted(" vol min/aver %5.3f%c", dd_vol_min(dd), flags ? '!' : ' ');
1195 log.writeStringFormatted(" load imb.: force %4.1f%%", dd_f_imbal(dd) * 100);
1197 if (dd->comm->cycl_n[ddCyclPME])
1199 log.writeStringFormatted(" pme mesh/force %5.3f", dd_pme_f_ratio(dd));
1201 log.ensureLineBreak();
1202 return stream.toString();
1205 //! Prints DD load balance report in mdrun verbose mode.
1206 static void dd_print_load_verbose(gmx_domdec_t* dd)
1208 if (isDlbOn(dd->comm))
1210 fprintf(stderr, "vol %4.2f%c ", dd_vol_min(dd), dd_load_flags(dd) ? '!' : ' ');
1214 fprintf(stderr, "imb F %2d%% ", gmx::roundToInt(dd_f_imbal(dd) * 100));
1216 if (dd->comm->cycl_n[ddCyclPME])
1218 fprintf(stderr, "pme/F %4.2f ", dd_pme_f_ratio(dd));
1222 //! Turns on dynamic load balancing if possible and needed.
1223 static void turn_on_dlb(const gmx::MDLogger& mdlog, gmx_domdec_t* dd, int64_t step)
1225 gmx_domdec_comm_t* comm = dd->comm;
1227 real cellsize_min = comm->cellsize_min[dd->dim[0]];
1228 for (int d = 1; d < dd->ndim; d++)
1230 cellsize_min = std::min(cellsize_min, comm->cellsize_min[dd->dim[d]]);
1233 /* Turn off DLB if we're too close to the cell size limit. */
1234 if (cellsize_min < comm->cellsize_limit * 1.05)
1237 .appendTextFormatted(
1238 "step %s Measured %.1f %% performance loss due to load imbalance, "
1239 "but the minimum cell size is smaller than 1.05 times the cell size limit. "
1240 "Will no longer try dynamic load balancing.",
1241 gmx::toString(step).c_str(),
1242 dd_force_imb_perf_loss(dd) * 100);
1244 comm->dlbState = DlbState::offForever;
1249 .appendTextFormatted(
1250 "step %s Turning on dynamic load balancing, because the performance loss due "
1251 "to load imbalance is %.1f %%.",
1252 gmx::toString(step).c_str(),
1253 dd_force_imb_perf_loss(dd) * 100);
1254 comm->dlbState = DlbState::onCanTurnOff;
1256 /* Store the non-DLB performance, so we can check if DLB actually
1257 * improves performance.
1259 GMX_RELEASE_ASSERT(comm->cycl_n[ddCyclStep] > 0,
1260 "When we turned on DLB, we should have measured cycles");
1261 comm->cyclesPerStepBeforeDLB = comm->cycl[ddCyclStep] / comm->cycl_n[ddCyclStep];
1265 /* We can set the required cell size info here,
1266 * so we do not need to communicate this.
1267 * The grid is completely uniform.
1269 for (int d = 0; d < dd->ndim; d++)
1271 RowMaster* rowMaster = comm->cellsizesWithDlb[d].rowMaster.get();
1275 comm->load[d].sum_m = comm->load[d].sum;
1277 int nc = dd->numCells[dd->dim[d]];
1278 for (int i = 0; i < nc; i++)
1280 rowMaster->cellFrac[i] = i / static_cast<real>(nc);
1283 rowMaster->bounds[i].cellFracLowerMax = i / static_cast<real>(nc);
1284 rowMaster->bounds[i].cellFracUpperMin = (i + 1) / static_cast<real>(nc);
1287 rowMaster->cellFrac[nc] = 1.0;
1292 //! Turns off dynamic load balancing (but leave it able to turn back on).
1293 static void turn_off_dlb(const gmx::MDLogger& mdlog, gmx_domdec_t* dd, int64_t step)
1297 "step " + gmx::toString(step)
1298 + " Turning off dynamic load balancing, because it is degrading performance.");
1299 dd->comm->dlbState = DlbState::offCanTurnOn;
1300 dd->comm->haveTurnedOffDlb = true;
1301 dd->comm->ddPartioningCountFirstDlbOff = dd->ddp_count;
1304 //! Turns off dynamic load balancing permanently.
1305 static void turn_off_dlb_forever(const gmx::MDLogger& mdlog, gmx_domdec_t* dd, int64_t step)
1307 GMX_RELEASE_ASSERT(dd->comm->dlbState == DlbState::offCanTurnOn,
1308 "Can only turn off DLB forever when it was in the can-turn-on state");
1311 "step " + gmx::toString(step)
1312 + " Will no longer try dynamic load balancing, as it degraded performance.");
1313 dd->comm->dlbState = DlbState::offForever;
1316 void set_dd_dlb_max_cutoff(t_commrec* cr, real cutoff)
1318 gmx_domdec_comm_t* comm;
1320 comm = cr->dd->comm;
1322 /* Turn on the DLB limiting (might have been on already) */
1323 comm->bPMELoadBalDLBLimits = TRUE;
1325 /* Change the cut-off limit */
1326 comm->PMELoadBal_max_cutoff = cutoff;
1331 "PME load balancing set a limit to the DLB staggering such that a %f cut-off will "
1332 "continue to fit\n",
1333 comm->PMELoadBal_max_cutoff);
1337 //! Merge atom buffers.
1338 static void merge_cg_buffers(int ncell,
1339 gmx_domdec_comm_dim_t* cd,
1342 gmx::ArrayRef<int> index_gl,
1344 gmx::ArrayRef<gmx::RVec> x,
1345 gmx::ArrayRef<const gmx::RVec> recv_vr,
1346 gmx::ArrayRef<gmx::AtomInfoWithinMoleculeBlock> atomInfoForEachMoleculeBlock,
1347 gmx::ArrayRef<int64_t> atomInfo)
1349 gmx_domdec_ind_t *ind, *ind_p;
1350 int p, cell, c, cg, cg0, cg1, cg_gl;
1353 ind = &cd->ind[pulse];
1355 /* First correct the already stored data */
1356 shift = ind->nrecv[ncell];
1357 for (cell = ncell - 1; cell >= 0; cell--)
1359 shift -= ind->nrecv[cell];
1362 /* Move the cg's present from previous grid pulses */
1363 cg0 = ncg_cell[ncell + cell];
1364 cg1 = ncg_cell[ncell + cell + 1];
1365 for (cg = cg1 - 1; cg >= cg0; cg--)
1367 index_gl[cg + shift] = index_gl[cg];
1368 x[cg + shift] = x[cg];
1369 atomInfo[cg + shift] = atomInfo[cg];
1371 /* Correct the already stored send indices for the shift */
1372 for (p = 1; p <= pulse; p++)
1374 ind_p = &cd->ind[p];
1376 for (c = 0; c < cell; c++)
1378 cg0 += ind_p->nsend[c];
1380 cg1 = cg0 + ind_p->nsend[cell];
1381 for (cg = cg0; cg < cg1; cg++)
1383 ind_p->index[cg] += shift;
1389 /* Merge in the communicated buffers */
1392 for (cell = 0; cell < ncell; cell++)
1394 cg1 = ncg_cell[ncell + cell + 1] + shift;
1395 for (cg = 0; cg < ind->nrecv[cell]; cg++)
1397 /* Copy this atom from the buffer */
1398 index_gl[cg1] = recv_i[cg0];
1399 x[cg1] = recv_vr[cg0];
1400 /* Copy information */
1401 cg_gl = index_gl[cg1];
1402 atomInfo[cg1] = ddGetAtomInfo(atomInfoForEachMoleculeBlock, cg_gl);
1406 shift += ind->nrecv[cell];
1407 ncg_cell[ncell + cell + 1] = cg1;
1411 //! Makes a range partitioning for the atom groups wthin a cell
1412 static void make_cell2at_index(gmx_domdec_comm_dim_t* cd, int nzone, int atomGroupStart)
1414 /* Store the atom block boundaries for easy copying of communication buffers
1416 int g = atomGroupStart;
1417 for (int zone = 0; zone < nzone; zone++)
1419 for (gmx_domdec_ind_t& ind : cd->ind)
1421 ind.cell2at0[zone] = g;
1422 g += ind.nrecv[zone];
1423 ind.cell2at1[zone] = g;
1428 //! Returns whether a link is missing.
1429 static gmx_bool missing_link(const t_blocka& link, const int globalAtomIndex, const gmx_ga2la_t& ga2la)
1431 for (int i = link.index[globalAtomIndex]; i < link.index[globalAtomIndex + 1]; i++)
1433 if (!ga2la.findHome(link.a[i]))
1442 //! Domain corners for communication, a maximum of 4 i-zones see a j domain
1445 //! The corners for the non-bonded communication.
1447 //! Corner for rounding.
1449 //! Corners for rounding.
1451 //! Corners for bounded communication.
1453 //! Corner for rounding for bonded communication.
1457 //! Determine the corners of the domain(s) we are communicating with.
1458 static void set_dd_corners(const gmx_domdec_t* dd, int dim0, int dim1, int dim2, gmx_bool bDistMB, dd_corners_t* c)
1460 const gmx_domdec_comm_t* comm;
1461 const gmx_domdec_zones_t* zones;
1465 zones = &comm->zones;
1467 /* Keep the compiler happy */
1471 /* The first dimension is equal for all cells */
1472 c->c[0][0] = comm->cell_x0[dim0];
1475 c->bc[0] = c->c[0][0];
1480 /* This cell row is only seen from the first row */
1481 c->c[1][0] = comm->cell_x0[dim1];
1482 /* All rows can see this row */
1483 c->c[1][1] = comm->cell_x0[dim1];
1484 if (isDlbOn(dd->comm))
1486 c->c[1][1] = std::max(comm->cell_x0[dim1], comm->zone_d1[1].mch0);
1489 /* For the multi-body distance we need the maximum */
1490 c->bc[1] = std::max(comm->cell_x0[dim1], comm->zone_d1[1].p1_0);
1493 /* Set the upper-right corner for rounding */
1494 c->cr0 = comm->cell_x1[dim0];
1499 for (int j = 0; j < 4; j++)
1501 c->c[2][j] = comm->cell_x0[dim2];
1503 if (isDlbOn(dd->comm))
1505 /* Use the maximum of the i-cells that see a j-cell */
1506 for (const auto& iZone : zones->iZones)
1508 const int iZoneIndex = iZone.iZoneIndex;
1509 for (int jZone : iZone.jZoneRange)
1513 c->c[2][jZone - 4] = std::max(
1515 comm->zone_d2[zones->shift[iZoneIndex][dim0]][zones->shift[iZoneIndex][dim1]]
1522 /* For the multi-body distance we need the maximum */
1523 c->bc[2] = comm->cell_x0[dim2];
1524 for (int i = 0; i < 2; i++)
1526 for (int j = 0; j < 2; j++)
1528 c->bc[2] = std::max(c->bc[2], comm->zone_d2[i][j].p1_0);
1534 /* Set the upper-right corner for rounding */
1535 /* Cell (0,0,0) and cell (1,0,0) can see cell 4 (0,1,1)
1536 * Only cell (0,0,0) can see cell 7 (1,1,1)
1538 c->cr1[0] = comm->cell_x1[dim1];
1539 c->cr1[3] = comm->cell_x1[dim1];
1540 if (isDlbOn(dd->comm))
1542 c->cr1[0] = std::max(comm->cell_x1[dim1], comm->zone_d1[1].mch1);
1545 /* For the multi-body distance we need the maximum */
1546 c->bcr1 = std::max(comm->cell_x1[dim1], comm->zone_d1[1].p1_1);
1553 /*! \brief Add the atom groups and coordinates we need to send in this
1554 * pulse from this zone to \p localAtomGroups and \p work. */
1555 static void get_zone_pulse_groups(gmx_domdec_t* dd,
1560 gmx::ArrayRef<const int> globalAtomGroupIndices,
1569 bool distanceIsTriclinic,
1576 const dd_corners_t* c,
1577 const rvec sf2_round,
1578 gmx_bool bDistBonded,
1582 gmx::ArrayRef<const gmx::RVec> coordinates,
1583 gmx::ArrayRef<const int64_t> atomInfo,
1584 std::vector<int>* localAtomGroups,
1585 dd_comm_setup_work_t* work)
1587 gmx_domdec_comm_t* comm;
1589 gmx_bool bDistMB_pulse;
1591 real r2, rb2, r, tric_sh;
1598 bScrew = (dd->unitCellInfo.haveScrewPBC && dim == XX);
1600 bDistMB_pulse = (bDistMB && bDistBonded);
1602 /* Unpack the work data */
1603 std::vector<int>& ibuf = work->atomGroupBuffer;
1604 std::vector<gmx::RVec>& vbuf = work->positionBuffer;
1608 for (cg = cg0; cg < cg1; cg++)
1612 if (!distanceIsTriclinic)
1614 /* Rectangular direction, easy */
1615 r = coordinates[cg][dim] - c->c[dim_ind][zone];
1622 r = coordinates[cg][dim] - c->bc[dim_ind];
1628 /* Rounding gives at most a 16% reduction
1629 * in communicated atoms
1631 if (dim_ind >= 1 && (zonei == 1 || zonei == 2))
1633 r = coordinates[cg][dim0] - c->cr0;
1634 /* This is the first dimension, so always r >= 0 */
1641 if (dim_ind == 2 && (zonei == 2 || zonei == 3))
1643 r = coordinates[cg][dim1] - c->cr1[zone];
1650 r = coordinates[cg][dim1] - c->bcr1;
1660 /* Triclinic direction, more complicated */
1663 /* Rounding, conservative as the skew_fac multiplication
1664 * will slightly underestimate the distance.
1666 if (dim_ind >= 1 && (zonei == 1 || zonei == 2))
1668 rn[dim0] = coordinates[cg][dim0] - c->cr0;
1669 for (i = dim0 + 1; i < DIM; i++)
1671 rn[dim0] -= coordinates[cg][i] * v_0[i][dim0];
1673 r2 = rn[dim0] * rn[dim0] * sf2_round[dim0];
1676 rb[dim0] = rn[dim0];
1679 /* Take care that the cell planes along dim0 might not
1680 * be orthogonal to those along dim1 and dim2.
1682 for (i = 1; i <= dim_ind; i++)
1685 if (normal[dim0][dimd] > 0)
1687 rn[dimd] -= rn[dim0] * normal[dim0][dimd];
1690 rb[dimd] -= rb[dim0] * normal[dim0][dimd];
1695 if (dim_ind == 2 && (zonei == 2 || zonei == 3))
1697 GMX_ASSERT(dim1 >= 0 && dim1 < DIM, "Must have a valid dimension index");
1698 rn[dim1] += coordinates[cg][dim1] - c->cr1[zone];
1700 for (i = dim1 + 1; i < DIM; i++)
1702 tric_sh -= coordinates[cg][i] * v_1[i][dim1];
1704 rn[dim1] += tric_sh;
1707 r2 += rn[dim1] * rn[dim1] * sf2_round[dim1];
1708 /* Take care of coupling of the distances
1709 * to the planes along dim0 and dim1 through dim2.
1711 r2 -= rn[dim0] * rn[dim1] * skew_fac_01;
1712 /* Take care that the cell planes along dim1
1713 * might not be orthogonal to that along dim2.
1715 if (normal[dim1][dim2] > 0)
1717 rn[dim2] -= rn[dim1] * normal[dim1][dim2];
1722 rb[dim1] += coordinates[cg][dim1] - c->bcr1 + tric_sh;
1725 rb2 += rb[dim1] * rb[dim1] * sf2_round[dim1];
1726 /* Take care of coupling of the distances
1727 * to the planes along dim0 and dim1 through dim2.
1729 rb2 -= rb[dim0] * rb[dim1] * skew_fac_01;
1730 /* Take care that the cell planes along dim1
1731 * might not be orthogonal to that along dim2.
1733 if (normal[dim1][dim2] > 0)
1735 rb[dim2] -= rb[dim1] * normal[dim1][dim2];
1740 /* The distance along the communication direction */
1741 rn[dim] += coordinates[cg][dim] - c->c[dim_ind][zone];
1743 for (i = dim + 1; i < DIM; i++)
1745 tric_sh -= coordinates[cg][i] * v_d[i][dim];
1750 r2 += rn[dim] * rn[dim] * skew_fac2_d;
1751 /* Take care of coupling of the distances
1752 * to the planes along dim0 and dim1 through dim2.
1754 if (dim_ind == 1 && zonei == 1)
1756 r2 -= rn[dim0] * rn[dim] * skew_fac_01;
1762 GMX_ASSERT(dim >= 0 && dim < DIM, "Must have a valid dimension index");
1763 rb[dim] += coordinates[cg][dim] - c->bc[dim_ind] + tric_sh;
1766 rb2 += rb[dim] * rb[dim] * skew_fac2_d;
1767 /* Take care of coupling of the distances
1768 * to the planes along dim0 and dim1 through dim2.
1770 if (dim_ind == 1 && zonei == 1)
1772 rb2 -= rb[dim0] * rb[dim] * skew_fac_01;
1779 || (bDistBonded && ((bDistMB && rb2 < r_bcomm2) || (bDist2B && r2 < r_bcomm2))
1781 || ((atomInfo[cg] & gmx::sc_atomInfo_BondCommunication)
1782 && missing_link(*comm->bondedLinks, globalAtomGroupIndices[cg], *dd->ga2la)))))
1784 /* Store the local and global atom group indices and position */
1785 localAtomGroups->push_back(cg);
1786 ibuf.push_back(globalAtomGroupIndices[cg]);
1790 if (dd->ci[dim] == 0)
1792 /* Correct coordinates for pbc */
1793 rvec_add(coordinates[cg], box[dim], posPbc);
1796 posPbc[YY] = box[YY][YY] - posPbc[YY];
1797 posPbc[ZZ] = box[ZZ][ZZ] - posPbc[ZZ];
1802 copy_rvec(coordinates[cg], posPbc);
1804 vbuf.emplace_back(posPbc[XX], posPbc[YY], posPbc[ZZ]);
1811 work->nsend_zone = nsend_z;
1815 static void clearCommSetupData(dd_comm_setup_work_t* work)
1817 work->localAtomGroupBuffer.clear();
1818 work->atomGroupBuffer.clear();
1819 work->positionBuffer.clear();
1821 work->nsend_zone = 0;
1824 //! Prepare DD communication.
1825 static void setup_dd_communication(gmx_domdec_t* dd, matrix box, gmx_ddbox_t* ddbox, t_forcerec* fr, t_state* state)
1827 int dim_ind, dim, dim0, dim1, dim2, dimd, nat_tot;
1828 int nzone, nzone_send, zone, zonei, cg0, cg1;
1830 int * zone_cg_range, pos_cg;
1831 gmx_domdec_comm_t* comm;
1832 gmx_domdec_zones_t* zones;
1833 gmx_domdec_comm_dim_t* cd;
1834 gmx_bool bBondComm, bDist2B, bDistMB, bDistBonded;
1835 dd_corners_t corners;
1836 rvec * normal, *v_d, *v_0 = nullptr, *v_1 = nullptr;
1837 real skew_fac2_d, skew_fac_01;
1842 fprintf(debug, "Setting up DD communication\n");
1847 if (comm->dth.empty())
1849 /* Initialize the thread data.
1850 * This can not be done in init_domain_decomposition,
1851 * as the numbers of threads is determined later.
1853 int numThreads = gmx_omp_nthreads_get(ModuleMultiThread::Domdec);
1854 comm->dth.resize(numThreads);
1857 bBondComm = comm->systemInfo.filterBondedCommunication;
1859 /* Do we need to determine extra distances for multi-body bondeds? */
1860 bDistMB = (comm->systemInfo.haveInterDomainMultiBodyBondeds && isDlbOn(dd->comm) && dd->ndim > 1);
1862 /* Do we need to determine extra distances for only two-body bondeds? */
1863 bDist2B = (bBondComm && !bDistMB);
1865 const real r_comm2 =
1866 gmx::square(domainToDomainIntoAtomToDomainCutoff(comm->systemInfo, comm->systemInfo.cutoff));
1867 const real r_bcomm2 =
1868 gmx::square(domainToDomainIntoAtomToDomainCutoff(comm->systemInfo, comm->cutoff_mbody));
1872 fprintf(debug, "bBondComm %s, r_bc %f\n", gmx::boolToString(bBondComm), std::sqrt(r_bcomm2));
1875 zones = &comm->zones;
1878 dim1 = (dd->ndim >= 2 ? dd->dim[1] : -1);
1879 dim2 = (dd->ndim >= 3 ? dd->dim[2] : -1);
1881 set_dd_corners(dd, dim0, dim1, dim2, bDistMB, &corners);
1883 /* Triclinic stuff */
1884 normal = ddbox->normal;
1888 v_0 = ddbox->v[dim0];
1889 if (ddbox->tric_dir[dim0] && ddbox->tric_dir[dim1])
1891 /* Determine the coupling coefficient for the distances
1892 * to the cell planes along dim0 and dim1 through dim2.
1893 * This is required for correct rounding.
1895 skew_fac_01 = ddbox->v[dim0][dim1 + 1][dim0] * ddbox->v[dim1][dim1 + 1][dim1];
1898 fprintf(debug, "\nskew_fac_01 %f\n", skew_fac_01);
1904 v_1 = ddbox->v[dim1];
1907 zone_cg_range = zones->cg_range.data();
1908 gmx::ArrayRef<gmx::AtomInfoWithinMoleculeBlock> atomInfoForEachMoleculeBlock =
1909 fr->atomInfoForEachMoleculeBlock;
1911 zone_cg_range[0] = 0;
1912 zone_cg_range[1] = dd->numHomeAtoms;
1913 comm->zone_ncg1[0] = dd->numHomeAtoms;
1914 pos_cg = dd->numHomeAtoms;
1916 nat_tot = comm->atomRanges.numHomeAtoms();
1918 for (dim_ind = 0; dim_ind < dd->ndim; dim_ind++)
1920 dim = dd->dim[dim_ind];
1921 cd = &comm->cd[dim_ind];
1923 /* Check if we need to compute triclinic distances along this dim */
1924 bool distanceIsTriclinic = false;
1925 for (int i = 0; i <= dim_ind; i++)
1927 if (ddbox->tric_dir[dd->dim[i]])
1929 distanceIsTriclinic = true;
1933 if (dim >= ddbox->npbcdim && dd->ci[dim] == 0)
1935 /* No pbc in this dimension, the first node should not comm. */
1943 v_d = ddbox->v[dim];
1944 skew_fac2_d = gmx::square(ddbox->skew_fac[dim]);
1946 cd->receiveInPlace = true;
1947 for (int p = 0; p < cd->numPulses(); p++)
1949 /* Only atoms communicated in the first pulse are used
1950 * for multi-body bonded interactions or for bBondComm.
1952 bDistBonded = ((bDistMB || bDist2B) && p == 0);
1954 gmx_domdec_ind_t* ind = &cd->ind[p];
1956 /* Thread 0 writes in the global index array */
1958 clearCommSetupData(&comm->dth[0]);
1960 for (zone = 0; zone < nzone_send; zone++)
1962 if (dim_ind > 0 && distanceIsTriclinic)
1964 /* Determine slightly more optimized skew_fac's
1966 * This reduces the number of communicated atoms
1967 * by about 10% for 3D DD of rhombic dodecahedra.
1969 for (dimd = 0; dimd < dim; dimd++)
1971 sf2_round[dimd] = 1;
1972 if (ddbox->tric_dir[dimd])
1974 for (int i = dd->dim[dimd] + 1; i < DIM; i++)
1976 /* If we are shifted in dimension i
1977 * and the cell plane is tilted forward
1978 * in dimension i, skip this coupling.
1980 if (!(zones->shift[nzone + zone][i] && ddbox->v[dimd][i][dimd] >= 0))
1982 sf2_round[dimd] += gmx::square(ddbox->v[dimd][i][dimd]);
1985 sf2_round[dimd] = 1 / sf2_round[dimd];
1990 zonei = zone_perm[dim_ind][zone];
1993 /* Here we permutate the zones to obtain a convenient order
1994 * for neighbor searching
1996 cg0 = zone_cg_range[zonei];
1997 cg1 = zone_cg_range[zonei + 1];
2001 /* Look only at the cg's received in the previous grid pulse
2003 cg1 = zone_cg_range[nzone + zone + 1];
2004 cg0 = cg1 - cd->ind[p - 1].nrecv[zone];
2007 const int numThreads = gmx::ssize(comm->dth);
2008 #pragma omp parallel for num_threads(numThreads) schedule(static)
2009 for (int th = 0; th < numThreads; th++)
2013 dd_comm_setup_work_t& work = comm->dth[th];
2015 /* Retain data accumulated into buffers of thread 0 */
2018 clearCommSetupData(&work);
2021 int cg0_th = cg0 + ((cg1 - cg0) * th) / numThreads;
2022 int cg1_th = cg0 + ((cg1 - cg0) * (th + 1)) / numThreads;
2024 /* Get the atom groups and coordinates for this pulse in this zone */
2025 get_zone_pulse_groups(dd,
2030 dd->globalAtomGroupIndices,
2039 distanceIsTriclinic,
2054 th == 0 ? &ind->index : &work.localAtomGroupBuffer,
2057 GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR
2060 std::vector<int>& atomGroups = comm->dth[0].atomGroupBuffer;
2061 std::vector<gmx::RVec>& positions = comm->dth[0].positionBuffer;
2062 ind->nsend[zone] = comm->dth[0].nsend_zone;
2063 /* Append data of threads>=1 to the communication buffers */
2064 for (int th = 1; th < numThreads; th++)
2066 const dd_comm_setup_work_t& dth = comm->dth[th];
2068 ind->index.insert(ind->index.end(),
2069 dth.localAtomGroupBuffer.begin(),
2070 dth.localAtomGroupBuffer.end());
2072 atomGroups.end(), dth.atomGroupBuffer.begin(), dth.atomGroupBuffer.end());
2074 positions.end(), dth.positionBuffer.begin(), dth.positionBuffer.end());
2075 comm->dth[0].nat += dth.nat;
2076 ind->nsend[zone] += dth.nsend_zone;
2079 /* Clear the counts in case we do not have pbc */
2080 for (zone = nzone_send; zone < nzone; zone++)
2082 ind->nsend[zone] = 0;
2084 ind->nsend[nzone] = ind->index.size();
2085 ind->nsend[nzone + 1] = comm->dth[0].nat;
2086 /* Communicate the number of cg's and atoms to receive */
2087 ddSendrecv(dd, dim_ind, dddirBackward, ind->nsend, nzone + 2, ind->nrecv, nzone + 2);
2091 /* We can receive in place if only the last zone is not empty */
2092 for (zone = 0; zone < nzone - 1; zone++)
2094 if (ind->nrecv[zone] > 0)
2096 cd->receiveInPlace = false;
2101 int receiveBufferSize = 0;
2102 if (!cd->receiveInPlace)
2104 receiveBufferSize = ind->nrecv[nzone];
2106 /* These buffer are actually only needed with in-place */
2107 DDBufferAccess<int> globalAtomGroupBuffer(comm->intBuffer, receiveBufferSize);
2108 DDBufferAccess<gmx::RVec> rvecBuffer(comm->rvecBuffer, receiveBufferSize);
2110 dd_comm_setup_work_t& work = comm->dth[0];
2112 /* Make space for the global cg indices */
2113 int numAtomGroupsNew = pos_cg + ind->nrecv[nzone];
2114 dd->globalAtomGroupIndices.resize(numAtomGroupsNew);
2115 /* Communicate the global cg indices */
2116 gmx::ArrayRef<int> integerBufferRef;
2117 if (cd->receiveInPlace)
2119 integerBufferRef = gmx::arrayRefFromArray(
2120 dd->globalAtomGroupIndices.data() + pos_cg, ind->nrecv[nzone]);
2124 integerBufferRef = globalAtomGroupBuffer.buffer;
2126 ddSendrecv<int>(dd, dim_ind, dddirBackward, work.atomGroupBuffer, integerBufferRef);
2128 /* Make space for atominfo */
2129 dd_resize_atominfo_and_state(fr, state, pos_cg + ind->nrecv[nzone]);
2131 /* Communicate the coordinates */
2132 gmx::ArrayRef<gmx::RVec> rvecBufferRef;
2133 if (cd->receiveInPlace)
2135 rvecBufferRef = gmx::makeArrayRef(state->x).subArray(pos_cg, ind->nrecv[nzone]);
2139 rvecBufferRef = rvecBuffer.buffer;
2141 ddSendrecv<gmx::RVec>(dd, dim_ind, dddirBackward, work.positionBuffer, rvecBufferRef);
2143 /* Make the charge group index */
2144 if (cd->receiveInPlace)
2146 zone = (p == 0 ? 0 : nzone - 1);
2147 while (zone < nzone)
2149 for (int i = 0; i < ind->nrecv[zone]; i++)
2151 int globalAtomIndex = dd->globalAtomGroupIndices[pos_cg];
2152 fr->atomInfo[pos_cg] =
2153 ddGetAtomInfo(atomInfoForEachMoleculeBlock, globalAtomIndex);
2158 comm->zone_ncg1[nzone + zone] = ind->nrecv[zone];
2161 zone_cg_range[nzone + zone] = pos_cg;
2166 /* This part of the code is never executed with bBondComm. */
2167 merge_cg_buffers(nzone,
2171 dd->globalAtomGroupIndices,
2172 integerBufferRef.data(),
2175 fr->atomInfoForEachMoleculeBlock,
2177 pos_cg += ind->nrecv[nzone];
2179 nat_tot += ind->nrecv[nzone + 1];
2181 if (!cd->receiveInPlace)
2183 /* Store the atom block for easy copying of communication buffers */
2184 make_cell2at_index(cd, nzone, zone_cg_range[nzone]);
2189 comm->atomRanges.setEnd(DDAtomRanges::Type::Zones, nat_tot);
2193 /* We don't need to update atominfo, since that was already done above.
2194 * So we pass NULL for the forcerec.
2197 dd->globalAtomGroupIndices, dd->numHomeAtoms, dd->globalAtomGroupIndices.size(), nullptr);
2202 fprintf(debug, "Finished setting up DD communication, zones:");
2203 for (c = 0; c < zones->n; c++)
2205 fprintf(debug, " %d", zones->cg_range[c + 1] - zones->cg_range[c]);
2207 fprintf(debug, "\n");
2211 //! Set boundaries for the charge group range.
2212 static void set_cg_boundaries(gmx_domdec_zones_t* zones)
2214 for (auto& iZone : zones->iZones)
2216 iZone.iAtomRange = gmx::Range<int>(0, zones->cg_range[iZone.iZoneIndex + 1]);
2217 iZone.jAtomRange = gmx::Range<int>(zones->cg_range[iZone.jZoneRange.begin()],
2218 zones->cg_range[iZone.jZoneRange.end()]);
2222 /*! \brief Set zone dimensions for zones \p zone_start to \p zone_end-1
2224 * Also sets the atom density for the home zone when \p zone_start=0.
2225 * For this \p numMovedChargeGroupsInHomeZone needs to be passed to tell
2226 * how many charge groups will move but are still part of the current range.
2227 * \todo When converting domdec to use proper classes, all these variables
2228 * should be private and a method should return the correct count
2229 * depending on an internal state.
2231 * \param[in,out] dd The domain decomposition struct
2232 * \param[in] box The box
2233 * \param[in] ddbox The domain decomposition box struct
2234 * \param[in] zone_start The start of the zone range to set sizes for
2235 * \param[in] zone_end The end of the zone range to set sizes for
2236 * \param[in] numMovedChargeGroupsInHomeZone The number of charge groups in the home zone that should moved but are still present in dd->comm->zones.cg_range
2238 static void set_zones_size(gmx_domdec_t* dd,
2240 const gmx_ddbox_t* ddbox,
2243 int numMovedChargeGroupsInHomeZone)
2245 gmx_domdec_comm_t* comm;
2246 gmx_domdec_zones_t* zones;
2255 zones = &comm->zones;
2257 /* Do we need to determine extra distances for multi-body bondeds? */
2258 bDistMB = (comm->systemInfo.haveInterDomainMultiBodyBondeds && isDlbOn(dd->comm) && dd->ndim > 1);
2260 for (z = zone_start; z < zone_end; z++)
2262 /* Copy cell limits to zone limits.
2263 * Valid for non-DD dims and non-shifted dims.
2265 copy_rvec(comm->cell_x0, zones->size[z].x0);
2266 copy_rvec(comm->cell_x1, zones->size[z].x1);
2269 for (d = 0; d < dd->ndim; d++)
2273 for (z = 0; z < zones->n; z++)
2275 /* With a staggered grid we have different sizes
2276 * for non-shifted dimensions.
2278 if (isDlbOn(dd->comm) && zones->shift[z][dim] == 0)
2282 zones->size[z].x0[dim] = comm->zone_d1[zones->shift[z][dd->dim[d - 1]]].min0;
2283 zones->size[z].x1[dim] = comm->zone_d1[zones->shift[z][dd->dim[d - 1]]].max1;
2287 zones->size[z].x0[dim] =
2288 comm->zone_d2[zones->shift[z][dd->dim[d - 2]]][zones->shift[z][dd->dim[d - 1]]]
2290 zones->size[z].x1[dim] =
2291 comm->zone_d2[zones->shift[z][dd->dim[d - 2]]][zones->shift[z][dd->dim[d - 1]]]
2297 rcs = comm->systemInfo.cutoff;
2298 rcmbs = comm->cutoff_mbody;
2299 if (ddbox->tric_dir[dim])
2301 rcs /= ddbox->skew_fac[dim];
2302 rcmbs /= ddbox->skew_fac[dim];
2305 /* Set the lower limit for the shifted zone dimensions */
2306 for (z = zone_start; z < zone_end; z++)
2308 if (zones->shift[z][dim] > 0)
2311 if (!isDlbOn(dd->comm) || d == 0)
2313 zones->size[z].x0[dim] = comm->cell_x1[dim];
2314 zones->size[z].x1[dim] = comm->cell_x1[dim] + rcs;
2318 /* Here we take the lower limit of the zone from
2319 * the lowest domain of the zone below.
2323 zones->size[z].x0[dim] = comm->zone_d1[zones->shift[z][dd->dim[d - 1]]].min1;
2329 zones->size[z].x0[dim] = zones->size[zone_perm[2][z - 4]].x0[dim];
2333 zones->size[z].x0[dim] =
2334 comm->zone_d2[zones->shift[z][dd->dim[d - 2]]][zones->shift[z][dd->dim[d - 1]]]
2338 /* A temporary limit, is updated below */
2339 zones->size[z].x1[dim] = zones->size[z].x0[dim];
2343 for (size_t zi = 0; zi < zones->iZones.size(); zi++)
2345 if (zones->shift[zi][dim] == 0)
2347 /* This takes the whole zone into account.
2348 * With multiple pulses this will lead
2349 * to a larger zone then strictly necessary.
2351 zones->size[z].x1[dim] = std::max(zones->size[z].x1[dim],
2352 zones->size[zi].x1[dim] + rcmbs);
2360 /* Loop over the i-zones to set the upper limit of each
2363 for (const auto& iZone : zones->iZones)
2365 const int zi = iZone.iZoneIndex;
2366 if (zones->shift[zi][dim] == 0)
2368 /* We should only use zones up to zone_end */
2369 const auto& jZoneRangeFull = iZone.jZoneRange;
2370 if (zone_end <= *jZoneRangeFull.begin())
2374 const gmx::Range<int> jZoneRange(*jZoneRangeFull.begin(),
2375 std::min(*jZoneRangeFull.end(), zone_end));
2376 for (int jZone : jZoneRange)
2378 if (zones->shift[jZone][dim] > 0)
2380 zones->size[jZone].x1[dim] =
2381 std::max(zones->size[jZone].x1[dim], zones->size[zi].x1[dim] + rcs);
2388 for (z = zone_start; z < zone_end; z++)
2390 /* Initialization only required to keep the compiler happy */
2391 rvec corner_min = { 0, 0, 0 }, corner_max = { 0, 0, 0 }, corner;
2394 /* To determine the bounding box for a zone we need to find
2395 * the extreme corners of 4, 2 or 1 corners.
2397 nc = 1 << (ddbox->nboundeddim - 1);
2399 for (c = 0; c < nc; c++)
2401 /* Set up a zone corner at x=0, ignoring trilinic couplings */
2405 corner[YY] = zones->size[z].x0[YY];
2409 corner[YY] = zones->size[z].x1[YY];
2413 corner[ZZ] = zones->size[z].x0[ZZ];
2417 corner[ZZ] = zones->size[z].x1[ZZ];
2419 if (dd->ndim == 1 && dd->dim[0] < ZZ && ZZ < dd->unitCellInfo.npbcdim
2420 && box[ZZ][1 - dd->dim[0]] != 0)
2422 /* With 1D domain decomposition the cg's are not in
2423 * the triclinic box, but triclinic x-y and rectangular y/x-z.
2424 * Shift the corner of the z-vector back to along the box
2425 * vector of dimension d, so it will later end up at 0 along d.
2426 * This can affect the location of this corner along dd->dim[0]
2427 * through the matrix operation below if box[d][dd->dim[0]]!=0.
2429 int d = 1 - dd->dim[0];
2431 corner[d] -= corner[ZZ] * box[ZZ][d] / box[ZZ][ZZ];
2433 /* Apply the triclinic couplings */
2434 for (i = YY; i < ddbox->npbcdim && i < DIM; i++)
2436 for (j = XX; j < i; j++)
2438 corner[j] += corner[i] * box[i][j] / box[i][i];
2443 copy_rvec(corner, corner_min);
2444 copy_rvec(corner, corner_max);
2448 for (i = 0; i < DIM; i++)
2450 corner_min[i] = std::min(corner_min[i], corner[i]);
2451 corner_max[i] = std::max(corner_max[i], corner[i]);
2455 /* Copy the extreme cornes without offset along x */
2456 for (i = 0; i < DIM; i++)
2458 zones->size[z].bb_x0[i] = corner_min[i];
2459 zones->size[z].bb_x1[i] = corner_max[i];
2461 /* Add the offset along x */
2462 zones->size[z].bb_x0[XX] += zones->size[z].x0[XX];
2463 zones->size[z].bb_x1[XX] += zones->size[z].x1[XX];
2466 if (zone_start == 0)
2469 for (dim = 0; dim < DIM; dim++)
2471 vol *= zones->size[0].x1[dim] - zones->size[0].x0[dim];
2474 (zones->cg_range[1] - zones->cg_range[0] - numMovedChargeGroupsInHomeZone) / vol;
2479 for (z = zone_start; z < zone_end; z++)
2482 "zone %d %6.3f - %6.3f %6.3f - %6.3f %6.3f - %6.3f\n",
2484 zones->size[z].x0[XX],
2485 zones->size[z].x1[XX],
2486 zones->size[z].x0[YY],
2487 zones->size[z].x1[YY],
2488 zones->size[z].x0[ZZ],
2489 zones->size[z].x1[ZZ]);
2491 "zone %d bb %6.3f - %6.3f %6.3f - %6.3f %6.3f - %6.3f\n",
2493 zones->size[z].bb_x0[XX],
2494 zones->size[z].bb_x1[XX],
2495 zones->size[z].bb_x0[YY],
2496 zones->size[z].bb_x1[YY],
2497 zones->size[z].bb_x0[ZZ],
2498 zones->size[z].bb_x1[ZZ]);
2503 /*! \brief Order data in \p dataToSort according to \p sort
2505 * Note: both buffers should have at least \p sort.size() elements.
2507 template<typename T>
2508 static void orderVector(gmx::ArrayRef<const gmx_cgsort_t> sort,
2509 gmx::ArrayRef<T> dataToSort,
2510 gmx::ArrayRef<T> sortBuffer)
2512 GMX_ASSERT(dataToSort.size() >= sort.size(), "The vector needs to be sufficiently large");
2513 GMX_ASSERT(sortBuffer.size() >= sort.size(),
2514 "The sorting buffer needs to be sufficiently large");
2516 /* Order the data into the temporary buffer */
2518 for (const gmx_cgsort_t& entry : sort)
2520 sortBuffer[i++] = dataToSort[entry.ind];
2523 /* Copy back to the original array */
2524 std::copy(sortBuffer.begin(), sortBuffer.begin() + sort.size(), dataToSort.begin());
2527 /*! \brief Order data in \p dataToSort according to \p sort
2529 * Note: \p vectorToSort should have at least \p sort.size() elements,
2530 * \p workVector is resized when it is too small.
2532 template<typename T>
2533 static void orderVector(gmx::ArrayRef<const gmx_cgsort_t> sort,
2534 gmx::ArrayRef<T> vectorToSort,
2535 std::vector<T>* workVector)
2537 if (gmx::index(workVector->size()) < sort.ssize())
2539 workVector->resize(sort.size());
2541 orderVector<T>(sort, vectorToSort, *workVector);
2544 //! Returns the sorting order for atoms based on the nbnxn grid order in sort
2545 static void dd_sort_order_nbnxn(const t_forcerec* fr, std::vector<gmx_cgsort_t>* sort)
2547 gmx::ArrayRef<const int> atomOrder = fr->nbv->getLocalAtomOrder();
2549 /* Using push_back() instead of this resize results in much slower code */
2550 sort->resize(atomOrder.size());
2551 gmx::ArrayRef<gmx_cgsort_t> buffer = *sort;
2552 size_t numSorted = 0;
2553 for (int i : atomOrder)
2557 /* The values of nsc and ind_gl are not used in this case */
2558 buffer[numSorted++].ind = i;
2561 sort->resize(numSorted);
2564 //! Returns the sorting state for DD.
2565 static void dd_sort_state(gmx_domdec_t* dd, t_forcerec* fr, t_state* state)
2567 gmx_domdec_sort_t* sort = dd->comm->sort.get();
2569 dd_sort_order_nbnxn(fr, &sort->sorted);
2571 /* We alloc with the old size, since cgindex is still old */
2572 DDBufferAccess<gmx::RVec> rvecBuffer(dd->comm->rvecBuffer, dd->numHomeAtoms);
2574 /* Set the new home atom/charge group count */
2575 dd->numHomeAtoms = sort->sorted.size();
2578 fprintf(debug, "Set the new home atom count to %d\n", dd->numHomeAtoms);
2581 /* Reorder the state */
2582 gmx::ArrayRef<const gmx_cgsort_t> cgsort = sort->sorted;
2583 GMX_RELEASE_ASSERT(cgsort.ssize() == dd->numHomeAtoms,
2584 "We should sort all the home atom groups");
2586 if (state->flags & enumValueToBitMask(StateEntry::X))
2588 orderVector(cgsort, makeArrayRef(state->x), rvecBuffer.buffer);
2590 if (state->flags & enumValueToBitMask(StateEntry::V))
2592 orderVector(cgsort, makeArrayRef(state->v), rvecBuffer.buffer);
2594 if (state->flags & enumValueToBitMask(StateEntry::Cgp))
2596 orderVector(cgsort, makeArrayRef(state->cg_p), rvecBuffer.buffer);
2599 /* Reorder the global cg index */
2600 orderVector<int>(cgsort, dd->globalAtomGroupIndices, &sort->intBuffer);
2601 /* Reorder the atom info */
2602 orderVector<int64_t>(cgsort, fr->atomInfo, &sort->int64Buffer);
2603 /* Set the home atom number */
2604 dd->comm->atomRanges.setEnd(DDAtomRanges::Type::Home, dd->numHomeAtoms);
2606 /* The atoms are now exactly in grid order, update the grid order */
2607 fr->nbv->setLocalAtomOrder();
2610 //! Accumulates load statistics.
2611 static void add_dd_statistics(gmx_domdec_t* dd)
2613 gmx_domdec_comm_t* comm = dd->comm;
2615 for (int i = 0; i < static_cast<int>(DDAtomRanges::Type::Number); i++)
2617 auto range = static_cast<DDAtomRanges::Type>(i);
2618 comm->sum_nat[i] += comm->atomRanges.end(range) - comm->atomRanges.start(range);
2623 void reset_dd_statistics_counters(gmx_domdec_t* dd)
2625 gmx_domdec_comm_t* comm = dd->comm;
2627 /* Reset all the statistics and counters for total run counting */
2628 for (int i = 0; i < static_cast<int>(DDAtomRanges::Type::Number); i++)
2630 comm->sum_nat[i] = 0;
2634 comm->load_step = 0;
2637 clear_ivec(comm->load_lim);
2645 bool check_grid_jump(int64_t step, const gmx_domdec_t* dd, real cutoff, const gmx_ddbox_t* ddbox, bool bFatal)
2647 gmx_domdec_comm_t* comm = dd->comm;
2648 bool invalid = false;
2650 for (int d = 1; d < dd->ndim; d++)
2652 const DDCellsizesWithDlb& cellsizes = comm->cellsizesWithDlb[d];
2653 const int dim = dd->dim[d];
2654 const real limit = grid_jump_limit(comm, cutoff, d);
2655 real bfac = ddbox->box_size[dim];
2656 if (ddbox->tric_dir[dim])
2658 bfac *= ddbox->skew_fac[dim];
2660 if ((cellsizes.fracUpper - cellsizes.fracLowerMax) * bfac < limit
2661 || (cellsizes.fracLower - cellsizes.fracUpperMin) * bfac > -limit)
2669 /* This error should never be triggered under normal
2670 * circumstances, but you never know ...
2673 "step %s: The domain decomposition grid has shifted too much in the "
2674 "%c-direction around cell %d %d %d. This should not have happened. "
2675 "Running with fewer ranks might avoid this issue.",
2676 gmx_step_str(step, buf),
2688 void print_dd_statistics(const t_commrec* cr, const t_inputrec& inputrec, FILE* fplog)
2690 gmx_domdec_comm_t* comm = cr->dd->comm;
2692 const int numRanges = static_cast<int>(DDAtomRanges::Type::Number);
2693 gmx_sumd(numRanges, comm->sum_nat, cr);
2695 if (fplog == nullptr)
2700 fprintf(fplog, "\n D O M A I N D E C O M P O S I T I O N S T A T I S T I C S\n\n");
2702 for (int i = static_cast<int>(DDAtomRanges::Type::Zones); i < numRanges; i++)
2704 auto range = static_cast<DDAtomRanges::Type>(i);
2705 double av = comm->sum_nat[i] / comm->ndecomp;
2708 case DDAtomRanges::Type::Zones:
2709 fprintf(fplog, " av. #atoms communicated per step for force: %d x %.1f\n", 2, av);
2711 case DDAtomRanges::Type::Vsites:
2712 if (cr->dd->vsite_comm)
2715 " av. #atoms communicated per step for vsites: %d x %.1f\n",
2716 (EEL_PME(inputrec.coulombtype)
2717 || inputrec.coulombtype == CoulombInteractionType::Ewald)
2723 case DDAtomRanges::Type::Constraints:
2724 if (cr->dd->constraint_comm)
2727 " av. #atoms communicated per step for LINCS: %d x %.1f\n",
2728 1 + inputrec.nLincsIter,
2732 default: gmx_incons(" Unknown type for DD statistics");
2735 fprintf(fplog, "\n");
2737 if (comm->ddSettings.recordLoad && EI_DYNAMICS(inputrec.eI))
2739 print_dd_load_av(fplog, cr->dd);
2743 //!\brief TODO Remove fplog when group scheme and charge groups are gone
2744 void dd_partition_system(FILE* fplog,
2745 const gmx::MDLogger& mdlog,
2747 const t_commrec* cr,
2750 t_state* state_global,
2751 const gmx_mtop_t& top_global,
2752 const t_inputrec& inputrec,
2753 gmx::ImdSession* imdSession,
2755 t_state* state_local,
2756 gmx::ForceBuffers* f,
2757 gmx::MDAtoms* mdAtoms,
2758 gmx_localtop_t* top_local,
2760 gmx::VirtualSitesHandler* vsite,
2761 gmx::Constraints* constr,
2763 gmx_wallcycle* wcycle,
2766 gmx_ddbox_t ddbox = { 0 };
2770 wallcycle_start(wcycle, WallCycleCounter::Domdec);
2772 gmx_domdec_t* dd = cr->dd;
2773 gmx_domdec_comm_t* comm = dd->comm;
2775 // TODO if the update code becomes accessible here, use
2776 // upd->deform for this logic.
2777 bool bBoxChanged = (bMasterState || inputrecDeform(&inputrec));
2778 if (inputrec.epc != PressureCoupling::No)
2780 /* With nstpcouple > 1 pressure coupling happens.
2781 * one step after calculating the pressure.
2782 * Box scaling happens at the end of the MD step,
2783 * after the DD partitioning.
2784 * We therefore have to do DLB in the first partitioning
2785 * after an MD step where P-coupling occurred.
2786 * We need to determine the last step in which p-coupling occurred.
2787 * MRS -- need to validate this for vv?
2789 int n = inputrec.nstpcouple;
2790 int64_t step_pcoupl;
2793 step_pcoupl = step - 1;
2797 step_pcoupl = ((step - 1) / n) * n + 1;
2799 if (step_pcoupl >= comm->partition_step)
2805 bool bNStGlobalComm = (step % nstglobalcomm == 0);
2813 /* Should we do dynamic load balacing this step?
2814 * Since it requires (possibly expensive) global communication,
2815 * we might want to do DLB less frequently.
2817 if (bBoxChanged || inputrec.epc != PressureCoupling::No)
2819 bDoDLB = bBoxChanged;
2823 bDoDLB = bNStGlobalComm;
2827 /* Check if we have recorded loads on the nodes */
2828 if (comm->ddSettings.recordLoad && dd_load_count(comm) > 0)
2830 bool bCheckWhetherToTurnDlbOn = dd_dlb_get_should_check_whether_to_turn_dlb_on(dd);
2832 /* Print load every nstlog, first and last step to the log file */
2833 bool bLogLoad = ((inputrec.nstlog > 0 && step % inputrec.nstlog == 0) || comm->n_load_collect == 0
2834 || (inputrec.nsteps >= 0
2835 && (step + inputrec.nstlist > inputrec.init_step + inputrec.nsteps)));
2837 /* Avoid extra communication due to verbose screen output
2838 * when nstglobalcomm is set.
2840 if (bDoDLB || bLogLoad || bCheckWhetherToTurnDlbOn
2841 || (bVerbose && (inputrec.nstlist == 0 || nstglobalcomm <= inputrec.nstlist)))
2843 get_load_distribution(dd, wcycle);
2848 GMX_LOG(mdlog.info).asParagraph().appendText(dd_print_load(dd, step - 1));
2852 dd_print_load_verbose(dd);
2855 comm->n_load_collect++;
2861 /* Add the measured cycles to the running average */
2862 const float averageFactor = 0.1F;
2863 comm->cyclesPerStepDlbExpAverage =
2864 (1 - averageFactor) * comm->cyclesPerStepDlbExpAverage
2865 + averageFactor * comm->cycl[ddCyclStep] / comm->cycl_n[ddCyclStep];
2867 if (comm->dlbState == DlbState::onCanTurnOff
2868 && dd->comm->n_load_have % c_checkTurnDlbOffInterval == c_checkTurnDlbOffInterval - 1)
2873 /* If the running averaged cycles with DLB are more
2874 * than before we turned on DLB, turn off DLB.
2875 * We will again run and check the cycles without DLB
2876 * and we can then decide if to turn off DLB forever.
2878 turnOffDlb = (comm->cyclesPerStepDlbExpAverage > comm->cyclesPerStepBeforeDLB);
2880 dd_bcast(dd, sizeof(turnOffDlb), &turnOffDlb);
2883 /* To turn off DLB, we need to redistribute the atoms */
2884 dd_collect_state(dd, state_local, state_global);
2885 bMasterState = true;
2886 turn_off_dlb(mdlog, dd, step);
2890 else if (bCheckWhetherToTurnDlbOn)
2892 bool turnOffDlbForever = false;
2893 bool turnOnDlb = false;
2895 /* Since the timings are node dependent, the master decides */
2898 /* If we recently turned off DLB, we want to check if
2899 * performance is better without DLB. We want to do this
2900 * ASAP to minimize the chance that external factors
2901 * slowed down the DLB step are gone here and we
2902 * incorrectly conclude that DLB was causing the slowdown.
2903 * So we measure one nstlist block, no running average.
2905 if (comm->haveTurnedOffDlb
2906 && comm->cycl[ddCyclStep] / comm->cycl_n[ddCyclStep] < comm->cyclesPerStepDlbExpAverage)
2908 /* After turning off DLB we ran nstlist steps in fewer
2909 * cycles than with DLB. This likely means that DLB
2910 * in not benefical, but this could be due to a one
2911 * time unlucky fluctuation, so we require two such
2912 * observations in close succession to turn off DLB
2915 if (comm->dlbSlowerPartitioningCount > 0
2916 && dd->ddp_count < comm->dlbSlowerPartitioningCount + 10 * c_checkTurnDlbOnInterval)
2918 turnOffDlbForever = true;
2920 comm->haveTurnedOffDlb = false;
2921 /* Register when we last measured DLB slowdown */
2922 comm->dlbSlowerPartitioningCount = dd->ddp_count;
2926 /* Here we check if the max PME rank load is more than 0.98
2927 * the max PP force load. If so, PP DLB will not help,
2928 * since we are (almost) limited by PME. Furthermore,
2929 * DLB will cause a significant extra x/f redistribution
2930 * cost on the PME ranks, which will then surely result
2931 * in lower total performance.
2933 if (comm->ddRankSetup.usePmeOnlyRanks && dd_pme_f_ratio(dd) > 1 - DD_PERF_LOSS_DLB_ON)
2939 turnOnDlb = (dd_force_imb_perf_loss(dd) >= DD_PERF_LOSS_DLB_ON);
2945 bool turnOffDlbForever;
2947 } bools{ turnOffDlbForever, turnOnDlb };
2948 dd_bcast(dd, sizeof(bools), &bools);
2949 if (bools.turnOffDlbForever)
2951 turn_off_dlb_forever(mdlog, dd, step);
2953 else if (bools.turnOnDlb)
2955 turn_on_dlb(mdlog, dd, step);
2960 comm->n_load_have++;
2963 bool bRedist = false;
2966 /* Clear the old state */
2967 clearDDStateIndices(dd, false);
2970 auto xGlobal = positionsFromStatePointer(state_global);
2972 set_ddbox(*dd, true, DDMASTER(dd) ? state_global->box : nullptr, true, xGlobal, &ddbox);
2974 distributeState(mdlog, dd, top_global, state_global, ddbox, state_local);
2976 /* Ensure that we have space for the new distribution */
2977 dd_resize_atominfo_and_state(fr, state_local, dd->numHomeAtoms);
2979 inc_nrnb(nrnb, eNR_CGCM, comm->atomRanges.numHomeAtoms());
2981 dd_set_atominfo(dd->globalAtomGroupIndices, 0, dd->numHomeAtoms, fr);
2983 else if (state_local->ddp_count != dd->ddp_count)
2985 if (state_local->ddp_count > dd->ddp_count)
2988 "Internal inconsistency state_local->ddp_count (%d) > dd->ddp_count (%" PRId64
2990 state_local->ddp_count,
2994 if (state_local->ddp_count_cg_gl != state_local->ddp_count)
2997 "Internal inconsistency state_local->ddp_count_cg_gl (%d) != "
2998 "state_local->ddp_count (%d)",
2999 state_local->ddp_count_cg_gl,
3000 state_local->ddp_count);
3003 /* Clear the old state */
3004 clearDDStateIndices(dd, false);
3006 /* Restore the atom group indices from state_local */
3007 restoreAtomGroups(dd, state_local);
3008 make_dd_indices(dd, 0);
3009 ncgindex_set = dd->numHomeAtoms;
3011 inc_nrnb(nrnb, eNR_CGCM, comm->atomRanges.numHomeAtoms());
3013 dd_set_atominfo(dd->globalAtomGroupIndices, 0, dd->numHomeAtoms, fr);
3015 set_ddbox(*dd, bMasterState, state_local->box, true, state_local->x, &ddbox);
3017 bRedist = isDlbOn(comm);
3021 /* We have the full state, only redistribute the cgs */
3023 /* Clear the non-home indices */
3024 clearDDStateIndices(dd, true);
3027 /* To avoid global communication, we do not recompute the extent
3028 * of the system for dims without pbc. Therefore we need to copy
3029 * the previously computed values when we do not communicate.
3031 if (!bNStGlobalComm)
3033 copy_rvec(comm->box0, ddbox.box0);
3034 copy_rvec(comm->box_size, ddbox.box_size);
3036 set_ddbox(*dd, bMasterState, state_local->box, bNStGlobalComm, state_local->x, &ddbox);
3041 /* Copy needed for dim's without pbc when avoiding communication */
3042 copy_rvec(ddbox.box0, comm->box0);
3043 copy_rvec(ddbox.box_size, comm->box_size);
3045 set_dd_cell_sizes(dd, &ddbox, dd->unitCellInfo.ddBoxIsDynamic, bMasterState, bDoDLB, step, wcycle);
3047 if (comm->ddSettings.nstDDDumpGrid > 0 && step % comm->ddSettings.nstDDDumpGrid == 0)
3049 write_dd_grid_pdb("dd_grid", step, dd, state_local->box, &ddbox);
3052 if (comm->systemInfo.useUpdateGroups)
3054 comm->updateGroupsCog->addCogs(
3055 gmx::arrayRefFromArray(dd->globalAtomGroupIndices.data(), dd->numHomeAtoms),
3059 /* Check if we should sort the charge groups */
3060 const bool bSortCG = (bMasterState || bRedist);
3062 /* When repartitioning we mark atom groups that will move to neighboring
3063 * DD cells, but we do not move them right away for performance reasons.
3064 * Thus we need to keep track of how many charge groups will move for
3065 * obtaining correct local charge group / atom counts.
3070 wallcycle_sub_start(wcycle, WallCycleSubCounter::DDRedist);
3072 ncgindex_set = dd->numHomeAtoms;
3073 dd_redistribute_cg(fplog, step, dd, ddbox.tric_dir, state_local, fr, nrnb, &ncg_moved);
3075 GMX_RELEASE_ASSERT(bSortCG, "Sorting is required after redistribution");
3077 if (comm->systemInfo.useUpdateGroups)
3079 comm->updateGroupsCog->addCogs(
3080 gmx::arrayRefFromArray(dd->globalAtomGroupIndices.data(), dd->numHomeAtoms),
3084 wallcycle_sub_stop(wcycle, WallCycleSubCounter::DDRedist);
3087 RVec cell_ns_x0, cell_ns_x1;
3088 get_nsgrid_boundaries(ddbox.nboundeddim,
3095 as_rvec_array(state_local->x.data()),
3101 comm_dd_ns_cell_sizes(dd, &ddbox, cell_ns_x0, cell_ns_x1, step);
3106 wallcycle_sub_start(wcycle, WallCycleSubCounter::DDGrid);
3108 /* Sort the state on charge group position.
3109 * This enables exact restarts from this step.
3110 * It also improves performance by about 15% with larger numbers
3111 * of atoms per node.
3114 /* Fill the ns grid with the home cell,
3115 * so we can sort with the indices.
3117 set_zones_numHomeAtoms(dd);
3119 set_zones_size(dd, state_local->box, &ddbox, 0, 1, ncg_moved);
3121 nbnxn_put_on_grid(fr->nbv.get(),
3124 comm->zones.size[0].bb_x0,
3125 comm->zones.size[0].bb_x1,
3126 comm->updateGroupsCog.get(),
3127 { 0, dd->numHomeAtoms },
3128 comm->zones.dens_zone0,
3132 bRedist ? comm->movedBuffer.data() : nullptr);
3136 fprintf(debug, "Step %s, sorting the %d home charge groups\n", gmx_step_str(step, sbuf), dd->numHomeAtoms);
3138 dd_sort_state(dd, fr, state_local);
3140 /* After sorting and compacting we set the correct size */
3141 state_change_natoms(state_local, comm->atomRanges.numHomeAtoms());
3143 /* Rebuild all the indices */
3144 dd->ga2la->clear(false);
3147 wallcycle_sub_stop(wcycle, WallCycleSubCounter::DDGrid);
3151 /* With the group scheme the sorting array is part of the DD state,
3152 * but it just got out of sync, so mark as invalid by emptying it.
3154 if (inputrec.cutoff_scheme == CutoffScheme::Group)
3156 comm->sort->sorted.clear();
3160 if (comm->systemInfo.useUpdateGroups)
3162 /* The update groups cog's are invalid after sorting
3163 * and need to be cleared before the next partitioning anyhow.
3165 comm->updateGroupsCog->clear();
3168 wallcycle_sub_start(wcycle, WallCycleSubCounter::DDSetupComm);
3170 /* Set the induces for the home atoms */
3171 set_zones_numHomeAtoms(dd);
3172 make_dd_indices(dd, ncgindex_set);
3174 /* Setup up the communication and communicate the coordinates */
3175 setup_dd_communication(dd, state_local->box, &ddbox, fr, state_local);
3177 /* Set the indices for the halo atoms */
3178 make_dd_indices(dd, dd->numHomeAtoms);
3180 /* Set the charge group boundaries for neighbor searching */
3181 set_cg_boundaries(&comm->zones);
3183 /* When bSortCG=true, we have already set the size for zone 0 */
3184 set_zones_size(dd, state_local->box, &ddbox, bSortCG ? 1 : 0, comm->zones.n, 0);
3186 wallcycle_sub_stop(wcycle, WallCycleSubCounter::DDSetupComm);
3189 write_dd_pdb("dd_home",step,"dump",top_global,cr,
3190 -1,state_local->x.rvec_array(),state_local->box);
3193 wallcycle_sub_start(wcycle, WallCycleSubCounter::DDMakeTop);
3195 /* Extract a local topology from the global topology */
3197 for (int i = 0; i < dd->ndim; i++)
3199 numPulses[dd->dim[i]] = comm->cd[i].numPulses();
3201 int numBondedInteractionsToReduce = dd_make_local_top(*dd,
3203 dd->unitCellInfo.npbcdim,
3212 dd->localTopologyChecker->scheduleCheckOfLocalTopology(numBondedInteractionsToReduce);
3214 wallcycle_sub_stop(wcycle, WallCycleSubCounter::DDMakeTop);
3216 wallcycle_sub_start(wcycle, WallCycleSubCounter::DDMakeConstr);
3218 /* Set up the special atom communication */
3219 int n = comm->atomRanges.end(DDAtomRanges::Type::Zones);
3220 for (int i = static_cast<int>(DDAtomRanges::Type::Zones) + 1;
3221 i < static_cast<int>(DDAtomRanges::Type::Number);
3224 auto range = static_cast<DDAtomRanges::Type>(i);
3227 case DDAtomRanges::Type::Vsites:
3228 if (vsite && vsite->numInterUpdategroupVirtualSites())
3230 n = dd_make_local_vsites(dd, n, top_local->idef.il);
3233 case DDAtomRanges::Type::Constraints:
3234 if (dd->comm->systemInfo.mayHaveSplitConstraints || dd->comm->systemInfo.mayHaveSplitSettles)
3236 /* Only for inter-cg constraints we need special code */
3237 n = dd_make_local_constraints(dd,
3242 inputrec.nProjOrder,
3243 top_local->idef.il);
3246 default: gmx_incons("Unknown special atom type setup");
3248 comm->atomRanges.setEnd(range, n);
3251 wallcycle_sub_stop(wcycle, WallCycleSubCounter::DDMakeConstr);
3253 wallcycle_sub_start(wcycle, WallCycleSubCounter::DDTopOther);
3255 /* Make space for the extra coordinates for virtual site
3256 * or constraint communication.
3258 state_local->natoms = comm->atomRanges.numAtomsTotal();
3260 state_change_natoms(state_local, state_local->natoms);
3263 if (vsite && vsite->numInterUpdategroupVirtualSites())
3265 nat_f_novirsum = comm->atomRanges.end(DDAtomRanges::Type::Vsites);
3269 if (EEL_FULL(inputrec.coulombtype) && dd->haveExclusions)
3271 nat_f_novirsum = comm->atomRanges.end(DDAtomRanges::Type::Zones);
3275 nat_f_novirsum = comm->atomRanges.numHomeAtoms();
3279 /* Set the number of atoms required for the force calculation.
3280 * Forces need to be constrained when doing energy
3281 * minimization. For simple simulations we could avoid some
3282 * allocation, zeroing and copying, but this is probably not worth
3283 * the complications and checking.
3285 forcerec_set_ranges(fr,
3286 comm->atomRanges.end(DDAtomRanges::Type::Zones),
3287 comm->atomRanges.end(DDAtomRanges::Type::Constraints),
3290 /* Update atom data for mdatoms and several algorithms */
3291 mdAlgorithmsSetupAtomData(cr, inputrec, top_global, top_local, fr, f, mdAtoms, constr, vsite, nullptr);
3293 auto* mdatoms = mdAtoms->mdatoms();
3294 if (!thisRankHasDuty(cr, DUTY_PME))
3296 /* Send the charges and/or c6/sigmas to our PME only node */
3297 gmx_pme_send_parameters(
3300 mdatoms->nChargePerturbed != 0,
3301 mdatoms->nTypePerturbed != 0,
3302 mdatoms->chargeA ? gmx::arrayRefFromArray(mdatoms->chargeA, mdatoms->nr)
3303 : gmx::ArrayRef<real>{},
3304 mdatoms->chargeB ? gmx::arrayRefFromArray(mdatoms->chargeB, mdatoms->nr)
3305 : gmx::ArrayRef<real>{},
3306 mdatoms->sqrt_c6A ? gmx::arrayRefFromArray(mdatoms->sqrt_c6A, mdatoms->nr)
3307 : gmx::ArrayRef<real>{},
3308 mdatoms->sqrt_c6B ? gmx::arrayRefFromArray(mdatoms->sqrt_c6B, mdatoms->nr)
3309 : gmx::ArrayRef<real>{},
3310 mdatoms->sigmaA ? gmx::arrayRefFromArray(mdatoms->sigmaA, mdatoms->nr)
3311 : gmx::ArrayRef<real>{},
3312 mdatoms->sigmaB ? gmx::arrayRefFromArray(mdatoms->sigmaB, mdatoms->nr)
3313 : gmx::ArrayRef<real>{},
3314 dd_pme_maxshift_x(*dd),
3315 dd_pme_maxshift_y(*dd));
3318 if (dd->atomSets != nullptr)
3320 /* Update the local atom sets */
3321 dd->atomSets->setIndicesInDomainDecomposition(*(dd->ga2la));
3324 // The pull group construction can need the atom sets updated above
3327 /* Update the local pull groups */
3328 dd_make_local_pull_groups(cr, pull_work);
3331 /* Update the local atoms to be communicated via the IMD protocol if bIMD is true. */
3332 imdSession->dd_make_local_IMD_atoms(dd);
3334 add_dd_statistics(dd);
3336 /* Make sure we only count the cycles for this DD partitioning */
3337 clear_dd_cycle_counts(dd);
3339 /* Because the order of the atoms might have changed since
3340 * the last vsite construction, we need to communicate the constructing
3341 * atom coordinates again (for spreading the forces this MD step).
3343 dd_move_x_vsites(*dd, state_local->box, state_local->x.rvec_array());
3345 wallcycle_sub_stop(wcycle, WallCycleSubCounter::DDTopOther);
3347 if (comm->ddSettings.nstDDDump > 0 && step % comm->ddSettings.nstDDDump == 0)
3349 dd_move_x(dd, state_local->box, state_local->x, nullptr);
3350 write_dd_pdb("dd_dump",
3356 state_local->x.rvec_array(),
3360 /* Store the partitioning step */
3361 comm->partition_step = step;
3363 /* Increase the DD partitioning counter */
3365 /* The state currently matches this DD partitioning count, store it */
3366 state_local->ddp_count = dd->ddp_count;
3369 /* The DD master node knows the complete cg distribution,
3370 * store the count so we can possibly skip the cg info communication.
3372 comm->master_cg_ddp_count = (bSortCG ? 0 : dd->ddp_count);
3375 if (comm->ddSettings.DD_debug > 0)
3377 /* Set the env var GMX_DD_DEBUG if you suspect corrupted indices */
3378 check_index_consistency(dd, top_global.natoms, "after partitioning");
3381 wallcycle_stop(wcycle, WallCycleCounter::Domdec);