2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2018,2019, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
37 * \brief This file defines functions for mdrun to call to make a new
38 * domain decomposition, and check it.
40 * \author Berk Hess <hess@kth.se>
41 * \ingroup module_domdec
46 #include "partition.h"
55 #include "gromacs/domdec/collect.h"
56 #include "gromacs/domdec/dlb.h"
57 #include "gromacs/domdec/dlbtiming.h"
58 #include "gromacs/domdec/domdec.h"
59 #include "gromacs/domdec/domdec_network.h"
60 #include "gromacs/domdec/ga2la.h"
61 #include "gromacs/domdec/localatomsetmanager.h"
62 #include "gromacs/ewald/pme.h"
63 #include "gromacs/gmxlib/chargegroup.h"
64 #include "gromacs/gmxlib/network.h"
65 #include "gromacs/gmxlib/nrnb.h"
66 #include "gromacs/imd/imd.h"
67 #include "gromacs/math/functions.h"
68 #include "gromacs/math/vec.h"
69 #include "gromacs/mdlib/forcerec.h"
70 #include "gromacs/mdlib/gmx_omp_nthreads.h"
71 #include "gromacs/mdlib/mdatoms.h"
72 #include "gromacs/mdlib/mdsetup.h"
73 #include "gromacs/mdlib/nb_verlet.h"
74 #include "gromacs/mdlib/nbnxn_grid.h"
75 #include "gromacs/mdlib/nsgrid.h"
76 #include "gromacs/mdlib/vsite.h"
77 #include "gromacs/mdtypes/commrec.h"
78 #include "gromacs/mdtypes/forcerec.h"
79 #include "gromacs/mdtypes/inputrec.h"
80 #include "gromacs/mdtypes/md_enums.h"
81 #include "gromacs/mdtypes/nblist.h"
82 #include "gromacs/mdtypes/state.h"
83 #include "gromacs/pulling/pull.h"
84 #include "gromacs/timing/wallcycle.h"
85 #include "gromacs/topology/mtop_util.h"
86 #include "gromacs/topology/topology.h"
87 #include "gromacs/utility/cstringutil.h"
88 #include "gromacs/utility/fatalerror.h"
89 #include "gromacs/utility/logger.h"
90 #include "gromacs/utility/real.h"
91 #include "gromacs/utility/smalloc.h"
92 #include "gromacs/utility/strconvert.h"
93 #include "gromacs/utility/stringstream.h"
94 #include "gromacs/utility/stringutil.h"
95 #include "gromacs/utility/textwriter.h"
98 #include "cellsizes.h"
99 #include "distribute.h"
100 #include "domdec_constraints.h"
101 #include "domdec_internal.h"
102 #include "domdec_vsite.h"
104 #include "redistribute.h"
107 /*! \brief Turn on DLB when the load imbalance causes this amount of total loss.
109 * There is a bit of overhead with DLB and it's difficult to achieve
110 * a load imbalance of less than 2% with DLB.
112 #define DD_PERF_LOSS_DLB_ON 0.02
114 //! Warn about imbalance due to PP or PP/PME load imbalance at this loss.
115 #define DD_PERF_LOSS_WARN 0.05
118 //! Debug helper printing a DD zone
119 static void print_ddzone(FILE *fp, int d, int i, int j, gmx_ddzone_t *zone)
121 fprintf(fp, "zone d0 %d d1 %d d2 %d min0 %6.3f max1 %6.3f mch0 %6.3f mch1 %6.3f p1_0 %6.3f p1_1 %6.3f\n",
123 zone->min0, zone->max1,
124 zone->mch0, zone->mch0,
125 zone->p1_0, zone->p1_1);
128 /*! \brief Using the home grid size as input in cell_ns_x0 and cell_ns_x1
129 * takes the extremes over all home and remote zones in the halo
130 * and returns the results in cell_ns_x0 and cell_ns_x1.
131 * Note: only used with the group cut-off scheme.
133 static void dd_move_cellx(gmx_domdec_t *dd,
134 const gmx_ddbox_t *ddbox,
138 constexpr int c_ddZoneCommMaxNumZones = 5;
139 gmx_ddzone_t buf_s[c_ddZoneCommMaxNumZones];
140 gmx_ddzone_t buf_r[c_ddZoneCommMaxNumZones];
141 gmx_ddzone_t buf_e[c_ddZoneCommMaxNumZones];
142 gmx_domdec_comm_t *comm = dd->comm;
146 for (int d = 1; d < dd->ndim; d++)
148 int dim = dd->dim[d];
149 gmx_ddzone_t &zp = (d == 1) ? comm->zone_d1[0] : comm->zone_d2[0][0];
151 /* Copy the base sizes of the home zone */
152 zp.min0 = cell_ns_x0[dim];
153 zp.max1 = cell_ns_x1[dim];
154 zp.min1 = cell_ns_x1[dim];
155 zp.mch0 = cell_ns_x0[dim];
156 zp.mch1 = cell_ns_x1[dim];
157 zp.p1_0 = cell_ns_x0[dim];
158 zp.p1_1 = cell_ns_x1[dim];
162 gmx::ArrayRef<DDCellsizesWithDlb> cellsizes = comm->cellsizesWithDlb;
164 /* Loop backward over the dimensions and aggregate the extremes
167 for (int d = dd->ndim - 2; d >= 0; d--)
169 const int dim = dd->dim[d];
170 const bool applyPbc = (dim < ddbox->npbcdim);
172 /* Use an rvec to store two reals */
173 extr_s[d][0] = cellsizes[d + 1].fracLower;
174 extr_s[d][1] = cellsizes[d + 1].fracUpper;
175 extr_s[d][2] = cellsizes[d + 1].fracUpper;
178 GMX_ASSERT(pos < c_ddZoneCommMaxNumZones, "The buffers should be sufficiently large");
179 /* Store the extremes in the backward sending buffer,
180 * so they get updated separately from the forward communication.
182 for (int d1 = d; d1 < dd->ndim-1; d1++)
184 gmx_ddzone_t &buf = buf_s[pos];
186 /* We invert the order to be able to use the same loop for buf_e */
187 buf.min0 = extr_s[d1][1];
188 buf.max1 = extr_s[d1][0];
189 buf.min1 = extr_s[d1][2];
192 /* Store the cell corner of the dimension we communicate along */
193 buf.p1_0 = comm->cell_x0[dim];
199 buf_s[pos] = (dd->ndim == 2) ? comm->zone_d1[0] : comm->zone_d2[0][0];
202 if (dd->ndim == 3 && d == 0)
204 buf_s[pos] = comm->zone_d2[0][1];
206 buf_s[pos] = comm->zone_d1[0];
210 /* We only need to communicate the extremes
211 * in the forward direction
213 int numPulses = comm->cd[d].numPulses();
217 /* Take the minimum to avoid double communication */
218 numPulsesMin = std::min(numPulses, dd->nc[dim] - 1 - numPulses);
222 /* Without PBC we should really not communicate over
223 * the boundaries, but implementing that complicates
224 * the communication setup and therefore we simply
225 * do all communication, but ignore some data.
227 numPulsesMin = numPulses;
229 for (int pulse = 0; pulse < numPulsesMin; pulse++)
231 /* Communicate the extremes forward */
232 bool receiveValidData = (applyPbc || dd->ci[dim] > 0);
234 int numElements = dd->ndim - d - 1;
235 ddSendrecv(dd, d, dddirForward,
236 extr_s + d, numElements,
237 extr_r + d, numElements);
239 if (receiveValidData)
241 for (int d1 = d; d1 < dd->ndim - 1; d1++)
243 extr_s[d1][0] = std::max(extr_s[d1][0], extr_r[d1][0]);
244 extr_s[d1][1] = std::min(extr_s[d1][1], extr_r[d1][1]);
245 extr_s[d1][2] = std::min(extr_s[d1][2], extr_r[d1][2]);
250 const int numElementsInBuffer = pos;
251 for (int pulse = 0; pulse < numPulses; pulse++)
253 /* Communicate all the zone information backward */
254 bool receiveValidData = (applyPbc || dd->ci[dim] < dd->nc[dim] - 1);
256 static_assert(sizeof(gmx_ddzone_t) == c_ddzoneNumReals*sizeof(real), "Here we expect gmx_ddzone_t to consist of c_ddzoneNumReals reals (only)");
258 int numReals = numElementsInBuffer*c_ddzoneNumReals;
259 ddSendrecv(dd, d, dddirBackward,
260 gmx::arrayRefFromArray(&buf_s[0].min0, numReals),
261 gmx::arrayRefFromArray(&buf_r[0].min0, numReals));
266 for (int d1 = d + 1; d1 < dd->ndim; d1++)
268 /* Determine the decrease of maximum required
269 * communication height along d1 due to the distance along d,
270 * this avoids a lot of useless atom communication.
272 real dist_d = comm->cell_x1[dim] - buf_r[0].p1_0;
275 if (ddbox->tric_dir[dim])
277 /* c is the off-diagonal coupling between the cell planes
278 * along directions d and d1.
280 c = ddbox->v[dim][dd->dim[d1]][dim];
286 real det = (1 + c*c)*comm->cutoff*comm->cutoff - dist_d*dist_d;
289 dh[d1] = comm->cutoff - (c*dist_d + std::sqrt(det))/(1 + c*c);
293 /* A negative value signals out of range */
299 /* Accumulate the extremes over all pulses */
300 for (int i = 0; i < numElementsInBuffer; i++)
308 if (receiveValidData)
310 buf_e[i].min0 = std::min(buf_e[i].min0, buf_r[i].min0);
311 buf_e[i].max1 = std::max(buf_e[i].max1, buf_r[i].max1);
312 buf_e[i].min1 = std::min(buf_e[i].min1, buf_r[i].min1);
316 if (dd->ndim == 3 && d == 0 && i == numElementsInBuffer - 1)
324 if (receiveValidData && dh[d1] >= 0)
326 buf_e[i].mch0 = std::max(buf_e[i].mch0, buf_r[i].mch0-dh[d1]);
327 buf_e[i].mch1 = std::max(buf_e[i].mch1, buf_r[i].mch1-dh[d1]);
330 /* Copy the received buffer to the send buffer,
331 * to pass the data through with the next pulse.
335 if (((applyPbc || dd->ci[dim] + numPulses < dd->nc[dim]) && pulse == numPulses - 1) ||
336 (!applyPbc && dd->ci[dim] + 1 + pulse == dd->nc[dim] - 1))
338 /* Store the extremes */
341 for (int d1 = d; d1 < dd->ndim-1; d1++)
343 extr_s[d1][1] = std::min(extr_s[d1][1], buf_e[pos].min0);
344 extr_s[d1][0] = std::max(extr_s[d1][0], buf_e[pos].max1);
345 extr_s[d1][2] = std::min(extr_s[d1][2], buf_e[pos].min1);
349 if (d == 1 || (d == 0 && dd->ndim == 3))
351 for (int i = d; i < 2; i++)
353 comm->zone_d2[1-d][i] = buf_e[pos];
359 comm->zone_d1[1] = buf_e[pos];
365 if (d == 1 || (d == 0 && dd->ndim == 3))
367 for (int i = d; i < 2; i++)
369 comm->zone_d2[1 - d][i].dataSet = 0;
374 comm->zone_d1[1].dataSet = 0;
382 int dim = dd->dim[1];
383 for (int i = 0; i < 2; i++)
385 if (comm->zone_d1[i].dataSet != 0)
389 print_ddzone(debug, 1, i, 0, &comm->zone_d1[i]);
391 cell_ns_x0[dim] = std::min(cell_ns_x0[dim], comm->zone_d1[i].min0);
392 cell_ns_x1[dim] = std::max(cell_ns_x1[dim], comm->zone_d1[i].max1);
398 int dim = dd->dim[2];
399 for (int i = 0; i < 2; i++)
401 for (int j = 0; j < 2; j++)
403 if (comm->zone_d2[i][j].dataSet != 0)
407 print_ddzone(debug, 2, i, j, &comm->zone_d2[i][j]);
409 cell_ns_x0[dim] = std::min(cell_ns_x0[dim], comm->zone_d2[i][j].min0);
410 cell_ns_x1[dim] = std::max(cell_ns_x1[dim], comm->zone_d2[i][j].max1);
415 for (int d = 1; d < dd->ndim; d++)
417 cellsizes[d].fracLowerMax = extr_s[d-1][0];
418 cellsizes[d].fracUpperMin = extr_s[d-1][1];
421 fprintf(debug, "Cell fraction d %d, max0 %f, min1 %f\n",
422 d, cellsizes[d].fracLowerMax, cellsizes[d].fracUpperMin);
427 //! Sets the charge-group zones to be equal to the home zone.
428 static void set_zones_ncg_home(gmx_domdec_t *dd)
430 gmx_domdec_zones_t *zones;
433 zones = &dd->comm->zones;
435 zones->cg_range[0] = 0;
436 for (i = 1; i < zones->n+1; i++)
438 zones->cg_range[i] = dd->ncg_home;
440 /* zone_ncg1[0] should always be equal to ncg_home */
441 dd->comm->zone_ncg1[0] = dd->ncg_home;
444 //! Restore atom groups for the charge groups.
445 static void restoreAtomGroups(gmx_domdec_t *dd,
446 const int *gcgs_index, const t_state *state)
448 gmx::ArrayRef<const int> atomGroupsState = state->cg_gl;
450 std::vector<int> &globalAtomGroupIndices = dd->globalAtomGroupIndices;
451 gmx::RangePartitioning &atomGrouping = dd->atomGrouping_;
453 globalAtomGroupIndices.resize(atomGroupsState.size());
454 atomGrouping.clear();
456 /* Copy back the global charge group indices from state
457 * and rebuild the local charge group to atom index.
459 for (gmx::index i = 0; i < atomGroupsState.size(); i++)
461 const int atomGroupGlobal = atomGroupsState[i];
462 const int groupSize = gcgs_index[atomGroupGlobal + 1] - gcgs_index[atomGroupGlobal];
463 globalAtomGroupIndices[i] = atomGroupGlobal;
464 atomGrouping.appendBlock(groupSize);
467 dd->ncg_home = atomGroupsState.size();
468 dd->comm->atomRanges.setEnd(DDAtomRanges::Type::Home, atomGrouping.fullRange().end());
470 set_zones_ncg_home(dd);
473 //! Sets the cginfo structures.
474 static void dd_set_cginfo(gmx::ArrayRef<const int> index_gl, int cg0, int cg1,
475 t_forcerec *fr, char *bLocalCG)
477 cginfo_mb_t *cginfo_mb;
483 cginfo_mb = fr->cginfo_mb;
486 for (cg = cg0; cg < cg1; cg++)
488 cginfo[cg] = ddcginfo(cginfo_mb, index_gl[cg]);
492 if (bLocalCG != nullptr)
494 for (cg = cg0; cg < cg1; cg++)
496 bLocalCG[index_gl[cg]] = TRUE;
501 //! Makes the mappings between global and local atom indices during DD repartioning.
502 static void make_dd_indices(gmx_domdec_t *dd,
503 const int *gcgs_index, int cg_start)
505 const int numZones = dd->comm->zones.n;
506 const int *zone2cg = dd->comm->zones.cg_range;
507 const int *zone_ncg1 = dd->comm->zone_ncg1;
508 gmx::ArrayRef<const int> globalAtomGroupIndices = dd->globalAtomGroupIndices;
509 const gmx_bool bCGs = dd->comm->bCGs;
511 std::vector<int> &globalAtomIndices = dd->globalAtomIndices;
512 gmx_ga2la_t &ga2la = *dd->ga2la;
514 if (zone2cg[1] != dd->ncg_home)
516 gmx_incons("dd->ncg_zone is not up to date");
519 /* Make the local to global and global to local atom index */
520 int a = dd->atomGrouping().subRange(cg_start, cg_start).begin();
521 globalAtomIndices.resize(a);
522 for (int zone = 0; zone < numZones; zone++)
533 int cg1 = zone2cg[zone+1];
534 int cg1_p1 = cg0 + zone_ncg1[zone];
536 for (int cg = cg0; cg < cg1; cg++)
541 /* Signal that this cg is from more than one pulse away */
544 int cg_gl = globalAtomGroupIndices[cg];
547 for (int a_gl = gcgs_index[cg_gl]; a_gl < gcgs_index[cg_gl+1]; a_gl++)
549 globalAtomIndices.push_back(a_gl);
550 ga2la.insert(a_gl, {a, zone1});
556 globalAtomIndices.push_back(cg_gl);
557 ga2la.insert(cg_gl, {a, zone1});
564 //! Checks the charge-group assignements.
565 static int check_bLocalCG(gmx_domdec_t *dd, int ncg_sys, const char *bLocalCG,
569 if (bLocalCG == nullptr)
573 for (size_t i = 0; i < dd->globalAtomGroupIndices.size(); i++)
575 if (!bLocalCG[dd->globalAtomGroupIndices[i]])
578 "DD rank %d, %s: atom group %zu, global atom group %d is not marked in bLocalCG (ncg_home %d)\n", dd->rank, where, i + 1, dd->globalAtomGroupIndices[i] + 1, dd->ncg_home);
583 for (int i = 0; i < ncg_sys; i++)
590 if (ngl != dd->globalAtomGroupIndices.size())
592 fprintf(stderr, "DD rank %d, %s: In bLocalCG %zu atom groups are marked as local, whereas there are %zu\n", dd->rank, where, ngl, dd->globalAtomGroupIndices.size());
599 //! Checks whether global and local atom indices are consistent.
600 static void check_index_consistency(gmx_domdec_t *dd,
601 int natoms_sys, int ncg_sys,
606 const int numAtomsInZones = dd->comm->atomRanges.end(DDAtomRanges::Type::Zones);
608 if (dd->comm->DD_debug > 1)
610 std::vector<int> have(natoms_sys);
611 for (int a = 0; a < numAtomsInZones; a++)
613 int globalAtomIndex = dd->globalAtomIndices[a];
614 if (have[globalAtomIndex] > 0)
616 fprintf(stderr, "DD rank %d: global atom %d occurs twice: index %d and %d\n", dd->rank, globalAtomIndex + 1, have[globalAtomIndex], a+1);
620 have[globalAtomIndex] = a + 1;
625 std::vector<int> have(numAtomsInZones);
628 for (int i = 0; i < natoms_sys; i++)
630 if (const auto entry = dd->ga2la->find(i))
632 const int a = entry->la;
633 if (a >= numAtomsInZones)
635 fprintf(stderr, "DD rank %d: global atom %d marked as local atom %d, which is larger than nat_tot (%d)\n", dd->rank, i+1, a+1, numAtomsInZones);
641 if (dd->globalAtomIndices[a] != i)
643 fprintf(stderr, "DD rank %d: global atom %d marked as local atom %d, which has global atom index %d\n", dd->rank, i+1, a+1, dd->globalAtomIndices[a]+1);
650 if (ngl != numAtomsInZones)
653 "DD rank %d, %s: %d global atom indices, %d local atoms\n",
654 dd->rank, where, ngl, numAtomsInZones);
656 for (int a = 0; a < numAtomsInZones; a++)
661 "DD rank %d, %s: local atom %d, global %d has no global index\n",
662 dd->rank, where, a + 1, dd->globalAtomIndices[a] + 1);
666 nerr += check_bLocalCG(dd, ncg_sys, dd->comm->bLocalCG, where);
670 gmx_fatal(FARGS, "DD rank %d, %s: %d atom(group) index inconsistencies",
671 dd->rank, where, nerr);
675 //! Clear all DD global state indices, starting from \p atomGroupStart and \p atomStart
676 static void clearDDStateIndices(gmx_domdec_t *dd,
680 gmx_ga2la_t &ga2la = *dd->ga2la;
684 /* Clear the whole list without the overhead of searching */
689 const int numAtomsInZones = dd->comm->atomRanges.end(DDAtomRanges::Type::Zones);
690 for (int i = 0; i < numAtomsInZones; i++)
692 ga2la.erase(dd->globalAtomIndices[i]);
696 char *bLocalCG = dd->comm->bLocalCG;
699 for (size_t atomGroup = atomGroupStart; atomGroup < dd->globalAtomGroupIndices.size(); atomGroup++)
701 bLocalCG[dd->globalAtomGroupIndices[atomGroup]] = FALSE;
705 dd_clear_local_vsite_indices(dd);
709 dd_clear_local_constraint_indices(dd);
713 bool check_grid_jump(int64_t step,
714 const gmx_domdec_t *dd,
716 const gmx_ddbox_t *ddbox,
719 gmx_domdec_comm_t *comm = dd->comm;
720 bool invalid = false;
722 for (int d = 1; d < dd->ndim; d++)
724 const DDCellsizesWithDlb &cellsizes = comm->cellsizesWithDlb[d];
725 const int dim = dd->dim[d];
726 const real limit = grid_jump_limit(comm, cutoff, d);
727 real bfac = ddbox->box_size[dim];
728 if (ddbox->tric_dir[dim])
730 bfac *= ddbox->skew_fac[dim];
732 if ((cellsizes.fracUpper - cellsizes.fracLowerMax)*bfac < limit ||
733 (cellsizes.fracLower - cellsizes.fracUpperMin)*bfac > -limit)
741 /* This error should never be triggered under normal
742 * circumstances, but you never know ...
744 gmx_fatal(FARGS, "step %s: The domain decomposition grid has shifted too much in the %c-direction around cell %d %d %d. This should not have happened. Running with fewer ranks might avoid this issue.",
745 gmx_step_str(step, buf),
746 dim2char(dim), dd->ci[XX], dd->ci[YY], dd->ci[ZZ]);
753 //! Return the duration of force calculations on this rank.
754 static float dd_force_load(gmx_domdec_comm_t *comm)
763 load *= 1.0 + (comm->eFlop - 1)*(0.1*rand()/RAND_MAX - 0.05);
768 load = comm->cycl[ddCyclF];
769 if (comm->cycl_n[ddCyclF] > 1)
771 /* Subtract the maximum of the last n cycle counts
772 * to get rid of possible high counts due to other sources,
773 * for instance system activity, that would otherwise
774 * affect the dynamic load balancing.
776 load -= comm->cycl_max[ddCyclF];
780 if (comm->cycl_n[ddCyclWaitGPU] && comm->nrank_gpu_shared > 1)
782 float gpu_wait, gpu_wait_sum;
784 gpu_wait = comm->cycl[ddCyclWaitGPU];
785 if (comm->cycl_n[ddCyclF] > 1)
787 /* We should remove the WaitGPU time of the same MD step
788 * as the one with the maximum F time, since the F time
789 * and the wait time are not independent.
790 * Furthermore, the step for the max F time should be chosen
791 * the same on all ranks that share the same GPU.
792 * But to keep the code simple, we remove the average instead.
793 * The main reason for artificially long times at some steps
794 * is spurious CPU activity or MPI time, so we don't expect
795 * that changes in the GPU wait time matter a lot here.
797 gpu_wait *= (comm->cycl_n[ddCyclF] - 1)/static_cast<float>(comm->cycl_n[ddCyclF]);
799 /* Sum the wait times over the ranks that share the same GPU */
800 MPI_Allreduce(&gpu_wait, &gpu_wait_sum, 1, MPI_FLOAT, MPI_SUM,
801 comm->mpi_comm_gpu_shared);
802 /* Replace the wait time by the average over the ranks */
803 load += -gpu_wait + gpu_wait_sum/comm->nrank_gpu_shared;
811 //! Runs cell size checks and communicates the boundaries.
812 static void comm_dd_ns_cell_sizes(gmx_domdec_t *dd,
814 rvec cell_ns_x0, rvec cell_ns_x1,
817 gmx_domdec_comm_t *comm;
822 for (dim_ind = 0; dim_ind < dd->ndim; dim_ind++)
824 dim = dd->dim[dim_ind];
826 /* Without PBC we don't have restrictions on the outer cells */
827 if (!(dim >= ddbox->npbcdim &&
828 (dd->ci[dim] == 0 || dd->ci[dim] == dd->nc[dim] - 1)) &&
830 (comm->cell_x1[dim] - comm->cell_x0[dim])*ddbox->skew_fac[dim] <
831 comm->cellsize_min[dim])
834 gmx_fatal(FARGS, "step %s: The %c-size (%f) times the triclinic skew factor (%f) is smaller than the smallest allowed cell size (%f) for domain decomposition grid cell %d %d %d",
835 gmx_step_str(step, buf), dim2char(dim),
836 comm->cell_x1[dim] - comm->cell_x0[dim],
837 ddbox->skew_fac[dim],
838 dd->comm->cellsize_min[dim],
839 dd->ci[XX], dd->ci[YY], dd->ci[ZZ]);
843 if ((isDlbOn(dd->comm) && dd->ndim > 1) || ddbox->nboundeddim < DIM)
845 /* Communicate the boundaries and update cell_ns_x0/1 */
846 dd_move_cellx(dd, ddbox, cell_ns_x0, cell_ns_x1);
847 if (isDlbOn(dd->comm) && dd->ndim > 1)
849 check_grid_jump(step, dd, dd->comm->cutoff, ddbox, TRUE);
854 //! Compute and communicate to determine the load distribution across PP ranks.
855 static void get_load_distribution(gmx_domdec_t *dd, gmx_wallcycle_t wcycle)
857 gmx_domdec_comm_t *comm;
859 float cell_frac = 0, sbuf[DD_NLOAD_MAX];
864 fprintf(debug, "get_load_distribution start\n");
867 wallcycle_start(wcycle, ewcDDCOMMLOAD);
871 bSepPME = (dd->pme_nodeid >= 0);
873 if (dd->ndim == 0 && bSepPME)
875 /* Without decomposition, but with PME nodes, we need the load */
876 comm->load[0].mdf = comm->cycl[ddCyclPPduringPME];
877 comm->load[0].pme = comm->cycl[ddCyclPME];
880 for (int d = dd->ndim - 1; d >= 0; d--)
882 const DDCellsizesWithDlb *cellsizes = (isDlbOn(dd->comm) ? &comm->cellsizesWithDlb[d] : nullptr);
883 const int dim = dd->dim[d];
884 /* Check if we participate in the communication in this dimension */
885 if (d == dd->ndim-1 ||
886 (dd->ci[dd->dim[d+1]] == 0 && dd->ci[dd->dim[dd->ndim-1]] == 0))
888 load = &comm->load[d];
889 if (isDlbOn(dd->comm))
891 cell_frac = cellsizes->fracUpper - cellsizes->fracLower;
896 sbuf[pos++] = dd_force_load(comm);
897 sbuf[pos++] = sbuf[0];
898 if (isDlbOn(dd->comm))
900 sbuf[pos++] = sbuf[0];
901 sbuf[pos++] = cell_frac;
904 sbuf[pos++] = cellsizes->fracLowerMax;
905 sbuf[pos++] = cellsizes->fracUpperMin;
910 sbuf[pos++] = comm->cycl[ddCyclPPduringPME];
911 sbuf[pos++] = comm->cycl[ddCyclPME];
916 sbuf[pos++] = comm->load[d+1].sum;
917 sbuf[pos++] = comm->load[d+1].max;
918 if (isDlbOn(dd->comm))
920 sbuf[pos++] = comm->load[d+1].sum_m;
921 sbuf[pos++] = comm->load[d+1].cvol_min*cell_frac;
922 sbuf[pos++] = comm->load[d+1].flags;
925 sbuf[pos++] = cellsizes->fracLowerMax;
926 sbuf[pos++] = cellsizes->fracUpperMin;
931 sbuf[pos++] = comm->load[d+1].mdf;
932 sbuf[pos++] = comm->load[d+1].pme;
936 /* Communicate a row in DD direction d.
937 * The communicators are setup such that the root always has rank 0.
940 MPI_Gather(sbuf, load->nload*sizeof(float), MPI_BYTE,
941 load->load, load->nload*sizeof(float), MPI_BYTE,
942 0, comm->mpi_comm_load[d]);
944 if (dd->ci[dim] == dd->master_ci[dim])
946 /* We are the master along this row, process this row */
947 RowMaster *rowMaster = nullptr;
951 rowMaster = cellsizes->rowMaster.get();
961 for (int i = 0; i < dd->nc[dim]; i++)
963 load->sum += load->load[pos++];
964 load->max = std::max(load->max, load->load[pos]);
966 if (isDlbOn(dd->comm))
968 if (rowMaster->dlbIsLimited)
970 /* This direction could not be load balanced properly,
971 * therefore we need to use the maximum iso the average load.
973 load->sum_m = std::max(load->sum_m, load->load[pos]);
977 load->sum_m += load->load[pos];
980 load->cvol_min = std::min(load->cvol_min, load->load[pos]);
984 load->flags = gmx::roundToInt(load->load[pos++]);
988 rowMaster->bounds[i].cellFracLowerMax = load->load[pos++];
989 rowMaster->bounds[i].cellFracUpperMin = load->load[pos++];
994 load->mdf = std::max(load->mdf, load->load[pos]);
996 load->pme = std::max(load->pme, load->load[pos]);
1000 if (isDlbOn(comm) && rowMaster->dlbIsLimited)
1002 load->sum_m *= dd->nc[dim];
1003 load->flags |= (1<<d);
1011 comm->nload += dd_load_count(comm);
1012 comm->load_step += comm->cycl[ddCyclStep];
1013 comm->load_sum += comm->load[0].sum;
1014 comm->load_max += comm->load[0].max;
1017 for (int d = 0; d < dd->ndim; d++)
1019 if (comm->load[0].flags & (1<<d))
1021 comm->load_lim[d]++;
1027 comm->load_mdf += comm->load[0].mdf;
1028 comm->load_pme += comm->load[0].pme;
1032 wallcycle_stop(wcycle, ewcDDCOMMLOAD);
1036 fprintf(debug, "get_load_distribution finished\n");
1040 /*! \brief Return the relative performance loss on the total run time
1041 * due to the force calculation load imbalance. */
1042 static float dd_force_load_fraction(gmx_domdec_t *dd)
1044 if (dd->comm->nload > 0 && dd->comm->load_step > 0)
1046 return dd->comm->load_sum/(dd->comm->load_step*dd->nnodes);
1054 /*! \brief Return the relative performance loss on the total run time
1055 * due to the force calculation load imbalance. */
1056 static float dd_force_imb_perf_loss(gmx_domdec_t *dd)
1058 if (dd->comm->nload > 0 && dd->comm->load_step > 0)
1061 (dd->comm->load_max*dd->nnodes - dd->comm->load_sum)/
1062 (dd->comm->load_step*dd->nnodes);
1070 //! Print load-balance report e.g. at the end of a run.
1071 static void print_dd_load_av(FILE *fplog, gmx_domdec_t *dd)
1073 gmx_domdec_comm_t *comm = dd->comm;
1075 /* Only the master rank prints loads and only if we measured loads */
1076 if (!DDMASTER(dd) || comm->nload == 0)
1082 int numPpRanks = dd->nnodes;
1083 int numPmeRanks = (dd->pme_nodeid >= 0) ? comm->npmenodes : 0;
1084 int numRanks = numPpRanks + numPmeRanks;
1085 float lossFraction = 0;
1087 /* Print the average load imbalance and performance loss */
1088 if (dd->nnodes > 1 && comm->load_sum > 0)
1090 float imbalance = comm->load_max*numPpRanks/comm->load_sum - 1;
1091 lossFraction = dd_force_imb_perf_loss(dd);
1093 std::string msg = "\nDynamic load balancing report:\n";
1094 std::string dlbStateStr;
1096 switch (dd->comm->dlbState)
1098 case DlbState::offUser:
1099 dlbStateStr = "DLB was off during the run per user request.";
1101 case DlbState::offForever:
1102 /* Currectly this can happen due to performance loss observed, cell size
1103 * limitations or incompatibility with other settings observed during
1104 * determineInitialDlbState(). */
1105 dlbStateStr = "DLB got disabled because it was unsuitable to use.";
1107 case DlbState::offCanTurnOn:
1108 dlbStateStr = "DLB was off during the run due to low measured imbalance.";
1110 case DlbState::offTemporarilyLocked:
1111 dlbStateStr = "DLB was locked at the end of the run due to unfinished PP-PME balancing.";
1113 case DlbState::onCanTurnOff:
1114 dlbStateStr = "DLB was turned on during the run due to measured imbalance.";
1116 case DlbState::onUser:
1117 dlbStateStr = "DLB was permanently on during the run per user request.";
1120 GMX_ASSERT(false, "Undocumented DLB state");
1123 msg += " " + dlbStateStr + "\n";
1124 msg += gmx::formatString(" Average load imbalance: %.1f%%.\n", imbalance*100);
1125 msg += gmx::formatString(" The balanceable part of the MD step is %d%%, load imbalance is computed from this.\n",
1126 gmx::roundToInt(dd_force_load_fraction(dd)*100));
1127 msg += gmx::formatString(" Part of the total run time spent waiting due to load imbalance: %.1f%%.\n",
1129 fprintf(fplog, "%s", msg.c_str());
1130 fprintf(stderr, "\n%s", msg.c_str());
1133 /* Print during what percentage of steps the load balancing was limited */
1134 bool dlbWasLimited = false;
1137 sprintf(buf, " Steps where the load balancing was limited by -rdd, -rcon and/or -dds:");
1138 for (int d = 0; d < dd->ndim; d++)
1140 int limitPercentage = (200*comm->load_lim[d] + 1)/(2*comm->nload);
1141 sprintf(buf+strlen(buf), " %c %d %%",
1142 dim2char(dd->dim[d]), limitPercentage);
1143 if (limitPercentage >= 50)
1145 dlbWasLimited = true;
1148 sprintf(buf + strlen(buf), "\n");
1149 fprintf(fplog, "%s", buf);
1150 fprintf(stderr, "%s", buf);
1153 /* Print the performance loss due to separate PME - PP rank imbalance */
1154 float lossFractionPme = 0;
1155 if (numPmeRanks > 0 && comm->load_mdf > 0 && comm->load_step > 0)
1157 float pmeForceRatio = comm->load_pme/comm->load_mdf;
1158 lossFractionPme = (comm->load_pme - comm->load_mdf)/comm->load_step;
1159 if (lossFractionPme <= 0)
1161 lossFractionPme *= numPmeRanks/static_cast<float>(numRanks);
1165 lossFractionPme *= numPpRanks/static_cast<float>(numRanks);
1167 sprintf(buf, " Average PME mesh/force load: %5.3f\n", pmeForceRatio);
1168 fprintf(fplog, "%s", buf);
1169 fprintf(stderr, "%s", buf);
1170 sprintf(buf, " Part of the total run time spent waiting due to PP/PME imbalance: %.1f %%\n", std::fabs(lossFractionPme)*100);
1171 fprintf(fplog, "%s", buf);
1172 fprintf(stderr, "%s", buf);
1174 fprintf(fplog, "\n");
1175 fprintf(stderr, "\n");
1177 if (lossFraction >= DD_PERF_LOSS_WARN)
1179 std::string message = gmx::formatString(
1180 "NOTE: %.1f %% of the available CPU time was lost due to load imbalance\n"
1181 " in the domain decomposition.\n", lossFraction*100);
1183 bool hadSuggestion = false;
1186 message += " You might want to use dynamic load balancing (option -dlb.)\n";
1187 hadSuggestion = true;
1189 else if (dlbWasLimited)
1191 message += " You might want to decrease the cell size limit (options -rdd, -rcon and/or -dds).\n";
1192 hadSuggestion = true;
1194 message += gmx::formatString(
1195 " You can %sconsider manually changing the decomposition (option -dd);\n"
1196 " e.g. by using fewer domains along the box dimension in which there is\n"
1197 " considerable inhomogeneity in the simulated system.",
1198 hadSuggestion ? "also " : "");
1201 fprintf(fplog, "%s\n", message.c_str());
1202 fprintf(stderr, "%s\n", message.c_str());
1204 if (numPmeRanks > 0 && std::fabs(lossFractionPme) >= DD_PERF_LOSS_WARN)
1207 "NOTE: %.1f %% performance was lost because the PME ranks\n"
1208 " had %s work to do than the PP ranks.\n"
1209 " You might want to %s the number of PME ranks\n"
1210 " or %s the cut-off and the grid spacing.\n",
1211 std::fabs(lossFractionPme*100),
1212 (lossFractionPme < 0) ? "less" : "more",
1213 (lossFractionPme < 0) ? "decrease" : "increase",
1214 (lossFractionPme < 0) ? "decrease" : "increase");
1215 fprintf(fplog, "%s\n", buf);
1216 fprintf(stderr, "%s\n", buf);
1220 //! Return the minimum communication volume.
1221 static float dd_vol_min(gmx_domdec_t *dd)
1223 return dd->comm->load[0].cvol_min*dd->nnodes;
1226 //! Return the DD load flags.
1227 static int dd_load_flags(gmx_domdec_t *dd)
1229 return dd->comm->load[0].flags;
1232 //! Return the reported load imbalance in force calculations.
1233 static float dd_f_imbal(gmx_domdec_t *dd)
1235 if (dd->comm->load[0].sum > 0)
1237 return dd->comm->load[0].max*dd->nnodes/dd->comm->load[0].sum - 1.0f;
1241 /* Something is wrong in the cycle counting, report no load imbalance */
1246 //! Returns DD load balance report.
1248 dd_print_load(gmx_domdec_t *dd,
1251 gmx::StringOutputStream stream;
1252 gmx::TextWriter log(&stream);
1254 int flags = dd_load_flags(dd);
1257 log.writeString("DD load balancing is limited by minimum cell size in dimension");
1258 for (int d = 0; d < dd->ndim; d++)
1262 log.writeStringFormatted(" %c", dim2char(dd->dim[d]));
1265 log.ensureLineBreak();
1267 log.writeString("DD step " + gmx::toString(step));
1268 if (isDlbOn(dd->comm))
1270 log.writeStringFormatted(" vol min/aver %5.3f%c",
1271 dd_vol_min(dd), flags ? '!' : ' ');
1275 log.writeStringFormatted(" load imb.: force %4.1f%%", dd_f_imbal(dd)*100);
1277 if (dd->comm->cycl_n[ddCyclPME])
1279 log.writeStringFormatted(" pme mesh/force %5.3f", dd_pme_f_ratio(dd));
1281 log.ensureLineBreak();
1282 return stream.toString();
1285 //! Prints DD load balance report in mdrun verbose mode.
1286 static void dd_print_load_verbose(gmx_domdec_t *dd)
1288 if (isDlbOn(dd->comm))
1290 fprintf(stderr, "vol %4.2f%c ",
1291 dd_vol_min(dd), dd_load_flags(dd) ? '!' : ' ');
1295 fprintf(stderr, "imb F %2d%% ", gmx::roundToInt(dd_f_imbal(dd)*100));
1297 if (dd->comm->cycl_n[ddCyclPME])
1299 fprintf(stderr, "pme/F %4.2f ", dd_pme_f_ratio(dd));
1303 //! Turns on dynamic load balancing if possible and needed.
1304 static void turn_on_dlb(const gmx::MDLogger &mdlog,
1308 gmx_domdec_comm_t *comm = dd->comm;
1310 real cellsize_min = comm->cellsize_min[dd->dim[0]];
1311 for (int d = 1; d < dd->ndim; d++)
1313 cellsize_min = std::min(cellsize_min, comm->cellsize_min[dd->dim[d]]);
1316 /* Turn off DLB if we're too close to the cell size limit. */
1317 if (cellsize_min < comm->cellsize_limit*1.05)
1319 GMX_LOG(mdlog.info).appendTextFormatted(
1320 "step %s Measured %.1f %% performance loss due to load imbalance, "
1321 "but the minimum cell size is smaller than 1.05 times the cell size limit. "
1322 "Will no longer try dynamic load balancing.",
1323 gmx::toString(step).c_str(), dd_force_imb_perf_loss(dd)*100);
1325 comm->dlbState = DlbState::offForever;
1329 GMX_LOG(mdlog.info).appendTextFormatted(
1330 "step %s Turning on dynamic load balancing, because the performance loss due to load imbalance is %.1f %%.",
1331 gmx::toString(step).c_str(), dd_force_imb_perf_loss(dd)*100);
1332 comm->dlbState = DlbState::onCanTurnOff;
1334 /* Store the non-DLB performance, so we can check if DLB actually
1335 * improves performance.
1337 GMX_RELEASE_ASSERT(comm->cycl_n[ddCyclStep] > 0, "When we turned on DLB, we should have measured cycles");
1338 comm->cyclesPerStepBeforeDLB = comm->cycl[ddCyclStep]/comm->cycl_n[ddCyclStep];
1342 /* We can set the required cell size info here,
1343 * so we do not need to communicate this.
1344 * The grid is completely uniform.
1346 for (int d = 0; d < dd->ndim; d++)
1348 RowMaster *rowMaster = comm->cellsizesWithDlb[d].rowMaster.get();
1352 comm->load[d].sum_m = comm->load[d].sum;
1354 int nc = dd->nc[dd->dim[d]];
1355 for (int i = 0; i < nc; i++)
1357 rowMaster->cellFrac[i] = i/static_cast<real>(nc);
1360 rowMaster->bounds[i].cellFracLowerMax = i /static_cast<real>(nc);
1361 rowMaster->bounds[i].cellFracUpperMin = (i + 1)/static_cast<real>(nc);
1364 rowMaster->cellFrac[nc] = 1.0;
1369 //! Turns off dynamic load balancing (but leave it able to turn back on).
1370 static void turn_off_dlb(const gmx::MDLogger &mdlog,
1374 GMX_LOG(mdlog.info).appendText(
1375 "step " + gmx::toString(step) + " Turning off dynamic load balancing, because it is degrading performance.");
1376 dd->comm->dlbState = DlbState::offCanTurnOn;
1377 dd->comm->haveTurnedOffDlb = true;
1378 dd->comm->ddPartioningCountFirstDlbOff = dd->ddp_count;
1381 //! Turns off dynamic load balancing permanently.
1382 static void turn_off_dlb_forever(const gmx::MDLogger &mdlog,
1386 GMX_RELEASE_ASSERT(dd->comm->dlbState == DlbState::offCanTurnOn, "Can only turn off DLB forever when it was in the can-turn-on state");
1387 GMX_LOG(mdlog.info).appendText(
1388 "step " + gmx::toString(step) + " Will no longer try dynamic load balancing, as it degraded performance.");
1389 dd->comm->dlbState = DlbState::offForever;
1392 void set_dd_dlb_max_cutoff(t_commrec *cr, real cutoff)
1394 gmx_domdec_comm_t *comm;
1396 comm = cr->dd->comm;
1398 /* Turn on the DLB limiting (might have been on already) */
1399 comm->bPMELoadBalDLBLimits = TRUE;
1401 /* Change the cut-off limit */
1402 comm->PMELoadBal_max_cutoff = cutoff;
1406 fprintf(debug, "PME load balancing set a limit to the DLB staggering such that a %f cut-off will continue to fit\n",
1407 comm->PMELoadBal_max_cutoff);
1411 //! Merges charge-group buffers.
1412 static void merge_cg_buffers(int ncell,
1413 gmx_domdec_comm_dim_t *cd, int pulse,
1415 gmx::ArrayRef<int> index_gl,
1417 rvec *cg_cm, rvec *recv_vr,
1418 gmx::ArrayRef<int> cgindex,
1419 cginfo_mb_t *cginfo_mb, int *cginfo)
1421 gmx_domdec_ind_t *ind, *ind_p;
1422 int p, cell, c, cg, cg0, cg1, cg_gl, nat;
1423 int shift, shift_at;
1425 ind = &cd->ind[pulse];
1427 /* First correct the already stored data */
1428 shift = ind->nrecv[ncell];
1429 for (cell = ncell-1; cell >= 0; cell--)
1431 shift -= ind->nrecv[cell];
1434 /* Move the cg's present from previous grid pulses */
1435 cg0 = ncg_cell[ncell+cell];
1436 cg1 = ncg_cell[ncell+cell+1];
1437 cgindex[cg1+shift] = cgindex[cg1];
1438 for (cg = cg1-1; cg >= cg0; cg--)
1440 index_gl[cg+shift] = index_gl[cg];
1441 copy_rvec(cg_cm[cg], cg_cm[cg+shift]);
1442 cgindex[cg+shift] = cgindex[cg];
1443 cginfo[cg+shift] = cginfo[cg];
1445 /* Correct the already stored send indices for the shift */
1446 for (p = 1; p <= pulse; p++)
1448 ind_p = &cd->ind[p];
1450 for (c = 0; c < cell; c++)
1452 cg0 += ind_p->nsend[c];
1454 cg1 = cg0 + ind_p->nsend[cell];
1455 for (cg = cg0; cg < cg1; cg++)
1457 ind_p->index[cg] += shift;
1463 /* Merge in the communicated buffers */
1467 for (cell = 0; cell < ncell; cell++)
1469 cg1 = ncg_cell[ncell+cell+1] + shift;
1472 /* Correct the old cg indices */
1473 for (cg = ncg_cell[ncell+cell]; cg < cg1; cg++)
1475 cgindex[cg+1] += shift_at;
1478 for (cg = 0; cg < ind->nrecv[cell]; cg++)
1480 /* Copy this charge group from the buffer */
1481 index_gl[cg1] = recv_i[cg0];
1482 copy_rvec(recv_vr[cg0], cg_cm[cg1]);
1483 /* Add it to the cgindex */
1484 cg_gl = index_gl[cg1];
1485 cginfo[cg1] = ddcginfo(cginfo_mb, cg_gl);
1486 nat = GET_CGINFO_NATOMS(cginfo[cg1]);
1487 cgindex[cg1+1] = cgindex[cg1] + nat;
1492 shift += ind->nrecv[cell];
1493 ncg_cell[ncell+cell+1] = cg1;
1497 //! Makes a range partitioning for the atom groups wthin a cell
1498 static void make_cell2at_index(gmx_domdec_comm_dim_t *cd,
1501 const gmx::RangePartitioning &atomGroups)
1503 /* Store the atom block boundaries for easy copying of communication buffers
1505 int g = atomGroupStart;
1506 for (int zone = 0; zone < nzone; zone++)
1508 for (gmx_domdec_ind_t &ind : cd->ind)
1510 const auto range = atomGroups.subRange(g, g + ind.nrecv[zone]);
1511 ind.cell2at0[zone] = range.begin();
1512 ind.cell2at1[zone] = range.end();
1513 g += ind.nrecv[zone];
1518 //! Returns whether a link is missing.
1519 static gmx_bool missing_link(t_blocka *link, int cg_gl, const char *bLocalCG)
1525 for (i = link->index[cg_gl]; i < link->index[cg_gl+1]; i++)
1527 if (!bLocalCG[link->a[i]])
1536 //! Domain corners for communication, a maximum of 4 i-zones see a j domain
1538 //! The corners for the non-bonded communication.
1540 //! Corner for rounding.
1542 //! Corners for rounding.
1544 //! Corners for bounded communication.
1546 //! Corner for rounding for bonded communication.
1550 //! Determine the corners of the domain(s) we are communicating with.
1552 set_dd_corners(const gmx_domdec_t *dd,
1553 int dim0, int dim1, int dim2,
1557 const gmx_domdec_comm_t *comm;
1558 const gmx_domdec_zones_t *zones;
1563 zones = &comm->zones;
1565 /* Keep the compiler happy */
1569 /* The first dimension is equal for all cells */
1570 c->c[0][0] = comm->cell_x0[dim0];
1573 c->bc[0] = c->c[0][0];
1578 /* This cell row is only seen from the first row */
1579 c->c[1][0] = comm->cell_x0[dim1];
1580 /* All rows can see this row */
1581 c->c[1][1] = comm->cell_x0[dim1];
1582 if (isDlbOn(dd->comm))
1584 c->c[1][1] = std::max(comm->cell_x0[dim1], comm->zone_d1[1].mch0);
1587 /* For the multi-body distance we need the maximum */
1588 c->bc[1] = std::max(comm->cell_x0[dim1], comm->zone_d1[1].p1_0);
1591 /* Set the upper-right corner for rounding */
1592 c->cr0 = comm->cell_x1[dim0];
1597 for (j = 0; j < 4; j++)
1599 c->c[2][j] = comm->cell_x0[dim2];
1601 if (isDlbOn(dd->comm))
1603 /* Use the maximum of the i-cells that see a j-cell */
1604 for (i = 0; i < zones->nizone; i++)
1606 for (j = zones->izone[i].j0; j < zones->izone[i].j1; j++)
1611 std::max(c->c[2][j-4],
1612 comm->zone_d2[zones->shift[i][dim0]][zones->shift[i][dim1]].mch0);
1618 /* For the multi-body distance we need the maximum */
1619 c->bc[2] = comm->cell_x0[dim2];
1620 for (i = 0; i < 2; i++)
1622 for (j = 0; j < 2; j++)
1624 c->bc[2] = std::max(c->bc[2], comm->zone_d2[i][j].p1_0);
1630 /* Set the upper-right corner for rounding */
1631 /* Cell (0,0,0) and cell (1,0,0) can see cell 4 (0,1,1)
1632 * Only cell (0,0,0) can see cell 7 (1,1,1)
1634 c->cr1[0] = comm->cell_x1[dim1];
1635 c->cr1[3] = comm->cell_x1[dim1];
1636 if (isDlbOn(dd->comm))
1638 c->cr1[0] = std::max(comm->cell_x1[dim1], comm->zone_d1[1].mch1);
1641 /* For the multi-body distance we need the maximum */
1642 c->bcr1 = std::max(comm->cell_x1[dim1], comm->zone_d1[1].p1_1);
1649 /*! \brief Add the atom groups we need to send in this pulse from this
1650 * zone to \p localAtomGroups and \p work. */
1652 get_zone_pulse_cgs(gmx_domdec_t *dd,
1653 int zonei, int zone,
1655 gmx::ArrayRef<const int> globalAtomGroupIndices,
1656 const gmx::RangePartitioning &atomGroups,
1657 int dim, int dim_ind,
1658 int dim0, int dim1, int dim2,
1659 real r_comm2, real r_bcomm2,
1661 bool distanceIsTriclinic,
1663 real skew_fac2_d, real skew_fac_01,
1664 rvec *v_d, rvec *v_0, rvec *v_1,
1665 const dd_corners_t *c,
1666 const rvec sf2_round,
1667 gmx_bool bDistBonded,
1673 std::vector<int> *localAtomGroups,
1674 dd_comm_setup_work_t *work)
1676 gmx_domdec_comm_t *comm;
1678 gmx_bool bDistMB_pulse;
1680 real r2, rb2, r, tric_sh;
1687 bScrew = (dd->bScrewPBC && dim == XX);
1689 bDistMB_pulse = (bDistMB && bDistBonded);
1691 /* Unpack the work data */
1692 std::vector<int> &ibuf = work->atomGroupBuffer;
1693 std::vector<gmx::RVec> &vbuf = work->positionBuffer;
1697 for (cg = cg0; cg < cg1; cg++)
1701 if (!distanceIsTriclinic)
1703 /* Rectangular direction, easy */
1704 r = cg_cm[cg][dim] - c->c[dim_ind][zone];
1711 r = cg_cm[cg][dim] - c->bc[dim_ind];
1717 /* Rounding gives at most a 16% reduction
1718 * in communicated atoms
1720 if (dim_ind >= 1 && (zonei == 1 || zonei == 2))
1722 r = cg_cm[cg][dim0] - c->cr0;
1723 /* This is the first dimension, so always r >= 0 */
1730 if (dim_ind == 2 && (zonei == 2 || zonei == 3))
1732 r = cg_cm[cg][dim1] - c->cr1[zone];
1739 r = cg_cm[cg][dim1] - c->bcr1;
1749 /* Triclinic direction, more complicated */
1752 /* Rounding, conservative as the skew_fac multiplication
1753 * will slightly underestimate the distance.
1755 if (dim_ind >= 1 && (zonei == 1 || zonei == 2))
1757 rn[dim0] = cg_cm[cg][dim0] - c->cr0;
1758 for (i = dim0+1; i < DIM; i++)
1760 rn[dim0] -= cg_cm[cg][i]*v_0[i][dim0];
1762 r2 = rn[dim0]*rn[dim0]*sf2_round[dim0];
1765 rb[dim0] = rn[dim0];
1768 /* Take care that the cell planes along dim0 might not
1769 * be orthogonal to those along dim1 and dim2.
1771 for (i = 1; i <= dim_ind; i++)
1774 if (normal[dim0][dimd] > 0)
1776 rn[dimd] -= rn[dim0]*normal[dim0][dimd];
1779 rb[dimd] -= rb[dim0]*normal[dim0][dimd];
1784 if (dim_ind == 2 && (zonei == 2 || zonei == 3))
1786 rn[dim1] += cg_cm[cg][dim1] - c->cr1[zone];
1788 for (i = dim1+1; i < DIM; i++)
1790 tric_sh -= cg_cm[cg][i]*v_1[i][dim1];
1792 rn[dim1] += tric_sh;
1795 r2 += rn[dim1]*rn[dim1]*sf2_round[dim1];
1796 /* Take care of coupling of the distances
1797 * to the planes along dim0 and dim1 through dim2.
1799 r2 -= rn[dim0]*rn[dim1]*skew_fac_01;
1800 /* Take care that the cell planes along dim1
1801 * might not be orthogonal to that along dim2.
1803 if (normal[dim1][dim2] > 0)
1805 rn[dim2] -= rn[dim1]*normal[dim1][dim2];
1811 cg_cm[cg][dim1] - c->bcr1 + tric_sh;
1814 rb2 += rb[dim1]*rb[dim1]*sf2_round[dim1];
1815 /* Take care of coupling of the distances
1816 * to the planes along dim0 and dim1 through dim2.
1818 rb2 -= rb[dim0]*rb[dim1]*skew_fac_01;
1819 /* Take care that the cell planes along dim1
1820 * might not be orthogonal to that along dim2.
1822 if (normal[dim1][dim2] > 0)
1824 rb[dim2] -= rb[dim1]*normal[dim1][dim2];
1829 /* The distance along the communication direction */
1830 rn[dim] += cg_cm[cg][dim] - c->c[dim_ind][zone];
1832 for (i = dim+1; i < DIM; i++)
1834 tric_sh -= cg_cm[cg][i]*v_d[i][dim];
1839 r2 += rn[dim]*rn[dim]*skew_fac2_d;
1840 /* Take care of coupling of the distances
1841 * to the planes along dim0 and dim1 through dim2.
1843 if (dim_ind == 1 && zonei == 1)
1845 r2 -= rn[dim0]*rn[dim]*skew_fac_01;
1851 rb[dim] += cg_cm[cg][dim] - c->bc[dim_ind] + tric_sh;
1854 rb2 += rb[dim]*rb[dim]*skew_fac2_d;
1855 /* Take care of coupling of the distances
1856 * to the planes along dim0 and dim1 through dim2.
1858 if (dim_ind == 1 && zonei == 1)
1860 rb2 -= rb[dim0]*rb[dim]*skew_fac_01;
1868 ((bDistMB && rb2 < r_bcomm2) ||
1869 (bDist2B && r2 < r_bcomm2)) &&
1871 (GET_CGINFO_BOND_INTER(cginfo[cg]) &&
1872 missing_link(comm->cglink, globalAtomGroupIndices[cg],
1875 /* Store the local and global atom group indices and position */
1876 localAtomGroups->push_back(cg);
1877 ibuf.push_back(globalAtomGroupIndices[cg]);
1881 if (dd->ci[dim] == 0)
1883 /* Correct cg_cm for pbc */
1884 rvec_add(cg_cm[cg], box[dim], posPbc);
1887 posPbc[YY] = box[YY][YY] - posPbc[YY];
1888 posPbc[ZZ] = box[ZZ][ZZ] - posPbc[ZZ];
1893 copy_rvec(cg_cm[cg], posPbc);
1895 vbuf.emplace_back(posPbc[XX], posPbc[YY], posPbc[ZZ]);
1897 nat += atomGroups.block(cg).size();
1902 work->nsend_zone = nsend_z;
1906 static void clearCommSetupData(dd_comm_setup_work_t *work)
1908 work->localAtomGroupBuffer.clear();
1909 work->atomGroupBuffer.clear();
1910 work->positionBuffer.clear();
1912 work->nsend_zone = 0;
1915 //! Prepare DD communication.
1916 static void setup_dd_communication(gmx_domdec_t *dd,
1917 matrix box, gmx_ddbox_t *ddbox,
1920 PaddedVector<gmx::RVec> *f)
1922 int dim_ind, dim, dim0, dim1, dim2, dimd, nat_tot;
1923 int nzone, nzone_send, zone, zonei, cg0, cg1;
1924 int c, i, cg, cg_gl, nrcg;
1925 int *zone_cg_range, pos_cg;
1926 gmx_domdec_comm_t *comm;
1927 gmx_domdec_zones_t *zones;
1928 gmx_domdec_comm_dim_t *cd;
1929 cginfo_mb_t *cginfo_mb;
1930 gmx_bool bBondComm, bDist2B, bDistMB, bDistBonded;
1931 dd_corners_t corners;
1932 rvec *cg_cm, *normal, *v_d, *v_0 = nullptr, *v_1 = nullptr;
1933 real skew_fac2_d, skew_fac_01;
1938 fprintf(debug, "Setting up DD communication\n");
1943 if (comm->dth.empty())
1945 /* Initialize the thread data.
1946 * This can not be done in init_domain_decomposition,
1947 * as the numbers of threads is determined later.
1949 int numThreads = gmx_omp_nthreads_get(emntDomdec);
1950 comm->dth.resize(numThreads);
1953 switch (fr->cutoff_scheme)
1959 cg_cm = state->x.rvec_array();
1962 gmx_incons("unimplemented");
1965 bBondComm = comm->bBondComm;
1967 /* Do we need to determine extra distances for multi-body bondeds? */
1968 bDistMB = (comm->bInterCGMultiBody && isDlbOn(dd->comm) && dd->ndim > 1);
1970 /* Do we need to determine extra distances for only two-body bondeds? */
1971 bDist2B = (bBondComm && !bDistMB);
1973 const real r_comm2 = gmx::square(domainToDomainIntoAtomToDomainCutoff(*comm, comm->cutoff));
1974 const real r_bcomm2 = gmx::square(domainToDomainIntoAtomToDomainCutoff(*comm, comm->cutoff_mbody));
1978 fprintf(debug, "bBondComm %s, r_bc %f\n", gmx::boolToString(bBondComm), std::sqrt(r_bcomm2));
1981 zones = &comm->zones;
1984 dim1 = (dd->ndim >= 2 ? dd->dim[1] : -1);
1985 dim2 = (dd->ndim >= 3 ? dd->dim[2] : -1);
1987 set_dd_corners(dd, dim0, dim1, dim2, bDistMB, &corners);
1989 /* Triclinic stuff */
1990 normal = ddbox->normal;
1994 v_0 = ddbox->v[dim0];
1995 if (ddbox->tric_dir[dim0] && ddbox->tric_dir[dim1])
1997 /* Determine the coupling coefficient for the distances
1998 * to the cell planes along dim0 and dim1 through dim2.
1999 * This is required for correct rounding.
2002 ddbox->v[dim0][dim1+1][dim0]*ddbox->v[dim1][dim1+1][dim1];
2005 fprintf(debug, "\nskew_fac_01 %f\n", skew_fac_01);
2011 v_1 = ddbox->v[dim1];
2014 zone_cg_range = zones->cg_range;
2015 cginfo_mb = fr->cginfo_mb;
2017 zone_cg_range[0] = 0;
2018 zone_cg_range[1] = dd->ncg_home;
2019 comm->zone_ncg1[0] = dd->ncg_home;
2020 pos_cg = dd->ncg_home;
2022 nat_tot = comm->atomRanges.numHomeAtoms();
2024 for (dim_ind = 0; dim_ind < dd->ndim; dim_ind++)
2026 dim = dd->dim[dim_ind];
2027 cd = &comm->cd[dim_ind];
2029 /* Check if we need to compute triclinic distances along this dim */
2030 bool distanceIsTriclinic = false;
2031 for (i = 0; i <= dim_ind; i++)
2033 if (ddbox->tric_dir[dd->dim[i]])
2035 distanceIsTriclinic = true;
2039 if (dim >= ddbox->npbcdim && dd->ci[dim] == 0)
2041 /* No pbc in this dimension, the first node should not comm. */
2049 v_d = ddbox->v[dim];
2050 skew_fac2_d = gmx::square(ddbox->skew_fac[dim]);
2052 cd->receiveInPlace = true;
2053 for (int p = 0; p < cd->numPulses(); p++)
2055 /* Only atoms communicated in the first pulse are used
2056 * for multi-body bonded interactions or for bBondComm.
2058 bDistBonded = ((bDistMB || bDist2B) && p == 0);
2060 gmx_domdec_ind_t *ind = &cd->ind[p];
2062 /* Thread 0 writes in the global index array */
2064 clearCommSetupData(&comm->dth[0]);
2066 for (zone = 0; zone < nzone_send; zone++)
2068 if (dim_ind > 0 && distanceIsTriclinic)
2070 /* Determine slightly more optimized skew_fac's
2072 * This reduces the number of communicated atoms
2073 * by about 10% for 3D DD of rhombic dodecahedra.
2075 for (dimd = 0; dimd < dim; dimd++)
2077 sf2_round[dimd] = 1;
2078 if (ddbox->tric_dir[dimd])
2080 for (i = dd->dim[dimd]+1; i < DIM; i++)
2082 /* If we are shifted in dimension i
2083 * and the cell plane is tilted forward
2084 * in dimension i, skip this coupling.
2086 if (!(zones->shift[nzone+zone][i] &&
2087 ddbox->v[dimd][i][dimd] >= 0))
2090 gmx::square(ddbox->v[dimd][i][dimd]);
2093 sf2_round[dimd] = 1/sf2_round[dimd];
2098 zonei = zone_perm[dim_ind][zone];
2101 /* Here we permutate the zones to obtain a convenient order
2102 * for neighbor searching
2104 cg0 = zone_cg_range[zonei];
2105 cg1 = zone_cg_range[zonei+1];
2109 /* Look only at the cg's received in the previous grid pulse
2111 cg1 = zone_cg_range[nzone+zone+1];
2112 cg0 = cg1 - cd->ind[p-1].nrecv[zone];
2115 const int numThreads = static_cast<int>(comm->dth.size());
2116 #pragma omp parallel for num_threads(numThreads) schedule(static)
2117 for (int th = 0; th < numThreads; th++)
2121 dd_comm_setup_work_t &work = comm->dth[th];
2123 /* Retain data accumulated into buffers of thread 0 */
2126 clearCommSetupData(&work);
2129 int cg0_th = cg0 + ((cg1 - cg0)* th )/numThreads;
2130 int cg1_th = cg0 + ((cg1 - cg0)*(th+1))/numThreads;
2132 /* Get the cg's for this pulse in this zone */
2133 get_zone_pulse_cgs(dd, zonei, zone, cg0_th, cg1_th,
2134 dd->globalAtomGroupIndices,
2136 dim, dim_ind, dim0, dim1, dim2,
2138 box, distanceIsTriclinic,
2139 normal, skew_fac2_d, skew_fac_01,
2140 v_d, v_0, v_1, &corners, sf2_round,
2141 bDistBonded, bBondComm,
2144 th == 0 ? &ind->index : &work.localAtomGroupBuffer,
2147 GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
2150 std::vector<int> &atomGroups = comm->dth[0].atomGroupBuffer;
2151 std::vector<gmx::RVec> &positions = comm->dth[0].positionBuffer;
2152 ind->nsend[zone] = comm->dth[0].nsend_zone;
2153 /* Append data of threads>=1 to the communication buffers */
2154 for (int th = 1; th < numThreads; th++)
2156 const dd_comm_setup_work_t &dth = comm->dth[th];
2158 ind->index.insert(ind->index.end(), dth.localAtomGroupBuffer.begin(), dth.localAtomGroupBuffer.end());
2159 atomGroups.insert(atomGroups.end(), dth.atomGroupBuffer.begin(), dth.atomGroupBuffer.end());
2160 positions.insert(positions.end(), dth.positionBuffer.begin(), dth.positionBuffer.end());
2161 comm->dth[0].nat += dth.nat;
2162 ind->nsend[zone] += dth.nsend_zone;
2165 /* Clear the counts in case we do not have pbc */
2166 for (zone = nzone_send; zone < nzone; zone++)
2168 ind->nsend[zone] = 0;
2170 ind->nsend[nzone] = ind->index.size();
2171 ind->nsend[nzone + 1] = comm->dth[0].nat;
2172 /* Communicate the number of cg's and atoms to receive */
2173 ddSendrecv(dd, dim_ind, dddirBackward,
2174 ind->nsend, nzone+2,
2175 ind->nrecv, nzone+2);
2179 /* We can receive in place if only the last zone is not empty */
2180 for (zone = 0; zone < nzone-1; zone++)
2182 if (ind->nrecv[zone] > 0)
2184 cd->receiveInPlace = false;
2189 int receiveBufferSize = 0;
2190 if (!cd->receiveInPlace)
2192 receiveBufferSize = ind->nrecv[nzone];
2194 /* These buffer are actually only needed with in-place */
2195 DDBufferAccess<int> globalAtomGroupBuffer(comm->intBuffer, receiveBufferSize);
2196 DDBufferAccess<gmx::RVec> rvecBuffer(comm->rvecBuffer, receiveBufferSize);
2198 dd_comm_setup_work_t &work = comm->dth[0];
2200 /* Make space for the global cg indices */
2201 int numAtomGroupsNew = pos_cg + ind->nrecv[nzone];
2202 dd->globalAtomGroupIndices.resize(numAtomGroupsNew);
2203 /* Communicate the global cg indices */
2204 gmx::ArrayRef<int> integerBufferRef;
2205 if (cd->receiveInPlace)
2207 integerBufferRef = gmx::arrayRefFromArray(dd->globalAtomGroupIndices.data() + pos_cg, ind->nrecv[nzone]);
2211 integerBufferRef = globalAtomGroupBuffer.buffer;
2213 ddSendrecv<int>(dd, dim_ind, dddirBackward,
2214 work.atomGroupBuffer, integerBufferRef);
2216 /* Make space for cg_cm */
2217 dd_check_alloc_ncg(fr, state, f, pos_cg + ind->nrecv[nzone]);
2218 if (fr->cutoff_scheme == ecutsGROUP)
2224 cg_cm = state->x.rvec_array();
2226 /* Communicate cg_cm */
2227 gmx::ArrayRef<gmx::RVec> rvecBufferRef;
2228 if (cd->receiveInPlace)
2230 rvecBufferRef = gmx::arrayRefFromArray(reinterpret_cast<gmx::RVec *>(cg_cm + pos_cg), ind->nrecv[nzone]);
2234 rvecBufferRef = rvecBuffer.buffer;
2236 ddSendrecv<gmx::RVec>(dd, dim_ind, dddirBackward,
2237 work.positionBuffer, rvecBufferRef);
2239 /* Make the charge group index */
2240 if (cd->receiveInPlace)
2242 zone = (p == 0 ? 0 : nzone - 1);
2243 while (zone < nzone)
2245 for (cg = 0; cg < ind->nrecv[zone]; cg++)
2247 cg_gl = dd->globalAtomGroupIndices[pos_cg];
2248 fr->cginfo[pos_cg] = ddcginfo(cginfo_mb, cg_gl);
2249 nrcg = GET_CGINFO_NATOMS(fr->cginfo[pos_cg]);
2250 dd->atomGrouping_.appendBlock(nrcg);
2253 /* Update the charge group presence,
2254 * so we can use it in the next pass of the loop.
2256 comm->bLocalCG[cg_gl] = TRUE;
2262 comm->zone_ncg1[nzone+zone] = ind->nrecv[zone];
2265 zone_cg_range[nzone+zone] = pos_cg;
2270 /* This part of the code is never executed with bBondComm. */
2271 std::vector<int> &atomGroupsIndex = dd->atomGrouping_.rawIndex();
2272 atomGroupsIndex.resize(numAtomGroupsNew + 1);
2274 merge_cg_buffers(nzone, cd, p, zone_cg_range,
2275 dd->globalAtomGroupIndices, integerBufferRef.data(),
2276 cg_cm, as_rvec_array(rvecBufferRef.data()),
2278 fr->cginfo_mb, fr->cginfo);
2279 pos_cg += ind->nrecv[nzone];
2281 nat_tot += ind->nrecv[nzone+1];
2283 if (!cd->receiveInPlace)
2285 /* Store the atom block for easy copying of communication buffers */
2286 make_cell2at_index(cd, nzone, zone_cg_range[nzone], dd->atomGrouping());
2291 comm->atomRanges.setEnd(DDAtomRanges::Type::Zones, nat_tot);
2295 /* We don't need to update cginfo, since that was alrady done above.
2296 * So we pass NULL for the forcerec.
2298 dd_set_cginfo(dd->globalAtomGroupIndices,
2299 dd->ncg_home, dd->globalAtomGroupIndices.size(),
2300 nullptr, comm->bLocalCG);
2305 fprintf(debug, "Finished setting up DD communication, zones:");
2306 for (c = 0; c < zones->n; c++)
2308 fprintf(debug, " %d", zones->cg_range[c+1]-zones->cg_range[c]);
2310 fprintf(debug, "\n");
2314 //! Set boundaries for the charge group range.
2315 static void set_cg_boundaries(gmx_domdec_zones_t *zones)
2319 for (c = 0; c < zones->nizone; c++)
2321 zones->izone[c].cg1 = zones->cg_range[c+1];
2322 zones->izone[c].jcg0 = zones->cg_range[zones->izone[c].j0];
2323 zones->izone[c].jcg1 = zones->cg_range[zones->izone[c].j1];
2327 /*! \brief Set zone dimensions for zones \p zone_start to \p zone_end-1
2329 * Also sets the atom density for the home zone when \p zone_start=0.
2330 * For this \p numMovedChargeGroupsInHomeZone needs to be passed to tell
2331 * how many charge groups will move but are still part of the current range.
2332 * \todo When converting domdec to use proper classes, all these variables
2333 * should be private and a method should return the correct count
2334 * depending on an internal state.
2336 * \param[in,out] dd The domain decomposition struct
2337 * \param[in] box The box
2338 * \param[in] ddbox The domain decomposition box struct
2339 * \param[in] zone_start The start of the zone range to set sizes for
2340 * \param[in] zone_end The end of the zone range to set sizes for
2341 * \param[in] numMovedChargeGroupsInHomeZone The number of charge groups in the home zone that should moved but are still present in dd->comm->zones.cg_range
2343 static void set_zones_size(gmx_domdec_t *dd,
2344 matrix box, const gmx_ddbox_t *ddbox,
2345 int zone_start, int zone_end,
2346 int numMovedChargeGroupsInHomeZone)
2348 gmx_domdec_comm_t *comm;
2349 gmx_domdec_zones_t *zones;
2358 zones = &comm->zones;
2360 /* Do we need to determine extra distances for multi-body bondeds? */
2361 bDistMB = (comm->bInterCGMultiBody && isDlbOn(dd->comm) && dd->ndim > 1);
2363 for (z = zone_start; z < zone_end; z++)
2365 /* Copy cell limits to zone limits.
2366 * Valid for non-DD dims and non-shifted dims.
2368 copy_rvec(comm->cell_x0, zones->size[z].x0);
2369 copy_rvec(comm->cell_x1, zones->size[z].x1);
2372 for (d = 0; d < dd->ndim; d++)
2376 for (z = 0; z < zones->n; z++)
2378 /* With a staggered grid we have different sizes
2379 * for non-shifted dimensions.
2381 if (isDlbOn(dd->comm) && zones->shift[z][dim] == 0)
2385 zones->size[z].x0[dim] = comm->zone_d1[zones->shift[z][dd->dim[d-1]]].min0;
2386 zones->size[z].x1[dim] = comm->zone_d1[zones->shift[z][dd->dim[d-1]]].max1;
2390 zones->size[z].x0[dim] = comm->zone_d2[zones->shift[z][dd->dim[d-2]]][zones->shift[z][dd->dim[d-1]]].min0;
2391 zones->size[z].x1[dim] = comm->zone_d2[zones->shift[z][dd->dim[d-2]]][zones->shift[z][dd->dim[d-1]]].max1;
2397 rcmbs = comm->cutoff_mbody;
2398 if (ddbox->tric_dir[dim])
2400 rcs /= ddbox->skew_fac[dim];
2401 rcmbs /= ddbox->skew_fac[dim];
2404 /* Set the lower limit for the shifted zone dimensions */
2405 for (z = zone_start; z < zone_end; z++)
2407 if (zones->shift[z][dim] > 0)
2410 if (!isDlbOn(dd->comm) || d == 0)
2412 zones->size[z].x0[dim] = comm->cell_x1[dim];
2413 zones->size[z].x1[dim] = comm->cell_x1[dim] + rcs;
2417 /* Here we take the lower limit of the zone from
2418 * the lowest domain of the zone below.
2422 zones->size[z].x0[dim] =
2423 comm->zone_d1[zones->shift[z][dd->dim[d-1]]].min1;
2429 zones->size[z].x0[dim] =
2430 zones->size[zone_perm[2][z-4]].x0[dim];
2434 zones->size[z].x0[dim] =
2435 comm->zone_d2[zones->shift[z][dd->dim[d-2]]][zones->shift[z][dd->dim[d-1]]].min1;
2438 /* A temporary limit, is updated below */
2439 zones->size[z].x1[dim] = zones->size[z].x0[dim];
2443 for (zi = 0; zi < zones->nizone; zi++)
2445 if (zones->shift[zi][dim] == 0)
2447 /* This takes the whole zone into account.
2448 * With multiple pulses this will lead
2449 * to a larger zone then strictly necessary.
2451 zones->size[z].x1[dim] = std::max(zones->size[z].x1[dim],
2452 zones->size[zi].x1[dim]+rcmbs);
2460 /* Loop over the i-zones to set the upper limit of each
2463 for (zi = 0; zi < zones->nizone; zi++)
2465 if (zones->shift[zi][dim] == 0)
2467 /* We should only use zones up to zone_end */
2468 int jZoneEnd = std::min(zones->izone[zi].j1, zone_end);
2469 for (z = zones->izone[zi].j0; z < jZoneEnd; z++)
2471 if (zones->shift[z][dim] > 0)
2473 zones->size[z].x1[dim] = std::max(zones->size[z].x1[dim],
2474 zones->size[zi].x1[dim]+rcs);
2481 for (z = zone_start; z < zone_end; z++)
2483 /* Initialization only required to keep the compiler happy */
2484 rvec corner_min = {0, 0, 0}, corner_max = {0, 0, 0}, corner;
2487 /* To determine the bounding box for a zone we need to find
2488 * the extreme corners of 4, 2 or 1 corners.
2490 nc = 1 << (ddbox->nboundeddim - 1);
2492 for (c = 0; c < nc; c++)
2494 /* Set up a zone corner at x=0, ignoring trilinic couplings */
2498 corner[YY] = zones->size[z].x0[YY];
2502 corner[YY] = zones->size[z].x1[YY];
2506 corner[ZZ] = zones->size[z].x0[ZZ];
2510 corner[ZZ] = zones->size[z].x1[ZZ];
2512 if (dd->ndim == 1 && dd->dim[0] < ZZ && ZZ < dd->npbcdim &&
2513 box[ZZ][1 - dd->dim[0]] != 0)
2515 /* With 1D domain decomposition the cg's are not in
2516 * the triclinic box, but triclinic x-y and rectangular y/x-z.
2517 * Shift the corner of the z-vector back to along the box
2518 * vector of dimension d, so it will later end up at 0 along d.
2519 * This can affect the location of this corner along dd->dim[0]
2520 * through the matrix operation below if box[d][dd->dim[0]]!=0.
2522 int d = 1 - dd->dim[0];
2524 corner[d] -= corner[ZZ]*box[ZZ][d]/box[ZZ][ZZ];
2526 /* Apply the triclinic couplings */
2527 assert(ddbox->npbcdim <= DIM);
2528 for (i = YY; i < ddbox->npbcdim; i++)
2530 for (j = XX; j < i; j++)
2532 corner[j] += corner[i]*box[i][j]/box[i][i];
2537 copy_rvec(corner, corner_min);
2538 copy_rvec(corner, corner_max);
2542 for (i = 0; i < DIM; i++)
2544 corner_min[i] = std::min(corner_min[i], corner[i]);
2545 corner_max[i] = std::max(corner_max[i], corner[i]);
2549 /* Copy the extreme cornes without offset along x */
2550 for (i = 0; i < DIM; i++)
2552 zones->size[z].bb_x0[i] = corner_min[i];
2553 zones->size[z].bb_x1[i] = corner_max[i];
2555 /* Add the offset along x */
2556 zones->size[z].bb_x0[XX] += zones->size[z].x0[XX];
2557 zones->size[z].bb_x1[XX] += zones->size[z].x1[XX];
2560 if (zone_start == 0)
2563 for (dim = 0; dim < DIM; dim++)
2565 vol *= zones->size[0].x1[dim] - zones->size[0].x0[dim];
2567 zones->dens_zone0 = (zones->cg_range[1] - zones->cg_range[0] - numMovedChargeGroupsInHomeZone)/vol;
2572 for (z = zone_start; z < zone_end; z++)
2574 fprintf(debug, "zone %d %6.3f - %6.3f %6.3f - %6.3f %6.3f - %6.3f\n",
2576 zones->size[z].x0[XX], zones->size[z].x1[XX],
2577 zones->size[z].x0[YY], zones->size[z].x1[YY],
2578 zones->size[z].x0[ZZ], zones->size[z].x1[ZZ]);
2579 fprintf(debug, "zone %d bb %6.3f - %6.3f %6.3f - %6.3f %6.3f - %6.3f\n",
2581 zones->size[z].bb_x0[XX], zones->size[z].bb_x1[XX],
2582 zones->size[z].bb_x0[YY], zones->size[z].bb_x1[YY],
2583 zones->size[z].bb_x0[ZZ], zones->size[z].bb_x1[ZZ]);
2588 //! Comparator for sorting charge groups.
2589 static bool comp_cgsort(const gmx_cgsort_t &a, const gmx_cgsort_t &b)
2594 return a.ind_gl < b.ind_gl;
2596 return a.nsc < b.nsc;
2599 /*! \brief Order data in \p dataToSort according to \p sort
2601 * Note: both buffers should have at least \p sort.size() elements.
2603 template <typename T>
2605 orderVector(gmx::ArrayRef<const gmx_cgsort_t> sort,
2606 gmx::ArrayRef<T> dataToSort,
2607 gmx::ArrayRef<T> sortBuffer)
2609 GMX_ASSERT(dataToSort.size() >= sort.size(), "The vector needs to be sufficiently large");
2610 GMX_ASSERT(sortBuffer.size() >= sort.size(), "The sorting buffer needs to be sufficiently large");
2612 /* Order the data into the temporary buffer */
2614 for (const gmx_cgsort_t &entry : sort)
2616 sortBuffer[i++] = dataToSort[entry.ind];
2619 /* Copy back to the original array */
2620 std::copy(sortBuffer.begin(), sortBuffer.begin() + sort.size(),
2621 dataToSort.begin());
2624 /*! \brief Order data in \p dataToSort according to \p sort
2626 * Note: \p vectorToSort should have at least \p sort.size() elements,
2627 * \p workVector is resized when it is too small.
2629 template <typename T>
2631 orderVector(gmx::ArrayRef<const gmx_cgsort_t> sort,
2632 gmx::ArrayRef<T> vectorToSort,
2633 std::vector<T> *workVector)
2635 if (gmx::index(workVector->size()) < sort.size())
2637 workVector->resize(sort.size());
2639 orderVector<T>(sort, vectorToSort, *workVector);
2642 //! Order vectors of atoms.
2643 static void order_vec_atom(const gmx::RangePartitioning *atomGroups,
2644 gmx::ArrayRef<const gmx_cgsort_t> sort,
2645 gmx::ArrayRef<gmx::RVec> v,
2646 gmx::ArrayRef<gmx::RVec> buf)
2648 if (atomGroups == nullptr)
2650 /* Avoid the useless loop of the atoms within a cg */
2651 orderVector(sort, v, buf);
2656 /* Order the data */
2658 for (const gmx_cgsort_t &entry : sort)
2660 for (int i : atomGroups->block(entry.ind))
2662 copy_rvec(v[i], buf[a]);
2668 /* Copy back to the original array */
2669 for (int a = 0; a < atot; a++)
2671 copy_rvec(buf[a], v[a]);
2675 //! Returns whether a < b */
2676 static bool compareCgsort(const gmx_cgsort_t &a,
2677 const gmx_cgsort_t &b)
2679 return (a.nsc < b.nsc ||
2680 (a.nsc == b.nsc && a.ind_gl < b.ind_gl));
2683 //! Do sorting of charge groups.
2684 static void orderedSort(gmx::ArrayRef<const gmx_cgsort_t> stationary,
2685 gmx::ArrayRef<gmx_cgsort_t> moved,
2686 std::vector<gmx_cgsort_t> *sort1)
2688 /* The new indices are not very ordered, so we qsort them */
2689 std::sort(moved.begin(), moved.end(), comp_cgsort);
2691 /* stationary is already ordered, so now we can merge the two arrays */
2692 sort1->resize(stationary.size() + moved.size());
2693 std::merge(stationary.begin(), stationary.end(),
2694 moved.begin(), moved.end(),
2699 /*! \brief Set the sorting order for systems with charge groups, returned in sort->sort.
2701 * The order is according to the global charge group index.
2702 * This adds and removes charge groups that moved between domains.
2704 static void dd_sort_order(const gmx_domdec_t *dd,
2705 const t_forcerec *fr,
2707 gmx_domdec_sort_t *sort)
2709 const int *a = fr->ns->grid->cell_index;
2711 const int movedValue = NSGRID_SIGNAL_MOVED_FAC*fr->ns->grid->ncells;
2713 if (ncg_home_old >= 0)
2715 std::vector<gmx_cgsort_t> &stationary = sort->stationary;
2716 std::vector<gmx_cgsort_t> &moved = sort->moved;
2718 /* The charge groups that remained in the same ns grid cell
2719 * are completely ordered. So we can sort efficiently by sorting
2720 * the charge groups that did move into the stationary list.
2721 * Note: push_back() seems to be slightly slower than direct access.
2725 for (int i = 0; i < dd->ncg_home; i++)
2727 /* Check if this cg did not move to another node */
2728 if (a[i] < movedValue)
2732 entry.ind_gl = dd->globalAtomGroupIndices[i];
2735 if (i >= ncg_home_old || a[i] != sort->sorted[i].nsc)
2737 /* This cg is new on this node or moved ns grid cell */
2738 moved.push_back(entry);
2742 /* This cg did not move */
2743 stationary.push_back(entry);
2750 fprintf(debug, "ordered sort cgs: stationary %zu moved %zu\n",
2751 stationary.size(), moved.size());
2753 /* Sort efficiently */
2754 orderedSort(stationary, moved, &sort->sorted);
2758 std::vector<gmx_cgsort_t> &cgsort = sort->sorted;
2760 cgsort.reserve(dd->ncg_home);
2762 for (int i = 0; i < dd->ncg_home; i++)
2764 /* Sort on the ns grid cell indices
2765 * and the global topology index
2769 entry.ind_gl = dd->globalAtomGroupIndices[i];
2771 cgsort.push_back(entry);
2772 if (cgsort[i].nsc < movedValue)
2779 fprintf(debug, "qsort cgs: %d new home %d\n", dd->ncg_home, numCGNew);
2781 /* Determine the order of the charge groups using qsort */
2782 std::sort(cgsort.begin(), cgsort.end(), comp_cgsort);
2784 /* Remove the charge groups which are no longer at home here */
2785 cgsort.resize(numCGNew);
2789 //! Returns the sorting order for atoms based on the nbnxn grid order in sort
2790 static void dd_sort_order_nbnxn(const t_forcerec *fr,
2791 std::vector<gmx_cgsort_t> *sort)
2793 gmx::ArrayRef<const int> atomOrder = nbnxn_get_atomorder(fr->nbv->nbs.get());
2795 /* Using push_back() instead of this resize results in much slower code */
2796 sort->resize(atomOrder.size());
2797 gmx::ArrayRef<gmx_cgsort_t> buffer = *sort;
2798 size_t numSorted = 0;
2799 for (int i : atomOrder)
2803 /* The values of nsc and ind_gl are not used in this case */
2804 buffer[numSorted++].ind = i;
2807 sort->resize(numSorted);
2810 //! Returns the sorting state for DD.
2811 static void dd_sort_state(gmx_domdec_t *dd, rvec *cgcm, t_forcerec *fr, t_state *state,
2814 gmx_domdec_sort_t *sort = dd->comm->sort.get();
2816 switch (fr->cutoff_scheme)
2819 dd_sort_order(dd, fr, ncg_home_old, sort);
2822 dd_sort_order_nbnxn(fr, &sort->sorted);
2825 gmx_incons("unimplemented");
2828 const gmx::RangePartitioning &atomGrouping = dd->atomGrouping();
2830 /* We alloc with the old size, since cgindex is still old */
2831 GMX_ASSERT(atomGrouping.numBlocks() == dd->ncg_home, "atomGroups and dd should be consistent");
2832 DDBufferAccess<gmx::RVec> rvecBuffer(dd->comm->rvecBuffer, atomGrouping.fullRange().end());
2834 const gmx::RangePartitioning *atomGroupsPtr = (dd->comm->bCGs ? &atomGrouping : nullptr);
2836 /* Set the new home atom/charge group count */
2837 dd->ncg_home = sort->sorted.size();
2840 fprintf(debug, "Set the new home %s count to %d\n",
2841 dd->comm->bCGs ? "charge group" : "atom",
2845 /* Reorder the state */
2846 gmx::ArrayRef<const gmx_cgsort_t> cgsort = sort->sorted;
2847 GMX_RELEASE_ASSERT(cgsort.size() == dd->ncg_home, "We should sort all the home atom groups");
2849 if (state->flags & (1 << estX))
2851 order_vec_atom(atomGroupsPtr, cgsort, state->x, rvecBuffer.buffer);
2853 if (state->flags & (1 << estV))
2855 order_vec_atom(atomGroupsPtr, cgsort, state->v, rvecBuffer.buffer);
2857 if (state->flags & (1 << estCGP))
2859 order_vec_atom(atomGroupsPtr, cgsort, state->cg_p, rvecBuffer.buffer);
2862 if (fr->cutoff_scheme == ecutsGROUP)
2865 gmx::ArrayRef<gmx::RVec> cgcmRef = gmx::arrayRefFromArray(reinterpret_cast<gmx::RVec *>(cgcm[0]), cgsort.size());
2866 orderVector(cgsort, cgcmRef, rvecBuffer.buffer);
2869 /* Reorder the global cg index */
2870 orderVector<int>(cgsort, dd->globalAtomGroupIndices, &sort->intBuffer);
2871 /* Reorder the cginfo */
2872 orderVector<int>(cgsort, gmx::arrayRefFromArray(fr->cginfo, cgsort.size()), &sort->intBuffer);
2873 /* Rebuild the local cg index */
2876 /* We make a new, ordered atomGroups object and assign it to
2877 * the old one. This causes some allocation overhead, but saves
2878 * a copy back of the whole index.
2880 gmx::RangePartitioning ordered;
2881 for (const gmx_cgsort_t &entry : cgsort)
2883 ordered.appendBlock(atomGrouping.block(entry.ind).size());
2885 dd->atomGrouping_ = ordered;
2889 dd->atomGrouping_.setAllBlocksSizeOne(dd->ncg_home);
2891 /* Set the home atom number */
2892 dd->comm->atomRanges.setEnd(DDAtomRanges::Type::Home, dd->atomGrouping().fullRange().end());
2894 if (fr->cutoff_scheme == ecutsVERLET)
2896 /* The atoms are now exactly in grid order, update the grid order */
2897 nbnxn_set_atomorder(fr->nbv->nbs.get());
2901 /* Copy the sorted ns cell indices back to the ns grid struct */
2902 for (gmx::index i = 0; i < cgsort.size(); i++)
2904 fr->ns->grid->cell_index[i] = cgsort[i].nsc;
2906 fr->ns->grid->nr = cgsort.size();
2910 //! Accumulates load statistics.
2911 static void add_dd_statistics(gmx_domdec_t *dd)
2913 gmx_domdec_comm_t *comm = dd->comm;
2915 for (int i = 0; i < static_cast<int>(DDAtomRanges::Type::Number); i++)
2917 auto range = static_cast<DDAtomRanges::Type>(i);
2919 comm->atomRanges.end(range) - comm->atomRanges.start(range);
2924 void reset_dd_statistics_counters(gmx_domdec_t *dd)
2926 gmx_domdec_comm_t *comm = dd->comm;
2928 /* Reset all the statistics and counters for total run counting */
2929 for (int i = 0; i < static_cast<int>(DDAtomRanges::Type::Number); i++)
2931 comm->sum_nat[i] = 0;
2935 comm->load_step = 0;
2938 clear_ivec(comm->load_lim);
2943 void print_dd_statistics(const t_commrec *cr, const t_inputrec *ir, FILE *fplog)
2945 gmx_domdec_comm_t *comm = cr->dd->comm;
2947 const int numRanges = static_cast<int>(DDAtomRanges::Type::Number);
2948 gmx_sumd(numRanges, comm->sum_nat, cr);
2950 if (fplog == nullptr)
2955 fprintf(fplog, "\n D O M A I N D E C O M P O S I T I O N S T A T I S T I C S\n\n");
2957 for (int i = static_cast<int>(DDAtomRanges::Type::Zones); i < numRanges; i++)
2959 auto range = static_cast<DDAtomRanges::Type>(i);
2960 double av = comm->sum_nat[i]/comm->ndecomp;
2963 case DDAtomRanges::Type::Zones:
2965 " av. #atoms communicated per step for force: %d x %.1f\n",
2968 case DDAtomRanges::Type::Vsites:
2969 if (cr->dd->vsite_comm)
2972 " av. #atoms communicated per step for vsites: %d x %.1f\n",
2973 (EEL_PME(ir->coulombtype) || ir->coulombtype == eelEWALD) ? 3 : 2,
2977 case DDAtomRanges::Type::Constraints:
2978 if (cr->dd->constraint_comm)
2981 " av. #atoms communicated per step for LINCS: %d x %.1f\n",
2982 1 + ir->nLincsIter, av);
2986 gmx_incons(" Unknown type for DD statistics");
2989 fprintf(fplog, "\n");
2991 if (comm->bRecordLoad && EI_DYNAMICS(ir->eI))
2993 print_dd_load_av(fplog, cr->dd);
2997 // TODO Remove fplog when group scheme and charge groups are gone
2998 void dd_partition_system(FILE *fplog,
2999 const gmx::MDLogger &mdlog,
3001 const t_commrec *cr,
3002 gmx_bool bMasterState,
3004 t_state *state_global,
3005 const gmx_mtop_t &top_global,
3006 const t_inputrec *ir,
3007 t_state *state_local,
3008 PaddedVector<gmx::RVec> *f,
3009 gmx::MDAtoms *mdAtoms,
3010 gmx_localtop_t *top_local,
3013 gmx::Constraints *constr,
3015 gmx_wallcycle *wcycle,
3019 gmx_domdec_comm_t *comm;
3020 gmx_ddbox_t ddbox = {0};
3022 int64_t step_pcoupl;
3023 rvec cell_ns_x0, cell_ns_x1;
3024 int ncgindex_set, ncg_home_old = -1, ncg_moved, nat_f_novirsum;
3025 gmx_bool bBoxChanged, bNStGlobalComm, bDoDLB, bCheckWhetherToTurnDlbOn, bLogLoad;
3026 gmx_bool bRedist, bResortAll;
3027 ivec ncells_old = {0, 0, 0}, ncells_new = {0, 0, 0}, np;
3031 wallcycle_start(wcycle, ewcDOMDEC);
3036 // TODO if the update code becomes accessible here, use
3037 // upd->deform for this logic.
3038 bBoxChanged = (bMasterState || inputrecDeform(ir));
3039 if (ir->epc != epcNO)
3041 /* With nstpcouple > 1 pressure coupling happens.
3042 * one step after calculating the pressure.
3043 * Box scaling happens at the end of the MD step,
3044 * after the DD partitioning.
3045 * We therefore have to do DLB in the first partitioning
3046 * after an MD step where P-coupling occurred.
3047 * We need to determine the last step in which p-coupling occurred.
3048 * MRS -- need to validate this for vv?
3050 int n = ir->nstpcouple;
3053 step_pcoupl = step - 1;
3057 step_pcoupl = ((step - 1)/n)*n + 1;
3059 if (step_pcoupl >= comm->partition_step)
3065 bNStGlobalComm = (step % nstglobalcomm == 0);
3073 /* Should we do dynamic load balacing this step?
3074 * Since it requires (possibly expensive) global communication,
3075 * we might want to do DLB less frequently.
3077 if (bBoxChanged || ir->epc != epcNO)
3079 bDoDLB = bBoxChanged;
3083 bDoDLB = bNStGlobalComm;
3087 /* Check if we have recorded loads on the nodes */
3088 if (comm->bRecordLoad && dd_load_count(comm) > 0)
3090 bCheckWhetherToTurnDlbOn = dd_dlb_get_should_check_whether_to_turn_dlb_on(dd);
3092 /* Print load every nstlog, first and last step to the log file */
3093 bLogLoad = ((ir->nstlog > 0 && step % ir->nstlog == 0) ||
3094 comm->n_load_collect == 0 ||
3096 (step + ir->nstlist > ir->init_step + ir->nsteps)));
3098 /* Avoid extra communication due to verbose screen output
3099 * when nstglobalcomm is set.
3101 if (bDoDLB || bLogLoad || bCheckWhetherToTurnDlbOn ||
3102 (bVerbose && (ir->nstlist == 0 || nstglobalcomm <= ir->nstlist)))
3104 get_load_distribution(dd, wcycle);
3109 GMX_LOG(mdlog.info).asParagraph().appendText(dd_print_load(dd, step-1));
3113 dd_print_load_verbose(dd);
3116 comm->n_load_collect++;
3122 /* Add the measured cycles to the running average */
3123 const float averageFactor = 0.1f;
3124 comm->cyclesPerStepDlbExpAverage =
3125 (1 - averageFactor)*comm->cyclesPerStepDlbExpAverage +
3126 averageFactor*comm->cycl[ddCyclStep]/comm->cycl_n[ddCyclStep];
3128 if (comm->dlbState == DlbState::onCanTurnOff &&
3129 dd->comm->n_load_have % c_checkTurnDlbOffInterval == c_checkTurnDlbOffInterval - 1)
3131 gmx_bool turnOffDlb;
3134 /* If the running averaged cycles with DLB are more
3135 * than before we turned on DLB, turn off DLB.
3136 * We will again run and check the cycles without DLB
3137 * and we can then decide if to turn off DLB forever.
3139 turnOffDlb = (comm->cyclesPerStepDlbExpAverage >
3140 comm->cyclesPerStepBeforeDLB);
3142 dd_bcast(dd, sizeof(turnOffDlb), &turnOffDlb);
3145 /* To turn off DLB, we need to redistribute the atoms */
3146 dd_collect_state(dd, state_local, state_global);
3147 bMasterState = TRUE;
3148 turn_off_dlb(mdlog, dd, step);
3152 else if (bCheckWhetherToTurnDlbOn)
3154 gmx_bool turnOffDlbForever = FALSE;
3155 gmx_bool turnOnDlb = FALSE;
3157 /* Since the timings are node dependent, the master decides */
3160 /* If we recently turned off DLB, we want to check if
3161 * performance is better without DLB. We want to do this
3162 * ASAP to minimize the chance that external factors
3163 * slowed down the DLB step are gone here and we
3164 * incorrectly conclude that DLB was causing the slowdown.
3165 * So we measure one nstlist block, no running average.
3167 if (comm->haveTurnedOffDlb &&
3168 comm->cycl[ddCyclStep]/comm->cycl_n[ddCyclStep] <
3169 comm->cyclesPerStepDlbExpAverage)
3171 /* After turning off DLB we ran nstlist steps in fewer
3172 * cycles than with DLB. This likely means that DLB
3173 * in not benefical, but this could be due to a one
3174 * time unlucky fluctuation, so we require two such
3175 * observations in close succession to turn off DLB
3178 if (comm->dlbSlowerPartitioningCount > 0 &&
3179 dd->ddp_count < comm->dlbSlowerPartitioningCount + 10*c_checkTurnDlbOnInterval)
3181 turnOffDlbForever = TRUE;
3183 comm->haveTurnedOffDlb = false;
3184 /* Register when we last measured DLB slowdown */
3185 comm->dlbSlowerPartitioningCount = dd->ddp_count;
3189 /* Here we check if the max PME rank load is more than 0.98
3190 * the max PP force load. If so, PP DLB will not help,
3191 * since we are (almost) limited by PME. Furthermore,
3192 * DLB will cause a significant extra x/f redistribution
3193 * cost on the PME ranks, which will then surely result
3194 * in lower total performance.
3196 if (cr->npmenodes > 0 &&
3197 dd_pme_f_ratio(dd) > 1 - DD_PERF_LOSS_DLB_ON)
3203 turnOnDlb = (dd_force_imb_perf_loss(dd) >= DD_PERF_LOSS_DLB_ON);
3209 gmx_bool turnOffDlbForever;
3213 turnOffDlbForever, turnOnDlb
3215 dd_bcast(dd, sizeof(bools), &bools);
3216 if (bools.turnOffDlbForever)
3218 turn_off_dlb_forever(mdlog, dd, step);
3220 else if (bools.turnOnDlb)
3222 turn_on_dlb(mdlog, dd, step);
3227 comm->n_load_have++;
3230 cgs_gl = &comm->cgs_gl;
3235 /* Clear the old state */
3236 clearDDStateIndices(dd, 0, 0);
3239 auto xGlobal = positionsFromStatePointer(state_global);
3241 set_ddbox(*dd, true,
3242 DDMASTER(dd) ? state_global->box : nullptr,
3246 distributeState(mdlog, dd, top_global, state_global, ddbox, state_local, f);
3248 dd_make_local_cgs(dd, &top_local->cgs);
3250 /* Ensure that we have space for the new distribution */
3251 dd_check_alloc_ncg(fr, state_local, f, dd->ncg_home);
3253 if (fr->cutoff_scheme == ecutsGROUP)
3255 calc_cgcm(fplog, 0, dd->ncg_home,
3256 &top_local->cgs, state_local->x.rvec_array(), fr->cg_cm);
3259 inc_nrnb(nrnb, eNR_CGCM, comm->atomRanges.numHomeAtoms());
3261 dd_set_cginfo(dd->globalAtomGroupIndices, 0, dd->ncg_home, fr, comm->bLocalCG);
3263 else if (state_local->ddp_count != dd->ddp_count)
3265 if (state_local->ddp_count > dd->ddp_count)
3267 gmx_fatal(FARGS, "Internal inconsistency state_local->ddp_count (%d) > dd->ddp_count (%" PRId64 ")", state_local->ddp_count, dd->ddp_count);
3270 if (state_local->ddp_count_cg_gl != state_local->ddp_count)
3272 gmx_fatal(FARGS, "Internal inconsistency state_local->ddp_count_cg_gl (%d) != state_local->ddp_count (%d)", state_local->ddp_count_cg_gl, state_local->ddp_count);
3275 /* Clear the old state */
3276 clearDDStateIndices(dd, 0, 0);
3278 /* Restore the atom group indices from state_local */
3279 restoreAtomGroups(dd, cgs_gl->index, state_local);
3280 make_dd_indices(dd, cgs_gl->index, 0);
3281 ncgindex_set = dd->ncg_home;
3283 if (fr->cutoff_scheme == ecutsGROUP)
3285 /* Redetermine the cg COMs */
3286 calc_cgcm(fplog, 0, dd->ncg_home,
3287 &top_local->cgs, state_local->x.rvec_array(), fr->cg_cm);
3290 inc_nrnb(nrnb, eNR_CGCM, comm->atomRanges.numHomeAtoms());
3292 dd_set_cginfo(dd->globalAtomGroupIndices, 0, dd->ncg_home, fr, comm->bLocalCG);
3294 set_ddbox(*dd, bMasterState, state_local->box,
3295 true, state_local->x, &ddbox);
3297 bRedist = isDlbOn(comm);
3301 /* We have the full state, only redistribute the cgs */
3303 /* Clear the non-home indices */
3304 clearDDStateIndices(dd, dd->ncg_home, comm->atomRanges.numHomeAtoms());
3307 /* Avoid global communication for dim's without pbc and -gcom */
3308 if (!bNStGlobalComm)
3310 copy_rvec(comm->box0, ddbox.box0 );
3311 copy_rvec(comm->box_size, ddbox.box_size);
3313 set_ddbox(*dd, bMasterState, state_local->box,
3314 bNStGlobalComm, state_local->x, &ddbox);
3319 /* For dim's without pbc and -gcom */
3320 copy_rvec(ddbox.box0, comm->box0 );
3321 copy_rvec(ddbox.box_size, comm->box_size);
3323 set_dd_cell_sizes(dd, &ddbox, dynamic_dd_box(*dd), bMasterState, bDoDLB,
3326 if (comm->nstDDDumpGrid > 0 && step % comm->nstDDDumpGrid == 0)
3328 write_dd_grid_pdb("dd_grid", step, dd, state_local->box, &ddbox);
3331 if (comm->useUpdateGroups)
3333 comm->updateGroupsCog->addCogs(gmx::arrayRefFromArray(dd->globalAtomGroupIndices.data(), dd->ncg_home),
3337 /* Check if we should sort the charge groups */
3338 const bool bSortCG = (bMasterState || bRedist);
3340 ncg_home_old = dd->ncg_home;
3342 /* When repartitioning we mark atom groups that will move to neighboring
3343 * DD cells, but we do not move them right away for performance reasons.
3344 * Thus we need to keep track of how many charge groups will move for
3345 * obtaining correct local charge group / atom counts.
3350 wallcycle_sub_start(wcycle, ewcsDD_REDIST);
3352 ncgindex_set = dd->ncg_home;
3353 dd_redistribute_cg(fplog, step, dd, ddbox.tric_dir,
3357 GMX_RELEASE_ASSERT(bSortCG, "Sorting is required after redistribution");
3359 if (comm->useUpdateGroups)
3361 comm->updateGroupsCog->addCogs(gmx::arrayRefFromArray(dd->globalAtomGroupIndices.data(), dd->ncg_home),
3365 wallcycle_sub_stop(wcycle, ewcsDD_REDIST);
3368 get_nsgrid_boundaries(ddbox.nboundeddim, state_local->box,
3370 &comm->cell_x0, &comm->cell_x1,
3371 dd->ncg_home, fr->cg_cm,
3372 cell_ns_x0, cell_ns_x1, &grid_density);
3376 comm_dd_ns_cell_sizes(dd, &ddbox, cell_ns_x0, cell_ns_x1, step);
3379 switch (fr->cutoff_scheme)
3382 copy_ivec(fr->ns->grid->n, ncells_old);
3383 grid_first(fplog, fr->ns->grid, dd, &ddbox,
3384 state_local->box, cell_ns_x0, cell_ns_x1,
3385 fr->rlist, grid_density);
3388 nbnxn_get_ncells(fr->nbv->nbs.get(), &ncells_old[XX], &ncells_old[YY]);
3391 gmx_incons("unimplemented");
3393 /* We need to store tric_dir for dd_get_ns_ranges called from ns.c */
3394 copy_ivec(ddbox.tric_dir, comm->tric_dir);
3398 wallcycle_sub_start(wcycle, ewcsDD_GRID);
3400 /* Sort the state on charge group position.
3401 * This enables exact restarts from this step.
3402 * It also improves performance by about 15% with larger numbers
3403 * of atoms per node.
3406 /* Fill the ns grid with the home cell,
3407 * so we can sort with the indices.
3409 set_zones_ncg_home(dd);
3411 switch (fr->cutoff_scheme)
3414 set_zones_size(dd, state_local->box, &ddbox, 0, 1, ncg_moved);
3416 nbnxn_put_on_grid(fr->nbv->nbs.get(), fr->ePBC, state_local->box,
3418 comm->zones.size[0].bb_x0,
3419 comm->zones.size[0].bb_x1,
3420 comm->updateGroupsCog.get(),
3422 comm->zones.dens_zone0,
3425 ncg_moved, bRedist ? comm->movedBuffer.data() : nullptr,
3426 fr->nbv->grp[eintLocal].kernel_type,
3429 nbnxn_get_ncells(fr->nbv->nbs.get(), &ncells_new[XX], &ncells_new[YY]);
3432 fill_grid(&comm->zones, fr->ns->grid, dd->ncg_home,
3433 0, dd->ncg_home, fr->cg_cm);
3435 copy_ivec(fr->ns->grid->n, ncells_new);
3438 gmx_incons("unimplemented");
3441 bResortAll = bMasterState;
3443 /* Check if we can user the old order and ns grid cell indices
3444 * of the charge groups to sort the charge groups efficiently.
3446 if (ncells_new[XX] != ncells_old[XX] ||
3447 ncells_new[YY] != ncells_old[YY] ||
3448 ncells_new[ZZ] != ncells_old[ZZ])
3455 fprintf(debug, "Step %s, sorting the %d home charge groups\n",
3456 gmx_step_str(step, sbuf), dd->ncg_home);
3458 dd_sort_state(dd, fr->cg_cm, fr, state_local,
3459 bResortAll ? -1 : ncg_home_old);
3461 /* After sorting and compacting we set the correct size */
3462 dd_resize_state(state_local, f, comm->atomRanges.numHomeAtoms());
3464 /* Rebuild all the indices */
3468 wallcycle_sub_stop(wcycle, ewcsDD_GRID);
3471 if (comm->useUpdateGroups)
3473 /* The update groups cog's are invalid after sorting
3474 * and need to be cleared before the next partitioning anyhow.
3476 comm->updateGroupsCog->clear();
3479 wallcycle_sub_start(wcycle, ewcsDD_SETUPCOMM);
3481 /* Setup up the communication and communicate the coordinates */
3482 setup_dd_communication(dd, state_local->box, &ddbox, fr, state_local, f);
3484 /* Set the indices */
3485 make_dd_indices(dd, cgs_gl->index, ncgindex_set);
3487 /* Set the charge group boundaries for neighbor searching */
3488 set_cg_boundaries(&comm->zones);
3490 if (fr->cutoff_scheme == ecutsVERLET)
3492 /* When bSortCG=true, we have already set the size for zone 0 */
3493 set_zones_size(dd, state_local->box, &ddbox,
3494 bSortCG ? 1 : 0, comm->zones.n,
3498 wallcycle_sub_stop(wcycle, ewcsDD_SETUPCOMM);
3501 write_dd_pdb("dd_home",step,"dump",top_global,cr,
3502 -1,state_local->x.rvec_array(),state_local->box);
3505 wallcycle_sub_start(wcycle, ewcsDD_MAKETOP);
3507 /* Extract a local topology from the global topology */
3508 for (int i = 0; i < dd->ndim; i++)
3510 np[dd->dim[i]] = comm->cd[i].numPulses();
3512 dd_make_local_top(dd, &comm->zones, dd->npbcdim, state_local->box,
3513 comm->cellsize_min, np,
3515 fr->cutoff_scheme == ecutsGROUP ? fr->cg_cm : state_local->x.rvec_array(),
3516 vsite, top_global, top_local);
3518 wallcycle_sub_stop(wcycle, ewcsDD_MAKETOP);
3520 wallcycle_sub_start(wcycle, ewcsDD_MAKECONSTR);
3522 /* Set up the special atom communication */
3523 int n = comm->atomRanges.end(DDAtomRanges::Type::Zones);
3524 for (int i = static_cast<int>(DDAtomRanges::Type::Zones) + 1; i < static_cast<int>(DDAtomRanges::Type::Number); i++)
3526 auto range = static_cast<DDAtomRanges::Type>(i);
3529 case DDAtomRanges::Type::Vsites:
3530 if (vsite && vsite->n_intercg_vsite)
3532 n = dd_make_local_vsites(dd, n, top_local->idef.il);
3535 case DDAtomRanges::Type::Constraints:
3536 if (dd->splitConstraints || dd->splitSettles)
3538 /* Only for inter-cg constraints we need special code */
3539 n = dd_make_local_constraints(dd, n, &top_global, fr->cginfo,
3540 constr, ir->nProjOrder,
3541 top_local->idef.il);
3545 gmx_incons("Unknown special atom type setup");
3547 comm->atomRanges.setEnd(range, n);
3550 wallcycle_sub_stop(wcycle, ewcsDD_MAKECONSTR);
3552 wallcycle_sub_start(wcycle, ewcsDD_TOPOTHER);
3554 /* Make space for the extra coordinates for virtual site
3555 * or constraint communication.
3557 state_local->natoms = comm->atomRanges.numAtomsTotal();
3559 dd_resize_state(state_local, f, state_local->natoms);
3561 if (fr->haveDirectVirialContributions)
3563 if (vsite && vsite->n_intercg_vsite)
3565 nat_f_novirsum = comm->atomRanges.end(DDAtomRanges::Type::Vsites);
3569 if (EEL_FULL(ir->coulombtype) && dd->n_intercg_excl > 0)
3571 nat_f_novirsum = comm->atomRanges.end(DDAtomRanges::Type::Zones);
3575 nat_f_novirsum = comm->atomRanges.numHomeAtoms();
3584 /* Set the number of atoms required for the force calculation.
3585 * Forces need to be constrained when doing energy
3586 * minimization. For simple simulations we could avoid some
3587 * allocation, zeroing and copying, but this is probably not worth
3588 * the complications and checking.
3590 forcerec_set_ranges(fr, dd->ncg_home, dd->globalAtomGroupIndices.size(),
3591 comm->atomRanges.end(DDAtomRanges::Type::Zones),
3592 comm->atomRanges.end(DDAtomRanges::Type::Constraints),
3595 /* Update atom data for mdatoms and several algorithms */
3596 mdAlgorithmsSetupAtomData(cr, ir, top_global, top_local, fr,
3597 nullptr, mdAtoms, constr, vsite, nullptr);
3599 auto mdatoms = mdAtoms->mdatoms();
3600 if (!thisRankHasDuty(cr, DUTY_PME))
3602 /* Send the charges and/or c6/sigmas to our PME only node */
3603 gmx_pme_send_parameters(cr,
3605 mdatoms->nChargePerturbed != 0, mdatoms->nTypePerturbed != 0,
3606 mdatoms->chargeA, mdatoms->chargeB,
3607 mdatoms->sqrt_c6A, mdatoms->sqrt_c6B,
3608 mdatoms->sigmaA, mdatoms->sigmaB,
3609 dd_pme_maxshift_x(dd), dd_pme_maxshift_y(dd));
3614 /* Update the local pull groups */
3615 dd_make_local_pull_groups(cr, ir->pull_work);
3618 if (dd->atomSets != nullptr)
3620 /* Update the local atom sets */
3621 dd->atomSets->setIndicesInDomainDecomposition(*(dd->ga2la));
3624 /* Update the local atoms to be communicated via the IMD protocol if bIMD is TRUE. */
3625 dd_make_local_IMD_atoms(ir->bIMD, dd, ir->imd);
3627 add_dd_statistics(dd);
3629 /* Make sure we only count the cycles for this DD partitioning */
3630 clear_dd_cycle_counts(dd);
3632 /* Because the order of the atoms might have changed since
3633 * the last vsite construction, we need to communicate the constructing
3634 * atom coordinates again (for spreading the forces this MD step).
3636 dd_move_x_vsites(dd, state_local->box, state_local->x.rvec_array());
3638 wallcycle_sub_stop(wcycle, ewcsDD_TOPOTHER);
3640 if (comm->nstDDDump > 0 && step % comm->nstDDDump == 0)
3642 dd_move_x(dd, state_local->box, state_local->x, nullWallcycle);
3643 write_dd_pdb("dd_dump", step, "dump", &top_global, cr,
3644 -1, state_local->x.rvec_array(), state_local->box);
3647 /* Store the partitioning step */
3648 comm->partition_step = step;
3650 /* Increase the DD partitioning counter */
3652 /* The state currently matches this DD partitioning count, store it */
3653 state_local->ddp_count = dd->ddp_count;
3656 /* The DD master node knows the complete cg distribution,
3657 * store the count so we can possibly skip the cg info communication.
3659 comm->master_cg_ddp_count = (bSortCG ? 0 : dd->ddp_count);
3662 if (comm->DD_debug > 0)
3664 /* Set the env var GMX_DD_DEBUG if you suspect corrupted indices */
3665 check_index_consistency(dd, top_global.natoms, ncg_mtop(&top_global),
3666 "after partitioning");
3669 wallcycle_stop(wcycle, ewcDOMDEC);
3672 /*! \brief Check whether bonded interactions are missing, if appropriate */
3673 void checkNumberOfBondedInteractions(const gmx::MDLogger &mdlog,
3675 int totalNumberOfBondedInteractions,
3676 const gmx_mtop_t *top_global,
3677 const gmx_localtop_t *top_local,
3678 const t_state *state,
3679 bool *shouldCheckNumberOfBondedInteractions)
3681 if (*shouldCheckNumberOfBondedInteractions)
3683 if (totalNumberOfBondedInteractions != cr->dd->nbonded_global)
3685 dd_print_missing_interactions(mdlog, cr, totalNumberOfBondedInteractions, top_global, top_local, state); // Does not return
3687 *shouldCheckNumberOfBondedInteractions = false;