2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
52 #include "gromacs/domdec/domdec_network.h"
53 #include "gromacs/domdec/ga2la.h"
54 #include "gromacs/ewald/pme.h"
55 #include "gromacs/fileio/gmxfio.h"
56 #include "gromacs/fileio/pdbio.h"
57 #include "gromacs/gmxlib/chargegroup.h"
58 #include "gromacs/gmxlib/network.h"
59 #include "gromacs/gmxlib/nrnb.h"
60 #include "gromacs/gpu_utils/gpu_utils.h"
61 #include "gromacs/hardware/hw_info.h"
62 #include "gromacs/imd/imd.h"
63 #include "gromacs/listed-forces/manage-threading.h"
64 #include "gromacs/math/functions.h"
65 #include "gromacs/math/vec.h"
66 #include "gromacs/math/vectypes.h"
67 #include "gromacs/mdlib/calc_verletbuf.h"
68 #include "gromacs/mdlib/constr.h"
69 #include "gromacs/mdlib/force.h"
70 #include "gromacs/mdlib/forcerec.h"
71 #include "gromacs/mdlib/genborn.h"
72 #include "gromacs/mdlib/gmx_omp_nthreads.h"
73 #include "gromacs/mdlib/mdatoms.h"
74 #include "gromacs/mdlib/mdrun.h"
75 #include "gromacs/mdlib/mdsetup.h"
76 #include "gromacs/mdlib/nb_verlet.h"
77 #include "gromacs/mdlib/nbnxn_grid.h"
78 #include "gromacs/mdlib/nsgrid.h"
79 #include "gromacs/mdlib/vsite.h"
80 #include "gromacs/mdtypes/commrec.h"
81 #include "gromacs/mdtypes/df_history.h"
82 #include "gromacs/mdtypes/forcerec.h"
83 #include "gromacs/mdtypes/inputrec.h"
84 #include "gromacs/mdtypes/md_enums.h"
85 #include "gromacs/mdtypes/mdatom.h"
86 #include "gromacs/mdtypes/nblist.h"
87 #include "gromacs/mdtypes/state.h"
88 #include "gromacs/pbcutil/ishift.h"
89 #include "gromacs/pbcutil/pbc.h"
90 #include "gromacs/pulling/pull.h"
91 #include "gromacs/pulling/pull_rotation.h"
92 #include "gromacs/swap/swapcoords.h"
93 #include "gromacs/timing/wallcycle.h"
94 #include "gromacs/topology/block.h"
95 #include "gromacs/topology/idef.h"
96 #include "gromacs/topology/ifunc.h"
97 #include "gromacs/topology/mtop_lookup.h"
98 #include "gromacs/topology/mtop_util.h"
99 #include "gromacs/topology/topology.h"
100 #include "gromacs/utility/basedefinitions.h"
101 #include "gromacs/utility/basenetwork.h"
102 #include "gromacs/utility/cstringutil.h"
103 #include "gromacs/utility/exceptions.h"
104 #include "gromacs/utility/fatalerror.h"
105 #include "gromacs/utility/gmxmpi.h"
106 #include "gromacs/utility/qsort_threadsafe.h"
107 #include "gromacs/utility/real.h"
108 #include "gromacs/utility/smalloc.h"
109 #include "gromacs/utility/stringutil.h"
111 #include "domdec_constraints.h"
112 #include "domdec_internal.h"
113 #include "domdec_vsite.h"
115 #define DDRANK(dd, rank) (rank)
116 #define DDMASTERRANK(dd) (dd->masterrank)
118 struct gmx_domdec_master_t
120 /* The cell boundaries */
122 /* The global charge group division */
123 int *ncg; /* Number of home charge groups for each node */
124 int *index; /* Index of nnodes+1 into cg */
125 int *cg; /* Global charge group index */
126 int *nat; /* Number of home atoms for each node. */
127 int *ibuf; /* Buffer for communication */
128 rvec *vbuf; /* Buffer for state scattering and gathering */
131 #define DD_NLOAD_MAX 9
133 const char *edlbs_names[edlbsNR] = { "off", "auto", "locked", "on", "on" };
135 /* The size per charge group of the cggl_flag buffer in gmx_domdec_comm_t */
138 /* The flags for the cggl_flag buffer in gmx_domdec_comm_t */
139 #define DD_FLAG_NRCG 65535
140 #define DD_FLAG_FW(d) (1<<(16+(d)*2))
141 #define DD_FLAG_BW(d) (1<<(16+(d)*2+1))
143 /* The DD zone order */
144 static const ivec dd_zo[DD_MAXZONE] =
145 {{0, 0, 0}, {1, 0, 0}, {1, 1, 0}, {0, 1, 0}, {0, 1, 1}, {0, 0, 1}, {1, 0, 1}, {1, 1, 1}};
147 /* The non-bonded zone-pair setup for domain decomposition
148 * The first number is the i-zone, the second number the first j-zone seen by
149 * this i-zone, the third number the last+1 j-zone seen by this i-zone.
150 * As is, this is for 3D decomposition, where there are 4 i-zones.
151 * With 2D decomposition use only the first 2 i-zones and a last+1 j-zone of 4.
152 * With 1D decomposition use only the first i-zone and a last+1 j-zone of 2.
155 ddNonbondedZonePairRanges[DD_MAXIZONE][3] = {{0, 0, 8},
160 /* Factors used to avoid problems due to rounding issues */
161 #define DD_CELL_MARGIN 1.0001
162 #define DD_CELL_MARGIN2 1.00005
163 /* Factor to account for pressure scaling during nstlist steps */
164 #define DD_PRES_SCALE_MARGIN 1.02
166 /* Turn on DLB when the load imbalance causes this amount of total loss.
167 * There is a bit of overhead with DLB and it's difficult to achieve
168 * a load imbalance of less than 2% with DLB.
170 #define DD_PERF_LOSS_DLB_ON 0.02
172 /* Warn about imbalance due to PP or PP/PME load imbalance at this loss */
173 #define DD_PERF_LOSS_WARN 0.05
175 #define DD_CELL_F_SIZE(dd, di) ((dd)->nc[(dd)->dim[(di)]]+1+(di)*2+1+(di))
177 /* Use separate MPI send and receive commands
178 * when nnodes <= GMX_DD_NNODES_SENDRECV.
179 * This saves memory (and some copying for small nnodes).
180 * For high parallelization scatter and gather calls are used.
182 #define GMX_DD_NNODES_SENDRECV 4
185 /* We check if to turn on DLB at the first and every 100 DD partitionings.
186 * With large imbalance DLB will turn on at the first step, so we can
187 * make the interval so large that the MPI overhead of the check is negligible.
189 static const int c_checkTurnDlbOnInterval = 100;
190 /* We need to check if DLB results in worse performance and then turn it off.
191 * We check this more often then for turning DLB on, because the DLB can scale
192 * the domains very rapidly, so if unlucky the load imbalance can go up quickly
193 * and furthermore, we are already synchronizing often with DLB, so
194 * the overhead of the MPI Bcast is not that high.
196 static const int c_checkTurnDlbOffInterval = 20;
198 /* Forward declaration */
199 static void dd_dlb_set_should_check_whether_to_turn_dlb_on(gmx_domdec_t *dd, gmx_bool bValue);
203 #define dd_index(n,i) ((((i)[ZZ]*(n)[YY] + (i)[YY])*(n)[XX]) + (i)[XX])
205 static void index2xyz(ivec nc,int ind,ivec xyz)
207 xyz[XX] = ind % nc[XX];
208 xyz[YY] = (ind / nc[XX]) % nc[YY];
209 xyz[ZZ] = ind / (nc[YY]*nc[XX]);
213 /* This order is required to minimize the coordinate communication in PME
214 * which uses decomposition in the x direction.
216 #define dd_index(n, i) ((((i)[XX]*(n)[YY] + (i)[YY])*(n)[ZZ]) + (i)[ZZ])
218 static void ddindex2xyz(ivec nc, int ind, ivec xyz)
220 xyz[XX] = ind / (nc[YY]*nc[ZZ]);
221 xyz[YY] = (ind / nc[ZZ]) % nc[YY];
222 xyz[ZZ] = ind % nc[ZZ];
225 static int ddcoord2ddnodeid(gmx_domdec_t *dd, ivec c)
230 ddindex = dd_index(dd->nc, c);
231 if (dd->comm->bCartesianPP_PME)
233 ddnodeid = dd->comm->ddindex2ddnodeid[ddindex];
235 else if (dd->comm->bCartesianPP)
238 MPI_Cart_rank(dd->mpi_comm_all, c, &ddnodeid);
249 static gmx_bool dynamic_dd_box(const gmx_ddbox_t *ddbox, const t_inputrec *ir)
251 return (ddbox->nboundeddim < DIM || inputrecDynamicBox(ir));
254 int ddglatnr(const gmx_domdec_t *dd, int i)
264 if (i >= dd->comm->nat[ddnatNR-1])
266 gmx_fatal(FARGS, "glatnr called with %d, which is larger than the local number of atoms (%d)", i, dd->comm->nat[ddnatNR-1]);
268 atnr = dd->gatindex[i] + 1;
274 t_block *dd_charge_groups_global(gmx_domdec_t *dd)
276 return &dd->comm->cgs_gl;
279 /*! \brief Returns true if the DLB state indicates that the balancer is on. */
280 static bool isDlbOn(const gmx_domdec_comm_t *comm)
282 return (comm->dlbState == edlbsOnCanTurnOff ||
283 comm->dlbState == edlbsOnUser);
285 /*! \brief Returns true if the DLB state indicates that the balancer is off/disabled.
287 static bool isDlbDisabled(const gmx_domdec_comm_t *comm)
289 return (comm->dlbState == edlbsOffUser ||
290 comm->dlbState == edlbsOffForever);
293 static void vec_rvec_init(vec_rvec_t *v)
299 static void vec_rvec_check_alloc(vec_rvec_t *v, int n)
303 v->nalloc = over_alloc_dd(n);
304 srenew(v->v, v->nalloc);
308 void dd_store_state(gmx_domdec_t *dd, t_state *state)
312 if (state->ddp_count != dd->ddp_count)
314 gmx_incons("The MD state does not match the domain decomposition state");
317 state->cg_gl.resize(dd->ncg_home);
318 for (i = 0; i < dd->ncg_home; i++)
320 state->cg_gl[i] = dd->index_gl[i];
323 state->ddp_count_cg_gl = dd->ddp_count;
326 gmx_domdec_zones_t *domdec_zones(gmx_domdec_t *dd)
328 return &dd->comm->zones;
331 void dd_get_ns_ranges(const gmx_domdec_t *dd, int icg,
332 int *jcg0, int *jcg1, ivec shift0, ivec shift1)
334 gmx_domdec_zones_t *zones;
337 zones = &dd->comm->zones;
340 while (icg >= zones->izone[izone].cg1)
349 else if (izone < zones->nizone)
351 *jcg0 = zones->izone[izone].jcg0;
355 gmx_fatal(FARGS, "DD icg %d out of range: izone (%d) >= nizone (%d)",
356 icg, izone, zones->nizone);
359 *jcg1 = zones->izone[izone].jcg1;
361 for (d = 0; d < dd->ndim; d++)
364 shift0[dim] = zones->izone[izone].shift0[dim];
365 shift1[dim] = zones->izone[izone].shift1[dim];
366 if (dd->comm->tric_dir[dim] || (isDlbOn(dd->comm) && d > 0))
368 /* A conservative approach, this can be optimized */
375 int dd_natoms_mdatoms(const gmx_domdec_t *dd)
377 /* We currently set mdatoms entries for all atoms:
378 * local + non-local + communicated for vsite + constraints
381 return dd->comm->nat[ddnatNR - 1];
384 int dd_natoms_vsite(const gmx_domdec_t *dd)
386 return dd->comm->nat[ddnatVSITE];
389 void dd_get_constraint_range(const gmx_domdec_t *dd, int *at_start, int *at_end)
391 *at_start = dd->comm->nat[ddnatCON-1];
392 *at_end = dd->comm->nat[ddnatCON];
395 void dd_move_x(gmx_domdec_t *dd, matrix box, rvec x[])
397 int nzone, nat_tot, n, d, p, i, j, at0, at1, zone;
398 int *index, *cgindex;
399 gmx_domdec_comm_t *comm;
400 gmx_domdec_comm_dim_t *cd;
401 gmx_domdec_ind_t *ind;
402 rvec shift = {0, 0, 0}, *buf, *rbuf;
403 gmx_bool bPBC, bScrew;
407 cgindex = dd->cgindex;
412 nat_tot = dd->nat_home;
413 for (d = 0; d < dd->ndim; d++)
415 bPBC = (dd->ci[dd->dim[d]] == 0);
416 bScrew = (bPBC && dd->bScrewPBC && dd->dim[d] == XX);
419 copy_rvec(box[dd->dim[d]], shift);
422 for (p = 0; p < cd->np; p++)
429 for (i = 0; i < ind->nsend[nzone]; i++)
431 at0 = cgindex[index[i]];
432 at1 = cgindex[index[i]+1];
433 for (j = at0; j < at1; j++)
435 copy_rvec(x[j], buf[n]);
442 for (i = 0; i < ind->nsend[nzone]; i++)
444 at0 = cgindex[index[i]];
445 at1 = cgindex[index[i]+1];
446 for (j = at0; j < at1; j++)
448 /* We need to shift the coordinates */
449 rvec_add(x[j], shift, buf[n]);
456 for (i = 0; i < ind->nsend[nzone]; i++)
458 at0 = cgindex[index[i]];
459 at1 = cgindex[index[i]+1];
460 for (j = at0; j < at1; j++)
463 buf[n][XX] = x[j][XX] + shift[XX];
465 * This operation requires a special shift force
466 * treatment, which is performed in calc_vir.
468 buf[n][YY] = box[YY][YY] - x[j][YY];
469 buf[n][ZZ] = box[ZZ][ZZ] - x[j][ZZ];
481 rbuf = comm->vbuf2.v;
483 /* Send and receive the coordinates */
484 dd_sendrecv_rvec(dd, d, dddirBackward,
485 buf, ind->nsend[nzone+1],
486 rbuf, ind->nrecv[nzone+1]);
490 for (zone = 0; zone < nzone; zone++)
492 for (i = ind->cell2at0[zone]; i < ind->cell2at1[zone]; i++)
494 copy_rvec(rbuf[j], x[i]);
499 nat_tot += ind->nrecv[nzone+1];
505 void dd_move_f(gmx_domdec_t *dd, rvec f[], rvec *fshift)
507 int nzone, nat_tot, n, d, p, i, j, at0, at1, zone;
508 int *index, *cgindex;
509 gmx_domdec_comm_t *comm;
510 gmx_domdec_comm_dim_t *cd;
511 gmx_domdec_ind_t *ind;
515 gmx_bool bShiftForcesNeedPbc, bScrew;
519 cgindex = dd->cgindex;
523 nzone = comm->zones.n/2;
524 nat_tot = dd->nat_tot;
525 for (d = dd->ndim-1; d >= 0; d--)
527 /* Only forces in domains near the PBC boundaries need to
528 consider PBC in the treatment of fshift */
529 bShiftForcesNeedPbc = (dd->ci[dd->dim[d]] == 0);
530 bScrew = (bShiftForcesNeedPbc && dd->bScrewPBC && dd->dim[d] == XX);
531 if (fshift == nullptr && !bScrew)
533 bShiftForcesNeedPbc = FALSE;
535 /* Determine which shift vector we need */
541 for (p = cd->np-1; p >= 0; p--)
544 nat_tot -= ind->nrecv[nzone+1];
551 sbuf = comm->vbuf2.v;
553 for (zone = 0; zone < nzone; zone++)
555 for (i = ind->cell2at0[zone]; i < ind->cell2at1[zone]; i++)
557 copy_rvec(f[i], sbuf[j]);
562 /* Communicate the forces */
563 dd_sendrecv_rvec(dd, d, dddirForward,
564 sbuf, ind->nrecv[nzone+1],
565 buf, ind->nsend[nzone+1]);
567 /* Add the received forces */
569 if (!bShiftForcesNeedPbc)
571 for (i = 0; i < ind->nsend[nzone]; i++)
573 at0 = cgindex[index[i]];
574 at1 = cgindex[index[i]+1];
575 for (j = at0; j < at1; j++)
577 rvec_inc(f[j], buf[n]);
584 /* fshift should always be defined if this function is
585 * called when bShiftForcesNeedPbc is true */
586 assert(NULL != fshift);
587 for (i = 0; i < ind->nsend[nzone]; i++)
589 at0 = cgindex[index[i]];
590 at1 = cgindex[index[i]+1];
591 for (j = at0; j < at1; j++)
593 rvec_inc(f[j], buf[n]);
594 /* Add this force to the shift force */
595 rvec_inc(fshift[is], buf[n]);
602 for (i = 0; i < ind->nsend[nzone]; i++)
604 at0 = cgindex[index[i]];
605 at1 = cgindex[index[i]+1];
606 for (j = at0; j < at1; j++)
608 /* Rotate the force */
609 f[j][XX] += buf[n][XX];
610 f[j][YY] -= buf[n][YY];
611 f[j][ZZ] -= buf[n][ZZ];
614 /* Add this force to the shift force */
615 rvec_inc(fshift[is], buf[n]);
626 void dd_atom_spread_real(gmx_domdec_t *dd, real v[])
628 int nzone, nat_tot, n, d, p, i, j, at0, at1, zone;
629 int *index, *cgindex;
630 gmx_domdec_comm_t *comm;
631 gmx_domdec_comm_dim_t *cd;
632 gmx_domdec_ind_t *ind;
637 cgindex = dd->cgindex;
639 buf = &comm->vbuf.v[0][0];
642 nat_tot = dd->nat_home;
643 for (d = 0; d < dd->ndim; d++)
646 for (p = 0; p < cd->np; p++)
651 for (i = 0; i < ind->nsend[nzone]; i++)
653 at0 = cgindex[index[i]];
654 at1 = cgindex[index[i]+1];
655 for (j = at0; j < at1; j++)
668 rbuf = &comm->vbuf2.v[0][0];
670 /* Send and receive the coordinates */
671 dd_sendrecv_real(dd, d, dddirBackward,
672 buf, ind->nsend[nzone+1],
673 rbuf, ind->nrecv[nzone+1]);
677 for (zone = 0; zone < nzone; zone++)
679 for (i = ind->cell2at0[zone]; i < ind->cell2at1[zone]; i++)
686 nat_tot += ind->nrecv[nzone+1];
692 void dd_atom_sum_real(gmx_domdec_t *dd, real v[])
694 int nzone, nat_tot, n, d, p, i, j, at0, at1, zone;
695 int *index, *cgindex;
696 gmx_domdec_comm_t *comm;
697 gmx_domdec_comm_dim_t *cd;
698 gmx_domdec_ind_t *ind;
703 cgindex = dd->cgindex;
705 buf = &comm->vbuf.v[0][0];
707 nzone = comm->zones.n/2;
708 nat_tot = dd->nat_tot;
709 for (d = dd->ndim-1; d >= 0; d--)
712 for (p = cd->np-1; p >= 0; p--)
715 nat_tot -= ind->nrecv[nzone+1];
722 sbuf = &comm->vbuf2.v[0][0];
724 for (zone = 0; zone < nzone; zone++)
726 for (i = ind->cell2at0[zone]; i < ind->cell2at1[zone]; i++)
733 /* Communicate the forces */
734 dd_sendrecv_real(dd, d, dddirForward,
735 sbuf, ind->nrecv[nzone+1],
736 buf, ind->nsend[nzone+1]);
738 /* Add the received forces */
740 for (i = 0; i < ind->nsend[nzone]; i++)
742 at0 = cgindex[index[i]];
743 at1 = cgindex[index[i]+1];
744 for (j = at0; j < at1; j++)
755 static void print_ddzone(FILE *fp, int d, int i, int j, gmx_ddzone_t *zone)
757 fprintf(fp, "zone d0 %d d1 %d d2 %d min0 %6.3f max1 %6.3f mch0 %6.3f mch1 %6.3f p1_0 %6.3f p1_1 %6.3f\n",
759 zone->min0, zone->max1,
760 zone->mch0, zone->mch0,
761 zone->p1_0, zone->p1_1);
765 #define DDZONECOMM_MAXZONE 5
766 #define DDZONECOMM_BUFSIZE 3
768 static void dd_sendrecv_ddzone(const gmx_domdec_t *dd,
769 int ddimind, int direction,
770 gmx_ddzone_t *buf_s, int n_s,
771 gmx_ddzone_t *buf_r, int n_r)
773 #define ZBS DDZONECOMM_BUFSIZE
774 rvec vbuf_s[DDZONECOMM_MAXZONE*ZBS];
775 rvec vbuf_r[DDZONECOMM_MAXZONE*ZBS];
778 for (i = 0; i < n_s; i++)
780 vbuf_s[i*ZBS ][0] = buf_s[i].min0;
781 vbuf_s[i*ZBS ][1] = buf_s[i].max1;
782 vbuf_s[i*ZBS ][2] = buf_s[i].min1;
783 vbuf_s[i*ZBS+1][0] = buf_s[i].mch0;
784 vbuf_s[i*ZBS+1][1] = buf_s[i].mch1;
785 vbuf_s[i*ZBS+1][2] = 0;
786 vbuf_s[i*ZBS+2][0] = buf_s[i].p1_0;
787 vbuf_s[i*ZBS+2][1] = buf_s[i].p1_1;
788 vbuf_s[i*ZBS+2][2] = 0;
791 dd_sendrecv_rvec(dd, ddimind, direction,
795 for (i = 0; i < n_r; i++)
797 buf_r[i].min0 = vbuf_r[i*ZBS ][0];
798 buf_r[i].max1 = vbuf_r[i*ZBS ][1];
799 buf_r[i].min1 = vbuf_r[i*ZBS ][2];
800 buf_r[i].mch0 = vbuf_r[i*ZBS+1][0];
801 buf_r[i].mch1 = vbuf_r[i*ZBS+1][1];
802 buf_r[i].p1_0 = vbuf_r[i*ZBS+2][0];
803 buf_r[i].p1_1 = vbuf_r[i*ZBS+2][1];
809 static void dd_move_cellx(gmx_domdec_t *dd, gmx_ddbox_t *ddbox,
810 rvec cell_ns_x0, rvec cell_ns_x1)
812 int d, d1, dim, pos, buf_size, i, j, p, npulse, npulse_min;
814 gmx_ddzone_t buf_s[DDZONECOMM_MAXZONE];
815 gmx_ddzone_t buf_r[DDZONECOMM_MAXZONE];
816 gmx_ddzone_t buf_e[DDZONECOMM_MAXZONE];
817 rvec extr_s[2], extr_r[2];
819 real dist_d, c = 0, det;
820 gmx_domdec_comm_t *comm;
825 for (d = 1; d < dd->ndim; d++)
828 zp = (d == 1) ? &comm->zone_d1[0] : &comm->zone_d2[0][0];
829 zp->min0 = cell_ns_x0[dim];
830 zp->max1 = cell_ns_x1[dim];
831 zp->min1 = cell_ns_x1[dim];
832 zp->mch0 = cell_ns_x0[dim];
833 zp->mch1 = cell_ns_x1[dim];
834 zp->p1_0 = cell_ns_x0[dim];
835 zp->p1_1 = cell_ns_x1[dim];
838 for (d = dd->ndim-2; d >= 0; d--)
841 bPBC = (dim < ddbox->npbcdim);
843 /* Use an rvec to store two reals */
844 extr_s[d][0] = comm->cell_f0[d+1];
845 extr_s[d][1] = comm->cell_f1[d+1];
846 extr_s[d][2] = comm->cell_f1[d+1];
849 /* Store the extremes in the backward sending buffer,
850 * so the get updated separately from the forward communication.
852 for (d1 = d; d1 < dd->ndim-1; d1++)
854 /* We invert the order to be able to use the same loop for buf_e */
855 buf_s[pos].min0 = extr_s[d1][1];
856 buf_s[pos].max1 = extr_s[d1][0];
857 buf_s[pos].min1 = extr_s[d1][2];
860 /* Store the cell corner of the dimension we communicate along */
861 buf_s[pos].p1_0 = comm->cell_x0[dim];
866 buf_s[pos] = (dd->ndim == 2) ? comm->zone_d1[0] : comm->zone_d2[0][0];
869 if (dd->ndim == 3 && d == 0)
871 buf_s[pos] = comm->zone_d2[0][1];
873 buf_s[pos] = comm->zone_d1[0];
877 /* We only need to communicate the extremes
878 * in the forward direction
880 npulse = comm->cd[d].np;
883 /* Take the minimum to avoid double communication */
884 npulse_min = std::min(npulse, dd->nc[dim]-1-npulse);
888 /* Without PBC we should really not communicate over
889 * the boundaries, but implementing that complicates
890 * the communication setup and therefore we simply
891 * do all communication, but ignore some data.
895 for (p = 0; p < npulse_min; p++)
897 /* Communicate the extremes forward */
898 bUse = (bPBC || dd->ci[dim] > 0);
900 dd_sendrecv_rvec(dd, d, dddirForward,
901 extr_s+d, dd->ndim-d-1,
902 extr_r+d, dd->ndim-d-1);
906 for (d1 = d; d1 < dd->ndim-1; d1++)
908 extr_s[d1][0] = std::max(extr_s[d1][0], extr_r[d1][0]);
909 extr_s[d1][1] = std::min(extr_s[d1][1], extr_r[d1][1]);
910 extr_s[d1][2] = std::min(extr_s[d1][2], extr_r[d1][2]);
916 for (p = 0; p < npulse; p++)
918 /* Communicate all the zone information backward */
919 bUse = (bPBC || dd->ci[dim] < dd->nc[dim] - 1);
921 dd_sendrecv_ddzone(dd, d, dddirBackward,
928 for (d1 = d+1; d1 < dd->ndim; d1++)
930 /* Determine the decrease of maximum required
931 * communication height along d1 due to the distance along d,
932 * this avoids a lot of useless atom communication.
934 dist_d = comm->cell_x1[dim] - buf_r[0].p1_0;
936 if (ddbox->tric_dir[dim])
938 /* c is the off-diagonal coupling between the cell planes
939 * along directions d and d1.
941 c = ddbox->v[dim][dd->dim[d1]][dim];
947 det = (1 + c*c)*comm->cutoff*comm->cutoff - dist_d*dist_d;
950 dh[d1] = comm->cutoff - (c*dist_d + std::sqrt(det))/(1 + c*c);
954 /* A negative value signals out of range */
960 /* Accumulate the extremes over all pulses */
961 for (i = 0; i < buf_size; i++)
971 buf_e[i].min0 = std::min(buf_e[i].min0, buf_r[i].min0);
972 buf_e[i].max1 = std::max(buf_e[i].max1, buf_r[i].max1);
973 buf_e[i].min1 = std::min(buf_e[i].min1, buf_r[i].min1);
976 if (dd->ndim == 3 && d == 0 && i == buf_size - 1)
984 if (bUse && dh[d1] >= 0)
986 buf_e[i].mch0 = std::max(buf_e[i].mch0, buf_r[i].mch0-dh[d1]);
987 buf_e[i].mch1 = std::max(buf_e[i].mch1, buf_r[i].mch1-dh[d1]);
990 /* Copy the received buffer to the send buffer,
991 * to pass the data through with the next pulse.
995 if (((bPBC || dd->ci[dim]+npulse < dd->nc[dim]) && p == npulse-1) ||
996 (!bPBC && dd->ci[dim]+1+p == dd->nc[dim]-1))
998 /* Store the extremes */
1001 for (d1 = d; d1 < dd->ndim-1; d1++)
1003 extr_s[d1][1] = std::min(extr_s[d1][1], buf_e[pos].min0);
1004 extr_s[d1][0] = std::max(extr_s[d1][0], buf_e[pos].max1);
1005 extr_s[d1][2] = std::min(extr_s[d1][2], buf_e[pos].min1);
1009 if (d == 1 || (d == 0 && dd->ndim == 3))
1011 for (i = d; i < 2; i++)
1013 comm->zone_d2[1-d][i] = buf_e[pos];
1019 comm->zone_d1[1] = buf_e[pos];
1029 for (i = 0; i < 2; i++)
1033 print_ddzone(debug, 1, i, 0, &comm->zone_d1[i]);
1035 cell_ns_x0[dim] = std::min(cell_ns_x0[dim], comm->zone_d1[i].min0);
1036 cell_ns_x1[dim] = std::max(cell_ns_x1[dim], comm->zone_d1[i].max1);
1042 for (i = 0; i < 2; i++)
1044 for (j = 0; j < 2; j++)
1048 print_ddzone(debug, 2, i, j, &comm->zone_d2[i][j]);
1050 cell_ns_x0[dim] = std::min(cell_ns_x0[dim], comm->zone_d2[i][j].min0);
1051 cell_ns_x1[dim] = std::max(cell_ns_x1[dim], comm->zone_d2[i][j].max1);
1055 for (d = 1; d < dd->ndim; d++)
1057 comm->cell_f_max0[d] = extr_s[d-1][0];
1058 comm->cell_f_min1[d] = extr_s[d-1][1];
1061 fprintf(debug, "Cell fraction d %d, max0 %f, min1 %f\n",
1062 d, comm->cell_f_max0[d], comm->cell_f_min1[d]);
1067 static void dd_collect_cg(gmx_domdec_t *dd,
1068 const t_state *state_local)
1070 gmx_domdec_master_t *ma = nullptr;
1071 int buf2[2], *ibuf, i, ncg_home = 0, nat_home = 0;
1073 if (state_local->ddp_count == dd->comm->master_cg_ddp_count)
1075 /* The master has the correct distribution */
1081 if (state_local->ddp_count == dd->ddp_count)
1083 /* The local state and DD are in sync, use the DD indices */
1084 ncg_home = dd->ncg_home;
1086 nat_home = dd->nat_home;
1088 else if (state_local->ddp_count_cg_gl == state_local->ddp_count)
1090 /* The DD is out of sync with the local state, but we have stored
1091 * the cg indices with the local state, so we can use those.
1095 cgs_gl = &dd->comm->cgs_gl;
1097 ncg_home = state_local->cg_gl.size();
1098 cg = state_local->cg_gl.data();
1100 for (i = 0; i < ncg_home; i++)
1102 nat_home += cgs_gl->index[cg[i]+1] - cgs_gl->index[cg[i]];
1107 gmx_incons("Attempted to collect a vector for a state for which the charge group distribution is unknown");
1121 /* Collect the charge group and atom counts on the master */
1122 dd_gather(dd, 2*sizeof(int), buf2, ibuf);
1127 for (i = 0; i < dd->nnodes; i++)
1129 ma->ncg[i] = ma->ibuf[2*i];
1130 ma->nat[i] = ma->ibuf[2*i+1];
1131 ma->index[i+1] = ma->index[i] + ma->ncg[i];
1134 /* Make byte counts and indices */
1135 for (i = 0; i < dd->nnodes; i++)
1137 ma->ibuf[i] = ma->ncg[i]*sizeof(int);
1138 ma->ibuf[dd->nnodes+i] = ma->index[i]*sizeof(int);
1142 fprintf(debug, "Initial charge group distribution: ");
1143 for (i = 0; i < dd->nnodes; i++)
1145 fprintf(debug, " %d", ma->ncg[i]);
1147 fprintf(debug, "\n");
1151 /* Collect the charge group indices on the master */
1153 ncg_home*sizeof(int), cg,
1154 DDMASTER(dd) ? ma->ibuf : nullptr,
1155 DDMASTER(dd) ? ma->ibuf+dd->nnodes : nullptr,
1156 DDMASTER(dd) ? ma->cg : nullptr);
1158 dd->comm->master_cg_ddp_count = state_local->ddp_count;
1161 static void dd_collect_vec_sendrecv(gmx_domdec_t *dd,
1162 gmx::ArrayRef<const gmx::RVec> lv,
1163 gmx::ArrayRef<gmx::RVec> v)
1165 gmx_domdec_master_t *ma;
1166 int n, i, c, a, nalloc = 0;
1167 rvec *buf = nullptr;
1175 MPI_Send(const_cast<void *>(static_cast<const void *>(lv.data())), dd->nat_home*sizeof(rvec), MPI_BYTE,
1176 DDMASTERRANK(dd), dd->rank, dd->mpi_comm_all);
1181 /* Copy the master coordinates to the global array */
1182 cgs_gl = &dd->comm->cgs_gl;
1184 n = DDMASTERRANK(dd);
1186 for (i = ma->index[n]; i < ma->index[n+1]; i++)
1188 for (c = cgs_gl->index[ma->cg[i]]; c < cgs_gl->index[ma->cg[i]+1]; c++)
1190 copy_rvec(lv[a++], v[c]);
1194 for (n = 0; n < dd->nnodes; n++)
1198 if (ma->nat[n] > nalloc)
1200 nalloc = over_alloc_dd(ma->nat[n]);
1201 srenew(buf, nalloc);
1204 MPI_Recv(buf, ma->nat[n]*sizeof(rvec), MPI_BYTE, DDRANK(dd, n),
1205 n, dd->mpi_comm_all, MPI_STATUS_IGNORE);
1208 for (i = ma->index[n]; i < ma->index[n+1]; i++)
1210 for (c = cgs_gl->index[ma->cg[i]]; c < cgs_gl->index[ma->cg[i]+1]; c++)
1212 copy_rvec(buf[a++], v[c]);
1221 static void get_commbuffer_counts(gmx_domdec_t *dd,
1222 int **counts, int **disps)
1224 gmx_domdec_master_t *ma;
1229 /* Make the rvec count and displacment arrays */
1231 *disps = ma->ibuf + dd->nnodes;
1232 for (n = 0; n < dd->nnodes; n++)
1234 (*counts)[n] = ma->nat[n]*sizeof(rvec);
1235 (*disps)[n] = (n == 0 ? 0 : (*disps)[n-1] + (*counts)[n-1]);
1239 static void dd_collect_vec_gatherv(gmx_domdec_t *dd,
1240 gmx::ArrayRef<const gmx::RVec> lv,
1241 gmx::ArrayRef<gmx::RVec> v)
1243 gmx_domdec_master_t *ma;
1244 int *rcounts = nullptr, *disps = nullptr;
1246 rvec *buf = nullptr;
1253 get_commbuffer_counts(dd, &rcounts, &disps);
1258 dd_gatherv(dd, dd->nat_home*sizeof(rvec), lv.data(), rcounts, disps, buf);
1262 cgs_gl = &dd->comm->cgs_gl;
1265 for (n = 0; n < dd->nnodes; n++)
1267 for (i = ma->index[n]; i < ma->index[n+1]; i++)
1269 for (c = cgs_gl->index[ma->cg[i]]; c < cgs_gl->index[ma->cg[i]+1]; c++)
1271 copy_rvec(buf[a++], v[c]);
1278 void dd_collect_vec(gmx_domdec_t *dd,
1279 const t_state *state_local,
1280 gmx::ArrayRef<const gmx::RVec> lv,
1281 gmx::ArrayRef<gmx::RVec> v)
1283 dd_collect_cg(dd, state_local);
1285 if (dd->nnodes <= GMX_DD_NNODES_SENDRECV)
1287 dd_collect_vec_sendrecv(dd, lv, v);
1291 dd_collect_vec_gatherv(dd, lv, v);
1296 void dd_collect_state(gmx_domdec_t *dd,
1297 const t_state *state_local, t_state *state)
1299 int nh = state_local->nhchainlength;
1303 GMX_RELEASE_ASSERT(state->nhchainlength == nh, "The global and local Nose-Hoover chain lengths should match");
1305 for (int i = 0; i < efptNR; i++)
1307 state->lambda[i] = state_local->lambda[i];
1309 state->fep_state = state_local->fep_state;
1310 state->veta = state_local->veta;
1311 state->vol0 = state_local->vol0;
1312 copy_mat(state_local->box, state->box);
1313 copy_mat(state_local->boxv, state->boxv);
1314 copy_mat(state_local->svir_prev, state->svir_prev);
1315 copy_mat(state_local->fvir_prev, state->fvir_prev);
1316 copy_mat(state_local->pres_prev, state->pres_prev);
1318 for (int i = 0; i < state_local->ngtc; i++)
1320 for (int j = 0; j < nh; j++)
1322 state->nosehoover_xi[i*nh+j] = state_local->nosehoover_xi[i*nh+j];
1323 state->nosehoover_vxi[i*nh+j] = state_local->nosehoover_vxi[i*nh+j];
1325 state->therm_integral[i] = state_local->therm_integral[i];
1327 for (int i = 0; i < state_local->nnhpres; i++)
1329 for (int j = 0; j < nh; j++)
1331 state->nhpres_xi[i*nh+j] = state_local->nhpres_xi[i*nh+j];
1332 state->nhpres_vxi[i*nh+j] = state_local->nhpres_vxi[i*nh+j];
1335 state->baros_integral = state_local->baros_integral;
1337 if (state_local->flags & (1 << estX))
1339 gmx::ArrayRef<gmx::RVec> globalXRef = state ? gmx::makeArrayRef(state->x) : gmx::EmptyArrayRef();
1340 dd_collect_vec(dd, state_local, state_local->x, globalXRef);
1342 if (state_local->flags & (1 << estV))
1344 gmx::ArrayRef<gmx::RVec> globalVRef = state ? gmx::makeArrayRef(state->v) : gmx::EmptyArrayRef();
1345 dd_collect_vec(dd, state_local, state_local->v, globalVRef);
1347 if (state_local->flags & (1 << estCGP))
1349 gmx::ArrayRef<gmx::RVec> globalCgpRef = state ? gmx::makeArrayRef(state->cg_p) : gmx::EmptyArrayRef();
1350 dd_collect_vec(dd, state_local, state_local->cg_p, globalCgpRef);
1354 static void dd_resize_state(t_state *state, PaddedRVecVector *f, int natoms)
1358 fprintf(debug, "Resizing state: currently %d, required %d\n", state->natoms, natoms);
1361 state_change_natoms(state, natoms);
1365 /* We need to allocate one element extra, since we might use
1366 * (unaligned) 4-wide SIMD loads to access rvec entries.
1368 f->resize(paddedRVecVectorSize(natoms));
1372 static void dd_check_alloc_ncg(t_forcerec *fr,
1374 PaddedRVecVector *f,
1375 int numChargeGroups)
1377 if (numChargeGroups > fr->cg_nalloc)
1381 fprintf(debug, "Reallocating forcerec: currently %d, required %d, allocating %d\n", fr->cg_nalloc, numChargeGroups, over_alloc_dd(numChargeGroups));
1383 fr->cg_nalloc = over_alloc_dd(numChargeGroups);
1384 srenew(fr->cginfo, fr->cg_nalloc);
1385 if (fr->cutoff_scheme == ecutsGROUP)
1387 srenew(fr->cg_cm, fr->cg_nalloc);
1390 if (fr->cutoff_scheme == ecutsVERLET)
1392 /* We don't use charge groups, we use x in state to set up
1393 * the atom communication.
1395 dd_resize_state(state, f, numChargeGroups);
1399 static void dd_distribute_vec_sendrecv(gmx_domdec_t *dd, t_block *cgs,
1400 const rvec *v, rvec *lv)
1402 gmx_domdec_master_t *ma;
1403 int n, i, c, a, nalloc = 0;
1404 rvec *buf = nullptr;
1410 for (n = 0; n < dd->nnodes; n++)
1414 if (ma->nat[n] > nalloc)
1416 nalloc = over_alloc_dd(ma->nat[n]);
1417 srenew(buf, nalloc);
1419 /* Use lv as a temporary buffer */
1421 for (i = ma->index[n]; i < ma->index[n+1]; i++)
1423 for (c = cgs->index[ma->cg[i]]; c < cgs->index[ma->cg[i]+1]; c++)
1425 copy_rvec(v[c], buf[a++]);
1428 if (a != ma->nat[n])
1430 gmx_fatal(FARGS, "Internal error a (%d) != nat (%d)",
1435 MPI_Send(buf, ma->nat[n]*sizeof(rvec), MPI_BYTE,
1436 DDRANK(dd, n), n, dd->mpi_comm_all);
1441 n = DDMASTERRANK(dd);
1443 for (i = ma->index[n]; i < ma->index[n+1]; i++)
1445 for (c = cgs->index[ma->cg[i]]; c < cgs->index[ma->cg[i]+1]; c++)
1447 copy_rvec(v[c], lv[a++]);
1454 MPI_Recv(lv, dd->nat_home*sizeof(rvec), MPI_BYTE, DDMASTERRANK(dd),
1455 MPI_ANY_TAG, dd->mpi_comm_all, MPI_STATUS_IGNORE);
1460 static void dd_distribute_vec_scatterv(gmx_domdec_t *dd, t_block *cgs,
1461 const rvec *v, rvec *lv)
1463 gmx_domdec_master_t *ma;
1464 int *scounts = nullptr, *disps = nullptr;
1466 rvec *buf = nullptr;
1472 get_commbuffer_counts(dd, &scounts, &disps);
1476 for (n = 0; n < dd->nnodes; n++)
1478 for (i = ma->index[n]; i < ma->index[n+1]; i++)
1480 for (c = cgs->index[ma->cg[i]]; c < cgs->index[ma->cg[i]+1]; c++)
1482 copy_rvec(v[c], buf[a++]);
1488 dd_scatterv(dd, scounts, disps, buf, dd->nat_home*sizeof(rvec), lv);
1491 static void dd_distribute_vec(gmx_domdec_t *dd, t_block *cgs,
1492 const rvec *v, rvec *lv)
1494 if (dd->nnodes <= GMX_DD_NNODES_SENDRECV)
1496 dd_distribute_vec_sendrecv(dd, cgs, v, lv);
1500 dd_distribute_vec_scatterv(dd, cgs, v, lv);
1504 static void dd_distribute_dfhist(gmx_domdec_t *dd, df_history_t *dfhist)
1506 if (dfhist == nullptr)
1511 dd_bcast(dd, sizeof(int), &dfhist->bEquil);
1512 dd_bcast(dd, sizeof(int), &dfhist->nlambda);
1513 dd_bcast(dd, sizeof(real), &dfhist->wl_delta);
1515 if (dfhist->nlambda > 0)
1517 int nlam = dfhist->nlambda;
1518 dd_bcast(dd, sizeof(int)*nlam, dfhist->n_at_lam);
1519 dd_bcast(dd, sizeof(real)*nlam, dfhist->wl_histo);
1520 dd_bcast(dd, sizeof(real)*nlam, dfhist->sum_weights);
1521 dd_bcast(dd, sizeof(real)*nlam, dfhist->sum_dg);
1522 dd_bcast(dd, sizeof(real)*nlam, dfhist->sum_minvar);
1523 dd_bcast(dd, sizeof(real)*nlam, dfhist->sum_variance);
1525 for (int i = 0; i < nlam; i++)
1527 dd_bcast(dd, sizeof(real)*nlam, dfhist->accum_p[i]);
1528 dd_bcast(dd, sizeof(real)*nlam, dfhist->accum_m[i]);
1529 dd_bcast(dd, sizeof(real)*nlam, dfhist->accum_p2[i]);
1530 dd_bcast(dd, sizeof(real)*nlam, dfhist->accum_m2[i]);
1531 dd_bcast(dd, sizeof(real)*nlam, dfhist->Tij[i]);
1532 dd_bcast(dd, sizeof(real)*nlam, dfhist->Tij_empirical[i]);
1537 static void dd_distribute_state(gmx_domdec_t *dd, t_block *cgs,
1538 t_state *state, t_state *state_local,
1539 PaddedRVecVector *f)
1541 int nh = state_local->nhchainlength;
1545 GMX_RELEASE_ASSERT(state->nhchainlength == nh, "The global and local Nose-Hoover chain lengths should match");
1547 for (int i = 0; i < efptNR; i++)
1549 state_local->lambda[i] = state->lambda[i];
1551 state_local->fep_state = state->fep_state;
1552 state_local->veta = state->veta;
1553 state_local->vol0 = state->vol0;
1554 copy_mat(state->box, state_local->box);
1555 copy_mat(state->box_rel, state_local->box_rel);
1556 copy_mat(state->boxv, state_local->boxv);
1557 copy_mat(state->svir_prev, state_local->svir_prev);
1558 copy_mat(state->fvir_prev, state_local->fvir_prev);
1559 if (state->dfhist != nullptr)
1561 copy_df_history(state_local->dfhist, state->dfhist);
1563 for (int i = 0; i < state_local->ngtc; i++)
1565 for (int j = 0; j < nh; j++)
1567 state_local->nosehoover_xi[i*nh+j] = state->nosehoover_xi[i*nh+j];
1568 state_local->nosehoover_vxi[i*nh+j] = state->nosehoover_vxi[i*nh+j];
1570 state_local->therm_integral[i] = state->therm_integral[i];
1572 for (int i = 0; i < state_local->nnhpres; i++)
1574 for (int j = 0; j < nh; j++)
1576 state_local->nhpres_xi[i*nh+j] = state->nhpres_xi[i*nh+j];
1577 state_local->nhpres_vxi[i*nh+j] = state->nhpres_vxi[i*nh+j];
1580 state_local->baros_integral = state->baros_integral;
1582 dd_bcast(dd, ((efptNR)*sizeof(real)), state_local->lambda.data());
1583 dd_bcast(dd, sizeof(int), &state_local->fep_state);
1584 dd_bcast(dd, sizeof(real), &state_local->veta);
1585 dd_bcast(dd, sizeof(real), &state_local->vol0);
1586 dd_bcast(dd, sizeof(state_local->box), state_local->box);
1587 dd_bcast(dd, sizeof(state_local->box_rel), state_local->box_rel);
1588 dd_bcast(dd, sizeof(state_local->boxv), state_local->boxv);
1589 dd_bcast(dd, sizeof(state_local->svir_prev), state_local->svir_prev);
1590 dd_bcast(dd, sizeof(state_local->fvir_prev), state_local->fvir_prev);
1591 dd_bcast(dd, ((state_local->ngtc*nh)*sizeof(double)), state_local->nosehoover_xi.data());
1592 dd_bcast(dd, ((state_local->ngtc*nh)*sizeof(double)), state_local->nosehoover_vxi.data());
1593 dd_bcast(dd, state_local->ngtc*sizeof(double), state_local->therm_integral.data());
1594 dd_bcast(dd, ((state_local->nnhpres*nh)*sizeof(double)), state_local->nhpres_xi.data());
1595 dd_bcast(dd, ((state_local->nnhpres*nh)*sizeof(double)), state_local->nhpres_vxi.data());
1597 /* communicate df_history -- required for restarting from checkpoint */
1598 dd_distribute_dfhist(dd, state_local->dfhist);
1600 dd_resize_state(state_local, f, dd->nat_home);
1602 if (state_local->flags & (1 << estX))
1604 const rvec *xGlobal = (DDMASTER(dd) ? as_rvec_array(state->x.data()) : nullptr);
1605 dd_distribute_vec(dd, cgs, xGlobal, as_rvec_array(state_local->x.data()));
1607 if (state_local->flags & (1 << estV))
1609 const rvec *vGlobal = (DDMASTER(dd) ? as_rvec_array(state->v.data()) : nullptr);
1610 dd_distribute_vec(dd, cgs, vGlobal, as_rvec_array(state_local->v.data()));
1612 if (state_local->flags & (1 << estCGP))
1614 const rvec *cgpGlobal = (DDMASTER(dd) ? as_rvec_array(state->cg_p.data()) : nullptr);
1615 dd_distribute_vec(dd, cgs, cgpGlobal, as_rvec_array(state_local->cg_p.data()));
1619 static char dim2char(int dim)
1625 case XX: c = 'X'; break;
1626 case YY: c = 'Y'; break;
1627 case ZZ: c = 'Z'; break;
1628 default: gmx_fatal(FARGS, "Unknown dim %d", dim);
1634 static void write_dd_grid_pdb(const char *fn, gmx_int64_t step,
1635 gmx_domdec_t *dd, matrix box, gmx_ddbox_t *ddbox)
1637 rvec grid_s[2], *grid_r = nullptr, cx, r;
1638 char fname[STRLEN], buf[22];
1640 int a, i, d, z, y, x;
1644 copy_rvec(dd->comm->cell_x0, grid_s[0]);
1645 copy_rvec(dd->comm->cell_x1, grid_s[1]);
1649 snew(grid_r, 2*dd->nnodes);
1652 dd_gather(dd, 2*sizeof(rvec), grid_s, DDMASTER(dd) ? grid_r : nullptr);
1656 for (d = 0; d < DIM; d++)
1658 for (i = 0; i < DIM; i++)
1666 if (d < ddbox->npbcdim && dd->nc[d] > 1)
1668 tric[d][i] = box[i][d]/box[i][i];
1677 sprintf(fname, "%s_%s.pdb", fn, gmx_step_str(step, buf));
1678 out = gmx_fio_fopen(fname, "w");
1679 gmx_write_pdb_box(out, dd->bScrewPBC ? epbcSCREW : epbcXYZ, box);
1681 for (i = 0; i < dd->nnodes; i++)
1683 vol = dd->nnodes/(box[XX][XX]*box[YY][YY]*box[ZZ][ZZ]);
1684 for (d = 0; d < DIM; d++)
1686 vol *= grid_r[i*2+1][d] - grid_r[i*2][d];
1688 for (z = 0; z < 2; z++)
1690 for (y = 0; y < 2; y++)
1692 for (x = 0; x < 2; x++)
1694 cx[XX] = grid_r[i*2+x][XX];
1695 cx[YY] = grid_r[i*2+y][YY];
1696 cx[ZZ] = grid_r[i*2+z][ZZ];
1698 gmx_fprintf_pdb_atomline(out, epdbATOM, a++, "CA", ' ', "GLY", ' ', i+1, ' ',
1699 10*r[XX], 10*r[YY], 10*r[ZZ], 1.0, vol, "");
1703 for (d = 0; d < DIM; d++)
1705 for (x = 0; x < 4; x++)
1709 case 0: y = 1 + i*8 + 2*x; break;
1710 case 1: y = 1 + i*8 + 2*x - (x % 2); break;
1711 case 2: y = 1 + i*8 + x; break;
1713 fprintf(out, "%6s%5d%5d\n", "CONECT", y, y+(1<<d));
1717 gmx_fio_fclose(out);
1722 void write_dd_pdb(const char *fn, gmx_int64_t step, const char *title,
1723 const gmx_mtop_t *mtop, t_commrec *cr,
1724 int natoms, rvec x[], matrix box)
1726 char fname[STRLEN], buf[22];
1728 int i, ii, resnr, c;
1729 const char *atomname, *resname;
1736 natoms = dd->comm->nat[ddnatVSITE];
1739 sprintf(fname, "%s_%s_n%d.pdb", fn, gmx_step_str(step, buf), cr->sim_nodeid);
1741 out = gmx_fio_fopen(fname, "w");
1743 fprintf(out, "TITLE %s\n", title);
1744 gmx_write_pdb_box(out, dd->bScrewPBC ? epbcSCREW : epbcXYZ, box);
1746 for (i = 0; i < natoms; i++)
1748 ii = dd->gatindex[i];
1749 mtopGetAtomAndResidueName(mtop, ii, &molb, &atomname, &resnr, &resname, nullptr);
1750 if (i < dd->comm->nat[ddnatZONE])
1753 while (i >= dd->cgindex[dd->comm->zones.cg_range[c+1]])
1759 else if (i < dd->comm->nat[ddnatVSITE])
1761 b = dd->comm->zones.n;
1765 b = dd->comm->zones.n + 1;
1767 gmx_fprintf_pdb_atomline(out, epdbATOM, ii+1, atomname, ' ', resname, ' ', resnr, ' ',
1768 10*x[i][XX], 10*x[i][YY], 10*x[i][ZZ], 1.0, b, "");
1770 fprintf(out, "TER\n");
1772 gmx_fio_fclose(out);
1775 real dd_cutoff_multibody(const gmx_domdec_t *dd)
1777 gmx_domdec_comm_t *comm;
1784 if (comm->bInterCGBondeds)
1786 if (comm->cutoff_mbody > 0)
1788 r = comm->cutoff_mbody;
1792 /* cutoff_mbody=0 means we do not have DLB */
1793 r = comm->cellsize_min[dd->dim[0]];
1794 for (di = 1; di < dd->ndim; di++)
1796 r = std::min(r, comm->cellsize_min[dd->dim[di]]);
1798 if (comm->bBondComm)
1800 r = std::max(r, comm->cutoff_mbody);
1804 r = std::min(r, comm->cutoff);
1812 real dd_cutoff_twobody(const gmx_domdec_t *dd)
1816 r_mb = dd_cutoff_multibody(dd);
1818 return std::max(dd->comm->cutoff, r_mb);
1822 static void dd_cart_coord2pmecoord(const gmx_domdec_t *dd, const ivec coord,
1827 nc = dd->nc[dd->comm->cartpmedim];
1828 ntot = dd->comm->ntot[dd->comm->cartpmedim];
1829 copy_ivec(coord, coord_pme);
1830 coord_pme[dd->comm->cartpmedim] =
1831 nc + (coord[dd->comm->cartpmedim]*(ntot - nc) + (ntot - nc)/2)/nc;
1834 static int ddindex2pmeindex(const gmx_domdec_t *dd, int ddindex)
1839 npme = dd->comm->npmenodes;
1841 /* Here we assign a PME node to communicate with this DD node
1842 * by assuming that the major index of both is x.
1843 * We add cr->npmenodes/2 to obtain an even distribution.
1845 return (ddindex*npme + npme/2)/npp;
1848 static int *dd_interleaved_pme_ranks(const gmx_domdec_t *dd)
1853 snew(pme_rank, dd->comm->npmenodes);
1855 for (i = 0; i < dd->nnodes; i++)
1857 p0 = ddindex2pmeindex(dd, i);
1858 p1 = ddindex2pmeindex(dd, i+1);
1859 if (i+1 == dd->nnodes || p1 > p0)
1863 fprintf(debug, "pme_rank[%d] = %d\n", n, i+1+n);
1865 pme_rank[n] = i + 1 + n;
1873 static int gmx_ddcoord2pmeindex(t_commrec *cr, int x, int y, int z)
1881 if (dd->comm->bCartesian) {
1882 gmx_ddindex2xyz(dd->nc,ddindex,coords);
1883 dd_coords2pmecoords(dd,coords,coords_pme);
1884 copy_ivec(dd->ntot,nc);
1885 nc[dd->cartpmedim] -= dd->nc[dd->cartpmedim];
1886 coords_pme[dd->cartpmedim] -= dd->nc[dd->cartpmedim];
1888 slab = (coords_pme[XX]*nc[YY] + coords_pme[YY])*nc[ZZ] + coords_pme[ZZ];
1890 slab = (ddindex*cr->npmenodes + cr->npmenodes/2)/dd->nnodes;
1896 slab = ddindex2pmeindex(dd, dd_index(dd->nc, coords));
1901 static int ddcoord2simnodeid(t_commrec *cr, int x, int y, int z)
1903 gmx_domdec_comm_t *comm;
1905 int ddindex, nodeid = -1;
1907 comm = cr->dd->comm;
1912 if (comm->bCartesianPP_PME)
1915 MPI_Cart_rank(cr->mpi_comm_mysim, coords, &nodeid);
1920 ddindex = dd_index(cr->dd->nc, coords);
1921 if (comm->bCartesianPP)
1923 nodeid = comm->ddindex2simnodeid[ddindex];
1929 nodeid = ddindex + gmx_ddcoord2pmeindex(cr, x, y, z);
1941 static int dd_simnode2pmenode(const gmx_domdec_t *dd,
1942 const t_commrec gmx_unused *cr,
1947 const gmx_domdec_comm_t *comm = dd->comm;
1949 /* This assumes a uniform x domain decomposition grid cell size */
1950 if (comm->bCartesianPP_PME)
1953 ivec coord, coord_pme;
1954 MPI_Cart_coords(cr->mpi_comm_mysim, sim_nodeid, DIM, coord);
1955 if (coord[comm->cartpmedim] < dd->nc[comm->cartpmedim])
1957 /* This is a PP node */
1958 dd_cart_coord2pmecoord(dd, coord, coord_pme);
1959 MPI_Cart_rank(cr->mpi_comm_mysim, coord_pme, &pmenode);
1963 else if (comm->bCartesianPP)
1965 if (sim_nodeid < dd->nnodes)
1967 pmenode = dd->nnodes + ddindex2pmeindex(dd, sim_nodeid);
1972 /* This assumes DD cells with identical x coordinates
1973 * are numbered sequentially.
1975 if (dd->comm->pmenodes == nullptr)
1977 if (sim_nodeid < dd->nnodes)
1979 /* The DD index equals the nodeid */
1980 pmenode = dd->nnodes + ddindex2pmeindex(dd, sim_nodeid);
1986 while (sim_nodeid > dd->comm->pmenodes[i])
1990 if (sim_nodeid < dd->comm->pmenodes[i])
1992 pmenode = dd->comm->pmenodes[i];
2000 void get_pme_nnodes(const gmx_domdec_t *dd,
2001 int *npmenodes_x, int *npmenodes_y)
2005 *npmenodes_x = dd->comm->npmenodes_x;
2006 *npmenodes_y = dd->comm->npmenodes_y;
2015 std::vector<int> get_pme_ddranks(t_commrec *cr, int pmenodeid)
2019 ivec coord, coord_pme;
2023 std::vector<int> ddranks;
2024 ddranks.reserve((dd->nnodes+cr->npmenodes-1)/cr->npmenodes);
2026 for (x = 0; x < dd->nc[XX]; x++)
2028 for (y = 0; y < dd->nc[YY]; y++)
2030 for (z = 0; z < dd->nc[ZZ]; z++)
2032 if (dd->comm->bCartesianPP_PME)
2037 dd_cart_coord2pmecoord(dd, coord, coord_pme);
2038 if (dd->ci[XX] == coord_pme[XX] &&
2039 dd->ci[YY] == coord_pme[YY] &&
2040 dd->ci[ZZ] == coord_pme[ZZ])
2042 ddranks.push_back(ddcoord2simnodeid(cr, x, y, z));
2047 /* The slab corresponds to the nodeid in the PME group */
2048 if (gmx_ddcoord2pmeindex(cr, x, y, z) == pmenodeid)
2050 ddranks.push_back(ddcoord2simnodeid(cr, x, y, z));
2059 static gmx_bool receive_vir_ener(const gmx_domdec_t *dd, const t_commrec *cr)
2061 gmx_bool bReceive = TRUE;
2063 if (cr->npmenodes < dd->nnodes)
2065 gmx_domdec_comm_t *comm = dd->comm;
2066 if (comm->bCartesianPP_PME)
2069 int pmenode = dd_simnode2pmenode(dd, cr, cr->sim_nodeid);
2071 MPI_Cart_coords(cr->mpi_comm_mysim, cr->sim_nodeid, DIM, coords);
2072 coords[comm->cartpmedim]++;
2073 if (coords[comm->cartpmedim] < dd->nc[comm->cartpmedim])
2076 MPI_Cart_rank(cr->mpi_comm_mysim, coords, &rank);
2077 if (dd_simnode2pmenode(dd, cr, rank) == pmenode)
2079 /* This is not the last PP node for pmenode */
2084 GMX_RELEASE_ASSERT(false, "Without MPI we should not have Cartesian PP-PME with #PMEnodes < #DDnodes");
2089 int pmenode = dd_simnode2pmenode(dd, cr, cr->sim_nodeid);
2090 if (cr->sim_nodeid+1 < cr->nnodes &&
2091 dd_simnode2pmenode(dd, cr, cr->sim_nodeid+1) == pmenode)
2093 /* This is not the last PP node for pmenode */
2102 static void set_zones_ncg_home(gmx_domdec_t *dd)
2104 gmx_domdec_zones_t *zones;
2107 zones = &dd->comm->zones;
2109 zones->cg_range[0] = 0;
2110 for (i = 1; i < zones->n+1; i++)
2112 zones->cg_range[i] = dd->ncg_home;
2114 /* zone_ncg1[0] should always be equal to ncg_home */
2115 dd->comm->zone_ncg1[0] = dd->ncg_home;
2118 static void rebuild_cgindex(gmx_domdec_t *dd,
2119 const int *gcgs_index, const t_state *state)
2121 int * gmx_restrict dd_cg_gl = dd->index_gl;
2122 int * gmx_restrict cgindex = dd->cgindex;
2125 /* Copy back the global charge group indices from state
2126 * and rebuild the local charge group to atom index.
2129 for (unsigned int i = 0; i < state->cg_gl.size(); i++)
2132 int cg_gl = state->cg_gl[i];
2133 dd_cg_gl[i] = cg_gl;
2134 nat += gcgs_index[cg_gl+1] - gcgs_index[cg_gl];
2136 cgindex[state->cg_gl.size()] = nat;
2138 dd->ncg_home = state->cg_gl.size();
2141 set_zones_ncg_home(dd);
2144 static int ddcginfo(const cginfo_mb_t *cginfo_mb, int cg)
2146 while (cg >= cginfo_mb->cg_end)
2151 return cginfo_mb->cginfo[(cg - cginfo_mb->cg_start) % cginfo_mb->cg_mod];
2154 static void dd_set_cginfo(int *index_gl, int cg0, int cg1,
2155 t_forcerec *fr, char *bLocalCG)
2157 cginfo_mb_t *cginfo_mb;
2163 cginfo_mb = fr->cginfo_mb;
2164 cginfo = fr->cginfo;
2166 for (cg = cg0; cg < cg1; cg++)
2168 cginfo[cg] = ddcginfo(cginfo_mb, index_gl[cg]);
2172 if (bLocalCG != nullptr)
2174 for (cg = cg0; cg < cg1; cg++)
2176 bLocalCG[index_gl[cg]] = TRUE;
2181 static void make_dd_indices(gmx_domdec_t *dd,
2182 const int *gcgs_index, int cg_start)
2184 int nzone, zone, zone1, cg0, cg1, cg1_p1, cg, cg_gl, a, a_gl;
2185 int *zone2cg, *zone_ncg1, *index_gl, *gatindex;
2188 if (dd->nat_tot > dd->gatindex_nalloc)
2190 dd->gatindex_nalloc = over_alloc_dd(dd->nat_tot);
2191 srenew(dd->gatindex, dd->gatindex_nalloc);
2194 nzone = dd->comm->zones.n;
2195 zone2cg = dd->comm->zones.cg_range;
2196 zone_ncg1 = dd->comm->zone_ncg1;
2197 index_gl = dd->index_gl;
2198 gatindex = dd->gatindex;
2199 bCGs = dd->comm->bCGs;
2201 if (zone2cg[1] != dd->ncg_home)
2203 gmx_incons("dd->ncg_zone is not up to date");
2206 /* Make the local to global and global to local atom index */
2207 a = dd->cgindex[cg_start];
2208 for (zone = 0; zone < nzone; zone++)
2216 cg0 = zone2cg[zone];
2218 cg1 = zone2cg[zone+1];
2219 cg1_p1 = cg0 + zone_ncg1[zone];
2221 for (cg = cg0; cg < cg1; cg++)
2226 /* Signal that this cg is from more than one pulse away */
2229 cg_gl = index_gl[cg];
2232 for (a_gl = gcgs_index[cg_gl]; a_gl < gcgs_index[cg_gl+1]; a_gl++)
2235 ga2la_set(dd->ga2la, a_gl, a, zone1);
2241 gatindex[a] = cg_gl;
2242 ga2la_set(dd->ga2la, cg_gl, a, zone1);
2249 static int check_bLocalCG(gmx_domdec_t *dd, int ncg_sys, const char *bLocalCG,
2255 if (bLocalCG == nullptr)
2259 for (i = 0; i < dd->ncg_tot; i++)
2261 if (!bLocalCG[dd->index_gl[i]])
2264 "DD rank %d, %s: cg %d, global cg %d is not marked in bLocalCG (ncg_home %d)\n", dd->rank, where, i+1, dd->index_gl[i]+1, dd->ncg_home);
2269 for (i = 0; i < ncg_sys; i++)
2276 if (ngl != dd->ncg_tot)
2278 fprintf(stderr, "DD rank %d, %s: In bLocalCG %d cgs are marked as local, whereas there are %d\n", dd->rank, where, ngl, dd->ncg_tot);
2285 static void check_index_consistency(gmx_domdec_t *dd,
2286 int natoms_sys, int ncg_sys,
2289 int nerr, ngl, i, a, cell;
2294 if (dd->comm->DD_debug > 1)
2296 snew(have, natoms_sys);
2297 for (a = 0; a < dd->nat_tot; a++)
2299 if (have[dd->gatindex[a]] > 0)
2301 fprintf(stderr, "DD rank %d: global atom %d occurs twice: index %d and %d\n", dd->rank, dd->gatindex[a]+1, have[dd->gatindex[a]], a+1);
2305 have[dd->gatindex[a]] = a + 1;
2311 snew(have, dd->nat_tot);
2314 for (i = 0; i < natoms_sys; i++)
2316 if (ga2la_get(dd->ga2la, i, &a, &cell))
2318 if (a >= dd->nat_tot)
2320 fprintf(stderr, "DD rank %d: global atom %d marked as local atom %d, which is larger than nat_tot (%d)\n", dd->rank, i+1, a+1, dd->nat_tot);
2326 if (dd->gatindex[a] != i)
2328 fprintf(stderr, "DD rank %d: global atom %d marked as local atom %d, which has global atom index %d\n", dd->rank, i+1, a+1, dd->gatindex[a]+1);
2335 if (ngl != dd->nat_tot)
2338 "DD rank %d, %s: %d global atom indices, %d local atoms\n",
2339 dd->rank, where, ngl, dd->nat_tot);
2341 for (a = 0; a < dd->nat_tot; a++)
2346 "DD rank %d, %s: local atom %d, global %d has no global index\n",
2347 dd->rank, where, a+1, dd->gatindex[a]+1);
2352 nerr += check_bLocalCG(dd, ncg_sys, dd->comm->bLocalCG, where);
2356 gmx_fatal(FARGS, "DD rank %d, %s: %d atom/cg index inconsistencies",
2357 dd->rank, where, nerr);
2361 static void clear_dd_indices(gmx_domdec_t *dd, int cg_start, int a_start)
2368 /* Clear the whole list without searching */
2369 ga2la_clear(dd->ga2la);
2373 for (i = a_start; i < dd->nat_tot; i++)
2375 ga2la_del(dd->ga2la, dd->gatindex[i]);
2379 bLocalCG = dd->comm->bLocalCG;
2382 for (i = cg_start; i < dd->ncg_tot; i++)
2384 bLocalCG[dd->index_gl[i]] = FALSE;
2388 dd_clear_local_vsite_indices(dd);
2390 if (dd->constraints)
2392 dd_clear_local_constraint_indices(dd);
2396 /* This function should be used for moving the domain boudaries during DLB,
2397 * for obtaining the minimum cell size. It checks the initially set limit
2398 * comm->cellsize_min, for bonded and initial non-bonded cut-offs,
2399 * and, possibly, a longer cut-off limit set for PME load balancing.
2401 static real cellsize_min_dlb(gmx_domdec_comm_t *comm, int dim_ind, int dim)
2405 cellsize_min = comm->cellsize_min[dim];
2407 if (!comm->bVacDLBNoLimit)
2409 /* The cut-off might have changed, e.g. by PME load balacning,
2410 * from the value used to set comm->cellsize_min, so check it.
2412 cellsize_min = std::max(cellsize_min, comm->cutoff/comm->cd[dim_ind].np_dlb);
2414 if (comm->bPMELoadBalDLBLimits)
2416 /* Check for the cut-off limit set by the PME load balancing */
2417 cellsize_min = std::max(cellsize_min, comm->PMELoadBal_max_cutoff/comm->cd[dim_ind].np_dlb);
2421 return cellsize_min;
2424 static real grid_jump_limit(gmx_domdec_comm_t *comm, real cutoff,
2427 real grid_jump_limit;
2429 /* The distance between the boundaries of cells at distance
2430 * x+-1,y+-1 or y+-1,z+-1 is limited by the cut-off restrictions
2431 * and by the fact that cells should not be shifted by more than
2432 * half their size, such that cg's only shift by one cell
2433 * at redecomposition.
2435 grid_jump_limit = comm->cellsize_limit;
2436 if (!comm->bVacDLBNoLimit)
2438 if (comm->bPMELoadBalDLBLimits)
2440 cutoff = std::max(cutoff, comm->PMELoadBal_max_cutoff);
2442 grid_jump_limit = std::max(grid_jump_limit,
2443 cutoff/comm->cd[dim_ind].np);
2446 return grid_jump_limit;
2449 static gmx_bool check_grid_jump(gmx_int64_t step,
2455 gmx_domdec_comm_t *comm;
2464 for (d = 1; d < dd->ndim; d++)
2467 limit = grid_jump_limit(comm, cutoff, d);
2468 bfac = ddbox->box_size[dim];
2469 if (ddbox->tric_dir[dim])
2471 bfac *= ddbox->skew_fac[dim];
2473 if ((comm->cell_f1[d] - comm->cell_f_max0[d])*bfac < limit ||
2474 (comm->cell_f0[d] - comm->cell_f_min1[d])*bfac > -limit)
2482 /* This error should never be triggered under normal
2483 * circumstances, but you never know ...
2485 gmx_fatal(FARGS, "step %s: The domain decomposition grid has shifted too much in the %c-direction around cell %d %d %d. This should not have happened. Running with fewer ranks might avoid this issue.",
2486 gmx_step_str(step, buf),
2487 dim2char(dim), dd->ci[XX], dd->ci[YY], dd->ci[ZZ]);
2495 static int dd_load_count(gmx_domdec_comm_t *comm)
2497 return (comm->eFlop ? comm->flop_n : comm->cycl_n[ddCyclF]);
2500 static float dd_force_load(gmx_domdec_comm_t *comm)
2507 if (comm->eFlop > 1)
2509 load *= 1.0 + (comm->eFlop - 1)*(0.1*rand()/RAND_MAX - 0.05);
2514 load = comm->cycl[ddCyclF];
2515 if (comm->cycl_n[ddCyclF] > 1)
2517 /* Subtract the maximum of the last n cycle counts
2518 * to get rid of possible high counts due to other sources,
2519 * for instance system activity, that would otherwise
2520 * affect the dynamic load balancing.
2522 load -= comm->cycl_max[ddCyclF];
2526 if (comm->cycl_n[ddCyclWaitGPU] && comm->nrank_gpu_shared > 1)
2528 float gpu_wait, gpu_wait_sum;
2530 gpu_wait = comm->cycl[ddCyclWaitGPU];
2531 if (comm->cycl_n[ddCyclF] > 1)
2533 /* We should remove the WaitGPU time of the same MD step
2534 * as the one with the maximum F time, since the F time
2535 * and the wait time are not independent.
2536 * Furthermore, the step for the max F time should be chosen
2537 * the same on all ranks that share the same GPU.
2538 * But to keep the code simple, we remove the average instead.
2539 * The main reason for artificially long times at some steps
2540 * is spurious CPU activity or MPI time, so we don't expect
2541 * that changes in the GPU wait time matter a lot here.
2543 gpu_wait *= (comm->cycl_n[ddCyclF] - 1)/(float)comm->cycl_n[ddCyclF];
2545 /* Sum the wait times over the ranks that share the same GPU */
2546 MPI_Allreduce(&gpu_wait, &gpu_wait_sum, 1, MPI_FLOAT, MPI_SUM,
2547 comm->mpi_comm_gpu_shared);
2548 /* Replace the wait time by the average over the ranks */
2549 load += -gpu_wait + gpu_wait_sum/comm->nrank_gpu_shared;
2557 static void set_slb_pme_dim_f(gmx_domdec_t *dd, int dim, real **dim_f)
2559 gmx_domdec_comm_t *comm;
2564 snew(*dim_f, dd->nc[dim]+1);
2566 for (i = 1; i < dd->nc[dim]; i++)
2568 if (comm->slb_frac[dim])
2570 (*dim_f)[i] = (*dim_f)[i-1] + comm->slb_frac[dim][i-1];
2574 (*dim_f)[i] = (real)i/(real)dd->nc[dim];
2577 (*dim_f)[dd->nc[dim]] = 1;
2580 static void init_ddpme(gmx_domdec_t *dd, gmx_ddpme_t *ddpme, int dimind)
2582 int pmeindex, slab, nso, i;
2585 if (dimind == 0 && dd->dim[0] == YY && dd->comm->npmenodes_x == 1)
2591 ddpme->dim = dimind;
2593 ddpme->dim_match = (ddpme->dim == dd->dim[dimind]);
2595 ddpme->nslab = (ddpme->dim == 0 ?
2596 dd->comm->npmenodes_x :
2597 dd->comm->npmenodes_y);
2599 if (ddpme->nslab <= 1)
2604 nso = dd->comm->npmenodes/ddpme->nslab;
2605 /* Determine for each PME slab the PP location range for dimension dim */
2606 snew(ddpme->pp_min, ddpme->nslab);
2607 snew(ddpme->pp_max, ddpme->nslab);
2608 for (slab = 0; slab < ddpme->nslab; slab++)
2610 ddpme->pp_min[slab] = dd->nc[dd->dim[dimind]] - 1;
2611 ddpme->pp_max[slab] = 0;
2613 for (i = 0; i < dd->nnodes; i++)
2615 ddindex2xyz(dd->nc, i, xyz);
2616 /* For y only use our y/z slab.
2617 * This assumes that the PME x grid size matches the DD grid size.
2619 if (dimind == 0 || xyz[XX] == dd->ci[XX])
2621 pmeindex = ddindex2pmeindex(dd, i);
2624 slab = pmeindex/nso;
2628 slab = pmeindex % ddpme->nslab;
2630 ddpme->pp_min[slab] = std::min(ddpme->pp_min[slab], xyz[dimind]);
2631 ddpme->pp_max[slab] = std::max(ddpme->pp_max[slab], xyz[dimind]);
2635 set_slb_pme_dim_f(dd, ddpme->dim, &ddpme->slb_dim_f);
2638 int dd_pme_maxshift_x(const gmx_domdec_t *dd)
2640 if (dd->comm->ddpme[0].dim == XX)
2642 return dd->comm->ddpme[0].maxshift;
2650 int dd_pme_maxshift_y(const gmx_domdec_t *dd)
2652 if (dd->comm->ddpme[0].dim == YY)
2654 return dd->comm->ddpme[0].maxshift;
2656 else if (dd->comm->npmedecompdim >= 2 && dd->comm->ddpme[1].dim == YY)
2658 return dd->comm->ddpme[1].maxshift;
2666 static void set_pme_maxshift(gmx_domdec_t *dd, gmx_ddpme_t *ddpme,
2667 gmx_bool bUniform, const gmx_ddbox_t *ddbox,
2670 gmx_domdec_comm_t *comm;
2673 real range, pme_boundary;
2677 nc = dd->nc[ddpme->dim];
2680 if (!ddpme->dim_match)
2682 /* PP decomposition is not along dim: the worst situation */
2685 else if (ns <= 3 || (bUniform && ns == nc))
2687 /* The optimal situation */
2692 /* We need to check for all pme nodes which nodes they
2693 * could possibly need to communicate with.
2695 xmin = ddpme->pp_min;
2696 xmax = ddpme->pp_max;
2697 /* Allow for atoms to be maximally 2/3 times the cut-off
2698 * out of their DD cell. This is a reasonable balance between
2699 * between performance and support for most charge-group/cut-off
2702 range = 2.0/3.0*comm->cutoff/ddbox->box_size[ddpme->dim];
2703 /* Avoid extra communication when we are exactly at a boundary */
2707 for (s = 0; s < ns; s++)
2709 /* PME slab s spreads atoms between box frac. s/ns and (s+1)/ns */
2710 pme_boundary = (real)s/ns;
2713 cell_f[xmax[s-(sh+1) ]+1] + range > pme_boundary) ||
2715 cell_f[xmax[s-(sh+1)+ns]+1] - 1 + range > pme_boundary)))
2719 pme_boundary = (real)(s+1)/ns;
2722 cell_f[xmin[s+(sh+1) ] ] - range < pme_boundary) ||
2724 cell_f[xmin[s+(sh+1)-ns] ] + 1 - range < pme_boundary)))
2731 ddpme->maxshift = sh;
2735 fprintf(debug, "PME slab communication range for dim %d is %d\n",
2736 ddpme->dim, ddpme->maxshift);
2740 static void check_box_size(gmx_domdec_t *dd, gmx_ddbox_t *ddbox)
2744 for (d = 0; d < dd->ndim; d++)
2747 if (dim < ddbox->nboundeddim &&
2748 ddbox->box_size[dim]*ddbox->skew_fac[dim] <
2749 dd->nc[dim]*dd->comm->cellsize_limit*DD_CELL_MARGIN)
2751 gmx_fatal(FARGS, "The %c-size of the box (%f) times the triclinic skew factor (%f) is smaller than the number of DD cells (%d) times the smallest allowed cell size (%f)\n",
2752 dim2char(dim), ddbox->box_size[dim], ddbox->skew_fac[dim],
2753 dd->nc[dim], dd->comm->cellsize_limit);
2759 setcellsizeslbLOCAL, setcellsizeslbMASTER, setcellsizeslbPULSE_ONLY
2762 /* Set the domain boundaries. Use for static (or no) load balancing,
2763 * and also for the starting state for dynamic load balancing.
2764 * setmode determine if and where the boundaries are stored, use enum above.
2765 * Returns the number communication pulses in npulse.
2767 static void set_dd_cell_sizes_slb(gmx_domdec_t *dd, const gmx_ddbox_t *ddbox,
2768 int setmode, ivec npulse)
2770 gmx_domdec_comm_t *comm;
2773 real *cell_x, cell_dx, cellsize;
2777 for (d = 0; d < DIM; d++)
2779 cellsize_min[d] = ddbox->box_size[d]*ddbox->skew_fac[d];
2781 if (dd->nc[d] == 1 || comm->slb_frac[d] == nullptr)
2784 cell_dx = ddbox->box_size[d]/dd->nc[d];
2787 case setcellsizeslbMASTER:
2788 for (j = 0; j < dd->nc[d]+1; j++)
2790 dd->ma->cell_x[d][j] = ddbox->box0[d] + j*cell_dx;
2793 case setcellsizeslbLOCAL:
2794 comm->cell_x0[d] = ddbox->box0[d] + (dd->ci[d] )*cell_dx;
2795 comm->cell_x1[d] = ddbox->box0[d] + (dd->ci[d]+1)*cell_dx;
2800 cellsize = cell_dx*ddbox->skew_fac[d];
2801 while (cellsize*npulse[d] < comm->cutoff)
2805 cellsize_min[d] = cellsize;
2809 /* Statically load balanced grid */
2810 /* Also when we are not doing a master distribution we determine
2811 * all cell borders in a loop to obtain identical values
2812 * to the master distribution case and to determine npulse.
2814 if (setmode == setcellsizeslbMASTER)
2816 cell_x = dd->ma->cell_x[d];
2820 snew(cell_x, dd->nc[d]+1);
2822 cell_x[0] = ddbox->box0[d];
2823 for (j = 0; j < dd->nc[d]; j++)
2825 cell_dx = ddbox->box_size[d]*comm->slb_frac[d][j];
2826 cell_x[j+1] = cell_x[j] + cell_dx;
2827 cellsize = cell_dx*ddbox->skew_fac[d];
2828 while (cellsize*npulse[d] < comm->cutoff &&
2829 npulse[d] < dd->nc[d]-1)
2833 cellsize_min[d] = std::min(cellsize_min[d], cellsize);
2835 if (setmode == setcellsizeslbLOCAL)
2837 comm->cell_x0[d] = cell_x[dd->ci[d]];
2838 comm->cell_x1[d] = cell_x[dd->ci[d]+1];
2840 if (setmode != setcellsizeslbMASTER)
2845 /* The following limitation is to avoid that a cell would receive
2846 * some of its own home charge groups back over the periodic boundary.
2847 * Double charge groups cause trouble with the global indices.
2849 if (d < ddbox->npbcdim &&
2850 dd->nc[d] > 1 && npulse[d] >= dd->nc[d])
2852 char error_string[STRLEN];
2854 sprintf(error_string,
2855 "The box size in direction %c (%f) times the triclinic skew factor (%f) is too small for a cut-off of %f with %d domain decomposition cells, use 1 or more than %d %s or increase the box size in this direction",
2856 dim2char(d), ddbox->box_size[d], ddbox->skew_fac[d],
2858 dd->nc[d], dd->nc[d],
2859 dd->nnodes > dd->nc[d] ? "cells" : "ranks");
2861 if (setmode == setcellsizeslbLOCAL)
2863 gmx_fatal_collective(FARGS, dd->mpi_comm_all, DDMASTER(dd),
2868 gmx_fatal(FARGS, error_string);
2875 copy_rvec(cellsize_min, comm->cellsize_min);
2878 for (d = 0; d < comm->npmedecompdim; d++)
2880 set_pme_maxshift(dd, &comm->ddpme[d],
2881 comm->slb_frac[dd->dim[d]] == nullptr, ddbox,
2882 comm->ddpme[d].slb_dim_f);
2887 static void dd_cell_sizes_dlb_root_enforce_limits(gmx_domdec_t *dd,
2888 int d, int dim, domdec_root_t *root,
2889 const gmx_ddbox_t *ddbox,
2890 gmx_bool bUniform, gmx_int64_t step, real cellsize_limit_f, int range[])
2892 gmx_domdec_comm_t *comm;
2893 int ncd, i, j, nmin, nmin_old;
2894 gmx_bool bLimLo, bLimHi;
2896 real fac, halfway, cellsize_limit_f_i, region_size;
2897 gmx_bool bPBC, bLastHi = FALSE;
2898 int nrange[] = {range[0], range[1]};
2900 region_size = root->cell_f[range[1]]-root->cell_f[range[0]];
2906 bPBC = (dim < ddbox->npbcdim);
2908 cell_size = root->buf_ncd;
2912 fprintf(debug, "enforce_limits: %d %d\n", range[0], range[1]);
2915 /* First we need to check if the scaling does not make cells
2916 * smaller than the smallest allowed size.
2917 * We need to do this iteratively, since if a cell is too small,
2918 * it needs to be enlarged, which makes all the other cells smaller,
2919 * which could in turn make another cell smaller than allowed.
2921 for (i = range[0]; i < range[1]; i++)
2923 root->bCellMin[i] = FALSE;
2929 /* We need the total for normalization */
2931 for (i = range[0]; i < range[1]; i++)
2933 if (root->bCellMin[i] == FALSE)
2935 fac += cell_size[i];
2938 fac = ( region_size - nmin*cellsize_limit_f)/fac; /* substracting cells already set to cellsize_limit_f */
2939 /* Determine the cell boundaries */
2940 for (i = range[0]; i < range[1]; i++)
2942 if (root->bCellMin[i] == FALSE)
2944 cell_size[i] *= fac;
2945 if (!bPBC && (i == 0 || i == dd->nc[dim] -1))
2947 cellsize_limit_f_i = 0;
2951 cellsize_limit_f_i = cellsize_limit_f;
2953 if (cell_size[i] < cellsize_limit_f_i)
2955 root->bCellMin[i] = TRUE;
2956 cell_size[i] = cellsize_limit_f_i;
2960 root->cell_f[i+1] = root->cell_f[i] + cell_size[i];
2963 while (nmin > nmin_old);
2966 cell_size[i] = root->cell_f[i+1] - root->cell_f[i];
2967 /* For this check we should not use DD_CELL_MARGIN,
2968 * but a slightly smaller factor,
2969 * since rounding could get use below the limit.
2971 if (bPBC && cell_size[i] < cellsize_limit_f*DD_CELL_MARGIN2/DD_CELL_MARGIN)
2974 gmx_fatal(FARGS, "step %s: the dynamic load balancing could not balance dimension %c: box size %f, triclinic skew factor %f, #cells %d, minimum cell size %f\n",
2975 gmx_step_str(step, buf),
2976 dim2char(dim), ddbox->box_size[dim], ddbox->skew_fac[dim],
2977 ncd, comm->cellsize_min[dim]);
2980 root->bLimited = (nmin > 0) || (range[0] > 0) || (range[1] < ncd);
2984 /* Check if the boundary did not displace more than halfway
2985 * each of the cells it bounds, as this could cause problems,
2986 * especially when the differences between cell sizes are large.
2987 * If changes are applied, they will not make cells smaller
2988 * than the cut-off, as we check all the boundaries which
2989 * might be affected by a change and if the old state was ok,
2990 * the cells will at most be shrunk back to their old size.
2992 for (i = range[0]+1; i < range[1]; i++)
2994 halfway = 0.5*(root->old_cell_f[i] + root->old_cell_f[i-1]);
2995 if (root->cell_f[i] < halfway)
2997 root->cell_f[i] = halfway;
2998 /* Check if the change also causes shifts of the next boundaries */
2999 for (j = i+1; j < range[1]; j++)
3001 if (root->cell_f[j] < root->cell_f[j-1] + cellsize_limit_f)
3003 root->cell_f[j] = root->cell_f[j-1] + cellsize_limit_f;
3007 halfway = 0.5*(root->old_cell_f[i] + root->old_cell_f[i+1]);
3008 if (root->cell_f[i] > halfway)
3010 root->cell_f[i] = halfway;
3011 /* Check if the change also causes shifts of the next boundaries */
3012 for (j = i-1; j >= range[0]+1; j--)
3014 if (root->cell_f[j] > root->cell_f[j+1] - cellsize_limit_f)
3016 root->cell_f[j] = root->cell_f[j+1] - cellsize_limit_f;
3023 /* nrange is defined as [lower, upper) range for new call to enforce_limits */
3024 /* find highest violation of LimLo (a) and the following violation of LimHi (thus the lowest following) (b)
3025 * then call enforce_limits for (oldb,a), (a,b). In the next step: (b,nexta). oldb and nexta can be the boundaries.
3026 * for a and b nrange is used */
3029 /* Take care of the staggering of the cell boundaries */
3032 for (i = range[0]; i < range[1]; i++)
3034 root->cell_f_max0[i] = root->cell_f[i];
3035 root->cell_f_min1[i] = root->cell_f[i+1];
3040 for (i = range[0]+1; i < range[1]; i++)
3042 bLimLo = (root->cell_f[i] < root->bound_min[i]);
3043 bLimHi = (root->cell_f[i] > root->bound_max[i]);
3044 if (bLimLo && bLimHi)
3046 /* Both limits violated, try the best we can */
3047 /* For this case we split the original range (range) in two parts and care about the other limitiations in the next iteration. */
3048 root->cell_f[i] = 0.5*(root->bound_min[i] + root->bound_max[i]);
3049 nrange[0] = range[0];
3051 dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3054 nrange[1] = range[1];
3055 dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3061 /* root->cell_f[i] = root->bound_min[i]; */
3062 nrange[1] = i; /* only store violation location. There could be a LimLo violation following with an higher index */
3065 else if (bLimHi && !bLastHi)
3068 if (nrange[1] < range[1]) /* found a LimLo before */
3070 root->cell_f[nrange[1]] = root->bound_min[nrange[1]];
3071 dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3072 nrange[0] = nrange[1];
3074 root->cell_f[i] = root->bound_max[i];
3076 dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3078 nrange[1] = range[1];
3081 if (nrange[1] < range[1]) /* found last a LimLo */
3083 root->cell_f[nrange[1]] = root->bound_min[nrange[1]];
3084 dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3085 nrange[0] = nrange[1];
3086 nrange[1] = range[1];
3087 dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3089 else if (nrange[0] > range[0]) /* found at least one LimHi */
3091 dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3098 static void set_dd_cell_sizes_dlb_root(gmx_domdec_t *dd,
3099 int d, int dim, domdec_root_t *root,
3100 const gmx_ddbox_t *ddbox,
3101 gmx_bool bDynamicBox,
3102 gmx_bool bUniform, gmx_int64_t step)
3104 gmx_domdec_comm_t *comm;
3105 int ncd, d1, i, pos;
3107 real load_aver, load_i, imbalance, change, change_max, sc;
3108 real cellsize_limit_f, dist_min_f, dist_min_f_hard, space;
3112 int range[] = { 0, 0 };
3116 /* Convert the maximum change from the input percentage to a fraction */
3117 change_limit = comm->dlb_scale_lim*0.01;
3121 bPBC = (dim < ddbox->npbcdim);
3123 cell_size = root->buf_ncd;
3125 /* Store the original boundaries */
3126 for (i = 0; i < ncd+1; i++)
3128 root->old_cell_f[i] = root->cell_f[i];
3132 for (i = 0; i < ncd; i++)
3134 cell_size[i] = 1.0/ncd;
3137 else if (dd_load_count(comm) > 0)
3139 load_aver = comm->load[d].sum_m/ncd;
3141 for (i = 0; i < ncd; i++)
3143 /* Determine the relative imbalance of cell i */
3144 load_i = comm->load[d].load[i*comm->load[d].nload+2];
3145 imbalance = (load_i - load_aver)/(load_aver > 0 ? load_aver : 1);
3146 /* Determine the change of the cell size using underrelaxation */
3147 change = -relax*imbalance;
3148 change_max = std::max(change_max, std::max(change, -change));
3150 /* Limit the amount of scaling.
3151 * We need to use the same rescaling for all cells in one row,
3152 * otherwise the load balancing might not converge.
3155 if (change_max > change_limit)
3157 sc *= change_limit/change_max;
3159 for (i = 0; i < ncd; i++)
3161 /* Determine the relative imbalance of cell i */
3162 load_i = comm->load[d].load[i*comm->load[d].nload+2];
3163 imbalance = (load_i - load_aver)/(load_aver > 0 ? load_aver : 1);
3164 /* Determine the change of the cell size using underrelaxation */
3165 change = -sc*imbalance;
3166 cell_size[i] = (root->cell_f[i+1]-root->cell_f[i])*(1 + change);
3170 cellsize_limit_f = cellsize_min_dlb(comm, d, dim)/ddbox->box_size[dim];
3171 cellsize_limit_f *= DD_CELL_MARGIN;
3172 dist_min_f_hard = grid_jump_limit(comm, comm->cutoff, d)/ddbox->box_size[dim];
3173 dist_min_f = dist_min_f_hard * DD_CELL_MARGIN;
3174 if (ddbox->tric_dir[dim])
3176 cellsize_limit_f /= ddbox->skew_fac[dim];
3177 dist_min_f /= ddbox->skew_fac[dim];
3179 if (bDynamicBox && d > 0)
3181 dist_min_f *= DD_PRES_SCALE_MARGIN;
3183 if (d > 0 && !bUniform)
3185 /* Make sure that the grid is not shifted too much */
3186 for (i = 1; i < ncd; i++)
3188 if (root->cell_f_min1[i] - root->cell_f_max0[i-1] < 2 * dist_min_f_hard)
3190 gmx_incons("Inconsistent DD boundary staggering limits!");
3192 root->bound_min[i] = root->cell_f_max0[i-1] + dist_min_f;
3193 space = root->cell_f[i] - (root->cell_f_max0[i-1] + dist_min_f);
3196 root->bound_min[i] += 0.5*space;
3198 root->bound_max[i] = root->cell_f_min1[i] - dist_min_f;
3199 space = root->cell_f[i] - (root->cell_f_min1[i] - dist_min_f);
3202 root->bound_max[i] += 0.5*space;
3207 "dim %d boundary %d %.3f < %.3f < %.3f < %.3f < %.3f\n",
3209 root->cell_f_max0[i-1] + dist_min_f,
3210 root->bound_min[i], root->cell_f[i], root->bound_max[i],
3211 root->cell_f_min1[i] - dist_min_f);
3216 root->cell_f[0] = 0;
3217 root->cell_f[ncd] = 1;
3218 dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, range);
3221 /* After the checks above, the cells should obey the cut-off
3222 * restrictions, but it does not hurt to check.
3224 for (i = 0; i < ncd; i++)
3228 fprintf(debug, "Relative bounds dim %d cell %d: %f %f\n",
3229 dim, i, root->cell_f[i], root->cell_f[i+1]);
3232 if ((bPBC || (i != 0 && i != dd->nc[dim]-1)) &&
3233 root->cell_f[i+1] - root->cell_f[i] <
3234 cellsize_limit_f/DD_CELL_MARGIN)
3238 "\nWARNING step %s: direction %c, cell %d too small: %f\n",
3239 gmx_step_str(step, buf), dim2char(dim), i,
3240 (root->cell_f[i+1] - root->cell_f[i])
3241 *ddbox->box_size[dim]*ddbox->skew_fac[dim]);
3246 /* Store the cell boundaries of the lower dimensions at the end */
3247 for (d1 = 0; d1 < d; d1++)
3249 root->cell_f[pos++] = comm->cell_f0[d1];
3250 root->cell_f[pos++] = comm->cell_f1[d1];
3253 if (d < comm->npmedecompdim)
3255 /* The master determines the maximum shift for
3256 * the coordinate communication between separate PME nodes.
3258 set_pme_maxshift(dd, &comm->ddpme[d], bUniform, ddbox, root->cell_f);
3260 root->cell_f[pos++] = comm->ddpme[0].maxshift;
3263 root->cell_f[pos++] = comm->ddpme[1].maxshift;
3267 static void relative_to_absolute_cell_bounds(gmx_domdec_t *dd,
3268 const gmx_ddbox_t *ddbox,
3271 gmx_domdec_comm_t *comm;
3276 /* Set the cell dimensions */
3277 dim = dd->dim[dimind];
3278 comm->cell_x0[dim] = comm->cell_f0[dimind]*ddbox->box_size[dim];
3279 comm->cell_x1[dim] = comm->cell_f1[dimind]*ddbox->box_size[dim];
3280 if (dim >= ddbox->nboundeddim)
3282 comm->cell_x0[dim] += ddbox->box0[dim];
3283 comm->cell_x1[dim] += ddbox->box0[dim];
3287 static void distribute_dd_cell_sizes_dlb(gmx_domdec_t *dd,
3288 int d, int dim, real *cell_f_row,
3289 const gmx_ddbox_t *ddbox)
3291 gmx_domdec_comm_t *comm;
3297 /* Each node would only need to know two fractions,
3298 * but it is probably cheaper to broadcast the whole array.
3300 MPI_Bcast(cell_f_row, DD_CELL_F_SIZE(dd, d)*sizeof(real), MPI_BYTE,
3301 0, comm->mpi_comm_load[d]);
3303 /* Copy the fractions for this dimension from the buffer */
3304 comm->cell_f0[d] = cell_f_row[dd->ci[dim] ];
3305 comm->cell_f1[d] = cell_f_row[dd->ci[dim]+1];
3306 /* The whole array was communicated, so set the buffer position */
3307 pos = dd->nc[dim] + 1;
3308 for (d1 = 0; d1 <= d; d1++)
3312 /* Copy the cell fractions of the lower dimensions */
3313 comm->cell_f0[d1] = cell_f_row[pos++];
3314 comm->cell_f1[d1] = cell_f_row[pos++];
3316 relative_to_absolute_cell_bounds(dd, ddbox, d1);
3318 /* Convert the communicated shift from float to int */
3319 comm->ddpme[0].maxshift = (int)(cell_f_row[pos++] + 0.5);
3322 comm->ddpme[1].maxshift = (int)(cell_f_row[pos++] + 0.5);
3326 static void set_dd_cell_sizes_dlb_change(gmx_domdec_t *dd,
3327 const gmx_ddbox_t *ddbox,
3328 gmx_bool bDynamicBox,
3329 gmx_bool bUniform, gmx_int64_t step)
3331 gmx_domdec_comm_t *comm;
3333 gmx_bool bRowMember, bRowRoot;
3338 for (d = 0; d < dd->ndim; d++)
3343 for (d1 = d; d1 < dd->ndim; d1++)
3345 if (dd->ci[dd->dim[d1]] > 0)
3358 set_dd_cell_sizes_dlb_root(dd, d, dim, comm->root[d],
3359 ddbox, bDynamicBox, bUniform, step);
3360 cell_f_row = comm->root[d]->cell_f;
3364 cell_f_row = comm->cell_f_row;
3366 distribute_dd_cell_sizes_dlb(dd, d, dim, cell_f_row, ddbox);
3371 static void set_dd_cell_sizes_dlb_nochange(gmx_domdec_t *dd,
3372 const gmx_ddbox_t *ddbox)
3376 /* This function assumes the box is static and should therefore
3377 * not be called when the box has changed since the last
3378 * call to dd_partition_system.
3380 for (d = 0; d < dd->ndim; d++)
3382 relative_to_absolute_cell_bounds(dd, ddbox, d);
3388 static void set_dd_cell_sizes_dlb(gmx_domdec_t *dd,
3389 const gmx_ddbox_t *ddbox, gmx_bool bDynamicBox,
3390 gmx_bool bUniform, gmx_bool bDoDLB, gmx_int64_t step,
3391 gmx_wallcycle_t wcycle)
3393 gmx_domdec_comm_t *comm;
3400 wallcycle_start(wcycle, ewcDDCOMMBOUND);
3401 set_dd_cell_sizes_dlb_change(dd, ddbox, bDynamicBox, bUniform, step);
3402 wallcycle_stop(wcycle, ewcDDCOMMBOUND);
3404 else if (bDynamicBox)
3406 set_dd_cell_sizes_dlb_nochange(dd, ddbox);
3409 /* Set the dimensions for which no DD is used */
3410 for (dim = 0; dim < DIM; dim++)
3412 if (dd->nc[dim] == 1)
3414 comm->cell_x0[dim] = 0;
3415 comm->cell_x1[dim] = ddbox->box_size[dim];
3416 if (dim >= ddbox->nboundeddim)
3418 comm->cell_x0[dim] += ddbox->box0[dim];
3419 comm->cell_x1[dim] += ddbox->box0[dim];
3425 static void realloc_comm_ind(gmx_domdec_t *dd, ivec npulse)
3428 gmx_domdec_comm_dim_t *cd;
3430 for (d = 0; d < dd->ndim; d++)
3432 cd = &dd->comm->cd[d];
3433 np = npulse[dd->dim[d]];
3434 if (np > cd->np_nalloc)
3438 fprintf(debug, "(Re)allocing cd for %c to %d pulses\n",
3439 dim2char(dd->dim[d]), np);
3441 if (DDMASTER(dd) && cd->np_nalloc > 0)
3443 fprintf(stderr, "\nIncreasing the number of cell to communicate in dimension %c to %d for the first time\n", dim2char(dd->dim[d]), np);
3445 srenew(cd->ind, np);
3446 for (i = cd->np_nalloc; i < np; i++)
3448 cd->ind[i].index = nullptr;
3449 cd->ind[i].nalloc = 0;
3458 static void set_dd_cell_sizes(gmx_domdec_t *dd,
3459 gmx_ddbox_t *ddbox, gmx_bool bDynamicBox,
3460 gmx_bool bUniform, gmx_bool bDoDLB, gmx_int64_t step,
3461 gmx_wallcycle_t wcycle)
3463 gmx_domdec_comm_t *comm;
3469 /* Copy the old cell boundaries for the cg displacement check */
3470 copy_rvec(comm->cell_x0, comm->old_cell_x0);
3471 copy_rvec(comm->cell_x1, comm->old_cell_x1);
3477 check_box_size(dd, ddbox);
3479 set_dd_cell_sizes_dlb(dd, ddbox, bDynamicBox, bUniform, bDoDLB, step, wcycle);
3483 set_dd_cell_sizes_slb(dd, ddbox, setcellsizeslbLOCAL, npulse);
3484 realloc_comm_ind(dd, npulse);
3489 for (d = 0; d < DIM; d++)
3491 fprintf(debug, "cell_x[%d] %f - %f skew_fac %f\n",
3492 d, comm->cell_x0[d], comm->cell_x1[d], ddbox->skew_fac[d]);
3497 static void comm_dd_ns_cell_sizes(gmx_domdec_t *dd,
3499 rvec cell_ns_x0, rvec cell_ns_x1,
3502 gmx_domdec_comm_t *comm;
3507 for (dim_ind = 0; dim_ind < dd->ndim; dim_ind++)
3509 dim = dd->dim[dim_ind];
3511 /* Without PBC we don't have restrictions on the outer cells */
3512 if (!(dim >= ddbox->npbcdim &&
3513 (dd->ci[dim] == 0 || dd->ci[dim] == dd->nc[dim] - 1)) &&
3515 (comm->cell_x1[dim] - comm->cell_x0[dim])*ddbox->skew_fac[dim] <
3516 comm->cellsize_min[dim])
3519 gmx_fatal(FARGS, "step %s: The %c-size (%f) times the triclinic skew factor (%f) is smaller than the smallest allowed cell size (%f) for domain decomposition grid cell %d %d %d",
3520 gmx_step_str(step, buf), dim2char(dim),
3521 comm->cell_x1[dim] - comm->cell_x0[dim],
3522 ddbox->skew_fac[dim],
3523 dd->comm->cellsize_min[dim],
3524 dd->ci[XX], dd->ci[YY], dd->ci[ZZ]);
3528 if ((isDlbOn(dd->comm) && dd->ndim > 1) || ddbox->nboundeddim < DIM)
3530 /* Communicate the boundaries and update cell_ns_x0/1 */
3531 dd_move_cellx(dd, ddbox, cell_ns_x0, cell_ns_x1);
3532 if (isDlbOn(dd->comm) && dd->ndim > 1)
3534 check_grid_jump(step, dd, dd->comm->cutoff, ddbox, TRUE);
3539 static void make_tric_corr_matrix(int npbcdim, const matrix box, matrix tcm)
3543 tcm[YY][XX] = -box[YY][XX]/box[YY][YY];
3551 tcm[ZZ][XX] = -(box[ZZ][YY]*tcm[YY][XX] + box[ZZ][XX])/box[ZZ][ZZ];
3552 tcm[ZZ][YY] = -box[ZZ][YY]/box[ZZ][ZZ];
3561 static void check_screw_box(const matrix box)
3563 /* Mathematical limitation */
3564 if (box[YY][XX] != 0 || box[ZZ][XX] != 0)
3566 gmx_fatal(FARGS, "With screw pbc the unit cell can not have non-zero off-diagonal x-components");
3569 /* Limitation due to the asymmetry of the eighth shell method */
3570 if (box[ZZ][YY] != 0)
3572 gmx_fatal(FARGS, "pbc=screw with non-zero box_zy is not supported");
3576 static void distribute_cg(FILE *fplog,
3577 const matrix box, ivec tric_dir, t_block *cgs, rvec pos[],
3580 gmx_domdec_master_t *ma;
3581 int **tmp_ind = nullptr, *tmp_nalloc = nullptr;
3582 int i, icg, j, k, k0, k1, d;
3586 real nrcg, inv_ncg, pos_d;
3592 snew(tmp_nalloc, dd->nnodes);
3593 snew(tmp_ind, dd->nnodes);
3594 for (i = 0; i < dd->nnodes; i++)
3596 tmp_nalloc[i] = over_alloc_large(cgs->nr/dd->nnodes+1);
3597 snew(tmp_ind[i], tmp_nalloc[i]);
3600 /* Clear the count */
3601 for (i = 0; i < dd->nnodes; i++)
3607 make_tric_corr_matrix(dd->npbcdim, box, tcm);
3609 cgindex = cgs->index;
3611 /* Compute the center of geometry for all charge groups */
3612 for (icg = 0; icg < cgs->nr; icg++)
3615 k1 = cgindex[icg+1];
3619 copy_rvec(pos[k0], cg_cm);
3626 for (k = k0; (k < k1); k++)
3628 rvec_inc(cg_cm, pos[k]);
3630 for (d = 0; (d < DIM); d++)
3632 cg_cm[d] *= inv_ncg;
3635 /* Put the charge group in the box and determine the cell index */
3636 for (d = DIM-1; d >= 0; d--)
3639 if (d < dd->npbcdim)
3641 bScrew = (dd->bScrewPBC && d == XX);
3642 if (tric_dir[d] && dd->nc[d] > 1)
3644 /* Use triclinic coordintates for this dimension */
3645 for (j = d+1; j < DIM; j++)
3647 pos_d += cg_cm[j]*tcm[j][d];
3650 while (pos_d >= box[d][d])
3653 rvec_dec(cg_cm, box[d]);
3656 cg_cm[YY] = box[YY][YY] - cg_cm[YY];
3657 cg_cm[ZZ] = box[ZZ][ZZ] - cg_cm[ZZ];
3659 for (k = k0; (k < k1); k++)
3661 rvec_dec(pos[k], box[d]);
3664 pos[k][YY] = box[YY][YY] - pos[k][YY];
3665 pos[k][ZZ] = box[ZZ][ZZ] - pos[k][ZZ];
3672 rvec_inc(cg_cm, box[d]);
3675 cg_cm[YY] = box[YY][YY] - cg_cm[YY];
3676 cg_cm[ZZ] = box[ZZ][ZZ] - cg_cm[ZZ];
3678 for (k = k0; (k < k1); k++)
3680 rvec_inc(pos[k], box[d]);
3683 pos[k][YY] = box[YY][YY] - pos[k][YY];
3684 pos[k][ZZ] = box[ZZ][ZZ] - pos[k][ZZ];
3689 /* This could be done more efficiently */
3691 while (ind[d]+1 < dd->nc[d] && pos_d >= ma->cell_x[d][ind[d]+1])
3696 i = dd_index(dd->nc, ind);
3697 if (ma->ncg[i] == tmp_nalloc[i])
3699 tmp_nalloc[i] = over_alloc_large(ma->ncg[i]+1);
3700 srenew(tmp_ind[i], tmp_nalloc[i]);
3702 tmp_ind[i][ma->ncg[i]] = icg;
3704 ma->nat[i] += cgindex[icg+1] - cgindex[icg];
3708 for (i = 0; i < dd->nnodes; i++)
3711 for (k = 0; k < ma->ncg[i]; k++)
3713 ma->cg[k1++] = tmp_ind[i][k];
3716 ma->index[dd->nnodes] = k1;
3718 for (i = 0; i < dd->nnodes; i++)
3727 // Use double for the sums to avoid natoms^2 overflowing
3729 int nat_sum, nat_min, nat_max;
3734 nat_min = ma->nat[0];
3735 nat_max = ma->nat[0];
3736 for (i = 0; i < dd->nnodes; i++)
3738 nat_sum += ma->nat[i];
3739 // cast to double to avoid integer overflows when squaring
3740 nat2_sum += gmx::square(static_cast<double>(ma->nat[i]));
3741 nat_min = std::min(nat_min, ma->nat[i]);
3742 nat_max = std::max(nat_max, ma->nat[i]);
3744 nat_sum /= dd->nnodes;
3745 nat2_sum /= dd->nnodes;
3747 fprintf(fplog, "Atom distribution over %d domains: av %d stddev %d min %d max %d\n",
3750 static_cast<int>(std::sqrt(nat2_sum - gmx::square(static_cast<double>(nat_sum)) + 0.5)),
3755 static void get_cg_distribution(FILE *fplog, gmx_domdec_t *dd,
3756 t_block *cgs, const matrix box, gmx_ddbox_t *ddbox,
3759 gmx_domdec_master_t *ma = nullptr;
3762 int *ibuf, buf2[2] = { 0, 0 };
3763 gmx_bool bMaster = DDMASTER(dd);
3771 check_screw_box(box);
3774 set_dd_cell_sizes_slb(dd, ddbox, setcellsizeslbMASTER, npulse);
3776 distribute_cg(fplog, box, ddbox->tric_dir, cgs, pos, dd);
3777 for (i = 0; i < dd->nnodes; i++)
3779 ma->ibuf[2*i] = ma->ncg[i];
3780 ma->ibuf[2*i+1] = ma->nat[i];
3788 dd_scatter(dd, 2*sizeof(int), ibuf, buf2);
3790 dd->ncg_home = buf2[0];
3791 dd->nat_home = buf2[1];
3792 dd->ncg_tot = dd->ncg_home;
3793 dd->nat_tot = dd->nat_home;
3794 if (dd->ncg_home > dd->cg_nalloc || dd->cg_nalloc == 0)
3796 dd->cg_nalloc = over_alloc_dd(dd->ncg_home);
3797 srenew(dd->index_gl, dd->cg_nalloc);
3798 srenew(dd->cgindex, dd->cg_nalloc+1);
3802 for (i = 0; i < dd->nnodes; i++)
3804 ma->ibuf[i] = ma->ncg[i]*sizeof(int);
3805 ma->ibuf[dd->nnodes+i] = ma->index[i]*sizeof(int);
3810 bMaster ? ma->ibuf : nullptr,
3811 bMaster ? ma->ibuf+dd->nnodes : nullptr,
3812 bMaster ? ma->cg : nullptr,
3813 dd->ncg_home*sizeof(int), dd->index_gl);
3815 /* Determine the home charge group sizes */
3817 for (i = 0; i < dd->ncg_home; i++)
3819 cg_gl = dd->index_gl[i];
3821 dd->cgindex[i] + cgs->index[cg_gl+1] - cgs->index[cg_gl];
3826 fprintf(debug, "Home charge groups:\n");
3827 for (i = 0; i < dd->ncg_home; i++)
3829 fprintf(debug, " %d", dd->index_gl[i]);
3832 fprintf(debug, "\n");
3835 fprintf(debug, "\n");
3839 static int compact_and_copy_vec_at(int ncg, int *move,
3842 rvec *src, gmx_domdec_comm_t *comm,
3845 int m, icg, i, i0, i1, nrcg;
3851 for (m = 0; m < DIM*2; m++)
3857 for (icg = 0; icg < ncg; icg++)
3859 i1 = cgindex[icg+1];
3865 /* Compact the home array in place */
3866 for (i = i0; i < i1; i++)
3868 copy_rvec(src[i], src[home_pos++]);
3874 /* Copy to the communication buffer */
3876 pos_vec[m] += 1 + vec*nrcg;
3877 for (i = i0; i < i1; i++)
3879 copy_rvec(src[i], comm->cgcm_state[m][pos_vec[m]++]);
3881 pos_vec[m] += (nvec - vec - 1)*nrcg;
3885 home_pos += i1 - i0;
3893 static int compact_and_copy_vec_cg(int ncg, int *move,
3895 int nvec, rvec *src, gmx_domdec_comm_t *comm,
3898 int m, icg, i0, i1, nrcg;
3904 for (m = 0; m < DIM*2; m++)
3910 for (icg = 0; icg < ncg; icg++)
3912 i1 = cgindex[icg+1];
3918 /* Compact the home array in place */
3919 copy_rvec(src[icg], src[home_pos++]);
3925 /* Copy to the communication buffer */
3926 copy_rvec(src[icg], comm->cgcm_state[m][pos_vec[m]]);
3927 pos_vec[m] += 1 + nrcg*nvec;
3939 static int compact_ind(int ncg, int *move,
3940 int *index_gl, int *cgindex,
3942 gmx_ga2la_t *ga2la, char *bLocalCG,
3945 int cg, nat, a0, a1, a, a_gl;
3950 for (cg = 0; cg < ncg; cg++)
3956 /* Compact the home arrays in place.
3957 * Anything that can be done here avoids access to global arrays.
3959 cgindex[home_pos] = nat;
3960 for (a = a0; a < a1; a++)
3963 gatindex[nat] = a_gl;
3964 /* The cell number stays 0, so we don't need to set it */
3965 ga2la_change_la(ga2la, a_gl, nat);
3968 index_gl[home_pos] = index_gl[cg];
3969 cginfo[home_pos] = cginfo[cg];
3970 /* The charge group remains local, so bLocalCG does not change */
3975 /* Clear the global indices */
3976 for (a = a0; a < a1; a++)
3978 ga2la_del(ga2la, gatindex[a]);
3982 bLocalCG[index_gl[cg]] = FALSE;
3986 cgindex[home_pos] = nat;
3991 static void clear_and_mark_ind(int ncg, int *move,
3992 int *index_gl, int *cgindex, int *gatindex,
3993 gmx_ga2la_t *ga2la, char *bLocalCG,
3998 for (cg = 0; cg < ncg; cg++)
4004 /* Clear the global indices */
4005 for (a = a0; a < a1; a++)
4007 ga2la_del(ga2la, gatindex[a]);
4011 bLocalCG[index_gl[cg]] = FALSE;
4013 /* Signal that this cg has moved using the ns cell index.
4014 * Here we set it to -1. fill_grid will change it
4015 * from -1 to NSGRID_SIGNAL_MOVED_FAC*grid->ncells.
4017 cell_index[cg] = -1;
4022 static void print_cg_move(FILE *fplog,
4024 gmx_int64_t step, int cg, int dim, int dir,
4025 gmx_bool bHaveCgcmOld, real limitd,
4026 rvec cm_old, rvec cm_new, real pos_d)
4028 gmx_domdec_comm_t *comm;
4033 fprintf(fplog, "\nStep %s:\n", gmx_step_str(step, buf));
4036 fprintf(fplog, "%s %d moved more than the distance allowed by the domain decomposition (%f) in direction %c\n",
4037 dd->comm->bCGs ? "The charge group starting at atom" : "Atom",
4038 ddglatnr(dd, dd->cgindex[cg]), limitd, dim2char(dim));
4042 /* We don't have a limiting distance available: don't print it */
4043 fprintf(fplog, "%s %d moved more than the distance allowed by the domain decomposition in direction %c\n",
4044 dd->comm->bCGs ? "The charge group starting at atom" : "Atom",
4045 ddglatnr(dd, dd->cgindex[cg]), dim2char(dim));
4047 fprintf(fplog, "distance out of cell %f\n",
4048 dir == 1 ? pos_d - comm->cell_x1[dim] : pos_d - comm->cell_x0[dim]);
4051 fprintf(fplog, "Old coordinates: %8.3f %8.3f %8.3f\n",
4052 cm_old[XX], cm_old[YY], cm_old[ZZ]);
4054 fprintf(fplog, "New coordinates: %8.3f %8.3f %8.3f\n",
4055 cm_new[XX], cm_new[YY], cm_new[ZZ]);
4056 fprintf(fplog, "Old cell boundaries in direction %c: %8.3f %8.3f\n",
4058 comm->old_cell_x0[dim], comm->old_cell_x1[dim]);
4059 fprintf(fplog, "New cell boundaries in direction %c: %8.3f %8.3f\n",
4061 comm->cell_x0[dim], comm->cell_x1[dim]);
4064 static void cg_move_error(FILE *fplog,
4066 gmx_int64_t step, int cg, int dim, int dir,
4067 gmx_bool bHaveCgcmOld, real limitd,
4068 rvec cm_old, rvec cm_new, real pos_d)
4072 print_cg_move(fplog, dd, step, cg, dim, dir,
4073 bHaveCgcmOld, limitd, cm_old, cm_new, pos_d);
4075 print_cg_move(stderr, dd, step, cg, dim, dir,
4076 bHaveCgcmOld, limitd, cm_old, cm_new, pos_d);
4078 "%s moved too far between two domain decomposition steps\n"
4079 "This usually means that your system is not well equilibrated",
4080 dd->comm->bCGs ? "A charge group" : "An atom");
4083 static void rotate_state_atom(t_state *state, int a)
4085 if (state->flags & (1 << estX))
4087 /* Rotate the complete state; for a rectangular box only */
4088 state->x[a][YY] = state->box[YY][YY] - state->x[a][YY];
4089 state->x[a][ZZ] = state->box[ZZ][ZZ] - state->x[a][ZZ];
4091 if (state->flags & (1 << estV))
4093 state->v[a][YY] = -state->v[a][YY];
4094 state->v[a][ZZ] = -state->v[a][ZZ];
4096 if (state->flags & (1 << estCGP))
4098 state->cg_p[a][YY] = -state->cg_p[a][YY];
4099 state->cg_p[a][ZZ] = -state->cg_p[a][ZZ];
4103 static int *get_moved(gmx_domdec_comm_t *comm, int natoms)
4105 if (natoms > comm->moved_nalloc)
4107 /* Contents should be preserved here */
4108 comm->moved_nalloc = over_alloc_dd(natoms);
4109 srenew(comm->moved, comm->moved_nalloc);
4115 static void calc_cg_move(FILE *fplog, gmx_int64_t step,
4118 ivec tric_dir, matrix tcm,
4119 rvec cell_x0, rvec cell_x1,
4120 rvec limitd, rvec limit0, rvec limit1,
4122 int cg_start, int cg_end,
4127 int cg, k, k0, k1, d, dim, d2;
4132 real inv_ncg, pos_d;
4135 npbcdim = dd->npbcdim;
4137 for (cg = cg_start; cg < cg_end; cg++)
4144 copy_rvec(state->x[k0], cm_new);
4151 for (k = k0; (k < k1); k++)
4153 rvec_inc(cm_new, state->x[k]);
4155 for (d = 0; (d < DIM); d++)
4157 cm_new[d] = inv_ncg*cm_new[d];
4162 /* Do pbc and check DD cell boundary crossings */
4163 for (d = DIM-1; d >= 0; d--)
4167 bScrew = (dd->bScrewPBC && d == XX);
4168 /* Determine the location of this cg in lattice coordinates */
4172 for (d2 = d+1; d2 < DIM; d2++)
4174 pos_d += cm_new[d2]*tcm[d2][d];
4177 /* Put the charge group in the triclinic unit-cell */
4178 if (pos_d >= cell_x1[d])
4180 if (pos_d >= limit1[d])
4182 cg_move_error(fplog, dd, step, cg, d, 1,
4183 cg_cm != as_rvec_array(state->x.data()), limitd[d],
4184 cg_cm[cg], cm_new, pos_d);
4187 if (dd->ci[d] == dd->nc[d] - 1)
4189 rvec_dec(cm_new, state->box[d]);
4192 cm_new[YY] = state->box[YY][YY] - cm_new[YY];
4193 cm_new[ZZ] = state->box[ZZ][ZZ] - cm_new[ZZ];
4195 for (k = k0; (k < k1); k++)
4197 rvec_dec(state->x[k], state->box[d]);
4200 rotate_state_atom(state, k);
4205 else if (pos_d < cell_x0[d])
4207 if (pos_d < limit0[d])
4209 cg_move_error(fplog, dd, step, cg, d, -1,
4210 cg_cm != as_rvec_array(state->x.data()), limitd[d],
4211 cg_cm[cg], cm_new, pos_d);
4216 rvec_inc(cm_new, state->box[d]);
4219 cm_new[YY] = state->box[YY][YY] - cm_new[YY];
4220 cm_new[ZZ] = state->box[ZZ][ZZ] - cm_new[ZZ];
4222 for (k = k0; (k < k1); k++)
4224 rvec_inc(state->x[k], state->box[d]);
4227 rotate_state_atom(state, k);
4233 else if (d < npbcdim)
4235 /* Put the charge group in the rectangular unit-cell */
4236 while (cm_new[d] >= state->box[d][d])
4238 rvec_dec(cm_new, state->box[d]);
4239 for (k = k0; (k < k1); k++)
4241 rvec_dec(state->x[k], state->box[d]);
4244 while (cm_new[d] < 0)
4246 rvec_inc(cm_new, state->box[d]);
4247 for (k = k0; (k < k1); k++)
4249 rvec_inc(state->x[k], state->box[d]);
4255 copy_rvec(cm_new, cg_cm[cg]);
4257 /* Determine where this cg should go */
4260 for (d = 0; d < dd->ndim; d++)
4265 flag |= DD_FLAG_FW(d);
4271 else if (dev[dim] == -1)
4273 flag |= DD_FLAG_BW(d);
4276 if (dd->nc[dim] > 2)
4287 /* Temporarily store the flag in move */
4288 move[cg] = mc + flag;
4292 static void dd_redistribute_cg(FILE *fplog, gmx_int64_t step,
4293 gmx_domdec_t *dd, ivec tric_dir,
4294 t_state *state, PaddedRVecVector *f,
4303 int ncg[DIM*2] = { 0 }, nat[DIM*2] = { 0 };
4304 int i, cg, k, d, dim, dim2, dir, d2, d3;
4305 int mc, cdd, nrcg, ncg_recv, nvs, nvr, nvec, vec;
4306 int sbuf[2], rbuf[2];
4307 int home_pos_cg, home_pos_at, buf_pos;
4311 rvec *cg_cm = nullptr, cell_x0, cell_x1, limitd, limit0, limit1;
4313 cginfo_mb_t *cginfo_mb;
4314 gmx_domdec_comm_t *comm;
4316 int nthread, thread;
4320 check_screw_box(state->box);
4324 if (fr->cutoff_scheme == ecutsGROUP)
4329 // Positions are always present, so there's nothing to flag
4330 bool bV = state->flags & (1<<estV);
4331 bool bCGP = state->flags & (1<<estCGP);
4333 if (dd->ncg_tot > comm->nalloc_int)
4335 comm->nalloc_int = over_alloc_dd(dd->ncg_tot);
4336 srenew(comm->buf_int, comm->nalloc_int);
4338 move = comm->buf_int;
4340 npbcdim = dd->npbcdim;
4342 for (d = 0; (d < DIM); d++)
4344 limitd[d] = dd->comm->cellsize_min[d];
4345 if (d >= npbcdim && dd->ci[d] == 0)
4347 cell_x0[d] = -GMX_FLOAT_MAX;
4351 cell_x0[d] = comm->cell_x0[d];
4353 if (d >= npbcdim && dd->ci[d] == dd->nc[d] - 1)
4355 cell_x1[d] = GMX_FLOAT_MAX;
4359 cell_x1[d] = comm->cell_x1[d];
4363 limit0[d] = comm->old_cell_x0[d] - limitd[d];
4364 limit1[d] = comm->old_cell_x1[d] + limitd[d];
4368 /* We check after communication if a charge group moved
4369 * more than one cell. Set the pre-comm check limit to float_max.
4371 limit0[d] = -GMX_FLOAT_MAX;
4372 limit1[d] = GMX_FLOAT_MAX;
4376 make_tric_corr_matrix(npbcdim, state->box, tcm);
4378 cgindex = dd->cgindex;
4380 nthread = gmx_omp_nthreads_get(emntDomdec);
4382 /* Compute the center of geometry for all home charge groups
4383 * and put them in the box and determine where they should go.
4385 #pragma omp parallel for num_threads(nthread) schedule(static)
4386 for (thread = 0; thread < nthread; thread++)
4390 calc_cg_move(fplog, step, dd, state, tric_dir, tcm,
4391 cell_x0, cell_x1, limitd, limit0, limit1,
4393 ( thread *dd->ncg_home)/nthread,
4394 ((thread+1)*dd->ncg_home)/nthread,
4395 fr->cutoff_scheme == ecutsGROUP ? cg_cm : as_rvec_array(state->x.data()),
4398 GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
4401 for (cg = 0; cg < dd->ncg_home; cg++)
4406 flag = mc & ~DD_FLAG_NRCG;
4407 mc = mc & DD_FLAG_NRCG;
4410 if (ncg[mc]+1 > comm->cggl_flag_nalloc[mc])
4412 comm->cggl_flag_nalloc[mc] = over_alloc_dd(ncg[mc]+1);
4413 srenew(comm->cggl_flag[mc], comm->cggl_flag_nalloc[mc]*DD_CGIBS);
4415 comm->cggl_flag[mc][ncg[mc]*DD_CGIBS ] = dd->index_gl[cg];
4416 /* We store the cg size in the lower 16 bits
4417 * and the place where the charge group should go
4418 * in the next 6 bits. This saves some communication volume.
4420 nrcg = cgindex[cg+1] - cgindex[cg];
4421 comm->cggl_flag[mc][ncg[mc]*DD_CGIBS+1] = nrcg | flag;
4427 inc_nrnb(nrnb, eNR_CGCM, dd->nat_home);
4428 inc_nrnb(nrnb, eNR_RESETX, dd->ncg_home);
4431 for (i = 0; i < dd->ndim*2; i++)
4433 *ncg_moved += ncg[i];
4446 /* Make sure the communication buffers are large enough */
4447 for (mc = 0; mc < dd->ndim*2; mc++)
4449 nvr = ncg[mc] + nat[mc]*nvec;
4450 if (nvr > comm->cgcm_state_nalloc[mc])
4452 comm->cgcm_state_nalloc[mc] = over_alloc_dd(nvr);
4453 srenew(comm->cgcm_state[mc], comm->cgcm_state_nalloc[mc]);
4457 switch (fr->cutoff_scheme)
4460 /* Recalculating cg_cm might be cheaper than communicating,
4461 * but that could give rise to rounding issues.
4464 compact_and_copy_vec_cg(dd->ncg_home, move, cgindex,
4465 nvec, cg_cm, comm, bCompact);
4468 /* Without charge groups we send the moved atom coordinates
4469 * over twice. This is so the code below can be used without
4470 * many conditionals for both for with and without charge groups.
4473 compact_and_copy_vec_cg(dd->ncg_home, move, cgindex,
4474 nvec, as_rvec_array(state->x.data()), comm, FALSE);
4477 home_pos_cg -= *ncg_moved;
4481 gmx_incons("unimplemented");
4487 compact_and_copy_vec_at(dd->ncg_home, move, cgindex,
4488 nvec, vec++, as_rvec_array(state->x.data()),
4492 compact_and_copy_vec_at(dd->ncg_home, move, cgindex,
4493 nvec, vec++, as_rvec_array(state->v.data()),
4498 compact_and_copy_vec_at(dd->ncg_home, move, cgindex,
4499 nvec, vec++, as_rvec_array(state->cg_p.data()),
4505 compact_ind(dd->ncg_home, move,
4506 dd->index_gl, dd->cgindex, dd->gatindex,
4507 dd->ga2la, comm->bLocalCG,
4512 if (fr->cutoff_scheme == ecutsVERLET)
4514 moved = get_moved(comm, dd->ncg_home);
4516 for (k = 0; k < dd->ncg_home; k++)
4523 moved = fr->ns->grid->cell_index;
4526 clear_and_mark_ind(dd->ncg_home, move,
4527 dd->index_gl, dd->cgindex, dd->gatindex,
4528 dd->ga2la, comm->bLocalCG,
4532 cginfo_mb = fr->cginfo_mb;
4534 *ncg_stay_home = home_pos_cg;
4535 for (d = 0; d < dd->ndim; d++)
4540 for (dir = 0; dir < (dd->nc[dim] == 2 ? 1 : 2); dir++)
4543 /* Communicate the cg and atom counts */
4548 fprintf(debug, "Sending ddim %d dir %d: ncg %d nat %d\n",
4549 d, dir, sbuf[0], sbuf[1]);
4551 dd_sendrecv_int(dd, d, dir, sbuf, 2, rbuf, 2);
4553 if ((ncg_recv+rbuf[0])*DD_CGIBS > comm->nalloc_int)
4555 comm->nalloc_int = over_alloc_dd((ncg_recv+rbuf[0])*DD_CGIBS);
4556 srenew(comm->buf_int, comm->nalloc_int);
4559 /* Communicate the charge group indices, sizes and flags */
4560 dd_sendrecv_int(dd, d, dir,
4561 comm->cggl_flag[cdd], sbuf[0]*DD_CGIBS,
4562 comm->buf_int+ncg_recv*DD_CGIBS, rbuf[0]*DD_CGIBS);
4564 nvs = ncg[cdd] + nat[cdd]*nvec;
4565 i = rbuf[0] + rbuf[1] *nvec;
4566 vec_rvec_check_alloc(&comm->vbuf, nvr+i);
4568 /* Communicate cgcm and state */
4569 dd_sendrecv_rvec(dd, d, dir,
4570 comm->cgcm_state[cdd], nvs,
4571 comm->vbuf.v+nvr, i);
4572 ncg_recv += rbuf[0];
4576 dd_check_alloc_ncg(fr, state, f, home_pos_cg + ncg_recv);
4577 if (fr->cutoff_scheme == ecutsGROUP)
4579 /* Here we resize to more than necessary and shrink later */
4580 dd_resize_state(state, f, home_pos_at + ncg_recv*MAX_CGCGSIZE);
4583 /* Process the received charge groups */
4585 for (cg = 0; cg < ncg_recv; cg++)
4587 flag = comm->buf_int[cg*DD_CGIBS+1];
4589 if (dim >= npbcdim && dd->nc[dim] > 2)
4591 /* No pbc in this dim and more than one domain boundary.
4592 * We do a separate check if a charge group didn't move too far.
4594 if (((flag & DD_FLAG_FW(d)) &&
4595 comm->vbuf.v[buf_pos][dim] > cell_x1[dim]) ||
4596 ((flag & DD_FLAG_BW(d)) &&
4597 comm->vbuf.v[buf_pos][dim] < cell_x0[dim]))
4599 cg_move_error(fplog, dd, step, cg, dim,
4600 (flag & DD_FLAG_FW(d)) ? 1 : 0,
4601 fr->cutoff_scheme == ecutsGROUP, 0,
4602 comm->vbuf.v[buf_pos],
4603 comm->vbuf.v[buf_pos],
4604 comm->vbuf.v[buf_pos][dim]);
4611 /* Check which direction this cg should go */
4612 for (d2 = d+1; (d2 < dd->ndim && mc == -1); d2++)
4614 if (isDlbOn(dd->comm))
4616 /* The cell boundaries for dimension d2 are not equal
4617 * for each cell row of the lower dimension(s),
4618 * therefore we might need to redetermine where
4619 * this cg should go.
4622 /* If this cg crosses the box boundary in dimension d2
4623 * we can use the communicated flag, so we do not
4624 * have to worry about pbc.
4626 if (!((dd->ci[dim2] == dd->nc[dim2]-1 &&
4627 (flag & DD_FLAG_FW(d2))) ||
4628 (dd->ci[dim2] == 0 &&
4629 (flag & DD_FLAG_BW(d2)))))
4631 /* Clear the two flags for this dimension */
4632 flag &= ~(DD_FLAG_FW(d2) | DD_FLAG_BW(d2));
4633 /* Determine the location of this cg
4634 * in lattice coordinates
4636 pos_d = comm->vbuf.v[buf_pos][dim2];
4639 for (d3 = dim2+1; d3 < DIM; d3++)
4642 comm->vbuf.v[buf_pos][d3]*tcm[d3][dim2];
4645 /* Check of we are not at the box edge.
4646 * pbc is only handled in the first step above,
4647 * but this check could move over pbc while
4648 * the first step did not due to different rounding.
4650 if (pos_d >= cell_x1[dim2] &&
4651 dd->ci[dim2] != dd->nc[dim2]-1)
4653 flag |= DD_FLAG_FW(d2);
4655 else if (pos_d < cell_x0[dim2] &&
4658 flag |= DD_FLAG_BW(d2);
4660 comm->buf_int[cg*DD_CGIBS+1] = flag;
4663 /* Set to which neighboring cell this cg should go */
4664 if (flag & DD_FLAG_FW(d2))
4668 else if (flag & DD_FLAG_BW(d2))
4670 if (dd->nc[dd->dim[d2]] > 2)
4682 nrcg = flag & DD_FLAG_NRCG;
4685 if (home_pos_cg+1 > dd->cg_nalloc)
4687 dd->cg_nalloc = over_alloc_dd(home_pos_cg+1);
4688 srenew(dd->index_gl, dd->cg_nalloc);
4689 srenew(dd->cgindex, dd->cg_nalloc+1);
4691 /* Set the global charge group index and size */
4692 dd->index_gl[home_pos_cg] = comm->buf_int[cg*DD_CGIBS];
4693 dd->cgindex[home_pos_cg+1] = dd->cgindex[home_pos_cg] + nrcg;
4694 /* Copy the state from the buffer */
4695 if (fr->cutoff_scheme == ecutsGROUP)
4698 copy_rvec(comm->vbuf.v[buf_pos], cg_cm[home_pos_cg]);
4702 /* Set the cginfo */
4703 fr->cginfo[home_pos_cg] = ddcginfo(cginfo_mb,
4704 dd->index_gl[home_pos_cg]);
4707 comm->bLocalCG[dd->index_gl[home_pos_cg]] = TRUE;
4710 for (i = 0; i < nrcg; i++)
4712 copy_rvec(comm->vbuf.v[buf_pos++],
4713 state->x[home_pos_at+i]);
4717 for (i = 0; i < nrcg; i++)
4719 copy_rvec(comm->vbuf.v[buf_pos++],
4720 state->v[home_pos_at+i]);
4725 for (i = 0; i < nrcg; i++)
4727 copy_rvec(comm->vbuf.v[buf_pos++],
4728 state->cg_p[home_pos_at+i]);
4732 home_pos_at += nrcg;
4736 /* Reallocate the buffers if necessary */
4737 if (ncg[mc]+1 > comm->cggl_flag_nalloc[mc])
4739 comm->cggl_flag_nalloc[mc] = over_alloc_dd(ncg[mc]+1);
4740 srenew(comm->cggl_flag[mc], comm->cggl_flag_nalloc[mc]*DD_CGIBS);
4742 nvr = ncg[mc] + nat[mc]*nvec;
4743 if (nvr + 1 + nrcg*nvec > comm->cgcm_state_nalloc[mc])
4745 comm->cgcm_state_nalloc[mc] = over_alloc_dd(nvr + 1 + nrcg*nvec);
4746 srenew(comm->cgcm_state[mc], comm->cgcm_state_nalloc[mc]);
4748 /* Copy from the receive to the send buffers */
4749 memcpy(comm->cggl_flag[mc] + ncg[mc]*DD_CGIBS,
4750 comm->buf_int + cg*DD_CGIBS,
4751 DD_CGIBS*sizeof(int));
4752 memcpy(comm->cgcm_state[mc][nvr],
4753 comm->vbuf.v[buf_pos],
4754 (1+nrcg*nvec)*sizeof(rvec));
4755 buf_pos += 1 + nrcg*nvec;
4762 /* With sorting (!bCompact) the indices are now only partially up to date
4763 * and ncg_home and nat_home are not the real count, since there are
4764 * "holes" in the arrays for the charge groups that moved to neighbors.
4766 if (fr->cutoff_scheme == ecutsVERLET)
4768 moved = get_moved(comm, home_pos_cg);
4770 for (i = dd->ncg_home; i < home_pos_cg; i++)
4775 dd->ncg_home = home_pos_cg;
4776 dd->nat_home = home_pos_at;
4778 if (fr->cutoff_scheme == ecutsGROUP && !bCompact)
4780 /* We overallocated before, we need to set the right size here */
4781 dd_resize_state(state, f, dd->nat_home);
4787 "Finished repartitioning: cgs moved out %d, new home %d\n",
4788 *ncg_moved, dd->ncg_home-*ncg_moved);
4793 void dd_cycles_add(const gmx_domdec_t *dd, float cycles, int ddCycl)
4795 /* Note that the cycles value can be incorrect, either 0 or some
4796 * extremely large value, when our thread migrated to another core
4797 * with an unsynchronized cycle counter. If this happens less often
4798 * that once per nstlist steps, this will not cause issues, since
4799 * we later subtract the maximum value from the sum over nstlist steps.
4800 * A zero count will slightly lower the total, but that's a small effect.
4801 * Note that the main purpose of the subtraction of the maximum value
4802 * is to avoid throwing off the load balancing when stalls occur due
4803 * e.g. system activity or network congestion.
4805 dd->comm->cycl[ddCycl] += cycles;
4806 dd->comm->cycl_n[ddCycl]++;
4807 if (cycles > dd->comm->cycl_max[ddCycl])
4809 dd->comm->cycl_max[ddCycl] = cycles;
4813 static double force_flop_count(t_nrnb *nrnb)
4820 for (i = 0; i < eNR_NBKERNEL_FREE_ENERGY; i++)
4822 /* To get closer to the real timings, we half the count
4823 * for the normal loops and again half it for water loops.
4826 if (strstr(name, "W3") != nullptr || strstr(name, "W4") != nullptr)
4828 sum += nrnb->n[i]*0.25*cost_nrnb(i);
4832 sum += nrnb->n[i]*0.50*cost_nrnb(i);
4835 for (i = eNR_NBKERNEL_FREE_ENERGY; i <= eNR_NB14; i++)
4838 if (strstr(name, "W3") != nullptr || strstr(name, "W4") != nullptr)
4840 sum += nrnb->n[i]*cost_nrnb(i);
4843 for (i = eNR_BONDS; i <= eNR_WALLS; i++)
4845 sum += nrnb->n[i]*cost_nrnb(i);
4851 void dd_force_flop_start(gmx_domdec_t *dd, t_nrnb *nrnb)
4853 if (dd->comm->eFlop)
4855 dd->comm->flop -= force_flop_count(nrnb);
4858 void dd_force_flop_stop(gmx_domdec_t *dd, t_nrnb *nrnb)
4860 if (dd->comm->eFlop)
4862 dd->comm->flop += force_flop_count(nrnb);
4867 static void clear_dd_cycle_counts(gmx_domdec_t *dd)
4871 for (i = 0; i < ddCyclNr; i++)
4873 dd->comm->cycl[i] = 0;
4874 dd->comm->cycl_n[i] = 0;
4875 dd->comm->cycl_max[i] = 0;
4878 dd->comm->flop_n = 0;
4881 static void get_load_distribution(gmx_domdec_t *dd, gmx_wallcycle_t wcycle)
4883 gmx_domdec_comm_t *comm;
4884 domdec_load_t *load;
4885 domdec_root_t *root = nullptr;
4887 float cell_frac = 0, sbuf[DD_NLOAD_MAX];
4892 fprintf(debug, "get_load_distribution start\n");
4895 wallcycle_start(wcycle, ewcDDCOMMLOAD);
4899 bSepPME = (dd->pme_nodeid >= 0);
4901 if (dd->ndim == 0 && bSepPME)
4903 /* Without decomposition, but with PME nodes, we need the load */
4904 comm->load[0].mdf = comm->cycl[ddCyclPPduringPME];
4905 comm->load[0].pme = comm->cycl[ddCyclPME];
4908 for (d = dd->ndim-1; d >= 0; d--)
4911 /* Check if we participate in the communication in this dimension */
4912 if (d == dd->ndim-1 ||
4913 (dd->ci[dd->dim[d+1]] == 0 && dd->ci[dd->dim[dd->ndim-1]] == 0))
4915 load = &comm->load[d];
4916 if (isDlbOn(dd->comm))
4918 cell_frac = comm->cell_f1[d] - comm->cell_f0[d];
4921 if (d == dd->ndim-1)
4923 sbuf[pos++] = dd_force_load(comm);
4924 sbuf[pos++] = sbuf[0];
4925 if (isDlbOn(dd->comm))
4927 sbuf[pos++] = sbuf[0];
4928 sbuf[pos++] = cell_frac;
4931 sbuf[pos++] = comm->cell_f_max0[d];
4932 sbuf[pos++] = comm->cell_f_min1[d];
4937 sbuf[pos++] = comm->cycl[ddCyclPPduringPME];
4938 sbuf[pos++] = comm->cycl[ddCyclPME];
4943 sbuf[pos++] = comm->load[d+1].sum;
4944 sbuf[pos++] = comm->load[d+1].max;
4945 if (isDlbOn(dd->comm))
4947 sbuf[pos++] = comm->load[d+1].sum_m;
4948 sbuf[pos++] = comm->load[d+1].cvol_min*cell_frac;
4949 sbuf[pos++] = comm->load[d+1].flags;
4952 sbuf[pos++] = comm->cell_f_max0[d];
4953 sbuf[pos++] = comm->cell_f_min1[d];
4958 sbuf[pos++] = comm->load[d+1].mdf;
4959 sbuf[pos++] = comm->load[d+1].pme;
4963 /* Communicate a row in DD direction d.
4964 * The communicators are setup such that the root always has rank 0.
4967 MPI_Gather(sbuf, load->nload*sizeof(float), MPI_BYTE,
4968 load->load, load->nload*sizeof(float), MPI_BYTE,
4969 0, comm->mpi_comm_load[d]);
4971 if (dd->ci[dim] == dd->master_ci[dim])
4973 /* We are the root, process this row */
4976 root = comm->root[d];
4986 for (i = 0; i < dd->nc[dim]; i++)
4988 load->sum += load->load[pos++];
4989 load->max = std::max(load->max, load->load[pos]);
4991 if (isDlbOn(dd->comm))
4995 /* This direction could not be load balanced properly,
4996 * therefore we need to use the maximum iso the average load.
4998 load->sum_m = std::max(load->sum_m, load->load[pos]);
5002 load->sum_m += load->load[pos];
5005 load->cvol_min = std::min(load->cvol_min, load->load[pos]);
5009 load->flags = (int)(load->load[pos++] + 0.5);
5013 root->cell_f_max0[i] = load->load[pos++];
5014 root->cell_f_min1[i] = load->load[pos++];
5019 load->mdf = std::max(load->mdf, load->load[pos]);
5021 load->pme = std::max(load->pme, load->load[pos]);
5025 if (isDlbOn(comm) && root->bLimited)
5027 load->sum_m *= dd->nc[dim];
5028 load->flags |= (1<<d);
5036 comm->nload += dd_load_count(comm);
5037 comm->load_step += comm->cycl[ddCyclStep];
5038 comm->load_sum += comm->load[0].sum;
5039 comm->load_max += comm->load[0].max;
5042 for (d = 0; d < dd->ndim; d++)
5044 if (comm->load[0].flags & (1<<d))
5046 comm->load_lim[d]++;
5052 comm->load_mdf += comm->load[0].mdf;
5053 comm->load_pme += comm->load[0].pme;
5057 wallcycle_stop(wcycle, ewcDDCOMMLOAD);
5061 fprintf(debug, "get_load_distribution finished\n");
5065 static float dd_force_load_fraction(gmx_domdec_t *dd)
5067 /* Return the relative performance loss on the total run time
5068 * due to the force calculation load imbalance.
5070 if (dd->comm->nload > 0 && dd->comm->load_step > 0)
5072 return dd->comm->load_sum/(dd->comm->load_step*dd->nnodes);
5080 static float dd_force_imb_perf_loss(gmx_domdec_t *dd)
5082 /* Return the relative performance loss on the total run time
5083 * due to the force calculation load imbalance.
5085 if (dd->comm->nload > 0 && dd->comm->load_step > 0)
5088 (dd->comm->load_max*dd->nnodes - dd->comm->load_sum)/
5089 (dd->comm->load_step*dd->nnodes);
5097 static void print_dd_load_av(FILE *fplog, gmx_domdec_t *dd)
5099 gmx_domdec_comm_t *comm = dd->comm;
5101 /* Only the master rank prints loads and only if we measured loads */
5102 if (!DDMASTER(dd) || comm->nload == 0)
5108 int numPpRanks = dd->nnodes;
5109 int numPmeRanks = (dd->pme_nodeid >= 0) ? comm->npmenodes : 0;
5110 int numRanks = numPpRanks + numPmeRanks;
5111 float lossFraction = 0;
5113 /* Print the average load imbalance and performance loss */
5114 if (dd->nnodes > 1 && comm->load_sum > 0)
5116 float imbalance = comm->load_max*numPpRanks/comm->load_sum - 1;
5117 lossFraction = dd_force_imb_perf_loss(dd);
5119 std::string msg = "\n Dynamic load balancing report:\n";
5120 std::string dlbStateStr = "";
5122 switch (dd->comm->dlbState)
5125 dlbStateStr = "DLB was off during the run per user request.";
5127 case edlbsOffForever:
5128 /* Currectly this can happen due to performance loss observed, cell size
5129 * limitations or incompatibility with other settings observed during
5130 * determineInitialDlbState(). */
5131 dlbStateStr = "DLB got disabled because it was unsuitable to use.";
5133 case edlbsOffCanTurnOn:
5134 dlbStateStr = "DLB was off during the run due to low measured imbalance.";
5136 case edlbsOffTemporarilyLocked:
5137 dlbStateStr = "DLB was locked at the end of the run due to unfinished PP-PME balancing.";
5139 case edlbsOnCanTurnOff:
5140 dlbStateStr = "DLB was turned on during the run due to measured imbalance.";
5143 dlbStateStr = "DLB was permanently on during the run per user request.";
5146 GMX_ASSERT(false, "Undocumented DLB state");
5149 msg += " " + dlbStateStr + "\n";
5150 msg += gmx::formatString(" Average load imbalance: %.1f%%.\n", imbalance*100);
5151 msg += gmx::formatString(" The balanceable part of the MD step is %d%%, load imbalance is computed from this.\n",
5152 static_cast<int>(dd_force_load_fraction(dd)*100 + 0.5));
5153 msg += gmx::formatString(" Part of the total run time spent waiting due to load imbalance: %.1f%%.\n",
5155 fprintf(fplog, "%s", msg.c_str());
5156 fprintf(stderr, "%s", msg.c_str());
5159 /* Print during what percentage of steps the load balancing was limited */
5160 bool dlbWasLimited = false;
5163 sprintf(buf, " Steps where the load balancing was limited by -rdd, -rcon and/or -dds:");
5164 for (int d = 0; d < dd->ndim; d++)
5166 int limitPercentage = (200*comm->load_lim[d] + 1)/(2*comm->nload);
5167 sprintf(buf+strlen(buf), " %c %d %%",
5168 dim2char(dd->dim[d]), limitPercentage);
5169 if (limitPercentage >= 50)
5171 dlbWasLimited = true;
5174 sprintf(buf + strlen(buf), "\n");
5175 fprintf(fplog, "%s", buf);
5176 fprintf(stderr, "%s", buf);
5179 /* Print the performance loss due to separate PME - PP rank imbalance */
5180 float lossFractionPme = 0;
5181 if (numPmeRanks > 0 && comm->load_mdf > 0 && comm->load_step > 0)
5183 float pmeForceRatio = comm->load_pme/comm->load_mdf;
5184 lossFractionPme = (comm->load_pme - comm->load_mdf)/comm->load_step;
5185 if (lossFractionPme <= 0)
5187 lossFractionPme *= numPmeRanks/static_cast<float>(numRanks);
5191 lossFractionPme *= numPpRanks/static_cast<float>(numRanks);
5193 sprintf(buf, " Average PME mesh/force load: %5.3f\n", pmeForceRatio);
5194 fprintf(fplog, "%s", buf);
5195 fprintf(stderr, "%s", buf);
5196 sprintf(buf, " Part of the total run time spent waiting due to PP/PME imbalance: %.1f %%\n", fabs(lossFractionPme)*100);
5197 fprintf(fplog, "%s", buf);
5198 fprintf(stderr, "%s", buf);
5200 fprintf(fplog, "\n");
5201 fprintf(stderr, "\n");
5203 if (lossFraction >= DD_PERF_LOSS_WARN)
5206 "NOTE: %.1f %% of the available CPU time was lost due to load imbalance\n"
5207 " in the domain decomposition.\n", lossFraction*100);
5210 sprintf(buf+strlen(buf), " You might want to use dynamic load balancing (option -dlb.)\n");
5212 else if (dlbWasLimited)
5214 sprintf(buf+strlen(buf), " You might want to decrease the cell size limit (options -rdd, -rcon and/or -dds).\n");
5216 fprintf(fplog, "%s\n", buf);
5217 fprintf(stderr, "%s\n", buf);
5219 if (numPmeRanks > 0 && fabs(lossFractionPme) >= DD_PERF_LOSS_WARN)
5222 "NOTE: %.1f %% performance was lost because the PME ranks\n"
5223 " had %s work to do than the PP ranks.\n"
5224 " You might want to %s the number of PME ranks\n"
5225 " or %s the cut-off and the grid spacing.\n",
5226 fabs(lossFractionPme*100),
5227 (lossFractionPme < 0) ? "less" : "more",
5228 (lossFractionPme < 0) ? "decrease" : "increase",
5229 (lossFractionPme < 0) ? "decrease" : "increase");
5230 fprintf(fplog, "%s\n", buf);
5231 fprintf(stderr, "%s\n", buf);
5235 static float dd_vol_min(gmx_domdec_t *dd)
5237 return dd->comm->load[0].cvol_min*dd->nnodes;
5240 static gmx_bool dd_load_flags(gmx_domdec_t *dd)
5242 return dd->comm->load[0].flags;
5245 static float dd_f_imbal(gmx_domdec_t *dd)
5247 if (dd->comm->load[0].sum > 0)
5249 return dd->comm->load[0].max*dd->nnodes/dd->comm->load[0].sum - 1.0f;
5253 /* Something is wrong in the cycle counting, report no load imbalance */
5258 float dd_pme_f_ratio(gmx_domdec_t *dd)
5260 /* Should only be called on the DD master rank */
5261 assert(DDMASTER(dd));
5263 if (dd->comm->load[0].mdf > 0 && dd->comm->cycl_n[ddCyclPME] > 0)
5265 return dd->comm->load[0].pme/dd->comm->load[0].mdf;
5273 static void dd_print_load(FILE *fplog, gmx_domdec_t *dd, gmx_int64_t step)
5278 flags = dd_load_flags(dd);
5282 "DD load balancing is limited by minimum cell size in dimension");
5283 for (d = 0; d < dd->ndim; d++)
5287 fprintf(fplog, " %c", dim2char(dd->dim[d]));
5290 fprintf(fplog, "\n");
5292 fprintf(fplog, "DD step %s", gmx_step_str(step, buf));
5293 if (isDlbOn(dd->comm))
5295 fprintf(fplog, " vol min/aver %5.3f%c",
5296 dd_vol_min(dd), flags ? '!' : ' ');
5300 fprintf(fplog, " load imb.: force %4.1f%%", dd_f_imbal(dd)*100);
5302 if (dd->comm->cycl_n[ddCyclPME])
5304 fprintf(fplog, " pme mesh/force %5.3f", dd_pme_f_ratio(dd));
5306 fprintf(fplog, "\n\n");
5309 static void dd_print_load_verbose(gmx_domdec_t *dd)
5311 if (isDlbOn(dd->comm))
5313 fprintf(stderr, "vol %4.2f%c ",
5314 dd_vol_min(dd), dd_load_flags(dd) ? '!' : ' ');
5318 fprintf(stderr, "imb F %2d%% ", (int)(dd_f_imbal(dd)*100+0.5));
5320 if (dd->comm->cycl_n[ddCyclPME])
5322 fprintf(stderr, "pme/F %4.2f ", dd_pme_f_ratio(dd));
5327 static void make_load_communicator(gmx_domdec_t *dd, int dim_ind, ivec loc)
5332 domdec_root_t *root;
5333 gmx_bool bPartOfGroup = FALSE;
5335 dim = dd->dim[dim_ind];
5336 copy_ivec(loc, loc_c);
5337 for (i = 0; i < dd->nc[dim]; i++)
5340 rank = dd_index(dd->nc, loc_c);
5341 if (rank == dd->rank)
5343 /* This process is part of the group */
5344 bPartOfGroup = TRUE;
5347 MPI_Comm_split(dd->mpi_comm_all, bPartOfGroup ? 0 : MPI_UNDEFINED, dd->rank,
5351 dd->comm->mpi_comm_load[dim_ind] = c_row;
5352 if (!isDlbDisabled(dd->comm))
5354 if (dd->ci[dim] == dd->master_ci[dim])
5356 /* This is the root process of this row */
5357 snew(dd->comm->root[dim_ind], 1);
5358 root = dd->comm->root[dim_ind];
5359 snew(root->cell_f, DD_CELL_F_SIZE(dd, dim_ind));
5360 snew(root->old_cell_f, dd->nc[dim]+1);
5361 snew(root->bCellMin, dd->nc[dim]);
5364 snew(root->cell_f_max0, dd->nc[dim]);
5365 snew(root->cell_f_min1, dd->nc[dim]);
5366 snew(root->bound_min, dd->nc[dim]);
5367 snew(root->bound_max, dd->nc[dim]);
5369 snew(root->buf_ncd, dd->nc[dim]);
5373 /* This is not a root process, we only need to receive cell_f */
5374 snew(dd->comm->cell_f_row, DD_CELL_F_SIZE(dd, dim_ind));
5377 if (dd->ci[dim] == dd->master_ci[dim])
5379 snew(dd->comm->load[dim_ind].load, dd->nc[dim]*DD_NLOAD_MAX);
5385 void dd_setup_dlb_resource_sharing(t_commrec *cr,
5389 int physicalnode_id_hash;
5391 MPI_Comm mpi_comm_pp_physicalnode;
5393 if (!thisRankHasDuty(cr, DUTY_PP) || gpu_id < 0)
5395 /* Only ranks with short-ranged tasks (currently) use GPUs.
5396 * If we don't have GPUs assigned, there are no resources to share.
5401 physicalnode_id_hash = gmx_physicalnode_id_hash();
5407 fprintf(debug, "dd_setup_dd_dlb_gpu_sharing:\n");
5408 fprintf(debug, "DD PP rank %d physical node hash %d gpu_id %d\n",
5409 dd->rank, physicalnode_id_hash, gpu_id);
5411 /* Split the PP communicator over the physical nodes */
5412 /* TODO: See if we should store this (before), as it's also used for
5413 * for the nodecomm summution.
5415 MPI_Comm_split(dd->mpi_comm_all, physicalnode_id_hash, dd->rank,
5416 &mpi_comm_pp_physicalnode);
5417 MPI_Comm_split(mpi_comm_pp_physicalnode, gpu_id, dd->rank,
5418 &dd->comm->mpi_comm_gpu_shared);
5419 MPI_Comm_free(&mpi_comm_pp_physicalnode);
5420 MPI_Comm_size(dd->comm->mpi_comm_gpu_shared, &dd->comm->nrank_gpu_shared);
5424 fprintf(debug, "nrank_gpu_shared %d\n", dd->comm->nrank_gpu_shared);
5427 /* Note that some ranks could share a GPU, while others don't */
5429 if (dd->comm->nrank_gpu_shared == 1)
5431 MPI_Comm_free(&dd->comm->mpi_comm_gpu_shared);
5434 GMX_UNUSED_VALUE(cr);
5435 GMX_UNUSED_VALUE(gpu_id);
5439 static void make_load_communicators(gmx_domdec_t gmx_unused *dd)
5442 int dim0, dim1, i, j;
5447 fprintf(debug, "Making load communicators\n");
5450 snew(dd->comm->load, std::max(dd->ndim, 1));
5451 snew(dd->comm->mpi_comm_load, std::max(dd->ndim, 1));
5459 make_load_communicator(dd, 0, loc);
5463 for (i = 0; i < dd->nc[dim0]; i++)
5466 make_load_communicator(dd, 1, loc);
5472 for (i = 0; i < dd->nc[dim0]; i++)
5476 for (j = 0; j < dd->nc[dim1]; j++)
5479 make_load_communicator(dd, 2, loc);
5486 fprintf(debug, "Finished making load communicators\n");
5491 /*! \brief Sets up the relation between neighboring domains and zones */
5492 static void setup_neighbor_relations(gmx_domdec_t *dd)
5494 int d, dim, i, j, m;
5496 gmx_domdec_zones_t *zones;
5497 gmx_domdec_ns_ranges_t *izone;
5499 for (d = 0; d < dd->ndim; d++)
5502 copy_ivec(dd->ci, tmp);
5503 tmp[dim] = (tmp[dim] + 1) % dd->nc[dim];
5504 dd->neighbor[d][0] = ddcoord2ddnodeid(dd, tmp);
5505 copy_ivec(dd->ci, tmp);
5506 tmp[dim] = (tmp[dim] - 1 + dd->nc[dim]) % dd->nc[dim];
5507 dd->neighbor[d][1] = ddcoord2ddnodeid(dd, tmp);
5510 fprintf(debug, "DD rank %d neighbor ranks in dir %d are + %d - %d\n",
5513 dd->neighbor[d][1]);
5517 int nzone = (1 << dd->ndim);
5518 int nizone = (1 << std::max(dd->ndim - 1, 0));
5519 assert(nizone >= 1 && nizone <= DD_MAXIZONE);
5521 zones = &dd->comm->zones;
5523 for (i = 0; i < nzone; i++)
5526 clear_ivec(zones->shift[i]);
5527 for (d = 0; d < dd->ndim; d++)
5529 zones->shift[i][dd->dim[d]] = dd_zo[i][m++];
5534 for (i = 0; i < nzone; i++)
5536 for (d = 0; d < DIM; d++)
5538 s[d] = dd->ci[d] - zones->shift[i][d];
5543 else if (s[d] >= dd->nc[d])
5549 zones->nizone = nizone;
5550 for (i = 0; i < zones->nizone; i++)
5552 assert(ddNonbondedZonePairRanges[i][0] == i);
5554 izone = &zones->izone[i];
5555 /* dd_zp3 is for 3D decomposition, for fewer dimensions use only
5556 * j-zones up to nzone.
5558 izone->j0 = std::min(ddNonbondedZonePairRanges[i][1], nzone);
5559 izone->j1 = std::min(ddNonbondedZonePairRanges[i][2], nzone);
5560 for (dim = 0; dim < DIM; dim++)
5562 if (dd->nc[dim] == 1)
5564 /* All shifts should be allowed */
5565 izone->shift0[dim] = -1;
5566 izone->shift1[dim] = 1;
5570 /* Determine the min/max j-zone shift wrt the i-zone */
5571 izone->shift0[dim] = 1;
5572 izone->shift1[dim] = -1;
5573 for (j = izone->j0; j < izone->j1; j++)
5575 int shift_diff = zones->shift[j][dim] - zones->shift[i][dim];
5576 if (shift_diff < izone->shift0[dim])
5578 izone->shift0[dim] = shift_diff;
5580 if (shift_diff > izone->shift1[dim])
5582 izone->shift1[dim] = shift_diff;
5589 if (!isDlbDisabled(dd->comm))
5591 snew(dd->comm->root, dd->ndim);
5594 if (dd->comm->bRecordLoad)
5596 make_load_communicators(dd);
5600 static void make_pp_communicator(FILE *fplog,
5602 t_commrec gmx_unused *cr,
5603 int gmx_unused reorder)
5606 gmx_domdec_comm_t *comm;
5613 if (comm->bCartesianPP)
5615 /* Set up cartesian communication for the particle-particle part */
5618 fprintf(fplog, "Will use a Cartesian communicator: %d x %d x %d\n",
5619 dd->nc[XX], dd->nc[YY], dd->nc[ZZ]);
5622 for (int i = 0; i < DIM; i++)
5626 MPI_Cart_create(cr->mpi_comm_mygroup, DIM, dd->nc, periods, reorder,
5628 /* We overwrite the old communicator with the new cartesian one */
5629 cr->mpi_comm_mygroup = comm_cart;
5632 dd->mpi_comm_all = cr->mpi_comm_mygroup;
5633 MPI_Comm_rank(dd->mpi_comm_all, &dd->rank);
5635 if (comm->bCartesianPP_PME)
5637 /* Since we want to use the original cartesian setup for sim,
5638 * and not the one after split, we need to make an index.
5640 snew(comm->ddindex2ddnodeid, dd->nnodes);
5641 comm->ddindex2ddnodeid[dd_index(dd->nc, dd->ci)] = dd->rank;
5642 gmx_sumi(dd->nnodes, comm->ddindex2ddnodeid, cr);
5643 /* Get the rank of the DD master,
5644 * above we made sure that the master node is a PP node.
5654 MPI_Allreduce(&rank, &dd->masterrank, 1, MPI_INT, MPI_SUM, dd->mpi_comm_all);
5656 else if (comm->bCartesianPP)
5658 if (cr->npmenodes == 0)
5660 /* The PP communicator is also
5661 * the communicator for this simulation
5663 cr->mpi_comm_mysim = cr->mpi_comm_mygroup;
5665 cr->nodeid = dd->rank;
5667 MPI_Cart_coords(dd->mpi_comm_all, dd->rank, DIM, dd->ci);
5669 /* We need to make an index to go from the coordinates
5670 * to the nodeid of this simulation.
5672 snew(comm->ddindex2simnodeid, dd->nnodes);
5673 snew(buf, dd->nnodes);
5674 if (thisRankHasDuty(cr, DUTY_PP))
5676 buf[dd_index(dd->nc, dd->ci)] = cr->sim_nodeid;
5678 /* Communicate the ddindex to simulation nodeid index */
5679 MPI_Allreduce(buf, comm->ddindex2simnodeid, dd->nnodes, MPI_INT, MPI_SUM,
5680 cr->mpi_comm_mysim);
5683 /* Determine the master coordinates and rank.
5684 * The DD master should be the same node as the master of this sim.
5686 for (int i = 0; i < dd->nnodes; i++)
5688 if (comm->ddindex2simnodeid[i] == 0)
5690 ddindex2xyz(dd->nc, i, dd->master_ci);
5691 MPI_Cart_rank(dd->mpi_comm_all, dd->master_ci, &dd->masterrank);
5696 fprintf(debug, "The master rank is %d\n", dd->masterrank);
5701 /* No Cartesian communicators */
5702 /* We use the rank in dd->comm->all as DD index */
5703 ddindex2xyz(dd->nc, dd->rank, dd->ci);
5704 /* The simulation master nodeid is 0, so the DD master rank is also 0 */
5706 clear_ivec(dd->master_ci);
5713 "Domain decomposition rank %d, coordinates %d %d %d\n\n",
5714 dd->rank, dd->ci[XX], dd->ci[YY], dd->ci[ZZ]);
5719 "Domain decomposition rank %d, coordinates %d %d %d\n\n",
5720 dd->rank, dd->ci[XX], dd->ci[YY], dd->ci[ZZ]);
5724 static void receive_ddindex2simnodeid(gmx_domdec_t *dd,
5728 gmx_domdec_comm_t *comm = dd->comm;
5730 if (!comm->bCartesianPP_PME && comm->bCartesianPP)
5733 snew(comm->ddindex2simnodeid, dd->nnodes);
5734 snew(buf, dd->nnodes);
5735 if (thisRankHasDuty(cr, DUTY_PP))
5737 buf[dd_index(dd->nc, dd->ci)] = cr->sim_nodeid;
5739 /* Communicate the ddindex to simulation nodeid index */
5740 MPI_Allreduce(buf, comm->ddindex2simnodeid, dd->nnodes, MPI_INT, MPI_SUM,
5741 cr->mpi_comm_mysim);
5745 GMX_UNUSED_VALUE(dd);
5746 GMX_UNUSED_VALUE(cr);
5750 static gmx_domdec_master_t *init_gmx_domdec_master_t(gmx_domdec_t *dd,
5751 int ncg, int natoms)
5753 gmx_domdec_master_t *ma;
5758 snew(ma->ncg, dd->nnodes);
5759 snew(ma->index, dd->nnodes+1);
5761 snew(ma->nat, dd->nnodes);
5762 snew(ma->ibuf, dd->nnodes*2);
5763 snew(ma->cell_x, DIM);
5764 for (i = 0; i < DIM; i++)
5766 snew(ma->cell_x[i], dd->nc[i]+1);
5769 if (dd->nnodes <= GMX_DD_NNODES_SENDRECV)
5775 snew(ma->vbuf, natoms);
5781 static void split_communicator(FILE *fplog, t_commrec *cr, gmx_domdec_t *dd,
5782 DdRankOrder gmx_unused rankOrder,
5783 int gmx_unused reorder)
5785 gmx_domdec_comm_t *comm;
5794 if (comm->bCartesianPP)
5796 for (i = 1; i < DIM; i++)
5798 bDiv[i] = ((cr->npmenodes*dd->nc[i]) % (dd->nnodes) == 0);
5800 if (bDiv[YY] || bDiv[ZZ])
5802 comm->bCartesianPP_PME = TRUE;
5803 /* If we have 2D PME decomposition, which is always in x+y,
5804 * we stack the PME only nodes in z.
5805 * Otherwise we choose the direction that provides the thinnest slab
5806 * of PME only nodes as this will have the least effect
5807 * on the PP communication.
5808 * But for the PME communication the opposite might be better.
5810 if (bDiv[ZZ] && (comm->npmenodes_y > 1 ||
5812 dd->nc[YY] > dd->nc[ZZ]))
5814 comm->cartpmedim = ZZ;
5818 comm->cartpmedim = YY;
5820 comm->ntot[comm->cartpmedim]
5821 += (cr->npmenodes*dd->nc[comm->cartpmedim])/dd->nnodes;
5825 fprintf(fplog, "Number of PME-only ranks (%d) is not a multiple of nx*ny (%d*%d) or nx*nz (%d*%d)\n", cr->npmenodes, dd->nc[XX], dd->nc[YY], dd->nc[XX], dd->nc[ZZ]);
5827 "Will not use a Cartesian communicator for PP <-> PME\n\n");
5831 if (comm->bCartesianPP_PME)
5839 fprintf(fplog, "Will use a Cartesian communicator for PP <-> PME: %d x %d x %d\n", comm->ntot[XX], comm->ntot[YY], comm->ntot[ZZ]);
5842 for (i = 0; i < DIM; i++)
5846 MPI_Cart_create(cr->mpi_comm_mysim, DIM, comm->ntot, periods, reorder,
5848 MPI_Comm_rank(comm_cart, &rank);
5849 if (MASTER(cr) && rank != 0)
5851 gmx_fatal(FARGS, "MPI rank 0 was renumbered by MPI_Cart_create, we do not allow this");
5854 /* With this assigment we loose the link to the original communicator
5855 * which will usually be MPI_COMM_WORLD, unless have multisim.
5857 cr->mpi_comm_mysim = comm_cart;
5858 cr->sim_nodeid = rank;
5860 MPI_Cart_coords(cr->mpi_comm_mysim, cr->sim_nodeid, DIM, dd->ci);
5864 fprintf(fplog, "Cartesian rank %d, coordinates %d %d %d\n\n",
5865 cr->sim_nodeid, dd->ci[XX], dd->ci[YY], dd->ci[ZZ]);
5868 if (dd->ci[comm->cartpmedim] < dd->nc[comm->cartpmedim])
5872 if (cr->npmenodes == 0 ||
5873 dd->ci[comm->cartpmedim] >= dd->nc[comm->cartpmedim])
5875 cr->duty = DUTY_PME;
5878 /* Split the sim communicator into PP and PME only nodes */
5879 MPI_Comm_split(cr->mpi_comm_mysim,
5880 getThisRankDuties(cr),
5881 dd_index(comm->ntot, dd->ci),
5882 &cr->mpi_comm_mygroup);
5889 case DdRankOrder::pp_pme:
5892 fprintf(fplog, "Order of the ranks: PP first, PME last\n");
5895 case DdRankOrder::interleave:
5896 /* Interleave the PP-only and PME-only ranks */
5899 fprintf(fplog, "Interleaving PP and PME ranks\n");
5901 comm->pmenodes = dd_interleaved_pme_ranks(dd);
5903 case DdRankOrder::cartesian:
5906 gmx_fatal(FARGS, "Invalid ddRankOrder=%d", static_cast<int>(rankOrder));
5909 if (dd_simnode2pmenode(dd, cr, cr->sim_nodeid) == -1)
5911 cr->duty = DUTY_PME;
5918 /* Split the sim communicator into PP and PME only nodes */
5919 MPI_Comm_split(cr->mpi_comm_mysim,
5920 getThisRankDuties(cr),
5922 &cr->mpi_comm_mygroup);
5923 MPI_Comm_rank(cr->mpi_comm_mygroup, &cr->nodeid);
5929 fprintf(fplog, "This rank does only %s work.\n\n",
5930 thisRankHasDuty(cr, DUTY_PP) ? "particle-particle" : "PME-mesh");
5934 /*! \brief Generates the MPI communicators for domain decomposition */
5935 static void make_dd_communicators(FILE *fplog, t_commrec *cr,
5936 gmx_domdec_t *dd, DdRankOrder ddRankOrder)
5938 gmx_domdec_comm_t *comm;
5943 copy_ivec(dd->nc, comm->ntot);
5945 comm->bCartesianPP = (ddRankOrder == DdRankOrder::cartesian);
5946 comm->bCartesianPP_PME = FALSE;
5948 /* Reorder the nodes by default. This might change the MPI ranks.
5949 * Real reordering is only supported on very few architectures,
5950 * Blue Gene is one of them.
5952 CartReorder = (getenv("GMX_NO_CART_REORDER") == nullptr);
5954 if (cr->npmenodes > 0)
5956 /* Split the communicator into a PP and PME part */
5957 split_communicator(fplog, cr, dd, ddRankOrder, CartReorder);
5958 if (comm->bCartesianPP_PME)
5960 /* We (possibly) reordered the nodes in split_communicator,
5961 * so it is no longer required in make_pp_communicator.
5963 CartReorder = FALSE;
5968 /* All nodes do PP and PME */
5970 /* We do not require separate communicators */
5971 cr->mpi_comm_mygroup = cr->mpi_comm_mysim;
5975 if (thisRankHasDuty(cr, DUTY_PP))
5977 /* Copy or make a new PP communicator */
5978 make_pp_communicator(fplog, dd, cr, CartReorder);
5982 receive_ddindex2simnodeid(dd, cr);
5985 if (!thisRankHasDuty(cr, DUTY_PME))
5987 /* Set up the commnuication to our PME node */
5988 dd->pme_nodeid = dd_simnode2pmenode(dd, cr, cr->sim_nodeid);
5989 dd->pme_receive_vir_ener = receive_vir_ener(dd, cr);
5992 fprintf(debug, "My pme_nodeid %d receive ener %d\n",
5993 dd->pme_nodeid, dd->pme_receive_vir_ener);
5998 dd->pme_nodeid = -1;
6003 dd->ma = init_gmx_domdec_master_t(dd,
6005 comm->cgs_gl.index[comm->cgs_gl.nr]);
6009 static real *get_slb_frac(FILE *fplog, const char *dir, int nc, const char *size_string)
6011 real *slb_frac, tot;
6016 if (nc > 1 && size_string != nullptr)
6020 fprintf(fplog, "Using static load balancing for the %s direction\n",
6025 for (i = 0; i < nc; i++)
6028 sscanf(size_string, "%20lf%n", &dbl, &n);
6031 gmx_fatal(FARGS, "Incorrect or not enough DD cell size entries for direction %s: '%s'", dir, size_string);
6040 fprintf(fplog, "Relative cell sizes:");
6042 for (i = 0; i < nc; i++)
6047 fprintf(fplog, " %5.3f", slb_frac[i]);
6052 fprintf(fplog, "\n");
6059 static int multi_body_bondeds_count(const gmx_mtop_t *mtop)
6062 gmx_mtop_ilistloop_t iloop;
6066 iloop = gmx_mtop_ilistloop_init(mtop);
6067 while (gmx_mtop_ilistloop_next(iloop, &il, &nmol))
6069 for (ftype = 0; ftype < F_NRE; ftype++)
6071 if ((interaction_function[ftype].flags & IF_BOND) &&
6074 n += nmol*il[ftype].nr/(1 + NRAL(ftype));
6082 static int dd_getenv(FILE *fplog, const char *env_var, int def)
6088 val = getenv(env_var);
6091 if (sscanf(val, "%20d", &nst) <= 0)
6097 fprintf(fplog, "Found env.var. %s = %s, using value %d\n",
6105 static void dd_warning(t_commrec *cr, FILE *fplog, const char *warn_string)
6109 fprintf(stderr, "\n%s\n", warn_string);
6113 fprintf(fplog, "\n%s\n", warn_string);
6117 static void check_dd_restrictions(t_commrec *cr, const gmx_domdec_t *dd,
6118 const t_inputrec *ir, FILE *fplog)
6120 if (ir->ePBC == epbcSCREW &&
6121 (dd->nc[XX] == 1 || dd->nc[YY] > 1 || dd->nc[ZZ] > 1))
6123 gmx_fatal(FARGS, "With pbc=%s can only do domain decomposition in the x-direction", epbc_names[ir->ePBC]);
6126 if (ir->ns_type == ensSIMPLE)
6128 gmx_fatal(FARGS, "Domain decomposition does not support simple neighbor searching, use grid searching or run with one MPI rank");
6131 if (ir->nstlist == 0)
6133 gmx_fatal(FARGS, "Domain decomposition does not work with nstlist=0");
6136 if (ir->comm_mode == ecmANGULAR && ir->ePBC != epbcNONE)
6138 dd_warning(cr, fplog, "comm-mode angular will give incorrect results when the comm group partially crosses a periodic boundary");
6142 static real average_cellsize_min(gmx_domdec_t *dd, gmx_ddbox_t *ddbox)
6147 r = ddbox->box_size[XX];
6148 for (di = 0; di < dd->ndim; di++)
6151 /* Check using the initial average cell size */
6152 r = std::min(r, ddbox->box_size[d]*ddbox->skew_fac[d]/dd->nc[d]);
6158 /*! \brief Depending on the DLB initial value return the DLB switched off state or issue an error.
6160 static int forceDlbOffOrBail(int cmdlineDlbState,
6161 const std::string &reasonStr,
6165 std::string dlbNotSupportedErr = "Dynamic load balancing requested, but ";
6166 std::string dlbDisableNote = "NOTE: disabling dynamic load balancing as ";
6168 if (cmdlineDlbState == edlbsOnUser)
6170 gmx_fatal(FARGS, (dlbNotSupportedErr + reasonStr).c_str());
6172 else if (cmdlineDlbState == edlbsOffCanTurnOn)
6174 dd_warning(cr, fplog, (dlbDisableNote + reasonStr + "\n").c_str());
6176 return edlbsOffForever;
6179 /*! \brief Return the dynamic load balancer's initial state based on initial conditions and user inputs.
6181 * This function parses the parameters of "-dlb" command line option setting
6182 * corresponding state values. Then it checks the consistency of the determined
6183 * state with other run parameters and settings. As a result, the initial state
6184 * may be altered or an error may be thrown if incompatibility of options is detected.
6186 * \param [in] fplog Pointer to mdrun log file.
6187 * \param [in] cr Pointer to MPI communication object.
6188 * \param [in] dlbOption Enum value for the DLB option.
6189 * \param [in] bRecordLoad True if the load balancer is recording load information.
6190 * \param [in] mdrunOptions Options for mdrun.
6191 * \param [in] ir Pointer mdrun to input parameters.
6192 * \returns DLB initial/startup state.
6194 static int determineInitialDlbState(FILE *fplog, t_commrec *cr,
6195 DlbOption dlbOption, gmx_bool bRecordLoad,
6196 const MdrunOptions &mdrunOptions,
6197 const t_inputrec *ir)
6199 int dlbState = edlbsOffCanTurnOn;
6203 case DlbOption::turnOnWhenUseful: dlbState = edlbsOffCanTurnOn; break;
6204 case DlbOption::no: dlbState = edlbsOffUser; break;
6205 case DlbOption::yes: dlbState = edlbsOnUser; break;
6206 default: gmx_incons("Invalid dlbOption enum value");
6209 /* Reruns don't support DLB: bail or override auto mode */
6210 if (mdrunOptions.rerun)
6212 std::string reasonStr = "it is not supported in reruns.";
6213 return forceDlbOffOrBail(dlbState, reasonStr, cr, fplog);
6216 /* Unsupported integrators */
6217 if (!EI_DYNAMICS(ir->eI))
6219 auto reasonStr = gmx::formatString("it is only supported with dynamics, not with integrator '%s'.", EI(ir->eI));
6220 return forceDlbOffOrBail(dlbState, reasonStr, cr, fplog);
6223 /* Without cycle counters we can't time work to balance on */
6226 std::string reasonStr = "cycle counters unsupported or not enabled in the operating system kernel.";
6227 return forceDlbOffOrBail(dlbState, reasonStr, cr, fplog);
6230 if (mdrunOptions.reproducible)
6232 std::string reasonStr = "you started a reproducible run.";
6237 case edlbsOffForever:
6238 GMX_RELEASE_ASSERT(false, "edlbsOffForever is not a valid initial state");
6240 case edlbsOffCanTurnOn:
6241 return forceDlbOffOrBail(dlbState, reasonStr, cr, fplog);
6243 case edlbsOnCanTurnOff:
6244 GMX_RELEASE_ASSERT(false, "edlbsOffCanTurnOff is not a valid initial state");
6247 return forceDlbOffOrBail(dlbState, reasonStr + " In load balanced runs binary reproducibility cannot be ensured.", cr, fplog);
6250 gmx_fatal(FARGS, "Death horror: undefined case (%d) for load balancing choice", dlbState);
6258 static void set_dd_dim(FILE *fplog, gmx_domdec_t *dd)
6263 if (getenv("GMX_DD_ORDER_ZYX") != nullptr)
6265 /* Decomposition order z,y,x */
6268 fprintf(fplog, "Using domain decomposition order z, y, x\n");
6270 for (dim = DIM-1; dim >= 0; dim--)
6272 if (dd->nc[dim] > 1)
6274 dd->dim[dd->ndim++] = dim;
6280 /* Decomposition order x,y,z */
6281 for (dim = 0; dim < DIM; dim++)
6283 if (dd->nc[dim] > 1)
6285 dd->dim[dd->ndim++] = dim;
6291 static gmx_domdec_comm_t *init_dd_comm()
6293 gmx_domdec_comm_t *comm;
6297 snew(comm->cggl_flag, DIM*2);
6298 snew(comm->cgcm_state, DIM*2);
6299 for (i = 0; i < DIM*2; i++)
6301 comm->cggl_flag_nalloc[i] = 0;
6302 comm->cgcm_state_nalloc[i] = 0;
6305 comm->nalloc_int = 0;
6306 comm->buf_int = nullptr;
6308 vec_rvec_init(&comm->vbuf);
6310 comm->n_load_have = 0;
6311 comm->n_load_collect = 0;
6313 for (i = 0; i < ddnatNR-ddnatZONE; i++)
6315 comm->sum_nat[i] = 0;
6319 comm->load_step = 0;
6322 clear_ivec(comm->load_lim);
6326 /* This should be replaced by a unique pointer */
6327 comm->balanceRegion = ddBalanceRegionAllocate();
6332 /*! \brief Set the cell size and interaction limits, as well as the DD grid */
6333 static void set_dd_limits_and_grid(FILE *fplog, t_commrec *cr, gmx_domdec_t *dd,
6334 const DomdecOptions &options,
6335 const MdrunOptions &mdrunOptions,
6336 const gmx_mtop_t *mtop,
6337 const t_inputrec *ir,
6338 const matrix box, const rvec *xGlobal,
6340 int *npme_x, int *npme_y)
6343 real r_bonded_limit = -1;
6344 const real tenPercentMargin = 1.1;
6345 gmx_domdec_comm_t *comm = dd->comm;
6347 snew(comm->cggl_flag, DIM*2);
6348 snew(comm->cgcm_state, DIM*2);
6350 dd->npbcdim = ePBC2npbcdim(ir->ePBC);
6351 dd->bScrewPBC = (ir->ePBC == epbcSCREW);
6353 dd->pme_recv_f_alloc = 0;
6354 dd->pme_recv_f_buf = nullptr;
6356 /* Initialize to GPU share count to 0, might change later */
6357 comm->nrank_gpu_shared = 0;
6359 comm->dlbState = determineInitialDlbState(fplog, cr, options.dlbOption, comm->bRecordLoad, mdrunOptions, ir);
6360 dd_dlb_set_should_check_whether_to_turn_dlb_on(dd, TRUE);
6361 /* To consider turning DLB on after 2*nstlist steps we need to check
6362 * at partitioning count 3. Thus we need to increase the first count by 2.
6364 comm->ddPartioningCountFirstDlbOff += 2;
6368 fprintf(fplog, "Dynamic load balancing: %s\n",
6369 edlbs_names[comm->dlbState]);
6371 comm->bPMELoadBalDLBLimits = FALSE;
6373 /* Allocate the charge group/atom sorting struct */
6374 snew(comm->sort, 1);
6376 comm->bCGs = (ncg_mtop(mtop) < mtop->natoms);
6378 comm->bInterCGBondeds = ((ncg_mtop(mtop) > mtop->mols.nr) ||
6379 mtop->bIntermolecularInteractions);
6380 if (comm->bInterCGBondeds)
6382 comm->bInterCGMultiBody = (multi_body_bondeds_count(mtop) > 0);
6386 comm->bInterCGMultiBody = FALSE;
6389 dd->bInterCGcons = inter_charge_group_constraints(mtop);
6390 dd->bInterCGsettles = inter_charge_group_settles(mtop);
6394 /* Set the cut-off to some very large value,
6395 * so we don't need if statements everywhere in the code.
6396 * We use sqrt, since the cut-off is squared in some places.
6398 comm->cutoff = GMX_CUTOFF_INF;
6402 comm->cutoff = ir->rlist;
6404 comm->cutoff_mbody = 0;
6406 /* Determine the minimum cell size limit, affected by many factors */
6407 comm->cellsize_limit = 0;
6408 comm->bBondComm = FALSE;
6410 /* We do not allow home atoms to move beyond the neighboring domain
6411 * between domain decomposition steps, which limits the cell size.
6412 * Get an estimate of cell size limit due to atom displacement.
6413 * In most cases this is a large overestimate, because it assumes
6414 * non-interaction atoms.
6415 * We set the chance to 1 in a trillion steps.
6417 constexpr real c_chanceThatAtomMovesBeyondDomain = 1e-12;
6418 const real limitForAtomDisplacement =
6419 minCellSizeForAtomDisplacement(*mtop, *ir, c_chanceThatAtomMovesBeyondDomain);
6423 "Minimum cell size due to atom displacement: %.3f nm\n",
6424 limitForAtomDisplacement);
6426 comm->cellsize_limit = std::max(comm->cellsize_limit,
6427 limitForAtomDisplacement);
6429 /* TODO: PME decomposition currently requires atoms not to be more than
6430 * 2/3 of comm->cutoff, which is >=rlist, outside of their domain.
6431 * In nearly all cases, limitForAtomDisplacement will be smaller
6432 * than 2/3*rlist, so the PME requirement is satisfied.
6433 * But it would be better for both correctness and performance
6434 * to use limitForAtomDisplacement instead of 2/3*comm->cutoff.
6435 * Note that we would need to improve the pairlist buffer case.
6438 if (comm->bInterCGBondeds)
6440 if (options.minimumCommunicationRange > 0)
6442 comm->cutoff_mbody = options.minimumCommunicationRange;
6443 if (options.useBondedCommunication)
6445 comm->bBondComm = (comm->cutoff_mbody > comm->cutoff);
6449 comm->cutoff = std::max(comm->cutoff, comm->cutoff_mbody);
6451 r_bonded_limit = comm->cutoff_mbody;
6453 else if (ir->bPeriodicMols)
6455 /* Can not easily determine the required cut-off */
6456 dd_warning(cr, fplog, "NOTE: Periodic molecules are present in this system. Because of this, the domain decomposition algorithm cannot easily determine the minimum cell size that it requires for treating bonded interactions. Instead, domain decomposition will assume that half the non-bonded cut-off will be a suitable lower bound.\n");
6457 comm->cutoff_mbody = comm->cutoff/2;
6458 r_bonded_limit = comm->cutoff_mbody;
6466 dd_bonded_cg_distance(fplog, mtop, ir, xGlobal, box,
6467 options.checkBondedInteractions,
6470 gmx_bcast(sizeof(r_2b), &r_2b, cr);
6471 gmx_bcast(sizeof(r_mb), &r_mb, cr);
6473 /* We use an initial margin of 10% for the minimum cell size,
6474 * except when we are just below the non-bonded cut-off.
6476 if (options.useBondedCommunication)
6478 if (std::max(r_2b, r_mb) > comm->cutoff)
6480 r_bonded = std::max(r_2b, r_mb);
6481 r_bonded_limit = tenPercentMargin*r_bonded;
6482 comm->bBondComm = TRUE;
6487 r_bonded_limit = std::min(tenPercentMargin*r_bonded, comm->cutoff);
6489 /* We determine cutoff_mbody later */
6493 /* No special bonded communication,
6494 * simply increase the DD cut-off.
6496 r_bonded_limit = tenPercentMargin*std::max(r_2b, r_mb);
6497 comm->cutoff_mbody = r_bonded_limit;
6498 comm->cutoff = std::max(comm->cutoff, comm->cutoff_mbody);
6504 "Minimum cell size due to bonded interactions: %.3f nm\n",
6507 comm->cellsize_limit = std::max(comm->cellsize_limit, r_bonded_limit);
6511 if (dd->bInterCGcons && options.constraintCommunicationRange <= 0)
6513 /* There is a cell size limit due to the constraints (P-LINCS) */
6514 rconstr = constr_r_max(fplog, mtop, ir);
6518 "Estimated maximum distance required for P-LINCS: %.3f nm\n",
6520 if (rconstr > comm->cellsize_limit)
6522 fprintf(fplog, "This distance will limit the DD cell size, you can override this with -rcon\n");
6526 else if (options.constraintCommunicationRange > 0 && fplog)
6528 /* Here we do not check for dd->bInterCGcons,
6529 * because one can also set a cell size limit for virtual sites only
6530 * and at this point we don't know yet if there are intercg v-sites.
6533 "User supplied maximum distance required for P-LINCS: %.3f nm\n",
6534 options.constraintCommunicationRange);
6535 rconstr = options.constraintCommunicationRange;
6537 comm->cellsize_limit = std::max(comm->cellsize_limit, rconstr);
6539 comm->cgs_gl = gmx_mtop_global_cgs(mtop);
6541 if (options.numCells[XX] > 0)
6543 copy_ivec(options.numCells, dd->nc);
6544 set_dd_dim(fplog, dd);
6545 set_ddbox_cr(cr, &dd->nc, ir, box, &comm->cgs_gl, xGlobal, ddbox);
6547 if (options.numPmeRanks >= 0)
6549 cr->npmenodes = options.numPmeRanks;
6553 /* When the DD grid is set explicitly and -npme is set to auto,
6554 * don't use PME ranks. We check later if the DD grid is
6555 * compatible with the total number of ranks.
6560 real acs = average_cellsize_min(dd, ddbox);
6561 if (acs < comm->cellsize_limit)
6565 fprintf(fplog, "ERROR: The initial cell size (%f) is smaller than the cell size limit (%f)\n", acs, comm->cellsize_limit);
6567 gmx_fatal_collective(FARGS, cr->mpi_comm_mysim, MASTER(cr),
6568 "The initial cell size (%f) is smaller than the cell size limit (%f), change options -dd, -rdd or -rcon, see the log file for details",
6569 acs, comm->cellsize_limit);
6574 set_ddbox_cr(cr, nullptr, ir, box, &comm->cgs_gl, xGlobal, ddbox);
6576 /* We need to choose the optimal DD grid and possibly PME nodes */
6578 dd_choose_grid(fplog, cr, dd, ir, mtop, box, ddbox,
6579 options.numPmeRanks,
6580 !isDlbDisabled(comm),
6582 comm->cellsize_limit, comm->cutoff,
6583 comm->bInterCGBondeds);
6585 if (dd->nc[XX] == 0)
6588 gmx_bool bC = (dd->bInterCGcons && rconstr > r_bonded_limit);
6589 sprintf(buf, "Change the number of ranks or mdrun option %s%s%s",
6590 !bC ? "-rdd" : "-rcon",
6591 comm->dlbState != edlbsOffUser ? " or -dds" : "",
6592 bC ? " or your LINCS settings" : "");
6594 gmx_fatal_collective(FARGS, cr->mpi_comm_mysim, MASTER(cr),
6595 "There is no domain decomposition for %d ranks that is compatible with the given box and a minimum cell size of %g nm\n"
6597 "Look in the log file for details on the domain decomposition",
6598 cr->nnodes-cr->npmenodes, limit, buf);
6600 set_dd_dim(fplog, dd);
6606 "Domain decomposition grid %d x %d x %d, separate PME ranks %d\n",
6607 dd->nc[XX], dd->nc[YY], dd->nc[ZZ], cr->npmenodes);
6610 dd->nnodes = dd->nc[XX]*dd->nc[YY]*dd->nc[ZZ];
6611 if (cr->nnodes - dd->nnodes != cr->npmenodes)
6613 gmx_fatal_collective(FARGS, cr->mpi_comm_mysim, MASTER(cr),
6614 "The size of the domain decomposition grid (%d) does not match the number of ranks (%d). The total number of ranks is %d",
6615 dd->nnodes, cr->nnodes - cr->npmenodes, cr->nnodes);
6617 if (cr->npmenodes > dd->nnodes)
6619 gmx_fatal_collective(FARGS, cr->mpi_comm_mysim, MASTER(cr),
6620 "The number of separate PME ranks (%d) is larger than the number of PP ranks (%d), this is not supported.", cr->npmenodes, dd->nnodes);
6622 if (cr->npmenodes > 0)
6624 comm->npmenodes = cr->npmenodes;
6628 comm->npmenodes = dd->nnodes;
6631 if (EEL_PME(ir->coulombtype) || EVDW_PME(ir->vdwtype))
6633 /* The following choices should match those
6634 * in comm_cost_est in domdec_setup.c.
6635 * Note that here the checks have to take into account
6636 * that the decomposition might occur in a different order than xyz
6637 * (for instance through the env.var. GMX_DD_ORDER_ZYX),
6638 * in which case they will not match those in comm_cost_est,
6639 * but since that is mainly for testing purposes that's fine.
6641 if (dd->ndim >= 2 && dd->dim[0] == XX && dd->dim[1] == YY &&
6642 comm->npmenodes > dd->nc[XX] && comm->npmenodes % dd->nc[XX] == 0 &&
6643 getenv("GMX_PMEONEDD") == nullptr)
6645 comm->npmedecompdim = 2;
6646 comm->npmenodes_x = dd->nc[XX];
6647 comm->npmenodes_y = comm->npmenodes/comm->npmenodes_x;
6651 /* In case nc is 1 in both x and y we could still choose to
6652 * decompose pme in y instead of x, but we use x for simplicity.
6654 comm->npmedecompdim = 1;
6655 if (dd->dim[0] == YY)
6657 comm->npmenodes_x = 1;
6658 comm->npmenodes_y = comm->npmenodes;
6662 comm->npmenodes_x = comm->npmenodes;
6663 comm->npmenodes_y = 1;
6668 fprintf(fplog, "PME domain decomposition: %d x %d x %d\n",
6669 comm->npmenodes_x, comm->npmenodes_y, 1);
6674 comm->npmedecompdim = 0;
6675 comm->npmenodes_x = 0;
6676 comm->npmenodes_y = 0;
6679 /* Technically we don't need both of these,
6680 * but it simplifies code not having to recalculate it.
6682 *npme_x = comm->npmenodes_x;
6683 *npme_y = comm->npmenodes_y;
6685 snew(comm->slb_frac, DIM);
6686 if (isDlbDisabled(comm))
6688 comm->slb_frac[XX] = get_slb_frac(fplog, "x", dd->nc[XX], options.cellSizeX);
6689 comm->slb_frac[YY] = get_slb_frac(fplog, "y", dd->nc[YY], options.cellSizeY);
6690 comm->slb_frac[ZZ] = get_slb_frac(fplog, "z", dd->nc[ZZ], options.cellSizeZ);
6693 if (comm->bInterCGBondeds && comm->cutoff_mbody == 0)
6695 if (comm->bBondComm || !isDlbDisabled(comm))
6697 /* Set the bonded communication distance to halfway
6698 * the minimum and the maximum,
6699 * since the extra communication cost is nearly zero.
6701 real acs = average_cellsize_min(dd, ddbox);
6702 comm->cutoff_mbody = 0.5*(r_bonded + acs);
6703 if (!isDlbDisabled(comm))
6705 /* Check if this does not limit the scaling */
6706 comm->cutoff_mbody = std::min(comm->cutoff_mbody,
6707 options.dlbScaling*acs);
6709 if (!comm->bBondComm)
6711 /* Without bBondComm do not go beyond the n.b. cut-off */
6712 comm->cutoff_mbody = std::min(comm->cutoff_mbody, comm->cutoff);
6713 if (comm->cellsize_limit >= comm->cutoff)
6715 /* We don't loose a lot of efficieny
6716 * when increasing it to the n.b. cut-off.
6717 * It can even be slightly faster, because we need
6718 * less checks for the communication setup.
6720 comm->cutoff_mbody = comm->cutoff;
6723 /* Check if we did not end up below our original limit */
6724 comm->cutoff_mbody = std::max(comm->cutoff_mbody, r_bonded_limit);
6726 if (comm->cutoff_mbody > comm->cellsize_limit)
6728 comm->cellsize_limit = comm->cutoff_mbody;
6731 /* Without DLB and cutoff_mbody<cutoff, cutoff_mbody is dynamic */
6736 fprintf(debug, "Bonded atom communication beyond the cut-off: %d\n"
6737 "cellsize limit %f\n",
6738 comm->bBondComm, comm->cellsize_limit);
6743 check_dd_restrictions(cr, dd, ir, fplog);
6747 static void set_dlb_limits(gmx_domdec_t *dd)
6752 for (d = 0; d < dd->ndim; d++)
6754 dd->comm->cd[d].np = dd->comm->cd[d].np_dlb;
6755 dd->comm->cellsize_min[dd->dim[d]] =
6756 dd->comm->cellsize_min_dlb[dd->dim[d]];
6761 static void turn_on_dlb(FILE *fplog, t_commrec *cr, gmx_int64_t step)
6764 gmx_domdec_comm_t *comm;
6771 cellsize_min = comm->cellsize_min[dd->dim[0]];
6772 for (d = 1; d < dd->ndim; d++)
6774 cellsize_min = std::min(cellsize_min, comm->cellsize_min[dd->dim[d]]);
6777 /* Turn off DLB if we're too close to the cell size limit. */
6778 if (cellsize_min < comm->cellsize_limit*1.05)
6780 auto str = gmx::formatString("step %" GMX_PRId64 " Measured %.1f %% performance loss due to load imbalance, "
6781 "but the minimum cell size is smaller than 1.05 times the cell size limit."
6782 "Will no longer try dynamic load balancing.\n", step, dd_force_imb_perf_loss(dd)*100);
6783 dd_warning(cr, fplog, str.c_str());
6785 comm->dlbState = edlbsOffForever;
6790 sprintf(buf, "step %" GMX_PRId64 " Turning on dynamic load balancing, because the performance loss due to load imbalance is %.1f %%.\n", step, dd_force_imb_perf_loss(dd)*100);
6791 dd_warning(cr, fplog, buf);
6792 comm->dlbState = edlbsOnCanTurnOff;
6794 /* Store the non-DLB performance, so we can check if DLB actually
6795 * improves performance.
6797 GMX_RELEASE_ASSERT(comm->cycl_n[ddCyclStep] > 0, "When we turned on DLB, we should have measured cycles");
6798 comm->cyclesPerStepBeforeDLB = comm->cycl[ddCyclStep]/comm->cycl_n[ddCyclStep];
6802 /* We can set the required cell size info here,
6803 * so we do not need to communicate this.
6804 * The grid is completely uniform.
6806 for (d = 0; d < dd->ndim; d++)
6810 comm->load[d].sum_m = comm->load[d].sum;
6812 nc = dd->nc[dd->dim[d]];
6813 for (i = 0; i < nc; i++)
6815 comm->root[d]->cell_f[i] = i/(real)nc;
6818 comm->root[d]->cell_f_max0[i] = i /(real)nc;
6819 comm->root[d]->cell_f_min1[i] = (i+1)/(real)nc;
6822 comm->root[d]->cell_f[nc] = 1.0;
6827 static void turn_off_dlb(FILE *fplog, t_commrec *cr, gmx_int64_t step)
6829 gmx_domdec_t *dd = cr->dd;
6832 sprintf(buf, "step %" GMX_PRId64 " Turning off dynamic load balancing, because it is degrading performance.\n", step);
6833 dd_warning(cr, fplog, buf);
6834 dd->comm->dlbState = edlbsOffCanTurnOn;
6835 dd->comm->haveTurnedOffDlb = true;
6836 dd->comm->ddPartioningCountFirstDlbOff = dd->ddp_count;
6839 static void turn_off_dlb_forever(FILE *fplog, t_commrec *cr, gmx_int64_t step)
6841 GMX_RELEASE_ASSERT(cr->dd->comm->dlbState == edlbsOffCanTurnOn, "Can only turn off DLB forever when it was in the can-turn-on state");
6843 sprintf(buf, "step %" GMX_PRId64 " Will no longer try dynamic load balancing, as it degraded performance.\n", step);
6844 dd_warning(cr, fplog, buf);
6845 cr->dd->comm->dlbState = edlbsOffForever;
6848 static char *init_bLocalCG(const gmx_mtop_t *mtop)
6853 ncg = ncg_mtop(mtop);
6854 snew(bLocalCG, ncg);
6855 for (cg = 0; cg < ncg; cg++)
6857 bLocalCG[cg] = FALSE;
6863 void dd_init_bondeds(FILE *fplog,
6865 const gmx_mtop_t *mtop,
6866 const gmx_vsite_t *vsite,
6867 const t_inputrec *ir,
6868 gmx_bool bBCheck, cginfo_mb_t *cginfo_mb)
6870 gmx_domdec_comm_t *comm;
6872 dd_make_reverse_top(fplog, dd, mtop, vsite, ir, bBCheck);
6876 if (comm->bBondComm)
6878 /* Communicate atoms beyond the cut-off for bonded interactions */
6881 comm->cglink = make_charge_group_links(mtop, dd, cginfo_mb);
6883 comm->bLocalCG = init_bLocalCG(mtop);
6887 /* Only communicate atoms based on cut-off */
6888 comm->cglink = nullptr;
6889 comm->bLocalCG = nullptr;
6893 static void print_dd_settings(FILE *fplog, gmx_domdec_t *dd,
6894 const gmx_mtop_t *mtop, const t_inputrec *ir,
6895 gmx_bool bDynLoadBal, real dlb_scale,
6896 const gmx_ddbox_t *ddbox)
6898 gmx_domdec_comm_t *comm;
6904 if (fplog == nullptr)
6913 fprintf(fplog, "The maximum number of communication pulses is:");
6914 for (d = 0; d < dd->ndim; d++)
6916 fprintf(fplog, " %c %d", dim2char(dd->dim[d]), comm->cd[d].np_dlb);
6918 fprintf(fplog, "\n");
6919 fprintf(fplog, "The minimum size for domain decomposition cells is %.3f nm\n", comm->cellsize_limit);
6920 fprintf(fplog, "The requested allowed shrink of DD cells (option -dds) is: %.2f\n", dlb_scale);
6921 fprintf(fplog, "The allowed shrink of domain decomposition cells is:");
6922 for (d = 0; d < DIM; d++)
6926 if (d >= ddbox->npbcdim && dd->nc[d] == 2)
6933 comm->cellsize_min_dlb[d]/
6934 (ddbox->box_size[d]*ddbox->skew_fac[d]/dd->nc[d]);
6936 fprintf(fplog, " %c %.2f", dim2char(d), shrink);
6939 fprintf(fplog, "\n");
6943 set_dd_cell_sizes_slb(dd, ddbox, setcellsizeslbPULSE_ONLY, np);
6944 fprintf(fplog, "The initial number of communication pulses is:");
6945 for (d = 0; d < dd->ndim; d++)
6947 fprintf(fplog, " %c %d", dim2char(dd->dim[d]), np[dd->dim[d]]);
6949 fprintf(fplog, "\n");
6950 fprintf(fplog, "The initial domain decomposition cell size is:");
6951 for (d = 0; d < DIM; d++)
6955 fprintf(fplog, " %c %.2f nm",
6956 dim2char(d), dd->comm->cellsize_min[d]);
6959 fprintf(fplog, "\n\n");
6962 gmx_bool bInterCGVsites = count_intercg_vsites(mtop);
6964 if (comm->bInterCGBondeds ||
6966 dd->bInterCGcons || dd->bInterCGsettles)
6968 fprintf(fplog, "The maximum allowed distance for charge groups involved in interactions is:\n");
6969 fprintf(fplog, "%40s %-7s %6.3f nm\n",
6970 "non-bonded interactions", "", comm->cutoff);
6974 limit = dd->comm->cellsize_limit;
6978 if (dynamic_dd_box(ddbox, ir))
6980 fprintf(fplog, "(the following are initial values, they could change due to box deformation)\n");
6982 limit = dd->comm->cellsize_min[XX];
6983 for (d = 1; d < DIM; d++)
6985 limit = std::min(limit, dd->comm->cellsize_min[d]);
6989 if (comm->bInterCGBondeds)
6991 fprintf(fplog, "%40s %-7s %6.3f nm\n",
6992 "two-body bonded interactions", "(-rdd)",
6993 std::max(comm->cutoff, comm->cutoff_mbody));
6994 fprintf(fplog, "%40s %-7s %6.3f nm\n",
6995 "multi-body bonded interactions", "(-rdd)",
6996 (comm->bBondComm || isDlbOn(dd->comm)) ? comm->cutoff_mbody : std::min(comm->cutoff, limit));
7000 fprintf(fplog, "%40s %-7s %6.3f nm\n",
7001 "virtual site constructions", "(-rcon)", limit);
7003 if (dd->bInterCGcons || dd->bInterCGsettles)
7005 sprintf(buf, "atoms separated by up to %d constraints",
7007 fprintf(fplog, "%40s %-7s %6.3f nm\n",
7008 buf, "(-rcon)", limit);
7010 fprintf(fplog, "\n");
7016 static void set_cell_limits_dlb(gmx_domdec_t *dd,
7018 const t_inputrec *ir,
7019 const gmx_ddbox_t *ddbox)
7021 gmx_domdec_comm_t *comm;
7022 int d, dim, npulse, npulse_d_max, npulse_d;
7027 bNoCutOff = (ir->rvdw == 0 || ir->rcoulomb == 0);
7029 /* Determine the maximum number of comm. pulses in one dimension */
7031 comm->cellsize_limit = std::max(comm->cellsize_limit, comm->cutoff_mbody);
7033 /* Determine the maximum required number of grid pulses */
7034 if (comm->cellsize_limit >= comm->cutoff)
7036 /* Only a single pulse is required */
7039 else if (!bNoCutOff && comm->cellsize_limit > 0)
7041 /* We round down slightly here to avoid overhead due to the latency
7042 * of extra communication calls when the cut-off
7043 * would be only slightly longer than the cell size.
7044 * Later cellsize_limit is redetermined,
7045 * so we can not miss interactions due to this rounding.
7047 npulse = (int)(0.96 + comm->cutoff/comm->cellsize_limit);
7051 /* There is no cell size limit */
7052 npulse = std::max(dd->nc[XX]-1, std::max(dd->nc[YY]-1, dd->nc[ZZ]-1));
7055 if (!bNoCutOff && npulse > 1)
7057 /* See if we can do with less pulses, based on dlb_scale */
7059 for (d = 0; d < dd->ndim; d++)
7062 npulse_d = (int)(1 + dd->nc[dim]*comm->cutoff
7063 /(ddbox->box_size[dim]*ddbox->skew_fac[dim]*dlb_scale));
7064 npulse_d_max = std::max(npulse_d_max, npulse_d);
7066 npulse = std::min(npulse, npulse_d_max);
7069 /* This env var can override npulse */
7070 d = dd_getenv(debug, "GMX_DD_NPULSE", 0);
7077 comm->bVacDLBNoLimit = (ir->ePBC == epbcNONE);
7078 for (d = 0; d < dd->ndim; d++)
7080 comm->cd[d].np_dlb = std::min(npulse, dd->nc[dd->dim[d]]-1);
7081 comm->cd[d].np_nalloc = comm->cd[d].np_dlb;
7082 snew(comm->cd[d].ind, comm->cd[d].np_nalloc);
7083 comm->maxpulse = std::max(comm->maxpulse, comm->cd[d].np_dlb);
7084 if (comm->cd[d].np_dlb < dd->nc[dd->dim[d]]-1)
7086 comm->bVacDLBNoLimit = FALSE;
7090 /* cellsize_limit is set for LINCS in init_domain_decomposition */
7091 if (!comm->bVacDLBNoLimit)
7093 comm->cellsize_limit = std::max(comm->cellsize_limit,
7094 comm->cutoff/comm->maxpulse);
7096 comm->cellsize_limit = std::max(comm->cellsize_limit, comm->cutoff_mbody);
7097 /* Set the minimum cell size for each DD dimension */
7098 for (d = 0; d < dd->ndim; d++)
7100 if (comm->bVacDLBNoLimit ||
7101 comm->cd[d].np_dlb*comm->cellsize_limit >= comm->cutoff)
7103 comm->cellsize_min_dlb[dd->dim[d]] = comm->cellsize_limit;
7107 comm->cellsize_min_dlb[dd->dim[d]] =
7108 comm->cutoff/comm->cd[d].np_dlb;
7111 if (comm->cutoff_mbody <= 0)
7113 comm->cutoff_mbody = std::min(comm->cutoff, comm->cellsize_limit);
7121 gmx_bool dd_bonded_molpbc(const gmx_domdec_t *dd, int ePBC)
7123 /* If each molecule is a single charge group
7124 * or we use domain decomposition for each periodic dimension,
7125 * we do not need to take pbc into account for the bonded interactions.
7127 return (ePBC != epbcNONE && dd->comm->bInterCGBondeds &&
7130 (dd->nc[ZZ] > 1 || ePBC == epbcXY)));
7133 /*! \brief Sets grid size limits and PP-PME setup, prints settings to log */
7134 static void set_ddgrid_parameters(FILE *fplog, gmx_domdec_t *dd, real dlb_scale,
7135 const gmx_mtop_t *mtop, const t_inputrec *ir,
7136 const gmx_ddbox_t *ddbox)
7138 gmx_domdec_comm_t *comm;
7144 if (EEL_PME(ir->coulombtype) || EVDW_PME(ir->vdwtype))
7146 init_ddpme(dd, &comm->ddpme[0], 0);
7147 if (comm->npmedecompdim >= 2)
7149 init_ddpme(dd, &comm->ddpme[1], 1);
7154 comm->npmenodes = 0;
7155 if (dd->pme_nodeid >= 0)
7157 gmx_fatal_collective(FARGS, dd->mpi_comm_all, DDMASTER(dd),
7158 "Can not have separate PME ranks without PME electrostatics");
7164 fprintf(debug, "The DD cut-off is %f\n", comm->cutoff);
7166 if (!isDlbDisabled(comm))
7168 set_cell_limits_dlb(dd, dlb_scale, ir, ddbox);
7171 print_dd_settings(fplog, dd, mtop, ir, isDlbOn(comm), dlb_scale, ddbox);
7172 if (comm->dlbState == edlbsOffCanTurnOn)
7176 fprintf(fplog, "When dynamic load balancing gets turned on, these settings will change to:\n");
7178 print_dd_settings(fplog, dd, mtop, ir, TRUE, dlb_scale, ddbox);
7181 if (ir->ePBC == epbcNONE)
7183 vol_frac = 1 - 1/(double)dd->nnodes;
7188 (1 + comm_box_frac(dd->nc, comm->cutoff, ddbox))/(double)dd->nnodes;
7192 fprintf(debug, "Volume fraction for all DD zones: %f\n", vol_frac);
7194 natoms_tot = comm->cgs_gl.index[comm->cgs_gl.nr];
7196 dd->ga2la = ga2la_init(natoms_tot, static_cast<int>(vol_frac*natoms_tot));
7199 /*! \brief Set some important DD parameters that can be modified by env.vars */
7200 static void set_dd_envvar_options(FILE *fplog, gmx_domdec_t *dd, int rank_mysim)
7202 gmx_domdec_comm_t *comm = dd->comm;
7204 dd->bSendRecv2 = dd_getenv(fplog, "GMX_DD_USE_SENDRECV2", 0);
7205 comm->dlb_scale_lim = dd_getenv(fplog, "GMX_DLB_MAX_BOX_SCALING", 10);
7206 comm->eFlop = dd_getenv(fplog, "GMX_DLB_BASED_ON_FLOPS", 0);
7207 int recload = dd_getenv(fplog, "GMX_DD_RECORD_LOAD", 1);
7208 comm->nstDDDump = dd_getenv(fplog, "GMX_DD_NST_DUMP", 0);
7209 comm->nstDDDumpGrid = dd_getenv(fplog, "GMX_DD_NST_DUMP_GRID", 0);
7210 comm->DD_debug = dd_getenv(fplog, "GMX_DD_DEBUG", 0);
7212 if (dd->bSendRecv2 && fplog)
7214 fprintf(fplog, "Will use two sequential MPI_Sendrecv calls instead of two simultaneous non-blocking MPI_Irecv and MPI_Isend pairs for constraint and vsite communication\n");
7221 fprintf(fplog, "Will load balance based on FLOP count\n");
7223 if (comm->eFlop > 1)
7225 srand(1 + rank_mysim);
7227 comm->bRecordLoad = TRUE;
7231 comm->bRecordLoad = (wallcycle_have_counter() && recload > 0);
7235 DomdecOptions::DomdecOptions() :
7236 checkBondedInteractions(TRUE),
7237 useBondedCommunication(TRUE),
7239 rankOrder(DdRankOrder::pp_pme),
7240 minimumCommunicationRange(0),
7241 constraintCommunicationRange(0),
7242 dlbOption(DlbOption::turnOnWhenUseful),
7248 clear_ivec(numCells);
7251 gmx_domdec_t *init_domain_decomposition(FILE *fplog, t_commrec *cr,
7252 const DomdecOptions &options,
7253 const MdrunOptions &mdrunOptions,
7254 const gmx_mtop_t *mtop,
7255 const t_inputrec *ir,
7257 const rvec *xGlobal,
7259 int *npme_x, int *npme_y)
7266 "\nInitializing Domain Decomposition on %d ranks\n", cr->nnodes);
7271 dd->comm = init_dd_comm();
7273 set_dd_envvar_options(fplog, dd, cr->nodeid);
7275 set_dd_limits_and_grid(fplog, cr, dd, options, mdrunOptions,
7281 make_dd_communicators(fplog, cr, dd, options.rankOrder);
7283 if (thisRankHasDuty(cr, DUTY_PP))
7285 set_ddgrid_parameters(fplog, dd, options.dlbScaling, mtop, ir, ddbox);
7287 setup_neighbor_relations(dd);
7290 /* Set overallocation to avoid frequent reallocation of arrays */
7291 set_over_alloc_dd(TRUE);
7293 /* Initialize DD paritioning counters */
7294 dd->comm->partition_step = INT_MIN;
7297 /* We currently don't know the number of threads yet, we set this later */
7300 clear_dd_cycle_counts(dd);
7305 static gmx_bool test_dd_cutoff(t_commrec *cr,
7306 t_state *state, const t_inputrec *ir,
7317 set_ddbox(dd, FALSE, cr, ir, state->box,
7318 TRUE, &dd->comm->cgs_gl, as_rvec_array(state->x.data()), &ddbox);
7322 for (d = 0; d < dd->ndim; d++)
7326 inv_cell_size = DD_CELL_MARGIN*dd->nc[dim]/ddbox.box_size[dim];
7327 if (dynamic_dd_box(&ddbox, ir))
7329 inv_cell_size *= DD_PRES_SCALE_MARGIN;
7332 np = 1 + (int)(cutoff_req*inv_cell_size*ddbox.skew_fac[dim]);
7334 if (!isDlbDisabled(dd->comm) && (dim < ddbox.npbcdim) && (dd->comm->cd[d].np_dlb > 0))
7336 if (np > dd->comm->cd[d].np_dlb)
7341 /* If a current local cell size is smaller than the requested
7342 * cut-off, we could still fix it, but this gets very complicated.
7343 * Without fixing here, we might actually need more checks.
7345 if ((dd->comm->cell_x1[dim] - dd->comm->cell_x0[dim])*ddbox.skew_fac[dim]*dd->comm->cd[d].np_dlb < cutoff_req)
7352 if (!isDlbDisabled(dd->comm))
7354 /* If DLB is not active yet, we don't need to check the grid jumps.
7355 * Actually we shouldn't, because then the grid jump data is not set.
7357 if (isDlbOn(dd->comm) &&
7358 check_grid_jump(0, dd, cutoff_req, &ddbox, FALSE))
7363 gmx_sumi(1, &LocallyLimited, cr);
7365 if (LocallyLimited > 0)
7374 gmx_bool change_dd_cutoff(t_commrec *cr, t_state *state, const t_inputrec *ir,
7377 gmx_bool bCutoffAllowed;
7379 bCutoffAllowed = test_dd_cutoff(cr, state, ir, cutoff_req);
7383 cr->dd->comm->cutoff = cutoff_req;
7386 return bCutoffAllowed;
7389 void set_dd_dlb_max_cutoff(t_commrec *cr, real cutoff)
7391 gmx_domdec_comm_t *comm;
7393 comm = cr->dd->comm;
7395 /* Turn on the DLB limiting (might have been on already) */
7396 comm->bPMELoadBalDLBLimits = TRUE;
7398 /* Change the cut-off limit */
7399 comm->PMELoadBal_max_cutoff = cutoff;
7403 fprintf(debug, "PME load balancing set a limit to the DLB staggering such that a %f cut-off will continue to fit\n",
7404 comm->PMELoadBal_max_cutoff);
7408 /* Sets whether we should later check the load imbalance data, so that
7409 * we can trigger dynamic load balancing if enough imbalance has
7412 * Used after PME load balancing unlocks DLB, so that the check
7413 * whether DLB will be useful can happen immediately.
7415 static void dd_dlb_set_should_check_whether_to_turn_dlb_on(gmx_domdec_t *dd, gmx_bool bValue)
7417 if (dd->comm->dlbState == edlbsOffCanTurnOn)
7419 dd->comm->bCheckWhetherToTurnDlbOn = bValue;
7423 /* Store the DD partitioning count, so we can ignore cycle counts
7424 * over the next nstlist steps, which are often slower.
7426 dd->comm->ddPartioningCountFirstDlbOff = dd->ddp_count;
7431 /* Returns if we should check whether there has been enough load
7432 * imbalance to trigger dynamic load balancing.
7434 static gmx_bool dd_dlb_get_should_check_whether_to_turn_dlb_on(gmx_domdec_t *dd)
7436 if (dd->comm->dlbState != edlbsOffCanTurnOn)
7441 if (dd->ddp_count <= dd->comm->ddPartioningCountFirstDlbOff)
7443 /* We ignore the first nstlist steps at the start of the run
7444 * or after PME load balancing or after turning DLB off, since
7445 * these often have extra allocation or cache miss overhead.
7450 if (dd->comm->cycl_n[ddCyclStep] == 0)
7452 /* We can have zero timed steps when dd_partition_system is called
7453 * more than once at the same step, e.g. with replica exchange.
7454 * Turning on DLB would trigger an assertion failure later, but is
7455 * also useless right after exchanging replicas.
7460 /* We should check whether we should use DLB directly after
7462 if (dd->comm->bCheckWhetherToTurnDlbOn)
7464 /* This flag was set when the PME load-balancing routines
7465 unlocked DLB, and should now be cleared. */
7466 dd_dlb_set_should_check_whether_to_turn_dlb_on(dd, FALSE);
7469 /* We check whether we should use DLB every c_checkTurnDlbOnInterval
7470 * partitionings (we do not do this every partioning, so that we
7471 * avoid excessive communication). */
7472 if (dd->comm->n_load_have % c_checkTurnDlbOnInterval == c_checkTurnDlbOnInterval - 1)
7480 gmx_bool dd_dlb_is_on(const gmx_domdec_t *dd)
7482 return isDlbOn(dd->comm);
7485 gmx_bool dd_dlb_is_locked(const gmx_domdec_t *dd)
7487 return (dd->comm->dlbState == edlbsOffTemporarilyLocked);
7490 void dd_dlb_lock(gmx_domdec_t *dd)
7492 /* We can only lock the DLB when it is set to auto, otherwise don't do anything */
7493 if (dd->comm->dlbState == edlbsOffCanTurnOn)
7495 dd->comm->dlbState = edlbsOffTemporarilyLocked;
7499 void dd_dlb_unlock(gmx_domdec_t *dd)
7501 /* We can only lock the DLB when it is set to auto, otherwise don't do anything */
7502 if (dd->comm->dlbState == edlbsOffTemporarilyLocked)
7504 dd->comm->dlbState = edlbsOffCanTurnOn;
7505 dd_dlb_set_should_check_whether_to_turn_dlb_on(dd, TRUE);
7509 static void merge_cg_buffers(int ncell,
7510 gmx_domdec_comm_dim_t *cd, int pulse,
7512 int *index_gl, int *recv_i,
7513 rvec *cg_cm, rvec *recv_vr,
7515 cginfo_mb_t *cginfo_mb, int *cginfo)
7517 gmx_domdec_ind_t *ind, *ind_p;
7518 int p, cell, c, cg, cg0, cg1, cg_gl, nat;
7519 int shift, shift_at;
7521 ind = &cd->ind[pulse];
7523 /* First correct the already stored data */
7524 shift = ind->nrecv[ncell];
7525 for (cell = ncell-1; cell >= 0; cell--)
7527 shift -= ind->nrecv[cell];
7530 /* Move the cg's present from previous grid pulses */
7531 cg0 = ncg_cell[ncell+cell];
7532 cg1 = ncg_cell[ncell+cell+1];
7533 cgindex[cg1+shift] = cgindex[cg1];
7534 for (cg = cg1-1; cg >= cg0; cg--)
7536 index_gl[cg+shift] = index_gl[cg];
7537 copy_rvec(cg_cm[cg], cg_cm[cg+shift]);
7538 cgindex[cg+shift] = cgindex[cg];
7539 cginfo[cg+shift] = cginfo[cg];
7541 /* Correct the already stored send indices for the shift */
7542 for (p = 1; p <= pulse; p++)
7544 ind_p = &cd->ind[p];
7546 for (c = 0; c < cell; c++)
7548 cg0 += ind_p->nsend[c];
7550 cg1 = cg0 + ind_p->nsend[cell];
7551 for (cg = cg0; cg < cg1; cg++)
7553 ind_p->index[cg] += shift;
7559 /* Merge in the communicated buffers */
7563 for (cell = 0; cell < ncell; cell++)
7565 cg1 = ncg_cell[ncell+cell+1] + shift;
7568 /* Correct the old cg indices */
7569 for (cg = ncg_cell[ncell+cell]; cg < cg1; cg++)
7571 cgindex[cg+1] += shift_at;
7574 for (cg = 0; cg < ind->nrecv[cell]; cg++)
7576 /* Copy this charge group from the buffer */
7577 index_gl[cg1] = recv_i[cg0];
7578 copy_rvec(recv_vr[cg0], cg_cm[cg1]);
7579 /* Add it to the cgindex */
7580 cg_gl = index_gl[cg1];
7581 cginfo[cg1] = ddcginfo(cginfo_mb, cg_gl);
7582 nat = GET_CGINFO_NATOMS(cginfo[cg1]);
7583 cgindex[cg1+1] = cgindex[cg1] + nat;
7588 shift += ind->nrecv[cell];
7589 ncg_cell[ncell+cell+1] = cg1;
7593 static void make_cell2at_index(gmx_domdec_comm_dim_t *cd,
7594 int nzone, int cg0, const int *cgindex)
7598 /* Store the atom block boundaries for easy copying of communication buffers
7601 for (zone = 0; zone < nzone; zone++)
7603 for (p = 0; p < cd->np; p++)
7605 cd->ind[p].cell2at0[zone] = cgindex[cg];
7606 cg += cd->ind[p].nrecv[zone];
7607 cd->ind[p].cell2at1[zone] = cgindex[cg];
7612 static gmx_bool missing_link(t_blocka *link, int cg_gl, char *bLocalCG)
7618 for (i = link->index[cg_gl]; i < link->index[cg_gl+1]; i++)
7620 if (!bLocalCG[link->a[i]])
7629 /* Domain corners for communication, a maximum of 4 i-zones see a j domain */
7631 real c[DIM][4]; /* the corners for the non-bonded communication */
7632 real cr0; /* corner for rounding */
7633 real cr1[4]; /* corners for rounding */
7634 real bc[DIM]; /* corners for bounded communication */
7635 real bcr1; /* corner for rounding for bonded communication */
7638 /* Determine the corners of the domain(s) we are communicating with */
7640 set_dd_corners(const gmx_domdec_t *dd,
7641 int dim0, int dim1, int dim2,
7645 const gmx_domdec_comm_t *comm;
7646 const gmx_domdec_zones_t *zones;
7651 zones = &comm->zones;
7653 /* Keep the compiler happy */
7657 /* The first dimension is equal for all cells */
7658 c->c[0][0] = comm->cell_x0[dim0];
7661 c->bc[0] = c->c[0][0];
7666 /* This cell row is only seen from the first row */
7667 c->c[1][0] = comm->cell_x0[dim1];
7668 /* All rows can see this row */
7669 c->c[1][1] = comm->cell_x0[dim1];
7670 if (isDlbOn(dd->comm))
7672 c->c[1][1] = std::max(comm->cell_x0[dim1], comm->zone_d1[1].mch0);
7675 /* For the multi-body distance we need the maximum */
7676 c->bc[1] = std::max(comm->cell_x0[dim1], comm->zone_d1[1].p1_0);
7679 /* Set the upper-right corner for rounding */
7680 c->cr0 = comm->cell_x1[dim0];
7685 for (j = 0; j < 4; j++)
7687 c->c[2][j] = comm->cell_x0[dim2];
7689 if (isDlbOn(dd->comm))
7691 /* Use the maximum of the i-cells that see a j-cell */
7692 for (i = 0; i < zones->nizone; i++)
7694 for (j = zones->izone[i].j0; j < zones->izone[i].j1; j++)
7699 std::max(c->c[2][j-4],
7700 comm->zone_d2[zones->shift[i][dim0]][zones->shift[i][dim1]].mch0);
7706 /* For the multi-body distance we need the maximum */
7707 c->bc[2] = comm->cell_x0[dim2];
7708 for (i = 0; i < 2; i++)
7710 for (j = 0; j < 2; j++)
7712 c->bc[2] = std::max(c->bc[2], comm->zone_d2[i][j].p1_0);
7718 /* Set the upper-right corner for rounding */
7719 /* Cell (0,0,0) and cell (1,0,0) can see cell 4 (0,1,1)
7720 * Only cell (0,0,0) can see cell 7 (1,1,1)
7722 c->cr1[0] = comm->cell_x1[dim1];
7723 c->cr1[3] = comm->cell_x1[dim1];
7724 if (isDlbOn(dd->comm))
7726 c->cr1[0] = std::max(comm->cell_x1[dim1], comm->zone_d1[1].mch1);
7729 /* For the multi-body distance we need the maximum */
7730 c->bcr1 = std::max(comm->cell_x1[dim1], comm->zone_d1[1].p1_1);
7737 /* Determine which cg's we need to send in this pulse from this zone */
7739 get_zone_pulse_cgs(gmx_domdec_t *dd,
7740 int zonei, int zone,
7742 const int *index_gl,
7744 int dim, int dim_ind,
7745 int dim0, int dim1, int dim2,
7746 real r_comm2, real r_bcomm2,
7750 real skew_fac2_d, real skew_fac_01,
7751 rvec *v_d, rvec *v_0, rvec *v_1,
7752 const dd_corners_t *c,
7754 gmx_bool bDistBonded,
7760 gmx_domdec_ind_t *ind,
7761 int **ibuf, int *ibuf_nalloc,
7767 gmx_domdec_comm_t *comm;
7769 gmx_bool bDistMB_pulse;
7771 real r2, rb2, r, tric_sh;
7774 int nsend_z, nsend, nat;
7778 bScrew = (dd->bScrewPBC && dim == XX);
7780 bDistMB_pulse = (bDistMB && bDistBonded);
7786 for (cg = cg0; cg < cg1; cg++)
7790 if (tric_dist[dim_ind] == 0)
7792 /* Rectangular direction, easy */
7793 r = cg_cm[cg][dim] - c->c[dim_ind][zone];
7800 r = cg_cm[cg][dim] - c->bc[dim_ind];
7806 /* Rounding gives at most a 16% reduction
7807 * in communicated atoms
7809 if (dim_ind >= 1 && (zonei == 1 || zonei == 2))
7811 r = cg_cm[cg][dim0] - c->cr0;
7812 /* This is the first dimension, so always r >= 0 */
7819 if (dim_ind == 2 && (zonei == 2 || zonei == 3))
7821 r = cg_cm[cg][dim1] - c->cr1[zone];
7828 r = cg_cm[cg][dim1] - c->bcr1;
7838 /* Triclinic direction, more complicated */
7841 /* Rounding, conservative as the skew_fac multiplication
7842 * will slightly underestimate the distance.
7844 if (dim_ind >= 1 && (zonei == 1 || zonei == 2))
7846 rn[dim0] = cg_cm[cg][dim0] - c->cr0;
7847 for (i = dim0+1; i < DIM; i++)
7849 rn[dim0] -= cg_cm[cg][i]*v_0[i][dim0];
7851 r2 = rn[dim0]*rn[dim0]*sf2_round[dim0];
7854 rb[dim0] = rn[dim0];
7857 /* Take care that the cell planes along dim0 might not
7858 * be orthogonal to those along dim1 and dim2.
7860 for (i = 1; i <= dim_ind; i++)
7863 if (normal[dim0][dimd] > 0)
7865 rn[dimd] -= rn[dim0]*normal[dim0][dimd];
7868 rb[dimd] -= rb[dim0]*normal[dim0][dimd];
7873 if (dim_ind == 2 && (zonei == 2 || zonei == 3))
7875 rn[dim1] += cg_cm[cg][dim1] - c->cr1[zone];
7877 for (i = dim1+1; i < DIM; i++)
7879 tric_sh -= cg_cm[cg][i]*v_1[i][dim1];
7881 rn[dim1] += tric_sh;
7884 r2 += rn[dim1]*rn[dim1]*sf2_round[dim1];
7885 /* Take care of coupling of the distances
7886 * to the planes along dim0 and dim1 through dim2.
7888 r2 -= rn[dim0]*rn[dim1]*skew_fac_01;
7889 /* Take care that the cell planes along dim1
7890 * might not be orthogonal to that along dim2.
7892 if (normal[dim1][dim2] > 0)
7894 rn[dim2] -= rn[dim1]*normal[dim1][dim2];
7900 cg_cm[cg][dim1] - c->bcr1 + tric_sh;
7903 rb2 += rb[dim1]*rb[dim1]*sf2_round[dim1];
7904 /* Take care of coupling of the distances
7905 * to the planes along dim0 and dim1 through dim2.
7907 rb2 -= rb[dim0]*rb[dim1]*skew_fac_01;
7908 /* Take care that the cell planes along dim1
7909 * might not be orthogonal to that along dim2.
7911 if (normal[dim1][dim2] > 0)
7913 rb[dim2] -= rb[dim1]*normal[dim1][dim2];
7918 /* The distance along the communication direction */
7919 rn[dim] += cg_cm[cg][dim] - c->c[dim_ind][zone];
7921 for (i = dim+1; i < DIM; i++)
7923 tric_sh -= cg_cm[cg][i]*v_d[i][dim];
7928 r2 += rn[dim]*rn[dim]*skew_fac2_d;
7929 /* Take care of coupling of the distances
7930 * to the planes along dim0 and dim1 through dim2.
7932 if (dim_ind == 1 && zonei == 1)
7934 r2 -= rn[dim0]*rn[dim]*skew_fac_01;
7940 rb[dim] += cg_cm[cg][dim] - c->bc[dim_ind] + tric_sh;
7943 rb2 += rb[dim]*rb[dim]*skew_fac2_d;
7944 /* Take care of coupling of the distances
7945 * to the planes along dim0 and dim1 through dim2.
7947 if (dim_ind == 1 && zonei == 1)
7949 rb2 -= rb[dim0]*rb[dim]*skew_fac_01;
7957 ((bDistMB && rb2 < r_bcomm2) ||
7958 (bDist2B && r2 < r_bcomm2)) &&
7960 (GET_CGINFO_BOND_INTER(cginfo[cg]) &&
7961 missing_link(comm->cglink, index_gl[cg],
7964 /* Make an index to the local charge groups */
7965 if (nsend+1 > ind->nalloc)
7967 ind->nalloc = over_alloc_large(nsend+1);
7968 srenew(ind->index, ind->nalloc);
7970 if (nsend+1 > *ibuf_nalloc)
7972 *ibuf_nalloc = over_alloc_large(nsend+1);
7973 srenew(*ibuf, *ibuf_nalloc);
7975 ind->index[nsend] = cg;
7976 (*ibuf)[nsend] = index_gl[cg];
7978 vec_rvec_check_alloc(vbuf, nsend+1);
7980 if (dd->ci[dim] == 0)
7982 /* Correct cg_cm for pbc */
7983 rvec_add(cg_cm[cg], box[dim], vbuf->v[nsend]);
7986 vbuf->v[nsend][YY] = box[YY][YY] - vbuf->v[nsend][YY];
7987 vbuf->v[nsend][ZZ] = box[ZZ][ZZ] - vbuf->v[nsend][ZZ];
7992 copy_rvec(cg_cm[cg], vbuf->v[nsend]);
7995 nat += cgindex[cg+1] - cgindex[cg];
8001 *nsend_z_ptr = nsend_z;
8004 static void setup_dd_communication(gmx_domdec_t *dd,
8005 matrix box, gmx_ddbox_t *ddbox,
8007 t_state *state, PaddedRVecVector *f)
8009 int dim_ind, dim, dim0, dim1, dim2, dimd, p, nat_tot;
8010 int nzone, nzone_send, zone, zonei, cg0, cg1;
8011 int c, i, cg, cg_gl, nrcg;
8012 int *zone_cg_range, pos_cg, *index_gl, *cgindex, *recv_i;
8013 gmx_domdec_comm_t *comm;
8014 gmx_domdec_zones_t *zones;
8015 gmx_domdec_comm_dim_t *cd;
8016 gmx_domdec_ind_t *ind;
8017 cginfo_mb_t *cginfo_mb;
8018 gmx_bool bBondComm, bDist2B, bDistMB, bDistBonded;
8019 real r_comm2, r_bcomm2;
8020 dd_corners_t corners;
8022 rvec *cg_cm, *normal, *v_d, *v_0 = nullptr, *v_1 = nullptr, *recv_vr;
8023 real skew_fac2_d, skew_fac_01;
8030 fprintf(debug, "Setting up DD communication\n");
8037 /* Initialize the thread data.
8038 * This can not be done in init_domain_decomposition,
8039 * as the numbers of threads is determined later.
8041 comm->nth = gmx_omp_nthreads_get(emntDomdec);
8044 snew(comm->dth, comm->nth);
8048 switch (fr->cutoff_scheme)
8054 cg_cm = as_rvec_array(state->x.data());
8057 gmx_incons("unimplemented");
8061 for (dim_ind = 0; dim_ind < dd->ndim; dim_ind++)
8063 /* Check if we need to use triclinic distances */
8064 tric_dist[dim_ind] = 0;
8065 for (i = 0; i <= dim_ind; i++)
8067 if (ddbox->tric_dir[dd->dim[i]])
8069 tric_dist[dim_ind] = 1;
8074 bBondComm = comm->bBondComm;
8076 /* Do we need to determine extra distances for multi-body bondeds? */
8077 bDistMB = (comm->bInterCGMultiBody && isDlbOn(dd->comm) && dd->ndim > 1);
8079 /* Do we need to determine extra distances for only two-body bondeds? */
8080 bDist2B = (bBondComm && !bDistMB);
8082 r_comm2 = gmx::square(comm->cutoff);
8083 r_bcomm2 = gmx::square(comm->cutoff_mbody);
8087 fprintf(debug, "bBondComm %d, r_bc %f\n", bBondComm, std::sqrt(r_bcomm2));
8090 zones = &comm->zones;
8093 dim1 = (dd->ndim >= 2 ? dd->dim[1] : -1);
8094 dim2 = (dd->ndim >= 3 ? dd->dim[2] : -1);
8096 set_dd_corners(dd, dim0, dim1, dim2, bDistMB, &corners);
8098 /* Triclinic stuff */
8099 normal = ddbox->normal;
8103 v_0 = ddbox->v[dim0];
8104 if (ddbox->tric_dir[dim0] && ddbox->tric_dir[dim1])
8106 /* Determine the coupling coefficient for the distances
8107 * to the cell planes along dim0 and dim1 through dim2.
8108 * This is required for correct rounding.
8111 ddbox->v[dim0][dim1+1][dim0]*ddbox->v[dim1][dim1+1][dim1];
8114 fprintf(debug, "\nskew_fac_01 %f\n", skew_fac_01);
8120 v_1 = ddbox->v[dim1];
8123 zone_cg_range = zones->cg_range;
8124 index_gl = dd->index_gl;
8125 cgindex = dd->cgindex;
8126 cginfo_mb = fr->cginfo_mb;
8128 zone_cg_range[0] = 0;
8129 zone_cg_range[1] = dd->ncg_home;
8130 comm->zone_ncg1[0] = dd->ncg_home;
8131 pos_cg = dd->ncg_home;
8133 nat_tot = dd->nat_home;
8135 for (dim_ind = 0; dim_ind < dd->ndim; dim_ind++)
8137 dim = dd->dim[dim_ind];
8138 cd = &comm->cd[dim_ind];
8140 if (dim >= ddbox->npbcdim && dd->ci[dim] == 0)
8142 /* No pbc in this dimension, the first node should not comm. */
8150 v_d = ddbox->v[dim];
8151 skew_fac2_d = gmx::square(ddbox->skew_fac[dim]);
8153 cd->bInPlace = TRUE;
8154 for (p = 0; p < cd->np; p++)
8156 /* Only atoms communicated in the first pulse are used
8157 * for multi-body bonded interactions or for bBondComm.
8159 bDistBonded = ((bDistMB || bDist2B) && p == 0);
8164 for (zone = 0; zone < nzone_send; zone++)
8166 if (tric_dist[dim_ind] && dim_ind > 0)
8168 /* Determine slightly more optimized skew_fac's
8170 * This reduces the number of communicated atoms
8171 * by about 10% for 3D DD of rhombic dodecahedra.
8173 for (dimd = 0; dimd < dim; dimd++)
8175 sf2_round[dimd] = 1;
8176 if (ddbox->tric_dir[dimd])
8178 for (i = dd->dim[dimd]+1; i < DIM; i++)
8180 /* If we are shifted in dimension i
8181 * and the cell plane is tilted forward
8182 * in dimension i, skip this coupling.
8184 if (!(zones->shift[nzone+zone][i] &&
8185 ddbox->v[dimd][i][dimd] >= 0))
8188 gmx::square(ddbox->v[dimd][i][dimd]);
8191 sf2_round[dimd] = 1/sf2_round[dimd];
8196 zonei = zone_perm[dim_ind][zone];
8199 /* Here we permutate the zones to obtain a convenient order
8200 * for neighbor searching
8202 cg0 = zone_cg_range[zonei];
8203 cg1 = zone_cg_range[zonei+1];
8207 /* Look only at the cg's received in the previous grid pulse
8209 cg1 = zone_cg_range[nzone+zone+1];
8210 cg0 = cg1 - cd->ind[p-1].nrecv[zone];
8213 #pragma omp parallel for num_threads(comm->nth) schedule(static)
8214 for (th = 0; th < comm->nth; th++)
8218 gmx_domdec_ind_t *ind_p;
8219 int **ibuf_p, *ibuf_nalloc_p;
8221 int *nsend_p, *nat_p;
8227 /* Thread 0 writes in the comm buffers */
8229 ibuf_p = &comm->buf_int;
8230 ibuf_nalloc_p = &comm->nalloc_int;
8231 vbuf_p = &comm->vbuf;
8234 nsend_zone_p = &ind->nsend[zone];
8238 /* Other threads write into temp buffers */
8239 ind_p = &comm->dth[th].ind;
8240 ibuf_p = &comm->dth[th].ibuf;
8241 ibuf_nalloc_p = &comm->dth[th].ibuf_nalloc;
8242 vbuf_p = &comm->dth[th].vbuf;
8243 nsend_p = &comm->dth[th].nsend;
8244 nat_p = &comm->dth[th].nat;
8245 nsend_zone_p = &comm->dth[th].nsend_zone;
8247 comm->dth[th].nsend = 0;
8248 comm->dth[th].nat = 0;
8249 comm->dth[th].nsend_zone = 0;
8259 cg0_th = cg0 + ((cg1 - cg0)* th )/comm->nth;
8260 cg1_th = cg0 + ((cg1 - cg0)*(th+1))/comm->nth;
8263 /* Get the cg's for this pulse in this zone */
8264 get_zone_pulse_cgs(dd, zonei, zone, cg0_th, cg1_th,
8266 dim, dim_ind, dim0, dim1, dim2,
8269 normal, skew_fac2_d, skew_fac_01,
8270 v_d, v_0, v_1, &corners, sf2_round,
8271 bDistBonded, bBondComm,
8275 ibuf_p, ibuf_nalloc_p,
8280 GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
8283 /* Append data of threads>=1 to the communication buffers */
8284 for (th = 1; th < comm->nth; th++)
8286 dd_comm_setup_work_t *dth;
8289 dth = &comm->dth[th];
8291 ns1 = nsend + dth->nsend_zone;
8292 if (ns1 > ind->nalloc)
8294 ind->nalloc = over_alloc_dd(ns1);
8295 srenew(ind->index, ind->nalloc);
8297 if (ns1 > comm->nalloc_int)
8299 comm->nalloc_int = over_alloc_dd(ns1);
8300 srenew(comm->buf_int, comm->nalloc_int);
8302 if (ns1 > comm->vbuf.nalloc)
8304 comm->vbuf.nalloc = over_alloc_dd(ns1);
8305 srenew(comm->vbuf.v, comm->vbuf.nalloc);
8308 for (i = 0; i < dth->nsend_zone; i++)
8310 ind->index[nsend] = dth->ind.index[i];
8311 comm->buf_int[nsend] = dth->ibuf[i];
8312 copy_rvec(dth->vbuf.v[i],
8313 comm->vbuf.v[nsend]);
8317 ind->nsend[zone] += dth->nsend_zone;
8320 /* Clear the counts in case we do not have pbc */
8321 for (zone = nzone_send; zone < nzone; zone++)
8323 ind->nsend[zone] = 0;
8325 ind->nsend[nzone] = nsend;
8326 ind->nsend[nzone+1] = nat;
8327 /* Communicate the number of cg's and atoms to receive */
8328 dd_sendrecv_int(dd, dim_ind, dddirBackward,
8329 ind->nsend, nzone+2,
8330 ind->nrecv, nzone+2);
8332 /* The rvec buffer is also required for atom buffers of size nsend
8333 * in dd_move_x and dd_move_f.
8335 vec_rvec_check_alloc(&comm->vbuf, ind->nsend[nzone+1]);
8339 /* We can receive in place if only the last zone is not empty */
8340 for (zone = 0; zone < nzone-1; zone++)
8342 if (ind->nrecv[zone] > 0)
8344 cd->bInPlace = FALSE;
8349 /* The int buffer is only required here for the cg indices */
8350 if (ind->nrecv[nzone] > comm->nalloc_int2)
8352 comm->nalloc_int2 = over_alloc_dd(ind->nrecv[nzone]);
8353 srenew(comm->buf_int2, comm->nalloc_int2);
8355 /* The rvec buffer is also required for atom buffers
8356 * of size nrecv in dd_move_x and dd_move_f.
8358 i = std::max(cd->ind[0].nrecv[nzone+1], ind->nrecv[nzone+1]);
8359 vec_rvec_check_alloc(&comm->vbuf2, i);
8363 /* Make space for the global cg indices */
8364 if (pos_cg + ind->nrecv[nzone] > dd->cg_nalloc
8365 || dd->cg_nalloc == 0)
8367 dd->cg_nalloc = over_alloc_dd(pos_cg + ind->nrecv[nzone]);
8368 srenew(index_gl, dd->cg_nalloc);
8369 srenew(cgindex, dd->cg_nalloc+1);
8371 /* Communicate the global cg indices */
8374 recv_i = index_gl + pos_cg;
8378 recv_i = comm->buf_int2;
8380 dd_sendrecv_int(dd, dim_ind, dddirBackward,
8381 comm->buf_int, nsend,
8382 recv_i, ind->nrecv[nzone]);
8384 /* Make space for cg_cm */
8385 dd_check_alloc_ncg(fr, state, f, pos_cg + ind->nrecv[nzone]);
8386 if (fr->cutoff_scheme == ecutsGROUP)
8392 cg_cm = as_rvec_array(state->x.data());
8394 /* Communicate cg_cm */
8397 recv_vr = cg_cm + pos_cg;
8401 recv_vr = comm->vbuf2.v;
8403 dd_sendrecv_rvec(dd, dim_ind, dddirBackward,
8404 comm->vbuf.v, nsend,
8405 recv_vr, ind->nrecv[nzone]);
8407 /* Make the charge group index */
8410 zone = (p == 0 ? 0 : nzone - 1);
8411 while (zone < nzone)
8413 for (cg = 0; cg < ind->nrecv[zone]; cg++)
8415 cg_gl = index_gl[pos_cg];
8416 fr->cginfo[pos_cg] = ddcginfo(cginfo_mb, cg_gl);
8417 nrcg = GET_CGINFO_NATOMS(fr->cginfo[pos_cg]);
8418 cgindex[pos_cg+1] = cgindex[pos_cg] + nrcg;
8421 /* Update the charge group presence,
8422 * so we can use it in the next pass of the loop.
8424 comm->bLocalCG[cg_gl] = TRUE;
8430 comm->zone_ncg1[nzone+zone] = ind->nrecv[zone];
8433 zone_cg_range[nzone+zone] = pos_cg;
8438 /* This part of the code is never executed with bBondComm. */
8439 merge_cg_buffers(nzone, cd, p, zone_cg_range,
8440 index_gl, recv_i, cg_cm, recv_vr,
8441 cgindex, fr->cginfo_mb, fr->cginfo);
8442 pos_cg += ind->nrecv[nzone];
8444 nat_tot += ind->nrecv[nzone+1];
8448 /* Store the atom block for easy copying of communication buffers */
8449 make_cell2at_index(cd, nzone, zone_cg_range[nzone], cgindex);
8453 dd->index_gl = index_gl;
8454 dd->cgindex = cgindex;
8456 dd->ncg_tot = zone_cg_range[zones->n];
8457 dd->nat_tot = nat_tot;
8458 comm->nat[ddnatHOME] = dd->nat_home;
8459 for (i = ddnatZONE; i < ddnatNR; i++)
8461 comm->nat[i] = dd->nat_tot;
8466 /* We don't need to update cginfo, since that was alrady done above.
8467 * So we pass NULL for the forcerec.
8469 dd_set_cginfo(dd->index_gl, dd->ncg_home, dd->ncg_tot,
8470 nullptr, comm->bLocalCG);
8475 fprintf(debug, "Finished setting up DD communication, zones:");
8476 for (c = 0; c < zones->n; c++)
8478 fprintf(debug, " %d", zones->cg_range[c+1]-zones->cg_range[c]);
8480 fprintf(debug, "\n");
8484 static void set_cg_boundaries(gmx_domdec_zones_t *zones)
8488 for (c = 0; c < zones->nizone; c++)
8490 zones->izone[c].cg1 = zones->cg_range[c+1];
8491 zones->izone[c].jcg0 = zones->cg_range[zones->izone[c].j0];
8492 zones->izone[c].jcg1 = zones->cg_range[zones->izone[c].j1];
8496 /* \brief Set zone dimensions for zones \p zone_start to \p zone_end-1
8498 * Also sets the atom density for the home zone when \p zone_start=0.
8499 * For this \p numMovedChargeGroupsInHomeZone needs to be passed to tell
8500 * how many charge groups will move but are still part of the current range.
8501 * \todo When converting domdec to use proper classes, all these variables
8502 * should be private and a method should return the correct count
8503 * depending on an internal state.
8505 * \param[in,out] dd The domain decomposition struct
8506 * \param[in] box The box
8507 * \param[in] ddbox The domain decomposition box struct
8508 * \param[in] zone_start The start of the zone range to set sizes for
8509 * \param[in] zone_end The end of the zone range to set sizes for
8510 * \param[in] numMovedChargeGroupsInHomeZone The number of charge groups in the home zone that should moved but are still present in dd->comm->zones.cg_range
8512 static void set_zones_size(gmx_domdec_t *dd,
8513 matrix box, const gmx_ddbox_t *ddbox,
8514 int zone_start, int zone_end,
8515 int numMovedChargeGroupsInHomeZone)
8517 gmx_domdec_comm_t *comm;
8518 gmx_domdec_zones_t *zones;
8527 zones = &comm->zones;
8529 /* Do we need to determine extra distances for multi-body bondeds? */
8530 bDistMB = (comm->bInterCGMultiBody && isDlbOn(dd->comm) && dd->ndim > 1);
8532 for (z = zone_start; z < zone_end; z++)
8534 /* Copy cell limits to zone limits.
8535 * Valid for non-DD dims and non-shifted dims.
8537 copy_rvec(comm->cell_x0, zones->size[z].x0);
8538 copy_rvec(comm->cell_x1, zones->size[z].x1);
8541 for (d = 0; d < dd->ndim; d++)
8545 for (z = 0; z < zones->n; z++)
8547 /* With a staggered grid we have different sizes
8548 * for non-shifted dimensions.
8550 if (isDlbOn(dd->comm) && zones->shift[z][dim] == 0)
8554 zones->size[z].x0[dim] = comm->zone_d1[zones->shift[z][dd->dim[d-1]]].min0;
8555 zones->size[z].x1[dim] = comm->zone_d1[zones->shift[z][dd->dim[d-1]]].max1;
8559 zones->size[z].x0[dim] = comm->zone_d2[zones->shift[z][dd->dim[d-2]]][zones->shift[z][dd->dim[d-1]]].min0;
8560 zones->size[z].x1[dim] = comm->zone_d2[zones->shift[z][dd->dim[d-2]]][zones->shift[z][dd->dim[d-1]]].max1;
8566 rcmbs = comm->cutoff_mbody;
8567 if (ddbox->tric_dir[dim])
8569 rcs /= ddbox->skew_fac[dim];
8570 rcmbs /= ddbox->skew_fac[dim];
8573 /* Set the lower limit for the shifted zone dimensions */
8574 for (z = zone_start; z < zone_end; z++)
8576 if (zones->shift[z][dim] > 0)
8579 if (!isDlbOn(dd->comm) || d == 0)
8581 zones->size[z].x0[dim] = comm->cell_x1[dim];
8582 zones->size[z].x1[dim] = comm->cell_x1[dim] + rcs;
8586 /* Here we take the lower limit of the zone from
8587 * the lowest domain of the zone below.
8591 zones->size[z].x0[dim] =
8592 comm->zone_d1[zones->shift[z][dd->dim[d-1]]].min1;
8598 zones->size[z].x0[dim] =
8599 zones->size[zone_perm[2][z-4]].x0[dim];
8603 zones->size[z].x0[dim] =
8604 comm->zone_d2[zones->shift[z][dd->dim[d-2]]][zones->shift[z][dd->dim[d-1]]].min1;
8607 /* A temporary limit, is updated below */
8608 zones->size[z].x1[dim] = zones->size[z].x0[dim];
8612 for (zi = 0; zi < zones->nizone; zi++)
8614 if (zones->shift[zi][dim] == 0)
8616 /* This takes the whole zone into account.
8617 * With multiple pulses this will lead
8618 * to a larger zone then strictly necessary.
8620 zones->size[z].x1[dim] = std::max(zones->size[z].x1[dim],
8621 zones->size[zi].x1[dim]+rcmbs);
8629 /* Loop over the i-zones to set the upper limit of each
8632 for (zi = 0; zi < zones->nizone; zi++)
8634 if (zones->shift[zi][dim] == 0)
8636 for (z = zones->izone[zi].j0; z < zones->izone[zi].j1; z++)
8638 if (zones->shift[z][dim] > 0)
8640 zones->size[z].x1[dim] = std::max(zones->size[z].x1[dim],
8641 zones->size[zi].x1[dim]+rcs);
8648 for (z = zone_start; z < zone_end; z++)
8650 /* Initialization only required to keep the compiler happy */
8651 rvec corner_min = {0, 0, 0}, corner_max = {0, 0, 0}, corner;
8654 /* To determine the bounding box for a zone we need to find
8655 * the extreme corners of 4, 2 or 1 corners.
8657 nc = 1 << (ddbox->nboundeddim - 1);
8659 for (c = 0; c < nc; c++)
8661 /* Set up a zone corner at x=0, ignoring trilinic couplings */
8665 corner[YY] = zones->size[z].x0[YY];
8669 corner[YY] = zones->size[z].x1[YY];
8673 corner[ZZ] = zones->size[z].x0[ZZ];
8677 corner[ZZ] = zones->size[z].x1[ZZ];
8679 if (dd->ndim == 1 && dd->dim[0] < ZZ && ZZ < dd->npbcdim &&
8680 box[ZZ][1 - dd->dim[0]] != 0)
8682 /* With 1D domain decomposition the cg's are not in
8683 * the triclinic box, but triclinic x-y and rectangular y/x-z.
8684 * Shift the corner of the z-vector back to along the box
8685 * vector of dimension d, so it will later end up at 0 along d.
8686 * This can affect the location of this corner along dd->dim[0]
8687 * through the matrix operation below if box[d][dd->dim[0]]!=0.
8689 int d = 1 - dd->dim[0];
8691 corner[d] -= corner[ZZ]*box[ZZ][d]/box[ZZ][ZZ];
8693 /* Apply the triclinic couplings */
8694 assert(ddbox->npbcdim <= DIM);
8695 for (i = YY; i < ddbox->npbcdim; i++)
8697 for (j = XX; j < i; j++)
8699 corner[j] += corner[i]*box[i][j]/box[i][i];
8704 copy_rvec(corner, corner_min);
8705 copy_rvec(corner, corner_max);
8709 for (i = 0; i < DIM; i++)
8711 corner_min[i] = std::min(corner_min[i], corner[i]);
8712 corner_max[i] = std::max(corner_max[i], corner[i]);
8716 /* Copy the extreme cornes without offset along x */
8717 for (i = 0; i < DIM; i++)
8719 zones->size[z].bb_x0[i] = corner_min[i];
8720 zones->size[z].bb_x1[i] = corner_max[i];
8722 /* Add the offset along x */
8723 zones->size[z].bb_x0[XX] += zones->size[z].x0[XX];
8724 zones->size[z].bb_x1[XX] += zones->size[z].x1[XX];
8727 if (zone_start == 0)
8730 for (dim = 0; dim < DIM; dim++)
8732 vol *= zones->size[0].x1[dim] - zones->size[0].x0[dim];
8734 zones->dens_zone0 = (zones->cg_range[1] - zones->cg_range[0] - numMovedChargeGroupsInHomeZone)/vol;
8739 for (z = zone_start; z < zone_end; z++)
8741 fprintf(debug, "zone %d %6.3f - %6.3f %6.3f - %6.3f %6.3f - %6.3f\n",
8743 zones->size[z].x0[XX], zones->size[z].x1[XX],
8744 zones->size[z].x0[YY], zones->size[z].x1[YY],
8745 zones->size[z].x0[ZZ], zones->size[z].x1[ZZ]);
8746 fprintf(debug, "zone %d bb %6.3f - %6.3f %6.3f - %6.3f %6.3f - %6.3f\n",
8748 zones->size[z].bb_x0[XX], zones->size[z].bb_x1[XX],
8749 zones->size[z].bb_x0[YY], zones->size[z].bb_x1[YY],
8750 zones->size[z].bb_x0[ZZ], zones->size[z].bb_x1[ZZ]);
8755 static int comp_cgsort(const void *a, const void *b)
8759 gmx_cgsort_t *cga, *cgb;
8760 cga = (gmx_cgsort_t *)a;
8761 cgb = (gmx_cgsort_t *)b;
8763 comp = cga->nsc - cgb->nsc;
8766 comp = cga->ind_gl - cgb->ind_gl;
8772 static void order_int_cg(int n, const gmx_cgsort_t *sort,
8777 /* Order the data */
8778 for (i = 0; i < n; i++)
8780 buf[i] = a[sort[i].ind];
8783 /* Copy back to the original array */
8784 for (i = 0; i < n; i++)
8790 static void order_vec_cg(int n, const gmx_cgsort_t *sort,
8795 /* Order the data */
8796 for (i = 0; i < n; i++)
8798 copy_rvec(v[sort[i].ind], buf[i]);
8801 /* Copy back to the original array */
8802 for (i = 0; i < n; i++)
8804 copy_rvec(buf[i], v[i]);
8808 static void order_vec_atom(int ncg, const int *cgindex, const gmx_cgsort_t *sort,
8811 int a, atot, cg, cg0, cg1, i;
8813 if (cgindex == nullptr)
8815 /* Avoid the useless loop of the atoms within a cg */
8816 order_vec_cg(ncg, sort, v, buf);
8821 /* Order the data */
8823 for (cg = 0; cg < ncg; cg++)
8825 cg0 = cgindex[sort[cg].ind];
8826 cg1 = cgindex[sort[cg].ind+1];
8827 for (i = cg0; i < cg1; i++)
8829 copy_rvec(v[i], buf[a]);
8835 /* Copy back to the original array */
8836 for (a = 0; a < atot; a++)
8838 copy_rvec(buf[a], v[a]);
8842 static void ordered_sort(int nsort2, gmx_cgsort_t *sort2,
8843 int nsort_new, gmx_cgsort_t *sort_new,
8844 gmx_cgsort_t *sort1)
8848 /* The new indices are not very ordered, so we qsort them */
8849 gmx_qsort_threadsafe(sort_new, nsort_new, sizeof(sort_new[0]), comp_cgsort);
8851 /* sort2 is already ordered, so now we can merge the two arrays */
8855 while (i2 < nsort2 || i_new < nsort_new)
8859 sort1[i1++] = sort_new[i_new++];
8861 else if (i_new == nsort_new)
8863 sort1[i1++] = sort2[i2++];
8865 else if (sort2[i2].nsc < sort_new[i_new].nsc ||
8866 (sort2[i2].nsc == sort_new[i_new].nsc &&
8867 sort2[i2].ind_gl < sort_new[i_new].ind_gl))
8869 sort1[i1++] = sort2[i2++];
8873 sort1[i1++] = sort_new[i_new++];
8878 static int dd_sort_order(gmx_domdec_t *dd, t_forcerec *fr, int ncg_home_old)
8880 gmx_domdec_sort_t *sort;
8881 gmx_cgsort_t *cgsort, *sort_i;
8882 int ncg_new, nsort2, nsort_new, i, *a, moved;
8884 sort = dd->comm->sort;
8886 a = fr->ns->grid->cell_index;
8888 moved = NSGRID_SIGNAL_MOVED_FAC*fr->ns->grid->ncells;
8890 if (ncg_home_old >= 0)
8892 /* The charge groups that remained in the same ns grid cell
8893 * are completely ordered. So we can sort efficiently by sorting
8894 * the charge groups that did move into the stationary list.
8899 for (i = 0; i < dd->ncg_home; i++)
8901 /* Check if this cg did not move to another node */
8904 if (i >= ncg_home_old || a[i] != sort->sort[i].nsc)
8906 /* This cg is new on this node or moved ns grid cell */
8907 if (nsort_new >= sort->sort_new_nalloc)
8909 sort->sort_new_nalloc = over_alloc_dd(nsort_new+1);
8910 srenew(sort->sort_new, sort->sort_new_nalloc);
8912 sort_i = &(sort->sort_new[nsort_new++]);
8916 /* This cg did not move */
8917 sort_i = &(sort->sort2[nsort2++]);
8919 /* Sort on the ns grid cell indices
8920 * and the global topology index.
8921 * index_gl is irrelevant with cell ns,
8922 * but we set it here anyhow to avoid a conditional.
8925 sort_i->ind_gl = dd->index_gl[i];
8932 fprintf(debug, "ordered sort cgs: stationary %d moved %d\n",
8935 /* Sort efficiently */
8936 ordered_sort(nsort2, sort->sort2, nsort_new, sort->sort_new,
8941 cgsort = sort->sort;
8943 for (i = 0; i < dd->ncg_home; i++)
8945 /* Sort on the ns grid cell indices
8946 * and the global topology index
8948 cgsort[i].nsc = a[i];
8949 cgsort[i].ind_gl = dd->index_gl[i];
8951 if (cgsort[i].nsc < moved)
8958 fprintf(debug, "qsort cgs: %d new home %d\n", dd->ncg_home, ncg_new);
8960 /* Determine the order of the charge groups using qsort */
8961 gmx_qsort_threadsafe(cgsort, dd->ncg_home, sizeof(cgsort[0]), comp_cgsort);
8967 static int dd_sort_order_nbnxn(gmx_domdec_t *dd, t_forcerec *fr)
8973 sort = dd->comm->sort->sort;
8975 nbnxn_get_atomorder(fr->nbv->nbs, &a, &na);
8978 for (i = 0; i < na; i++)
8982 sort[ncg_new].ind = a[i];
8990 static void dd_sort_state(gmx_domdec_t *dd, rvec *cgcm, t_forcerec *fr, t_state *state,
8993 gmx_domdec_sort_t *sort;
8994 gmx_cgsort_t *cgsort;
8996 int ncg_new, i, *ibuf, cgsize;
8999 sort = dd->comm->sort;
9001 if (dd->ncg_home > sort->sort_nalloc)
9003 sort->sort_nalloc = over_alloc_dd(dd->ncg_home);
9004 srenew(sort->sort, sort->sort_nalloc);
9005 srenew(sort->sort2, sort->sort_nalloc);
9007 cgsort = sort->sort;
9009 switch (fr->cutoff_scheme)
9012 ncg_new = dd_sort_order(dd, fr, ncg_home_old);
9015 ncg_new = dd_sort_order_nbnxn(dd, fr);
9018 gmx_incons("unimplemented");
9022 /* We alloc with the old size, since cgindex is still old */
9023 vec_rvec_check_alloc(&dd->comm->vbuf, dd->cgindex[dd->ncg_home]);
9024 vbuf = dd->comm->vbuf.v;
9028 cgindex = dd->cgindex;
9035 /* Remove the charge groups which are no longer at home here */
9036 dd->ncg_home = ncg_new;
9039 fprintf(debug, "Set the new home charge group count to %d\n",
9043 /* Reorder the state */
9044 if (state->flags & (1 << estX))
9046 order_vec_atom(dd->ncg_home, cgindex, cgsort, as_rvec_array(state->x.data()), vbuf);
9048 if (state->flags & (1 << estV))
9050 order_vec_atom(dd->ncg_home, cgindex, cgsort, as_rvec_array(state->v.data()), vbuf);
9052 if (state->flags & (1 << estCGP))
9054 order_vec_atom(dd->ncg_home, cgindex, cgsort, as_rvec_array(state->cg_p.data()), vbuf);
9057 if (fr->cutoff_scheme == ecutsGROUP)
9060 order_vec_cg(dd->ncg_home, cgsort, cgcm, vbuf);
9063 if (dd->ncg_home+1 > sort->ibuf_nalloc)
9065 sort->ibuf_nalloc = over_alloc_dd(dd->ncg_home+1);
9066 srenew(sort->ibuf, sort->ibuf_nalloc);
9069 /* Reorder the global cg index */
9070 order_int_cg(dd->ncg_home, cgsort, dd->index_gl, ibuf);
9071 /* Reorder the cginfo */
9072 order_int_cg(dd->ncg_home, cgsort, fr->cginfo, ibuf);
9073 /* Rebuild the local cg index */
9077 for (i = 0; i < dd->ncg_home; i++)
9079 cgsize = dd->cgindex[cgsort[i].ind+1] - dd->cgindex[cgsort[i].ind];
9080 ibuf[i+1] = ibuf[i] + cgsize;
9082 for (i = 0; i < dd->ncg_home+1; i++)
9084 dd->cgindex[i] = ibuf[i];
9089 for (i = 0; i < dd->ncg_home+1; i++)
9094 /* Set the home atom number */
9095 dd->nat_home = dd->cgindex[dd->ncg_home];
9097 if (fr->cutoff_scheme == ecutsVERLET)
9099 /* The atoms are now exactly in grid order, update the grid order */
9100 nbnxn_set_atomorder(fr->nbv->nbs);
9104 /* Copy the sorted ns cell indices back to the ns grid struct */
9105 for (i = 0; i < dd->ncg_home; i++)
9107 fr->ns->grid->cell_index[i] = cgsort[i].nsc;
9109 fr->ns->grid->nr = dd->ncg_home;
9113 static void add_dd_statistics(gmx_domdec_t *dd)
9115 gmx_domdec_comm_t *comm;
9120 for (ddnat = ddnatZONE; ddnat < ddnatNR; ddnat++)
9122 comm->sum_nat[ddnat-ddnatZONE] +=
9123 comm->nat[ddnat] - comm->nat[ddnat-1];
9128 void reset_dd_statistics_counters(gmx_domdec_t *dd)
9130 gmx_domdec_comm_t *comm;
9135 /* Reset all the statistics and counters for total run counting */
9136 for (ddnat = ddnatZONE; ddnat < ddnatNR; ddnat++)
9138 comm->sum_nat[ddnat-ddnatZONE] = 0;
9142 comm->load_step = 0;
9145 clear_ivec(comm->load_lim);
9150 void print_dd_statistics(t_commrec *cr, const t_inputrec *ir, FILE *fplog)
9152 gmx_domdec_comm_t *comm;
9156 comm = cr->dd->comm;
9158 gmx_sumd(ddnatNR-ddnatZONE, comm->sum_nat, cr);
9160 if (fplog == nullptr)
9165 fprintf(fplog, "\n D O M A I N D E C O M P O S I T I O N S T A T I S T I C S\n\n");
9167 for (ddnat = ddnatZONE; ddnat < ddnatNR; ddnat++)
9169 av = comm->sum_nat[ddnat-ddnatZONE]/comm->ndecomp;
9174 " av. #atoms communicated per step for force: %d x %.1f\n",
9178 if (cr->dd->vsite_comm)
9181 " av. #atoms communicated per step for vsites: %d x %.1f\n",
9182 (EEL_PME(ir->coulombtype) || ir->coulombtype == eelEWALD) ? 3 : 2,
9187 if (cr->dd->constraint_comm)
9190 " av. #atoms communicated per step for LINCS: %d x %.1f\n",
9191 1 + ir->nLincsIter, av);
9195 gmx_incons(" Unknown type for DD statistics");
9198 fprintf(fplog, "\n");
9200 if (comm->bRecordLoad && EI_DYNAMICS(ir->eI))
9202 print_dd_load_av(fplog, cr->dd);
9206 void dd_partition_system(FILE *fplog,
9209 gmx_bool bMasterState,
9211 t_state *state_global,
9212 const gmx_mtop_t *top_global,
9213 const t_inputrec *ir,
9214 t_state *state_local,
9215 PaddedRVecVector *f,
9216 gmx::MDAtoms *mdAtoms,
9217 gmx_localtop_t *top_local,
9220 gmx_constr_t constr,
9222 gmx_wallcycle_t wcycle,
9226 gmx_domdec_comm_t *comm;
9227 gmx_ddbox_t ddbox = {0};
9229 gmx_int64_t step_pcoupl;
9230 rvec cell_ns_x0, cell_ns_x1;
9231 int i, n, ncgindex_set, ncg_home_old = -1, ncg_moved, nat_f_novirsum;
9232 gmx_bool bBoxChanged, bNStGlobalComm, bDoDLB, bCheckWhetherToTurnDlbOn, bLogLoad;
9233 gmx_bool bRedist, bSortCG, bResortAll;
9234 ivec ncells_old = {0, 0, 0}, ncells_new = {0, 0, 0}, np;
9238 wallcycle_start(wcycle, ewcDOMDEC);
9243 bBoxChanged = (bMasterState || inputrecDeform(ir));
9244 if (ir->epc != epcNO)
9246 /* With nstpcouple > 1 pressure coupling happens.
9247 * one step after calculating the pressure.
9248 * Box scaling happens at the end of the MD step,
9249 * after the DD partitioning.
9250 * We therefore have to do DLB in the first partitioning
9251 * after an MD step where P-coupling occurred.
9252 * We need to determine the last step in which p-coupling occurred.
9253 * MRS -- need to validate this for vv?
9258 step_pcoupl = step - 1;
9262 step_pcoupl = ((step - 1)/n)*n + 1;
9264 if (step_pcoupl >= comm->partition_step)
9270 bNStGlobalComm = (step % nstglobalcomm == 0);
9278 /* Should we do dynamic load balacing this step?
9279 * Since it requires (possibly expensive) global communication,
9280 * we might want to do DLB less frequently.
9282 if (bBoxChanged || ir->epc != epcNO)
9284 bDoDLB = bBoxChanged;
9288 bDoDLB = bNStGlobalComm;
9292 /* Check if we have recorded loads on the nodes */
9293 if (comm->bRecordLoad && dd_load_count(comm) > 0)
9295 bCheckWhetherToTurnDlbOn = dd_dlb_get_should_check_whether_to_turn_dlb_on(dd);
9297 /* Print load every nstlog, first and last step to the log file */
9298 bLogLoad = ((ir->nstlog > 0 && step % ir->nstlog == 0) ||
9299 comm->n_load_collect == 0 ||
9301 (step + ir->nstlist > ir->init_step + ir->nsteps)));
9303 /* Avoid extra communication due to verbose screen output
9304 * when nstglobalcomm is set.
9306 if (bDoDLB || bLogLoad || bCheckWhetherToTurnDlbOn ||
9307 (bVerbose && (ir->nstlist == 0 || nstglobalcomm <= ir->nstlist)))
9309 get_load_distribution(dd, wcycle);
9314 dd_print_load(fplog, dd, step-1);
9318 dd_print_load_verbose(dd);
9321 comm->n_load_collect++;
9327 /* Add the measured cycles to the running average */
9328 const float averageFactor = 0.1f;
9329 comm->cyclesPerStepDlbExpAverage =
9330 (1 - averageFactor)*comm->cyclesPerStepDlbExpAverage +
9331 averageFactor*comm->cycl[ddCyclStep]/comm->cycl_n[ddCyclStep];
9333 if (comm->dlbState == edlbsOnCanTurnOff &&
9334 dd->comm->n_load_have % c_checkTurnDlbOffInterval == c_checkTurnDlbOffInterval - 1)
9336 gmx_bool turnOffDlb;
9339 /* If the running averaged cycles with DLB are more
9340 * than before we turned on DLB, turn off DLB.
9341 * We will again run and check the cycles without DLB
9342 * and we can then decide if to turn off DLB forever.
9344 turnOffDlb = (comm->cyclesPerStepDlbExpAverage >
9345 comm->cyclesPerStepBeforeDLB);
9347 dd_bcast(dd, sizeof(turnOffDlb), &turnOffDlb);
9350 /* To turn off DLB, we need to redistribute the atoms */
9351 dd_collect_state(dd, state_local, state_global);
9352 bMasterState = TRUE;
9353 turn_off_dlb(fplog, cr, step);
9357 else if (bCheckWhetherToTurnDlbOn)
9359 gmx_bool turnOffDlbForever = FALSE;
9360 gmx_bool turnOnDlb = FALSE;
9362 /* Since the timings are node dependent, the master decides */
9365 /* If we recently turned off DLB, we want to check if
9366 * performance is better without DLB. We want to do this
9367 * ASAP to minimize the chance that external factors
9368 * slowed down the DLB step are gone here and we
9369 * incorrectly conclude that DLB was causing the slowdown.
9370 * So we measure one nstlist block, no running average.
9372 if (comm->haveTurnedOffDlb &&
9373 comm->cycl[ddCyclStep]/comm->cycl_n[ddCyclStep] <
9374 comm->cyclesPerStepDlbExpAverage)
9376 /* After turning off DLB we ran nstlist steps in fewer
9377 * cycles than with DLB. This likely means that DLB
9378 * in not benefical, but this could be due to a one
9379 * time unlucky fluctuation, so we require two such
9380 * observations in close succession to turn off DLB
9383 if (comm->dlbSlowerPartitioningCount > 0 &&
9384 dd->ddp_count < comm->dlbSlowerPartitioningCount + 10*c_checkTurnDlbOnInterval)
9386 turnOffDlbForever = TRUE;
9388 comm->haveTurnedOffDlb = false;
9389 /* Register when we last measured DLB slowdown */
9390 comm->dlbSlowerPartitioningCount = dd->ddp_count;
9394 /* Here we check if the max PME rank load is more than 0.98
9395 * the max PP force load. If so, PP DLB will not help,
9396 * since we are (almost) limited by PME. Furthermore,
9397 * DLB will cause a significant extra x/f redistribution
9398 * cost on the PME ranks, which will then surely result
9399 * in lower total performance.
9401 if (cr->npmenodes > 0 &&
9402 dd_pme_f_ratio(dd) > 1 - DD_PERF_LOSS_DLB_ON)
9408 turnOnDlb = (dd_force_imb_perf_loss(dd) >= DD_PERF_LOSS_DLB_ON);
9414 gmx_bool turnOffDlbForever;
9418 turnOffDlbForever, turnOnDlb
9420 dd_bcast(dd, sizeof(bools), &bools);
9421 if (bools.turnOffDlbForever)
9423 turn_off_dlb_forever(fplog, cr, step);
9425 else if (bools.turnOnDlb)
9427 turn_on_dlb(fplog, cr, step);
9432 comm->n_load_have++;
9435 cgs_gl = &comm->cgs_gl;
9440 /* Clear the old state */
9441 clear_dd_indices(dd, 0, 0);
9444 rvec *xGlobal = (SIMMASTER(cr) ? as_rvec_array(state_global->x.data()) : nullptr);
9446 set_ddbox(dd, bMasterState, cr, ir,
9447 SIMMASTER(cr) ? state_global->box : nullptr,
9448 TRUE, cgs_gl, xGlobal,
9451 get_cg_distribution(fplog, dd, cgs_gl,
9452 SIMMASTER(cr) ? state_global->box : nullptr,
9455 dd_distribute_state(dd, cgs_gl,
9456 state_global, state_local, f);
9458 dd_make_local_cgs(dd, &top_local->cgs);
9460 /* Ensure that we have space for the new distribution */
9461 dd_check_alloc_ncg(fr, state_local, f, dd->ncg_home);
9463 if (fr->cutoff_scheme == ecutsGROUP)
9465 calc_cgcm(fplog, 0, dd->ncg_home,
9466 &top_local->cgs, as_rvec_array(state_local->x.data()), fr->cg_cm);
9469 inc_nrnb(nrnb, eNR_CGCM, dd->nat_home);
9471 dd_set_cginfo(dd->index_gl, 0, dd->ncg_home, fr, comm->bLocalCG);
9473 else if (state_local->ddp_count != dd->ddp_count)
9475 if (state_local->ddp_count > dd->ddp_count)
9477 gmx_fatal(FARGS, "Internal inconsistency state_local->ddp_count (%d) > dd->ddp_count (%d)", state_local->ddp_count, dd->ddp_count);
9480 if (state_local->ddp_count_cg_gl != state_local->ddp_count)
9482 gmx_fatal(FARGS, "Internal inconsistency state_local->ddp_count_cg_gl (%d) != state_local->ddp_count (%d)", state_local->ddp_count_cg_gl, state_local->ddp_count);
9485 /* Clear the old state */
9486 clear_dd_indices(dd, 0, 0);
9488 /* Build the new indices */
9489 rebuild_cgindex(dd, cgs_gl->index, state_local);
9490 make_dd_indices(dd, cgs_gl->index, 0);
9491 ncgindex_set = dd->ncg_home;
9493 if (fr->cutoff_scheme == ecutsGROUP)
9495 /* Redetermine the cg COMs */
9496 calc_cgcm(fplog, 0, dd->ncg_home,
9497 &top_local->cgs, as_rvec_array(state_local->x.data()), fr->cg_cm);
9500 inc_nrnb(nrnb, eNR_CGCM, dd->nat_home);
9502 dd_set_cginfo(dd->index_gl, 0, dd->ncg_home, fr, comm->bLocalCG);
9504 set_ddbox(dd, bMasterState, cr, ir, state_local->box,
9505 TRUE, &top_local->cgs, as_rvec_array(state_local->x.data()), &ddbox);
9507 bRedist = isDlbOn(comm);
9511 /* We have the full state, only redistribute the cgs */
9513 /* Clear the non-home indices */
9514 clear_dd_indices(dd, dd->ncg_home, dd->nat_home);
9517 /* Avoid global communication for dim's without pbc and -gcom */
9518 if (!bNStGlobalComm)
9520 copy_rvec(comm->box0, ddbox.box0 );
9521 copy_rvec(comm->box_size, ddbox.box_size);
9523 set_ddbox(dd, bMasterState, cr, ir, state_local->box,
9524 bNStGlobalComm, &top_local->cgs, as_rvec_array(state_local->x.data()), &ddbox);
9529 /* For dim's without pbc and -gcom */
9530 copy_rvec(ddbox.box0, comm->box0 );
9531 copy_rvec(ddbox.box_size, comm->box_size);
9533 set_dd_cell_sizes(dd, &ddbox, dynamic_dd_box(&ddbox, ir), bMasterState, bDoDLB,
9536 if (comm->nstDDDumpGrid > 0 && step % comm->nstDDDumpGrid == 0)
9538 write_dd_grid_pdb("dd_grid", step, dd, state_local->box, &ddbox);
9541 /* Check if we should sort the charge groups */
9542 bSortCG = (bMasterState || bRedist);
9544 ncg_home_old = dd->ncg_home;
9546 /* When repartitioning we mark charge groups that will move to neighboring
9547 * DD cells, but we do not move them right away for performance reasons.
9548 * Thus we need to keep track of how many charge groups will move for
9549 * obtaining correct local charge group / atom counts.
9554 wallcycle_sub_start(wcycle, ewcsDD_REDIST);
9556 dd_redistribute_cg(fplog, step, dd, ddbox.tric_dir,
9558 !bSortCG, nrnb, &ncgindex_set, &ncg_moved);
9560 wallcycle_sub_stop(wcycle, ewcsDD_REDIST);
9563 get_nsgrid_boundaries(ddbox.nboundeddim, state_local->box,
9565 &comm->cell_x0, &comm->cell_x1,
9566 dd->ncg_home, fr->cg_cm,
9567 cell_ns_x0, cell_ns_x1, &grid_density);
9571 comm_dd_ns_cell_sizes(dd, &ddbox, cell_ns_x0, cell_ns_x1, step);
9574 switch (fr->cutoff_scheme)
9577 copy_ivec(fr->ns->grid->n, ncells_old);
9578 grid_first(fplog, fr->ns->grid, dd, &ddbox,
9579 state_local->box, cell_ns_x0, cell_ns_x1,
9580 fr->rlist, grid_density);
9583 nbnxn_get_ncells(fr->nbv->nbs, &ncells_old[XX], &ncells_old[YY]);
9586 gmx_incons("unimplemented");
9588 /* We need to store tric_dir for dd_get_ns_ranges called from ns.c */
9589 copy_ivec(ddbox.tric_dir, comm->tric_dir);
9593 wallcycle_sub_start(wcycle, ewcsDD_GRID);
9595 /* Sort the state on charge group position.
9596 * This enables exact restarts from this step.
9597 * It also improves performance by about 15% with larger numbers
9598 * of atoms per node.
9601 /* Fill the ns grid with the home cell,
9602 * so we can sort with the indices.
9604 set_zones_ncg_home(dd);
9606 switch (fr->cutoff_scheme)
9609 set_zones_size(dd, state_local->box, &ddbox, 0, 1, ncg_moved);
9611 nbnxn_put_on_grid(fr->nbv->nbs, fr->ePBC, state_local->box,
9613 comm->zones.size[0].bb_x0,
9614 comm->zones.size[0].bb_x1,
9616 comm->zones.dens_zone0,
9618 as_rvec_array(state_local->x.data()),
9619 ncg_moved, bRedist ? comm->moved : nullptr,
9620 fr->nbv->grp[eintLocal].kernel_type,
9623 nbnxn_get_ncells(fr->nbv->nbs, &ncells_new[XX], &ncells_new[YY]);
9626 fill_grid(&comm->zones, fr->ns->grid, dd->ncg_home,
9627 0, dd->ncg_home, fr->cg_cm);
9629 copy_ivec(fr->ns->grid->n, ncells_new);
9632 gmx_incons("unimplemented");
9635 bResortAll = bMasterState;
9637 /* Check if we can user the old order and ns grid cell indices
9638 * of the charge groups to sort the charge groups efficiently.
9640 if (ncells_new[XX] != ncells_old[XX] ||
9641 ncells_new[YY] != ncells_old[YY] ||
9642 ncells_new[ZZ] != ncells_old[ZZ])
9649 fprintf(debug, "Step %s, sorting the %d home charge groups\n",
9650 gmx_step_str(step, sbuf), dd->ncg_home);
9652 dd_sort_state(dd, fr->cg_cm, fr, state_local,
9653 bResortAll ? -1 : ncg_home_old);
9655 /* After sorting and compacting we set the correct size */
9656 dd_resize_state(state_local, f, dd->nat_home);
9658 /* Rebuild all the indices */
9659 ga2la_clear(dd->ga2la);
9662 wallcycle_sub_stop(wcycle, ewcsDD_GRID);
9665 wallcycle_sub_start(wcycle, ewcsDD_SETUPCOMM);
9667 /* Setup up the communication and communicate the coordinates */
9668 setup_dd_communication(dd, state_local->box, &ddbox, fr, state_local, f);
9670 /* Set the indices */
9671 make_dd_indices(dd, cgs_gl->index, ncgindex_set);
9673 /* Set the charge group boundaries for neighbor searching */
9674 set_cg_boundaries(&comm->zones);
9676 if (fr->cutoff_scheme == ecutsVERLET)
9678 set_zones_size(dd, state_local->box, &ddbox,
9679 bSortCG ? 1 : 0, comm->zones.n,
9683 wallcycle_sub_stop(wcycle, ewcsDD_SETUPCOMM);
9686 write_dd_pdb("dd_home",step,"dump",top_global,cr,
9687 -1,as_rvec_array(state_local->x.data()),state_local->box);
9690 wallcycle_sub_start(wcycle, ewcsDD_MAKETOP);
9692 /* Extract a local topology from the global topology */
9693 for (i = 0; i < dd->ndim; i++)
9695 np[dd->dim[i]] = comm->cd[i].np;
9697 dd_make_local_top(dd, &comm->zones, dd->npbcdim, state_local->box,
9698 comm->cellsize_min, np,
9700 fr->cutoff_scheme == ecutsGROUP ? fr->cg_cm : as_rvec_array(state_local->x.data()),
9701 vsite, top_global, top_local);
9703 wallcycle_sub_stop(wcycle, ewcsDD_MAKETOP);
9705 wallcycle_sub_start(wcycle, ewcsDD_MAKECONSTR);
9707 /* Set up the special atom communication */
9708 n = comm->nat[ddnatZONE];
9709 for (i = ddnatZONE+1; i < ddnatNR; i++)
9714 if (vsite && vsite->n_intercg_vsite)
9716 n = dd_make_local_vsites(dd, n, top_local->idef.il);
9720 if (dd->bInterCGcons || dd->bInterCGsettles)
9722 /* Only for inter-cg constraints we need special code */
9723 n = dd_make_local_constraints(dd, n, top_global, fr->cginfo,
9724 constr, ir->nProjOrder,
9725 top_local->idef.il);
9729 gmx_incons("Unknown special atom type setup");
9734 wallcycle_sub_stop(wcycle, ewcsDD_MAKECONSTR);
9736 wallcycle_sub_start(wcycle, ewcsDD_TOPOTHER);
9738 /* Make space for the extra coordinates for virtual site
9739 * or constraint communication.
9741 state_local->natoms = comm->nat[ddnatNR-1];
9743 dd_resize_state(state_local, f, state_local->natoms);
9745 if (fr->haveDirectVirialContributions)
9747 if (vsite && vsite->n_intercg_vsite)
9749 nat_f_novirsum = comm->nat[ddnatVSITE];
9753 if (EEL_FULL(ir->coulombtype) && dd->n_intercg_excl > 0)
9755 nat_f_novirsum = dd->nat_tot;
9759 nat_f_novirsum = dd->nat_home;
9768 /* Set the number of atoms required for the force calculation.
9769 * Forces need to be constrained when doing energy
9770 * minimization. For simple simulations we could avoid some
9771 * allocation, zeroing and copying, but this is probably not worth
9772 * the complications and checking.
9774 forcerec_set_ranges(fr, dd->ncg_home, dd->ncg_tot,
9775 dd->nat_tot, comm->nat[ddnatCON], nat_f_novirsum);
9777 /* Update atom data for mdatoms and several algorithms */
9778 mdAlgorithmsSetupAtomData(cr, ir, top_global, top_local, fr,
9779 nullptr, mdAtoms, vsite, nullptr);
9781 if (ir->implicit_solvent)
9783 make_local_gb(cr, fr->born, ir->gb_algorithm);
9786 auto mdatoms = mdAtoms->mdatoms();
9787 if (!thisRankHasDuty(cr, DUTY_PME))
9789 /* Send the charges and/or c6/sigmas to our PME only node */
9790 gmx_pme_send_parameters(cr,
9792 mdatoms->nChargePerturbed, mdatoms->nTypePerturbed,
9793 mdatoms->chargeA, mdatoms->chargeB,
9794 mdatoms->sqrt_c6A, mdatoms->sqrt_c6B,
9795 mdatoms->sigmaA, mdatoms->sigmaB,
9796 dd_pme_maxshift_x(dd), dd_pme_maxshift_y(dd));
9801 set_constraints(constr, top_local, ir, mdatoms, cr);
9806 /* Update the local pull groups */
9807 dd_make_local_pull_groups(cr, ir->pull_work, mdatoms);
9812 /* Update the local rotation groups */
9813 dd_make_local_rotation_groups(dd, ir->rot);
9816 if (ir->eSwapCoords != eswapNO)
9818 /* Update the local groups needed for ion swapping */
9819 dd_make_local_swap_groups(dd, ir->swap);
9822 /* Update the local atoms to be communicated via the IMD protocol if bIMD is TRUE. */
9823 dd_make_local_IMD_atoms(ir->bIMD, dd, ir->imd);
9825 add_dd_statistics(dd);
9827 /* Make sure we only count the cycles for this DD partitioning */
9828 clear_dd_cycle_counts(dd);
9830 /* Because the order of the atoms might have changed since
9831 * the last vsite construction, we need to communicate the constructing
9832 * atom coordinates again (for spreading the forces this MD step).
9834 dd_move_x_vsites(dd, state_local->box, as_rvec_array(state_local->x.data()));
9836 wallcycle_sub_stop(wcycle, ewcsDD_TOPOTHER);
9838 if (comm->nstDDDump > 0 && step % comm->nstDDDump == 0)
9840 dd_move_x(dd, state_local->box, as_rvec_array(state_local->x.data()));
9841 write_dd_pdb("dd_dump", step, "dump", top_global, cr,
9842 -1, as_rvec_array(state_local->x.data()), state_local->box);
9845 /* Store the partitioning step */
9846 comm->partition_step = step;
9848 /* Increase the DD partitioning counter */
9850 /* The state currently matches this DD partitioning count, store it */
9851 state_local->ddp_count = dd->ddp_count;
9854 /* The DD master node knows the complete cg distribution,
9855 * store the count so we can possibly skip the cg info communication.
9857 comm->master_cg_ddp_count = (bSortCG ? 0 : dd->ddp_count);
9860 if (comm->DD_debug > 0)
9862 /* Set the env var GMX_DD_DEBUG if you suspect corrupted indices */
9863 check_index_consistency(dd, top_global->natoms, ncg_mtop(top_global),
9864 "after partitioning");
9867 wallcycle_stop(wcycle, ewcDOMDEC);