2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
51 #include "gromacs/domdec/domdec_network.h"
52 #include "gromacs/domdec/ga2la.h"
53 #include "gromacs/ewald/pme.h"
54 #include "gromacs/fileio/gmxfio.h"
55 #include "gromacs/fileio/pdbio.h"
56 #include "gromacs/gmxlib/chargegroup.h"
57 #include "gromacs/gmxlib/network.h"
58 #include "gromacs/gmxlib/nrnb.h"
59 #include "gromacs/gpu_utils/gpu_utils.h"
60 #include "gromacs/hardware/hw_info.h"
61 #include "gromacs/imd/imd.h"
62 #include "gromacs/listed-forces/manage-threading.h"
63 #include "gromacs/math/functions.h"
64 #include "gromacs/math/vec.h"
65 #include "gromacs/math/vectypes.h"
66 #include "gromacs/mdlib/constr.h"
67 #include "gromacs/mdlib/force.h"
68 #include "gromacs/mdlib/forcerec.h"
69 #include "gromacs/mdlib/genborn.h"
70 #include "gromacs/mdlib/gmx_omp_nthreads.h"
71 #include "gromacs/mdlib/mdatoms.h"
72 #include "gromacs/mdlib/mdrun.h"
73 #include "gromacs/mdlib/mdsetup.h"
74 #include "gromacs/mdlib/nb_verlet.h"
75 #include "gromacs/mdlib/nbnxn_grid.h"
76 #include "gromacs/mdlib/nsgrid.h"
77 #include "gromacs/mdlib/vsite.h"
78 #include "gromacs/mdtypes/commrec.h"
79 #include "gromacs/mdtypes/df_history.h"
80 #include "gromacs/mdtypes/forcerec.h"
81 #include "gromacs/mdtypes/inputrec.h"
82 #include "gromacs/mdtypes/md_enums.h"
83 #include "gromacs/mdtypes/mdatom.h"
84 #include "gromacs/mdtypes/nblist.h"
85 #include "gromacs/mdtypes/state.h"
86 #include "gromacs/pbcutil/ishift.h"
87 #include "gromacs/pbcutil/pbc.h"
88 #include "gromacs/pulling/pull.h"
89 #include "gromacs/pulling/pull_rotation.h"
90 #include "gromacs/swap/swapcoords.h"
91 #include "gromacs/timing/wallcycle.h"
92 #include "gromacs/topology/block.h"
93 #include "gromacs/topology/idef.h"
94 #include "gromacs/topology/ifunc.h"
95 #include "gromacs/topology/mtop_lookup.h"
96 #include "gromacs/topology/mtop_util.h"
97 #include "gromacs/topology/topology.h"
98 #include "gromacs/utility/basedefinitions.h"
99 #include "gromacs/utility/basenetwork.h"
100 #include "gromacs/utility/cstringutil.h"
101 #include "gromacs/utility/exceptions.h"
102 #include "gromacs/utility/fatalerror.h"
103 #include "gromacs/utility/gmxmpi.h"
104 #include "gromacs/utility/qsort_threadsafe.h"
105 #include "gromacs/utility/real.h"
106 #include "gromacs/utility/smalloc.h"
108 #include "domdec_constraints.h"
109 #include "domdec_internal.h"
110 #include "domdec_vsite.h"
112 #define DDRANK(dd, rank) (rank)
113 #define DDMASTERRANK(dd) (dd->masterrank)
115 struct gmx_domdec_master_t
117 /* The cell boundaries */
119 /* The global charge group division */
120 int *ncg; /* Number of home charge groups for each node */
121 int *index; /* Index of nnodes+1 into cg */
122 int *cg; /* Global charge group index */
123 int *nat; /* Number of home atoms for each node. */
124 int *ibuf; /* Buffer for communication */
125 rvec *vbuf; /* Buffer for state scattering and gathering */
128 #define DD_NLOAD_MAX 9
130 const char *edlbs_names[edlbsNR] = { "off", "auto", "locked", "on" };
132 /* The size per charge group of the cggl_flag buffer in gmx_domdec_comm_t */
135 /* The flags for the cggl_flag buffer in gmx_domdec_comm_t */
136 #define DD_FLAG_NRCG 65535
137 #define DD_FLAG_FW(d) (1<<(16+(d)*2))
138 #define DD_FLAG_BW(d) (1<<(16+(d)*2+1))
140 /* The DD zone order */
141 static const ivec dd_zo[DD_MAXZONE] =
142 {{0, 0, 0}, {1, 0, 0}, {1, 1, 0}, {0, 1, 0}, {0, 1, 1}, {0, 0, 1}, {1, 0, 1}, {1, 1, 1}};
144 /* The non-bonded zone-pair setup for domain decomposition
145 * The first number is the i-zone, the second number the first j-zone seen by
146 * this i-zone, the third number the last+1 j-zone seen by this i-zone.
147 * As is, this is for 3D decomposition, where there are 4 i-zones.
148 * With 2D decomposition use only the first 2 i-zones and a last+1 j-zone of 4.
149 * With 1D decomposition use only the first i-zone and a last+1 j-zone of 2.
152 ddNonbondedZonePairRanges[DD_MAXIZONE][3] = {{0, 0, 8},
157 /* Factors used to avoid problems due to rounding issues */
158 #define DD_CELL_MARGIN 1.0001
159 #define DD_CELL_MARGIN2 1.00005
160 /* Factor to account for pressure scaling during nstlist steps */
161 #define DD_PRES_SCALE_MARGIN 1.02
163 /* Turn on DLB when the load imbalance causes this amount of total loss.
164 * There is a bit of overhead with DLB and it's difficult to achieve
165 * a load imbalance of less than 2% with DLB.
167 #define DD_PERF_LOSS_DLB_ON 0.02
169 /* Warn about imbalance due to PP or PP/PME load imbalance at this loss */
170 #define DD_PERF_LOSS_WARN 0.05
172 #define DD_CELL_F_SIZE(dd, di) ((dd)->nc[(dd)->dim[(di)]]+1+(di)*2+1+(di))
174 /* Use separate MPI send and receive commands
175 * when nnodes <= GMX_DD_NNODES_SENDRECV.
176 * This saves memory (and some copying for small nnodes).
177 * For high parallelization scatter and gather calls are used.
179 #define GMX_DD_NNODES_SENDRECV 4
182 /* We check if to turn on DLB at the first and every 100 DD partitionings.
183 * With large imbalance DLB will turn on at the first step, so we can
184 * make the interval so large that the MPI overhead of the check is negligible.
186 static const int c_checkTurnDlbOnInterval = 100;
187 /* We need to check if DLB results in worse performance and then turn it off.
188 * We check this more often then for turning DLB on, because the DLB can scale
189 * the domains very rapidly, so if unlucky the load imbalance can go up quickly
190 * and furthermore, we are already synchronizing often with DLB, so
191 * the overhead of the MPI Bcast is not that high.
193 static const int c_checkTurnDlbOffInterval = 20;
195 /* Forward declaration */
196 static void dd_dlb_set_should_check_whether_to_turn_dlb_on(gmx_domdec_t *dd, gmx_bool bValue);
200 #define dd_index(n,i) ((((i)[ZZ]*(n)[YY] + (i)[YY])*(n)[XX]) + (i)[XX])
202 static void index2xyz(ivec nc,int ind,ivec xyz)
204 xyz[XX] = ind % nc[XX];
205 xyz[YY] = (ind / nc[XX]) % nc[YY];
206 xyz[ZZ] = ind / (nc[YY]*nc[XX]);
210 /* This order is required to minimize the coordinate communication in PME
211 * which uses decomposition in the x direction.
213 #define dd_index(n, i) ((((i)[XX]*(n)[YY] + (i)[YY])*(n)[ZZ]) + (i)[ZZ])
215 static void ddindex2xyz(ivec nc, int ind, ivec xyz)
217 xyz[XX] = ind / (nc[YY]*nc[ZZ]);
218 xyz[YY] = (ind / nc[ZZ]) % nc[YY];
219 xyz[ZZ] = ind % nc[ZZ];
222 static int ddcoord2ddnodeid(gmx_domdec_t *dd, ivec c)
227 ddindex = dd_index(dd->nc, c);
228 if (dd->comm->bCartesianPP_PME)
230 ddnodeid = dd->comm->ddindex2ddnodeid[ddindex];
232 else if (dd->comm->bCartesianPP)
235 MPI_Cart_rank(dd->mpi_comm_all, c, &ddnodeid);
246 static gmx_bool dynamic_dd_box(const gmx_ddbox_t *ddbox, const t_inputrec *ir)
248 return (ddbox->nboundeddim < DIM || inputrecDynamicBox(ir));
251 int ddglatnr(const gmx_domdec_t *dd, int i)
261 if (i >= dd->comm->nat[ddnatNR-1])
263 gmx_fatal(FARGS, "glatnr called with %d, which is larger than the local number of atoms (%d)", i, dd->comm->nat[ddnatNR-1]);
265 atnr = dd->gatindex[i] + 1;
271 t_block *dd_charge_groups_global(gmx_domdec_t *dd)
273 return &dd->comm->cgs_gl;
276 static bool dlbIsOn(const gmx_domdec_comm_t *comm)
278 return (comm->dlbState == edlbsOnCanTurnOff ||
279 comm->dlbState == edlbsOnForever);
282 static void vec_rvec_init(vec_rvec_t *v)
288 static void vec_rvec_check_alloc(vec_rvec_t *v, int n)
292 v->nalloc = over_alloc_dd(n);
293 srenew(v->v, v->nalloc);
297 void dd_store_state(gmx_domdec_t *dd, t_state *state)
301 if (state->ddp_count != dd->ddp_count)
303 gmx_incons("The state does not the domain decomposition state");
306 state->cg_gl.resize(dd->ncg_home);
307 for (i = 0; i < dd->ncg_home; i++)
309 state->cg_gl[i] = dd->index_gl[i];
312 state->ddp_count_cg_gl = dd->ddp_count;
315 gmx_domdec_zones_t *domdec_zones(gmx_domdec_t *dd)
317 return &dd->comm->zones;
320 void dd_get_ns_ranges(const gmx_domdec_t *dd, int icg,
321 int *jcg0, int *jcg1, ivec shift0, ivec shift1)
323 gmx_domdec_zones_t *zones;
326 zones = &dd->comm->zones;
329 while (icg >= zones->izone[izone].cg1)
338 else if (izone < zones->nizone)
340 *jcg0 = zones->izone[izone].jcg0;
344 gmx_fatal(FARGS, "DD icg %d out of range: izone (%d) >= nizone (%d)",
345 icg, izone, zones->nizone);
348 *jcg1 = zones->izone[izone].jcg1;
350 for (d = 0; d < dd->ndim; d++)
353 shift0[dim] = zones->izone[izone].shift0[dim];
354 shift1[dim] = zones->izone[izone].shift1[dim];
355 if (dd->comm->tric_dir[dim] || (dlbIsOn(dd->comm) && d > 0))
357 /* A conservative approach, this can be optimized */
364 int dd_natoms_mdatoms(const gmx_domdec_t *dd)
366 /* We currently set mdatoms entries for all atoms:
367 * local + non-local + communicated for vsite + constraints
370 return dd->comm->nat[ddnatNR - 1];
373 int dd_natoms_vsite(const gmx_domdec_t *dd)
375 return dd->comm->nat[ddnatVSITE];
378 void dd_get_constraint_range(const gmx_domdec_t *dd, int *at_start, int *at_end)
380 *at_start = dd->comm->nat[ddnatCON-1];
381 *at_end = dd->comm->nat[ddnatCON];
384 void dd_move_x(gmx_domdec_t *dd, matrix box, rvec x[])
386 int nzone, nat_tot, n, d, p, i, j, at0, at1, zone;
387 int *index, *cgindex;
388 gmx_domdec_comm_t *comm;
389 gmx_domdec_comm_dim_t *cd;
390 gmx_domdec_ind_t *ind;
391 rvec shift = {0, 0, 0}, *buf, *rbuf;
392 gmx_bool bPBC, bScrew;
396 cgindex = dd->cgindex;
401 nat_tot = dd->nat_home;
402 for (d = 0; d < dd->ndim; d++)
404 bPBC = (dd->ci[dd->dim[d]] == 0);
405 bScrew = (bPBC && dd->bScrewPBC && dd->dim[d] == XX);
408 copy_rvec(box[dd->dim[d]], shift);
411 for (p = 0; p < cd->np; p++)
418 for (i = 0; i < ind->nsend[nzone]; i++)
420 at0 = cgindex[index[i]];
421 at1 = cgindex[index[i]+1];
422 for (j = at0; j < at1; j++)
424 copy_rvec(x[j], buf[n]);
431 for (i = 0; i < ind->nsend[nzone]; i++)
433 at0 = cgindex[index[i]];
434 at1 = cgindex[index[i]+1];
435 for (j = at0; j < at1; j++)
437 /* We need to shift the coordinates */
438 rvec_add(x[j], shift, buf[n]);
445 for (i = 0; i < ind->nsend[nzone]; i++)
447 at0 = cgindex[index[i]];
448 at1 = cgindex[index[i]+1];
449 for (j = at0; j < at1; j++)
452 buf[n][XX] = x[j][XX] + shift[XX];
454 * This operation requires a special shift force
455 * treatment, which is performed in calc_vir.
457 buf[n][YY] = box[YY][YY] - x[j][YY];
458 buf[n][ZZ] = box[ZZ][ZZ] - x[j][ZZ];
470 rbuf = comm->vbuf2.v;
472 /* Send and receive the coordinates */
473 dd_sendrecv_rvec(dd, d, dddirBackward,
474 buf, ind->nsend[nzone+1],
475 rbuf, ind->nrecv[nzone+1]);
479 for (zone = 0; zone < nzone; zone++)
481 for (i = ind->cell2at0[zone]; i < ind->cell2at1[zone]; i++)
483 copy_rvec(rbuf[j], x[i]);
488 nat_tot += ind->nrecv[nzone+1];
494 void dd_move_f(gmx_domdec_t *dd, rvec f[], rvec *fshift)
496 int nzone, nat_tot, n, d, p, i, j, at0, at1, zone;
497 int *index, *cgindex;
498 gmx_domdec_comm_t *comm;
499 gmx_domdec_comm_dim_t *cd;
500 gmx_domdec_ind_t *ind;
504 gmx_bool bShiftForcesNeedPbc, bScrew;
508 cgindex = dd->cgindex;
512 nzone = comm->zones.n/2;
513 nat_tot = dd->nat_tot;
514 for (d = dd->ndim-1; d >= 0; d--)
516 /* Only forces in domains near the PBC boundaries need to
517 consider PBC in the treatment of fshift */
518 bShiftForcesNeedPbc = (dd->ci[dd->dim[d]] == 0);
519 bScrew = (bShiftForcesNeedPbc && dd->bScrewPBC && dd->dim[d] == XX);
520 if (fshift == nullptr && !bScrew)
522 bShiftForcesNeedPbc = FALSE;
524 /* Determine which shift vector we need */
530 for (p = cd->np-1; p >= 0; p--)
533 nat_tot -= ind->nrecv[nzone+1];
540 sbuf = comm->vbuf2.v;
542 for (zone = 0; zone < nzone; zone++)
544 for (i = ind->cell2at0[zone]; i < ind->cell2at1[zone]; i++)
546 copy_rvec(f[i], sbuf[j]);
551 /* Communicate the forces */
552 dd_sendrecv_rvec(dd, d, dddirForward,
553 sbuf, ind->nrecv[nzone+1],
554 buf, ind->nsend[nzone+1]);
556 /* Add the received forces */
558 if (!bShiftForcesNeedPbc)
560 for (i = 0; i < ind->nsend[nzone]; i++)
562 at0 = cgindex[index[i]];
563 at1 = cgindex[index[i]+1];
564 for (j = at0; j < at1; j++)
566 rvec_inc(f[j], buf[n]);
573 /* fshift should always be defined if this function is
574 * called when bShiftForcesNeedPbc is true */
575 assert(NULL != fshift);
576 for (i = 0; i < ind->nsend[nzone]; i++)
578 at0 = cgindex[index[i]];
579 at1 = cgindex[index[i]+1];
580 for (j = at0; j < at1; j++)
582 rvec_inc(f[j], buf[n]);
583 /* Add this force to the shift force */
584 rvec_inc(fshift[is], buf[n]);
591 for (i = 0; i < ind->nsend[nzone]; i++)
593 at0 = cgindex[index[i]];
594 at1 = cgindex[index[i]+1];
595 for (j = at0; j < at1; j++)
597 /* Rotate the force */
598 f[j][XX] += buf[n][XX];
599 f[j][YY] -= buf[n][YY];
600 f[j][ZZ] -= buf[n][ZZ];
603 /* Add this force to the shift force */
604 rvec_inc(fshift[is], buf[n]);
615 void dd_atom_spread_real(gmx_domdec_t *dd, real v[])
617 int nzone, nat_tot, n, d, p, i, j, at0, at1, zone;
618 int *index, *cgindex;
619 gmx_domdec_comm_t *comm;
620 gmx_domdec_comm_dim_t *cd;
621 gmx_domdec_ind_t *ind;
626 cgindex = dd->cgindex;
628 buf = &comm->vbuf.v[0][0];
631 nat_tot = dd->nat_home;
632 for (d = 0; d < dd->ndim; d++)
635 for (p = 0; p < cd->np; p++)
640 for (i = 0; i < ind->nsend[nzone]; i++)
642 at0 = cgindex[index[i]];
643 at1 = cgindex[index[i]+1];
644 for (j = at0; j < at1; j++)
657 rbuf = &comm->vbuf2.v[0][0];
659 /* Send and receive the coordinates */
660 dd_sendrecv_real(dd, d, dddirBackward,
661 buf, ind->nsend[nzone+1],
662 rbuf, ind->nrecv[nzone+1]);
666 for (zone = 0; zone < nzone; zone++)
668 for (i = ind->cell2at0[zone]; i < ind->cell2at1[zone]; i++)
675 nat_tot += ind->nrecv[nzone+1];
681 void dd_atom_sum_real(gmx_domdec_t *dd, real v[])
683 int nzone, nat_tot, n, d, p, i, j, at0, at1, zone;
684 int *index, *cgindex;
685 gmx_domdec_comm_t *comm;
686 gmx_domdec_comm_dim_t *cd;
687 gmx_domdec_ind_t *ind;
692 cgindex = dd->cgindex;
694 buf = &comm->vbuf.v[0][0];
696 nzone = comm->zones.n/2;
697 nat_tot = dd->nat_tot;
698 for (d = dd->ndim-1; d >= 0; d--)
701 for (p = cd->np-1; p >= 0; p--)
704 nat_tot -= ind->nrecv[nzone+1];
711 sbuf = &comm->vbuf2.v[0][0];
713 for (zone = 0; zone < nzone; zone++)
715 for (i = ind->cell2at0[zone]; i < ind->cell2at1[zone]; i++)
722 /* Communicate the forces */
723 dd_sendrecv_real(dd, d, dddirForward,
724 sbuf, ind->nrecv[nzone+1],
725 buf, ind->nsend[nzone+1]);
727 /* Add the received forces */
729 for (i = 0; i < ind->nsend[nzone]; i++)
731 at0 = cgindex[index[i]];
732 at1 = cgindex[index[i]+1];
733 for (j = at0; j < at1; j++)
744 static void print_ddzone(FILE *fp, int d, int i, int j, gmx_ddzone_t *zone)
746 fprintf(fp, "zone d0 %d d1 %d d2 %d min0 %6.3f max1 %6.3f mch0 %6.3f mch1 %6.3f p1_0 %6.3f p1_1 %6.3f\n",
748 zone->min0, zone->max1,
749 zone->mch0, zone->mch0,
750 zone->p1_0, zone->p1_1);
754 #define DDZONECOMM_MAXZONE 5
755 #define DDZONECOMM_BUFSIZE 3
757 static void dd_sendrecv_ddzone(const gmx_domdec_t *dd,
758 int ddimind, int direction,
759 gmx_ddzone_t *buf_s, int n_s,
760 gmx_ddzone_t *buf_r, int n_r)
762 #define ZBS DDZONECOMM_BUFSIZE
763 rvec vbuf_s[DDZONECOMM_MAXZONE*ZBS];
764 rvec vbuf_r[DDZONECOMM_MAXZONE*ZBS];
767 for (i = 0; i < n_s; i++)
769 vbuf_s[i*ZBS ][0] = buf_s[i].min0;
770 vbuf_s[i*ZBS ][1] = buf_s[i].max1;
771 vbuf_s[i*ZBS ][2] = buf_s[i].min1;
772 vbuf_s[i*ZBS+1][0] = buf_s[i].mch0;
773 vbuf_s[i*ZBS+1][1] = buf_s[i].mch1;
774 vbuf_s[i*ZBS+1][2] = 0;
775 vbuf_s[i*ZBS+2][0] = buf_s[i].p1_0;
776 vbuf_s[i*ZBS+2][1] = buf_s[i].p1_1;
777 vbuf_s[i*ZBS+2][2] = 0;
780 dd_sendrecv_rvec(dd, ddimind, direction,
784 for (i = 0; i < n_r; i++)
786 buf_r[i].min0 = vbuf_r[i*ZBS ][0];
787 buf_r[i].max1 = vbuf_r[i*ZBS ][1];
788 buf_r[i].min1 = vbuf_r[i*ZBS ][2];
789 buf_r[i].mch0 = vbuf_r[i*ZBS+1][0];
790 buf_r[i].mch1 = vbuf_r[i*ZBS+1][1];
791 buf_r[i].p1_0 = vbuf_r[i*ZBS+2][0];
792 buf_r[i].p1_1 = vbuf_r[i*ZBS+2][1];
798 static void dd_move_cellx(gmx_domdec_t *dd, gmx_ddbox_t *ddbox,
799 rvec cell_ns_x0, rvec cell_ns_x1)
801 int d, d1, dim, pos, buf_size, i, j, p, npulse, npulse_min;
803 gmx_ddzone_t buf_s[DDZONECOMM_MAXZONE];
804 gmx_ddzone_t buf_r[DDZONECOMM_MAXZONE];
805 gmx_ddzone_t buf_e[DDZONECOMM_MAXZONE];
806 rvec extr_s[2], extr_r[2];
808 real dist_d, c = 0, det;
809 gmx_domdec_comm_t *comm;
814 for (d = 1; d < dd->ndim; d++)
817 zp = (d == 1) ? &comm->zone_d1[0] : &comm->zone_d2[0][0];
818 zp->min0 = cell_ns_x0[dim];
819 zp->max1 = cell_ns_x1[dim];
820 zp->min1 = cell_ns_x1[dim];
821 zp->mch0 = cell_ns_x0[dim];
822 zp->mch1 = cell_ns_x1[dim];
823 zp->p1_0 = cell_ns_x0[dim];
824 zp->p1_1 = cell_ns_x1[dim];
827 for (d = dd->ndim-2; d >= 0; d--)
830 bPBC = (dim < ddbox->npbcdim);
832 /* Use an rvec to store two reals */
833 extr_s[d][0] = comm->cell_f0[d+1];
834 extr_s[d][1] = comm->cell_f1[d+1];
835 extr_s[d][2] = comm->cell_f1[d+1];
838 /* Store the extremes in the backward sending buffer,
839 * so the get updated separately from the forward communication.
841 for (d1 = d; d1 < dd->ndim-1; d1++)
843 /* We invert the order to be able to use the same loop for buf_e */
844 buf_s[pos].min0 = extr_s[d1][1];
845 buf_s[pos].max1 = extr_s[d1][0];
846 buf_s[pos].min1 = extr_s[d1][2];
849 /* Store the cell corner of the dimension we communicate along */
850 buf_s[pos].p1_0 = comm->cell_x0[dim];
855 buf_s[pos] = (dd->ndim == 2) ? comm->zone_d1[0] : comm->zone_d2[0][0];
858 if (dd->ndim == 3 && d == 0)
860 buf_s[pos] = comm->zone_d2[0][1];
862 buf_s[pos] = comm->zone_d1[0];
866 /* We only need to communicate the extremes
867 * in the forward direction
869 npulse = comm->cd[d].np;
872 /* Take the minimum to avoid double communication */
873 npulse_min = std::min(npulse, dd->nc[dim]-1-npulse);
877 /* Without PBC we should really not communicate over
878 * the boundaries, but implementing that complicates
879 * the communication setup and therefore we simply
880 * do all communication, but ignore some data.
884 for (p = 0; p < npulse_min; p++)
886 /* Communicate the extremes forward */
887 bUse = (bPBC || dd->ci[dim] > 0);
889 dd_sendrecv_rvec(dd, d, dddirForward,
890 extr_s+d, dd->ndim-d-1,
891 extr_r+d, dd->ndim-d-1);
895 for (d1 = d; d1 < dd->ndim-1; d1++)
897 extr_s[d1][0] = std::max(extr_s[d1][0], extr_r[d1][0]);
898 extr_s[d1][1] = std::min(extr_s[d1][1], extr_r[d1][1]);
899 extr_s[d1][2] = std::min(extr_s[d1][2], extr_r[d1][2]);
905 for (p = 0; p < npulse; p++)
907 /* Communicate all the zone information backward */
908 bUse = (bPBC || dd->ci[dim] < dd->nc[dim] - 1);
910 dd_sendrecv_ddzone(dd, d, dddirBackward,
917 for (d1 = d+1; d1 < dd->ndim; d1++)
919 /* Determine the decrease of maximum required
920 * communication height along d1 due to the distance along d,
921 * this avoids a lot of useless atom communication.
923 dist_d = comm->cell_x1[dim] - buf_r[0].p1_0;
925 if (ddbox->tric_dir[dim])
927 /* c is the off-diagonal coupling between the cell planes
928 * along directions d and d1.
930 c = ddbox->v[dim][dd->dim[d1]][dim];
936 det = (1 + c*c)*comm->cutoff*comm->cutoff - dist_d*dist_d;
939 dh[d1] = comm->cutoff - (c*dist_d + std::sqrt(det))/(1 + c*c);
943 /* A negative value signals out of range */
949 /* Accumulate the extremes over all pulses */
950 for (i = 0; i < buf_size; i++)
960 buf_e[i].min0 = std::min(buf_e[i].min0, buf_r[i].min0);
961 buf_e[i].max1 = std::max(buf_e[i].max1, buf_r[i].max1);
962 buf_e[i].min1 = std::min(buf_e[i].min1, buf_r[i].min1);
965 if (dd->ndim == 3 && d == 0 && i == buf_size - 1)
973 if (bUse && dh[d1] >= 0)
975 buf_e[i].mch0 = std::max(buf_e[i].mch0, buf_r[i].mch0-dh[d1]);
976 buf_e[i].mch1 = std::max(buf_e[i].mch1, buf_r[i].mch1-dh[d1]);
979 /* Copy the received buffer to the send buffer,
980 * to pass the data through with the next pulse.
984 if (((bPBC || dd->ci[dim]+npulse < dd->nc[dim]) && p == npulse-1) ||
985 (!bPBC && dd->ci[dim]+1+p == dd->nc[dim]-1))
987 /* Store the extremes */
990 for (d1 = d; d1 < dd->ndim-1; d1++)
992 extr_s[d1][1] = std::min(extr_s[d1][1], buf_e[pos].min0);
993 extr_s[d1][0] = std::max(extr_s[d1][0], buf_e[pos].max1);
994 extr_s[d1][2] = std::min(extr_s[d1][2], buf_e[pos].min1);
998 if (d == 1 || (d == 0 && dd->ndim == 3))
1000 for (i = d; i < 2; i++)
1002 comm->zone_d2[1-d][i] = buf_e[pos];
1008 comm->zone_d1[1] = buf_e[pos];
1018 for (i = 0; i < 2; i++)
1022 print_ddzone(debug, 1, i, 0, &comm->zone_d1[i]);
1024 cell_ns_x0[dim] = std::min(cell_ns_x0[dim], comm->zone_d1[i].min0);
1025 cell_ns_x1[dim] = std::max(cell_ns_x1[dim], comm->zone_d1[i].max1);
1031 for (i = 0; i < 2; i++)
1033 for (j = 0; j < 2; j++)
1037 print_ddzone(debug, 2, i, j, &comm->zone_d2[i][j]);
1039 cell_ns_x0[dim] = std::min(cell_ns_x0[dim], comm->zone_d2[i][j].min0);
1040 cell_ns_x1[dim] = std::max(cell_ns_x1[dim], comm->zone_d2[i][j].max1);
1044 for (d = 1; d < dd->ndim; d++)
1046 comm->cell_f_max0[d] = extr_s[d-1][0];
1047 comm->cell_f_min1[d] = extr_s[d-1][1];
1050 fprintf(debug, "Cell fraction d %d, max0 %f, min1 %f\n",
1051 d, comm->cell_f_max0[d], comm->cell_f_min1[d]);
1056 static void dd_collect_cg(gmx_domdec_t *dd,
1057 t_state *state_local)
1059 gmx_domdec_master_t *ma = nullptr;
1060 int buf2[2], *ibuf, i, ncg_home = 0, *cg = nullptr, nat_home = 0;
1062 if (state_local->ddp_count == dd->comm->master_cg_ddp_count)
1064 /* The master has the correct distribution */
1068 if (state_local->ddp_count == dd->ddp_count)
1070 /* The local state and DD are in sync, use the DD indices */
1071 ncg_home = dd->ncg_home;
1073 nat_home = dd->nat_home;
1075 else if (state_local->ddp_count_cg_gl == state_local->ddp_count)
1077 /* The DD is out of sync with the local state, but we have stored
1078 * the cg indices with the local state, so we can use those.
1082 cgs_gl = &dd->comm->cgs_gl;
1084 ncg_home = state_local->cg_gl.size();
1085 cg = state_local->cg_gl.data();
1087 for (i = 0; i < ncg_home; i++)
1089 nat_home += cgs_gl->index[cg[i]+1] - cgs_gl->index[cg[i]];
1094 gmx_incons("Attempted to collect a vector for a state for which the charge group distribution is unknown");
1108 /* Collect the charge group and atom counts on the master */
1109 dd_gather(dd, 2*sizeof(int), buf2, ibuf);
1114 for (i = 0; i < dd->nnodes; i++)
1116 ma->ncg[i] = ma->ibuf[2*i];
1117 ma->nat[i] = ma->ibuf[2*i+1];
1118 ma->index[i+1] = ma->index[i] + ma->ncg[i];
1121 /* Make byte counts and indices */
1122 for (i = 0; i < dd->nnodes; i++)
1124 ma->ibuf[i] = ma->ncg[i]*sizeof(int);
1125 ma->ibuf[dd->nnodes+i] = ma->index[i]*sizeof(int);
1129 fprintf(debug, "Initial charge group distribution: ");
1130 for (i = 0; i < dd->nnodes; i++)
1132 fprintf(debug, " %d", ma->ncg[i]);
1134 fprintf(debug, "\n");
1138 /* Collect the charge group indices on the master */
1140 ncg_home*sizeof(int), cg,
1141 DDMASTER(dd) ? ma->ibuf : nullptr,
1142 DDMASTER(dd) ? ma->ibuf+dd->nnodes : nullptr,
1143 DDMASTER(dd) ? ma->cg : nullptr);
1145 dd->comm->master_cg_ddp_count = state_local->ddp_count;
1148 static void dd_collect_vec_sendrecv(gmx_domdec_t *dd,
1149 const rvec *lv, rvec *v)
1151 gmx_domdec_master_t *ma;
1152 int n, i, c, a, nalloc = 0;
1153 rvec *buf = nullptr;
1161 MPI_Send(const_cast<void *>(static_cast<const void *>(lv)), dd->nat_home*sizeof(rvec), MPI_BYTE,
1162 DDMASTERRANK(dd), dd->rank, dd->mpi_comm_all);
1167 /* Copy the master coordinates to the global array */
1168 cgs_gl = &dd->comm->cgs_gl;
1170 n = DDMASTERRANK(dd);
1172 for (i = ma->index[n]; i < ma->index[n+1]; i++)
1174 for (c = cgs_gl->index[ma->cg[i]]; c < cgs_gl->index[ma->cg[i]+1]; c++)
1176 copy_rvec(lv[a++], v[c]);
1180 for (n = 0; n < dd->nnodes; n++)
1184 if (ma->nat[n] > nalloc)
1186 nalloc = over_alloc_dd(ma->nat[n]);
1187 srenew(buf, nalloc);
1190 MPI_Recv(buf, ma->nat[n]*sizeof(rvec), MPI_BYTE, DDRANK(dd, n),
1191 n, dd->mpi_comm_all, MPI_STATUS_IGNORE);
1194 for (i = ma->index[n]; i < ma->index[n+1]; i++)
1196 for (c = cgs_gl->index[ma->cg[i]]; c < cgs_gl->index[ma->cg[i]+1]; c++)
1198 copy_rvec(buf[a++], v[c]);
1207 static void get_commbuffer_counts(gmx_domdec_t *dd,
1208 int **counts, int **disps)
1210 gmx_domdec_master_t *ma;
1215 /* Make the rvec count and displacment arrays */
1217 *disps = ma->ibuf + dd->nnodes;
1218 for (n = 0; n < dd->nnodes; n++)
1220 (*counts)[n] = ma->nat[n]*sizeof(rvec);
1221 (*disps)[n] = (n == 0 ? 0 : (*disps)[n-1] + (*counts)[n-1]);
1225 static void dd_collect_vec_gatherv(gmx_domdec_t *dd,
1226 const rvec *lv, rvec *v)
1228 gmx_domdec_master_t *ma;
1229 int *rcounts = nullptr, *disps = nullptr;
1231 rvec *buf = nullptr;
1238 get_commbuffer_counts(dd, &rcounts, &disps);
1243 dd_gatherv(dd, dd->nat_home*sizeof(rvec), lv, rcounts, disps, buf);
1247 cgs_gl = &dd->comm->cgs_gl;
1250 for (n = 0; n < dd->nnodes; n++)
1252 for (i = ma->index[n]; i < ma->index[n+1]; i++)
1254 for (c = cgs_gl->index[ma->cg[i]]; c < cgs_gl->index[ma->cg[i]+1]; c++)
1256 copy_rvec(buf[a++], v[c]);
1263 void dd_collect_vec(gmx_domdec_t *dd,
1264 t_state *state_local,
1265 const PaddedRVecVector *localVector,
1268 dd_collect_cg(dd, state_local);
1270 const rvec *lv = as_rvec_array(localVector->data());
1272 if (dd->nnodes <= GMX_DD_NNODES_SENDRECV)
1274 dd_collect_vec_sendrecv(dd, lv, v);
1278 dd_collect_vec_gatherv(dd, lv, v);
1282 void dd_collect_vec(gmx_domdec_t *dd,
1283 t_state *state_local,
1284 const PaddedRVecVector *localVector,
1285 PaddedRVecVector *vector)
1287 dd_collect_vec(dd, state_local, localVector, as_rvec_array(vector->data()));
1291 void dd_collect_state(gmx_domdec_t *dd,
1292 t_state *state_local, t_state *state)
1296 nh = state->nhchainlength;
1300 for (i = 0; i < efptNR; i++)
1302 state->lambda[i] = state_local->lambda[i];
1304 state->fep_state = state_local->fep_state;
1305 state->veta = state_local->veta;
1306 state->vol0 = state_local->vol0;
1307 copy_mat(state_local->box, state->box);
1308 copy_mat(state_local->boxv, state->boxv);
1309 copy_mat(state_local->svir_prev, state->svir_prev);
1310 copy_mat(state_local->fvir_prev, state->fvir_prev);
1311 copy_mat(state_local->pres_prev, state->pres_prev);
1313 for (i = 0; i < state_local->ngtc; i++)
1315 for (j = 0; j < nh; j++)
1317 state->nosehoover_xi[i*nh+j] = state_local->nosehoover_xi[i*nh+j];
1318 state->nosehoover_vxi[i*nh+j] = state_local->nosehoover_vxi[i*nh+j];
1320 state->therm_integral[i] = state_local->therm_integral[i];
1322 for (i = 0; i < state_local->nnhpres; i++)
1324 for (j = 0; j < nh; j++)
1326 state->nhpres_xi[i*nh+j] = state_local->nhpres_xi[i*nh+j];
1327 state->nhpres_vxi[i*nh+j] = state_local->nhpres_vxi[i*nh+j];
1331 for (est = 0; est < estNR; est++)
1333 if (EST_DISTR(est) && (state_local->flags & (1<<est)))
1338 dd_collect_vec(dd, state_local, &state_local->x, &state->x);
1341 dd_collect_vec(dd, state_local, &state_local->v, &state->v);
1343 case est_SDX_NOTSUPPORTED:
1346 dd_collect_vec(dd, state_local, &state_local->cg_p, &state->cg_p);
1348 case estDISRE_INITF:
1349 case estDISRE_RM3TAV:
1350 case estORIRE_INITF:
1354 gmx_incons("Unknown state entry encountered in dd_collect_state");
1360 static void dd_resize_state(t_state *state, PaddedRVecVector *f, int natoms)
1366 fprintf(debug, "Resizing state: currently %d, required %d\n", state->natoms, natoms);
1369 for (est = 0; est < estNR; est++)
1371 if (EST_DISTR(est) && (state->flags & (1<<est)))
1373 /* We need to allocate one element extra, since we might use
1374 * (unaligned) 4-wide SIMD loads to access rvec entries.
1379 state->x.resize(natoms + 1);
1382 state->v.resize(natoms + 1);
1384 case est_SDX_NOTSUPPORTED:
1387 state->cg_p.resize(natoms + 1);
1389 case estDISRE_INITF:
1390 case estDISRE_RM3TAV:
1391 case estORIRE_INITF:
1393 /* No reallocation required */
1396 gmx_incons("Unknown state entry encountered in dd_resize_state");
1403 (*f).resize(natoms + 1);
1407 static void dd_check_alloc_ncg(t_forcerec *fr,
1409 PaddedRVecVector *f,
1410 int numChargeGroups)
1412 if (numChargeGroups > fr->cg_nalloc)
1416 fprintf(debug, "Reallocating forcerec: currently %d, required %d, allocating %d\n", fr->cg_nalloc, numChargeGroups, over_alloc_dd(numChargeGroups));
1418 fr->cg_nalloc = over_alloc_dd(numChargeGroups);
1419 srenew(fr->cginfo, fr->cg_nalloc);
1420 if (fr->cutoff_scheme == ecutsGROUP)
1422 srenew(fr->cg_cm, fr->cg_nalloc);
1425 if (fr->cutoff_scheme == ecutsVERLET)
1427 /* We don't use charge groups, we use x in state to set up
1428 * the atom communication.
1430 dd_resize_state(state, f, numChargeGroups);
1434 static void dd_distribute_vec_sendrecv(gmx_domdec_t *dd, t_block *cgs,
1437 gmx_domdec_master_t *ma;
1438 int n, i, c, a, nalloc = 0;
1439 rvec *buf = nullptr;
1445 for (n = 0; n < dd->nnodes; n++)
1449 if (ma->nat[n] > nalloc)
1451 nalloc = over_alloc_dd(ma->nat[n]);
1452 srenew(buf, nalloc);
1454 /* Use lv as a temporary buffer */
1456 for (i = ma->index[n]; i < ma->index[n+1]; i++)
1458 for (c = cgs->index[ma->cg[i]]; c < cgs->index[ma->cg[i]+1]; c++)
1460 copy_rvec(v[c], buf[a++]);
1463 if (a != ma->nat[n])
1465 gmx_fatal(FARGS, "Internal error a (%d) != nat (%d)",
1470 MPI_Send(buf, ma->nat[n]*sizeof(rvec), MPI_BYTE,
1471 DDRANK(dd, n), n, dd->mpi_comm_all);
1476 n = DDMASTERRANK(dd);
1478 for (i = ma->index[n]; i < ma->index[n+1]; i++)
1480 for (c = cgs->index[ma->cg[i]]; c < cgs->index[ma->cg[i]+1]; c++)
1482 copy_rvec(v[c], lv[a++]);
1489 MPI_Recv(lv, dd->nat_home*sizeof(rvec), MPI_BYTE, DDMASTERRANK(dd),
1490 MPI_ANY_TAG, dd->mpi_comm_all, MPI_STATUS_IGNORE);
1495 static void dd_distribute_vec_scatterv(gmx_domdec_t *dd, t_block *cgs,
1498 gmx_domdec_master_t *ma;
1499 int *scounts = nullptr, *disps = nullptr;
1501 rvec *buf = nullptr;
1507 get_commbuffer_counts(dd, &scounts, &disps);
1511 for (n = 0; n < dd->nnodes; n++)
1513 for (i = ma->index[n]; i < ma->index[n+1]; i++)
1515 for (c = cgs->index[ma->cg[i]]; c < cgs->index[ma->cg[i]+1]; c++)
1517 copy_rvec(v[c], buf[a++]);
1523 dd_scatterv(dd, scounts, disps, buf, dd->nat_home*sizeof(rvec), lv);
1526 static void dd_distribute_vec(gmx_domdec_t *dd, t_block *cgs, rvec *v, rvec *lv)
1528 if (dd->nnodes <= GMX_DD_NNODES_SENDRECV)
1530 dd_distribute_vec_sendrecv(dd, cgs, v, lv);
1534 dd_distribute_vec_scatterv(dd, cgs, v, lv);
1538 static void dd_distribute_dfhist(gmx_domdec_t *dd, df_history_t *dfhist)
1540 if (dfhist == nullptr)
1545 dd_bcast(dd, sizeof(int), &dfhist->bEquil);
1546 dd_bcast(dd, sizeof(int), &dfhist->nlambda);
1547 dd_bcast(dd, sizeof(real), &dfhist->wl_delta);
1549 if (dfhist->nlambda > 0)
1551 int nlam = dfhist->nlambda;
1552 dd_bcast(dd, sizeof(int)*nlam, dfhist->n_at_lam);
1553 dd_bcast(dd, sizeof(real)*nlam, dfhist->wl_histo);
1554 dd_bcast(dd, sizeof(real)*nlam, dfhist->sum_weights);
1555 dd_bcast(dd, sizeof(real)*nlam, dfhist->sum_dg);
1556 dd_bcast(dd, sizeof(real)*nlam, dfhist->sum_minvar);
1557 dd_bcast(dd, sizeof(real)*nlam, dfhist->sum_variance);
1559 for (int i = 0; i < nlam; i++)
1561 dd_bcast(dd, sizeof(real)*nlam, dfhist->accum_p[i]);
1562 dd_bcast(dd, sizeof(real)*nlam, dfhist->accum_m[i]);
1563 dd_bcast(dd, sizeof(real)*nlam, dfhist->accum_p2[i]);
1564 dd_bcast(dd, sizeof(real)*nlam, dfhist->accum_m2[i]);
1565 dd_bcast(dd, sizeof(real)*nlam, dfhist->Tij[i]);
1566 dd_bcast(dd, sizeof(real)*nlam, dfhist->Tij_empirical[i]);
1571 static void dd_distribute_state(gmx_domdec_t *dd, t_block *cgs,
1572 t_state *state, t_state *state_local,
1573 PaddedRVecVector *f)
1577 nh = state->nhchainlength;
1581 for (i = 0; i < efptNR; i++)
1583 state_local->lambda[i] = state->lambda[i];
1585 state_local->fep_state = state->fep_state;
1586 state_local->veta = state->veta;
1587 state_local->vol0 = state->vol0;
1588 copy_mat(state->box, state_local->box);
1589 copy_mat(state->box_rel, state_local->box_rel);
1590 copy_mat(state->boxv, state_local->boxv);
1591 copy_mat(state->svir_prev, state_local->svir_prev);
1592 copy_mat(state->fvir_prev, state_local->fvir_prev);
1593 if (state->dfhist != nullptr)
1595 copy_df_history(state_local->dfhist, state->dfhist);
1597 for (i = 0; i < state_local->ngtc; i++)
1599 for (j = 0; j < nh; j++)
1601 state_local->nosehoover_xi[i*nh+j] = state->nosehoover_xi[i*nh+j];
1602 state_local->nosehoover_vxi[i*nh+j] = state->nosehoover_vxi[i*nh+j];
1604 state_local->therm_integral[i] = state->therm_integral[i];
1606 for (i = 0; i < state_local->nnhpres; i++)
1608 for (j = 0; j < nh; j++)
1610 state_local->nhpres_xi[i*nh+j] = state->nhpres_xi[i*nh+j];
1611 state_local->nhpres_vxi[i*nh+j] = state->nhpres_vxi[i*nh+j];
1615 dd_bcast(dd, ((efptNR)*sizeof(real)), state_local->lambda.data());
1616 dd_bcast(dd, sizeof(int), &state_local->fep_state);
1617 dd_bcast(dd, sizeof(real), &state_local->veta);
1618 dd_bcast(dd, sizeof(real), &state_local->vol0);
1619 dd_bcast(dd, sizeof(state_local->box), state_local->box);
1620 dd_bcast(dd, sizeof(state_local->box_rel), state_local->box_rel);
1621 dd_bcast(dd, sizeof(state_local->boxv), state_local->boxv);
1622 dd_bcast(dd, sizeof(state_local->svir_prev), state_local->svir_prev);
1623 dd_bcast(dd, sizeof(state_local->fvir_prev), state_local->fvir_prev);
1624 dd_bcast(dd, ((state_local->ngtc*nh)*sizeof(double)), state_local->nosehoover_xi.data());
1625 dd_bcast(dd, ((state_local->ngtc*nh)*sizeof(double)), state_local->nosehoover_vxi.data());
1626 dd_bcast(dd, state_local->ngtc*sizeof(double), state_local->therm_integral.data());
1627 dd_bcast(dd, ((state_local->nnhpres*nh)*sizeof(double)), state_local->nhpres_xi.data());
1628 dd_bcast(dd, ((state_local->nnhpres*nh)*sizeof(double)), state_local->nhpres_vxi.data());
1630 /* communicate df_history -- required for restarting from checkpoint */
1631 dd_distribute_dfhist(dd, state_local->dfhist);
1633 dd_resize_state(state_local, f, dd->nat_home);
1635 for (i = 0; i < estNR; i++)
1637 if (EST_DISTR(i) && (state_local->flags & (1<<i)))
1642 dd_distribute_vec(dd, cgs, as_rvec_array(state->x.data()), as_rvec_array(state_local->x.data()));
1645 dd_distribute_vec(dd, cgs, as_rvec_array(state->v.data()), as_rvec_array(state_local->v.data()));
1647 case est_SDX_NOTSUPPORTED:
1650 dd_distribute_vec(dd, cgs, as_rvec_array(state->cg_p.data()), as_rvec_array(state_local->cg_p.data()));
1652 case estDISRE_INITF:
1653 case estDISRE_RM3TAV:
1654 case estORIRE_INITF:
1656 /* Not implemented yet */
1659 gmx_incons("Unknown state entry encountered in dd_distribute_state");
1665 static char dim2char(int dim)
1671 case XX: c = 'X'; break;
1672 case YY: c = 'Y'; break;
1673 case ZZ: c = 'Z'; break;
1674 default: gmx_fatal(FARGS, "Unknown dim %d", dim);
1680 static void write_dd_grid_pdb(const char *fn, gmx_int64_t step,
1681 gmx_domdec_t *dd, matrix box, gmx_ddbox_t *ddbox)
1683 rvec grid_s[2], *grid_r = nullptr, cx, r;
1684 char fname[STRLEN], buf[22];
1686 int a, i, d, z, y, x;
1690 copy_rvec(dd->comm->cell_x0, grid_s[0]);
1691 copy_rvec(dd->comm->cell_x1, grid_s[1]);
1695 snew(grid_r, 2*dd->nnodes);
1698 dd_gather(dd, 2*sizeof(rvec), grid_s, DDMASTER(dd) ? grid_r : nullptr);
1702 for (d = 0; d < DIM; d++)
1704 for (i = 0; i < DIM; i++)
1712 if (d < ddbox->npbcdim && dd->nc[d] > 1)
1714 tric[d][i] = box[i][d]/box[i][i];
1723 sprintf(fname, "%s_%s.pdb", fn, gmx_step_str(step, buf));
1724 out = gmx_fio_fopen(fname, "w");
1725 gmx_write_pdb_box(out, dd->bScrewPBC ? epbcSCREW : epbcXYZ, box);
1727 for (i = 0; i < dd->nnodes; i++)
1729 vol = dd->nnodes/(box[XX][XX]*box[YY][YY]*box[ZZ][ZZ]);
1730 for (d = 0; d < DIM; d++)
1732 vol *= grid_r[i*2+1][d] - grid_r[i*2][d];
1734 for (z = 0; z < 2; z++)
1736 for (y = 0; y < 2; y++)
1738 for (x = 0; x < 2; x++)
1740 cx[XX] = grid_r[i*2+x][XX];
1741 cx[YY] = grid_r[i*2+y][YY];
1742 cx[ZZ] = grid_r[i*2+z][ZZ];
1744 gmx_fprintf_pdb_atomline(out, epdbATOM, a++, "CA", ' ', "GLY", ' ', i+1, ' ',
1745 10*r[XX], 10*r[YY], 10*r[ZZ], 1.0, vol, "");
1749 for (d = 0; d < DIM; d++)
1751 for (x = 0; x < 4; x++)
1755 case 0: y = 1 + i*8 + 2*x; break;
1756 case 1: y = 1 + i*8 + 2*x - (x % 2); break;
1757 case 2: y = 1 + i*8 + x; break;
1759 fprintf(out, "%6s%5d%5d\n", "CONECT", y, y+(1<<d));
1763 gmx_fio_fclose(out);
1768 void write_dd_pdb(const char *fn, gmx_int64_t step, const char *title,
1769 const gmx_mtop_t *mtop, t_commrec *cr,
1770 int natoms, rvec x[], matrix box)
1772 char fname[STRLEN], buf[22];
1774 int i, ii, resnr, c;
1775 const char *atomname, *resname;
1782 natoms = dd->comm->nat[ddnatVSITE];
1785 sprintf(fname, "%s_%s_n%d.pdb", fn, gmx_step_str(step, buf), cr->sim_nodeid);
1787 out = gmx_fio_fopen(fname, "w");
1789 fprintf(out, "TITLE %s\n", title);
1790 gmx_write_pdb_box(out, dd->bScrewPBC ? epbcSCREW : epbcXYZ, box);
1792 for (i = 0; i < natoms; i++)
1794 ii = dd->gatindex[i];
1795 mtopGetAtomAndResidueName(mtop, ii, &molb, &atomname, &resnr, &resname, nullptr);
1796 if (i < dd->comm->nat[ddnatZONE])
1799 while (i >= dd->cgindex[dd->comm->zones.cg_range[c+1]])
1805 else if (i < dd->comm->nat[ddnatVSITE])
1807 b = dd->comm->zones.n;
1811 b = dd->comm->zones.n + 1;
1813 gmx_fprintf_pdb_atomline(out, epdbATOM, ii+1, atomname, ' ', resname, ' ', resnr, ' ',
1814 10*x[i][XX], 10*x[i][YY], 10*x[i][ZZ], 1.0, b, "");
1816 fprintf(out, "TER\n");
1818 gmx_fio_fclose(out);
1821 real dd_cutoff_multibody(const gmx_domdec_t *dd)
1823 gmx_domdec_comm_t *comm;
1830 if (comm->bInterCGBondeds)
1832 if (comm->cutoff_mbody > 0)
1834 r = comm->cutoff_mbody;
1838 /* cutoff_mbody=0 means we do not have DLB */
1839 r = comm->cellsize_min[dd->dim[0]];
1840 for (di = 1; di < dd->ndim; di++)
1842 r = std::min(r, comm->cellsize_min[dd->dim[di]]);
1844 if (comm->bBondComm)
1846 r = std::max(r, comm->cutoff_mbody);
1850 r = std::min(r, comm->cutoff);
1858 real dd_cutoff_twobody(const gmx_domdec_t *dd)
1862 r_mb = dd_cutoff_multibody(dd);
1864 return std::max(dd->comm->cutoff, r_mb);
1868 static void dd_cart_coord2pmecoord(const gmx_domdec_t *dd, const ivec coord,
1873 nc = dd->nc[dd->comm->cartpmedim];
1874 ntot = dd->comm->ntot[dd->comm->cartpmedim];
1875 copy_ivec(coord, coord_pme);
1876 coord_pme[dd->comm->cartpmedim] =
1877 nc + (coord[dd->comm->cartpmedim]*(ntot - nc) + (ntot - nc)/2)/nc;
1880 static int ddindex2pmeindex(const gmx_domdec_t *dd, int ddindex)
1885 npme = dd->comm->npmenodes;
1887 /* Here we assign a PME node to communicate with this DD node
1888 * by assuming that the major index of both is x.
1889 * We add cr->npmenodes/2 to obtain an even distribution.
1891 return (ddindex*npme + npme/2)/npp;
1894 static int *dd_interleaved_pme_ranks(const gmx_domdec_t *dd)
1899 snew(pme_rank, dd->comm->npmenodes);
1901 for (i = 0; i < dd->nnodes; i++)
1903 p0 = ddindex2pmeindex(dd, i);
1904 p1 = ddindex2pmeindex(dd, i+1);
1905 if (i+1 == dd->nnodes || p1 > p0)
1909 fprintf(debug, "pme_rank[%d] = %d\n", n, i+1+n);
1911 pme_rank[n] = i + 1 + n;
1919 static int gmx_ddcoord2pmeindex(t_commrec *cr, int x, int y, int z)
1927 if (dd->comm->bCartesian) {
1928 gmx_ddindex2xyz(dd->nc,ddindex,coords);
1929 dd_coords2pmecoords(dd,coords,coords_pme);
1930 copy_ivec(dd->ntot,nc);
1931 nc[dd->cartpmedim] -= dd->nc[dd->cartpmedim];
1932 coords_pme[dd->cartpmedim] -= dd->nc[dd->cartpmedim];
1934 slab = (coords_pme[XX]*nc[YY] + coords_pme[YY])*nc[ZZ] + coords_pme[ZZ];
1936 slab = (ddindex*cr->npmenodes + cr->npmenodes/2)/dd->nnodes;
1942 slab = ddindex2pmeindex(dd, dd_index(dd->nc, coords));
1947 static int ddcoord2simnodeid(t_commrec *cr, int x, int y, int z)
1949 gmx_domdec_comm_t *comm;
1951 int ddindex, nodeid = -1;
1953 comm = cr->dd->comm;
1958 if (comm->bCartesianPP_PME)
1961 MPI_Cart_rank(cr->mpi_comm_mysim, coords, &nodeid);
1966 ddindex = dd_index(cr->dd->nc, coords);
1967 if (comm->bCartesianPP)
1969 nodeid = comm->ddindex2simnodeid[ddindex];
1975 nodeid = ddindex + gmx_ddcoord2pmeindex(cr, x, y, z);
1987 static int dd_simnode2pmenode(const gmx_domdec_t *dd,
1988 const t_commrec gmx_unused *cr,
1993 const gmx_domdec_comm_t *comm = dd->comm;
1995 /* This assumes a uniform x domain decomposition grid cell size */
1996 if (comm->bCartesianPP_PME)
1999 ivec coord, coord_pme;
2000 MPI_Cart_coords(cr->mpi_comm_mysim, sim_nodeid, DIM, coord);
2001 if (coord[comm->cartpmedim] < dd->nc[comm->cartpmedim])
2003 /* This is a PP node */
2004 dd_cart_coord2pmecoord(dd, coord, coord_pme);
2005 MPI_Cart_rank(cr->mpi_comm_mysim, coord_pme, &pmenode);
2009 else if (comm->bCartesianPP)
2011 if (sim_nodeid < dd->nnodes)
2013 pmenode = dd->nnodes + ddindex2pmeindex(dd, sim_nodeid);
2018 /* This assumes DD cells with identical x coordinates
2019 * are numbered sequentially.
2021 if (dd->comm->pmenodes == nullptr)
2023 if (sim_nodeid < dd->nnodes)
2025 /* The DD index equals the nodeid */
2026 pmenode = dd->nnodes + ddindex2pmeindex(dd, sim_nodeid);
2032 while (sim_nodeid > dd->comm->pmenodes[i])
2036 if (sim_nodeid < dd->comm->pmenodes[i])
2038 pmenode = dd->comm->pmenodes[i];
2046 void get_pme_nnodes(const gmx_domdec_t *dd,
2047 int *npmenodes_x, int *npmenodes_y)
2051 *npmenodes_x = dd->comm->npmenodes_x;
2052 *npmenodes_y = dd->comm->npmenodes_y;
2061 void get_pme_ddnodes(t_commrec *cr, int pmenodeid,
2062 int *nmy_ddnodes, int **my_ddnodes, int *node_peer)
2066 ivec coord, coord_pme;
2070 snew(*my_ddnodes, (dd->nnodes+cr->npmenodes-1)/cr->npmenodes);
2073 for (x = 0; x < dd->nc[XX]; x++)
2075 for (y = 0; y < dd->nc[YY]; y++)
2077 for (z = 0; z < dd->nc[ZZ]; z++)
2079 if (dd->comm->bCartesianPP_PME)
2084 dd_cart_coord2pmecoord(dd, coord, coord_pme);
2085 if (dd->ci[XX] == coord_pme[XX] &&
2086 dd->ci[YY] == coord_pme[YY] &&
2087 dd->ci[ZZ] == coord_pme[ZZ])
2089 (*my_ddnodes)[(*nmy_ddnodes)++] = ddcoord2simnodeid(cr, x, y, z);
2094 /* The slab corresponds to the nodeid in the PME group */
2095 if (gmx_ddcoord2pmeindex(cr, x, y, z) == pmenodeid)
2097 (*my_ddnodes)[(*nmy_ddnodes)++] = ddcoord2simnodeid(cr, x, y, z);
2104 /* The last PP-only node is the peer node */
2105 *node_peer = (*my_ddnodes)[*nmy_ddnodes-1];
2109 fprintf(debug, "Receive coordinates from PP ranks:");
2110 for (x = 0; x < *nmy_ddnodes; x++)
2112 fprintf(debug, " %d", (*my_ddnodes)[x]);
2114 fprintf(debug, "\n");
2118 static gmx_bool receive_vir_ener(const gmx_domdec_t *dd, const t_commrec *cr)
2120 gmx_bool bReceive = TRUE;
2122 if (cr->npmenodes < dd->nnodes)
2124 gmx_domdec_comm_t *comm = dd->comm;
2125 if (comm->bCartesianPP_PME)
2128 int pmenode = dd_simnode2pmenode(dd, cr, cr->sim_nodeid);
2130 MPI_Cart_coords(cr->mpi_comm_mysim, cr->sim_nodeid, DIM, coords);
2131 coords[comm->cartpmedim]++;
2132 if (coords[comm->cartpmedim] < dd->nc[comm->cartpmedim])
2135 MPI_Cart_rank(cr->mpi_comm_mysim, coords, &rank);
2136 if (dd_simnode2pmenode(dd, cr, rank) == pmenode)
2138 /* This is not the last PP node for pmenode */
2143 GMX_RELEASE_ASSERT(false, "Without MPI we should not have Cartesian PP-PME with #PMEnodes < #DDnodes");
2148 int pmenode = dd_simnode2pmenode(dd, cr, cr->sim_nodeid);
2149 if (cr->sim_nodeid+1 < cr->nnodes &&
2150 dd_simnode2pmenode(dd, cr, cr->sim_nodeid+1) == pmenode)
2152 /* This is not the last PP node for pmenode */
2161 static void set_zones_ncg_home(gmx_domdec_t *dd)
2163 gmx_domdec_zones_t *zones;
2166 zones = &dd->comm->zones;
2168 zones->cg_range[0] = 0;
2169 for (i = 1; i < zones->n+1; i++)
2171 zones->cg_range[i] = dd->ncg_home;
2173 /* zone_ncg1[0] should always be equal to ncg_home */
2174 dd->comm->zone_ncg1[0] = dd->ncg_home;
2177 static void rebuild_cgindex(gmx_domdec_t *dd,
2178 const int *gcgs_index, const t_state *state)
2180 int * gmx_restrict dd_cg_gl = dd->index_gl;
2181 int * gmx_restrict cgindex = dd->cgindex;
2184 /* Copy back the global charge group indices from state
2185 * and rebuild the local charge group to atom index.
2188 for (unsigned int i = 0; i < state->cg_gl.size(); i++)
2191 int cg_gl = state->cg_gl[i];
2192 dd_cg_gl[i] = cg_gl;
2193 nat += gcgs_index[cg_gl+1] - gcgs_index[cg_gl];
2195 cgindex[state->cg_gl.size()] = nat;
2197 dd->ncg_home = state->cg_gl.size();
2200 set_zones_ncg_home(dd);
2203 static int ddcginfo(const cginfo_mb_t *cginfo_mb, int cg)
2205 while (cg >= cginfo_mb->cg_end)
2210 return cginfo_mb->cginfo[(cg - cginfo_mb->cg_start) % cginfo_mb->cg_mod];
2213 static void dd_set_cginfo(int *index_gl, int cg0, int cg1,
2214 t_forcerec *fr, char *bLocalCG)
2216 cginfo_mb_t *cginfo_mb;
2222 cginfo_mb = fr->cginfo_mb;
2223 cginfo = fr->cginfo;
2225 for (cg = cg0; cg < cg1; cg++)
2227 cginfo[cg] = ddcginfo(cginfo_mb, index_gl[cg]);
2231 if (bLocalCG != nullptr)
2233 for (cg = cg0; cg < cg1; cg++)
2235 bLocalCG[index_gl[cg]] = TRUE;
2240 static void make_dd_indices(gmx_domdec_t *dd,
2241 const int *gcgs_index, int cg_start)
2243 int nzone, zone, zone1, cg0, cg1, cg1_p1, cg, cg_gl, a, a_gl;
2244 int *zone2cg, *zone_ncg1, *index_gl, *gatindex;
2247 if (dd->nat_tot > dd->gatindex_nalloc)
2249 dd->gatindex_nalloc = over_alloc_dd(dd->nat_tot);
2250 srenew(dd->gatindex, dd->gatindex_nalloc);
2253 nzone = dd->comm->zones.n;
2254 zone2cg = dd->comm->zones.cg_range;
2255 zone_ncg1 = dd->comm->zone_ncg1;
2256 index_gl = dd->index_gl;
2257 gatindex = dd->gatindex;
2258 bCGs = dd->comm->bCGs;
2260 if (zone2cg[1] != dd->ncg_home)
2262 gmx_incons("dd->ncg_zone is not up to date");
2265 /* Make the local to global and global to local atom index */
2266 a = dd->cgindex[cg_start];
2267 for (zone = 0; zone < nzone; zone++)
2275 cg0 = zone2cg[zone];
2277 cg1 = zone2cg[zone+1];
2278 cg1_p1 = cg0 + zone_ncg1[zone];
2280 for (cg = cg0; cg < cg1; cg++)
2285 /* Signal that this cg is from more than one pulse away */
2288 cg_gl = index_gl[cg];
2291 for (a_gl = gcgs_index[cg_gl]; a_gl < gcgs_index[cg_gl+1]; a_gl++)
2294 ga2la_set(dd->ga2la, a_gl, a, zone1);
2300 gatindex[a] = cg_gl;
2301 ga2la_set(dd->ga2la, cg_gl, a, zone1);
2308 static int check_bLocalCG(gmx_domdec_t *dd, int ncg_sys, const char *bLocalCG,
2314 if (bLocalCG == nullptr)
2318 for (i = 0; i < dd->ncg_tot; i++)
2320 if (!bLocalCG[dd->index_gl[i]])
2323 "DD rank %d, %s: cg %d, global cg %d is not marked in bLocalCG (ncg_home %d)\n", dd->rank, where, i+1, dd->index_gl[i]+1, dd->ncg_home);
2328 for (i = 0; i < ncg_sys; i++)
2335 if (ngl != dd->ncg_tot)
2337 fprintf(stderr, "DD rank %d, %s: In bLocalCG %d cgs are marked as local, whereas there are %d\n", dd->rank, where, ngl, dd->ncg_tot);
2344 static void check_index_consistency(gmx_domdec_t *dd,
2345 int natoms_sys, int ncg_sys,
2348 int nerr, ngl, i, a, cell;
2353 if (dd->comm->DD_debug > 1)
2355 snew(have, natoms_sys);
2356 for (a = 0; a < dd->nat_tot; a++)
2358 if (have[dd->gatindex[a]] > 0)
2360 fprintf(stderr, "DD rank %d: global atom %d occurs twice: index %d and %d\n", dd->rank, dd->gatindex[a]+1, have[dd->gatindex[a]], a+1);
2364 have[dd->gatindex[a]] = a + 1;
2370 snew(have, dd->nat_tot);
2373 for (i = 0; i < natoms_sys; i++)
2375 if (ga2la_get(dd->ga2la, i, &a, &cell))
2377 if (a >= dd->nat_tot)
2379 fprintf(stderr, "DD rank %d: global atom %d marked as local atom %d, which is larger than nat_tot (%d)\n", dd->rank, i+1, a+1, dd->nat_tot);
2385 if (dd->gatindex[a] != i)
2387 fprintf(stderr, "DD rank %d: global atom %d marked as local atom %d, which has global atom index %d\n", dd->rank, i+1, a+1, dd->gatindex[a]+1);
2394 if (ngl != dd->nat_tot)
2397 "DD rank %d, %s: %d global atom indices, %d local atoms\n",
2398 dd->rank, where, ngl, dd->nat_tot);
2400 for (a = 0; a < dd->nat_tot; a++)
2405 "DD rank %d, %s: local atom %d, global %d has no global index\n",
2406 dd->rank, where, a+1, dd->gatindex[a]+1);
2411 nerr += check_bLocalCG(dd, ncg_sys, dd->comm->bLocalCG, where);
2415 gmx_fatal(FARGS, "DD rank %d, %s: %d atom/cg index inconsistencies",
2416 dd->rank, where, nerr);
2420 static void clear_dd_indices(gmx_domdec_t *dd, int cg_start, int a_start)
2427 /* Clear the whole list without searching */
2428 ga2la_clear(dd->ga2la);
2432 for (i = a_start; i < dd->nat_tot; i++)
2434 ga2la_del(dd->ga2la, dd->gatindex[i]);
2438 bLocalCG = dd->comm->bLocalCG;
2441 for (i = cg_start; i < dd->ncg_tot; i++)
2443 bLocalCG[dd->index_gl[i]] = FALSE;
2447 dd_clear_local_vsite_indices(dd);
2449 if (dd->constraints)
2451 dd_clear_local_constraint_indices(dd);
2455 /* This function should be used for moving the domain boudaries during DLB,
2456 * for obtaining the minimum cell size. It checks the initially set limit
2457 * comm->cellsize_min, for bonded and initial non-bonded cut-offs,
2458 * and, possibly, a longer cut-off limit set for PME load balancing.
2460 static real cellsize_min_dlb(gmx_domdec_comm_t *comm, int dim_ind, int dim)
2464 cellsize_min = comm->cellsize_min[dim];
2466 if (!comm->bVacDLBNoLimit)
2468 /* The cut-off might have changed, e.g. by PME load balacning,
2469 * from the value used to set comm->cellsize_min, so check it.
2471 cellsize_min = std::max(cellsize_min, comm->cutoff/comm->cd[dim_ind].np_dlb);
2473 if (comm->bPMELoadBalDLBLimits)
2475 /* Check for the cut-off limit set by the PME load balancing */
2476 cellsize_min = std::max(cellsize_min, comm->PMELoadBal_max_cutoff/comm->cd[dim_ind].np_dlb);
2480 return cellsize_min;
2483 static real grid_jump_limit(gmx_domdec_comm_t *comm, real cutoff,
2486 real grid_jump_limit;
2488 /* The distance between the boundaries of cells at distance
2489 * x+-1,y+-1 or y+-1,z+-1 is limited by the cut-off restrictions
2490 * and by the fact that cells should not be shifted by more than
2491 * half their size, such that cg's only shift by one cell
2492 * at redecomposition.
2494 grid_jump_limit = comm->cellsize_limit;
2495 if (!comm->bVacDLBNoLimit)
2497 if (comm->bPMELoadBalDLBLimits)
2499 cutoff = std::max(cutoff, comm->PMELoadBal_max_cutoff);
2501 grid_jump_limit = std::max(grid_jump_limit,
2502 cutoff/comm->cd[dim_ind].np);
2505 return grid_jump_limit;
2508 static gmx_bool check_grid_jump(gmx_int64_t step,
2514 gmx_domdec_comm_t *comm;
2523 for (d = 1; d < dd->ndim; d++)
2526 limit = grid_jump_limit(comm, cutoff, d);
2527 bfac = ddbox->box_size[dim];
2528 if (ddbox->tric_dir[dim])
2530 bfac *= ddbox->skew_fac[dim];
2532 if ((comm->cell_f1[d] - comm->cell_f_max0[d])*bfac < limit ||
2533 (comm->cell_f0[d] - comm->cell_f_min1[d])*bfac > -limit)
2541 /* This error should never be triggered under normal
2542 * circumstances, but you never know ...
2544 gmx_fatal(FARGS, "step %s: The domain decomposition grid has shifted too much in the %c-direction around cell %d %d %d. This should not have happened. Running with fewer ranks might avoid this issue.",
2545 gmx_step_str(step, buf),
2546 dim2char(dim), dd->ci[XX], dd->ci[YY], dd->ci[ZZ]);
2554 static int dd_load_count(gmx_domdec_comm_t *comm)
2556 return (comm->eFlop ? comm->flop_n : comm->cycl_n[ddCyclF]);
2559 static float dd_force_load(gmx_domdec_comm_t *comm)
2566 if (comm->eFlop > 1)
2568 load *= 1.0 + (comm->eFlop - 1)*(0.1*rand()/RAND_MAX - 0.05);
2573 load = comm->cycl[ddCyclF];
2574 if (comm->cycl_n[ddCyclF] > 1)
2576 /* Subtract the maximum of the last n cycle counts
2577 * to get rid of possible high counts due to other sources,
2578 * for instance system activity, that would otherwise
2579 * affect the dynamic load balancing.
2581 load -= comm->cycl_max[ddCyclF];
2585 if (comm->cycl_n[ddCyclWaitGPU] && comm->nrank_gpu_shared > 1)
2587 float gpu_wait, gpu_wait_sum;
2589 gpu_wait = comm->cycl[ddCyclWaitGPU];
2590 if (comm->cycl_n[ddCyclF] > 1)
2592 /* We should remove the WaitGPU time of the same MD step
2593 * as the one with the maximum F time, since the F time
2594 * and the wait time are not independent.
2595 * Furthermore, the step for the max F time should be chosen
2596 * the same on all ranks that share the same GPU.
2597 * But to keep the code simple, we remove the average instead.
2598 * The main reason for artificially long times at some steps
2599 * is spurious CPU activity or MPI time, so we don't expect
2600 * that changes in the GPU wait time matter a lot here.
2602 gpu_wait *= (comm->cycl_n[ddCyclF] - 1)/(float)comm->cycl_n[ddCyclF];
2604 /* Sum the wait times over the ranks that share the same GPU */
2605 MPI_Allreduce(&gpu_wait, &gpu_wait_sum, 1, MPI_FLOAT, MPI_SUM,
2606 comm->mpi_comm_gpu_shared);
2607 /* Replace the wait time by the average over the ranks */
2608 load += -gpu_wait + gpu_wait_sum/comm->nrank_gpu_shared;
2616 static void set_slb_pme_dim_f(gmx_domdec_t *dd, int dim, real **dim_f)
2618 gmx_domdec_comm_t *comm;
2623 snew(*dim_f, dd->nc[dim]+1);
2625 for (i = 1; i < dd->nc[dim]; i++)
2627 if (comm->slb_frac[dim])
2629 (*dim_f)[i] = (*dim_f)[i-1] + comm->slb_frac[dim][i-1];
2633 (*dim_f)[i] = (real)i/(real)dd->nc[dim];
2636 (*dim_f)[dd->nc[dim]] = 1;
2639 static void init_ddpme(gmx_domdec_t *dd, gmx_ddpme_t *ddpme, int dimind)
2641 int pmeindex, slab, nso, i;
2644 if (dimind == 0 && dd->dim[0] == YY && dd->comm->npmenodes_x == 1)
2650 ddpme->dim = dimind;
2652 ddpme->dim_match = (ddpme->dim == dd->dim[dimind]);
2654 ddpme->nslab = (ddpme->dim == 0 ?
2655 dd->comm->npmenodes_x :
2656 dd->comm->npmenodes_y);
2658 if (ddpme->nslab <= 1)
2663 nso = dd->comm->npmenodes/ddpme->nslab;
2664 /* Determine for each PME slab the PP location range for dimension dim */
2665 snew(ddpme->pp_min, ddpme->nslab);
2666 snew(ddpme->pp_max, ddpme->nslab);
2667 for (slab = 0; slab < ddpme->nslab; slab++)
2669 ddpme->pp_min[slab] = dd->nc[dd->dim[dimind]] - 1;
2670 ddpme->pp_max[slab] = 0;
2672 for (i = 0; i < dd->nnodes; i++)
2674 ddindex2xyz(dd->nc, i, xyz);
2675 /* For y only use our y/z slab.
2676 * This assumes that the PME x grid size matches the DD grid size.
2678 if (dimind == 0 || xyz[XX] == dd->ci[XX])
2680 pmeindex = ddindex2pmeindex(dd, i);
2683 slab = pmeindex/nso;
2687 slab = pmeindex % ddpme->nslab;
2689 ddpme->pp_min[slab] = std::min(ddpme->pp_min[slab], xyz[dimind]);
2690 ddpme->pp_max[slab] = std::max(ddpme->pp_max[slab], xyz[dimind]);
2694 set_slb_pme_dim_f(dd, ddpme->dim, &ddpme->slb_dim_f);
2697 int dd_pme_maxshift_x(const gmx_domdec_t *dd)
2699 if (dd->comm->ddpme[0].dim == XX)
2701 return dd->comm->ddpme[0].maxshift;
2709 int dd_pme_maxshift_y(const gmx_domdec_t *dd)
2711 if (dd->comm->ddpme[0].dim == YY)
2713 return dd->comm->ddpme[0].maxshift;
2715 else if (dd->comm->npmedecompdim >= 2 && dd->comm->ddpme[1].dim == YY)
2717 return dd->comm->ddpme[1].maxshift;
2725 static void set_pme_maxshift(gmx_domdec_t *dd, gmx_ddpme_t *ddpme,
2726 gmx_bool bUniform, const gmx_ddbox_t *ddbox,
2729 gmx_domdec_comm_t *comm;
2732 real range, pme_boundary;
2736 nc = dd->nc[ddpme->dim];
2739 if (!ddpme->dim_match)
2741 /* PP decomposition is not along dim: the worst situation */
2744 else if (ns <= 3 || (bUniform && ns == nc))
2746 /* The optimal situation */
2751 /* We need to check for all pme nodes which nodes they
2752 * could possibly need to communicate with.
2754 xmin = ddpme->pp_min;
2755 xmax = ddpme->pp_max;
2756 /* Allow for atoms to be maximally 2/3 times the cut-off
2757 * out of their DD cell. This is a reasonable balance between
2758 * between performance and support for most charge-group/cut-off
2761 range = 2.0/3.0*comm->cutoff/ddbox->box_size[ddpme->dim];
2762 /* Avoid extra communication when we are exactly at a boundary */
2766 for (s = 0; s < ns; s++)
2768 /* PME slab s spreads atoms between box frac. s/ns and (s+1)/ns */
2769 pme_boundary = (real)s/ns;
2772 cell_f[xmax[s-(sh+1) ]+1] + range > pme_boundary) ||
2774 cell_f[xmax[s-(sh+1)+ns]+1] - 1 + range > pme_boundary)))
2778 pme_boundary = (real)(s+1)/ns;
2781 cell_f[xmin[s+(sh+1) ] ] - range < pme_boundary) ||
2783 cell_f[xmin[s+(sh+1)-ns] ] + 1 - range < pme_boundary)))
2790 ddpme->maxshift = sh;
2794 fprintf(debug, "PME slab communication range for dim %d is %d\n",
2795 ddpme->dim, ddpme->maxshift);
2799 static void check_box_size(gmx_domdec_t *dd, gmx_ddbox_t *ddbox)
2803 for (d = 0; d < dd->ndim; d++)
2806 if (dim < ddbox->nboundeddim &&
2807 ddbox->box_size[dim]*ddbox->skew_fac[dim] <
2808 dd->nc[dim]*dd->comm->cellsize_limit*DD_CELL_MARGIN)
2810 gmx_fatal(FARGS, "The %c-size of the box (%f) times the triclinic skew factor (%f) is smaller than the number of DD cells (%d) times the smallest allowed cell size (%f)\n",
2811 dim2char(dim), ddbox->box_size[dim], ddbox->skew_fac[dim],
2812 dd->nc[dim], dd->comm->cellsize_limit);
2818 setcellsizeslbLOCAL, setcellsizeslbMASTER, setcellsizeslbPULSE_ONLY
2821 /* Set the domain boundaries. Use for static (or no) load balancing,
2822 * and also for the starting state for dynamic load balancing.
2823 * setmode determine if and where the boundaries are stored, use enum above.
2824 * Returns the number communication pulses in npulse.
2826 static void set_dd_cell_sizes_slb(gmx_domdec_t *dd, const gmx_ddbox_t *ddbox,
2827 int setmode, ivec npulse)
2829 gmx_domdec_comm_t *comm;
2832 real *cell_x, cell_dx, cellsize;
2836 for (d = 0; d < DIM; d++)
2838 cellsize_min[d] = ddbox->box_size[d]*ddbox->skew_fac[d];
2840 if (dd->nc[d] == 1 || comm->slb_frac[d] == nullptr)
2843 cell_dx = ddbox->box_size[d]/dd->nc[d];
2846 case setcellsizeslbMASTER:
2847 for (j = 0; j < dd->nc[d]+1; j++)
2849 dd->ma->cell_x[d][j] = ddbox->box0[d] + j*cell_dx;
2852 case setcellsizeslbLOCAL:
2853 comm->cell_x0[d] = ddbox->box0[d] + (dd->ci[d] )*cell_dx;
2854 comm->cell_x1[d] = ddbox->box0[d] + (dd->ci[d]+1)*cell_dx;
2859 cellsize = cell_dx*ddbox->skew_fac[d];
2860 while (cellsize*npulse[d] < comm->cutoff)
2864 cellsize_min[d] = cellsize;
2868 /* Statically load balanced grid */
2869 /* Also when we are not doing a master distribution we determine
2870 * all cell borders in a loop to obtain identical values
2871 * to the master distribution case and to determine npulse.
2873 if (setmode == setcellsizeslbMASTER)
2875 cell_x = dd->ma->cell_x[d];
2879 snew(cell_x, dd->nc[d]+1);
2881 cell_x[0] = ddbox->box0[d];
2882 for (j = 0; j < dd->nc[d]; j++)
2884 cell_dx = ddbox->box_size[d]*comm->slb_frac[d][j];
2885 cell_x[j+1] = cell_x[j] + cell_dx;
2886 cellsize = cell_dx*ddbox->skew_fac[d];
2887 while (cellsize*npulse[d] < comm->cutoff &&
2888 npulse[d] < dd->nc[d]-1)
2892 cellsize_min[d] = std::min(cellsize_min[d], cellsize);
2894 if (setmode == setcellsizeslbLOCAL)
2896 comm->cell_x0[d] = cell_x[dd->ci[d]];
2897 comm->cell_x1[d] = cell_x[dd->ci[d]+1];
2899 if (setmode != setcellsizeslbMASTER)
2904 /* The following limitation is to avoid that a cell would receive
2905 * some of its own home charge groups back over the periodic boundary.
2906 * Double charge groups cause trouble with the global indices.
2908 if (d < ddbox->npbcdim &&
2909 dd->nc[d] > 1 && npulse[d] >= dd->nc[d])
2911 char error_string[STRLEN];
2913 sprintf(error_string,
2914 "The box size in direction %c (%f) times the triclinic skew factor (%f) is too small for a cut-off of %f with %d domain decomposition cells, use 1 or more than %d %s or increase the box size in this direction",
2915 dim2char(d), ddbox->box_size[d], ddbox->skew_fac[d],
2917 dd->nc[d], dd->nc[d],
2918 dd->nnodes > dd->nc[d] ? "cells" : "ranks");
2920 if (setmode == setcellsizeslbLOCAL)
2922 gmx_fatal_collective(FARGS, dd->mpi_comm_all, DDMASTER(dd),
2927 gmx_fatal(FARGS, error_string);
2934 copy_rvec(cellsize_min, comm->cellsize_min);
2937 for (d = 0; d < comm->npmedecompdim; d++)
2939 set_pme_maxshift(dd, &comm->ddpme[d],
2940 comm->slb_frac[dd->dim[d]] == nullptr, ddbox,
2941 comm->ddpme[d].slb_dim_f);
2946 static void dd_cell_sizes_dlb_root_enforce_limits(gmx_domdec_t *dd,
2947 int d, int dim, domdec_root_t *root,
2948 const gmx_ddbox_t *ddbox,
2949 gmx_bool bUniform, gmx_int64_t step, real cellsize_limit_f, int range[])
2951 gmx_domdec_comm_t *comm;
2952 int ncd, i, j, nmin, nmin_old;
2953 gmx_bool bLimLo, bLimHi;
2955 real fac, halfway, cellsize_limit_f_i, region_size;
2956 gmx_bool bPBC, bLastHi = FALSE;
2957 int nrange[] = {range[0], range[1]};
2959 region_size = root->cell_f[range[1]]-root->cell_f[range[0]];
2965 bPBC = (dim < ddbox->npbcdim);
2967 cell_size = root->buf_ncd;
2971 fprintf(debug, "enforce_limits: %d %d\n", range[0], range[1]);
2974 /* First we need to check if the scaling does not make cells
2975 * smaller than the smallest allowed size.
2976 * We need to do this iteratively, since if a cell is too small,
2977 * it needs to be enlarged, which makes all the other cells smaller,
2978 * which could in turn make another cell smaller than allowed.
2980 for (i = range[0]; i < range[1]; i++)
2982 root->bCellMin[i] = FALSE;
2988 /* We need the total for normalization */
2990 for (i = range[0]; i < range[1]; i++)
2992 if (root->bCellMin[i] == FALSE)
2994 fac += cell_size[i];
2997 fac = ( region_size - nmin*cellsize_limit_f)/fac; /* substracting cells already set to cellsize_limit_f */
2998 /* Determine the cell boundaries */
2999 for (i = range[0]; i < range[1]; i++)
3001 if (root->bCellMin[i] == FALSE)
3003 cell_size[i] *= fac;
3004 if (!bPBC && (i == 0 || i == dd->nc[dim] -1))
3006 cellsize_limit_f_i = 0;
3010 cellsize_limit_f_i = cellsize_limit_f;
3012 if (cell_size[i] < cellsize_limit_f_i)
3014 root->bCellMin[i] = TRUE;
3015 cell_size[i] = cellsize_limit_f_i;
3019 root->cell_f[i+1] = root->cell_f[i] + cell_size[i];
3022 while (nmin > nmin_old);
3025 cell_size[i] = root->cell_f[i+1] - root->cell_f[i];
3026 /* For this check we should not use DD_CELL_MARGIN,
3027 * but a slightly smaller factor,
3028 * since rounding could get use below the limit.
3030 if (bPBC && cell_size[i] < cellsize_limit_f*DD_CELL_MARGIN2/DD_CELL_MARGIN)
3033 gmx_fatal(FARGS, "step %s: the dynamic load balancing could not balance dimension %c: box size %f, triclinic skew factor %f, #cells %d, minimum cell size %f\n",
3034 gmx_step_str(step, buf),
3035 dim2char(dim), ddbox->box_size[dim], ddbox->skew_fac[dim],
3036 ncd, comm->cellsize_min[dim]);
3039 root->bLimited = (nmin > 0) || (range[0] > 0) || (range[1] < ncd);
3043 /* Check if the boundary did not displace more than halfway
3044 * each of the cells it bounds, as this could cause problems,
3045 * especially when the differences between cell sizes are large.
3046 * If changes are applied, they will not make cells smaller
3047 * than the cut-off, as we check all the boundaries which
3048 * might be affected by a change and if the old state was ok,
3049 * the cells will at most be shrunk back to their old size.
3051 for (i = range[0]+1; i < range[1]; i++)
3053 halfway = 0.5*(root->old_cell_f[i] + root->old_cell_f[i-1]);
3054 if (root->cell_f[i] < halfway)
3056 root->cell_f[i] = halfway;
3057 /* Check if the change also causes shifts of the next boundaries */
3058 for (j = i+1; j < range[1]; j++)
3060 if (root->cell_f[j] < root->cell_f[j-1] + cellsize_limit_f)
3062 root->cell_f[j] = root->cell_f[j-1] + cellsize_limit_f;
3066 halfway = 0.5*(root->old_cell_f[i] + root->old_cell_f[i+1]);
3067 if (root->cell_f[i] > halfway)
3069 root->cell_f[i] = halfway;
3070 /* Check if the change also causes shifts of the next boundaries */
3071 for (j = i-1; j >= range[0]+1; j--)
3073 if (root->cell_f[j] > root->cell_f[j+1] - cellsize_limit_f)
3075 root->cell_f[j] = root->cell_f[j+1] - cellsize_limit_f;
3082 /* nrange is defined as [lower, upper) range for new call to enforce_limits */
3083 /* find highest violation of LimLo (a) and the following violation of LimHi (thus the lowest following) (b)
3084 * then call enforce_limits for (oldb,a), (a,b). In the next step: (b,nexta). oldb and nexta can be the boundaries.
3085 * for a and b nrange is used */
3088 /* Take care of the staggering of the cell boundaries */
3091 for (i = range[0]; i < range[1]; i++)
3093 root->cell_f_max0[i] = root->cell_f[i];
3094 root->cell_f_min1[i] = root->cell_f[i+1];
3099 for (i = range[0]+1; i < range[1]; i++)
3101 bLimLo = (root->cell_f[i] < root->bound_min[i]);
3102 bLimHi = (root->cell_f[i] > root->bound_max[i]);
3103 if (bLimLo && bLimHi)
3105 /* Both limits violated, try the best we can */
3106 /* For this case we split the original range (range) in two parts and care about the other limitiations in the next iteration. */
3107 root->cell_f[i] = 0.5*(root->bound_min[i] + root->bound_max[i]);
3108 nrange[0] = range[0];
3110 dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3113 nrange[1] = range[1];
3114 dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3120 /* root->cell_f[i] = root->bound_min[i]; */
3121 nrange[1] = i; /* only store violation location. There could be a LimLo violation following with an higher index */
3124 else if (bLimHi && !bLastHi)
3127 if (nrange[1] < range[1]) /* found a LimLo before */
3129 root->cell_f[nrange[1]] = root->bound_min[nrange[1]];
3130 dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3131 nrange[0] = nrange[1];
3133 root->cell_f[i] = root->bound_max[i];
3135 dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3137 nrange[1] = range[1];
3140 if (nrange[1] < range[1]) /* found last a LimLo */
3142 root->cell_f[nrange[1]] = root->bound_min[nrange[1]];
3143 dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3144 nrange[0] = nrange[1];
3145 nrange[1] = range[1];
3146 dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3148 else if (nrange[0] > range[0]) /* found at least one LimHi */
3150 dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3157 static void set_dd_cell_sizes_dlb_root(gmx_domdec_t *dd,
3158 int d, int dim, domdec_root_t *root,
3159 const gmx_ddbox_t *ddbox,
3160 gmx_bool bDynamicBox,
3161 gmx_bool bUniform, gmx_int64_t step)
3163 gmx_domdec_comm_t *comm;
3164 int ncd, d1, i, pos;
3166 real load_aver, load_i, imbalance, change, change_max, sc;
3167 real cellsize_limit_f, dist_min_f, dist_min_f_hard, space;
3171 int range[] = { 0, 0 };
3175 /* Convert the maximum change from the input percentage to a fraction */
3176 change_limit = comm->dlb_scale_lim*0.01;
3180 bPBC = (dim < ddbox->npbcdim);
3182 cell_size = root->buf_ncd;
3184 /* Store the original boundaries */
3185 for (i = 0; i < ncd+1; i++)
3187 root->old_cell_f[i] = root->cell_f[i];
3191 for (i = 0; i < ncd; i++)
3193 cell_size[i] = 1.0/ncd;
3196 else if (dd_load_count(comm) > 0)
3198 load_aver = comm->load[d].sum_m/ncd;
3200 for (i = 0; i < ncd; i++)
3202 /* Determine the relative imbalance of cell i */
3203 load_i = comm->load[d].load[i*comm->load[d].nload+2];
3204 imbalance = (load_i - load_aver)/(load_aver > 0 ? load_aver : 1);
3205 /* Determine the change of the cell size using underrelaxation */
3206 change = -relax*imbalance;
3207 change_max = std::max(change_max, std::max(change, -change));
3209 /* Limit the amount of scaling.
3210 * We need to use the same rescaling for all cells in one row,
3211 * otherwise the load balancing might not converge.
3214 if (change_max > change_limit)
3216 sc *= change_limit/change_max;
3218 for (i = 0; i < ncd; i++)
3220 /* Determine the relative imbalance of cell i */
3221 load_i = comm->load[d].load[i*comm->load[d].nload+2];
3222 imbalance = (load_i - load_aver)/(load_aver > 0 ? load_aver : 1);
3223 /* Determine the change of the cell size using underrelaxation */
3224 change = -sc*imbalance;
3225 cell_size[i] = (root->cell_f[i+1]-root->cell_f[i])*(1 + change);
3229 cellsize_limit_f = cellsize_min_dlb(comm, d, dim)/ddbox->box_size[dim];
3230 cellsize_limit_f *= DD_CELL_MARGIN;
3231 dist_min_f_hard = grid_jump_limit(comm, comm->cutoff, d)/ddbox->box_size[dim];
3232 dist_min_f = dist_min_f_hard * DD_CELL_MARGIN;
3233 if (ddbox->tric_dir[dim])
3235 cellsize_limit_f /= ddbox->skew_fac[dim];
3236 dist_min_f /= ddbox->skew_fac[dim];
3238 if (bDynamicBox && d > 0)
3240 dist_min_f *= DD_PRES_SCALE_MARGIN;
3242 if (d > 0 && !bUniform)
3244 /* Make sure that the grid is not shifted too much */
3245 for (i = 1; i < ncd; i++)
3247 if (root->cell_f_min1[i] - root->cell_f_max0[i-1] < 2 * dist_min_f_hard)
3249 gmx_incons("Inconsistent DD boundary staggering limits!");
3251 root->bound_min[i] = root->cell_f_max0[i-1] + dist_min_f;
3252 space = root->cell_f[i] - (root->cell_f_max0[i-1] + dist_min_f);
3255 root->bound_min[i] += 0.5*space;
3257 root->bound_max[i] = root->cell_f_min1[i] - dist_min_f;
3258 space = root->cell_f[i] - (root->cell_f_min1[i] - dist_min_f);
3261 root->bound_max[i] += 0.5*space;
3266 "dim %d boundary %d %.3f < %.3f < %.3f < %.3f < %.3f\n",
3268 root->cell_f_max0[i-1] + dist_min_f,
3269 root->bound_min[i], root->cell_f[i], root->bound_max[i],
3270 root->cell_f_min1[i] - dist_min_f);
3275 root->cell_f[0] = 0;
3276 root->cell_f[ncd] = 1;
3277 dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, range);
3280 /* After the checks above, the cells should obey the cut-off
3281 * restrictions, but it does not hurt to check.
3283 for (i = 0; i < ncd; i++)
3287 fprintf(debug, "Relative bounds dim %d cell %d: %f %f\n",
3288 dim, i, root->cell_f[i], root->cell_f[i+1]);
3291 if ((bPBC || (i != 0 && i != dd->nc[dim]-1)) &&
3292 root->cell_f[i+1] - root->cell_f[i] <
3293 cellsize_limit_f/DD_CELL_MARGIN)
3297 "\nWARNING step %s: direction %c, cell %d too small: %f\n",
3298 gmx_step_str(step, buf), dim2char(dim), i,
3299 (root->cell_f[i+1] - root->cell_f[i])
3300 *ddbox->box_size[dim]*ddbox->skew_fac[dim]);
3305 /* Store the cell boundaries of the lower dimensions at the end */
3306 for (d1 = 0; d1 < d; d1++)
3308 root->cell_f[pos++] = comm->cell_f0[d1];
3309 root->cell_f[pos++] = comm->cell_f1[d1];
3312 if (d < comm->npmedecompdim)
3314 /* The master determines the maximum shift for
3315 * the coordinate communication between separate PME nodes.
3317 set_pme_maxshift(dd, &comm->ddpme[d], bUniform, ddbox, root->cell_f);
3319 root->cell_f[pos++] = comm->ddpme[0].maxshift;
3322 root->cell_f[pos++] = comm->ddpme[1].maxshift;
3326 static void relative_to_absolute_cell_bounds(gmx_domdec_t *dd,
3327 const gmx_ddbox_t *ddbox,
3330 gmx_domdec_comm_t *comm;
3335 /* Set the cell dimensions */
3336 dim = dd->dim[dimind];
3337 comm->cell_x0[dim] = comm->cell_f0[dimind]*ddbox->box_size[dim];
3338 comm->cell_x1[dim] = comm->cell_f1[dimind]*ddbox->box_size[dim];
3339 if (dim >= ddbox->nboundeddim)
3341 comm->cell_x0[dim] += ddbox->box0[dim];
3342 comm->cell_x1[dim] += ddbox->box0[dim];
3346 static void distribute_dd_cell_sizes_dlb(gmx_domdec_t *dd,
3347 int d, int dim, real *cell_f_row,
3348 const gmx_ddbox_t *ddbox)
3350 gmx_domdec_comm_t *comm;
3356 /* Each node would only need to know two fractions,
3357 * but it is probably cheaper to broadcast the whole array.
3359 MPI_Bcast(cell_f_row, DD_CELL_F_SIZE(dd, d)*sizeof(real), MPI_BYTE,
3360 0, comm->mpi_comm_load[d]);
3362 /* Copy the fractions for this dimension from the buffer */
3363 comm->cell_f0[d] = cell_f_row[dd->ci[dim] ];
3364 comm->cell_f1[d] = cell_f_row[dd->ci[dim]+1];
3365 /* The whole array was communicated, so set the buffer position */
3366 pos = dd->nc[dim] + 1;
3367 for (d1 = 0; d1 <= d; d1++)
3371 /* Copy the cell fractions of the lower dimensions */
3372 comm->cell_f0[d1] = cell_f_row[pos++];
3373 comm->cell_f1[d1] = cell_f_row[pos++];
3375 relative_to_absolute_cell_bounds(dd, ddbox, d1);
3377 /* Convert the communicated shift from float to int */
3378 comm->ddpme[0].maxshift = (int)(cell_f_row[pos++] + 0.5);
3381 comm->ddpme[1].maxshift = (int)(cell_f_row[pos++] + 0.5);
3385 static void set_dd_cell_sizes_dlb_change(gmx_domdec_t *dd,
3386 const gmx_ddbox_t *ddbox,
3387 gmx_bool bDynamicBox,
3388 gmx_bool bUniform, gmx_int64_t step)
3390 gmx_domdec_comm_t *comm;
3392 gmx_bool bRowMember, bRowRoot;
3397 for (d = 0; d < dd->ndim; d++)
3402 for (d1 = d; d1 < dd->ndim; d1++)
3404 if (dd->ci[dd->dim[d1]] > 0)
3417 set_dd_cell_sizes_dlb_root(dd, d, dim, comm->root[d],
3418 ddbox, bDynamicBox, bUniform, step);
3419 cell_f_row = comm->root[d]->cell_f;
3423 cell_f_row = comm->cell_f_row;
3425 distribute_dd_cell_sizes_dlb(dd, d, dim, cell_f_row, ddbox);
3430 static void set_dd_cell_sizes_dlb_nochange(gmx_domdec_t *dd,
3431 const gmx_ddbox_t *ddbox)
3435 /* This function assumes the box is static and should therefore
3436 * not be called when the box has changed since the last
3437 * call to dd_partition_system.
3439 for (d = 0; d < dd->ndim; d++)
3441 relative_to_absolute_cell_bounds(dd, ddbox, d);
3447 static void set_dd_cell_sizes_dlb(gmx_domdec_t *dd,
3448 const gmx_ddbox_t *ddbox, gmx_bool bDynamicBox,
3449 gmx_bool bUniform, gmx_bool bDoDLB, gmx_int64_t step,
3450 gmx_wallcycle_t wcycle)
3452 gmx_domdec_comm_t *comm;
3459 wallcycle_start(wcycle, ewcDDCOMMBOUND);
3460 set_dd_cell_sizes_dlb_change(dd, ddbox, bDynamicBox, bUniform, step);
3461 wallcycle_stop(wcycle, ewcDDCOMMBOUND);
3463 else if (bDynamicBox)
3465 set_dd_cell_sizes_dlb_nochange(dd, ddbox);
3468 /* Set the dimensions for which no DD is used */
3469 for (dim = 0; dim < DIM; dim++)
3471 if (dd->nc[dim] == 1)
3473 comm->cell_x0[dim] = 0;
3474 comm->cell_x1[dim] = ddbox->box_size[dim];
3475 if (dim >= ddbox->nboundeddim)
3477 comm->cell_x0[dim] += ddbox->box0[dim];
3478 comm->cell_x1[dim] += ddbox->box0[dim];
3484 static void realloc_comm_ind(gmx_domdec_t *dd, ivec npulse)
3487 gmx_domdec_comm_dim_t *cd;
3489 for (d = 0; d < dd->ndim; d++)
3491 cd = &dd->comm->cd[d];
3492 np = npulse[dd->dim[d]];
3493 if (np > cd->np_nalloc)
3497 fprintf(debug, "(Re)allocing cd for %c to %d pulses\n",
3498 dim2char(dd->dim[d]), np);
3500 if (DDMASTER(dd) && cd->np_nalloc > 0)
3502 fprintf(stderr, "\nIncreasing the number of cell to communicate in dimension %c to %d for the first time\n", dim2char(dd->dim[d]), np);
3504 srenew(cd->ind, np);
3505 for (i = cd->np_nalloc; i < np; i++)
3507 cd->ind[i].index = nullptr;
3508 cd->ind[i].nalloc = 0;
3517 static void set_dd_cell_sizes(gmx_domdec_t *dd,
3518 gmx_ddbox_t *ddbox, gmx_bool bDynamicBox,
3519 gmx_bool bUniform, gmx_bool bDoDLB, gmx_int64_t step,
3520 gmx_wallcycle_t wcycle)
3522 gmx_domdec_comm_t *comm;
3528 /* Copy the old cell boundaries for the cg displacement check */
3529 copy_rvec(comm->cell_x0, comm->old_cell_x0);
3530 copy_rvec(comm->cell_x1, comm->old_cell_x1);
3536 check_box_size(dd, ddbox);
3538 set_dd_cell_sizes_dlb(dd, ddbox, bDynamicBox, bUniform, bDoDLB, step, wcycle);
3542 set_dd_cell_sizes_slb(dd, ddbox, setcellsizeslbLOCAL, npulse);
3543 realloc_comm_ind(dd, npulse);
3548 for (d = 0; d < DIM; d++)
3550 fprintf(debug, "cell_x[%d] %f - %f skew_fac %f\n",
3551 d, comm->cell_x0[d], comm->cell_x1[d], ddbox->skew_fac[d]);
3556 static void comm_dd_ns_cell_sizes(gmx_domdec_t *dd,
3558 rvec cell_ns_x0, rvec cell_ns_x1,
3561 gmx_domdec_comm_t *comm;
3566 for (dim_ind = 0; dim_ind < dd->ndim; dim_ind++)
3568 dim = dd->dim[dim_ind];
3570 /* Without PBC we don't have restrictions on the outer cells */
3571 if (!(dim >= ddbox->npbcdim &&
3572 (dd->ci[dim] == 0 || dd->ci[dim] == dd->nc[dim] - 1)) &&
3574 (comm->cell_x1[dim] - comm->cell_x0[dim])*ddbox->skew_fac[dim] <
3575 comm->cellsize_min[dim])
3578 gmx_fatal(FARGS, "step %s: The %c-size (%f) times the triclinic skew factor (%f) is smaller than the smallest allowed cell size (%f) for domain decomposition grid cell %d %d %d",
3579 gmx_step_str(step, buf), dim2char(dim),
3580 comm->cell_x1[dim] - comm->cell_x0[dim],
3581 ddbox->skew_fac[dim],
3582 dd->comm->cellsize_min[dim],
3583 dd->ci[XX], dd->ci[YY], dd->ci[ZZ]);
3587 if ((dlbIsOn(dd->comm) && dd->ndim > 1) || ddbox->nboundeddim < DIM)
3589 /* Communicate the boundaries and update cell_ns_x0/1 */
3590 dd_move_cellx(dd, ddbox, cell_ns_x0, cell_ns_x1);
3591 if (dlbIsOn(dd->comm) && dd->ndim > 1)
3593 check_grid_jump(step, dd, dd->comm->cutoff, ddbox, TRUE);
3598 static void make_tric_corr_matrix(int npbcdim, matrix box, matrix tcm)
3602 tcm[YY][XX] = -box[YY][XX]/box[YY][YY];
3610 tcm[ZZ][XX] = -(box[ZZ][YY]*tcm[YY][XX] + box[ZZ][XX])/box[ZZ][ZZ];
3611 tcm[ZZ][YY] = -box[ZZ][YY]/box[ZZ][ZZ];
3620 static void check_screw_box(matrix box)
3622 /* Mathematical limitation */
3623 if (box[YY][XX] != 0 || box[ZZ][XX] != 0)
3625 gmx_fatal(FARGS, "With screw pbc the unit cell can not have non-zero off-diagonal x-components");
3628 /* Limitation due to the asymmetry of the eighth shell method */
3629 if (box[ZZ][YY] != 0)
3631 gmx_fatal(FARGS, "pbc=screw with non-zero box_zy is not supported");
3635 static void distribute_cg(FILE *fplog,
3636 matrix box, ivec tric_dir, t_block *cgs, rvec pos[],
3639 gmx_domdec_master_t *ma;
3640 int **tmp_ind = nullptr, *tmp_nalloc = nullptr;
3641 int i, icg, j, k, k0, k1, d;
3645 real nrcg, inv_ncg, pos_d;
3651 snew(tmp_nalloc, dd->nnodes);
3652 snew(tmp_ind, dd->nnodes);
3653 for (i = 0; i < dd->nnodes; i++)
3655 tmp_nalloc[i] = over_alloc_large(cgs->nr/dd->nnodes+1);
3656 snew(tmp_ind[i], tmp_nalloc[i]);
3659 /* Clear the count */
3660 for (i = 0; i < dd->nnodes; i++)
3666 make_tric_corr_matrix(dd->npbcdim, box, tcm);
3668 cgindex = cgs->index;
3670 /* Compute the center of geometry for all charge groups */
3671 for (icg = 0; icg < cgs->nr; icg++)
3674 k1 = cgindex[icg+1];
3678 copy_rvec(pos[k0], cg_cm);
3685 for (k = k0; (k < k1); k++)
3687 rvec_inc(cg_cm, pos[k]);
3689 for (d = 0; (d < DIM); d++)
3691 cg_cm[d] *= inv_ncg;
3694 /* Put the charge group in the box and determine the cell index */
3695 for (d = DIM-1; d >= 0; d--)
3698 if (d < dd->npbcdim)
3700 bScrew = (dd->bScrewPBC && d == XX);
3701 if (tric_dir[d] && dd->nc[d] > 1)
3703 /* Use triclinic coordintates for this dimension */
3704 for (j = d+1; j < DIM; j++)
3706 pos_d += cg_cm[j]*tcm[j][d];
3709 while (pos_d >= box[d][d])
3712 rvec_dec(cg_cm, box[d]);
3715 cg_cm[YY] = box[YY][YY] - cg_cm[YY];
3716 cg_cm[ZZ] = box[ZZ][ZZ] - cg_cm[ZZ];
3718 for (k = k0; (k < k1); k++)
3720 rvec_dec(pos[k], box[d]);
3723 pos[k][YY] = box[YY][YY] - pos[k][YY];
3724 pos[k][ZZ] = box[ZZ][ZZ] - pos[k][ZZ];
3731 rvec_inc(cg_cm, box[d]);
3734 cg_cm[YY] = box[YY][YY] - cg_cm[YY];
3735 cg_cm[ZZ] = box[ZZ][ZZ] - cg_cm[ZZ];
3737 for (k = k0; (k < k1); k++)
3739 rvec_inc(pos[k], box[d]);
3742 pos[k][YY] = box[YY][YY] - pos[k][YY];
3743 pos[k][ZZ] = box[ZZ][ZZ] - pos[k][ZZ];
3748 /* This could be done more efficiently */
3750 while (ind[d]+1 < dd->nc[d] && pos_d >= ma->cell_x[d][ind[d]+1])
3755 i = dd_index(dd->nc, ind);
3756 if (ma->ncg[i] == tmp_nalloc[i])
3758 tmp_nalloc[i] = over_alloc_large(ma->ncg[i]+1);
3759 srenew(tmp_ind[i], tmp_nalloc[i]);
3761 tmp_ind[i][ma->ncg[i]] = icg;
3763 ma->nat[i] += cgindex[icg+1] - cgindex[icg];
3767 for (i = 0; i < dd->nnodes; i++)
3770 for (k = 0; k < ma->ncg[i]; k++)
3772 ma->cg[k1++] = tmp_ind[i][k];
3775 ma->index[dd->nnodes] = k1;
3777 for (i = 0; i < dd->nnodes; i++)
3786 // Use double for the sums to avoid natoms^2 overflowing
3788 int nat_sum, nat_min, nat_max;
3793 nat_min = ma->nat[0];
3794 nat_max = ma->nat[0];
3795 for (i = 0; i < dd->nnodes; i++)
3797 nat_sum += ma->nat[i];
3798 // cast to double to avoid integer overflows when squaring
3799 nat2_sum += gmx::square(static_cast<double>(ma->nat[i]));
3800 nat_min = std::min(nat_min, ma->nat[i]);
3801 nat_max = std::max(nat_max, ma->nat[i]);
3803 nat_sum /= dd->nnodes;
3804 nat2_sum /= dd->nnodes;
3806 fprintf(fplog, "Atom distribution over %d domains: av %d stddev %d min %d max %d\n",
3809 static_cast<int>(std::sqrt(nat2_sum - gmx::square(static_cast<double>(nat_sum)) + 0.5)),
3814 static void get_cg_distribution(FILE *fplog, gmx_domdec_t *dd,
3815 t_block *cgs, matrix box, gmx_ddbox_t *ddbox,
3818 gmx_domdec_master_t *ma = nullptr;
3821 int *ibuf, buf2[2] = { 0, 0 };
3822 gmx_bool bMaster = DDMASTER(dd);
3830 check_screw_box(box);
3833 set_dd_cell_sizes_slb(dd, ddbox, setcellsizeslbMASTER, npulse);
3835 distribute_cg(fplog, box, ddbox->tric_dir, cgs, pos, dd);
3836 for (i = 0; i < dd->nnodes; i++)
3838 ma->ibuf[2*i] = ma->ncg[i];
3839 ma->ibuf[2*i+1] = ma->nat[i];
3847 dd_scatter(dd, 2*sizeof(int), ibuf, buf2);
3849 dd->ncg_home = buf2[0];
3850 dd->nat_home = buf2[1];
3851 dd->ncg_tot = dd->ncg_home;
3852 dd->nat_tot = dd->nat_home;
3853 if (dd->ncg_home > dd->cg_nalloc || dd->cg_nalloc == 0)
3855 dd->cg_nalloc = over_alloc_dd(dd->ncg_home);
3856 srenew(dd->index_gl, dd->cg_nalloc);
3857 srenew(dd->cgindex, dd->cg_nalloc+1);
3861 for (i = 0; i < dd->nnodes; i++)
3863 ma->ibuf[i] = ma->ncg[i]*sizeof(int);
3864 ma->ibuf[dd->nnodes+i] = ma->index[i]*sizeof(int);
3869 bMaster ? ma->ibuf : nullptr,
3870 bMaster ? ma->ibuf+dd->nnodes : nullptr,
3871 bMaster ? ma->cg : nullptr,
3872 dd->ncg_home*sizeof(int), dd->index_gl);
3874 /* Determine the home charge group sizes */
3876 for (i = 0; i < dd->ncg_home; i++)
3878 cg_gl = dd->index_gl[i];
3880 dd->cgindex[i] + cgs->index[cg_gl+1] - cgs->index[cg_gl];
3885 fprintf(debug, "Home charge groups:\n");
3886 for (i = 0; i < dd->ncg_home; i++)
3888 fprintf(debug, " %d", dd->index_gl[i]);
3891 fprintf(debug, "\n");
3894 fprintf(debug, "\n");
3898 static int compact_and_copy_vec_at(int ncg, int *move,
3901 rvec *src, gmx_domdec_comm_t *comm,
3904 int m, icg, i, i0, i1, nrcg;
3910 for (m = 0; m < DIM*2; m++)
3916 for (icg = 0; icg < ncg; icg++)
3918 i1 = cgindex[icg+1];
3924 /* Compact the home array in place */
3925 for (i = i0; i < i1; i++)
3927 copy_rvec(src[i], src[home_pos++]);
3933 /* Copy to the communication buffer */
3935 pos_vec[m] += 1 + vec*nrcg;
3936 for (i = i0; i < i1; i++)
3938 copy_rvec(src[i], comm->cgcm_state[m][pos_vec[m]++]);
3940 pos_vec[m] += (nvec - vec - 1)*nrcg;
3944 home_pos += i1 - i0;
3952 static int compact_and_copy_vec_cg(int ncg, int *move,
3954 int nvec, rvec *src, gmx_domdec_comm_t *comm,
3957 int m, icg, i0, i1, nrcg;
3963 for (m = 0; m < DIM*2; m++)
3969 for (icg = 0; icg < ncg; icg++)
3971 i1 = cgindex[icg+1];
3977 /* Compact the home array in place */
3978 copy_rvec(src[icg], src[home_pos++]);
3984 /* Copy to the communication buffer */
3985 copy_rvec(src[icg], comm->cgcm_state[m][pos_vec[m]]);
3986 pos_vec[m] += 1 + nrcg*nvec;
3998 static int compact_ind(int ncg, int *move,
3999 int *index_gl, int *cgindex,
4001 gmx_ga2la_t *ga2la, char *bLocalCG,
4004 int cg, nat, a0, a1, a, a_gl;
4009 for (cg = 0; cg < ncg; cg++)
4015 /* Compact the home arrays in place.
4016 * Anything that can be done here avoids access to global arrays.
4018 cgindex[home_pos] = nat;
4019 for (a = a0; a < a1; a++)
4022 gatindex[nat] = a_gl;
4023 /* The cell number stays 0, so we don't need to set it */
4024 ga2la_change_la(ga2la, a_gl, nat);
4027 index_gl[home_pos] = index_gl[cg];
4028 cginfo[home_pos] = cginfo[cg];
4029 /* The charge group remains local, so bLocalCG does not change */
4034 /* Clear the global indices */
4035 for (a = a0; a < a1; a++)
4037 ga2la_del(ga2la, gatindex[a]);
4041 bLocalCG[index_gl[cg]] = FALSE;
4045 cgindex[home_pos] = nat;
4050 static void clear_and_mark_ind(int ncg, int *move,
4051 int *index_gl, int *cgindex, int *gatindex,
4052 gmx_ga2la_t *ga2la, char *bLocalCG,
4057 for (cg = 0; cg < ncg; cg++)
4063 /* Clear the global indices */
4064 for (a = a0; a < a1; a++)
4066 ga2la_del(ga2la, gatindex[a]);
4070 bLocalCG[index_gl[cg]] = FALSE;
4072 /* Signal that this cg has moved using the ns cell index.
4073 * Here we set it to -1. fill_grid will change it
4074 * from -1 to NSGRID_SIGNAL_MOVED_FAC*grid->ncells.
4076 cell_index[cg] = -1;
4081 static void print_cg_move(FILE *fplog,
4083 gmx_int64_t step, int cg, int dim, int dir,
4084 gmx_bool bHaveCgcmOld, real limitd,
4085 rvec cm_old, rvec cm_new, real pos_d)
4087 gmx_domdec_comm_t *comm;
4092 fprintf(fplog, "\nStep %s:\n", gmx_step_str(step, buf));
4095 fprintf(fplog, "%s %d moved more than the distance allowed by the domain decomposition (%f) in direction %c\n",
4096 dd->comm->bCGs ? "The charge group starting at atom" : "Atom",
4097 ddglatnr(dd, dd->cgindex[cg]), limitd, dim2char(dim));
4101 /* We don't have a limiting distance available: don't print it */
4102 fprintf(fplog, "%s %d moved more than the distance allowed by the domain decomposition in direction %c\n",
4103 dd->comm->bCGs ? "The charge group starting at atom" : "Atom",
4104 ddglatnr(dd, dd->cgindex[cg]), dim2char(dim));
4106 fprintf(fplog, "distance out of cell %f\n",
4107 dir == 1 ? pos_d - comm->cell_x1[dim] : pos_d - comm->cell_x0[dim]);
4110 fprintf(fplog, "Old coordinates: %8.3f %8.3f %8.3f\n",
4111 cm_old[XX], cm_old[YY], cm_old[ZZ]);
4113 fprintf(fplog, "New coordinates: %8.3f %8.3f %8.3f\n",
4114 cm_new[XX], cm_new[YY], cm_new[ZZ]);
4115 fprintf(fplog, "Old cell boundaries in direction %c: %8.3f %8.3f\n",
4117 comm->old_cell_x0[dim], comm->old_cell_x1[dim]);
4118 fprintf(fplog, "New cell boundaries in direction %c: %8.3f %8.3f\n",
4120 comm->cell_x0[dim], comm->cell_x1[dim]);
4123 static void cg_move_error(FILE *fplog,
4125 gmx_int64_t step, int cg, int dim, int dir,
4126 gmx_bool bHaveCgcmOld, real limitd,
4127 rvec cm_old, rvec cm_new, real pos_d)
4131 print_cg_move(fplog, dd, step, cg, dim, dir,
4132 bHaveCgcmOld, limitd, cm_old, cm_new, pos_d);
4134 print_cg_move(stderr, dd, step, cg, dim, dir,
4135 bHaveCgcmOld, limitd, cm_old, cm_new, pos_d);
4137 "%s moved too far between two domain decomposition steps\n"
4138 "This usually means that your system is not well equilibrated",
4139 dd->comm->bCGs ? "A charge group" : "An atom");
4142 static void rotate_state_atom(t_state *state, int a)
4146 for (est = 0; est < estNR; est++)
4148 if (EST_DISTR(est) && (state->flags & (1<<est)))
4153 /* Rotate the complete state; for a rectangular box only */
4154 state->x[a][YY] = state->box[YY][YY] - state->x[a][YY];
4155 state->x[a][ZZ] = state->box[ZZ][ZZ] - state->x[a][ZZ];
4158 state->v[a][YY] = -state->v[a][YY];
4159 state->v[a][ZZ] = -state->v[a][ZZ];
4161 case est_SDX_NOTSUPPORTED:
4164 state->cg_p[a][YY] = -state->cg_p[a][YY];
4165 state->cg_p[a][ZZ] = -state->cg_p[a][ZZ];
4167 case estDISRE_INITF:
4168 case estDISRE_RM3TAV:
4169 case estORIRE_INITF:
4171 /* These are distances, so not affected by rotation */
4174 gmx_incons("Unknown state entry encountered in rotate_state_atom");
4180 static int *get_moved(gmx_domdec_comm_t *comm, int natoms)
4182 if (natoms > comm->moved_nalloc)
4184 /* Contents should be preserved here */
4185 comm->moved_nalloc = over_alloc_dd(natoms);
4186 srenew(comm->moved, comm->moved_nalloc);
4192 static void calc_cg_move(FILE *fplog, gmx_int64_t step,
4195 ivec tric_dir, matrix tcm,
4196 rvec cell_x0, rvec cell_x1,
4197 rvec limitd, rvec limit0, rvec limit1,
4199 int cg_start, int cg_end,
4204 int cg, k, k0, k1, d, dim, d2;
4209 real inv_ncg, pos_d;
4212 npbcdim = dd->npbcdim;
4214 for (cg = cg_start; cg < cg_end; cg++)
4221 copy_rvec(state->x[k0], cm_new);
4228 for (k = k0; (k < k1); k++)
4230 rvec_inc(cm_new, state->x[k]);
4232 for (d = 0; (d < DIM); d++)
4234 cm_new[d] = inv_ncg*cm_new[d];
4239 /* Do pbc and check DD cell boundary crossings */
4240 for (d = DIM-1; d >= 0; d--)
4244 bScrew = (dd->bScrewPBC && d == XX);
4245 /* Determine the location of this cg in lattice coordinates */
4249 for (d2 = d+1; d2 < DIM; d2++)
4251 pos_d += cm_new[d2]*tcm[d2][d];
4254 /* Put the charge group in the triclinic unit-cell */
4255 if (pos_d >= cell_x1[d])
4257 if (pos_d >= limit1[d])
4259 cg_move_error(fplog, dd, step, cg, d, 1,
4260 cg_cm != as_rvec_array(state->x.data()), limitd[d],
4261 cg_cm[cg], cm_new, pos_d);
4264 if (dd->ci[d] == dd->nc[d] - 1)
4266 rvec_dec(cm_new, state->box[d]);
4269 cm_new[YY] = state->box[YY][YY] - cm_new[YY];
4270 cm_new[ZZ] = state->box[ZZ][ZZ] - cm_new[ZZ];
4272 for (k = k0; (k < k1); k++)
4274 rvec_dec(state->x[k], state->box[d]);
4277 rotate_state_atom(state, k);
4282 else if (pos_d < cell_x0[d])
4284 if (pos_d < limit0[d])
4286 cg_move_error(fplog, dd, step, cg, d, -1,
4287 cg_cm != as_rvec_array(state->x.data()), limitd[d],
4288 cg_cm[cg], cm_new, pos_d);
4293 rvec_inc(cm_new, state->box[d]);
4296 cm_new[YY] = state->box[YY][YY] - cm_new[YY];
4297 cm_new[ZZ] = state->box[ZZ][ZZ] - cm_new[ZZ];
4299 for (k = k0; (k < k1); k++)
4301 rvec_inc(state->x[k], state->box[d]);
4304 rotate_state_atom(state, k);
4310 else if (d < npbcdim)
4312 /* Put the charge group in the rectangular unit-cell */
4313 while (cm_new[d] >= state->box[d][d])
4315 rvec_dec(cm_new, state->box[d]);
4316 for (k = k0; (k < k1); k++)
4318 rvec_dec(state->x[k], state->box[d]);
4321 while (cm_new[d] < 0)
4323 rvec_inc(cm_new, state->box[d]);
4324 for (k = k0; (k < k1); k++)
4326 rvec_inc(state->x[k], state->box[d]);
4332 copy_rvec(cm_new, cg_cm[cg]);
4334 /* Determine where this cg should go */
4337 for (d = 0; d < dd->ndim; d++)
4342 flag |= DD_FLAG_FW(d);
4348 else if (dev[dim] == -1)
4350 flag |= DD_FLAG_BW(d);
4353 if (dd->nc[dim] > 2)
4364 /* Temporarily store the flag in move */
4365 move[cg] = mc + flag;
4369 static void dd_redistribute_cg(FILE *fplog, gmx_int64_t step,
4370 gmx_domdec_t *dd, ivec tric_dir,
4371 t_state *state, PaddedRVecVector *f,
4380 int ncg[DIM*2], nat[DIM*2];
4381 int c, i, cg, k, d, dim, dim2, dir, d2, d3;
4382 int mc, cdd, nrcg, ncg_recv, nvs, nvr, nvec, vec;
4383 int sbuf[2], rbuf[2];
4384 int home_pos_cg, home_pos_at, buf_pos;
4386 gmx_bool bV = FALSE, bCGP = FALSE;
4389 rvec *cg_cm = nullptr, cell_x0, cell_x1, limitd, limit0, limit1;
4391 cginfo_mb_t *cginfo_mb;
4392 gmx_domdec_comm_t *comm;
4394 int nthread, thread;
4398 check_screw_box(state->box);
4402 if (fr->cutoff_scheme == ecutsGROUP)
4407 for (i = 0; i < estNR; i++)
4413 case estX: /* Always present */ break;
4414 case estV: bV = (state->flags & (1<<i)); break;
4415 case est_SDX_NOTSUPPORTED: break;
4416 case estCGP: bCGP = (state->flags & (1<<i)); break;
4419 case estDISRE_INITF:
4420 case estDISRE_RM3TAV:
4421 case estORIRE_INITF:
4423 /* No processing required */
4426 gmx_incons("Unknown state entry encountered in dd_redistribute_cg");
4431 if (dd->ncg_tot > comm->nalloc_int)
4433 comm->nalloc_int = over_alloc_dd(dd->ncg_tot);
4434 srenew(comm->buf_int, comm->nalloc_int);
4436 move = comm->buf_int;
4438 /* Clear the count */
4439 for (c = 0; c < dd->ndim*2; c++)
4445 npbcdim = dd->npbcdim;
4447 for (d = 0; (d < DIM); d++)
4449 limitd[d] = dd->comm->cellsize_min[d];
4450 if (d >= npbcdim && dd->ci[d] == 0)
4452 cell_x0[d] = -GMX_FLOAT_MAX;
4456 cell_x0[d] = comm->cell_x0[d];
4458 if (d >= npbcdim && dd->ci[d] == dd->nc[d] - 1)
4460 cell_x1[d] = GMX_FLOAT_MAX;
4464 cell_x1[d] = comm->cell_x1[d];
4468 limit0[d] = comm->old_cell_x0[d] - limitd[d];
4469 limit1[d] = comm->old_cell_x1[d] + limitd[d];
4473 /* We check after communication if a charge group moved
4474 * more than one cell. Set the pre-comm check limit to float_max.
4476 limit0[d] = -GMX_FLOAT_MAX;
4477 limit1[d] = GMX_FLOAT_MAX;
4481 make_tric_corr_matrix(npbcdim, state->box, tcm);
4483 cgindex = dd->cgindex;
4485 nthread = gmx_omp_nthreads_get(emntDomdec);
4487 /* Compute the center of geometry for all home charge groups
4488 * and put them in the box and determine where they should go.
4490 #pragma omp parallel for num_threads(nthread) schedule(static)
4491 for (thread = 0; thread < nthread; thread++)
4495 calc_cg_move(fplog, step, dd, state, tric_dir, tcm,
4496 cell_x0, cell_x1, limitd, limit0, limit1,
4498 ( thread *dd->ncg_home)/nthread,
4499 ((thread+1)*dd->ncg_home)/nthread,
4500 fr->cutoff_scheme == ecutsGROUP ? cg_cm : as_rvec_array(state->x.data()),
4503 GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
4506 for (cg = 0; cg < dd->ncg_home; cg++)
4511 flag = mc & ~DD_FLAG_NRCG;
4512 mc = mc & DD_FLAG_NRCG;
4515 if (ncg[mc]+1 > comm->cggl_flag_nalloc[mc])
4517 comm->cggl_flag_nalloc[mc] = over_alloc_dd(ncg[mc]+1);
4518 srenew(comm->cggl_flag[mc], comm->cggl_flag_nalloc[mc]*DD_CGIBS);
4520 comm->cggl_flag[mc][ncg[mc]*DD_CGIBS ] = dd->index_gl[cg];
4521 /* We store the cg size in the lower 16 bits
4522 * and the place where the charge group should go
4523 * in the next 6 bits. This saves some communication volume.
4525 nrcg = cgindex[cg+1] - cgindex[cg];
4526 comm->cggl_flag[mc][ncg[mc]*DD_CGIBS+1] = nrcg | flag;
4532 inc_nrnb(nrnb, eNR_CGCM, dd->nat_home);
4533 inc_nrnb(nrnb, eNR_RESETX, dd->ncg_home);
4536 for (i = 0; i < dd->ndim*2; i++)
4538 *ncg_moved += ncg[i];
4551 /* Make sure the communication buffers are large enough */
4552 for (mc = 0; mc < dd->ndim*2; mc++)
4554 nvr = ncg[mc] + nat[mc]*nvec;
4555 if (nvr > comm->cgcm_state_nalloc[mc])
4557 comm->cgcm_state_nalloc[mc] = over_alloc_dd(nvr);
4558 srenew(comm->cgcm_state[mc], comm->cgcm_state_nalloc[mc]);
4562 switch (fr->cutoff_scheme)
4565 /* Recalculating cg_cm might be cheaper than communicating,
4566 * but that could give rise to rounding issues.
4569 compact_and_copy_vec_cg(dd->ncg_home, move, cgindex,
4570 nvec, cg_cm, comm, bCompact);
4573 /* Without charge groups we send the moved atom coordinates
4574 * over twice. This is so the code below can be used without
4575 * many conditionals for both for with and without charge groups.
4578 compact_and_copy_vec_cg(dd->ncg_home, move, cgindex,
4579 nvec, as_rvec_array(state->x.data()), comm, FALSE);
4582 home_pos_cg -= *ncg_moved;
4586 gmx_incons("unimplemented");
4592 compact_and_copy_vec_at(dd->ncg_home, move, cgindex,
4593 nvec, vec++, as_rvec_array(state->x.data()),
4597 compact_and_copy_vec_at(dd->ncg_home, move, cgindex,
4598 nvec, vec++, as_rvec_array(state->v.data()),
4603 compact_and_copy_vec_at(dd->ncg_home, move, cgindex,
4604 nvec, vec++, as_rvec_array(state->cg_p.data()),
4610 compact_ind(dd->ncg_home, move,
4611 dd->index_gl, dd->cgindex, dd->gatindex,
4612 dd->ga2la, comm->bLocalCG,
4617 if (fr->cutoff_scheme == ecutsVERLET)
4619 moved = get_moved(comm, dd->ncg_home);
4621 for (k = 0; k < dd->ncg_home; k++)
4628 moved = fr->ns->grid->cell_index;
4631 clear_and_mark_ind(dd->ncg_home, move,
4632 dd->index_gl, dd->cgindex, dd->gatindex,
4633 dd->ga2la, comm->bLocalCG,
4637 cginfo_mb = fr->cginfo_mb;
4639 *ncg_stay_home = home_pos_cg;
4640 for (d = 0; d < dd->ndim; d++)
4645 for (dir = 0; dir < (dd->nc[dim] == 2 ? 1 : 2); dir++)
4648 /* Communicate the cg and atom counts */
4653 fprintf(debug, "Sending ddim %d dir %d: ncg %d nat %d\n",
4654 d, dir, sbuf[0], sbuf[1]);
4656 dd_sendrecv_int(dd, d, dir, sbuf, 2, rbuf, 2);
4658 if ((ncg_recv+rbuf[0])*DD_CGIBS > comm->nalloc_int)
4660 comm->nalloc_int = over_alloc_dd((ncg_recv+rbuf[0])*DD_CGIBS);
4661 srenew(comm->buf_int, comm->nalloc_int);
4664 /* Communicate the charge group indices, sizes and flags */
4665 dd_sendrecv_int(dd, d, dir,
4666 comm->cggl_flag[cdd], sbuf[0]*DD_CGIBS,
4667 comm->buf_int+ncg_recv*DD_CGIBS, rbuf[0]*DD_CGIBS);
4669 nvs = ncg[cdd] + nat[cdd]*nvec;
4670 i = rbuf[0] + rbuf[1] *nvec;
4671 vec_rvec_check_alloc(&comm->vbuf, nvr+i);
4673 /* Communicate cgcm and state */
4674 dd_sendrecv_rvec(dd, d, dir,
4675 comm->cgcm_state[cdd], nvs,
4676 comm->vbuf.v+nvr, i);
4677 ncg_recv += rbuf[0];
4681 dd_check_alloc_ncg(fr, state, f, home_pos_cg + ncg_recv);
4682 if (fr->cutoff_scheme == ecutsGROUP)
4684 /* Here we resize to more than necessary and shrink later */
4685 dd_resize_state(state, f, home_pos_at + ncg_recv*MAX_CGCGSIZE);
4688 /* Process the received charge groups */
4690 for (cg = 0; cg < ncg_recv; cg++)
4692 flag = comm->buf_int[cg*DD_CGIBS+1];
4694 if (dim >= npbcdim && dd->nc[dim] > 2)
4696 /* No pbc in this dim and more than one domain boundary.
4697 * We do a separate check if a charge group didn't move too far.
4699 if (((flag & DD_FLAG_FW(d)) &&
4700 comm->vbuf.v[buf_pos][dim] > cell_x1[dim]) ||
4701 ((flag & DD_FLAG_BW(d)) &&
4702 comm->vbuf.v[buf_pos][dim] < cell_x0[dim]))
4704 cg_move_error(fplog, dd, step, cg, dim,
4705 (flag & DD_FLAG_FW(d)) ? 1 : 0,
4706 fr->cutoff_scheme == ecutsGROUP, 0,
4707 comm->vbuf.v[buf_pos],
4708 comm->vbuf.v[buf_pos],
4709 comm->vbuf.v[buf_pos][dim]);
4716 /* Check which direction this cg should go */
4717 for (d2 = d+1; (d2 < dd->ndim && mc == -1); d2++)
4719 if (dlbIsOn(dd->comm))
4721 /* The cell boundaries for dimension d2 are not equal
4722 * for each cell row of the lower dimension(s),
4723 * therefore we might need to redetermine where
4724 * this cg should go.
4727 /* If this cg crosses the box boundary in dimension d2
4728 * we can use the communicated flag, so we do not
4729 * have to worry about pbc.
4731 if (!((dd->ci[dim2] == dd->nc[dim2]-1 &&
4732 (flag & DD_FLAG_FW(d2))) ||
4733 (dd->ci[dim2] == 0 &&
4734 (flag & DD_FLAG_BW(d2)))))
4736 /* Clear the two flags for this dimension */
4737 flag &= ~(DD_FLAG_FW(d2) | DD_FLAG_BW(d2));
4738 /* Determine the location of this cg
4739 * in lattice coordinates
4741 pos_d = comm->vbuf.v[buf_pos][dim2];
4744 for (d3 = dim2+1; d3 < DIM; d3++)
4747 comm->vbuf.v[buf_pos][d3]*tcm[d3][dim2];
4750 /* Check of we are not at the box edge.
4751 * pbc is only handled in the first step above,
4752 * but this check could move over pbc while
4753 * the first step did not due to different rounding.
4755 if (pos_d >= cell_x1[dim2] &&
4756 dd->ci[dim2] != dd->nc[dim2]-1)
4758 flag |= DD_FLAG_FW(d2);
4760 else if (pos_d < cell_x0[dim2] &&
4763 flag |= DD_FLAG_BW(d2);
4765 comm->buf_int[cg*DD_CGIBS+1] = flag;
4768 /* Set to which neighboring cell this cg should go */
4769 if (flag & DD_FLAG_FW(d2))
4773 else if (flag & DD_FLAG_BW(d2))
4775 if (dd->nc[dd->dim[d2]] > 2)
4787 nrcg = flag & DD_FLAG_NRCG;
4790 if (home_pos_cg+1 > dd->cg_nalloc)
4792 dd->cg_nalloc = over_alloc_dd(home_pos_cg+1);
4793 srenew(dd->index_gl, dd->cg_nalloc);
4794 srenew(dd->cgindex, dd->cg_nalloc+1);
4796 /* Set the global charge group index and size */
4797 dd->index_gl[home_pos_cg] = comm->buf_int[cg*DD_CGIBS];
4798 dd->cgindex[home_pos_cg+1] = dd->cgindex[home_pos_cg] + nrcg;
4799 /* Copy the state from the buffer */
4800 if (fr->cutoff_scheme == ecutsGROUP)
4803 copy_rvec(comm->vbuf.v[buf_pos], cg_cm[home_pos_cg]);
4807 /* Set the cginfo */
4808 fr->cginfo[home_pos_cg] = ddcginfo(cginfo_mb,
4809 dd->index_gl[home_pos_cg]);
4812 comm->bLocalCG[dd->index_gl[home_pos_cg]] = TRUE;
4815 for (i = 0; i < nrcg; i++)
4817 copy_rvec(comm->vbuf.v[buf_pos++],
4818 state->x[home_pos_at+i]);
4822 for (i = 0; i < nrcg; i++)
4824 copy_rvec(comm->vbuf.v[buf_pos++],
4825 state->v[home_pos_at+i]);
4830 for (i = 0; i < nrcg; i++)
4832 copy_rvec(comm->vbuf.v[buf_pos++],
4833 state->cg_p[home_pos_at+i]);
4837 home_pos_at += nrcg;
4841 /* Reallocate the buffers if necessary */
4842 if (ncg[mc]+1 > comm->cggl_flag_nalloc[mc])
4844 comm->cggl_flag_nalloc[mc] = over_alloc_dd(ncg[mc]+1);
4845 srenew(comm->cggl_flag[mc], comm->cggl_flag_nalloc[mc]*DD_CGIBS);
4847 nvr = ncg[mc] + nat[mc]*nvec;
4848 if (nvr + 1 + nrcg*nvec > comm->cgcm_state_nalloc[mc])
4850 comm->cgcm_state_nalloc[mc] = over_alloc_dd(nvr + 1 + nrcg*nvec);
4851 srenew(comm->cgcm_state[mc], comm->cgcm_state_nalloc[mc]);
4853 /* Copy from the receive to the send buffers */
4854 memcpy(comm->cggl_flag[mc] + ncg[mc]*DD_CGIBS,
4855 comm->buf_int + cg*DD_CGIBS,
4856 DD_CGIBS*sizeof(int));
4857 memcpy(comm->cgcm_state[mc][nvr],
4858 comm->vbuf.v[buf_pos],
4859 (1+nrcg*nvec)*sizeof(rvec));
4860 buf_pos += 1 + nrcg*nvec;
4867 /* With sorting (!bCompact) the indices are now only partially up to date
4868 * and ncg_home and nat_home are not the real count, since there are
4869 * "holes" in the arrays for the charge groups that moved to neighbors.
4871 if (fr->cutoff_scheme == ecutsVERLET)
4873 moved = get_moved(comm, home_pos_cg);
4875 for (i = dd->ncg_home; i < home_pos_cg; i++)
4880 dd->ncg_home = home_pos_cg;
4881 dd->nat_home = home_pos_at;
4883 if (fr->cutoff_scheme == ecutsGROUP && !bCompact)
4885 /* We overallocated before, we need to set the right size here */
4886 dd_resize_state(state, f, dd->nat_home);
4892 "Finished repartitioning: cgs moved out %d, new home %d\n",
4893 *ncg_moved, dd->ncg_home-*ncg_moved);
4898 void dd_cycles_add(gmx_domdec_t *dd, float cycles, int ddCycl)
4900 /* Note that the cycles value can be incorrect, either 0 or some
4901 * extremely large value, when our thread migrated to another core
4902 * with an unsynchronized cycle counter. If this happens less often
4903 * that once per nstlist steps, this will not cause issues, since
4904 * we later subtract the maximum value from the sum over nstlist steps.
4905 * A zero count will slightly lower the total, but that's a small effect.
4906 * Note that the main purpose of the subtraction of the maximum value
4907 * is to avoid throwing off the load balancing when stalls occur due
4908 * e.g. system activity or network congestion.
4910 dd->comm->cycl[ddCycl] += cycles;
4911 dd->comm->cycl_n[ddCycl]++;
4912 if (cycles > dd->comm->cycl_max[ddCycl])
4914 dd->comm->cycl_max[ddCycl] = cycles;
4918 static double force_flop_count(t_nrnb *nrnb)
4925 for (i = 0; i < eNR_NBKERNEL_FREE_ENERGY; i++)
4927 /* To get closer to the real timings, we half the count
4928 * for the normal loops and again half it for water loops.
4931 if (strstr(name, "W3") != nullptr || strstr(name, "W4") != nullptr)
4933 sum += nrnb->n[i]*0.25*cost_nrnb(i);
4937 sum += nrnb->n[i]*0.50*cost_nrnb(i);
4940 for (i = eNR_NBKERNEL_FREE_ENERGY; i <= eNR_NB14; i++)
4943 if (strstr(name, "W3") != nullptr || strstr(name, "W4") != nullptr)
4945 sum += nrnb->n[i]*cost_nrnb(i);
4948 for (i = eNR_BONDS; i <= eNR_WALLS; i++)
4950 sum += nrnb->n[i]*cost_nrnb(i);
4956 void dd_force_flop_start(gmx_domdec_t *dd, t_nrnb *nrnb)
4958 if (dd->comm->eFlop)
4960 dd->comm->flop -= force_flop_count(nrnb);
4963 void dd_force_flop_stop(gmx_domdec_t *dd, t_nrnb *nrnb)
4965 if (dd->comm->eFlop)
4967 dd->comm->flop += force_flop_count(nrnb);
4972 static void clear_dd_cycle_counts(gmx_domdec_t *dd)
4976 for (i = 0; i < ddCyclNr; i++)
4978 dd->comm->cycl[i] = 0;
4979 dd->comm->cycl_n[i] = 0;
4980 dd->comm->cycl_max[i] = 0;
4983 dd->comm->flop_n = 0;
4986 static void get_load_distribution(gmx_domdec_t *dd, gmx_wallcycle_t wcycle)
4988 gmx_domdec_comm_t *comm;
4989 domdec_load_t *load;
4990 domdec_root_t *root = nullptr;
4992 float cell_frac = 0, sbuf[DD_NLOAD_MAX];
4997 fprintf(debug, "get_load_distribution start\n");
5000 wallcycle_start(wcycle, ewcDDCOMMLOAD);
5004 bSepPME = (dd->pme_nodeid >= 0);
5006 if (dd->ndim == 0 && bSepPME)
5008 /* Without decomposition, but with PME nodes, we need the load */
5009 comm->load[0].mdf = comm->cycl[ddCyclPPduringPME];
5010 comm->load[0].pme = comm->cycl[ddCyclPME];
5013 for (d = dd->ndim-1; d >= 0; d--)
5016 /* Check if we participate in the communication in this dimension */
5017 if (d == dd->ndim-1 ||
5018 (dd->ci[dd->dim[d+1]] == 0 && dd->ci[dd->dim[dd->ndim-1]] == 0))
5020 load = &comm->load[d];
5021 if (dlbIsOn(dd->comm))
5023 cell_frac = comm->cell_f1[d] - comm->cell_f0[d];
5026 if (d == dd->ndim-1)
5028 sbuf[pos++] = dd_force_load(comm);
5029 sbuf[pos++] = sbuf[0];
5030 if (dlbIsOn(dd->comm))
5032 sbuf[pos++] = sbuf[0];
5033 sbuf[pos++] = cell_frac;
5036 sbuf[pos++] = comm->cell_f_max0[d];
5037 sbuf[pos++] = comm->cell_f_min1[d];
5042 sbuf[pos++] = comm->cycl[ddCyclPPduringPME];
5043 sbuf[pos++] = comm->cycl[ddCyclPME];
5048 sbuf[pos++] = comm->load[d+1].sum;
5049 sbuf[pos++] = comm->load[d+1].max;
5050 if (dlbIsOn(dd->comm))
5052 sbuf[pos++] = comm->load[d+1].sum_m;
5053 sbuf[pos++] = comm->load[d+1].cvol_min*cell_frac;
5054 sbuf[pos++] = comm->load[d+1].flags;
5057 sbuf[pos++] = comm->cell_f_max0[d];
5058 sbuf[pos++] = comm->cell_f_min1[d];
5063 sbuf[pos++] = comm->load[d+1].mdf;
5064 sbuf[pos++] = comm->load[d+1].pme;
5068 /* Communicate a row in DD direction d.
5069 * The communicators are setup such that the root always has rank 0.
5072 MPI_Gather(sbuf, load->nload*sizeof(float), MPI_BYTE,
5073 load->load, load->nload*sizeof(float), MPI_BYTE,
5074 0, comm->mpi_comm_load[d]);
5076 if (dd->ci[dim] == dd->master_ci[dim])
5078 /* We are the root, process this row */
5081 root = comm->root[d];
5091 for (i = 0; i < dd->nc[dim]; i++)
5093 load->sum += load->load[pos++];
5094 load->max = std::max(load->max, load->load[pos]);
5096 if (dlbIsOn(dd->comm))
5100 /* This direction could not be load balanced properly,
5101 * therefore we need to use the maximum iso the average load.
5103 load->sum_m = std::max(load->sum_m, load->load[pos]);
5107 load->sum_m += load->load[pos];
5110 load->cvol_min = std::min(load->cvol_min, load->load[pos]);
5114 load->flags = (int)(load->load[pos++] + 0.5);
5118 root->cell_f_max0[i] = load->load[pos++];
5119 root->cell_f_min1[i] = load->load[pos++];
5124 load->mdf = std::max(load->mdf, load->load[pos]);
5126 load->pme = std::max(load->pme, load->load[pos]);
5130 if (dlbIsOn(comm) && root->bLimited)
5132 load->sum_m *= dd->nc[dim];
5133 load->flags |= (1<<d);
5141 comm->nload += dd_load_count(comm);
5142 comm->load_step += comm->cycl[ddCyclStep];
5143 comm->load_sum += comm->load[0].sum;
5144 comm->load_max += comm->load[0].max;
5147 for (d = 0; d < dd->ndim; d++)
5149 if (comm->load[0].flags & (1<<d))
5151 comm->load_lim[d]++;
5157 comm->load_mdf += comm->load[0].mdf;
5158 comm->load_pme += comm->load[0].pme;
5162 wallcycle_stop(wcycle, ewcDDCOMMLOAD);
5166 fprintf(debug, "get_load_distribution finished\n");
5170 static float dd_force_imb_perf_loss(gmx_domdec_t *dd)
5172 /* Return the relative performance loss on the total run time
5173 * due to the force calculation load imbalance.
5175 if (dd->comm->nload > 0 && dd->comm->load_step > 0)
5178 (dd->comm->load_max*dd->nnodes - dd->comm->load_sum)/
5179 (dd->comm->load_step*dd->nnodes);
5187 static void print_dd_load_av(FILE *fplog, gmx_domdec_t *dd)
5190 int npp, npme, nnodes, d, limp;
5191 float imbal, pme_f_ratio, lossf = 0, lossp = 0;
5193 gmx_domdec_comm_t *comm;
5196 if (DDMASTER(dd) && comm->nload > 0)
5199 npme = (dd->pme_nodeid >= 0) ? comm->npmenodes : 0;
5200 nnodes = npp + npme;
5201 if (dd->nnodes > 1 && comm->load_sum > 0)
5203 imbal = comm->load_max*npp/comm->load_sum - 1;
5204 lossf = dd_force_imb_perf_loss(dd);
5205 sprintf(buf, " Average load imbalance: %.1f %%\n", imbal*100);
5206 fprintf(fplog, "%s", buf);
5207 fprintf(stderr, "\n");
5208 fprintf(stderr, "%s", buf);
5209 sprintf(buf, " Part of the total run time spent waiting due to load imbalance: %.1f %%\n", lossf*100);
5210 fprintf(fplog, "%s", buf);
5211 fprintf(stderr, "%s", buf);
5216 sprintf(buf, " Steps where the load balancing was limited by -rdd, -rcon and/or -dds:");
5217 for (d = 0; d < dd->ndim; d++)
5219 limp = (200*comm->load_lim[d]+1)/(2*comm->nload);
5220 sprintf(buf+strlen(buf), " %c %d %%", dim2char(dd->dim[d]), limp);
5226 sprintf(buf+strlen(buf), "\n");
5227 fprintf(fplog, "%s", buf);
5228 fprintf(stderr, "%s", buf);
5230 if (npme > 0 && comm->load_mdf > 0 && comm->load_step > 0)
5232 pme_f_ratio = comm->load_pme/comm->load_mdf;
5233 lossp = (comm->load_pme - comm->load_mdf)/comm->load_step;
5236 lossp *= (float)npme/(float)nnodes;
5240 lossp *= (float)npp/(float)nnodes;
5242 sprintf(buf, " Average PME mesh/force load: %5.3f\n", pme_f_ratio);
5243 fprintf(fplog, "%s", buf);
5244 fprintf(stderr, "%s", buf);
5245 sprintf(buf, " Part of the total run time spent waiting due to PP/PME imbalance: %.1f %%\n", fabs(lossp)*100);
5246 fprintf(fplog, "%s", buf);
5247 fprintf(stderr, "%s", buf);
5249 fprintf(fplog, "\n");
5250 fprintf(stderr, "\n");
5252 if (lossf >= DD_PERF_LOSS_WARN)
5255 "NOTE: %.1f %% of the available CPU time was lost due to load imbalance\n"
5256 " in the domain decomposition.\n", lossf*100);
5259 sprintf(buf+strlen(buf), " You might want to use dynamic load balancing (option -dlb.)\n");
5263 sprintf(buf+strlen(buf), " You might want to decrease the cell size limit (options -rdd, -rcon and/or -dds).\n");
5265 fprintf(fplog, "%s\n", buf);
5266 fprintf(stderr, "%s\n", buf);
5268 if (npme > 0 && fabs(lossp) >= DD_PERF_LOSS_WARN)
5271 "NOTE: %.1f %% performance was lost because the PME ranks\n"
5272 " had %s work to do than the PP ranks.\n"
5273 " You might want to %s the number of PME ranks\n"
5274 " or %s the cut-off and the grid spacing.\n",
5276 (lossp < 0) ? "less" : "more",
5277 (lossp < 0) ? "decrease" : "increase",
5278 (lossp < 0) ? "decrease" : "increase");
5279 fprintf(fplog, "%s\n", buf);
5280 fprintf(stderr, "%s\n", buf);
5285 static float dd_vol_min(gmx_domdec_t *dd)
5287 return dd->comm->load[0].cvol_min*dd->nnodes;
5290 static gmx_bool dd_load_flags(gmx_domdec_t *dd)
5292 return dd->comm->load[0].flags;
5295 static float dd_f_imbal(gmx_domdec_t *dd)
5297 if (dd->comm->load[0].sum > 0)
5299 return dd->comm->load[0].max*dd->nnodes/dd->comm->load[0].sum - 1.0f;
5303 /* Something is wrong in the cycle counting, report no load imbalance */
5308 float dd_pme_f_ratio(gmx_domdec_t *dd)
5310 /* Should only be called on the DD master rank */
5311 assert(DDMASTER(dd));
5313 if (dd->comm->load[0].mdf > 0 && dd->comm->cycl_n[ddCyclPME] > 0)
5315 return dd->comm->load[0].pme/dd->comm->load[0].mdf;
5323 static void dd_print_load(FILE *fplog, gmx_domdec_t *dd, gmx_int64_t step)
5328 flags = dd_load_flags(dd);
5332 "DD load balancing is limited by minimum cell size in dimension");
5333 for (d = 0; d < dd->ndim; d++)
5337 fprintf(fplog, " %c", dim2char(dd->dim[d]));
5340 fprintf(fplog, "\n");
5342 fprintf(fplog, "DD step %s", gmx_step_str(step, buf));
5343 if (dlbIsOn(dd->comm))
5345 fprintf(fplog, " vol min/aver %5.3f%c",
5346 dd_vol_min(dd), flags ? '!' : ' ');
5350 fprintf(fplog, " load imb.: force %4.1f%%", dd_f_imbal(dd)*100);
5352 if (dd->comm->cycl_n[ddCyclPME])
5354 fprintf(fplog, " pme mesh/force %5.3f", dd_pme_f_ratio(dd));
5356 fprintf(fplog, "\n\n");
5359 static void dd_print_load_verbose(gmx_domdec_t *dd)
5361 if (dlbIsOn(dd->comm))
5363 fprintf(stderr, "vol %4.2f%c ",
5364 dd_vol_min(dd), dd_load_flags(dd) ? '!' : ' ');
5368 fprintf(stderr, "imb F %2d%% ", (int)(dd_f_imbal(dd)*100+0.5));
5370 if (dd->comm->cycl_n[ddCyclPME])
5372 fprintf(stderr, "pme/F %4.2f ", dd_pme_f_ratio(dd));
5377 static void make_load_communicator(gmx_domdec_t *dd, int dim_ind, ivec loc)
5382 domdec_root_t *root;
5383 gmx_bool bPartOfGroup = FALSE;
5385 dim = dd->dim[dim_ind];
5386 copy_ivec(loc, loc_c);
5387 for (i = 0; i < dd->nc[dim]; i++)
5390 rank = dd_index(dd->nc, loc_c);
5391 if (rank == dd->rank)
5393 /* This process is part of the group */
5394 bPartOfGroup = TRUE;
5397 MPI_Comm_split(dd->mpi_comm_all, bPartOfGroup ? 0 : MPI_UNDEFINED, dd->rank,
5401 dd->comm->mpi_comm_load[dim_ind] = c_row;
5402 if (dd->comm->dlbState != edlbsOffForever)
5404 if (dd->ci[dim] == dd->master_ci[dim])
5406 /* This is the root process of this row */
5407 snew(dd->comm->root[dim_ind], 1);
5408 root = dd->comm->root[dim_ind];
5409 snew(root->cell_f, DD_CELL_F_SIZE(dd, dim_ind));
5410 snew(root->old_cell_f, dd->nc[dim]+1);
5411 snew(root->bCellMin, dd->nc[dim]);
5414 snew(root->cell_f_max0, dd->nc[dim]);
5415 snew(root->cell_f_min1, dd->nc[dim]);
5416 snew(root->bound_min, dd->nc[dim]);
5417 snew(root->bound_max, dd->nc[dim]);
5419 snew(root->buf_ncd, dd->nc[dim]);
5423 /* This is not a root process, we only need to receive cell_f */
5424 snew(dd->comm->cell_f_row, DD_CELL_F_SIZE(dd, dim_ind));
5427 if (dd->ci[dim] == dd->master_ci[dim])
5429 snew(dd->comm->load[dim_ind].load, dd->nc[dim]*DD_NLOAD_MAX);
5435 void dd_setup_dlb_resource_sharing(t_commrec gmx_unused *cr,
5436 const gmx_hw_info_t gmx_unused *hwinfo,
5437 const gmx_hw_opt_t gmx_unused *hw_opt)
5440 int physicalnode_id_hash;
5443 MPI_Comm mpi_comm_pp_physicalnode;
5445 if (!(cr->duty & DUTY_PP) || hw_opt->gpu_opt.n_dev_use == 0)
5447 /* Only PP nodes (currently) use GPUs.
5448 * If we don't have GPUs, there are no resources to share.
5453 physicalnode_id_hash = gmx_physicalnode_id_hash();
5455 gpu_id = get_gpu_device_id(&hwinfo->gpu_info, &hw_opt->gpu_opt, cr->rank_pp_intranode);
5461 fprintf(debug, "dd_setup_dd_dlb_gpu_sharing:\n");
5462 fprintf(debug, "DD PP rank %d physical node hash %d gpu_id %d\n",
5463 dd->rank, physicalnode_id_hash, gpu_id);
5465 /* Split the PP communicator over the physical nodes */
5466 /* TODO: See if we should store this (before), as it's also used for
5467 * for the nodecomm summution.
5469 MPI_Comm_split(dd->mpi_comm_all, physicalnode_id_hash, dd->rank,
5470 &mpi_comm_pp_physicalnode);
5471 MPI_Comm_split(mpi_comm_pp_physicalnode, gpu_id, dd->rank,
5472 &dd->comm->mpi_comm_gpu_shared);
5473 MPI_Comm_free(&mpi_comm_pp_physicalnode);
5474 MPI_Comm_size(dd->comm->mpi_comm_gpu_shared, &dd->comm->nrank_gpu_shared);
5478 fprintf(debug, "nrank_gpu_shared %d\n", dd->comm->nrank_gpu_shared);
5481 /* Note that some ranks could share a GPU, while others don't */
5483 if (dd->comm->nrank_gpu_shared == 1)
5485 MPI_Comm_free(&dd->comm->mpi_comm_gpu_shared);
5490 static void make_load_communicators(gmx_domdec_t gmx_unused *dd)
5493 int dim0, dim1, i, j;
5498 fprintf(debug, "Making load communicators\n");
5501 snew(dd->comm->load, std::max(dd->ndim, 1));
5502 snew(dd->comm->mpi_comm_load, std::max(dd->ndim, 1));
5510 make_load_communicator(dd, 0, loc);
5514 for (i = 0; i < dd->nc[dim0]; i++)
5517 make_load_communicator(dd, 1, loc);
5523 for (i = 0; i < dd->nc[dim0]; i++)
5527 for (j = 0; j < dd->nc[dim1]; j++)
5530 make_load_communicator(dd, 2, loc);
5537 fprintf(debug, "Finished making load communicators\n");
5542 /*! \brief Sets up the relation between neighboring domains and zones */
5543 static void setup_neighbor_relations(gmx_domdec_t *dd)
5545 int d, dim, i, j, m;
5547 gmx_domdec_zones_t *zones;
5548 gmx_domdec_ns_ranges_t *izone;
5550 for (d = 0; d < dd->ndim; d++)
5553 copy_ivec(dd->ci, tmp);
5554 tmp[dim] = (tmp[dim] + 1) % dd->nc[dim];
5555 dd->neighbor[d][0] = ddcoord2ddnodeid(dd, tmp);
5556 copy_ivec(dd->ci, tmp);
5557 tmp[dim] = (tmp[dim] - 1 + dd->nc[dim]) % dd->nc[dim];
5558 dd->neighbor[d][1] = ddcoord2ddnodeid(dd, tmp);
5561 fprintf(debug, "DD rank %d neighbor ranks in dir %d are + %d - %d\n",
5564 dd->neighbor[d][1]);
5568 int nzone = (1 << dd->ndim);
5569 int nizone = (1 << std::max(dd->ndim - 1, 0));
5570 assert(nizone >= 1 && nizone <= DD_MAXIZONE);
5572 zones = &dd->comm->zones;
5574 for (i = 0; i < nzone; i++)
5577 clear_ivec(zones->shift[i]);
5578 for (d = 0; d < dd->ndim; d++)
5580 zones->shift[i][dd->dim[d]] = dd_zo[i][m++];
5585 for (i = 0; i < nzone; i++)
5587 for (d = 0; d < DIM; d++)
5589 s[d] = dd->ci[d] - zones->shift[i][d];
5594 else if (s[d] >= dd->nc[d])
5600 zones->nizone = nizone;
5601 for (i = 0; i < zones->nizone; i++)
5603 assert(ddNonbondedZonePairRanges[i][0] == i);
5605 izone = &zones->izone[i];
5606 /* dd_zp3 is for 3D decomposition, for fewer dimensions use only
5607 * j-zones up to nzone.
5609 izone->j0 = std::min(ddNonbondedZonePairRanges[i][1], nzone);
5610 izone->j1 = std::min(ddNonbondedZonePairRanges[i][2], nzone);
5611 for (dim = 0; dim < DIM; dim++)
5613 if (dd->nc[dim] == 1)
5615 /* All shifts should be allowed */
5616 izone->shift0[dim] = -1;
5617 izone->shift1[dim] = 1;
5621 /* Determine the min/max j-zone shift wrt the i-zone */
5622 izone->shift0[dim] = 1;
5623 izone->shift1[dim] = -1;
5624 for (j = izone->j0; j < izone->j1; j++)
5626 int shift_diff = zones->shift[j][dim] - zones->shift[i][dim];
5627 if (shift_diff < izone->shift0[dim])
5629 izone->shift0[dim] = shift_diff;
5631 if (shift_diff > izone->shift1[dim])
5633 izone->shift1[dim] = shift_diff;
5640 if (dd->comm->dlbState != edlbsOffForever)
5642 snew(dd->comm->root, dd->ndim);
5645 if (dd->comm->bRecordLoad)
5647 make_load_communicators(dd);
5651 static void make_pp_communicator(FILE *fplog,
5653 t_commrec gmx_unused *cr,
5654 int gmx_unused reorder)
5657 gmx_domdec_comm_t *comm;
5664 if (comm->bCartesianPP)
5666 /* Set up cartesian communication for the particle-particle part */
5669 fprintf(fplog, "Will use a Cartesian communicator: %d x %d x %d\n",
5670 dd->nc[XX], dd->nc[YY], dd->nc[ZZ]);
5673 for (int i = 0; i < DIM; i++)
5677 MPI_Cart_create(cr->mpi_comm_mygroup, DIM, dd->nc, periods, reorder,
5679 /* We overwrite the old communicator with the new cartesian one */
5680 cr->mpi_comm_mygroup = comm_cart;
5683 dd->mpi_comm_all = cr->mpi_comm_mygroup;
5684 MPI_Comm_rank(dd->mpi_comm_all, &dd->rank);
5686 if (comm->bCartesianPP_PME)
5688 /* Since we want to use the original cartesian setup for sim,
5689 * and not the one after split, we need to make an index.
5691 snew(comm->ddindex2ddnodeid, dd->nnodes);
5692 comm->ddindex2ddnodeid[dd_index(dd->nc, dd->ci)] = dd->rank;
5693 gmx_sumi(dd->nnodes, comm->ddindex2ddnodeid, cr);
5694 /* Get the rank of the DD master,
5695 * above we made sure that the master node is a PP node.
5705 MPI_Allreduce(&rank, &dd->masterrank, 1, MPI_INT, MPI_SUM, dd->mpi_comm_all);
5707 else if (comm->bCartesianPP)
5709 if (cr->npmenodes == 0)
5711 /* The PP communicator is also
5712 * the communicator for this simulation
5714 cr->mpi_comm_mysim = cr->mpi_comm_mygroup;
5716 cr->nodeid = dd->rank;
5718 MPI_Cart_coords(dd->mpi_comm_all, dd->rank, DIM, dd->ci);
5720 /* We need to make an index to go from the coordinates
5721 * to the nodeid of this simulation.
5723 snew(comm->ddindex2simnodeid, dd->nnodes);
5724 snew(buf, dd->nnodes);
5725 if (cr->duty & DUTY_PP)
5727 buf[dd_index(dd->nc, dd->ci)] = cr->sim_nodeid;
5729 /* Communicate the ddindex to simulation nodeid index */
5730 MPI_Allreduce(buf, comm->ddindex2simnodeid, dd->nnodes, MPI_INT, MPI_SUM,
5731 cr->mpi_comm_mysim);
5734 /* Determine the master coordinates and rank.
5735 * The DD master should be the same node as the master of this sim.
5737 for (int i = 0; i < dd->nnodes; i++)
5739 if (comm->ddindex2simnodeid[i] == 0)
5741 ddindex2xyz(dd->nc, i, dd->master_ci);
5742 MPI_Cart_rank(dd->mpi_comm_all, dd->master_ci, &dd->masterrank);
5747 fprintf(debug, "The master rank is %d\n", dd->masterrank);
5752 /* No Cartesian communicators */
5753 /* We use the rank in dd->comm->all as DD index */
5754 ddindex2xyz(dd->nc, dd->rank, dd->ci);
5755 /* The simulation master nodeid is 0, so the DD master rank is also 0 */
5757 clear_ivec(dd->master_ci);
5764 "Domain decomposition rank %d, coordinates %d %d %d\n\n",
5765 dd->rank, dd->ci[XX], dd->ci[YY], dd->ci[ZZ]);
5770 "Domain decomposition rank %d, coordinates %d %d %d\n\n",
5771 dd->rank, dd->ci[XX], dd->ci[YY], dd->ci[ZZ]);
5775 static void receive_ddindex2simnodeid(gmx_domdec_t *dd,
5779 gmx_domdec_comm_t *comm = dd->comm;
5781 if (!comm->bCartesianPP_PME && comm->bCartesianPP)
5784 snew(comm->ddindex2simnodeid, dd->nnodes);
5785 snew(buf, dd->nnodes);
5786 if (cr->duty & DUTY_PP)
5788 buf[dd_index(dd->nc, dd->ci)] = cr->sim_nodeid;
5790 /* Communicate the ddindex to simulation nodeid index */
5791 MPI_Allreduce(buf, comm->ddindex2simnodeid, dd->nnodes, MPI_INT, MPI_SUM,
5792 cr->mpi_comm_mysim);
5796 GMX_UNUSED_VALUE(dd);
5797 GMX_UNUSED_VALUE(cr);
5801 static gmx_domdec_master_t *init_gmx_domdec_master_t(gmx_domdec_t *dd,
5802 int ncg, int natoms)
5804 gmx_domdec_master_t *ma;
5809 snew(ma->ncg, dd->nnodes);
5810 snew(ma->index, dd->nnodes+1);
5812 snew(ma->nat, dd->nnodes);
5813 snew(ma->ibuf, dd->nnodes*2);
5814 snew(ma->cell_x, DIM);
5815 for (i = 0; i < DIM; i++)
5817 snew(ma->cell_x[i], dd->nc[i]+1);
5820 if (dd->nnodes <= GMX_DD_NNODES_SENDRECV)
5826 snew(ma->vbuf, natoms);
5832 static void split_communicator(FILE *fplog, t_commrec *cr, gmx_domdec_t *dd,
5833 int gmx_unused dd_rank_order,
5834 int gmx_unused reorder)
5836 gmx_domdec_comm_t *comm;
5845 if (comm->bCartesianPP)
5847 for (i = 1; i < DIM; i++)
5849 bDiv[i] = ((cr->npmenodes*dd->nc[i]) % (dd->nnodes) == 0);
5851 if (bDiv[YY] || bDiv[ZZ])
5853 comm->bCartesianPP_PME = TRUE;
5854 /* If we have 2D PME decomposition, which is always in x+y,
5855 * we stack the PME only nodes in z.
5856 * Otherwise we choose the direction that provides the thinnest slab
5857 * of PME only nodes as this will have the least effect
5858 * on the PP communication.
5859 * But for the PME communication the opposite might be better.
5861 if (bDiv[ZZ] && (comm->npmenodes_y > 1 ||
5863 dd->nc[YY] > dd->nc[ZZ]))
5865 comm->cartpmedim = ZZ;
5869 comm->cartpmedim = YY;
5871 comm->ntot[comm->cartpmedim]
5872 += (cr->npmenodes*dd->nc[comm->cartpmedim])/dd->nnodes;
5876 fprintf(fplog, "Number of PME-only ranks (%d) is not a multiple of nx*ny (%d*%d) or nx*nz (%d*%d)\n", cr->npmenodes, dd->nc[XX], dd->nc[YY], dd->nc[XX], dd->nc[ZZ]);
5878 "Will not use a Cartesian communicator for PP <-> PME\n\n");
5883 if (comm->bCartesianPP_PME)
5890 fprintf(fplog, "Will use a Cartesian communicator for PP <-> PME: %d x %d x %d\n", comm->ntot[XX], comm->ntot[YY], comm->ntot[ZZ]);
5893 for (i = 0; i < DIM; i++)
5897 MPI_Cart_create(cr->mpi_comm_mysim, DIM, comm->ntot, periods, reorder,
5899 MPI_Comm_rank(comm_cart, &rank);
5900 if (MASTER(cr) && rank != 0)
5902 gmx_fatal(FARGS, "MPI rank 0 was renumbered by MPI_Cart_create, we do not allow this");
5905 /* With this assigment we loose the link to the original communicator
5906 * which will usually be MPI_COMM_WORLD, unless have multisim.
5908 cr->mpi_comm_mysim = comm_cart;
5909 cr->sim_nodeid = rank;
5911 MPI_Cart_coords(cr->mpi_comm_mysim, cr->sim_nodeid, DIM, dd->ci);
5915 fprintf(fplog, "Cartesian rank %d, coordinates %d %d %d\n\n",
5916 cr->sim_nodeid, dd->ci[XX], dd->ci[YY], dd->ci[ZZ]);
5919 if (dd->ci[comm->cartpmedim] < dd->nc[comm->cartpmedim])
5923 if (cr->npmenodes == 0 ||
5924 dd->ci[comm->cartpmedim] >= dd->nc[comm->cartpmedim])
5926 cr->duty = DUTY_PME;
5929 /* Split the sim communicator into PP and PME only nodes */
5930 MPI_Comm_split(cr->mpi_comm_mysim,
5932 dd_index(comm->ntot, dd->ci),
5933 &cr->mpi_comm_mygroup);
5937 switch (dd_rank_order)
5939 case ddrankorderPP_PME:
5942 fprintf(fplog, "Order of the ranks: PP first, PME last\n");
5945 case ddrankorderINTERLEAVE:
5946 /* Interleave the PP-only and PME-only ranks */
5949 fprintf(fplog, "Interleaving PP and PME ranks\n");
5951 comm->pmenodes = dd_interleaved_pme_ranks(dd);
5953 case ddrankorderCARTESIAN:
5956 gmx_fatal(FARGS, "Unknown dd_rank_order=%d", dd_rank_order);
5959 if (dd_simnode2pmenode(dd, cr, cr->sim_nodeid) == -1)
5961 cr->duty = DUTY_PME;
5968 /* Split the sim communicator into PP and PME only nodes */
5969 MPI_Comm_split(cr->mpi_comm_mysim,
5972 &cr->mpi_comm_mygroup);
5973 MPI_Comm_rank(cr->mpi_comm_mygroup, &cr->nodeid);
5979 fprintf(fplog, "This rank does only %s work.\n\n",
5980 (cr->duty & DUTY_PP) ? "particle-particle" : "PME-mesh");
5984 /*! \brief Generates the MPI communicators for domain decomposition */
5985 static void make_dd_communicators(FILE *fplog, t_commrec *cr,
5986 gmx_domdec_t *dd, int dd_rank_order)
5988 gmx_domdec_comm_t *comm;
5993 copy_ivec(dd->nc, comm->ntot);
5995 comm->bCartesianPP = (dd_rank_order == ddrankorderCARTESIAN);
5996 comm->bCartesianPP_PME = FALSE;
5998 /* Reorder the nodes by default. This might change the MPI ranks.
5999 * Real reordering is only supported on very few architectures,
6000 * Blue Gene is one of them.
6002 CartReorder = (getenv("GMX_NO_CART_REORDER") == nullptr);
6004 if (cr->npmenodes > 0)
6006 /* Split the communicator into a PP and PME part */
6007 split_communicator(fplog, cr, dd, dd_rank_order, CartReorder);
6008 if (comm->bCartesianPP_PME)
6010 /* We (possibly) reordered the nodes in split_communicator,
6011 * so it is no longer required in make_pp_communicator.
6013 CartReorder = FALSE;
6018 /* All nodes do PP and PME */
6020 /* We do not require separate communicators */
6021 cr->mpi_comm_mygroup = cr->mpi_comm_mysim;
6025 if (cr->duty & DUTY_PP)
6027 /* Copy or make a new PP communicator */
6028 make_pp_communicator(fplog, dd, cr, CartReorder);
6032 receive_ddindex2simnodeid(dd, cr);
6035 if (!(cr->duty & DUTY_PME))
6037 /* Set up the commnuication to our PME node */
6038 dd->pme_nodeid = dd_simnode2pmenode(dd, cr, cr->sim_nodeid);
6039 dd->pme_receive_vir_ener = receive_vir_ener(dd, cr);
6042 fprintf(debug, "My pme_nodeid %d receive ener %d\n",
6043 dd->pme_nodeid, dd->pme_receive_vir_ener);
6048 dd->pme_nodeid = -1;
6053 dd->ma = init_gmx_domdec_master_t(dd,
6055 comm->cgs_gl.index[comm->cgs_gl.nr]);
6059 static real *get_slb_frac(FILE *fplog, const char *dir, int nc, const char *size_string)
6061 real *slb_frac, tot;
6066 if (nc > 1 && size_string != nullptr)
6070 fprintf(fplog, "Using static load balancing for the %s direction\n",
6075 for (i = 0; i < nc; i++)
6078 sscanf(size_string, "%20lf%n", &dbl, &n);
6081 gmx_fatal(FARGS, "Incorrect or not enough DD cell size entries for direction %s: '%s'", dir, size_string);
6090 fprintf(fplog, "Relative cell sizes:");
6092 for (i = 0; i < nc; i++)
6097 fprintf(fplog, " %5.3f", slb_frac[i]);
6102 fprintf(fplog, "\n");
6109 static int multi_body_bondeds_count(const gmx_mtop_t *mtop)
6112 gmx_mtop_ilistloop_t iloop;
6116 iloop = gmx_mtop_ilistloop_init(mtop);
6117 while (gmx_mtop_ilistloop_next(iloop, &il, &nmol))
6119 for (ftype = 0; ftype < F_NRE; ftype++)
6121 if ((interaction_function[ftype].flags & IF_BOND) &&
6124 n += nmol*il[ftype].nr/(1 + NRAL(ftype));
6132 static int dd_getenv(FILE *fplog, const char *env_var, int def)
6138 val = getenv(env_var);
6141 if (sscanf(val, "%20d", &nst) <= 0)
6147 fprintf(fplog, "Found env.var. %s = %s, using value %d\n",
6155 static void dd_warning(t_commrec *cr, FILE *fplog, const char *warn_string)
6159 fprintf(stderr, "\n%s\n", warn_string);
6163 fprintf(fplog, "\n%s\n", warn_string);
6167 static void check_dd_restrictions(t_commrec *cr, const gmx_domdec_t *dd,
6168 const t_inputrec *ir, FILE *fplog)
6170 if (ir->ePBC == epbcSCREW &&
6171 (dd->nc[XX] == 1 || dd->nc[YY] > 1 || dd->nc[ZZ] > 1))
6173 gmx_fatal(FARGS, "With pbc=%s can only do domain decomposition in the x-direction", epbc_names[ir->ePBC]);
6176 if (ir->ns_type == ensSIMPLE)
6178 gmx_fatal(FARGS, "Domain decomposition does not support simple neighbor searching, use grid searching or run with one MPI rank");
6181 if (ir->nstlist == 0)
6183 gmx_fatal(FARGS, "Domain decomposition does not work with nstlist=0");
6186 if (ir->comm_mode == ecmANGULAR && ir->ePBC != epbcNONE)
6188 dd_warning(cr, fplog, "comm-mode angular will give incorrect results when the comm group partially crosses a periodic boundary");
6192 static real average_cellsize_min(gmx_domdec_t *dd, gmx_ddbox_t *ddbox)
6197 r = ddbox->box_size[XX];
6198 for (di = 0; di < dd->ndim; di++)
6201 /* Check using the initial average cell size */
6202 r = std::min(r, ddbox->box_size[d]*ddbox->skew_fac[d]/dd->nc[d]);
6208 static int check_dlb_support(FILE *fplog, t_commrec *cr,
6209 const char *dlb_opt, gmx_bool bRecordLoad,
6210 unsigned long Flags, const t_inputrec *ir)
6217 case 'a': dlbState = edlbsOffCanTurnOn; break;
6218 case 'n': dlbState = edlbsOffForever; break;
6219 case 'y': dlbState = edlbsOnForever; break;
6220 default: gmx_incons("Unknown dlb_opt");
6223 if (Flags & MD_RERUN)
6225 return edlbsOffForever;
6228 if (!EI_DYNAMICS(ir->eI))
6230 if (dlbState == edlbsOnForever)
6232 sprintf(buf, "NOTE: dynamic load balancing is only supported with dynamics, not with integrator '%s'\n", EI(ir->eI));
6233 dd_warning(cr, fplog, buf);
6236 return edlbsOffForever;
6241 dd_warning(cr, fplog, "NOTE: Cycle counters unsupported or not enabled in kernel. Cannot use dynamic load balancing.\n");
6242 return edlbsOffForever;
6245 if (Flags & MD_REPRODUCIBLE)
6249 case edlbsOffForever:
6251 case edlbsOffCanTurnOn:
6252 case edlbsOnCanTurnOff:
6253 dd_warning(cr, fplog, "NOTE: reproducibility requested, will not use dynamic load balancing\n");
6254 dlbState = edlbsOffForever;
6256 case edlbsOnForever:
6257 dd_warning(cr, fplog, "WARNING: reproducibility requested with dynamic load balancing, the simulation will NOT be binary reproducible\n");
6260 gmx_fatal(FARGS, "Death horror: undefined case (%d) for load balancing choice", dlbState);
6268 static void set_dd_dim(FILE *fplog, gmx_domdec_t *dd)
6273 if (getenv("GMX_DD_ORDER_ZYX") != nullptr)
6275 /* Decomposition order z,y,x */
6278 fprintf(fplog, "Using domain decomposition order z, y, x\n");
6280 for (dim = DIM-1; dim >= 0; dim--)
6282 if (dd->nc[dim] > 1)
6284 dd->dim[dd->ndim++] = dim;
6290 /* Decomposition order x,y,z */
6291 for (dim = 0; dim < DIM; dim++)
6293 if (dd->nc[dim] > 1)
6295 dd->dim[dd->ndim++] = dim;
6301 static gmx_domdec_comm_t *init_dd_comm()
6303 gmx_domdec_comm_t *comm;
6307 snew(comm->cggl_flag, DIM*2);
6308 snew(comm->cgcm_state, DIM*2);
6309 for (i = 0; i < DIM*2; i++)
6311 comm->cggl_flag_nalloc[i] = 0;
6312 comm->cgcm_state_nalloc[i] = 0;
6315 comm->nalloc_int = 0;
6316 comm->buf_int = nullptr;
6318 vec_rvec_init(&comm->vbuf);
6320 comm->n_load_have = 0;
6321 comm->n_load_collect = 0;
6323 for (i = 0; i < ddnatNR-ddnatZONE; i++)
6325 comm->sum_nat[i] = 0;
6329 comm->load_step = 0;
6332 clear_ivec(comm->load_lim);
6339 /*! \brief Set the cell size and interaction limits, as well as the DD grid */
6340 static void set_dd_limits_and_grid(FILE *fplog, t_commrec *cr, gmx_domdec_t *dd,
6341 unsigned long Flags,
6342 ivec nc, int nPmeRanks,
6343 real comm_distance_min, real rconstr,
6344 const char *dlb_opt, real dlb_scale,
6345 const char *sizex, const char *sizey, const char *sizez,
6346 const gmx_mtop_t *mtop,
6347 const t_inputrec *ir,
6348 matrix box, const rvec *x,
6350 int *npme_x, int *npme_y)
6353 real r_bonded_limit = -1;
6354 const real tenPercentMargin = 1.1;
6355 gmx_domdec_comm_t *comm = dd->comm;
6357 snew(comm->cggl_flag, DIM*2);
6358 snew(comm->cgcm_state, DIM*2);
6360 dd->npbcdim = ePBC2npbcdim(ir->ePBC);
6361 dd->bScrewPBC = (ir->ePBC == epbcSCREW);
6363 dd->pme_recv_f_alloc = 0;
6364 dd->pme_recv_f_buf = nullptr;
6366 /* Initialize to GPU share count to 0, might change later */
6367 comm->nrank_gpu_shared = 0;
6369 comm->dlbState = check_dlb_support(fplog, cr, dlb_opt, comm->bRecordLoad, Flags, ir);
6370 dd_dlb_set_should_check_whether_to_turn_dlb_on(dd, TRUE);
6371 /* To consider turning DLB on after 2*nstlist steps we need to check
6372 * at partitioning count 3. Thus we need to increase the first count by 2.
6374 comm->ddPartioningCountFirstDlbOff += 2;
6378 fprintf(fplog, "Dynamic load balancing: %s\n",
6379 edlbs_names[comm->dlbState]);
6381 comm->bPMELoadBalDLBLimits = FALSE;
6383 /* Allocate the charge group/atom sorting struct */
6384 snew(comm->sort, 1);
6386 comm->bCGs = (ncg_mtop(mtop) < mtop->natoms);
6388 comm->bInterCGBondeds = ((ncg_mtop(mtop) > mtop->mols.nr) ||
6389 mtop->bIntermolecularInteractions);
6390 if (comm->bInterCGBondeds)
6392 comm->bInterCGMultiBody = (multi_body_bondeds_count(mtop) > 0);
6396 comm->bInterCGMultiBody = FALSE;
6399 dd->bInterCGcons = inter_charge_group_constraints(mtop);
6400 dd->bInterCGsettles = inter_charge_group_settles(mtop);
6404 /* Set the cut-off to some very large value,
6405 * so we don't need if statements everywhere in the code.
6406 * We use sqrt, since the cut-off is squared in some places.
6408 comm->cutoff = GMX_CUTOFF_INF;
6412 comm->cutoff = ir->rlist;
6414 comm->cutoff_mbody = 0;
6416 comm->cellsize_limit = 0;
6417 comm->bBondComm = FALSE;
6419 /* Atoms should be able to move by up to half the list buffer size (if > 0)
6420 * within nstlist steps. Since boundaries are allowed to displace by half
6421 * a cell size, DD cells should be at least the size of the list buffer.
6423 comm->cellsize_limit = std::max(comm->cellsize_limit,
6424 ir->rlist - std::max(ir->rvdw, ir->rcoulomb));
6426 if (comm->bInterCGBondeds)
6428 if (comm_distance_min > 0)
6430 comm->cutoff_mbody = comm_distance_min;
6431 if (Flags & MD_DDBONDCOMM)
6433 comm->bBondComm = (comm->cutoff_mbody > comm->cutoff);
6437 comm->cutoff = std::max(comm->cutoff, comm->cutoff_mbody);
6439 r_bonded_limit = comm->cutoff_mbody;
6441 else if (ir->bPeriodicMols)
6443 /* Can not easily determine the required cut-off */
6444 dd_warning(cr, fplog, "NOTE: Periodic molecules are present in this system. Because of this, the domain decomposition algorithm cannot easily determine the minimum cell size that it requires for treating bonded interactions. Instead, domain decomposition will assume that half the non-bonded cut-off will be a suitable lower bound.\n");
6445 comm->cutoff_mbody = comm->cutoff/2;
6446 r_bonded_limit = comm->cutoff_mbody;
6454 dd_bonded_cg_distance(fplog, mtop, ir, x, box,
6455 Flags & MD_DDBONDCHECK, &r_2b, &r_mb);
6457 gmx_bcast(sizeof(r_2b), &r_2b, cr);
6458 gmx_bcast(sizeof(r_mb), &r_mb, cr);
6460 /* We use an initial margin of 10% for the minimum cell size,
6461 * except when we are just below the non-bonded cut-off.
6463 if (Flags & MD_DDBONDCOMM)
6465 if (std::max(r_2b, r_mb) > comm->cutoff)
6467 r_bonded = std::max(r_2b, r_mb);
6468 r_bonded_limit = tenPercentMargin*r_bonded;
6469 comm->bBondComm = TRUE;
6474 r_bonded_limit = std::min(tenPercentMargin*r_bonded, comm->cutoff);
6476 /* We determine cutoff_mbody later */
6480 /* No special bonded communication,
6481 * simply increase the DD cut-off.
6483 r_bonded_limit = tenPercentMargin*std::max(r_2b, r_mb);
6484 comm->cutoff_mbody = r_bonded_limit;
6485 comm->cutoff = std::max(comm->cutoff, comm->cutoff_mbody);
6491 "Minimum cell size due to bonded interactions: %.3f nm\n",
6494 comm->cellsize_limit = std::max(comm->cellsize_limit, r_bonded_limit);
6497 if (dd->bInterCGcons && rconstr <= 0)
6499 /* There is a cell size limit due to the constraints (P-LINCS) */
6500 rconstr = constr_r_max(fplog, mtop, ir);
6504 "Estimated maximum distance required for P-LINCS: %.3f nm\n",
6506 if (rconstr > comm->cellsize_limit)
6508 fprintf(fplog, "This distance will limit the DD cell size, you can override this with -rcon\n");
6512 else if (rconstr > 0 && fplog)
6514 /* Here we do not check for dd->bInterCGcons,
6515 * because one can also set a cell size limit for virtual sites only
6516 * and at this point we don't know yet if there are intercg v-sites.
6519 "User supplied maximum distance required for P-LINCS: %.3f nm\n",
6522 comm->cellsize_limit = std::max(comm->cellsize_limit, rconstr);
6524 comm->cgs_gl = gmx_mtop_global_cgs(mtop);
6528 copy_ivec(nc, dd->nc);
6529 set_dd_dim(fplog, dd);
6530 set_ddbox_cr(cr, &dd->nc, ir, box, &comm->cgs_gl, x, ddbox);
6534 cr->npmenodes = nPmeRanks;
6538 /* When the DD grid is set explicitly and -npme is set to auto,
6539 * don't use PME ranks. We check later if the DD grid is
6540 * compatible with the total number of ranks.
6545 real acs = average_cellsize_min(dd, ddbox);
6546 if (acs < comm->cellsize_limit)
6550 fprintf(fplog, "ERROR: The initial cell size (%f) is smaller than the cell size limit (%f)\n", acs, comm->cellsize_limit);
6552 gmx_fatal_collective(FARGS, cr->mpi_comm_mysim, MASTER(cr),
6553 "The initial cell size (%f) is smaller than the cell size limit (%f), change options -dd, -rdd or -rcon, see the log file for details",
6554 acs, comm->cellsize_limit);
6559 set_ddbox_cr(cr, nullptr, ir, box, &comm->cgs_gl, x, ddbox);
6561 /* We need to choose the optimal DD grid and possibly PME nodes */
6563 dd_choose_grid(fplog, cr, dd, ir, mtop, box, ddbox,
6565 comm->dlbState != edlbsOffForever, dlb_scale,
6566 comm->cellsize_limit, comm->cutoff,
6567 comm->bInterCGBondeds);
6569 if (dd->nc[XX] == 0)
6572 gmx_bool bC = (dd->bInterCGcons && rconstr > r_bonded_limit);
6573 sprintf(buf, "Change the number of ranks or mdrun option %s%s%s",
6574 !bC ? "-rdd" : "-rcon",
6575 comm->dlbState != edlbsOffForever ? " or -dds" : "",
6576 bC ? " or your LINCS settings" : "");
6578 gmx_fatal_collective(FARGS, cr->mpi_comm_mysim, MASTER(cr),
6579 "There is no domain decomposition for %d ranks that is compatible with the given box and a minimum cell size of %g nm\n"
6581 "Look in the log file for details on the domain decomposition",
6582 cr->nnodes-cr->npmenodes, limit, buf);
6584 set_dd_dim(fplog, dd);
6590 "Domain decomposition grid %d x %d x %d, separate PME ranks %d\n",
6591 dd->nc[XX], dd->nc[YY], dd->nc[ZZ], cr->npmenodes);
6594 dd->nnodes = dd->nc[XX]*dd->nc[YY]*dd->nc[ZZ];
6595 if (cr->nnodes - dd->nnodes != cr->npmenodes)
6597 gmx_fatal_collective(FARGS, cr->mpi_comm_mysim, MASTER(cr),
6598 "The size of the domain decomposition grid (%d) does not match the number of ranks (%d). The total number of ranks is %d",
6599 dd->nnodes, cr->nnodes - cr->npmenodes, cr->nnodes);
6601 if (cr->npmenodes > dd->nnodes)
6603 gmx_fatal_collective(FARGS, cr->mpi_comm_mysim, MASTER(cr),
6604 "The number of separate PME ranks (%d) is larger than the number of PP ranks (%d), this is not supported.", cr->npmenodes, dd->nnodes);
6606 if (cr->npmenodes > 0)
6608 comm->npmenodes = cr->npmenodes;
6612 comm->npmenodes = dd->nnodes;
6615 if (EEL_PME(ir->coulombtype) || EVDW_PME(ir->vdwtype))
6617 /* The following choices should match those
6618 * in comm_cost_est in domdec_setup.c.
6619 * Note that here the checks have to take into account
6620 * that the decomposition might occur in a different order than xyz
6621 * (for instance through the env.var. GMX_DD_ORDER_ZYX),
6622 * in which case they will not match those in comm_cost_est,
6623 * but since that is mainly for testing purposes that's fine.
6625 if (dd->ndim >= 2 && dd->dim[0] == XX && dd->dim[1] == YY &&
6626 comm->npmenodes > dd->nc[XX] && comm->npmenodes % dd->nc[XX] == 0 &&
6627 getenv("GMX_PMEONEDD") == nullptr)
6629 comm->npmedecompdim = 2;
6630 comm->npmenodes_x = dd->nc[XX];
6631 comm->npmenodes_y = comm->npmenodes/comm->npmenodes_x;
6635 /* In case nc is 1 in both x and y we could still choose to
6636 * decompose pme in y instead of x, but we use x for simplicity.
6638 comm->npmedecompdim = 1;
6639 if (dd->dim[0] == YY)
6641 comm->npmenodes_x = 1;
6642 comm->npmenodes_y = comm->npmenodes;
6646 comm->npmenodes_x = comm->npmenodes;
6647 comm->npmenodes_y = 1;
6652 fprintf(fplog, "PME domain decomposition: %d x %d x %d\n",
6653 comm->npmenodes_x, comm->npmenodes_y, 1);
6658 comm->npmedecompdim = 0;
6659 comm->npmenodes_x = 0;
6660 comm->npmenodes_y = 0;
6663 /* Technically we don't need both of these,
6664 * but it simplifies code not having to recalculate it.
6666 *npme_x = comm->npmenodes_x;
6667 *npme_y = comm->npmenodes_y;
6669 snew(comm->slb_frac, DIM);
6670 if (comm->dlbState == edlbsOffForever)
6672 comm->slb_frac[XX] = get_slb_frac(fplog, "x", dd->nc[XX], sizex);
6673 comm->slb_frac[YY] = get_slb_frac(fplog, "y", dd->nc[YY], sizey);
6674 comm->slb_frac[ZZ] = get_slb_frac(fplog, "z", dd->nc[ZZ], sizez);
6677 if (comm->bInterCGBondeds && comm->cutoff_mbody == 0)
6679 if (comm->bBondComm || comm->dlbState != edlbsOffForever)
6681 /* Set the bonded communication distance to halfway
6682 * the minimum and the maximum,
6683 * since the extra communication cost is nearly zero.
6685 real acs = average_cellsize_min(dd, ddbox);
6686 comm->cutoff_mbody = 0.5*(r_bonded + acs);
6687 if (comm->dlbState != edlbsOffForever)
6689 /* Check if this does not limit the scaling */
6690 comm->cutoff_mbody = std::min(comm->cutoff_mbody, dlb_scale*acs);
6692 if (!comm->bBondComm)
6694 /* Without bBondComm do not go beyond the n.b. cut-off */
6695 comm->cutoff_mbody = std::min(comm->cutoff_mbody, comm->cutoff);
6696 if (comm->cellsize_limit >= comm->cutoff)
6698 /* We don't loose a lot of efficieny
6699 * when increasing it to the n.b. cut-off.
6700 * It can even be slightly faster, because we need
6701 * less checks for the communication setup.
6703 comm->cutoff_mbody = comm->cutoff;
6706 /* Check if we did not end up below our original limit */
6707 comm->cutoff_mbody = std::max(comm->cutoff_mbody, r_bonded_limit);
6709 if (comm->cutoff_mbody > comm->cellsize_limit)
6711 comm->cellsize_limit = comm->cutoff_mbody;
6714 /* Without DLB and cutoff_mbody<cutoff, cutoff_mbody is dynamic */
6719 fprintf(debug, "Bonded atom communication beyond the cut-off: %d\n"
6720 "cellsize limit %f\n",
6721 comm->bBondComm, comm->cellsize_limit);
6726 check_dd_restrictions(cr, dd, ir, fplog);
6730 static void set_dlb_limits(gmx_domdec_t *dd)
6735 for (d = 0; d < dd->ndim; d++)
6737 dd->comm->cd[d].np = dd->comm->cd[d].np_dlb;
6738 dd->comm->cellsize_min[dd->dim[d]] =
6739 dd->comm->cellsize_min_dlb[dd->dim[d]];
6744 static void turn_on_dlb(FILE *fplog, t_commrec *cr, gmx_int64_t step)
6747 gmx_domdec_comm_t *comm;
6754 cellsize_min = comm->cellsize_min[dd->dim[0]];
6755 for (d = 1; d < dd->ndim; d++)
6757 cellsize_min = std::min(cellsize_min, comm->cellsize_min[dd->dim[d]]);
6760 if (cellsize_min < comm->cellsize_limit*1.05)
6763 sprintf(buf, "step %" GMX_PRId64 " Measured %.1f %% performance load due to load imbalance, but the minimum cell size is smaller than 1.05 times the cell size limit. Will no longer try dynamic load balancing.\n", step, dd_force_imb_perf_loss(dd)*100);
6765 /* Change DLB from "auto" to "no". */
6766 comm->dlbState = edlbsOffForever;
6772 sprintf(buf, "step %" GMX_PRId64 " Turning on dynamic load balancing, because the performance loss due to load imbalance is %.1f %%.\n", step, dd_force_imb_perf_loss(dd)*100);
6773 dd_warning(cr, fplog, buf);
6774 comm->dlbState = edlbsOnCanTurnOff;
6776 /* Store the non-DLB performance, so we can check if DLB actually
6777 * improves performance.
6779 GMX_RELEASE_ASSERT(comm->cycl_n[ddCyclStep] > 0, "When we turned on DLB, we should have measured cycles");
6780 comm->cyclesPerStepBeforeDLB = comm->cycl[ddCyclStep]/comm->cycl_n[ddCyclStep];
6784 /* We can set the required cell size info here,
6785 * so we do not need to communicate this.
6786 * The grid is completely uniform.
6788 for (d = 0; d < dd->ndim; d++)
6792 comm->load[d].sum_m = comm->load[d].sum;
6794 nc = dd->nc[dd->dim[d]];
6795 for (i = 0; i < nc; i++)
6797 comm->root[d]->cell_f[i] = i/(real)nc;
6800 comm->root[d]->cell_f_max0[i] = i /(real)nc;
6801 comm->root[d]->cell_f_min1[i] = (i+1)/(real)nc;
6804 comm->root[d]->cell_f[nc] = 1.0;
6809 static void turn_off_dlb(FILE *fplog, t_commrec *cr, gmx_int64_t step)
6811 gmx_domdec_t *dd = cr->dd;
6814 sprintf(buf, "step %" GMX_PRId64 " Turning off dynamic load balancing, because it is degrading performance.\n", step);
6815 dd_warning(cr, fplog, buf);
6816 dd->comm->dlbState = edlbsOffCanTurnOn;
6817 dd->comm->haveTurnedOffDlb = true;
6818 dd->comm->ddPartioningCountFirstDlbOff = dd->ddp_count;
6821 static void turn_off_dlb_forever(FILE *fplog, t_commrec *cr, gmx_int64_t step)
6823 GMX_RELEASE_ASSERT(cr->dd->comm->dlbState == edlbsOffCanTurnOn, "Can only turn off DLB forever when it was in the can-turn-on state");
6825 sprintf(buf, "step %" GMX_PRId64 " Will no longer try dynamic load balancing, as it degraded performance.\n", step);
6826 dd_warning(cr, fplog, buf);
6827 cr->dd->comm->dlbState = edlbsOffForever;
6830 static char *init_bLocalCG(const gmx_mtop_t *mtop)
6835 ncg = ncg_mtop(mtop);
6836 snew(bLocalCG, ncg);
6837 for (cg = 0; cg < ncg; cg++)
6839 bLocalCG[cg] = FALSE;
6845 void dd_init_bondeds(FILE *fplog,
6847 const gmx_mtop_t *mtop,
6848 const gmx_vsite_t *vsite,
6849 const t_inputrec *ir,
6850 gmx_bool bBCheck, cginfo_mb_t *cginfo_mb)
6852 gmx_domdec_comm_t *comm;
6854 dd_make_reverse_top(fplog, dd, mtop, vsite, ir, bBCheck);
6858 if (comm->bBondComm)
6860 /* Communicate atoms beyond the cut-off for bonded interactions */
6863 comm->cglink = make_charge_group_links(mtop, dd, cginfo_mb);
6865 comm->bLocalCG = init_bLocalCG(mtop);
6869 /* Only communicate atoms based on cut-off */
6870 comm->cglink = nullptr;
6871 comm->bLocalCG = nullptr;
6875 static void print_dd_settings(FILE *fplog, gmx_domdec_t *dd,
6876 const gmx_mtop_t *mtop, const t_inputrec *ir,
6877 gmx_bool bDynLoadBal, real dlb_scale,
6878 const gmx_ddbox_t *ddbox)
6880 gmx_domdec_comm_t *comm;
6886 if (fplog == nullptr)
6895 fprintf(fplog, "The maximum number of communication pulses is:");
6896 for (d = 0; d < dd->ndim; d++)
6898 fprintf(fplog, " %c %d", dim2char(dd->dim[d]), comm->cd[d].np_dlb);
6900 fprintf(fplog, "\n");
6901 fprintf(fplog, "The minimum size for domain decomposition cells is %.3f nm\n", comm->cellsize_limit);
6902 fprintf(fplog, "The requested allowed shrink of DD cells (option -dds) is: %.2f\n", dlb_scale);
6903 fprintf(fplog, "The allowed shrink of domain decomposition cells is:");
6904 for (d = 0; d < DIM; d++)
6908 if (d >= ddbox->npbcdim && dd->nc[d] == 2)
6915 comm->cellsize_min_dlb[d]/
6916 (ddbox->box_size[d]*ddbox->skew_fac[d]/dd->nc[d]);
6918 fprintf(fplog, " %c %.2f", dim2char(d), shrink);
6921 fprintf(fplog, "\n");
6925 set_dd_cell_sizes_slb(dd, ddbox, setcellsizeslbPULSE_ONLY, np);
6926 fprintf(fplog, "The initial number of communication pulses is:");
6927 for (d = 0; d < dd->ndim; d++)
6929 fprintf(fplog, " %c %d", dim2char(dd->dim[d]), np[dd->dim[d]]);
6931 fprintf(fplog, "\n");
6932 fprintf(fplog, "The initial domain decomposition cell size is:");
6933 for (d = 0; d < DIM; d++)
6937 fprintf(fplog, " %c %.2f nm",
6938 dim2char(d), dd->comm->cellsize_min[d]);
6941 fprintf(fplog, "\n\n");
6944 gmx_bool bInterCGVsites = count_intercg_vsites(mtop);
6946 if (comm->bInterCGBondeds ||
6948 dd->bInterCGcons || dd->bInterCGsettles)
6950 fprintf(fplog, "The maximum allowed distance for charge groups involved in interactions is:\n");
6951 fprintf(fplog, "%40s %-7s %6.3f nm\n",
6952 "non-bonded interactions", "", comm->cutoff);
6956 limit = dd->comm->cellsize_limit;
6960 if (dynamic_dd_box(ddbox, ir))
6962 fprintf(fplog, "(the following are initial values, they could change due to box deformation)\n");
6964 limit = dd->comm->cellsize_min[XX];
6965 for (d = 1; d < DIM; d++)
6967 limit = std::min(limit, dd->comm->cellsize_min[d]);
6971 if (comm->bInterCGBondeds)
6973 fprintf(fplog, "%40s %-7s %6.3f nm\n",
6974 "two-body bonded interactions", "(-rdd)",
6975 std::max(comm->cutoff, comm->cutoff_mbody));
6976 fprintf(fplog, "%40s %-7s %6.3f nm\n",
6977 "multi-body bonded interactions", "(-rdd)",
6978 (comm->bBondComm || dlbIsOn(dd->comm)) ? comm->cutoff_mbody : std::min(comm->cutoff, limit));
6982 fprintf(fplog, "%40s %-7s %6.3f nm\n",
6983 "virtual site constructions", "(-rcon)", limit);
6985 if (dd->bInterCGcons || dd->bInterCGsettles)
6987 sprintf(buf, "atoms separated by up to %d constraints",
6989 fprintf(fplog, "%40s %-7s %6.3f nm\n",
6990 buf, "(-rcon)", limit);
6992 fprintf(fplog, "\n");
6998 static void set_cell_limits_dlb(gmx_domdec_t *dd,
7000 const t_inputrec *ir,
7001 const gmx_ddbox_t *ddbox)
7003 gmx_domdec_comm_t *comm;
7004 int d, dim, npulse, npulse_d_max, npulse_d;
7009 bNoCutOff = (ir->rvdw == 0 || ir->rcoulomb == 0);
7011 /* Determine the maximum number of comm. pulses in one dimension */
7013 comm->cellsize_limit = std::max(comm->cellsize_limit, comm->cutoff_mbody);
7015 /* Determine the maximum required number of grid pulses */
7016 if (comm->cellsize_limit >= comm->cutoff)
7018 /* Only a single pulse is required */
7021 else if (!bNoCutOff && comm->cellsize_limit > 0)
7023 /* We round down slightly here to avoid overhead due to the latency
7024 * of extra communication calls when the cut-off
7025 * would be only slightly longer than the cell size.
7026 * Later cellsize_limit is redetermined,
7027 * so we can not miss interactions due to this rounding.
7029 npulse = (int)(0.96 + comm->cutoff/comm->cellsize_limit);
7033 /* There is no cell size limit */
7034 npulse = std::max(dd->nc[XX]-1, std::max(dd->nc[YY]-1, dd->nc[ZZ]-1));
7037 if (!bNoCutOff && npulse > 1)
7039 /* See if we can do with less pulses, based on dlb_scale */
7041 for (d = 0; d < dd->ndim; d++)
7044 npulse_d = (int)(1 + dd->nc[dim]*comm->cutoff
7045 /(ddbox->box_size[dim]*ddbox->skew_fac[dim]*dlb_scale));
7046 npulse_d_max = std::max(npulse_d_max, npulse_d);
7048 npulse = std::min(npulse, npulse_d_max);
7051 /* This env var can override npulse */
7052 d = dd_getenv(debug, "GMX_DD_NPULSE", 0);
7059 comm->bVacDLBNoLimit = (ir->ePBC == epbcNONE);
7060 for (d = 0; d < dd->ndim; d++)
7062 comm->cd[d].np_dlb = std::min(npulse, dd->nc[dd->dim[d]]-1);
7063 comm->cd[d].np_nalloc = comm->cd[d].np_dlb;
7064 snew(comm->cd[d].ind, comm->cd[d].np_nalloc);
7065 comm->maxpulse = std::max(comm->maxpulse, comm->cd[d].np_dlb);
7066 if (comm->cd[d].np_dlb < dd->nc[dd->dim[d]]-1)
7068 comm->bVacDLBNoLimit = FALSE;
7072 /* cellsize_limit is set for LINCS in init_domain_decomposition */
7073 if (!comm->bVacDLBNoLimit)
7075 comm->cellsize_limit = std::max(comm->cellsize_limit,
7076 comm->cutoff/comm->maxpulse);
7078 comm->cellsize_limit = std::max(comm->cellsize_limit, comm->cutoff_mbody);
7079 /* Set the minimum cell size for each DD dimension */
7080 for (d = 0; d < dd->ndim; d++)
7082 if (comm->bVacDLBNoLimit ||
7083 comm->cd[d].np_dlb*comm->cellsize_limit >= comm->cutoff)
7085 comm->cellsize_min_dlb[dd->dim[d]] = comm->cellsize_limit;
7089 comm->cellsize_min_dlb[dd->dim[d]] =
7090 comm->cutoff/comm->cd[d].np_dlb;
7093 if (comm->cutoff_mbody <= 0)
7095 comm->cutoff_mbody = std::min(comm->cutoff, comm->cellsize_limit);
7103 gmx_bool dd_bonded_molpbc(const gmx_domdec_t *dd, int ePBC)
7105 /* If each molecule is a single charge group
7106 * or we use domain decomposition for each periodic dimension,
7107 * we do not need to take pbc into account for the bonded interactions.
7109 return (ePBC != epbcNONE && dd->comm->bInterCGBondeds &&
7112 (dd->nc[ZZ] > 1 || ePBC == epbcXY)));
7115 /*! \brief Sets grid size limits and PP-PME setup, prints settings to log */
7116 static void set_ddgrid_parameters(FILE *fplog, gmx_domdec_t *dd, real dlb_scale,
7117 const gmx_mtop_t *mtop, const t_inputrec *ir,
7118 const gmx_ddbox_t *ddbox)
7120 gmx_domdec_comm_t *comm;
7126 if (EEL_PME(ir->coulombtype) || EVDW_PME(ir->vdwtype))
7128 init_ddpme(dd, &comm->ddpme[0], 0);
7129 if (comm->npmedecompdim >= 2)
7131 init_ddpme(dd, &comm->ddpme[1], 1);
7136 comm->npmenodes = 0;
7137 if (dd->pme_nodeid >= 0)
7139 gmx_fatal_collective(FARGS, dd->mpi_comm_all, DDMASTER(dd),
7140 "Can not have separate PME ranks without PME electrostatics");
7146 fprintf(debug, "The DD cut-off is %f\n", comm->cutoff);
7148 if (comm->dlbState != edlbsOffForever)
7150 set_cell_limits_dlb(dd, dlb_scale, ir, ddbox);
7153 print_dd_settings(fplog, dd, mtop, ir, dlbIsOn(comm), dlb_scale, ddbox);
7154 if (comm->dlbState == edlbsOffCanTurnOn)
7158 fprintf(fplog, "When dynamic load balancing gets turned on, these settings will change to:\n");
7160 print_dd_settings(fplog, dd, mtop, ir, TRUE, dlb_scale, ddbox);
7163 if (ir->ePBC == epbcNONE)
7165 vol_frac = 1 - 1/(double)dd->nnodes;
7170 (1 + comm_box_frac(dd->nc, comm->cutoff, ddbox))/(double)dd->nnodes;
7174 fprintf(debug, "Volume fraction for all DD zones: %f\n", vol_frac);
7176 natoms_tot = comm->cgs_gl.index[comm->cgs_gl.nr];
7178 dd->ga2la = ga2la_init(natoms_tot, static_cast<int>(vol_frac*natoms_tot));
7181 /*! \brief Set some important DD parameters that can be modified by env.vars */
7182 static void set_dd_envvar_options(FILE *fplog, gmx_domdec_t *dd, int rank_mysim)
7184 gmx_domdec_comm_t *comm = dd->comm;
7186 dd->bSendRecv2 = dd_getenv(fplog, "GMX_DD_USE_SENDRECV2", 0);
7187 comm->dlb_scale_lim = dd_getenv(fplog, "GMX_DLB_MAX_BOX_SCALING", 10);
7188 comm->eFlop = dd_getenv(fplog, "GMX_DLB_BASED_ON_FLOPS", 0);
7189 int recload = dd_getenv(fplog, "GMX_DD_RECORD_LOAD", 1);
7190 comm->nstDDDump = dd_getenv(fplog, "GMX_DD_NST_DUMP", 0);
7191 comm->nstDDDumpGrid = dd_getenv(fplog, "GMX_DD_NST_DUMP_GRID", 0);
7192 comm->DD_debug = dd_getenv(fplog, "GMX_DD_DEBUG", 0);
7194 if (dd->bSendRecv2 && fplog)
7196 fprintf(fplog, "Will use two sequential MPI_Sendrecv calls instead of two simultaneous non-blocking MPI_Irecv and MPI_Isend pairs for constraint and vsite communication\n");
7203 fprintf(fplog, "Will load balance based on FLOP count\n");
7205 if (comm->eFlop > 1)
7207 srand(1 + rank_mysim);
7209 comm->bRecordLoad = TRUE;
7213 comm->bRecordLoad = (wallcycle_have_counter() && recload > 0);
7217 gmx_domdec_t *init_domain_decomposition(FILE *fplog, t_commrec *cr,
7218 unsigned long Flags,
7219 ivec nc, int nPmeRanks,
7221 real comm_distance_min, real rconstr,
7222 const char *dlb_opt, real dlb_scale,
7223 const char *sizex, const char *sizey, const char *sizez,
7224 const gmx_mtop_t *mtop,
7225 const t_inputrec *ir,
7226 matrix box, rvec *x,
7228 int *npme_x, int *npme_y)
7235 "\nInitializing Domain Decomposition on %d ranks\n", cr->nnodes);
7240 dd->comm = init_dd_comm();
7242 set_dd_envvar_options(fplog, dd, cr->nodeid);
7244 set_dd_limits_and_grid(fplog, cr, dd, Flags,
7246 comm_distance_min, rconstr,
7248 sizex, sizey, sizez,
7254 make_dd_communicators(fplog, cr, dd, dd_rank_order);
7256 if (cr->duty & DUTY_PP)
7258 set_ddgrid_parameters(fplog, dd, dlb_scale, mtop, ir, ddbox);
7260 setup_neighbor_relations(dd);
7263 /* Set overallocation to avoid frequent reallocation of arrays */
7264 set_over_alloc_dd(TRUE);
7266 /* Initialize DD paritioning counters */
7267 dd->comm->partition_step = INT_MIN;
7270 /* We currently don't know the number of threads yet, we set this later */
7273 clear_dd_cycle_counts(dd);
7278 static gmx_bool test_dd_cutoff(t_commrec *cr,
7279 t_state *state, const t_inputrec *ir,
7290 set_ddbox(dd, FALSE, cr, ir, state->box,
7291 TRUE, &dd->comm->cgs_gl, as_rvec_array(state->x.data()), &ddbox);
7295 for (d = 0; d < dd->ndim; d++)
7299 inv_cell_size = DD_CELL_MARGIN*dd->nc[dim]/ddbox.box_size[dim];
7300 if (dynamic_dd_box(&ddbox, ir))
7302 inv_cell_size *= DD_PRES_SCALE_MARGIN;
7305 np = 1 + (int)(cutoff_req*inv_cell_size*ddbox.skew_fac[dim]);
7307 if (dd->comm->dlbState != edlbsOffForever && dim < ddbox.npbcdim &&
7308 dd->comm->cd[d].np_dlb > 0)
7310 if (np > dd->comm->cd[d].np_dlb)
7315 /* If a current local cell size is smaller than the requested
7316 * cut-off, we could still fix it, but this gets very complicated.
7317 * Without fixing here, we might actually need more checks.
7319 if ((dd->comm->cell_x1[dim] - dd->comm->cell_x0[dim])*ddbox.skew_fac[dim]*dd->comm->cd[d].np_dlb < cutoff_req)
7326 if (dd->comm->dlbState != edlbsOffForever)
7328 /* If DLB is not active yet, we don't need to check the grid jumps.
7329 * Actually we shouldn't, because then the grid jump data is not set.
7331 if (dlbIsOn(dd->comm) &&
7332 check_grid_jump(0, dd, cutoff_req, &ddbox, FALSE))
7337 gmx_sumi(1, &LocallyLimited, cr);
7339 if (LocallyLimited > 0)
7348 gmx_bool change_dd_cutoff(t_commrec *cr, t_state *state, const t_inputrec *ir,
7351 gmx_bool bCutoffAllowed;
7353 bCutoffAllowed = test_dd_cutoff(cr, state, ir, cutoff_req);
7357 cr->dd->comm->cutoff = cutoff_req;
7360 return bCutoffAllowed;
7363 void set_dd_dlb_max_cutoff(t_commrec *cr, real cutoff)
7365 gmx_domdec_comm_t *comm;
7367 comm = cr->dd->comm;
7369 /* Turn on the DLB limiting (might have been on already) */
7370 comm->bPMELoadBalDLBLimits = TRUE;
7372 /* Change the cut-off limit */
7373 comm->PMELoadBal_max_cutoff = cutoff;
7377 fprintf(debug, "PME load balancing set a limit to the DLB staggering such that a %f cut-off will continue to fit\n",
7378 comm->PMELoadBal_max_cutoff);
7382 /* Sets whether we should later check the load imbalance data, so that
7383 * we can trigger dynamic load balancing if enough imbalance has
7386 * Used after PME load balancing unlocks DLB, so that the check
7387 * whether DLB will be useful can happen immediately.
7389 static void dd_dlb_set_should_check_whether_to_turn_dlb_on(gmx_domdec_t *dd, gmx_bool bValue)
7391 if (dd->comm->dlbState == edlbsOffCanTurnOn)
7393 dd->comm->bCheckWhetherToTurnDlbOn = bValue;
7397 /* Store the DD partitioning count, so we can ignore cycle counts
7398 * over the next nstlist steps, which are often slower.
7400 dd->comm->ddPartioningCountFirstDlbOff = dd->ddp_count;
7405 /* Returns if we should check whether there has been enough load
7406 * imbalance to trigger dynamic load balancing.
7408 static gmx_bool dd_dlb_get_should_check_whether_to_turn_dlb_on(gmx_domdec_t *dd)
7410 if (dd->comm->dlbState != edlbsOffCanTurnOn)
7415 if (dd->ddp_count <= dd->comm->ddPartioningCountFirstDlbOff)
7417 /* We ignore the first nstlist steps at the start of the run
7418 * or after PME load balancing or after turning DLB off, since
7419 * these often have extra allocation or cache miss overhead.
7424 /* We should check whether we should use DLB directly after
7426 if (dd->comm->bCheckWhetherToTurnDlbOn)
7428 /* This flag was set when the PME load-balancing routines
7429 unlocked DLB, and should now be cleared. */
7430 dd_dlb_set_should_check_whether_to_turn_dlb_on(dd, FALSE);
7433 /* We check whether we should use DLB every c_checkTurnDlbOnInterval
7434 * partitionings (we do not do this every partioning, so that we
7435 * avoid excessive communication). */
7436 if (dd->comm->n_load_have % c_checkTurnDlbOnInterval == c_checkTurnDlbOnInterval - 1)
7444 gmx_bool dd_dlb_is_on(const gmx_domdec_t *dd)
7446 return dlbIsOn(dd->comm);
7449 gmx_bool dd_dlb_is_locked(const gmx_domdec_t *dd)
7451 return (dd->comm->dlbState == edlbsOffTemporarilyLocked);
7454 void dd_dlb_lock(gmx_domdec_t *dd)
7456 /* We can only lock the DLB when it is set to auto, otherwise don't do anything */
7457 if (dd->comm->dlbState == edlbsOffCanTurnOn)
7459 dd->comm->dlbState = edlbsOffTemporarilyLocked;
7463 void dd_dlb_unlock(gmx_domdec_t *dd)
7465 /* We can only lock the DLB when it is set to auto, otherwise don't do anything */
7466 if (dd->comm->dlbState == edlbsOffTemporarilyLocked)
7468 dd->comm->dlbState = edlbsOffCanTurnOn;
7469 dd_dlb_set_should_check_whether_to_turn_dlb_on(dd, TRUE);
7473 static void merge_cg_buffers(int ncell,
7474 gmx_domdec_comm_dim_t *cd, int pulse,
7476 int *index_gl, int *recv_i,
7477 rvec *cg_cm, rvec *recv_vr,
7479 cginfo_mb_t *cginfo_mb, int *cginfo)
7481 gmx_domdec_ind_t *ind, *ind_p;
7482 int p, cell, c, cg, cg0, cg1, cg_gl, nat;
7483 int shift, shift_at;
7485 ind = &cd->ind[pulse];
7487 /* First correct the already stored data */
7488 shift = ind->nrecv[ncell];
7489 for (cell = ncell-1; cell >= 0; cell--)
7491 shift -= ind->nrecv[cell];
7494 /* Move the cg's present from previous grid pulses */
7495 cg0 = ncg_cell[ncell+cell];
7496 cg1 = ncg_cell[ncell+cell+1];
7497 cgindex[cg1+shift] = cgindex[cg1];
7498 for (cg = cg1-1; cg >= cg0; cg--)
7500 index_gl[cg+shift] = index_gl[cg];
7501 copy_rvec(cg_cm[cg], cg_cm[cg+shift]);
7502 cgindex[cg+shift] = cgindex[cg];
7503 cginfo[cg+shift] = cginfo[cg];
7505 /* Correct the already stored send indices for the shift */
7506 for (p = 1; p <= pulse; p++)
7508 ind_p = &cd->ind[p];
7510 for (c = 0; c < cell; c++)
7512 cg0 += ind_p->nsend[c];
7514 cg1 = cg0 + ind_p->nsend[cell];
7515 for (cg = cg0; cg < cg1; cg++)
7517 ind_p->index[cg] += shift;
7523 /* Merge in the communicated buffers */
7527 for (cell = 0; cell < ncell; cell++)
7529 cg1 = ncg_cell[ncell+cell+1] + shift;
7532 /* Correct the old cg indices */
7533 for (cg = ncg_cell[ncell+cell]; cg < cg1; cg++)
7535 cgindex[cg+1] += shift_at;
7538 for (cg = 0; cg < ind->nrecv[cell]; cg++)
7540 /* Copy this charge group from the buffer */
7541 index_gl[cg1] = recv_i[cg0];
7542 copy_rvec(recv_vr[cg0], cg_cm[cg1]);
7543 /* Add it to the cgindex */
7544 cg_gl = index_gl[cg1];
7545 cginfo[cg1] = ddcginfo(cginfo_mb, cg_gl);
7546 nat = GET_CGINFO_NATOMS(cginfo[cg1]);
7547 cgindex[cg1+1] = cgindex[cg1] + nat;
7552 shift += ind->nrecv[cell];
7553 ncg_cell[ncell+cell+1] = cg1;
7557 static void make_cell2at_index(gmx_domdec_comm_dim_t *cd,
7558 int nzone, int cg0, const int *cgindex)
7562 /* Store the atom block boundaries for easy copying of communication buffers
7565 for (zone = 0; zone < nzone; zone++)
7567 for (p = 0; p < cd->np; p++)
7569 cd->ind[p].cell2at0[zone] = cgindex[cg];
7570 cg += cd->ind[p].nrecv[zone];
7571 cd->ind[p].cell2at1[zone] = cgindex[cg];
7576 static gmx_bool missing_link(t_blocka *link, int cg_gl, char *bLocalCG)
7582 for (i = link->index[cg_gl]; i < link->index[cg_gl+1]; i++)
7584 if (!bLocalCG[link->a[i]])
7593 /* Domain corners for communication, a maximum of 4 i-zones see a j domain */
7595 real c[DIM][4]; /* the corners for the non-bonded communication */
7596 real cr0; /* corner for rounding */
7597 real cr1[4]; /* corners for rounding */
7598 real bc[DIM]; /* corners for bounded communication */
7599 real bcr1; /* corner for rounding for bonded communication */
7602 /* Determine the corners of the domain(s) we are communicating with */
7604 set_dd_corners(const gmx_domdec_t *dd,
7605 int dim0, int dim1, int dim2,
7609 const gmx_domdec_comm_t *comm;
7610 const gmx_domdec_zones_t *zones;
7615 zones = &comm->zones;
7617 /* Keep the compiler happy */
7621 /* The first dimension is equal for all cells */
7622 c->c[0][0] = comm->cell_x0[dim0];
7625 c->bc[0] = c->c[0][0];
7630 /* This cell row is only seen from the first row */
7631 c->c[1][0] = comm->cell_x0[dim1];
7632 /* All rows can see this row */
7633 c->c[1][1] = comm->cell_x0[dim1];
7634 if (dlbIsOn(dd->comm))
7636 c->c[1][1] = std::max(comm->cell_x0[dim1], comm->zone_d1[1].mch0);
7639 /* For the multi-body distance we need the maximum */
7640 c->bc[1] = std::max(comm->cell_x0[dim1], comm->zone_d1[1].p1_0);
7643 /* Set the upper-right corner for rounding */
7644 c->cr0 = comm->cell_x1[dim0];
7649 for (j = 0; j < 4; j++)
7651 c->c[2][j] = comm->cell_x0[dim2];
7653 if (dlbIsOn(dd->comm))
7655 /* Use the maximum of the i-cells that see a j-cell */
7656 for (i = 0; i < zones->nizone; i++)
7658 for (j = zones->izone[i].j0; j < zones->izone[i].j1; j++)
7663 std::max(c->c[2][j-4],
7664 comm->zone_d2[zones->shift[i][dim0]][zones->shift[i][dim1]].mch0);
7670 /* For the multi-body distance we need the maximum */
7671 c->bc[2] = comm->cell_x0[dim2];
7672 for (i = 0; i < 2; i++)
7674 for (j = 0; j < 2; j++)
7676 c->bc[2] = std::max(c->bc[2], comm->zone_d2[i][j].p1_0);
7682 /* Set the upper-right corner for rounding */
7683 /* Cell (0,0,0) and cell (1,0,0) can see cell 4 (0,1,1)
7684 * Only cell (0,0,0) can see cell 7 (1,1,1)
7686 c->cr1[0] = comm->cell_x1[dim1];
7687 c->cr1[3] = comm->cell_x1[dim1];
7688 if (dlbIsOn(dd->comm))
7690 c->cr1[0] = std::max(comm->cell_x1[dim1], comm->zone_d1[1].mch1);
7693 /* For the multi-body distance we need the maximum */
7694 c->bcr1 = std::max(comm->cell_x1[dim1], comm->zone_d1[1].p1_1);
7701 /* Determine which cg's we need to send in this pulse from this zone */
7703 get_zone_pulse_cgs(gmx_domdec_t *dd,
7704 int zonei, int zone,
7706 const int *index_gl,
7708 int dim, int dim_ind,
7709 int dim0, int dim1, int dim2,
7710 real r_comm2, real r_bcomm2,
7714 real skew_fac2_d, real skew_fac_01,
7715 rvec *v_d, rvec *v_0, rvec *v_1,
7716 const dd_corners_t *c,
7718 gmx_bool bDistBonded,
7724 gmx_domdec_ind_t *ind,
7725 int **ibuf, int *ibuf_nalloc,
7731 gmx_domdec_comm_t *comm;
7733 gmx_bool bDistMB_pulse;
7735 real r2, rb2, r, tric_sh;
7738 int nsend_z, nsend, nat;
7742 bScrew = (dd->bScrewPBC && dim == XX);
7744 bDistMB_pulse = (bDistMB && bDistBonded);
7750 for (cg = cg0; cg < cg1; cg++)
7754 if (tric_dist[dim_ind] == 0)
7756 /* Rectangular direction, easy */
7757 r = cg_cm[cg][dim] - c->c[dim_ind][zone];
7764 r = cg_cm[cg][dim] - c->bc[dim_ind];
7770 /* Rounding gives at most a 16% reduction
7771 * in communicated atoms
7773 if (dim_ind >= 1 && (zonei == 1 || zonei == 2))
7775 r = cg_cm[cg][dim0] - c->cr0;
7776 /* This is the first dimension, so always r >= 0 */
7783 if (dim_ind == 2 && (zonei == 2 || zonei == 3))
7785 r = cg_cm[cg][dim1] - c->cr1[zone];
7792 r = cg_cm[cg][dim1] - c->bcr1;
7802 /* Triclinic direction, more complicated */
7805 /* Rounding, conservative as the skew_fac multiplication
7806 * will slightly underestimate the distance.
7808 if (dim_ind >= 1 && (zonei == 1 || zonei == 2))
7810 rn[dim0] = cg_cm[cg][dim0] - c->cr0;
7811 for (i = dim0+1; i < DIM; i++)
7813 rn[dim0] -= cg_cm[cg][i]*v_0[i][dim0];
7815 r2 = rn[dim0]*rn[dim0]*sf2_round[dim0];
7818 rb[dim0] = rn[dim0];
7821 /* Take care that the cell planes along dim0 might not
7822 * be orthogonal to those along dim1 and dim2.
7824 for (i = 1; i <= dim_ind; i++)
7827 if (normal[dim0][dimd] > 0)
7829 rn[dimd] -= rn[dim0]*normal[dim0][dimd];
7832 rb[dimd] -= rb[dim0]*normal[dim0][dimd];
7837 if (dim_ind == 2 && (zonei == 2 || zonei == 3))
7839 rn[dim1] += cg_cm[cg][dim1] - c->cr1[zone];
7841 for (i = dim1+1; i < DIM; i++)
7843 tric_sh -= cg_cm[cg][i]*v_1[i][dim1];
7845 rn[dim1] += tric_sh;
7848 r2 += rn[dim1]*rn[dim1]*sf2_round[dim1];
7849 /* Take care of coupling of the distances
7850 * to the planes along dim0 and dim1 through dim2.
7852 r2 -= rn[dim0]*rn[dim1]*skew_fac_01;
7853 /* Take care that the cell planes along dim1
7854 * might not be orthogonal to that along dim2.
7856 if (normal[dim1][dim2] > 0)
7858 rn[dim2] -= rn[dim1]*normal[dim1][dim2];
7864 cg_cm[cg][dim1] - c->bcr1 + tric_sh;
7867 rb2 += rb[dim1]*rb[dim1]*sf2_round[dim1];
7868 /* Take care of coupling of the distances
7869 * to the planes along dim0 and dim1 through dim2.
7871 rb2 -= rb[dim0]*rb[dim1]*skew_fac_01;
7872 /* Take care that the cell planes along dim1
7873 * might not be orthogonal to that along dim2.
7875 if (normal[dim1][dim2] > 0)
7877 rb[dim2] -= rb[dim1]*normal[dim1][dim2];
7882 /* The distance along the communication direction */
7883 rn[dim] += cg_cm[cg][dim] - c->c[dim_ind][zone];
7885 for (i = dim+1; i < DIM; i++)
7887 tric_sh -= cg_cm[cg][i]*v_d[i][dim];
7892 r2 += rn[dim]*rn[dim]*skew_fac2_d;
7893 /* Take care of coupling of the distances
7894 * to the planes along dim0 and dim1 through dim2.
7896 if (dim_ind == 1 && zonei == 1)
7898 r2 -= rn[dim0]*rn[dim]*skew_fac_01;
7904 rb[dim] += cg_cm[cg][dim] - c->bc[dim_ind] + tric_sh;
7907 rb2 += rb[dim]*rb[dim]*skew_fac2_d;
7908 /* Take care of coupling of the distances
7909 * to the planes along dim0 and dim1 through dim2.
7911 if (dim_ind == 1 && zonei == 1)
7913 rb2 -= rb[dim0]*rb[dim]*skew_fac_01;
7921 ((bDistMB && rb2 < r_bcomm2) ||
7922 (bDist2B && r2 < r_bcomm2)) &&
7924 (GET_CGINFO_BOND_INTER(cginfo[cg]) &&
7925 missing_link(comm->cglink, index_gl[cg],
7928 /* Make an index to the local charge groups */
7929 if (nsend+1 > ind->nalloc)
7931 ind->nalloc = over_alloc_large(nsend+1);
7932 srenew(ind->index, ind->nalloc);
7934 if (nsend+1 > *ibuf_nalloc)
7936 *ibuf_nalloc = over_alloc_large(nsend+1);
7937 srenew(*ibuf, *ibuf_nalloc);
7939 ind->index[nsend] = cg;
7940 (*ibuf)[nsend] = index_gl[cg];
7942 vec_rvec_check_alloc(vbuf, nsend+1);
7944 if (dd->ci[dim] == 0)
7946 /* Correct cg_cm for pbc */
7947 rvec_add(cg_cm[cg], box[dim], vbuf->v[nsend]);
7950 vbuf->v[nsend][YY] = box[YY][YY] - vbuf->v[nsend][YY];
7951 vbuf->v[nsend][ZZ] = box[ZZ][ZZ] - vbuf->v[nsend][ZZ];
7956 copy_rvec(cg_cm[cg], vbuf->v[nsend]);
7959 nat += cgindex[cg+1] - cgindex[cg];
7965 *nsend_z_ptr = nsend_z;
7968 static void setup_dd_communication(gmx_domdec_t *dd,
7969 matrix box, gmx_ddbox_t *ddbox,
7971 t_state *state, PaddedRVecVector *f)
7973 int dim_ind, dim, dim0, dim1, dim2, dimd, p, nat_tot;
7974 int nzone, nzone_send, zone, zonei, cg0, cg1;
7975 int c, i, cg, cg_gl, nrcg;
7976 int *zone_cg_range, pos_cg, *index_gl, *cgindex, *recv_i;
7977 gmx_domdec_comm_t *comm;
7978 gmx_domdec_zones_t *zones;
7979 gmx_domdec_comm_dim_t *cd;
7980 gmx_domdec_ind_t *ind;
7981 cginfo_mb_t *cginfo_mb;
7982 gmx_bool bBondComm, bDist2B, bDistMB, bDistBonded;
7983 real r_comm2, r_bcomm2;
7984 dd_corners_t corners;
7986 rvec *cg_cm, *normal, *v_d, *v_0 = nullptr, *v_1 = nullptr, *recv_vr;
7987 real skew_fac2_d, skew_fac_01;
7994 fprintf(debug, "Setting up DD communication\n");
8001 /* Initialize the thread data.
8002 * This can not be done in init_domain_decomposition,
8003 * as the numbers of threads is determined later.
8005 comm->nth = gmx_omp_nthreads_get(emntDomdec);
8008 snew(comm->dth, comm->nth);
8012 switch (fr->cutoff_scheme)
8018 cg_cm = as_rvec_array(state->x.data());
8021 gmx_incons("unimplemented");
8025 for (dim_ind = 0; dim_ind < dd->ndim; dim_ind++)
8027 /* Check if we need to use triclinic distances */
8028 tric_dist[dim_ind] = 0;
8029 for (i = 0; i <= dim_ind; i++)
8031 if (ddbox->tric_dir[dd->dim[i]])
8033 tric_dist[dim_ind] = 1;
8038 bBondComm = comm->bBondComm;
8040 /* Do we need to determine extra distances for multi-body bondeds? */
8041 bDistMB = (comm->bInterCGMultiBody && dlbIsOn(dd->comm) && dd->ndim > 1);
8043 /* Do we need to determine extra distances for only two-body bondeds? */
8044 bDist2B = (bBondComm && !bDistMB);
8046 r_comm2 = gmx::square(comm->cutoff);
8047 r_bcomm2 = gmx::square(comm->cutoff_mbody);
8051 fprintf(debug, "bBondComm %d, r_bc %f\n", bBondComm, std::sqrt(r_bcomm2));
8054 zones = &comm->zones;
8057 dim1 = (dd->ndim >= 2 ? dd->dim[1] : -1);
8058 dim2 = (dd->ndim >= 3 ? dd->dim[2] : -1);
8060 set_dd_corners(dd, dim0, dim1, dim2, bDistMB, &corners);
8062 /* Triclinic stuff */
8063 normal = ddbox->normal;
8067 v_0 = ddbox->v[dim0];
8068 if (ddbox->tric_dir[dim0] && ddbox->tric_dir[dim1])
8070 /* Determine the coupling coefficient for the distances
8071 * to the cell planes along dim0 and dim1 through dim2.
8072 * This is required for correct rounding.
8075 ddbox->v[dim0][dim1+1][dim0]*ddbox->v[dim1][dim1+1][dim1];
8078 fprintf(debug, "\nskew_fac_01 %f\n", skew_fac_01);
8084 v_1 = ddbox->v[dim1];
8087 zone_cg_range = zones->cg_range;
8088 index_gl = dd->index_gl;
8089 cgindex = dd->cgindex;
8090 cginfo_mb = fr->cginfo_mb;
8092 zone_cg_range[0] = 0;
8093 zone_cg_range[1] = dd->ncg_home;
8094 comm->zone_ncg1[0] = dd->ncg_home;
8095 pos_cg = dd->ncg_home;
8097 nat_tot = dd->nat_home;
8099 for (dim_ind = 0; dim_ind < dd->ndim; dim_ind++)
8101 dim = dd->dim[dim_ind];
8102 cd = &comm->cd[dim_ind];
8104 if (dim >= ddbox->npbcdim && dd->ci[dim] == 0)
8106 /* No pbc in this dimension, the first node should not comm. */
8114 v_d = ddbox->v[dim];
8115 skew_fac2_d = gmx::square(ddbox->skew_fac[dim]);
8117 cd->bInPlace = TRUE;
8118 for (p = 0; p < cd->np; p++)
8120 /* Only atoms communicated in the first pulse are used
8121 * for multi-body bonded interactions or for bBondComm.
8123 bDistBonded = ((bDistMB || bDist2B) && p == 0);
8128 for (zone = 0; zone < nzone_send; zone++)
8130 if (tric_dist[dim_ind] && dim_ind > 0)
8132 /* Determine slightly more optimized skew_fac's
8134 * This reduces the number of communicated atoms
8135 * by about 10% for 3D DD of rhombic dodecahedra.
8137 for (dimd = 0; dimd < dim; dimd++)
8139 sf2_round[dimd] = 1;
8140 if (ddbox->tric_dir[dimd])
8142 for (i = dd->dim[dimd]+1; i < DIM; i++)
8144 /* If we are shifted in dimension i
8145 * and the cell plane is tilted forward
8146 * in dimension i, skip this coupling.
8148 if (!(zones->shift[nzone+zone][i] &&
8149 ddbox->v[dimd][i][dimd] >= 0))
8152 gmx::square(ddbox->v[dimd][i][dimd]);
8155 sf2_round[dimd] = 1/sf2_round[dimd];
8160 zonei = zone_perm[dim_ind][zone];
8163 /* Here we permutate the zones to obtain a convenient order
8164 * for neighbor searching
8166 cg0 = zone_cg_range[zonei];
8167 cg1 = zone_cg_range[zonei+1];
8171 /* Look only at the cg's received in the previous grid pulse
8173 cg1 = zone_cg_range[nzone+zone+1];
8174 cg0 = cg1 - cd->ind[p-1].nrecv[zone];
8177 #pragma omp parallel for num_threads(comm->nth) schedule(static)
8178 for (th = 0; th < comm->nth; th++)
8182 gmx_domdec_ind_t *ind_p;
8183 int **ibuf_p, *ibuf_nalloc_p;
8185 int *nsend_p, *nat_p;
8191 /* Thread 0 writes in the comm buffers */
8193 ibuf_p = &comm->buf_int;
8194 ibuf_nalloc_p = &comm->nalloc_int;
8195 vbuf_p = &comm->vbuf;
8198 nsend_zone_p = &ind->nsend[zone];
8202 /* Other threads write into temp buffers */
8203 ind_p = &comm->dth[th].ind;
8204 ibuf_p = &comm->dth[th].ibuf;
8205 ibuf_nalloc_p = &comm->dth[th].ibuf_nalloc;
8206 vbuf_p = &comm->dth[th].vbuf;
8207 nsend_p = &comm->dth[th].nsend;
8208 nat_p = &comm->dth[th].nat;
8209 nsend_zone_p = &comm->dth[th].nsend_zone;
8211 comm->dth[th].nsend = 0;
8212 comm->dth[th].nat = 0;
8213 comm->dth[th].nsend_zone = 0;
8223 cg0_th = cg0 + ((cg1 - cg0)* th )/comm->nth;
8224 cg1_th = cg0 + ((cg1 - cg0)*(th+1))/comm->nth;
8227 /* Get the cg's for this pulse in this zone */
8228 get_zone_pulse_cgs(dd, zonei, zone, cg0_th, cg1_th,
8230 dim, dim_ind, dim0, dim1, dim2,
8233 normal, skew_fac2_d, skew_fac_01,
8234 v_d, v_0, v_1, &corners, sf2_round,
8235 bDistBonded, bBondComm,
8239 ibuf_p, ibuf_nalloc_p,
8244 GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
8247 /* Append data of threads>=1 to the communication buffers */
8248 for (th = 1; th < comm->nth; th++)
8250 dd_comm_setup_work_t *dth;
8253 dth = &comm->dth[th];
8255 ns1 = nsend + dth->nsend_zone;
8256 if (ns1 > ind->nalloc)
8258 ind->nalloc = over_alloc_dd(ns1);
8259 srenew(ind->index, ind->nalloc);
8261 if (ns1 > comm->nalloc_int)
8263 comm->nalloc_int = over_alloc_dd(ns1);
8264 srenew(comm->buf_int, comm->nalloc_int);
8266 if (ns1 > comm->vbuf.nalloc)
8268 comm->vbuf.nalloc = over_alloc_dd(ns1);
8269 srenew(comm->vbuf.v, comm->vbuf.nalloc);
8272 for (i = 0; i < dth->nsend_zone; i++)
8274 ind->index[nsend] = dth->ind.index[i];
8275 comm->buf_int[nsend] = dth->ibuf[i];
8276 copy_rvec(dth->vbuf.v[i],
8277 comm->vbuf.v[nsend]);
8281 ind->nsend[zone] += dth->nsend_zone;
8284 /* Clear the counts in case we do not have pbc */
8285 for (zone = nzone_send; zone < nzone; zone++)
8287 ind->nsend[zone] = 0;
8289 ind->nsend[nzone] = nsend;
8290 ind->nsend[nzone+1] = nat;
8291 /* Communicate the number of cg's and atoms to receive */
8292 dd_sendrecv_int(dd, dim_ind, dddirBackward,
8293 ind->nsend, nzone+2,
8294 ind->nrecv, nzone+2);
8296 /* The rvec buffer is also required for atom buffers of size nsend
8297 * in dd_move_x and dd_move_f.
8299 vec_rvec_check_alloc(&comm->vbuf, ind->nsend[nzone+1]);
8303 /* We can receive in place if only the last zone is not empty */
8304 for (zone = 0; zone < nzone-1; zone++)
8306 if (ind->nrecv[zone] > 0)
8308 cd->bInPlace = FALSE;
8313 /* The int buffer is only required here for the cg indices */
8314 if (ind->nrecv[nzone] > comm->nalloc_int2)
8316 comm->nalloc_int2 = over_alloc_dd(ind->nrecv[nzone]);
8317 srenew(comm->buf_int2, comm->nalloc_int2);
8319 /* The rvec buffer is also required for atom buffers
8320 * of size nrecv in dd_move_x and dd_move_f.
8322 i = std::max(cd->ind[0].nrecv[nzone+1], ind->nrecv[nzone+1]);
8323 vec_rvec_check_alloc(&comm->vbuf2, i);
8327 /* Make space for the global cg indices */
8328 if (pos_cg + ind->nrecv[nzone] > dd->cg_nalloc
8329 || dd->cg_nalloc == 0)
8331 dd->cg_nalloc = over_alloc_dd(pos_cg + ind->nrecv[nzone]);
8332 srenew(index_gl, dd->cg_nalloc);
8333 srenew(cgindex, dd->cg_nalloc+1);
8335 /* Communicate the global cg indices */
8338 recv_i = index_gl + pos_cg;
8342 recv_i = comm->buf_int2;
8344 dd_sendrecv_int(dd, dim_ind, dddirBackward,
8345 comm->buf_int, nsend,
8346 recv_i, ind->nrecv[nzone]);
8348 /* Make space for cg_cm */
8349 dd_check_alloc_ncg(fr, state, f, pos_cg + ind->nrecv[nzone]);
8350 if (fr->cutoff_scheme == ecutsGROUP)
8356 cg_cm = as_rvec_array(state->x.data());
8358 /* Communicate cg_cm */
8361 recv_vr = cg_cm + pos_cg;
8365 recv_vr = comm->vbuf2.v;
8367 dd_sendrecv_rvec(dd, dim_ind, dddirBackward,
8368 comm->vbuf.v, nsend,
8369 recv_vr, ind->nrecv[nzone]);
8371 /* Make the charge group index */
8374 zone = (p == 0 ? 0 : nzone - 1);
8375 while (zone < nzone)
8377 for (cg = 0; cg < ind->nrecv[zone]; cg++)
8379 cg_gl = index_gl[pos_cg];
8380 fr->cginfo[pos_cg] = ddcginfo(cginfo_mb, cg_gl);
8381 nrcg = GET_CGINFO_NATOMS(fr->cginfo[pos_cg]);
8382 cgindex[pos_cg+1] = cgindex[pos_cg] + nrcg;
8385 /* Update the charge group presence,
8386 * so we can use it in the next pass of the loop.
8388 comm->bLocalCG[cg_gl] = TRUE;
8394 comm->zone_ncg1[nzone+zone] = ind->nrecv[zone];
8397 zone_cg_range[nzone+zone] = pos_cg;
8402 /* This part of the code is never executed with bBondComm. */
8403 merge_cg_buffers(nzone, cd, p, zone_cg_range,
8404 index_gl, recv_i, cg_cm, recv_vr,
8405 cgindex, fr->cginfo_mb, fr->cginfo);
8406 pos_cg += ind->nrecv[nzone];
8408 nat_tot += ind->nrecv[nzone+1];
8412 /* Store the atom block for easy copying of communication buffers */
8413 make_cell2at_index(cd, nzone, zone_cg_range[nzone], cgindex);
8417 dd->index_gl = index_gl;
8418 dd->cgindex = cgindex;
8420 dd->ncg_tot = zone_cg_range[zones->n];
8421 dd->nat_tot = nat_tot;
8422 comm->nat[ddnatHOME] = dd->nat_home;
8423 for (i = ddnatZONE; i < ddnatNR; i++)
8425 comm->nat[i] = dd->nat_tot;
8430 /* We don't need to update cginfo, since that was alrady done above.
8431 * So we pass NULL for the forcerec.
8433 dd_set_cginfo(dd->index_gl, dd->ncg_home, dd->ncg_tot,
8434 nullptr, comm->bLocalCG);
8439 fprintf(debug, "Finished setting up DD communication, zones:");
8440 for (c = 0; c < zones->n; c++)
8442 fprintf(debug, " %d", zones->cg_range[c+1]-zones->cg_range[c]);
8444 fprintf(debug, "\n");
8448 static void set_cg_boundaries(gmx_domdec_zones_t *zones)
8452 for (c = 0; c < zones->nizone; c++)
8454 zones->izone[c].cg1 = zones->cg_range[c+1];
8455 zones->izone[c].jcg0 = zones->cg_range[zones->izone[c].j0];
8456 zones->izone[c].jcg1 = zones->cg_range[zones->izone[c].j1];
8460 static void set_zones_size(gmx_domdec_t *dd,
8461 matrix box, const gmx_ddbox_t *ddbox,
8462 int zone_start, int zone_end)
8464 gmx_domdec_comm_t *comm;
8465 gmx_domdec_zones_t *zones;
8474 zones = &comm->zones;
8476 /* Do we need to determine extra distances for multi-body bondeds? */
8477 bDistMB = (comm->bInterCGMultiBody && dlbIsOn(dd->comm) && dd->ndim > 1);
8479 for (z = zone_start; z < zone_end; z++)
8481 /* Copy cell limits to zone limits.
8482 * Valid for non-DD dims and non-shifted dims.
8484 copy_rvec(comm->cell_x0, zones->size[z].x0);
8485 copy_rvec(comm->cell_x1, zones->size[z].x1);
8488 for (d = 0; d < dd->ndim; d++)
8492 for (z = 0; z < zones->n; z++)
8494 /* With a staggered grid we have different sizes
8495 * for non-shifted dimensions.
8497 if (dlbIsOn(dd->comm) && zones->shift[z][dim] == 0)
8501 zones->size[z].x0[dim] = comm->zone_d1[zones->shift[z][dd->dim[d-1]]].min0;
8502 zones->size[z].x1[dim] = comm->zone_d1[zones->shift[z][dd->dim[d-1]]].max1;
8506 zones->size[z].x0[dim] = comm->zone_d2[zones->shift[z][dd->dim[d-2]]][zones->shift[z][dd->dim[d-1]]].min0;
8507 zones->size[z].x1[dim] = comm->zone_d2[zones->shift[z][dd->dim[d-2]]][zones->shift[z][dd->dim[d-1]]].max1;
8513 rcmbs = comm->cutoff_mbody;
8514 if (ddbox->tric_dir[dim])
8516 rcs /= ddbox->skew_fac[dim];
8517 rcmbs /= ddbox->skew_fac[dim];
8520 /* Set the lower limit for the shifted zone dimensions */
8521 for (z = zone_start; z < zone_end; z++)
8523 if (zones->shift[z][dim] > 0)
8526 if (!dlbIsOn(dd->comm) || d == 0)
8528 zones->size[z].x0[dim] = comm->cell_x1[dim];
8529 zones->size[z].x1[dim] = comm->cell_x1[dim] + rcs;
8533 /* Here we take the lower limit of the zone from
8534 * the lowest domain of the zone below.
8538 zones->size[z].x0[dim] =
8539 comm->zone_d1[zones->shift[z][dd->dim[d-1]]].min1;
8545 zones->size[z].x0[dim] =
8546 zones->size[zone_perm[2][z-4]].x0[dim];
8550 zones->size[z].x0[dim] =
8551 comm->zone_d2[zones->shift[z][dd->dim[d-2]]][zones->shift[z][dd->dim[d-1]]].min1;
8554 /* A temporary limit, is updated below */
8555 zones->size[z].x1[dim] = zones->size[z].x0[dim];
8559 for (zi = 0; zi < zones->nizone; zi++)
8561 if (zones->shift[zi][dim] == 0)
8563 /* This takes the whole zone into account.
8564 * With multiple pulses this will lead
8565 * to a larger zone then strictly necessary.
8567 zones->size[z].x1[dim] = std::max(zones->size[z].x1[dim],
8568 zones->size[zi].x1[dim]+rcmbs);
8576 /* Loop over the i-zones to set the upper limit of each
8579 for (zi = 0; zi < zones->nizone; zi++)
8581 if (zones->shift[zi][dim] == 0)
8583 for (z = zones->izone[zi].j0; z < zones->izone[zi].j1; z++)
8585 if (zones->shift[z][dim] > 0)
8587 zones->size[z].x1[dim] = std::max(zones->size[z].x1[dim],
8588 zones->size[zi].x1[dim]+rcs);
8595 for (z = zone_start; z < zone_end; z++)
8597 /* Initialization only required to keep the compiler happy */
8598 rvec corner_min = {0, 0, 0}, corner_max = {0, 0, 0}, corner;
8601 /* To determine the bounding box for a zone we need to find
8602 * the extreme corners of 4, 2 or 1 corners.
8604 nc = 1 << (ddbox->nboundeddim - 1);
8606 for (c = 0; c < nc; c++)
8608 /* Set up a zone corner at x=0, ignoring trilinic couplings */
8612 corner[YY] = zones->size[z].x0[YY];
8616 corner[YY] = zones->size[z].x1[YY];
8620 corner[ZZ] = zones->size[z].x0[ZZ];
8624 corner[ZZ] = zones->size[z].x1[ZZ];
8626 if (dd->ndim == 1 && dd->dim[0] < ZZ && ZZ < dd->npbcdim &&
8627 box[ZZ][1 - dd->dim[0]] != 0)
8629 /* With 1D domain decomposition the cg's are not in
8630 * the triclinic box, but triclinic x-y and rectangular y/x-z.
8631 * Shift the corner of the z-vector back to along the box
8632 * vector of dimension d, so it will later end up at 0 along d.
8633 * This can affect the location of this corner along dd->dim[0]
8634 * through the matrix operation below if box[d][dd->dim[0]]!=0.
8636 int d = 1 - dd->dim[0];
8638 corner[d] -= corner[ZZ]*box[ZZ][d]/box[ZZ][ZZ];
8640 /* Apply the triclinic couplings */
8641 assert(ddbox->npbcdim <= DIM);
8642 for (i = YY; i < ddbox->npbcdim; i++)
8644 for (j = XX; j < i; j++)
8646 corner[j] += corner[i]*box[i][j]/box[i][i];
8651 copy_rvec(corner, corner_min);
8652 copy_rvec(corner, corner_max);
8656 for (i = 0; i < DIM; i++)
8658 corner_min[i] = std::min(corner_min[i], corner[i]);
8659 corner_max[i] = std::max(corner_max[i], corner[i]);
8663 /* Copy the extreme cornes without offset along x */
8664 for (i = 0; i < DIM; i++)
8666 zones->size[z].bb_x0[i] = corner_min[i];
8667 zones->size[z].bb_x1[i] = corner_max[i];
8669 /* Add the offset along x */
8670 zones->size[z].bb_x0[XX] += zones->size[z].x0[XX];
8671 zones->size[z].bb_x1[XX] += zones->size[z].x1[XX];
8674 if (zone_start == 0)
8677 for (dim = 0; dim < DIM; dim++)
8679 vol *= zones->size[0].x1[dim] - zones->size[0].x0[dim];
8681 zones->dens_zone0 = (zones->cg_range[1] - zones->cg_range[0])/vol;
8686 for (z = zone_start; z < zone_end; z++)
8688 fprintf(debug, "zone %d %6.3f - %6.3f %6.3f - %6.3f %6.3f - %6.3f\n",
8690 zones->size[z].x0[XX], zones->size[z].x1[XX],
8691 zones->size[z].x0[YY], zones->size[z].x1[YY],
8692 zones->size[z].x0[ZZ], zones->size[z].x1[ZZ]);
8693 fprintf(debug, "zone %d bb %6.3f - %6.3f %6.3f - %6.3f %6.3f - %6.3f\n",
8695 zones->size[z].bb_x0[XX], zones->size[z].bb_x1[XX],
8696 zones->size[z].bb_x0[YY], zones->size[z].bb_x1[YY],
8697 zones->size[z].bb_x0[ZZ], zones->size[z].bb_x1[ZZ]);
8702 static int comp_cgsort(const void *a, const void *b)
8706 gmx_cgsort_t *cga, *cgb;
8707 cga = (gmx_cgsort_t *)a;
8708 cgb = (gmx_cgsort_t *)b;
8710 comp = cga->nsc - cgb->nsc;
8713 comp = cga->ind_gl - cgb->ind_gl;
8719 static void order_int_cg(int n, const gmx_cgsort_t *sort,
8724 /* Order the data */
8725 for (i = 0; i < n; i++)
8727 buf[i] = a[sort[i].ind];
8730 /* Copy back to the original array */
8731 for (i = 0; i < n; i++)
8737 static void order_vec_cg(int n, const gmx_cgsort_t *sort,
8742 /* Order the data */
8743 for (i = 0; i < n; i++)
8745 copy_rvec(v[sort[i].ind], buf[i]);
8748 /* Copy back to the original array */
8749 for (i = 0; i < n; i++)
8751 copy_rvec(buf[i], v[i]);
8755 static void order_vec_atom(int ncg, const int *cgindex, const gmx_cgsort_t *sort,
8758 int a, atot, cg, cg0, cg1, i;
8760 if (cgindex == nullptr)
8762 /* Avoid the useless loop of the atoms within a cg */
8763 order_vec_cg(ncg, sort, v, buf);
8768 /* Order the data */
8770 for (cg = 0; cg < ncg; cg++)
8772 cg0 = cgindex[sort[cg].ind];
8773 cg1 = cgindex[sort[cg].ind+1];
8774 for (i = cg0; i < cg1; i++)
8776 copy_rvec(v[i], buf[a]);
8782 /* Copy back to the original array */
8783 for (a = 0; a < atot; a++)
8785 copy_rvec(buf[a], v[a]);
8789 static void ordered_sort(int nsort2, gmx_cgsort_t *sort2,
8790 int nsort_new, gmx_cgsort_t *sort_new,
8791 gmx_cgsort_t *sort1)
8795 /* The new indices are not very ordered, so we qsort them */
8796 gmx_qsort_threadsafe(sort_new, nsort_new, sizeof(sort_new[0]), comp_cgsort);
8798 /* sort2 is already ordered, so now we can merge the two arrays */
8802 while (i2 < nsort2 || i_new < nsort_new)
8806 sort1[i1++] = sort_new[i_new++];
8808 else if (i_new == nsort_new)
8810 sort1[i1++] = sort2[i2++];
8812 else if (sort2[i2].nsc < sort_new[i_new].nsc ||
8813 (sort2[i2].nsc == sort_new[i_new].nsc &&
8814 sort2[i2].ind_gl < sort_new[i_new].ind_gl))
8816 sort1[i1++] = sort2[i2++];
8820 sort1[i1++] = sort_new[i_new++];
8825 static int dd_sort_order(gmx_domdec_t *dd, t_forcerec *fr, int ncg_home_old)
8827 gmx_domdec_sort_t *sort;
8828 gmx_cgsort_t *cgsort, *sort_i;
8829 int ncg_new, nsort2, nsort_new, i, *a, moved;
8831 sort = dd->comm->sort;
8833 a = fr->ns->grid->cell_index;
8835 moved = NSGRID_SIGNAL_MOVED_FAC*fr->ns->grid->ncells;
8837 if (ncg_home_old >= 0)
8839 /* The charge groups that remained in the same ns grid cell
8840 * are completely ordered. So we can sort efficiently by sorting
8841 * the charge groups that did move into the stationary list.
8846 for (i = 0; i < dd->ncg_home; i++)
8848 /* Check if this cg did not move to another node */
8851 if (i >= ncg_home_old || a[i] != sort->sort[i].nsc)
8853 /* This cg is new on this node or moved ns grid cell */
8854 if (nsort_new >= sort->sort_new_nalloc)
8856 sort->sort_new_nalloc = over_alloc_dd(nsort_new+1);
8857 srenew(sort->sort_new, sort->sort_new_nalloc);
8859 sort_i = &(sort->sort_new[nsort_new++]);
8863 /* This cg did not move */
8864 sort_i = &(sort->sort2[nsort2++]);
8866 /* Sort on the ns grid cell indices
8867 * and the global topology index.
8868 * index_gl is irrelevant with cell ns,
8869 * but we set it here anyhow to avoid a conditional.
8872 sort_i->ind_gl = dd->index_gl[i];
8879 fprintf(debug, "ordered sort cgs: stationary %d moved %d\n",
8882 /* Sort efficiently */
8883 ordered_sort(nsort2, sort->sort2, nsort_new, sort->sort_new,
8888 cgsort = sort->sort;
8890 for (i = 0; i < dd->ncg_home; i++)
8892 /* Sort on the ns grid cell indices
8893 * and the global topology index
8895 cgsort[i].nsc = a[i];
8896 cgsort[i].ind_gl = dd->index_gl[i];
8898 if (cgsort[i].nsc < moved)
8905 fprintf(debug, "qsort cgs: %d new home %d\n", dd->ncg_home, ncg_new);
8907 /* Determine the order of the charge groups using qsort */
8908 gmx_qsort_threadsafe(cgsort, dd->ncg_home, sizeof(cgsort[0]), comp_cgsort);
8914 static int dd_sort_order_nbnxn(gmx_domdec_t *dd, t_forcerec *fr)
8920 sort = dd->comm->sort->sort;
8922 nbnxn_get_atomorder(fr->nbv->nbs, &a, &na);
8925 for (i = 0; i < na; i++)
8929 sort[ncg_new].ind = a[i];
8937 static void dd_sort_state(gmx_domdec_t *dd, rvec *cgcm, t_forcerec *fr, t_state *state,
8940 gmx_domdec_sort_t *sort;
8941 gmx_cgsort_t *cgsort;
8943 int ncg_new, i, *ibuf, cgsize;
8946 sort = dd->comm->sort;
8948 if (dd->ncg_home > sort->sort_nalloc)
8950 sort->sort_nalloc = over_alloc_dd(dd->ncg_home);
8951 srenew(sort->sort, sort->sort_nalloc);
8952 srenew(sort->sort2, sort->sort_nalloc);
8954 cgsort = sort->sort;
8956 switch (fr->cutoff_scheme)
8959 ncg_new = dd_sort_order(dd, fr, ncg_home_old);
8962 ncg_new = dd_sort_order_nbnxn(dd, fr);
8965 gmx_incons("unimplemented");
8969 /* We alloc with the old size, since cgindex is still old */
8970 vec_rvec_check_alloc(&dd->comm->vbuf, dd->cgindex[dd->ncg_home]);
8971 vbuf = dd->comm->vbuf.v;
8975 cgindex = dd->cgindex;
8982 /* Remove the charge groups which are no longer at home here */
8983 dd->ncg_home = ncg_new;
8986 fprintf(debug, "Set the new home charge group count to %d\n",
8990 /* Reorder the state */
8991 for (i = 0; i < estNR; i++)
8993 if (EST_DISTR(i) && (state->flags & (1<<i)))
8998 order_vec_atom(dd->ncg_home, cgindex, cgsort, as_rvec_array(state->x.data()), vbuf);
9001 order_vec_atom(dd->ncg_home, cgindex, cgsort, as_rvec_array(state->v.data()), vbuf);
9003 case est_SDX_NOTSUPPORTED:
9006 order_vec_atom(dd->ncg_home, cgindex, cgsort, as_rvec_array(state->cg_p.data()), vbuf);
9010 case estDISRE_INITF:
9011 case estDISRE_RM3TAV:
9012 case estORIRE_INITF:
9014 /* No ordering required */
9017 gmx_incons("Unknown state entry encountered in dd_sort_state");
9022 if (fr->cutoff_scheme == ecutsGROUP)
9025 order_vec_cg(dd->ncg_home, cgsort, cgcm, vbuf);
9028 if (dd->ncg_home+1 > sort->ibuf_nalloc)
9030 sort->ibuf_nalloc = over_alloc_dd(dd->ncg_home+1);
9031 srenew(sort->ibuf, sort->ibuf_nalloc);
9034 /* Reorder the global cg index */
9035 order_int_cg(dd->ncg_home, cgsort, dd->index_gl, ibuf);
9036 /* Reorder the cginfo */
9037 order_int_cg(dd->ncg_home, cgsort, fr->cginfo, ibuf);
9038 /* Rebuild the local cg index */
9042 for (i = 0; i < dd->ncg_home; i++)
9044 cgsize = dd->cgindex[cgsort[i].ind+1] - dd->cgindex[cgsort[i].ind];
9045 ibuf[i+1] = ibuf[i] + cgsize;
9047 for (i = 0; i < dd->ncg_home+1; i++)
9049 dd->cgindex[i] = ibuf[i];
9054 for (i = 0; i < dd->ncg_home+1; i++)
9059 /* Set the home atom number */
9060 dd->nat_home = dd->cgindex[dd->ncg_home];
9062 if (fr->cutoff_scheme == ecutsVERLET)
9064 /* The atoms are now exactly in grid order, update the grid order */
9065 nbnxn_set_atomorder(fr->nbv->nbs);
9069 /* Copy the sorted ns cell indices back to the ns grid struct */
9070 for (i = 0; i < dd->ncg_home; i++)
9072 fr->ns->grid->cell_index[i] = cgsort[i].nsc;
9074 fr->ns->grid->nr = dd->ncg_home;
9078 static void add_dd_statistics(gmx_domdec_t *dd)
9080 gmx_domdec_comm_t *comm;
9085 for (ddnat = ddnatZONE; ddnat < ddnatNR; ddnat++)
9087 comm->sum_nat[ddnat-ddnatZONE] +=
9088 comm->nat[ddnat] - comm->nat[ddnat-1];
9093 void reset_dd_statistics_counters(gmx_domdec_t *dd)
9095 gmx_domdec_comm_t *comm;
9100 /* Reset all the statistics and counters for total run counting */
9101 for (ddnat = ddnatZONE; ddnat < ddnatNR; ddnat++)
9103 comm->sum_nat[ddnat-ddnatZONE] = 0;
9107 comm->load_step = 0;
9110 clear_ivec(comm->load_lim);
9115 void print_dd_statistics(t_commrec *cr, t_inputrec *ir, FILE *fplog)
9117 gmx_domdec_comm_t *comm;
9121 comm = cr->dd->comm;
9123 gmx_sumd(ddnatNR-ddnatZONE, comm->sum_nat, cr);
9125 if (fplog == nullptr)
9130 fprintf(fplog, "\n D O M A I N D E C O M P O S I T I O N S T A T I S T I C S\n\n");
9132 for (ddnat = ddnatZONE; ddnat < ddnatNR; ddnat++)
9134 av = comm->sum_nat[ddnat-ddnatZONE]/comm->ndecomp;
9139 " av. #atoms communicated per step for force: %d x %.1f\n",
9143 if (cr->dd->vsite_comm)
9146 " av. #atoms communicated per step for vsites: %d x %.1f\n",
9147 (EEL_PME(ir->coulombtype) || ir->coulombtype == eelEWALD) ? 3 : 2,
9152 if (cr->dd->constraint_comm)
9155 " av. #atoms communicated per step for LINCS: %d x %.1f\n",
9156 1 + ir->nLincsIter, av);
9160 gmx_incons(" Unknown type for DD statistics");
9163 fprintf(fplog, "\n");
9165 if (comm->bRecordLoad && EI_DYNAMICS(ir->eI))
9167 print_dd_load_av(fplog, cr->dd);
9171 void dd_partition_system(FILE *fplog,
9174 gmx_bool bMasterState,
9176 t_state *state_global,
9177 const gmx_mtop_t *top_global,
9178 const t_inputrec *ir,
9179 t_state *state_local,
9180 PaddedRVecVector *f,
9182 gmx_localtop_t *top_local,
9185 gmx_constr_t constr,
9187 gmx_wallcycle_t wcycle,
9191 gmx_domdec_comm_t *comm;
9192 gmx_ddbox_t ddbox = {0};
9194 gmx_int64_t step_pcoupl;
9195 rvec cell_ns_x0, cell_ns_x1;
9196 int i, n, ncgindex_set, ncg_home_old = -1, ncg_moved, nat_f_novirsum;
9197 gmx_bool bBoxChanged, bNStGlobalComm, bDoDLB, bCheckWhetherToTurnDlbOn, bLogLoad;
9198 gmx_bool bRedist, bSortCG, bResortAll;
9199 ivec ncells_old = {0, 0, 0}, ncells_new = {0, 0, 0}, np;
9203 wallcycle_start(wcycle, ewcDOMDEC);
9208 bBoxChanged = (bMasterState || inputrecDeform(ir));
9209 if (ir->epc != epcNO)
9211 /* With nstpcouple > 1 pressure coupling happens.
9212 * one step after calculating the pressure.
9213 * Box scaling happens at the end of the MD step,
9214 * after the DD partitioning.
9215 * We therefore have to do DLB in the first partitioning
9216 * after an MD step where P-coupling occurred.
9217 * We need to determine the last step in which p-coupling occurred.
9218 * MRS -- need to validate this for vv?
9223 step_pcoupl = step - 1;
9227 step_pcoupl = ((step - 1)/n)*n + 1;
9229 if (step_pcoupl >= comm->partition_step)
9235 bNStGlobalComm = (step % nstglobalcomm == 0);
9243 /* Should we do dynamic load balacing this step?
9244 * Since it requires (possibly expensive) global communication,
9245 * we might want to do DLB less frequently.
9247 if (bBoxChanged || ir->epc != epcNO)
9249 bDoDLB = bBoxChanged;
9253 bDoDLB = bNStGlobalComm;
9257 /* Check if we have recorded loads on the nodes */
9258 if (comm->bRecordLoad && dd_load_count(comm) > 0)
9260 bCheckWhetherToTurnDlbOn = dd_dlb_get_should_check_whether_to_turn_dlb_on(dd);
9262 /* Print load every nstlog, first and last step to the log file */
9263 bLogLoad = ((ir->nstlog > 0 && step % ir->nstlog == 0) ||
9264 comm->n_load_collect == 0 ||
9266 (step + ir->nstlist > ir->init_step + ir->nsteps)));
9268 /* Avoid extra communication due to verbose screen output
9269 * when nstglobalcomm is set.
9271 if (bDoDLB || bLogLoad || bCheckWhetherToTurnDlbOn ||
9272 (bVerbose && (ir->nstlist == 0 || nstglobalcomm <= ir->nstlist)))
9274 get_load_distribution(dd, wcycle);
9279 dd_print_load(fplog, dd, step-1);
9283 dd_print_load_verbose(dd);
9286 comm->n_load_collect++;
9292 /* Add the measured cycles to the running average */
9293 const float averageFactor = 0.1f;
9294 comm->cyclesPerStepDlbExpAverage =
9295 (1 - averageFactor)*comm->cyclesPerStepDlbExpAverage +
9296 averageFactor*comm->cycl[ddCyclStep]/comm->cycl_n[ddCyclStep];
9298 if (comm->dlbState == edlbsOnCanTurnOff &&
9299 dd->comm->n_load_have % c_checkTurnDlbOffInterval == c_checkTurnDlbOffInterval - 1)
9301 gmx_bool turnOffDlb;
9304 /* If the running averaged cycles with DLB are more
9305 * than before we turned on DLB, turn off DLB.
9306 * We will again run and check the cycles without DLB
9307 * and we can then decide if to turn off DLB forever.
9309 turnOffDlb = (comm->cyclesPerStepDlbExpAverage >
9310 comm->cyclesPerStepBeforeDLB);
9312 dd_bcast(dd, sizeof(turnOffDlb), &turnOffDlb);
9315 /* To turn off DLB, we need to redistribute the atoms */
9316 dd_collect_state(dd, state_local, state_global);
9317 bMasterState = TRUE;
9318 turn_off_dlb(fplog, cr, step);
9322 else if (bCheckWhetherToTurnDlbOn)
9324 gmx_bool turnOffDlbForever = FALSE;
9325 gmx_bool turnOnDlb = FALSE;
9327 /* Since the timings are node dependent, the master decides */
9330 /* If we recently turned off DLB, we want to check if
9331 * performance is better without DLB. We want to do this
9332 * ASAP to minimize the chance that external factors
9333 * slowed down the DLB step are gone here and we
9334 * incorrectly conclude that DLB was causing the slowdown.
9335 * So we measure one nstlist block, no running average.
9337 if (comm->haveTurnedOffDlb &&
9338 comm->cycl[ddCyclStep]/comm->cycl_n[ddCyclStep] <
9339 comm->cyclesPerStepDlbExpAverage)
9341 /* After turning off DLB we ran nstlist steps in fewer
9342 * cycles than with DLB. This likely means that DLB
9343 * in not benefical, but this could be due to a one
9344 * time unlucky fluctuation, so we require two such
9345 * observations in close succession to turn off DLB
9348 if (comm->dlbSlowerPartitioningCount > 0 &&
9349 dd->ddp_count < comm->dlbSlowerPartitioningCount + 10*c_checkTurnDlbOnInterval)
9351 turnOffDlbForever = TRUE;
9353 comm->haveTurnedOffDlb = false;
9354 /* Register when we last measured DLB slowdown */
9355 comm->dlbSlowerPartitioningCount = dd->ddp_count;
9359 /* Here we check if the max PME rank load is more than 0.98
9360 * the max PP force load. If so, PP DLB will not help,
9361 * since we are (almost) limited by PME. Furthermore,
9362 * DLB will cause a significant extra x/f redistribution
9363 * cost on the PME ranks, which will then surely result
9364 * in lower total performance.
9366 if (cr->npmenodes > 0 &&
9367 dd_pme_f_ratio(dd) > 1 - DD_PERF_LOSS_DLB_ON)
9373 turnOnDlb = (dd_force_imb_perf_loss(dd) >= DD_PERF_LOSS_DLB_ON);
9379 gmx_bool turnOffDlbForever;
9383 turnOffDlbForever, turnOnDlb
9385 dd_bcast(dd, sizeof(bools), &bools);
9386 if (bools.turnOffDlbForever)
9388 turn_off_dlb_forever(fplog, cr, step);
9390 else if (bools.turnOnDlb)
9392 turn_on_dlb(fplog, cr, step);
9397 comm->n_load_have++;
9400 cgs_gl = &comm->cgs_gl;
9405 /* Clear the old state */
9406 clear_dd_indices(dd, 0, 0);
9409 set_ddbox(dd, bMasterState, cr, ir, state_global->box,
9410 TRUE, cgs_gl, as_rvec_array(state_global->x.data()), &ddbox);
9412 get_cg_distribution(fplog, dd, cgs_gl,
9413 state_global->box, &ddbox, as_rvec_array(state_global->x.data()));
9415 dd_distribute_state(dd, cgs_gl,
9416 state_global, state_local, f);
9418 dd_make_local_cgs(dd, &top_local->cgs);
9420 /* Ensure that we have space for the new distribution */
9421 dd_check_alloc_ncg(fr, state_local, f, dd->ncg_home);
9423 if (fr->cutoff_scheme == ecutsGROUP)
9425 calc_cgcm(fplog, 0, dd->ncg_home,
9426 &top_local->cgs, as_rvec_array(state_local->x.data()), fr->cg_cm);
9429 inc_nrnb(nrnb, eNR_CGCM, dd->nat_home);
9431 dd_set_cginfo(dd->index_gl, 0, dd->ncg_home, fr, comm->bLocalCG);
9433 else if (state_local->ddp_count != dd->ddp_count)
9435 if (state_local->ddp_count > dd->ddp_count)
9437 gmx_fatal(FARGS, "Internal inconsistency state_local->ddp_count (%d) > dd->ddp_count (%d)", state_local->ddp_count, dd->ddp_count);
9440 if (state_local->ddp_count_cg_gl != state_local->ddp_count)
9442 gmx_fatal(FARGS, "Internal inconsistency state_local->ddp_count_cg_gl (%d) != state_local->ddp_count (%d)", state_local->ddp_count_cg_gl, state_local->ddp_count);
9445 /* Clear the old state */
9446 clear_dd_indices(dd, 0, 0);
9448 /* Build the new indices */
9449 rebuild_cgindex(dd, cgs_gl->index, state_local);
9450 make_dd_indices(dd, cgs_gl->index, 0);
9451 ncgindex_set = dd->ncg_home;
9453 if (fr->cutoff_scheme == ecutsGROUP)
9455 /* Redetermine the cg COMs */
9456 calc_cgcm(fplog, 0, dd->ncg_home,
9457 &top_local->cgs, as_rvec_array(state_local->x.data()), fr->cg_cm);
9460 inc_nrnb(nrnb, eNR_CGCM, dd->nat_home);
9462 dd_set_cginfo(dd->index_gl, 0, dd->ncg_home, fr, comm->bLocalCG);
9464 set_ddbox(dd, bMasterState, cr, ir, state_local->box,
9465 TRUE, &top_local->cgs, as_rvec_array(state_local->x.data()), &ddbox);
9467 bRedist = dlbIsOn(comm);
9471 /* We have the full state, only redistribute the cgs */
9473 /* Clear the non-home indices */
9474 clear_dd_indices(dd, dd->ncg_home, dd->nat_home);
9477 /* Avoid global communication for dim's without pbc and -gcom */
9478 if (!bNStGlobalComm)
9480 copy_rvec(comm->box0, ddbox.box0 );
9481 copy_rvec(comm->box_size, ddbox.box_size);
9483 set_ddbox(dd, bMasterState, cr, ir, state_local->box,
9484 bNStGlobalComm, &top_local->cgs, as_rvec_array(state_local->x.data()), &ddbox);
9489 /* For dim's without pbc and -gcom */
9490 copy_rvec(ddbox.box0, comm->box0 );
9491 copy_rvec(ddbox.box_size, comm->box_size);
9493 set_dd_cell_sizes(dd, &ddbox, dynamic_dd_box(&ddbox, ir), bMasterState, bDoDLB,
9496 if (comm->nstDDDumpGrid > 0 && step % comm->nstDDDumpGrid == 0)
9498 write_dd_grid_pdb("dd_grid", step, dd, state_local->box, &ddbox);
9501 /* Check if we should sort the charge groups */
9502 bSortCG = (bMasterState || bRedist);
9504 ncg_home_old = dd->ncg_home;
9509 wallcycle_sub_start(wcycle, ewcsDD_REDIST);
9511 dd_redistribute_cg(fplog, step, dd, ddbox.tric_dir,
9513 !bSortCG, nrnb, &ncgindex_set, &ncg_moved);
9515 wallcycle_sub_stop(wcycle, ewcsDD_REDIST);
9518 get_nsgrid_boundaries(ddbox.nboundeddim, state_local->box,
9520 &comm->cell_x0, &comm->cell_x1,
9521 dd->ncg_home, fr->cg_cm,
9522 cell_ns_x0, cell_ns_x1, &grid_density);
9526 comm_dd_ns_cell_sizes(dd, &ddbox, cell_ns_x0, cell_ns_x1, step);
9529 switch (fr->cutoff_scheme)
9532 copy_ivec(fr->ns->grid->n, ncells_old);
9533 grid_first(fplog, fr->ns->grid, dd, &ddbox,
9534 state_local->box, cell_ns_x0, cell_ns_x1,
9535 fr->rlist, grid_density);
9538 nbnxn_get_ncells(fr->nbv->nbs, &ncells_old[XX], &ncells_old[YY]);
9541 gmx_incons("unimplemented");
9543 /* We need to store tric_dir for dd_get_ns_ranges called from ns.c */
9544 copy_ivec(ddbox.tric_dir, comm->tric_dir);
9548 wallcycle_sub_start(wcycle, ewcsDD_GRID);
9550 /* Sort the state on charge group position.
9551 * This enables exact restarts from this step.
9552 * It also improves performance by about 15% with larger numbers
9553 * of atoms per node.
9556 /* Fill the ns grid with the home cell,
9557 * so we can sort with the indices.
9559 set_zones_ncg_home(dd);
9561 switch (fr->cutoff_scheme)
9564 set_zones_size(dd, state_local->box, &ddbox, 0, 1);
9566 nbnxn_put_on_grid(fr->nbv->nbs, fr->ePBC, state_local->box,
9568 comm->zones.size[0].bb_x0,
9569 comm->zones.size[0].bb_x1,
9571 comm->zones.dens_zone0,
9573 as_rvec_array(state_local->x.data()),
9574 ncg_moved, bRedist ? comm->moved : nullptr,
9575 fr->nbv->grp[eintLocal].kernel_type,
9576 fr->nbv->grp[eintLocal].nbat);
9578 nbnxn_get_ncells(fr->nbv->nbs, &ncells_new[XX], &ncells_new[YY]);
9581 fill_grid(&comm->zones, fr->ns->grid, dd->ncg_home,
9582 0, dd->ncg_home, fr->cg_cm);
9584 copy_ivec(fr->ns->grid->n, ncells_new);
9587 gmx_incons("unimplemented");
9590 bResortAll = bMasterState;
9592 /* Check if we can user the old order and ns grid cell indices
9593 * of the charge groups to sort the charge groups efficiently.
9595 if (ncells_new[XX] != ncells_old[XX] ||
9596 ncells_new[YY] != ncells_old[YY] ||
9597 ncells_new[ZZ] != ncells_old[ZZ])
9604 fprintf(debug, "Step %s, sorting the %d home charge groups\n",
9605 gmx_step_str(step, sbuf), dd->ncg_home);
9607 dd_sort_state(dd, fr->cg_cm, fr, state_local,
9608 bResortAll ? -1 : ncg_home_old);
9610 /* After sorting and compacting we set the correct size */
9611 dd_resize_state(state_local, f, dd->nat_home);
9613 /* Rebuild all the indices */
9614 ga2la_clear(dd->ga2la);
9617 wallcycle_sub_stop(wcycle, ewcsDD_GRID);
9620 wallcycle_sub_start(wcycle, ewcsDD_SETUPCOMM);
9622 /* Setup up the communication and communicate the coordinates */
9623 setup_dd_communication(dd, state_local->box, &ddbox, fr, state_local, f);
9625 /* Set the indices */
9626 make_dd_indices(dd, cgs_gl->index, ncgindex_set);
9628 /* Set the charge group boundaries for neighbor searching */
9629 set_cg_boundaries(&comm->zones);
9631 if (fr->cutoff_scheme == ecutsVERLET)
9633 set_zones_size(dd, state_local->box, &ddbox,
9634 bSortCG ? 1 : 0, comm->zones.n);
9637 wallcycle_sub_stop(wcycle, ewcsDD_SETUPCOMM);
9640 write_dd_pdb("dd_home",step,"dump",top_global,cr,
9641 -1,as_rvec_array(state_local->x.data()),state_local->box);
9644 wallcycle_sub_start(wcycle, ewcsDD_MAKETOP);
9646 /* Extract a local topology from the global topology */
9647 for (i = 0; i < dd->ndim; i++)
9649 np[dd->dim[i]] = comm->cd[i].np;
9651 dd_make_local_top(dd, &comm->zones, dd->npbcdim, state_local->box,
9652 comm->cellsize_min, np,
9654 fr->cutoff_scheme == ecutsGROUP ? fr->cg_cm : as_rvec_array(state_local->x.data()),
9655 vsite, top_global, top_local);
9657 wallcycle_sub_stop(wcycle, ewcsDD_MAKETOP);
9659 wallcycle_sub_start(wcycle, ewcsDD_MAKECONSTR);
9661 /* Set up the special atom communication */
9662 n = comm->nat[ddnatZONE];
9663 for (i = ddnatZONE+1; i < ddnatNR; i++)
9668 if (vsite && vsite->n_intercg_vsite)
9670 n = dd_make_local_vsites(dd, n, top_local->idef.il);
9674 if (dd->bInterCGcons || dd->bInterCGsettles)
9676 /* Only for inter-cg constraints we need special code */
9677 n = dd_make_local_constraints(dd, n, top_global, fr->cginfo,
9678 constr, ir->nProjOrder,
9679 top_local->idef.il);
9683 gmx_incons("Unknown special atom type setup");
9688 wallcycle_sub_stop(wcycle, ewcsDD_MAKECONSTR);
9690 wallcycle_sub_start(wcycle, ewcsDD_TOPOTHER);
9692 /* Make space for the extra coordinates for virtual site
9693 * or constraint communication.
9695 state_local->natoms = comm->nat[ddnatNR-1];
9697 dd_resize_state(state_local, f, state_local->natoms);
9699 if (fr->bF_NoVirSum)
9701 if (vsite && vsite->n_intercg_vsite)
9703 nat_f_novirsum = comm->nat[ddnatVSITE];
9707 if (EEL_FULL(ir->coulombtype) && dd->n_intercg_excl > 0)
9709 nat_f_novirsum = dd->nat_tot;
9713 nat_f_novirsum = dd->nat_home;
9722 /* Set the number of atoms required for the force calculation.
9723 * Forces need to be constrained when doing energy
9724 * minimization. For simple simulations we could avoid some
9725 * allocation, zeroing and copying, but this is probably not worth
9726 * the complications and checking.
9728 forcerec_set_ranges(fr, dd->ncg_home, dd->ncg_tot,
9729 dd->nat_tot, comm->nat[ddnatCON], nat_f_novirsum);
9731 /* Update atom data for mdatoms and several algorithms */
9732 mdAlgorithmsSetupAtomData(cr, ir, top_global, top_local, fr,
9733 nullptr, mdatoms, vsite, nullptr);
9735 if (ir->implicit_solvent)
9737 make_local_gb(cr, fr->born, ir->gb_algorithm);
9740 if (!(cr->duty & DUTY_PME))
9742 /* Send the charges and/or c6/sigmas to our PME only node */
9743 gmx_pme_send_parameters(cr,
9745 mdatoms->nChargePerturbed, mdatoms->nTypePerturbed,
9746 mdatoms->chargeA, mdatoms->chargeB,
9747 mdatoms->sqrt_c6A, mdatoms->sqrt_c6B,
9748 mdatoms->sigmaA, mdatoms->sigmaB,
9749 dd_pme_maxshift_x(dd), dd_pme_maxshift_y(dd));
9754 set_constraints(constr, top_local, ir, mdatoms, cr);
9759 /* Update the local pull groups */
9760 dd_make_local_pull_groups(cr, ir->pull_work, mdatoms);
9765 /* Update the local rotation groups */
9766 dd_make_local_rotation_groups(dd, ir->rot);
9769 if (ir->eSwapCoords != eswapNO)
9771 /* Update the local groups needed for ion swapping */
9772 dd_make_local_swap_groups(dd, ir->swap);
9775 /* Update the local atoms to be communicated via the IMD protocol if bIMD is TRUE. */
9776 dd_make_local_IMD_atoms(ir->bIMD, dd, ir->imd);
9778 add_dd_statistics(dd);
9780 /* Make sure we only count the cycles for this DD partitioning */
9781 clear_dd_cycle_counts(dd);
9783 /* Because the order of the atoms might have changed since
9784 * the last vsite construction, we need to communicate the constructing
9785 * atom coordinates again (for spreading the forces this MD step).
9787 dd_move_x_vsites(dd, state_local->box, as_rvec_array(state_local->x.data()));
9789 wallcycle_sub_stop(wcycle, ewcsDD_TOPOTHER);
9791 if (comm->nstDDDump > 0 && step % comm->nstDDDump == 0)
9793 dd_move_x(dd, state_local->box, as_rvec_array(state_local->x.data()));
9794 write_dd_pdb("dd_dump", step, "dump", top_global, cr,
9795 -1, as_rvec_array(state_local->x.data()), state_local->box);
9798 /* Store the partitioning step */
9799 comm->partition_step = step;
9801 /* Increase the DD partitioning counter */
9803 /* The state currently matches this DD partitioning count, store it */
9804 state_local->ddp_count = dd->ddp_count;
9807 /* The DD master node knows the complete cg distribution,
9808 * store the count so we can possibly skip the cg info communication.
9810 comm->master_cg_ddp_count = (bSortCG ? 0 : dd->ddp_count);
9813 if (comm->DD_debug > 0)
9815 /* Set the env var GMX_DD_DEBUG if you suspect corrupted indices */
9816 check_index_consistency(dd, top_global->natoms, ncg_mtop(top_global),
9817 "after partitioning");
9820 wallcycle_stop(wcycle, ewcDOMDEC);