1 /* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
4 * This file is part of Gromacs Copyright (c) 1991-2008
5 * David van der Spoel, Erik Lindahl, Berk Hess, University of Groningen.
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation; either version 2
10 * of the License, or (at your option) any later version.
12 * To help us fund GROMACS development, we humbly ask that you cite
13 * the research papers on the package. Check out http://www.gromacs.org
16 * Gnomes, ROck Monsters And Chili Sauce
32 #include "domdec_network.h"
35 #include "chargegroup.h"
44 #include "pull_rotation.h"
45 #include "gmx_wallcycle.h"
49 #include "mtop_util.h"
51 #include "gmx_ga2la.h"
62 #define DDRANK(dd,rank) (rank)
63 #define DDMASTERRANK(dd) (dd->masterrank)
65 typedef struct gmx_domdec_master
67 /* The cell boundaries */
69 /* The global charge group division */
70 int *ncg; /* Number of home charge groups for each node */
71 int *index; /* Index of nnodes+1 into cg */
72 int *cg; /* Global charge group index */
73 int *nat; /* Number of home atoms for each node. */
74 int *ibuf; /* Buffer for communication */
75 rvec *vbuf; /* Buffer for state scattering and gathering */
76 } gmx_domdec_master_t;
80 /* The numbers of charge groups to send and receive for each cell
81 * that requires communication, the last entry contains the total
82 * number of atoms that needs to be communicated.
84 int nsend[DD_MAXIZONE+2];
85 int nrecv[DD_MAXIZONE+2];
86 /* The charge groups to send */
89 /* The atom range for non-in-place communication */
90 int cell2at0[DD_MAXIZONE];
91 int cell2at1[DD_MAXIZONE];
96 int np; /* Number of grid pulses in this dimension */
97 int np_dlb; /* For dlb, for use with edlbAUTO */
98 gmx_domdec_ind_t *ind; /* The indices to communicate, size np */
100 gmx_bool bInPlace; /* Can we communicate in place? */
101 } gmx_domdec_comm_dim_t;
105 gmx_bool *bCellMin; /* Temp. var.: is this cell size at the limit */
106 real *cell_f; /* State var.: cell boundaries, box relative */
107 real *old_cell_f; /* Temp. var.: old cell size */
108 real *cell_f_max0; /* State var.: max lower boundary, incl neighbors */
109 real *cell_f_min1; /* State var.: min upper boundary, incl neighbors */
110 real *bound_min; /* Temp. var.: lower limit for cell boundary */
111 real *bound_max; /* Temp. var.: upper limit for cell boundary */
112 gmx_bool bLimited; /* State var.: is DLB limited in this dim and row */
113 real *buf_ncd; /* Temp. var. */
116 #define DD_NLOAD_MAX 9
118 /* Here floats are accurate enough, since these variables
119 * only influence the load balancing, not the actual MD results.
143 gmx_cgsort_t *sort1,*sort2;
145 gmx_cgsort_t *sort_new;
157 /* This enum determines the order of the coordinates.
158 * ddnatHOME and ddnatZONE should be first and second,
159 * the others can be ordered as wanted.
161 enum { ddnatHOME, ddnatZONE, ddnatVSITE, ddnatCON, ddnatNR };
163 enum { edlbAUTO, edlbNO, edlbYES, edlbNR };
164 const char *edlb_names[edlbNR] = { "auto", "no", "yes" };
168 int dim; /* The dimension */
169 gmx_bool dim_match;/* Tells if DD and PME dims match */
170 int nslab; /* The number of PME slabs in this dimension */
171 real *slb_dim_f; /* Cell sizes for determining the PME comm. with SLB */
172 int *pp_min; /* The minimum pp node location, size nslab */
173 int *pp_max; /* The maximum pp node location,size nslab */
174 int maxshift; /* The maximum shift for coordinate redistribution in PME */
179 real min0; /* The minimum bottom of this zone */
180 real max1; /* The maximum top of this zone */
181 real mch0; /* The maximum bottom communicaton height for this zone */
182 real mch1; /* The maximum top communicaton height for this zone */
183 real p1_0; /* The bottom value of the first cell in this zone */
184 real p1_1; /* The top value of the first cell in this zone */
187 typedef struct gmx_domdec_comm
189 /* All arrays are indexed with 0 to dd->ndim (not Cartesian indexing),
190 * unless stated otherwise.
193 /* The number of decomposition dimensions for PME, 0: no PME */
195 /* The number of nodes doing PME (PP/PME or only PME) */
199 /* The communication setup including the PME only nodes */
200 gmx_bool bCartesianPP_PME;
203 int *pmenodes; /* size npmenodes */
204 int *ddindex2simnodeid; /* size npmenodes, only with bCartesianPP
205 * but with bCartesianPP_PME */
206 gmx_ddpme_t ddpme[2];
208 /* The DD particle-particle nodes only */
209 gmx_bool bCartesianPP;
210 int *ddindex2ddnodeid; /* size npmenode, only with bCartesianPP_PME */
212 /* The global charge groups */
215 /* Should we sort the cgs */
217 gmx_domdec_sort_t *sort;
219 /* Are there bonded and multi-body interactions between charge groups? */
220 gmx_bool bInterCGBondeds;
221 gmx_bool bInterCGMultiBody;
223 /* Data for the optional bonded interaction atom communication range */
230 /* Are we actually using DLB? */
231 gmx_bool bDynLoadBal;
233 /* Cell sizes for static load balancing, first index cartesian */
236 /* The width of the communicated boundaries */
239 /* The minimum cell size (including triclinic correction) */
241 /* For dlb, for use with edlbAUTO */
242 rvec cellsize_min_dlb;
243 /* The lower limit for the DD cell size with DLB */
245 /* Effectively no NB cut-off limit with DLB for systems without PBC? */
246 gmx_bool bVacDLBNoLimit;
248 /* tric_dir is only stored here because dd_get_ns_ranges needs it */
250 /* box0 and box_size are required with dim's without pbc and -gcom */
254 /* The cell boundaries */
258 /* The old location of the cell boundaries, to check cg displacements */
262 /* The communication setup and charge group boundaries for the zones */
263 gmx_domdec_zones_t zones;
265 /* The zone limits for DD dimensions 1 and 2 (not 0), determined from
266 * cell boundaries of neighboring cells for dynamic load balancing.
268 gmx_ddzone_t zone_d1[2];
269 gmx_ddzone_t zone_d2[2][2];
271 /* The coordinate/force communication setup and indices */
272 gmx_domdec_comm_dim_t cd[DIM];
273 /* The maximum number of cells to communicate with in one dimension */
276 /* Which cg distribution is stored on the master node */
277 int master_cg_ddp_count;
279 /* The number of cg's received from the direct neighbors */
280 int zone_ncg1[DD_MAXZONE];
282 /* The atom counts, the range for each type t is nat[t-1] <= at < nat[t] */
285 /* Communication buffer for general use */
289 /* Communication buffer for general use */
292 /* Communication buffers only used with multiple grid pulses */
297 /* Communication buffers for local redistribution */
299 int cggl_flag_nalloc[DIM*2];
301 int cgcm_state_nalloc[DIM*2];
303 /* Cell sizes for dynamic load balancing */
304 gmx_domdec_root_t **root;
308 real cell_f_max0[DIM];
309 real cell_f_min1[DIM];
311 /* Stuff for load communication */
312 gmx_bool bRecordLoad;
313 gmx_domdec_load_t *load;
315 MPI_Comm *mpi_comm_load;
318 /* Maximum DLB scaling per load balancing step in percent */
322 float cycl[ddCyclNr];
323 int cycl_n[ddCyclNr];
324 float cycl_max[ddCyclNr];
325 /* Flop counter (0=no,1=yes,2=with (eFlop-1)*5% noise */
329 /* Have often have did we have load measurements */
331 /* Have often have we collected the load measurements */
335 double sum_nat[ddnatNR-ddnatZONE];
345 /* The last partition step */
346 gmx_large_int_t globalcomm_step;
354 /* The size per charge group of the cggl_flag buffer in gmx_domdec_comm_t */
357 /* The flags for the cggl_flag buffer in gmx_domdec_comm_t */
358 #define DD_FLAG_NRCG 65535
359 #define DD_FLAG_FW(d) (1<<(16+(d)*2))
360 #define DD_FLAG_BW(d) (1<<(16+(d)*2+1))
362 /* Zone permutation required to obtain consecutive charge groups
363 * for neighbor searching.
365 static const int zone_perm[3][4] = { {0,0,0,0},{1,0,0,0},{3,0,1,2} };
367 /* dd_zo and dd_zp3/dd_zp2 are set up such that i zones with non-zero
368 * components see only j zones with that component 0.
371 /* The DD zone order */
372 static const ivec dd_zo[DD_MAXZONE] =
373 {{0,0,0},{1,0,0},{1,1,0},{0,1,0},{0,1,1},{0,0,1},{1,0,1},{1,1,1}};
378 static const ivec dd_zp3[dd_zp3n] = {{0,0,8},{1,3,6},{2,5,6},{3,5,7}};
383 static const ivec dd_zp2[dd_zp2n] = {{0,0,4},{1,3,4}};
388 static const ivec dd_zp1[dd_zp1n] = {{0,0,2}};
390 /* Factors used to avoid problems due to rounding issues */
391 #define DD_CELL_MARGIN 1.0001
392 #define DD_CELL_MARGIN2 1.00005
393 /* Factor to account for pressure scaling during nstlist steps */
394 #define DD_PRES_SCALE_MARGIN 1.02
396 /* Allowed performance loss before we DLB or warn */
397 #define DD_PERF_LOSS 0.05
399 #define DD_CELL_F_SIZE(dd,di) ((dd)->nc[(dd)->dim[(di)]]+1+(di)*2+1+(di))
401 /* Use separate MPI send and receive commands
402 * when nnodes <= GMX_DD_NNODES_SENDRECV.
403 * This saves memory (and some copying for small nnodes).
404 * For high parallelization scatter and gather calls are used.
406 #define GMX_DD_NNODES_SENDRECV 4
410 #define dd_index(n,i) ((((i)[ZZ]*(n)[YY] + (i)[YY])*(n)[XX]) + (i)[XX])
412 static void index2xyz(ivec nc,int ind,ivec xyz)
414 xyz[XX] = ind % nc[XX];
415 xyz[YY] = (ind / nc[XX]) % nc[YY];
416 xyz[ZZ] = ind / (nc[YY]*nc[XX]);
420 /* This order is required to minimize the coordinate communication in PME
421 * which uses decomposition in the x direction.
423 #define dd_index(n,i) ((((i)[XX]*(n)[YY] + (i)[YY])*(n)[ZZ]) + (i)[ZZ])
425 static void ddindex2xyz(ivec nc,int ind,ivec xyz)
427 xyz[XX] = ind / (nc[YY]*nc[ZZ]);
428 xyz[YY] = (ind / nc[ZZ]) % nc[YY];
429 xyz[ZZ] = ind % nc[ZZ];
432 static int ddcoord2ddnodeid(gmx_domdec_t *dd,ivec c)
437 ddindex = dd_index(dd->nc,c);
438 if (dd->comm->bCartesianPP_PME)
440 ddnodeid = dd->comm->ddindex2ddnodeid[ddindex];
442 else if (dd->comm->bCartesianPP)
445 MPI_Cart_rank(dd->mpi_comm_all,c,&ddnodeid);
456 static gmx_bool dynamic_dd_box(gmx_ddbox_t *ddbox,t_inputrec *ir)
458 return (ddbox->nboundeddim < DIM || DYNAMIC_BOX(*ir));
461 int ddglatnr(gmx_domdec_t *dd,int i)
471 if (i >= dd->comm->nat[ddnatNR-1])
473 gmx_fatal(FARGS,"glatnr called with %d, which is larger than the local number of atoms (%d)",i,dd->comm->nat[ddnatNR-1]);
475 atnr = dd->gatindex[i] + 1;
481 t_block *dd_charge_groups_global(gmx_domdec_t *dd)
483 return &dd->comm->cgs_gl;
486 static void vec_rvec_init(vec_rvec_t *v)
492 static void vec_rvec_check_alloc(vec_rvec_t *v,int n)
496 v->nalloc = over_alloc_dd(n);
497 srenew(v->v,v->nalloc);
501 void dd_store_state(gmx_domdec_t *dd,t_state *state)
505 if (state->ddp_count != dd->ddp_count)
507 gmx_incons("The state does not the domain decomposition state");
510 state->ncg_gl = dd->ncg_home;
511 if (state->ncg_gl > state->cg_gl_nalloc)
513 state->cg_gl_nalloc = over_alloc_dd(state->ncg_gl);
514 srenew(state->cg_gl,state->cg_gl_nalloc);
516 for(i=0; i<state->ncg_gl; i++)
518 state->cg_gl[i] = dd->index_gl[i];
521 state->ddp_count_cg_gl = dd->ddp_count;
524 gmx_domdec_zones_t *domdec_zones(gmx_domdec_t *dd)
526 return &dd->comm->zones;
529 void dd_get_ns_ranges(gmx_domdec_t *dd,int icg,
530 int *jcg0,int *jcg1,ivec shift0,ivec shift1)
532 gmx_domdec_zones_t *zones;
535 zones = &dd->comm->zones;
538 while (icg >= zones->izone[izone].cg1)
547 else if (izone < zones->nizone)
549 *jcg0 = zones->izone[izone].jcg0;
553 gmx_fatal(FARGS,"DD icg %d out of range: izone (%d) >= nizone (%d)",
554 icg,izone,zones->nizone);
557 *jcg1 = zones->izone[izone].jcg1;
559 for(d=0; d<dd->ndim; d++)
562 shift0[dim] = zones->izone[izone].shift0[dim];
563 shift1[dim] = zones->izone[izone].shift1[dim];
564 if (dd->comm->tric_dir[dim] || (dd->bGridJump && d > 0))
566 /* A conservative approach, this can be optimized */
573 int dd_natoms_vsite(gmx_domdec_t *dd)
575 return dd->comm->nat[ddnatVSITE];
578 void dd_get_constraint_range(gmx_domdec_t *dd,int *at_start,int *at_end)
580 *at_start = dd->comm->nat[ddnatCON-1];
581 *at_end = dd->comm->nat[ddnatCON];
584 void dd_move_x(gmx_domdec_t *dd,matrix box,rvec x[])
586 int nzone,nat_tot,n,d,p,i,j,at0,at1,zone;
588 gmx_domdec_comm_t *comm;
589 gmx_domdec_comm_dim_t *cd;
590 gmx_domdec_ind_t *ind;
591 rvec shift={0,0,0},*buf,*rbuf;
592 gmx_bool bPBC,bScrew;
596 cgindex = dd->cgindex;
601 nat_tot = dd->nat_home;
602 for(d=0; d<dd->ndim; d++)
604 bPBC = (dd->ci[dd->dim[d]] == 0);
605 bScrew = (bPBC && dd->bScrewPBC && dd->dim[d] == XX);
608 copy_rvec(box[dd->dim[d]],shift);
611 for(p=0; p<cd->np; p++)
618 for(i=0; i<ind->nsend[nzone]; i++)
620 at0 = cgindex[index[i]];
621 at1 = cgindex[index[i]+1];
622 for(j=at0; j<at1; j++)
624 copy_rvec(x[j],buf[n]);
631 for(i=0; i<ind->nsend[nzone]; i++)
633 at0 = cgindex[index[i]];
634 at1 = cgindex[index[i]+1];
635 for(j=at0; j<at1; j++)
637 /* We need to shift the coordinates */
638 rvec_add(x[j],shift,buf[n]);
645 for(i=0; i<ind->nsend[nzone]; i++)
647 at0 = cgindex[index[i]];
648 at1 = cgindex[index[i]+1];
649 for(j=at0; j<at1; j++)
652 buf[n][XX] = x[j][XX] + shift[XX];
654 * This operation requires a special shift force
655 * treatment, which is performed in calc_vir.
657 buf[n][YY] = box[YY][YY] - x[j][YY];
658 buf[n][ZZ] = box[ZZ][ZZ] - x[j][ZZ];
670 rbuf = comm->vbuf2.v;
672 /* Send and receive the coordinates */
673 dd_sendrecv_rvec(dd, d, dddirBackward,
674 buf, ind->nsend[nzone+1],
675 rbuf, ind->nrecv[nzone+1]);
679 for(zone=0; zone<nzone; zone++)
681 for(i=ind->cell2at0[zone]; i<ind->cell2at1[zone]; i++)
683 copy_rvec(rbuf[j],x[i]);
688 nat_tot += ind->nrecv[nzone+1];
694 void dd_move_f(gmx_domdec_t *dd,rvec f[],rvec *fshift)
696 int nzone,nat_tot,n,d,p,i,j,at0,at1,zone;
698 gmx_domdec_comm_t *comm;
699 gmx_domdec_comm_dim_t *cd;
700 gmx_domdec_ind_t *ind;
704 gmx_bool bPBC,bScrew;
708 cgindex = dd->cgindex;
713 nzone = comm->zones.n/2;
714 nat_tot = dd->nat_tot;
715 for(d=dd->ndim-1; d>=0; d--)
717 bPBC = (dd->ci[dd->dim[d]] == 0);
718 bScrew = (bPBC && dd->bScrewPBC && dd->dim[d] == XX);
719 if (fshift == NULL && !bScrew)
723 /* Determine which shift vector we need */
729 for(p=cd->np-1; p>=0; p--) {
731 nat_tot -= ind->nrecv[nzone+1];
738 sbuf = comm->vbuf2.v;
740 for(zone=0; zone<nzone; zone++)
742 for(i=ind->cell2at0[zone]; i<ind->cell2at1[zone]; i++)
744 copy_rvec(f[i],sbuf[j]);
749 /* Communicate the forces */
750 dd_sendrecv_rvec(dd, d, dddirForward,
751 sbuf, ind->nrecv[nzone+1],
752 buf, ind->nsend[nzone+1]);
754 /* Add the received forces */
758 for(i=0; i<ind->nsend[nzone]; i++)
760 at0 = cgindex[index[i]];
761 at1 = cgindex[index[i]+1];
762 for(j=at0; j<at1; j++)
764 rvec_inc(f[j],buf[n]);
771 for(i=0; i<ind->nsend[nzone]; i++)
773 at0 = cgindex[index[i]];
774 at1 = cgindex[index[i]+1];
775 for(j=at0; j<at1; j++)
777 rvec_inc(f[j],buf[n]);
778 /* Add this force to the shift force */
779 rvec_inc(fshift[is],buf[n]);
786 for(i=0; i<ind->nsend[nzone]; i++)
788 at0 = cgindex[index[i]];
789 at1 = cgindex[index[i]+1];
790 for(j=at0; j<at1; j++)
792 /* Rotate the force */
793 f[j][XX] += buf[n][XX];
794 f[j][YY] -= buf[n][YY];
795 f[j][ZZ] -= buf[n][ZZ];
798 /* Add this force to the shift force */
799 rvec_inc(fshift[is],buf[n]);
810 void dd_atom_spread_real(gmx_domdec_t *dd,real v[])
812 int nzone,nat_tot,n,d,p,i,j,at0,at1,zone;
814 gmx_domdec_comm_t *comm;
815 gmx_domdec_comm_dim_t *cd;
816 gmx_domdec_ind_t *ind;
821 cgindex = dd->cgindex;
823 buf = &comm->vbuf.v[0][0];
826 nat_tot = dd->nat_home;
827 for(d=0; d<dd->ndim; d++)
830 for(p=0; p<cd->np; p++)
835 for(i=0; i<ind->nsend[nzone]; i++)
837 at0 = cgindex[index[i]];
838 at1 = cgindex[index[i]+1];
839 for(j=at0; j<at1; j++)
852 rbuf = &comm->vbuf2.v[0][0];
854 /* Send and receive the coordinates */
855 dd_sendrecv_real(dd, d, dddirBackward,
856 buf, ind->nsend[nzone+1],
857 rbuf, ind->nrecv[nzone+1]);
861 for(zone=0; zone<nzone; zone++)
863 for(i=ind->cell2at0[zone]; i<ind->cell2at1[zone]; i++)
870 nat_tot += ind->nrecv[nzone+1];
876 void dd_atom_sum_real(gmx_domdec_t *dd,real v[])
878 int nzone,nat_tot,n,d,p,i,j,at0,at1,zone;
880 gmx_domdec_comm_t *comm;
881 gmx_domdec_comm_dim_t *cd;
882 gmx_domdec_ind_t *ind;
887 cgindex = dd->cgindex;
889 buf = &comm->vbuf.v[0][0];
892 nzone = comm->zones.n/2;
893 nat_tot = dd->nat_tot;
894 for(d=dd->ndim-1; d>=0; d--)
897 for(p=cd->np-1; p>=0; p--) {
899 nat_tot -= ind->nrecv[nzone+1];
906 sbuf = &comm->vbuf2.v[0][0];
908 for(zone=0; zone<nzone; zone++)
910 for(i=ind->cell2at0[zone]; i<ind->cell2at1[zone]; i++)
917 /* Communicate the forces */
918 dd_sendrecv_real(dd, d, dddirForward,
919 sbuf, ind->nrecv[nzone+1],
920 buf, ind->nsend[nzone+1]);
922 /* Add the received forces */
924 for(i=0; i<ind->nsend[nzone]; i++)
926 at0 = cgindex[index[i]];
927 at1 = cgindex[index[i]+1];
928 for(j=at0; j<at1; j++)
939 static void print_ddzone(FILE *fp,int d,int i,int j,gmx_ddzone_t *zone)
941 fprintf(fp,"zone d0 %d d1 %d d2 %d min0 %6.3f max1 %6.3f mch0 %6.3f mch1 %6.3f p1_0 %6.3f p1_1 %6.3f\n",
943 zone->min0,zone->max1,
944 zone->mch0,zone->mch0,
945 zone->p1_0,zone->p1_1);
948 static void dd_sendrecv_ddzone(const gmx_domdec_t *dd,
949 int ddimind,int direction,
950 gmx_ddzone_t *buf_s,int n_s,
951 gmx_ddzone_t *buf_r,int n_r)
953 rvec vbuf_s[5*2],vbuf_r[5*2];
958 vbuf_s[i*2 ][0] = buf_s[i].min0;
959 vbuf_s[i*2 ][1] = buf_s[i].max1;
960 vbuf_s[i*2 ][2] = buf_s[i].mch0;
961 vbuf_s[i*2+1][0] = buf_s[i].mch1;
962 vbuf_s[i*2+1][1] = buf_s[i].p1_0;
963 vbuf_s[i*2+1][2] = buf_s[i].p1_1;
966 dd_sendrecv_rvec(dd, ddimind, direction,
972 buf_r[i].min0 = vbuf_r[i*2 ][0];
973 buf_r[i].max1 = vbuf_r[i*2 ][1];
974 buf_r[i].mch0 = vbuf_r[i*2 ][2];
975 buf_r[i].mch1 = vbuf_r[i*2+1][0];
976 buf_r[i].p1_0 = vbuf_r[i*2+1][1];
977 buf_r[i].p1_1 = vbuf_r[i*2+1][2];
981 static void dd_move_cellx(gmx_domdec_t *dd,gmx_ddbox_t *ddbox,
982 rvec cell_ns_x0,rvec cell_ns_x1)
984 int d,d1,dim,dim1,pos,buf_size,i,j,k,p,npulse,npulse_min;
985 gmx_ddzone_t *zp,buf_s[5],buf_r[5],buf_e[5];
986 rvec extr_s[2],extr_r[2];
989 gmx_domdec_comm_t *comm;
994 for(d=1; d<dd->ndim; d++)
997 zp = (d == 1) ? &comm->zone_d1[0] : &comm->zone_d2[0][0];
998 zp->min0 = cell_ns_x0[dim];
999 zp->max1 = cell_ns_x1[dim];
1000 zp->mch0 = cell_ns_x0[dim];
1001 zp->mch1 = cell_ns_x1[dim];
1002 zp->p1_0 = cell_ns_x0[dim];
1003 zp->p1_1 = cell_ns_x1[dim];
1006 for(d=dd->ndim-2; d>=0; d--)
1009 bPBC = (dim < ddbox->npbcdim);
1011 /* Use an rvec to store two reals */
1012 extr_s[d][0] = comm->cell_f0[d+1];
1013 extr_s[d][1] = comm->cell_f1[d+1];
1017 /* Store the extremes in the backward sending buffer,
1018 * so the get updated separately from the forward communication.
1020 for(d1=d; d1<dd->ndim-1; d1++)
1022 /* We invert the order to be able to use the same loop for buf_e */
1023 buf_s[pos].min0 = extr_s[d1][1];
1024 buf_s[pos].max1 = extr_s[d1][0];
1025 buf_s[pos].mch0 = 0;
1026 buf_s[pos].mch1 = 0;
1027 /* Store the cell corner of the dimension we communicate along */
1028 buf_s[pos].p1_0 = comm->cell_x0[dim];
1029 buf_s[pos].p1_1 = 0;
1033 buf_s[pos] = (dd->ndim == 2) ? comm->zone_d1[0] : comm->zone_d2[0][0];
1036 if (dd->ndim == 3 && d == 0)
1038 buf_s[pos] = comm->zone_d2[0][1];
1040 buf_s[pos] = comm->zone_d1[0];
1044 /* We only need to communicate the extremes
1045 * in the forward direction
1047 npulse = comm->cd[d].np;
1050 /* Take the minimum to avoid double communication */
1051 npulse_min = min(npulse,dd->nc[dim]-1-npulse);
1055 /* Without PBC we should really not communicate over
1056 * the boundaries, but implementing that complicates
1057 * the communication setup and therefore we simply
1058 * do all communication, but ignore some data.
1060 npulse_min = npulse;
1062 for(p=0; p<npulse_min; p++)
1064 /* Communicate the extremes forward */
1065 bUse = (bPBC || dd->ci[dim] > 0);
1067 dd_sendrecv_rvec(dd, d, dddirForward,
1068 extr_s+d, dd->ndim-d-1,
1069 extr_r+d, dd->ndim-d-1);
1073 for(d1=d; d1<dd->ndim-1; d1++)
1075 extr_s[d1][0] = max(extr_s[d1][0],extr_r[d1][0]);
1076 extr_s[d1][1] = min(extr_s[d1][1],extr_r[d1][1]);
1082 for(p=0; p<npulse; p++)
1084 /* Communicate all the zone information backward */
1085 bUse = (bPBC || dd->ci[dim] < dd->nc[dim] - 1);
1087 dd_sendrecv_ddzone(dd, d, dddirBackward,
1094 for(d1=d+1; d1<dd->ndim; d1++)
1096 /* Determine the decrease of maximum required
1097 * communication height along d1 due to the distance along d,
1098 * this avoids a lot of useless atom communication.
1100 dist_d = comm->cell_x1[dim] - buf_r[0].p1_0;
1102 if (ddbox->tric_dir[dim])
1104 /* c is the off-diagonal coupling between the cell planes
1105 * along directions d and d1.
1107 c = ddbox->v[dim][dd->dim[d1]][dim];
1113 det = (1 + c*c)*comm->cutoff*comm->cutoff - dist_d*dist_d;
1116 dh[d1] = comm->cutoff - (c*dist_d + sqrt(det))/(1 + c*c);
1120 /* A negative value signals out of range */
1126 /* Accumulate the extremes over all pulses */
1127 for(i=0; i<buf_size; i++)
1131 buf_e[i] = buf_r[i];
1137 buf_e[i].min0 = min(buf_e[i].min0,buf_r[i].min0);
1138 buf_e[i].max1 = max(buf_e[i].max1,buf_r[i].max1);
1141 if (dd->ndim == 3 && d == 0 && i == buf_size - 1)
1149 if (bUse && dh[d1] >= 0)
1151 buf_e[i].mch0 = max(buf_e[i].mch0,buf_r[i].mch0-dh[d1]);
1152 buf_e[i].mch1 = max(buf_e[i].mch1,buf_r[i].mch1-dh[d1]);
1155 /* Copy the received buffer to the send buffer,
1156 * to pass the data through with the next pulse.
1158 buf_s[i] = buf_r[i];
1160 if (((bPBC || dd->ci[dim]+npulse < dd->nc[dim]) && p == npulse-1) ||
1161 (!bPBC && dd->ci[dim]+1+p == dd->nc[dim]-1))
1163 /* Store the extremes */
1166 for(d1=d; d1<dd->ndim-1; d1++)
1168 extr_s[d1][1] = min(extr_s[d1][1],buf_e[pos].min0);
1169 extr_s[d1][0] = max(extr_s[d1][0],buf_e[pos].max1);
1173 if (d == 1 || (d == 0 && dd->ndim == 3))
1177 comm->zone_d2[1-d][i] = buf_e[pos];
1183 comm->zone_d1[1] = buf_e[pos];
1197 print_ddzone(debug,1,i,0,&comm->zone_d1[i]);
1199 cell_ns_x0[dim] = min(cell_ns_x0[dim],comm->zone_d1[i].min0);
1200 cell_ns_x1[dim] = max(cell_ns_x1[dim],comm->zone_d1[i].max1);
1212 print_ddzone(debug,2,i,j,&comm->zone_d2[i][j]);
1214 cell_ns_x0[dim] = min(cell_ns_x0[dim],comm->zone_d2[i][j].min0);
1215 cell_ns_x1[dim] = max(cell_ns_x1[dim],comm->zone_d2[i][j].max1);
1219 for(d=1; d<dd->ndim; d++)
1221 comm->cell_f_max0[d] = extr_s[d-1][0];
1222 comm->cell_f_min1[d] = extr_s[d-1][1];
1225 fprintf(debug,"Cell fraction d %d, max0 %f, min1 %f\n",
1226 d,comm->cell_f_max0[d],comm->cell_f_min1[d]);
1231 static void dd_collect_cg(gmx_domdec_t *dd,
1232 t_state *state_local)
1234 gmx_domdec_master_t *ma=NULL;
1235 int buf2[2],*ibuf,i,ncg_home=0,*cg=NULL,nat_home=0;
1238 if (state_local->ddp_count == dd->comm->master_cg_ddp_count)
1240 /* The master has the correct distribution */
1244 if (state_local->ddp_count == dd->ddp_count)
1246 ncg_home = dd->ncg_home;
1248 nat_home = dd->nat_home;
1250 else if (state_local->ddp_count_cg_gl == state_local->ddp_count)
1252 cgs_gl = &dd->comm->cgs_gl;
1254 ncg_home = state_local->ncg_gl;
1255 cg = state_local->cg_gl;
1257 for(i=0; i<ncg_home; i++)
1259 nat_home += cgs_gl->index[cg[i]+1] - cgs_gl->index[cg[i]];
1264 gmx_incons("Attempted to collect a vector for a state for which the charge group distribution is unknown");
1267 buf2[0] = dd->ncg_home;
1268 buf2[1] = dd->nat_home;
1278 /* Collect the charge group and atom counts on the master */
1279 dd_gather(dd,2*sizeof(int),buf2,ibuf);
1284 for(i=0; i<dd->nnodes; i++)
1286 ma->ncg[i] = ma->ibuf[2*i];
1287 ma->nat[i] = ma->ibuf[2*i+1];
1288 ma->index[i+1] = ma->index[i] + ma->ncg[i];
1291 /* Make byte counts and indices */
1292 for(i=0; i<dd->nnodes; i++)
1294 ma->ibuf[i] = ma->ncg[i]*sizeof(int);
1295 ma->ibuf[dd->nnodes+i] = ma->index[i]*sizeof(int);
1299 fprintf(debug,"Initial charge group distribution: ");
1300 for(i=0; i<dd->nnodes; i++)
1301 fprintf(debug," %d",ma->ncg[i]);
1302 fprintf(debug,"\n");
1306 /* Collect the charge group indices on the master */
1308 dd->ncg_home*sizeof(int),dd->index_gl,
1309 DDMASTER(dd) ? ma->ibuf : NULL,
1310 DDMASTER(dd) ? ma->ibuf+dd->nnodes : NULL,
1311 DDMASTER(dd) ? ma->cg : NULL);
1313 dd->comm->master_cg_ddp_count = state_local->ddp_count;
1316 static void dd_collect_vec_sendrecv(gmx_domdec_t *dd,
1319 gmx_domdec_master_t *ma;
1320 int n,i,c,a,nalloc=0;
1329 MPI_Send(lv,dd->nat_home*sizeof(rvec),MPI_BYTE,DDMASTERRANK(dd),
1330 dd->rank,dd->mpi_comm_all);
1333 /* Copy the master coordinates to the global array */
1334 cgs_gl = &dd->comm->cgs_gl;
1336 n = DDMASTERRANK(dd);
1338 for(i=ma->index[n]; i<ma->index[n+1]; i++)
1340 for(c=cgs_gl->index[ma->cg[i]]; c<cgs_gl->index[ma->cg[i]+1]; c++)
1342 copy_rvec(lv[a++],v[c]);
1346 for(n=0; n<dd->nnodes; n++)
1350 if (ma->nat[n] > nalloc)
1352 nalloc = over_alloc_dd(ma->nat[n]);
1356 MPI_Recv(buf,ma->nat[n]*sizeof(rvec),MPI_BYTE,DDRANK(dd,n),
1357 n,dd->mpi_comm_all,MPI_STATUS_IGNORE);
1360 for(i=ma->index[n]; i<ma->index[n+1]; i++)
1362 for(c=cgs_gl->index[ma->cg[i]]; c<cgs_gl->index[ma->cg[i]+1]; c++)
1364 copy_rvec(buf[a++],v[c]);
1373 static void get_commbuffer_counts(gmx_domdec_t *dd,
1374 int **counts,int **disps)
1376 gmx_domdec_master_t *ma;
1381 /* Make the rvec count and displacment arrays */
1383 *disps = ma->ibuf + dd->nnodes;
1384 for(n=0; n<dd->nnodes; n++)
1386 (*counts)[n] = ma->nat[n]*sizeof(rvec);
1387 (*disps)[n] = (n == 0 ? 0 : (*disps)[n-1] + (*counts)[n-1]);
1391 static void dd_collect_vec_gatherv(gmx_domdec_t *dd,
1394 gmx_domdec_master_t *ma;
1395 int *rcounts=NULL,*disps=NULL;
1404 get_commbuffer_counts(dd,&rcounts,&disps);
1409 dd_gatherv(dd,dd->nat_home*sizeof(rvec),lv,rcounts,disps,buf);
1413 cgs_gl = &dd->comm->cgs_gl;
1416 for(n=0; n<dd->nnodes; n++)
1418 for(i=ma->index[n]; i<ma->index[n+1]; i++)
1420 for(c=cgs_gl->index[ma->cg[i]]; c<cgs_gl->index[ma->cg[i]+1]; c++)
1422 copy_rvec(buf[a++],v[c]);
1429 void dd_collect_vec(gmx_domdec_t *dd,
1430 t_state *state_local,rvec *lv,rvec *v)
1432 gmx_domdec_master_t *ma;
1433 int n,i,c,a,nalloc=0;
1436 dd_collect_cg(dd,state_local);
1438 if (dd->nnodes <= GMX_DD_NNODES_SENDRECV)
1440 dd_collect_vec_sendrecv(dd,lv,v);
1444 dd_collect_vec_gatherv(dd,lv,v);
1449 void dd_collect_state(gmx_domdec_t *dd,
1450 t_state *state_local,t_state *state)
1454 nh = state->nhchainlength;
1458 state->lambda = state_local->lambda;
1459 state->veta = state_local->veta;
1460 state->vol0 = state_local->vol0;
1461 copy_mat(state_local->box,state->box);
1462 copy_mat(state_local->boxv,state->boxv);
1463 copy_mat(state_local->svir_prev,state->svir_prev);
1464 copy_mat(state_local->fvir_prev,state->fvir_prev);
1465 copy_mat(state_local->pres_prev,state->pres_prev);
1468 for(i=0; i<state_local->ngtc; i++)
1470 for(j=0; j<nh; j++) {
1471 state->nosehoover_xi[i*nh+j] = state_local->nosehoover_xi[i*nh+j];
1472 state->nosehoover_vxi[i*nh+j] = state_local->nosehoover_vxi[i*nh+j];
1474 state->therm_integral[i] = state_local->therm_integral[i];
1476 for(i=0; i<state_local->nnhpres; i++)
1478 for(j=0; j<nh; j++) {
1479 state->nhpres_xi[i*nh+j] = state_local->nhpres_xi[i*nh+j];
1480 state->nhpres_vxi[i*nh+j] = state_local->nhpres_vxi[i*nh+j];
1484 for(est=0; est<estNR; est++)
1486 if (EST_DISTR(est) && (state_local->flags & (1<<est)))
1490 dd_collect_vec(dd,state_local,state_local->x,state->x);
1493 dd_collect_vec(dd,state_local,state_local->v,state->v);
1496 dd_collect_vec(dd,state_local,state_local->sd_X,state->sd_X);
1499 dd_collect_vec(dd,state_local,state_local->cg_p,state->cg_p);
1502 if (state->nrngi == 1)
1506 for(i=0; i<state_local->nrng; i++)
1508 state->ld_rng[i] = state_local->ld_rng[i];
1514 dd_gather(dd,state_local->nrng*sizeof(state->ld_rng[0]),
1515 state_local->ld_rng,state->ld_rng);
1519 if (state->nrngi == 1)
1523 state->ld_rngi[0] = state_local->ld_rngi[0];
1528 dd_gather(dd,sizeof(state->ld_rngi[0]),
1529 state_local->ld_rngi,state->ld_rngi);
1532 case estDISRE_INITF:
1533 case estDISRE_RM3TAV:
1534 case estORIRE_INITF:
1538 gmx_incons("Unknown state entry encountered in dd_collect_state");
1544 static void dd_realloc_fr_cg(t_forcerec *fr,int nalloc)
1548 fprintf(debug,"Reallocating forcerec: currently %d, required %d, allocating %d\n",fr->cg_nalloc,nalloc,over_alloc_dd(nalloc));
1550 fr->cg_nalloc = over_alloc_dd(nalloc);
1551 srenew(fr->cg_cm,fr->cg_nalloc);
1552 srenew(fr->cginfo,fr->cg_nalloc);
1555 static void dd_realloc_state(t_state *state,rvec **f,int nalloc)
1561 fprintf(debug,"Reallocating state: currently %d, required %d, allocating %d\n",state->nalloc,nalloc,over_alloc_dd(nalloc));
1564 state->nalloc = over_alloc_dd(nalloc);
1566 for(est=0; est<estNR; est++)
1568 if (EST_DISTR(est) && (state->flags & (1<<est)))
1572 srenew(state->x,state->nalloc);
1575 srenew(state->v,state->nalloc);
1578 srenew(state->sd_X,state->nalloc);
1581 srenew(state->cg_p,state->nalloc);
1585 case estDISRE_INITF:
1586 case estDISRE_RM3TAV:
1587 case estORIRE_INITF:
1589 /* No reallocation required */
1592 gmx_incons("Unknown state entry encountered in dd_realloc_state");
1599 srenew(*f,state->nalloc);
1603 static void dd_distribute_vec_sendrecv(gmx_domdec_t *dd,t_block *cgs,
1606 gmx_domdec_master_t *ma;
1607 int n,i,c,a,nalloc=0;
1614 for(n=0; n<dd->nnodes; n++)
1618 if (ma->nat[n] > nalloc)
1620 nalloc = over_alloc_dd(ma->nat[n]);
1623 /* Use lv as a temporary buffer */
1625 for(i=ma->index[n]; i<ma->index[n+1]; i++)
1627 for(c=cgs->index[ma->cg[i]]; c<cgs->index[ma->cg[i]+1]; c++)
1629 copy_rvec(v[c],buf[a++]);
1632 if (a != ma->nat[n])
1634 gmx_fatal(FARGS,"Internal error a (%d) != nat (%d)",
1639 MPI_Send(buf,ma->nat[n]*sizeof(rvec),MPI_BYTE,
1640 DDRANK(dd,n),n,dd->mpi_comm_all);
1645 n = DDMASTERRANK(dd);
1647 for(i=ma->index[n]; i<ma->index[n+1]; i++)
1649 for(c=cgs->index[ma->cg[i]]; c<cgs->index[ma->cg[i]+1]; c++)
1651 copy_rvec(v[c],lv[a++]);
1658 MPI_Recv(lv,dd->nat_home*sizeof(rvec),MPI_BYTE,DDMASTERRANK(dd),
1659 MPI_ANY_TAG,dd->mpi_comm_all,MPI_STATUS_IGNORE);
1664 static void dd_distribute_vec_scatterv(gmx_domdec_t *dd,t_block *cgs,
1667 gmx_domdec_master_t *ma;
1668 int *scounts=NULL,*disps=NULL;
1669 int n,i,c,a,nalloc=0;
1676 get_commbuffer_counts(dd,&scounts,&disps);
1680 for(n=0; n<dd->nnodes; n++)
1682 for(i=ma->index[n]; i<ma->index[n+1]; i++)
1684 for(c=cgs->index[ma->cg[i]]; c<cgs->index[ma->cg[i]+1]; c++)
1686 copy_rvec(v[c],buf[a++]);
1692 dd_scatterv(dd,scounts,disps,buf,dd->nat_home*sizeof(rvec),lv);
1695 static void dd_distribute_vec(gmx_domdec_t *dd,t_block *cgs,rvec *v,rvec *lv)
1697 if (dd->nnodes <= GMX_DD_NNODES_SENDRECV)
1699 dd_distribute_vec_sendrecv(dd,cgs,v,lv);
1703 dd_distribute_vec_scatterv(dd,cgs,v,lv);
1707 static void dd_distribute_state(gmx_domdec_t *dd,t_block *cgs,
1708 t_state *state,t_state *state_local,
1711 int i,j,ngtch,ngtcp,nh;
1713 nh = state->nhchainlength;
1717 state_local->lambda = state->lambda;
1718 state_local->veta = state->veta;
1719 state_local->vol0 = state->vol0;
1720 copy_mat(state->box,state_local->box);
1721 copy_mat(state->box_rel,state_local->box_rel);
1722 copy_mat(state->boxv,state_local->boxv);
1723 copy_mat(state->svir_prev,state_local->svir_prev);
1724 copy_mat(state->fvir_prev,state_local->fvir_prev);
1725 for(i=0; i<state_local->ngtc; i++)
1727 for(j=0; j<nh; j++) {
1728 state_local->nosehoover_xi[i*nh+j] = state->nosehoover_xi[i*nh+j];
1729 state_local->nosehoover_vxi[i*nh+j] = state->nosehoover_vxi[i*nh+j];
1731 state_local->therm_integral[i] = state->therm_integral[i];
1733 for(i=0; i<state_local->nnhpres; i++)
1735 for(j=0; j<nh; j++) {
1736 state_local->nhpres_xi[i*nh+j] = state->nhpres_xi[i*nh+j];
1737 state_local->nhpres_vxi[i*nh+j] = state->nhpres_vxi[i*nh+j];
1741 dd_bcast(dd,sizeof(real),&state_local->lambda);
1742 dd_bcast(dd,sizeof(real),&state_local->veta);
1743 dd_bcast(dd,sizeof(real),&state_local->vol0);
1744 dd_bcast(dd,sizeof(state_local->box),state_local->box);
1745 dd_bcast(dd,sizeof(state_local->box_rel),state_local->box_rel);
1746 dd_bcast(dd,sizeof(state_local->boxv),state_local->boxv);
1747 dd_bcast(dd,sizeof(state_local->svir_prev),state_local->svir_prev);
1748 dd_bcast(dd,sizeof(state_local->fvir_prev),state_local->fvir_prev);
1749 dd_bcast(dd,((state_local->ngtc*nh)*sizeof(double)),state_local->nosehoover_xi);
1750 dd_bcast(dd,((state_local->ngtc*nh)*sizeof(double)),state_local->nosehoover_vxi);
1751 dd_bcast(dd,state_local->ngtc*sizeof(double),state_local->therm_integral);
1752 dd_bcast(dd,((state_local->nnhpres*nh)*sizeof(double)),state_local->nhpres_xi);
1753 dd_bcast(dd,((state_local->nnhpres*nh)*sizeof(double)),state_local->nhpres_vxi);
1755 if (dd->nat_home > state_local->nalloc)
1757 dd_realloc_state(state_local,f,dd->nat_home);
1759 for(i=0; i<estNR; i++)
1761 if (EST_DISTR(i) && (state_local->flags & (1<<i)))
1765 dd_distribute_vec(dd,cgs,state->x,state_local->x);
1768 dd_distribute_vec(dd,cgs,state->v,state_local->v);
1771 dd_distribute_vec(dd,cgs,state->sd_X,state_local->sd_X);
1774 dd_distribute_vec(dd,cgs,state->cg_p,state_local->cg_p);
1777 if (state->nrngi == 1)
1780 state_local->nrng*sizeof(state_local->ld_rng[0]),
1781 state->ld_rng,state_local->ld_rng);
1786 state_local->nrng*sizeof(state_local->ld_rng[0]),
1787 state->ld_rng,state_local->ld_rng);
1791 if (state->nrngi == 1)
1793 dd_bcastc(dd,sizeof(state_local->ld_rngi[0]),
1794 state->ld_rngi,state_local->ld_rngi);
1798 dd_scatter(dd,sizeof(state_local->ld_rngi[0]),
1799 state->ld_rngi,state_local->ld_rngi);
1802 case estDISRE_INITF:
1803 case estDISRE_RM3TAV:
1804 case estORIRE_INITF:
1806 /* Not implemented yet */
1809 gmx_incons("Unknown state entry encountered in dd_distribute_state");
1815 static char dim2char(int dim)
1821 case XX: c = 'X'; break;
1822 case YY: c = 'Y'; break;
1823 case ZZ: c = 'Z'; break;
1824 default: gmx_fatal(FARGS,"Unknown dim %d",dim);
1830 static void write_dd_grid_pdb(const char *fn,gmx_large_int_t step,
1831 gmx_domdec_t *dd,matrix box,gmx_ddbox_t *ddbox)
1833 rvec grid_s[2],*grid_r=NULL,cx,r;
1834 char fname[STRLEN],format[STRLEN],buf[22];
1840 copy_rvec(dd->comm->cell_x0,grid_s[0]);
1841 copy_rvec(dd->comm->cell_x1,grid_s[1]);
1845 snew(grid_r,2*dd->nnodes);
1848 dd_gather(dd,2*sizeof(rvec),grid_s[0],DDMASTER(dd) ? grid_r[0] : NULL);
1852 for(d=0; d<DIM; d++)
1854 for(i=0; i<DIM; i++)
1862 if (d < ddbox->npbcdim && dd->nc[d] > 1)
1864 tric[d][i] = box[i][d]/box[i][i];
1873 sprintf(fname,"%s_%s.pdb",fn,gmx_step_str(step,buf));
1874 sprintf(format,"%s%s\n",get_pdbformat(),"%6.2f%6.2f");
1875 out = gmx_fio_fopen(fname,"w");
1876 gmx_write_pdb_box(out,dd->bScrewPBC ? epbcSCREW : epbcXYZ,box);
1878 for(i=0; i<dd->nnodes; i++)
1880 vol = dd->nnodes/(box[XX][XX]*box[YY][YY]*box[ZZ][ZZ]);
1881 for(d=0; d<DIM; d++)
1883 vol *= grid_r[i*2+1][d] - grid_r[i*2][d];
1891 cx[XX] = grid_r[i*2+x][XX];
1892 cx[YY] = grid_r[i*2+y][YY];
1893 cx[ZZ] = grid_r[i*2+z][ZZ];
1895 fprintf(out,format,"ATOM",a++,"CA","GLY",' ',1+i,
1896 10*r[XX],10*r[YY],10*r[ZZ],1.0,vol);
1900 for(d=0; d<DIM; d++)
1906 case 0: y = 1 + i*8 + 2*x; break;
1907 case 1: y = 1 + i*8 + 2*x - (x % 2); break;
1908 case 2: y = 1 + i*8 + x; break;
1910 fprintf(out,"%6s%5d%5d\n","CONECT",y,y+(1<<d));
1914 gmx_fio_fclose(out);
1919 void write_dd_pdb(const char *fn,gmx_large_int_t step,const char *title,
1920 gmx_mtop_t *mtop,t_commrec *cr,
1921 int natoms,rvec x[],matrix box)
1923 char fname[STRLEN],format[STRLEN],format4[STRLEN],buf[22];
1926 char *atomname,*resname;
1933 natoms = dd->comm->nat[ddnatVSITE];
1936 sprintf(fname,"%s_%s_n%d.pdb",fn,gmx_step_str(step,buf),cr->sim_nodeid);
1938 sprintf(format,"%s%s\n",get_pdbformat(),"%6.2f%6.2f");
1939 sprintf(format4,"%s%s\n",get_pdbformat4(),"%6.2f%6.2f");
1941 out = gmx_fio_fopen(fname,"w");
1943 fprintf(out,"TITLE %s\n",title);
1944 gmx_write_pdb_box(out,dd->bScrewPBC ? epbcSCREW : epbcXYZ,box);
1945 for(i=0; i<natoms; i++)
1947 ii = dd->gatindex[i];
1948 gmx_mtop_atominfo_global(mtop,ii,&atomname,&resnr,&resname);
1949 if (i < dd->comm->nat[ddnatZONE])
1952 while (i >= dd->cgindex[dd->comm->zones.cg_range[c+1]])
1958 else if (i < dd->comm->nat[ddnatVSITE])
1960 b = dd->comm->zones.n;
1964 b = dd->comm->zones.n + 1;
1966 fprintf(out,strlen(atomname)<4 ? format : format4,
1967 "ATOM",(ii+1)%100000,
1968 atomname,resname,' ',resnr%10000,' ',
1969 10*x[i][XX],10*x[i][YY],10*x[i][ZZ],1.0,b);
1971 fprintf(out,"TER\n");
1973 gmx_fio_fclose(out);
1976 real dd_cutoff_mbody(gmx_domdec_t *dd)
1978 gmx_domdec_comm_t *comm;
1985 if (comm->bInterCGBondeds)
1987 if (comm->cutoff_mbody > 0)
1989 r = comm->cutoff_mbody;
1993 /* cutoff_mbody=0 means we do not have DLB */
1994 r = comm->cellsize_min[dd->dim[0]];
1995 for(di=1; di<dd->ndim; di++)
1997 r = min(r,comm->cellsize_min[dd->dim[di]]);
1999 if (comm->bBondComm)
2001 r = max(r,comm->cutoff_mbody);
2005 r = min(r,comm->cutoff);
2013 real dd_cutoff_twobody(gmx_domdec_t *dd)
2017 r_mb = dd_cutoff_mbody(dd);
2019 return max(dd->comm->cutoff,r_mb);
2023 static void dd_cart_coord2pmecoord(gmx_domdec_t *dd,ivec coord,ivec coord_pme)
2027 nc = dd->nc[dd->comm->cartpmedim];
2028 ntot = dd->comm->ntot[dd->comm->cartpmedim];
2029 copy_ivec(coord,coord_pme);
2030 coord_pme[dd->comm->cartpmedim] =
2031 nc + (coord[dd->comm->cartpmedim]*(ntot - nc) + (ntot - nc)/2)/nc;
2034 static int low_ddindex2pmeindex(int ndd,int npme,int ddindex)
2036 /* Here we assign a PME node to communicate with this DD node
2037 * by assuming that the major index of both is x.
2038 * We add cr->npmenodes/2 to obtain an even distribution.
2040 return (ddindex*npme + npme/2)/ndd;
2043 static int ddindex2pmeindex(const gmx_domdec_t *dd,int ddindex)
2045 return low_ddindex2pmeindex(dd->nnodes,dd->comm->npmenodes,ddindex);
2048 static int cr_ddindex2pmeindex(const t_commrec *cr,int ddindex)
2050 return low_ddindex2pmeindex(cr->dd->nnodes,cr->npmenodes,ddindex);
2053 static int *dd_pmenodes(t_commrec *cr)
2058 snew(pmenodes,cr->npmenodes);
2060 for(i=0; i<cr->dd->nnodes; i++) {
2061 p0 = cr_ddindex2pmeindex(cr,i);
2062 p1 = cr_ddindex2pmeindex(cr,i+1);
2063 if (i+1 == cr->dd->nnodes || p1 > p0) {
2065 fprintf(debug,"pmenode[%d] = %d\n",n,i+1+n);
2066 pmenodes[n] = i + 1 + n;
2074 static int gmx_ddcoord2pmeindex(t_commrec *cr,int x,int y,int z)
2077 ivec coords,coords_pme,nc;
2082 if (dd->comm->bCartesian) {
2083 gmx_ddindex2xyz(dd->nc,ddindex,coords);
2084 dd_coords2pmecoords(dd,coords,coords_pme);
2085 copy_ivec(dd->ntot,nc);
2086 nc[dd->cartpmedim] -= dd->nc[dd->cartpmedim];
2087 coords_pme[dd->cartpmedim] -= dd->nc[dd->cartpmedim];
2089 slab = (coords_pme[XX]*nc[YY] + coords_pme[YY])*nc[ZZ] + coords_pme[ZZ];
2091 slab = (ddindex*cr->npmenodes + cr->npmenodes/2)/dd->nnodes;
2097 slab = ddindex2pmeindex(dd,dd_index(dd->nc,coords));
2102 static int ddcoord2simnodeid(t_commrec *cr,int x,int y,int z)
2104 gmx_domdec_comm_t *comm;
2106 int ddindex,nodeid=-1;
2108 comm = cr->dd->comm;
2113 if (comm->bCartesianPP_PME)
2116 MPI_Cart_rank(cr->mpi_comm_mysim,coords,&nodeid);
2121 ddindex = dd_index(cr->dd->nc,coords);
2122 if (comm->bCartesianPP)
2124 nodeid = comm->ddindex2simnodeid[ddindex];
2130 nodeid = ddindex + gmx_ddcoord2pmeindex(cr,x,y,z);
2142 static int dd_simnode2pmenode(t_commrec *cr,int sim_nodeid)
2145 gmx_domdec_comm_t *comm;
2146 ivec coord,coord_pme;
2153 /* This assumes a uniform x domain decomposition grid cell size */
2154 if (comm->bCartesianPP_PME)
2157 MPI_Cart_coords(cr->mpi_comm_mysim,sim_nodeid,DIM,coord);
2158 if (coord[comm->cartpmedim] < dd->nc[comm->cartpmedim])
2160 /* This is a PP node */
2161 dd_cart_coord2pmecoord(dd,coord,coord_pme);
2162 MPI_Cart_rank(cr->mpi_comm_mysim,coord_pme,&pmenode);
2166 else if (comm->bCartesianPP)
2168 if (sim_nodeid < dd->nnodes)
2170 pmenode = dd->nnodes + ddindex2pmeindex(dd,sim_nodeid);
2175 /* This assumes DD cells with identical x coordinates
2176 * are numbered sequentially.
2178 if (dd->comm->pmenodes == NULL)
2180 if (sim_nodeid < dd->nnodes)
2182 /* The DD index equals the nodeid */
2183 pmenode = dd->nnodes + ddindex2pmeindex(dd,sim_nodeid);
2189 while (sim_nodeid > dd->comm->pmenodes[i])
2193 if (sim_nodeid < dd->comm->pmenodes[i])
2195 pmenode = dd->comm->pmenodes[i];
2203 gmx_bool gmx_pmeonlynode(t_commrec *cr,int sim_nodeid)
2205 gmx_bool bPMEOnlyNode;
2207 if (DOMAINDECOMP(cr))
2209 bPMEOnlyNode = (dd_simnode2pmenode(cr,sim_nodeid) == -1);
2213 bPMEOnlyNode = FALSE;
2216 return bPMEOnlyNode;
2219 void get_pme_ddnodes(t_commrec *cr,int pmenodeid,
2220 int *nmy_ddnodes,int **my_ddnodes,int *node_peer)
2224 ivec coord,coord_pme;
2228 snew(*my_ddnodes,(dd->nnodes+cr->npmenodes-1)/cr->npmenodes);
2231 for(x=0; x<dd->nc[XX]; x++)
2233 for(y=0; y<dd->nc[YY]; y++)
2235 for(z=0; z<dd->nc[ZZ]; z++)
2237 if (dd->comm->bCartesianPP_PME)
2242 dd_cart_coord2pmecoord(dd,coord,coord_pme);
2243 if (dd->ci[XX] == coord_pme[XX] &&
2244 dd->ci[YY] == coord_pme[YY] &&
2245 dd->ci[ZZ] == coord_pme[ZZ])
2246 (*my_ddnodes)[(*nmy_ddnodes)++] = ddcoord2simnodeid(cr,x,y,z);
2250 /* The slab corresponds to the nodeid in the PME group */
2251 if (gmx_ddcoord2pmeindex(cr,x,y,z) == pmenodeid)
2253 (*my_ddnodes)[(*nmy_ddnodes)++] = ddcoord2simnodeid(cr,x,y,z);
2260 /* The last PP-only node is the peer node */
2261 *node_peer = (*my_ddnodes)[*nmy_ddnodes-1];
2265 fprintf(debug,"Receive coordinates from PP nodes:");
2266 for(x=0; x<*nmy_ddnodes; x++)
2268 fprintf(debug," %d",(*my_ddnodes)[x]);
2270 fprintf(debug,"\n");
2274 static gmx_bool receive_vir_ener(t_commrec *cr)
2276 gmx_domdec_comm_t *comm;
2277 int pmenode,coords[DIM],rank;
2281 if (cr->npmenodes < cr->dd->nnodes)
2283 comm = cr->dd->comm;
2284 if (comm->bCartesianPP_PME)
2286 pmenode = dd_simnode2pmenode(cr,cr->sim_nodeid);
2288 MPI_Cart_coords(cr->mpi_comm_mysim,cr->sim_nodeid,DIM,coords);
2289 coords[comm->cartpmedim]++;
2290 if (coords[comm->cartpmedim] < cr->dd->nc[comm->cartpmedim])
2292 MPI_Cart_rank(cr->mpi_comm_mysim,coords,&rank);
2293 if (dd_simnode2pmenode(cr,rank) == pmenode)
2295 /* This is not the last PP node for pmenode */
2303 pmenode = dd_simnode2pmenode(cr,cr->sim_nodeid);
2304 if (cr->sim_nodeid+1 < cr->nnodes &&
2305 dd_simnode2pmenode(cr,cr->sim_nodeid+1) == pmenode)
2307 /* This is not the last PP node for pmenode */
2316 static void set_zones_ncg_home(gmx_domdec_t *dd)
2318 gmx_domdec_zones_t *zones;
2321 zones = &dd->comm->zones;
2323 zones->cg_range[0] = 0;
2324 for(i=1; i<zones->n+1; i++)
2326 zones->cg_range[i] = dd->ncg_home;
2330 static void rebuild_cgindex(gmx_domdec_t *dd,int *gcgs_index,t_state *state)
2332 int nat,i,*ind,*dd_cg_gl,*cgindex,cg_gl;
2335 dd_cg_gl = dd->index_gl;
2336 cgindex = dd->cgindex;
2339 for(i=0; i<state->ncg_gl; i++)
2343 dd_cg_gl[i] = cg_gl;
2344 nat += gcgs_index[cg_gl+1] - gcgs_index[cg_gl];
2348 dd->ncg_home = state->ncg_gl;
2351 set_zones_ncg_home(dd);
2354 static int ddcginfo(const cginfo_mb_t *cginfo_mb,int cg)
2356 while (cg >= cginfo_mb->cg_end)
2361 return cginfo_mb->cginfo[(cg - cginfo_mb->cg_start) % cginfo_mb->cg_mod];
2364 static void dd_set_cginfo(int *index_gl,int cg0,int cg1,
2365 t_forcerec *fr,char *bLocalCG)
2367 cginfo_mb_t *cginfo_mb;
2373 cginfo_mb = fr->cginfo_mb;
2374 cginfo = fr->cginfo;
2376 for(cg=cg0; cg<cg1; cg++)
2378 cginfo[cg] = ddcginfo(cginfo_mb,index_gl[cg]);
2382 if (bLocalCG != NULL)
2384 for(cg=cg0; cg<cg1; cg++)
2386 bLocalCG[index_gl[cg]] = TRUE;
2391 static void make_dd_indices(gmx_domdec_t *dd,int *gcgs_index,int cg_start)
2393 int nzone,zone,zone1,cg0,cg,cg_gl,a,a_gl;
2394 int *zone2cg,*zone_ncg1,*index_gl,*gatindex;
2398 bLocalCG = dd->comm->bLocalCG;
2400 if (dd->nat_tot > dd->gatindex_nalloc)
2402 dd->gatindex_nalloc = over_alloc_dd(dd->nat_tot);
2403 srenew(dd->gatindex,dd->gatindex_nalloc);
2406 nzone = dd->comm->zones.n;
2407 zone2cg = dd->comm->zones.cg_range;
2408 zone_ncg1 = dd->comm->zone_ncg1;
2409 index_gl = dd->index_gl;
2410 gatindex = dd->gatindex;
2412 if (zone2cg[1] != dd->ncg_home)
2414 gmx_incons("dd->ncg_zone is not up to date");
2417 /* Make the local to global and global to local atom index */
2418 a = dd->cgindex[cg_start];
2419 for(zone=0; zone<nzone; zone++)
2427 cg0 = zone2cg[zone];
2429 for(cg=cg0; cg<zone2cg[zone+1]; cg++)
2432 if (cg - cg0 >= zone_ncg1[zone])
2434 /* Signal that this cg is from more than one zone away */
2437 cg_gl = index_gl[cg];
2438 for(a_gl=gcgs_index[cg_gl]; a_gl<gcgs_index[cg_gl+1]; a_gl++)
2441 ga2la_set(dd->ga2la,a_gl,a,zone1);
2448 static int check_bLocalCG(gmx_domdec_t *dd,int ncg_sys,const char *bLocalCG,
2454 if (bLocalCG == NULL)
2458 for(i=0; i<dd->ncg_tot; i++)
2460 if (!bLocalCG[dd->index_gl[i]])
2463 "DD node %d, %s: cg %d, global cg %d is not marked in bLocalCG (ncg_home %d)\n",dd->rank,where,i+1,dd->index_gl[i]+1,dd->ncg_home);
2468 for(i=0; i<ncg_sys; i++)
2475 if (ngl != dd->ncg_tot)
2477 fprintf(stderr,"DD node %d, %s: In bLocalCG %d cgs are marked as local, whereas there are %d\n",dd->rank,where,ngl,dd->ncg_tot);
2484 static void check_index_consistency(gmx_domdec_t *dd,
2485 int natoms_sys,int ncg_sys,
2488 int nerr,ngl,i,a,cell;
2493 if (dd->comm->DD_debug > 1)
2495 snew(have,natoms_sys);
2496 for(a=0; a<dd->nat_tot; a++)
2498 if (have[dd->gatindex[a]] > 0)
2500 fprintf(stderr,"DD node %d: global atom %d occurs twice: index %d and %d\n",dd->rank,dd->gatindex[a]+1,have[dd->gatindex[a]],a+1);
2504 have[dd->gatindex[a]] = a + 1;
2510 snew(have,dd->nat_tot);
2513 for(i=0; i<natoms_sys; i++)
2515 if (ga2la_get(dd->ga2la,i,&a,&cell))
2517 if (a >= dd->nat_tot)
2519 fprintf(stderr,"DD node %d: global atom %d marked as local atom %d, which is larger than nat_tot (%d)\n",dd->rank,i+1,a+1,dd->nat_tot);
2525 if (dd->gatindex[a] != i)
2527 fprintf(stderr,"DD node %d: global atom %d marked as local atom %d, which has global atom index %d\n",dd->rank,i+1,a+1,dd->gatindex[a]+1);
2534 if (ngl != dd->nat_tot)
2537 "DD node %d, %s: %d global atom indices, %d local atoms\n",
2538 dd->rank,where,ngl,dd->nat_tot);
2540 for(a=0; a<dd->nat_tot; a++)
2545 "DD node %d, %s: local atom %d, global %d has no global index\n",
2546 dd->rank,where,a+1,dd->gatindex[a]+1);
2551 nerr += check_bLocalCG(dd,ncg_sys,dd->comm->bLocalCG,where);
2554 gmx_fatal(FARGS,"DD node %d, %s: %d atom/cg index inconsistencies",
2555 dd->rank,where,nerr);
2559 static void clear_dd_indices(gmx_domdec_t *dd,int cg_start,int a_start)
2566 /* Clear the whole list without searching */
2567 ga2la_clear(dd->ga2la);
2571 for(i=a_start; i<dd->nat_tot; i++)
2573 ga2la_del(dd->ga2la,dd->gatindex[i]);
2577 bLocalCG = dd->comm->bLocalCG;
2580 for(i=cg_start; i<dd->ncg_tot; i++)
2582 bLocalCG[dd->index_gl[i]] = FALSE;
2586 dd_clear_local_vsite_indices(dd);
2588 if (dd->constraints)
2590 dd_clear_local_constraint_indices(dd);
2594 static real grid_jump_limit(gmx_domdec_comm_t *comm,int dim_ind)
2596 real grid_jump_limit;
2598 /* The distance between the boundaries of cells at distance
2599 * x+-1,y+-1 or y+-1,z+-1 is limited by the cut-off restrictions
2600 * and by the fact that cells should not be shifted by more than
2601 * half their size, such that cg's only shift by one cell
2602 * at redecomposition.
2604 grid_jump_limit = comm->cellsize_limit;
2605 if (!comm->bVacDLBNoLimit)
2607 grid_jump_limit = max(grid_jump_limit,
2608 comm->cutoff/comm->cd[dim_ind].np);
2611 return grid_jump_limit;
2614 static void check_grid_jump(gmx_large_int_t step,gmx_domdec_t *dd,gmx_ddbox_t *ddbox)
2616 gmx_domdec_comm_t *comm;
2622 for(d=1; d<dd->ndim; d++)
2625 limit = grid_jump_limit(comm,d);
2626 bfac = ddbox->box_size[dim];
2627 if (ddbox->tric_dir[dim])
2629 bfac *= ddbox->skew_fac[dim];
2631 if ((comm->cell_f1[d] - comm->cell_f_max0[d])*bfac < limit ||
2632 (comm->cell_f0[d] - comm->cell_f_min1[d])*bfac > -limit)
2635 gmx_fatal(FARGS,"Step %s: The domain decomposition grid has shifted too much in the %c-direction around cell %d %d %d\n",
2636 gmx_step_str(step,buf),
2637 dim2char(dim),dd->ci[XX],dd->ci[YY],dd->ci[ZZ]);
2642 static int dd_load_count(gmx_domdec_comm_t *comm)
2644 return (comm->eFlop ? comm->flop_n : comm->cycl_n[ddCyclF]);
2647 static float dd_force_load(gmx_domdec_comm_t *comm)
2654 if (comm->eFlop > 1)
2656 load *= 1.0 + (comm->eFlop - 1)*(0.1*rand()/RAND_MAX - 0.05);
2661 load = comm->cycl[ddCyclF];
2662 if (comm->cycl_n[ddCyclF] > 1)
2664 /* Subtract the maximum of the last n cycle counts
2665 * to get rid of possible high counts due to other soures,
2666 * for instance system activity, that would otherwise
2667 * affect the dynamic load balancing.
2669 load -= comm->cycl_max[ddCyclF];
2676 static void set_slb_pme_dim_f(gmx_domdec_t *dd,int dim,real **dim_f)
2678 gmx_domdec_comm_t *comm;
2683 snew(*dim_f,dd->nc[dim]+1);
2685 for(i=1; i<dd->nc[dim]; i++)
2687 if (comm->slb_frac[dim])
2689 (*dim_f)[i] = (*dim_f)[i-1] + comm->slb_frac[dim][i-1];
2693 (*dim_f)[i] = (real)i/(real)dd->nc[dim];
2696 (*dim_f)[dd->nc[dim]] = 1;
2699 static void init_ddpme(gmx_domdec_t *dd,gmx_ddpme_t *ddpme,int dimind)
2701 int pmeindex,slab,nso,i;
2704 if (dimind == 0 && dd->dim[0] == YY && dd->comm->npmenodes_x == 1)
2710 ddpme->dim = dimind;
2712 ddpme->dim_match = (ddpme->dim == dd->dim[dimind]);
2714 ddpme->nslab = (ddpme->dim == 0 ?
2715 dd->comm->npmenodes_x :
2716 dd->comm->npmenodes_y);
2718 if (ddpme->nslab <= 1)
2723 nso = dd->comm->npmenodes/ddpme->nslab;
2724 /* Determine for each PME slab the PP location range for dimension dim */
2725 snew(ddpme->pp_min,ddpme->nslab);
2726 snew(ddpme->pp_max,ddpme->nslab);
2727 for(slab=0; slab<ddpme->nslab; slab++) {
2728 ddpme->pp_min[slab] = dd->nc[dd->dim[dimind]] - 1;
2729 ddpme->pp_max[slab] = 0;
2731 for(i=0; i<dd->nnodes; i++) {
2732 ddindex2xyz(dd->nc,i,xyz);
2733 /* For y only use our y/z slab.
2734 * This assumes that the PME x grid size matches the DD grid size.
2736 if (dimind == 0 || xyz[XX] == dd->ci[XX]) {
2737 pmeindex = ddindex2pmeindex(dd,i);
2739 slab = pmeindex/nso;
2741 slab = pmeindex % ddpme->nslab;
2743 ddpme->pp_min[slab] = min(ddpme->pp_min[slab],xyz[dimind]);
2744 ddpme->pp_max[slab] = max(ddpme->pp_max[slab],xyz[dimind]);
2748 set_slb_pme_dim_f(dd,ddpme->dim,&ddpme->slb_dim_f);
2751 int dd_pme_maxshift_x(gmx_domdec_t *dd)
2753 if (dd->comm->ddpme[0].dim == XX)
2755 return dd->comm->ddpme[0].maxshift;
2763 int dd_pme_maxshift_y(gmx_domdec_t *dd)
2765 if (dd->comm->ddpme[0].dim == YY)
2767 return dd->comm->ddpme[0].maxshift;
2769 else if (dd->comm->npmedecompdim >= 2 && dd->comm->ddpme[1].dim == YY)
2771 return dd->comm->ddpme[1].maxshift;
2779 static void set_pme_maxshift(gmx_domdec_t *dd,gmx_ddpme_t *ddpme,
2780 gmx_bool bUniform,gmx_ddbox_t *ddbox,real *cell_f)
2782 gmx_domdec_comm_t *comm;
2785 real range,pme_boundary;
2789 nc = dd->nc[ddpme->dim];
2792 if (!ddpme->dim_match)
2794 /* PP decomposition is not along dim: the worst situation */
2797 else if (ns <= 3 || (bUniform && ns == nc))
2799 /* The optimal situation */
2804 /* We need to check for all pme nodes which nodes they
2805 * could possibly need to communicate with.
2807 xmin = ddpme->pp_min;
2808 xmax = ddpme->pp_max;
2809 /* Allow for atoms to be maximally 2/3 times the cut-off
2810 * out of their DD cell. This is a reasonable balance between
2811 * between performance and support for most charge-group/cut-off
2814 range = 2.0/3.0*comm->cutoff/ddbox->box_size[ddpme->dim];
2815 /* Avoid extra communication when we are exactly at a boundary */
2821 /* PME slab s spreads atoms between box frac. s/ns and (s+1)/ns */
2822 pme_boundary = (real)s/ns;
2825 cell_f[xmax[s-(sh+1) ]+1] + range > pme_boundary) ||
2827 cell_f[xmax[s-(sh+1)+ns]+1] - 1 + range > pme_boundary)))
2831 pme_boundary = (real)(s+1)/ns;
2834 cell_f[xmin[s+(sh+1) ] ] - range < pme_boundary) ||
2836 cell_f[xmin[s+(sh+1)-ns] ] + 1 - range < pme_boundary)))
2843 ddpme->maxshift = sh;
2847 fprintf(debug,"PME slab communication range for dim %d is %d\n",
2848 ddpme->dim,ddpme->maxshift);
2852 static void check_box_size(gmx_domdec_t *dd,gmx_ddbox_t *ddbox)
2856 for(d=0; d<dd->ndim; d++)
2859 if (dim < ddbox->nboundeddim &&
2860 ddbox->box_size[dim]*ddbox->skew_fac[dim] <
2861 dd->nc[dim]*dd->comm->cellsize_limit*DD_CELL_MARGIN)
2863 gmx_fatal(FARGS,"The %c-size of the box (%f) times the triclinic skew factor (%f) is smaller than the number of DD cells (%d) times the smallest allowed cell size (%f)\n",
2864 dim2char(dim),ddbox->box_size[dim],ddbox->skew_fac[dim],
2865 dd->nc[dim],dd->comm->cellsize_limit);
2870 static void set_dd_cell_sizes_slb(gmx_domdec_t *dd,gmx_ddbox_t *ddbox,
2871 gmx_bool bMaster,ivec npulse)
2873 gmx_domdec_comm_t *comm;
2876 real *cell_x,cell_dx,cellsize;
2880 for(d=0; d<DIM; d++)
2882 cellsize_min[d] = ddbox->box_size[d]*ddbox->skew_fac[d];
2884 if (dd->nc[d] == 1 || comm->slb_frac[d] == NULL)
2887 cell_dx = ddbox->box_size[d]/dd->nc[d];
2890 for(j=0; j<dd->nc[d]+1; j++)
2892 dd->ma->cell_x[d][j] = ddbox->box0[d] + j*cell_dx;
2897 comm->cell_x0[d] = ddbox->box0[d] + (dd->ci[d] )*cell_dx;
2898 comm->cell_x1[d] = ddbox->box0[d] + (dd->ci[d]+1)*cell_dx;
2900 cellsize = cell_dx*ddbox->skew_fac[d];
2901 while (cellsize*npulse[d] < comm->cutoff && npulse[d] < dd->nc[d]-1)
2905 cellsize_min[d] = cellsize;
2909 /* Statically load balanced grid */
2910 /* Also when we are not doing a master distribution we determine
2911 * all cell borders in a loop to obtain identical values
2912 * to the master distribution case and to determine npulse.
2916 cell_x = dd->ma->cell_x[d];
2920 snew(cell_x,dd->nc[d]+1);
2922 cell_x[0] = ddbox->box0[d];
2923 for(j=0; j<dd->nc[d]; j++)
2925 cell_dx = ddbox->box_size[d]*comm->slb_frac[d][j];
2926 cell_x[j+1] = cell_x[j] + cell_dx;
2927 cellsize = cell_dx*ddbox->skew_fac[d];
2928 while (cellsize*npulse[d] < comm->cutoff &&
2929 npulse[d] < dd->nc[d]-1)
2933 cellsize_min[d] = min(cellsize_min[d],cellsize);
2937 comm->cell_x0[d] = cell_x[dd->ci[d]];
2938 comm->cell_x1[d] = cell_x[dd->ci[d]+1];
2942 /* The following limitation is to avoid that a cell would receive
2943 * some of its own home charge groups back over the periodic boundary.
2944 * Double charge groups cause trouble with the global indices.
2946 if (d < ddbox->npbcdim &&
2947 dd->nc[d] > 1 && npulse[d] >= dd->nc[d])
2949 gmx_fatal_collective(FARGS,NULL,dd,
2950 "The box size in direction %c (%f) times the triclinic skew factor (%f) is too small for a cut-off of %f with %d domain decomposition cells, use 1 or more than %d %s or increase the box size in this direction",
2951 dim2char(d),ddbox->box_size[d],ddbox->skew_fac[d],
2953 dd->nc[d],dd->nc[d],
2954 dd->nnodes > dd->nc[d] ? "cells" : "processors");
2958 if (!comm->bDynLoadBal)
2960 copy_rvec(cellsize_min,comm->cellsize_min);
2963 for(d=0; d<comm->npmedecompdim; d++)
2965 set_pme_maxshift(dd,&comm->ddpme[d],
2966 comm->slb_frac[dd->dim[d]]==NULL,ddbox,
2967 comm->ddpme[d].slb_dim_f);
2972 static void dd_cell_sizes_dlb_root_enforce_limits(gmx_domdec_t *dd,
2973 int d,int dim,gmx_domdec_root_t *root,
2975 gmx_bool bUniform,gmx_large_int_t step, real cellsize_limit_f, int range[])
2977 gmx_domdec_comm_t *comm;
2978 int ncd,i,j,nmin,nmin_old;
2979 gmx_bool bLimLo,bLimHi;
2981 real fac,halfway,cellsize_limit_f_i,region_size;
2982 gmx_bool bPBC,bLastHi=FALSE;
2983 int nrange[]={range[0],range[1]};
2985 region_size= root->cell_f[range[1]]-root->cell_f[range[0]];
2991 bPBC = (dim < ddbox->npbcdim);
2993 cell_size = root->buf_ncd;
2997 fprintf(debug,"enforce_limits: %d %d\n",range[0],range[1]);
3000 /* First we need to check if the scaling does not make cells
3001 * smaller than the smallest allowed size.
3002 * We need to do this iteratively, since if a cell is too small,
3003 * it needs to be enlarged, which makes all the other cells smaller,
3004 * which could in turn make another cell smaller than allowed.
3006 for(i=range[0]; i<range[1]; i++)
3008 root->bCellMin[i] = FALSE;
3014 /* We need the total for normalization */
3016 for(i=range[0]; i<range[1]; i++)
3018 if (root->bCellMin[i] == FALSE)
3020 fac += cell_size[i];
3023 fac = ( region_size - nmin*cellsize_limit_f)/fac; /* substracting cells already set to cellsize_limit_f */
3024 /* Determine the cell boundaries */
3025 for(i=range[0]; i<range[1]; i++)
3027 if (root->bCellMin[i] == FALSE)
3029 cell_size[i] *= fac;
3030 if (!bPBC && (i == 0 || i == dd->nc[dim] -1))
3032 cellsize_limit_f_i = 0;
3036 cellsize_limit_f_i = cellsize_limit_f;
3038 if (cell_size[i] < cellsize_limit_f_i)
3040 root->bCellMin[i] = TRUE;
3041 cell_size[i] = cellsize_limit_f_i;
3045 root->cell_f[i+1] = root->cell_f[i] + cell_size[i];
3048 while (nmin > nmin_old);
3051 cell_size[i] = root->cell_f[i+1] - root->cell_f[i];
3052 /* For this check we should not use DD_CELL_MARGIN,
3053 * but a slightly smaller factor,
3054 * since rounding could get use below the limit.
3056 if (bPBC && cell_size[i] < cellsize_limit_f*DD_CELL_MARGIN2/DD_CELL_MARGIN)
3059 gmx_fatal(FARGS,"Step %s: the dynamic load balancing could not balance dimension %c: box size %f, triclinic skew factor %f, #cells %d, minimum cell size %f\n",
3060 gmx_step_str(step,buf),
3061 dim2char(dim),ddbox->box_size[dim],ddbox->skew_fac[dim],
3062 ncd,comm->cellsize_min[dim]);
3065 root->bLimited = (nmin > 0) || (range[0]>0) || (range[1]<ncd);
3069 /* Check if the boundary did not displace more than halfway
3070 * each of the cells it bounds, as this could cause problems,
3071 * especially when the differences between cell sizes are large.
3072 * If changes are applied, they will not make cells smaller
3073 * than the cut-off, as we check all the boundaries which
3074 * might be affected by a change and if the old state was ok,
3075 * the cells will at most be shrunk back to their old size.
3077 for(i=range[0]+1; i<range[1]; i++)
3079 halfway = 0.5*(root->old_cell_f[i] + root->old_cell_f[i-1]);
3080 if (root->cell_f[i] < halfway)
3082 root->cell_f[i] = halfway;
3083 /* Check if the change also causes shifts of the next boundaries */
3084 for(j=i+1; j<range[1]; j++)
3086 if (root->cell_f[j] < root->cell_f[j-1] + cellsize_limit_f)
3087 root->cell_f[j] = root->cell_f[j-1] + cellsize_limit_f;
3090 halfway = 0.5*(root->old_cell_f[i] + root->old_cell_f[i+1]);
3091 if (root->cell_f[i] > halfway)
3093 root->cell_f[i] = halfway;
3094 /* Check if the change also causes shifts of the next boundaries */
3095 for(j=i-1; j>=range[0]+1; j--)
3097 if (root->cell_f[j] > root->cell_f[j+1] - cellsize_limit_f)
3098 root->cell_f[j] = root->cell_f[j+1] - cellsize_limit_f;
3104 /* nrange is defined as [lower, upper) range for new call to enforce_limits */
3105 /* find highest violation of LimLo (a) and the following violation of LimHi (thus the lowest following) (b)
3106 * then call enforce_limits for (oldb,a), (a,b). In the next step: (b,nexta). oldb and nexta can be the boundaries.
3107 * for a and b nrange is used */
3110 /* Take care of the staggering of the cell boundaries */
3113 for(i=range[0]; i<range[1]; i++)
3115 root->cell_f_max0[i] = root->cell_f[i];
3116 root->cell_f_min1[i] = root->cell_f[i+1];
3121 for(i=range[0]+1; i<range[1]; i++)
3123 bLimLo = (root->cell_f[i] < root->bound_min[i]);
3124 bLimHi = (root->cell_f[i] > root->bound_max[i]);
3125 if (bLimLo && bLimHi)
3127 /* Both limits violated, try the best we can */
3128 /* For this case we split the original range (range) in two parts and care about the other limitiations in the next iteration. */
3129 root->cell_f[i] = 0.5*(root->bound_min[i] + root->bound_max[i]);
3132 dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3136 dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3142 /* root->cell_f[i] = root->bound_min[i]; */
3143 nrange[1]=i; /* only store violation location. There could be a LimLo violation following with an higher index */
3146 else if (bLimHi && !bLastHi)
3149 if (nrange[1] < range[1]) /* found a LimLo before */
3151 root->cell_f[nrange[1]] = root->bound_min[nrange[1]];
3152 dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3153 nrange[0]=nrange[1];
3155 root->cell_f[i] = root->bound_max[i];
3157 dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3162 if (nrange[1] < range[1]) /* found last a LimLo */
3164 root->cell_f[nrange[1]] = root->bound_min[nrange[1]];
3165 dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3166 nrange[0]=nrange[1];
3168 dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3170 else if (nrange[0] > range[0]) /* found at least one LimHi */
3172 dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3179 static void set_dd_cell_sizes_dlb_root(gmx_domdec_t *dd,
3180 int d,int dim,gmx_domdec_root_t *root,
3181 gmx_ddbox_t *ddbox,gmx_bool bDynamicBox,
3182 gmx_bool bUniform,gmx_large_int_t step)
3184 gmx_domdec_comm_t *comm;
3187 real load_aver,load_i,imbalance,change,change_max,sc;
3188 real cellsize_limit_f,dist_min_f,dist_min_f_hard,space;
3192 int range[] = { 0, 0 };
3196 /* Convert the maximum change from the input percentage to a fraction */
3197 change_limit = comm->dlb_scale_lim*0.01;
3201 bPBC = (dim < ddbox->npbcdim);
3203 cell_size = root->buf_ncd;
3205 /* Store the original boundaries */
3206 for(i=0; i<ncd+1; i++)
3208 root->old_cell_f[i] = root->cell_f[i];
3211 for(i=0; i<ncd; i++)
3213 cell_size[i] = 1.0/ncd;
3216 else if (dd_load_count(comm))
3218 load_aver = comm->load[d].sum_m/ncd;
3220 for(i=0; i<ncd; i++)
3222 /* Determine the relative imbalance of cell i */
3223 load_i = comm->load[d].load[i*comm->load[d].nload+2];
3224 imbalance = (load_i - load_aver)/(load_aver>0 ? load_aver : 1);
3225 /* Determine the change of the cell size using underrelaxation */
3226 change = -relax*imbalance;
3227 change_max = max(change_max,max(change,-change));
3229 /* Limit the amount of scaling.
3230 * We need to use the same rescaling for all cells in one row,
3231 * otherwise the load balancing might not converge.
3234 if (change_max > change_limit)
3236 sc *= change_limit/change_max;
3238 for(i=0; i<ncd; i++)
3240 /* Determine the relative imbalance of cell i */
3241 load_i = comm->load[d].load[i*comm->load[d].nload+2];
3242 imbalance = (load_i - load_aver)/(load_aver>0 ? load_aver : 1);
3243 /* Determine the change of the cell size using underrelaxation */
3244 change = -sc*imbalance;
3245 cell_size[i] = (root->cell_f[i+1]-root->cell_f[i])*(1 + change);
3249 cellsize_limit_f = comm->cellsize_min[dim]/ddbox->box_size[dim];
3250 cellsize_limit_f *= DD_CELL_MARGIN;
3251 dist_min_f_hard = grid_jump_limit(comm,d)/ddbox->box_size[dim];
3252 dist_min_f = dist_min_f_hard * DD_CELL_MARGIN;
3253 if (ddbox->tric_dir[dim])
3255 cellsize_limit_f /= ddbox->skew_fac[dim];
3256 dist_min_f /= ddbox->skew_fac[dim];
3258 if (bDynamicBox && d > 0)
3260 dist_min_f *= DD_PRES_SCALE_MARGIN;
3262 if (d > 0 && !bUniform)
3264 /* Make sure that the grid is not shifted too much */
3265 for(i=1; i<ncd; i++) {
3266 if (root->cell_f_min1[i] - root->cell_f_max0[i-1] < 2 * dist_min_f_hard)
3268 gmx_incons("Inconsistent DD boundary staggering limits!");
3270 root->bound_min[i] = root->cell_f_max0[i-1] + dist_min_f;
3271 space = root->cell_f[i] - (root->cell_f_max0[i-1] + dist_min_f);
3273 root->bound_min[i] += 0.5*space;
3275 root->bound_max[i] = root->cell_f_min1[i] - dist_min_f;
3276 space = root->cell_f[i] - (root->cell_f_min1[i] - dist_min_f);
3278 root->bound_max[i] += 0.5*space;
3283 "dim %d boundary %d %.3f < %.3f < %.3f < %.3f < %.3f\n",
3285 root->cell_f_max0[i-1] + dist_min_f,
3286 root->bound_min[i],root->cell_f[i],root->bound_max[i],
3287 root->cell_f_min1[i] - dist_min_f);
3292 root->cell_f[0] = 0;
3293 root->cell_f[ncd] = 1;
3294 dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, range);
3297 /* After the checks above, the cells should obey the cut-off
3298 * restrictions, but it does not hurt to check.
3300 for(i=0; i<ncd; i++)
3304 fprintf(debug,"Relative bounds dim %d cell %d: %f %f\n",
3305 dim,i,root->cell_f[i],root->cell_f[i+1]);
3308 if ((bPBC || (i != 0 && i != dd->nc[dim]-1)) &&
3309 root->cell_f[i+1] - root->cell_f[i] <
3310 cellsize_limit_f/DD_CELL_MARGIN)
3314 "\nWARNING step %s: direction %c, cell %d too small: %f\n",
3315 gmx_step_str(step,buf),dim2char(dim),i,
3316 (root->cell_f[i+1] - root->cell_f[i])
3317 *ddbox->box_size[dim]*ddbox->skew_fac[dim]);
3322 /* Store the cell boundaries of the lower dimensions at the end */
3323 for(d1=0; d1<d; d1++)
3325 root->cell_f[pos++] = comm->cell_f0[d1];
3326 root->cell_f[pos++] = comm->cell_f1[d1];
3329 if (d < comm->npmedecompdim)
3331 /* The master determines the maximum shift for
3332 * the coordinate communication between separate PME nodes.
3334 set_pme_maxshift(dd,&comm->ddpme[d],bUniform,ddbox,root->cell_f);
3336 root->cell_f[pos++] = comm->ddpme[0].maxshift;
3339 root->cell_f[pos++] = comm->ddpme[1].maxshift;
3343 static void relative_to_absolute_cell_bounds(gmx_domdec_t *dd,
3344 gmx_ddbox_t *ddbox,int dimind)
3346 gmx_domdec_comm_t *comm;
3351 /* Set the cell dimensions */
3352 dim = dd->dim[dimind];
3353 comm->cell_x0[dim] = comm->cell_f0[dimind]*ddbox->box_size[dim];
3354 comm->cell_x1[dim] = comm->cell_f1[dimind]*ddbox->box_size[dim];
3355 if (dim >= ddbox->nboundeddim)
3357 comm->cell_x0[dim] += ddbox->box0[dim];
3358 comm->cell_x1[dim] += ddbox->box0[dim];
3362 static void distribute_dd_cell_sizes_dlb(gmx_domdec_t *dd,
3363 int d,int dim,real *cell_f_row,
3366 gmx_domdec_comm_t *comm;
3372 /* Each node would only need to know two fractions,
3373 * but it is probably cheaper to broadcast the whole array.
3375 MPI_Bcast(cell_f_row,DD_CELL_F_SIZE(dd,d)*sizeof(real),MPI_BYTE,
3376 0,comm->mpi_comm_load[d]);
3378 /* Copy the fractions for this dimension from the buffer */
3379 comm->cell_f0[d] = cell_f_row[dd->ci[dim] ];
3380 comm->cell_f1[d] = cell_f_row[dd->ci[dim]+1];
3381 /* The whole array was communicated, so set the buffer position */
3382 pos = dd->nc[dim] + 1;
3383 for(d1=0; d1<=d; d1++)
3387 /* Copy the cell fractions of the lower dimensions */
3388 comm->cell_f0[d1] = cell_f_row[pos++];
3389 comm->cell_f1[d1] = cell_f_row[pos++];
3391 relative_to_absolute_cell_bounds(dd,ddbox,d1);
3393 /* Convert the communicated shift from float to int */
3394 comm->ddpme[0].maxshift = (int)(cell_f_row[pos++] + 0.5);
3397 comm->ddpme[1].maxshift = (int)(cell_f_row[pos++] + 0.5);
3401 static void set_dd_cell_sizes_dlb_change(gmx_domdec_t *dd,
3402 gmx_ddbox_t *ddbox,gmx_bool bDynamicBox,
3403 gmx_bool bUniform,gmx_large_int_t step)
3405 gmx_domdec_comm_t *comm;
3407 gmx_bool bRowMember,bRowRoot;
3412 for(d=0; d<dd->ndim; d++)
3417 for(d1=d; d1<dd->ndim; d1++)
3419 if (dd->ci[dd->dim[d1]] > 0)
3432 set_dd_cell_sizes_dlb_root(dd,d,dim,comm->root[d],
3433 ddbox,bDynamicBox,bUniform,step);
3434 cell_f_row = comm->root[d]->cell_f;
3438 cell_f_row = comm->cell_f_row;
3440 distribute_dd_cell_sizes_dlb(dd,d,dim,cell_f_row,ddbox);
3445 static void set_dd_cell_sizes_dlb_nochange(gmx_domdec_t *dd,gmx_ddbox_t *ddbox)
3449 /* This function assumes the box is static and should therefore
3450 * not be called when the box has changed since the last
3451 * call to dd_partition_system.
3453 for(d=0; d<dd->ndim; d++)
3455 relative_to_absolute_cell_bounds(dd,ddbox,d);
3461 static void set_dd_cell_sizes_dlb(gmx_domdec_t *dd,
3462 gmx_ddbox_t *ddbox,gmx_bool bDynamicBox,
3463 gmx_bool bUniform,gmx_bool bDoDLB,gmx_large_int_t step,
3464 gmx_wallcycle_t wcycle)
3466 gmx_domdec_comm_t *comm;
3473 wallcycle_start(wcycle,ewcDDCOMMBOUND);
3474 set_dd_cell_sizes_dlb_change(dd,ddbox,bDynamicBox,bUniform,step);
3475 wallcycle_stop(wcycle,ewcDDCOMMBOUND);
3477 else if (bDynamicBox)
3479 set_dd_cell_sizes_dlb_nochange(dd,ddbox);
3482 /* Set the dimensions for which no DD is used */
3483 for(dim=0; dim<DIM; dim++) {
3484 if (dd->nc[dim] == 1) {
3485 comm->cell_x0[dim] = 0;
3486 comm->cell_x1[dim] = ddbox->box_size[dim];
3487 if (dim >= ddbox->nboundeddim)
3489 comm->cell_x0[dim] += ddbox->box0[dim];
3490 comm->cell_x1[dim] += ddbox->box0[dim];
3496 static void realloc_comm_ind(gmx_domdec_t *dd,ivec npulse)
3499 gmx_domdec_comm_dim_t *cd;
3501 for(d=0; d<dd->ndim; d++)
3503 cd = &dd->comm->cd[d];
3504 np = npulse[dd->dim[d]];
3505 if (np > cd->np_nalloc)
3509 fprintf(debug,"(Re)allocing cd for %c to %d pulses\n",
3510 dim2char(dd->dim[d]),np);
3512 if (DDMASTER(dd) && cd->np_nalloc > 0)
3514 fprintf(stderr,"\nIncreasing the number of cell to communicate in dimension %c to %d for the first time\n",dim2char(dd->dim[d]),np);
3517 for(i=cd->np_nalloc; i<np; i++)
3519 cd->ind[i].index = NULL;
3520 cd->ind[i].nalloc = 0;
3529 static void set_dd_cell_sizes(gmx_domdec_t *dd,
3530 gmx_ddbox_t *ddbox,gmx_bool bDynamicBox,
3531 gmx_bool bUniform,gmx_bool bDoDLB,gmx_large_int_t step,
3532 gmx_wallcycle_t wcycle)
3534 gmx_domdec_comm_t *comm;
3540 /* Copy the old cell boundaries for the cg displacement check */
3541 copy_rvec(comm->cell_x0,comm->old_cell_x0);
3542 copy_rvec(comm->cell_x1,comm->old_cell_x1);
3544 if (comm->bDynLoadBal)
3548 check_box_size(dd,ddbox);
3550 set_dd_cell_sizes_dlb(dd,ddbox,bDynamicBox,bUniform,bDoDLB,step,wcycle);
3554 set_dd_cell_sizes_slb(dd,ddbox,FALSE,npulse);
3555 realloc_comm_ind(dd,npulse);
3560 for(d=0; d<DIM; d++)
3562 fprintf(debug,"cell_x[%d] %f - %f skew_fac %f\n",
3563 d,comm->cell_x0[d],comm->cell_x1[d],ddbox->skew_fac[d]);
3568 static void comm_dd_ns_cell_sizes(gmx_domdec_t *dd,
3570 rvec cell_ns_x0,rvec cell_ns_x1,
3571 gmx_large_int_t step)
3573 gmx_domdec_comm_t *comm;
3578 for(dim_ind=0; dim_ind<dd->ndim; dim_ind++)
3580 dim = dd->dim[dim_ind];
3582 /* Without PBC we don't have restrictions on the outer cells */
3583 if (!(dim >= ddbox->npbcdim &&
3584 (dd->ci[dim] == 0 || dd->ci[dim] == dd->nc[dim] - 1)) &&
3585 comm->bDynLoadBal &&
3586 (comm->cell_x1[dim] - comm->cell_x0[dim])*ddbox->skew_fac[dim] <
3587 comm->cellsize_min[dim])
3590 gmx_fatal(FARGS,"Step %s: The %c-size (%f) times the triclinic skew factor (%f) is smaller than the smallest allowed cell size (%f) for domain decomposition grid cell %d %d %d",
3591 gmx_step_str(step,buf),dim2char(dim),
3592 comm->cell_x1[dim] - comm->cell_x0[dim],
3593 ddbox->skew_fac[dim],
3594 dd->comm->cellsize_min[dim],
3595 dd->ci[XX],dd->ci[YY],dd->ci[ZZ]);
3599 if ((dd->bGridJump && dd->ndim > 1) || ddbox->nboundeddim < DIM)
3601 /* Communicate the boundaries and update cell_ns_x0/1 */
3602 dd_move_cellx(dd,ddbox,cell_ns_x0,cell_ns_x1);
3603 if (dd->bGridJump && dd->ndim > 1)
3605 check_grid_jump(step,dd,ddbox);
3610 static void make_tric_corr_matrix(int npbcdim,matrix box,matrix tcm)
3614 tcm[YY][XX] = -box[YY][XX]/box[YY][YY];
3622 tcm[ZZ][XX] = -(box[ZZ][YY]*tcm[YY][XX] + box[ZZ][XX])/box[ZZ][ZZ];
3623 tcm[ZZ][YY] = -box[ZZ][YY]/box[ZZ][ZZ];
3632 static void check_screw_box(matrix box)
3634 /* Mathematical limitation */
3635 if (box[YY][XX] != 0 || box[ZZ][XX] != 0)
3637 gmx_fatal(FARGS,"With screw pbc the unit cell can not have non-zero off-diagonal x-components");
3640 /* Limitation due to the asymmetry of the eighth shell method */
3641 if (box[ZZ][YY] != 0)
3643 gmx_fatal(FARGS,"pbc=screw with non-zero box_zy is not supported");
3647 static void distribute_cg(FILE *fplog,gmx_large_int_t step,
3648 matrix box,ivec tric_dir,t_block *cgs,rvec pos[],
3651 gmx_domdec_master_t *ma;
3652 int **tmp_ind=NULL,*tmp_nalloc=NULL;
3653 int i,icg,j,k,k0,k1,d,npbcdim;
3655 rvec box_size,cg_cm;
3657 real nrcg,inv_ncg,pos_d;
3659 gmx_bool bUnbounded,bScrew;
3663 if (tmp_ind == NULL)
3665 snew(tmp_nalloc,dd->nnodes);
3666 snew(tmp_ind,dd->nnodes);
3667 for(i=0; i<dd->nnodes; i++)
3669 tmp_nalloc[i] = over_alloc_large(cgs->nr/dd->nnodes+1);
3670 snew(tmp_ind[i],tmp_nalloc[i]);
3674 /* Clear the count */
3675 for(i=0; i<dd->nnodes; i++)
3681 make_tric_corr_matrix(dd->npbcdim,box,tcm);
3683 cgindex = cgs->index;
3685 /* Compute the center of geometry for all charge groups */
3686 for(icg=0; icg<cgs->nr; icg++)
3689 k1 = cgindex[icg+1];
3693 copy_rvec(pos[k0],cg_cm);
3700 for(k=k0; (k<k1); k++)
3702 rvec_inc(cg_cm,pos[k]);
3704 for(d=0; (d<DIM); d++)
3706 cg_cm[d] *= inv_ncg;
3709 /* Put the charge group in the box and determine the cell index */
3710 for(d=DIM-1; d>=0; d--) {
3712 if (d < dd->npbcdim)
3714 bScrew = (dd->bScrewPBC && d == XX);
3715 if (tric_dir[d] && dd->nc[d] > 1)
3717 /* Use triclinic coordintates for this dimension */
3718 for(j=d+1; j<DIM; j++)
3720 pos_d += cg_cm[j]*tcm[j][d];
3723 while(pos_d >= box[d][d])
3726 rvec_dec(cg_cm,box[d]);
3729 cg_cm[YY] = box[YY][YY] - cg_cm[YY];
3730 cg_cm[ZZ] = box[ZZ][ZZ] - cg_cm[ZZ];
3732 for(k=k0; (k<k1); k++)
3734 rvec_dec(pos[k],box[d]);
3737 pos[k][YY] = box[YY][YY] - pos[k][YY];
3738 pos[k][ZZ] = box[ZZ][ZZ] - pos[k][ZZ];
3745 rvec_inc(cg_cm,box[d]);
3748 cg_cm[YY] = box[YY][YY] - cg_cm[YY];
3749 cg_cm[ZZ] = box[ZZ][ZZ] - cg_cm[ZZ];
3751 for(k=k0; (k<k1); k++)
3753 rvec_inc(pos[k],box[d]);
3755 pos[k][YY] = box[YY][YY] - pos[k][YY];
3756 pos[k][ZZ] = box[ZZ][ZZ] - pos[k][ZZ];
3761 /* This could be done more efficiently */
3763 while(ind[d]+1 < dd->nc[d] && pos_d >= ma->cell_x[d][ind[d]+1])
3768 i = dd_index(dd->nc,ind);
3769 if (ma->ncg[i] == tmp_nalloc[i])
3771 tmp_nalloc[i] = over_alloc_large(ma->ncg[i]+1);
3772 srenew(tmp_ind[i],tmp_nalloc[i]);
3774 tmp_ind[i][ma->ncg[i]] = icg;
3776 ma->nat[i] += cgindex[icg+1] - cgindex[icg];
3780 for(i=0; i<dd->nnodes; i++)
3783 for(k=0; k<ma->ncg[i]; k++)
3785 ma->cg[k1++] = tmp_ind[i][k];
3788 ma->index[dd->nnodes] = k1;
3790 for(i=0; i<dd->nnodes; i++)
3800 fprintf(fplog,"Charge group distribution at step %s:",
3801 gmx_step_str(step,buf));
3802 for(i=0; i<dd->nnodes; i++)
3804 fprintf(fplog," %d",ma->ncg[i]);
3806 fprintf(fplog,"\n");
3810 static void get_cg_distribution(FILE *fplog,gmx_large_int_t step,gmx_domdec_t *dd,
3811 t_block *cgs,matrix box,gmx_ddbox_t *ddbox,
3814 gmx_domdec_master_t *ma=NULL;
3817 int *ibuf,buf2[2] = { 0, 0 };
3818 gmx_bool bMaster = DDMASTER(dd);
3825 check_screw_box(box);
3828 set_dd_cell_sizes_slb(dd,ddbox,TRUE,npulse);
3830 distribute_cg(fplog,step,box,ddbox->tric_dir,cgs,pos,dd);
3831 for(i=0; i<dd->nnodes; i++)
3833 ma->ibuf[2*i] = ma->ncg[i];
3834 ma->ibuf[2*i+1] = ma->nat[i];
3842 dd_scatter(dd,2*sizeof(int),ibuf,buf2);
3844 dd->ncg_home = buf2[0];
3845 dd->nat_home = buf2[1];
3846 dd->ncg_tot = dd->ncg_home;
3847 dd->nat_tot = dd->nat_home;
3848 if (dd->ncg_home > dd->cg_nalloc || dd->cg_nalloc == 0)
3850 dd->cg_nalloc = over_alloc_dd(dd->ncg_home);
3851 srenew(dd->index_gl,dd->cg_nalloc);
3852 srenew(dd->cgindex,dd->cg_nalloc+1);
3856 for(i=0; i<dd->nnodes; i++)
3858 ma->ibuf[i] = ma->ncg[i]*sizeof(int);
3859 ma->ibuf[dd->nnodes+i] = ma->index[i]*sizeof(int);
3864 DDMASTER(dd) ? ma->ibuf : NULL,
3865 DDMASTER(dd) ? ma->ibuf+dd->nnodes : NULL,
3866 DDMASTER(dd) ? ma->cg : NULL,
3867 dd->ncg_home*sizeof(int),dd->index_gl);
3869 /* Determine the home charge group sizes */
3871 for(i=0; i<dd->ncg_home; i++)
3873 cg_gl = dd->index_gl[i];
3875 dd->cgindex[i] + cgs->index[cg_gl+1] - cgs->index[cg_gl];
3880 fprintf(debug,"Home charge groups:\n");
3881 for(i=0; i<dd->ncg_home; i++)
3883 fprintf(debug," %d",dd->index_gl[i]);
3885 fprintf(debug,"\n");
3887 fprintf(debug,"\n");
3891 static int compact_and_copy_vec_at(int ncg,int *move,
3894 rvec *src,gmx_domdec_comm_t *comm,
3897 int m,icg,i,i0,i1,nrcg;
3903 for(m=0; m<DIM*2; m++)
3909 for(icg=0; icg<ncg; icg++)
3911 i1 = cgindex[icg+1];
3917 /* Compact the home array in place */
3918 for(i=i0; i<i1; i++)
3920 copy_rvec(src[i],src[home_pos++]);
3926 /* Copy to the communication buffer */
3928 pos_vec[m] += 1 + vec*nrcg;
3929 for(i=i0; i<i1; i++)
3931 copy_rvec(src[i],comm->cgcm_state[m][pos_vec[m]++]);
3933 pos_vec[m] += (nvec - vec - 1)*nrcg;
3937 home_pos += i1 - i0;
3945 static int compact_and_copy_vec_cg(int ncg,int *move,
3947 int nvec,rvec *src,gmx_domdec_comm_t *comm,
3950 int m,icg,i0,i1,nrcg;
3956 for(m=0; m<DIM*2; m++)
3962 for(icg=0; icg<ncg; icg++)
3964 i1 = cgindex[icg+1];
3970 /* Compact the home array in place */
3971 copy_rvec(src[icg],src[home_pos++]);
3977 /* Copy to the communication buffer */
3978 copy_rvec(src[icg],comm->cgcm_state[m][pos_vec[m]]);
3979 pos_vec[m] += 1 + nrcg*nvec;
3991 static int compact_ind(int ncg,int *move,
3992 int *index_gl,int *cgindex,
3994 gmx_ga2la_t ga2la,char *bLocalCG,
3997 int cg,nat,a0,a1,a,a_gl;
4002 for(cg=0; cg<ncg; cg++)
4008 /* Compact the home arrays in place.
4009 * Anything that can be done here avoids access to global arrays.
4011 cgindex[home_pos] = nat;
4012 for(a=a0; a<a1; a++)
4015 gatindex[nat] = a_gl;
4016 /* The cell number stays 0, so we don't need to set it */
4017 ga2la_change_la(ga2la,a_gl,nat);
4020 index_gl[home_pos] = index_gl[cg];
4021 cginfo[home_pos] = cginfo[cg];
4022 /* The charge group remains local, so bLocalCG does not change */
4027 /* Clear the global indices */
4028 for(a=a0; a<a1; a++)
4030 ga2la_del(ga2la,gatindex[a]);
4034 bLocalCG[index_gl[cg]] = FALSE;
4038 cgindex[home_pos] = nat;
4043 static void clear_and_mark_ind(int ncg,int *move,
4044 int *index_gl,int *cgindex,int *gatindex,
4045 gmx_ga2la_t ga2la,char *bLocalCG,
4050 for(cg=0; cg<ncg; cg++)
4056 /* Clear the global indices */
4057 for(a=a0; a<a1; a++)
4059 ga2la_del(ga2la,gatindex[a]);
4063 bLocalCG[index_gl[cg]] = FALSE;
4065 /* Signal that this cg has moved using the ns cell index.
4066 * Here we set it to -1.
4067 * fill_grid will change it from -1 to 4*grid->ncells.
4069 cell_index[cg] = -1;
4074 static void print_cg_move(FILE *fplog,
4076 gmx_large_int_t step,int cg,int dim,int dir,
4077 gmx_bool bHaveLimitdAndCMOld,real limitd,
4078 rvec cm_old,rvec cm_new,real pos_d)
4080 gmx_domdec_comm_t *comm;
4085 fprintf(fplog,"\nStep %s:\n",gmx_step_str(step,buf));
4086 if (bHaveLimitdAndCMOld)
4088 fprintf(fplog,"The charge group starting at atom %d moved than the distance allowed by the domain decomposition (%f) in direction %c\n",
4089 ddglatnr(dd,dd->cgindex[cg]),limitd,dim2char(dim));
4093 fprintf(fplog,"The charge group starting at atom %d moved than the distance allowed by the domain decomposition in direction %c\n",
4094 ddglatnr(dd,dd->cgindex[cg]),dim2char(dim));
4096 fprintf(fplog,"distance out of cell %f\n",
4097 dir==1 ? pos_d - comm->cell_x1[dim] : pos_d - comm->cell_x0[dim]);
4098 if (bHaveLimitdAndCMOld)
4100 fprintf(fplog,"Old coordinates: %8.3f %8.3f %8.3f\n",
4101 cm_old[XX],cm_old[YY],cm_old[ZZ]);
4103 fprintf(fplog,"New coordinates: %8.3f %8.3f %8.3f\n",
4104 cm_new[XX],cm_new[YY],cm_new[ZZ]);
4105 fprintf(fplog,"Old cell boundaries in direction %c: %8.3f %8.3f\n",
4107 comm->old_cell_x0[dim],comm->old_cell_x1[dim]);
4108 fprintf(fplog,"New cell boundaries in direction %c: %8.3f %8.3f\n",
4110 comm->cell_x0[dim],comm->cell_x1[dim]);
4113 static void cg_move_error(FILE *fplog,
4115 gmx_large_int_t step,int cg,int dim,int dir,
4116 gmx_bool bHaveLimitdAndCMOld,real limitd,
4117 rvec cm_old,rvec cm_new,real pos_d)
4121 print_cg_move(fplog, dd,step,cg,dim,dir,
4122 bHaveLimitdAndCMOld,limitd,cm_old,cm_new,pos_d);
4124 print_cg_move(stderr,dd,step,cg,dim,dir,
4125 bHaveLimitdAndCMOld,limitd,cm_old,cm_new,pos_d);
4127 "A charge group moved too far between two domain decomposition steps\n"
4128 "This usually means that your system is not well equilibrated");
4131 static void rotate_state_atom(t_state *state,int a)
4135 for(est=0; est<estNR; est++)
4137 if (EST_DISTR(est) && (state->flags & (1<<est))) {
4140 /* Rotate the complete state; for a rectangular box only */
4141 state->x[a][YY] = state->box[YY][YY] - state->x[a][YY];
4142 state->x[a][ZZ] = state->box[ZZ][ZZ] - state->x[a][ZZ];
4145 state->v[a][YY] = -state->v[a][YY];
4146 state->v[a][ZZ] = -state->v[a][ZZ];
4149 state->sd_X[a][YY] = -state->sd_X[a][YY];
4150 state->sd_X[a][ZZ] = -state->sd_X[a][ZZ];
4153 state->cg_p[a][YY] = -state->cg_p[a][YY];
4154 state->cg_p[a][ZZ] = -state->cg_p[a][ZZ];
4156 case estDISRE_INITF:
4157 case estDISRE_RM3TAV:
4158 case estORIRE_INITF:
4160 /* These are distances, so not affected by rotation */
4163 gmx_incons("Unknown state entry encountered in rotate_state_atom");
4169 static int dd_redistribute_cg(FILE *fplog,gmx_large_int_t step,
4170 gmx_domdec_t *dd,ivec tric_dir,
4171 t_state *state,rvec **f,
4172 t_forcerec *fr,t_mdatoms *md,
4178 int ncg[DIM*2],nat[DIM*2];
4179 int c,i,cg,k,k0,k1,d,dim,dim2,dir,d2,d3,d4,cell_d;
4180 int mc,cdd,nrcg,ncg_recv,nat_recv,nvs,nvr,nvec,vec;
4181 int sbuf[2],rbuf[2];
4182 int home_pos_cg,home_pos_at,ncg_stay_home,buf_pos;
4184 gmx_bool bV=FALSE,bSDX=FALSE,bCGP=FALSE;
4189 rvec *cg_cm,cell_x0,cell_x1,limitd,limit0,limit1,cm_new;
4191 cginfo_mb_t *cginfo_mb;
4192 gmx_domdec_comm_t *comm;
4196 check_screw_box(state->box);
4202 for(i=0; i<estNR; i++)
4208 case estX: /* Always present */ break;
4209 case estV: bV = (state->flags & (1<<i)); break;
4210 case estSDX: bSDX = (state->flags & (1<<i)); break;
4211 case estCGP: bCGP = (state->flags & (1<<i)); break;
4214 case estDISRE_INITF:
4215 case estDISRE_RM3TAV:
4216 case estORIRE_INITF:
4218 /* No processing required */
4221 gmx_incons("Unknown state entry encountered in dd_redistribute_cg");
4226 if (dd->ncg_tot > comm->nalloc_int)
4228 comm->nalloc_int = over_alloc_dd(dd->ncg_tot);
4229 srenew(comm->buf_int,comm->nalloc_int);
4231 move = comm->buf_int;
4233 /* Clear the count */
4234 for(c=0; c<dd->ndim*2; c++)
4240 npbcdim = dd->npbcdim;
4242 for(d=0; (d<DIM); d++)
4244 limitd[d] = dd->comm->cellsize_min[d];
4245 if (d >= npbcdim && dd->ci[d] == 0)
4247 cell_x0[d] = -GMX_FLOAT_MAX;
4251 cell_x0[d] = comm->cell_x0[d];
4253 if (d >= npbcdim && dd->ci[d] == dd->nc[d] - 1)
4255 cell_x1[d] = GMX_FLOAT_MAX;
4259 cell_x1[d] = comm->cell_x1[d];
4263 limit0[d] = comm->old_cell_x0[d] - limitd[d];
4264 limit1[d] = comm->old_cell_x1[d] + limitd[d];
4268 /* We check after communication if a charge group moved
4269 * more than one cell. Set the pre-comm check limit to float_max.
4271 limit0[d] = -GMX_FLOAT_MAX;
4272 limit1[d] = GMX_FLOAT_MAX;
4276 make_tric_corr_matrix(npbcdim,state->box,tcm);
4278 cgindex = dd->cgindex;
4280 /* Compute the center of geometry for all home charge groups
4281 * and put them in the box and determine where they should go.
4283 for(cg=0; cg<dd->ncg_home; cg++)
4290 copy_rvec(state->x[k0],cm_new);
4297 for(k=k0; (k<k1); k++)
4299 rvec_inc(cm_new,state->x[k]);
4301 for(d=0; (d<DIM); d++)
4303 cm_new[d] = inv_ncg*cm_new[d];
4308 /* Do pbc and check DD cell boundary crossings */
4309 for(d=DIM-1; d>=0; d--)
4313 bScrew = (dd->bScrewPBC && d == XX);
4314 /* Determine the location of this cg in lattice coordinates */
4318 for(d2=d+1; d2<DIM; d2++)
4320 pos_d += cm_new[d2]*tcm[d2][d];
4323 /* Put the charge group in the triclinic unit-cell */
4324 if (pos_d >= cell_x1[d])
4326 if (pos_d >= limit1[d])
4328 cg_move_error(fplog,dd,step,cg,d,1,TRUE,limitd[d],
4329 cg_cm[cg],cm_new,pos_d);
4332 if (dd->ci[d] == dd->nc[d] - 1)
4334 rvec_dec(cm_new,state->box[d]);
4337 cm_new[YY] = state->box[YY][YY] - cm_new[YY];
4338 cm_new[ZZ] = state->box[ZZ][ZZ] - cm_new[ZZ];
4340 for(k=k0; (k<k1); k++)
4342 rvec_dec(state->x[k],state->box[d]);
4345 rotate_state_atom(state,k);
4350 else if (pos_d < cell_x0[d])
4352 if (pos_d < limit0[d])
4354 cg_move_error(fplog,dd,step,cg,d,-1,TRUE,limitd[d],
4355 cg_cm[cg],cm_new,pos_d);
4360 rvec_inc(cm_new,state->box[d]);
4363 cm_new[YY] = state->box[YY][YY] - cm_new[YY];
4364 cm_new[ZZ] = state->box[ZZ][ZZ] - cm_new[ZZ];
4366 for(k=k0; (k<k1); k++)
4368 rvec_inc(state->x[k],state->box[d]);
4371 rotate_state_atom(state,k);
4377 else if (d < npbcdim)
4379 /* Put the charge group in the rectangular unit-cell */
4380 while (cm_new[d] >= state->box[d][d])
4382 rvec_dec(cm_new,state->box[d]);
4383 for(k=k0; (k<k1); k++)
4385 rvec_dec(state->x[k],state->box[d]);
4388 while (cm_new[d] < 0)
4390 rvec_inc(cm_new,state->box[d]);
4391 for(k=k0; (k<k1); k++)
4393 rvec_inc(state->x[k],state->box[d]);
4399 copy_rvec(cm_new,cg_cm[cg]);
4401 /* Determine where this cg should go */
4404 for(d=0; d<dd->ndim; d++)
4409 flag |= DD_FLAG_FW(d);
4415 else if (dev[dim] == -1)
4417 flag |= DD_FLAG_BW(d);
4419 if (dd->nc[dim] > 2)
4433 if (ncg[mc]+1 > comm->cggl_flag_nalloc[mc])
4435 comm->cggl_flag_nalloc[mc] = over_alloc_dd(ncg[mc]+1);
4436 srenew(comm->cggl_flag[mc],comm->cggl_flag_nalloc[mc]*DD_CGIBS);
4438 comm->cggl_flag[mc][ncg[mc]*DD_CGIBS ] = dd->index_gl[cg];
4439 /* We store the cg size in the lower 16 bits
4440 * and the place where the charge group should go
4441 * in the next 6 bits. This saves some communication volume.
4443 comm->cggl_flag[mc][ncg[mc]*DD_CGIBS+1] = nrcg | flag;
4449 inc_nrnb(nrnb,eNR_CGCM,dd->nat_home);
4450 inc_nrnb(nrnb,eNR_RESETX,dd->ncg_home);
4466 /* Make sure the communication buffers are large enough */
4467 for(mc=0; mc<dd->ndim*2; mc++)
4469 nvr = ncg[mc] + nat[mc]*nvec;
4470 if (nvr > comm->cgcm_state_nalloc[mc])
4472 comm->cgcm_state_nalloc[mc] = over_alloc_dd(nvr);
4473 srenew(comm->cgcm_state[mc],comm->cgcm_state_nalloc[mc]);
4477 /* Recalculating cg_cm might be cheaper than communicating,
4478 * but that could give rise to rounding issues.
4481 compact_and_copy_vec_cg(dd->ncg_home,move,cgindex,
4482 nvec,cg_cm,comm,bCompact);
4486 compact_and_copy_vec_at(dd->ncg_home,move,cgindex,
4487 nvec,vec++,state->x,comm,bCompact);
4490 compact_and_copy_vec_at(dd->ncg_home,move,cgindex,
4491 nvec,vec++,state->v,comm,bCompact);
4495 compact_and_copy_vec_at(dd->ncg_home,move,cgindex,
4496 nvec,vec++,state->sd_X,comm,bCompact);
4500 compact_and_copy_vec_at(dd->ncg_home,move,cgindex,
4501 nvec,vec++,state->cg_p,comm,bCompact);
4506 compact_ind(dd->ncg_home,move,
4507 dd->index_gl,dd->cgindex,dd->gatindex,
4508 dd->ga2la,comm->bLocalCG,
4513 clear_and_mark_ind(dd->ncg_home,move,
4514 dd->index_gl,dd->cgindex,dd->gatindex,
4515 dd->ga2la,comm->bLocalCG,
4516 fr->ns.grid->cell_index);
4519 cginfo_mb = fr->cginfo_mb;
4521 ncg_stay_home = home_pos_cg;
4522 for(d=0; d<dd->ndim; d++)
4528 for(dir=0; dir<(dd->nc[dim]==2 ? 1 : 2); dir++)
4531 /* Communicate the cg and atom counts */
4536 fprintf(debug,"Sending ddim %d dir %d: ncg %d nat %d\n",
4537 d,dir,sbuf[0],sbuf[1]);
4539 dd_sendrecv_int(dd, d, dir, sbuf, 2, rbuf, 2);
4541 if ((ncg_recv+rbuf[0])*DD_CGIBS > comm->nalloc_int)
4543 comm->nalloc_int = over_alloc_dd((ncg_recv+rbuf[0])*DD_CGIBS);
4544 srenew(comm->buf_int,comm->nalloc_int);
4547 /* Communicate the charge group indices, sizes and flags */
4548 dd_sendrecv_int(dd, d, dir,
4549 comm->cggl_flag[cdd], sbuf[0]*DD_CGIBS,
4550 comm->buf_int+ncg_recv*DD_CGIBS, rbuf[0]*DD_CGIBS);
4552 nvs = ncg[cdd] + nat[cdd]*nvec;
4553 i = rbuf[0] + rbuf[1] *nvec;
4554 vec_rvec_check_alloc(&comm->vbuf,nvr+i);
4556 /* Communicate cgcm and state */
4557 dd_sendrecv_rvec(dd, d, dir,
4558 comm->cgcm_state[cdd], nvs,
4559 comm->vbuf.v+nvr, i);
4560 ncg_recv += rbuf[0];
4561 nat_recv += rbuf[1];
4565 /* Process the received charge groups */
4567 for(cg=0; cg<ncg_recv; cg++)
4569 flag = comm->buf_int[cg*DD_CGIBS+1];
4571 if (dim >= npbcdim && dd->nc[dim] > 2)
4573 /* No pbc in this dim and more than one domain boundary.
4574 * We to a separate check if a charge did not move too far.
4576 if (((flag & DD_FLAG_FW(d)) &&
4577 comm->vbuf.v[buf_pos][d] > cell_x1[dim]) ||
4578 ((flag & DD_FLAG_BW(d)) &&
4579 comm->vbuf.v[buf_pos][d] < cell_x0[dim]))
4581 cg_move_error(fplog,dd,step,cg,d,
4582 (flag & DD_FLAG_FW(d)) ? 1 : 0,
4584 comm->vbuf.v[buf_pos],
4585 comm->vbuf.v[buf_pos],
4586 comm->vbuf.v[buf_pos][d]);
4593 /* Check which direction this cg should go */
4594 for(d2=d+1; (d2<dd->ndim && mc==-1); d2++)
4598 /* The cell boundaries for dimension d2 are not equal
4599 * for each cell row of the lower dimension(s),
4600 * therefore we might need to redetermine where
4601 * this cg should go.
4604 /* If this cg crosses the box boundary in dimension d2
4605 * we can use the communicated flag, so we do not
4606 * have to worry about pbc.
4608 if (!((dd->ci[dim2] == dd->nc[dim2]-1 &&
4609 (flag & DD_FLAG_FW(d2))) ||
4610 (dd->ci[dim2] == 0 &&
4611 (flag & DD_FLAG_BW(d2)))))
4613 /* Clear the two flags for this dimension */
4614 flag &= ~(DD_FLAG_FW(d2) | DD_FLAG_BW(d2));
4615 /* Determine the location of this cg
4616 * in lattice coordinates
4618 pos_d = comm->vbuf.v[buf_pos][dim2];
4621 for(d3=dim2+1; d3<DIM; d3++)
4624 comm->vbuf.v[buf_pos][d3]*tcm[d3][dim2];
4627 /* Check of we are not at the box edge.
4628 * pbc is only handled in the first step above,
4629 * but this check could move over pbc while
4630 * the first step did not due to different rounding.
4632 if (pos_d >= cell_x1[dim2] &&
4633 dd->ci[dim2] != dd->nc[dim2]-1)
4635 flag |= DD_FLAG_FW(d2);
4637 else if (pos_d < cell_x0[dim2] &&
4640 flag |= DD_FLAG_BW(d2);
4642 comm->buf_int[cg*DD_CGIBS+1] = flag;
4645 /* Set to which neighboring cell this cg should go */
4646 if (flag & DD_FLAG_FW(d2))
4650 else if (flag & DD_FLAG_BW(d2))
4652 if (dd->nc[dd->dim[d2]] > 2)
4664 nrcg = flag & DD_FLAG_NRCG;
4667 if (home_pos_cg+1 > dd->cg_nalloc)
4669 dd->cg_nalloc = over_alloc_dd(home_pos_cg+1);
4670 srenew(dd->index_gl,dd->cg_nalloc);
4671 srenew(dd->cgindex,dd->cg_nalloc+1);
4673 /* Set the global charge group index and size */
4674 dd->index_gl[home_pos_cg] = comm->buf_int[cg*DD_CGIBS];
4675 dd->cgindex[home_pos_cg+1] = dd->cgindex[home_pos_cg] + nrcg;
4676 /* Copy the state from the buffer */
4677 if (home_pos_cg >= fr->cg_nalloc)
4679 dd_realloc_fr_cg(fr,home_pos_cg+1);
4682 copy_rvec(comm->vbuf.v[buf_pos++],cg_cm[home_pos_cg]);
4683 /* Set the cginfo */
4684 fr->cginfo[home_pos_cg] = ddcginfo(cginfo_mb,
4685 dd->index_gl[home_pos_cg]);
4688 comm->bLocalCG[dd->index_gl[home_pos_cg]] = TRUE;
4691 if (home_pos_at+nrcg > state->nalloc)
4693 dd_realloc_state(state,f,home_pos_at+nrcg);
4695 for(i=0; i<nrcg; i++)
4697 copy_rvec(comm->vbuf.v[buf_pos++],
4698 state->x[home_pos_at+i]);
4702 for(i=0; i<nrcg; i++)
4704 copy_rvec(comm->vbuf.v[buf_pos++],
4705 state->v[home_pos_at+i]);
4710 for(i=0; i<nrcg; i++)
4712 copy_rvec(comm->vbuf.v[buf_pos++],
4713 state->sd_X[home_pos_at+i]);
4718 for(i=0; i<nrcg; i++)
4720 copy_rvec(comm->vbuf.v[buf_pos++],
4721 state->cg_p[home_pos_at+i]);
4725 home_pos_at += nrcg;
4729 /* Reallocate the buffers if necessary */
4730 if (ncg[mc]+1 > comm->cggl_flag_nalloc[mc])
4732 comm->cggl_flag_nalloc[mc] = over_alloc_dd(ncg[mc]+1);
4733 srenew(comm->cggl_flag[mc],comm->cggl_flag_nalloc[mc]*DD_CGIBS);
4735 nvr = ncg[mc] + nat[mc]*nvec;
4736 if (nvr + 1 + nrcg*nvec > comm->cgcm_state_nalloc[mc])
4738 comm->cgcm_state_nalloc[mc] = over_alloc_dd(nvr + 1 + nrcg*nvec);
4739 srenew(comm->cgcm_state[mc],comm->cgcm_state_nalloc[mc]);
4741 /* Copy from the receive to the send buffers */
4742 memcpy(comm->cggl_flag[mc] + ncg[mc]*DD_CGIBS,
4743 comm->buf_int + cg*DD_CGIBS,
4744 DD_CGIBS*sizeof(int));
4745 memcpy(comm->cgcm_state[mc][nvr],
4746 comm->vbuf.v[buf_pos],
4747 (1+nrcg*nvec)*sizeof(rvec));
4748 buf_pos += 1 + nrcg*nvec;
4755 /* With sorting (!bCompact) the indices are now only partially up to date
4756 * and ncg_home and nat_home are not the real count, since there are
4757 * "holes" in the arrays for the charge groups that moved to neighbors.
4759 dd->ncg_home = home_pos_cg;
4760 dd->nat_home = home_pos_at;
4764 fprintf(debug,"Finished repartitioning\n");
4767 return ncg_stay_home;
4770 void dd_cycles_add(gmx_domdec_t *dd,float cycles,int ddCycl)
4772 dd->comm->cycl[ddCycl] += cycles;
4773 dd->comm->cycl_n[ddCycl]++;
4774 if (cycles > dd->comm->cycl_max[ddCycl])
4776 dd->comm->cycl_max[ddCycl] = cycles;
4780 static double force_flop_count(t_nrnb *nrnb)
4787 for(i=eNR_NBKERNEL010; i<eNR_NBKERNEL_FREE_ENERGY; i++)
4789 /* To get closer to the real timings, we half the count
4790 * for the normal loops and again half it for water loops.
4793 if (strstr(name,"W3") != NULL || strstr(name,"W4") != NULL)
4795 sum += nrnb->n[i]*0.25*cost_nrnb(i);
4799 sum += nrnb->n[i]*0.50*cost_nrnb(i);
4802 for(i=eNR_NBKERNEL_FREE_ENERGY; i<=eNR_NB14; i++)
4805 if (strstr(name,"W3") != NULL || strstr(name,"W4") != NULL)
4806 sum += nrnb->n[i]*cost_nrnb(i);
4808 for(i=eNR_BONDS; i<=eNR_WALLS; i++)
4810 sum += nrnb->n[i]*cost_nrnb(i);
4816 void dd_force_flop_start(gmx_domdec_t *dd,t_nrnb *nrnb)
4818 if (dd->comm->eFlop)
4820 dd->comm->flop -= force_flop_count(nrnb);
4823 void dd_force_flop_stop(gmx_domdec_t *dd,t_nrnb *nrnb)
4825 if (dd->comm->eFlop)
4827 dd->comm->flop += force_flop_count(nrnb);
4832 static void clear_dd_cycle_counts(gmx_domdec_t *dd)
4836 for(i=0; i<ddCyclNr; i++)
4838 dd->comm->cycl[i] = 0;
4839 dd->comm->cycl_n[i] = 0;
4840 dd->comm->cycl_max[i] = 0;
4843 dd->comm->flop_n = 0;
4846 static void get_load_distribution(gmx_domdec_t *dd,gmx_wallcycle_t wcycle)
4848 gmx_domdec_comm_t *comm;
4849 gmx_domdec_load_t *load;
4850 gmx_domdec_root_t *root=NULL;
4851 int d,dim,cid,i,pos;
4852 float cell_frac=0,sbuf[DD_NLOAD_MAX];
4857 fprintf(debug,"get_load_distribution start\n");
4860 wallcycle_start(wcycle,ewcDDCOMMLOAD);
4864 bSepPME = (dd->pme_nodeid >= 0);
4866 for(d=dd->ndim-1; d>=0; d--)
4869 /* Check if we participate in the communication in this dimension */
4870 if (d == dd->ndim-1 ||
4871 (dd->ci[dd->dim[d+1]]==0 && dd->ci[dd->dim[dd->ndim-1]]==0))
4873 load = &comm->load[d];
4876 cell_frac = comm->cell_f1[d] - comm->cell_f0[d];
4879 if (d == dd->ndim-1)
4881 sbuf[pos++] = dd_force_load(comm);
4882 sbuf[pos++] = sbuf[0];
4885 sbuf[pos++] = sbuf[0];
4886 sbuf[pos++] = cell_frac;
4889 sbuf[pos++] = comm->cell_f_max0[d];
4890 sbuf[pos++] = comm->cell_f_min1[d];
4895 sbuf[pos++] = comm->cycl[ddCyclPPduringPME];
4896 sbuf[pos++] = comm->cycl[ddCyclPME];
4901 sbuf[pos++] = comm->load[d+1].sum;
4902 sbuf[pos++] = comm->load[d+1].max;
4905 sbuf[pos++] = comm->load[d+1].sum_m;
4906 sbuf[pos++] = comm->load[d+1].cvol_min*cell_frac;
4907 sbuf[pos++] = comm->load[d+1].flags;
4910 sbuf[pos++] = comm->cell_f_max0[d];
4911 sbuf[pos++] = comm->cell_f_min1[d];
4916 sbuf[pos++] = comm->load[d+1].mdf;
4917 sbuf[pos++] = comm->load[d+1].pme;
4921 /* Communicate a row in DD direction d.
4922 * The communicators are setup such that the root always has rank 0.
4925 MPI_Gather(sbuf ,load->nload*sizeof(float),MPI_BYTE,
4926 load->load,load->nload*sizeof(float),MPI_BYTE,
4927 0,comm->mpi_comm_load[d]);
4929 if (dd->ci[dim] == dd->master_ci[dim])
4931 /* We are the root, process this row */
4932 if (comm->bDynLoadBal)
4934 root = comm->root[d];
4944 for(i=0; i<dd->nc[dim]; i++)
4946 load->sum += load->load[pos++];
4947 load->max = max(load->max,load->load[pos]);
4953 /* This direction could not be load balanced properly,
4954 * therefore we need to use the maximum iso the average load.
4956 load->sum_m = max(load->sum_m,load->load[pos]);
4960 load->sum_m += load->load[pos];
4963 load->cvol_min = min(load->cvol_min,load->load[pos]);
4967 load->flags = (int)(load->load[pos++] + 0.5);
4971 root->cell_f_max0[i] = load->load[pos++];
4972 root->cell_f_min1[i] = load->load[pos++];
4977 load->mdf = max(load->mdf,load->load[pos]);
4979 load->pme = max(load->pme,load->load[pos]);
4983 if (comm->bDynLoadBal && root->bLimited)
4985 load->sum_m *= dd->nc[dim];
4986 load->flags |= (1<<d);
4994 comm->nload += dd_load_count(comm);
4995 comm->load_step += comm->cycl[ddCyclStep];
4996 comm->load_sum += comm->load[0].sum;
4997 comm->load_max += comm->load[0].max;
4998 if (comm->bDynLoadBal)
5000 for(d=0; d<dd->ndim; d++)
5002 if (comm->load[0].flags & (1<<d))
5004 comm->load_lim[d]++;
5010 comm->load_mdf += comm->load[0].mdf;
5011 comm->load_pme += comm->load[0].pme;
5015 wallcycle_stop(wcycle,ewcDDCOMMLOAD);
5019 fprintf(debug,"get_load_distribution finished\n");
5023 static float dd_force_imb_perf_loss(gmx_domdec_t *dd)
5025 /* Return the relative performance loss on the total run time
5026 * due to the force calculation load imbalance.
5028 if (dd->comm->nload > 0)
5031 (dd->comm->load_max*dd->nnodes - dd->comm->load_sum)/
5032 (dd->comm->load_step*dd->nnodes);
5040 static void print_dd_load_av(FILE *fplog,gmx_domdec_t *dd)
5043 int npp,npme,nnodes,d,limp;
5044 float imbal,pme_f_ratio,lossf,lossp=0;
5046 gmx_domdec_comm_t *comm;
5049 if (DDMASTER(dd) && comm->nload > 0)
5052 npme = (dd->pme_nodeid >= 0) ? comm->npmenodes : 0;
5053 nnodes = npp + npme;
5054 imbal = comm->load_max*npp/comm->load_sum - 1;
5055 lossf = dd_force_imb_perf_loss(dd);
5056 sprintf(buf," Average load imbalance: %.1f %%\n",imbal*100);
5057 fprintf(fplog,"%s",buf);
5058 fprintf(stderr,"\n");
5059 fprintf(stderr,"%s",buf);
5060 sprintf(buf," Part of the total run time spent waiting due to load imbalance: %.1f %%\n",lossf*100);
5061 fprintf(fplog,"%s",buf);
5062 fprintf(stderr,"%s",buf);
5064 if (comm->bDynLoadBal)
5066 sprintf(buf," Steps where the load balancing was limited by -rdd, -rcon and/or -dds:");
5067 for(d=0; d<dd->ndim; d++)
5069 limp = (200*comm->load_lim[d]+1)/(2*comm->nload);
5070 sprintf(buf+strlen(buf)," %c %d %%",dim2char(dd->dim[d]),limp);
5076 sprintf(buf+strlen(buf),"\n");
5077 fprintf(fplog,"%s",buf);
5078 fprintf(stderr,"%s",buf);
5082 pme_f_ratio = comm->load_pme/comm->load_mdf;
5083 lossp = (comm->load_pme -comm->load_mdf)/comm->load_step;
5086 lossp *= (float)npme/(float)nnodes;
5090 lossp *= (float)npp/(float)nnodes;
5092 sprintf(buf," Average PME mesh/force load: %5.3f\n",pme_f_ratio);
5093 fprintf(fplog,"%s",buf);
5094 fprintf(stderr,"%s",buf);
5095 sprintf(buf," Part of the total run time spent waiting due to PP/PME imbalance: %.1f %%\n",fabs(lossp)*100);
5096 fprintf(fplog,"%s",buf);
5097 fprintf(stderr,"%s",buf);
5099 fprintf(fplog,"\n");
5100 fprintf(stderr,"\n");
5102 if (lossf >= DD_PERF_LOSS)
5105 "NOTE: %.1f %% performance was lost due to load imbalance\n"
5106 " in the domain decomposition.\n",lossf*100);
5107 if (!comm->bDynLoadBal)
5109 sprintf(buf+strlen(buf)," You might want to use dynamic load balancing (option -dlb.)\n");
5113 sprintf(buf+strlen(buf)," You might want to decrease the cell size limit (options -rdd, -rcon and/or -dds).\n");
5115 fprintf(fplog,"%s\n",buf);
5116 fprintf(stderr,"%s\n",buf);
5118 if (npme > 0 && fabs(lossp) >= DD_PERF_LOSS)
5121 "NOTE: %.1f %% performance was lost because the PME nodes\n"
5122 " had %s work to do than the PP nodes.\n"
5123 " You might want to %s the number of PME nodes\n"
5124 " or %s the cut-off and the grid spacing.\n",
5126 (lossp < 0) ? "less" : "more",
5127 (lossp < 0) ? "decrease" : "increase",
5128 (lossp < 0) ? "decrease" : "increase");
5129 fprintf(fplog,"%s\n",buf);
5130 fprintf(stderr,"%s\n",buf);
5135 static float dd_vol_min(gmx_domdec_t *dd)
5137 return dd->comm->load[0].cvol_min*dd->nnodes;
5140 static gmx_bool dd_load_flags(gmx_domdec_t *dd)
5142 return dd->comm->load[0].flags;
5145 static float dd_f_imbal(gmx_domdec_t *dd)
5147 return dd->comm->load[0].max*dd->nnodes/dd->comm->load[0].sum - 1;
5150 static float dd_pme_f_ratio(gmx_domdec_t *dd)
5152 return dd->comm->load[0].pme/dd->comm->load[0].mdf;
5155 static void dd_print_load(FILE *fplog,gmx_domdec_t *dd,gmx_large_int_t step)
5160 flags = dd_load_flags(dd);
5164 "DD load balancing is limited by minimum cell size in dimension");
5165 for(d=0; d<dd->ndim; d++)
5169 fprintf(fplog," %c",dim2char(dd->dim[d]));
5172 fprintf(fplog,"\n");
5174 fprintf(fplog,"DD step %s",gmx_step_str(step,buf));
5175 if (dd->comm->bDynLoadBal)
5177 fprintf(fplog," vol min/aver %5.3f%c",
5178 dd_vol_min(dd),flags ? '!' : ' ');
5180 fprintf(fplog," load imb.: force %4.1f%%",dd_f_imbal(dd)*100);
5181 if (dd->comm->cycl_n[ddCyclPME])
5183 fprintf(fplog," pme mesh/force %5.3f",dd_pme_f_ratio(dd));
5185 fprintf(fplog,"\n\n");
5188 static void dd_print_load_verbose(gmx_domdec_t *dd)
5190 if (dd->comm->bDynLoadBal)
5192 fprintf(stderr,"vol %4.2f%c ",
5193 dd_vol_min(dd),dd_load_flags(dd) ? '!' : ' ');
5195 fprintf(stderr,"imb F %2d%% ",(int)(dd_f_imbal(dd)*100+0.5));
5196 if (dd->comm->cycl_n[ddCyclPME])
5198 fprintf(stderr,"pme/F %4.2f ",dd_pme_f_ratio(dd));
5203 static void make_load_communicator(gmx_domdec_t *dd,MPI_Group g_all,
5204 int dim_ind,ivec loc)
5206 MPI_Group g_row = MPI_GROUP_EMPTY;
5210 gmx_domdec_root_t *root;
5211 gmx_bool bPartOfGroup = FALSE;
5213 dim = dd->dim[dim_ind];
5214 copy_ivec(loc,loc_c);
5215 snew(rank,dd->nc[dim]);
5216 for(i=0; i<dd->nc[dim]; i++)
5219 rank[i] = dd_index(dd->nc,loc_c);
5220 if (rank[i] == dd->rank)
5222 /* This process is part of the group */
5223 bPartOfGroup = TRUE;
5228 MPI_Group_incl(g_all,dd->nc[dim],rank,&g_row);
5230 MPI_Comm_create(dd->mpi_comm_all,g_row,&c_row);
5233 dd->comm->mpi_comm_load[dim_ind] = c_row;
5234 if (dd->comm->eDLB != edlbNO)
5236 if (dd->ci[dim] == dd->master_ci[dim])
5238 /* This is the root process of this row */
5239 snew(dd->comm->root[dim_ind],1);
5240 root = dd->comm->root[dim_ind];
5241 snew(root->cell_f,DD_CELL_F_SIZE(dd,dim_ind));
5242 snew(root->old_cell_f,dd->nc[dim]+1);
5243 snew(root->bCellMin,dd->nc[dim]);
5246 snew(root->cell_f_max0,dd->nc[dim]);
5247 snew(root->cell_f_min1,dd->nc[dim]);
5248 snew(root->bound_min,dd->nc[dim]);
5249 snew(root->bound_max,dd->nc[dim]);
5251 snew(root->buf_ncd,dd->nc[dim]);
5255 /* This is not a root process, we only need to receive cell_f */
5256 snew(dd->comm->cell_f_row,DD_CELL_F_SIZE(dd,dim_ind));
5259 if (dd->ci[dim] == dd->master_ci[dim])
5261 snew(dd->comm->load[dim_ind].load,dd->nc[dim]*DD_NLOAD_MAX);
5268 static void make_load_communicators(gmx_domdec_t *dd)
5276 fprintf(debug,"Making load communicators\n");
5278 MPI_Comm_group(dd->mpi_comm_all,&g_all);
5280 snew(dd->comm->load,dd->ndim);
5281 snew(dd->comm->mpi_comm_load,dd->ndim);
5284 make_load_communicator(dd,g_all,0,loc);
5287 for(i=0; i<dd->nc[dim0]; i++) {
5289 make_load_communicator(dd,g_all,1,loc);
5294 for(i=0; i<dd->nc[dim0]; i++) {
5297 for(j=0; j<dd->nc[dim1]; j++) {
5299 make_load_communicator(dd,g_all,2,loc);
5304 MPI_Group_free(&g_all);
5307 fprintf(debug,"Finished making load communicators\n");
5311 void setup_dd_grid(FILE *fplog,gmx_domdec_t *dd)
5317 ivec dd_zp[DD_MAXIZONE];
5318 gmx_domdec_zones_t *zones;
5319 gmx_domdec_ns_ranges_t *izone;
5321 for(d=0; d<dd->ndim; d++)
5324 copy_ivec(dd->ci,tmp);
5325 tmp[dim] = (tmp[dim] + 1) % dd->nc[dim];
5326 dd->neighbor[d][0] = ddcoord2ddnodeid(dd,tmp);
5327 copy_ivec(dd->ci,tmp);
5328 tmp[dim] = (tmp[dim] - 1 + dd->nc[dim]) % dd->nc[dim];
5329 dd->neighbor[d][1] = ddcoord2ddnodeid(dd,tmp);
5332 fprintf(debug,"DD rank %d neighbor ranks in dir %d are + %d - %d\n",
5335 dd->neighbor[d][1]);
5341 fprintf(stderr,"Making %dD domain decomposition %d x %d x %d\n",
5342 dd->ndim,dd->nc[XX],dd->nc[YY],dd->nc[ZZ]);
5346 fprintf(fplog,"\nMaking %dD domain decomposition grid %d x %d x %d, home cell index %d %d %d\n\n",
5348 dd->nc[XX],dd->nc[YY],dd->nc[ZZ],
5349 dd->ci[XX],dd->ci[YY],dd->ci[ZZ]);
5356 for(i=0; i<nzonep; i++)
5358 copy_ivec(dd_zp3[i],dd_zp[i]);
5364 for(i=0; i<nzonep; i++)
5366 copy_ivec(dd_zp2[i],dd_zp[i]);
5372 for(i=0; i<nzonep; i++)
5374 copy_ivec(dd_zp1[i],dd_zp[i]);
5378 gmx_fatal(FARGS,"Can only do 1, 2 or 3D domain decomposition");
5383 zones = &dd->comm->zones;
5385 for(i=0; i<nzone; i++)
5388 clear_ivec(zones->shift[i]);
5389 for(d=0; d<dd->ndim; d++)
5391 zones->shift[i][dd->dim[d]] = dd_zo[i][m++];
5396 for(i=0; i<nzone; i++)
5398 for(d=0; d<DIM; d++)
5400 s[d] = dd->ci[d] - zones->shift[i][d];
5405 else if (s[d] >= dd->nc[d])
5411 zones->nizone = nzonep;
5412 for(i=0; i<zones->nizone; i++)
5414 if (dd_zp[i][0] != i)
5416 gmx_fatal(FARGS,"Internal inconsistency in the dd grid setup");
5418 izone = &zones->izone[i];
5419 izone->j0 = dd_zp[i][1];
5420 izone->j1 = dd_zp[i][2];
5421 for(dim=0; dim<DIM; dim++)
5423 if (dd->nc[dim] == 1)
5425 /* All shifts should be allowed */
5426 izone->shift0[dim] = -1;
5427 izone->shift1[dim] = 1;
5432 izone->shift0[d] = 0;
5433 izone->shift1[d] = 0;
5434 for(j=izone->j0; j<izone->j1; j++) {
5435 if (dd->shift[j][d] > dd->shift[i][d])
5436 izone->shift0[d] = -1;
5437 if (dd->shift[j][d] < dd->shift[i][d])
5438 izone->shift1[d] = 1;
5444 /* Assume the shift are not more than 1 cell */
5445 izone->shift0[dim] = 1;
5446 izone->shift1[dim] = -1;
5447 for(j=izone->j0; j<izone->j1; j++)
5449 shift_diff = zones->shift[j][dim] - zones->shift[i][dim];
5450 if (shift_diff < izone->shift0[dim])
5452 izone->shift0[dim] = shift_diff;
5454 if (shift_diff > izone->shift1[dim])
5456 izone->shift1[dim] = shift_diff;
5463 if (dd->comm->eDLB != edlbNO)
5465 snew(dd->comm->root,dd->ndim);
5468 if (dd->comm->bRecordLoad)
5470 make_load_communicators(dd);
5474 static void make_pp_communicator(FILE *fplog,t_commrec *cr,int reorder)
5477 gmx_domdec_comm_t *comm;
5488 if (comm->bCartesianPP)
5490 /* Set up cartesian communication for the particle-particle part */
5493 fprintf(fplog,"Will use a Cartesian communicator: %d x %d x %d\n",
5494 dd->nc[XX],dd->nc[YY],dd->nc[ZZ]);
5497 for(i=0; i<DIM; i++)
5501 MPI_Cart_create(cr->mpi_comm_mygroup,DIM,dd->nc,periods,reorder,
5503 /* We overwrite the old communicator with the new cartesian one */
5504 cr->mpi_comm_mygroup = comm_cart;
5507 dd->mpi_comm_all = cr->mpi_comm_mygroup;
5508 MPI_Comm_rank(dd->mpi_comm_all,&dd->rank);
5510 if (comm->bCartesianPP_PME)
5512 /* Since we want to use the original cartesian setup for sim,
5513 * and not the one after split, we need to make an index.
5515 snew(comm->ddindex2ddnodeid,dd->nnodes);
5516 comm->ddindex2ddnodeid[dd_index(dd->nc,dd->ci)] = dd->rank;
5517 gmx_sumi(dd->nnodes,comm->ddindex2ddnodeid,cr);
5518 /* Get the rank of the DD master,
5519 * above we made sure that the master node is a PP node.
5529 MPI_Allreduce(&rank,&dd->masterrank,1,MPI_INT,MPI_SUM,dd->mpi_comm_all);
5531 else if (comm->bCartesianPP)
5533 if (cr->npmenodes == 0)
5535 /* The PP communicator is also
5536 * the communicator for this simulation
5538 cr->mpi_comm_mysim = cr->mpi_comm_mygroup;
5540 cr->nodeid = dd->rank;
5542 MPI_Cart_coords(dd->mpi_comm_all,dd->rank,DIM,dd->ci);
5544 /* We need to make an index to go from the coordinates
5545 * to the nodeid of this simulation.
5547 snew(comm->ddindex2simnodeid,dd->nnodes);
5548 snew(buf,dd->nnodes);
5549 if (cr->duty & DUTY_PP)
5551 buf[dd_index(dd->nc,dd->ci)] = cr->sim_nodeid;
5553 /* Communicate the ddindex to simulation nodeid index */
5554 MPI_Allreduce(buf,comm->ddindex2simnodeid,dd->nnodes,MPI_INT,MPI_SUM,
5555 cr->mpi_comm_mysim);
5558 /* Determine the master coordinates and rank.
5559 * The DD master should be the same node as the master of this sim.
5561 for(i=0; i<dd->nnodes; i++)
5563 if (comm->ddindex2simnodeid[i] == 0)
5565 ddindex2xyz(dd->nc,i,dd->master_ci);
5566 MPI_Cart_rank(dd->mpi_comm_all,dd->master_ci,&dd->masterrank);
5571 fprintf(debug,"The master rank is %d\n",dd->masterrank);
5576 /* No Cartesian communicators */
5577 /* We use the rank in dd->comm->all as DD index */
5578 ddindex2xyz(dd->nc,dd->rank,dd->ci);
5579 /* The simulation master nodeid is 0, so the DD master rank is also 0 */
5581 clear_ivec(dd->master_ci);
5588 "Domain decomposition nodeid %d, coordinates %d %d %d\n\n",
5589 dd->rank,dd->ci[XX],dd->ci[YY],dd->ci[ZZ]);
5594 "Domain decomposition nodeid %d, coordinates %d %d %d\n\n",
5595 dd->rank,dd->ci[XX],dd->ci[YY],dd->ci[ZZ]);
5599 static void receive_ddindex2simnodeid(t_commrec *cr)
5603 gmx_domdec_comm_t *comm;
5610 if (!comm->bCartesianPP_PME && comm->bCartesianPP)
5612 snew(comm->ddindex2simnodeid,dd->nnodes);
5613 snew(buf,dd->nnodes);
5614 if (cr->duty & DUTY_PP)
5616 buf[dd_index(dd->nc,dd->ci)] = cr->sim_nodeid;
5619 /* Communicate the ddindex to simulation nodeid index */
5620 MPI_Allreduce(buf,comm->ddindex2simnodeid,dd->nnodes,MPI_INT,MPI_SUM,
5621 cr->mpi_comm_mysim);
5628 static gmx_domdec_master_t *init_gmx_domdec_master_t(gmx_domdec_t *dd,
5631 gmx_domdec_master_t *ma;
5636 snew(ma->ncg,dd->nnodes);
5637 snew(ma->index,dd->nnodes+1);
5639 snew(ma->nat,dd->nnodes);
5640 snew(ma->ibuf,dd->nnodes*2);
5641 snew(ma->cell_x,DIM);
5642 for(i=0; i<DIM; i++)
5644 snew(ma->cell_x[i],dd->nc[i]+1);
5647 if (dd->nnodes <= GMX_DD_NNODES_SENDRECV)
5653 snew(ma->vbuf,natoms);
5659 static void split_communicator(FILE *fplog,t_commrec *cr,int dd_node_order,
5663 gmx_domdec_comm_t *comm;
5674 if (comm->bCartesianPP)
5676 for(i=1; i<DIM; i++)
5678 bDiv[i] = ((cr->npmenodes*dd->nc[i]) % (dd->nnodes) == 0);
5680 if (bDiv[YY] || bDiv[ZZ])
5682 comm->bCartesianPP_PME = TRUE;
5683 /* If we have 2D PME decomposition, which is always in x+y,
5684 * we stack the PME only nodes in z.
5685 * Otherwise we choose the direction that provides the thinnest slab
5686 * of PME only nodes as this will have the least effect
5687 * on the PP communication.
5688 * But for the PME communication the opposite might be better.
5690 if (bDiv[ZZ] && (comm->npmenodes_y > 1 ||
5692 dd->nc[YY] > dd->nc[ZZ]))
5694 comm->cartpmedim = ZZ;
5698 comm->cartpmedim = YY;
5700 comm->ntot[comm->cartpmedim]
5701 += (cr->npmenodes*dd->nc[comm->cartpmedim])/dd->nnodes;
5705 fprintf(fplog,"#pmenodes (%d) is not a multiple of nx*ny (%d*%d) or nx*nz (%d*%d)\n",cr->npmenodes,dd->nc[XX],dd->nc[YY],dd->nc[XX],dd->nc[ZZ]);
5707 "Will not use a Cartesian communicator for PP <-> PME\n\n");
5712 if (comm->bCartesianPP_PME)
5716 fprintf(fplog,"Will use a Cartesian communicator for PP <-> PME: %d x %d x %d\n",comm->ntot[XX],comm->ntot[YY],comm->ntot[ZZ]);
5719 for(i=0; i<DIM; i++)
5723 MPI_Cart_create(cr->mpi_comm_mysim,DIM,comm->ntot,periods,reorder,
5726 MPI_Comm_rank(comm_cart,&rank);
5727 if (MASTERNODE(cr) && rank != 0)
5729 gmx_fatal(FARGS,"MPI rank 0 was renumbered by MPI_Cart_create, we do not allow this");
5732 /* With this assigment we loose the link to the original communicator
5733 * which will usually be MPI_COMM_WORLD, unless have multisim.
5735 cr->mpi_comm_mysim = comm_cart;
5736 cr->sim_nodeid = rank;
5738 MPI_Cart_coords(cr->mpi_comm_mysim,cr->sim_nodeid,DIM,dd->ci);
5742 fprintf(fplog,"Cartesian nodeid %d, coordinates %d %d %d\n\n",
5743 cr->sim_nodeid,dd->ci[XX],dd->ci[YY],dd->ci[ZZ]);
5746 if (dd->ci[comm->cartpmedim] < dd->nc[comm->cartpmedim])
5750 if (cr->npmenodes == 0 ||
5751 dd->ci[comm->cartpmedim] >= dd->nc[comm->cartpmedim])
5753 cr->duty = DUTY_PME;
5756 /* Split the sim communicator into PP and PME only nodes */
5757 MPI_Comm_split(cr->mpi_comm_mysim,
5759 dd_index(comm->ntot,dd->ci),
5760 &cr->mpi_comm_mygroup);
5764 switch (dd_node_order)
5769 fprintf(fplog,"Order of the nodes: PP first, PME last\n");
5772 case ddnoINTERLEAVE:
5773 /* Interleave the PP-only and PME-only nodes,
5774 * as on clusters with dual-core machines this will double
5775 * the communication bandwidth of the PME processes
5776 * and thus speed up the PP <-> PME and inter PME communication.
5780 fprintf(fplog,"Interleaving PP and PME nodes\n");
5782 comm->pmenodes = dd_pmenodes(cr);
5787 gmx_fatal(FARGS,"Unknown dd_node_order=%d",dd_node_order);
5790 if (dd_simnode2pmenode(cr,cr->sim_nodeid) == -1)
5792 cr->duty = DUTY_PME;
5799 /* Split the sim communicator into PP and PME only nodes */
5800 MPI_Comm_split(cr->mpi_comm_mysim,
5803 &cr->mpi_comm_mygroup);
5804 MPI_Comm_rank(cr->mpi_comm_mygroup,&cr->nodeid);
5810 fprintf(fplog,"This is a %s only node\n\n",
5811 (cr->duty & DUTY_PP) ? "particle-particle" : "PME-mesh");
5815 void make_dd_communicators(FILE *fplog,t_commrec *cr,int dd_node_order)
5818 gmx_domdec_comm_t *comm;
5824 copy_ivec(dd->nc,comm->ntot);
5826 comm->bCartesianPP = (dd_node_order == ddnoCARTESIAN);
5827 comm->bCartesianPP_PME = FALSE;
5829 /* Reorder the nodes by default. This might change the MPI ranks.
5830 * Real reordering is only supported on very few architectures,
5831 * Blue Gene is one of them.
5833 CartReorder = (getenv("GMX_NO_CART_REORDER") == NULL);
5835 if (cr->npmenodes > 0)
5837 /* Split the communicator into a PP and PME part */
5838 split_communicator(fplog,cr,dd_node_order,CartReorder);
5839 if (comm->bCartesianPP_PME)
5841 /* We (possibly) reordered the nodes in split_communicator,
5842 * so it is no longer required in make_pp_communicator.
5844 CartReorder = FALSE;
5849 /* All nodes do PP and PME */
5851 /* We do not require separate communicators */
5852 cr->mpi_comm_mygroup = cr->mpi_comm_mysim;
5856 if (cr->duty & DUTY_PP)
5858 /* Copy or make a new PP communicator */
5859 make_pp_communicator(fplog,cr,CartReorder);
5863 receive_ddindex2simnodeid(cr);
5866 if (!(cr->duty & DUTY_PME))
5868 /* Set up the commnuication to our PME node */
5869 dd->pme_nodeid = dd_simnode2pmenode(cr,cr->sim_nodeid);
5870 dd->pme_receive_vir_ener = receive_vir_ener(cr);
5873 fprintf(debug,"My pme_nodeid %d receive ener %d\n",
5874 dd->pme_nodeid,dd->pme_receive_vir_ener);
5879 dd->pme_nodeid = -1;
5884 dd->ma = init_gmx_domdec_master_t(dd,
5886 comm->cgs_gl.index[comm->cgs_gl.nr]);
5890 static real *get_slb_frac(FILE *fplog,const char *dir,int nc,const char *size_string)
5897 if (nc > 1 && size_string != NULL)
5901 fprintf(fplog,"Using static load balancing for the %s direction\n",
5906 for (i=0; i<nc; i++)
5909 sscanf(size_string,"%lf%n",&dbl,&n);
5912 gmx_fatal(FARGS,"Incorrect or not enough DD cell size entries for direction %s: '%s'",dir,size_string);
5921 fprintf(fplog,"Relative cell sizes:");
5923 for (i=0; i<nc; i++)
5928 fprintf(fplog," %5.3f",slb_frac[i]);
5933 fprintf(fplog,"\n");
5940 static int multi_body_bondeds_count(gmx_mtop_t *mtop)
5943 gmx_mtop_ilistloop_t iloop;
5947 iloop = gmx_mtop_ilistloop_init(mtop);
5948 while (gmx_mtop_ilistloop_next(iloop,&il,&nmol))
5950 for(ftype=0; ftype<F_NRE; ftype++)
5952 if ((interaction_function[ftype].flags & IF_BOND) &&
5955 n += nmol*il[ftype].nr/(1 + NRAL(ftype));
5963 static int dd_nst_env(FILE *fplog,const char *env_var,int def)
5969 val = getenv(env_var);
5972 if (sscanf(val,"%d",&nst) <= 0)
5978 fprintf(fplog,"Found env.var. %s = %s, using value %d\n",
5986 static void dd_warning(t_commrec *cr,FILE *fplog,const char *warn_string)
5990 fprintf(stderr,"\n%s\n",warn_string);
5994 fprintf(fplog,"\n%s\n",warn_string);
5998 static void check_dd_restrictions(t_commrec *cr,gmx_domdec_t *dd,
5999 t_inputrec *ir,FILE *fplog)
6001 if (ir->ePBC == epbcSCREW &&
6002 (dd->nc[XX] == 1 || dd->nc[YY] > 1 || dd->nc[ZZ] > 1))
6004 gmx_fatal(FARGS,"With pbc=%s can only do domain decomposition in the x-direction",epbc_names[ir->ePBC]);
6007 if (ir->ns_type == ensSIMPLE)
6009 gmx_fatal(FARGS,"Domain decomposition does not support simple neighbor searching, use grid searching or use particle decomposition");
6012 if (ir->nstlist == 0)
6014 gmx_fatal(FARGS,"Domain decomposition does not work with nstlist=0");
6017 if (ir->comm_mode == ecmANGULAR && ir->ePBC != epbcNONE)
6019 dd_warning(cr,fplog,"comm-mode angular will give incorrect results when the comm group partially crosses a periodic boundary");
6023 static real average_cellsize_min(gmx_domdec_t *dd,gmx_ddbox_t *ddbox)
6028 r = ddbox->box_size[XX];
6029 for(di=0; di<dd->ndim; di++)
6032 /* Check using the initial average cell size */
6033 r = min(r,ddbox->box_size[d]*ddbox->skew_fac[d]/dd->nc[d]);
6039 static int check_dlb_support(FILE *fplog,t_commrec *cr,
6040 const char *dlb_opt,gmx_bool bRecordLoad,
6041 unsigned long Flags,t_inputrec *ir)
6049 case 'a': eDLB = edlbAUTO; break;
6050 case 'n': eDLB = edlbNO; break;
6051 case 'y': eDLB = edlbYES; break;
6052 default: gmx_incons("Unknown dlb_opt");
6055 if (Flags & MD_RERUN)
6060 if (!EI_DYNAMICS(ir->eI))
6062 if (eDLB == edlbYES)
6064 sprintf(buf,"NOTE: dynamic load balancing is only supported with dynamics, not with integrator '%s'\n",EI(ir->eI));
6065 dd_warning(cr,fplog,buf);
6073 dd_warning(cr,fplog,"NOTE: Cycle counting is not supported on this architecture, will not use dynamic load balancing\n");
6078 if (Flags & MD_REPRODUCIBLE)
6085 dd_warning(cr,fplog,"NOTE: reproducibility requested, will not use dynamic load balancing\n");
6089 dd_warning(cr,fplog,"WARNING: reproducibility requested with dynamic load balancing, the simulation will NOT be binary reproducible\n");
6092 gmx_fatal(FARGS,"Death horror: undefined case (%d) for load balancing choice",eDLB);
6100 static void set_dd_dim(FILE *fplog,gmx_domdec_t *dd)
6105 if (getenv("GMX_DD_ORDER_ZYX") != NULL)
6107 /* Decomposition order z,y,x */
6110 fprintf(fplog,"Using domain decomposition order z, y, x\n");
6112 for(dim=DIM-1; dim>=0; dim--)
6114 if (dd->nc[dim] > 1)
6116 dd->dim[dd->ndim++] = dim;
6122 /* Decomposition order x,y,z */
6123 for(dim=0; dim<DIM; dim++)
6125 if (dd->nc[dim] > 1)
6127 dd->dim[dd->ndim++] = dim;
6133 static gmx_domdec_comm_t *init_dd_comm()
6135 gmx_domdec_comm_t *comm;
6139 snew(comm->cggl_flag,DIM*2);
6140 snew(comm->cgcm_state,DIM*2);
6141 for(i=0; i<DIM*2; i++)
6143 comm->cggl_flag_nalloc[i] = 0;
6144 comm->cgcm_state_nalloc[i] = 0;
6147 comm->nalloc_int = 0;
6148 comm->buf_int = NULL;
6150 vec_rvec_init(&comm->vbuf);
6152 comm->n_load_have = 0;
6153 comm->n_load_collect = 0;
6155 for(i=0; i<ddnatNR-ddnatZONE; i++)
6157 comm->sum_nat[i] = 0;
6161 comm->load_step = 0;
6164 clear_ivec(comm->load_lim);
6171 gmx_domdec_t *init_domain_decomposition(FILE *fplog,t_commrec *cr,
6172 unsigned long Flags,
6174 real comm_distance_min,real rconstr,
6175 const char *dlb_opt,real dlb_scale,
6176 const char *sizex,const char *sizey,const char *sizez,
6177 gmx_mtop_t *mtop,t_inputrec *ir,
6180 int *npme_x,int *npme_y)
6183 gmx_domdec_comm_t *comm;
6186 real r_2b,r_mb,r_bonded=-1,r_bonded_limit=-1,limit,acs;
6193 "\nInitializing Domain Decomposition on %d nodes\n",cr->nnodes);
6198 dd->comm = init_dd_comm();
6200 snew(comm->cggl_flag,DIM*2);
6201 snew(comm->cgcm_state,DIM*2);
6203 dd->npbcdim = ePBC2npbcdim(ir->ePBC);
6204 dd->bScrewPBC = (ir->ePBC == epbcSCREW);
6206 dd->bSendRecv2 = dd_nst_env(fplog,"GMX_DD_SENDRECV2",0);
6207 comm->dlb_scale_lim = dd_nst_env(fplog,"GMX_DLB_MAX",10);
6208 comm->eFlop = dd_nst_env(fplog,"GMX_DLB_FLOP",0);
6209 recload = dd_nst_env(fplog,"GMX_DD_LOAD",1);
6210 comm->nstSortCG = dd_nst_env(fplog,"GMX_DD_SORT",1);
6211 comm->nstDDDump = dd_nst_env(fplog,"GMX_DD_DUMP",0);
6212 comm->nstDDDumpGrid = dd_nst_env(fplog,"GMX_DD_DUMP_GRID",0);
6213 comm->DD_debug = dd_nst_env(fplog,"GMX_DD_DEBUG",0);
6215 dd->pme_recv_f_alloc = 0;
6216 dd->pme_recv_f_buf = NULL;
6218 if (dd->bSendRecv2 && fplog)
6220 fprintf(fplog,"Will use two sequential MPI_Sendrecv calls instead of two simultaneous non-blocking MPI_Irecv and MPI_Isend pairs for constraint and vsite communication\n");
6226 fprintf(fplog,"Will load balance based on FLOP count\n");
6228 if (comm->eFlop > 1)
6230 srand(1+cr->nodeid);
6232 comm->bRecordLoad = TRUE;
6236 comm->bRecordLoad = (wallcycle_have_counter() && recload > 0);
6240 comm->eDLB = check_dlb_support(fplog,cr,dlb_opt,comm->bRecordLoad,Flags,ir);
6242 comm->bDynLoadBal = (comm->eDLB == edlbYES);
6245 fprintf(fplog,"Dynamic load balancing: %s\n",edlb_names[comm->eDLB]);
6247 dd->bGridJump = comm->bDynLoadBal;
6249 if (comm->nstSortCG)
6253 if (comm->nstSortCG == 1)
6255 fprintf(fplog,"Will sort the charge groups at every domain (re)decomposition\n");
6259 fprintf(fplog,"Will sort the charge groups every %d steps\n",
6269 fprintf(fplog,"Will not sort the charge groups\n");
6273 comm->bInterCGBondeds = (ncg_mtop(mtop) > mtop->mols.nr);
6274 if (comm->bInterCGBondeds)
6276 comm->bInterCGMultiBody = (multi_body_bondeds_count(mtop) > 0);
6280 comm->bInterCGMultiBody = FALSE;
6283 dd->bInterCGcons = inter_charge_group_constraints(mtop);
6285 if (ir->rlistlong == 0)
6287 /* Set the cut-off to some very large value,
6288 * so we don't need if statements everywhere in the code.
6289 * We use sqrt, since the cut-off is squared in some places.
6291 comm->cutoff = GMX_CUTOFF_INF;
6295 comm->cutoff = ir->rlistlong;
6297 comm->cutoff_mbody = 0;
6299 comm->cellsize_limit = 0;
6300 comm->bBondComm = FALSE;
6302 if (comm->bInterCGBondeds)
6304 if (comm_distance_min > 0)
6306 comm->cutoff_mbody = comm_distance_min;
6307 if (Flags & MD_DDBONDCOMM)
6309 comm->bBondComm = (comm->cutoff_mbody > comm->cutoff);
6313 comm->cutoff = max(comm->cutoff,comm->cutoff_mbody);
6315 r_bonded_limit = comm->cutoff_mbody;
6317 else if (ir->bPeriodicMols)
6319 /* Can not easily determine the required cut-off */
6320 dd_warning(cr,fplog,"NOTE: Periodic molecules are present in this system. Because of this, the domain decomposition algorithm cannot easily determine the minimum cell size that it requires for treating bonded interactions. Instead, domain decomposition will assume that half the non-bonded cut-off will be a suitable lower bound.\n");
6321 comm->cutoff_mbody = comm->cutoff/2;
6322 r_bonded_limit = comm->cutoff_mbody;
6328 dd_bonded_cg_distance(fplog,dd,mtop,ir,x,box,
6329 Flags & MD_DDBONDCHECK,&r_2b,&r_mb);
6331 gmx_bcast(sizeof(r_2b),&r_2b,cr);
6332 gmx_bcast(sizeof(r_mb),&r_mb,cr);
6334 /* We use an initial margin of 10% for the minimum cell size,
6335 * except when we are just below the non-bonded cut-off.
6337 if (Flags & MD_DDBONDCOMM)
6339 if (max(r_2b,r_mb) > comm->cutoff)
6341 r_bonded = max(r_2b,r_mb);
6342 r_bonded_limit = 1.1*r_bonded;
6343 comm->bBondComm = TRUE;
6348 r_bonded_limit = min(1.1*r_bonded,comm->cutoff);
6350 /* We determine cutoff_mbody later */
6354 /* No special bonded communication,
6355 * simply increase the DD cut-off.
6357 r_bonded_limit = 1.1*max(r_2b,r_mb);
6358 comm->cutoff_mbody = r_bonded_limit;
6359 comm->cutoff = max(comm->cutoff,comm->cutoff_mbody);
6362 comm->cellsize_limit = max(comm->cellsize_limit,r_bonded_limit);
6366 "Minimum cell size due to bonded interactions: %.3f nm\n",
6367 comm->cellsize_limit);
6371 if (dd->bInterCGcons && rconstr <= 0)
6373 /* There is a cell size limit due to the constraints (P-LINCS) */
6374 rconstr = constr_r_max(fplog,mtop,ir);
6378 "Estimated maximum distance required for P-LINCS: %.3f nm\n",
6380 if (rconstr > comm->cellsize_limit)
6382 fprintf(fplog,"This distance will limit the DD cell size, you can override this with -rcon\n");
6386 else if (rconstr > 0 && fplog)
6388 /* Here we do not check for dd->bInterCGcons,
6389 * because one can also set a cell size limit for virtual sites only
6390 * and at this point we don't know yet if there are intercg v-sites.
6393 "User supplied maximum distance required for P-LINCS: %.3f nm\n",
6396 comm->cellsize_limit = max(comm->cellsize_limit,rconstr);
6398 comm->cgs_gl = gmx_mtop_global_cgs(mtop);
6402 copy_ivec(nc,dd->nc);
6403 set_dd_dim(fplog,dd);
6404 set_ddbox_cr(cr,&dd->nc,ir,box,&comm->cgs_gl,x,ddbox);
6406 if (cr->npmenodes == -1)
6410 acs = average_cellsize_min(dd,ddbox);
6411 if (acs < comm->cellsize_limit)
6415 fprintf(fplog,"ERROR: The initial cell size (%f) is smaller than the cell size limit (%f)\n",acs,comm->cellsize_limit);
6417 gmx_fatal_collective(FARGS,cr,NULL,
6418 "The initial cell size (%f) is smaller than the cell size limit (%f), change options -dd, -rdd or -rcon, see the log file for details",
6419 acs,comm->cellsize_limit);
6424 set_ddbox_cr(cr,NULL,ir,box,&comm->cgs_gl,x,ddbox);
6426 /* We need to choose the optimal DD grid and possibly PME nodes */
6427 limit = dd_choose_grid(fplog,cr,dd,ir,mtop,box,ddbox,
6428 comm->eDLB!=edlbNO,dlb_scale,
6429 comm->cellsize_limit,comm->cutoff,
6430 comm->bInterCGBondeds,comm->bInterCGMultiBody);
6432 if (dd->nc[XX] == 0)
6434 bC = (dd->bInterCGcons && rconstr > r_bonded_limit);
6435 sprintf(buf,"Change the number of nodes or mdrun option %s%s%s",
6436 !bC ? "-rdd" : "-rcon",
6437 comm->eDLB!=edlbNO ? " or -dds" : "",
6438 bC ? " or your LINCS settings" : "");
6440 gmx_fatal_collective(FARGS,cr,NULL,
6441 "There is no domain decomposition for %d nodes that is compatible with the given box and a minimum cell size of %g nm\n"
6443 "Look in the log file for details on the domain decomposition",
6444 cr->nnodes-cr->npmenodes,limit,buf);
6446 set_dd_dim(fplog,dd);
6452 "Domain decomposition grid %d x %d x %d, separate PME nodes %d\n",
6453 dd->nc[XX],dd->nc[YY],dd->nc[ZZ],cr->npmenodes);
6456 dd->nnodes = dd->nc[XX]*dd->nc[YY]*dd->nc[ZZ];
6457 if (cr->nnodes - dd->nnodes != cr->npmenodes)
6459 gmx_fatal_collective(FARGS,cr,NULL,
6460 "The size of the domain decomposition grid (%d) does not match the number of nodes (%d). The total number of nodes is %d",
6461 dd->nnodes,cr->nnodes - cr->npmenodes,cr->nnodes);
6463 if (cr->npmenodes > dd->nnodes)
6465 gmx_fatal_collective(FARGS,cr,NULL,
6466 "The number of separate PME nodes (%d) is larger than the number of PP nodes (%d), this is not supported.",cr->npmenodes,dd->nnodes);
6468 if (cr->npmenodes > 0)
6470 comm->npmenodes = cr->npmenodes;
6474 comm->npmenodes = dd->nnodes;
6477 if (EEL_PME(ir->coulombtype))
6479 /* The following choices should match those
6480 * in comm_cost_est in domdec_setup.c.
6481 * Note that here the checks have to take into account
6482 * that the decomposition might occur in a different order than xyz
6483 * (for instance through the env.var. GMX_DD_ORDER_ZYX),
6484 * in which case they will not match those in comm_cost_est,
6485 * but since that is mainly for testing purposes that's fine.
6487 if (dd->ndim >= 2 && dd->dim[0] == XX && dd->dim[1] == YY &&
6488 comm->npmenodes > dd->nc[XX] && comm->npmenodes % dd->nc[XX] == 0 &&
6489 getenv("GMX_PMEONEDD") == NULL)
6491 comm->npmedecompdim = 2;
6492 comm->npmenodes_x = dd->nc[XX];
6493 comm->npmenodes_y = comm->npmenodes/comm->npmenodes_x;
6497 /* In case nc is 1 in both x and y we could still choose to
6498 * decompose pme in y instead of x, but we use x for simplicity.
6500 comm->npmedecompdim = 1;
6501 if (dd->dim[0] == YY)
6503 comm->npmenodes_x = 1;
6504 comm->npmenodes_y = comm->npmenodes;
6508 comm->npmenodes_x = comm->npmenodes;
6509 comm->npmenodes_y = 1;
6514 fprintf(fplog,"PME domain decomposition: %d x %d x %d\n",
6515 comm->npmenodes_x,comm->npmenodes_y,1);
6520 comm->npmedecompdim = 0;
6521 comm->npmenodes_x = 0;
6522 comm->npmenodes_y = 0;
6525 /* Technically we don't need both of these,
6526 * but it simplifies code not having to recalculate it.
6528 *npme_x = comm->npmenodes_x;
6529 *npme_y = comm->npmenodes_y;
6531 snew(comm->slb_frac,DIM);
6532 if (comm->eDLB == edlbNO)
6534 comm->slb_frac[XX] = get_slb_frac(fplog,"x",dd->nc[XX],sizex);
6535 comm->slb_frac[YY] = get_slb_frac(fplog,"y",dd->nc[YY],sizey);
6536 comm->slb_frac[ZZ] = get_slb_frac(fplog,"z",dd->nc[ZZ],sizez);
6539 if (comm->bInterCGBondeds && comm->cutoff_mbody == 0)
6541 if (comm->bBondComm || comm->eDLB != edlbNO)
6543 /* Set the bonded communication distance to halfway
6544 * the minimum and the maximum,
6545 * since the extra communication cost is nearly zero.
6547 acs = average_cellsize_min(dd,ddbox);
6548 comm->cutoff_mbody = 0.5*(r_bonded + acs);
6549 if (comm->eDLB != edlbNO)
6551 /* Check if this does not limit the scaling */
6552 comm->cutoff_mbody = min(comm->cutoff_mbody,dlb_scale*acs);
6554 if (!comm->bBondComm)
6556 /* Without bBondComm do not go beyond the n.b. cut-off */
6557 comm->cutoff_mbody = min(comm->cutoff_mbody,comm->cutoff);
6558 if (comm->cellsize_limit >= comm->cutoff)
6560 /* We don't loose a lot of efficieny
6561 * when increasing it to the n.b. cut-off.
6562 * It can even be slightly faster, because we need
6563 * less checks for the communication setup.
6565 comm->cutoff_mbody = comm->cutoff;
6568 /* Check if we did not end up below our original limit */
6569 comm->cutoff_mbody = max(comm->cutoff_mbody,r_bonded_limit);
6571 if (comm->cutoff_mbody > comm->cellsize_limit)
6573 comm->cellsize_limit = comm->cutoff_mbody;
6576 /* Without DLB and cutoff_mbody<cutoff, cutoff_mbody is dynamic */
6581 fprintf(debug,"Bonded atom communication beyond the cut-off: %d\n"
6582 "cellsize limit %f\n",
6583 comm->bBondComm,comm->cellsize_limit);
6588 check_dd_restrictions(cr,dd,ir,fplog);
6591 comm->globalcomm_step = INT_MIN;
6594 clear_dd_cycle_counts(dd);
6599 static void set_dlb_limits(gmx_domdec_t *dd)
6604 for(d=0; d<dd->ndim; d++)
6606 dd->comm->cd[d].np = dd->comm->cd[d].np_dlb;
6607 dd->comm->cellsize_min[dd->dim[d]] =
6608 dd->comm->cellsize_min_dlb[dd->dim[d]];
6613 static void turn_on_dlb(FILE *fplog,t_commrec *cr,gmx_large_int_t step)
6616 gmx_domdec_comm_t *comm;
6626 fprintf(fplog,"At step %s the performance loss due to force load imbalance is %.1f %%\n",gmx_step_str(step,buf),dd_force_imb_perf_loss(dd)*100);
6629 cellsize_min = comm->cellsize_min[dd->dim[0]];
6630 for(d=1; d<dd->ndim; d++)
6632 cellsize_min = min(cellsize_min,comm->cellsize_min[dd->dim[d]]);
6635 if (cellsize_min < comm->cellsize_limit*1.05)
6637 dd_warning(cr,fplog,"NOTE: the minimum cell size is smaller than 1.05 times the cell size limit, will not turn on dynamic load balancing\n");
6639 /* Change DLB from "auto" to "no". */
6640 comm->eDLB = edlbNO;
6645 dd_warning(cr,fplog,"NOTE: Turning on dynamic load balancing\n");
6646 comm->bDynLoadBal = TRUE;
6647 dd->bGridJump = TRUE;
6651 /* We can set the required cell size info here,
6652 * so we do not need to communicate this.
6653 * The grid is completely uniform.
6655 for(d=0; d<dd->ndim; d++)
6659 comm->load[d].sum_m = comm->load[d].sum;
6661 nc = dd->nc[dd->dim[d]];
6664 comm->root[d]->cell_f[i] = i/(real)nc;
6667 comm->root[d]->cell_f_max0[i] = i /(real)nc;
6668 comm->root[d]->cell_f_min1[i] = (i+1)/(real)nc;
6671 comm->root[d]->cell_f[nc] = 1.0;
6676 static char *init_bLocalCG(gmx_mtop_t *mtop)
6681 ncg = ncg_mtop(mtop);
6683 for(cg=0; cg<ncg; cg++)
6685 bLocalCG[cg] = FALSE;
6691 void dd_init_bondeds(FILE *fplog,
6692 gmx_domdec_t *dd,gmx_mtop_t *mtop,
6693 gmx_vsite_t *vsite,gmx_constr_t constr,
6694 t_inputrec *ir,gmx_bool bBCheck,cginfo_mb_t *cginfo_mb)
6696 gmx_domdec_comm_t *comm;
6700 dd_make_reverse_top(fplog,dd,mtop,vsite,constr,ir,bBCheck);
6704 if (comm->bBondComm)
6706 /* Communicate atoms beyond the cut-off for bonded interactions */
6709 comm->cglink = make_charge_group_links(mtop,dd,cginfo_mb);
6711 comm->bLocalCG = init_bLocalCG(mtop);
6715 /* Only communicate atoms based on cut-off */
6716 comm->cglink = NULL;
6717 comm->bLocalCG = NULL;
6721 static void print_dd_settings(FILE *fplog,gmx_domdec_t *dd,
6723 gmx_bool bDynLoadBal,real dlb_scale,
6726 gmx_domdec_comm_t *comm;
6741 fprintf(fplog,"The maximum number of communication pulses is:");
6742 for(d=0; d<dd->ndim; d++)
6744 fprintf(fplog," %c %d",dim2char(dd->dim[d]),comm->cd[d].np_dlb);
6746 fprintf(fplog,"\n");
6747 fprintf(fplog,"The minimum size for domain decomposition cells is %.3f nm\n",comm->cellsize_limit);
6748 fprintf(fplog,"The requested allowed shrink of DD cells (option -dds) is: %.2f\n",dlb_scale);
6749 fprintf(fplog,"The allowed shrink of domain decomposition cells is:");
6750 for(d=0; d<DIM; d++)
6754 if (d >= ddbox->npbcdim && dd->nc[d] == 2)
6761 comm->cellsize_min_dlb[d]/
6762 (ddbox->box_size[d]*ddbox->skew_fac[d]/dd->nc[d]);
6764 fprintf(fplog," %c %.2f",dim2char(d),shrink);
6767 fprintf(fplog,"\n");
6771 set_dd_cell_sizes_slb(dd,ddbox,FALSE,np);
6772 fprintf(fplog,"The initial number of communication pulses is:");
6773 for(d=0; d<dd->ndim; d++)
6775 fprintf(fplog," %c %d",dim2char(dd->dim[d]),np[dd->dim[d]]);
6777 fprintf(fplog,"\n");
6778 fprintf(fplog,"The initial domain decomposition cell size is:");
6779 for(d=0; d<DIM; d++) {
6782 fprintf(fplog," %c %.2f nm",
6783 dim2char(d),dd->comm->cellsize_min[d]);
6786 fprintf(fplog,"\n\n");
6789 if (comm->bInterCGBondeds || dd->vsite_comm || dd->constraint_comm)
6791 fprintf(fplog,"The maximum allowed distance for charge groups involved in interactions is:\n");
6792 fprintf(fplog,"%40s %-7s %6.3f nm\n",
6793 "non-bonded interactions","",comm->cutoff);
6797 limit = dd->comm->cellsize_limit;
6801 if (dynamic_dd_box(ddbox,ir))
6803 fprintf(fplog,"(the following are initial values, they could change due to box deformation)\n");
6805 limit = dd->comm->cellsize_min[XX];
6806 for(d=1; d<DIM; d++)
6808 limit = min(limit,dd->comm->cellsize_min[d]);
6812 if (comm->bInterCGBondeds)
6814 fprintf(fplog,"%40s %-7s %6.3f nm\n",
6815 "two-body bonded interactions","(-rdd)",
6816 max(comm->cutoff,comm->cutoff_mbody));
6817 fprintf(fplog,"%40s %-7s %6.3f nm\n",
6818 "multi-body bonded interactions","(-rdd)",
6819 (comm->bBondComm || dd->bGridJump) ? comm->cutoff_mbody : min(comm->cutoff,limit));
6823 fprintf(fplog,"%40s %-7s %6.3f nm\n",
6824 "virtual site constructions","(-rcon)",limit);
6826 if (dd->constraint_comm)
6828 sprintf(buf,"atoms separated by up to %d constraints",
6830 fprintf(fplog,"%40s %-7s %6.3f nm\n",
6831 buf,"(-rcon)",limit);
6833 fprintf(fplog,"\n");
6839 void set_dd_parameters(FILE *fplog,gmx_domdec_t *dd,real dlb_scale,
6840 t_inputrec *ir,t_forcerec *fr,
6843 gmx_domdec_comm_t *comm;
6844 int d,dim,npulse,npulse_d_max,npulse_d;
6851 bNoCutOff = (ir->rvdw == 0 || ir->rcoulomb == 0);
6853 if (EEL_PME(ir->coulombtype))
6855 init_ddpme(dd,&comm->ddpme[0],0);
6856 if (comm->npmedecompdim >= 2)
6858 init_ddpme(dd,&comm->ddpme[1],1);
6863 comm->npmenodes = 0;
6864 if (dd->pme_nodeid >= 0)
6866 gmx_fatal_collective(FARGS,NULL,dd,
6867 "Can not have separate PME nodes without PME electrostatics");
6871 /* If each molecule is a single charge group
6872 * or we use domain decomposition for each periodic dimension,
6873 * we do not need to take pbc into account for the bonded interactions.
6875 if (fr->ePBC == epbcNONE || !comm->bInterCGBondeds ||
6876 (dd->nc[XX]>1 && dd->nc[YY]>1 && (dd->nc[ZZ]>1 || fr->ePBC==epbcXY)))
6878 fr->bMolPBC = FALSE;
6887 fprintf(debug,"The DD cut-off is %f\n",comm->cutoff);
6889 if (comm->eDLB != edlbNO)
6891 /* Determine the maximum number of comm. pulses in one dimension */
6893 comm->cellsize_limit = max(comm->cellsize_limit,comm->cutoff_mbody);
6895 /* Determine the maximum required number of grid pulses */
6896 if (comm->cellsize_limit >= comm->cutoff)
6898 /* Only a single pulse is required */
6901 else if (!bNoCutOff && comm->cellsize_limit > 0)
6903 /* We round down slightly here to avoid overhead due to the latency
6904 * of extra communication calls when the cut-off
6905 * would be only slightly longer than the cell size.
6906 * Later cellsize_limit is redetermined,
6907 * so we can not miss interactions due to this rounding.
6909 npulse = (int)(0.96 + comm->cutoff/comm->cellsize_limit);
6913 /* There is no cell size limit */
6914 npulse = max(dd->nc[XX]-1,max(dd->nc[YY]-1,dd->nc[ZZ]-1));
6917 if (!bNoCutOff && npulse > 1)
6919 /* See if we can do with less pulses, based on dlb_scale */
6921 for(d=0; d<dd->ndim; d++)
6924 npulse_d = (int)(1 + dd->nc[dim]*comm->cutoff
6925 /(ddbox->box_size[dim]*ddbox->skew_fac[dim]*dlb_scale));
6926 npulse_d_max = max(npulse_d_max,npulse_d);
6928 npulse = min(npulse,npulse_d_max);
6931 /* This env var can override npulse */
6932 d = dd_nst_env(fplog,"GMX_DD_NPULSE",0);
6939 comm->bVacDLBNoLimit = (ir->ePBC == epbcNONE);
6940 for(d=0; d<dd->ndim; d++)
6942 comm->cd[d].np_dlb = min(npulse,dd->nc[dd->dim[d]]-1);
6943 comm->cd[d].np_nalloc = comm->cd[d].np_dlb;
6944 snew(comm->cd[d].ind,comm->cd[d].np_nalloc);
6945 comm->maxpulse = max(comm->maxpulse,comm->cd[d].np_dlb);
6946 if (comm->cd[d].np_dlb < dd->nc[dd->dim[d]]-1)
6948 comm->bVacDLBNoLimit = FALSE;
6952 /* cellsize_limit is set for LINCS in init_domain_decomposition */
6953 if (!comm->bVacDLBNoLimit)
6955 comm->cellsize_limit = max(comm->cellsize_limit,
6956 comm->cutoff/comm->maxpulse);
6958 comm->cellsize_limit = max(comm->cellsize_limit,comm->cutoff_mbody);
6959 /* Set the minimum cell size for each DD dimension */
6960 for(d=0; d<dd->ndim; d++)
6962 if (comm->bVacDLBNoLimit ||
6963 comm->cd[d].np_dlb*comm->cellsize_limit >= comm->cutoff)
6965 comm->cellsize_min_dlb[dd->dim[d]] = comm->cellsize_limit;
6969 comm->cellsize_min_dlb[dd->dim[d]] =
6970 comm->cutoff/comm->cd[d].np_dlb;
6973 if (comm->cutoff_mbody <= 0)
6975 comm->cutoff_mbody = min(comm->cutoff,comm->cellsize_limit);
6977 if (comm->bDynLoadBal)
6983 print_dd_settings(fplog,dd,ir,comm->bDynLoadBal,dlb_scale,ddbox);
6984 if (comm->eDLB == edlbAUTO)
6988 fprintf(fplog,"When dynamic load balancing gets turned on, these settings will change to:\n");
6990 print_dd_settings(fplog,dd,ir,TRUE,dlb_scale,ddbox);
6993 if (ir->ePBC == epbcNONE)
6995 vol_frac = 1 - 1/(double)dd->nnodes;
7000 (1 + comm_box_frac(dd->nc,comm->cutoff,ddbox))/(double)dd->nnodes;
7004 fprintf(debug,"Volume fraction for all DD zones: %f\n",vol_frac);
7006 natoms_tot = comm->cgs_gl.index[comm->cgs_gl.nr];
7008 dd->ga2la = ga2la_init(natoms_tot,vol_frac*natoms_tot);
7011 static void merge_cg_buffers(int ncell,
7012 gmx_domdec_comm_dim_t *cd, int pulse,
7014 int *index_gl, int *recv_i,
7015 rvec *cg_cm, rvec *recv_vr,
7017 cginfo_mb_t *cginfo_mb,int *cginfo)
7019 gmx_domdec_ind_t *ind,*ind_p;
7020 int p,cell,c,cg,cg0,cg1,cg_gl,nat;
7023 ind = &cd->ind[pulse];
7025 /* First correct the already stored data */
7026 shift = ind->nrecv[ncell];
7027 for(cell=ncell-1; cell>=0; cell--)
7029 shift -= ind->nrecv[cell];
7032 /* Move the cg's present from previous grid pulses */
7033 cg0 = ncg_cell[ncell+cell];
7034 cg1 = ncg_cell[ncell+cell+1];
7035 cgindex[cg1+shift] = cgindex[cg1];
7036 for(cg=cg1-1; cg>=cg0; cg--)
7038 index_gl[cg+shift] = index_gl[cg];
7039 copy_rvec(cg_cm[cg],cg_cm[cg+shift]);
7040 cgindex[cg+shift] = cgindex[cg];
7041 cginfo[cg+shift] = cginfo[cg];
7043 /* Correct the already stored send indices for the shift */
7044 for(p=1; p<=pulse; p++)
7046 ind_p = &cd->ind[p];
7048 for(c=0; c<cell; c++)
7050 cg0 += ind_p->nsend[c];
7052 cg1 = cg0 + ind_p->nsend[cell];
7053 for(cg=cg0; cg<cg1; cg++)
7055 ind_p->index[cg] += shift;
7061 /* Merge in the communicated buffers */
7065 for(cell=0; cell<ncell; cell++)
7067 cg1 = ncg_cell[ncell+cell+1] + shift;
7070 /* Correct the old cg indices */
7071 for(cg=ncg_cell[ncell+cell]; cg<cg1; cg++)
7073 cgindex[cg+1] += shift_at;
7076 for(cg=0; cg<ind->nrecv[cell]; cg++)
7078 /* Copy this charge group from the buffer */
7079 index_gl[cg1] = recv_i[cg0];
7080 copy_rvec(recv_vr[cg0],cg_cm[cg1]);
7081 /* Add it to the cgindex */
7082 cg_gl = index_gl[cg1];
7083 cginfo[cg1] = ddcginfo(cginfo_mb,cg_gl);
7084 nat = GET_CGINFO_NATOMS(cginfo[cg1]);
7085 cgindex[cg1+1] = cgindex[cg1] + nat;
7090 shift += ind->nrecv[cell];
7091 ncg_cell[ncell+cell+1] = cg1;
7095 static void make_cell2at_index(gmx_domdec_comm_dim_t *cd,
7096 int nzone,int cg0,const int *cgindex)
7100 /* Store the atom block boundaries for easy copying of communication buffers
7103 for(zone=0; zone<nzone; zone++)
7105 for(p=0; p<cd->np; p++) {
7106 cd->ind[p].cell2at0[zone] = cgindex[cg];
7107 cg += cd->ind[p].nrecv[zone];
7108 cd->ind[p].cell2at1[zone] = cgindex[cg];
7113 static gmx_bool missing_link(t_blocka *link,int cg_gl,char *bLocalCG)
7119 for(i=link->index[cg_gl]; i<link->index[cg_gl+1]; i++)
7121 if (!bLocalCG[link->a[i]])
7130 static void setup_dd_communication(gmx_domdec_t *dd,
7131 matrix box,gmx_ddbox_t *ddbox,t_forcerec *fr)
7133 int dim_ind,dim,dim0,dim1=-1,dim2=-1,dimd,p,nat_tot;
7134 int nzone,nzone_send,zone,zonei,cg0,cg1;
7135 int c,i,j,cg,cg_gl,nrcg;
7136 int *zone_cg_range,pos_cg,*index_gl,*cgindex,*recv_i;
7137 gmx_domdec_comm_t *comm;
7138 gmx_domdec_zones_t *zones;
7139 gmx_domdec_comm_dim_t *cd;
7140 gmx_domdec_ind_t *ind;
7141 cginfo_mb_t *cginfo_mb;
7142 gmx_bool bBondComm,bDist2B,bDistMB,bDistMB_pulse,bDistBonded,bScrew;
7143 real r_mb,r_comm2,r_scomm2,r_bcomm2,r,r_0,r_1,r2,rb2,r2inc,inv_ncg,tric_sh;
7145 real corner[DIM][4],corner_round_0=0,corner_round_1[4];
7146 real bcorner[DIM],bcorner_round_1=0;
7148 rvec *cg_cm,*normal,*v_d,*v_0=NULL,*v_1=NULL,*recv_vr;
7149 real skew_fac2_d,skew_fac_01;
7155 fprintf(debug,"Setting up DD communication\n");
7161 for(dim_ind=0; dim_ind<dd->ndim; dim_ind++)
7163 dim = dd->dim[dim_ind];
7165 /* Check if we need to use triclinic distances */
7166 tric_dist[dim_ind] = 0;
7167 for(i=0; i<=dim_ind; i++)
7169 if (ddbox->tric_dir[dd->dim[i]])
7171 tric_dist[dim_ind] = 1;
7176 bBondComm = comm->bBondComm;
7178 /* Do we need to determine extra distances for multi-body bondeds? */
7179 bDistMB = (comm->bInterCGMultiBody && dd->bGridJump && dd->ndim > 1);
7181 /* Do we need to determine extra distances for only two-body bondeds? */
7182 bDist2B = (bBondComm && !bDistMB);
7184 r_comm2 = sqr(comm->cutoff);
7185 r_bcomm2 = sqr(comm->cutoff_mbody);
7189 fprintf(debug,"bBondComm %d, r_bc %f\n",bBondComm,sqrt(r_bcomm2));
7192 zones = &comm->zones;
7195 /* The first dimension is equal for all cells */
7196 corner[0][0] = comm->cell_x0[dim0];
7199 bcorner[0] = corner[0][0];
7204 /* This cell row is only seen from the first row */
7205 corner[1][0] = comm->cell_x0[dim1];
7206 /* All rows can see this row */
7207 corner[1][1] = comm->cell_x0[dim1];
7210 corner[1][1] = max(comm->cell_x0[dim1],comm->zone_d1[1].mch0);
7213 /* For the multi-body distance we need the maximum */
7214 bcorner[1] = max(comm->cell_x0[dim1],comm->zone_d1[1].p1_0);
7217 /* Set the upper-right corner for rounding */
7218 corner_round_0 = comm->cell_x1[dim0];
7225 corner[2][j] = comm->cell_x0[dim2];
7229 /* Use the maximum of the i-cells that see a j-cell */
7230 for(i=0; i<zones->nizone; i++)
7232 for(j=zones->izone[i].j0; j<zones->izone[i].j1; j++)
7238 comm->zone_d2[zones->shift[i][dim0]][zones->shift[i][dim1]].mch0);
7244 /* For the multi-body distance we need the maximum */
7245 bcorner[2] = comm->cell_x0[dim2];
7250 bcorner[2] = max(bcorner[2],
7251 comm->zone_d2[i][j].p1_0);
7257 /* Set the upper-right corner for rounding */
7258 /* Cell (0,0,0) and cell (1,0,0) can see cell 4 (0,1,1)
7259 * Only cell (0,0,0) can see cell 7 (1,1,1)
7261 corner_round_1[0] = comm->cell_x1[dim1];
7262 corner_round_1[3] = comm->cell_x1[dim1];
7265 corner_round_1[0] = max(comm->cell_x1[dim1],
7266 comm->zone_d1[1].mch1);
7269 /* For the multi-body distance we need the maximum */
7270 bcorner_round_1 = max(comm->cell_x1[dim1],
7271 comm->zone_d1[1].p1_1);
7277 /* Triclinic stuff */
7278 normal = ddbox->normal;
7282 v_0 = ddbox->v[dim0];
7283 if (ddbox->tric_dir[dim0] && ddbox->tric_dir[dim1])
7285 /* Determine the coupling coefficient for the distances
7286 * to the cell planes along dim0 and dim1 through dim2.
7287 * This is required for correct rounding.
7290 ddbox->v[dim0][dim1+1][dim0]*ddbox->v[dim1][dim1+1][dim1];
7293 fprintf(debug,"\nskew_fac_01 %f\n",skew_fac_01);
7299 v_1 = ddbox->v[dim1];
7302 zone_cg_range = zones->cg_range;
7303 index_gl = dd->index_gl;
7304 cgindex = dd->cgindex;
7305 cginfo_mb = fr->cginfo_mb;
7307 zone_cg_range[0] = 0;
7308 zone_cg_range[1] = dd->ncg_home;
7309 comm->zone_ncg1[0] = dd->ncg_home;
7310 pos_cg = dd->ncg_home;
7312 nat_tot = dd->nat_home;
7314 for(dim_ind=0; dim_ind<dd->ndim; dim_ind++)
7316 dim = dd->dim[dim_ind];
7317 cd = &comm->cd[dim_ind];
7319 if (dim >= ddbox->npbcdim && dd->ci[dim] == 0)
7321 /* No pbc in this dimension, the first node should not comm. */
7329 bScrew = (dd->bScrewPBC && dim == XX);
7331 v_d = ddbox->v[dim];
7332 skew_fac2_d = sqr(ddbox->skew_fac[dim]);
7334 cd->bInPlace = TRUE;
7335 for(p=0; p<cd->np; p++)
7337 /* Only atoms communicated in the first pulse are used
7338 * for multi-body bonded interactions or for bBondComm.
7340 bDistBonded = ((bDistMB || bDist2B) && p == 0);
7341 bDistMB_pulse = (bDistMB && bDistBonded);
7346 for(zone=0; zone<nzone_send; zone++)
7348 if (tric_dist[dim_ind] && dim_ind > 0)
7350 /* Determine slightly more optimized skew_fac's
7352 * This reduces the number of communicated atoms
7353 * by about 10% for 3D DD of rhombic dodecahedra.
7355 for(dimd=0; dimd<dim; dimd++)
7357 sf2_round[dimd] = 1;
7358 if (ddbox->tric_dir[dimd])
7360 for(i=dd->dim[dimd]+1; i<DIM; i++)
7362 /* If we are shifted in dimension i
7363 * and the cell plane is tilted forward
7364 * in dimension i, skip this coupling.
7366 if (!(zones->shift[nzone+zone][i] &&
7367 ddbox->v[dimd][i][dimd] >= 0))
7370 sqr(ddbox->v[dimd][i][dimd]);
7373 sf2_round[dimd] = 1/sf2_round[dimd];
7378 zonei = zone_perm[dim_ind][zone];
7381 /* Here we permutate the zones to obtain a convenient order
7382 * for neighbor searching
7384 cg0 = zone_cg_range[zonei];
7385 cg1 = zone_cg_range[zonei+1];
7389 /* Look only at the cg's received in the previous grid pulse
7391 cg1 = zone_cg_range[nzone+zone+1];
7392 cg0 = cg1 - cd->ind[p-1].nrecv[zone];
7394 ind->nsend[zone] = 0;
7395 for(cg=cg0; cg<cg1; cg++)
7399 if (tric_dist[dim_ind] == 0)
7401 /* Rectangular direction, easy */
7402 r = cg_cm[cg][dim] - corner[dim_ind][zone];
7409 r = cg_cm[cg][dim] - bcorner[dim_ind];
7415 /* Rounding gives at most a 16% reduction
7416 * in communicated atoms
7418 if (dim_ind >= 1 && (zonei == 1 || zonei == 2))
7420 r = cg_cm[cg][dim0] - corner_round_0;
7421 /* This is the first dimension, so always r >= 0 */
7428 if (dim_ind == 2 && (zonei == 2 || zonei == 3))
7430 r = cg_cm[cg][dim1] - corner_round_1[zone];
7437 r = cg_cm[cg][dim1] - bcorner_round_1;
7447 /* Triclinic direction, more complicated */
7450 /* Rounding, conservative as the skew_fac multiplication
7451 * will slightly underestimate the distance.
7453 if (dim_ind >= 1 && (zonei == 1 || zonei == 2))
7455 rn[dim0] = cg_cm[cg][dim0] - corner_round_0;
7456 for(i=dim0+1; i<DIM; i++)
7458 rn[dim0] -= cg_cm[cg][i]*v_0[i][dim0];
7460 r2 = rn[dim0]*rn[dim0]*sf2_round[dim0];
7463 rb[dim0] = rn[dim0];
7466 /* Take care that the cell planes along dim0 might not
7467 * be orthogonal to those along dim1 and dim2.
7469 for(i=1; i<=dim_ind; i++)
7472 if (normal[dim0][dimd] > 0)
7474 rn[dimd] -= rn[dim0]*normal[dim0][dimd];
7477 rb[dimd] -= rb[dim0]*normal[dim0][dimd];
7482 if (dim_ind == 2 && (zonei == 2 || zonei == 3))
7484 rn[dim1] += cg_cm[cg][dim1] - corner_round_1[zone];
7486 for(i=dim1+1; i<DIM; i++)
7488 tric_sh -= cg_cm[cg][i]*v_1[i][dim1];
7490 rn[dim1] += tric_sh;
7493 r2 += rn[dim1]*rn[dim1]*sf2_round[dim1];
7494 /* Take care of coupling of the distances
7495 * to the planes along dim0 and dim1 through dim2.
7497 r2 -= rn[dim0]*rn[dim1]*skew_fac_01;
7498 /* Take care that the cell planes along dim1
7499 * might not be orthogonal to that along dim2.
7501 if (normal[dim1][dim2] > 0)
7503 rn[dim2] -= rn[dim1]*normal[dim1][dim2];
7509 cg_cm[cg][dim1] - bcorner_round_1 + tric_sh;
7512 rb2 += rb[dim1]*rb[dim1]*sf2_round[dim1];
7513 /* Take care of coupling of the distances
7514 * to the planes along dim0 and dim1 through dim2.
7516 rb2 -= rb[dim0]*rb[dim1]*skew_fac_01;
7517 /* Take care that the cell planes along dim1
7518 * might not be orthogonal to that along dim2.
7520 if (normal[dim1][dim2] > 0)
7522 rb[dim2] -= rb[dim1]*normal[dim1][dim2];
7527 /* The distance along the communication direction */
7528 rn[dim] += cg_cm[cg][dim] - corner[dim_ind][zone];
7530 for(i=dim+1; i<DIM; i++)
7532 tric_sh -= cg_cm[cg][i]*v_d[i][dim];
7537 r2 += rn[dim]*rn[dim]*skew_fac2_d;
7538 /* Take care of coupling of the distances
7539 * to the planes along dim0 and dim1 through dim2.
7541 if (dim_ind == 1 && zonei == 1)
7543 r2 -= rn[dim0]*rn[dim]*skew_fac_01;
7549 rb[dim] += cg_cm[cg][dim] - bcorner[dim_ind] + tric_sh;
7552 rb2 += rb[dim]*rb[dim]*skew_fac2_d;
7553 /* Take care of coupling of the distances
7554 * to the planes along dim0 and dim1 through dim2.
7556 if (dim_ind == 1 && zonei == 1)
7558 rb2 -= rb[dim0]*rb[dim]*skew_fac_01;
7566 ((bDistMB && rb2 < r_bcomm2) ||
7567 (bDist2B && r2 < r_bcomm2)) &&
7569 (GET_CGINFO_BOND_INTER(fr->cginfo[cg]) &&
7570 missing_link(comm->cglink,index_gl[cg],
7573 /* Make an index to the local charge groups */
7574 if (nsend+1 > ind->nalloc)
7576 ind->nalloc = over_alloc_large(nsend+1);
7577 srenew(ind->index,ind->nalloc);
7579 if (nsend+1 > comm->nalloc_int)
7581 comm->nalloc_int = over_alloc_large(nsend+1);
7582 srenew(comm->buf_int,comm->nalloc_int);
7584 ind->index[nsend] = cg;
7585 comm->buf_int[nsend] = index_gl[cg];
7587 vec_rvec_check_alloc(&comm->vbuf,nsend+1);
7589 if (dd->ci[dim] == 0)
7591 /* Correct cg_cm for pbc */
7592 rvec_add(cg_cm[cg],box[dim],comm->vbuf.v[nsend]);
7595 comm->vbuf.v[nsend][YY] =
7596 box[YY][YY]-comm->vbuf.v[nsend][YY];
7597 comm->vbuf.v[nsend][ZZ] =
7598 box[ZZ][ZZ]-comm->vbuf.v[nsend][ZZ];
7603 copy_rvec(cg_cm[cg],comm->vbuf.v[nsend]);
7606 nat += cgindex[cg+1] - cgindex[cg];
7610 /* Clear the counts in case we do not have pbc */
7611 for(zone=nzone_send; zone<nzone; zone++)
7613 ind->nsend[zone] = 0;
7615 ind->nsend[nzone] = nsend;
7616 ind->nsend[nzone+1] = nat;
7617 /* Communicate the number of cg's and atoms to receive */
7618 dd_sendrecv_int(dd, dim_ind, dddirBackward,
7619 ind->nsend, nzone+2,
7620 ind->nrecv, nzone+2);
7622 /* The rvec buffer is also required for atom buffers of size nsend
7623 * in dd_move_x and dd_move_f.
7625 vec_rvec_check_alloc(&comm->vbuf,ind->nsend[nzone+1]);
7629 /* We can receive in place if only the last zone is not empty */
7630 for(zone=0; zone<nzone-1; zone++)
7632 if (ind->nrecv[zone] > 0)
7634 cd->bInPlace = FALSE;
7639 /* The int buffer is only required here for the cg indices */
7640 if (ind->nrecv[nzone] > comm->nalloc_int2)
7642 comm->nalloc_int2 = over_alloc_dd(ind->nrecv[nzone]);
7643 srenew(comm->buf_int2,comm->nalloc_int2);
7645 /* The rvec buffer is also required for atom buffers
7646 * of size nrecv in dd_move_x and dd_move_f.
7648 i = max(cd->ind[0].nrecv[nzone+1],ind->nrecv[nzone+1]);
7649 vec_rvec_check_alloc(&comm->vbuf2,i);
7653 /* Make space for the global cg indices */
7654 if (pos_cg + ind->nrecv[nzone] > dd->cg_nalloc
7655 || dd->cg_nalloc == 0)
7657 dd->cg_nalloc = over_alloc_dd(pos_cg + ind->nrecv[nzone]);
7658 srenew(index_gl,dd->cg_nalloc);
7659 srenew(cgindex,dd->cg_nalloc+1);
7661 /* Communicate the global cg indices */
7664 recv_i = index_gl + pos_cg;
7668 recv_i = comm->buf_int2;
7670 dd_sendrecv_int(dd, dim_ind, dddirBackward,
7671 comm->buf_int, nsend,
7672 recv_i, ind->nrecv[nzone]);
7674 /* Make space for cg_cm */
7675 if (pos_cg + ind->nrecv[nzone] > fr->cg_nalloc)
7677 dd_realloc_fr_cg(fr,pos_cg + ind->nrecv[nzone]);
7680 /* Communicate cg_cm */
7683 recv_vr = cg_cm + pos_cg;
7687 recv_vr = comm->vbuf2.v;
7689 dd_sendrecv_rvec(dd, dim_ind, dddirBackward,
7690 comm->vbuf.v, nsend,
7691 recv_vr, ind->nrecv[nzone]);
7693 /* Make the charge group index */
7696 zone = (p == 0 ? 0 : nzone - 1);
7697 while (zone < nzone)
7699 for(cg=0; cg<ind->nrecv[zone]; cg++)
7701 cg_gl = index_gl[pos_cg];
7702 fr->cginfo[pos_cg] = ddcginfo(cginfo_mb,cg_gl);
7703 nrcg = GET_CGINFO_NATOMS(fr->cginfo[pos_cg]);
7704 cgindex[pos_cg+1] = cgindex[pos_cg] + nrcg;
7707 /* Update the charge group presence,
7708 * so we can use it in the next pass of the loop.
7710 comm->bLocalCG[cg_gl] = TRUE;
7716 comm->zone_ncg1[nzone+zone] = ind->nrecv[zone];
7719 zone_cg_range[nzone+zone] = pos_cg;
7724 /* This part of the code is never executed with bBondComm. */
7725 merge_cg_buffers(nzone,cd,p,zone_cg_range,
7726 index_gl,recv_i,cg_cm,recv_vr,
7727 cgindex,fr->cginfo_mb,fr->cginfo);
7728 pos_cg += ind->nrecv[nzone];
7730 nat_tot += ind->nrecv[nzone+1];
7734 /* Store the atom block for easy copying of communication buffers */
7735 make_cell2at_index(cd,nzone,zone_cg_range[nzone],cgindex);
7739 dd->index_gl = index_gl;
7740 dd->cgindex = cgindex;
7742 dd->ncg_tot = zone_cg_range[zones->n];
7743 dd->nat_tot = nat_tot;
7744 comm->nat[ddnatHOME] = dd->nat_home;
7745 for(i=ddnatZONE; i<ddnatNR; i++)
7747 comm->nat[i] = dd->nat_tot;
7752 /* We don't need to update cginfo, since that was alrady done above.
7753 * So we pass NULL for the forcerec.
7755 dd_set_cginfo(dd->index_gl,dd->ncg_home,dd->ncg_tot,
7756 NULL,comm->bLocalCG);
7761 fprintf(debug,"Finished setting up DD communication, zones:");
7762 for(c=0; c<zones->n; c++)
7764 fprintf(debug," %d",zones->cg_range[c+1]-zones->cg_range[c]);
7766 fprintf(debug,"\n");
7770 static void set_cg_boundaries(gmx_domdec_zones_t *zones)
7774 for(c=0; c<zones->nizone; c++)
7776 zones->izone[c].cg1 = zones->cg_range[c+1];
7777 zones->izone[c].jcg0 = zones->cg_range[zones->izone[c].j0];
7778 zones->izone[c].jcg1 = zones->cg_range[zones->izone[c].j1];
7782 static int comp_cgsort(const void *a,const void *b)
7786 gmx_cgsort_t *cga,*cgb;
7787 cga = (gmx_cgsort_t *)a;
7788 cgb = (gmx_cgsort_t *)b;
7790 comp = cga->nsc - cgb->nsc;
7793 comp = cga->ind_gl - cgb->ind_gl;
7799 static void order_int_cg(int n,gmx_cgsort_t *sort,
7804 /* Order the data */
7807 buf[i] = a[sort[i].ind];
7810 /* Copy back to the original array */
7817 static void order_vec_cg(int n,gmx_cgsort_t *sort,
7822 /* Order the data */
7825 copy_rvec(v[sort[i].ind],buf[i]);
7828 /* Copy back to the original array */
7831 copy_rvec(buf[i],v[i]);
7835 static void order_vec_atom(int ncg,int *cgindex,gmx_cgsort_t *sort,
7838 int a,atot,cg,cg0,cg1,i;
7840 /* Order the data */
7842 for(cg=0; cg<ncg; cg++)
7844 cg0 = cgindex[sort[cg].ind];
7845 cg1 = cgindex[sort[cg].ind+1];
7846 for(i=cg0; i<cg1; i++)
7848 copy_rvec(v[i],buf[a]);
7854 /* Copy back to the original array */
7855 for(a=0; a<atot; a++)
7857 copy_rvec(buf[a],v[a]);
7861 static void ordered_sort(int nsort2,gmx_cgsort_t *sort2,
7862 int nsort_new,gmx_cgsort_t *sort_new,
7863 gmx_cgsort_t *sort1)
7867 /* The new indices are not very ordered, so we qsort them */
7868 qsort_threadsafe(sort_new,nsort_new,sizeof(sort_new[0]),comp_cgsort);
7870 /* sort2 is already ordered, so now we can merge the two arrays */
7874 while(i2 < nsort2 || i_new < nsort_new)
7878 sort1[i1++] = sort_new[i_new++];
7880 else if (i_new == nsort_new)
7882 sort1[i1++] = sort2[i2++];
7884 else if (sort2[i2].nsc < sort_new[i_new].nsc ||
7885 (sort2[i2].nsc == sort_new[i_new].nsc &&
7886 sort2[i2].ind_gl < sort_new[i_new].ind_gl))
7888 sort1[i1++] = sort2[i2++];
7892 sort1[i1++] = sort_new[i_new++];
7897 static void dd_sort_state(gmx_domdec_t *dd,int ePBC,
7898 rvec *cgcm,t_forcerec *fr,t_state *state,
7901 gmx_domdec_sort_t *sort;
7902 gmx_cgsort_t *cgsort,*sort_i;
7903 int ncg_new,nsort2,nsort_new,i,cell_index,*ibuf,cgsize;
7906 sort = dd->comm->sort;
7908 if (dd->ncg_home > sort->sort_nalloc)
7910 sort->sort_nalloc = over_alloc_dd(dd->ncg_home);
7911 srenew(sort->sort1,sort->sort_nalloc);
7912 srenew(sort->sort2,sort->sort_nalloc);
7915 if (ncg_home_old >= 0)
7917 /* The charge groups that remained in the same ns grid cell
7918 * are completely ordered. So we can sort efficiently by sorting
7919 * the charge groups that did move into the stationary list.
7924 for(i=0; i<dd->ncg_home; i++)
7926 /* Check if this cg did not move to another node */
7927 cell_index = fr->ns.grid->cell_index[i];
7928 if (cell_index != 4*fr->ns.grid->ncells)
7930 if (i >= ncg_home_old || cell_index != sort->sort1[i].nsc)
7932 /* This cg is new on this node or moved ns grid cell */
7933 if (nsort_new >= sort->sort_new_nalloc)
7935 sort->sort_new_nalloc = over_alloc_dd(nsort_new+1);
7936 srenew(sort->sort_new,sort->sort_new_nalloc);
7938 sort_i = &(sort->sort_new[nsort_new++]);
7942 /* This cg did not move */
7943 sort_i = &(sort->sort2[nsort2++]);
7945 /* Sort on the ns grid cell indices
7946 * and the global topology index
7948 sort_i->nsc = cell_index;
7949 sort_i->ind_gl = dd->index_gl[i];
7956 fprintf(debug,"ordered sort cgs: stationary %d moved %d\n",
7959 /* Sort efficiently */
7960 ordered_sort(nsort2,sort->sort2,nsort_new,sort->sort_new,sort->sort1);
7964 cgsort = sort->sort1;
7966 for(i=0; i<dd->ncg_home; i++)
7968 /* Sort on the ns grid cell indices
7969 * and the global topology index
7971 cgsort[i].nsc = fr->ns.grid->cell_index[i];
7972 cgsort[i].ind_gl = dd->index_gl[i];
7974 if (cgsort[i].nsc != 4*fr->ns.grid->ncells)
7981 fprintf(debug,"qsort cgs: %d new home %d\n",dd->ncg_home,ncg_new);
7983 /* Determine the order of the charge groups using qsort */
7984 qsort_threadsafe(cgsort,dd->ncg_home,sizeof(cgsort[0]),comp_cgsort);
7986 cgsort = sort->sort1;
7988 /* We alloc with the old size, since cgindex is still old */
7989 vec_rvec_check_alloc(&dd->comm->vbuf,dd->cgindex[dd->ncg_home]);
7990 vbuf = dd->comm->vbuf.v;
7992 /* Remove the charge groups which are no longer at home here */
7993 dd->ncg_home = ncg_new;
7995 /* Reorder the state */
7996 for(i=0; i<estNR; i++)
7998 if (EST_DISTR(i) && (state->flags & (1<<i)))
8003 order_vec_atom(dd->ncg_home,dd->cgindex,cgsort,state->x,vbuf);
8006 order_vec_atom(dd->ncg_home,dd->cgindex,cgsort,state->v,vbuf);
8009 order_vec_atom(dd->ncg_home,dd->cgindex,cgsort,state->sd_X,vbuf);
8012 order_vec_atom(dd->ncg_home,dd->cgindex,cgsort,state->cg_p,vbuf);
8016 case estDISRE_INITF:
8017 case estDISRE_RM3TAV:
8018 case estORIRE_INITF:
8020 /* No ordering required */
8023 gmx_incons("Unknown state entry encountered in dd_sort_state");
8029 order_vec_cg(dd->ncg_home,cgsort,cgcm,vbuf);
8031 if (dd->ncg_home+1 > sort->ibuf_nalloc)
8033 sort->ibuf_nalloc = over_alloc_dd(dd->ncg_home+1);
8034 srenew(sort->ibuf,sort->ibuf_nalloc);
8037 /* Reorder the global cg index */
8038 order_int_cg(dd->ncg_home,cgsort,dd->index_gl,ibuf);
8039 /* Reorder the cginfo */
8040 order_int_cg(dd->ncg_home,cgsort,fr->cginfo,ibuf);
8041 /* Rebuild the local cg index */
8043 for(i=0; i<dd->ncg_home; i++)
8045 cgsize = dd->cgindex[cgsort[i].ind+1] - dd->cgindex[cgsort[i].ind];
8046 ibuf[i+1] = ibuf[i] + cgsize;
8048 for(i=0; i<dd->ncg_home+1; i++)
8050 dd->cgindex[i] = ibuf[i];
8052 /* Set the home atom number */
8053 dd->nat_home = dd->cgindex[dd->ncg_home];
8055 /* Copy the sorted ns cell indices back to the ns grid struct */
8056 for(i=0; i<dd->ncg_home; i++)
8058 fr->ns.grid->cell_index[i] = cgsort[i].nsc;
8060 fr->ns.grid->nr = dd->ncg_home;
8063 static void add_dd_statistics(gmx_domdec_t *dd)
8065 gmx_domdec_comm_t *comm;
8070 for(ddnat=ddnatZONE; ddnat<ddnatNR; ddnat++)
8072 comm->sum_nat[ddnat-ddnatZONE] +=
8073 comm->nat[ddnat] - comm->nat[ddnat-1];
8078 void reset_dd_statistics_counters(gmx_domdec_t *dd)
8080 gmx_domdec_comm_t *comm;
8085 /* Reset all the statistics and counters for total run counting */
8086 for(ddnat=ddnatZONE; ddnat<ddnatNR; ddnat++)
8088 comm->sum_nat[ddnat-ddnatZONE] = 0;
8092 comm->load_step = 0;
8095 clear_ivec(comm->load_lim);
8100 void print_dd_statistics(t_commrec *cr,t_inputrec *ir,FILE *fplog)
8102 gmx_domdec_comm_t *comm;
8106 comm = cr->dd->comm;
8108 gmx_sumd(ddnatNR-ddnatZONE,comm->sum_nat,cr);
8115 fprintf(fplog,"\n D O M A I N D E C O M P O S I T I O N S T A T I S T I C S\n\n");
8117 for(ddnat=ddnatZONE; ddnat<ddnatNR; ddnat++)
8119 av = comm->sum_nat[ddnat-ddnatZONE]/comm->ndecomp;
8124 " av. #atoms communicated per step for force: %d x %.1f\n",
8128 if (cr->dd->vsite_comm)
8131 " av. #atoms communicated per step for vsites: %d x %.1f\n",
8132 (EEL_PME(ir->coulombtype) || ir->coulombtype==eelEWALD) ? 3 : 2,
8137 if (cr->dd->constraint_comm)
8140 " av. #atoms communicated per step for LINCS: %d x %.1f\n",
8141 1 + ir->nLincsIter,av);
8145 gmx_incons(" Unknown type for DD statistics");
8148 fprintf(fplog,"\n");
8150 if (comm->bRecordLoad && EI_DYNAMICS(ir->eI))
8152 print_dd_load_av(fplog,cr->dd);
8156 void dd_partition_system(FILE *fplog,
8157 gmx_large_int_t step,
8159 gmx_bool bMasterState,
8161 t_state *state_global,
8162 gmx_mtop_t *top_global,
8164 t_state *state_local,
8167 gmx_localtop_t *top_local,
8170 gmx_shellfc_t shellfc,
8171 gmx_constr_t constr,
8173 gmx_wallcycle_t wcycle,
8177 gmx_domdec_comm_t *comm;
8178 gmx_ddbox_t ddbox={0};
8180 gmx_large_int_t step_pcoupl;
8181 rvec cell_ns_x0,cell_ns_x1;
8182 int i,j,n,cg0=0,ncg_home_old=-1,nat_f_novirsum;
8183 gmx_bool bBoxChanged,bNStGlobalComm,bDoDLB,bCheckDLB,bTurnOnDLB,bLogLoad;
8184 gmx_bool bRedist,bSortCG,bResortAll;
8192 bBoxChanged = (bMasterState || DEFORM(*ir));
8193 if (ir->epc != epcNO)
8195 /* With nstpcouple > 1 pressure coupling happens.
8196 * one step after calculating the pressure.
8197 * Box scaling happens at the end of the MD step,
8198 * after the DD partitioning.
8199 * We therefore have to do DLB in the first partitioning
8200 * after an MD step where P-coupling occured.
8201 * We need to determine the last step in which p-coupling occurred.
8202 * MRS -- need to validate this for vv?
8207 step_pcoupl = step - 1;
8211 step_pcoupl = ((step - 1)/n)*n + 1;
8213 if (step_pcoupl >= comm->globalcomm_step)
8219 bNStGlobalComm = (step >= comm->globalcomm_step + nstglobalcomm);
8221 if (!comm->bDynLoadBal)
8227 /* Should we do dynamic load balacing this step?
8228 * Since it requires (possibly expensive) global communication,
8229 * we might want to do DLB less frequently.
8231 if (bBoxChanged || ir->epc != epcNO)
8233 bDoDLB = bBoxChanged;
8237 bDoDLB = bNStGlobalComm;
8241 /* Check if we have recorded loads on the nodes */
8242 if (comm->bRecordLoad && dd_load_count(comm))
8244 if (comm->eDLB == edlbAUTO && !comm->bDynLoadBal)
8246 /* Check if we should use DLB at the second partitioning
8247 * and every 100 partitionings,
8248 * so the extra communication cost is negligible.
8250 n = max(100,nstglobalcomm);
8251 bCheckDLB = (comm->n_load_collect == 0 ||
8252 comm->n_load_have % n == n-1);
8259 /* Print load every nstlog, first and last step to the log file */
8260 bLogLoad = ((ir->nstlog > 0 && step % ir->nstlog == 0) ||
8261 comm->n_load_collect == 0 ||
8263 (step + ir->nstlist > ir->init_step + ir->nsteps)));
8265 /* Avoid extra communication due to verbose screen output
8266 * when nstglobalcomm is set.
8268 if (bDoDLB || bLogLoad || bCheckDLB ||
8269 (bVerbose && (ir->nstlist == 0 || nstglobalcomm <= ir->nstlist)))
8271 get_load_distribution(dd,wcycle);
8276 dd_print_load(fplog,dd,step-1);
8280 dd_print_load_verbose(dd);
8283 comm->n_load_collect++;
8286 /* Since the timings are node dependent, the master decides */
8290 (dd_force_imb_perf_loss(dd) >= DD_PERF_LOSS);
8293 fprintf(debug,"step %s, imb loss %f\n",
8294 gmx_step_str(step,sbuf),
8295 dd_force_imb_perf_loss(dd));
8298 dd_bcast(dd,sizeof(bTurnOnDLB),&bTurnOnDLB);
8301 turn_on_dlb(fplog,cr,step);
8306 comm->n_load_have++;
8309 cgs_gl = &comm->cgs_gl;
8314 /* Clear the old state */
8315 clear_dd_indices(dd,0,0);
8317 set_ddbox(dd,bMasterState,cr,ir,state_global->box,
8318 TRUE,cgs_gl,state_global->x,&ddbox);
8320 get_cg_distribution(fplog,step,dd,cgs_gl,
8321 state_global->box,&ddbox,state_global->x);
8323 dd_distribute_state(dd,cgs_gl,
8324 state_global,state_local,f);
8326 dd_make_local_cgs(dd,&top_local->cgs);
8328 if (dd->ncg_home > fr->cg_nalloc)
8330 dd_realloc_fr_cg(fr,dd->ncg_home);
8332 calc_cgcm(fplog,0,dd->ncg_home,
8333 &top_local->cgs,state_local->x,fr->cg_cm);
8335 inc_nrnb(nrnb,eNR_CGCM,dd->nat_home);
8337 dd_set_cginfo(dd->index_gl,0,dd->ncg_home,fr,comm->bLocalCG);
8341 else if (state_local->ddp_count != dd->ddp_count)
8343 if (state_local->ddp_count > dd->ddp_count)
8345 gmx_fatal(FARGS,"Internal inconsistency state_local->ddp_count (%d) > dd->ddp_count (%d)",state_local->ddp_count,dd->ddp_count);
8348 if (state_local->ddp_count_cg_gl != state_local->ddp_count)
8350 gmx_fatal(FARGS,"Internal inconsistency state_local->ddp_count_cg_gl (%d) != state_local->ddp_count (%d)",state_local->ddp_count_cg_gl,state_local->ddp_count);
8353 /* Clear the old state */
8354 clear_dd_indices(dd,0,0);
8356 /* Build the new indices */
8357 rebuild_cgindex(dd,cgs_gl->index,state_local);
8358 make_dd_indices(dd,cgs_gl->index,0);
8360 /* Redetermine the cg COMs */
8361 calc_cgcm(fplog,0,dd->ncg_home,
8362 &top_local->cgs,state_local->x,fr->cg_cm);
8364 inc_nrnb(nrnb,eNR_CGCM,dd->nat_home);
8366 dd_set_cginfo(dd->index_gl,0,dd->ncg_home,fr,comm->bLocalCG);
8368 set_ddbox(dd,bMasterState,cr,ir,state_local->box,
8369 TRUE,&top_local->cgs,state_local->x,&ddbox);
8371 bRedist = comm->bDynLoadBal;
8375 /* We have the full state, only redistribute the cgs */
8377 /* Clear the non-home indices */
8378 clear_dd_indices(dd,dd->ncg_home,dd->nat_home);
8380 /* Avoid global communication for dim's without pbc and -gcom */
8381 if (!bNStGlobalComm)
8383 copy_rvec(comm->box0 ,ddbox.box0 );
8384 copy_rvec(comm->box_size,ddbox.box_size);
8386 set_ddbox(dd,bMasterState,cr,ir,state_local->box,
8387 bNStGlobalComm,&top_local->cgs,state_local->x,&ddbox);
8392 /* For dim's without pbc and -gcom */
8393 copy_rvec(ddbox.box0 ,comm->box0 );
8394 copy_rvec(ddbox.box_size,comm->box_size);
8396 set_dd_cell_sizes(dd,&ddbox,dynamic_dd_box(&ddbox,ir),bMasterState,bDoDLB,
8399 if (comm->nstDDDumpGrid > 0 && step % comm->nstDDDumpGrid == 0)
8401 write_dd_grid_pdb("dd_grid",step,dd,state_local->box,&ddbox);
8404 /* Check if we should sort the charge groups */
8405 if (comm->nstSortCG > 0)
8407 bSortCG = (bMasterState ||
8408 (bRedist && (step % comm->nstSortCG == 0)));
8415 ncg_home_old = dd->ncg_home;
8419 cg0 = dd_redistribute_cg(fplog,step,dd,ddbox.tric_dir,
8420 state_local,f,fr,mdatoms,
8424 get_nsgrid_boundaries(fr->ns.grid,dd,
8425 state_local->box,&ddbox,&comm->cell_x0,&comm->cell_x1,
8426 dd->ncg_home,fr->cg_cm,
8427 cell_ns_x0,cell_ns_x1,&grid_density);
8431 comm_dd_ns_cell_sizes(dd,&ddbox,cell_ns_x0,cell_ns_x1,step);
8434 copy_ivec(fr->ns.grid->n,ncells_old);
8435 grid_first(fplog,fr->ns.grid,dd,&ddbox,fr->ePBC,
8436 state_local->box,cell_ns_x0,cell_ns_x1,
8437 fr->rlistlong,grid_density);
8438 /* We need to store tric_dir for dd_get_ns_ranges called from ns.c */
8439 copy_ivec(ddbox.tric_dir,comm->tric_dir);
8443 /* Sort the state on charge group position.
8444 * This enables exact restarts from this step.
8445 * It also improves performance by about 15% with larger numbers
8446 * of atoms per node.
8449 /* Fill the ns grid with the home cell,
8450 * so we can sort with the indices.
8452 set_zones_ncg_home(dd);
8453 fill_grid(fplog,&comm->zones,fr->ns.grid,dd->ncg_home,
8454 0,dd->ncg_home,fr->cg_cm);
8456 /* Check if we can user the old order and ns grid cell indices
8457 * of the charge groups to sort the charge groups efficiently.
8459 bResortAll = (bMasterState ||
8460 fr->ns.grid->n[XX] != ncells_old[XX] ||
8461 fr->ns.grid->n[YY] != ncells_old[YY] ||
8462 fr->ns.grid->n[ZZ] != ncells_old[ZZ]);
8466 fprintf(debug,"Step %s, sorting the %d home charge groups\n",
8467 gmx_step_str(step,sbuf),dd->ncg_home);
8469 dd_sort_state(dd,ir->ePBC,fr->cg_cm,fr,state_local,
8470 bResortAll ? -1 : ncg_home_old);
8471 /* Rebuild all the indices */
8473 ga2la_clear(dd->ga2la);
8476 /* Setup up the communication and communicate the coordinates */
8477 setup_dd_communication(dd,state_local->box,&ddbox,fr);
8479 /* Set the indices */
8480 make_dd_indices(dd,cgs_gl->index,cg0);
8482 /* Set the charge group boundaries for neighbor searching */
8483 set_cg_boundaries(&comm->zones);
8486 write_dd_pdb("dd_home",step,"dump",top_global,cr,
8487 -1,state_local->x,state_local->box);
8490 /* Extract a local topology from the global topology */
8491 for(i=0; i<dd->ndim; i++)
8493 np[dd->dim[i]] = comm->cd[i].np;
8495 dd_make_local_top(fplog,dd,&comm->zones,dd->npbcdim,state_local->box,
8496 comm->cellsize_min,np,
8497 fr,vsite,top_global,top_local);
8499 /* Set up the special atom communication */
8500 n = comm->nat[ddnatZONE];
8501 for(i=ddnatZONE+1; i<ddnatNR; i++)
8506 if (vsite && vsite->n_intercg_vsite)
8508 n = dd_make_local_vsites(dd,n,top_local->idef.il);
8512 if (dd->bInterCGcons)
8514 /* Only for inter-cg constraints we need special code */
8515 n = dd_make_local_constraints(dd,n,top_global,
8516 constr,ir->nProjOrder,
8517 &top_local->idef.il[F_CONSTR]);
8521 gmx_incons("Unknown special atom type setup");
8526 /* Make space for the extra coordinates for virtual site
8527 * or constraint communication.
8529 state_local->natoms = comm->nat[ddnatNR-1];
8530 if (state_local->natoms > state_local->nalloc)
8532 dd_realloc_state(state_local,f,state_local->natoms);
8535 if (fr->bF_NoVirSum)
8537 if (vsite && vsite->n_intercg_vsite)
8539 nat_f_novirsum = comm->nat[ddnatVSITE];
8543 if (EEL_FULL(ir->coulombtype) && dd->n_intercg_excl > 0)
8545 nat_f_novirsum = dd->nat_tot;
8549 nat_f_novirsum = dd->nat_home;
8558 /* Set the number of atoms required for the force calculation.
8559 * Forces need to be constrained when using a twin-range setup
8560 * or with energy minimization. For simple simulations we could
8561 * avoid some allocation, zeroing and copying, but this is
8562 * probably not worth the complications ande checking.
8564 forcerec_set_ranges(fr,dd->ncg_home,dd->ncg_tot,
8565 dd->nat_tot,comm->nat[ddnatCON],nat_f_novirsum);
8567 /* We make the all mdatoms up to nat_tot_con.
8568 * We could save some work by only setting invmass
8569 * between nat_tot and nat_tot_con.
8571 /* This call also sets the new number of home particles to dd->nat_home */
8572 atoms2md(top_global,ir,
8573 comm->nat[ddnatCON],dd->gatindex,0,dd->nat_home,mdatoms);
8575 /* Now we have the charges we can sort the FE interactions */
8576 dd_sort_local_top(dd,mdatoms,top_local);
8580 /* Make the local shell stuff, currently no communication is done */
8581 make_local_shells(cr,mdatoms,shellfc);
8584 if (ir->implicit_solvent)
8586 make_local_gb(cr,fr->born,ir->gb_algorithm);
8589 if (!(cr->duty & DUTY_PME))
8591 /* Send the charges to our PME only node */
8592 gmx_pme_send_q(cr,mdatoms->nChargePerturbed,
8593 mdatoms->chargeA,mdatoms->chargeB,
8594 dd_pme_maxshift_x(dd),dd_pme_maxshift_y(dd));
8599 set_constraints(constr,top_local,ir,mdatoms,cr);
8602 if (ir->ePull != epullNO)
8604 /* Update the local pull groups */
8605 dd_make_local_pull_groups(dd,ir->pull,mdatoms);
8610 /* Update the local rotation groups */
8611 dd_make_local_rotation_groups(dd,ir->rot);
8615 add_dd_statistics(dd);
8617 /* Make sure we only count the cycles for this DD partitioning */
8618 clear_dd_cycle_counts(dd);
8620 /* Because the order of the atoms might have changed since
8621 * the last vsite construction, we need to communicate the constructing
8622 * atom coordinates again (for spreading the forces this MD step).
8624 dd_move_x_vsites(dd,state_local->box,state_local->x);
8626 if (comm->nstDDDump > 0 && step % comm->nstDDDump == 0)
8628 dd_move_x(dd,state_local->box,state_local->x);
8629 write_dd_pdb("dd_dump",step,"dump",top_global,cr,
8630 -1,state_local->x,state_local->box);
8635 /* Store the global communication step */
8636 comm->globalcomm_step = step;
8639 /* Increase the DD partitioning counter */
8641 /* The state currently matches this DD partitioning count, store it */
8642 state_local->ddp_count = dd->ddp_count;
8645 /* The DD master node knows the complete cg distribution,
8646 * store the count so we can possibly skip the cg info communication.
8648 comm->master_cg_ddp_count = (bSortCG ? 0 : dd->ddp_count);
8651 if (comm->DD_debug > 0)
8653 /* Set the env var GMX_DD_DEBUG if you suspect corrupted indices */
8654 check_index_consistency(dd,top_global->natoms,ncg_mtop(top_global),
8655 "after partitioning");